1/* Optimized memcpy implementation for CELL BE PowerPC.
2 Copyright (C) 2010-2024 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <https://www.gnu.org/licenses/>. */
18
19#include <sysdep.h>
20
21#define PREFETCH_AHEAD 6 /* no cache lines SRC prefetching ahead */
22#define ZERO_AHEAD 4 /* no cache lines DST zeroing ahead */
23
24/* memcpy routine optimized for CELL-BE-PPC v2.0
25 *
26 * The CELL PPC core has 1 integer unit and 1 load/store unit
27 * CELL:
28 * 1st level data cache = 32K
29 * 2nd level data cache = 512K
30 * 3rd level data cache = 0K
31 * With 3.2 GHz clockrate the latency to 2nd level cache is >36 clocks,
32 * latency to memory is >400 clocks
33 * To improve copy performance we need to prefetch source data
34 * far ahead to hide this latency
35 * For best performance instruction forms ending in "." like "andi."
36 * should be avoided as the are implemented in microcode on CELL.
37 * The below code is loop unrolled for the CELL cache line of 128 bytes
38 */
39
40.align 7
41
42EALIGN (memcpy, 5, 0)
43 CALL_MCOUNT
44
45 dcbt 0,r4 /* Prefetch ONE SRC cacheline */
46 cmplwi cr1,r5,16 /* is size < 16 ? */
47 mr r6,r3
48 blt+ cr1,.Lshortcopy
49
50.Lbigcopy:
51 neg r8,r3 /* LS 3 bits = # bytes to 8-byte dest bdry */
52 clrlwi r8,r8,32-4 /* align to 16byte boundary */
53 sub r7,r4,r3
54 cmplwi cr0,r8,0
55 beq+ .Ldst_aligned
56
57.Ldst_unaligned:
58 mtcrf 0x01,r8 /* put #bytes to boundary into cr7 */
59 subf r5,r8,r5
60
61 bf cr7*4+3,1f
62 lbzx r0,r7,r6 /* copy 1 byte */
63 stb r0,0(r6)
64 addi r6,r6,1
651: bf cr7*4+2,2f
66 lhzx r0,r7,r6 /* copy 2 byte */
67 sth r0,0(r6)
68 addi r6,r6,2
692: bf cr7*4+1,4f
70 lwzx r0,r7,r6 /* copy 4 byte */
71 stw r0,0(r6)
72 addi r6,r6,4
734: bf cr7*4+0,8f
74 lfdx fp9,r7,r6 /* copy 8 byte */
75 stfd fp9,0(r6)
76 addi r6,r6,8
778:
78 add r4,r7,r6
79
80.Ldst_aligned:
81
82 cmpwi cr5,r5,128-1
83
84 neg r7,r6
85 addi r6,r6,-8 /* prepare for stfdu */
86 addi r4,r4,-8 /* prepare for lfdu */
87
88 clrlwi r7,r7,32-7 /* align to cacheline boundary */
89 ble+ cr5,.Llessthancacheline
90
91 cmplwi cr6,r7,0
92 subf r5,r7,r5
93 srwi r7,r7,4 /* divide size by 16 */
94 srwi r10,r5,7 /* number of cache lines to copy */
95
96 cmplwi r10,0
97 li r11,0 /* number cachelines to copy with prefetch */
98 beq .Lnocacheprefetch
99
100 cmplwi r10,PREFETCH_AHEAD
101 li r12,128+8 /* prefetch distance */
102 ble .Llessthanmaxprefetch
103
104 subi r11,r10,PREFETCH_AHEAD
105 li r10,PREFETCH_AHEAD
106
107.Llessthanmaxprefetch:
108 mtctr r10
109
110.LprefetchSRC:
111 dcbt r12,r4
112 addi r12,r12,128
113 bdnz .LprefetchSRC
114
115.Lnocacheprefetch:
116 mtctr r7
117 cmplwi cr1,r5,128
118 clrlwi r5,r5,32-7
119 beq cr6,.Lcachelinealigned
120
121.Laligntocacheline:
122 lfd fp9,0x08(r4)
123 lfdu fp10,0x10(r4)
124 stfd fp9,0x08(r6)
125 stfdu fp10,0x10(r6)
126 bdnz .Laligntocacheline
127
128
129.Lcachelinealigned: /* copy while cache lines */
130
131 blt- cr1,.Llessthancacheline /* size <128 */
132
133.Louterloop:
134 cmpwi r11,0
135 mtctr r11
136 beq- .Lendloop
137
138 li r11,128*ZERO_AHEAD +8 /* DCBZ dist */
139
140.align 4
141 /* Copy whole cachelines, optimized by prefetching SRC cacheline */
142.Lloop: /* Copy aligned body */
143 dcbt r12,r4 /* PREFETCH SOURCE some cache lines ahead */
144 lfd fp9, 0x08(r4)
145 dcbz r11,r6
146 lfd fp10, 0x10(r4) /* 4 register stride copy is optimal */
147 lfd fp11, 0x18(r4) /* to hide 1st level cache latency. */
148 lfd fp12, 0x20(r4)
149 stfd fp9, 0x08(r6)
150 stfd fp10, 0x10(r6)
151 stfd fp11, 0x18(r6)
152 stfd fp12, 0x20(r6)
153 lfd fp9, 0x28(r4)
154 lfd fp10, 0x30(r4)
155 lfd fp11, 0x38(r4)
156 lfd fp12, 0x40(r4)
157 stfd fp9, 0x28(r6)
158 stfd fp10, 0x30(r6)
159 stfd fp11, 0x38(r6)
160 stfd fp12, 0x40(r6)
161 lfd fp9, 0x48(r4)
162 lfd fp10, 0x50(r4)
163 lfd fp11, 0x58(r4)
164 lfd fp12, 0x60(r4)
165 stfd fp9, 0x48(r6)
166 stfd fp10, 0x50(r6)
167 stfd fp11, 0x58(r6)
168 stfd fp12, 0x60(r6)
169 lfd fp9, 0x68(r4)
170 lfd fp10, 0x70(r4)
171 lfd fp11, 0x78(r4)
172 lfdu fp12, 0x80(r4)
173 stfd fp9, 0x68(r6)
174 stfd fp10, 0x70(r6)
175 stfd fp11, 0x78(r6)
176 stfdu fp12, 0x80(r6)
177
178 bdnz .Lloop
179
180.Lendloop:
181 cmpwi r10,0
182 slwi r10,r10,2 /* adjust from 128 to 32 byte stride */
183 beq- .Lendloop2
184 mtctr r10
185
186.Lloop2: /* Copy aligned body */
187 lfd fp9, 0x08(r4)
188 lfd fp10, 0x10(r4)
189 lfd fp11, 0x18(r4)
190 lfdu fp12, 0x20(r4)
191 stfd fp9, 0x08(r6)
192 stfd fp10, 0x10(r6)
193 stfd fp11, 0x18(r6)
194 stfdu fp12, 0x20(r6)
195
196 bdnz .Lloop2
197.Lendloop2:
198
199.Llessthancacheline: /* less than cache to do ? */
200 cmplwi cr0,r5,16
201 srwi r7,r5,4 /* divide size by 16 */
202 blt- .Ldo_lt16
203 mtctr r7
204
205.Lcopy_remaining:
206 lfd fp9,0x08(r4)
207 lfdu fp10,0x10(r4)
208 stfd fp9,0x08(r6)
209 stfdu fp10,0x10(r6)
210 bdnz .Lcopy_remaining
211
212.Ldo_lt16: /* less than 16 ? */
213 cmplwi cr0,r5,0 /* copy remaining bytes (0-15) */
214 beqlr+ /* no rest to copy */
215 addi r4,r4,8
216 addi r6,r6,8
217
218.Lshortcopy: /* SIMPLE COPY to handle size =< 15 bytes */
219 mtcrf 0x01,r5
220 sub r7,r4,r6
221 bf- cr7*4+0,8f
222 lfdx fp9,r7,r6 /* copy 8 byte */
223 stfd fp9,0(r6)
224 addi r6,r6,8
2258:
226 bf cr7*4+1,4f
227 lwzx r0,r7,r6 /* copy 4 byte */
228 stw r0,0(r6)
229 addi r6,r6,4
2304:
231 bf cr7*4+2,2f
232 lhzx r0,r7,r6 /* copy 2 byte */
233 sth r0,0(r6)
234 addi r6,r6,2
2352:
236 bf cr7*4+3,1f
237 lbzx r0,r7,r6 /* copy 1 byte */
238 stb r0,0(r6)
2391: blr
240
241END (memcpy)
242libc_hidden_builtin_def (memcpy)
243

source code of glibc/sysdeps/powerpc/powerpc32/cell/memcpy.S