1 | /* Optimized memcpy implementation for CELL BE PowerPC. |
2 | Copyright (C) 2010-2024 Free Software Foundation, Inc. |
3 | This file is part of the GNU C Library. |
4 | |
5 | The GNU C Library is free software; you can redistribute it and/or |
6 | modify it under the terms of the GNU Lesser General Public |
7 | License as published by the Free Software Foundation; either |
8 | version 2.1 of the License, or (at your option) any later version. |
9 | |
10 | The GNU C Library is distributed in the hope that it will be useful, |
11 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
13 | Lesser General Public License for more details. |
14 | |
15 | You should have received a copy of the GNU Lesser General Public |
16 | License along with the GNU C Library; if not, see |
17 | <https://www.gnu.org/licenses/>. */ |
18 | |
19 | #include <sysdep.h> |
20 | |
21 | #ifndef MEMCPY |
22 | # define MEMCPY memcpy |
23 | #endif |
24 | |
25 | #define PREFETCH_AHEAD 6 /* no cache lines SRC prefetching ahead */ |
26 | #define ZERO_AHEAD 4 /* no cache lines DST zeroing ahead */ |
27 | |
28 | /* memcpy routine optimized for CELL-BE-PPC v2.0 |
29 | * |
30 | * The CELL PPC core has 1 integer unit and 1 load/store unit |
31 | * CELL: |
32 | * 1st level data cache = 32K |
33 | * 2nd level data cache = 512K |
34 | * 3rd level data cache = 0K |
35 | * With 3.2 GHz clockrate the latency to 2nd level cache is >36 clocks, |
36 | * latency to memory is >400 clocks |
37 | * To improve copy performance we need to prefetch source data |
38 | * far ahead to hide this latency |
39 | * For best performance instruction forms ending in "." like "andi." |
40 | * should be avoided as the are implemented in microcode on CELL. |
41 | * The below code is loop unrolled for the CELL cache line of 128 bytes |
42 | */ |
43 | |
44 | .align 7 |
45 | |
46 | ENTRY_TOCLESS (MEMCPY, 5) |
47 | CALL_MCOUNT 3 |
48 | |
49 | dcbt 0,r4 /* Prefetch ONE SRC cacheline */ |
50 | cmpldi cr1,r5,16 /* is size < 16 ? */ |
51 | mr r6,r3 |
52 | blt+ cr1,.Lshortcopy |
53 | |
54 | .Lbigcopy: |
55 | neg r8,r3 /* LS 3 bits = # bytes to 8-byte dest bdry */ |
56 | clrldi r8,r8,64-4 /* align to 16byte boundary */ |
57 | sub r7,r4,r3 |
58 | cmpldi cr0,r8,0 |
59 | beq+ .Ldst_aligned |
60 | |
61 | .Ldst_unaligned: |
62 | mtcrf 0x01,r8 /* put #bytes to boundary into cr7 */ |
63 | subf r5,r8,r5 |
64 | |
65 | bf cr7*4+3,1f |
66 | lbzx r0,r7,r6 /* copy 1 byte */ |
67 | stb r0,0(r6) |
68 | addi r6,r6,1 |
69 | 1: bf cr7*4+2,2f |
70 | lhzx r0,r7,r6 /* copy 2 byte */ |
71 | sth r0,0(r6) |
72 | addi r6,r6,2 |
73 | 2: bf cr7*4+1,4f |
74 | lwzx r0,r7,r6 /* copy 4 byte */ |
75 | stw r0,0(r6) |
76 | addi r6,r6,4 |
77 | 4: bf cr7*4+0,8f |
78 | ldx r0,r7,r6 /* copy 8 byte */ |
79 | std r0,0(r6) |
80 | addi r6,r6,8 |
81 | 8: |
82 | add r4,r7,r6 |
83 | |
84 | .Ldst_aligned: |
85 | |
86 | cmpdi cr5,r5,128-1 |
87 | |
88 | neg r7,r6 |
89 | addi r6,r6,-8 /* prepare for stdu */ |
90 | addi r4,r4,-8 /* prepare for ldu */ |
91 | |
92 | clrldi r7,r7,64-7 /* align to cacheline boundary */ |
93 | ble+ cr5,.Llessthancacheline |
94 | |
95 | cmpldi cr6,r7,0 |
96 | subf r5,r7,r5 |
97 | srdi r7,r7,4 /* divide size by 16 */ |
98 | srdi r10,r5,7 /* number of cache lines to copy */ |
99 | |
100 | cmpldi r10,0 |
101 | li r11,0 /* number cachelines to copy with prefetch */ |
102 | beq .Lnocacheprefetch |
103 | |
104 | cmpldi r10,PREFETCH_AHEAD |
105 | li r12,128+8 /* prefetch distance */ |
106 | ble .Llessthanmaxprefetch |
107 | |
108 | subi r11,r10,PREFETCH_AHEAD |
109 | li r10,PREFETCH_AHEAD |
110 | |
111 | .Llessthanmaxprefetch: |
112 | mtctr r10 |
113 | |
114 | .LprefetchSRC: |
115 | dcbt r12,r4 |
116 | addi r12,r12,128 |
117 | bdnz .LprefetchSRC |
118 | |
119 | .Lnocacheprefetch: |
120 | mtctr r7 |
121 | cmpldi cr1,r5,128 |
122 | clrldi r5,r5,64-7 |
123 | beq cr6,.Lcachelinealigned |
124 | |
125 | .Laligntocacheline: |
126 | ld r9,0x08(r4) |
127 | ldu r7,0x10(r4) |
128 | std r9,0x08(r6) |
129 | stdu r7,0x10(r6) |
130 | bdnz .Laligntocacheline |
131 | |
132 | |
133 | .Lcachelinealigned: /* copy while cache lines */ |
134 | |
135 | blt- cr1,.Llessthancacheline /* size <128 */ |
136 | |
137 | .Louterloop: |
138 | cmpdi r11,0 |
139 | mtctr r11 |
140 | beq- .Lendloop |
141 | |
142 | li r11,128*ZERO_AHEAD +8 /* DCBZ dist */ |
143 | |
144 | .align 4 |
145 | /* Copy whole cachelines, optimized by prefetching SRC cacheline */ |
146 | .Lloop: /* Copy aligned body */ |
147 | dcbt r12,r4 /* PREFETCH SOURCE some cache lines ahead */ |
148 | ld r9, 0x08(r4) |
149 | dcbz r11,r6 |
150 | ld r7, 0x10(r4) /* 4 register stride copy is optimal */ |
151 | ld r8, 0x18(r4) /* to hide 1st level cache latency. */ |
152 | ld r0, 0x20(r4) |
153 | std r9, 0x08(r6) |
154 | std r7, 0x10(r6) |
155 | std r8, 0x18(r6) |
156 | std r0, 0x20(r6) |
157 | ld r9, 0x28(r4) |
158 | ld r7, 0x30(r4) |
159 | ld r8, 0x38(r4) |
160 | ld r0, 0x40(r4) |
161 | std r9, 0x28(r6) |
162 | std r7, 0x30(r6) |
163 | std r8, 0x38(r6) |
164 | std r0, 0x40(r6) |
165 | ld r9, 0x48(r4) |
166 | ld r7, 0x50(r4) |
167 | ld r8, 0x58(r4) |
168 | ld r0, 0x60(r4) |
169 | std r9, 0x48(r6) |
170 | std r7, 0x50(r6) |
171 | std r8, 0x58(r6) |
172 | std r0, 0x60(r6) |
173 | ld r9, 0x68(r4) |
174 | ld r7, 0x70(r4) |
175 | ld r8, 0x78(r4) |
176 | ldu r0, 0x80(r4) |
177 | std r9, 0x68(r6) |
178 | std r7, 0x70(r6) |
179 | std r8, 0x78(r6) |
180 | stdu r0, 0x80(r6) |
181 | |
182 | bdnz .Lloop |
183 | |
184 | .Lendloop: |
185 | cmpdi r10,0 |
186 | sldi r10,r10,2 /* adjust from 128 to 32 byte stride */ |
187 | beq- .Lendloop2 |
188 | mtctr r10 |
189 | |
190 | .Lloop2: /* Copy aligned body */ |
191 | ld r9, 0x08(r4) |
192 | ld r7, 0x10(r4) |
193 | ld r8, 0x18(r4) |
194 | ldu r0, 0x20(r4) |
195 | std r9, 0x08(r6) |
196 | std r7, 0x10(r6) |
197 | std r8, 0x18(r6) |
198 | stdu r0, 0x20(r6) |
199 | |
200 | bdnz .Lloop2 |
201 | .Lendloop2: |
202 | |
203 | .Llessthancacheline: /* less than cache to do ? */ |
204 | cmpldi cr0,r5,16 |
205 | srdi r7,r5,4 /* divide size by 16 */ |
206 | blt- .Ldo_lt16 |
207 | mtctr r7 |
208 | |
209 | .Lcopy_remaining: |
210 | ld r8,0x08(r4) |
211 | ldu r7,0x10(r4) |
212 | std r8,0x08(r6) |
213 | stdu r7,0x10(r6) |
214 | bdnz .Lcopy_remaining |
215 | |
216 | .Ldo_lt16: /* less than 16 ? */ |
217 | cmpldi cr0,r5,0 /* copy remaining bytes (0-15) */ |
218 | beqlr+ /* no rest to copy */ |
219 | addi r4,r4,8 |
220 | addi r6,r6,8 |
221 | |
222 | .Lshortcopy: /* SIMPLE COPY to handle size =< 15 bytes */ |
223 | mtcrf 0x01,r5 |
224 | sub r7,r4,r6 |
225 | bf- cr7*4+0,8f |
226 | ldx r0,r7,r6 /* copy 8 byte */ |
227 | std r0,0(r6) |
228 | addi r6,r6,8 |
229 | 8: |
230 | bf cr7*4+1,4f |
231 | lwzx r0,r7,r6 /* copy 4 byte */ |
232 | stw r0,0(r6) |
233 | addi r6,r6,4 |
234 | 4: |
235 | bf cr7*4+2,2f |
236 | lhzx r0,r7,r6 /* copy 2 byte */ |
237 | sth r0,0(r6) |
238 | addi r6,r6,2 |
239 | 2: |
240 | bf cr7*4+3,1f |
241 | lbzx r0,r7,r6 /* copy 1 byte */ |
242 | stb r0,0(r6) |
243 | 1: blr |
244 | |
245 | END_GEN_TB (MEMCPY,TB_TOCLESS) |
246 | libc_hidden_builtin_def (memcpy) |
247 | |