1 | /* Optimized memcpy implementation for cached memory on PowerPC64/POWER8. |
2 | Copyright (C) 2017-2024 Free Software Foundation, Inc. |
3 | This file is part of the GNU C Library. |
4 | |
5 | The GNU C Library is free software; you can redistribute it and/or |
6 | modify it under the terms of the GNU Lesser General Public |
7 | License as published by the Free Software Foundation; either |
8 | version 2.1 of the License, or (at your option) any later version. |
9 | |
10 | The GNU C Library is distributed in the hope that it will be useful, |
11 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
13 | Lesser General Public License for more details. |
14 | |
15 | You should have received a copy of the GNU Lesser General Public |
16 | License along with the GNU C Library; if not, see |
17 | <https://www.gnu.org/licenses/>. */ |
18 | |
19 | #include <sysdep.h> |
20 | |
21 | |
22 | /* __ptr_t [r3] memcpy (__ptr_t dst [r3], __ptr_t src [r4], size_t len [r5]); |
23 | Returns 'dst'. */ |
24 | |
25 | .machine power8 |
26 | ENTRY_TOCLESS (__memcpy_power8_cached, 5) |
27 | CALL_MCOUNT 3 |
28 | |
29 | cmpldi cr7,r5,15 |
30 | bgt cr7,L(ge_16) |
31 | andi. r9,r5,0x1 |
32 | mr r9,r3 |
33 | beq cr0,1f |
34 | lbz r10,0(r4) |
35 | addi r9,r3,1 |
36 | addi r4,r4,1 |
37 | stb r10,0(r3) |
38 | 1: |
39 | andi. r10,r5,0x2 |
40 | beq cr0,2f |
41 | lhz r10,0(r4) |
42 | addi r9,r9,2 |
43 | addi r4,r4,2 |
44 | sth r10,-2(r9) |
45 | 2: |
46 | andi. r10,r5,0x4 |
47 | beq cr0,3f |
48 | lwz r10,0(r4) |
49 | addi r9,9,4 |
50 | addi r4,4,4 |
51 | stw r10,-4(r9) |
52 | 3: |
53 | andi. r10,r5,0x8 |
54 | beqlr cr0 |
55 | ld r10,0(r4) |
56 | std r10,0(r9) |
57 | blr |
58 | |
59 | .align 4 |
60 | L(ge_16): |
61 | cmpldi cr7,r5,32 |
62 | ble cr7,L(ge_16_le_32) |
63 | cmpldi cr7,r5,64 |
64 | ble cr7,L(gt_32_le_64) |
65 | |
66 | /* Align dst to 16 bytes. */ |
67 | andi. r9,r3,0xf |
68 | mr r12,r3 |
69 | beq cr0,L(dst_is_align_16) |
70 | lxvd2x v0,0,r4 |
71 | subfic r12,r9,16 |
72 | subf r5,r12,r5 |
73 | add r4,r4,r12 |
74 | add r12,r3,r12 |
75 | stxvd2x v0,0,r3 |
76 | L(dst_is_align_16): |
77 | cmpldi cr7,r5,127 |
78 | ble cr7,L(tail_copy) |
79 | mr r9,r12 |
80 | srdi r10,r5,7 |
81 | li r11,16 |
82 | li r6,32 |
83 | li r7,48 |
84 | mtctr r10 |
85 | clrrdi r0,r5,7 |
86 | |
87 | /* Main loop, copy 128 bytes each time. */ |
88 | .align 4 |
89 | L(copy_128): |
90 | lxvd2x v10,0,r4 |
91 | lxvd2x v11,r4,r11 |
92 | addi r8,r4,64 |
93 | addi r10,r9,64 |
94 | lxvd2x v12,r4,r6 |
95 | lxvd2x v0,r4,r7 |
96 | addi r4,r4,128 |
97 | stxvd2x v10,0,r9 |
98 | stxvd2x v11,r9,r11 |
99 | stxvd2x v12,r9,r6 |
100 | stxvd2x v0,r9,r7 |
101 | addi r9,r9,128 |
102 | lxvd2x v10,0,r8 |
103 | lxvd2x v11,r8,r11 |
104 | lxvd2x v12,r8,r6 |
105 | lxvd2x v0,r8,r7 |
106 | stxvd2x v10,0,r10 |
107 | stxvd2x v11,r10,r11 |
108 | stxvd2x v12,r10,r6 |
109 | stxvd2x v0,r10,r7 |
110 | bdnz L(copy_128) |
111 | |
112 | add r12,r12,r0 |
113 | rldicl r5,r5,0,57 |
114 | L(tail_copy): |
115 | cmpldi cr7,r5,63 |
116 | ble cr7,L(tail_le_64) |
117 | li r8,16 |
118 | li r10,32 |
119 | lxvd2x v10,0,r4 |
120 | li r9,48 |
121 | addi r5,r5,-64 |
122 | lxvd2x v11,r4,r8 |
123 | lxvd2x v12,r4,r10 |
124 | lxvd2x v0,r4,r9 |
125 | addi r4,r4,64 |
126 | stxvd2x v10,0,r12 |
127 | stxvd2x v11,r12,r8 |
128 | stxvd2x v12,r12,r10 |
129 | stxvd2x v0,r12,9 |
130 | addi r12,r12,64 |
131 | |
132 | L(tail_le_64): |
133 | cmpldi cr7,r5,32 |
134 | bgt cr7,L(tail_gt_32_le_64) |
135 | cmpdi cr7,r5,0 |
136 | beqlr cr7 |
137 | addi r5,r5,-32 |
138 | li r9,16 |
139 | add r8,r4,r5 |
140 | add r10,r12,r5 |
141 | lxvd2x v12,r4,r5 |
142 | lxvd2x v0,r8,r9 |
143 | stxvd2x v12,r12,r5 |
144 | stxvd2x v0,r10,r9 |
145 | blr |
146 | |
147 | .align 4 |
148 | L(ge_16_le_32): |
149 | addi r5,r5,-16 |
150 | lxvd2x v0,0,r4 |
151 | lxvd2x v1,r4,r5 |
152 | stxvd2x v0,0,r3 |
153 | stxvd2x v1,r3,r5 |
154 | blr |
155 | |
156 | .align 4 |
157 | L(gt_32_le_64): |
158 | mr r12,r3 |
159 | |
160 | .align 4 |
161 | L(tail_gt_32_le_64): |
162 | li r9,16 |
163 | lxvd2x v0,0,r4 |
164 | addi r5,r5,-32 |
165 | lxvd2x v1,r4,r9 |
166 | add r8,r4,r5 |
167 | lxvd2x v2,r4,r5 |
168 | add r10,r12,r5 |
169 | lxvd2x v3,r8,r9 |
170 | stxvd2x v0,0,r12 |
171 | stxvd2x v1,r12,r9 |
172 | stxvd2x v2,r12,r5 |
173 | stxvd2x v3,r10,r9 |
174 | blr |
175 | |
176 | END_GEN_TB (__memcpy_power8_cached,TB_TOCLESS) |
177 | |