1 | /* Copyright 2002 Andi Kleen */ |
2 | |
3 | #include <linux/linkage.h> |
4 | #include <asm/errno.h> |
5 | #include <asm/cpufeatures.h> |
6 | #include <asm/mcsafe_test.h> |
7 | #include <asm/alternative-asm.h> |
8 | #include <asm/export.h> |
9 | |
10 | /* |
11 | * We build a jump to memcpy_orig by default which gets NOPped out on |
12 | * the majority of x86 CPUs which set REP_GOOD. In addition, CPUs which |
13 | * have the enhanced REP MOVSB/STOSB feature (ERMS), change those NOPs |
14 | * to a jmp to memcpy_erms which does the REP; MOVSB mem copy. |
15 | */ |
16 | |
17 | .weak memcpy |
18 | |
19 | /* |
20 | * memcpy - Copy a memory block. |
21 | * |
22 | * Input: |
23 | * rdi destination |
24 | * rsi source |
25 | * rdx count |
26 | * |
27 | * Output: |
28 | * rax original destination |
29 | */ |
30 | ENTRY(__memcpy) |
31 | ENTRY(memcpy) |
32 | ALTERNATIVE_2 "jmp memcpy_orig" , "" , X86_FEATURE_REP_GOOD, \ |
33 | "jmp memcpy_erms" , X86_FEATURE_ERMS |
34 | |
35 | movq %rdi, %rax |
36 | movq %rdx, %rcx |
37 | shrq $3, %rcx |
38 | andl $7, %edx |
39 | rep movsq |
40 | movl %edx, %ecx |
41 | rep movsb |
42 | ret |
43 | ENDPROC(memcpy) |
44 | ENDPROC(__memcpy) |
45 | EXPORT_SYMBOL(memcpy) |
46 | EXPORT_SYMBOL(__memcpy) |
47 | |
48 | /* |
49 | * memcpy_erms() - enhanced fast string memcpy. This is faster and |
50 | * simpler than memcpy. Use memcpy_erms when possible. |
51 | */ |
52 | ENTRY(memcpy_erms) |
53 | movq %rdi, %rax |
54 | movq %rdx, %rcx |
55 | rep movsb |
56 | ret |
57 | ENDPROC(memcpy_erms) |
58 | |
59 | ENTRY(memcpy_orig) |
60 | movq %rdi, %rax |
61 | |
62 | cmpq $0x20, %rdx |
63 | jb .Lhandle_tail |
64 | |
65 | /* |
66 | * We check whether memory false dependence could occur, |
67 | * then jump to corresponding copy mode. |
68 | */ |
69 | cmp %dil, %sil |
70 | jl .Lcopy_backward |
71 | subq $0x20, %rdx |
72 | .Lcopy_forward_loop: |
73 | subq $0x20, %rdx |
74 | |
75 | /* |
76 | * Move in blocks of 4x8 bytes: |
77 | */ |
78 | movq 0*8(%rsi), %r8 |
79 | movq 1*8(%rsi), %r9 |
80 | movq 2*8(%rsi), %r10 |
81 | movq 3*8(%rsi), %r11 |
82 | leaq 4*8(%rsi), %rsi |
83 | |
84 | movq %r8, 0*8(%rdi) |
85 | movq %r9, 1*8(%rdi) |
86 | movq %r10, 2*8(%rdi) |
87 | movq %r11, 3*8(%rdi) |
88 | leaq 4*8(%rdi), %rdi |
89 | jae .Lcopy_forward_loop |
90 | addl $0x20, %edx |
91 | jmp .Lhandle_tail |
92 | |
93 | .Lcopy_backward: |
94 | /* |
95 | * Calculate copy position to tail. |
96 | */ |
97 | addq %rdx, %rsi |
98 | addq %rdx, %rdi |
99 | subq $0x20, %rdx |
100 | /* |
101 | * At most 3 ALU operations in one cycle, |
102 | * so append NOPS in the same 16 bytes trunk. |
103 | */ |
104 | .p2align 4 |
105 | .Lcopy_backward_loop: |
106 | subq $0x20, %rdx |
107 | movq -1*8(%rsi), %r8 |
108 | movq -2*8(%rsi), %r9 |
109 | movq -3*8(%rsi), %r10 |
110 | movq -4*8(%rsi), %r11 |
111 | leaq -4*8(%rsi), %rsi |
112 | movq %r8, -1*8(%rdi) |
113 | movq %r9, -2*8(%rdi) |
114 | movq %r10, -3*8(%rdi) |
115 | movq %r11, -4*8(%rdi) |
116 | leaq -4*8(%rdi), %rdi |
117 | jae .Lcopy_backward_loop |
118 | |
119 | /* |
120 | * Calculate copy position to head. |
121 | */ |
122 | addl $0x20, %edx |
123 | subq %rdx, %rsi |
124 | subq %rdx, %rdi |
125 | .Lhandle_tail: |
126 | cmpl $16, %edx |
127 | jb .Lless_16bytes |
128 | |
129 | /* |
130 | * Move data from 16 bytes to 31 bytes. |
131 | */ |
132 | movq 0*8(%rsi), %r8 |
133 | movq 1*8(%rsi), %r9 |
134 | movq -2*8(%rsi, %rdx), %r10 |
135 | movq -1*8(%rsi, %rdx), %r11 |
136 | movq %r8, 0*8(%rdi) |
137 | movq %r9, 1*8(%rdi) |
138 | movq %r10, -2*8(%rdi, %rdx) |
139 | movq %r11, -1*8(%rdi, %rdx) |
140 | retq |
141 | .p2align 4 |
142 | .Lless_16bytes: |
143 | cmpl $8, %edx |
144 | jb .Lless_8bytes |
145 | /* |
146 | * Move data from 8 bytes to 15 bytes. |
147 | */ |
148 | movq 0*8(%rsi), %r8 |
149 | movq -1*8(%rsi, %rdx), %r9 |
150 | movq %r8, 0*8(%rdi) |
151 | movq %r9, -1*8(%rdi, %rdx) |
152 | retq |
153 | .p2align 4 |
154 | .Lless_8bytes: |
155 | cmpl $4, %edx |
156 | jb .Lless_3bytes |
157 | |
158 | /* |
159 | * Move data from 4 bytes to 7 bytes. |
160 | */ |
161 | movl (%rsi), %ecx |
162 | movl -4(%rsi, %rdx), %r8d |
163 | movl %ecx, (%rdi) |
164 | movl %r8d, -4(%rdi, %rdx) |
165 | retq |
166 | .p2align 4 |
167 | .Lless_3bytes: |
168 | subl $1, %edx |
169 | jb .Lend |
170 | /* |
171 | * Move data from 1 bytes to 3 bytes. |
172 | */ |
173 | movzbl (%rsi), %ecx |
174 | jz .Lstore_1byte |
175 | movzbq 1(%rsi), %r8 |
176 | movzbq (%rsi, %rdx), %r9 |
177 | movb %r8b, 1(%rdi) |
178 | movb %r9b, (%rdi, %rdx) |
179 | .Lstore_1byte: |
180 | movb %cl, (%rdi) |
181 | |
182 | .Lend: |
183 | retq |
184 | ENDPROC(memcpy_orig) |
185 | |
186 | #ifndef CONFIG_UML |
187 | |
188 | MCSAFE_TEST_CTL |
189 | |
190 | /* |
191 | * __memcpy_mcsafe - memory copy with machine check exception handling |
192 | * Note that we only catch machine checks when reading the source addresses. |
193 | * Writes to target are posted and don't generate machine checks. |
194 | */ |
195 | ENTRY(__memcpy_mcsafe) |
196 | cmpl $8, %edx |
197 | /* Less than 8 bytes? Go to byte copy loop */ |
198 | jb .L_no_whole_words |
199 | |
200 | /* Check for bad alignment of source */ |
201 | testl $7, %esi |
202 | /* Already aligned */ |
203 | jz .L_8byte_aligned |
204 | |
205 | /* Copy one byte at a time until source is 8-byte aligned */ |
206 | movl %esi, %ecx |
207 | andl $7, %ecx |
208 | subl $8, %ecx |
209 | negl %ecx |
210 | subl %ecx, %edx |
211 | .L_read_leading_bytes: |
212 | movb (%rsi), %al |
213 | MCSAFE_TEST_SRC %rsi 1 .E_leading_bytes |
214 | MCSAFE_TEST_DST %rdi 1 .E_leading_bytes |
215 | .L_write_leading_bytes: |
216 | movb %al, (%rdi) |
217 | incq %rsi |
218 | incq %rdi |
219 | decl %ecx |
220 | jnz .L_read_leading_bytes |
221 | |
222 | .L_8byte_aligned: |
223 | movl %edx, %ecx |
224 | andl $7, %edx |
225 | shrl $3, %ecx |
226 | jz .L_no_whole_words |
227 | |
228 | .L_read_words: |
229 | movq (%rsi), %r8 |
230 | MCSAFE_TEST_SRC %rsi 8 .E_read_words |
231 | MCSAFE_TEST_DST %rdi 8 .E_write_words |
232 | .L_write_words: |
233 | movq %r8, (%rdi) |
234 | addq $8, %rsi |
235 | addq $8, %rdi |
236 | decl %ecx |
237 | jnz .L_read_words |
238 | |
239 | /* Any trailing bytes? */ |
240 | .L_no_whole_words: |
241 | andl %edx, %edx |
242 | jz .L_done_memcpy_trap |
243 | |
244 | /* Copy trailing bytes */ |
245 | movl %edx, %ecx |
246 | .L_read_trailing_bytes: |
247 | movb (%rsi), %al |
248 | MCSAFE_TEST_SRC %rsi 1 .E_trailing_bytes |
249 | MCSAFE_TEST_DST %rdi 1 .E_trailing_bytes |
250 | .L_write_trailing_bytes: |
251 | movb %al, (%rdi) |
252 | incq %rsi |
253 | incq %rdi |
254 | decl %ecx |
255 | jnz .L_read_trailing_bytes |
256 | |
257 | /* Copy successful. Return zero */ |
258 | .L_done_memcpy_trap: |
259 | xorl %eax, %eax |
260 | ret |
261 | ENDPROC(__memcpy_mcsafe) |
262 | EXPORT_SYMBOL_GPL(__memcpy_mcsafe) |
263 | |
264 | .section .fixup, "ax" |
265 | /* |
266 | * Return number of bytes not copied for any failure. Note that |
267 | * there is no "tail" handling since the source buffer is 8-byte |
268 | * aligned and poison is cacheline aligned. |
269 | */ |
270 | .E_read_words: |
271 | shll $3, %ecx |
272 | .E_leading_bytes: |
273 | addl %edx, %ecx |
274 | .E_trailing_bytes: |
275 | mov %ecx, %eax |
276 | ret |
277 | |
278 | /* |
279 | * For write fault handling, given the destination is unaligned, |
280 | * we handle faults on multi-byte writes with a byte-by-byte |
281 | * copy up to the write-protected page. |
282 | */ |
283 | .E_write_words: |
284 | shll $3, %ecx |
285 | addl %edx, %ecx |
286 | movl %ecx, %edx |
287 | jmp mcsafe_handle_tail |
288 | |
289 | .previous |
290 | |
291 | _ASM_EXTABLE_FAULT(.L_read_leading_bytes, .E_leading_bytes) |
292 | _ASM_EXTABLE_FAULT(.L_read_words, .E_read_words) |
293 | _ASM_EXTABLE_FAULT(.L_read_trailing_bytes, .E_trailing_bytes) |
294 | _ASM_EXTABLE(.L_write_leading_bytes, .E_leading_bytes) |
295 | _ASM_EXTABLE(.L_write_words, .E_write_words) |
296 | _ASM_EXTABLE(.L_write_trailing_bytes, .E_trailing_bytes) |
297 | #endif |
298 | |