1 | /* SPDX-License-Identifier: GPL-2.0-only */ |
2 | /* |
3 | * Copyright 2023 Linus Torvalds <torvalds@linux-foundation.org> |
4 | */ |
5 | |
6 | #include <linux/export.h> |
7 | #include <linux/linkage.h> |
8 | #include <asm/asm.h> |
9 | |
10 | /* |
11 | * copy_user_nocache - Uncached memory copy with exception handling |
12 | * |
13 | * This copies from user space into kernel space, but the kernel |
14 | * space accesses can take a machine check exception, so they too |
15 | * need exception handling. |
16 | * |
17 | * Note: only 32-bit and 64-bit stores have non-temporal versions, |
18 | * and we only use aligned versions. Any unaligned parts at the |
19 | * start or end of the copy will be done using normal cached stores. |
20 | * |
21 | * Input: |
22 | * rdi destination |
23 | * rsi source |
24 | * edx count |
25 | * |
26 | * Output: |
27 | * rax uncopied bytes or 0 if successful. |
28 | */ |
29 | SYM_FUNC_START(__copy_user_nocache) |
30 | /* If destination is not 7-byte aligned, we'll have to align it */ |
31 | testb $7,%dil |
32 | jne .Lalign |
33 | |
34 | .Lis_aligned: |
35 | cmp $64,%edx |
36 | jb .Lquadwords |
37 | |
38 | .p2align 4,0x90 |
39 | .Lunrolled: |
40 | 10: movq (%rsi),%r8 |
41 | 11: movq 8(%rsi),%r9 |
42 | 12: movq 16(%rsi),%r10 |
43 | 13: movq 24(%rsi),%r11 |
44 | 20: movnti %r8,(%rdi) |
45 | 21: movnti %r9,8(%rdi) |
46 | 22: movnti %r10,16(%rdi) |
47 | 23: movnti %r11,24(%rdi) |
48 | 30: movq 32(%rsi),%r8 |
49 | 31: movq 40(%rsi),%r9 |
50 | 32: movq 48(%rsi),%r10 |
51 | 33: movq 56(%rsi),%r11 |
52 | 40: movnti %r8,32(%rdi) |
53 | 41: movnti %r9,40(%rdi) |
54 | 42: movnti %r10,48(%rdi) |
55 | 43: movnti %r11,56(%rdi) |
56 | |
57 | addq $64,%rsi |
58 | addq $64,%rdi |
59 | sub $64,%edx |
60 | cmp $64,%edx |
61 | jae .Lunrolled |
62 | |
63 | /* |
64 | * First set of user mode loads have been done |
65 | * without any stores, so if they fail, we can |
66 | * just try the non-unrolled loop. |
67 | */ |
68 | _ASM_EXTABLE_UA(10b, .Lquadwords) |
69 | _ASM_EXTABLE_UA(11b, .Lquadwords) |
70 | _ASM_EXTABLE_UA(12b, .Lquadwords) |
71 | _ASM_EXTABLE_UA(13b, .Lquadwords) |
72 | |
73 | /* |
74 | * The second set of user mode loads have been |
75 | * done with 32 bytes stored to the destination, |
76 | * so we need to take that into account before |
77 | * falling back to the unrolled loop. |
78 | */ |
79 | _ASM_EXTABLE_UA(30b, .Lfixup32) |
80 | _ASM_EXTABLE_UA(31b, .Lfixup32) |
81 | _ASM_EXTABLE_UA(32b, .Lfixup32) |
82 | _ASM_EXTABLE_UA(33b, .Lfixup32) |
83 | |
84 | /* |
85 | * An exception on a write means that we're |
86 | * done, but we need to update the count |
87 | * depending on where in the unrolled loop |
88 | * we were. |
89 | */ |
90 | _ASM_EXTABLE_UA(20b, .Ldone0) |
91 | _ASM_EXTABLE_UA(21b, .Ldone8) |
92 | _ASM_EXTABLE_UA(22b, .Ldone16) |
93 | _ASM_EXTABLE_UA(23b, .Ldone24) |
94 | _ASM_EXTABLE_UA(40b, .Ldone32) |
95 | _ASM_EXTABLE_UA(41b, .Ldone40) |
96 | _ASM_EXTABLE_UA(42b, .Ldone48) |
97 | _ASM_EXTABLE_UA(43b, .Ldone56) |
98 | |
99 | .Lquadwords: |
100 | cmp $8,%edx |
101 | jb .Llong |
102 | 50: movq (%rsi),%rax |
103 | 51: movnti %rax,(%rdi) |
104 | addq $8,%rsi |
105 | addq $8,%rdi |
106 | sub $8,%edx |
107 | jmp .Lquadwords |
108 | |
109 | /* |
110 | * If we fail on the last full quadword, we will |
111 | * not try to do any byte-wise cached accesses. |
112 | * We will try to do one more 4-byte uncached |
113 | * one, though. |
114 | */ |
115 | _ASM_EXTABLE_UA(50b, .Llast4) |
116 | _ASM_EXTABLE_UA(51b, .Ldone0) |
117 | |
118 | .Llong: |
119 | test $4,%dl |
120 | je .Lword |
121 | 60: movl (%rsi),%eax |
122 | 61: movnti %eax,(%rdi) |
123 | addq $4,%rsi |
124 | addq $4,%rdi |
125 | sub $4,%edx |
126 | .Lword: |
127 | sfence |
128 | test $2,%dl |
129 | je .Lbyte |
130 | 70: movw (%rsi),%ax |
131 | 71: movw %ax,(%rdi) |
132 | addq $2,%rsi |
133 | addq $2,%rdi |
134 | sub $2,%edx |
135 | .Lbyte: |
136 | test $1,%dl |
137 | je .Ldone |
138 | 80: movb (%rsi),%al |
139 | 81: movb %al,(%rdi) |
140 | dec %edx |
141 | .Ldone: |
142 | mov %edx,%eax |
143 | RET |
144 | |
145 | /* |
146 | * If we fail on the last four bytes, we won't |
147 | * bother with any fixups. It's dead, Jim. Note |
148 | * that there's no need for 'sfence' for any |
149 | * of this, since the exception will have been |
150 | * serializing. |
151 | */ |
152 | _ASM_EXTABLE_UA(60b, .Ldone) |
153 | _ASM_EXTABLE_UA(61b, .Ldone) |
154 | _ASM_EXTABLE_UA(70b, .Ldone) |
155 | _ASM_EXTABLE_UA(71b, .Ldone) |
156 | _ASM_EXTABLE_UA(80b, .Ldone) |
157 | _ASM_EXTABLE_UA(81b, .Ldone) |
158 | |
159 | /* |
160 | * This is the "head needs aliging" case when |
161 | * the destination isn't 8-byte aligned. The |
162 | * 4-byte case can be done uncached, but any |
163 | * smaller alignment is done with regular stores. |
164 | */ |
165 | .Lalign: |
166 | test $1,%dil |
167 | je .Lalign_word |
168 | test %edx,%edx |
169 | je .Ldone |
170 | 90: movb (%rsi),%al |
171 | 91: movb %al,(%rdi) |
172 | inc %rsi |
173 | inc %rdi |
174 | dec %edx |
175 | .Lalign_word: |
176 | test $2,%dil |
177 | je .Lalign_long |
178 | cmp $2,%edx |
179 | jb .Lbyte |
180 | 92: movw (%rsi),%ax |
181 | 93: movw %ax,(%rdi) |
182 | addq $2,%rsi |
183 | addq $2,%rdi |
184 | sub $2,%edx |
185 | .Lalign_long: |
186 | test $4,%dil |
187 | je .Lis_aligned |
188 | cmp $4,%edx |
189 | jb .Lword |
190 | 94: movl (%rsi),%eax |
191 | 95: movnti %eax,(%rdi) |
192 | addq $4,%rsi |
193 | addq $4,%rdi |
194 | sub $4,%edx |
195 | jmp .Lis_aligned |
196 | |
197 | /* |
198 | * If we fail on the initial alignment accesses, |
199 | * we're all done. Again, no point in trying to |
200 | * do byte-by-byte probing if the 4-byte load |
201 | * fails - we're not doing any uncached accesses |
202 | * any more. |
203 | */ |
204 | _ASM_EXTABLE_UA(90b, .Ldone) |
205 | _ASM_EXTABLE_UA(91b, .Ldone) |
206 | _ASM_EXTABLE_UA(92b, .Ldone) |
207 | _ASM_EXTABLE_UA(93b, .Ldone) |
208 | _ASM_EXTABLE_UA(94b, .Ldone) |
209 | _ASM_EXTABLE_UA(95b, .Ldone) |
210 | |
211 | /* |
212 | * Exception table fixups for faults in the middle |
213 | */ |
214 | .Ldone56: sub $8,%edx |
215 | .Ldone48: sub $8,%edx |
216 | .Ldone40: sub $8,%edx |
217 | .Ldone32: sub $8,%edx |
218 | .Ldone24: sub $8,%edx |
219 | .Ldone16: sub $8,%edx |
220 | .Ldone8: sub $8,%edx |
221 | .Ldone0: |
222 | mov %edx,%eax |
223 | RET |
224 | |
225 | .Lfixup32: |
226 | addq $32,%rsi |
227 | addq $32,%rdi |
228 | sub $32,%edx |
229 | jmp .Lquadwords |
230 | |
231 | .Llast4: |
232 | 52: movl (%rsi),%eax |
233 | 53: movnti %eax,(%rdi) |
234 | sfence |
235 | sub $4,%edx |
236 | mov %edx,%eax |
237 | RET |
238 | _ASM_EXTABLE_UA(52b, .Ldone0) |
239 | _ASM_EXTABLE_UA(53b, .Ldone0) |
240 | |
241 | SYM_FUNC_END(__copy_user_nocache) |
242 | EXPORT_SYMBOL(__copy_user_nocache) |
243 | |