1 | // SPDX-License-Identifier: GPL-2.0-only |
2 | /* |
3 | * xsave/xrstor support. |
4 | * |
5 | * Author: Suresh Siddha <suresh.b.siddha@intel.com> |
6 | */ |
7 | #include <linux/bitops.h> |
8 | #include <linux/compat.h> |
9 | #include <linux/cpu.h> |
10 | #include <linux/mman.h> |
11 | #include <linux/nospec.h> |
12 | #include <linux/pkeys.h> |
13 | #include <linux/seq_file.h> |
14 | #include <linux/proc_fs.h> |
15 | #include <linux/vmalloc.h> |
16 | |
17 | #include <asm/fpu/api.h> |
18 | #include <asm/fpu/regset.h> |
19 | #include <asm/fpu/signal.h> |
20 | #include <asm/fpu/xcr.h> |
21 | |
22 | #include <asm/tlbflush.h> |
23 | #include <asm/prctl.h> |
24 | #include <asm/elf.h> |
25 | |
26 | #include "context.h" |
27 | #include "internal.h" |
28 | #include "legacy.h" |
29 | #include "xstate.h" |
30 | |
31 | #define for_each_extended_xfeature(bit, mask) \ |
32 | (bit) = FIRST_EXTENDED_XFEATURE; \ |
33 | for_each_set_bit_from(bit, (unsigned long *)&(mask), 8 * sizeof(mask)) |
34 | |
35 | /* |
36 | * Although we spell it out in here, the Processor Trace |
37 | * xfeature is completely unused. We use other mechanisms |
38 | * to save/restore PT state in Linux. |
39 | */ |
40 | static const char *xfeature_names[] = |
41 | { |
42 | "x87 floating point registers" , |
43 | "SSE registers" , |
44 | "AVX registers" , |
45 | "MPX bounds registers" , |
46 | "MPX CSR" , |
47 | "AVX-512 opmask" , |
48 | "AVX-512 Hi256" , |
49 | "AVX-512 ZMM_Hi256" , |
50 | "Processor Trace (unused)" , |
51 | "Protection Keys User registers" , |
52 | "PASID state" , |
53 | "Control-flow User registers" , |
54 | "Control-flow Kernel registers (unused)" , |
55 | "unknown xstate feature" , |
56 | "unknown xstate feature" , |
57 | "unknown xstate feature" , |
58 | "unknown xstate feature" , |
59 | "AMX Tile config" , |
60 | "AMX Tile data" , |
61 | "unknown xstate feature" , |
62 | }; |
63 | |
64 | static unsigned short xsave_cpuid_features[] __initdata = { |
65 | [XFEATURE_FP] = X86_FEATURE_FPU, |
66 | [XFEATURE_SSE] = X86_FEATURE_XMM, |
67 | [XFEATURE_YMM] = X86_FEATURE_AVX, |
68 | [XFEATURE_BNDREGS] = X86_FEATURE_MPX, |
69 | [XFEATURE_BNDCSR] = X86_FEATURE_MPX, |
70 | [XFEATURE_OPMASK] = X86_FEATURE_AVX512F, |
71 | [XFEATURE_ZMM_Hi256] = X86_FEATURE_AVX512F, |
72 | [XFEATURE_Hi16_ZMM] = X86_FEATURE_AVX512F, |
73 | [XFEATURE_PT_UNIMPLEMENTED_SO_FAR] = X86_FEATURE_INTEL_PT, |
74 | [XFEATURE_PKRU] = X86_FEATURE_OSPKE, |
75 | [XFEATURE_PASID] = X86_FEATURE_ENQCMD, |
76 | [XFEATURE_CET_USER] = X86_FEATURE_SHSTK, |
77 | [XFEATURE_XTILE_CFG] = X86_FEATURE_AMX_TILE, |
78 | [XFEATURE_XTILE_DATA] = X86_FEATURE_AMX_TILE, |
79 | }; |
80 | |
81 | static unsigned int xstate_offsets[XFEATURE_MAX] __ro_after_init = |
82 | { [ 0 ... XFEATURE_MAX - 1] = -1}; |
83 | static unsigned int xstate_sizes[XFEATURE_MAX] __ro_after_init = |
84 | { [ 0 ... XFEATURE_MAX - 1] = -1}; |
85 | static unsigned int xstate_flags[XFEATURE_MAX] __ro_after_init; |
86 | |
87 | #define XSTATE_FLAG_SUPERVISOR BIT(0) |
88 | #define XSTATE_FLAG_ALIGNED64 BIT(1) |
89 | |
90 | /* |
91 | * Return whether the system supports a given xfeature. |
92 | * |
93 | * Also return the name of the (most advanced) feature that the caller requested: |
94 | */ |
95 | int cpu_has_xfeatures(u64 xfeatures_needed, const char **feature_name) |
96 | { |
97 | u64 xfeatures_missing = xfeatures_needed & ~fpu_kernel_cfg.max_features; |
98 | |
99 | if (unlikely(feature_name)) { |
100 | long xfeature_idx, max_idx; |
101 | u64 xfeatures_print; |
102 | /* |
103 | * So we use FLS here to be able to print the most advanced |
104 | * feature that was requested but is missing. So if a driver |
105 | * asks about "XFEATURE_MASK_SSE | XFEATURE_MASK_YMM" we'll print the |
106 | * missing AVX feature - this is the most informative message |
107 | * to users: |
108 | */ |
109 | if (xfeatures_missing) |
110 | xfeatures_print = xfeatures_missing; |
111 | else |
112 | xfeatures_print = xfeatures_needed; |
113 | |
114 | xfeature_idx = fls64(x: xfeatures_print)-1; |
115 | max_idx = ARRAY_SIZE(xfeature_names)-1; |
116 | xfeature_idx = min(xfeature_idx, max_idx); |
117 | |
118 | *feature_name = xfeature_names[xfeature_idx]; |
119 | } |
120 | |
121 | if (xfeatures_missing) |
122 | return 0; |
123 | |
124 | return 1; |
125 | } |
126 | EXPORT_SYMBOL_GPL(cpu_has_xfeatures); |
127 | |
128 | static bool xfeature_is_aligned64(int xfeature_nr) |
129 | { |
130 | return xstate_flags[xfeature_nr] & XSTATE_FLAG_ALIGNED64; |
131 | } |
132 | |
133 | static bool xfeature_is_supervisor(int xfeature_nr) |
134 | { |
135 | return xstate_flags[xfeature_nr] & XSTATE_FLAG_SUPERVISOR; |
136 | } |
137 | |
138 | static unsigned int xfeature_get_offset(u64 xcomp_bv, int xfeature) |
139 | { |
140 | unsigned int offs, i; |
141 | |
142 | /* |
143 | * Non-compacted format and legacy features use the cached fixed |
144 | * offsets. |
145 | */ |
146 | if (!cpu_feature_enabled(X86_FEATURE_XCOMPACTED) || |
147 | xfeature <= XFEATURE_SSE) |
148 | return xstate_offsets[xfeature]; |
149 | |
150 | /* |
151 | * Compacted format offsets depend on the actual content of the |
152 | * compacted xsave area which is determined by the xcomp_bv header |
153 | * field. |
154 | */ |
155 | offs = FXSAVE_SIZE + XSAVE_HDR_SIZE; |
156 | for_each_extended_xfeature(i, xcomp_bv) { |
157 | if (xfeature_is_aligned64(xfeature_nr: i)) |
158 | offs = ALIGN(offs, 64); |
159 | if (i == xfeature) |
160 | break; |
161 | offs += xstate_sizes[i]; |
162 | } |
163 | return offs; |
164 | } |
165 | |
166 | /* |
167 | * Enable the extended processor state save/restore feature. |
168 | * Called once per CPU onlining. |
169 | */ |
170 | void fpu__init_cpu_xstate(void) |
171 | { |
172 | if (!boot_cpu_has(X86_FEATURE_XSAVE) || !fpu_kernel_cfg.max_features) |
173 | return; |
174 | |
175 | cr4_set_bits(X86_CR4_OSXSAVE); |
176 | |
177 | /* |
178 | * Must happen after CR4 setup and before xsetbv() to allow KVM |
179 | * lazy passthrough. Write independent of the dynamic state static |
180 | * key as that does not work on the boot CPU. This also ensures |
181 | * that any stale state is wiped out from XFD. Reset the per CPU |
182 | * xfd cache too. |
183 | */ |
184 | if (cpu_feature_enabled(X86_FEATURE_XFD)) |
185 | xfd_set_state(xfd: init_fpstate.xfd); |
186 | |
187 | /* |
188 | * XCR_XFEATURE_ENABLED_MASK (aka. XCR0) sets user features |
189 | * managed by XSAVE{C, OPT, S} and XRSTOR{S}. Only XSAVE user |
190 | * states can be set here. |
191 | */ |
192 | xsetbv(XCR_XFEATURE_ENABLED_MASK, value: fpu_user_cfg.max_features); |
193 | |
194 | /* |
195 | * MSR_IA32_XSS sets supervisor states managed by XSAVES. |
196 | */ |
197 | if (boot_cpu_has(X86_FEATURE_XSAVES)) { |
198 | wrmsrl(MSR_IA32_XSS, val: xfeatures_mask_supervisor() | |
199 | xfeatures_mask_independent()); |
200 | } |
201 | } |
202 | |
203 | static bool xfeature_enabled(enum xfeature xfeature) |
204 | { |
205 | return fpu_kernel_cfg.max_features & BIT_ULL(xfeature); |
206 | } |
207 | |
208 | /* |
209 | * Record the offsets and sizes of various xstates contained |
210 | * in the XSAVE state memory layout. |
211 | */ |
212 | static void __init setup_xstate_cache(void) |
213 | { |
214 | u32 eax, ebx, ecx, edx, i; |
215 | /* start at the beginning of the "extended state" */ |
216 | unsigned int last_good_offset = offsetof(struct xregs_state, |
217 | extended_state_area); |
218 | /* |
219 | * The FP xstates and SSE xstates are legacy states. They are always |
220 | * in the fixed offsets in the xsave area in either compacted form |
221 | * or standard form. |
222 | */ |
223 | xstate_offsets[XFEATURE_FP] = 0; |
224 | xstate_sizes[XFEATURE_FP] = offsetof(struct fxregs_state, |
225 | xmm_space); |
226 | |
227 | xstate_offsets[XFEATURE_SSE] = xstate_sizes[XFEATURE_FP]; |
228 | xstate_sizes[XFEATURE_SSE] = sizeof_field(struct fxregs_state, |
229 | xmm_space); |
230 | |
231 | for_each_extended_xfeature(i, fpu_kernel_cfg.max_features) { |
232 | cpuid_count(XSTATE_CPUID, count: i, eax: &eax, ebx: &ebx, ecx: &ecx, edx: &edx); |
233 | |
234 | xstate_sizes[i] = eax; |
235 | xstate_flags[i] = ecx; |
236 | |
237 | /* |
238 | * If an xfeature is supervisor state, the offset in EBX is |
239 | * invalid, leave it to -1. |
240 | */ |
241 | if (xfeature_is_supervisor(xfeature_nr: i)) |
242 | continue; |
243 | |
244 | xstate_offsets[i] = ebx; |
245 | |
246 | /* |
247 | * In our xstate size checks, we assume that the highest-numbered |
248 | * xstate feature has the highest offset in the buffer. Ensure |
249 | * it does. |
250 | */ |
251 | WARN_ONCE(last_good_offset > xstate_offsets[i], |
252 | "x86/fpu: misordered xstate at %d\n" , last_good_offset); |
253 | |
254 | last_good_offset = xstate_offsets[i]; |
255 | } |
256 | } |
257 | |
258 | static void __init print_xstate_feature(u64 xstate_mask) |
259 | { |
260 | const char *feature_name; |
261 | |
262 | if (cpu_has_xfeatures(xstate_mask, &feature_name)) |
263 | pr_info("x86/fpu: Supporting XSAVE feature 0x%03Lx: '%s'\n" , xstate_mask, feature_name); |
264 | } |
265 | |
266 | /* |
267 | * Print out all the supported xstate features: |
268 | */ |
269 | static void __init print_xstate_features(void) |
270 | { |
271 | print_xstate_feature(XFEATURE_MASK_FP); |
272 | print_xstate_feature(XFEATURE_MASK_SSE); |
273 | print_xstate_feature(XFEATURE_MASK_YMM); |
274 | print_xstate_feature(XFEATURE_MASK_BNDREGS); |
275 | print_xstate_feature(XFEATURE_MASK_BNDCSR); |
276 | print_xstate_feature(XFEATURE_MASK_OPMASK); |
277 | print_xstate_feature(XFEATURE_MASK_ZMM_Hi256); |
278 | print_xstate_feature(XFEATURE_MASK_Hi16_ZMM); |
279 | print_xstate_feature(XFEATURE_MASK_PKRU); |
280 | print_xstate_feature(XFEATURE_MASK_PASID); |
281 | print_xstate_feature(XFEATURE_MASK_CET_USER); |
282 | print_xstate_feature(XFEATURE_MASK_XTILE_CFG); |
283 | print_xstate_feature(XFEATURE_MASK_XTILE_DATA); |
284 | } |
285 | |
286 | /* |
287 | * This check is important because it is easy to get XSTATE_* |
288 | * confused with XSTATE_BIT_*. |
289 | */ |
290 | #define CHECK_XFEATURE(nr) do { \ |
291 | WARN_ON(nr < FIRST_EXTENDED_XFEATURE); \ |
292 | WARN_ON(nr >= XFEATURE_MAX); \ |
293 | } while (0) |
294 | |
295 | /* |
296 | * Print out xstate component offsets and sizes |
297 | */ |
298 | static void __init print_xstate_offset_size(void) |
299 | { |
300 | int i; |
301 | |
302 | for_each_extended_xfeature(i, fpu_kernel_cfg.max_features) { |
303 | pr_info("x86/fpu: xstate_offset[%d]: %4d, xstate_sizes[%d]: %4d\n" , |
304 | i, xfeature_get_offset(fpu_kernel_cfg.max_features, i), |
305 | i, xstate_sizes[i]); |
306 | } |
307 | } |
308 | |
309 | /* |
310 | * This function is called only during boot time when x86 caps are not set |
311 | * up and alternative can not be used yet. |
312 | */ |
313 | static __init void os_xrstor_booting(struct xregs_state *xstate) |
314 | { |
315 | u64 mask = fpu_kernel_cfg.max_features & XFEATURE_MASK_FPSTATE; |
316 | u32 lmask = mask; |
317 | u32 hmask = mask >> 32; |
318 | int err; |
319 | |
320 | if (cpu_feature_enabled(X86_FEATURE_XSAVES)) |
321 | XSTATE_OP(XRSTORS, xstate, lmask, hmask, err); |
322 | else |
323 | XSTATE_OP(XRSTOR, xstate, lmask, hmask, err); |
324 | |
325 | /* |
326 | * We should never fault when copying from a kernel buffer, and the FPU |
327 | * state we set at boot time should be valid. |
328 | */ |
329 | WARN_ON_FPU(err); |
330 | } |
331 | |
332 | /* |
333 | * All supported features have either init state all zeros or are |
334 | * handled in setup_init_fpu() individually. This is an explicit |
335 | * feature list and does not use XFEATURE_MASK*SUPPORTED to catch |
336 | * newly added supported features at build time and make people |
337 | * actually look at the init state for the new feature. |
338 | */ |
339 | #define XFEATURES_INIT_FPSTATE_HANDLED \ |
340 | (XFEATURE_MASK_FP | \ |
341 | XFEATURE_MASK_SSE | \ |
342 | XFEATURE_MASK_YMM | \ |
343 | XFEATURE_MASK_OPMASK | \ |
344 | XFEATURE_MASK_ZMM_Hi256 | \ |
345 | XFEATURE_MASK_Hi16_ZMM | \ |
346 | XFEATURE_MASK_PKRU | \ |
347 | XFEATURE_MASK_BNDREGS | \ |
348 | XFEATURE_MASK_BNDCSR | \ |
349 | XFEATURE_MASK_PASID | \ |
350 | XFEATURE_MASK_CET_USER | \ |
351 | XFEATURE_MASK_XTILE) |
352 | |
353 | /* |
354 | * setup the xstate image representing the init state |
355 | */ |
356 | static void __init setup_init_fpu_buf(void) |
357 | { |
358 | BUILD_BUG_ON((XFEATURE_MASK_USER_SUPPORTED | |
359 | XFEATURE_MASK_SUPERVISOR_SUPPORTED) != |
360 | XFEATURES_INIT_FPSTATE_HANDLED); |
361 | |
362 | if (!boot_cpu_has(X86_FEATURE_XSAVE)) |
363 | return; |
364 | |
365 | print_xstate_features(); |
366 | |
367 | xstate_init_xcomp_bv(xsave: &init_fpstate.regs.xsave, mask: init_fpstate.xfeatures); |
368 | |
369 | /* |
370 | * Init all the features state with header.xfeatures being 0x0 |
371 | */ |
372 | os_xrstor_booting(xstate: &init_fpstate.regs.xsave); |
373 | |
374 | /* |
375 | * All components are now in init state. Read the state back so |
376 | * that init_fpstate contains all non-zero init state. This only |
377 | * works with XSAVE, but not with XSAVEOPT and XSAVEC/S because |
378 | * those use the init optimization which skips writing data for |
379 | * components in init state. |
380 | * |
381 | * XSAVE could be used, but that would require to reshuffle the |
382 | * data when XSAVEC/S is available because XSAVEC/S uses xstate |
383 | * compaction. But doing so is a pointless exercise because most |
384 | * components have an all zeros init state except for the legacy |
385 | * ones (FP and SSE). Those can be saved with FXSAVE into the |
386 | * legacy area. Adding new features requires to ensure that init |
387 | * state is all zeroes or if not to add the necessary handling |
388 | * here. |
389 | */ |
390 | fxsave(fx: &init_fpstate.regs.fxsave); |
391 | } |
392 | |
393 | int xfeature_size(int xfeature_nr) |
394 | { |
395 | u32 eax, ebx, ecx, edx; |
396 | |
397 | CHECK_XFEATURE(xfeature_nr); |
398 | cpuid_count(XSTATE_CPUID, count: xfeature_nr, eax: &eax, ebx: &ebx, ecx: &ecx, edx: &edx); |
399 | return eax; |
400 | } |
401 | |
402 | /* Validate an xstate header supplied by userspace (ptrace or sigreturn) */ |
403 | static int (const struct xstate_header *hdr, |
404 | struct fpstate *fpstate) |
405 | { |
406 | /* No unknown or supervisor features may be set */ |
407 | if (hdr->xfeatures & ~fpstate->user_xfeatures) |
408 | return -EINVAL; |
409 | |
410 | /* Userspace must use the uncompacted format */ |
411 | if (hdr->xcomp_bv) |
412 | return -EINVAL; |
413 | |
414 | /* |
415 | * If 'reserved' is shrunken to add a new field, make sure to validate |
416 | * that new field here! |
417 | */ |
418 | BUILD_BUG_ON(sizeof(hdr->reserved) != 48); |
419 | |
420 | /* No reserved bits may be set */ |
421 | if (memchr_inv(p: hdr->reserved, c: 0, size: sizeof(hdr->reserved))) |
422 | return -EINVAL; |
423 | |
424 | return 0; |
425 | } |
426 | |
427 | static void __init __xstate_dump_leaves(void) |
428 | { |
429 | int i; |
430 | u32 eax, ebx, ecx, edx; |
431 | static int should_dump = 1; |
432 | |
433 | if (!should_dump) |
434 | return; |
435 | should_dump = 0; |
436 | /* |
437 | * Dump out a few leaves past the ones that we support |
438 | * just in case there are some goodies up there |
439 | */ |
440 | for (i = 0; i < XFEATURE_MAX + 10; i++) { |
441 | cpuid_count(XSTATE_CPUID, count: i, eax: &eax, ebx: &ebx, ecx: &ecx, edx: &edx); |
442 | pr_warn("CPUID[%02x, %02x]: eax=%08x ebx=%08x ecx=%08x edx=%08x\n" , |
443 | XSTATE_CPUID, i, eax, ebx, ecx, edx); |
444 | } |
445 | } |
446 | |
447 | #define XSTATE_WARN_ON(x, fmt, ...) do { \ |
448 | if (WARN_ONCE(x, "XSAVE consistency problem: " fmt, ##__VA_ARGS__)) { \ |
449 | __xstate_dump_leaves(); \ |
450 | } \ |
451 | } while (0) |
452 | |
453 | #define XCHECK_SZ(sz, nr, __struct) ({ \ |
454 | if (WARN_ONCE(sz != sizeof(__struct), \ |
455 | "[%s]: struct is %zu bytes, cpu state %d bytes\n", \ |
456 | xfeature_names[nr], sizeof(__struct), sz)) { \ |
457 | __xstate_dump_leaves(); \ |
458 | } \ |
459 | true; \ |
460 | }) |
461 | |
462 | |
463 | /** |
464 | * check_xtile_data_against_struct - Check tile data state size. |
465 | * |
466 | * Calculate the state size by multiplying the single tile size which is |
467 | * recorded in a C struct, and the number of tiles that the CPU informs. |
468 | * Compare the provided size with the calculation. |
469 | * |
470 | * @size: The tile data state size |
471 | * |
472 | * Returns: 0 on success, -EINVAL on mismatch. |
473 | */ |
474 | static int __init check_xtile_data_against_struct(int size) |
475 | { |
476 | u32 max_palid, palid, state_size; |
477 | u32 eax, ebx, ecx, edx; |
478 | u16 max_tile; |
479 | |
480 | /* |
481 | * Check the maximum palette id: |
482 | * eax: the highest numbered palette subleaf. |
483 | */ |
484 | cpuid_count(TILE_CPUID, count: 0, eax: &max_palid, ebx: &ebx, ecx: &ecx, edx: &edx); |
485 | |
486 | /* |
487 | * Cross-check each tile size and find the maximum number of |
488 | * supported tiles. |
489 | */ |
490 | for (palid = 1, max_tile = 0; palid <= max_palid; palid++) { |
491 | u16 tile_size, max; |
492 | |
493 | /* |
494 | * Check the tile size info: |
495 | * eax[31:16]: bytes per title |
496 | * ebx[31:16]: the max names (or max number of tiles) |
497 | */ |
498 | cpuid_count(TILE_CPUID, count: palid, eax: &eax, ebx: &ebx, ecx: &edx, edx: &edx); |
499 | tile_size = eax >> 16; |
500 | max = ebx >> 16; |
501 | |
502 | if (tile_size != sizeof(struct xtile_data)) { |
503 | pr_err("%s: struct is %zu bytes, cpu xtile %d bytes\n" , |
504 | __stringify(XFEATURE_XTILE_DATA), |
505 | sizeof(struct xtile_data), tile_size); |
506 | __xstate_dump_leaves(); |
507 | return -EINVAL; |
508 | } |
509 | |
510 | if (max > max_tile) |
511 | max_tile = max; |
512 | } |
513 | |
514 | state_size = sizeof(struct xtile_data) * max_tile; |
515 | if (size != state_size) { |
516 | pr_err("%s: calculated size is %u bytes, cpu state %d bytes\n" , |
517 | __stringify(XFEATURE_XTILE_DATA), state_size, size); |
518 | __xstate_dump_leaves(); |
519 | return -EINVAL; |
520 | } |
521 | return 0; |
522 | } |
523 | |
524 | /* |
525 | * We have a C struct for each 'xstate'. We need to ensure |
526 | * that our software representation matches what the CPU |
527 | * tells us about the state's size. |
528 | */ |
529 | static bool __init check_xstate_against_struct(int nr) |
530 | { |
531 | /* |
532 | * Ask the CPU for the size of the state. |
533 | */ |
534 | int sz = xfeature_size(xfeature_nr: nr); |
535 | |
536 | /* |
537 | * Match each CPU state with the corresponding software |
538 | * structure. |
539 | */ |
540 | switch (nr) { |
541 | case XFEATURE_YMM: return XCHECK_SZ(sz, nr, struct ymmh_struct); |
542 | case XFEATURE_BNDREGS: return XCHECK_SZ(sz, nr, struct mpx_bndreg_state); |
543 | case XFEATURE_BNDCSR: return XCHECK_SZ(sz, nr, struct mpx_bndcsr_state); |
544 | case XFEATURE_OPMASK: return XCHECK_SZ(sz, nr, struct avx_512_opmask_state); |
545 | case XFEATURE_ZMM_Hi256: return XCHECK_SZ(sz, nr, struct avx_512_zmm_uppers_state); |
546 | case XFEATURE_Hi16_ZMM: return XCHECK_SZ(sz, nr, struct avx_512_hi16_state); |
547 | case XFEATURE_PKRU: return XCHECK_SZ(sz, nr, struct pkru_state); |
548 | case XFEATURE_PASID: return XCHECK_SZ(sz, nr, struct ia32_pasid_state); |
549 | case XFEATURE_XTILE_CFG: return XCHECK_SZ(sz, nr, struct xtile_cfg); |
550 | case XFEATURE_CET_USER: return XCHECK_SZ(sz, nr, struct cet_user_state); |
551 | case XFEATURE_XTILE_DATA: check_xtile_data_against_struct(size: sz); return true; |
552 | default: |
553 | XSTATE_WARN_ON(1, "No structure for xstate: %d\n" , nr); |
554 | return false; |
555 | } |
556 | |
557 | return true; |
558 | } |
559 | |
560 | static unsigned int xstate_calculate_size(u64 xfeatures, bool compacted) |
561 | { |
562 | unsigned int topmost = fls64(x: xfeatures) - 1; |
563 | unsigned int offset = xstate_offsets[topmost]; |
564 | |
565 | if (topmost <= XFEATURE_SSE) |
566 | return sizeof(struct xregs_state); |
567 | |
568 | if (compacted) |
569 | offset = xfeature_get_offset(xcomp_bv: xfeatures, xfeature: topmost); |
570 | return offset + xstate_sizes[topmost]; |
571 | } |
572 | |
573 | /* |
574 | * This essentially double-checks what the cpu told us about |
575 | * how large the XSAVE buffer needs to be. We are recalculating |
576 | * it to be safe. |
577 | * |
578 | * Independent XSAVE features allocate their own buffers and are not |
579 | * covered by these checks. Only the size of the buffer for task->fpu |
580 | * is checked here. |
581 | */ |
582 | static bool __init paranoid_xstate_size_valid(unsigned int kernel_size) |
583 | { |
584 | bool compacted = cpu_feature_enabled(X86_FEATURE_XCOMPACTED); |
585 | bool xsaves = cpu_feature_enabled(X86_FEATURE_XSAVES); |
586 | unsigned int size = FXSAVE_SIZE + XSAVE_HDR_SIZE; |
587 | int i; |
588 | |
589 | for_each_extended_xfeature(i, fpu_kernel_cfg.max_features) { |
590 | if (!check_xstate_against_struct(nr: i)) |
591 | return false; |
592 | /* |
593 | * Supervisor state components can be managed only by |
594 | * XSAVES. |
595 | */ |
596 | if (!xsaves && xfeature_is_supervisor(xfeature_nr: i)) { |
597 | XSTATE_WARN_ON(1, "Got supervisor feature %d, but XSAVES not advertised\n" , i); |
598 | return false; |
599 | } |
600 | } |
601 | size = xstate_calculate_size(xfeatures: fpu_kernel_cfg.max_features, compacted); |
602 | XSTATE_WARN_ON(size != kernel_size, |
603 | "size %u != kernel_size %u\n" , size, kernel_size); |
604 | return size == kernel_size; |
605 | } |
606 | |
607 | /* |
608 | * Get total size of enabled xstates in XCR0 | IA32_XSS. |
609 | * |
610 | * Note the SDM's wording here. "sub-function 0" only enumerates |
611 | * the size of the *user* states. If we use it to size a buffer |
612 | * that we use 'XSAVES' on, we could potentially overflow the |
613 | * buffer because 'XSAVES' saves system states too. |
614 | * |
615 | * This also takes compaction into account. So this works for |
616 | * XSAVEC as well. |
617 | */ |
618 | static unsigned int __init get_compacted_size(void) |
619 | { |
620 | unsigned int eax, ebx, ecx, edx; |
621 | /* |
622 | * - CPUID function 0DH, sub-function 1: |
623 | * EBX enumerates the size (in bytes) required by |
624 | * the XSAVES instruction for an XSAVE area |
625 | * containing all the state components |
626 | * corresponding to bits currently set in |
627 | * XCR0 | IA32_XSS. |
628 | * |
629 | * When XSAVES is not available but XSAVEC is (virt), then there |
630 | * are no supervisor states, but XSAVEC still uses compacted |
631 | * format. |
632 | */ |
633 | cpuid_count(XSTATE_CPUID, count: 1, eax: &eax, ebx: &ebx, ecx: &ecx, edx: &edx); |
634 | return ebx; |
635 | } |
636 | |
637 | /* |
638 | * Get the total size of the enabled xstates without the independent supervisor |
639 | * features. |
640 | */ |
641 | static unsigned int __init get_xsave_compacted_size(void) |
642 | { |
643 | u64 mask = xfeatures_mask_independent(); |
644 | unsigned int size; |
645 | |
646 | if (!mask) |
647 | return get_compacted_size(); |
648 | |
649 | /* Disable independent features. */ |
650 | wrmsrl(MSR_IA32_XSS, val: xfeatures_mask_supervisor()); |
651 | |
652 | /* |
653 | * Ask the hardware what size is required of the buffer. |
654 | * This is the size required for the task->fpu buffer. |
655 | */ |
656 | size = get_compacted_size(); |
657 | |
658 | /* Re-enable independent features so XSAVES will work on them again. */ |
659 | wrmsrl(MSR_IA32_XSS, val: xfeatures_mask_supervisor() | mask); |
660 | |
661 | return size; |
662 | } |
663 | |
664 | static unsigned int __init get_xsave_size_user(void) |
665 | { |
666 | unsigned int eax, ebx, ecx, edx; |
667 | /* |
668 | * - CPUID function 0DH, sub-function 0: |
669 | * EBX enumerates the size (in bytes) required by |
670 | * the XSAVE instruction for an XSAVE area |
671 | * containing all the *user* state components |
672 | * corresponding to bits currently set in XCR0. |
673 | */ |
674 | cpuid_count(XSTATE_CPUID, count: 0, eax: &eax, ebx: &ebx, ecx: &ecx, edx: &edx); |
675 | return ebx; |
676 | } |
677 | |
678 | static int __init init_xstate_size(void) |
679 | { |
680 | /* Recompute the context size for enabled features: */ |
681 | unsigned int user_size, kernel_size, kernel_default_size; |
682 | bool compacted = cpu_feature_enabled(X86_FEATURE_XCOMPACTED); |
683 | |
684 | /* Uncompacted user space size */ |
685 | user_size = get_xsave_size_user(); |
686 | |
687 | /* |
688 | * XSAVES kernel size includes supervisor states and uses compacted |
689 | * format. XSAVEC uses compacted format, but does not save |
690 | * supervisor states. |
691 | * |
692 | * XSAVE[OPT] do not support supervisor states so kernel and user |
693 | * size is identical. |
694 | */ |
695 | if (compacted) |
696 | kernel_size = get_xsave_compacted_size(); |
697 | else |
698 | kernel_size = user_size; |
699 | |
700 | kernel_default_size = |
701 | xstate_calculate_size(xfeatures: fpu_kernel_cfg.default_features, compacted); |
702 | |
703 | if (!paranoid_xstate_size_valid(kernel_size)) |
704 | return -EINVAL; |
705 | |
706 | fpu_kernel_cfg.max_size = kernel_size; |
707 | fpu_user_cfg.max_size = user_size; |
708 | |
709 | fpu_kernel_cfg.default_size = kernel_default_size; |
710 | fpu_user_cfg.default_size = |
711 | xstate_calculate_size(xfeatures: fpu_user_cfg.default_features, compacted: false); |
712 | |
713 | return 0; |
714 | } |
715 | |
716 | /* |
717 | * We enabled the XSAVE hardware, but something went wrong and |
718 | * we can not use it. Disable it. |
719 | */ |
720 | static void __init fpu__init_disable_system_xstate(unsigned int legacy_size) |
721 | { |
722 | fpu_kernel_cfg.max_features = 0; |
723 | cr4_clear_bits(X86_CR4_OSXSAVE); |
724 | setup_clear_cpu_cap(X86_FEATURE_XSAVE); |
725 | |
726 | /* Restore the legacy size.*/ |
727 | fpu_kernel_cfg.max_size = legacy_size; |
728 | fpu_kernel_cfg.default_size = legacy_size; |
729 | fpu_user_cfg.max_size = legacy_size; |
730 | fpu_user_cfg.default_size = legacy_size; |
731 | |
732 | /* |
733 | * Prevent enabling the static branch which enables writes to the |
734 | * XFD MSR. |
735 | */ |
736 | init_fpstate.xfd = 0; |
737 | |
738 | fpstate_reset(fpu: ¤t->thread.fpu); |
739 | } |
740 | |
741 | /* |
742 | * Enable and initialize the xsave feature. |
743 | * Called once per system bootup. |
744 | */ |
745 | void __init fpu__init_system_xstate(unsigned int legacy_size) |
746 | { |
747 | unsigned int eax, ebx, ecx, edx; |
748 | u64 xfeatures; |
749 | int err; |
750 | int i; |
751 | |
752 | if (!boot_cpu_has(X86_FEATURE_FPU)) { |
753 | pr_info("x86/fpu: No FPU detected\n" ); |
754 | return; |
755 | } |
756 | |
757 | if (!boot_cpu_has(X86_FEATURE_XSAVE)) { |
758 | pr_info("x86/fpu: x87 FPU will use %s\n" , |
759 | boot_cpu_has(X86_FEATURE_FXSR) ? "FXSAVE" : "FSAVE" ); |
760 | return; |
761 | } |
762 | |
763 | if (boot_cpu_data.cpuid_level < XSTATE_CPUID) { |
764 | WARN_ON_FPU(1); |
765 | return; |
766 | } |
767 | |
768 | /* |
769 | * Find user xstates supported by the processor. |
770 | */ |
771 | cpuid_count(XSTATE_CPUID, count: 0, eax: &eax, ebx: &ebx, ecx: &ecx, edx: &edx); |
772 | fpu_kernel_cfg.max_features = eax + ((u64)edx << 32); |
773 | |
774 | /* |
775 | * Find supervisor xstates supported by the processor. |
776 | */ |
777 | cpuid_count(XSTATE_CPUID, count: 1, eax: &eax, ebx: &ebx, ecx: &ecx, edx: &edx); |
778 | fpu_kernel_cfg.max_features |= ecx + ((u64)edx << 32); |
779 | |
780 | if ((fpu_kernel_cfg.max_features & XFEATURE_MASK_FPSSE) != XFEATURE_MASK_FPSSE) { |
781 | /* |
782 | * This indicates that something really unexpected happened |
783 | * with the enumeration. Disable XSAVE and try to continue |
784 | * booting without it. This is too early to BUG(). |
785 | */ |
786 | pr_err("x86/fpu: FP/SSE not present amongst the CPU's xstate features: 0x%llx.\n" , |
787 | fpu_kernel_cfg.max_features); |
788 | goto out_disable; |
789 | } |
790 | |
791 | /* |
792 | * Clear XSAVE features that are disabled in the normal CPUID. |
793 | */ |
794 | for (i = 0; i < ARRAY_SIZE(xsave_cpuid_features); i++) { |
795 | unsigned short cid = xsave_cpuid_features[i]; |
796 | |
797 | /* Careful: X86_FEATURE_FPU is 0! */ |
798 | if ((i != XFEATURE_FP && !cid) || !boot_cpu_has(cid)) |
799 | fpu_kernel_cfg.max_features &= ~BIT_ULL(i); |
800 | } |
801 | |
802 | if (!cpu_feature_enabled(X86_FEATURE_XFD)) |
803 | fpu_kernel_cfg.max_features &= ~XFEATURE_MASK_USER_DYNAMIC; |
804 | |
805 | if (!cpu_feature_enabled(X86_FEATURE_XSAVES)) |
806 | fpu_kernel_cfg.max_features &= XFEATURE_MASK_USER_SUPPORTED; |
807 | else |
808 | fpu_kernel_cfg.max_features &= XFEATURE_MASK_USER_SUPPORTED | |
809 | XFEATURE_MASK_SUPERVISOR_SUPPORTED; |
810 | |
811 | fpu_user_cfg.max_features = fpu_kernel_cfg.max_features; |
812 | fpu_user_cfg.max_features &= XFEATURE_MASK_USER_SUPPORTED; |
813 | |
814 | /* Clean out dynamic features from default */ |
815 | fpu_kernel_cfg.default_features = fpu_kernel_cfg.max_features; |
816 | fpu_kernel_cfg.default_features &= ~XFEATURE_MASK_USER_DYNAMIC; |
817 | |
818 | fpu_user_cfg.default_features = fpu_user_cfg.max_features; |
819 | fpu_user_cfg.default_features &= ~XFEATURE_MASK_USER_DYNAMIC; |
820 | |
821 | /* Store it for paranoia check at the end */ |
822 | xfeatures = fpu_kernel_cfg.max_features; |
823 | |
824 | /* |
825 | * Initialize the default XFD state in initfp_state and enable the |
826 | * dynamic sizing mechanism if dynamic states are available. The |
827 | * static key cannot be enabled here because this runs before |
828 | * jump_label_init(). This is delayed to an initcall. |
829 | */ |
830 | init_fpstate.xfd = fpu_user_cfg.max_features & XFEATURE_MASK_USER_DYNAMIC; |
831 | |
832 | /* Set up compaction feature bit */ |
833 | if (cpu_feature_enabled(X86_FEATURE_XSAVEC) || |
834 | cpu_feature_enabled(X86_FEATURE_XSAVES)) |
835 | setup_force_cpu_cap(X86_FEATURE_XCOMPACTED); |
836 | |
837 | /* Enable xstate instructions to be able to continue with initialization: */ |
838 | fpu__init_cpu_xstate(); |
839 | |
840 | /* Cache size, offset and flags for initialization */ |
841 | setup_xstate_cache(); |
842 | |
843 | err = init_xstate_size(); |
844 | if (err) |
845 | goto out_disable; |
846 | |
847 | /* Reset the state for the current task */ |
848 | fpstate_reset(fpu: ¤t->thread.fpu); |
849 | |
850 | /* |
851 | * Update info used for ptrace frames; use standard-format size and no |
852 | * supervisor xstates: |
853 | */ |
854 | update_regset_xstate_info(size: fpu_user_cfg.max_size, |
855 | xstate_mask: fpu_user_cfg.max_features); |
856 | |
857 | /* |
858 | * init_fpstate excludes dynamic states as they are large but init |
859 | * state is zero. |
860 | */ |
861 | init_fpstate.size = fpu_kernel_cfg.default_size; |
862 | init_fpstate.xfeatures = fpu_kernel_cfg.default_features; |
863 | |
864 | if (init_fpstate.size > sizeof(init_fpstate.regs)) { |
865 | pr_warn("x86/fpu: init_fpstate buffer too small (%zu < %d), disabling XSAVE\n" , |
866 | sizeof(init_fpstate.regs), init_fpstate.size); |
867 | goto out_disable; |
868 | } |
869 | |
870 | setup_init_fpu_buf(); |
871 | |
872 | /* |
873 | * Paranoia check whether something in the setup modified the |
874 | * xfeatures mask. |
875 | */ |
876 | if (xfeatures != fpu_kernel_cfg.max_features) { |
877 | pr_err("x86/fpu: xfeatures modified from 0x%016llx to 0x%016llx during init, disabling XSAVE\n" , |
878 | xfeatures, fpu_kernel_cfg.max_features); |
879 | goto out_disable; |
880 | } |
881 | |
882 | /* |
883 | * CPU capabilities initialization runs before FPU init. So |
884 | * X86_FEATURE_OSXSAVE is not set. Now that XSAVE is completely |
885 | * functional, set the feature bit so depending code works. |
886 | */ |
887 | setup_force_cpu_cap(X86_FEATURE_OSXSAVE); |
888 | |
889 | print_xstate_offset_size(); |
890 | pr_info("x86/fpu: Enabled xstate features 0x%llx, context size is %d bytes, using '%s' format.\n" , |
891 | fpu_kernel_cfg.max_features, |
892 | fpu_kernel_cfg.max_size, |
893 | boot_cpu_has(X86_FEATURE_XCOMPACTED) ? "compacted" : "standard" ); |
894 | return; |
895 | |
896 | out_disable: |
897 | /* something went wrong, try to boot without any XSAVE support */ |
898 | fpu__init_disable_system_xstate(legacy_size); |
899 | } |
900 | |
901 | /* |
902 | * Restore minimal FPU state after suspend: |
903 | */ |
904 | void fpu__resume_cpu(void) |
905 | { |
906 | /* |
907 | * Restore XCR0 on xsave capable CPUs: |
908 | */ |
909 | if (cpu_feature_enabled(X86_FEATURE_XSAVE)) |
910 | xsetbv(XCR_XFEATURE_ENABLED_MASK, value: fpu_user_cfg.max_features); |
911 | |
912 | /* |
913 | * Restore IA32_XSS. The same CPUID bit enumerates support |
914 | * of XSAVES and MSR_IA32_XSS. |
915 | */ |
916 | if (cpu_feature_enabled(X86_FEATURE_XSAVES)) { |
917 | wrmsrl(MSR_IA32_XSS, val: xfeatures_mask_supervisor() | |
918 | xfeatures_mask_independent()); |
919 | } |
920 | |
921 | if (fpu_state_size_dynamic()) |
922 | wrmsrl(MSR_IA32_XFD, current->thread.fpu.fpstate->xfd); |
923 | } |
924 | |
925 | /* |
926 | * Given an xstate feature nr, calculate where in the xsave |
927 | * buffer the state is. Callers should ensure that the buffer |
928 | * is valid. |
929 | */ |
930 | static void *__raw_xsave_addr(struct xregs_state *xsave, int xfeature_nr) |
931 | { |
932 | u64 xcomp_bv = xsave->header.xcomp_bv; |
933 | |
934 | if (WARN_ON_ONCE(!xfeature_enabled(xfeature_nr))) |
935 | return NULL; |
936 | |
937 | if (cpu_feature_enabled(X86_FEATURE_XCOMPACTED)) { |
938 | if (WARN_ON_ONCE(!(xcomp_bv & BIT_ULL(xfeature_nr)))) |
939 | return NULL; |
940 | } |
941 | |
942 | return (void *)xsave + xfeature_get_offset(xcomp_bv, xfeature: xfeature_nr); |
943 | } |
944 | |
945 | /* |
946 | * Given the xsave area and a state inside, this function returns the |
947 | * address of the state. |
948 | * |
949 | * This is the API that is called to get xstate address in either |
950 | * standard format or compacted format of xsave area. |
951 | * |
952 | * Note that if there is no data for the field in the xsave buffer |
953 | * this will return NULL. |
954 | * |
955 | * Inputs: |
956 | * xstate: the thread's storage area for all FPU data |
957 | * xfeature_nr: state which is defined in xsave.h (e.g. XFEATURE_FP, |
958 | * XFEATURE_SSE, etc...) |
959 | * Output: |
960 | * address of the state in the xsave area, or NULL if the |
961 | * field is not present in the xsave buffer. |
962 | */ |
963 | void *get_xsave_addr(struct xregs_state *xsave, int xfeature_nr) |
964 | { |
965 | /* |
966 | * Do we even *have* xsave state? |
967 | */ |
968 | if (!boot_cpu_has(X86_FEATURE_XSAVE)) |
969 | return NULL; |
970 | |
971 | /* |
972 | * We should not ever be requesting features that we |
973 | * have not enabled. |
974 | */ |
975 | if (WARN_ON_ONCE(!xfeature_enabled(xfeature_nr))) |
976 | return NULL; |
977 | |
978 | /* |
979 | * This assumes the last 'xsave*' instruction to |
980 | * have requested that 'xfeature_nr' be saved. |
981 | * If it did not, we might be seeing and old value |
982 | * of the field in the buffer. |
983 | * |
984 | * This can happen because the last 'xsave' did not |
985 | * request that this feature be saved (unlikely) |
986 | * or because the "init optimization" caused it |
987 | * to not be saved. |
988 | */ |
989 | if (!(xsave->header.xfeatures & BIT_ULL(xfeature_nr))) |
990 | return NULL; |
991 | |
992 | return __raw_xsave_addr(xsave, xfeature_nr); |
993 | } |
994 | |
995 | #ifdef CONFIG_ARCH_HAS_PKEYS |
996 | |
997 | /* |
998 | * This will go out and modify PKRU register to set the access |
999 | * rights for @pkey to @init_val. |
1000 | */ |
1001 | int arch_set_user_pkey_access(struct task_struct *tsk, int pkey, |
1002 | unsigned long init_val) |
1003 | { |
1004 | u32 old_pkru, new_pkru_bits = 0; |
1005 | int pkey_shift; |
1006 | |
1007 | /* |
1008 | * This check implies XSAVE support. OSPKE only gets |
1009 | * set if we enable XSAVE and we enable PKU in XCR0. |
1010 | */ |
1011 | if (!cpu_feature_enabled(X86_FEATURE_OSPKE)) |
1012 | return -EINVAL; |
1013 | |
1014 | /* |
1015 | * This code should only be called with valid 'pkey' |
1016 | * values originating from in-kernel users. Complain |
1017 | * if a bad value is observed. |
1018 | */ |
1019 | if (WARN_ON_ONCE(pkey >= arch_max_pkey())) |
1020 | return -EINVAL; |
1021 | |
1022 | /* Set the bits we need in PKRU: */ |
1023 | if (init_val & PKEY_DISABLE_ACCESS) |
1024 | new_pkru_bits |= PKRU_AD_BIT; |
1025 | if (init_val & PKEY_DISABLE_WRITE) |
1026 | new_pkru_bits |= PKRU_WD_BIT; |
1027 | |
1028 | /* Shift the bits in to the correct place in PKRU for pkey: */ |
1029 | pkey_shift = pkey * PKRU_BITS_PER_PKEY; |
1030 | new_pkru_bits <<= pkey_shift; |
1031 | |
1032 | /* Get old PKRU and mask off any old bits in place: */ |
1033 | old_pkru = read_pkru(); |
1034 | old_pkru &= ~((PKRU_AD_BIT|PKRU_WD_BIT) << pkey_shift); |
1035 | |
1036 | /* Write old part along with new part: */ |
1037 | write_pkru(pkru: old_pkru | new_pkru_bits); |
1038 | |
1039 | return 0; |
1040 | } |
1041 | #endif /* ! CONFIG_ARCH_HAS_PKEYS */ |
1042 | |
1043 | static void copy_feature(bool from_xstate, struct membuf *to, void *xstate, |
1044 | void *init_xstate, unsigned int size) |
1045 | { |
1046 | membuf_write(s: to, v: from_xstate ? xstate : init_xstate, size); |
1047 | } |
1048 | |
1049 | /** |
1050 | * __copy_xstate_to_uabi_buf - Copy kernel saved xstate to a UABI buffer |
1051 | * @to: membuf descriptor |
1052 | * @fpstate: The fpstate buffer from which to copy |
1053 | * @xfeatures: The mask of xfeatures to save (XSAVE mode only) |
1054 | * @pkru_val: The PKRU value to store in the PKRU component |
1055 | * @copy_mode: The requested copy mode |
1056 | * |
1057 | * Converts from kernel XSAVE or XSAVES compacted format to UABI conforming |
1058 | * format, i.e. from the kernel internal hardware dependent storage format |
1059 | * to the requested @mode. UABI XSTATE is always uncompacted! |
1060 | * |
1061 | * It supports partial copy but @to.pos always starts from zero. |
1062 | */ |
1063 | void __copy_xstate_to_uabi_buf(struct membuf to, struct fpstate *fpstate, |
1064 | u64 xfeatures, u32 pkru_val, |
1065 | enum xstate_copy_mode copy_mode) |
1066 | { |
1067 | const unsigned int off_mxcsr = offsetof(struct fxregs_state, mxcsr); |
1068 | struct xregs_state *xinit = &init_fpstate.regs.xsave; |
1069 | struct xregs_state *xsave = &fpstate->regs.xsave; |
1070 | struct xstate_header ; |
1071 | unsigned int zerofrom; |
1072 | u64 mask; |
1073 | int i; |
1074 | |
1075 | memset(&header, 0, sizeof(header)); |
1076 | header.xfeatures = xsave->header.xfeatures; |
1077 | |
1078 | /* Mask out the feature bits depending on copy mode */ |
1079 | switch (copy_mode) { |
1080 | case XSTATE_COPY_FP: |
1081 | header.xfeatures &= XFEATURE_MASK_FP; |
1082 | break; |
1083 | |
1084 | case XSTATE_COPY_FX: |
1085 | header.xfeatures &= XFEATURE_MASK_FP | XFEATURE_MASK_SSE; |
1086 | break; |
1087 | |
1088 | case XSTATE_COPY_XSAVE: |
1089 | header.xfeatures &= fpstate->user_xfeatures & xfeatures; |
1090 | break; |
1091 | } |
1092 | |
1093 | /* Copy FP state up to MXCSR */ |
1094 | copy_feature(from_xstate: header.xfeatures & XFEATURE_MASK_FP, to: &to, xstate: &xsave->i387, |
1095 | init_xstate: &xinit->i387, size: off_mxcsr); |
1096 | |
1097 | /* Copy MXCSR when SSE or YMM are set in the feature mask */ |
1098 | copy_feature(from_xstate: header.xfeatures & (XFEATURE_MASK_SSE | XFEATURE_MASK_YMM), |
1099 | to: &to, xstate: &xsave->i387.mxcsr, init_xstate: &xinit->i387.mxcsr, |
1100 | MXCSR_AND_FLAGS_SIZE); |
1101 | |
1102 | /* Copy the remaining FP state */ |
1103 | copy_feature(from_xstate: header.xfeatures & XFEATURE_MASK_FP, |
1104 | to: &to, xstate: &xsave->i387.st_space, init_xstate: &xinit->i387.st_space, |
1105 | size: sizeof(xsave->i387.st_space)); |
1106 | |
1107 | /* Copy the SSE state - shared with YMM, but independently managed */ |
1108 | copy_feature(from_xstate: header.xfeatures & XFEATURE_MASK_SSE, |
1109 | to: &to, xstate: &xsave->i387.xmm_space, init_xstate: &xinit->i387.xmm_space, |
1110 | size: sizeof(xsave->i387.xmm_space)); |
1111 | |
1112 | if (copy_mode != XSTATE_COPY_XSAVE) |
1113 | goto out; |
1114 | |
1115 | /* Zero the padding area */ |
1116 | membuf_zero(s: &to, size: sizeof(xsave->i387.padding)); |
1117 | |
1118 | /* Copy xsave->i387.sw_reserved */ |
1119 | membuf_write(s: &to, v: xstate_fx_sw_bytes, size: sizeof(xsave->i387.sw_reserved)); |
1120 | |
1121 | /* Copy the user space relevant state of @xsave->header */ |
1122 | membuf_write(s: &to, v: &header, size: sizeof(header)); |
1123 | |
1124 | zerofrom = offsetof(struct xregs_state, extended_state_area); |
1125 | |
1126 | /* |
1127 | * This 'mask' indicates which states to copy from fpstate. |
1128 | * Those extended states that are not present in fpstate are |
1129 | * either disabled or initialized: |
1130 | * |
1131 | * In non-compacted format, disabled features still occupy |
1132 | * state space but there is no state to copy from in the |
1133 | * compacted init_fpstate. The gap tracking will zero these |
1134 | * states. |
1135 | * |
1136 | * The extended features have an all zeroes init state. Thus, |
1137 | * remove them from 'mask' to zero those features in the user |
1138 | * buffer instead of retrieving them from init_fpstate. |
1139 | */ |
1140 | mask = header.xfeatures; |
1141 | |
1142 | for_each_extended_xfeature(i, mask) { |
1143 | /* |
1144 | * If there was a feature or alignment gap, zero the space |
1145 | * in the destination buffer. |
1146 | */ |
1147 | if (zerofrom < xstate_offsets[i]) |
1148 | membuf_zero(s: &to, size: xstate_offsets[i] - zerofrom); |
1149 | |
1150 | if (i == XFEATURE_PKRU) { |
1151 | struct pkru_state pkru = {0}; |
1152 | /* |
1153 | * PKRU is not necessarily up to date in the |
1154 | * XSAVE buffer. Use the provided value. |
1155 | */ |
1156 | pkru.pkru = pkru_val; |
1157 | membuf_write(s: &to, v: &pkru, size: sizeof(pkru)); |
1158 | } else { |
1159 | membuf_write(s: &to, |
1160 | v: __raw_xsave_addr(xsave, xfeature_nr: i), |
1161 | size: xstate_sizes[i]); |
1162 | } |
1163 | /* |
1164 | * Keep track of the last copied state in the non-compacted |
1165 | * target buffer for gap zeroing. |
1166 | */ |
1167 | zerofrom = xstate_offsets[i] + xstate_sizes[i]; |
1168 | } |
1169 | |
1170 | out: |
1171 | if (to.left) |
1172 | membuf_zero(s: &to, size: to.left); |
1173 | } |
1174 | |
1175 | /** |
1176 | * copy_xstate_to_uabi_buf - Copy kernel saved xstate to a UABI buffer |
1177 | * @to: membuf descriptor |
1178 | * @tsk: The task from which to copy the saved xstate |
1179 | * @copy_mode: The requested copy mode |
1180 | * |
1181 | * Converts from kernel XSAVE or XSAVES compacted format to UABI conforming |
1182 | * format, i.e. from the kernel internal hardware dependent storage format |
1183 | * to the requested @mode. UABI XSTATE is always uncompacted! |
1184 | * |
1185 | * It supports partial copy but @to.pos always starts from zero. |
1186 | */ |
1187 | void copy_xstate_to_uabi_buf(struct membuf to, struct task_struct *tsk, |
1188 | enum xstate_copy_mode copy_mode) |
1189 | { |
1190 | __copy_xstate_to_uabi_buf(to, fpstate: tsk->thread.fpu.fpstate, |
1191 | xfeatures: tsk->thread.fpu.fpstate->user_xfeatures, |
1192 | pkru_val: tsk->thread.pkru, copy_mode); |
1193 | } |
1194 | |
1195 | static int copy_from_buffer(void *dst, unsigned int offset, unsigned int size, |
1196 | const void *kbuf, const void __user *ubuf) |
1197 | { |
1198 | if (kbuf) { |
1199 | memcpy(dst, kbuf + offset, size); |
1200 | } else { |
1201 | if (copy_from_user(to: dst, from: ubuf + offset, n: size)) |
1202 | return -EFAULT; |
1203 | } |
1204 | return 0; |
1205 | } |
1206 | |
1207 | |
1208 | /** |
1209 | * copy_uabi_to_xstate - Copy a UABI format buffer to the kernel xstate |
1210 | * @fpstate: The fpstate buffer to copy to |
1211 | * @kbuf: The UABI format buffer, if it comes from the kernel |
1212 | * @ubuf: The UABI format buffer, if it comes from userspace |
1213 | * @pkru: The location to write the PKRU value to |
1214 | * |
1215 | * Converts from the UABI format into the kernel internal hardware |
1216 | * dependent format. |
1217 | * |
1218 | * This function ultimately has three different callers with distinct PKRU |
1219 | * behavior. |
1220 | * 1. When called from sigreturn the PKRU register will be restored from |
1221 | * @fpstate via an XRSTOR. Correctly copying the UABI format buffer to |
1222 | * @fpstate is sufficient to cover this case, but the caller will also |
1223 | * pass a pointer to the thread_struct's pkru field in @pkru and updating |
1224 | * it is harmless. |
1225 | * 2. When called from ptrace the PKRU register will be restored from the |
1226 | * thread_struct's pkru field. A pointer to that is passed in @pkru. |
1227 | * The kernel will restore it manually, so the XRSTOR behavior that resets |
1228 | * the PKRU register to the hardware init value (0) if the corresponding |
1229 | * xfeatures bit is not set is emulated here. |
1230 | * 3. When called from KVM the PKRU register will be restored from the vcpu's |
1231 | * pkru field. A pointer to that is passed in @pkru. KVM hasn't used |
1232 | * XRSTOR and hasn't had the PKRU resetting behavior described above. To |
1233 | * preserve that KVM behavior, it passes NULL for @pkru if the xfeatures |
1234 | * bit is not set. |
1235 | */ |
1236 | static int copy_uabi_to_xstate(struct fpstate *fpstate, const void *kbuf, |
1237 | const void __user *ubuf, u32 *pkru) |
1238 | { |
1239 | struct xregs_state *xsave = &fpstate->regs.xsave; |
1240 | unsigned int offset, size; |
1241 | struct xstate_header hdr; |
1242 | u64 mask; |
1243 | int i; |
1244 | |
1245 | offset = offsetof(struct xregs_state, header); |
1246 | if (copy_from_buffer(dst: &hdr, offset, size: sizeof(hdr), kbuf, ubuf)) |
1247 | return -EFAULT; |
1248 | |
1249 | if (validate_user_xstate_header(hdr: &hdr, fpstate)) |
1250 | return -EINVAL; |
1251 | |
1252 | /* Validate MXCSR when any of the related features is in use */ |
1253 | mask = XFEATURE_MASK_FP | XFEATURE_MASK_SSE | XFEATURE_MASK_YMM; |
1254 | if (hdr.xfeatures & mask) { |
1255 | u32 mxcsr[2]; |
1256 | |
1257 | offset = offsetof(struct fxregs_state, mxcsr); |
1258 | if (copy_from_buffer(dst: mxcsr, offset, size: sizeof(mxcsr), kbuf, ubuf)) |
1259 | return -EFAULT; |
1260 | |
1261 | /* Reserved bits in MXCSR must be zero. */ |
1262 | if (mxcsr[0] & ~mxcsr_feature_mask) |
1263 | return -EINVAL; |
1264 | |
1265 | /* SSE and YMM require MXCSR even when FP is not in use. */ |
1266 | if (!(hdr.xfeatures & XFEATURE_MASK_FP)) { |
1267 | xsave->i387.mxcsr = mxcsr[0]; |
1268 | xsave->i387.mxcsr_mask = mxcsr[1]; |
1269 | } |
1270 | } |
1271 | |
1272 | for (i = 0; i < XFEATURE_MAX; i++) { |
1273 | mask = BIT_ULL(i); |
1274 | |
1275 | if (hdr.xfeatures & mask) { |
1276 | void *dst = __raw_xsave_addr(xsave, xfeature_nr: i); |
1277 | |
1278 | offset = xstate_offsets[i]; |
1279 | size = xstate_sizes[i]; |
1280 | |
1281 | if (copy_from_buffer(dst, offset, size, kbuf, ubuf)) |
1282 | return -EFAULT; |
1283 | } |
1284 | } |
1285 | |
1286 | if (hdr.xfeatures & XFEATURE_MASK_PKRU) { |
1287 | struct pkru_state *xpkru; |
1288 | |
1289 | xpkru = __raw_xsave_addr(xsave, xfeature_nr: XFEATURE_PKRU); |
1290 | *pkru = xpkru->pkru; |
1291 | } else { |
1292 | /* |
1293 | * KVM may pass NULL here to indicate that it does not need |
1294 | * PKRU updated. |
1295 | */ |
1296 | if (pkru) |
1297 | *pkru = 0; |
1298 | } |
1299 | |
1300 | /* |
1301 | * The state that came in from userspace was user-state only. |
1302 | * Mask all the user states out of 'xfeatures': |
1303 | */ |
1304 | xsave->header.xfeatures &= XFEATURE_MASK_SUPERVISOR_ALL; |
1305 | |
1306 | /* |
1307 | * Add back in the features that came in from userspace: |
1308 | */ |
1309 | xsave->header.xfeatures |= hdr.xfeatures; |
1310 | |
1311 | return 0; |
1312 | } |
1313 | |
1314 | /* |
1315 | * Convert from a ptrace standard-format kernel buffer to kernel XSAVE[S] |
1316 | * format and copy to the target thread. Used by ptrace and KVM. |
1317 | */ |
1318 | int copy_uabi_from_kernel_to_xstate(struct fpstate *fpstate, const void *kbuf, u32 *pkru) |
1319 | { |
1320 | return copy_uabi_to_xstate(fpstate, kbuf, NULL, pkru); |
1321 | } |
1322 | |
1323 | /* |
1324 | * Convert from a sigreturn standard-format user-space buffer to kernel |
1325 | * XSAVE[S] format and copy to the target thread. This is called from the |
1326 | * sigreturn() and rt_sigreturn() system calls. |
1327 | */ |
1328 | int copy_sigframe_from_user_to_xstate(struct task_struct *tsk, |
1329 | const void __user *ubuf) |
1330 | { |
1331 | return copy_uabi_to_xstate(fpstate: tsk->thread.fpu.fpstate, NULL, ubuf, pkru: &tsk->thread.pkru); |
1332 | } |
1333 | |
1334 | static bool validate_independent_components(u64 mask) |
1335 | { |
1336 | u64 xchk; |
1337 | |
1338 | if (WARN_ON_FPU(!cpu_feature_enabled(X86_FEATURE_XSAVES))) |
1339 | return false; |
1340 | |
1341 | xchk = ~xfeatures_mask_independent(); |
1342 | |
1343 | if (WARN_ON_ONCE(!mask || mask & xchk)) |
1344 | return false; |
1345 | |
1346 | return true; |
1347 | } |
1348 | |
1349 | /** |
1350 | * xsaves - Save selected components to a kernel xstate buffer |
1351 | * @xstate: Pointer to the buffer |
1352 | * @mask: Feature mask to select the components to save |
1353 | * |
1354 | * The @xstate buffer must be 64 byte aligned and correctly initialized as |
1355 | * XSAVES does not write the full xstate header. Before first use the |
1356 | * buffer should be zeroed otherwise a consecutive XRSTORS from that buffer |
1357 | * can #GP. |
1358 | * |
1359 | * The feature mask must be a subset of the independent features. |
1360 | */ |
1361 | void xsaves(struct xregs_state *xstate, u64 mask) |
1362 | { |
1363 | int err; |
1364 | |
1365 | if (!validate_independent_components(mask)) |
1366 | return; |
1367 | |
1368 | XSTATE_OP(XSAVES, xstate, (u32)mask, (u32)(mask >> 32), err); |
1369 | WARN_ON_ONCE(err); |
1370 | } |
1371 | |
1372 | /** |
1373 | * xrstors - Restore selected components from a kernel xstate buffer |
1374 | * @xstate: Pointer to the buffer |
1375 | * @mask: Feature mask to select the components to restore |
1376 | * |
1377 | * The @xstate buffer must be 64 byte aligned and correctly initialized |
1378 | * otherwise XRSTORS from that buffer can #GP. |
1379 | * |
1380 | * Proper usage is to restore the state which was saved with |
1381 | * xsaves() into @xstate. |
1382 | * |
1383 | * The feature mask must be a subset of the independent features. |
1384 | */ |
1385 | void xrstors(struct xregs_state *xstate, u64 mask) |
1386 | { |
1387 | int err; |
1388 | |
1389 | if (!validate_independent_components(mask)) |
1390 | return; |
1391 | |
1392 | XSTATE_OP(XRSTORS, xstate, (u32)mask, (u32)(mask >> 32), err); |
1393 | WARN_ON_ONCE(err); |
1394 | } |
1395 | |
1396 | #if IS_ENABLED(CONFIG_KVM) |
1397 | void fpstate_clear_xstate_component(struct fpstate *fps, unsigned int xfeature) |
1398 | { |
1399 | void *addr = get_xsave_addr(xsave: &fps->regs.xsave, xfeature_nr: xfeature); |
1400 | |
1401 | if (addr) |
1402 | memset(addr, 0, xstate_sizes[xfeature]); |
1403 | } |
1404 | EXPORT_SYMBOL_GPL(fpstate_clear_xstate_component); |
1405 | #endif |
1406 | |
1407 | #ifdef CONFIG_X86_64 |
1408 | |
1409 | #ifdef CONFIG_X86_DEBUG_FPU |
1410 | /* |
1411 | * Ensure that a subsequent XSAVE* or XRSTOR* instruction with RFBM=@mask |
1412 | * can safely operate on the @fpstate buffer. |
1413 | */ |
1414 | static bool xstate_op_valid(struct fpstate *fpstate, u64 mask, bool rstor) |
1415 | { |
1416 | u64 xfd = __this_cpu_read(xfd_state); |
1417 | |
1418 | if (fpstate->xfd == xfd) |
1419 | return true; |
1420 | |
1421 | /* |
1422 | * The XFD MSR does not match fpstate->xfd. That's invalid when |
1423 | * the passed in fpstate is current's fpstate. |
1424 | */ |
1425 | if (fpstate->xfd == current->thread.fpu.fpstate->xfd) |
1426 | return false; |
1427 | |
1428 | /* |
1429 | * XRSTOR(S) from init_fpstate are always correct as it will just |
1430 | * bring all components into init state and not read from the |
1431 | * buffer. XSAVE(S) raises #PF after init. |
1432 | */ |
1433 | if (fpstate == &init_fpstate) |
1434 | return rstor; |
1435 | |
1436 | /* |
1437 | * XSAVE(S): clone(), fpu_swap_kvm_fpu() |
1438 | * XRSTORS(S): fpu_swap_kvm_fpu() |
1439 | */ |
1440 | |
1441 | /* |
1442 | * No XSAVE/XRSTOR instructions (except XSAVE itself) touch |
1443 | * the buffer area for XFD-disabled state components. |
1444 | */ |
1445 | mask &= ~xfd; |
1446 | |
1447 | /* |
1448 | * Remove features which are valid in fpstate. They |
1449 | * have space allocated in fpstate. |
1450 | */ |
1451 | mask &= ~fpstate->xfeatures; |
1452 | |
1453 | /* |
1454 | * Any remaining state components in 'mask' might be written |
1455 | * by XSAVE/XRSTOR. Fail validation it found. |
1456 | */ |
1457 | return !mask; |
1458 | } |
1459 | |
1460 | void xfd_validate_state(struct fpstate *fpstate, u64 mask, bool rstor) |
1461 | { |
1462 | WARN_ON_ONCE(!xstate_op_valid(fpstate, mask, rstor)); |
1463 | } |
1464 | #endif /* CONFIG_X86_DEBUG_FPU */ |
1465 | |
1466 | static int __init xfd_update_static_branch(void) |
1467 | { |
1468 | /* |
1469 | * If init_fpstate.xfd has bits set then dynamic features are |
1470 | * available and the dynamic sizing must be enabled. |
1471 | */ |
1472 | if (init_fpstate.xfd) |
1473 | static_branch_enable(&__fpu_state_size_dynamic); |
1474 | return 0; |
1475 | } |
1476 | arch_initcall(xfd_update_static_branch) |
1477 | |
1478 | void fpstate_free(struct fpu *fpu) |
1479 | { |
1480 | if (fpu->fpstate && fpu->fpstate != &fpu->__fpstate) |
1481 | vfree(addr: fpu->fpstate); |
1482 | } |
1483 | |
1484 | /** |
1485 | * fpstate_realloc - Reallocate struct fpstate for the requested new features |
1486 | * |
1487 | * @xfeatures: A bitmap of xstate features which extend the enabled features |
1488 | * of that task |
1489 | * @ksize: The required size for the kernel buffer |
1490 | * @usize: The required size for user space buffers |
1491 | * @guest_fpu: Pointer to a guest FPU container. NULL for host allocations |
1492 | * |
1493 | * Note vs. vmalloc(): If the task with a vzalloc()-allocated buffer |
1494 | * terminates quickly, vfree()-induced IPIs may be a concern, but tasks |
1495 | * with large states are likely to live longer. |
1496 | * |
1497 | * Returns: 0 on success, -ENOMEM on allocation error. |
1498 | */ |
1499 | static int fpstate_realloc(u64 xfeatures, unsigned int ksize, |
1500 | unsigned int usize, struct fpu_guest *guest_fpu) |
1501 | { |
1502 | struct fpu *fpu = ¤t->thread.fpu; |
1503 | struct fpstate *curfps, *newfps = NULL; |
1504 | unsigned int fpsize; |
1505 | bool in_use; |
1506 | |
1507 | fpsize = ksize + ALIGN(offsetof(struct fpstate, regs), 64); |
1508 | |
1509 | newfps = vzalloc(size: fpsize); |
1510 | if (!newfps) |
1511 | return -ENOMEM; |
1512 | newfps->size = ksize; |
1513 | newfps->user_size = usize; |
1514 | newfps->is_valloc = true; |
1515 | |
1516 | /* |
1517 | * When a guest FPU is supplied, use @guest_fpu->fpstate |
1518 | * as reference independent whether it is in use or not. |
1519 | */ |
1520 | curfps = guest_fpu ? guest_fpu->fpstate : fpu->fpstate; |
1521 | |
1522 | /* Determine whether @curfps is the active fpstate */ |
1523 | in_use = fpu->fpstate == curfps; |
1524 | |
1525 | if (guest_fpu) { |
1526 | newfps->is_guest = true; |
1527 | newfps->is_confidential = curfps->is_confidential; |
1528 | newfps->in_use = curfps->in_use; |
1529 | guest_fpu->xfeatures |= xfeatures; |
1530 | guest_fpu->uabi_size = usize; |
1531 | } |
1532 | |
1533 | fpregs_lock(); |
1534 | /* |
1535 | * If @curfps is in use, ensure that the current state is in the |
1536 | * registers before swapping fpstate as that might invalidate it |
1537 | * due to layout changes. |
1538 | */ |
1539 | if (in_use && test_thread_flag(TIF_NEED_FPU_LOAD)) |
1540 | fpregs_restore_userregs(); |
1541 | |
1542 | newfps->xfeatures = curfps->xfeatures | xfeatures; |
1543 | newfps->user_xfeatures = curfps->user_xfeatures | xfeatures; |
1544 | newfps->xfd = curfps->xfd & ~xfeatures; |
1545 | |
1546 | /* Do the final updates within the locked region */ |
1547 | xstate_init_xcomp_bv(xsave: &newfps->regs.xsave, mask: newfps->xfeatures); |
1548 | |
1549 | if (guest_fpu) { |
1550 | guest_fpu->fpstate = newfps; |
1551 | /* If curfps is active, update the FPU fpstate pointer */ |
1552 | if (in_use) |
1553 | fpu->fpstate = newfps; |
1554 | } else { |
1555 | fpu->fpstate = newfps; |
1556 | } |
1557 | |
1558 | if (in_use) |
1559 | xfd_update_state(fpstate: fpu->fpstate); |
1560 | fpregs_unlock(); |
1561 | |
1562 | /* Only free valloc'ed state */ |
1563 | if (curfps && curfps->is_valloc) |
1564 | vfree(addr: curfps); |
1565 | |
1566 | return 0; |
1567 | } |
1568 | |
1569 | static int validate_sigaltstack(unsigned int usize) |
1570 | { |
1571 | struct task_struct *thread, *leader = current->group_leader; |
1572 | unsigned long framesize = get_sigframe_size(); |
1573 | |
1574 | lockdep_assert_held(¤t->sighand->siglock); |
1575 | |
1576 | /* get_sigframe_size() is based on fpu_user_cfg.max_size */ |
1577 | framesize -= fpu_user_cfg.max_size; |
1578 | framesize += usize; |
1579 | for_each_thread(leader, thread) { |
1580 | if (thread->sas_ss_size && thread->sas_ss_size < framesize) |
1581 | return -ENOSPC; |
1582 | } |
1583 | return 0; |
1584 | } |
1585 | |
1586 | static int __xstate_request_perm(u64 permitted, u64 requested, bool guest) |
1587 | { |
1588 | /* |
1589 | * This deliberately does not exclude !XSAVES as we still might |
1590 | * decide to optionally context switch XCR0 or talk the silicon |
1591 | * vendors into extending XFD for the pre AMX states, especially |
1592 | * AVX512. |
1593 | */ |
1594 | bool compacted = cpu_feature_enabled(X86_FEATURE_XCOMPACTED); |
1595 | struct fpu *fpu = ¤t->group_leader->thread.fpu; |
1596 | struct fpu_state_perm *perm; |
1597 | unsigned int ksize, usize; |
1598 | u64 mask; |
1599 | int ret = 0; |
1600 | |
1601 | /* Check whether fully enabled */ |
1602 | if ((permitted & requested) == requested) |
1603 | return 0; |
1604 | |
1605 | /* Calculate the resulting kernel state size */ |
1606 | mask = permitted | requested; |
1607 | /* Take supervisor states into account on the host */ |
1608 | if (!guest) |
1609 | mask |= xfeatures_mask_supervisor(); |
1610 | ksize = xstate_calculate_size(xfeatures: mask, compacted); |
1611 | |
1612 | /* Calculate the resulting user state size */ |
1613 | mask &= XFEATURE_MASK_USER_SUPPORTED; |
1614 | usize = xstate_calculate_size(xfeatures: mask, compacted: false); |
1615 | |
1616 | if (!guest) { |
1617 | ret = validate_sigaltstack(usize); |
1618 | if (ret) |
1619 | return ret; |
1620 | } |
1621 | |
1622 | perm = guest ? &fpu->guest_perm : &fpu->perm; |
1623 | /* Pairs with the READ_ONCE() in xstate_get_group_perm() */ |
1624 | WRITE_ONCE(perm->__state_perm, mask); |
1625 | /* Protected by sighand lock */ |
1626 | perm->__state_size = ksize; |
1627 | perm->__user_state_size = usize; |
1628 | return ret; |
1629 | } |
1630 | |
1631 | /* |
1632 | * Permissions array to map facilities with more than one component |
1633 | */ |
1634 | static const u64 xstate_prctl_req[XFEATURE_MAX] = { |
1635 | [XFEATURE_XTILE_DATA] = XFEATURE_MASK_XTILE_DATA, |
1636 | }; |
1637 | |
1638 | static int xstate_request_perm(unsigned long idx, bool guest) |
1639 | { |
1640 | u64 permitted, requested; |
1641 | int ret; |
1642 | |
1643 | if (idx >= XFEATURE_MAX) |
1644 | return -EINVAL; |
1645 | |
1646 | /* |
1647 | * Look up the facility mask which can require more than |
1648 | * one xstate component. |
1649 | */ |
1650 | idx = array_index_nospec(idx, ARRAY_SIZE(xstate_prctl_req)); |
1651 | requested = xstate_prctl_req[idx]; |
1652 | if (!requested) |
1653 | return -EOPNOTSUPP; |
1654 | |
1655 | if ((fpu_user_cfg.max_features & requested) != requested) |
1656 | return -EOPNOTSUPP; |
1657 | |
1658 | /* Lockless quick check */ |
1659 | permitted = xstate_get_group_perm(guest); |
1660 | if ((permitted & requested) == requested) |
1661 | return 0; |
1662 | |
1663 | /* Protect against concurrent modifications */ |
1664 | spin_lock_irq(lock: ¤t->sighand->siglock); |
1665 | permitted = xstate_get_group_perm(guest); |
1666 | |
1667 | /* First vCPU allocation locks the permissions. */ |
1668 | if (guest && (permitted & FPU_GUEST_PERM_LOCKED)) |
1669 | ret = -EBUSY; |
1670 | else |
1671 | ret = __xstate_request_perm(permitted, requested, guest); |
1672 | spin_unlock_irq(lock: ¤t->sighand->siglock); |
1673 | return ret; |
1674 | } |
1675 | |
1676 | int __xfd_enable_feature(u64 xfd_err, struct fpu_guest *guest_fpu) |
1677 | { |
1678 | u64 xfd_event = xfd_err & XFEATURE_MASK_USER_DYNAMIC; |
1679 | struct fpu_state_perm *perm; |
1680 | unsigned int ksize, usize; |
1681 | struct fpu *fpu; |
1682 | |
1683 | if (!xfd_event) { |
1684 | if (!guest_fpu) |
1685 | pr_err_once("XFD: Invalid xfd error: %016llx\n" , xfd_err); |
1686 | return 0; |
1687 | } |
1688 | |
1689 | /* Protect against concurrent modifications */ |
1690 | spin_lock_irq(lock: ¤t->sighand->siglock); |
1691 | |
1692 | /* If not permitted let it die */ |
1693 | if ((xstate_get_group_perm(guest: !!guest_fpu) & xfd_event) != xfd_event) { |
1694 | spin_unlock_irq(lock: ¤t->sighand->siglock); |
1695 | return -EPERM; |
1696 | } |
1697 | |
1698 | fpu = ¤t->group_leader->thread.fpu; |
1699 | perm = guest_fpu ? &fpu->guest_perm : &fpu->perm; |
1700 | ksize = perm->__state_size; |
1701 | usize = perm->__user_state_size; |
1702 | |
1703 | /* |
1704 | * The feature is permitted. State size is sufficient. Dropping |
1705 | * the lock is safe here even if more features are added from |
1706 | * another task, the retrieved buffer sizes are valid for the |
1707 | * currently requested feature(s). |
1708 | */ |
1709 | spin_unlock_irq(lock: ¤t->sighand->siglock); |
1710 | |
1711 | /* |
1712 | * Try to allocate a new fpstate. If that fails there is no way |
1713 | * out. |
1714 | */ |
1715 | if (fpstate_realloc(xfeatures: xfd_event, ksize, usize, guest_fpu)) |
1716 | return -EFAULT; |
1717 | return 0; |
1718 | } |
1719 | |
1720 | int xfd_enable_feature(u64 xfd_err) |
1721 | { |
1722 | return __xfd_enable_feature(xfd_err, NULL); |
1723 | } |
1724 | |
1725 | #else /* CONFIG_X86_64 */ |
1726 | static inline int xstate_request_perm(unsigned long idx, bool guest) |
1727 | { |
1728 | return -EPERM; |
1729 | } |
1730 | #endif /* !CONFIG_X86_64 */ |
1731 | |
1732 | u64 xstate_get_guest_group_perm(void) |
1733 | { |
1734 | return xstate_get_group_perm(guest: true); |
1735 | } |
1736 | EXPORT_SYMBOL_GPL(xstate_get_guest_group_perm); |
1737 | |
1738 | /** |
1739 | * fpu_xstate_prctl - xstate permission operations |
1740 | * @option: A subfunction of arch_prctl() |
1741 | * @arg2: option argument |
1742 | * Return: 0 if successful; otherwise, an error code |
1743 | * |
1744 | * Option arguments: |
1745 | * |
1746 | * ARCH_GET_XCOMP_SUPP: Pointer to user space u64 to store the info |
1747 | * ARCH_GET_XCOMP_PERM: Pointer to user space u64 to store the info |
1748 | * ARCH_REQ_XCOMP_PERM: Facility number requested |
1749 | * |
1750 | * For facilities which require more than one XSTATE component, the request |
1751 | * must be the highest state component number related to that facility, |
1752 | * e.g. for AMX which requires XFEATURE_XTILE_CFG(17) and |
1753 | * XFEATURE_XTILE_DATA(18) this would be XFEATURE_XTILE_DATA(18). |
1754 | */ |
1755 | long fpu_xstate_prctl(int option, unsigned long arg2) |
1756 | { |
1757 | u64 __user *uptr = (u64 __user *)arg2; |
1758 | u64 permitted, supported; |
1759 | unsigned long idx = arg2; |
1760 | bool guest = false; |
1761 | |
1762 | switch (option) { |
1763 | case ARCH_GET_XCOMP_SUPP: |
1764 | supported = fpu_user_cfg.max_features | fpu_user_cfg.legacy_features; |
1765 | return put_user(supported, uptr); |
1766 | |
1767 | case ARCH_GET_XCOMP_PERM: |
1768 | /* |
1769 | * Lockless snapshot as it can also change right after the |
1770 | * dropping the lock. |
1771 | */ |
1772 | permitted = xstate_get_host_group_perm(); |
1773 | permitted &= XFEATURE_MASK_USER_SUPPORTED; |
1774 | return put_user(permitted, uptr); |
1775 | |
1776 | case ARCH_GET_XCOMP_GUEST_PERM: |
1777 | permitted = xstate_get_guest_group_perm(); |
1778 | permitted &= XFEATURE_MASK_USER_SUPPORTED; |
1779 | return put_user(permitted, uptr); |
1780 | |
1781 | case ARCH_REQ_XCOMP_GUEST_PERM: |
1782 | guest = true; |
1783 | fallthrough; |
1784 | |
1785 | case ARCH_REQ_XCOMP_PERM: |
1786 | if (!IS_ENABLED(CONFIG_X86_64)) |
1787 | return -EOPNOTSUPP; |
1788 | |
1789 | return xstate_request_perm(idx, guest); |
1790 | |
1791 | default: |
1792 | return -EINVAL; |
1793 | } |
1794 | } |
1795 | |
1796 | #ifdef CONFIG_PROC_PID_ARCH_STATUS |
1797 | /* |
1798 | * Report the amount of time elapsed in millisecond since last AVX512 |
1799 | * use in the task. |
1800 | */ |
1801 | static void avx512_status(struct seq_file *m, struct task_struct *task) |
1802 | { |
1803 | unsigned long timestamp = READ_ONCE(task->thread.fpu.avx512_timestamp); |
1804 | long delta; |
1805 | |
1806 | if (!timestamp) { |
1807 | /* |
1808 | * Report -1 if no AVX512 usage |
1809 | */ |
1810 | delta = -1; |
1811 | } else { |
1812 | delta = (long)(jiffies - timestamp); |
1813 | /* |
1814 | * Cap to LONG_MAX if time difference > LONG_MAX |
1815 | */ |
1816 | if (delta < 0) |
1817 | delta = LONG_MAX; |
1818 | delta = jiffies_to_msecs(j: delta); |
1819 | } |
1820 | |
1821 | seq_put_decimal_ll(m, delimiter: "AVX512_elapsed_ms:\t" , num: delta); |
1822 | seq_putc(m, c: '\n'); |
1823 | } |
1824 | |
1825 | /* |
1826 | * Report architecture specific information |
1827 | */ |
1828 | int proc_pid_arch_status(struct seq_file *m, struct pid_namespace *ns, |
1829 | struct pid *pid, struct task_struct *task) |
1830 | { |
1831 | /* |
1832 | * Report AVX512 state if the processor and build option supported. |
1833 | */ |
1834 | if (cpu_feature_enabled(X86_FEATURE_AVX512F)) |
1835 | avx512_status(m, task); |
1836 | |
1837 | return 0; |
1838 | } |
1839 | #endif /* CONFIG_PROC_PID_ARCH_STATUS */ |
1840 | |