process_64.c source code [linux/arch/x86/kernel/process_64.c]

1	// SPDX-License-Identifier: GPL-2.0-only
2	/*
3	* Copyright (C) 1995 Linus Torvalds
4	*
5	* Pentium III FXSR, SSE support
6	* Gareth Hughes <gareth@valinux.com>, May 2000
7	*
8	* X86-64 port
9	* Andi Kleen.
10	*
11	* CPU hotplug support - ashok.raj@intel.com
12	*/
13
14	/*
15	* This file handles the architecture-dependent parts of process handling..
16	*/
17
18	#include <linux/cpu.h>
19	#include <linux/errno.h>
20	#include <linux/sched.h>
21	#include <linux/sched/task.h>
22	#include <linux/sched/task_stack.h>
23	#include <linux/fs.h>
24	#include <linux/kernel.h>
25	#include <linux/mm.h>
26	#include <linux/elfcore.h>
27	#include <linux/smp.h>
28	#include <linux/slab.h>
29	#include <linux/user.h>
30	#include <linux/interrupt.h>
31	#include <linux/delay.h>
32	#include <linux/export.h>
33	#include <linux/ptrace.h>
34	#include <linux/notifier.h>
35	#include <linux/kprobes.h>
36	#include <linux/kdebug.h>
37	#include <linux/prctl.h>
38	#include <linux/uaccess.h>
39	#include <linux/io.h>
40	#include <linux/ftrace.h>
41	#include <linux/syscalls.h>
42	#include <linux/iommu.h>
43
44	#include <asm/processor.h>
45	#include <asm/pkru.h>
46	#include <asm/fpu/sched.h>
47	#include <asm/mmu_context.h>
48	#include <asm/prctl.h>
49	#include <asm/desc.h>
50	#include <asm/proto.h>
51	#include <asm/ia32.h>
52	#include <asm/debugreg.h>
53	#include <asm/switch_to.h>
54	#include <asm/xen/hypervisor.h>
55	#include <asm/vdso.h>
56	#include <asm/resctrl.h>
57	#include <asm/unistd.h>
58	#include <asm/fsgsbase.h>
59	#include <asm/fred.h>
60	#ifdef CONFIG_IA32_EMULATION
61	/ Not included via unistd.h /
62	#include <asm/unistd_32_ia32.h>
63	#endif
64
65	#include "process.h"
66
67	/ Prints also some state that isn't saved in the pt_regs /
68	void __show_regs(struct pt_regs regs, enum* show_regs_mode mode,
69	const char *log_lvl)
70	{
71	unsigned long cr0 = `0L`, cr2 = `0L`, cr3 = `0L`, cr4 = `0L`, fs, gs, shadowgs;
72	unsigned long d0, d1, d2, d3, d6, d7;
73	unsigned int fsindex, gsindex;
74	unsigned int ds, es;
75
76	show_iret_regs(regs, log_lvl);
77
78	if (regs->orig_ax != -`1`)
79	pr_cont(" ORIG_RAX: %016lx\n", regs->orig_ax);
80	else
81	pr_cont("\n");
82
83	printk("%sRAX: %016lx RBX: %016lx RCX: %016lx\n",
84	log_lvl, regs->ax, regs->bx, regs->cx);
85	printk("%sRDX: %016lx RSI: %016lx RDI: %016lx\n",
86	log_lvl, regs->dx, regs->si, regs->di);
87	printk("%sRBP: %016lx R08: %016lx R09: %016lx\n",
88	log_lvl, regs->bp, regs->r8, regs->r9);
89	printk("%sR10: %016lx R11: %016lx R12: %016lx\n",
90	log_lvl, regs->r10, regs->r11, regs->r12);
91	printk("%sR13: %016lx R14: %016lx R15: %016lx\n",
92	log_lvl, regs->r13, regs->r14, regs->r15);
93
94	if (mode == SHOW_REGS_SHORT)
95	return;
96
97	if (mode == SHOW_REGS_USER) {
98	rdmsrl(MSR_FS_BASE, fs);
99	rdmsrl(MSR_KERNEL_GS_BASE, shadowgs);
100	printk("%sFS: %016lx GS: %016lx\n",
101	log_lvl, fs, shadowgs);
102	return;
103	}
104
105	asm("movl %%ds,%0" : "=r" (ds));
106	asm("movl %%es,%0" : "=r" (es));
107	asm("movl %%fs,%0" : "=r" (fsindex));
108	asm("movl %%gs,%0" : "=r" (gsindex));
109
110	rdmsrl(MSR_FS_BASE, fs);
111	rdmsrl(MSR_GS_BASE, gs);
112	rdmsrl(MSR_KERNEL_GS_BASE, shadowgs);
113
114	cr0 = read_cr0();
115	cr2 = read_cr2();
116	cr3 = __read_cr3();
117	cr4 = __read_cr4();
118
119	printk("%sFS: %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n",
120	log_lvl, fs, fsindex, gs, gsindex, shadowgs);
121	printk("%sCS: %04x DS: %04x ES: %04x CR0: %016lx\n",
122	log_lvl, regs->cs, ds, es, cr0);
123	printk("%sCR2: %016lx CR3: %016lx CR4: %016lx\n",
124	log_lvl, cr2, cr3, cr4);
125
126	get_debugreg(d0, `0`);
127	get_debugreg(d1, `1`);
128	get_debugreg(d2, `2`);
129	get_debugreg(d3, `3`);
130	get_debugreg(d6, `6`);
131	get_debugreg(d7, `7`);
132
133	/ Only print out debug registers if they are in their non-default state. /
134	if (!((d0 == `0`) && (d1 == `0`) && (d2 == `0`) && (d3 == `0`) &&
135	(d6 == DR6_RESERVED) && (d7 == `0x400`))) {
136	printk("%sDR0: %016lx DR1: %016lx DR2: %016lx\n",
137	log_lvl, d0, d1, d2);
138	printk("%sDR3: %016lx DR6: %016lx DR7: %016lx\n",
139	log_lvl, d3, d6, d7);
140	}
141
142	if (cpu_feature_enabled(X86_FEATURE_OSPKE))
143	printk("%sPKRU: %08x\n", log_lvl, read_pkru());
144	}
145
146	void release_thread(struct task_struct *dead_task)
147	{
148	WARN_ON(dead_task->mm);
149	}
150
151	enum which_selector {
152	FS,
153	GS
154	};
155
156	/*
157	* Out of line to be protected from kprobes and tracing. If this would be
158	* traced or probed than any access to a per CPU variable happens with
159	* the wrong GS.
160	*
161	* It is not used on Xen paravirt. When paravirt support is needed, it
162	* needs to be renamed with native_ prefix.
163	*/
164	static noinstr unsigned long __rdgsbase_inactive(void)
165	{
166	unsigned long gsbase;
167
168	lockdep_assert_irqs_disabled();
169
170	/*
171	* SWAPGS is no longer needed thus NOT allowed with FRED because
172	* FRED transitions ensure that an operating system can _always_
173	* operate with its own GS base address:
174	* - For events that occur in ring 3, FRED event delivery swaps
175	* the GS base address with the IA32_KERNEL_GS_BASE MSR.
176	* - ERETU (the FRED transition that returns to ring 3) also swaps
177	* the GS base address with the IA32_KERNEL_GS_BASE MSR.
178	*
179	* And the operating system can still setup the GS segment for a
180	* user thread without the need of loading a user thread GS with:
181	* - Using LKGS, available with FRED, to modify other attributes
182	* of the GS segment without compromising its ability always to
183	* operate with its own GS base address.
184	* - Accessing the GS segment base address for a user thread as
185	* before using RDMSR or WRMSR on the IA32_KERNEL_GS_BASE MSR.
186	*
187	* Note, LKGS loads the GS base address into the IA32_KERNEL_GS_BASE
188	* MSR instead of the GS segment’s descriptor cache. As such, the
189	* operating system never changes its runtime GS base address.
190	*/
191	if (!cpu_feature_enabled(X86_FEATURE_FRED) &&
192	!cpu_feature_enabled(X86_FEATURE_XENPV)) {
193	native_swapgs();
194	gsbase = rdgsbase();
195	native_swapgs();
196	} else {
197	instrumentation_begin();
198	rdmsrl(MSR_KERNEL_GS_BASE, gsbase);
199	instrumentation_end();
200	}
201
202	return gsbase;
203	}
204
205	/*
206	* Out of line to be protected from kprobes and tracing. If this would be
207	* traced or probed than any access to a per CPU variable happens with
208	* the wrong GS.
209	*
210	* It is not used on Xen paravirt. When paravirt support is needed, it
211	* needs to be renamed with native_ prefix.
212	*/
213	static noinstr void __wrgsbase_inactive(unsigned long gsbase)
214	{
215	lockdep_assert_irqs_disabled();
216
217	if (!cpu_feature_enabled(X86_FEATURE_FRED) &&
218	!cpu_feature_enabled(X86_FEATURE_XENPV)) {
219	native_swapgs();
220	wrgsbase(gsbase);
221	native_swapgs();
222	} else {
223	instrumentation_begin();
224	wrmsrl(MSR_KERNEL_GS_BASE, val: gsbase);
225	instrumentation_end();
226	}
227	}
228
229	/*
230	* Saves the FS or GS base for an outgoing thread if FSGSBASE extensions are
231	* not available. The goal is to be reasonably fast on non-FSGSBASE systems.
232	* It's forcibly inlined because it'll generate better code and this function
233	* is hot.
234	*/
235	static __always_inline void save_base_legacy(struct task_struct *prev_p,
236	unsigned short selector,
237	enum which_selector which)
238	{
239	if (likely(selector == `0`)) {
240	/*
241	* On Intel (without X86_BUG_NULL_SEG), the segment base could
242	* be the pre-existing saved base or it could be zero. On AMD
243	* (with X86_BUG_NULL_SEG), the segment base could be almost
244	* anything.
245	*
246	* This branch is very hot (it's hit twice on almost every
247	* context switch between 64-bit programs), and avoiding
248	* the RDMSR helps a lot, so we just assume that whatever
249	* value is already saved is correct. This matches historical
250	* Linux behavior, so it won't break existing applications.
251	*
252	* To avoid leaking state, on non-X86_BUG_NULL_SEG CPUs, if we
253	* report that the base is zero, it needs to actually be zero:
254	* see the corresponding logic in load_seg_legacy.
255	*/
256	} else {
257	/*
258	* If the selector is 1, 2, or 3, then the base is zero on
259	* !X86_BUG_NULL_SEG CPUs and could be anything on
260	* X86_BUG_NULL_SEG CPUs. In the latter case, Linux
261	* has never attempted to preserve the base across context
262	* switches.
263	*
264	* If selector > 3, then it refers to a real segment, and
265	* saving the base isn't necessary.
266	*/
267	if (which == FS)
268	prev_p->thread.fsbase = `0`;
269	else
270	prev_p->thread.gsbase = `0`;
271	}
272	}
273
274	static __always_inline void save_fsgs(struct task_struct *task)
275	{
276	savesegment(fs, task->thread.fsindex);
277	savesegment(gs, task->thread.gsindex);
278	if (static_cpu_has(X86_FEATURE_FSGSBASE)) {
279	/*
280	* If FSGSBASE is enabled, we can't make any useful guesses
281	* about the base, and user code expects us to save the current
282	* value. Fortunately, reading the base directly is efficient.
283	*/
284	task->thread.fsbase = rdfsbase();
285	task->thread.gsbase = __rdgsbase_inactive();
286	} else {
287	save_base_legacy(prev_p: task, selector: task->thread.fsindex, which: FS);
288	save_base_legacy(prev_p: task, selector: task->thread.gsindex, which: GS);
289	}
290	}
291
292	/*
293	* While a process is running,current->thread.fsbase and current->thread.gsbase
294	* may not match the corresponding CPU registers (see save_base_legacy()).
295	*/
296	void current_save_fsgs(void)
297	{
298	unsigned long flags;
299
300	/ Interrupts need to be off for FSGSBASE /
301	local_irq_save(flags);
302	save_fsgs(current);
303	local_irq_restore(flags);
304	}
305	#if IS_ENABLED(CONFIG_KVM)
306	EXPORT_SYMBOL_GPL(current_save_fsgs);
307	#endif
308
309	static __always_inline void loadseg(enum which_selector which,
310	unsigned short sel)
311	{
312	if (which == FS)
313	loadsegment(fs, sel);
314	else
315	load_gs_index(gs: sel);
316	}
317
318	static __always_inline void load_seg_legacy(unsigned short prev_index,
319	unsigned long prev_base,
320	unsigned short next_index,
321	unsigned long next_base,
322	enum which_selector which)
323	{
324	if (likely(next_index <= `3`)) {
325	/*
326	* The next task is using 64-bit TLS, is not using this
327	* segment at all, or is having fun with arcane CPU features.
328	*/
329	if (next_base == `0`) {
330	/*
331	* Nasty case: on AMD CPUs, we need to forcibly zero
332	* the base.
333	*/
334	if (static_cpu_has_bug(X86_BUG_NULL_SEG)) {
335	loadseg(which, __USER_DS);
336	loadseg(which, sel: next_index);
337	} else {
338	/*
339	* We could try to exhaustively detect cases
340	* under which we can skip the segment load,
341	* but there's really only one case that matters
342	* for performance: if both the previous and
343	* next states are fully zeroed, we can skip
344	* the load.
345	*
346	* (This assumes that prev_base == 0 has no
347	* false positives. This is the case on
348	* Intel-style CPUs.)
349	*/
350	if (likely(prev_index \| next_index \| prev_base))
351	loadseg(which, sel: next_index);
352	}
353	} else {
354	if (prev_index != next_index)
355	loadseg(which, sel: next_index);
356	wrmsrl(msr: which == FS ? MSR_FS_BASE : MSR_KERNEL_GS_BASE,
357	val: next_base);
358	}
359	} else {
360	/*
361	* The next task is using a real segment. Loading the selector
362	* is sufficient.
363	*/
364	loadseg(which, sel: next_index);
365	}
366	}
367
368	/*
369	* Store prev's PKRU value and load next's PKRU value if they differ. PKRU
370	* is not XSTATE managed on context switch because that would require a
371	* lookup in the task's FPU xsave buffer and require to keep that updated
372	* in various places.
373	*/
374	static __always_inline void x86_pkru_load(struct thread_struct *prev,
375	struct thread_struct *next)
376	{
377	if (!cpu_feature_enabled(X86_FEATURE_OSPKE))
378	return;
379
380	/ Stash the prev task's value: /
381	prev->pkru = rdpkru();
382
383	/*
384	* PKRU writes are slightly expensive. Avoid them when not
385	* strictly necessary:
386	*/
387	if (prev->pkru != next->pkru)
388	wrpkru(pkru: next->pkru);
389	}
390
391	static __always_inline void x86_fsgsbase_load(struct thread_struct *prev,
392	struct thread_struct *next)
393	{
394	if (static_cpu_has(X86_FEATURE_FSGSBASE)) {
395	/ Update the FS and GS selectors if they could have changed. /
396	if (unlikely(prev->fsindex \|\| next->fsindex))
397	loadseg(which: FS, sel: next->fsindex);
398	if (unlikely(prev->gsindex \|\| next->gsindex))
399	loadseg(which: GS, sel: next->gsindex);
400
401	/ Update the bases. /
402	wrfsbase(fsbase: next->fsbase);
403	__wrgsbase_inactive(gsbase: next->gsbase);
404	} else {
405	load_seg_legacy(prev_index: prev->fsindex, prev_base: prev->fsbase,
406	next_index: next->fsindex, next_base: next->fsbase, which: FS);
407	load_seg_legacy(prev_index: prev->gsindex, prev_base: prev->gsbase,
408	next_index: next->gsindex, next_base: next->gsbase, which: GS);
409	}
410	}
411
412	unsigned long x86_fsgsbase_read_task(struct task_struct *task,
413	unsigned short selector)
414	{
415	unsigned short idx = selector >> `3`;
416	unsigned long base;
417
418	if (likely((selector & SEGMENT_TI_MASK) == `0`)) {
419	if (unlikely(idx >= GDT_ENTRIES))
420	return `0`;
421
422	/*
423	* There are no user segments in the GDT with nonzero bases
424	* other than the TLS segments.
425	*/
426	if (idx < GDT_ENTRY_TLS_MIN \|\| idx > GDT_ENTRY_TLS_MAX)
427	return `0`;
428
429	idx -= GDT_ENTRY_TLS_MIN;
430	base = get_desc_base(desc: &task->thread.tls_array[idx]);
431	} else {
432	#ifdef CONFIG_MODIFY_LDT_SYSCALL
433	struct ldt_struct *ldt;
434
435	/*
436	* If performance here mattered, we could protect the LDT
437	* with RCU. This is a slow path, though, so we can just
438	* take the mutex.
439	*/
440	mutex_lock(&task->mm->context.lock);
441	ldt = task->mm->context.ldt;
442	if (unlikely(!ldt \|\| idx >= ldt->nr_entries))
443	base = `0`;
444	else
445	base = get_desc_base(desc: ldt->entries + idx);
446	mutex_unlock(lock: &task->mm->context.lock);
447	#else
448	base = `0`;
449	#endif
450	}
451
452	return base;
453	}
454
455	unsigned long x86_gsbase_read_cpu_inactive(void)
456	{
457	unsigned long gsbase;
458
459	if (boot_cpu_has(X86_FEATURE_FSGSBASE)) {
460	unsigned long flags;
461
462	local_irq_save(flags);
463	gsbase = __rdgsbase_inactive();
464	local_irq_restore(flags);
465	} else {
466	rdmsrl(MSR_KERNEL_GS_BASE, gsbase);
467	}
468
469	return gsbase;
470	}
471
472	void x86_gsbase_write_cpu_inactive(unsigned long gsbase)
473	{
474	if (boot_cpu_has(X86_FEATURE_FSGSBASE)) {
475	unsigned long flags;
476
477	local_irq_save(flags);
478	__wrgsbase_inactive(gsbase);
479	local_irq_restore(flags);
480	} else {
481	wrmsrl(MSR_KERNEL_GS_BASE, val: gsbase);
482	}
483	}
484
485	unsigned long x86_fsbase_read_task(struct task_struct *task)
486	{
487	unsigned long fsbase;
488
489	if (task == current)
490	fsbase = x86_fsbase_read_cpu();
491	else if (boot_cpu_has(X86_FEATURE_FSGSBASE) \|\|
492	(task->thread.fsindex == `0`))
493	fsbase = task->thread.fsbase;
494	else
495	fsbase = x86_fsgsbase_read_task(task, selector: task->thread.fsindex);
496
497	return fsbase;
498	}
499
500	unsigned long x86_gsbase_read_task(struct task_struct *task)
501	{
502	unsigned long gsbase;
503
504	if (task == current)
505	gsbase = x86_gsbase_read_cpu_inactive();
506	else if (boot_cpu_has(X86_FEATURE_FSGSBASE) \|\|
507	(task->thread.gsindex == `0`))
508	gsbase = task->thread.gsbase;
509	else
510	gsbase = x86_fsgsbase_read_task(task, selector: task->thread.gsindex);
511
512	return gsbase;
513	}
514
515	void x86_fsbase_write_task(struct task_struct task, unsigned* long fsbase)
516	{
517	WARN_ON_ONCE(task == current);
518
519	task->thread.fsbase = fsbase;
520	}
521
522	void x86_gsbase_write_task(struct task_struct task, unsigned* long gsbase)
523	{
524	WARN_ON_ONCE(task == current);
525
526	task->thread.gsbase = gsbase;
527	}
528
529	static void
530	start_thread_common(struct pt_regs regs, unsigned* long new_ip,
531	unsigned long new_sp,
532	u16 _cs, u16 _ss, u16 _ds)
533	{
534	WARN_ON_ONCE(regs != current_pt_regs());
535
536	if (static_cpu_has(X86_BUG_NULL_SEG)) {
537	/ Loading zero below won't clear the base. /
538	loadsegment(fs, __USER_DS);
539	load_gs_index(__USER_DS);
540	}
541
542	reset_thread_features();
543
544	loadsegment(fs, `0`);
545	loadsegment(es, _ds);
546	loadsegment(ds, _ds);
547	load_gs_index(gs: `0`);
548
549	regs->ip = new_ip;
550	regs->sp = new_sp;
551	regs->csx = _cs;
552	regs->ssx = _ss;
553	/*
554	* Allow single-step trap and NMI when starting a new task, thus
555	* once the new task enters user space, single-step trap and NMI
556	* are both enabled immediately.
557	*
558	* Entering a new task is logically speaking a return from a
559	* system call (exec, fork, clone, etc.). As such, if ptrace
560	* enables single stepping a single step exception should be
561	* allowed to trigger immediately upon entering user space.
562	* This is not optional.
563	*
564	* NMI should never be disabled in user space. As such, this
565	* is an optional, opportunistic way to catch errors.
566	*
567	* Paranoia: High-order 48 bits above the lowest 16 bit SS are
568	* discarded by the legacy IRET instruction on all Intel, AMD,
569	* and Cyrix/Centaur/VIA CPUs, thus can be set unconditionally,
570	* even when FRED is not enabled. But we choose the safer side
571	* to use these bits only when FRED is enabled.
572	*/
573	if (cpu_feature_enabled(X86_FEATURE_FRED)) {
574	regs->fred_ss.swevent = true;
575	regs->fred_ss.nmi = true;
576	}
577
578	regs->flags = X86_EFLAGS_IF \| X86_EFLAGS_FIXED;
579	}
580
581	void
582	start_thread(struct pt_regs regs, unsigned* long new_ip, unsigned long new_sp)
583	{
584	start_thread_common(regs, new_ip, new_sp,
585	__USER_CS, __USER_DS, ds: `0`);
586	}
587	EXPORT_SYMBOL_GPL(start_thread);
588
589	#ifdef CONFIG_COMPAT
590	void compat_start_thread(struct pt_regs *regs, u32 new_ip, u32 new_sp, bool x32)
591	{
592	start_thread_common(regs, new_ip, new_sp,
593	cs: x32 ? __USER_CS : __USER32_CS,
594	__USER_DS, __USER_DS);
595	}
596	#endif
597
598	/*
599	* switch_to(x,y) should switch tasks from x to y.
600	*
601	* This could still be optimized:
602	* - fold all the options into a flag word and test it with a single test.
603	* - could test fs/gs bitsliced
604	*
605	* Kprobes not supported here. Set the probe on schedule instead.
606	* Function graph tracer not supported too.
607	*/
608	__no_kmsan_checks
609	__visible __notrace_funcgraph struct task_struct *
610	__switch_to(struct task_struct prev_p, struct* task_struct *next_p)
611	{
612	struct thread_struct *prev = &prev_p->thread;
613	struct thread_struct *next = &next_p->thread;
614	int cpu = smp_processor_id();
615
616	WARN_ON_ONCE(IS_ENABLED(CONFIG_DEBUG_ENTRY) &&
617	this_cpu_read(pcpu_hot.hardirq_stack_inuse));
618
619	if (!test_tsk_thread_flag(tsk: prev_p, TIF_NEED_FPU_LOAD))
620	switch_fpu_prepare(old: prev_p, cpu);
621
622	/ We must save %fs and %gs before load_TLS() because*
623	* %fs and %gs may be cleared by load_TLS().
624	*
625	* (e.g. xen_load_tls())
626	*/
627	save_fsgs(task: prev_p);
628
629	/*
630	* Load TLS before restoring any segments so that segment loads
631	* reference the correct GDT entries.
632	*/
633	load_TLS(t: next, cpu);
634
635	/*
636	* Leave lazy mode, flushing any hypercalls made here. This
637	* must be done after loading TLS entries in the GDT but before
638	* loading segments that might reference them.
639	*/
640	arch_end_context_switch(next: next_p);
641
642	/ Switch DS and ES.*
643	*
644	* Reading them only returns the selectors, but writing them (if
645	* nonzero) loads the full descriptor from the GDT or LDT. The
646	* LDT for next is loaded in switch_mm, and the GDT is loaded
647	* above.
648	*
649	* We therefore need to write new values to the segment
650	* registers on every context switch unless both the new and old
651	* values are zero.
652	*
653	* Note that we don't need to do anything for CS and SS, as
654	* those are saved and restored as part of pt_regs.
655	*/
656	savesegment(es, prev->es);
657	if (unlikely(next->es \| prev->es))
658	loadsegment(es, next->es);
659
660	savesegment(ds, prev->ds);
661	if (unlikely(next->ds \| prev->ds))
662	loadsegment(ds, next->ds);
663
664	x86_fsgsbase_load(prev, next);
665
666	x86_pkru_load(prev, next);
667
668	/*
669	* Switch the PDA and FPU contexts.
670	*/
671	raw_cpu_write(pcpu_hot.current_task, next_p);
672	raw_cpu_write(pcpu_hot.top_of_stack, task_top_of_stack(next_p));
673
674	switch_fpu_finish(new: next_p);
675
676	/ Reload sp0. /
677	update_task_stack(task: next_p);
678
679	switch_to_extra(prev: prev_p, next: next_p);
680
681	if (static_cpu_has_bug(X86_BUG_SYSRET_SS_ATTRS)) {
682	/*
683	* AMD CPUs have a misfeature: SYSRET sets the SS selector but
684	* does not update the cached descriptor. As a result, if we
685	* do SYSRET while SS is NULL, we'll end up in user mode with
686	* SS apparently equal to __USER_DS but actually unusable.
687	*
688	* The straightforward workaround would be to fix it up just
689	* before SYSRET, but that would slow down the system call
690	* fast paths. Instead, we ensure that SS is never NULL in
691	* system call context. We do this by replacing NULL SS
692	* selectors at every context switch. SYSCALL sets up a valid
693	* SS, so the only way to get NULL is to re-enter the kernel
694	* from CPL 3 through an interrupt. Since that can't happen
695	* in the same task as a running syscall, we are guaranteed to
696	* context switch between every interrupt vector entry and a
697	* subsequent SYSRET.
698	*
699	* We read SS first because SS reads are much faster than
700	* writes. Out of caution, we force SS to __KERNEL_DS even if
701	* it previously had a different non-NULL value.
702	*/
703	unsigned short ss_sel;
704	savesegment(ss, ss_sel);
705	if (ss_sel != __KERNEL_DS)
706	loadsegment(ss, __KERNEL_DS);
707	}
708
709	/ Load the Intel cache allocation PQR MSR. /
710	resctrl_sched_in(tsk: next_p);
711
712	return prev_p;
713	}
714
715	void set_personality_64bit(void)
716	{
717	/ inherit personality from parent /
718
719	/ Make sure to be in 64bit mode /
720	clear_thread_flag(TIF_ADDR32);
721	/ Pretend that this comes from a 64bit execve /
722	task_pt_regs(current)->orig_ax = __NR_execve;
723	current_thread_info()->status &= ~TS_COMPAT;
724	if (current->mm)
725	__set_bit(MM_CONTEXT_HAS_VSYSCALL, &current->mm->context.flags);
726
727	/ TBD: overwrites user setup. Should have two bits.*
728	But 64bit processes have always behaved this way,
729	so it's not too bad. The main problem is just that
730	32bit children are affected again. /*
731	current->personality &= ~READ_IMPLIES_EXEC;
732	}
733
734	static void __set_personality_x32(void)
735	{
736	#ifdef CONFIG_X86_X32_ABI
737	if (current->mm)
738	current->mm->context.flags = `0`;
739
740	current->personality &= ~READ_IMPLIES_EXEC;
741	/*
742	* in_32bit_syscall() uses the presence of the x32 syscall bit
743	* flag to determine compat status. The x86 mmap() code relies on
744	* the syscall bitness so set x32 syscall bit right here to make
745	* in_32bit_syscall() work during exec().
746	*
747	* Pretend to come from a x32 execve.
748	*/
749	task_pt_regs(current)->orig_ax = __NR_x32_execve \| __X32_SYSCALL_BIT;
750	current_thread_info()->status &= ~TS_COMPAT;
751	#endif
752	}
753
754	static void __set_personality_ia32(void)
755	{
756	#ifdef CONFIG_IA32_EMULATION
757	if (current->mm) {
758	/*
759	* uprobes applied to this MM need to know this and
760	* cannot use user_64bit_mode() at that time.
761	*/
762	__set_bit(MM_CONTEXT_UPROBE_IA32, &current->mm->context.flags);
763	}
764
765	current->personality \|= force_personality32;
766	/ Prepare the first "return" to user space /
767	task_pt_regs(current)->orig_ax = __NR_ia32_execve;
768	current_thread_info()->status \|= TS_COMPAT;
769	#endif
770	}
771
772	void set_personality_ia32(bool x32)
773	{
774	/ Make sure to be in 32bit mode /
775	set_thread_flag(TIF_ADDR32);
776
777	if (x32)
778	__set_personality_x32();
779	else
780	__set_personality_ia32();
781	}
782	EXPORT_SYMBOL_GPL(set_personality_ia32);
783
784	#ifdef CONFIG_CHECKPOINT_RESTORE
785	static long prctl_map_vdso(const struct vdso_image image, unsigned* long addr)
786	{
787	int ret;
788
789	ret = map_vdso_once(image, addr);
790	if (ret)
791	return ret;
792
793	return (long)image->size;
794	}
795	#endif
796
797	#ifdef CONFIG_ADDRESS_MASKING
798
799	#define LAM_U57_BITS 6
800
801	static int prctl_enable_tagged_addr(struct mm_struct mm, unsigned* long nr_bits)
802	{
803	if (!cpu_feature_enabled(X86_FEATURE_LAM))
804	return -ENODEV;
805
806	/ PTRACE_ARCH_PRCTL /
807	if (current->mm != mm)
808	return -EINVAL;
809
810	if (mm_valid_pasid(mm) &&
811	!test_bit(MM_CONTEXT_FORCE_TAGGED_SVA, &mm->context.flags))
812	return -EINVAL;
813
814	if (mmap_write_lock_killable(mm))
815	return -EINTR;
816
817	if (test_bit(MM_CONTEXT_LOCK_LAM, &mm->context.flags)) {
818	mmap_write_unlock(mm);
819	return -EBUSY;
820	}
821
822	if (!nr_bits) {
823	mmap_write_unlock(mm);
824	return -EINVAL;
825	} else if (nr_bits <= LAM_U57_BITS) {
826	mm->context.lam_cr3_mask = X86_CR3_LAM_U57;
827	mm->context.untag_mask = ~GENMASK(`62`, `57`);
828	} else {
829	mmap_write_unlock(mm);
830	return -EINVAL;
831	}
832
833	write_cr3(x: __read_cr3() \| mm->context.lam_cr3_mask);
834	set_tlbstate_lam_mode(mm);
835	set_bit(MM_CONTEXT_LOCK_LAM, addr: &mm->context.flags);
836
837	mmap_write_unlock(mm);
838
839	return `0`;
840	}
841	#endif
842
843	long do_arch_prctl_64(struct task_struct task, int* option, unsigned long arg2)
844	{
845	int ret = `0`;
846
847	switch (option) {
848	case ARCH_SET_GS: {
849	if (unlikely(arg2 >= TASK_SIZE_MAX))
850	return -EPERM;
851
852	preempt_disable();
853	/*
854	* ARCH_SET_GS has always overwritten the index
855	* and the base. Zero is the most sensible value
856	* to put in the index, and is the only value that
857	* makes any sense if FSGSBASE is unavailable.
858	*/
859	if (task == current) {
860	loadseg(which: GS, sel: `0`);
861	x86_gsbase_write_cpu_inactive(gsbase: arg2);
862
863	/*
864	* On non-FSGSBASE systems, save_base_legacy() expects
865	* that we also fill in thread.gsbase.
866	*/
867	task->thread.gsbase = arg2;
868
869	} else {
870	task->thread.gsindex = `0`;
871	x86_gsbase_write_task(task, gsbase: arg2);
872	}
873	preempt_enable();
874	break;
875	}
876	case ARCH_SET_FS: {
877	/*
878	* Not strictly needed for %fs, but do it for symmetry
879	* with %gs
880	*/
881	if (unlikely(arg2 >= TASK_SIZE_MAX))
882	return -EPERM;
883
884	preempt_disable();
885	/*
886	* Set the selector to 0 for the same reason
887	* as %gs above.
888	*/
889	if (task == current) {
890	loadseg(which: FS, sel: `0`);
891	x86_fsbase_write_cpu(fsbase: arg2);
892
893	/*
894	* On non-FSGSBASE systems, save_base_legacy() expects
895	* that we also fill in thread.fsbase.
896	*/
897	task->thread.fsbase = arg2;
898	} else {
899	task->thread.fsindex = `0`;
900	x86_fsbase_write_task(task, fsbase: arg2);
901	}
902	preempt_enable();
903	break;
904	}
905	case ARCH_GET_FS: {
906	unsigned long base = x86_fsbase_read_task(task);
907
908	ret = put_user(base, (unsigned long __user *)arg2);
909	break;
910	}
911	case ARCH_GET_GS: {
912	unsigned long base = x86_gsbase_read_task(task);
913
914	ret = put_user(base, (unsigned long __user *)arg2);
915	break;
916	}
917
918	#ifdef CONFIG_CHECKPOINT_RESTORE
919	# ifdef CONFIG_X86_X32_ABI
920	case ARCH_MAP_VDSO_X32:
921	return prctl_map_vdso(image: &vdso_image_x32, addr: arg2);
922	# endif
923	# if defined CONFIG_X86_32 \|\| defined CONFIG_IA32_EMULATION
924	case ARCH_MAP_VDSO_32:
925	return prctl_map_vdso(image: &vdso_image_32, addr: arg2);
926	# endif
927	case ARCH_MAP_VDSO_64:
928	return prctl_map_vdso(image: &vdso_image_64, addr: arg2);
929	#endif
930	#ifdef CONFIG_ADDRESS_MASKING
931	case ARCH_GET_UNTAG_MASK:
932	return put_user(task->mm->context.untag_mask,
933	(unsigned long __user *)arg2);
934	case ARCH_ENABLE_TAGGED_ADDR:
935	return prctl_enable_tagged_addr(mm: task->mm, nr_bits: arg2);
936	case ARCH_FORCE_TAGGED_SVA:
937	if (current != task)
938	return -EINVAL;
939	set_bit(MM_CONTEXT_FORCE_TAGGED_SVA, addr: &task->mm->context.flags);
940	return `0`;
941	case ARCH_GET_MAX_TAG_BITS:
942	if (!cpu_feature_enabled(X86_FEATURE_LAM))
943	return put_user(`0`, (unsigned long __user *)arg2);
944	else
945	return put_user(LAM_U57_BITS, (unsigned long __user *)arg2);
946	#endif
947	case ARCH_SHSTK_ENABLE:
948	case ARCH_SHSTK_DISABLE:
949	case ARCH_SHSTK_LOCK:
950	case ARCH_SHSTK_UNLOCK:
951	case ARCH_SHSTK_STATUS:
952	return shstk_prctl(task, option, arg2);
953	default:
954	ret = -EINVAL;
955	break;
956	}
957
958	return ret;
959	}
960
961	SYSCALL_DEFINE2(arch_prctl, int, option, unsigned long, arg2)
962	{
963	long ret;
964
965	ret = do_arch_prctl_64(current, option, arg2);
966	if (ret == -EINVAL)
967	ret = do_arch_prctl_common(option, arg2);
968
969	return ret;
970	}
971
972	#ifdef CONFIG_IA32_EMULATION
973	COMPAT_SYSCALL_DEFINE2(arch_prctl, int, option, unsigned long, arg2)
974	{
975	return do_arch_prctl_common(option, arg2);
976	}
977	#endif
978
979	unsigned long KSTK_ESP(struct task_struct *task)
980	{
981	return task_pt_regs(task)->sp;
982	}
983

source code of linux/arch/x86/kernel/process_64.c