kexec_core.c source code [linux/kernel/kexec_core.c]

1	// SPDX-License-Identifier: GPL-2.0-only
2	/*
3	* kexec.c - kexec system call core code.
4	* Copyright (C) 2002-2004 Eric Biederman <ebiederm@xmission.com>
5	*/
6
7	#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
8
9	#include <linux/btf.h>
10	#include <linux/capability.h>
11	#include <linux/mm.h>
12	#include <linux/file.h>
13	#include <linux/slab.h>
14	#include <linux/fs.h>
15	#include <linux/kexec.h>
16	#include <linux/mutex.h>
17	#include <linux/list.h>
18	#include <linux/highmem.h>
19	#include <linux/syscalls.h>
20	#include <linux/reboot.h>
21	#include <linux/ioport.h>
22	#include <linux/hardirq.h>
23	#include <linux/elf.h>
24	#include <linux/elfcore.h>
25	#include <linux/utsname.h>
26	#include <linux/numa.h>
27	#include <linux/suspend.h>
28	#include <linux/device.h>
29	#include <linux/freezer.h>
30	#include <linux/panic_notifier.h>
31	#include <linux/pm.h>
32	#include <linux/cpu.h>
33	#include <linux/uaccess.h>
34	#include <linux/io.h>
35	#include <linux/console.h>
36	#include <linux/vmalloc.h>
37	#include <linux/swap.h>
38	#include <linux/syscore_ops.h>
39	#include <linux/compiler.h>
40	#include <linux/hugetlb.h>
41	#include <linux/objtool.h>
42	#include <linux/kmsg_dump.h>
43
44	#include <asm/page.h>
45	#include <asm/sections.h>
46
47	#include <crypto/hash.h>
48	#include "kexec_internal.h"
49
50	atomic_t __kexec_lock = ATOMIC_INIT(`0`);
51
52	/ Flag to indicate we are going to kexec a new kernel /
53	bool kexec_in_progress = false;
54
55	int kexec_should_crash(struct task_struct *p)
56	{
57	/*
58	* If crash_kexec_post_notifiers is enabled, don't run
59	* crash_kexec() here yet, which must be run after panic
60	* notifiers in panic().
61	*/
62	if (crash_kexec_post_notifiers)
63	return `0`;
64	/*
65	* There are 4 panic() calls in make_task_dead() path, each of which
66	* corresponds to each of these 4 conditions.
67	*/
68	if (in_interrupt() \|\| !p->pid \|\| is_global_init(tsk: p) \|\| panic_on_oops)
69	return `1`;
70	return `0`;
71	}
72
73	int kexec_crash_loaded(void)
74	{
75	return !!kexec_crash_image;
76	}
77	EXPORT_SYMBOL_GPL(kexec_crash_loaded);
78
79	/*
80	* When kexec transitions to the new kernel there is a one-to-one
81	* mapping between physical and virtual addresses. On processors
82	* where you can disable the MMU this is trivial, and easy. For
83	* others it is still a simple predictable page table to setup.
84	*
85	* In that environment kexec copies the new kernel to its final
86	* resting place. This means I can only support memory whose
87	* physical address can fit in an unsigned long. In particular
88	* addresses where (pfn << PAGE_SHIFT) > ULONG_MAX cannot be handled.
89	* If the assembly stub has more restrictive requirements
90	* KEXEC_SOURCE_MEMORY_LIMIT and KEXEC_DEST_MEMORY_LIMIT can be
91	* defined more restrictively in <asm/kexec.h>.
92	*
93	* The code for the transition from the current kernel to the
94	* new kernel is placed in the control_code_buffer, whose size
95	* is given by KEXEC_CONTROL_PAGE_SIZE. In the best case only a single
96	* page of memory is necessary, but some architectures require more.
97	* Because this memory must be identity mapped in the transition from
98	* virtual to physical addresses it must live in the range
99	* 0 - TASK_SIZE, as only the user space mappings are arbitrarily
100	* modifiable.
101	*
102	* The assembly stub in the control code buffer is passed a linked list
103	* of descriptor pages detailing the source pages of the new kernel,
104	* and the destination addresses of those source pages. As this data
105	* structure is not used in the context of the current OS, it must
106	* be self-contained.
107	*
108	* The code has been made to work with highmem pages and will use a
109	* destination page in its final resting place (if it happens
110	* to allocate it). The end product of this is that most of the
111	* physical address space, and most of RAM can be used.
112	*
113	* Future directions include:
114	* - allocating a page table with the control code buffer identity
115	* mapped, to simplify machine_kexec and make kexec_on_panic more
116	* reliable.
117	*/
118
119	/*
120	* KIMAGE_NO_DEST is an impossible destination address..., for
121	* allocating pages whose destination address we do not care about.
122	*/
123	#define KIMAGE_NO_DEST (-1UL)
124	#define PAGE_COUNT(x) (((x) + PAGE_SIZE - 1) >> PAGE_SHIFT)
125
126	static struct page kimage_alloc_page(struct* kimage *image,
127	gfp_t gfp_mask,
128	unsigned long dest);
129
130	int sanity_check_segment_list(struct kimage *image)
131	{
132	int i;
133	unsigned long nr_segments = image->nr_segments;
134	unsigned long total_pages = `0`;
135	unsigned long nr_pages = totalram_pages();
136
137	/*
138	* Verify we have good destination addresses. The caller is
139	* responsible for making certain we don't attempt to load
140	* the new image into invalid or reserved areas of RAM. This
141	* just verifies it is an address we can use.
142	*
143	* Since the kernel does everything in page size chunks ensure
144	* the destination addresses are page aligned. Too many
145	* special cases crop of when we don't do this. The most
146	* insidious is getting overlapping destination addresses
147	* simply because addresses are changed to page size
148	* granularity.
149	*/
150	for (i = `0`; i < nr_segments; i++) {
151	unsigned long mstart, mend;
152
153	mstart = image->segment[i].mem;
154	mend = mstart + image->segment[i].memsz;
155	if (mstart > mend)
156	return -EADDRNOTAVAIL;
157	if ((mstart & ~PAGE_MASK) \|\| (mend & ~PAGE_MASK))
158	return -EADDRNOTAVAIL;
159	if (mend >= KEXEC_DESTINATION_MEMORY_LIMIT)
160	return -EADDRNOTAVAIL;
161	}
162
163	/ Verify our destination addresses do not overlap.*
164	* If we alloed overlapping destination addresses
165	* through very weird things can happen with no
166	* easy explanation as one segment stops on another.
167	*/
168	for (i = `0`; i < nr_segments; i++) {
169	unsigned long mstart, mend;
170	unsigned long j;
171
172	mstart = image->segment[i].mem;
173	mend = mstart + image->segment[i].memsz;
174	for (j = `0`; j < i; j++) {
175	unsigned long pstart, pend;
176
177	pstart = image->segment[j].mem;
178	pend = pstart + image->segment[j].memsz;
179	/ Do the segments overlap ? /
180	if ((mend > pstart) && (mstart < pend))
181	return -EINVAL;
182	}
183	}
184
185	/ Ensure our buffer sizes are strictly less than*
186	* our memory sizes. This should always be the case,
187	* and it is easier to check up front than to be surprised
188	* later on.
189	*/
190	for (i = `0`; i < nr_segments; i++) {
191	if (image->segment[i].bufsz > image->segment[i].memsz)
192	return -EINVAL;
193	}
194
195	/*
196	* Verify that no more than half of memory will be consumed. If the
197	* request from userspace is too large, a large amount of time will be
198	* wasted allocating pages, which can cause a soft lockup.
199	*/
200	for (i = `0`; i < nr_segments; i++) {
201	if (PAGE_COUNT(image->segment[i].memsz) > nr_pages / `2`)
202	return -EINVAL;
203
204	total_pages += PAGE_COUNT(image->segment[i].memsz);
205	}
206
207	if (total_pages > nr_pages / `2`)
208	return -EINVAL;
209
210	/*
211	* Verify we have good destination addresses. Normally
212	* the caller is responsible for making certain we don't
213	* attempt to load the new image into invalid or reserved
214	* areas of RAM. But crash kernels are preloaded into a
215	* reserved area of ram. We must ensure the addresses
216	* are in the reserved area otherwise preloading the
217	* kernel could corrupt things.
218	*/
219
220	if (image->type == KEXEC_TYPE_CRASH) {
221	for (i = `0`; i < nr_segments; i++) {
222	unsigned long mstart, mend;
223
224	mstart = image->segment[i].mem;
225	mend = mstart + image->segment[i].memsz - `1`;
226	/ Ensure we are within the crash kernel limits /
227	if ((mstart < phys_to_boot_phys(phys: crashk_res.start)) \|\|
228	(mend > phys_to_boot_phys(phys: crashk_res.end)))
229	return -EADDRNOTAVAIL;
230	}
231	}
232
233	return `0`;
234	}
235
236	struct kimage do_kimage_alloc_init(void*)
237	{
238	struct kimage *image;
239
240	/ Allocate a controlling structure /
241	image = kzalloc(size: sizeof(*image), GFP_KERNEL);
242	if (!image)
243	return NULL;
244
245	image->head = `0`;
246	image->entry = &image->head;
247	image->last_entry = &image->head;
248	image->control_page = ~`0`; / By default this does not apply /
249	image->type = KEXEC_TYPE_DEFAULT;
250
251	/ Initialize the list of control pages /
252	INIT_LIST_HEAD(list: &image->control_pages);
253
254	/ Initialize the list of destination pages /
255	INIT_LIST_HEAD(list: &image->dest_pages);
256
257	/ Initialize the list of unusable pages /
258	INIT_LIST_HEAD(list: &image->unusable_pages);
259
260	#ifdef CONFIG_CRASH_HOTPLUG
261	image->hp_action = KEXEC_CRASH_HP_NONE;
262	image->elfcorehdr_index = -`1`;
263	image->elfcorehdr_updated = false;
264	#endif
265
266	return image;
267	}
268
269	int kimage_is_destination_range(struct kimage *image,
270	unsigned long start,
271	unsigned long end)
272	{
273	unsigned long i;
274
275	for (i = `0`; i < image->nr_segments; i++) {
276	unsigned long mstart, mend;
277
278	mstart = image->segment[i].mem;
279	mend = mstart + image->segment[i].memsz;
280	if ((end > mstart) && (start < mend))
281	return `1`;
282	}
283
284	return `0`;
285	}
286
287	static struct page kimage_alloc_pages(gfp_t gfp_mask, unsigned* int order)
288	{
289	struct page *pages;
290
291	if (fatal_signal_pending(current))
292	return NULL;
293	pages = alloc_pages(gfp: gfp_mask & ~__GFP_ZERO, order);
294	if (pages) {
295	unsigned int count, i;
296
297	pages->mapping = NULL;
298	set_page_private(page: pages, private: order);
299	count = `1` << order;
300	for (i = `0`; i < count; i++)
301	SetPageReserved(pages + i);
302
303	arch_kexec_post_alloc_pages(page_address(pages), pages: count,
304	gfp: gfp_mask);
305
306	if (gfp_mask & __GFP_ZERO)
307	for (i = `0`; i < count; i++)
308	clear_highpage(page: pages + i);
309	}
310
311	return pages;
312	}
313
314	static void kimage_free_pages(struct page *page)
315	{
316	unsigned int order, count, i;
317
318	order = page_private(page);
319	count = `1` << order;
320
321	arch_kexec_pre_free_pages(page_address(page), pages: count);
322
323	for (i = `0`; i < count; i++)
324	ClearPageReserved(page: page + i);
325	__free_pages(page, order);
326	}
327
328	void kimage_free_page_list(struct list_head *list)
329	{
330	struct page page, next;
331
332	list_for_each_entry_safe(page, next, list, lru) {
333	list_del(entry: &page->lru);
334	kimage_free_pages(page);
335	}
336	}
337
338	static struct page kimage_alloc_normal_control_pages(struct* kimage *image,
339	unsigned int order)
340	{
341	/ Control pages are special, they are the intermediaries*
342	* that are needed while we copy the rest of the pages
343	* to their final resting place. As such they must
344	* not conflict with either the destination addresses
345	* or memory the kernel is already using.
346	*
347	* The only case where we really need more than one of
348	* these are for architectures where we cannot disable
349	* the MMU and must instead generate an identity mapped
350	* page table for all of the memory.
351	*
352	* At worst this runs in O(N) of the image size.
353	*/
354	struct list_head extra_pages;
355	struct page *pages;
356	unsigned int count;
357
358	count = `1` << order;
359	INIT_LIST_HEAD(list: &extra_pages);
360
361	/ Loop while I can allocate a page and the page allocated*
362	* is a destination page.
363	*/
364	do {
365	unsigned long pfn, epfn, addr, eaddr;
366
367	pages = kimage_alloc_pages(KEXEC_CONTROL_MEMORY_GFP, order);
368	if (!pages)
369	break;
370	pfn = page_to_boot_pfn(page: pages);
371	epfn = pfn + count;
372	addr = pfn << PAGE_SHIFT;
373	eaddr = epfn << PAGE_SHIFT;
374	if ((epfn >= (KEXEC_CONTROL_MEMORY_LIMIT >> PAGE_SHIFT)) \|\|
375	kimage_is_destination_range(image, start: addr, end: eaddr)) {
376	list_add(new: &pages->lru, head: &extra_pages);
377	pages = NULL;
378	}
379	} while (!pages);
380
381	if (pages) {
382	/ Remember the allocated page... /
383	list_add(new: &pages->lru, head: &image->control_pages);
384
385	/ Because the page is already in it's destination*
386	* location we will never allocate another page at
387	* that address. Therefore kimage_alloc_pages
388	* will not return it (again) and we don't need
389	* to give it an entry in image->segment[].
390	*/
391	}
392	/ Deal with the destination pages I have inadvertently allocated.*
393	*
394	* Ideally I would convert multi-page allocations into single
395	* page allocations, and add everything to image->dest_pages.
396	*
397	* For now it is simpler to just free the pages.
398	*/
399	kimage_free_page_list(list: &extra_pages);
400
401	return pages;
402	}
403
404	static struct page kimage_alloc_crash_control_pages(struct* kimage *image,
405	unsigned int order)
406	{
407	/ Control pages are special, they are the intermediaries*
408	* that are needed while we copy the rest of the pages
409	* to their final resting place. As such they must
410	* not conflict with either the destination addresses
411	* or memory the kernel is already using.
412	*
413	* Control pages are also the only pags we must allocate
414	* when loading a crash kernel. All of the other pages
415	* are specified by the segments and we just memcpy
416	* into them directly.
417	*
418	* The only case where we really need more than one of
419	* these are for architectures where we cannot disable
420	* the MMU and must instead generate an identity mapped
421	* page table for all of the memory.
422	*
423	* Given the low demand this implements a very simple
424	* allocator that finds the first hole of the appropriate
425	* size in the reserved memory region, and allocates all
426	* of the memory up to and including the hole.
427	*/
428	unsigned long hole_start, hole_end, size;
429	struct page *pages;
430
431	pages = NULL;
432	size = (`1` << order) << PAGE_SHIFT;
433	hole_start = (image->control_page + (size - `1`)) & ~(size - `1`);
434	hole_end = hole_start + size - `1`;
435	while (hole_end <= crashk_res.end) {
436	unsigned long i;
437
438	cond_resched();
439
440	if (hole_end > KEXEC_CRASH_CONTROL_MEMORY_LIMIT)
441	break;
442	/ See if I overlap any of the segments /
443	for (i = `0`; i < image->nr_segments; i++) {
444	unsigned long mstart, mend;
445
446	mstart = image->segment[i].mem;
447	mend = mstart + image->segment[i].memsz - `1`;
448	if ((hole_end >= mstart) && (hole_start <= mend)) {
449	/ Advance the hole to the end of the segment /
450	hole_start = (mend + (size - `1`)) & ~(size - `1`);
451	hole_end = hole_start + size - `1`;
452	break;
453	}
454	}
455	/ If I don't overlap any segments I have found my hole! /
456	if (i == image->nr_segments) {
457	pages = pfn_to_page(hole_start >> PAGE_SHIFT);
458	image->control_page = hole_end;
459	break;
460	}
461	}
462
463	/ Ensure that these pages are decrypted if SME is enabled. /
464	if (pages)
465	arch_kexec_post_alloc_pages(page_address(pages), pages: `1` << order, gfp: `0`);
466
467	return pages;
468	}
469
470
471	struct page kimage_alloc_control_pages(struct* kimage *image,
472	unsigned int order)
473	{
474	struct page *pages = NULL;
475
476	switch (image->type) {
477	case KEXEC_TYPE_DEFAULT:
478	pages = kimage_alloc_normal_control_pages(image, order);
479	break;
480	case KEXEC_TYPE_CRASH:
481	pages = kimage_alloc_crash_control_pages(image, order);
482	break;
483	}
484
485	return pages;
486	}
487
488	int kimage_crash_copy_vmcoreinfo(struct kimage *image)
489	{
490	struct page *vmcoreinfo_page;
491	void *safecopy;
492
493	if (image->type != KEXEC_TYPE_CRASH)
494	return `0`;
495
496	/*
497	* For kdump, allocate one vmcoreinfo safe copy from the
498	* crash memory. as we have arch_kexec_protect_crashkres()
499	* after kexec syscall, we naturally protect it from write
500	* (even read) access under kernel direct mapping. But on
501	* the other hand, we still need to operate it when crash
502	* happens to generate vmcoreinfo note, hereby we rely on
503	* vmap for this purpose.
504	*/
505	vmcoreinfo_page = kimage_alloc_control_pages(image, order: `0`);
506	if (!vmcoreinfo_page) {
507	pr_warn("Could not allocate vmcoreinfo buffer\n");
508	return -ENOMEM;
509	}
510	safecopy = vmap(pages: &vmcoreinfo_page, count: `1`, VM_MAP, PAGE_KERNEL);
511	if (!safecopy) {
512	pr_warn("Could not vmap vmcoreinfo buffer\n");
513	return -ENOMEM;
514	}
515
516	image->vmcoreinfo_data_copy = safecopy;
517	crash_update_vmcoreinfo_safecopy(ptr: safecopy);
518
519	return `0`;
520	}
521
522	static int kimage_add_entry(struct kimage *image, kimage_entry_t entry)
523	{
524	if (*image->entry != `0`)
525	image->entry++;
526
527	if (image->entry == image->last_entry) {
528	kimage_entry_t *ind_page;
529	struct page *page;
530
531	page = kimage_alloc_page(image, GFP_KERNEL, KIMAGE_NO_DEST);
532	if (!page)
533	return -ENOMEM;
534
535	ind_page = page_address(page);
536	*image->entry = virt_to_boot_phys(addr: ind_page) \| IND_INDIRECTION;
537	image->entry = ind_page;
538	image->last_entry = ind_page +
539	((PAGE_SIZE/sizeof(kimage_entry_t)) - `1`);
540	}
541	*image->entry = entry;
542	image->entry++;
543	*image->entry = `0`;
544
545	return `0`;
546	}
547
548	static int kimage_set_destination(struct kimage *image,
549	unsigned long destination)
550	{
551	destination &= PAGE_MASK;
552
553	return kimage_add_entry(image, entry: destination \| IND_DESTINATION);
554	}
555
556
557	static int kimage_add_page(struct kimage image, unsigned* long page)
558	{
559	page &= PAGE_MASK;
560
561	return kimage_add_entry(image, entry: page \| IND_SOURCE);
562	}
563
564
565	static void kimage_free_extra_pages(struct kimage *image)
566	{
567	/ Walk through and free any extra destination pages I may have /
568	kimage_free_page_list(list: &image->dest_pages);
569
570	/ Walk through and free any unusable pages I have cached /
571	kimage_free_page_list(list: &image->unusable_pages);
572
573	}
574
575	void kimage_terminate(struct kimage *image)
576	{
577	if (*image->entry != `0`)
578	image->entry++;
579
580	*image->entry = IND_DONE;
581	}
582
583	#define for_each_kimage_entry(image, ptr, entry) \
584	for (ptr = &image->head; (entry = *ptr) && !(entry & IND_DONE); \
585	ptr = (entry & IND_INDIRECTION) ? \
586	boot_phys_to_virt((entry & PAGE_MASK)) : ptr + 1)
587
588	static void kimage_free_entry(kimage_entry_t entry)
589	{
590	struct page *page;
591
592	page = boot_pfn_to_page(boot_pfn: entry >> PAGE_SHIFT);
593	kimage_free_pages(page);
594	}
595
596	void kimage_free(struct kimage *image)
597	{
598	kimage_entry_t *ptr, entry;
599	kimage_entry_t ind = `0`;
600
601	if (!image)
602	return;
603
604	if (image->vmcoreinfo_data_copy) {
605	crash_update_vmcoreinfo_safecopy(NULL);
606	vunmap(addr: image->vmcoreinfo_data_copy);
607	}
608
609	kimage_free_extra_pages(image);
610	for_each_kimage_entry(image, ptr, entry) {
611	if (entry & IND_INDIRECTION) {
612	/ Free the previous indirection page /
613	if (ind & IND_INDIRECTION)
614	kimage_free_entry(entry: ind);
615	/ Save this indirection page until we are*
616	* done with it.
617	*/
618	ind = entry;
619	} else if (entry & IND_SOURCE)
620	kimage_free_entry(entry);
621	}
622	/ Free the final indirection page /
623	if (ind & IND_INDIRECTION)
624	kimage_free_entry(entry: ind);
625
626	/ Handle any machine specific cleanup /
627	machine_kexec_cleanup(image);
628
629	/ Free the kexec control pages... /
630	kimage_free_page_list(list: &image->control_pages);
631
632	/*
633	* Free up any temporary buffers allocated. This might hit if
634	* error occurred much later after buffer allocation.
635	*/
636	if (image->file_mode)
637	kimage_file_post_load_cleanup(image);
638
639	kfree(objp: image);
640	}
641
642	static kimage_entry_t kimage_dst_used(struct* kimage *image,
643	unsigned long page)
644	{
645	kimage_entry_t *ptr, entry;
646	unsigned long destination = `0`;
647
648	for_each_kimage_entry(image, ptr, entry) {
649	if (entry & IND_DESTINATION)
650	destination = entry & PAGE_MASK;
651	else if (entry & IND_SOURCE) {
652	if (page == destination)
653	return ptr;
654	destination += PAGE_SIZE;
655	}
656	}
657
658	return NULL;
659	}
660
661	static struct page kimage_alloc_page(struct* kimage *image,
662	gfp_t gfp_mask,
663	unsigned long destination)
664	{
665	/*
666	* Here we implement safeguards to ensure that a source page
667	* is not copied to its destination page before the data on
668	* the destination page is no longer useful.
669	*
670	* To do this we maintain the invariant that a source page is
671	* either its own destination page, or it is not a
672	* destination page at all.
673	*
674	* That is slightly stronger than required, but the proof
675	* that no problems will not occur is trivial, and the
676	* implementation is simply to verify.
677	*
678	* When allocating all pages normally this algorithm will run
679	* in O(N) time, but in the worst case it will run in O(N^2)
680	* time. If the runtime is a problem the data structures can
681	* be fixed.
682	*/
683	struct page *page;
684	unsigned long addr;
685
686	/*
687	* Walk through the list of destination pages, and see if I
688	* have a match.
689	*/
690	list_for_each_entry(page, &image->dest_pages, lru) {
691	addr = page_to_boot_pfn(page) << PAGE_SHIFT;
692	if (addr == destination) {
693	list_del(entry: &page->lru);
694	return page;
695	}
696	}
697	page = NULL;
698	while (`1`) {
699	kimage_entry_t *old;
700
701	/ Allocate a page, if we run out of memory give up /
702	page = kimage_alloc_pages(gfp_mask, order: `0`);
703	if (!page)
704	return NULL;
705	/ If the page cannot be used file it away /
706	if (page_to_boot_pfn(page) >
707	(KEXEC_SOURCE_MEMORY_LIMIT >> PAGE_SHIFT)) {
708	list_add(new: &page->lru, head: &image->unusable_pages);
709	continue;
710	}
711	addr = page_to_boot_pfn(page) << PAGE_SHIFT;
712
713	/ If it is the destination page we want use it /
714	if (addr == destination)
715	break;
716
717	/ If the page is not a destination page use it /
718	if (!kimage_is_destination_range(image, start: addr,
719	end: addr + PAGE_SIZE))
720	break;
721
722	/*
723	* I know that the page is someones destination page.
724	* See if there is already a source page for this
725	* destination page. And if so swap the source pages.
726	*/
727	old = kimage_dst_used(image, page: addr);
728	if (old) {
729	/ If so move it /
730	unsigned long old_addr;
731	struct page *old_page;
732
733	old_addr = *old & PAGE_MASK;
734	old_page = boot_pfn_to_page(boot_pfn: old_addr >> PAGE_SHIFT);
735	copy_highpage(to: page, from: old_page);
736	old = addr \| (old & ~PAGE_MASK);
737
738	/ The old page I have found cannot be a*
739	* destination page, so return it if it's
740	* gfp_flags honor the ones passed in.
741	*/
742	if (!(gfp_mask & __GFP_HIGHMEM) &&
743	PageHighMem(page: old_page)) {
744	kimage_free_pages(page: old_page);
745	continue;
746	}
747	page = old_page;
748	break;
749	}
750	/ Place the page on the destination list, to be used later /
751	list_add(new: &page->lru, head: &image->dest_pages);
752	}
753
754	return page;
755	}
756
757	static int kimage_load_normal_segment(struct kimage *image,
758	struct kexec_segment *segment)
759	{
760	unsigned long maddr;
761	size_t ubytes, mbytes;
762	int result;
763	unsigned char __user *buf = NULL;
764	unsigned char *kbuf = NULL;
765
766	if (image->file_mode)
767	kbuf = segment->kbuf;
768	else
769	buf = segment->buf;
770	ubytes = segment->bufsz;
771	mbytes = segment->memsz;
772	maddr = segment->mem;
773
774	result = kimage_set_destination(image, destination: maddr);
775	if (result < `0`)
776	goto out;
777
778	while (mbytes) {
779	struct page *page;
780	char *ptr;
781	size_t uchunk, mchunk;
782
783	page = kimage_alloc_page(image, GFP_HIGHUSER, destination: maddr);
784	if (!page) {
785	result = -ENOMEM;
786	goto out;
787	}
788	result = kimage_add_page(image, page: page_to_boot_pfn(page)
789	<< PAGE_SHIFT);
790	if (result < `0`)
791	goto out;
792
793	ptr = kmap_local_page(page);
794	/ Start with a clear page /
795	clear_page(page: ptr);
796	ptr += maddr & ~PAGE_MASK;
797	mchunk = min_t(size_t, mbytes,
798	PAGE_SIZE - (maddr & ~PAGE_MASK));
799	uchunk = min(ubytes, mchunk);
800
801	/ For file based kexec, source pages are in kernel memory /
802	if (image->file_mode)
803	memcpy(ptr, kbuf, uchunk);
804	else
805	result = copy_from_user(to: ptr, from: buf, n: uchunk);
806	kunmap_local(ptr);
807	if (result) {
808	result = -EFAULT;
809	goto out;
810	}
811	ubytes -= uchunk;
812	maddr += mchunk;
813	if (image->file_mode)
814	kbuf += mchunk;
815	else
816	buf += mchunk;
817	mbytes -= mchunk;
818
819	cond_resched();
820	}
821	out:
822	return result;
823	}
824
825	static int kimage_load_crash_segment(struct kimage *image,
826	struct kexec_segment *segment)
827	{
828	/ For crash dumps kernels we simply copy the data from*
829	* user space to it's destination.
830	* We do things a page at a time for the sake of kmap.
831	*/
832	unsigned long maddr;
833	size_t ubytes, mbytes;
834	int result;
835	unsigned char __user *buf = NULL;
836	unsigned char *kbuf = NULL;
837
838	result = `0`;
839	if (image->file_mode)
840	kbuf = segment->kbuf;
841	else
842	buf = segment->buf;
843	ubytes = segment->bufsz;
844	mbytes = segment->memsz;
845	maddr = segment->mem;
846	while (mbytes) {
847	struct page *page;
848	char *ptr;
849	size_t uchunk, mchunk;
850
851	page = boot_pfn_to_page(boot_pfn: maddr >> PAGE_SHIFT);
852	if (!page) {
853	result = -ENOMEM;
854	goto out;
855	}
856	arch_kexec_post_alloc_pages(page_address(page), pages: `1`, gfp: `0`);
857	ptr = kmap_local_page(page);
858	ptr += maddr & ~PAGE_MASK;
859	mchunk = min_t(size_t, mbytes,
860	PAGE_SIZE - (maddr & ~PAGE_MASK));
861	uchunk = min(ubytes, mchunk);
862	if (mchunk > uchunk) {
863	/ Zero the trailing part of the page /
864	memset(ptr + uchunk, `0`, mchunk - uchunk);
865	}
866
867	/ For file based kexec, source pages are in kernel memory /
868	if (image->file_mode)
869	memcpy(ptr, kbuf, uchunk);
870	else
871	result = copy_from_user(to: ptr, from: buf, n: uchunk);
872	kexec_flush_icache_page(page);
873	kunmap_local(ptr);
874	arch_kexec_pre_free_pages(page_address(page), pages: `1`);
875	if (result) {
876	result = -EFAULT;
877	goto out;
878	}
879	ubytes -= uchunk;
880	maddr += mchunk;
881	if (image->file_mode)
882	kbuf += mchunk;
883	else
884	buf += mchunk;
885	mbytes -= mchunk;
886
887	cond_resched();
888	}
889	out:
890	return result;
891	}
892
893	int kimage_load_segment(struct kimage *image,
894	struct kexec_segment *segment)
895	{
896	int result = -ENOMEM;
897
898	switch (image->type) {
899	case KEXEC_TYPE_DEFAULT:
900	result = kimage_load_normal_segment(image, segment);
901	break;
902	case KEXEC_TYPE_CRASH:
903	result = kimage_load_crash_segment(image, segment);
904	break;
905	}
906
907	return result;
908	}
909
910	struct kexec_load_limit {
911	/ Mutex protects the limit count. /
912	struct mutex mutex;
913	int limit;
914	};
915
916	static struct kexec_load_limit load_limit_reboot = {
917	.mutex = __MUTEX_INITIALIZER(load_limit_reboot.mutex),
918	.limit = -`1`,
919	};
920
921	static struct kexec_load_limit load_limit_panic = {
922	.mutex = __MUTEX_INITIALIZER(load_limit_panic.mutex),
923	.limit = -`1`,
924	};
925
926	struct kimage *kexec_image;
927	struct kimage *kexec_crash_image;
928	static int kexec_load_disabled;
929
930	#ifdef CONFIG_SYSCTL
931	static int kexec_limit_handler(struct ctl_table table, int* write,
932	void buffer, size_t lenp, loff_t *ppos)
933	{
934	struct kexec_load_limit *limit = table->data;
935	int val;
936	struct ctl_table tmp = {
937	.data = &val,
938	.maxlen = sizeof(val),
939	.mode = table->mode,
940	};
941	int ret;
942
943	if (write) {
944	ret = proc_dointvec(&tmp, write, buffer, lenp, ppos);
945	if (ret)
946	return ret;
947
948	if (val < `0`)
949	return -EINVAL;
950
951	mutex_lock(&limit->mutex);
952	if (limit->limit != -`1` && val >= limit->limit)
953	ret = -EINVAL;
954	else
955	limit->limit = val;
956	mutex_unlock(lock: &limit->mutex);
957
958	return ret;
959	}
960
961	mutex_lock(&limit->mutex);
962	val = limit->limit;
963	mutex_unlock(lock: &limit->mutex);
964
965	return proc_dointvec(&tmp, write, buffer, lenp, ppos);
966	}
967
968	static struct ctl_table kexec_core_sysctls[] = {
969	{
970	.procname = "kexec_load_disabled",
971	.data = &kexec_load_disabled,
972	.maxlen = sizeof(int),
973	.mode = `0644`,
974	/ only handle a transition from default "0" to "1" /
975	.proc_handler = proc_dointvec_minmax,
976	.extra1 = SYSCTL_ONE,
977	.extra2 = SYSCTL_ONE,
978	},
979	{
980	.procname = "kexec_load_limit_panic",
981	.data = &load_limit_panic,
982	.mode = `0644`,
983	.proc_handler = kexec_limit_handler,
984	},
985	{
986	.procname = "kexec_load_limit_reboot",
987	.data = &load_limit_reboot,
988	.mode = `0644`,
989	.proc_handler = kexec_limit_handler,
990	},
991	{ }
992	};
993
994	static int __init kexec_core_sysctl_init(void)
995	{
996	register_sysctl_init("kernel", kexec_core_sysctls);
997	return `0`;
998	}
999	late_initcall(kexec_core_sysctl_init);
1000	#endif
1001
1002	bool kexec_load_permitted(int kexec_image_type)
1003	{
1004	struct kexec_load_limit *limit;
1005
1006	/*
1007	* Only the superuser can use the kexec syscall and if it has not
1008	* been disabled.
1009	*/
1010	if (!capable(CAP_SYS_BOOT) \|\| kexec_load_disabled)
1011	return false;
1012
1013	/ Check limit counter and decrease it./
1014	limit = (kexec_image_type == KEXEC_TYPE_CRASH) ?
1015	&load_limit_panic : &load_limit_reboot;
1016	mutex_lock(&limit->mutex);
1017	if (!limit->limit) {
1018	mutex_unlock(lock: &limit->mutex);
1019	return false;
1020	}
1021	if (limit->limit != -`1`)
1022	limit->limit--;
1023	mutex_unlock(lock: &limit->mutex);
1024
1025	return true;
1026	}
1027
1028	/*
1029	* No panic_cpu check version of crash_kexec(). This function is called
1030	* only when panic_cpu holds the current CPU number; this is the only CPU
1031	* which processes crash_kexec routines.
1032	*/
1033	void __noclone __crash_kexec(struct pt_regs *regs)
1034	{
1035	/ Take the kexec_lock here to prevent sys_kexec_load*
1036	* running on one cpu from replacing the crash kernel
1037	* we are using after a panic on a different cpu.
1038	*
1039	* If the crash kernel was not located in a fixed area
1040	* of memory the xchg(&kexec_crash_image) would be
1041	* sufficient. But since I reuse the memory...
1042	*/
1043	if (kexec_trylock()) {
1044	if (kexec_crash_image) {
1045	struct pt_regs fixed_regs;
1046
1047	crash_setup_regs(newregs: &fixed_regs, oldregs: regs);
1048	crash_save_vmcoreinfo();
1049	machine_crash_shutdown(&fixed_regs);
1050	machine_kexec(image: kexec_crash_image);
1051	}
1052	kexec_unlock();
1053	}
1054	}
1055	STACK_FRAME_NON_STANDARD(__crash_kexec);
1056
1057	__bpf_kfunc void crash_kexec(struct pt_regs *regs)
1058	{
1059	int old_cpu, this_cpu;
1060
1061	/*
1062	* Only one CPU is allowed to execute the crash_kexec() code as with
1063	* panic(). Otherwise parallel calls of panic() and crash_kexec()
1064	* may stop each other. To exclude them, we use panic_cpu here too.
1065	*/
1066	this_cpu = raw_smp_processor_id();
1067	old_cpu = atomic_cmpxchg(v: &panic_cpu, PANIC_CPU_INVALID, new: this_cpu);
1068	if (old_cpu == PANIC_CPU_INVALID) {
1069	/ This is the 1st CPU which comes here, so go ahead. /
1070	__crash_kexec(regs);
1071
1072	/*
1073	* Reset panic_cpu to allow another panic()/crash_kexec()
1074	* call.
1075	*/
1076	atomic_set(v: &panic_cpu, PANIC_CPU_INVALID);
1077	}
1078	}
1079
1080	static inline resource_size_t crash_resource_size(const struct resource *res)
1081	{
1082	return !res->end ? `0` : resource_size(res);
1083	}
1084
1085	ssize_t crash_get_memory_size(void)
1086	{
1087	ssize_t size = `0`;
1088
1089	if (!kexec_trylock())
1090	return -EBUSY;
1091
1092	size += crash_resource_size(res: &crashk_res);
1093	size += crash_resource_size(res: &crashk_low_res);
1094
1095	kexec_unlock();
1096	return size;
1097	}
1098
1099	static int __crash_shrink_memory(struct resource *old_res,
1100	unsigned long new_size)
1101	{
1102	struct resource *ram_res;
1103
1104	ram_res = kzalloc(size: sizeof(*ram_res), GFP_KERNEL);
1105	if (!ram_res)
1106	return -ENOMEM;
1107
1108	ram_res->start = old_res->start + new_size;
1109	ram_res->end = old_res->end;
1110	ram_res->flags = IORESOURCE_BUSY \| IORESOURCE_SYSTEM_RAM;
1111	ram_res->name = "System RAM";
1112
1113	if (!new_size) {
1114	release_resource(new: old_res);
1115	old_res->start = `0`;
1116	old_res->end = `0`;
1117	} else {
1118	crashk_res.end = ram_res->start - `1`;
1119	}
1120
1121	crash_free_reserved_phys_range(begin: ram_res->start, end: ram_res->end);
1122	insert_resource(parent: &iomem_resource, new: ram_res);
1123
1124	return `0`;
1125	}
1126
1127	int crash_shrink_memory(unsigned long new_size)
1128	{
1129	int ret = `0`;
1130	unsigned long old_size, low_size;
1131
1132	if (!kexec_trylock())
1133	return -EBUSY;
1134
1135	if (kexec_crash_image) {
1136	ret = -ENOENT;
1137	goto unlock;
1138	}
1139
1140	low_size = crash_resource_size(res: &crashk_low_res);
1141	old_size = crash_resource_size(res: &crashk_res) + low_size;
1142	new_size = roundup(new_size, KEXEC_CRASH_MEM_ALIGN);
1143	if (new_size >= old_size) {
1144	ret = (new_size == old_size) ? `0` : -EINVAL;
1145	goto unlock;
1146	}
1147
1148	/*
1149	* (low_size > new_size) implies that low_size is greater than zero.
1150	* This also means that if low_size is zero, the else branch is taken.
1151	*
1152	* If low_size is greater than 0, (low_size > new_size) indicates that
1153	* crashk_low_res also needs to be shrunken. Otherwise, only crashk_res
1154	* needs to be shrunken.
1155	*/
1156	if (low_size > new_size) {
1157	ret = __crash_shrink_memory(old_res: &crashk_res, new_size: `0`);
1158	if (ret)
1159	goto unlock;
1160
1161	ret = __crash_shrink_memory(old_res: &crashk_low_res, new_size);
1162	} else {
1163	ret = __crash_shrink_memory(old_res: &crashk_res, new_size: new_size - low_size);
1164	}
1165
1166	/ Swap crashk_res and crashk_low_res if needed /
1167	if (!crashk_res.end && crashk_low_res.end) {
1168	crashk_res.start = crashk_low_res.start;
1169	crashk_res.end = crashk_low_res.end;
1170	release_resource(new: &crashk_low_res);
1171	crashk_low_res.start = `0`;
1172	crashk_low_res.end = `0`;
1173	insert_resource(parent: &iomem_resource, new: &crashk_res);
1174	}
1175
1176	unlock:
1177	kexec_unlock();
1178	return ret;
1179	}
1180
1181	void crash_save_cpu(struct pt_regs regs, int* cpu)
1182	{
1183	struct elf_prstatus prstatus;
1184	u32 *buf;
1185
1186	if ((cpu < `0`) \|\| (cpu >= nr_cpu_ids))
1187	return;
1188
1189	/ Using ELF notes here is opportunistic.*
1190	* I need a well defined structure format
1191	* for the data I pass, and I need tags
1192	* on the data to indicate what information I have
1193	* squirrelled away. ELF notes happen to provide
1194	* all of that, so there is no need to invent something new.
1195	*/
1196	buf = (u32 *)per_cpu_ptr(crash_notes, cpu);
1197	if (!buf)
1198	return;
1199	memset(&prstatus, `0`, sizeof(prstatus));
1200	prstatus.common.pr_pid = current->pid;
1201	elf_core_copy_regs(elfregs: &prstatus.pr_reg, regs);
1202	buf = append_elf_note(buf, KEXEC_CORE_NOTE_NAME, NT_PRSTATUS,
1203	data: &prstatus, data_len: sizeof(prstatus));
1204	final_note(buf);
1205	}
1206
1207	/*
1208	* Move into place and start executing a preloaded standalone
1209	* executable. If nothing was preloaded return an error.
1210	*/
1211	int kernel_kexec(void)
1212	{
1213	int error = `0`;
1214
1215	if (!kexec_trylock())
1216	return -EBUSY;
1217	if (!kexec_image) {
1218	error = -EINVAL;
1219	goto Unlock;
1220	}
1221
1222	#ifdef CONFIG_KEXEC_JUMP
1223	if (kexec_image->preserve_context) {
1224	pm_prepare_console();
1225	error = freeze_processes();
1226	if (error) {
1227	error = -EBUSY;
1228	goto Restore_console;
1229	}
1230	suspend_console();
1231	error = dpm_suspend_start(PMSG_FREEZE);
1232	if (error)
1233	goto Resume_console;
1234	/ At this point, dpm_suspend_start() has been called,*
1235	* but not dpm_suspend_end(). We must call
1236	* dpm_suspend_end() now. Otherwise, drivers for
1237	* some devices (e.g. interrupt controllers) become
1238	* desynchronized with the actual state of the
1239	* hardware at resume time, and evil weirdness ensues.
1240	*/
1241	error = dpm_suspend_end(PMSG_FREEZE);
1242	if (error)
1243	goto Resume_devices;
1244	error = suspend_disable_secondary_cpus();
1245	if (error)
1246	goto Enable_cpus;
1247	local_irq_disable();
1248	error = syscore_suspend();
1249	if (error)
1250	goto Enable_irqs;
1251	} else
1252	#endif
1253	{
1254	kexec_in_progress = true;
1255	kernel_restart_prepare(cmd: "kexec reboot");
1256	migrate_to_reboot_cpu();
1257
1258	/*
1259	* migrate_to_reboot_cpu() disables CPU hotplug assuming that
1260	* no further code needs to use CPU hotplug (which is true in
1261	* the reboot case). However, the kexec path depends on using
1262	* CPU hotplug again; so re-enable it here.
1263	*/
1264	cpu_hotplug_enable();
1265	pr_notice("Starting new kernel\n");
1266	machine_shutdown();
1267	}
1268
1269	kmsg_dump(reason: KMSG_DUMP_SHUTDOWN);
1270	machine_kexec(image: kexec_image);
1271
1272	#ifdef CONFIG_KEXEC_JUMP
1273	if (kexec_image->preserve_context) {
1274	syscore_resume();
1275	Enable_irqs:
1276	local_irq_enable();
1277	Enable_cpus:
1278	suspend_enable_secondary_cpus();
1279	dpm_resume_start(PMSG_RESTORE);
1280	Resume_devices:
1281	dpm_resume_end(PMSG_RESTORE);
1282	Resume_console:
1283	resume_console();
1284	thaw_processes();
1285	Restore_console:
1286	pm_restore_console();
1287	}
1288	#endif
1289
1290	Unlock:
1291	kexec_unlock();
1292	return error;
1293	}
1294

source code of linux/kernel/kexec_core.c