1 | // SPDX-License-Identifier: GPL-2.0 |
2 | /* |
3 | * kaslr.c |
4 | * |
5 | * This contains the routines needed to generate a reasonable level of |
6 | * entropy to choose a randomized kernel base address offset in support |
7 | * of Kernel Address Space Layout Randomization (KASLR). Additionally |
8 | * handles walking the physical memory maps (and tracking memory regions |
9 | * to avoid) in order to select a physical memory location that can |
10 | * contain the entire properly aligned running kernel image. |
11 | * |
12 | */ |
13 | |
14 | /* |
15 | * isspace() in linux/ctype.h is expected by next_args() to filter |
16 | * out "space/lf/tab". While boot/ctype.h conflicts with linux/ctype.h, |
17 | * since isdigit() is implemented in both of them. Hence disable it |
18 | * here. |
19 | */ |
20 | #define BOOT_CTYPE_H |
21 | |
22 | #include "misc.h" |
23 | #include "error.h" |
24 | #include "../string.h" |
25 | #include "efi.h" |
26 | |
27 | #include <generated/compile.h> |
28 | #include <linux/module.h> |
29 | #include <linux/uts.h> |
30 | #include <linux/utsname.h> |
31 | #include <linux/ctype.h> |
32 | #include <generated/utsversion.h> |
33 | #include <generated/utsrelease.h> |
34 | |
35 | #define _SETUP |
36 | #include <asm/setup.h> /* For COMMAND_LINE_SIZE */ |
37 | #undef _SETUP |
38 | |
39 | extern unsigned long get_cmd_line_ptr(void); |
40 | |
41 | /* Simplified build-specific string for starting entropy. */ |
42 | static const char build_str[] = UTS_RELEASE " (" LINUX_COMPILE_BY "@" |
43 | LINUX_COMPILE_HOST ") (" LINUX_COMPILER ") " UTS_VERSION; |
44 | |
45 | static unsigned long rotate_xor(unsigned long hash, const void *area, |
46 | size_t size) |
47 | { |
48 | size_t i; |
49 | unsigned long *ptr = (unsigned long *)area; |
50 | |
51 | for (i = 0; i < size / sizeof(hash); i++) { |
52 | /* Rotate by odd number of bits and XOR. */ |
53 | hash = (hash << ((sizeof(hash) * 8) - 7)) | (hash >> 7); |
54 | hash ^= ptr[i]; |
55 | } |
56 | |
57 | return hash; |
58 | } |
59 | |
60 | /* Attempt to create a simple but unpredictable starting entropy. */ |
61 | static unsigned long get_boot_seed(void) |
62 | { |
63 | unsigned long hash = 0; |
64 | |
65 | hash = rotate_xor(hash, area: build_str, size: sizeof(build_str)); |
66 | hash = rotate_xor(hash, area: boot_params_ptr, size: sizeof(*boot_params_ptr)); |
67 | |
68 | return hash; |
69 | } |
70 | |
71 | #define KASLR_COMPRESSED_BOOT |
72 | #include "../../lib/kaslr.c" |
73 | |
74 | |
75 | /* Only supporting at most 4 unusable memmap regions with kaslr */ |
76 | #define MAX_MEMMAP_REGIONS 4 |
77 | |
78 | static bool memmap_too_large; |
79 | |
80 | |
81 | /* |
82 | * Store memory limit: MAXMEM on 64-bit and KERNEL_IMAGE_SIZE on 32-bit. |
83 | * It may be reduced by "mem=nn[KMG]" or "memmap=nn[KMG]" command line options. |
84 | */ |
85 | static u64 mem_limit; |
86 | |
87 | /* Number of immovable memory regions */ |
88 | static int num_immovable_mem; |
89 | |
90 | enum mem_avoid_index { |
91 | MEM_AVOID_ZO_RANGE = 0, |
92 | MEM_AVOID_INITRD, |
93 | MEM_AVOID_CMDLINE, |
94 | MEM_AVOID_BOOTPARAMS, |
95 | MEM_AVOID_MEMMAP_BEGIN, |
96 | MEM_AVOID_MEMMAP_END = MEM_AVOID_MEMMAP_BEGIN + MAX_MEMMAP_REGIONS - 1, |
97 | MEM_AVOID_MAX, |
98 | }; |
99 | |
100 | static struct mem_vector mem_avoid[MEM_AVOID_MAX]; |
101 | |
102 | static bool mem_overlaps(struct mem_vector *one, struct mem_vector *two) |
103 | { |
104 | /* Item one is entirely before item two. */ |
105 | if (one->start + one->size <= two->start) |
106 | return false; |
107 | /* Item one is entirely after item two. */ |
108 | if (one->start >= two->start + two->size) |
109 | return false; |
110 | return true; |
111 | } |
112 | |
113 | char *skip_spaces(const char *str) |
114 | { |
115 | while (isspace(*str)) |
116 | ++str; |
117 | return (char *)str; |
118 | } |
119 | #include "../../../../lib/ctype.c" |
120 | #include "../../../../lib/cmdline.c" |
121 | |
122 | enum parse_mode { |
123 | PARSE_MEMMAP, |
124 | PARSE_EFI, |
125 | }; |
126 | |
127 | static int |
128 | parse_memmap(char *p, u64 *start, u64 *size, enum parse_mode mode) |
129 | { |
130 | char *oldp; |
131 | |
132 | if (!p) |
133 | return -EINVAL; |
134 | |
135 | /* We don't care about this option here */ |
136 | if (!strncmp(cs: p, ct: "exactmap" , count: 8)) |
137 | return -EINVAL; |
138 | |
139 | oldp = p; |
140 | *size = memparse(ptr: p, retptr: &p); |
141 | if (p == oldp) |
142 | return -EINVAL; |
143 | |
144 | switch (*p) { |
145 | case '#': |
146 | case '$': |
147 | case '!': |
148 | *start = memparse(ptr: p + 1, retptr: &p); |
149 | return 0; |
150 | case '@': |
151 | if (mode == PARSE_MEMMAP) { |
152 | /* |
153 | * memmap=nn@ss specifies usable region, should |
154 | * be skipped |
155 | */ |
156 | *size = 0; |
157 | } else { |
158 | u64 flags; |
159 | |
160 | /* |
161 | * efi_fake_mem=nn@ss:attr the attr specifies |
162 | * flags that might imply a soft-reservation. |
163 | */ |
164 | *start = memparse(ptr: p + 1, retptr: &p); |
165 | if (p && *p == ':') { |
166 | p++; |
167 | if (kstrtoull(s: p, base: 0, res: &flags) < 0) |
168 | *size = 0; |
169 | else if (flags & EFI_MEMORY_SP) |
170 | return 0; |
171 | } |
172 | *size = 0; |
173 | } |
174 | fallthrough; |
175 | default: |
176 | /* |
177 | * If w/o offset, only size specified, memmap=nn[KMG] has the |
178 | * same behaviour as mem=nn[KMG]. It limits the max address |
179 | * system can use. Region above the limit should be avoided. |
180 | */ |
181 | *start = 0; |
182 | return 0; |
183 | } |
184 | |
185 | return -EINVAL; |
186 | } |
187 | |
188 | static void mem_avoid_memmap(enum parse_mode mode, char *str) |
189 | { |
190 | static int i; |
191 | |
192 | if (i >= MAX_MEMMAP_REGIONS) |
193 | return; |
194 | |
195 | while (str && (i < MAX_MEMMAP_REGIONS)) { |
196 | int rc; |
197 | u64 start, size; |
198 | char *k = strchr(s: str, c: ','); |
199 | |
200 | if (k) |
201 | *k++ = 0; |
202 | |
203 | rc = parse_memmap(p: str, start: &start, size: &size, mode); |
204 | if (rc < 0) |
205 | break; |
206 | str = k; |
207 | |
208 | if (start == 0) { |
209 | /* Store the specified memory limit if size > 0 */ |
210 | if (size > 0 && size < mem_limit) |
211 | mem_limit = size; |
212 | |
213 | continue; |
214 | } |
215 | |
216 | mem_avoid[MEM_AVOID_MEMMAP_BEGIN + i].start = start; |
217 | mem_avoid[MEM_AVOID_MEMMAP_BEGIN + i].size = size; |
218 | i++; |
219 | } |
220 | |
221 | /* More than 4 memmaps, fail kaslr */ |
222 | if ((i >= MAX_MEMMAP_REGIONS) && str) |
223 | memmap_too_large = true; |
224 | } |
225 | |
226 | /* Store the number of 1GB huge pages which users specified: */ |
227 | static unsigned long max_gb_huge_pages; |
228 | |
229 | static void parse_gb_huge_pages(char *param, char *val) |
230 | { |
231 | static bool gbpage_sz; |
232 | char *p; |
233 | |
234 | if (!strcmp(str1: param, str2: "hugepagesz" )) { |
235 | p = val; |
236 | if (memparse(ptr: p, retptr: &p) != PUD_SIZE) { |
237 | gbpage_sz = false; |
238 | return; |
239 | } |
240 | |
241 | if (gbpage_sz) |
242 | warn(m: "Repeatedly set hugeTLB page size of 1G!\n" ); |
243 | gbpage_sz = true; |
244 | return; |
245 | } |
246 | |
247 | if (!strcmp(str1: param, str2: "hugepages" ) && gbpage_sz) { |
248 | p = val; |
249 | max_gb_huge_pages = simple_strtoull(cp: p, endp: &p, base: 0); |
250 | return; |
251 | } |
252 | } |
253 | |
254 | static void handle_mem_options(void) |
255 | { |
256 | char *args = (char *)get_cmd_line_ptr(); |
257 | size_t len; |
258 | char *tmp_cmdline; |
259 | char *param, *val; |
260 | u64 mem_size; |
261 | |
262 | if (!args) |
263 | return; |
264 | |
265 | len = strnlen(s: args, COMMAND_LINE_SIZE-1); |
266 | tmp_cmdline = malloc(size: len + 1); |
267 | if (!tmp_cmdline) |
268 | error(m: "Failed to allocate space for tmp_cmdline" ); |
269 | |
270 | memcpy(tmp_cmdline, args, len); |
271 | tmp_cmdline[len] = 0; |
272 | args = tmp_cmdline; |
273 | |
274 | /* Chew leading spaces */ |
275 | args = skip_spaces(str: args); |
276 | |
277 | while (*args) { |
278 | args = next_arg(args, param: ¶m, val: &val); |
279 | /* Stop at -- */ |
280 | if (!val && strcmp(str1: param, str2: "--" ) == 0) |
281 | break; |
282 | |
283 | if (!strcmp(str1: param, str2: "memmap" )) { |
284 | mem_avoid_memmap(mode: PARSE_MEMMAP, str: val); |
285 | } else if (IS_ENABLED(CONFIG_X86_64) && strstr(s1: param, s2: "hugepages" )) { |
286 | parse_gb_huge_pages(param, val); |
287 | } else if (!strcmp(str1: param, str2: "mem" )) { |
288 | char *p = val; |
289 | |
290 | if (!strcmp(str1: p, str2: "nopentium" )) |
291 | continue; |
292 | mem_size = memparse(ptr: p, retptr: &p); |
293 | if (mem_size == 0) |
294 | break; |
295 | |
296 | if (mem_size < mem_limit) |
297 | mem_limit = mem_size; |
298 | } else if (!strcmp(str1: param, str2: "efi_fake_mem" )) { |
299 | mem_avoid_memmap(mode: PARSE_EFI, str: val); |
300 | } |
301 | } |
302 | |
303 | free(where: tmp_cmdline); |
304 | return; |
305 | } |
306 | |
307 | /* |
308 | * In theory, KASLR can put the kernel anywhere in the range of [16M, MAXMEM) |
309 | * on 64-bit, and [16M, KERNEL_IMAGE_SIZE) on 32-bit. |
310 | * |
311 | * The mem_avoid array is used to store the ranges that need to be avoided |
312 | * when KASLR searches for an appropriate random address. We must avoid any |
313 | * regions that are unsafe to overlap with during decompression, and other |
314 | * things like the initrd, cmdline and boot_params. This comment seeks to |
315 | * explain mem_avoid as clearly as possible since incorrect mem_avoid |
316 | * memory ranges lead to really hard to debug boot failures. |
317 | * |
318 | * The initrd, cmdline, and boot_params are trivial to identify for |
319 | * avoiding. They are MEM_AVOID_INITRD, MEM_AVOID_CMDLINE, and |
320 | * MEM_AVOID_BOOTPARAMS respectively below. |
321 | * |
322 | * What is not obvious how to avoid is the range of memory that is used |
323 | * during decompression (MEM_AVOID_ZO_RANGE below). This range must cover |
324 | * the compressed kernel (ZO) and its run space, which is used to extract |
325 | * the uncompressed kernel (VO) and relocs. |
326 | * |
327 | * ZO's full run size sits against the end of the decompression buffer, so |
328 | * we can calculate where text, data, bss, etc of ZO are positioned more |
329 | * easily. |
330 | * |
331 | * For additional background, the decompression calculations can be found |
332 | * in header.S, and the memory diagram is based on the one found in misc.c. |
333 | * |
334 | * The following conditions are already enforced by the image layouts and |
335 | * associated code: |
336 | * - input + input_size >= output + output_size |
337 | * - kernel_total_size <= init_size |
338 | * - kernel_total_size <= output_size (see Note below) |
339 | * - output + init_size >= output + output_size |
340 | * |
341 | * (Note that kernel_total_size and output_size have no fundamental |
342 | * relationship, but output_size is passed to choose_random_location |
343 | * as a maximum of the two. The diagram is showing a case where |
344 | * kernel_total_size is larger than output_size, but this case is |
345 | * handled by bumping output_size.) |
346 | * |
347 | * The above conditions can be illustrated by a diagram: |
348 | * |
349 | * 0 output input input+input_size output+init_size |
350 | * | | | | | |
351 | * | | | | | |
352 | * |-----|--------|--------|--------------|-----------|--|-------------| |
353 | * | | | |
354 | * | | | |
355 | * output+init_size-ZO_INIT_SIZE output+output_size output+kernel_total_size |
356 | * |
357 | * [output, output+init_size) is the entire memory range used for |
358 | * extracting the compressed image. |
359 | * |
360 | * [output, output+kernel_total_size) is the range needed for the |
361 | * uncompressed kernel (VO) and its run size (bss, brk, etc). |
362 | * |
363 | * [output, output+output_size) is VO plus relocs (i.e. the entire |
364 | * uncompressed payload contained by ZO). This is the area of the buffer |
365 | * written to during decompression. |
366 | * |
367 | * [output+init_size-ZO_INIT_SIZE, output+init_size) is the worst-case |
368 | * range of the copied ZO and decompression code. (i.e. the range |
369 | * covered backwards of size ZO_INIT_SIZE, starting from output+init_size.) |
370 | * |
371 | * [input, input+input_size) is the original copied compressed image (ZO) |
372 | * (i.e. it does not include its run size). This range must be avoided |
373 | * because it contains the data used for decompression. |
374 | * |
375 | * [input+input_size, output+init_size) is [_text, _end) for ZO. This |
376 | * range includes ZO's heap and stack, and must be avoided since it |
377 | * performs the decompression. |
378 | * |
379 | * Since the above two ranges need to be avoided and they are adjacent, |
380 | * they can be merged, resulting in: [input, output+init_size) which |
381 | * becomes the MEM_AVOID_ZO_RANGE below. |
382 | */ |
383 | static void mem_avoid_init(unsigned long input, unsigned long input_size, |
384 | unsigned long output) |
385 | { |
386 | unsigned long init_size = boot_params_ptr->hdr.init_size; |
387 | u64 initrd_start, initrd_size; |
388 | unsigned long cmd_line, cmd_line_size; |
389 | |
390 | /* |
391 | * Avoid the region that is unsafe to overlap during |
392 | * decompression. |
393 | */ |
394 | mem_avoid[MEM_AVOID_ZO_RANGE].start = input; |
395 | mem_avoid[MEM_AVOID_ZO_RANGE].size = (output + init_size) - input; |
396 | |
397 | /* Avoid initrd. */ |
398 | initrd_start = (u64)boot_params_ptr->ext_ramdisk_image << 32; |
399 | initrd_start |= boot_params_ptr->hdr.ramdisk_image; |
400 | initrd_size = (u64)boot_params_ptr->ext_ramdisk_size << 32; |
401 | initrd_size |= boot_params_ptr->hdr.ramdisk_size; |
402 | mem_avoid[MEM_AVOID_INITRD].start = initrd_start; |
403 | mem_avoid[MEM_AVOID_INITRD].size = initrd_size; |
404 | /* No need to set mapping for initrd, it will be handled in VO. */ |
405 | |
406 | /* Avoid kernel command line. */ |
407 | cmd_line = get_cmd_line_ptr(); |
408 | /* Calculate size of cmd_line. */ |
409 | if (cmd_line) { |
410 | cmd_line_size = strnlen(s: (char *)cmd_line, COMMAND_LINE_SIZE-1) + 1; |
411 | mem_avoid[MEM_AVOID_CMDLINE].start = cmd_line; |
412 | mem_avoid[MEM_AVOID_CMDLINE].size = cmd_line_size; |
413 | } |
414 | |
415 | /* Avoid boot parameters. */ |
416 | mem_avoid[MEM_AVOID_BOOTPARAMS].start = (unsigned long)boot_params_ptr; |
417 | mem_avoid[MEM_AVOID_BOOTPARAMS].size = sizeof(*boot_params_ptr); |
418 | |
419 | /* We don't need to set a mapping for setup_data. */ |
420 | |
421 | /* Mark the memmap regions we need to avoid */ |
422 | handle_mem_options(); |
423 | |
424 | /* Enumerate the immovable memory regions */ |
425 | num_immovable_mem = count_immovable_mem_regions(); |
426 | } |
427 | |
428 | /* |
429 | * Does this memory vector overlap a known avoided area? If so, record the |
430 | * overlap region with the lowest address. |
431 | */ |
432 | static bool mem_avoid_overlap(struct mem_vector *img, |
433 | struct mem_vector *overlap) |
434 | { |
435 | int i; |
436 | struct setup_data *ptr; |
437 | u64 earliest = img->start + img->size; |
438 | bool is_overlapping = false; |
439 | |
440 | for (i = 0; i < MEM_AVOID_MAX; i++) { |
441 | if (mem_overlaps(one: img, two: &mem_avoid[i]) && |
442 | mem_avoid[i].start < earliest) { |
443 | *overlap = mem_avoid[i]; |
444 | earliest = overlap->start; |
445 | is_overlapping = true; |
446 | } |
447 | } |
448 | |
449 | /* Avoid all entries in the setup_data linked list. */ |
450 | ptr = (struct setup_data *)(unsigned long)boot_params_ptr->hdr.setup_data; |
451 | while (ptr) { |
452 | struct mem_vector avoid; |
453 | |
454 | avoid.start = (unsigned long)ptr; |
455 | avoid.size = sizeof(*ptr) + ptr->len; |
456 | |
457 | if (mem_overlaps(one: img, two: &avoid) && (avoid.start < earliest)) { |
458 | *overlap = avoid; |
459 | earliest = overlap->start; |
460 | is_overlapping = true; |
461 | } |
462 | |
463 | if (ptr->type == SETUP_INDIRECT && |
464 | ((struct setup_indirect *)ptr->data)->type != SETUP_INDIRECT) { |
465 | avoid.start = ((struct setup_indirect *)ptr->data)->addr; |
466 | avoid.size = ((struct setup_indirect *)ptr->data)->len; |
467 | |
468 | if (mem_overlaps(one: img, two: &avoid) && (avoid.start < earliest)) { |
469 | *overlap = avoid; |
470 | earliest = overlap->start; |
471 | is_overlapping = true; |
472 | } |
473 | } |
474 | |
475 | ptr = (struct setup_data *)(unsigned long)ptr->next; |
476 | } |
477 | |
478 | return is_overlapping; |
479 | } |
480 | |
481 | struct slot_area { |
482 | u64 addr; |
483 | unsigned long num; |
484 | }; |
485 | |
486 | #define MAX_SLOT_AREA 100 |
487 | |
488 | static struct slot_area slot_areas[MAX_SLOT_AREA]; |
489 | static unsigned int slot_area_index; |
490 | static unsigned long slot_max; |
491 | |
492 | static void store_slot_info(struct mem_vector *region, unsigned long image_size) |
493 | { |
494 | struct slot_area slot_area; |
495 | |
496 | if (slot_area_index == MAX_SLOT_AREA) |
497 | return; |
498 | |
499 | slot_area.addr = region->start; |
500 | slot_area.num = 1 + (region->size - image_size) / CONFIG_PHYSICAL_ALIGN; |
501 | |
502 | slot_areas[slot_area_index++] = slot_area; |
503 | slot_max += slot_area.num; |
504 | } |
505 | |
506 | /* |
507 | * Skip as many 1GB huge pages as possible in the passed region |
508 | * according to the number which users specified: |
509 | */ |
510 | static void |
511 | process_gb_huge_pages(struct mem_vector *region, unsigned long image_size) |
512 | { |
513 | u64 pud_start, pud_end; |
514 | unsigned long gb_huge_pages; |
515 | struct mem_vector tmp; |
516 | |
517 | if (!IS_ENABLED(CONFIG_X86_64) || !max_gb_huge_pages) { |
518 | store_slot_info(region, image_size); |
519 | return; |
520 | } |
521 | |
522 | /* Are there any 1GB pages in the region? */ |
523 | pud_start = ALIGN(region->start, PUD_SIZE); |
524 | pud_end = ALIGN_DOWN(region->start + region->size, PUD_SIZE); |
525 | |
526 | /* No good 1GB huge pages found: */ |
527 | if (pud_start >= pud_end) { |
528 | store_slot_info(region, image_size); |
529 | return; |
530 | } |
531 | |
532 | /* Check if the head part of the region is usable. */ |
533 | if (pud_start >= region->start + image_size) { |
534 | tmp.start = region->start; |
535 | tmp.size = pud_start - region->start; |
536 | store_slot_info(region: &tmp, image_size); |
537 | } |
538 | |
539 | /* Skip the good 1GB pages. */ |
540 | gb_huge_pages = (pud_end - pud_start) >> PUD_SHIFT; |
541 | if (gb_huge_pages > max_gb_huge_pages) { |
542 | pud_end = pud_start + (max_gb_huge_pages << PUD_SHIFT); |
543 | max_gb_huge_pages = 0; |
544 | } else { |
545 | max_gb_huge_pages -= gb_huge_pages; |
546 | } |
547 | |
548 | /* Check if the tail part of the region is usable. */ |
549 | if (region->start + region->size >= pud_end + image_size) { |
550 | tmp.start = pud_end; |
551 | tmp.size = region->start + region->size - pud_end; |
552 | store_slot_info(region: &tmp, image_size); |
553 | } |
554 | } |
555 | |
556 | static u64 slots_fetch_random(void) |
557 | { |
558 | unsigned long slot; |
559 | unsigned int i; |
560 | |
561 | /* Handle case of no slots stored. */ |
562 | if (slot_max == 0) |
563 | return 0; |
564 | |
565 | slot = kaslr_get_random_long(purpose: "Physical" ) % slot_max; |
566 | |
567 | for (i = 0; i < slot_area_index; i++) { |
568 | if (slot >= slot_areas[i].num) { |
569 | slot -= slot_areas[i].num; |
570 | continue; |
571 | } |
572 | return slot_areas[i].addr + ((u64)slot * CONFIG_PHYSICAL_ALIGN); |
573 | } |
574 | |
575 | if (i == slot_area_index) |
576 | debug_putstr("slots_fetch_random() failed!?\n" ); |
577 | return 0; |
578 | } |
579 | |
580 | static void __process_mem_region(struct mem_vector *entry, |
581 | unsigned long minimum, |
582 | unsigned long image_size) |
583 | { |
584 | struct mem_vector region, overlap; |
585 | u64 region_end; |
586 | |
587 | /* Enforce minimum and memory limit. */ |
588 | region.start = max_t(u64, entry->start, minimum); |
589 | region_end = min(entry->start + entry->size, mem_limit); |
590 | |
591 | /* Give up if slot area array is full. */ |
592 | while (slot_area_index < MAX_SLOT_AREA) { |
593 | /* Potentially raise address to meet alignment needs. */ |
594 | region.start = ALIGN(region.start, CONFIG_PHYSICAL_ALIGN); |
595 | |
596 | /* Did we raise the address above the passed in memory entry? */ |
597 | if (region.start > region_end) |
598 | return; |
599 | |
600 | /* Reduce size by any delta from the original address. */ |
601 | region.size = region_end - region.start; |
602 | |
603 | /* Return if region can't contain decompressed kernel */ |
604 | if (region.size < image_size) |
605 | return; |
606 | |
607 | /* If nothing overlaps, store the region and return. */ |
608 | if (!mem_avoid_overlap(img: ®ion, overlap: &overlap)) { |
609 | process_gb_huge_pages(region: ®ion, image_size); |
610 | return; |
611 | } |
612 | |
613 | /* Store beginning of region if holds at least image_size. */ |
614 | if (overlap.start >= region.start + image_size) { |
615 | region.size = overlap.start - region.start; |
616 | process_gb_huge_pages(region: ®ion, image_size); |
617 | } |
618 | |
619 | /* Clip off the overlapping region and start over. */ |
620 | region.start = overlap.start + overlap.size; |
621 | } |
622 | } |
623 | |
624 | static bool process_mem_region(struct mem_vector *region, |
625 | unsigned long minimum, |
626 | unsigned long image_size) |
627 | { |
628 | int i; |
629 | /* |
630 | * If no immovable memory found, or MEMORY_HOTREMOVE disabled, |
631 | * use @region directly. |
632 | */ |
633 | if (!num_immovable_mem) { |
634 | __process_mem_region(entry: region, minimum, image_size); |
635 | |
636 | if (slot_area_index == MAX_SLOT_AREA) { |
637 | debug_putstr("Aborted e820/efi memmap scan (slot_areas full)!\n" ); |
638 | return true; |
639 | } |
640 | return false; |
641 | } |
642 | |
643 | #if defined(CONFIG_MEMORY_HOTREMOVE) && defined(CONFIG_ACPI) |
644 | /* |
645 | * If immovable memory found, filter the intersection between |
646 | * immovable memory and @region. |
647 | */ |
648 | for (i = 0; i < num_immovable_mem; i++) { |
649 | u64 start, end, entry_end, region_end; |
650 | struct mem_vector entry; |
651 | |
652 | if (!mem_overlaps(one: region, two: &immovable_mem[i])) |
653 | continue; |
654 | |
655 | start = immovable_mem[i].start; |
656 | end = start + immovable_mem[i].size; |
657 | region_end = region->start + region->size; |
658 | |
659 | entry.start = clamp(region->start, start, end); |
660 | entry_end = clamp(region_end, start, end); |
661 | entry.size = entry_end - entry.start; |
662 | |
663 | __process_mem_region(entry: &entry, minimum, image_size); |
664 | |
665 | if (slot_area_index == MAX_SLOT_AREA) { |
666 | debug_putstr("Aborted e820/efi memmap scan when walking immovable regions(slot_areas full)!\n" ); |
667 | return true; |
668 | } |
669 | } |
670 | #endif |
671 | return false; |
672 | } |
673 | |
674 | #ifdef CONFIG_EFI |
675 | |
676 | /* |
677 | * Only EFI_CONVENTIONAL_MEMORY and EFI_UNACCEPTED_MEMORY (if supported) are |
678 | * guaranteed to be free. |
679 | * |
680 | * Pick free memory more conservatively than the EFI spec allows: according to |
681 | * the spec, EFI_BOOT_SERVICES_{CODE|DATA} are also free memory and thus |
682 | * available to place the kernel image into, but in practice there's firmware |
683 | * where using that memory leads to crashes. Buggy vendor EFI code registers |
684 | * for an event that triggers on SetVirtualAddressMap(). The handler assumes |
685 | * that EFI_BOOT_SERVICES_DATA memory has not been touched by loader yet, which |
686 | * is probably true for Windows. |
687 | * |
688 | * Preserve EFI_BOOT_SERVICES_* regions until after SetVirtualAddressMap(). |
689 | */ |
690 | static inline bool memory_type_is_free(efi_memory_desc_t *md) |
691 | { |
692 | if (md->type == EFI_CONVENTIONAL_MEMORY) |
693 | return true; |
694 | |
695 | if (IS_ENABLED(CONFIG_UNACCEPTED_MEMORY) && |
696 | md->type == EFI_UNACCEPTED_MEMORY) |
697 | return true; |
698 | |
699 | return false; |
700 | } |
701 | |
702 | /* |
703 | * Returns true if we processed the EFI memmap, which we prefer over the E820 |
704 | * table if it is available. |
705 | */ |
706 | static bool |
707 | process_efi_entries(unsigned long minimum, unsigned long image_size) |
708 | { |
709 | struct efi_info *e = &boot_params_ptr->efi_info; |
710 | bool efi_mirror_found = false; |
711 | struct mem_vector region; |
712 | efi_memory_desc_t *md; |
713 | unsigned long pmap; |
714 | char *signature; |
715 | u32 nr_desc; |
716 | int i; |
717 | |
718 | signature = (char *)&e->efi_loader_signature; |
719 | if (strncmp(cs: signature, EFI32_LOADER_SIGNATURE, count: 4) && |
720 | strncmp(cs: signature, EFI64_LOADER_SIGNATURE, count: 4)) |
721 | return false; |
722 | |
723 | #ifdef CONFIG_X86_32 |
724 | /* Can't handle data above 4GB at this time */ |
725 | if (e->efi_memmap_hi) { |
726 | warn("EFI memmap is above 4GB, can't be handled now on x86_32. EFI should be disabled.\n" ); |
727 | return false; |
728 | } |
729 | pmap = e->efi_memmap; |
730 | #else |
731 | pmap = (e->efi_memmap | ((__u64)e->efi_memmap_hi << 32)); |
732 | #endif |
733 | |
734 | nr_desc = e->efi_memmap_size / e->efi_memdesc_size; |
735 | for (i = 0; i < nr_desc; i++) { |
736 | md = efi_early_memdesc_ptr(pmap, e->efi_memdesc_size, i); |
737 | if (md->attribute & EFI_MEMORY_MORE_RELIABLE) { |
738 | efi_mirror_found = true; |
739 | break; |
740 | } |
741 | } |
742 | |
743 | for (i = 0; i < nr_desc; i++) { |
744 | md = efi_early_memdesc_ptr(pmap, e->efi_memdesc_size, i); |
745 | |
746 | if (!memory_type_is_free(md)) |
747 | continue; |
748 | |
749 | if (efi_soft_reserve_enabled() && |
750 | (md->attribute & EFI_MEMORY_SP)) |
751 | continue; |
752 | |
753 | if (efi_mirror_found && |
754 | !(md->attribute & EFI_MEMORY_MORE_RELIABLE)) |
755 | continue; |
756 | |
757 | region.start = md->phys_addr; |
758 | region.size = md->num_pages << EFI_PAGE_SHIFT; |
759 | if (process_mem_region(region: ®ion, minimum, image_size)) |
760 | break; |
761 | } |
762 | return true; |
763 | } |
764 | #else |
765 | static inline bool |
766 | process_efi_entries(unsigned long minimum, unsigned long image_size) |
767 | { |
768 | return false; |
769 | } |
770 | #endif |
771 | |
772 | static void process_e820_entries(unsigned long minimum, |
773 | unsigned long image_size) |
774 | { |
775 | int i; |
776 | struct mem_vector region; |
777 | struct boot_e820_entry *entry; |
778 | |
779 | /* Verify potential e820 positions, appending to slots list. */ |
780 | for (i = 0; i < boot_params_ptr->e820_entries; i++) { |
781 | entry = &boot_params_ptr->e820_table[i]; |
782 | /* Skip non-RAM entries. */ |
783 | if (entry->type != E820_TYPE_RAM) |
784 | continue; |
785 | region.start = entry->addr; |
786 | region.size = entry->size; |
787 | if (process_mem_region(region: ®ion, minimum, image_size)) |
788 | break; |
789 | } |
790 | } |
791 | |
792 | static unsigned long find_random_phys_addr(unsigned long minimum, |
793 | unsigned long image_size) |
794 | { |
795 | u64 phys_addr; |
796 | |
797 | /* Bail out early if it's impossible to succeed. */ |
798 | if (minimum + image_size > mem_limit) |
799 | return 0; |
800 | |
801 | /* Check if we had too many memmaps. */ |
802 | if (memmap_too_large) { |
803 | debug_putstr("Aborted memory entries scan (more than 4 memmap= args)!\n" ); |
804 | return 0; |
805 | } |
806 | |
807 | if (!process_efi_entries(minimum, image_size)) |
808 | process_e820_entries(minimum, image_size); |
809 | |
810 | phys_addr = slots_fetch_random(); |
811 | |
812 | /* Perform a final check to make sure the address is in range. */ |
813 | if (phys_addr < minimum || phys_addr + image_size > mem_limit) { |
814 | warn(m: "Invalid physical address chosen!\n" ); |
815 | return 0; |
816 | } |
817 | |
818 | return (unsigned long)phys_addr; |
819 | } |
820 | |
821 | static unsigned long find_random_virt_addr(unsigned long minimum, |
822 | unsigned long image_size) |
823 | { |
824 | unsigned long slots, random_addr; |
825 | |
826 | /* |
827 | * There are how many CONFIG_PHYSICAL_ALIGN-sized slots |
828 | * that can hold image_size within the range of minimum to |
829 | * KERNEL_IMAGE_SIZE? |
830 | */ |
831 | slots = 1 + (KERNEL_IMAGE_SIZE - minimum - image_size) / CONFIG_PHYSICAL_ALIGN; |
832 | |
833 | random_addr = kaslr_get_random_long(purpose: "Virtual" ) % slots; |
834 | |
835 | return random_addr * CONFIG_PHYSICAL_ALIGN + minimum; |
836 | } |
837 | |
838 | /* |
839 | * Since this function examines addresses much more numerically, |
840 | * it takes the input and output pointers as 'unsigned long'. |
841 | */ |
842 | void choose_random_location(unsigned long input, |
843 | unsigned long input_size, |
844 | unsigned long *output, |
845 | unsigned long output_size, |
846 | unsigned long *virt_addr) |
847 | { |
848 | unsigned long random_addr, min_addr; |
849 | |
850 | if (cmdline_find_option_bool(option: "nokaslr" )) { |
851 | warn(m: "KASLR disabled: 'nokaslr' on cmdline." ); |
852 | return; |
853 | } |
854 | |
855 | boot_params_ptr->hdr.loadflags |= KASLR_FLAG; |
856 | |
857 | if (IS_ENABLED(CONFIG_X86_32)) |
858 | mem_limit = KERNEL_IMAGE_SIZE; |
859 | else |
860 | mem_limit = MAXMEM; |
861 | |
862 | /* Record the various known unsafe memory ranges. */ |
863 | mem_avoid_init(input, input_size, output: *output); |
864 | |
865 | /* |
866 | * Low end of the randomization range should be the |
867 | * smaller of 512M or the initial kernel image |
868 | * location: |
869 | */ |
870 | min_addr = min(*output, 512UL << 20); |
871 | /* Make sure minimum is aligned. */ |
872 | min_addr = ALIGN(min_addr, CONFIG_PHYSICAL_ALIGN); |
873 | |
874 | /* Walk available memory entries to find a random address. */ |
875 | random_addr = find_random_phys_addr(minimum: min_addr, image_size: output_size); |
876 | if (!random_addr) { |
877 | warn(m: "Physical KASLR disabled: no suitable memory region!" ); |
878 | } else { |
879 | /* Update the new physical address location. */ |
880 | if (*output != random_addr) |
881 | *output = random_addr; |
882 | } |
883 | |
884 | |
885 | /* Pick random virtual address starting from LOAD_PHYSICAL_ADDR. */ |
886 | if (IS_ENABLED(CONFIG_X86_64)) |
887 | random_addr = find_random_virt_addr(LOAD_PHYSICAL_ADDR, image_size: output_size); |
888 | *virt_addr = random_addr; |
889 | } |
890 | |