1 | // SPDX-License-Identifier: GPL-2.0 |
2 | /* |
3 | * Copyright (c) 2017-2019 Borislav Petkov, SUSE Labs. |
4 | */ |
5 | #include <linux/mm.h> |
6 | #include <linux/gfp.h> |
7 | #include <linux/ras.h> |
8 | #include <linux/kernel.h> |
9 | #include <linux/workqueue.h> |
10 | |
11 | #include <asm/mce.h> |
12 | |
13 | #include "debugfs.h" |
14 | |
15 | /* |
16 | * RAS Correctable Errors Collector |
17 | * |
18 | * This is a simple gadget which collects correctable errors and counts their |
19 | * occurrence per physical page address. |
20 | * |
21 | * We've opted for possibly the simplest data structure to collect those - an |
22 | * array of the size of a memory page. It stores 512 u64's with the following |
23 | * structure: |
24 | * |
25 | * [63 ... PFN ... 12 | 11 ... generation ... 10 | 9 ... count ... 0] |
26 | * |
27 | * The generation in the two highest order bits is two bits which are set to 11b |
28 | * on every insertion. During the course of each entry's existence, the |
29 | * generation field gets decremented during spring cleaning to 10b, then 01b and |
30 | * then 00b. |
31 | * |
32 | * This way we're employing the natural numeric ordering to make sure that newly |
33 | * inserted/touched elements have higher 12-bit counts (which we've manufactured) |
34 | * and thus iterating over the array initially won't kick out those elements |
35 | * which were inserted last. |
36 | * |
37 | * Spring cleaning is what we do when we reach a certain number CLEAN_ELEMS of |
38 | * elements entered into the array, during which, we're decaying all elements. |
39 | * If, after decay, an element gets inserted again, its generation is set to 11b |
40 | * to make sure it has higher numerical count than other, older elements and |
41 | * thus emulate an LRU-like behavior when deleting elements to free up space |
42 | * in the page. |
43 | * |
44 | * When an element reaches it's max count of action_threshold, we try to poison |
45 | * it by assuming that errors triggered action_threshold times in a single page |
46 | * are excessive and that page shouldn't be used anymore. action_threshold is |
47 | * initialized to COUNT_MASK which is the maximum. |
48 | * |
49 | * That error event entry causes cec_add_elem() to return !0 value and thus |
50 | * signal to its callers to log the error. |
51 | * |
52 | * To the question why we've chosen a page and moving elements around with |
53 | * memmove(), it is because it is a very simple structure to handle and max data |
54 | * movement is 4K which on highly optimized modern CPUs is almost unnoticeable. |
55 | * We wanted to avoid the pointer traversal of more complex structures like a |
56 | * linked list or some sort of a balancing search tree. |
57 | * |
58 | * Deleting an element takes O(n) but since it is only a single page, it should |
59 | * be fast enough and it shouldn't happen all too often depending on error |
60 | * patterns. |
61 | */ |
62 | |
63 | #undef pr_fmt |
64 | #define pr_fmt(fmt) "RAS: " fmt |
65 | |
66 | /* |
67 | * We use DECAY_BITS bits of PAGE_SHIFT bits for counting decay, i.e., how long |
68 | * elements have stayed in the array without having been accessed again. |
69 | */ |
70 | #define DECAY_BITS 2 |
71 | #define DECAY_MASK ((1ULL << DECAY_BITS) - 1) |
72 | #define MAX_ELEMS (PAGE_SIZE / sizeof(u64)) |
73 | |
74 | /* |
75 | * Threshold amount of inserted elements after which we start spring |
76 | * cleaning. |
77 | */ |
78 | #define CLEAN_ELEMS (MAX_ELEMS >> DECAY_BITS) |
79 | |
80 | /* Bits which count the number of errors happened in this 4K page. */ |
81 | #define COUNT_BITS (PAGE_SHIFT - DECAY_BITS) |
82 | #define COUNT_MASK ((1ULL << COUNT_BITS) - 1) |
83 | #define FULL_COUNT_MASK (PAGE_SIZE - 1) |
84 | |
85 | /* |
86 | * u64: [ 63 ... 12 | DECAY_BITS | COUNT_BITS ] |
87 | */ |
88 | |
89 | #define PFN(e) ((e) >> PAGE_SHIFT) |
90 | #define DECAY(e) (((e) >> COUNT_BITS) & DECAY_MASK) |
91 | #define COUNT(e) ((unsigned int)(e) & COUNT_MASK) |
92 | #define FULL_COUNT(e) ((e) & (PAGE_SIZE - 1)) |
93 | |
94 | static struct ce_array { |
95 | u64 *array; /* container page */ |
96 | unsigned int n; /* number of elements in the array */ |
97 | |
98 | unsigned int decay_count; /* |
99 | * number of element insertions/increments |
100 | * since the last spring cleaning. |
101 | */ |
102 | |
103 | u64 pfns_poisoned; /* |
104 | * number of PFNs which got poisoned. |
105 | */ |
106 | |
107 | u64 ces_entered; /* |
108 | * The number of correctable errors |
109 | * entered into the collector. |
110 | */ |
111 | |
112 | u64 decays_done; /* |
113 | * Times we did spring cleaning. |
114 | */ |
115 | |
116 | union { |
117 | struct { |
118 | __u32 disabled : 1, /* cmdline disabled */ |
119 | __resv : 31; |
120 | }; |
121 | __u32 flags; |
122 | }; |
123 | } ce_arr; |
124 | |
125 | static DEFINE_MUTEX(ce_mutex); |
126 | static u64 dfs_pfn; |
127 | |
128 | /* Amount of errors after which we offline */ |
129 | static u64 action_threshold = COUNT_MASK; |
130 | |
131 | /* Each element "decays" each decay_interval which is 24hrs by default. */ |
132 | #define CEC_DECAY_DEFAULT_INTERVAL 24 * 60 * 60 /* 24 hrs */ |
133 | #define CEC_DECAY_MIN_INTERVAL 1 * 60 * 60 /* 1h */ |
134 | #define CEC_DECAY_MAX_INTERVAL 30 * 24 * 60 * 60 /* one month */ |
135 | static struct delayed_work cec_work; |
136 | static u64 decay_interval = CEC_DECAY_DEFAULT_INTERVAL; |
137 | |
138 | /* |
139 | * Decrement decay value. We're using DECAY_BITS bits to denote decay of an |
140 | * element in the array. On insertion and any access, it gets reset to max. |
141 | */ |
142 | static void do_spring_cleaning(struct ce_array *ca) |
143 | { |
144 | int i; |
145 | |
146 | for (i = 0; i < ca->n; i++) { |
147 | u8 decay = DECAY(ca->array[i]); |
148 | |
149 | if (!decay) |
150 | continue; |
151 | |
152 | decay--; |
153 | |
154 | ca->array[i] &= ~(DECAY_MASK << COUNT_BITS); |
155 | ca->array[i] |= (decay << COUNT_BITS); |
156 | } |
157 | ca->decay_count = 0; |
158 | ca->decays_done++; |
159 | } |
160 | |
161 | /* |
162 | * @interval in seconds |
163 | */ |
164 | static void cec_mod_work(unsigned long interval) |
165 | { |
166 | unsigned long iv; |
167 | |
168 | iv = interval * HZ; |
169 | mod_delayed_work(wq: system_wq, dwork: &cec_work, delay: round_jiffies(j: iv)); |
170 | } |
171 | |
172 | static void cec_work_fn(struct work_struct *work) |
173 | { |
174 | mutex_lock(&ce_mutex); |
175 | do_spring_cleaning(ca: &ce_arr); |
176 | mutex_unlock(lock: &ce_mutex); |
177 | |
178 | cec_mod_work(interval: decay_interval); |
179 | } |
180 | |
181 | /* |
182 | * @to: index of the smallest element which is >= then @pfn. |
183 | * |
184 | * Return the index of the pfn if found, otherwise negative value. |
185 | */ |
186 | static int __find_elem(struct ce_array *ca, u64 pfn, unsigned int *to) |
187 | { |
188 | int min = 0, max = ca->n - 1; |
189 | u64 this_pfn; |
190 | |
191 | while (min <= max) { |
192 | int i = (min + max) >> 1; |
193 | |
194 | this_pfn = PFN(ca->array[i]); |
195 | |
196 | if (this_pfn < pfn) |
197 | min = i + 1; |
198 | else if (this_pfn > pfn) |
199 | max = i - 1; |
200 | else if (this_pfn == pfn) { |
201 | if (to) |
202 | *to = i; |
203 | |
204 | return i; |
205 | } |
206 | } |
207 | |
208 | /* |
209 | * When the loop terminates without finding @pfn, min has the index of |
210 | * the element slot where the new @pfn should be inserted. The loop |
211 | * terminates when min > max, which means the min index points to the |
212 | * bigger element while the max index to the smaller element, in-between |
213 | * which the new @pfn belongs to. |
214 | * |
215 | * For more details, see exercise 1, Section 6.2.1 in TAOCP, vol. 3. |
216 | */ |
217 | if (to) |
218 | *to = min; |
219 | |
220 | return -ENOKEY; |
221 | } |
222 | |
223 | static int find_elem(struct ce_array *ca, u64 pfn, unsigned int *to) |
224 | { |
225 | WARN_ON(!to); |
226 | |
227 | if (!ca->n) { |
228 | *to = 0; |
229 | return -ENOKEY; |
230 | } |
231 | return __find_elem(ca, pfn, to); |
232 | } |
233 | |
234 | static void del_elem(struct ce_array *ca, int idx) |
235 | { |
236 | /* Save us a function call when deleting the last element. */ |
237 | if (ca->n - (idx + 1)) |
238 | memmove((void *)&ca->array[idx], |
239 | (void *)&ca->array[idx + 1], |
240 | (ca->n - (idx + 1)) * sizeof(u64)); |
241 | |
242 | ca->n--; |
243 | } |
244 | |
245 | static u64 del_lru_elem_unlocked(struct ce_array *ca) |
246 | { |
247 | unsigned int min = FULL_COUNT_MASK; |
248 | int i, min_idx = 0; |
249 | |
250 | for (i = 0; i < ca->n; i++) { |
251 | unsigned int this = FULL_COUNT(ca->array[i]); |
252 | |
253 | if (min > this) { |
254 | min = this; |
255 | min_idx = i; |
256 | } |
257 | } |
258 | |
259 | del_elem(ca, idx: min_idx); |
260 | |
261 | return PFN(ca->array[min_idx]); |
262 | } |
263 | |
264 | /* |
265 | * We return the 0th pfn in the error case under the assumption that it cannot |
266 | * be poisoned and excessive CEs in there are a serious deal anyway. |
267 | */ |
268 | static u64 __maybe_unused del_lru_elem(void) |
269 | { |
270 | struct ce_array *ca = &ce_arr; |
271 | u64 pfn; |
272 | |
273 | if (!ca->n) |
274 | return 0; |
275 | |
276 | mutex_lock(&ce_mutex); |
277 | pfn = del_lru_elem_unlocked(ca); |
278 | mutex_unlock(lock: &ce_mutex); |
279 | |
280 | return pfn; |
281 | } |
282 | |
283 | static bool sanity_check(struct ce_array *ca) |
284 | { |
285 | bool ret = false; |
286 | u64 prev = 0; |
287 | int i; |
288 | |
289 | for (i = 0; i < ca->n; i++) { |
290 | u64 this = PFN(ca->array[i]); |
291 | |
292 | if (WARN(prev > this, "prev: 0x%016llx <-> this: 0x%016llx\n" , prev, this)) |
293 | ret = true; |
294 | |
295 | prev = this; |
296 | } |
297 | |
298 | if (!ret) |
299 | return ret; |
300 | |
301 | pr_info("Sanity check dump:\n{ n: %d\n" , ca->n); |
302 | for (i = 0; i < ca->n; i++) { |
303 | u64 this = PFN(ca->array[i]); |
304 | |
305 | pr_info(" %03d: [%016llx|%03llx]\n" , i, this, FULL_COUNT(ca->array[i])); |
306 | } |
307 | pr_info("}\n" ); |
308 | |
309 | return ret; |
310 | } |
311 | |
312 | /** |
313 | * cec_add_elem - Add an element to the CEC array. |
314 | * @pfn: page frame number to insert |
315 | * |
316 | * Return values: |
317 | * - <0: on error |
318 | * - 0: on success |
319 | * - >0: when the inserted pfn was offlined |
320 | */ |
321 | static int cec_add_elem(u64 pfn) |
322 | { |
323 | struct ce_array *ca = &ce_arr; |
324 | int count, err, ret = 0; |
325 | unsigned int to = 0; |
326 | |
327 | /* |
328 | * We can be called very early on the identify_cpu() path where we are |
329 | * not initialized yet. We ignore the error for simplicity. |
330 | */ |
331 | if (!ce_arr.array || ce_arr.disabled) |
332 | return -ENODEV; |
333 | |
334 | mutex_lock(&ce_mutex); |
335 | |
336 | ca->ces_entered++; |
337 | |
338 | /* Array full, free the LRU slot. */ |
339 | if (ca->n == MAX_ELEMS) |
340 | WARN_ON(!del_lru_elem_unlocked(ca)); |
341 | |
342 | err = find_elem(ca, pfn, to: &to); |
343 | if (err < 0) { |
344 | /* |
345 | * Shift range [to-end] to make room for one more element. |
346 | */ |
347 | memmove((void *)&ca->array[to + 1], |
348 | (void *)&ca->array[to], |
349 | (ca->n - to) * sizeof(u64)); |
350 | |
351 | ca->array[to] = pfn << PAGE_SHIFT; |
352 | ca->n++; |
353 | } |
354 | |
355 | /* Add/refresh element generation and increment count */ |
356 | ca->array[to] |= DECAY_MASK << COUNT_BITS; |
357 | ca->array[to]++; |
358 | |
359 | /* Check action threshold and soft-offline, if reached. */ |
360 | count = COUNT(ca->array[to]); |
361 | if (count >= action_threshold) { |
362 | u64 pfn = ca->array[to] >> PAGE_SHIFT; |
363 | |
364 | if (!pfn_valid(pfn)) { |
365 | pr_warn("CEC: Invalid pfn: 0x%llx\n" , pfn); |
366 | } else { |
367 | /* We have reached max count for this page, soft-offline it. */ |
368 | pr_err("Soft-offlining pfn: 0x%llx\n" , pfn); |
369 | memory_failure_queue(pfn, flags: MF_SOFT_OFFLINE); |
370 | ca->pfns_poisoned++; |
371 | } |
372 | |
373 | del_elem(ca, idx: to); |
374 | |
375 | /* |
376 | * Return a >0 value to callers, to denote that we've reached |
377 | * the offlining threshold. |
378 | */ |
379 | ret = 1; |
380 | |
381 | goto unlock; |
382 | } |
383 | |
384 | ca->decay_count++; |
385 | |
386 | if (ca->decay_count >= CLEAN_ELEMS) |
387 | do_spring_cleaning(ca); |
388 | |
389 | WARN_ON_ONCE(sanity_check(ca)); |
390 | |
391 | unlock: |
392 | mutex_unlock(lock: &ce_mutex); |
393 | |
394 | return ret; |
395 | } |
396 | |
397 | static int u64_get(void *data, u64 *val) |
398 | { |
399 | *val = *(u64 *)data; |
400 | |
401 | return 0; |
402 | } |
403 | |
404 | static int pfn_set(void *data, u64 val) |
405 | { |
406 | *(u64 *)data = val; |
407 | |
408 | cec_add_elem(pfn: val); |
409 | |
410 | return 0; |
411 | } |
412 | |
413 | DEFINE_DEBUGFS_ATTRIBUTE(pfn_ops, u64_get, pfn_set, "0x%llx\n" ); |
414 | |
415 | static int decay_interval_set(void *data, u64 val) |
416 | { |
417 | if (val < CEC_DECAY_MIN_INTERVAL) |
418 | return -EINVAL; |
419 | |
420 | if (val > CEC_DECAY_MAX_INTERVAL) |
421 | return -EINVAL; |
422 | |
423 | *(u64 *)data = val; |
424 | decay_interval = val; |
425 | |
426 | cec_mod_work(interval: decay_interval); |
427 | |
428 | return 0; |
429 | } |
430 | DEFINE_DEBUGFS_ATTRIBUTE(decay_interval_ops, u64_get, decay_interval_set, "%lld\n" ); |
431 | |
432 | static int action_threshold_set(void *data, u64 val) |
433 | { |
434 | *(u64 *)data = val; |
435 | |
436 | if (val > COUNT_MASK) |
437 | val = COUNT_MASK; |
438 | |
439 | action_threshold = val; |
440 | |
441 | return 0; |
442 | } |
443 | DEFINE_DEBUGFS_ATTRIBUTE(action_threshold_ops, u64_get, action_threshold_set, "%lld\n" ); |
444 | |
445 | static const char * const bins[] = { "00" , "01" , "10" , "11" }; |
446 | |
447 | static int array_show(struct seq_file *m, void *v) |
448 | { |
449 | struct ce_array *ca = &ce_arr; |
450 | int i; |
451 | |
452 | mutex_lock(&ce_mutex); |
453 | |
454 | seq_printf(m, fmt: "{ n: %d\n" , ca->n); |
455 | for (i = 0; i < ca->n; i++) { |
456 | u64 this = PFN(ca->array[i]); |
457 | |
458 | seq_printf(m, fmt: " %3d: [%016llx|%s|%03llx]\n" , |
459 | i, this, bins[DECAY(ca->array[i])], COUNT(ca->array[i])); |
460 | } |
461 | |
462 | seq_printf(m, fmt: "}\n" ); |
463 | |
464 | seq_printf(m, fmt: "Stats:\nCEs: %llu\nofflined pages: %llu\n" , |
465 | ca->ces_entered, ca->pfns_poisoned); |
466 | |
467 | seq_printf(m, fmt: "Flags: 0x%x\n" , ca->flags); |
468 | |
469 | seq_printf(m, fmt: "Decay interval: %lld seconds\n" , decay_interval); |
470 | seq_printf(m, fmt: "Decays: %lld\n" , ca->decays_done); |
471 | |
472 | seq_printf(m, fmt: "Action threshold: %lld\n" , action_threshold); |
473 | |
474 | mutex_unlock(lock: &ce_mutex); |
475 | |
476 | return 0; |
477 | } |
478 | |
479 | DEFINE_SHOW_ATTRIBUTE(array); |
480 | |
481 | static int __init create_debugfs_nodes(void) |
482 | { |
483 | struct dentry *d, *pfn, *decay, *count, *array; |
484 | |
485 | d = debugfs_create_dir(name: "cec" , parent: ras_debugfs_dir); |
486 | if (!d) { |
487 | pr_warn("Error creating cec debugfs node!\n" ); |
488 | return -1; |
489 | } |
490 | |
491 | decay = debugfs_create_file(name: "decay_interval" , S_IRUSR | S_IWUSR, parent: d, |
492 | data: &decay_interval, fops: &decay_interval_ops); |
493 | if (!decay) { |
494 | pr_warn("Error creating decay_interval debugfs node!\n" ); |
495 | goto err; |
496 | } |
497 | |
498 | count = debugfs_create_file(name: "action_threshold" , S_IRUSR | S_IWUSR, parent: d, |
499 | data: &action_threshold, fops: &action_threshold_ops); |
500 | if (!count) { |
501 | pr_warn("Error creating action_threshold debugfs node!\n" ); |
502 | goto err; |
503 | } |
504 | |
505 | if (!IS_ENABLED(CONFIG_RAS_CEC_DEBUG)) |
506 | return 0; |
507 | |
508 | pfn = debugfs_create_file(name: "pfn" , S_IRUSR | S_IWUSR, parent: d, data: &dfs_pfn, fops: &pfn_ops); |
509 | if (!pfn) { |
510 | pr_warn("Error creating pfn debugfs node!\n" ); |
511 | goto err; |
512 | } |
513 | |
514 | array = debugfs_create_file(name: "array" , S_IRUSR, parent: d, NULL, fops: &array_fops); |
515 | if (!array) { |
516 | pr_warn("Error creating array debugfs node!\n" ); |
517 | goto err; |
518 | } |
519 | |
520 | return 0; |
521 | |
522 | err: |
523 | debugfs_remove_recursive(dentry: d); |
524 | |
525 | return 1; |
526 | } |
527 | |
528 | static int cec_notifier(struct notifier_block *nb, unsigned long val, |
529 | void *data) |
530 | { |
531 | struct mce *m = (struct mce *)data; |
532 | |
533 | if (!m) |
534 | return NOTIFY_DONE; |
535 | |
536 | /* We eat only correctable DRAM errors with usable addresses. */ |
537 | if (mce_is_memory_error(m) && |
538 | mce_is_correctable(m) && |
539 | mce_usable_address(m)) { |
540 | if (!cec_add_elem(pfn: m->addr >> PAGE_SHIFT)) { |
541 | m->kflags |= MCE_HANDLED_CEC; |
542 | return NOTIFY_OK; |
543 | } |
544 | } |
545 | |
546 | return NOTIFY_DONE; |
547 | } |
548 | |
549 | static struct notifier_block cec_nb = { |
550 | .notifier_call = cec_notifier, |
551 | .priority = MCE_PRIO_CEC, |
552 | }; |
553 | |
554 | static int __init cec_init(void) |
555 | { |
556 | if (ce_arr.disabled) |
557 | return -ENODEV; |
558 | |
559 | /* |
560 | * Intel systems may avoid uncorrectable errors |
561 | * if pages with corrected errors are aggressively |
562 | * taken offline. |
563 | */ |
564 | if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) |
565 | action_threshold = 2; |
566 | |
567 | ce_arr.array = (void *)get_zeroed_page(GFP_KERNEL); |
568 | if (!ce_arr.array) { |
569 | pr_err("Error allocating CE array page!\n" ); |
570 | return -ENOMEM; |
571 | } |
572 | |
573 | if (create_debugfs_nodes()) { |
574 | free_page((unsigned long)ce_arr.array); |
575 | return -ENOMEM; |
576 | } |
577 | |
578 | INIT_DELAYED_WORK(&cec_work, cec_work_fn); |
579 | schedule_delayed_work(dwork: &cec_work, CEC_DECAY_DEFAULT_INTERVAL); |
580 | |
581 | mce_register_decode_chain(nb: &cec_nb); |
582 | |
583 | pr_info("Correctable Errors collector initialized.\n" ); |
584 | return 0; |
585 | } |
586 | late_initcall(cec_init); |
587 | |
588 | int __init parse_cec_param(char *str) |
589 | { |
590 | if (!str) |
591 | return 0; |
592 | |
593 | if (*str == '=') |
594 | str++; |
595 | |
596 | if (!strcmp(str, "cec_disable" )) |
597 | ce_arr.disabled = 1; |
598 | else |
599 | return 0; |
600 | |
601 | return 1; |
602 | } |
603 | |