1 | // SPDX-License-Identifier: GPL-2.0-or-later |
2 | /* |
3 | md.c : Multiple Devices driver for Linux |
4 | Copyright (C) 1998, 1999, 2000 Ingo Molnar |
5 | |
6 | completely rewritten, based on the MD driver code from Marc Zyngier |
7 | |
8 | Changes: |
9 | |
10 | - RAID-1/RAID-5 extensions by Miguel de Icaza, Gadi Oxman, Ingo Molnar |
11 | - RAID-6 extensions by H. Peter Anvin <hpa@zytor.com> |
12 | - boot support for linear and striped mode by Harald Hoyer <HarryH@Royal.Net> |
13 | - kerneld support by Boris Tobotras <boris@xtalk.msk.su> |
14 | - kmod support by: Cyrus Durgin |
15 | - RAID0 bugfixes: Mark Anthony Lisher <markal@iname.com> |
16 | - Devfs support by Richard Gooch <rgooch@atnf.csiro.au> |
17 | |
18 | - lots of fixes and improvements to the RAID1/RAID5 and generic |
19 | RAID code (such as request based resynchronization): |
20 | |
21 | Neil Brown <neilb@cse.unsw.edu.au>. |
22 | |
23 | - persistent bitmap code |
24 | Copyright (C) 2003-2004, Paul Clements, SteelEye Technology, Inc. |
25 | |
26 | |
27 | Errors, Warnings, etc. |
28 | Please use: |
29 | pr_crit() for error conditions that risk data loss |
30 | pr_err() for error conditions that are unexpected, like an IO error |
31 | or internal inconsistency |
32 | pr_warn() for error conditions that could have been predicated, like |
33 | adding a device to an array when it has incompatible metadata |
34 | pr_info() for every interesting, very rare events, like an array starting |
35 | or stopping, or resync starting or stopping |
36 | pr_debug() for everything else. |
37 | |
38 | */ |
39 | |
40 | #include <linux/sched/mm.h> |
41 | #include <linux/sched/signal.h> |
42 | #include <linux/kthread.h> |
43 | #include <linux/blkdev.h> |
44 | #include <linux/blk-integrity.h> |
45 | #include <linux/badblocks.h> |
46 | #include <linux/sysctl.h> |
47 | #include <linux/seq_file.h> |
48 | #include <linux/fs.h> |
49 | #include <linux/poll.h> |
50 | #include <linux/ctype.h> |
51 | #include <linux/string.h> |
52 | #include <linux/hdreg.h> |
53 | #include <linux/proc_fs.h> |
54 | #include <linux/random.h> |
55 | #include <linux/major.h> |
56 | #include <linux/module.h> |
57 | #include <linux/reboot.h> |
58 | #include <linux/file.h> |
59 | #include <linux/compat.h> |
60 | #include <linux/delay.h> |
61 | #include <linux/raid/md_p.h> |
62 | #include <linux/raid/md_u.h> |
63 | #include <linux/raid/detect.h> |
64 | #include <linux/slab.h> |
65 | #include <linux/percpu-refcount.h> |
66 | #include <linux/part_stat.h> |
67 | |
68 | #include "md.h" |
69 | #include "md-bitmap.h" |
70 | #include "md-cluster.h" |
71 | |
72 | /* pers_list is a list of registered personalities protected by pers_lock. */ |
73 | static LIST_HEAD(pers_list); |
74 | static DEFINE_SPINLOCK(pers_lock); |
75 | |
76 | static const struct kobj_type md_ktype; |
77 | |
78 | struct md_cluster_operations *md_cluster_ops; |
79 | EXPORT_SYMBOL(md_cluster_ops); |
80 | static struct module *md_cluster_mod; |
81 | |
82 | static DECLARE_WAIT_QUEUE_HEAD(resync_wait); |
83 | static struct workqueue_struct *md_wq; |
84 | |
85 | /* |
86 | * This workqueue is used for sync_work to register new sync_thread, and for |
87 | * del_work to remove rdev, and for event_work that is only set by dm-raid. |
88 | * |
89 | * Noted that sync_work will grab reconfig_mutex, hence never flush this |
90 | * workqueue whith reconfig_mutex grabbed. |
91 | */ |
92 | static struct workqueue_struct *md_misc_wq; |
93 | struct workqueue_struct *md_bitmap_wq; |
94 | |
95 | static int remove_and_add_spares(struct mddev *mddev, |
96 | struct md_rdev *this); |
97 | static void mddev_detach(struct mddev *mddev); |
98 | static void export_rdev(struct md_rdev *rdev, struct mddev *mddev); |
99 | static void md_wakeup_thread_directly(struct md_thread __rcu *thread); |
100 | |
101 | /* |
102 | * Default number of read corrections we'll attempt on an rdev |
103 | * before ejecting it from the array. We divide the read error |
104 | * count by 2 for every hour elapsed between read errors. |
105 | */ |
106 | #define MD_DEFAULT_MAX_CORRECTED_READ_ERRORS 20 |
107 | /* Default safemode delay: 200 msec */ |
108 | #define DEFAULT_SAFEMODE_DELAY ((200 * HZ)/1000 +1) |
109 | /* |
110 | * Current RAID-1,4,5 parallel reconstruction 'guaranteed speed limit' |
111 | * is 1000 KB/sec, so the extra system load does not show up that much. |
112 | * Increase it if you want to have more _guaranteed_ speed. Note that |
113 | * the RAID driver will use the maximum available bandwidth if the IO |
114 | * subsystem is idle. There is also an 'absolute maximum' reconstruction |
115 | * speed limit - in case reconstruction slows down your system despite |
116 | * idle IO detection. |
117 | * |
118 | * you can change it via /proc/sys/dev/raid/speed_limit_min and _max. |
119 | * or /sys/block/mdX/md/sync_speed_{min,max} |
120 | */ |
121 | |
122 | static int sysctl_speed_limit_min = 1000; |
123 | static int sysctl_speed_limit_max = 200000; |
124 | static inline int speed_min(struct mddev *mddev) |
125 | { |
126 | return mddev->sync_speed_min ? |
127 | mddev->sync_speed_min : sysctl_speed_limit_min; |
128 | } |
129 | |
130 | static inline int speed_max(struct mddev *mddev) |
131 | { |
132 | return mddev->sync_speed_max ? |
133 | mddev->sync_speed_max : sysctl_speed_limit_max; |
134 | } |
135 | |
136 | static void rdev_uninit_serial(struct md_rdev *rdev) |
137 | { |
138 | if (!test_and_clear_bit(nr: CollisionCheck, addr: &rdev->flags)) |
139 | return; |
140 | |
141 | kvfree(addr: rdev->serial); |
142 | rdev->serial = NULL; |
143 | } |
144 | |
145 | static void rdevs_uninit_serial(struct mddev *mddev) |
146 | { |
147 | struct md_rdev *rdev; |
148 | |
149 | rdev_for_each(rdev, mddev) |
150 | rdev_uninit_serial(rdev); |
151 | } |
152 | |
153 | static int rdev_init_serial(struct md_rdev *rdev) |
154 | { |
155 | /* serial_nums equals with BARRIER_BUCKETS_NR */ |
156 | int i, serial_nums = 1 << ((PAGE_SHIFT - ilog2(sizeof(atomic_t)))); |
157 | struct serial_in_rdev *serial = NULL; |
158 | |
159 | if (test_bit(CollisionCheck, &rdev->flags)) |
160 | return 0; |
161 | |
162 | serial = kvmalloc(size: sizeof(struct serial_in_rdev) * serial_nums, |
163 | GFP_KERNEL); |
164 | if (!serial) |
165 | return -ENOMEM; |
166 | |
167 | for (i = 0; i < serial_nums; i++) { |
168 | struct serial_in_rdev *serial_tmp = &serial[i]; |
169 | |
170 | spin_lock_init(&serial_tmp->serial_lock); |
171 | serial_tmp->serial_rb = RB_ROOT_CACHED; |
172 | init_waitqueue_head(&serial_tmp->serial_io_wait); |
173 | } |
174 | |
175 | rdev->serial = serial; |
176 | set_bit(nr: CollisionCheck, addr: &rdev->flags); |
177 | |
178 | return 0; |
179 | } |
180 | |
181 | static int rdevs_init_serial(struct mddev *mddev) |
182 | { |
183 | struct md_rdev *rdev; |
184 | int ret = 0; |
185 | |
186 | rdev_for_each(rdev, mddev) { |
187 | ret = rdev_init_serial(rdev); |
188 | if (ret) |
189 | break; |
190 | } |
191 | |
192 | /* Free all resources if pool is not existed */ |
193 | if (ret && !mddev->serial_info_pool) |
194 | rdevs_uninit_serial(mddev); |
195 | |
196 | return ret; |
197 | } |
198 | |
199 | /* |
200 | * rdev needs to enable serial stuffs if it meets the conditions: |
201 | * 1. it is multi-queue device flaged with writemostly. |
202 | * 2. the write-behind mode is enabled. |
203 | */ |
204 | static int rdev_need_serial(struct md_rdev *rdev) |
205 | { |
206 | return (rdev && rdev->mddev->bitmap_info.max_write_behind > 0 && |
207 | rdev->bdev->bd_disk->queue->nr_hw_queues != 1 && |
208 | test_bit(WriteMostly, &rdev->flags)); |
209 | } |
210 | |
211 | /* |
212 | * Init resource for rdev(s), then create serial_info_pool if: |
213 | * 1. rdev is the first device which return true from rdev_enable_serial. |
214 | * 2. rdev is NULL, means we want to enable serialization for all rdevs. |
215 | */ |
216 | void mddev_create_serial_pool(struct mddev *mddev, struct md_rdev *rdev) |
217 | { |
218 | int ret = 0; |
219 | |
220 | if (rdev && !rdev_need_serial(rdev) && |
221 | !test_bit(CollisionCheck, &rdev->flags)) |
222 | return; |
223 | |
224 | if (!rdev) |
225 | ret = rdevs_init_serial(mddev); |
226 | else |
227 | ret = rdev_init_serial(rdev); |
228 | if (ret) |
229 | return; |
230 | |
231 | if (mddev->serial_info_pool == NULL) { |
232 | /* |
233 | * already in memalloc noio context by |
234 | * mddev_suspend() |
235 | */ |
236 | mddev->serial_info_pool = |
237 | mempool_create_kmalloc_pool(NR_SERIAL_INFOS, |
238 | size: sizeof(struct serial_info)); |
239 | if (!mddev->serial_info_pool) { |
240 | rdevs_uninit_serial(mddev); |
241 | pr_err("can't alloc memory pool for serialization\n" ); |
242 | } |
243 | } |
244 | } |
245 | |
246 | /* |
247 | * Free resource from rdev(s), and destroy serial_info_pool under conditions: |
248 | * 1. rdev is the last device flaged with CollisionCheck. |
249 | * 2. when bitmap is destroyed while policy is not enabled. |
250 | * 3. for disable policy, the pool is destroyed only when no rdev needs it. |
251 | */ |
252 | void mddev_destroy_serial_pool(struct mddev *mddev, struct md_rdev *rdev) |
253 | { |
254 | if (rdev && !test_bit(CollisionCheck, &rdev->flags)) |
255 | return; |
256 | |
257 | if (mddev->serial_info_pool) { |
258 | struct md_rdev *temp; |
259 | int num = 0; /* used to track if other rdevs need the pool */ |
260 | |
261 | rdev_for_each(temp, mddev) { |
262 | if (!rdev) { |
263 | if (!mddev->serialize_policy || |
264 | !rdev_need_serial(rdev: temp)) |
265 | rdev_uninit_serial(rdev: temp); |
266 | else |
267 | num++; |
268 | } else if (temp != rdev && |
269 | test_bit(CollisionCheck, &temp->flags)) |
270 | num++; |
271 | } |
272 | |
273 | if (rdev) |
274 | rdev_uninit_serial(rdev); |
275 | |
276 | if (num) |
277 | pr_info("The mempool could be used by other devices\n" ); |
278 | else { |
279 | mempool_destroy(pool: mddev->serial_info_pool); |
280 | mddev->serial_info_pool = NULL; |
281 | } |
282 | } |
283 | } |
284 | |
285 | static struct ctl_table_header *; |
286 | |
287 | static struct ctl_table raid_table[] = { |
288 | { |
289 | .procname = "speed_limit_min" , |
290 | .data = &sysctl_speed_limit_min, |
291 | .maxlen = sizeof(int), |
292 | .mode = S_IRUGO|S_IWUSR, |
293 | .proc_handler = proc_dointvec, |
294 | }, |
295 | { |
296 | .procname = "speed_limit_max" , |
297 | .data = &sysctl_speed_limit_max, |
298 | .maxlen = sizeof(int), |
299 | .mode = S_IRUGO|S_IWUSR, |
300 | .proc_handler = proc_dointvec, |
301 | }, |
302 | }; |
303 | |
304 | static int start_readonly; |
305 | |
306 | /* |
307 | * The original mechanism for creating an md device is to create |
308 | * a device node in /dev and to open it. This causes races with device-close. |
309 | * The preferred method is to write to the "new_array" module parameter. |
310 | * This can avoid races. |
311 | * Setting create_on_open to false disables the original mechanism |
312 | * so all the races disappear. |
313 | */ |
314 | static bool create_on_open = true; |
315 | |
316 | /* |
317 | * We have a system wide 'event count' that is incremented |
318 | * on any 'interesting' event, and readers of /proc/mdstat |
319 | * can use 'poll' or 'select' to find out when the event |
320 | * count increases. |
321 | * |
322 | * Events are: |
323 | * start array, stop array, error, add device, remove device, |
324 | * start build, activate spare |
325 | */ |
326 | static DECLARE_WAIT_QUEUE_HEAD(md_event_waiters); |
327 | static atomic_t md_event_count; |
328 | void md_new_event(void) |
329 | { |
330 | atomic_inc(v: &md_event_count); |
331 | wake_up(&md_event_waiters); |
332 | } |
333 | EXPORT_SYMBOL_GPL(md_new_event); |
334 | |
335 | /* |
336 | * Enables to iterate over all existing md arrays |
337 | * all_mddevs_lock protects this list. |
338 | */ |
339 | static LIST_HEAD(all_mddevs); |
340 | static DEFINE_SPINLOCK(all_mddevs_lock); |
341 | |
342 | static bool is_md_suspended(struct mddev *mddev) |
343 | { |
344 | return percpu_ref_is_dying(ref: &mddev->active_io); |
345 | } |
346 | /* Rather than calling directly into the personality make_request function, |
347 | * IO requests come here first so that we can check if the device is |
348 | * being suspended pending a reconfiguration. |
349 | * We hold a refcount over the call to ->make_request. By the time that |
350 | * call has finished, the bio has been linked into some internal structure |
351 | * and so is visible to ->quiesce(), so we don't need the refcount any more. |
352 | */ |
353 | static bool is_suspended(struct mddev *mddev, struct bio *bio) |
354 | { |
355 | if (is_md_suspended(mddev)) |
356 | return true; |
357 | if (bio_data_dir(bio) != WRITE) |
358 | return false; |
359 | if (READ_ONCE(mddev->suspend_lo) >= READ_ONCE(mddev->suspend_hi)) |
360 | return false; |
361 | if (bio->bi_iter.bi_sector >= READ_ONCE(mddev->suspend_hi)) |
362 | return false; |
363 | if (bio_end_sector(bio) < READ_ONCE(mddev->suspend_lo)) |
364 | return false; |
365 | return true; |
366 | } |
367 | |
368 | bool md_handle_request(struct mddev *mddev, struct bio *bio) |
369 | { |
370 | check_suspended: |
371 | if (is_suspended(mddev, bio)) { |
372 | DEFINE_WAIT(__wait); |
373 | /* Bail out if REQ_NOWAIT is set for the bio */ |
374 | if (bio->bi_opf & REQ_NOWAIT) { |
375 | bio_wouldblock_error(bio); |
376 | return true; |
377 | } |
378 | for (;;) { |
379 | prepare_to_wait(wq_head: &mddev->sb_wait, wq_entry: &__wait, |
380 | TASK_UNINTERRUPTIBLE); |
381 | if (!is_suspended(mddev, bio)) |
382 | break; |
383 | schedule(); |
384 | } |
385 | finish_wait(wq_head: &mddev->sb_wait, wq_entry: &__wait); |
386 | } |
387 | if (!percpu_ref_tryget_live(ref: &mddev->active_io)) |
388 | goto check_suspended; |
389 | |
390 | if (!mddev->pers->make_request(mddev, bio)) { |
391 | percpu_ref_put(ref: &mddev->active_io); |
392 | if (!mddev->gendisk && mddev->pers->prepare_suspend) |
393 | return false; |
394 | goto check_suspended; |
395 | } |
396 | |
397 | percpu_ref_put(ref: &mddev->active_io); |
398 | return true; |
399 | } |
400 | EXPORT_SYMBOL(md_handle_request); |
401 | |
402 | static void md_submit_bio(struct bio *bio) |
403 | { |
404 | const int rw = bio_data_dir(bio); |
405 | struct mddev *mddev = bio->bi_bdev->bd_disk->private_data; |
406 | |
407 | if (mddev == NULL || mddev->pers == NULL) { |
408 | bio_io_error(bio); |
409 | return; |
410 | } |
411 | |
412 | if (unlikely(test_bit(MD_BROKEN, &mddev->flags)) && (rw == WRITE)) { |
413 | bio_io_error(bio); |
414 | return; |
415 | } |
416 | |
417 | bio = bio_split_to_limits(bio); |
418 | if (!bio) |
419 | return; |
420 | |
421 | if (mddev->ro == MD_RDONLY && unlikely(rw == WRITE)) { |
422 | if (bio_sectors(bio) != 0) |
423 | bio->bi_status = BLK_STS_IOERR; |
424 | bio_endio(bio); |
425 | return; |
426 | } |
427 | |
428 | /* bio could be mergeable after passing to underlayer */ |
429 | bio->bi_opf &= ~REQ_NOMERGE; |
430 | |
431 | md_handle_request(mddev, bio); |
432 | } |
433 | |
434 | /* |
435 | * Make sure no new requests are submitted to the device, and any requests that |
436 | * have been submitted are completely handled. |
437 | */ |
438 | int mddev_suspend(struct mddev *mddev, bool interruptible) |
439 | { |
440 | int err = 0; |
441 | |
442 | /* |
443 | * hold reconfig_mutex to wait for normal io will deadlock, because |
444 | * other context can't update super_block, and normal io can rely on |
445 | * updating super_block. |
446 | */ |
447 | lockdep_assert_not_held(&mddev->reconfig_mutex); |
448 | |
449 | if (interruptible) |
450 | err = mutex_lock_interruptible(&mddev->suspend_mutex); |
451 | else |
452 | mutex_lock(&mddev->suspend_mutex); |
453 | if (err) |
454 | return err; |
455 | |
456 | if (mddev->suspended) { |
457 | WRITE_ONCE(mddev->suspended, mddev->suspended + 1); |
458 | mutex_unlock(lock: &mddev->suspend_mutex); |
459 | return 0; |
460 | } |
461 | |
462 | percpu_ref_kill(ref: &mddev->active_io); |
463 | if (interruptible) |
464 | err = wait_event_interruptible(mddev->sb_wait, |
465 | percpu_ref_is_zero(&mddev->active_io)); |
466 | else |
467 | wait_event(mddev->sb_wait, |
468 | percpu_ref_is_zero(&mddev->active_io)); |
469 | if (err) { |
470 | percpu_ref_resurrect(ref: &mddev->active_io); |
471 | mutex_unlock(lock: &mddev->suspend_mutex); |
472 | return err; |
473 | } |
474 | |
475 | /* |
476 | * For raid456, io might be waiting for reshape to make progress, |
477 | * allow new reshape to start while waiting for io to be done to |
478 | * prevent deadlock. |
479 | */ |
480 | WRITE_ONCE(mddev->suspended, mddev->suspended + 1); |
481 | |
482 | del_timer_sync(timer: &mddev->safemode_timer); |
483 | /* restrict memory reclaim I/O during raid array is suspend */ |
484 | mddev->noio_flag = memalloc_noio_save(); |
485 | |
486 | mutex_unlock(lock: &mddev->suspend_mutex); |
487 | return 0; |
488 | } |
489 | EXPORT_SYMBOL_GPL(mddev_suspend); |
490 | |
491 | static void __mddev_resume(struct mddev *mddev, bool recovery_needed) |
492 | { |
493 | lockdep_assert_not_held(&mddev->reconfig_mutex); |
494 | |
495 | mutex_lock(&mddev->suspend_mutex); |
496 | WRITE_ONCE(mddev->suspended, mddev->suspended - 1); |
497 | if (mddev->suspended) { |
498 | mutex_unlock(lock: &mddev->suspend_mutex); |
499 | return; |
500 | } |
501 | |
502 | /* entred the memalloc scope from mddev_suspend() */ |
503 | memalloc_noio_restore(flags: mddev->noio_flag); |
504 | |
505 | percpu_ref_resurrect(ref: &mddev->active_io); |
506 | wake_up(&mddev->sb_wait); |
507 | |
508 | if (recovery_needed) |
509 | set_bit(nr: MD_RECOVERY_NEEDED, addr: &mddev->recovery); |
510 | md_wakeup_thread(thread: mddev->thread); |
511 | md_wakeup_thread(thread: mddev->sync_thread); /* possibly kick off a reshape */ |
512 | |
513 | mutex_unlock(lock: &mddev->suspend_mutex); |
514 | } |
515 | |
516 | void mddev_resume(struct mddev *mddev) |
517 | { |
518 | return __mddev_resume(mddev, recovery_needed: true); |
519 | } |
520 | EXPORT_SYMBOL_GPL(mddev_resume); |
521 | |
522 | /* sync bdev before setting device to readonly or stopping raid*/ |
523 | static int mddev_set_closing_and_sync_blockdev(struct mddev *mddev, int opener_num) |
524 | { |
525 | mutex_lock(&mddev->open_mutex); |
526 | if (mddev->pers && atomic_read(v: &mddev->openers) > opener_num) { |
527 | mutex_unlock(lock: &mddev->open_mutex); |
528 | return -EBUSY; |
529 | } |
530 | if (test_and_set_bit(nr: MD_CLOSING, addr: &mddev->flags)) { |
531 | mutex_unlock(lock: &mddev->open_mutex); |
532 | return -EBUSY; |
533 | } |
534 | mutex_unlock(lock: &mddev->open_mutex); |
535 | |
536 | sync_blockdev(bdev: mddev->gendisk->part0); |
537 | return 0; |
538 | } |
539 | |
540 | /* |
541 | * Generic flush handling for md |
542 | */ |
543 | |
544 | static void md_end_flush(struct bio *bio) |
545 | { |
546 | struct md_rdev *rdev = bio->bi_private; |
547 | struct mddev *mddev = rdev->mddev; |
548 | |
549 | bio_put(bio); |
550 | |
551 | rdev_dec_pending(rdev, mddev); |
552 | |
553 | if (atomic_dec_and_test(v: &mddev->flush_pending)) { |
554 | /* The pair is percpu_ref_get() from md_flush_request() */ |
555 | percpu_ref_put(ref: &mddev->active_io); |
556 | |
557 | /* The pre-request flush has finished */ |
558 | queue_work(wq: md_wq, work: &mddev->flush_work); |
559 | } |
560 | } |
561 | |
562 | static void md_submit_flush_data(struct work_struct *ws); |
563 | |
564 | static void submit_flushes(struct work_struct *ws) |
565 | { |
566 | struct mddev *mddev = container_of(ws, struct mddev, flush_work); |
567 | struct md_rdev *rdev; |
568 | |
569 | mddev->start_flush = ktime_get_boottime(); |
570 | INIT_WORK(&mddev->flush_work, md_submit_flush_data); |
571 | atomic_set(v: &mddev->flush_pending, i: 1); |
572 | rcu_read_lock(); |
573 | rdev_for_each_rcu(rdev, mddev) |
574 | if (rdev->raid_disk >= 0 && |
575 | !test_bit(Faulty, &rdev->flags)) { |
576 | struct bio *bi; |
577 | |
578 | atomic_inc(v: &rdev->nr_pending); |
579 | rcu_read_unlock(); |
580 | bi = bio_alloc_bioset(bdev: rdev->bdev, nr_vecs: 0, |
581 | opf: REQ_OP_WRITE | REQ_PREFLUSH, |
582 | GFP_NOIO, bs: &mddev->bio_set); |
583 | bi->bi_end_io = md_end_flush; |
584 | bi->bi_private = rdev; |
585 | atomic_inc(v: &mddev->flush_pending); |
586 | submit_bio(bio: bi); |
587 | rcu_read_lock(); |
588 | } |
589 | rcu_read_unlock(); |
590 | if (atomic_dec_and_test(v: &mddev->flush_pending)) { |
591 | /* The pair is percpu_ref_get() from md_flush_request() */ |
592 | percpu_ref_put(ref: &mddev->active_io); |
593 | |
594 | queue_work(wq: md_wq, work: &mddev->flush_work); |
595 | } |
596 | } |
597 | |
598 | static void md_submit_flush_data(struct work_struct *ws) |
599 | { |
600 | struct mddev *mddev = container_of(ws, struct mddev, flush_work); |
601 | struct bio *bio = mddev->flush_bio; |
602 | |
603 | /* |
604 | * must reset flush_bio before calling into md_handle_request to avoid a |
605 | * deadlock, because other bios passed md_handle_request suspend check |
606 | * could wait for this and below md_handle_request could wait for those |
607 | * bios because of suspend check |
608 | */ |
609 | spin_lock_irq(lock: &mddev->lock); |
610 | mddev->prev_flush_start = mddev->start_flush; |
611 | mddev->flush_bio = NULL; |
612 | spin_unlock_irq(lock: &mddev->lock); |
613 | wake_up(&mddev->sb_wait); |
614 | |
615 | if (bio->bi_iter.bi_size == 0) { |
616 | /* an empty barrier - all done */ |
617 | bio_endio(bio); |
618 | } else { |
619 | bio->bi_opf &= ~REQ_PREFLUSH; |
620 | md_handle_request(mddev, bio); |
621 | } |
622 | } |
623 | |
624 | /* |
625 | * Manages consolidation of flushes and submitting any flushes needed for |
626 | * a bio with REQ_PREFLUSH. Returns true if the bio is finished or is |
627 | * being finished in another context. Returns false if the flushing is |
628 | * complete but still needs the I/O portion of the bio to be processed. |
629 | */ |
630 | bool md_flush_request(struct mddev *mddev, struct bio *bio) |
631 | { |
632 | ktime_t req_start = ktime_get_boottime(); |
633 | spin_lock_irq(lock: &mddev->lock); |
634 | /* flush requests wait until ongoing flush completes, |
635 | * hence coalescing all the pending requests. |
636 | */ |
637 | wait_event_lock_irq(mddev->sb_wait, |
638 | !mddev->flush_bio || |
639 | ktime_before(req_start, mddev->prev_flush_start), |
640 | mddev->lock); |
641 | /* new request after previous flush is completed */ |
642 | if (ktime_after(cmp1: req_start, cmp2: mddev->prev_flush_start)) { |
643 | WARN_ON(mddev->flush_bio); |
644 | /* |
645 | * Grab a reference to make sure mddev_suspend() will wait for |
646 | * this flush to be done. |
647 | * |
648 | * md_flush_reqeust() is called under md_handle_request() and |
649 | * 'active_io' is already grabbed, hence percpu_ref_is_zero() |
650 | * won't pass, percpu_ref_tryget_live() can't be used because |
651 | * percpu_ref_kill() can be called by mddev_suspend() |
652 | * concurrently. |
653 | */ |
654 | WARN_ON(percpu_ref_is_zero(&mddev->active_io)); |
655 | percpu_ref_get(ref: &mddev->active_io); |
656 | mddev->flush_bio = bio; |
657 | bio = NULL; |
658 | } |
659 | spin_unlock_irq(lock: &mddev->lock); |
660 | |
661 | if (!bio) { |
662 | INIT_WORK(&mddev->flush_work, submit_flushes); |
663 | queue_work(wq: md_wq, work: &mddev->flush_work); |
664 | } else { |
665 | /* flush was performed for some other bio while we waited. */ |
666 | if (bio->bi_iter.bi_size == 0) |
667 | /* an empty barrier - all done */ |
668 | bio_endio(bio); |
669 | else { |
670 | bio->bi_opf &= ~REQ_PREFLUSH; |
671 | return false; |
672 | } |
673 | } |
674 | return true; |
675 | } |
676 | EXPORT_SYMBOL(md_flush_request); |
677 | |
678 | static inline struct mddev *mddev_get(struct mddev *mddev) |
679 | { |
680 | lockdep_assert_held(&all_mddevs_lock); |
681 | |
682 | if (test_bit(MD_DELETED, &mddev->flags)) |
683 | return NULL; |
684 | atomic_inc(v: &mddev->active); |
685 | return mddev; |
686 | } |
687 | |
688 | static void mddev_delayed_delete(struct work_struct *ws); |
689 | |
690 | static void __mddev_put(struct mddev *mddev) |
691 | { |
692 | if (mddev->raid_disks || !list_empty(head: &mddev->disks) || |
693 | mddev->ctime || mddev->hold_active) |
694 | return; |
695 | |
696 | /* Array is not configured at all, and not held active, so destroy it */ |
697 | set_bit(nr: MD_DELETED, addr: &mddev->flags); |
698 | |
699 | /* |
700 | * Call queue_work inside the spinlock so that flush_workqueue() after |
701 | * mddev_find will succeed in waiting for the work to be done. |
702 | */ |
703 | queue_work(wq: md_misc_wq, work: &mddev->del_work); |
704 | } |
705 | |
706 | void mddev_put(struct mddev *mddev) |
707 | { |
708 | if (!atomic_dec_and_lock(&mddev->active, &all_mddevs_lock)) |
709 | return; |
710 | |
711 | __mddev_put(mddev); |
712 | spin_unlock(lock: &all_mddevs_lock); |
713 | } |
714 | |
715 | static void md_safemode_timeout(struct timer_list *t); |
716 | static void md_start_sync(struct work_struct *ws); |
717 | |
718 | static void active_io_release(struct percpu_ref *ref) |
719 | { |
720 | struct mddev *mddev = container_of(ref, struct mddev, active_io); |
721 | |
722 | wake_up(&mddev->sb_wait); |
723 | } |
724 | |
725 | static void no_op(struct percpu_ref *r) {} |
726 | |
727 | int mddev_init(struct mddev *mddev) |
728 | { |
729 | |
730 | if (percpu_ref_init(ref: &mddev->active_io, release: active_io_release, |
731 | flags: PERCPU_REF_ALLOW_REINIT, GFP_KERNEL)) |
732 | return -ENOMEM; |
733 | |
734 | if (percpu_ref_init(ref: &mddev->writes_pending, release: no_op, |
735 | flags: PERCPU_REF_ALLOW_REINIT, GFP_KERNEL)) { |
736 | percpu_ref_exit(ref: &mddev->active_io); |
737 | return -ENOMEM; |
738 | } |
739 | |
740 | /* We want to start with the refcount at zero */ |
741 | percpu_ref_put(ref: &mddev->writes_pending); |
742 | |
743 | mutex_init(&mddev->open_mutex); |
744 | mutex_init(&mddev->reconfig_mutex); |
745 | mutex_init(&mddev->sync_mutex); |
746 | mutex_init(&mddev->suspend_mutex); |
747 | mutex_init(&mddev->bitmap_info.mutex); |
748 | INIT_LIST_HEAD(list: &mddev->disks); |
749 | INIT_LIST_HEAD(list: &mddev->all_mddevs); |
750 | INIT_LIST_HEAD(list: &mddev->deleting); |
751 | timer_setup(&mddev->safemode_timer, md_safemode_timeout, 0); |
752 | atomic_set(v: &mddev->active, i: 1); |
753 | atomic_set(v: &mddev->openers, i: 0); |
754 | atomic_set(v: &mddev->sync_seq, i: 0); |
755 | spin_lock_init(&mddev->lock); |
756 | atomic_set(v: &mddev->flush_pending, i: 0); |
757 | init_waitqueue_head(&mddev->sb_wait); |
758 | init_waitqueue_head(&mddev->recovery_wait); |
759 | mddev->reshape_position = MaxSector; |
760 | mddev->reshape_backwards = 0; |
761 | mddev->last_sync_action = "none" ; |
762 | mddev->resync_min = 0; |
763 | mddev->resync_max = MaxSector; |
764 | mddev->level = LEVEL_NONE; |
765 | |
766 | INIT_WORK(&mddev->sync_work, md_start_sync); |
767 | INIT_WORK(&mddev->del_work, mddev_delayed_delete); |
768 | |
769 | return 0; |
770 | } |
771 | EXPORT_SYMBOL_GPL(mddev_init); |
772 | |
773 | void mddev_destroy(struct mddev *mddev) |
774 | { |
775 | percpu_ref_exit(ref: &mddev->active_io); |
776 | percpu_ref_exit(ref: &mddev->writes_pending); |
777 | } |
778 | EXPORT_SYMBOL_GPL(mddev_destroy); |
779 | |
780 | static struct mddev *mddev_find_locked(dev_t unit) |
781 | { |
782 | struct mddev *mddev; |
783 | |
784 | list_for_each_entry(mddev, &all_mddevs, all_mddevs) |
785 | if (mddev->unit == unit) |
786 | return mddev; |
787 | |
788 | return NULL; |
789 | } |
790 | |
791 | /* find an unused unit number */ |
792 | static dev_t mddev_alloc_unit(void) |
793 | { |
794 | static int next_minor = 512; |
795 | int start = next_minor; |
796 | bool is_free = 0; |
797 | dev_t dev = 0; |
798 | |
799 | while (!is_free) { |
800 | dev = MKDEV(MD_MAJOR, next_minor); |
801 | next_minor++; |
802 | if (next_minor > MINORMASK) |
803 | next_minor = 0; |
804 | if (next_minor == start) |
805 | return 0; /* Oh dear, all in use. */ |
806 | is_free = !mddev_find_locked(unit: dev); |
807 | } |
808 | |
809 | return dev; |
810 | } |
811 | |
812 | static struct mddev *mddev_alloc(dev_t unit) |
813 | { |
814 | struct mddev *new; |
815 | int error; |
816 | |
817 | if (unit && MAJOR(unit) != MD_MAJOR) |
818 | unit &= ~((1 << MdpMinorShift) - 1); |
819 | |
820 | new = kzalloc(size: sizeof(*new), GFP_KERNEL); |
821 | if (!new) |
822 | return ERR_PTR(error: -ENOMEM); |
823 | |
824 | error = mddev_init(new); |
825 | if (error) |
826 | goto out_free_new; |
827 | |
828 | spin_lock(lock: &all_mddevs_lock); |
829 | if (unit) { |
830 | error = -EEXIST; |
831 | if (mddev_find_locked(unit)) |
832 | goto out_destroy_new; |
833 | new->unit = unit; |
834 | if (MAJOR(unit) == MD_MAJOR) |
835 | new->md_minor = MINOR(unit); |
836 | else |
837 | new->md_minor = MINOR(unit) >> MdpMinorShift; |
838 | new->hold_active = UNTIL_IOCTL; |
839 | } else { |
840 | error = -ENODEV; |
841 | new->unit = mddev_alloc_unit(); |
842 | if (!new->unit) |
843 | goto out_destroy_new; |
844 | new->md_minor = MINOR(new->unit); |
845 | new->hold_active = UNTIL_STOP; |
846 | } |
847 | |
848 | list_add(new: &new->all_mddevs, head: &all_mddevs); |
849 | spin_unlock(lock: &all_mddevs_lock); |
850 | return new; |
851 | |
852 | out_destroy_new: |
853 | spin_unlock(lock: &all_mddevs_lock); |
854 | mddev_destroy(new); |
855 | out_free_new: |
856 | kfree(objp: new); |
857 | return ERR_PTR(error); |
858 | } |
859 | |
860 | static void mddev_free(struct mddev *mddev) |
861 | { |
862 | spin_lock(lock: &all_mddevs_lock); |
863 | list_del(entry: &mddev->all_mddevs); |
864 | spin_unlock(lock: &all_mddevs_lock); |
865 | |
866 | mddev_destroy(mddev); |
867 | kfree(objp: mddev); |
868 | } |
869 | |
870 | static const struct attribute_group md_redundancy_group; |
871 | |
872 | void mddev_unlock(struct mddev *mddev) |
873 | { |
874 | struct md_rdev *rdev; |
875 | struct md_rdev *tmp; |
876 | LIST_HEAD(delete); |
877 | |
878 | if (!list_empty(head: &mddev->deleting)) |
879 | list_splice_init(list: &mddev->deleting, head: &delete); |
880 | |
881 | if (mddev->to_remove) { |
882 | /* These cannot be removed under reconfig_mutex as |
883 | * an access to the files will try to take reconfig_mutex |
884 | * while holding the file unremovable, which leads to |
885 | * a deadlock. |
886 | * So hold set sysfs_active while the remove in happeing, |
887 | * and anything else which might set ->to_remove or my |
888 | * otherwise change the sysfs namespace will fail with |
889 | * -EBUSY if sysfs_active is still set. |
890 | * We set sysfs_active under reconfig_mutex and elsewhere |
891 | * test it under the same mutex to ensure its correct value |
892 | * is seen. |
893 | */ |
894 | const struct attribute_group *to_remove = mddev->to_remove; |
895 | mddev->to_remove = NULL; |
896 | mddev->sysfs_active = 1; |
897 | mutex_unlock(lock: &mddev->reconfig_mutex); |
898 | |
899 | if (mddev->kobj.sd) { |
900 | if (to_remove != &md_redundancy_group) |
901 | sysfs_remove_group(kobj: &mddev->kobj, grp: to_remove); |
902 | if (mddev->pers == NULL || |
903 | mddev->pers->sync_request == NULL) { |
904 | sysfs_remove_group(kobj: &mddev->kobj, grp: &md_redundancy_group); |
905 | if (mddev->sysfs_action) |
906 | sysfs_put(kn: mddev->sysfs_action); |
907 | if (mddev->sysfs_completed) |
908 | sysfs_put(kn: mddev->sysfs_completed); |
909 | if (mddev->sysfs_degraded) |
910 | sysfs_put(kn: mddev->sysfs_degraded); |
911 | mddev->sysfs_action = NULL; |
912 | mddev->sysfs_completed = NULL; |
913 | mddev->sysfs_degraded = NULL; |
914 | } |
915 | } |
916 | mddev->sysfs_active = 0; |
917 | } else |
918 | mutex_unlock(lock: &mddev->reconfig_mutex); |
919 | |
920 | md_wakeup_thread(thread: mddev->thread); |
921 | wake_up(&mddev->sb_wait); |
922 | |
923 | list_for_each_entry_safe(rdev, tmp, &delete, same_set) { |
924 | list_del_init(entry: &rdev->same_set); |
925 | kobject_del(kobj: &rdev->kobj); |
926 | export_rdev(rdev, mddev); |
927 | } |
928 | } |
929 | EXPORT_SYMBOL_GPL(mddev_unlock); |
930 | |
931 | struct md_rdev *md_find_rdev_nr_rcu(struct mddev *mddev, int nr) |
932 | { |
933 | struct md_rdev *rdev; |
934 | |
935 | rdev_for_each_rcu(rdev, mddev) |
936 | if (rdev->desc_nr == nr) |
937 | return rdev; |
938 | |
939 | return NULL; |
940 | } |
941 | EXPORT_SYMBOL_GPL(md_find_rdev_nr_rcu); |
942 | |
943 | static struct md_rdev *find_rdev(struct mddev *mddev, dev_t dev) |
944 | { |
945 | struct md_rdev *rdev; |
946 | |
947 | rdev_for_each(rdev, mddev) |
948 | if (rdev->bdev->bd_dev == dev) |
949 | return rdev; |
950 | |
951 | return NULL; |
952 | } |
953 | |
954 | struct md_rdev *md_find_rdev_rcu(struct mddev *mddev, dev_t dev) |
955 | { |
956 | struct md_rdev *rdev; |
957 | |
958 | rdev_for_each_rcu(rdev, mddev) |
959 | if (rdev->bdev->bd_dev == dev) |
960 | return rdev; |
961 | |
962 | return NULL; |
963 | } |
964 | EXPORT_SYMBOL_GPL(md_find_rdev_rcu); |
965 | |
966 | static struct md_personality *find_pers(int level, char *clevel) |
967 | { |
968 | struct md_personality *pers; |
969 | list_for_each_entry(pers, &pers_list, list) { |
970 | if (level != LEVEL_NONE && pers->level == level) |
971 | return pers; |
972 | if (strcmp(pers->name, clevel)==0) |
973 | return pers; |
974 | } |
975 | return NULL; |
976 | } |
977 | |
978 | /* return the offset of the super block in 512byte sectors */ |
979 | static inline sector_t calc_dev_sboffset(struct md_rdev *rdev) |
980 | { |
981 | return MD_NEW_SIZE_SECTORS(bdev_nr_sectors(rdev->bdev)); |
982 | } |
983 | |
984 | static int alloc_disk_sb(struct md_rdev *rdev) |
985 | { |
986 | rdev->sb_page = alloc_page(GFP_KERNEL); |
987 | if (!rdev->sb_page) |
988 | return -ENOMEM; |
989 | return 0; |
990 | } |
991 | |
992 | void md_rdev_clear(struct md_rdev *rdev) |
993 | { |
994 | if (rdev->sb_page) { |
995 | put_page(page: rdev->sb_page); |
996 | rdev->sb_loaded = 0; |
997 | rdev->sb_page = NULL; |
998 | rdev->sb_start = 0; |
999 | rdev->sectors = 0; |
1000 | } |
1001 | if (rdev->bb_page) { |
1002 | put_page(page: rdev->bb_page); |
1003 | rdev->bb_page = NULL; |
1004 | } |
1005 | badblocks_exit(bb: &rdev->badblocks); |
1006 | } |
1007 | EXPORT_SYMBOL_GPL(md_rdev_clear); |
1008 | |
1009 | static void super_written(struct bio *bio) |
1010 | { |
1011 | struct md_rdev *rdev = bio->bi_private; |
1012 | struct mddev *mddev = rdev->mddev; |
1013 | |
1014 | if (bio->bi_status) { |
1015 | pr_err("md: %s gets error=%d\n" , __func__, |
1016 | blk_status_to_errno(bio->bi_status)); |
1017 | md_error(mddev, rdev); |
1018 | if (!test_bit(Faulty, &rdev->flags) |
1019 | && (bio->bi_opf & MD_FAILFAST)) { |
1020 | set_bit(nr: MD_SB_NEED_REWRITE, addr: &mddev->sb_flags); |
1021 | set_bit(nr: LastDev, addr: &rdev->flags); |
1022 | } |
1023 | } else |
1024 | clear_bit(nr: LastDev, addr: &rdev->flags); |
1025 | |
1026 | bio_put(bio); |
1027 | |
1028 | rdev_dec_pending(rdev, mddev); |
1029 | |
1030 | if (atomic_dec_and_test(v: &mddev->pending_writes)) |
1031 | wake_up(&mddev->sb_wait); |
1032 | } |
1033 | |
1034 | void md_super_write(struct mddev *mddev, struct md_rdev *rdev, |
1035 | sector_t sector, int size, struct page *page) |
1036 | { |
1037 | /* write first size bytes of page to sector of rdev |
1038 | * Increment mddev->pending_writes before returning |
1039 | * and decrement it on completion, waking up sb_wait |
1040 | * if zero is reached. |
1041 | * If an error occurred, call md_error |
1042 | */ |
1043 | struct bio *bio; |
1044 | |
1045 | if (!page) |
1046 | return; |
1047 | |
1048 | if (test_bit(Faulty, &rdev->flags)) |
1049 | return; |
1050 | |
1051 | bio = bio_alloc_bioset(bdev: rdev->meta_bdev ? rdev->meta_bdev : rdev->bdev, |
1052 | nr_vecs: 1, |
1053 | opf: REQ_OP_WRITE | REQ_SYNC | REQ_IDLE | REQ_META |
1054 | | REQ_PREFLUSH | REQ_FUA, |
1055 | GFP_NOIO, bs: &mddev->sync_set); |
1056 | |
1057 | atomic_inc(v: &rdev->nr_pending); |
1058 | |
1059 | bio->bi_iter.bi_sector = sector; |
1060 | __bio_add_page(bio, page, len: size, off: 0); |
1061 | bio->bi_private = rdev; |
1062 | bio->bi_end_io = super_written; |
1063 | |
1064 | if (test_bit(MD_FAILFAST_SUPPORTED, &mddev->flags) && |
1065 | test_bit(FailFast, &rdev->flags) && |
1066 | !test_bit(LastDev, &rdev->flags)) |
1067 | bio->bi_opf |= MD_FAILFAST; |
1068 | |
1069 | atomic_inc(v: &mddev->pending_writes); |
1070 | submit_bio(bio); |
1071 | } |
1072 | |
1073 | int md_super_wait(struct mddev *mddev) |
1074 | { |
1075 | /* wait for all superblock writes that were scheduled to complete */ |
1076 | wait_event(mddev->sb_wait, atomic_read(&mddev->pending_writes)==0); |
1077 | if (test_and_clear_bit(nr: MD_SB_NEED_REWRITE, addr: &mddev->sb_flags)) |
1078 | return -EAGAIN; |
1079 | return 0; |
1080 | } |
1081 | |
1082 | int sync_page_io(struct md_rdev *rdev, sector_t sector, int size, |
1083 | struct page *page, blk_opf_t opf, bool metadata_op) |
1084 | { |
1085 | struct bio bio; |
1086 | struct bio_vec bvec; |
1087 | |
1088 | if (metadata_op && rdev->meta_bdev) |
1089 | bio_init(bio: &bio, bdev: rdev->meta_bdev, table: &bvec, max_vecs: 1, opf); |
1090 | else |
1091 | bio_init(bio: &bio, bdev: rdev->bdev, table: &bvec, max_vecs: 1, opf); |
1092 | |
1093 | if (metadata_op) |
1094 | bio.bi_iter.bi_sector = sector + rdev->sb_start; |
1095 | else if (rdev->mddev->reshape_position != MaxSector && |
1096 | (rdev->mddev->reshape_backwards == |
1097 | (sector >= rdev->mddev->reshape_position))) |
1098 | bio.bi_iter.bi_sector = sector + rdev->new_data_offset; |
1099 | else |
1100 | bio.bi_iter.bi_sector = sector + rdev->data_offset; |
1101 | __bio_add_page(bio: &bio, page, len: size, off: 0); |
1102 | |
1103 | submit_bio_wait(bio: &bio); |
1104 | |
1105 | return !bio.bi_status; |
1106 | } |
1107 | EXPORT_SYMBOL_GPL(sync_page_io); |
1108 | |
1109 | static int read_disk_sb(struct md_rdev *rdev, int size) |
1110 | { |
1111 | if (rdev->sb_loaded) |
1112 | return 0; |
1113 | |
1114 | if (!sync_page_io(rdev, 0, size, rdev->sb_page, REQ_OP_READ, true)) |
1115 | goto fail; |
1116 | rdev->sb_loaded = 1; |
1117 | return 0; |
1118 | |
1119 | fail: |
1120 | pr_err("md: disabled device %pg, could not read superblock.\n" , |
1121 | rdev->bdev); |
1122 | return -EINVAL; |
1123 | } |
1124 | |
1125 | static int md_uuid_equal(mdp_super_t *sb1, mdp_super_t *sb2) |
1126 | { |
1127 | return sb1->set_uuid0 == sb2->set_uuid0 && |
1128 | sb1->set_uuid1 == sb2->set_uuid1 && |
1129 | sb1->set_uuid2 == sb2->set_uuid2 && |
1130 | sb1->set_uuid3 == sb2->set_uuid3; |
1131 | } |
1132 | |
1133 | static int md_sb_equal(mdp_super_t *sb1, mdp_super_t *sb2) |
1134 | { |
1135 | int ret; |
1136 | mdp_super_t *tmp1, *tmp2; |
1137 | |
1138 | tmp1 = kmalloc(size: sizeof(*tmp1),GFP_KERNEL); |
1139 | tmp2 = kmalloc(size: sizeof(*tmp2),GFP_KERNEL); |
1140 | |
1141 | if (!tmp1 || !tmp2) { |
1142 | ret = 0; |
1143 | goto abort; |
1144 | } |
1145 | |
1146 | *tmp1 = *sb1; |
1147 | *tmp2 = *sb2; |
1148 | |
1149 | /* |
1150 | * nr_disks is not constant |
1151 | */ |
1152 | tmp1->nr_disks = 0; |
1153 | tmp2->nr_disks = 0; |
1154 | |
1155 | ret = (memcmp(p: tmp1, q: tmp2, MD_SB_GENERIC_CONSTANT_WORDS * 4) == 0); |
1156 | abort: |
1157 | kfree(objp: tmp1); |
1158 | kfree(objp: tmp2); |
1159 | return ret; |
1160 | } |
1161 | |
1162 | static u32 md_csum_fold(u32 csum) |
1163 | { |
1164 | csum = (csum & 0xffff) + (csum >> 16); |
1165 | return (csum & 0xffff) + (csum >> 16); |
1166 | } |
1167 | |
1168 | static unsigned int calc_sb_csum(mdp_super_t *sb) |
1169 | { |
1170 | u64 newcsum = 0; |
1171 | u32 *sb32 = (u32*)sb; |
1172 | int i; |
1173 | unsigned int disk_csum, csum; |
1174 | |
1175 | disk_csum = sb->sb_csum; |
1176 | sb->sb_csum = 0; |
1177 | |
1178 | for (i = 0; i < MD_SB_BYTES/4 ; i++) |
1179 | newcsum += sb32[i]; |
1180 | csum = (newcsum & 0xffffffff) + (newcsum>>32); |
1181 | |
1182 | #ifdef CONFIG_ALPHA |
1183 | /* This used to use csum_partial, which was wrong for several |
1184 | * reasons including that different results are returned on |
1185 | * different architectures. It isn't critical that we get exactly |
1186 | * the same return value as before (we always csum_fold before |
1187 | * testing, and that removes any differences). However as we |
1188 | * know that csum_partial always returned a 16bit value on |
1189 | * alphas, do a fold to maximise conformity to previous behaviour. |
1190 | */ |
1191 | sb->sb_csum = md_csum_fold(disk_csum); |
1192 | #else |
1193 | sb->sb_csum = disk_csum; |
1194 | #endif |
1195 | return csum; |
1196 | } |
1197 | |
1198 | /* |
1199 | * Handle superblock details. |
1200 | * We want to be able to handle multiple superblock formats |
1201 | * so we have a common interface to them all, and an array of |
1202 | * different handlers. |
1203 | * We rely on user-space to write the initial superblock, and support |
1204 | * reading and updating of superblocks. |
1205 | * Interface methods are: |
1206 | * int load_super(struct md_rdev *dev, struct md_rdev *refdev, int minor_version) |
1207 | * loads and validates a superblock on dev. |
1208 | * if refdev != NULL, compare superblocks on both devices |
1209 | * Return: |
1210 | * 0 - dev has a superblock that is compatible with refdev |
1211 | * 1 - dev has a superblock that is compatible and newer than refdev |
1212 | * so dev should be used as the refdev in future |
1213 | * -EINVAL superblock incompatible or invalid |
1214 | * -othererror e.g. -EIO |
1215 | * |
1216 | * int validate_super(struct mddev *mddev, struct md_rdev *dev) |
1217 | * Verify that dev is acceptable into mddev. |
1218 | * The first time, mddev->raid_disks will be 0, and data from |
1219 | * dev should be merged in. Subsequent calls check that dev |
1220 | * is new enough. Return 0 or -EINVAL |
1221 | * |
1222 | * void sync_super(struct mddev *mddev, struct md_rdev *dev) |
1223 | * Update the superblock for rdev with data in mddev |
1224 | * This does not write to disc. |
1225 | * |
1226 | */ |
1227 | |
1228 | struct super_type { |
1229 | char *name; |
1230 | struct module *owner; |
1231 | int (*load_super)(struct md_rdev *rdev, |
1232 | struct md_rdev *refdev, |
1233 | int minor_version); |
1234 | int (*validate_super)(struct mddev *mddev, |
1235 | struct md_rdev *freshest, |
1236 | struct md_rdev *rdev); |
1237 | void (*sync_super)(struct mddev *mddev, |
1238 | struct md_rdev *rdev); |
1239 | unsigned long long (*rdev_size_change)(struct md_rdev *rdev, |
1240 | sector_t num_sectors); |
1241 | int (*allow_new_offset)(struct md_rdev *rdev, |
1242 | unsigned long long new_offset); |
1243 | }; |
1244 | |
1245 | /* |
1246 | * Check that the given mddev has no bitmap. |
1247 | * |
1248 | * This function is called from the run method of all personalities that do not |
1249 | * support bitmaps. It prints an error message and returns non-zero if mddev |
1250 | * has a bitmap. Otherwise, it returns 0. |
1251 | * |
1252 | */ |
1253 | int md_check_no_bitmap(struct mddev *mddev) |
1254 | { |
1255 | if (!mddev->bitmap_info.file && !mddev->bitmap_info.offset) |
1256 | return 0; |
1257 | pr_warn("%s: bitmaps are not supported for %s\n" , |
1258 | mdname(mddev), mddev->pers->name); |
1259 | return 1; |
1260 | } |
1261 | EXPORT_SYMBOL(md_check_no_bitmap); |
1262 | |
1263 | /* |
1264 | * load_super for 0.90.0 |
1265 | */ |
1266 | static int super_90_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_version) |
1267 | { |
1268 | mdp_super_t *sb; |
1269 | int ret; |
1270 | bool spare_disk = true; |
1271 | |
1272 | /* |
1273 | * Calculate the position of the superblock (512byte sectors), |
1274 | * it's at the end of the disk. |
1275 | * |
1276 | * It also happens to be a multiple of 4Kb. |
1277 | */ |
1278 | rdev->sb_start = calc_dev_sboffset(rdev); |
1279 | |
1280 | ret = read_disk_sb(rdev, MD_SB_BYTES); |
1281 | if (ret) |
1282 | return ret; |
1283 | |
1284 | ret = -EINVAL; |
1285 | |
1286 | sb = page_address(rdev->sb_page); |
1287 | |
1288 | if (sb->md_magic != MD_SB_MAGIC) { |
1289 | pr_warn("md: invalid raid superblock magic on %pg\n" , |
1290 | rdev->bdev); |
1291 | goto abort; |
1292 | } |
1293 | |
1294 | if (sb->major_version != 0 || |
1295 | sb->minor_version < 90 || |
1296 | sb->minor_version > 91) { |
1297 | pr_warn("Bad version number %d.%d on %pg\n" , |
1298 | sb->major_version, sb->minor_version, rdev->bdev); |
1299 | goto abort; |
1300 | } |
1301 | |
1302 | if (sb->raid_disks <= 0) |
1303 | goto abort; |
1304 | |
1305 | if (md_csum_fold(csum: calc_sb_csum(sb)) != md_csum_fold(csum: sb->sb_csum)) { |
1306 | pr_warn("md: invalid superblock checksum on %pg\n" , rdev->bdev); |
1307 | goto abort; |
1308 | } |
1309 | |
1310 | rdev->preferred_minor = sb->md_minor; |
1311 | rdev->data_offset = 0; |
1312 | rdev->new_data_offset = 0; |
1313 | rdev->sb_size = MD_SB_BYTES; |
1314 | rdev->badblocks.shift = -1; |
1315 | |
1316 | rdev->desc_nr = sb->this_disk.number; |
1317 | |
1318 | /* not spare disk */ |
1319 | if (rdev->desc_nr >= 0 && rdev->desc_nr < MD_SB_DISKS && |
1320 | sb->disks[rdev->desc_nr].state & ((1<<MD_DISK_SYNC) | (1 << MD_DISK_ACTIVE))) |
1321 | spare_disk = false; |
1322 | |
1323 | if (!refdev) { |
1324 | if (!spare_disk) |
1325 | ret = 1; |
1326 | else |
1327 | ret = 0; |
1328 | } else { |
1329 | __u64 ev1, ev2; |
1330 | mdp_super_t *refsb = page_address(refdev->sb_page); |
1331 | if (!md_uuid_equal(sb1: refsb, sb2: sb)) { |
1332 | pr_warn("md: %pg has different UUID to %pg\n" , |
1333 | rdev->bdev, refdev->bdev); |
1334 | goto abort; |
1335 | } |
1336 | if (!md_sb_equal(sb1: refsb, sb2: sb)) { |
1337 | pr_warn("md: %pg has same UUID but different superblock to %pg\n" , |
1338 | rdev->bdev, refdev->bdev); |
1339 | goto abort; |
1340 | } |
1341 | ev1 = md_event(sb); |
1342 | ev2 = md_event(sb: refsb); |
1343 | |
1344 | if (!spare_disk && ev1 > ev2) |
1345 | ret = 1; |
1346 | else |
1347 | ret = 0; |
1348 | } |
1349 | rdev->sectors = rdev->sb_start; |
1350 | /* Limit to 4TB as metadata cannot record more than that. |
1351 | * (not needed for Linear and RAID0 as metadata doesn't |
1352 | * record this size) |
1353 | */ |
1354 | if ((u64)rdev->sectors >= (2ULL << 32) && sb->level >= 1) |
1355 | rdev->sectors = (sector_t)(2ULL << 32) - 2; |
1356 | |
1357 | if (rdev->sectors < ((sector_t)sb->size) * 2 && sb->level >= 1) |
1358 | /* "this cannot possibly happen" ... */ |
1359 | ret = -EINVAL; |
1360 | |
1361 | abort: |
1362 | return ret; |
1363 | } |
1364 | |
1365 | /* |
1366 | * validate_super for 0.90.0 |
1367 | * note: we are not using "freshest" for 0.9 superblock |
1368 | */ |
1369 | static int super_90_validate(struct mddev *mddev, struct md_rdev *freshest, struct md_rdev *rdev) |
1370 | { |
1371 | mdp_disk_t *desc; |
1372 | mdp_super_t *sb = page_address(rdev->sb_page); |
1373 | __u64 ev1 = md_event(sb); |
1374 | |
1375 | rdev->raid_disk = -1; |
1376 | clear_bit(nr: Faulty, addr: &rdev->flags); |
1377 | clear_bit(nr: In_sync, addr: &rdev->flags); |
1378 | clear_bit(nr: Bitmap_sync, addr: &rdev->flags); |
1379 | clear_bit(nr: WriteMostly, addr: &rdev->flags); |
1380 | |
1381 | if (mddev->raid_disks == 0) { |
1382 | mddev->major_version = 0; |
1383 | mddev->minor_version = sb->minor_version; |
1384 | mddev->patch_version = sb->patch_version; |
1385 | mddev->external = 0; |
1386 | mddev->chunk_sectors = sb->chunk_size >> 9; |
1387 | mddev->ctime = sb->ctime; |
1388 | mddev->utime = sb->utime; |
1389 | mddev->level = sb->level; |
1390 | mddev->clevel[0] = 0; |
1391 | mddev->layout = sb->layout; |
1392 | mddev->raid_disks = sb->raid_disks; |
1393 | mddev->dev_sectors = ((sector_t)sb->size) * 2; |
1394 | mddev->events = ev1; |
1395 | mddev->bitmap_info.offset = 0; |
1396 | mddev->bitmap_info.space = 0; |
1397 | /* bitmap can use 60 K after the 4K superblocks */ |
1398 | mddev->bitmap_info.default_offset = MD_SB_BYTES >> 9; |
1399 | mddev->bitmap_info.default_space = 64*2 - (MD_SB_BYTES >> 9); |
1400 | mddev->reshape_backwards = 0; |
1401 | |
1402 | if (mddev->minor_version >= 91) { |
1403 | mddev->reshape_position = sb->reshape_position; |
1404 | mddev->delta_disks = sb->delta_disks; |
1405 | mddev->new_level = sb->new_level; |
1406 | mddev->new_layout = sb->new_layout; |
1407 | mddev->new_chunk_sectors = sb->new_chunk >> 9; |
1408 | if (mddev->delta_disks < 0) |
1409 | mddev->reshape_backwards = 1; |
1410 | } else { |
1411 | mddev->reshape_position = MaxSector; |
1412 | mddev->delta_disks = 0; |
1413 | mddev->new_level = mddev->level; |
1414 | mddev->new_layout = mddev->layout; |
1415 | mddev->new_chunk_sectors = mddev->chunk_sectors; |
1416 | } |
1417 | if (mddev->level == 0) |
1418 | mddev->layout = -1; |
1419 | |
1420 | if (sb->state & (1<<MD_SB_CLEAN)) |
1421 | mddev->recovery_cp = MaxSector; |
1422 | else { |
1423 | if (sb->events_hi == sb->cp_events_hi && |
1424 | sb->events_lo == sb->cp_events_lo) { |
1425 | mddev->recovery_cp = sb->recovery_cp; |
1426 | } else |
1427 | mddev->recovery_cp = 0; |
1428 | } |
1429 | |
1430 | memcpy(mddev->uuid+0, &sb->set_uuid0, 4); |
1431 | memcpy(mddev->uuid+4, &sb->set_uuid1, 4); |
1432 | memcpy(mddev->uuid+8, &sb->set_uuid2, 4); |
1433 | memcpy(mddev->uuid+12,&sb->set_uuid3, 4); |
1434 | |
1435 | mddev->max_disks = MD_SB_DISKS; |
1436 | |
1437 | if (sb->state & (1<<MD_SB_BITMAP_PRESENT) && |
1438 | mddev->bitmap_info.file == NULL) { |
1439 | mddev->bitmap_info.offset = |
1440 | mddev->bitmap_info.default_offset; |
1441 | mddev->bitmap_info.space = |
1442 | mddev->bitmap_info.default_space; |
1443 | } |
1444 | |
1445 | } else if (mddev->pers == NULL) { |
1446 | /* Insist on good event counter while assembling, except |
1447 | * for spares (which don't need an event count) */ |
1448 | ++ev1; |
1449 | if (sb->disks[rdev->desc_nr].state & ( |
1450 | (1<<MD_DISK_SYNC) | (1 << MD_DISK_ACTIVE))) |
1451 | if (ev1 < mddev->events) |
1452 | return -EINVAL; |
1453 | } else if (mddev->bitmap) { |
1454 | /* if adding to array with a bitmap, then we can accept an |
1455 | * older device ... but not too old. |
1456 | */ |
1457 | if (ev1 < mddev->bitmap->events_cleared) |
1458 | return 0; |
1459 | if (ev1 < mddev->events) |
1460 | set_bit(nr: Bitmap_sync, addr: &rdev->flags); |
1461 | } else { |
1462 | if (ev1 < mddev->events) |
1463 | /* just a hot-add of a new device, leave raid_disk at -1 */ |
1464 | return 0; |
1465 | } |
1466 | |
1467 | desc = sb->disks + rdev->desc_nr; |
1468 | |
1469 | if (desc->state & (1<<MD_DISK_FAULTY)) |
1470 | set_bit(nr: Faulty, addr: &rdev->flags); |
1471 | else if (desc->state & (1<<MD_DISK_SYNC)) { |
1472 | set_bit(nr: In_sync, addr: &rdev->flags); |
1473 | rdev->raid_disk = desc->raid_disk; |
1474 | rdev->saved_raid_disk = desc->raid_disk; |
1475 | } else if (desc->state & (1<<MD_DISK_ACTIVE)) { |
1476 | /* active but not in sync implies recovery up to |
1477 | * reshape position. We don't know exactly where |
1478 | * that is, so set to zero for now |
1479 | */ |
1480 | if (mddev->minor_version >= 91) { |
1481 | rdev->recovery_offset = 0; |
1482 | rdev->raid_disk = desc->raid_disk; |
1483 | } |
1484 | } |
1485 | if (desc->state & (1<<MD_DISK_WRITEMOSTLY)) |
1486 | set_bit(nr: WriteMostly, addr: &rdev->flags); |
1487 | if (desc->state & (1<<MD_DISK_FAILFAST)) |
1488 | set_bit(nr: FailFast, addr: &rdev->flags); |
1489 | return 0; |
1490 | } |
1491 | |
1492 | /* |
1493 | * sync_super for 0.90.0 |
1494 | */ |
1495 | static void super_90_sync(struct mddev *mddev, struct md_rdev *rdev) |
1496 | { |
1497 | mdp_super_t *sb; |
1498 | struct md_rdev *rdev2; |
1499 | int next_spare = mddev->raid_disks; |
1500 | |
1501 | /* make rdev->sb match mddev data.. |
1502 | * |
1503 | * 1/ zero out disks |
1504 | * 2/ Add info for each disk, keeping track of highest desc_nr (next_spare); |
1505 | * 3/ any empty disks < next_spare become removed |
1506 | * |
1507 | * disks[0] gets initialised to REMOVED because |
1508 | * we cannot be sure from other fields if it has |
1509 | * been initialised or not. |
1510 | */ |
1511 | int i; |
1512 | int active=0, working=0,failed=0,spare=0,nr_disks=0; |
1513 | |
1514 | rdev->sb_size = MD_SB_BYTES; |
1515 | |
1516 | sb = page_address(rdev->sb_page); |
1517 | |
1518 | memset(sb, 0, sizeof(*sb)); |
1519 | |
1520 | sb->md_magic = MD_SB_MAGIC; |
1521 | sb->major_version = mddev->major_version; |
1522 | sb->patch_version = mddev->patch_version; |
1523 | sb->gvalid_words = 0; /* ignored */ |
1524 | memcpy(&sb->set_uuid0, mddev->uuid+0, 4); |
1525 | memcpy(&sb->set_uuid1, mddev->uuid+4, 4); |
1526 | memcpy(&sb->set_uuid2, mddev->uuid+8, 4); |
1527 | memcpy(&sb->set_uuid3, mddev->uuid+12,4); |
1528 | |
1529 | sb->ctime = clamp_t(time64_t, mddev->ctime, 0, U32_MAX); |
1530 | sb->level = mddev->level; |
1531 | sb->size = mddev->dev_sectors / 2; |
1532 | sb->raid_disks = mddev->raid_disks; |
1533 | sb->md_minor = mddev->md_minor; |
1534 | sb->not_persistent = 0; |
1535 | sb->utime = clamp_t(time64_t, mddev->utime, 0, U32_MAX); |
1536 | sb->state = 0; |
1537 | sb->events_hi = (mddev->events>>32); |
1538 | sb->events_lo = (u32)mddev->events; |
1539 | |
1540 | if (mddev->reshape_position == MaxSector) |
1541 | sb->minor_version = 90; |
1542 | else { |
1543 | sb->minor_version = 91; |
1544 | sb->reshape_position = mddev->reshape_position; |
1545 | sb->new_level = mddev->new_level; |
1546 | sb->delta_disks = mddev->delta_disks; |
1547 | sb->new_layout = mddev->new_layout; |
1548 | sb->new_chunk = mddev->new_chunk_sectors << 9; |
1549 | } |
1550 | mddev->minor_version = sb->minor_version; |
1551 | if (mddev->in_sync) |
1552 | { |
1553 | sb->recovery_cp = mddev->recovery_cp; |
1554 | sb->cp_events_hi = (mddev->events>>32); |
1555 | sb->cp_events_lo = (u32)mddev->events; |
1556 | if (mddev->recovery_cp == MaxSector) |
1557 | sb->state = (1<< MD_SB_CLEAN); |
1558 | } else |
1559 | sb->recovery_cp = 0; |
1560 | |
1561 | sb->layout = mddev->layout; |
1562 | sb->chunk_size = mddev->chunk_sectors << 9; |
1563 | |
1564 | if (mddev->bitmap && mddev->bitmap_info.file == NULL) |
1565 | sb->state |= (1<<MD_SB_BITMAP_PRESENT); |
1566 | |
1567 | sb->disks[0].state = (1<<MD_DISK_REMOVED); |
1568 | rdev_for_each(rdev2, mddev) { |
1569 | mdp_disk_t *d; |
1570 | int desc_nr; |
1571 | int is_active = test_bit(In_sync, &rdev2->flags); |
1572 | |
1573 | if (rdev2->raid_disk >= 0 && |
1574 | sb->minor_version >= 91) |
1575 | /* we have nowhere to store the recovery_offset, |
1576 | * but if it is not below the reshape_position, |
1577 | * we can piggy-back on that. |
1578 | */ |
1579 | is_active = 1; |
1580 | if (rdev2->raid_disk < 0 || |
1581 | test_bit(Faulty, &rdev2->flags)) |
1582 | is_active = 0; |
1583 | if (is_active) |
1584 | desc_nr = rdev2->raid_disk; |
1585 | else |
1586 | desc_nr = next_spare++; |
1587 | rdev2->desc_nr = desc_nr; |
1588 | d = &sb->disks[rdev2->desc_nr]; |
1589 | nr_disks++; |
1590 | d->number = rdev2->desc_nr; |
1591 | d->major = MAJOR(rdev2->bdev->bd_dev); |
1592 | d->minor = MINOR(rdev2->bdev->bd_dev); |
1593 | if (is_active) |
1594 | d->raid_disk = rdev2->raid_disk; |
1595 | else |
1596 | d->raid_disk = rdev2->desc_nr; /* compatibility */ |
1597 | if (test_bit(Faulty, &rdev2->flags)) |
1598 | d->state = (1<<MD_DISK_FAULTY); |
1599 | else if (is_active) { |
1600 | d->state = (1<<MD_DISK_ACTIVE); |
1601 | if (test_bit(In_sync, &rdev2->flags)) |
1602 | d->state |= (1<<MD_DISK_SYNC); |
1603 | active++; |
1604 | working++; |
1605 | } else { |
1606 | d->state = 0; |
1607 | spare++; |
1608 | working++; |
1609 | } |
1610 | if (test_bit(WriteMostly, &rdev2->flags)) |
1611 | d->state |= (1<<MD_DISK_WRITEMOSTLY); |
1612 | if (test_bit(FailFast, &rdev2->flags)) |
1613 | d->state |= (1<<MD_DISK_FAILFAST); |
1614 | } |
1615 | /* now set the "removed" and "faulty" bits on any missing devices */ |
1616 | for (i=0 ; i < mddev->raid_disks ; i++) { |
1617 | mdp_disk_t *d = &sb->disks[i]; |
1618 | if (d->state == 0 && d->number == 0) { |
1619 | d->number = i; |
1620 | d->raid_disk = i; |
1621 | d->state = (1<<MD_DISK_REMOVED); |
1622 | d->state |= (1<<MD_DISK_FAULTY); |
1623 | failed++; |
1624 | } |
1625 | } |
1626 | sb->nr_disks = nr_disks; |
1627 | sb->active_disks = active; |
1628 | sb->working_disks = working; |
1629 | sb->failed_disks = failed; |
1630 | sb->spare_disks = spare; |
1631 | |
1632 | sb->this_disk = sb->disks[rdev->desc_nr]; |
1633 | sb->sb_csum = calc_sb_csum(sb); |
1634 | } |
1635 | |
1636 | /* |
1637 | * rdev_size_change for 0.90.0 |
1638 | */ |
1639 | static unsigned long long |
1640 | super_90_rdev_size_change(struct md_rdev *rdev, sector_t num_sectors) |
1641 | { |
1642 | if (num_sectors && num_sectors < rdev->mddev->dev_sectors) |
1643 | return 0; /* component must fit device */ |
1644 | if (rdev->mddev->bitmap_info.offset) |
1645 | return 0; /* can't move bitmap */ |
1646 | rdev->sb_start = calc_dev_sboffset(rdev); |
1647 | if (!num_sectors || num_sectors > rdev->sb_start) |
1648 | num_sectors = rdev->sb_start; |
1649 | /* Limit to 4TB as metadata cannot record more than that. |
1650 | * 4TB == 2^32 KB, or 2*2^32 sectors. |
1651 | */ |
1652 | if ((u64)num_sectors >= (2ULL << 32) && rdev->mddev->level >= 1) |
1653 | num_sectors = (sector_t)(2ULL << 32) - 2; |
1654 | do { |
1655 | md_super_write(mddev: rdev->mddev, rdev, sector: rdev->sb_start, size: rdev->sb_size, |
1656 | page: rdev->sb_page); |
1657 | } while (md_super_wait(mddev: rdev->mddev) < 0); |
1658 | return num_sectors; |
1659 | } |
1660 | |
1661 | static int |
1662 | super_90_allow_new_offset(struct md_rdev *rdev, unsigned long long new_offset) |
1663 | { |
1664 | /* non-zero offset changes not possible with v0.90 */ |
1665 | return new_offset == 0; |
1666 | } |
1667 | |
1668 | /* |
1669 | * version 1 superblock |
1670 | */ |
1671 | |
1672 | static __le32 calc_sb_1_csum(struct mdp_superblock_1 *sb) |
1673 | { |
1674 | __le32 disk_csum; |
1675 | u32 csum; |
1676 | unsigned long long newcsum; |
1677 | int size = 256 + le32_to_cpu(sb->max_dev)*2; |
1678 | __le32 *isuper = (__le32*)sb; |
1679 | |
1680 | disk_csum = sb->sb_csum; |
1681 | sb->sb_csum = 0; |
1682 | newcsum = 0; |
1683 | for (; size >= 4; size -= 4) |
1684 | newcsum += le32_to_cpu(*isuper++); |
1685 | |
1686 | if (size == 2) |
1687 | newcsum += le16_to_cpu(*(__le16*) isuper); |
1688 | |
1689 | csum = (newcsum & 0xffffffff) + (newcsum >> 32); |
1690 | sb->sb_csum = disk_csum; |
1691 | return cpu_to_le32(csum); |
1692 | } |
1693 | |
1694 | static int super_1_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_version) |
1695 | { |
1696 | struct mdp_superblock_1 *sb; |
1697 | int ret; |
1698 | sector_t sb_start; |
1699 | sector_t sectors; |
1700 | int bmask; |
1701 | bool spare_disk = true; |
1702 | |
1703 | /* |
1704 | * Calculate the position of the superblock in 512byte sectors. |
1705 | * It is always aligned to a 4K boundary and |
1706 | * depeding on minor_version, it can be: |
1707 | * 0: At least 8K, but less than 12K, from end of device |
1708 | * 1: At start of device |
1709 | * 2: 4K from start of device. |
1710 | */ |
1711 | switch(minor_version) { |
1712 | case 0: |
1713 | sb_start = bdev_nr_sectors(bdev: rdev->bdev) - 8 * 2; |
1714 | sb_start &= ~(sector_t)(4*2-1); |
1715 | break; |
1716 | case 1: |
1717 | sb_start = 0; |
1718 | break; |
1719 | case 2: |
1720 | sb_start = 8; |
1721 | break; |
1722 | default: |
1723 | return -EINVAL; |
1724 | } |
1725 | rdev->sb_start = sb_start; |
1726 | |
1727 | /* superblock is rarely larger than 1K, but it can be larger, |
1728 | * and it is safe to read 4k, so we do that |
1729 | */ |
1730 | ret = read_disk_sb(rdev, size: 4096); |
1731 | if (ret) return ret; |
1732 | |
1733 | sb = page_address(rdev->sb_page); |
1734 | |
1735 | if (sb->magic != cpu_to_le32(MD_SB_MAGIC) || |
1736 | sb->major_version != cpu_to_le32(1) || |
1737 | le32_to_cpu(sb->max_dev) > (4096-256)/2 || |
1738 | le64_to_cpu(sb->super_offset) != rdev->sb_start || |
1739 | (le32_to_cpu(sb->feature_map) & ~MD_FEATURE_ALL) != 0) |
1740 | return -EINVAL; |
1741 | |
1742 | if (calc_sb_1_csum(sb) != sb->sb_csum) { |
1743 | pr_warn("md: invalid superblock checksum on %pg\n" , |
1744 | rdev->bdev); |
1745 | return -EINVAL; |
1746 | } |
1747 | if (le64_to_cpu(sb->data_size) < 10) { |
1748 | pr_warn("md: data_size too small on %pg\n" , |
1749 | rdev->bdev); |
1750 | return -EINVAL; |
1751 | } |
1752 | if (sb->pad0 || |
1753 | sb->pad3[0] || |
1754 | memcmp(p: sb->pad3, q: sb->pad3+1, size: sizeof(sb->pad3) - sizeof(sb->pad3[1]))) |
1755 | /* Some padding is non-zero, might be a new feature */ |
1756 | return -EINVAL; |
1757 | |
1758 | rdev->preferred_minor = 0xffff; |
1759 | rdev->data_offset = le64_to_cpu(sb->data_offset); |
1760 | rdev->new_data_offset = rdev->data_offset; |
1761 | if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE) && |
1762 | (le32_to_cpu(sb->feature_map) & MD_FEATURE_NEW_OFFSET)) |
1763 | rdev->new_data_offset += (s32)le32_to_cpu(sb->new_offset); |
1764 | atomic_set(v: &rdev->corrected_errors, le32_to_cpu(sb->cnt_corrected_read)); |
1765 | |
1766 | rdev->sb_size = le32_to_cpu(sb->max_dev) * 2 + 256; |
1767 | bmask = queue_logical_block_size(q: rdev->bdev->bd_disk->queue)-1; |
1768 | if (rdev->sb_size & bmask) |
1769 | rdev->sb_size = (rdev->sb_size | bmask) + 1; |
1770 | |
1771 | if (minor_version |
1772 | && rdev->data_offset < sb_start + (rdev->sb_size/512)) |
1773 | return -EINVAL; |
1774 | if (minor_version |
1775 | && rdev->new_data_offset < sb_start + (rdev->sb_size/512)) |
1776 | return -EINVAL; |
1777 | |
1778 | rdev->desc_nr = le32_to_cpu(sb->dev_number); |
1779 | |
1780 | if (!rdev->bb_page) { |
1781 | rdev->bb_page = alloc_page(GFP_KERNEL); |
1782 | if (!rdev->bb_page) |
1783 | return -ENOMEM; |
1784 | } |
1785 | if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_BAD_BLOCKS) && |
1786 | rdev->badblocks.count == 0) { |
1787 | /* need to load the bad block list. |
1788 | * Currently we limit it to one page. |
1789 | */ |
1790 | s32 offset; |
1791 | sector_t bb_sector; |
1792 | __le64 *bbp; |
1793 | int i; |
1794 | int sectors = le16_to_cpu(sb->bblog_size); |
1795 | if (sectors > (PAGE_SIZE / 512)) |
1796 | return -EINVAL; |
1797 | offset = le32_to_cpu(sb->bblog_offset); |
1798 | if (offset == 0) |
1799 | return -EINVAL; |
1800 | bb_sector = (long long)offset; |
1801 | if (!sync_page_io(rdev, bb_sector, sectors << 9, |
1802 | rdev->bb_page, REQ_OP_READ, true)) |
1803 | return -EIO; |
1804 | bbp = (__le64 *)page_address(rdev->bb_page); |
1805 | rdev->badblocks.shift = sb->bblog_shift; |
1806 | for (i = 0 ; i < (sectors << (9-3)) ; i++, bbp++) { |
1807 | u64 bb = le64_to_cpu(*bbp); |
1808 | int count = bb & (0x3ff); |
1809 | u64 sector = bb >> 10; |
1810 | sector <<= sb->bblog_shift; |
1811 | count <<= sb->bblog_shift; |
1812 | if (bb + 1 == 0) |
1813 | break; |
1814 | if (badblocks_set(bb: &rdev->badblocks, s: sector, sectors: count, acknowledged: 1)) |
1815 | return -EINVAL; |
1816 | } |
1817 | } else if (sb->bblog_offset != 0) |
1818 | rdev->badblocks.shift = 0; |
1819 | |
1820 | if ((le32_to_cpu(sb->feature_map) & |
1821 | (MD_FEATURE_PPL | MD_FEATURE_MULTIPLE_PPLS))) { |
1822 | rdev->ppl.offset = (__s16)le16_to_cpu(sb->ppl.offset); |
1823 | rdev->ppl.size = le16_to_cpu(sb->ppl.size); |
1824 | rdev->ppl.sector = rdev->sb_start + rdev->ppl.offset; |
1825 | } |
1826 | |
1827 | if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RAID0_LAYOUT) && |
1828 | sb->level != 0) |
1829 | return -EINVAL; |
1830 | |
1831 | /* not spare disk */ |
1832 | if (rdev->desc_nr >= 0 && rdev->desc_nr < le32_to_cpu(sb->max_dev) && |
1833 | (le16_to_cpu(sb->dev_roles[rdev->desc_nr]) < MD_DISK_ROLE_MAX || |
1834 | le16_to_cpu(sb->dev_roles[rdev->desc_nr]) == MD_DISK_ROLE_JOURNAL)) |
1835 | spare_disk = false; |
1836 | |
1837 | if (!refdev) { |
1838 | if (!spare_disk) |
1839 | ret = 1; |
1840 | else |
1841 | ret = 0; |
1842 | } else { |
1843 | __u64 ev1, ev2; |
1844 | struct mdp_superblock_1 *refsb = page_address(refdev->sb_page); |
1845 | |
1846 | if (memcmp(p: sb->set_uuid, q: refsb->set_uuid, size: 16) != 0 || |
1847 | sb->level != refsb->level || |
1848 | sb->layout != refsb->layout || |
1849 | sb->chunksize != refsb->chunksize) { |
1850 | pr_warn("md: %pg has strangely different superblock to %pg\n" , |
1851 | rdev->bdev, |
1852 | refdev->bdev); |
1853 | return -EINVAL; |
1854 | } |
1855 | ev1 = le64_to_cpu(sb->events); |
1856 | ev2 = le64_to_cpu(refsb->events); |
1857 | |
1858 | if (!spare_disk && ev1 > ev2) |
1859 | ret = 1; |
1860 | else |
1861 | ret = 0; |
1862 | } |
1863 | if (minor_version) |
1864 | sectors = bdev_nr_sectors(bdev: rdev->bdev) - rdev->data_offset; |
1865 | else |
1866 | sectors = rdev->sb_start; |
1867 | if (sectors < le64_to_cpu(sb->data_size)) |
1868 | return -EINVAL; |
1869 | rdev->sectors = le64_to_cpu(sb->data_size); |
1870 | return ret; |
1871 | } |
1872 | |
1873 | static int super_1_validate(struct mddev *mddev, struct md_rdev *freshest, struct md_rdev *rdev) |
1874 | { |
1875 | struct mdp_superblock_1 *sb = page_address(rdev->sb_page); |
1876 | __u64 ev1 = le64_to_cpu(sb->events); |
1877 | int role; |
1878 | |
1879 | rdev->raid_disk = -1; |
1880 | clear_bit(nr: Faulty, addr: &rdev->flags); |
1881 | clear_bit(nr: In_sync, addr: &rdev->flags); |
1882 | clear_bit(nr: Bitmap_sync, addr: &rdev->flags); |
1883 | clear_bit(nr: WriteMostly, addr: &rdev->flags); |
1884 | |
1885 | if (mddev->raid_disks == 0) { |
1886 | mddev->major_version = 1; |
1887 | mddev->patch_version = 0; |
1888 | mddev->external = 0; |
1889 | mddev->chunk_sectors = le32_to_cpu(sb->chunksize); |
1890 | mddev->ctime = le64_to_cpu(sb->ctime); |
1891 | mddev->utime = le64_to_cpu(sb->utime); |
1892 | mddev->level = le32_to_cpu(sb->level); |
1893 | mddev->clevel[0] = 0; |
1894 | mddev->layout = le32_to_cpu(sb->layout); |
1895 | mddev->raid_disks = le32_to_cpu(sb->raid_disks); |
1896 | mddev->dev_sectors = le64_to_cpu(sb->size); |
1897 | mddev->events = ev1; |
1898 | mddev->bitmap_info.offset = 0; |
1899 | mddev->bitmap_info.space = 0; |
1900 | /* Default location for bitmap is 1K after superblock |
1901 | * using 3K - total of 4K |
1902 | */ |
1903 | mddev->bitmap_info.default_offset = 1024 >> 9; |
1904 | mddev->bitmap_info.default_space = (4096-1024) >> 9; |
1905 | mddev->reshape_backwards = 0; |
1906 | |
1907 | mddev->recovery_cp = le64_to_cpu(sb->resync_offset); |
1908 | memcpy(mddev->uuid, sb->set_uuid, 16); |
1909 | |
1910 | mddev->max_disks = (4096-256)/2; |
1911 | |
1912 | if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_BITMAP_OFFSET) && |
1913 | mddev->bitmap_info.file == NULL) { |
1914 | mddev->bitmap_info.offset = |
1915 | (__s32)le32_to_cpu(sb->bitmap_offset); |
1916 | /* Metadata doesn't record how much space is available. |
1917 | * For 1.0, we assume we can use up to the superblock |
1918 | * if before, else to 4K beyond superblock. |
1919 | * For others, assume no change is possible. |
1920 | */ |
1921 | if (mddev->minor_version > 0) |
1922 | mddev->bitmap_info.space = 0; |
1923 | else if (mddev->bitmap_info.offset > 0) |
1924 | mddev->bitmap_info.space = |
1925 | 8 - mddev->bitmap_info.offset; |
1926 | else |
1927 | mddev->bitmap_info.space = |
1928 | -mddev->bitmap_info.offset; |
1929 | } |
1930 | |
1931 | if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE)) { |
1932 | mddev->reshape_position = le64_to_cpu(sb->reshape_position); |
1933 | mddev->delta_disks = le32_to_cpu(sb->delta_disks); |
1934 | mddev->new_level = le32_to_cpu(sb->new_level); |
1935 | mddev->new_layout = le32_to_cpu(sb->new_layout); |
1936 | mddev->new_chunk_sectors = le32_to_cpu(sb->new_chunk); |
1937 | if (mddev->delta_disks < 0 || |
1938 | (mddev->delta_disks == 0 && |
1939 | (le32_to_cpu(sb->feature_map) |
1940 | & MD_FEATURE_RESHAPE_BACKWARDS))) |
1941 | mddev->reshape_backwards = 1; |
1942 | } else { |
1943 | mddev->reshape_position = MaxSector; |
1944 | mddev->delta_disks = 0; |
1945 | mddev->new_level = mddev->level; |
1946 | mddev->new_layout = mddev->layout; |
1947 | mddev->new_chunk_sectors = mddev->chunk_sectors; |
1948 | } |
1949 | |
1950 | if (mddev->level == 0 && |
1951 | !(le32_to_cpu(sb->feature_map) & MD_FEATURE_RAID0_LAYOUT)) |
1952 | mddev->layout = -1; |
1953 | |
1954 | if (le32_to_cpu(sb->feature_map) & MD_FEATURE_JOURNAL) |
1955 | set_bit(nr: MD_HAS_JOURNAL, addr: &mddev->flags); |
1956 | |
1957 | if (le32_to_cpu(sb->feature_map) & |
1958 | (MD_FEATURE_PPL | MD_FEATURE_MULTIPLE_PPLS)) { |
1959 | if (le32_to_cpu(sb->feature_map) & |
1960 | (MD_FEATURE_BITMAP_OFFSET | MD_FEATURE_JOURNAL)) |
1961 | return -EINVAL; |
1962 | if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_PPL) && |
1963 | (le32_to_cpu(sb->feature_map) & |
1964 | MD_FEATURE_MULTIPLE_PPLS)) |
1965 | return -EINVAL; |
1966 | set_bit(nr: MD_HAS_PPL, addr: &mddev->flags); |
1967 | } |
1968 | } else if (mddev->pers == NULL) { |
1969 | /* Insist of good event counter while assembling, except for |
1970 | * spares (which don't need an event count). |
1971 | * Similar to mdadm, we allow event counter difference of 1 |
1972 | * from the freshest device. |
1973 | */ |
1974 | if (rdev->desc_nr >= 0 && |
1975 | rdev->desc_nr < le32_to_cpu(sb->max_dev) && |
1976 | (le16_to_cpu(sb->dev_roles[rdev->desc_nr]) < MD_DISK_ROLE_MAX || |
1977 | le16_to_cpu(sb->dev_roles[rdev->desc_nr]) == MD_DISK_ROLE_JOURNAL)) |
1978 | if (ev1 + 1 < mddev->events) |
1979 | return -EINVAL; |
1980 | } else if (mddev->bitmap) { |
1981 | /* If adding to array with a bitmap, then we can accept an |
1982 | * older device, but not too old. |
1983 | */ |
1984 | if (ev1 < mddev->bitmap->events_cleared) |
1985 | return 0; |
1986 | if (ev1 < mddev->events) |
1987 | set_bit(nr: Bitmap_sync, addr: &rdev->flags); |
1988 | } else { |
1989 | if (ev1 < mddev->events) |
1990 | /* just a hot-add of a new device, leave raid_disk at -1 */ |
1991 | return 0; |
1992 | } |
1993 | |
1994 | if (rdev->desc_nr < 0 || |
1995 | rdev->desc_nr >= le32_to_cpu(sb->max_dev)) { |
1996 | role = MD_DISK_ROLE_SPARE; |
1997 | rdev->desc_nr = -1; |
1998 | } else if (mddev->pers == NULL && freshest && ev1 < mddev->events) { |
1999 | /* |
2000 | * If we are assembling, and our event counter is smaller than the |
2001 | * highest event counter, we cannot trust our superblock about the role. |
2002 | * It could happen that our rdev was marked as Faulty, and all other |
2003 | * superblocks were updated with +1 event counter. |
2004 | * Then, before the next superblock update, which typically happens when |
2005 | * remove_and_add_spares() removes the device from the array, there was |
2006 | * a crash or reboot. |
2007 | * If we allow current rdev without consulting the freshest superblock, |
2008 | * we could cause data corruption. |
2009 | * Note that in this case our event counter is smaller by 1 than the |
2010 | * highest, otherwise, this rdev would not be allowed into array; |
2011 | * both kernel and mdadm allow event counter difference of 1. |
2012 | */ |
2013 | struct mdp_superblock_1 *freshest_sb = page_address(freshest->sb_page); |
2014 | u32 freshest_max_dev = le32_to_cpu(freshest_sb->max_dev); |
2015 | |
2016 | if (rdev->desc_nr >= freshest_max_dev) { |
2017 | /* this is unexpected, better not proceed */ |
2018 | pr_warn("md: %s: rdev[%pg]: desc_nr(%d) >= freshest(%pg)->sb->max_dev(%u)\n" , |
2019 | mdname(mddev), rdev->bdev, rdev->desc_nr, |
2020 | freshest->bdev, freshest_max_dev); |
2021 | return -EUCLEAN; |
2022 | } |
2023 | |
2024 | role = le16_to_cpu(freshest_sb->dev_roles[rdev->desc_nr]); |
2025 | pr_debug("md: %s: rdev[%pg]: role=%d(0x%x) according to freshest %pg\n" , |
2026 | mdname(mddev), rdev->bdev, role, role, freshest->bdev); |
2027 | } else { |
2028 | role = le16_to_cpu(sb->dev_roles[rdev->desc_nr]); |
2029 | } |
2030 | switch (role) { |
2031 | case MD_DISK_ROLE_SPARE: /* spare */ |
2032 | break; |
2033 | case MD_DISK_ROLE_FAULTY: /* faulty */ |
2034 | set_bit(nr: Faulty, addr: &rdev->flags); |
2035 | break; |
2036 | case MD_DISK_ROLE_JOURNAL: /* journal device */ |
2037 | if (!(le32_to_cpu(sb->feature_map) & MD_FEATURE_JOURNAL)) { |
2038 | /* journal device without journal feature */ |
2039 | pr_warn("md: journal device provided without journal feature, ignoring the device\n" ); |
2040 | return -EINVAL; |
2041 | } |
2042 | set_bit(nr: Journal, addr: &rdev->flags); |
2043 | rdev->journal_tail = le64_to_cpu(sb->journal_tail); |
2044 | rdev->raid_disk = 0; |
2045 | break; |
2046 | default: |
2047 | rdev->saved_raid_disk = role; |
2048 | if ((le32_to_cpu(sb->feature_map) & |
2049 | MD_FEATURE_RECOVERY_OFFSET)) { |
2050 | rdev->recovery_offset = le64_to_cpu(sb->recovery_offset); |
2051 | if (!(le32_to_cpu(sb->feature_map) & |
2052 | MD_FEATURE_RECOVERY_BITMAP)) |
2053 | rdev->saved_raid_disk = -1; |
2054 | } else { |
2055 | /* |
2056 | * If the array is FROZEN, then the device can't |
2057 | * be in_sync with rest of array. |
2058 | */ |
2059 | if (!test_bit(MD_RECOVERY_FROZEN, |
2060 | &mddev->recovery)) |
2061 | set_bit(nr: In_sync, addr: &rdev->flags); |
2062 | } |
2063 | rdev->raid_disk = role; |
2064 | break; |
2065 | } |
2066 | if (sb->devflags & WriteMostly1) |
2067 | set_bit(nr: WriteMostly, addr: &rdev->flags); |
2068 | if (sb->devflags & FailFast1) |
2069 | set_bit(nr: FailFast, addr: &rdev->flags); |
2070 | if (le32_to_cpu(sb->feature_map) & MD_FEATURE_REPLACEMENT) |
2071 | set_bit(nr: Replacement, addr: &rdev->flags); |
2072 | |
2073 | return 0; |
2074 | } |
2075 | |
2076 | static void super_1_sync(struct mddev *mddev, struct md_rdev *rdev) |
2077 | { |
2078 | struct mdp_superblock_1 *sb; |
2079 | struct md_rdev *rdev2; |
2080 | int max_dev, i; |
2081 | /* make rdev->sb match mddev and rdev data. */ |
2082 | |
2083 | sb = page_address(rdev->sb_page); |
2084 | |
2085 | sb->feature_map = 0; |
2086 | sb->pad0 = 0; |
2087 | sb->recovery_offset = cpu_to_le64(0); |
2088 | memset(sb->pad3, 0, sizeof(sb->pad3)); |
2089 | |
2090 | sb->utime = cpu_to_le64((__u64)mddev->utime); |
2091 | sb->events = cpu_to_le64(mddev->events); |
2092 | if (mddev->in_sync) |
2093 | sb->resync_offset = cpu_to_le64(mddev->recovery_cp); |
2094 | else if (test_bit(MD_JOURNAL_CLEAN, &mddev->flags)) |
2095 | sb->resync_offset = cpu_to_le64(MaxSector); |
2096 | else |
2097 | sb->resync_offset = cpu_to_le64(0); |
2098 | |
2099 | sb->cnt_corrected_read = cpu_to_le32(atomic_read(&rdev->corrected_errors)); |
2100 | |
2101 | sb->raid_disks = cpu_to_le32(mddev->raid_disks); |
2102 | sb->size = cpu_to_le64(mddev->dev_sectors); |
2103 | sb->chunksize = cpu_to_le32(mddev->chunk_sectors); |
2104 | sb->level = cpu_to_le32(mddev->level); |
2105 | sb->layout = cpu_to_le32(mddev->layout); |
2106 | if (test_bit(FailFast, &rdev->flags)) |
2107 | sb->devflags |= FailFast1; |
2108 | else |
2109 | sb->devflags &= ~FailFast1; |
2110 | |
2111 | if (test_bit(WriteMostly, &rdev->flags)) |
2112 | sb->devflags |= WriteMostly1; |
2113 | else |
2114 | sb->devflags &= ~WriteMostly1; |
2115 | sb->data_offset = cpu_to_le64(rdev->data_offset); |
2116 | sb->data_size = cpu_to_le64(rdev->sectors); |
2117 | |
2118 | if (mddev->bitmap && mddev->bitmap_info.file == NULL) { |
2119 | sb->bitmap_offset = cpu_to_le32((__u32)mddev->bitmap_info.offset); |
2120 | sb->feature_map = cpu_to_le32(MD_FEATURE_BITMAP_OFFSET); |
2121 | } |
2122 | |
2123 | if (rdev->raid_disk >= 0 && !test_bit(Journal, &rdev->flags) && |
2124 | !test_bit(In_sync, &rdev->flags)) { |
2125 | sb->feature_map |= |
2126 | cpu_to_le32(MD_FEATURE_RECOVERY_OFFSET); |
2127 | sb->recovery_offset = |
2128 | cpu_to_le64(rdev->recovery_offset); |
2129 | if (rdev->saved_raid_disk >= 0 && mddev->bitmap) |
2130 | sb->feature_map |= |
2131 | cpu_to_le32(MD_FEATURE_RECOVERY_BITMAP); |
2132 | } |
2133 | /* Note: recovery_offset and journal_tail share space */ |
2134 | if (test_bit(Journal, &rdev->flags)) |
2135 | sb->journal_tail = cpu_to_le64(rdev->journal_tail); |
2136 | if (test_bit(Replacement, &rdev->flags)) |
2137 | sb->feature_map |= |
2138 | cpu_to_le32(MD_FEATURE_REPLACEMENT); |
2139 | |
2140 | if (mddev->reshape_position != MaxSector) { |
2141 | sb->feature_map |= cpu_to_le32(MD_FEATURE_RESHAPE_ACTIVE); |
2142 | sb->reshape_position = cpu_to_le64(mddev->reshape_position); |
2143 | sb->new_layout = cpu_to_le32(mddev->new_layout); |
2144 | sb->delta_disks = cpu_to_le32(mddev->delta_disks); |
2145 | sb->new_level = cpu_to_le32(mddev->new_level); |
2146 | sb->new_chunk = cpu_to_le32(mddev->new_chunk_sectors); |
2147 | if (mddev->delta_disks == 0 && |
2148 | mddev->reshape_backwards) |
2149 | sb->feature_map |
2150 | |= cpu_to_le32(MD_FEATURE_RESHAPE_BACKWARDS); |
2151 | if (rdev->new_data_offset != rdev->data_offset) { |
2152 | sb->feature_map |
2153 | |= cpu_to_le32(MD_FEATURE_NEW_OFFSET); |
2154 | sb->new_offset = cpu_to_le32((__u32)(rdev->new_data_offset |
2155 | - rdev->data_offset)); |
2156 | } |
2157 | } |
2158 | |
2159 | if (mddev_is_clustered(mddev)) |
2160 | sb->feature_map |= cpu_to_le32(MD_FEATURE_CLUSTERED); |
2161 | |
2162 | if (rdev->badblocks.count == 0) |
2163 | /* Nothing to do for bad blocks*/ ; |
2164 | else if (sb->bblog_offset == 0) |
2165 | /* Cannot record bad blocks on this device */ |
2166 | md_error(mddev, rdev); |
2167 | else { |
2168 | struct badblocks *bb = &rdev->badblocks; |
2169 | __le64 *bbp = (__le64 *)page_address(rdev->bb_page); |
2170 | u64 *p = bb->page; |
2171 | sb->feature_map |= cpu_to_le32(MD_FEATURE_BAD_BLOCKS); |
2172 | if (bb->changed) { |
2173 | unsigned seq; |
2174 | |
2175 | retry: |
2176 | seq = read_seqbegin(sl: &bb->lock); |
2177 | |
2178 | memset(bbp, 0xff, PAGE_SIZE); |
2179 | |
2180 | for (i = 0 ; i < bb->count ; i++) { |
2181 | u64 internal_bb = p[i]; |
2182 | u64 store_bb = ((BB_OFFSET(internal_bb) << 10) |
2183 | | BB_LEN(internal_bb)); |
2184 | bbp[i] = cpu_to_le64(store_bb); |
2185 | } |
2186 | bb->changed = 0; |
2187 | if (read_seqretry(sl: &bb->lock, start: seq)) |
2188 | goto retry; |
2189 | |
2190 | bb->sector = (rdev->sb_start + |
2191 | (int)le32_to_cpu(sb->bblog_offset)); |
2192 | bb->size = le16_to_cpu(sb->bblog_size); |
2193 | } |
2194 | } |
2195 | |
2196 | max_dev = 0; |
2197 | rdev_for_each(rdev2, mddev) |
2198 | if (rdev2->desc_nr+1 > max_dev) |
2199 | max_dev = rdev2->desc_nr+1; |
2200 | |
2201 | if (max_dev > le32_to_cpu(sb->max_dev)) { |
2202 | int bmask; |
2203 | sb->max_dev = cpu_to_le32(max_dev); |
2204 | rdev->sb_size = max_dev * 2 + 256; |
2205 | bmask = queue_logical_block_size(q: rdev->bdev->bd_disk->queue)-1; |
2206 | if (rdev->sb_size & bmask) |
2207 | rdev->sb_size = (rdev->sb_size | bmask) + 1; |
2208 | } else |
2209 | max_dev = le32_to_cpu(sb->max_dev); |
2210 | |
2211 | for (i=0; i<max_dev;i++) |
2212 | sb->dev_roles[i] = cpu_to_le16(MD_DISK_ROLE_SPARE); |
2213 | |
2214 | if (test_bit(MD_HAS_JOURNAL, &mddev->flags)) |
2215 | sb->feature_map |= cpu_to_le32(MD_FEATURE_JOURNAL); |
2216 | |
2217 | if (test_bit(MD_HAS_PPL, &mddev->flags)) { |
2218 | if (test_bit(MD_HAS_MULTIPLE_PPLS, &mddev->flags)) |
2219 | sb->feature_map |= |
2220 | cpu_to_le32(MD_FEATURE_MULTIPLE_PPLS); |
2221 | else |
2222 | sb->feature_map |= cpu_to_le32(MD_FEATURE_PPL); |
2223 | sb->ppl.offset = cpu_to_le16(rdev->ppl.offset); |
2224 | sb->ppl.size = cpu_to_le16(rdev->ppl.size); |
2225 | } |
2226 | |
2227 | rdev_for_each(rdev2, mddev) { |
2228 | i = rdev2->desc_nr; |
2229 | if (test_bit(Faulty, &rdev2->flags)) |
2230 | sb->dev_roles[i] = cpu_to_le16(MD_DISK_ROLE_FAULTY); |
2231 | else if (test_bit(In_sync, &rdev2->flags)) |
2232 | sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk); |
2233 | else if (test_bit(Journal, &rdev2->flags)) |
2234 | sb->dev_roles[i] = cpu_to_le16(MD_DISK_ROLE_JOURNAL); |
2235 | else if (rdev2->raid_disk >= 0) |
2236 | sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk); |
2237 | else |
2238 | sb->dev_roles[i] = cpu_to_le16(MD_DISK_ROLE_SPARE); |
2239 | } |
2240 | |
2241 | sb->sb_csum = calc_sb_1_csum(sb); |
2242 | } |
2243 | |
2244 | static sector_t super_1_choose_bm_space(sector_t dev_size) |
2245 | { |
2246 | sector_t bm_space; |
2247 | |
2248 | /* if the device is bigger than 8Gig, save 64k for bitmap |
2249 | * usage, if bigger than 200Gig, save 128k |
2250 | */ |
2251 | if (dev_size < 64*2) |
2252 | bm_space = 0; |
2253 | else if (dev_size - 64*2 >= 200*1024*1024*2) |
2254 | bm_space = 128*2; |
2255 | else if (dev_size - 4*2 > 8*1024*1024*2) |
2256 | bm_space = 64*2; |
2257 | else |
2258 | bm_space = 4*2; |
2259 | return bm_space; |
2260 | } |
2261 | |
2262 | static unsigned long long |
2263 | super_1_rdev_size_change(struct md_rdev *rdev, sector_t num_sectors) |
2264 | { |
2265 | struct mdp_superblock_1 *sb; |
2266 | sector_t max_sectors; |
2267 | if (num_sectors && num_sectors < rdev->mddev->dev_sectors) |
2268 | return 0; /* component must fit device */ |
2269 | if (rdev->data_offset != rdev->new_data_offset) |
2270 | return 0; /* too confusing */ |
2271 | if (rdev->sb_start < rdev->data_offset) { |
2272 | /* minor versions 1 and 2; superblock before data */ |
2273 | max_sectors = bdev_nr_sectors(bdev: rdev->bdev) - rdev->data_offset; |
2274 | if (!num_sectors || num_sectors > max_sectors) |
2275 | num_sectors = max_sectors; |
2276 | } else if (rdev->mddev->bitmap_info.offset) { |
2277 | /* minor version 0 with bitmap we can't move */ |
2278 | return 0; |
2279 | } else { |
2280 | /* minor version 0; superblock after data */ |
2281 | sector_t sb_start, bm_space; |
2282 | sector_t dev_size = bdev_nr_sectors(bdev: rdev->bdev); |
2283 | |
2284 | /* 8K is for superblock */ |
2285 | sb_start = dev_size - 8*2; |
2286 | sb_start &= ~(sector_t)(4*2 - 1); |
2287 | |
2288 | bm_space = super_1_choose_bm_space(dev_size); |
2289 | |
2290 | /* Space that can be used to store date needs to decrease |
2291 | * superblock bitmap space and bad block space(4K) |
2292 | */ |
2293 | max_sectors = sb_start - bm_space - 4*2; |
2294 | |
2295 | if (!num_sectors || num_sectors > max_sectors) |
2296 | num_sectors = max_sectors; |
2297 | rdev->sb_start = sb_start; |
2298 | } |
2299 | sb = page_address(rdev->sb_page); |
2300 | sb->data_size = cpu_to_le64(num_sectors); |
2301 | sb->super_offset = cpu_to_le64(rdev->sb_start); |
2302 | sb->sb_csum = calc_sb_1_csum(sb); |
2303 | do { |
2304 | md_super_write(mddev: rdev->mddev, rdev, sector: rdev->sb_start, size: rdev->sb_size, |
2305 | page: rdev->sb_page); |
2306 | } while (md_super_wait(mddev: rdev->mddev) < 0); |
2307 | return num_sectors; |
2308 | |
2309 | } |
2310 | |
2311 | static int |
2312 | super_1_allow_new_offset(struct md_rdev *rdev, |
2313 | unsigned long long new_offset) |
2314 | { |
2315 | /* All necessary checks on new >= old have been done */ |
2316 | struct bitmap *bitmap; |
2317 | if (new_offset >= rdev->data_offset) |
2318 | return 1; |
2319 | |
2320 | /* with 1.0 metadata, there is no metadata to tread on |
2321 | * so we can always move back */ |
2322 | if (rdev->mddev->minor_version == 0) |
2323 | return 1; |
2324 | |
2325 | /* otherwise we must be sure not to step on |
2326 | * any metadata, so stay: |
2327 | * 36K beyond start of superblock |
2328 | * beyond end of badblocks |
2329 | * beyond write-intent bitmap |
2330 | */ |
2331 | if (rdev->sb_start + (32+4)*2 > new_offset) |
2332 | return 0; |
2333 | bitmap = rdev->mddev->bitmap; |
2334 | if (bitmap && !rdev->mddev->bitmap_info.file && |
2335 | rdev->sb_start + rdev->mddev->bitmap_info.offset + |
2336 | bitmap->storage.file_pages * (PAGE_SIZE>>9) > new_offset) |
2337 | return 0; |
2338 | if (rdev->badblocks.sector + rdev->badblocks.size > new_offset) |
2339 | return 0; |
2340 | |
2341 | return 1; |
2342 | } |
2343 | |
2344 | static struct super_type super_types[] = { |
2345 | [0] = { |
2346 | .name = "0.90.0" , |
2347 | .owner = THIS_MODULE, |
2348 | .load_super = super_90_load, |
2349 | .validate_super = super_90_validate, |
2350 | .sync_super = super_90_sync, |
2351 | .rdev_size_change = super_90_rdev_size_change, |
2352 | .allow_new_offset = super_90_allow_new_offset, |
2353 | }, |
2354 | [1] = { |
2355 | .name = "md-1" , |
2356 | .owner = THIS_MODULE, |
2357 | .load_super = super_1_load, |
2358 | .validate_super = super_1_validate, |
2359 | .sync_super = super_1_sync, |
2360 | .rdev_size_change = super_1_rdev_size_change, |
2361 | .allow_new_offset = super_1_allow_new_offset, |
2362 | }, |
2363 | }; |
2364 | |
2365 | static void sync_super(struct mddev *mddev, struct md_rdev *rdev) |
2366 | { |
2367 | if (mddev->sync_super) { |
2368 | mddev->sync_super(mddev, rdev); |
2369 | return; |
2370 | } |
2371 | |
2372 | BUG_ON(mddev->major_version >= ARRAY_SIZE(super_types)); |
2373 | |
2374 | super_types[mddev->major_version].sync_super(mddev, rdev); |
2375 | } |
2376 | |
2377 | static int match_mddev_units(struct mddev *mddev1, struct mddev *mddev2) |
2378 | { |
2379 | struct md_rdev *rdev, *rdev2; |
2380 | |
2381 | rcu_read_lock(); |
2382 | rdev_for_each_rcu(rdev, mddev1) { |
2383 | if (test_bit(Faulty, &rdev->flags) || |
2384 | test_bit(Journal, &rdev->flags) || |
2385 | rdev->raid_disk == -1) |
2386 | continue; |
2387 | rdev_for_each_rcu(rdev2, mddev2) { |
2388 | if (test_bit(Faulty, &rdev2->flags) || |
2389 | test_bit(Journal, &rdev2->flags) || |
2390 | rdev2->raid_disk == -1) |
2391 | continue; |
2392 | if (rdev->bdev->bd_disk == rdev2->bdev->bd_disk) { |
2393 | rcu_read_unlock(); |
2394 | return 1; |
2395 | } |
2396 | } |
2397 | } |
2398 | rcu_read_unlock(); |
2399 | return 0; |
2400 | } |
2401 | |
2402 | static LIST_HEAD(pending_raid_disks); |
2403 | |
2404 | /* |
2405 | * Try to register data integrity profile for an mddev |
2406 | * |
2407 | * This is called when an array is started and after a disk has been kicked |
2408 | * from the array. It only succeeds if all working and active component devices |
2409 | * are integrity capable with matching profiles. |
2410 | */ |
2411 | int md_integrity_register(struct mddev *mddev) |
2412 | { |
2413 | struct md_rdev *rdev, *reference = NULL; |
2414 | |
2415 | if (list_empty(head: &mddev->disks)) |
2416 | return 0; /* nothing to do */ |
2417 | if (mddev_is_dm(mddev) || blk_get_integrity(disk: mddev->gendisk)) |
2418 | return 0; /* shouldn't register, or already is */ |
2419 | rdev_for_each(rdev, mddev) { |
2420 | /* skip spares and non-functional disks */ |
2421 | if (test_bit(Faulty, &rdev->flags)) |
2422 | continue; |
2423 | if (rdev->raid_disk < 0) |
2424 | continue; |
2425 | if (!reference) { |
2426 | /* Use the first rdev as the reference */ |
2427 | reference = rdev; |
2428 | continue; |
2429 | } |
2430 | /* does this rdev's profile match the reference profile? */ |
2431 | if (blk_integrity_compare(reference->bdev->bd_disk, |
2432 | rdev->bdev->bd_disk) < 0) |
2433 | return -EINVAL; |
2434 | } |
2435 | if (!reference || !bdev_get_integrity(bdev: reference->bdev)) |
2436 | return 0; |
2437 | /* |
2438 | * All component devices are integrity capable and have matching |
2439 | * profiles, register the common profile for the md device. |
2440 | */ |
2441 | blk_integrity_register(mddev->gendisk, |
2442 | bdev_get_integrity(bdev: reference->bdev)); |
2443 | |
2444 | pr_debug("md: data integrity enabled on %s\n" , mdname(mddev)); |
2445 | if (bioset_integrity_create(&mddev->bio_set, BIO_POOL_SIZE) || |
2446 | (mddev->level != 1 && mddev->level != 10 && |
2447 | bioset_integrity_create(&mddev->io_clone_set, BIO_POOL_SIZE))) { |
2448 | /* |
2449 | * No need to handle the failure of bioset_integrity_create, |
2450 | * because the function is called by md_run() -> pers->run(), |
2451 | * md_run calls bioset_exit -> bioset_integrity_free in case |
2452 | * of failure case. |
2453 | */ |
2454 | pr_err("md: failed to create integrity pool for %s\n" , |
2455 | mdname(mddev)); |
2456 | return -EINVAL; |
2457 | } |
2458 | return 0; |
2459 | } |
2460 | EXPORT_SYMBOL(md_integrity_register); |
2461 | |
2462 | /* |
2463 | * Attempt to add an rdev, but only if it is consistent with the current |
2464 | * integrity profile |
2465 | */ |
2466 | int md_integrity_add_rdev(struct md_rdev *rdev, struct mddev *mddev) |
2467 | { |
2468 | struct blk_integrity *bi_mddev; |
2469 | |
2470 | if (mddev_is_dm(mddev)) |
2471 | return 0; |
2472 | |
2473 | bi_mddev = blk_get_integrity(disk: mddev->gendisk); |
2474 | |
2475 | if (!bi_mddev) /* nothing to do */ |
2476 | return 0; |
2477 | |
2478 | if (blk_integrity_compare(mddev->gendisk, rdev->bdev->bd_disk) != 0) { |
2479 | pr_err("%s: incompatible integrity profile for %pg\n" , |
2480 | mdname(mddev), rdev->bdev); |
2481 | return -ENXIO; |
2482 | } |
2483 | |
2484 | return 0; |
2485 | } |
2486 | EXPORT_SYMBOL(md_integrity_add_rdev); |
2487 | |
2488 | static bool rdev_read_only(struct md_rdev *rdev) |
2489 | { |
2490 | return bdev_read_only(bdev: rdev->bdev) || |
2491 | (rdev->meta_bdev && bdev_read_only(bdev: rdev->meta_bdev)); |
2492 | } |
2493 | |
2494 | static int bind_rdev_to_array(struct md_rdev *rdev, struct mddev *mddev) |
2495 | { |
2496 | char b[BDEVNAME_SIZE]; |
2497 | int err; |
2498 | |
2499 | /* prevent duplicates */ |
2500 | if (find_rdev(mddev, dev: rdev->bdev->bd_dev)) |
2501 | return -EEXIST; |
2502 | |
2503 | if (rdev_read_only(rdev) && mddev->pers) |
2504 | return -EROFS; |
2505 | |
2506 | /* make sure rdev->sectors exceeds mddev->dev_sectors */ |
2507 | if (!test_bit(Journal, &rdev->flags) && |
2508 | rdev->sectors && |
2509 | (mddev->dev_sectors == 0 || rdev->sectors < mddev->dev_sectors)) { |
2510 | if (mddev->pers) { |
2511 | /* Cannot change size, so fail |
2512 | * If mddev->level <= 0, then we don't care |
2513 | * about aligning sizes (e.g. linear) |
2514 | */ |
2515 | if (mddev->level > 0) |
2516 | return -ENOSPC; |
2517 | } else |
2518 | mddev->dev_sectors = rdev->sectors; |
2519 | } |
2520 | |
2521 | /* Verify rdev->desc_nr is unique. |
2522 | * If it is -1, assign a free number, else |
2523 | * check number is not in use |
2524 | */ |
2525 | rcu_read_lock(); |
2526 | if (rdev->desc_nr < 0) { |
2527 | int choice = 0; |
2528 | if (mddev->pers) |
2529 | choice = mddev->raid_disks; |
2530 | while (md_find_rdev_nr_rcu(mddev, choice)) |
2531 | choice++; |
2532 | rdev->desc_nr = choice; |
2533 | } else { |
2534 | if (md_find_rdev_nr_rcu(mddev, rdev->desc_nr)) { |
2535 | rcu_read_unlock(); |
2536 | return -EBUSY; |
2537 | } |
2538 | } |
2539 | rcu_read_unlock(); |
2540 | if (!test_bit(Journal, &rdev->flags) && |
2541 | mddev->max_disks && rdev->desc_nr >= mddev->max_disks) { |
2542 | pr_warn("md: %s: array is limited to %d devices\n" , |
2543 | mdname(mddev), mddev->max_disks); |
2544 | return -EBUSY; |
2545 | } |
2546 | snprintf(buf: b, size: sizeof(b), fmt: "%pg" , rdev->bdev); |
2547 | strreplace(str: b, old: '/', new: '!'); |
2548 | |
2549 | rdev->mddev = mddev; |
2550 | pr_debug("md: bind<%s>\n" , b); |
2551 | |
2552 | if (mddev->raid_disks) |
2553 | mddev_create_serial_pool(mddev, rdev); |
2554 | |
2555 | if ((err = kobject_add(kobj: &rdev->kobj, parent: &mddev->kobj, fmt: "dev-%s" , b))) |
2556 | goto fail; |
2557 | |
2558 | /* failure here is OK */ |
2559 | err = sysfs_create_link(kobj: &rdev->kobj, bdev_kobj(rdev->bdev), name: "block" ); |
2560 | rdev->sysfs_state = sysfs_get_dirent_safe(sd: rdev->kobj.sd, name: "state" ); |
2561 | rdev->sysfs_unack_badblocks = |
2562 | sysfs_get_dirent_safe(sd: rdev->kobj.sd, name: "unacknowledged_bad_blocks" ); |
2563 | rdev->sysfs_badblocks = |
2564 | sysfs_get_dirent_safe(sd: rdev->kobj.sd, name: "bad_blocks" ); |
2565 | |
2566 | list_add_rcu(new: &rdev->same_set, head: &mddev->disks); |
2567 | bd_link_disk_holder(bdev: rdev->bdev, disk: mddev->gendisk); |
2568 | |
2569 | /* May as well allow recovery to be retried once */ |
2570 | mddev->recovery_disabled++; |
2571 | |
2572 | return 0; |
2573 | |
2574 | fail: |
2575 | pr_warn("md: failed to register dev-%s for %s\n" , |
2576 | b, mdname(mddev)); |
2577 | mddev_destroy_serial_pool(mddev, rdev); |
2578 | return err; |
2579 | } |
2580 | |
2581 | void md_autodetect_dev(dev_t dev); |
2582 | |
2583 | /* just for claiming the bdev */ |
2584 | static struct md_rdev claim_rdev; |
2585 | |
2586 | static void export_rdev(struct md_rdev *rdev, struct mddev *mddev) |
2587 | { |
2588 | pr_debug("md: export_rdev(%pg)\n" , rdev->bdev); |
2589 | md_rdev_clear(rdev); |
2590 | #ifndef MODULE |
2591 | if (test_bit(AutoDetected, &rdev->flags)) |
2592 | md_autodetect_dev(dev: rdev->bdev->bd_dev); |
2593 | #endif |
2594 | fput(rdev->bdev_file); |
2595 | rdev->bdev = NULL; |
2596 | kobject_put(kobj: &rdev->kobj); |
2597 | } |
2598 | |
2599 | static void md_kick_rdev_from_array(struct md_rdev *rdev) |
2600 | { |
2601 | struct mddev *mddev = rdev->mddev; |
2602 | |
2603 | bd_unlink_disk_holder(bdev: rdev->bdev, disk: rdev->mddev->gendisk); |
2604 | list_del_rcu(entry: &rdev->same_set); |
2605 | pr_debug("md: unbind<%pg>\n" , rdev->bdev); |
2606 | mddev_destroy_serial_pool(mddev: rdev->mddev, rdev); |
2607 | WRITE_ONCE(rdev->mddev, NULL); |
2608 | sysfs_remove_link(kobj: &rdev->kobj, name: "block" ); |
2609 | sysfs_put(kn: rdev->sysfs_state); |
2610 | sysfs_put(kn: rdev->sysfs_unack_badblocks); |
2611 | sysfs_put(kn: rdev->sysfs_badblocks); |
2612 | rdev->sysfs_state = NULL; |
2613 | rdev->sysfs_unack_badblocks = NULL; |
2614 | rdev->sysfs_badblocks = NULL; |
2615 | rdev->badblocks.count = 0; |
2616 | |
2617 | synchronize_rcu(); |
2618 | |
2619 | /* |
2620 | * kobject_del() will wait for all in progress writers to be done, where |
2621 | * reconfig_mutex is held, hence it can't be called under |
2622 | * reconfig_mutex and it's delayed to mddev_unlock(). |
2623 | */ |
2624 | list_add(new: &rdev->same_set, head: &mddev->deleting); |
2625 | } |
2626 | |
2627 | static void export_array(struct mddev *mddev) |
2628 | { |
2629 | struct md_rdev *rdev; |
2630 | |
2631 | while (!list_empty(head: &mddev->disks)) { |
2632 | rdev = list_first_entry(&mddev->disks, struct md_rdev, |
2633 | same_set); |
2634 | md_kick_rdev_from_array(rdev); |
2635 | } |
2636 | mddev->raid_disks = 0; |
2637 | mddev->major_version = 0; |
2638 | } |
2639 | |
2640 | static bool set_in_sync(struct mddev *mddev) |
2641 | { |
2642 | lockdep_assert_held(&mddev->lock); |
2643 | if (!mddev->in_sync) { |
2644 | mddev->sync_checkers++; |
2645 | spin_unlock(lock: &mddev->lock); |
2646 | percpu_ref_switch_to_atomic_sync(ref: &mddev->writes_pending); |
2647 | spin_lock(lock: &mddev->lock); |
2648 | if (!mddev->in_sync && |
2649 | percpu_ref_is_zero(ref: &mddev->writes_pending)) { |
2650 | mddev->in_sync = 1; |
2651 | /* |
2652 | * Ensure ->in_sync is visible before we clear |
2653 | * ->sync_checkers. |
2654 | */ |
2655 | smp_mb(); |
2656 | set_bit(nr: MD_SB_CHANGE_CLEAN, addr: &mddev->sb_flags); |
2657 | sysfs_notify_dirent_safe(sd: mddev->sysfs_state); |
2658 | } |
2659 | if (--mddev->sync_checkers == 0) |
2660 | percpu_ref_switch_to_percpu(ref: &mddev->writes_pending); |
2661 | } |
2662 | if (mddev->safemode == 1) |
2663 | mddev->safemode = 0; |
2664 | return mddev->in_sync; |
2665 | } |
2666 | |
2667 | static void sync_sbs(struct mddev *mddev, int nospares) |
2668 | { |
2669 | /* Update each superblock (in-memory image), but |
2670 | * if we are allowed to, skip spares which already |
2671 | * have the right event counter, or have one earlier |
2672 | * (which would mean they aren't being marked as dirty |
2673 | * with the rest of the array) |
2674 | */ |
2675 | struct md_rdev *rdev; |
2676 | rdev_for_each(rdev, mddev) { |
2677 | if (rdev->sb_events == mddev->events || |
2678 | (nospares && |
2679 | rdev->raid_disk < 0 && |
2680 | rdev->sb_events+1 == mddev->events)) { |
2681 | /* Don't update this superblock */ |
2682 | rdev->sb_loaded = 2; |
2683 | } else { |
2684 | sync_super(mddev, rdev); |
2685 | rdev->sb_loaded = 1; |
2686 | } |
2687 | } |
2688 | } |
2689 | |
2690 | static bool does_sb_need_changing(struct mddev *mddev) |
2691 | { |
2692 | struct md_rdev *rdev = NULL, *iter; |
2693 | struct mdp_superblock_1 *sb; |
2694 | int role; |
2695 | |
2696 | /* Find a good rdev */ |
2697 | rdev_for_each(iter, mddev) |
2698 | if ((iter->raid_disk >= 0) && !test_bit(Faulty, &iter->flags)) { |
2699 | rdev = iter; |
2700 | break; |
2701 | } |
2702 | |
2703 | /* No good device found. */ |
2704 | if (!rdev) |
2705 | return false; |
2706 | |
2707 | sb = page_address(rdev->sb_page); |
2708 | /* Check if a device has become faulty or a spare become active */ |
2709 | rdev_for_each(rdev, mddev) { |
2710 | role = le16_to_cpu(sb->dev_roles[rdev->desc_nr]); |
2711 | /* Device activated? */ |
2712 | if (role == MD_DISK_ROLE_SPARE && rdev->raid_disk >= 0 && |
2713 | !test_bit(Faulty, &rdev->flags)) |
2714 | return true; |
2715 | /* Device turned faulty? */ |
2716 | if (test_bit(Faulty, &rdev->flags) && (role < MD_DISK_ROLE_MAX)) |
2717 | return true; |
2718 | } |
2719 | |
2720 | /* Check if any mddev parameters have changed */ |
2721 | if ((mddev->dev_sectors != le64_to_cpu(sb->size)) || |
2722 | (mddev->reshape_position != le64_to_cpu(sb->reshape_position)) || |
2723 | (mddev->layout != le32_to_cpu(sb->layout)) || |
2724 | (mddev->raid_disks != le32_to_cpu(sb->raid_disks)) || |
2725 | (mddev->chunk_sectors != le32_to_cpu(sb->chunksize))) |
2726 | return true; |
2727 | |
2728 | return false; |
2729 | } |
2730 | |
2731 | void md_update_sb(struct mddev *mddev, int force_change) |
2732 | { |
2733 | struct md_rdev *rdev; |
2734 | int sync_req; |
2735 | int nospares = 0; |
2736 | int any_badblocks_changed = 0; |
2737 | int ret = -1; |
2738 | |
2739 | if (!md_is_rdwr(mddev)) { |
2740 | if (force_change) |
2741 | set_bit(nr: MD_SB_CHANGE_DEVS, addr: &mddev->sb_flags); |
2742 | return; |
2743 | } |
2744 | |
2745 | repeat: |
2746 | if (mddev_is_clustered(mddev)) { |
2747 | if (test_and_clear_bit(nr: MD_SB_CHANGE_DEVS, addr: &mddev->sb_flags)) |
2748 | force_change = 1; |
2749 | if (test_and_clear_bit(nr: MD_SB_CHANGE_CLEAN, addr: &mddev->sb_flags)) |
2750 | nospares = 1; |
2751 | ret = md_cluster_ops->metadata_update_start(mddev); |
2752 | /* Has someone else has updated the sb */ |
2753 | if (!does_sb_need_changing(mddev)) { |
2754 | if (ret == 0) |
2755 | md_cluster_ops->metadata_update_cancel(mddev); |
2756 | bit_clear_unless(&mddev->sb_flags, BIT(MD_SB_CHANGE_PENDING), |
2757 | BIT(MD_SB_CHANGE_DEVS) | |
2758 | BIT(MD_SB_CHANGE_CLEAN)); |
2759 | return; |
2760 | } |
2761 | } |
2762 | |
2763 | /* |
2764 | * First make sure individual recovery_offsets are correct |
2765 | * curr_resync_completed can only be used during recovery. |
2766 | * During reshape/resync it might use array-addresses rather |
2767 | * that device addresses. |
2768 | */ |
2769 | rdev_for_each(rdev, mddev) { |
2770 | if (rdev->raid_disk >= 0 && |
2771 | mddev->delta_disks >= 0 && |
2772 | test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) && |
2773 | test_bit(MD_RECOVERY_RECOVER, &mddev->recovery) && |
2774 | !test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && |
2775 | !test_bit(Journal, &rdev->flags) && |
2776 | !test_bit(In_sync, &rdev->flags) && |
2777 | mddev->curr_resync_completed > rdev->recovery_offset) |
2778 | rdev->recovery_offset = mddev->curr_resync_completed; |
2779 | |
2780 | } |
2781 | if (!mddev->persistent) { |
2782 | clear_bit(nr: MD_SB_CHANGE_CLEAN, addr: &mddev->sb_flags); |
2783 | clear_bit(nr: MD_SB_CHANGE_DEVS, addr: &mddev->sb_flags); |
2784 | if (!mddev->external) { |
2785 | clear_bit(nr: MD_SB_CHANGE_PENDING, addr: &mddev->sb_flags); |
2786 | rdev_for_each(rdev, mddev) { |
2787 | if (rdev->badblocks.changed) { |
2788 | rdev->badblocks.changed = 0; |
2789 | ack_all_badblocks(bb: &rdev->badblocks); |
2790 | md_error(mddev, rdev); |
2791 | } |
2792 | clear_bit(nr: Blocked, addr: &rdev->flags); |
2793 | clear_bit(nr: BlockedBadBlocks, addr: &rdev->flags); |
2794 | wake_up(&rdev->blocked_wait); |
2795 | } |
2796 | } |
2797 | wake_up(&mddev->sb_wait); |
2798 | return; |
2799 | } |
2800 | |
2801 | spin_lock(lock: &mddev->lock); |
2802 | |
2803 | mddev->utime = ktime_get_real_seconds(); |
2804 | |
2805 | if (test_and_clear_bit(nr: MD_SB_CHANGE_DEVS, addr: &mddev->sb_flags)) |
2806 | force_change = 1; |
2807 | if (test_and_clear_bit(nr: MD_SB_CHANGE_CLEAN, addr: &mddev->sb_flags)) |
2808 | /* just a clean<-> dirty transition, possibly leave spares alone, |
2809 | * though if events isn't the right even/odd, we will have to do |
2810 | * spares after all |
2811 | */ |
2812 | nospares = 1; |
2813 | if (force_change) |
2814 | nospares = 0; |
2815 | if (mddev->degraded) |
2816 | /* If the array is degraded, then skipping spares is both |
2817 | * dangerous and fairly pointless. |
2818 | * Dangerous because a device that was removed from the array |
2819 | * might have a event_count that still looks up-to-date, |
2820 | * so it can be re-added without a resync. |
2821 | * Pointless because if there are any spares to skip, |
2822 | * then a recovery will happen and soon that array won't |
2823 | * be degraded any more and the spare can go back to sleep then. |
2824 | */ |
2825 | nospares = 0; |
2826 | |
2827 | sync_req = mddev->in_sync; |
2828 | |
2829 | /* If this is just a dirty<->clean transition, and the array is clean |
2830 | * and 'events' is odd, we can roll back to the previous clean state */ |
2831 | if (nospares |
2832 | && (mddev->in_sync && mddev->recovery_cp == MaxSector) |
2833 | && mddev->can_decrease_events |
2834 | && mddev->events != 1) { |
2835 | mddev->events--; |
2836 | mddev->can_decrease_events = 0; |
2837 | } else { |
2838 | /* otherwise we have to go forward and ... */ |
2839 | mddev->events ++; |
2840 | mddev->can_decrease_events = nospares; |
2841 | } |
2842 | |
2843 | /* |
2844 | * This 64-bit counter should never wrap. |
2845 | * Either we are in around ~1 trillion A.C., assuming |
2846 | * 1 reboot per second, or we have a bug... |
2847 | */ |
2848 | WARN_ON(mddev->events == 0); |
2849 | |
2850 | rdev_for_each(rdev, mddev) { |
2851 | if (rdev->badblocks.changed) |
2852 | any_badblocks_changed++; |
2853 | if (test_bit(Faulty, &rdev->flags)) |
2854 | set_bit(nr: FaultRecorded, addr: &rdev->flags); |
2855 | } |
2856 | |
2857 | sync_sbs(mddev, nospares); |
2858 | spin_unlock(lock: &mddev->lock); |
2859 | |
2860 | pr_debug("md: updating %s RAID superblock on device (in sync %d)\n" , |
2861 | mdname(mddev), mddev->in_sync); |
2862 | |
2863 | mddev_add_trace_msg(mddev, "md md_update_sb" ); |
2864 | rewrite: |
2865 | md_bitmap_update_sb(bitmap: mddev->bitmap); |
2866 | rdev_for_each(rdev, mddev) { |
2867 | if (rdev->sb_loaded != 1) |
2868 | continue; /* no noise on spare devices */ |
2869 | |
2870 | if (!test_bit(Faulty, &rdev->flags)) { |
2871 | md_super_write(mddev,rdev, |
2872 | sector: rdev->sb_start, size: rdev->sb_size, |
2873 | page: rdev->sb_page); |
2874 | pr_debug("md: (write) %pg's sb offset: %llu\n" , |
2875 | rdev->bdev, |
2876 | (unsigned long long)rdev->sb_start); |
2877 | rdev->sb_events = mddev->events; |
2878 | if (rdev->badblocks.size) { |
2879 | md_super_write(mddev, rdev, |
2880 | sector: rdev->badblocks.sector, |
2881 | size: rdev->badblocks.size << 9, |
2882 | page: rdev->bb_page); |
2883 | rdev->badblocks.size = 0; |
2884 | } |
2885 | |
2886 | } else |
2887 | pr_debug("md: %pg (skipping faulty)\n" , |
2888 | rdev->bdev); |
2889 | } |
2890 | if (md_super_wait(mddev) < 0) |
2891 | goto rewrite; |
2892 | /* if there was a failure, MD_SB_CHANGE_DEVS was set, and we re-write super */ |
2893 | |
2894 | if (mddev_is_clustered(mddev) && ret == 0) |
2895 | md_cluster_ops->metadata_update_finish(mddev); |
2896 | |
2897 | if (mddev->in_sync != sync_req || |
2898 | !bit_clear_unless(&mddev->sb_flags, BIT(MD_SB_CHANGE_PENDING), |
2899 | BIT(MD_SB_CHANGE_DEVS) | BIT(MD_SB_CHANGE_CLEAN))) |
2900 | /* have to write it out again */ |
2901 | goto repeat; |
2902 | wake_up(&mddev->sb_wait); |
2903 | if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) |
2904 | sysfs_notify_dirent_safe(sd: mddev->sysfs_completed); |
2905 | |
2906 | rdev_for_each(rdev, mddev) { |
2907 | if (test_and_clear_bit(nr: FaultRecorded, addr: &rdev->flags)) |
2908 | clear_bit(nr: Blocked, addr: &rdev->flags); |
2909 | |
2910 | if (any_badblocks_changed) |
2911 | ack_all_badblocks(bb: &rdev->badblocks); |
2912 | clear_bit(nr: BlockedBadBlocks, addr: &rdev->flags); |
2913 | wake_up(&rdev->blocked_wait); |
2914 | } |
2915 | } |
2916 | EXPORT_SYMBOL(md_update_sb); |
2917 | |
2918 | static int add_bound_rdev(struct md_rdev *rdev) |
2919 | { |
2920 | struct mddev *mddev = rdev->mddev; |
2921 | int err = 0; |
2922 | bool add_journal = test_bit(Journal, &rdev->flags); |
2923 | |
2924 | if (!mddev->pers->hot_remove_disk || add_journal) { |
2925 | /* If there is hot_add_disk but no hot_remove_disk |
2926 | * then added disks for geometry changes, |
2927 | * and should be added immediately. |
2928 | */ |
2929 | super_types[mddev->major_version]. |
2930 | validate_super(mddev, NULL/*freshest*/, rdev); |
2931 | err = mddev->pers->hot_add_disk(mddev, rdev); |
2932 | if (err) { |
2933 | md_kick_rdev_from_array(rdev); |
2934 | return err; |
2935 | } |
2936 | } |
2937 | sysfs_notify_dirent_safe(sd: rdev->sysfs_state); |
2938 | |
2939 | set_bit(nr: MD_SB_CHANGE_DEVS, addr: &mddev->sb_flags); |
2940 | if (mddev->degraded) |
2941 | set_bit(nr: MD_RECOVERY_RECOVER, addr: &mddev->recovery); |
2942 | set_bit(nr: MD_RECOVERY_NEEDED, addr: &mddev->recovery); |
2943 | md_new_event(); |
2944 | return 0; |
2945 | } |
2946 | |
2947 | /* words written to sysfs files may, or may not, be \n terminated. |
2948 | * We want to accept with case. For this we use cmd_match. |
2949 | */ |
2950 | static int cmd_match(const char *cmd, const char *str) |
2951 | { |
2952 | /* See if cmd, written into a sysfs file, matches |
2953 | * str. They must either be the same, or cmd can |
2954 | * have a trailing newline |
2955 | */ |
2956 | while (*cmd && *str && *cmd == *str) { |
2957 | cmd++; |
2958 | str++; |
2959 | } |
2960 | if (*cmd == '\n') |
2961 | cmd++; |
2962 | if (*str || *cmd) |
2963 | return 0; |
2964 | return 1; |
2965 | } |
2966 | |
2967 | struct rdev_sysfs_entry { |
2968 | struct attribute attr; |
2969 | ssize_t (*show)(struct md_rdev *, char *); |
2970 | ssize_t (*store)(struct md_rdev *, const char *, size_t); |
2971 | }; |
2972 | |
2973 | static ssize_t |
2974 | state_show(struct md_rdev *rdev, char *page) |
2975 | { |
2976 | char *sep = "," ; |
2977 | size_t len = 0; |
2978 | unsigned long flags = READ_ONCE(rdev->flags); |
2979 | |
2980 | if (test_bit(Faulty, &flags) || |
2981 | (!test_bit(ExternalBbl, &flags) && |
2982 | rdev->badblocks.unacked_exist)) |
2983 | len += sprintf(buf: page+len, fmt: "faulty%s" , sep); |
2984 | if (test_bit(In_sync, &flags)) |
2985 | len += sprintf(buf: page+len, fmt: "in_sync%s" , sep); |
2986 | if (test_bit(Journal, &flags)) |
2987 | len += sprintf(buf: page+len, fmt: "journal%s" , sep); |
2988 | if (test_bit(WriteMostly, &flags)) |
2989 | len += sprintf(buf: page+len, fmt: "write_mostly%s" , sep); |
2990 | if (test_bit(Blocked, &flags) || |
2991 | (rdev->badblocks.unacked_exist |
2992 | && !test_bit(Faulty, &flags))) |
2993 | len += sprintf(buf: page+len, fmt: "blocked%s" , sep); |
2994 | if (!test_bit(Faulty, &flags) && |
2995 | !test_bit(Journal, &flags) && |
2996 | !test_bit(In_sync, &flags)) |
2997 | len += sprintf(buf: page+len, fmt: "spare%s" , sep); |
2998 | if (test_bit(WriteErrorSeen, &flags)) |
2999 | len += sprintf(buf: page+len, fmt: "write_error%s" , sep); |
3000 | if (test_bit(WantReplacement, &flags)) |
3001 | len += sprintf(buf: page+len, fmt: "want_replacement%s" , sep); |
3002 | if (test_bit(Replacement, &flags)) |
3003 | len += sprintf(buf: page+len, fmt: "replacement%s" , sep); |
3004 | if (test_bit(ExternalBbl, &flags)) |
3005 | len += sprintf(buf: page+len, fmt: "external_bbl%s" , sep); |
3006 | if (test_bit(FailFast, &flags)) |
3007 | len += sprintf(buf: page+len, fmt: "failfast%s" , sep); |
3008 | |
3009 | if (len) |
3010 | len -= strlen(sep); |
3011 | |
3012 | return len+sprintf(buf: page+len, fmt: "\n" ); |
3013 | } |
3014 | |
3015 | static ssize_t |
3016 | state_store(struct md_rdev *rdev, const char *buf, size_t len) |
3017 | { |
3018 | /* can write |
3019 | * faulty - simulates an error |
3020 | * remove - disconnects the device |
3021 | * writemostly - sets write_mostly |
3022 | * -writemostly - clears write_mostly |
3023 | * blocked - sets the Blocked flags |
3024 | * -blocked - clears the Blocked and possibly simulates an error |
3025 | * insync - sets Insync providing device isn't active |
3026 | * -insync - clear Insync for a device with a slot assigned, |
3027 | * so that it gets rebuilt based on bitmap |
3028 | * write_error - sets WriteErrorSeen |
3029 | * -write_error - clears WriteErrorSeen |
3030 | * {,-}failfast - set/clear FailFast |
3031 | */ |
3032 | |
3033 | struct mddev *mddev = rdev->mddev; |
3034 | int err = -EINVAL; |
3035 | bool need_update_sb = false; |
3036 | |
3037 | if (cmd_match(cmd: buf, str: "faulty" ) && rdev->mddev->pers) { |
3038 | md_error(mddev: rdev->mddev, rdev); |
3039 | |
3040 | if (test_bit(MD_BROKEN, &rdev->mddev->flags)) |
3041 | err = -EBUSY; |
3042 | else |
3043 | err = 0; |
3044 | } else if (cmd_match(cmd: buf, str: "remove" )) { |
3045 | if (rdev->mddev->pers) { |
3046 | clear_bit(nr: Blocked, addr: &rdev->flags); |
3047 | remove_and_add_spares(mddev: rdev->mddev, this: rdev); |
3048 | } |
3049 | if (rdev->raid_disk >= 0) |
3050 | err = -EBUSY; |
3051 | else { |
3052 | err = 0; |
3053 | if (mddev_is_clustered(mddev)) |
3054 | err = md_cluster_ops->remove_disk(mddev, rdev); |
3055 | |
3056 | if (err == 0) { |
3057 | md_kick_rdev_from_array(rdev); |
3058 | if (mddev->pers) |
3059 | set_bit(nr: MD_SB_CHANGE_DEVS, addr: &mddev->sb_flags); |
3060 | md_new_event(); |
3061 | } |
3062 | } |
3063 | } else if (cmd_match(cmd: buf, str: "writemostly" )) { |
3064 | set_bit(nr: WriteMostly, addr: &rdev->flags); |
3065 | mddev_create_serial_pool(mddev: rdev->mddev, rdev); |
3066 | need_update_sb = true; |
3067 | err = 0; |
3068 | } else if (cmd_match(cmd: buf, str: "-writemostly" )) { |
3069 | mddev_destroy_serial_pool(mddev: rdev->mddev, rdev); |
3070 | clear_bit(nr: WriteMostly, addr: &rdev->flags); |
3071 | need_update_sb = true; |
3072 | err = 0; |
3073 | } else if (cmd_match(cmd: buf, str: "blocked" )) { |
3074 | set_bit(nr: Blocked, addr: &rdev->flags); |
3075 | err = 0; |
3076 | } else if (cmd_match(cmd: buf, str: "-blocked" )) { |
3077 | if (!test_bit(Faulty, &rdev->flags) && |
3078 | !test_bit(ExternalBbl, &rdev->flags) && |
3079 | rdev->badblocks.unacked_exist) { |
3080 | /* metadata handler doesn't understand badblocks, |
3081 | * so we need to fail the device |
3082 | */ |
3083 | md_error(mddev: rdev->mddev, rdev); |
3084 | } |
3085 | clear_bit(nr: Blocked, addr: &rdev->flags); |
3086 | clear_bit(nr: BlockedBadBlocks, addr: &rdev->flags); |
3087 | wake_up(&rdev->blocked_wait); |
3088 | set_bit(nr: MD_RECOVERY_NEEDED, addr: &rdev->mddev->recovery); |
3089 | |
3090 | err = 0; |
3091 | } else if (cmd_match(cmd: buf, str: "insync" ) && rdev->raid_disk == -1) { |
3092 | set_bit(nr: In_sync, addr: &rdev->flags); |
3093 | err = 0; |
3094 | } else if (cmd_match(cmd: buf, str: "failfast" )) { |
3095 | set_bit(nr: FailFast, addr: &rdev->flags); |
3096 | need_update_sb = true; |
3097 | err = 0; |
3098 | } else if (cmd_match(cmd: buf, str: "-failfast" )) { |
3099 | clear_bit(nr: FailFast, addr: &rdev->flags); |
3100 | need_update_sb = true; |
3101 | err = 0; |
3102 | } else if (cmd_match(cmd: buf, str: "-insync" ) && rdev->raid_disk >= 0 && |
3103 | !test_bit(Journal, &rdev->flags)) { |
3104 | if (rdev->mddev->pers == NULL) { |
3105 | clear_bit(nr: In_sync, addr: &rdev->flags); |
3106 | rdev->saved_raid_disk = rdev->raid_disk; |
3107 | rdev->raid_disk = -1; |
3108 | err = 0; |
3109 | } |
3110 | } else if (cmd_match(cmd: buf, str: "write_error" )) { |
3111 | set_bit(nr: WriteErrorSeen, addr: &rdev->flags); |
3112 | err = 0; |
3113 | } else if (cmd_match(cmd: buf, str: "-write_error" )) { |
3114 | clear_bit(nr: WriteErrorSeen, addr: &rdev->flags); |
3115 | err = 0; |
3116 | } else if (cmd_match(cmd: buf, str: "want_replacement" )) { |
3117 | /* Any non-spare device that is not a replacement can |
3118 | * become want_replacement at any time, but we then need to |
3119 | * check if recovery is needed. |
3120 | */ |
3121 | if (rdev->raid_disk >= 0 && |
3122 | !test_bit(Journal, &rdev->flags) && |
3123 | !test_bit(Replacement, &rdev->flags)) |
3124 | set_bit(nr: WantReplacement, addr: &rdev->flags); |
3125 | set_bit(nr: MD_RECOVERY_NEEDED, addr: &rdev->mddev->recovery); |
3126 | err = 0; |
3127 | } else if (cmd_match(cmd: buf, str: "-want_replacement" )) { |
3128 | /* Clearing 'want_replacement' is always allowed. |
3129 | * Once replacements starts it is too late though. |
3130 | */ |
3131 | err = 0; |
3132 | clear_bit(nr: WantReplacement, addr: &rdev->flags); |
3133 | } else if (cmd_match(cmd: buf, str: "replacement" )) { |
3134 | /* Can only set a device as a replacement when array has not |
3135 | * yet been started. Once running, replacement is automatic |
3136 | * from spares, or by assigning 'slot'. |
3137 | */ |
3138 | if (rdev->mddev->pers) |
3139 | err = -EBUSY; |
3140 | else { |
3141 | set_bit(nr: Replacement, addr: &rdev->flags); |
3142 | err = 0; |
3143 | } |
3144 | } else if (cmd_match(cmd: buf, str: "-replacement" )) { |
3145 | /* Similarly, can only clear Replacement before start */ |
3146 | if (rdev->mddev->pers) |
3147 | err = -EBUSY; |
3148 | else { |
3149 | clear_bit(nr: Replacement, addr: &rdev->flags); |
3150 | err = 0; |
3151 | } |
3152 | } else if (cmd_match(cmd: buf, str: "re-add" )) { |
3153 | if (!rdev->mddev->pers) |
3154 | err = -EINVAL; |
3155 | else if (test_bit(Faulty, &rdev->flags) && (rdev->raid_disk == -1) && |
3156 | rdev->saved_raid_disk >= 0) { |
3157 | /* clear_bit is performed _after_ all the devices |
3158 | * have their local Faulty bit cleared. If any writes |
3159 | * happen in the meantime in the local node, they |
3160 | * will land in the local bitmap, which will be synced |
3161 | * by this node eventually |
3162 | */ |
3163 | if (!mddev_is_clustered(mddev: rdev->mddev) || |
3164 | (err = md_cluster_ops->gather_bitmaps(rdev)) == 0) { |
3165 | clear_bit(nr: Faulty, addr: &rdev->flags); |
3166 | err = add_bound_rdev(rdev); |
3167 | } |
3168 | } else |
3169 | err = -EBUSY; |
3170 | } else if (cmd_match(cmd: buf, str: "external_bbl" ) && (rdev->mddev->external)) { |
3171 | set_bit(nr: ExternalBbl, addr: &rdev->flags); |
3172 | rdev->badblocks.shift = 0; |
3173 | err = 0; |
3174 | } else if (cmd_match(cmd: buf, str: "-external_bbl" ) && (rdev->mddev->external)) { |
3175 | clear_bit(nr: ExternalBbl, addr: &rdev->flags); |
3176 | err = 0; |
3177 | } |
3178 | if (need_update_sb) |
3179 | md_update_sb(mddev, 1); |
3180 | if (!err) |
3181 | sysfs_notify_dirent_safe(sd: rdev->sysfs_state); |
3182 | return err ? err : len; |
3183 | } |
3184 | static struct rdev_sysfs_entry rdev_state = |
3185 | __ATTR_PREALLOC(state, S_IRUGO|S_IWUSR, state_show, state_store); |
3186 | |
3187 | static ssize_t |
3188 | errors_show(struct md_rdev *rdev, char *page) |
3189 | { |
3190 | return sprintf(buf: page, fmt: "%d\n" , atomic_read(v: &rdev->corrected_errors)); |
3191 | } |
3192 | |
3193 | static ssize_t |
3194 | errors_store(struct md_rdev *rdev, const char *buf, size_t len) |
3195 | { |
3196 | unsigned int n; |
3197 | int rv; |
3198 | |
3199 | rv = kstrtouint(s: buf, base: 10, res: &n); |
3200 | if (rv < 0) |
3201 | return rv; |
3202 | atomic_set(v: &rdev->corrected_errors, i: n); |
3203 | return len; |
3204 | } |
3205 | static struct rdev_sysfs_entry rdev_errors = |
3206 | __ATTR(errors, S_IRUGO|S_IWUSR, errors_show, errors_store); |
3207 | |
3208 | static ssize_t |
3209 | slot_show(struct md_rdev *rdev, char *page) |
3210 | { |
3211 | if (test_bit(Journal, &rdev->flags)) |
3212 | return sprintf(buf: page, fmt: "journal\n" ); |
3213 | else if (rdev->raid_disk < 0) |
3214 | return sprintf(buf: page, fmt: "none\n" ); |
3215 | else |
3216 | return sprintf(buf: page, fmt: "%d\n" , rdev->raid_disk); |
3217 | } |
3218 | |
3219 | static ssize_t |
3220 | slot_store(struct md_rdev *rdev, const char *buf, size_t len) |
3221 | { |
3222 | int slot; |
3223 | int err; |
3224 | |
3225 | if (test_bit(Journal, &rdev->flags)) |
3226 | return -EBUSY; |
3227 | if (strncmp(buf, "none" , 4)==0) |
3228 | slot = -1; |
3229 | else { |
3230 | err = kstrtouint(s: buf, base: 10, res: (unsigned int *)&slot); |
3231 | if (err < 0) |
3232 | return err; |
3233 | if (slot < 0) |
3234 | /* overflow */ |
3235 | return -ENOSPC; |
3236 | } |
3237 | if (rdev->mddev->pers && slot == -1) { |
3238 | /* Setting 'slot' on an active array requires also |
3239 | * updating the 'rd%d' link, and communicating |
3240 | * with the personality with ->hot_*_disk. |
3241 | * For now we only support removing |
3242 | * failed/spare devices. This normally happens automatically, |
3243 | * but not when the metadata is externally managed. |
3244 | */ |
3245 | if (rdev->raid_disk == -1) |
3246 | return -EEXIST; |
3247 | /* personality does all needed checks */ |
3248 | if (rdev->mddev->pers->hot_remove_disk == NULL) |
3249 | return -EINVAL; |
3250 | clear_bit(nr: Blocked, addr: &rdev->flags); |
3251 | remove_and_add_spares(mddev: rdev->mddev, this: rdev); |
3252 | if (rdev->raid_disk >= 0) |
3253 | return -EBUSY; |
3254 | set_bit(nr: MD_RECOVERY_NEEDED, addr: &rdev->mddev->recovery); |
3255 | } else if (rdev->mddev->pers) { |
3256 | /* Activating a spare .. or possibly reactivating |
3257 | * if we ever get bitmaps working here. |
3258 | */ |
3259 | int err; |
3260 | |
3261 | if (rdev->raid_disk != -1) |
3262 | return -EBUSY; |
3263 | |
3264 | if (test_bit(MD_RECOVERY_RUNNING, &rdev->mddev->recovery)) |
3265 | return -EBUSY; |
3266 | |
3267 | if (rdev->mddev->pers->hot_add_disk == NULL) |
3268 | return -EINVAL; |
3269 | |
3270 | if (slot >= rdev->mddev->raid_disks && |
3271 | slot >= rdev->mddev->raid_disks + rdev->mddev->delta_disks) |
3272 | return -ENOSPC; |
3273 | |
3274 | rdev->raid_disk = slot; |
3275 | if (test_bit(In_sync, &rdev->flags)) |
3276 | rdev->saved_raid_disk = slot; |
3277 | else |
3278 | rdev->saved_raid_disk = -1; |
3279 | clear_bit(nr: In_sync, addr: &rdev->flags); |
3280 | clear_bit(nr: Bitmap_sync, addr: &rdev->flags); |
3281 | err = rdev->mddev->pers->hot_add_disk(rdev->mddev, rdev); |
3282 | if (err) { |
3283 | rdev->raid_disk = -1; |
3284 | return err; |
3285 | } else |
3286 | sysfs_notify_dirent_safe(sd: rdev->sysfs_state); |
3287 | /* failure here is OK */; |
3288 | sysfs_link_rdev(mddev: rdev->mddev, rdev); |
3289 | /* don't wakeup anyone, leave that to userspace. */ |
3290 | } else { |
3291 | if (slot >= rdev->mddev->raid_disks && |
3292 | slot >= rdev->mddev->raid_disks + rdev->mddev->delta_disks) |
3293 | return -ENOSPC; |
3294 | rdev->raid_disk = slot; |
3295 | /* assume it is working */ |
3296 | clear_bit(nr: Faulty, addr: &rdev->flags); |
3297 | clear_bit(nr: WriteMostly, addr: &rdev->flags); |
3298 | set_bit(nr: In_sync, addr: &rdev->flags); |
3299 | sysfs_notify_dirent_safe(sd: rdev->sysfs_state); |
3300 | } |
3301 | return len; |
3302 | } |
3303 | |
3304 | static struct rdev_sysfs_entry rdev_slot = |
3305 | __ATTR(slot, S_IRUGO|S_IWUSR, slot_show, slot_store); |
3306 | |
3307 | static ssize_t |
3308 | offset_show(struct md_rdev *rdev, char *page) |
3309 | { |
3310 | return sprintf(buf: page, fmt: "%llu\n" , (unsigned long long)rdev->data_offset); |
3311 | } |
3312 | |
3313 | static ssize_t |
3314 | offset_store(struct md_rdev *rdev, const char *buf, size_t len) |
3315 | { |
3316 | unsigned long long offset; |
3317 | if (kstrtoull(s: buf, base: 10, res: &offset) < 0) |
3318 | return -EINVAL; |
3319 | if (rdev->mddev->pers && rdev->raid_disk >= 0) |
3320 | return -EBUSY; |
3321 | if (rdev->sectors && rdev->mddev->external) |
3322 | /* Must set offset before size, so overlap checks |
3323 | * can be sane */ |
3324 | return -EBUSY; |
3325 | rdev->data_offset = offset; |
3326 | rdev->new_data_offset = offset; |
3327 | return len; |
3328 | } |
3329 | |
3330 | static struct rdev_sysfs_entry rdev_offset = |
3331 | __ATTR(offset, S_IRUGO|S_IWUSR, offset_show, offset_store); |
3332 | |
3333 | static ssize_t new_offset_show(struct md_rdev *rdev, char *page) |
3334 | { |
3335 | return sprintf(buf: page, fmt: "%llu\n" , |
3336 | (unsigned long long)rdev->new_data_offset); |
3337 | } |
3338 | |
3339 | static ssize_t new_offset_store(struct md_rdev *rdev, |
3340 | const char *buf, size_t len) |
3341 | { |
3342 | unsigned long long new_offset; |
3343 | struct mddev *mddev = rdev->mddev; |
3344 | |
3345 | if (kstrtoull(s: buf, base: 10, res: &new_offset) < 0) |
3346 | return -EINVAL; |
3347 | |
3348 | if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) |
3349 | return -EBUSY; |
3350 | if (new_offset == rdev->data_offset) |
3351 | /* reset is always permitted */ |
3352 | ; |
3353 | else if (new_offset > rdev->data_offset) { |
3354 | /* must not push array size beyond rdev_sectors */ |
3355 | if (new_offset - rdev->data_offset |
3356 | + mddev->dev_sectors > rdev->sectors) |
3357 | return -E2BIG; |
3358 | } |
3359 | /* Metadata worries about other space details. */ |
3360 | |
3361 | /* decreasing the offset is inconsistent with a backwards |
3362 | * reshape. |
3363 | */ |
3364 | if (new_offset < rdev->data_offset && |
3365 | mddev->reshape_backwards) |
3366 | return -EINVAL; |
3367 | /* Increasing offset is inconsistent with forwards |
3368 | * reshape. reshape_direction should be set to |
3369 | * 'backwards' first. |
3370 | */ |
3371 | if (new_offset > rdev->data_offset && |
3372 | !mddev->reshape_backwards) |
3373 | return -EINVAL; |
3374 | |
3375 | if (mddev->pers && mddev->persistent && |
3376 | !super_types[mddev->major_version] |
3377 | .allow_new_offset(rdev, new_offset)) |
3378 | return -E2BIG; |
3379 | rdev->new_data_offset = new_offset; |
3380 | if (new_offset > rdev->data_offset) |
3381 | mddev->reshape_backwards = 1; |
3382 | else if (new_offset < rdev->data_offset) |
3383 | mddev->reshape_backwards = 0; |
3384 | |
3385 | return len; |
3386 | } |
3387 | static struct rdev_sysfs_entry rdev_new_offset = |
3388 | __ATTR(new_offset, S_IRUGO|S_IWUSR, new_offset_show, new_offset_store); |
3389 | |
3390 | static ssize_t |
3391 | rdev_size_show(struct md_rdev *rdev, char *page) |
3392 | { |
3393 | return sprintf(buf: page, fmt: "%llu\n" , (unsigned long long)rdev->sectors / 2); |
3394 | } |
3395 | |
3396 | static int md_rdevs_overlap(struct md_rdev *a, struct md_rdev *b) |
3397 | { |
3398 | /* check if two start/length pairs overlap */ |
3399 | if (a->data_offset + a->sectors <= b->data_offset) |
3400 | return false; |
3401 | if (b->data_offset + b->sectors <= a->data_offset) |
3402 | return false; |
3403 | return true; |
3404 | } |
3405 | |
3406 | static bool md_rdev_overlaps(struct md_rdev *rdev) |
3407 | { |
3408 | struct mddev *mddev; |
3409 | struct md_rdev *rdev2; |
3410 | |
3411 | spin_lock(lock: &all_mddevs_lock); |
3412 | list_for_each_entry(mddev, &all_mddevs, all_mddevs) { |
3413 | if (test_bit(MD_DELETED, &mddev->flags)) |
3414 | continue; |
3415 | rdev_for_each(rdev2, mddev) { |
3416 | if (rdev != rdev2 && rdev->bdev == rdev2->bdev && |
3417 | md_rdevs_overlap(a: rdev, b: rdev2)) { |
3418 | spin_unlock(lock: &all_mddevs_lock); |
3419 | return true; |
3420 | } |
3421 | } |
3422 | } |
3423 | spin_unlock(lock: &all_mddevs_lock); |
3424 | return false; |
3425 | } |
3426 | |
3427 | static int strict_blocks_to_sectors(const char *buf, sector_t *sectors) |
3428 | { |
3429 | unsigned long long blocks; |
3430 | sector_t new; |
3431 | |
3432 | if (kstrtoull(s: buf, base: 10, res: &blocks) < 0) |
3433 | return -EINVAL; |
3434 | |
3435 | if (blocks & 1ULL << (8 * sizeof(blocks) - 1)) |
3436 | return -EINVAL; /* sector conversion overflow */ |
3437 | |
3438 | new = blocks * 2; |
3439 | if (new != blocks * 2) |
3440 | return -EINVAL; /* unsigned long long to sector_t overflow */ |
3441 | |
3442 | *sectors = new; |
3443 | return 0; |
3444 | } |
3445 | |
3446 | static ssize_t |
3447 | rdev_size_store(struct md_rdev *rdev, const char *buf, size_t len) |
3448 | { |
3449 | struct mddev *my_mddev = rdev->mddev; |
3450 | sector_t oldsectors = rdev->sectors; |
3451 | sector_t sectors; |
3452 | |
3453 | if (test_bit(Journal, &rdev->flags)) |
3454 | return -EBUSY; |
3455 | if (strict_blocks_to_sectors(buf, sectors: §ors) < 0) |
3456 | return -EINVAL; |
3457 | if (rdev->data_offset != rdev->new_data_offset) |
3458 | return -EINVAL; /* too confusing */ |
3459 | if (my_mddev->pers && rdev->raid_disk >= 0) { |
3460 | if (my_mddev->persistent) { |
3461 | sectors = super_types[my_mddev->major_version]. |
3462 | rdev_size_change(rdev, sectors); |
3463 | if (!sectors) |
3464 | return -EBUSY; |
3465 | } else if (!sectors) |
3466 | sectors = bdev_nr_sectors(bdev: rdev->bdev) - |
3467 | rdev->data_offset; |
3468 | if (!my_mddev->pers->resize) |
3469 | /* Cannot change size for RAID0 or Linear etc */ |
3470 | return -EINVAL; |
3471 | } |
3472 | if (sectors < my_mddev->dev_sectors) |
3473 | return -EINVAL; /* component must fit device */ |
3474 | |
3475 | rdev->sectors = sectors; |
3476 | |
3477 | /* |
3478 | * Check that all other rdevs with the same bdev do not overlap. This |
3479 | * check does not provide a hard guarantee, it just helps avoid |
3480 | * dangerous mistakes. |
3481 | */ |
3482 | if (sectors > oldsectors && my_mddev->external && |
3483 | md_rdev_overlaps(rdev)) { |
3484 | /* |
3485 | * Someone else could have slipped in a size change here, but |
3486 | * doing so is just silly. We put oldsectors back because we |
3487 | * know it is safe, and trust userspace not to race with itself. |
3488 | */ |
3489 | rdev->sectors = oldsectors; |
3490 | return -EBUSY; |
3491 | } |
3492 | return len; |
3493 | } |
3494 | |
3495 | static struct rdev_sysfs_entry rdev_size = |
3496 | __ATTR(size, S_IRUGO|S_IWUSR, rdev_size_show, rdev_size_store); |
3497 | |
3498 | static ssize_t recovery_start_show(struct md_rdev *rdev, char *page) |
3499 | { |
3500 | unsigned long long recovery_start = rdev->recovery_offset; |
3501 | |
3502 | if (test_bit(In_sync, &rdev->flags) || |
3503 | recovery_start == MaxSector) |
3504 | return sprintf(buf: page, fmt: "none\n" ); |
3505 | |
3506 | return sprintf(buf: page, fmt: "%llu\n" , recovery_start); |
3507 | } |
3508 | |
3509 | static ssize_t recovery_start_store(struct md_rdev *rdev, const char *buf, size_t len) |
3510 | { |
3511 | unsigned long long recovery_start; |
3512 | |
3513 | if (cmd_match(cmd: buf, str: "none" )) |
3514 | recovery_start = MaxSector; |
3515 | else if (kstrtoull(s: buf, base: 10, res: &recovery_start)) |
3516 | return -EINVAL; |
3517 | |
3518 | if (rdev->mddev->pers && |
3519 | rdev->raid_disk >= 0) |
3520 | return -EBUSY; |
3521 | |
3522 | rdev->recovery_offset = recovery_start; |
3523 | if (recovery_start == MaxSector) |
3524 | set_bit(nr: In_sync, addr: &rdev->flags); |
3525 | else |
3526 | clear_bit(nr: In_sync, addr: &rdev->flags); |
3527 | return len; |
3528 | } |
3529 | |
3530 | static struct rdev_sysfs_entry rdev_recovery_start = |
3531 | __ATTR(recovery_start, S_IRUGO|S_IWUSR, recovery_start_show, recovery_start_store); |
3532 | |
3533 | /* sysfs access to bad-blocks list. |
3534 | * We present two files. |
3535 | * 'bad-blocks' lists sector numbers and lengths of ranges that |
3536 | * are recorded as bad. The list is truncated to fit within |
3537 | * the one-page limit of sysfs. |
3538 | * Writing "sector length" to this file adds an acknowledged |
3539 | * bad block list. |
3540 | * 'unacknowledged-bad-blocks' lists bad blocks that have not yet |
3541 | * been acknowledged. Writing to this file adds bad blocks |
3542 | * without acknowledging them. This is largely for testing. |
3543 | */ |
3544 | static ssize_t bb_show(struct md_rdev *rdev, char *page) |
3545 | { |
3546 | return badblocks_show(bb: &rdev->badblocks, page, unack: 0); |
3547 | } |
3548 | static ssize_t bb_store(struct md_rdev *rdev, const char *page, size_t len) |
3549 | { |
3550 | int rv = badblocks_store(bb: &rdev->badblocks, page, len, unack: 0); |
3551 | /* Maybe that ack was all we needed */ |
3552 | if (test_and_clear_bit(nr: BlockedBadBlocks, addr: &rdev->flags)) |
3553 | wake_up(&rdev->blocked_wait); |
3554 | return rv; |
3555 | } |
3556 | static struct rdev_sysfs_entry rdev_bad_blocks = |
3557 | __ATTR(bad_blocks, S_IRUGO|S_IWUSR, bb_show, bb_store); |
3558 | |
3559 | static ssize_t ubb_show(struct md_rdev *rdev, char *page) |
3560 | { |
3561 | return badblocks_show(bb: &rdev->badblocks, page, unack: 1); |
3562 | } |
3563 | static ssize_t ubb_store(struct md_rdev *rdev, const char *page, size_t len) |
3564 | { |
3565 | return badblocks_store(bb: &rdev->badblocks, page, len, unack: 1); |
3566 | } |
3567 | static struct rdev_sysfs_entry rdev_unack_bad_blocks = |
3568 | __ATTR(unacknowledged_bad_blocks, S_IRUGO|S_IWUSR, ubb_show, ubb_store); |
3569 | |
3570 | static ssize_t |
3571 | ppl_sector_show(struct md_rdev *rdev, char *page) |
3572 | { |
3573 | return sprintf(buf: page, fmt: "%llu\n" , (unsigned long long)rdev->ppl.sector); |
3574 | } |
3575 | |
3576 | static ssize_t |
3577 | ppl_sector_store(struct md_rdev *rdev, const char *buf, size_t len) |
3578 | { |
3579 | unsigned long long sector; |
3580 | |
3581 | if (kstrtoull(s: buf, base: 10, res: §or) < 0) |
3582 | return -EINVAL; |
3583 | if (sector != (sector_t)sector) |
3584 | return -EINVAL; |
3585 | |
3586 | if (rdev->mddev->pers && test_bit(MD_HAS_PPL, &rdev->mddev->flags) && |
3587 | rdev->raid_disk >= 0) |
3588 | return -EBUSY; |
3589 | |
3590 | if (rdev->mddev->persistent) { |
3591 | if (rdev->mddev->major_version == 0) |
3592 | return -EINVAL; |
3593 | if ((sector > rdev->sb_start && |
3594 | sector - rdev->sb_start > S16_MAX) || |
3595 | (sector < rdev->sb_start && |
3596 | rdev->sb_start - sector > -S16_MIN)) |
3597 | return -EINVAL; |
3598 | rdev->ppl.offset = sector - rdev->sb_start; |
3599 | } else if (!rdev->mddev->external) { |
3600 | return -EBUSY; |
3601 | } |
3602 | rdev->ppl.sector = sector; |
3603 | return len; |
3604 | } |
3605 | |
3606 | static struct rdev_sysfs_entry rdev_ppl_sector = |
3607 | __ATTR(ppl_sector, S_IRUGO|S_IWUSR, ppl_sector_show, ppl_sector_store); |
3608 | |
3609 | static ssize_t |
3610 | ppl_size_show(struct md_rdev *rdev, char *page) |
3611 | { |
3612 | return sprintf(buf: page, fmt: "%u\n" , rdev->ppl.size); |
3613 | } |
3614 | |
3615 | static ssize_t |
3616 | ppl_size_store(struct md_rdev *rdev, const char *buf, size_t len) |
3617 | { |
3618 | unsigned int size; |
3619 | |
3620 | if (kstrtouint(s: buf, base: 10, res: &size) < 0) |
3621 | return -EINVAL; |
3622 | |
3623 | if (rdev->mddev->pers && test_bit(MD_HAS_PPL, &rdev->mddev->flags) && |
3624 | rdev->raid_disk >= 0) |
3625 | return -EBUSY; |
3626 | |
3627 | if (rdev->mddev->persistent) { |
3628 | if (rdev->mddev->major_version == 0) |
3629 | return -EINVAL; |
3630 | if (size > U16_MAX) |
3631 | return -EINVAL; |
3632 | } else if (!rdev->mddev->external) { |
3633 | return -EBUSY; |
3634 | } |
3635 | rdev->ppl.size = size; |
3636 | return len; |
3637 | } |
3638 | |
3639 | static struct rdev_sysfs_entry rdev_ppl_size = |
3640 | __ATTR(ppl_size, S_IRUGO|S_IWUSR, ppl_size_show, ppl_size_store); |
3641 | |
3642 | static struct attribute *rdev_default_attrs[] = { |
3643 | &rdev_state.attr, |
3644 | &rdev_errors.attr, |
3645 | &rdev_slot.attr, |
3646 | &rdev_offset.attr, |
3647 | &rdev_new_offset.attr, |
3648 | &rdev_size.attr, |
3649 | &rdev_recovery_start.attr, |
3650 | &rdev_bad_blocks.attr, |
3651 | &rdev_unack_bad_blocks.attr, |
3652 | &rdev_ppl_sector.attr, |
3653 | &rdev_ppl_size.attr, |
3654 | NULL, |
3655 | }; |
3656 | ATTRIBUTE_GROUPS(rdev_default); |
3657 | static ssize_t |
3658 | rdev_attr_show(struct kobject *kobj, struct attribute *attr, char *page) |
3659 | { |
3660 | struct rdev_sysfs_entry *entry = container_of(attr, struct rdev_sysfs_entry, attr); |
3661 | struct md_rdev *rdev = container_of(kobj, struct md_rdev, kobj); |
3662 | |
3663 | if (!entry->show) |
3664 | return -EIO; |
3665 | if (!rdev->mddev) |
3666 | return -ENODEV; |
3667 | return entry->show(rdev, page); |
3668 | } |
3669 | |
3670 | static ssize_t |
3671 | rdev_attr_store(struct kobject *kobj, struct attribute *attr, |
3672 | const char *page, size_t length) |
3673 | { |
3674 | struct rdev_sysfs_entry *entry = container_of(attr, struct rdev_sysfs_entry, attr); |
3675 | struct md_rdev *rdev = container_of(kobj, struct md_rdev, kobj); |
3676 | struct kernfs_node *kn = NULL; |
3677 | bool suspend = false; |
3678 | ssize_t rv; |
3679 | struct mddev *mddev = READ_ONCE(rdev->mddev); |
3680 | |
3681 | if (!entry->store) |
3682 | return -EIO; |
3683 | if (!capable(CAP_SYS_ADMIN)) |
3684 | return -EACCES; |
3685 | if (!mddev) |
3686 | return -ENODEV; |
3687 | |
3688 | if (entry->store == state_store) { |
3689 | if (cmd_match(cmd: page, str: "remove" )) |
3690 | kn = sysfs_break_active_protection(kobj, attr); |
3691 | if (cmd_match(cmd: page, str: "remove" ) || cmd_match(cmd: page, str: "re-add" ) || |
3692 | cmd_match(cmd: page, str: "writemostly" ) || |
3693 | cmd_match(cmd: page, str: "-writemostly" )) |
3694 | suspend = true; |
3695 | } |
3696 | |
3697 | rv = suspend ? mddev_suspend_and_lock(mddev) : mddev_lock(mddev); |
3698 | if (!rv) { |
3699 | if (rdev->mddev == NULL) |
3700 | rv = -ENODEV; |
3701 | else |
3702 | rv = entry->store(rdev, page, length); |
3703 | suspend ? mddev_unlock_and_resume(mddev) : mddev_unlock(mddev); |
3704 | } |
3705 | |
3706 | if (kn) |
3707 | sysfs_unbreak_active_protection(kn); |
3708 | |
3709 | return rv; |
3710 | } |
3711 | |
3712 | static void rdev_free(struct kobject *ko) |
3713 | { |
3714 | struct md_rdev *rdev = container_of(ko, struct md_rdev, kobj); |
3715 | kfree(objp: rdev); |
3716 | } |
3717 | static const struct sysfs_ops rdev_sysfs_ops = { |
3718 | .show = rdev_attr_show, |
3719 | .store = rdev_attr_store, |
3720 | }; |
3721 | static const struct kobj_type rdev_ktype = { |
3722 | .release = rdev_free, |
3723 | .sysfs_ops = &rdev_sysfs_ops, |
3724 | .default_groups = rdev_default_groups, |
3725 | }; |
3726 | |
3727 | int md_rdev_init(struct md_rdev *rdev) |
3728 | { |
3729 | rdev->desc_nr = -1; |
3730 | rdev->saved_raid_disk = -1; |
3731 | rdev->raid_disk = -1; |
3732 | rdev->flags = 0; |
3733 | rdev->data_offset = 0; |
3734 | rdev->new_data_offset = 0; |
3735 | rdev->sb_events = 0; |
3736 | rdev->last_read_error = 0; |
3737 | rdev->sb_loaded = 0; |
3738 | rdev->bb_page = NULL; |
3739 | atomic_set(v: &rdev->nr_pending, i: 0); |
3740 | atomic_set(v: &rdev->read_errors, i: 0); |
3741 | atomic_set(v: &rdev->corrected_errors, i: 0); |
3742 | |
3743 | INIT_LIST_HEAD(list: &rdev->same_set); |
3744 | init_waitqueue_head(&rdev->blocked_wait); |
3745 | |
3746 | /* Add space to store bad block list. |
3747 | * This reserves the space even on arrays where it cannot |
3748 | * be used - I wonder if that matters |
3749 | */ |
3750 | return badblocks_init(bb: &rdev->badblocks, enable: 0); |
3751 | } |
3752 | EXPORT_SYMBOL_GPL(md_rdev_init); |
3753 | |
3754 | /* |
3755 | * Import a device. If 'super_format' >= 0, then sanity check the superblock |
3756 | * |
3757 | * mark the device faulty if: |
3758 | * |
3759 | * - the device is nonexistent (zero size) |
3760 | * - the device has no valid superblock |
3761 | * |
3762 | * a faulty rdev _never_ has rdev->sb set. |
3763 | */ |
3764 | static struct md_rdev *md_import_device(dev_t newdev, int super_format, int super_minor) |
3765 | { |
3766 | struct md_rdev *rdev; |
3767 | sector_t size; |
3768 | int err; |
3769 | |
3770 | rdev = kzalloc(size: sizeof(*rdev), GFP_KERNEL); |
3771 | if (!rdev) |
3772 | return ERR_PTR(error: -ENOMEM); |
3773 | |
3774 | err = md_rdev_init(rdev); |
3775 | if (err) |
3776 | goto out_free_rdev; |
3777 | err = alloc_disk_sb(rdev); |
3778 | if (err) |
3779 | goto out_clear_rdev; |
3780 | |
3781 | rdev->bdev_file = bdev_file_open_by_dev(dev: newdev, |
3782 | BLK_OPEN_READ | BLK_OPEN_WRITE, |
3783 | holder: super_format == -2 ? &claim_rdev : rdev, NULL); |
3784 | if (IS_ERR(ptr: rdev->bdev_file)) { |
3785 | pr_warn("md: could not open device unknown-block(%u,%u).\n" , |
3786 | MAJOR(newdev), MINOR(newdev)); |
3787 | err = PTR_ERR(ptr: rdev->bdev_file); |
3788 | goto out_clear_rdev; |
3789 | } |
3790 | rdev->bdev = file_bdev(bdev_file: rdev->bdev_file); |
3791 | |
3792 | kobject_init(kobj: &rdev->kobj, ktype: &rdev_ktype); |
3793 | |
3794 | size = bdev_nr_bytes(bdev: rdev->bdev) >> BLOCK_SIZE_BITS; |
3795 | if (!size) { |
3796 | pr_warn("md: %pg has zero or unknown size, marking faulty!\n" , |
3797 | rdev->bdev); |
3798 | err = -EINVAL; |
3799 | goto out_blkdev_put; |
3800 | } |
3801 | |
3802 | if (super_format >= 0) { |
3803 | err = super_types[super_format]. |
3804 | load_super(rdev, NULL, super_minor); |
3805 | if (err == -EINVAL) { |
3806 | pr_warn("md: %pg does not have a valid v%d.%d superblock, not importing!\n" , |
3807 | rdev->bdev, |
3808 | super_format, super_minor); |
3809 | goto out_blkdev_put; |
3810 | } |
3811 | if (err < 0) { |
3812 | pr_warn("md: could not read %pg's sb, not importing!\n" , |
3813 | rdev->bdev); |
3814 | goto out_blkdev_put; |
3815 | } |
3816 | } |
3817 | |
3818 | return rdev; |
3819 | |
3820 | out_blkdev_put: |
3821 | fput(rdev->bdev_file); |
3822 | out_clear_rdev: |
3823 | md_rdev_clear(rdev); |
3824 | out_free_rdev: |
3825 | kfree(objp: rdev); |
3826 | return ERR_PTR(error: err); |
3827 | } |
3828 | |
3829 | /* |
3830 | * Check a full RAID array for plausibility |
3831 | */ |
3832 | |
3833 | static int analyze_sbs(struct mddev *mddev) |
3834 | { |
3835 | int i; |
3836 | struct md_rdev *rdev, *freshest, *tmp; |
3837 | |
3838 | freshest = NULL; |
3839 | rdev_for_each_safe(rdev, tmp, mddev) |
3840 | switch (super_types[mddev->major_version]. |
3841 | load_super(rdev, freshest, mddev->minor_version)) { |
3842 | case 1: |
3843 | freshest = rdev; |
3844 | break; |
3845 | case 0: |
3846 | break; |
3847 | default: |
3848 | pr_warn("md: fatal superblock inconsistency in %pg -- removing from array\n" , |
3849 | rdev->bdev); |
3850 | md_kick_rdev_from_array(rdev); |
3851 | } |
3852 | |
3853 | /* Cannot find a valid fresh disk */ |
3854 | if (!freshest) { |
3855 | pr_warn("md: cannot find a valid disk\n" ); |
3856 | return -EINVAL; |
3857 | } |
3858 | |
3859 | super_types[mddev->major_version]. |
3860 | validate_super(mddev, NULL/*freshest*/, freshest); |
3861 | |
3862 | i = 0; |
3863 | rdev_for_each_safe(rdev, tmp, mddev) { |
3864 | if (mddev->max_disks && |
3865 | (rdev->desc_nr >= mddev->max_disks || |
3866 | i > mddev->max_disks)) { |
3867 | pr_warn("md: %s: %pg: only %d devices permitted\n" , |
3868 | mdname(mddev), rdev->bdev, |
3869 | mddev->max_disks); |
3870 | md_kick_rdev_from_array(rdev); |
3871 | continue; |
3872 | } |
3873 | if (rdev != freshest) { |
3874 | if (super_types[mddev->major_version]. |
3875 | validate_super(mddev, freshest, rdev)) { |
3876 | pr_warn("md: kicking non-fresh %pg from array!\n" , |
3877 | rdev->bdev); |
3878 | md_kick_rdev_from_array(rdev); |
3879 | continue; |
3880 | } |
3881 | } |
3882 | if (rdev->raid_disk >= (mddev->raid_disks - min(0, mddev->delta_disks)) && |
3883 | !test_bit(Journal, &rdev->flags)) { |
3884 | rdev->raid_disk = -1; |
3885 | clear_bit(nr: In_sync, addr: &rdev->flags); |
3886 | } |
3887 | } |
3888 | |
3889 | return 0; |
3890 | } |
3891 | |
3892 | /* Read a fixed-point number. |
3893 | * Numbers in sysfs attributes should be in "standard" units where |
3894 | * possible, so time should be in seconds. |
3895 | * However we internally use a a much smaller unit such as |
3896 | * milliseconds or jiffies. |
3897 | * This function takes a decimal number with a possible fractional |
3898 | * component, and produces an integer which is the result of |
3899 | * multiplying that number by 10^'scale'. |
3900 | * all without any floating-point arithmetic. |
3901 | */ |
3902 | int strict_strtoul_scaled(const char *cp, unsigned long *res, int scale) |
3903 | { |
3904 | unsigned long result = 0; |
3905 | long decimals = -1; |
3906 | while (isdigit(c: *cp) || (*cp == '.' && decimals < 0)) { |
3907 | if (*cp == '.') |
3908 | decimals = 0; |
3909 | else if (decimals < scale) { |
3910 | unsigned int value; |
3911 | value = *cp - '0'; |
3912 | result = result * 10 + value; |
3913 | if (decimals >= 0) |
3914 | decimals++; |
3915 | } |
3916 | cp++; |
3917 | } |
3918 | if (*cp == '\n') |
3919 | cp++; |
3920 | if (*cp) |
3921 | return -EINVAL; |
3922 | if (decimals < 0) |
3923 | decimals = 0; |
3924 | *res = result * int_pow(base: 10, exp: scale - decimals); |
3925 | return 0; |
3926 | } |
3927 | |
3928 | static ssize_t |
3929 | safe_delay_show(struct mddev *mddev, char *page) |
3930 | { |
3931 | unsigned int msec = ((unsigned long)mddev->safemode_delay*1000)/HZ; |
3932 | |
3933 | return sprintf(buf: page, fmt: "%u.%03u\n" , msec/1000, msec%1000); |
3934 | } |
3935 | static ssize_t |
3936 | safe_delay_store(struct mddev *mddev, const char *cbuf, size_t len) |
3937 | { |
3938 | unsigned long msec; |
3939 | |
3940 | if (mddev_is_clustered(mddev)) { |
3941 | pr_warn("md: Safemode is disabled for clustered mode\n" ); |
3942 | return -EINVAL; |
3943 | } |
3944 | |
3945 | if (strict_strtoul_scaled(cp: cbuf, res: &msec, scale: 3) < 0 || msec > UINT_MAX / HZ) |
3946 | return -EINVAL; |
3947 | if (msec == 0) |
3948 | mddev->safemode_delay = 0; |
3949 | else { |
3950 | unsigned long old_delay = mddev->safemode_delay; |
3951 | unsigned long new_delay = (msec*HZ)/1000; |
3952 | |
3953 | if (new_delay == 0) |
3954 | new_delay = 1; |
3955 | mddev->safemode_delay = new_delay; |
3956 | if (new_delay < old_delay || old_delay == 0) |
3957 | mod_timer(timer: &mddev->safemode_timer, expires: jiffies+1); |
3958 | } |
3959 | return len; |
3960 | } |
3961 | static struct md_sysfs_entry md_safe_delay = |
3962 | __ATTR(safe_mode_delay, S_IRUGO|S_IWUSR,safe_delay_show, safe_delay_store); |
3963 | |
3964 | static ssize_t |
3965 | level_show(struct mddev *mddev, char *page) |
3966 | { |
3967 | struct md_personality *p; |
3968 | int ret; |
3969 | spin_lock(lock: &mddev->lock); |
3970 | p = mddev->pers; |
3971 | if (p) |
3972 | ret = sprintf(buf: page, fmt: "%s\n" , p->name); |
3973 | else if (mddev->clevel[0]) |
3974 | ret = sprintf(buf: page, fmt: "%s\n" , mddev->clevel); |
3975 | else if (mddev->level != LEVEL_NONE) |
3976 | ret = sprintf(buf: page, fmt: "%d\n" , mddev->level); |
3977 | else |
3978 | ret = 0; |
3979 | spin_unlock(lock: &mddev->lock); |
3980 | return ret; |
3981 | } |
3982 | |
3983 | static ssize_t |
3984 | level_store(struct mddev *mddev, const char *buf, size_t len) |
3985 | { |
3986 | char clevel[16]; |
3987 | ssize_t rv; |
3988 | size_t slen = len; |
3989 | struct md_personality *pers, *oldpers; |
3990 | long level; |
3991 | void *priv, *oldpriv; |
3992 | struct md_rdev *rdev; |
3993 | |
3994 | if (slen == 0 || slen >= sizeof(clevel)) |
3995 | return -EINVAL; |
3996 | |
3997 | rv = mddev_suspend_and_lock(mddev); |
3998 | if (rv) |
3999 | return rv; |
4000 | |
4001 | if (mddev->pers == NULL) { |
4002 | memcpy(mddev->clevel, buf, slen); |
4003 | if (mddev->clevel[slen-1] == '\n') |
4004 | slen--; |
4005 | mddev->clevel[slen] = 0; |
4006 | mddev->level = LEVEL_NONE; |
4007 | rv = len; |
4008 | goto out_unlock; |
4009 | } |
4010 | rv = -EROFS; |
4011 | if (!md_is_rdwr(mddev)) |
4012 | goto out_unlock; |
4013 | |
4014 | /* request to change the personality. Need to ensure: |
4015 | * - array is not engaged in resync/recovery/reshape |
4016 | * - old personality can be suspended |
4017 | * - new personality will access other array. |
4018 | */ |
4019 | |
4020 | rv = -EBUSY; |
4021 | if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) || |
4022 | mddev->reshape_position != MaxSector || |
4023 | mddev->sysfs_active) |
4024 | goto out_unlock; |
4025 | |
4026 | rv = -EINVAL; |
4027 | if (!mddev->pers->quiesce) { |
4028 | pr_warn("md: %s: %s does not support online personality change\n" , |
4029 | mdname(mddev), mddev->pers->name); |
4030 | goto out_unlock; |
4031 | } |
4032 | |
4033 | /* Now find the new personality */ |
4034 | memcpy(clevel, buf, slen); |
4035 | if (clevel[slen-1] == '\n') |
4036 | slen--; |
4037 | clevel[slen] = 0; |
4038 | if (kstrtol(s: clevel, base: 10, res: &level)) |
4039 | level = LEVEL_NONE; |
4040 | |
4041 | if (request_module("md-%s" , clevel) != 0) |
4042 | request_module("md-level-%s" , clevel); |
4043 | spin_lock(lock: &pers_lock); |
4044 | pers = find_pers(level, clevel); |
4045 | if (!pers || !try_module_get(module: pers->owner)) { |
4046 | spin_unlock(lock: &pers_lock); |
4047 | pr_warn("md: personality %s not loaded\n" , clevel); |
4048 | rv = -EINVAL; |
4049 | goto out_unlock; |
4050 | } |
4051 | spin_unlock(lock: &pers_lock); |
4052 | |
4053 | if (pers == mddev->pers) { |
4054 | /* Nothing to do! */ |
4055 | module_put(module: pers->owner); |
4056 | rv = len; |
4057 | goto out_unlock; |
4058 | } |
4059 | if (!pers->takeover) { |
4060 | module_put(module: pers->owner); |
4061 | pr_warn("md: %s: %s does not support personality takeover\n" , |
4062 | mdname(mddev), clevel); |
4063 | rv = -EINVAL; |
4064 | goto out_unlock; |
4065 | } |
4066 | |
4067 | rdev_for_each(rdev, mddev) |
4068 | rdev->new_raid_disk = rdev->raid_disk; |
4069 | |
4070 | /* ->takeover must set new_* and/or delta_disks |
4071 | * if it succeeds, and may set them when it fails. |
4072 | */ |
4073 | priv = pers->takeover(mddev); |
4074 | if (IS_ERR(ptr: priv)) { |
4075 | mddev->new_level = mddev->level; |
4076 | mddev->new_layout = mddev->layout; |
4077 | mddev->new_chunk_sectors = mddev->chunk_sectors; |
4078 | mddev->raid_disks -= mddev->delta_disks; |
4079 | mddev->delta_disks = 0; |
4080 | mddev->reshape_backwards = 0; |
4081 | module_put(module: pers->owner); |
4082 | pr_warn("md: %s: %s would not accept array\n" , |
4083 | mdname(mddev), clevel); |
4084 | rv = PTR_ERR(ptr: priv); |
4085 | goto out_unlock; |
4086 | } |
4087 | |
4088 | /* Looks like we have a winner */ |
4089 | mddev_detach(mddev); |
4090 | |
4091 | spin_lock(lock: &mddev->lock); |
4092 | oldpers = mddev->pers; |
4093 | oldpriv = mddev->private; |
4094 | mddev->pers = pers; |
4095 | mddev->private = priv; |
4096 | strscpy(mddev->clevel, pers->name, sizeof(mddev->clevel)); |
4097 | mddev->level = mddev->new_level; |
4098 | mddev->layout = mddev->new_layout; |
4099 | mddev->chunk_sectors = mddev->new_chunk_sectors; |
4100 | mddev->delta_disks = 0; |
4101 | mddev->reshape_backwards = 0; |
4102 | mddev->degraded = 0; |
4103 | spin_unlock(lock: &mddev->lock); |
4104 | |
4105 | if (oldpers->sync_request == NULL && |
4106 | mddev->external) { |
4107 | /* We are converting from a no-redundancy array |
4108 | * to a redundancy array and metadata is managed |
4109 | * externally so we need to be sure that writes |
4110 | * won't block due to a need to transition |
4111 | * clean->dirty |
4112 | * until external management is started. |
4113 | */ |
4114 | mddev->in_sync = 0; |
4115 | mddev->safemode_delay = 0; |
4116 | mddev->safemode = 0; |
4117 | } |
4118 | |
4119 | oldpers->free(mddev, oldpriv); |
4120 | |
4121 | if (oldpers->sync_request == NULL && |
4122 | pers->sync_request != NULL) { |
4123 | /* need to add the md_redundancy_group */ |
4124 | if (sysfs_create_group(kobj: &mddev->kobj, grp: &md_redundancy_group)) |
4125 | pr_warn("md: cannot register extra attributes for %s\n" , |
4126 | mdname(mddev)); |
4127 | mddev->sysfs_action = sysfs_get_dirent(parent: mddev->kobj.sd, name: "sync_action" ); |
4128 | mddev->sysfs_completed = sysfs_get_dirent_safe(sd: mddev->kobj.sd, name: "sync_completed" ); |
4129 | mddev->sysfs_degraded = sysfs_get_dirent_safe(sd: mddev->kobj.sd, name: "degraded" ); |
4130 | } |
4131 | if (oldpers->sync_request != NULL && |
4132 | pers->sync_request == NULL) { |
4133 | /* need to remove the md_redundancy_group */ |
4134 | if (mddev->to_remove == NULL) |
4135 | mddev->to_remove = &md_redundancy_group; |
4136 | } |
4137 | |
4138 | module_put(module: oldpers->owner); |
4139 | |
4140 | rdev_for_each(rdev, mddev) { |
4141 | if (rdev->raid_disk < 0) |
4142 | continue; |
4143 | if (rdev->new_raid_disk >= mddev->raid_disks) |
4144 | rdev->new_raid_disk = -1; |
4145 | if (rdev->new_raid_disk == rdev->raid_disk) |
4146 | continue; |
4147 | sysfs_unlink_rdev(mddev, rdev); |
4148 | } |
4149 | rdev_for_each(rdev, mddev) { |
4150 | if (rdev->raid_disk < 0) |
4151 | continue; |
4152 | if (rdev->new_raid_disk == rdev->raid_disk) |
4153 | continue; |
4154 | rdev->raid_disk = rdev->new_raid_disk; |
4155 | if (rdev->raid_disk < 0) |
4156 | clear_bit(nr: In_sync, addr: &rdev->flags); |
4157 | else { |
4158 | if (sysfs_link_rdev(mddev, rdev)) |
4159 | pr_warn("md: cannot register rd%d for %s after level change\n" , |
4160 | rdev->raid_disk, mdname(mddev)); |
4161 | } |
4162 | } |
4163 | |
4164 | if (pers->sync_request == NULL) { |
4165 | /* this is now an array without redundancy, so |
4166 | * it must always be in_sync |
4167 | */ |
4168 | mddev->in_sync = 1; |
4169 | del_timer_sync(timer: &mddev->safemode_timer); |
4170 | } |
4171 | pers->run(mddev); |
4172 | set_bit(nr: MD_SB_CHANGE_DEVS, addr: &mddev->sb_flags); |
4173 | if (!mddev->thread) |
4174 | md_update_sb(mddev, 1); |
4175 | sysfs_notify_dirent_safe(sd: mddev->sysfs_level); |
4176 | md_new_event(); |
4177 | rv = len; |
4178 | out_unlock: |
4179 | mddev_unlock_and_resume(mddev); |
4180 | return rv; |
4181 | } |
4182 | |
4183 | static struct md_sysfs_entry md_level = |
4184 | __ATTR(level, S_IRUGO|S_IWUSR, level_show, level_store); |
4185 | |
4186 | static ssize_t |
4187 | layout_show(struct mddev *mddev, char *page) |
4188 | { |
4189 | /* just a number, not meaningful for all levels */ |
4190 | if (mddev->reshape_position != MaxSector && |
4191 | mddev->layout != mddev->new_layout) |
4192 | return sprintf(buf: page, fmt: "%d (%d)\n" , |
4193 | mddev->new_layout, mddev->layout); |
4194 | return sprintf(buf: page, fmt: "%d\n" , mddev->layout); |
4195 | } |
4196 | |
4197 | static ssize_t |
4198 | layout_store(struct mddev *mddev, const char *buf, size_t len) |
4199 | { |
4200 | unsigned int n; |
4201 | int err; |
4202 | |
4203 | err = kstrtouint(s: buf, base: 10, res: &n); |
4204 | if (err < 0) |
4205 | return err; |
4206 | err = mddev_lock(mddev); |
4207 | if (err) |
4208 | return err; |
4209 | |
4210 | if (mddev->pers) { |
4211 | if (mddev->pers->check_reshape == NULL) |
4212 | err = -EBUSY; |
4213 | else if (!md_is_rdwr(mddev)) |
4214 | err = -EROFS; |
4215 | else { |
4216 | mddev->new_layout = n; |
4217 | err = mddev->pers->check_reshape(mddev); |
4218 | if (err) |
4219 | mddev->new_layout = mddev->layout; |
4220 | } |
4221 | } else { |
4222 | mddev->new_layout = n; |
4223 | if (mddev->reshape_position == MaxSector) |
4224 | mddev->layout = n; |
4225 | } |
4226 | mddev_unlock(mddev); |
4227 | return err ?: len; |
4228 | } |
4229 | static struct md_sysfs_entry md_layout = |
4230 | __ATTR(layout, S_IRUGO|S_IWUSR, layout_show, layout_store); |
4231 | |
4232 | static ssize_t |
4233 | raid_disks_show(struct mddev *mddev, char *page) |
4234 | { |
4235 | if (mddev->raid_disks == 0) |
4236 | return 0; |
4237 | if (mddev->reshape_position != MaxSector && |
4238 | mddev->delta_disks != 0) |
4239 | return sprintf(buf: page, fmt: "%d (%d)\n" , mddev->raid_disks, |
4240 | mddev->raid_disks - mddev->delta_disks); |
4241 | return sprintf(buf: page, fmt: "%d\n" , mddev->raid_disks); |
4242 | } |
4243 | |
4244 | static int update_raid_disks(struct mddev *mddev, int raid_disks); |
4245 | |
4246 | static ssize_t |
4247 | raid_disks_store(struct mddev *mddev, const char *buf, size_t len) |
4248 | { |
4249 | unsigned int n; |
4250 | int err; |
4251 | |
4252 | err = kstrtouint(s: buf, base: 10, res: &n); |
4253 | if (err < 0) |
4254 | return err; |
4255 | |
4256 | err = mddev_lock(mddev); |
4257 | if (err) |
4258 | return err; |
4259 | if (mddev->pers) |
4260 | err = update_raid_disks(mddev, raid_disks: n); |
4261 | else if (mddev->reshape_position != MaxSector) { |
4262 | struct md_rdev *rdev; |
4263 | int olddisks = mddev->raid_disks - mddev->delta_disks; |
4264 | |
4265 | err = -EINVAL; |
4266 | rdev_for_each(rdev, mddev) { |
4267 | if (olddisks < n && |
4268 | rdev->data_offset < rdev->new_data_offset) |
4269 | goto out_unlock; |
4270 | if (olddisks > n && |
4271 | rdev->data_offset > rdev->new_data_offset) |
4272 | goto out_unlock; |
4273 | } |
4274 | err = 0; |
4275 | mddev->delta_disks = n - olddisks; |
4276 | mddev->raid_disks = n; |
4277 | mddev->reshape_backwards = (mddev->delta_disks < 0); |
4278 | } else |
4279 | mddev->raid_disks = n; |
4280 | out_unlock: |
4281 | mddev_unlock(mddev); |
4282 | return err ? err : len; |
4283 | } |
4284 | static struct md_sysfs_entry md_raid_disks = |
4285 | __ATTR(raid_disks, S_IRUGO|S_IWUSR, raid_disks_show, raid_disks_store); |
4286 | |
4287 | static ssize_t |
4288 | uuid_show(struct mddev *mddev, char *page) |
4289 | { |
4290 | return sprintf(buf: page, fmt: "%pU\n" , mddev->uuid); |
4291 | } |
4292 | static struct md_sysfs_entry md_uuid = |
4293 | __ATTR(uuid, S_IRUGO, uuid_show, NULL); |
4294 | |
4295 | static ssize_t |
4296 | chunk_size_show(struct mddev *mddev, char *page) |
4297 | { |
4298 | if (mddev->reshape_position != MaxSector && |
4299 | mddev->chunk_sectors != mddev->new_chunk_sectors) |
4300 | return sprintf(buf: page, fmt: "%d (%d)\n" , |
4301 | mddev->new_chunk_sectors << 9, |
4302 | mddev->chunk_sectors << 9); |
4303 | return sprintf(buf: page, fmt: "%d\n" , mddev->chunk_sectors << 9); |
4304 | } |
4305 | |
4306 | static ssize_t |
4307 | chunk_size_store(struct mddev *mddev, const char *buf, size_t len) |
4308 | { |
4309 | unsigned long n; |
4310 | int err; |
4311 | |
4312 | err = kstrtoul(s: buf, base: 10, res: &n); |
4313 | if (err < 0) |
4314 | return err; |
4315 | |
4316 | err = mddev_lock(mddev); |
4317 | if (err) |
4318 | return err; |
4319 | if (mddev->pers) { |
4320 | if (mddev->pers->check_reshape == NULL) |
4321 | err = -EBUSY; |
4322 | else if (!md_is_rdwr(mddev)) |
4323 | err = -EROFS; |
4324 | else { |
4325 | mddev->new_chunk_sectors = n >> 9; |
4326 | err = mddev->pers->check_reshape(mddev); |
4327 | if (err) |
4328 | mddev->new_chunk_sectors = mddev->chunk_sectors; |
4329 | } |
4330 | } else { |
4331 | mddev->new_chunk_sectors = n >> 9; |
4332 | if (mddev->reshape_position == MaxSector) |
4333 | mddev->chunk_sectors = n >> 9; |
4334 | } |
4335 | mddev_unlock(mddev); |
4336 | return err ?: len; |
4337 | } |
4338 | static struct md_sysfs_entry md_chunk_size = |
4339 | __ATTR(chunk_size, S_IRUGO|S_IWUSR, chunk_size_show, chunk_size_store); |
4340 | |
4341 | static ssize_t |
4342 | resync_start_show(struct mddev *mddev, char *page) |
4343 | { |
4344 | if (mddev->recovery_cp == MaxSector) |
4345 | return sprintf(buf: page, fmt: "none\n" ); |
4346 | return sprintf(buf: page, fmt: "%llu\n" , (unsigned long long)mddev->recovery_cp); |
4347 | } |
4348 | |
4349 | static ssize_t |
4350 | resync_start_store(struct mddev *mddev, const char *buf, size_t len) |
4351 | { |
4352 | unsigned long long n; |
4353 | int err; |
4354 | |
4355 | if (cmd_match(cmd: buf, str: "none" )) |
4356 | n = MaxSector; |
4357 | else { |
4358 | err = kstrtoull(s: buf, base: 10, res: &n); |
4359 | if (err < 0) |
4360 | return err; |
4361 | if (n != (sector_t)n) |
4362 | return -EINVAL; |
4363 | } |
4364 | |
4365 | err = mddev_lock(mddev); |
4366 | if (err) |
4367 | return err; |
4368 | if (mddev->pers && !test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)) |
4369 | err = -EBUSY; |
4370 | |
4371 | if (!err) { |
4372 | mddev->recovery_cp = n; |
4373 | if (mddev->pers) |
4374 | set_bit(nr: MD_SB_CHANGE_CLEAN, addr: &mddev->sb_flags); |
4375 | } |
4376 | mddev_unlock(mddev); |
4377 | return err ?: len; |
4378 | } |
4379 | static struct md_sysfs_entry md_resync_start = |
4380 | __ATTR_PREALLOC(resync_start, S_IRUGO|S_IWUSR, |
4381 | resync_start_show, resync_start_store); |
4382 | |
4383 | /* |
4384 | * The array state can be: |
4385 | * |
4386 | * clear |
4387 | * No devices, no size, no level |
4388 | * Equivalent to STOP_ARRAY ioctl |
4389 | * inactive |
4390 | * May have some settings, but array is not active |
4391 | * all IO results in error |
4392 | * When written, doesn't tear down array, but just stops it |
4393 | * suspended (not supported yet) |
4394 | * All IO requests will block. The array can be reconfigured. |
4395 | * Writing this, if accepted, will block until array is quiescent |
4396 | * readonly |
4397 | * no resync can happen. no superblocks get written. |
4398 | * write requests fail |
4399 | * read-auto |
4400 | * like readonly, but behaves like 'clean' on a write request. |
4401 | * |
4402 | * clean - no pending writes, but otherwise active. |
4403 | * When written to inactive array, starts without resync |
4404 | * If a write request arrives then |
4405 | * if metadata is known, mark 'dirty' and switch to 'active'. |
4406 | * if not known, block and switch to write-pending |
4407 | * If written to an active array that has pending writes, then fails. |
4408 | * active |
4409 | * fully active: IO and resync can be happening. |
4410 | * When written to inactive array, starts with resync |
4411 | * |
4412 | * write-pending |
4413 | * clean, but writes are blocked waiting for 'active' to be written. |
4414 | * |
4415 | * active-idle |
4416 | * like active, but no writes have been seen for a while (100msec). |
4417 | * |
4418 | * broken |
4419 | * Array is failed. It's useful because mounted-arrays aren't stopped |
4420 | * when array is failed, so this state will at least alert the user that |
4421 | * something is wrong. |
4422 | */ |
4423 | enum array_state { clear, inactive, suspended, readonly, read_auto, clean, active, |
4424 | write_pending, active_idle, broken, bad_word}; |
4425 | static char *array_states[] = { |
4426 | "clear" , "inactive" , "suspended" , "readonly" , "read-auto" , "clean" , "active" , |
4427 | "write-pending" , "active-idle" , "broken" , NULL }; |
4428 | |
4429 | static int match_word(const char *word, char **list) |
4430 | { |
4431 | int n; |
4432 | for (n=0; list[n]; n++) |
4433 | if (cmd_match(cmd: word, str: list[n])) |
4434 | break; |
4435 | return n; |
4436 | } |
4437 | |
4438 | static ssize_t |
4439 | array_state_show(struct mddev *mddev, char *page) |
4440 | { |
4441 | enum array_state st = inactive; |
4442 | |
4443 | if (mddev->pers && !test_bit(MD_NOT_READY, &mddev->flags)) { |
4444 | switch(mddev->ro) { |
4445 | case MD_RDONLY: |
4446 | st = readonly; |
4447 | break; |
4448 | case MD_AUTO_READ: |
4449 | st = read_auto; |
4450 | break; |
4451 | case MD_RDWR: |
4452 | spin_lock(lock: &mddev->lock); |
4453 | if (test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)) |
4454 | st = write_pending; |
4455 | else if (mddev->in_sync) |
4456 | st = clean; |
4457 | else if (mddev->safemode) |
4458 | st = active_idle; |
4459 | else |
4460 | st = active; |
4461 | spin_unlock(lock: &mddev->lock); |
4462 | } |
4463 | |
4464 | if (test_bit(MD_BROKEN, &mddev->flags) && st == clean) |
4465 | st = broken; |
4466 | } else { |
4467 | if (list_empty(head: &mddev->disks) && |
4468 | mddev->raid_disks == 0 && |
4469 | mddev->dev_sectors == 0) |
4470 | st = clear; |
4471 | else |
4472 | st = inactive; |
4473 | } |
4474 | return sprintf(buf: page, fmt: "%s\n" , array_states[st]); |
4475 | } |
4476 | |
4477 | static int do_md_stop(struct mddev *mddev, int ro); |
4478 | static int md_set_readonly(struct mddev *mddev); |
4479 | static int restart_array(struct mddev *mddev); |
4480 | |
4481 | static ssize_t |
4482 | array_state_store(struct mddev *mddev, const char *buf, size_t len) |
4483 | { |
4484 | int err = 0; |
4485 | enum array_state st = match_word(word: buf, list: array_states); |
4486 | |
4487 | /* No lock dependent actions */ |
4488 | switch (st) { |
4489 | case suspended: /* not supported yet */ |
4490 | case write_pending: /* cannot be set */ |
4491 | case active_idle: /* cannot be set */ |
4492 | case broken: /* cannot be set */ |
4493 | case bad_word: |
4494 | return -EINVAL; |
4495 | case clear: |
4496 | case readonly: |
4497 | case inactive: |
4498 | case read_auto: |
4499 | if (!mddev->pers || !md_is_rdwr(mddev)) |
4500 | break; |
4501 | /* write sysfs will not open mddev and opener should be 0 */ |
4502 | err = mddev_set_closing_and_sync_blockdev(mddev, opener_num: 0); |
4503 | if (err) |
4504 | return err; |
4505 | break; |
4506 | default: |
4507 | break; |
4508 | } |
4509 | |
4510 | if (mddev->pers && (st == active || st == clean) && |
4511 | mddev->ro != MD_RDONLY) { |
4512 | /* don't take reconfig_mutex when toggling between |
4513 | * clean and active |
4514 | */ |
4515 | spin_lock(lock: &mddev->lock); |
4516 | if (st == active) { |
4517 | restart_array(mddev); |
4518 | clear_bit(nr: MD_SB_CHANGE_PENDING, addr: &mddev->sb_flags); |
4519 | md_wakeup_thread(thread: mddev->thread); |
4520 | wake_up(&mddev->sb_wait); |
4521 | } else /* st == clean */ { |
4522 | restart_array(mddev); |
4523 | if (!set_in_sync(mddev)) |
4524 | err = -EBUSY; |
4525 | } |
4526 | if (!err) |
4527 | sysfs_notify_dirent_safe(sd: mddev->sysfs_state); |
4528 | spin_unlock(lock: &mddev->lock); |
4529 | return err ?: len; |
4530 | } |
4531 | err = mddev_lock(mddev); |
4532 | if (err) |
4533 | return err; |
4534 | |
4535 | switch (st) { |
4536 | case inactive: |
4537 | /* stop an active array, return 0 otherwise */ |
4538 | if (mddev->pers) |
4539 | err = do_md_stop(mddev, ro: 2); |
4540 | break; |
4541 | case clear: |
4542 | err = do_md_stop(mddev, ro: 0); |
4543 | break; |
4544 | case readonly: |
4545 | if (mddev->pers) |
4546 | err = md_set_readonly(mddev); |
4547 | else { |
4548 | mddev->ro = MD_RDONLY; |
4549 | set_disk_ro(disk: mddev->gendisk, read_only: 1); |
4550 | err = do_md_run(mddev); |
4551 | } |
4552 | break; |
4553 | case read_auto: |
4554 | if (mddev->pers) { |
4555 | if (md_is_rdwr(mddev)) |
4556 | err = md_set_readonly(mddev); |
4557 | else if (mddev->ro == MD_RDONLY) |
4558 | err = restart_array(mddev); |
4559 | if (err == 0) { |
4560 | mddev->ro = MD_AUTO_READ; |
4561 | set_disk_ro(disk: mddev->gendisk, read_only: 0); |
4562 | } |
4563 | } else { |
4564 | mddev->ro = MD_AUTO_READ; |
4565 | err = do_md_run(mddev); |
4566 | } |
4567 | break; |
4568 | case clean: |
4569 | if (mddev->pers) { |
4570 | err = restart_array(mddev); |
4571 | if (err) |
4572 | break; |
4573 | spin_lock(lock: &mddev->lock); |
4574 | if (!set_in_sync(mddev)) |
4575 | err = -EBUSY; |
4576 | spin_unlock(lock: &mddev->lock); |
4577 | } else |
4578 | err = -EINVAL; |
4579 | break; |
4580 | case active: |
4581 | if (mddev->pers) { |
4582 | err = restart_array(mddev); |
4583 | if (err) |
4584 | break; |
4585 | clear_bit(nr: MD_SB_CHANGE_PENDING, addr: &mddev->sb_flags); |
4586 | wake_up(&mddev->sb_wait); |
4587 | err = 0; |
4588 | } else { |
4589 | mddev->ro = MD_RDWR; |
4590 | set_disk_ro(disk: mddev->gendisk, read_only: 0); |
4591 | err = do_md_run(mddev); |
4592 | } |
4593 | break; |
4594 | default: |
4595 | err = -EINVAL; |
4596 | break; |
4597 | } |
4598 | |
4599 | if (!err) { |
4600 | if (mddev->hold_active == UNTIL_IOCTL) |
4601 | mddev->hold_active = 0; |
4602 | sysfs_notify_dirent_safe(sd: mddev->sysfs_state); |
4603 | } |
4604 | mddev_unlock(mddev); |
4605 | |
4606 | if (st == readonly || st == read_auto || st == inactive || |
4607 | (err && st == clear)) |
4608 | clear_bit(nr: MD_CLOSING, addr: &mddev->flags); |
4609 | |
4610 | return err ?: len; |
4611 | } |
4612 | static struct md_sysfs_entry md_array_state = |
4613 | __ATTR_PREALLOC(array_state, S_IRUGO|S_IWUSR, array_state_show, array_state_store); |
4614 | |
4615 | static ssize_t |
4616 | max_corrected_read_errors_show(struct mddev *mddev, char *page) { |
4617 | return sprintf(buf: page, fmt: "%d\n" , |
4618 | atomic_read(v: &mddev->max_corr_read_errors)); |
4619 | } |
4620 | |
4621 | static ssize_t |
4622 | max_corrected_read_errors_store(struct mddev *mddev, const char *buf, size_t len) |
4623 | { |
4624 | unsigned int n; |
4625 | int rv; |
4626 | |
4627 | rv = kstrtouint(s: buf, base: 10, res: &n); |
4628 | if (rv < 0) |
4629 | return rv; |
4630 | if (n > INT_MAX) |
4631 | return -EINVAL; |
4632 | atomic_set(v: &mddev->max_corr_read_errors, i: n); |
4633 | return len; |
4634 | } |
4635 | |
4636 | static struct md_sysfs_entry max_corr_read_errors = |
4637 | __ATTR(max_read_errors, S_IRUGO|S_IWUSR, max_corrected_read_errors_show, |
4638 | max_corrected_read_errors_store); |
4639 | |
4640 | static ssize_t |
4641 | null_show(struct mddev *mddev, char *page) |
4642 | { |
4643 | return -EINVAL; |
4644 | } |
4645 | |
4646 | static ssize_t |
4647 | new_dev_store(struct mddev *mddev, const char *buf, size_t len) |
4648 | { |
4649 | /* buf must be %d:%d\n? giving major and minor numbers */ |
4650 | /* The new device is added to the array. |
4651 | * If the array has a persistent superblock, we read the |
4652 | * superblock to initialise info and check validity. |
4653 | * Otherwise, only checking done is that in bind_rdev_to_array, |
4654 | * which mainly checks size. |
4655 | */ |
4656 | char *e; |
4657 | int major = simple_strtoul(buf, &e, 10); |
4658 | int minor; |
4659 | dev_t dev; |
4660 | struct md_rdev *rdev; |
4661 | int err; |
4662 | |
4663 | if (!*buf || *e != ':' || !e[1] || e[1] == '\n') |
4664 | return -EINVAL; |
4665 | minor = simple_strtoul(e+1, &e, 10); |
4666 | if (*e && *e != '\n') |
4667 | return -EINVAL; |
4668 | dev = MKDEV(major, minor); |
4669 | if (major != MAJOR(dev) || |
4670 | minor != MINOR(dev)) |
4671 | return -EOVERFLOW; |
4672 | |
4673 | err = mddev_suspend_and_lock(mddev); |
4674 | if (err) |
4675 | return err; |
4676 | if (mddev->persistent) { |
4677 | rdev = md_import_device(newdev: dev, super_format: mddev->major_version, |
4678 | super_minor: mddev->minor_version); |
4679 | if (!IS_ERR(ptr: rdev) && !list_empty(head: &mddev->disks)) { |
4680 | struct md_rdev *rdev0 |
4681 | = list_entry(mddev->disks.next, |
4682 | struct md_rdev, same_set); |
4683 | err = super_types[mddev->major_version] |
4684 | .load_super(rdev, rdev0, mddev->minor_version); |
4685 | if (err < 0) |
4686 | goto out; |
4687 | } |
4688 | } else if (mddev->external) |
4689 | rdev = md_import_device(newdev: dev, super_format: -2, super_minor: -1); |
4690 | else |
4691 | rdev = md_import_device(newdev: dev, super_format: -1, super_minor: -1); |
4692 | |
4693 | if (IS_ERR(ptr: rdev)) { |
4694 | mddev_unlock_and_resume(mddev); |
4695 | return PTR_ERR(ptr: rdev); |
4696 | } |
4697 | err = bind_rdev_to_array(rdev, mddev); |
4698 | out: |
4699 | if (err) |
4700 | export_rdev(rdev, mddev); |
4701 | mddev_unlock_and_resume(mddev); |
4702 | if (!err) |
4703 | md_new_event(); |
4704 | return err ? err : len; |
4705 | } |
4706 | |
4707 | static struct md_sysfs_entry md_new_device = |
4708 | __ATTR(new_dev, S_IWUSR, null_show, new_dev_store); |
4709 | |
4710 | static ssize_t |
4711 | bitmap_store(struct mddev *mddev, const char *buf, size_t len) |
4712 | { |
4713 | char *end; |
4714 | unsigned long chunk, end_chunk; |
4715 | int err; |
4716 | |
4717 | err = mddev_lock(mddev); |
4718 | if (err) |
4719 | return err; |
4720 | if (!mddev->bitmap) |
4721 | goto out; |
4722 | /* buf should be <chunk> <chunk> ... or <chunk>-<chunk> ... (range) */ |
4723 | while (*buf) { |
4724 | chunk = end_chunk = simple_strtoul(buf, &end, 0); |
4725 | if (buf == end) break; |
4726 | if (*end == '-') { /* range */ |
4727 | buf = end + 1; |
4728 | end_chunk = simple_strtoul(buf, &end, 0); |
4729 | if (buf == end) break; |
4730 | } |
4731 | if (*end && !isspace(*end)) break; |
4732 | md_bitmap_dirty_bits(bitmap: mddev->bitmap, s: chunk, e: end_chunk); |
4733 | buf = skip_spaces(end); |
4734 | } |
4735 | md_bitmap_unplug(bitmap: mddev->bitmap); /* flush the bits to disk */ |
4736 | out: |
4737 | mddev_unlock(mddev); |
4738 | return len; |
4739 | } |
4740 | |
4741 | static struct md_sysfs_entry md_bitmap = |
4742 | __ATTR(bitmap_set_bits, S_IWUSR, null_show, bitmap_store); |
4743 | |
4744 | static ssize_t |
4745 | size_show(struct mddev *mddev, char *page) |
4746 | { |
4747 | return sprintf(buf: page, fmt: "%llu\n" , |
4748 | (unsigned long long)mddev->dev_sectors / 2); |
4749 | } |
4750 | |
4751 | static int update_size(struct mddev *mddev, sector_t num_sectors); |
4752 | |
4753 | static ssize_t |
4754 | size_store(struct mddev *mddev, const char *buf, size_t len) |
4755 | { |
4756 | /* If array is inactive, we can reduce the component size, but |
4757 | * not increase it (except from 0). |
4758 | * If array is active, we can try an on-line resize |
4759 | */ |
4760 | sector_t sectors; |
4761 | int err = strict_blocks_to_sectors(buf, sectors: §ors); |
4762 | |
4763 | if (err < 0) |
4764 | return err; |
4765 | err = mddev_lock(mddev); |
4766 | if (err) |
4767 | return err; |
4768 | if (mddev->pers) { |
4769 | err = update_size(mddev, num_sectors: sectors); |
4770 | if (err == 0) |
4771 | md_update_sb(mddev, 1); |
4772 | } else { |
4773 | if (mddev->dev_sectors == 0 || |
4774 | mddev->dev_sectors > sectors) |
4775 | mddev->dev_sectors = sectors; |
4776 | else |
4777 | err = -ENOSPC; |
4778 | } |
4779 | mddev_unlock(mddev); |
4780 | return err ? err : len; |
4781 | } |
4782 | |
4783 | static struct md_sysfs_entry md_size = |
4784 | __ATTR(component_size, S_IRUGO|S_IWUSR, size_show, size_store); |
4785 | |
4786 | /* Metadata version. |
4787 | * This is one of |
4788 | * 'none' for arrays with no metadata (good luck...) |
4789 | * 'external' for arrays with externally managed metadata, |
4790 | * or N.M for internally known formats |
4791 | */ |
4792 | static ssize_t |
4793 | metadata_show(struct mddev *mddev, char *page) |
4794 | { |
4795 | if (mddev->persistent) |
4796 | return sprintf(buf: page, fmt: "%d.%d\n" , |
4797 | mddev->major_version, mddev->minor_version); |
4798 | else if (mddev->external) |
4799 | return sprintf(buf: page, fmt: "external:%s\n" , mddev->metadata_type); |
4800 | else |
4801 | return sprintf(buf: page, fmt: "none\n" ); |
4802 | } |
4803 | |
4804 | static ssize_t |
4805 | metadata_store(struct mddev *mddev, const char *buf, size_t len) |
4806 | { |
4807 | int major, minor; |
4808 | char *e; |
4809 | int err; |
4810 | /* Changing the details of 'external' metadata is |
4811 | * always permitted. Otherwise there must be |
4812 | * no devices attached to the array. |
4813 | */ |
4814 | |
4815 | err = mddev_lock(mddev); |
4816 | if (err) |
4817 | return err; |
4818 | err = -EBUSY; |
4819 | if (mddev->external && strncmp(buf, "external:" , 9) == 0) |
4820 | ; |
4821 | else if (!list_empty(head: &mddev->disks)) |
4822 | goto out_unlock; |
4823 | |
4824 | err = 0; |
4825 | if (cmd_match(cmd: buf, str: "none" )) { |
4826 | mddev->persistent = 0; |
4827 | mddev->external = 0; |
4828 | mddev->major_version = 0; |
4829 | mddev->minor_version = 90; |
4830 | goto out_unlock; |
4831 | } |
4832 | if (strncmp(buf, "external:" , 9) == 0) { |
4833 | size_t namelen = len-9; |
4834 | if (namelen >= sizeof(mddev->metadata_type)) |
4835 | namelen = sizeof(mddev->metadata_type)-1; |
4836 | memcpy(mddev->metadata_type, buf+9, namelen); |
4837 | mddev->metadata_type[namelen] = 0; |
4838 | if (namelen && mddev->metadata_type[namelen-1] == '\n') |
4839 | mddev->metadata_type[--namelen] = 0; |
4840 | mddev->persistent = 0; |
4841 | mddev->external = 1; |
4842 | mddev->major_version = 0; |
4843 | mddev->minor_version = 90; |
4844 | goto out_unlock; |
4845 | } |
4846 | major = simple_strtoul(buf, &e, 10); |
4847 | err = -EINVAL; |
4848 | if (e==buf || *e != '.') |
4849 | goto out_unlock; |
4850 | buf = e+1; |
4851 | minor = simple_strtoul(buf, &e, 10); |
4852 | if (e==buf || (*e && *e != '\n') ) |
4853 | goto out_unlock; |
4854 | err = -ENOENT; |
4855 | if (major >= ARRAY_SIZE(super_types) || super_types[major].name == NULL) |
4856 | goto out_unlock; |
4857 | mddev->major_version = major; |
4858 | mddev->minor_version = minor; |
4859 | mddev->persistent = 1; |
4860 | mddev->external = 0; |
4861 | err = 0; |
4862 | out_unlock: |
4863 | mddev_unlock(mddev); |
4864 | return err ?: len; |
4865 | } |
4866 | |
4867 | static struct md_sysfs_entry md_metadata = |
4868 | __ATTR_PREALLOC(metadata_version, S_IRUGO|S_IWUSR, metadata_show, metadata_store); |
4869 | |
4870 | static ssize_t |
4871 | action_show(struct mddev *mddev, char *page) |
4872 | { |
4873 | char *type = "idle" ; |
4874 | unsigned long recovery = mddev->recovery; |
4875 | if (test_bit(MD_RECOVERY_FROZEN, &recovery)) |
4876 | type = "frozen" ; |
4877 | else if (test_bit(MD_RECOVERY_RUNNING, &recovery) || |
4878 | (md_is_rdwr(mddev) && test_bit(MD_RECOVERY_NEEDED, &recovery))) { |
4879 | if (test_bit(MD_RECOVERY_RESHAPE, &recovery)) |
4880 | type = "reshape" ; |
4881 | else if (test_bit(MD_RECOVERY_SYNC, &recovery)) { |
4882 | if (!test_bit(MD_RECOVERY_REQUESTED, &recovery)) |
4883 | type = "resync" ; |
4884 | else if (test_bit(MD_RECOVERY_CHECK, &recovery)) |
4885 | type = "check" ; |
4886 | else |
4887 | type = "repair" ; |
4888 | } else if (test_bit(MD_RECOVERY_RECOVER, &recovery)) |
4889 | type = "recover" ; |
4890 | else if (mddev->reshape_position != MaxSector) |
4891 | type = "reshape" ; |
4892 | } |
4893 | return sprintf(buf: page, fmt: "%s\n" , type); |
4894 | } |
4895 | |
4896 | /** |
4897 | * stop_sync_thread() - wait for sync_thread to stop if it's running. |
4898 | * @mddev: the array. |
4899 | * @locked: if set, reconfig_mutex will still be held after this function |
4900 | * return; if not set, reconfig_mutex will be released after this |
4901 | * function return. |
4902 | * @check_seq: if set, only wait for curent running sync_thread to stop, noted |
4903 | * that new sync_thread can still start. |
4904 | */ |
4905 | static void stop_sync_thread(struct mddev *mddev, bool locked, bool check_seq) |
4906 | { |
4907 | int sync_seq; |
4908 | |
4909 | if (check_seq) |
4910 | sync_seq = atomic_read(v: &mddev->sync_seq); |
4911 | |
4912 | if (!test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) { |
4913 | if (!locked) |
4914 | mddev_unlock(mddev); |
4915 | return; |
4916 | } |
4917 | |
4918 | mddev_unlock(mddev); |
4919 | |
4920 | set_bit(nr: MD_RECOVERY_INTR, addr: &mddev->recovery); |
4921 | /* |
4922 | * Thread might be blocked waiting for metadata update which will now |
4923 | * never happen |
4924 | */ |
4925 | md_wakeup_thread_directly(thread: mddev->sync_thread); |
4926 | if (work_pending(&mddev->sync_work)) |
4927 | flush_work(work: &mddev->sync_work); |
4928 | |
4929 | wait_event(resync_wait, |
4930 | !test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) || |
4931 | (check_seq && sync_seq != atomic_read(&mddev->sync_seq))); |
4932 | |
4933 | if (locked) |
4934 | mddev_lock_nointr(mddev); |
4935 | } |
4936 | |
4937 | void md_idle_sync_thread(struct mddev *mddev) |
4938 | { |
4939 | lockdep_assert_held(&mddev->reconfig_mutex); |
4940 | |
4941 | clear_bit(nr: MD_RECOVERY_FROZEN, addr: &mddev->recovery); |
4942 | stop_sync_thread(mddev, locked: true, check_seq: true); |
4943 | } |
4944 | EXPORT_SYMBOL_GPL(md_idle_sync_thread); |
4945 | |
4946 | void md_frozen_sync_thread(struct mddev *mddev) |
4947 | { |
4948 | lockdep_assert_held(&mddev->reconfig_mutex); |
4949 | |
4950 | set_bit(nr: MD_RECOVERY_FROZEN, addr: &mddev->recovery); |
4951 | stop_sync_thread(mddev, locked: true, check_seq: false); |
4952 | } |
4953 | EXPORT_SYMBOL_GPL(md_frozen_sync_thread); |
4954 | |
4955 | void md_unfrozen_sync_thread(struct mddev *mddev) |
4956 | { |
4957 | lockdep_assert_held(&mddev->reconfig_mutex); |
4958 | |
4959 | clear_bit(nr: MD_RECOVERY_FROZEN, addr: &mddev->recovery); |
4960 | set_bit(nr: MD_RECOVERY_NEEDED, addr: &mddev->recovery); |
4961 | md_wakeup_thread(thread: mddev->thread); |
4962 | sysfs_notify_dirent_safe(sd: mddev->sysfs_action); |
4963 | } |
4964 | EXPORT_SYMBOL_GPL(md_unfrozen_sync_thread); |
4965 | |
4966 | static void idle_sync_thread(struct mddev *mddev) |
4967 | { |
4968 | mutex_lock(&mddev->sync_mutex); |
4969 | clear_bit(nr: MD_RECOVERY_FROZEN, addr: &mddev->recovery); |
4970 | |
4971 | if (mddev_lock(mddev)) { |
4972 | mutex_unlock(lock: &mddev->sync_mutex); |
4973 | return; |
4974 | } |
4975 | |
4976 | stop_sync_thread(mddev, locked: false, check_seq: true); |
4977 | mutex_unlock(lock: &mddev->sync_mutex); |
4978 | } |
4979 | |
4980 | static void frozen_sync_thread(struct mddev *mddev) |
4981 | { |
4982 | mutex_lock(&mddev->sync_mutex); |
4983 | set_bit(nr: MD_RECOVERY_FROZEN, addr: &mddev->recovery); |
4984 | |
4985 | if (mddev_lock(mddev)) { |
4986 | mutex_unlock(lock: &mddev->sync_mutex); |
4987 | return; |
4988 | } |
4989 | |
4990 | stop_sync_thread(mddev, locked: false, check_seq: false); |
4991 | mutex_unlock(lock: &mddev->sync_mutex); |
4992 | } |
4993 | |
4994 | static ssize_t |
4995 | action_store(struct mddev *mddev, const char *page, size_t len) |
4996 | { |
4997 | if (!mddev->pers || !mddev->pers->sync_request) |
4998 | return -EINVAL; |
4999 | |
5000 | |
5001 | if (cmd_match(cmd: page, str: "idle" )) |
5002 | idle_sync_thread(mddev); |
5003 | else if (cmd_match(cmd: page, str: "frozen" )) |
5004 | frozen_sync_thread(mddev); |
5005 | else if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) |
5006 | return -EBUSY; |
5007 | else if (cmd_match(cmd: page, str: "resync" )) |
5008 | clear_bit(nr: MD_RECOVERY_FROZEN, addr: &mddev->recovery); |
5009 | else if (cmd_match(cmd: page, str: "recover" )) { |
5010 | clear_bit(nr: MD_RECOVERY_FROZEN, addr: &mddev->recovery); |
5011 | set_bit(nr: MD_RECOVERY_RECOVER, addr: &mddev->recovery); |
5012 | } else if (cmd_match(cmd: page, str: "reshape" )) { |
5013 | int err; |
5014 | if (mddev->pers->start_reshape == NULL) |
5015 | return -EINVAL; |
5016 | err = mddev_lock(mddev); |
5017 | if (!err) { |
5018 | if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) { |
5019 | err = -EBUSY; |
5020 | } else if (mddev->reshape_position == MaxSector || |
5021 | mddev->pers->check_reshape == NULL || |
5022 | mddev->pers->check_reshape(mddev)) { |
5023 | clear_bit(nr: MD_RECOVERY_FROZEN, addr: &mddev->recovery); |
5024 | err = mddev->pers->start_reshape(mddev); |
5025 | } else { |
5026 | /* |
5027 | * If reshape is still in progress, and |
5028 | * md_check_recovery() can continue to reshape, |
5029 | * don't restart reshape because data can be |
5030 | * corrupted for raid456. |
5031 | */ |
5032 | clear_bit(nr: MD_RECOVERY_FROZEN, addr: &mddev->recovery); |
5033 | } |
5034 | mddev_unlock(mddev); |
5035 | } |
5036 | if (err) |
5037 | return err; |
5038 | sysfs_notify_dirent_safe(sd: mddev->sysfs_degraded); |
5039 | } else { |
5040 | if (cmd_match(cmd: page, str: "check" )) |
5041 | set_bit(nr: MD_RECOVERY_CHECK, addr: &mddev->recovery); |
5042 | else if (!cmd_match(cmd: page, str: "repair" )) |
5043 | return -EINVAL; |
5044 | clear_bit(nr: MD_RECOVERY_FROZEN, addr: &mddev->recovery); |
5045 | set_bit(nr: MD_RECOVERY_REQUESTED, addr: &mddev->recovery); |
5046 | set_bit(nr: MD_RECOVERY_SYNC, addr: &mddev->recovery); |
5047 | } |
5048 | if (mddev->ro == MD_AUTO_READ) { |
5049 | /* A write to sync_action is enough to justify |
5050 | * canceling read-auto mode |
5051 | */ |
5052 | flush_work(work: &mddev->sync_work); |
5053 | mddev->ro = MD_RDWR; |
5054 | md_wakeup_thread(thread: mddev->sync_thread); |
5055 | } |
5056 | set_bit(nr: MD_RECOVERY_NEEDED, addr: &mddev->recovery); |
5057 | md_wakeup_thread(thread: mddev->thread); |
5058 | sysfs_notify_dirent_safe(sd: mddev->sysfs_action); |
5059 | return len; |
5060 | } |
5061 | |
5062 | static struct md_sysfs_entry md_scan_mode = |
5063 | __ATTR_PREALLOC(sync_action, S_IRUGO|S_IWUSR, action_show, action_store); |
5064 | |
5065 | static ssize_t |
5066 | last_sync_action_show(struct mddev *mddev, char *page) |
5067 | { |
5068 | return sprintf(buf: page, fmt: "%s\n" , mddev->last_sync_action); |
5069 | } |
5070 | |
5071 | static struct md_sysfs_entry md_last_scan_mode = __ATTR_RO(last_sync_action); |
5072 | |
5073 | static ssize_t |
5074 | mismatch_cnt_show(struct mddev *mddev, char *page) |
5075 | { |
5076 | return sprintf(buf: page, fmt: "%llu\n" , |
5077 | (unsigned long long) |
5078 | atomic64_read(v: &mddev->resync_mismatches)); |
5079 | } |
5080 | |
5081 | static struct md_sysfs_entry md_mismatches = __ATTR_RO(mismatch_cnt); |
5082 | |
5083 | static ssize_t |
5084 | sync_min_show(struct mddev *mddev, char *page) |
5085 | { |
5086 | return sprintf(buf: page, fmt: "%d (%s)\n" , speed_min(mddev), |
5087 | mddev->sync_speed_min ? "local" : "system" ); |
5088 | } |
5089 | |
5090 | static ssize_t |
5091 | sync_min_store(struct mddev *mddev, const char *buf, size_t len) |
5092 | { |
5093 | unsigned int min; |
5094 | int rv; |
5095 | |
5096 | if (strncmp(buf, "system" , 6)==0) { |
5097 | min = 0; |
5098 | } else { |
5099 | rv = kstrtouint(s: buf, base: 10, res: &min); |
5100 | if (rv < 0) |
5101 | return rv; |
5102 | if (min == 0) |
5103 | return -EINVAL; |
5104 | } |
5105 | mddev->sync_speed_min = min; |
5106 | return len; |
5107 | } |
5108 | |
5109 | static struct md_sysfs_entry md_sync_min = |
5110 | __ATTR(sync_speed_min, S_IRUGO|S_IWUSR, sync_min_show, sync_min_store); |
5111 | |
5112 | static ssize_t |
5113 | sync_max_show(struct mddev *mddev, char *page) |
5114 | { |
5115 | return sprintf(buf: page, fmt: "%d (%s)\n" , speed_max(mddev), |
5116 | mddev->sync_speed_max ? "local" : "system" ); |
5117 | } |
5118 | |
5119 | static ssize_t |
5120 | sync_max_store(struct mddev *mddev, const char *buf, size_t len) |
5121 | { |
5122 | unsigned int max; |
5123 | int rv; |
5124 | |
5125 | if (strncmp(buf, "system" , 6)==0) { |
5126 | max = 0; |
5127 | } else { |
5128 | rv = kstrtouint(s: buf, base: 10, res: &max); |
5129 | if (rv < 0) |
5130 | return rv; |
5131 | if (max == 0) |
5132 | return -EINVAL; |
5133 | } |
5134 | mddev->sync_speed_max = max; |
5135 | return len; |
5136 | } |
5137 | |
5138 | static struct md_sysfs_entry md_sync_max = |
5139 | __ATTR(sync_speed_max, S_IRUGO|S_IWUSR, sync_max_show, sync_max_store); |
5140 | |
5141 | static ssize_t |
5142 | degraded_show(struct mddev *mddev, char *page) |
5143 | { |
5144 | return sprintf(buf: page, fmt: "%d\n" , mddev->degraded); |
5145 | } |
5146 | static struct md_sysfs_entry md_degraded = __ATTR_RO(degraded); |
5147 | |
5148 | static ssize_t |
5149 | sync_force_parallel_show(struct mddev *mddev, char *page) |
5150 | { |
5151 | return sprintf(buf: page, fmt: "%d\n" , mddev->parallel_resync); |
5152 | } |
5153 | |
5154 | static ssize_t |
5155 | sync_force_parallel_store(struct mddev *mddev, const char *buf, size_t len) |
5156 | { |
5157 | long n; |
5158 | |
5159 | if (kstrtol(s: buf, base: 10, res: &n)) |
5160 | return -EINVAL; |
5161 | |
5162 | if (n != 0 && n != 1) |
5163 | return -EINVAL; |
5164 | |
5165 | mddev->parallel_resync = n; |
5166 | |
5167 | if (mddev->sync_thread) |
5168 | wake_up(&resync_wait); |
5169 | |
5170 | return len; |
5171 | } |
5172 | |
5173 | /* force parallel resync, even with shared block devices */ |
5174 | static struct md_sysfs_entry md_sync_force_parallel = |
5175 | __ATTR(sync_force_parallel, S_IRUGO|S_IWUSR, |
5176 | sync_force_parallel_show, sync_force_parallel_store); |
5177 | |
5178 | static ssize_t |
5179 | sync_speed_show(struct mddev *mddev, char *page) |
5180 | { |
5181 | unsigned long resync, dt, db; |
5182 | if (mddev->curr_resync == MD_RESYNC_NONE) |
5183 | return sprintf(buf: page, fmt: "none\n" ); |
5184 | resync = mddev->curr_mark_cnt - atomic_read(v: &mddev->recovery_active); |
5185 | dt = (jiffies - mddev->resync_mark) / HZ; |
5186 | if (!dt) dt++; |
5187 | db = resync - mddev->resync_mark_cnt; |
5188 | return sprintf(buf: page, fmt: "%lu\n" , db/dt/2); /* K/sec */ |
5189 | } |
5190 | |
5191 | static struct md_sysfs_entry md_sync_speed = __ATTR_RO(sync_speed); |
5192 | |
5193 | static ssize_t |
5194 | sync_completed_show(struct mddev *mddev, char *page) |
5195 | { |
5196 | unsigned long long max_sectors, resync; |
5197 | |
5198 | if (!test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) |
5199 | return sprintf(buf: page, fmt: "none\n" ); |
5200 | |
5201 | if (mddev->curr_resync == MD_RESYNC_YIELDED || |
5202 | mddev->curr_resync == MD_RESYNC_DELAYED) |
5203 | return sprintf(buf: page, fmt: "delayed\n" ); |
5204 | |
5205 | if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) || |
5206 | test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) |
5207 | max_sectors = mddev->resync_max_sectors; |
5208 | else |
5209 | max_sectors = mddev->dev_sectors; |
5210 | |
5211 | resync = mddev->curr_resync_completed; |
5212 | return sprintf(buf: page, fmt: "%llu / %llu\n" , resync, max_sectors); |
5213 | } |
5214 | |
5215 | static struct md_sysfs_entry md_sync_completed = |
5216 | __ATTR_PREALLOC(sync_completed, S_IRUGO, sync_completed_show, NULL); |
5217 | |
5218 | static ssize_t |
5219 | min_sync_show(struct mddev *mddev, char *page) |
5220 | { |
5221 | return sprintf(buf: page, fmt: "%llu\n" , |
5222 | (unsigned long long)mddev->resync_min); |
5223 | } |
5224 | static ssize_t |
5225 | min_sync_store(struct mddev *mddev, const char *buf, size_t len) |
5226 | { |
5227 | unsigned long long min; |
5228 | int err; |
5229 | |
5230 | if (kstrtoull(s: buf, base: 10, res: &min)) |
5231 | return -EINVAL; |
5232 | |
5233 | spin_lock(lock: &mddev->lock); |
5234 | err = -EINVAL; |
5235 | if (min > mddev->resync_max) |
5236 | goto out_unlock; |
5237 | |
5238 | err = -EBUSY; |
5239 | if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) |
5240 | goto out_unlock; |
5241 | |
5242 | /* Round down to multiple of 4K for safety */ |
5243 | mddev->resync_min = round_down(min, 8); |
5244 | err = 0; |
5245 | |
5246 | out_unlock: |
5247 | spin_unlock(lock: &mddev->lock); |
5248 | return err ?: len; |
5249 | } |
5250 | |
5251 | static struct md_sysfs_entry md_min_sync = |
5252 | __ATTR(sync_min, S_IRUGO|S_IWUSR, min_sync_show, min_sync_store); |
5253 | |
5254 | static ssize_t |
5255 | max_sync_show(struct mddev *mddev, char *page) |
5256 | { |
5257 | if (mddev->resync_max == MaxSector) |
5258 | return sprintf(buf: page, fmt: "max\n" ); |
5259 | else |
5260 | return sprintf(buf: page, fmt: "%llu\n" , |
5261 | (unsigned long long)mddev->resync_max); |
5262 | } |
5263 | static ssize_t |
5264 | max_sync_store(struct mddev *mddev, const char *buf, size_t len) |
5265 | { |
5266 | int err; |
5267 | spin_lock(lock: &mddev->lock); |
5268 | if (strncmp(buf, "max" , 3) == 0) |
5269 | mddev->resync_max = MaxSector; |
5270 | else { |
5271 | unsigned long long max; |
5272 | int chunk; |
5273 | |
5274 | err = -EINVAL; |
5275 | if (kstrtoull(s: buf, base: 10, res: &max)) |
5276 | goto out_unlock; |
5277 | if (max < mddev->resync_min) |
5278 | goto out_unlock; |
5279 | |
5280 | err = -EBUSY; |
5281 | if (max < mddev->resync_max && md_is_rdwr(mddev) && |
5282 | test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) |
5283 | goto out_unlock; |
5284 | |
5285 | /* Must be a multiple of chunk_size */ |
5286 | chunk = mddev->chunk_sectors; |
5287 | if (chunk) { |
5288 | sector_t temp = max; |
5289 | |
5290 | err = -EINVAL; |
5291 | if (sector_div(temp, chunk)) |
5292 | goto out_unlock; |
5293 | } |
5294 | mddev->resync_max = max; |
5295 | } |
5296 | wake_up(&mddev->recovery_wait); |
5297 | err = 0; |
5298 | out_unlock: |
5299 | spin_unlock(lock: &mddev->lock); |
5300 | return err ?: len; |
5301 | } |
5302 | |
5303 | static struct md_sysfs_entry md_max_sync = |
5304 | __ATTR(sync_max, S_IRUGO|S_IWUSR, max_sync_show, max_sync_store); |
5305 | |
5306 | static ssize_t |
5307 | suspend_lo_show(struct mddev *mddev, char *page) |
5308 | { |
5309 | return sprintf(buf: page, fmt: "%llu\n" , |
5310 | (unsigned long long)READ_ONCE(mddev->suspend_lo)); |
5311 | } |
5312 | |
5313 | static ssize_t |
5314 | suspend_lo_store(struct mddev *mddev, const char *buf, size_t len) |
5315 | { |
5316 | unsigned long long new; |
5317 | int err; |
5318 | |
5319 | err = kstrtoull(s: buf, base: 10, res: &new); |
5320 | if (err < 0) |
5321 | return err; |
5322 | if (new != (sector_t)new) |
5323 | return -EINVAL; |
5324 | |
5325 | err = mddev_suspend(mddev, true); |
5326 | if (err) |
5327 | return err; |
5328 | |
5329 | WRITE_ONCE(mddev->suspend_lo, new); |
5330 | mddev_resume(mddev); |
5331 | |
5332 | return len; |
5333 | } |
5334 | static struct md_sysfs_entry md_suspend_lo = |
5335 | __ATTR(suspend_lo, S_IRUGO|S_IWUSR, suspend_lo_show, suspend_lo_store); |
5336 | |
5337 | static ssize_t |
5338 | suspend_hi_show(struct mddev *mddev, char *page) |
5339 | { |
5340 | return sprintf(buf: page, fmt: "%llu\n" , |
5341 | (unsigned long long)READ_ONCE(mddev->suspend_hi)); |
5342 | } |
5343 | |
5344 | static ssize_t |
5345 | suspend_hi_store(struct mddev *mddev, const char *buf, size_t len) |
5346 | { |
5347 | unsigned long long new; |
5348 | int err; |
5349 | |
5350 | err = kstrtoull(s: buf, base: 10, res: &new); |
5351 | if (err < 0) |
5352 | return err; |
5353 | if (new != (sector_t)new) |
5354 | return -EINVAL; |
5355 | |
5356 | err = mddev_suspend(mddev, true); |
5357 | if (err) |
5358 | return err; |
5359 | |
5360 | WRITE_ONCE(mddev->suspend_hi, new); |
5361 | mddev_resume(mddev); |
5362 | |
5363 | return len; |
5364 | } |
5365 | static struct md_sysfs_entry md_suspend_hi = |
5366 | __ATTR(suspend_hi, S_IRUGO|S_IWUSR, suspend_hi_show, suspend_hi_store); |
5367 | |
5368 | static ssize_t |
5369 | reshape_position_show(struct mddev *mddev, char *page) |
5370 | { |
5371 | if (mddev->reshape_position != MaxSector) |
5372 | return sprintf(buf: page, fmt: "%llu\n" , |
5373 | (unsigned long long)mddev->reshape_position); |
5374 | strcpy(p: page, q: "none\n" ); |
5375 | return 5; |
5376 | } |
5377 | |
5378 | static ssize_t |
5379 | reshape_position_store(struct mddev *mddev, const char *buf, size_t len) |
5380 | { |
5381 | struct md_rdev *rdev; |
5382 | unsigned long long new; |
5383 | int err; |
5384 | |
5385 | err = kstrtoull(s: buf, base: 10, res: &new); |
5386 | if (err < 0) |
5387 | return err; |
5388 | if (new != (sector_t)new) |
5389 | return -EINVAL; |
5390 | err = mddev_lock(mddev); |
5391 | if (err) |
5392 | return err; |
5393 | err = -EBUSY; |
5394 | if (mddev->pers) |
5395 | goto unlock; |
5396 | mddev->reshape_position = new; |
5397 | mddev->delta_disks = 0; |
5398 | mddev->reshape_backwards = 0; |
5399 | mddev->new_level = mddev->level; |
5400 | mddev->new_layout = mddev->layout; |
5401 | mddev->new_chunk_sectors = mddev->chunk_sectors; |
5402 | rdev_for_each(rdev, mddev) |
5403 | rdev->new_data_offset = rdev->data_offset; |
5404 | err = 0; |
5405 | unlock: |
5406 | mddev_unlock(mddev); |
5407 | return err ?: len; |
5408 | } |
5409 | |
5410 | static struct md_sysfs_entry md_reshape_position = |
5411 | __ATTR(reshape_position, S_IRUGO|S_IWUSR, reshape_position_show, |
5412 | reshape_position_store); |
5413 | |
5414 | static ssize_t |
5415 | reshape_direction_show(struct mddev *mddev, char *page) |
5416 | { |
5417 | return sprintf(buf: page, fmt: "%s\n" , |
5418 | mddev->reshape_backwards ? "backwards" : "forwards" ); |
5419 | } |
5420 | |
5421 | static ssize_t |
5422 | reshape_direction_store(struct mddev *mddev, const char *buf, size_t len) |
5423 | { |
5424 | int backwards = 0; |
5425 | int err; |
5426 | |
5427 | if (cmd_match(cmd: buf, str: "forwards" )) |
5428 | backwards = 0; |
5429 | else if (cmd_match(cmd: buf, str: "backwards" )) |
5430 | backwards = 1; |
5431 | else |
5432 | return -EINVAL; |
5433 | if (mddev->reshape_backwards == backwards) |
5434 | return len; |
5435 | |
5436 | err = mddev_lock(mddev); |
5437 | if (err) |
5438 | return err; |
5439 | /* check if we are allowed to change */ |
5440 | if (mddev->delta_disks) |
5441 | err = -EBUSY; |
5442 | else if (mddev->persistent && |
5443 | mddev->major_version == 0) |
5444 | err = -EINVAL; |
5445 | else |
5446 | mddev->reshape_backwards = backwards; |
5447 | mddev_unlock(mddev); |
5448 | return err ?: len; |
5449 | } |
5450 | |
5451 | static struct md_sysfs_entry md_reshape_direction = |
5452 | __ATTR(reshape_direction, S_IRUGO|S_IWUSR, reshape_direction_show, |
5453 | reshape_direction_store); |
5454 | |
5455 | static ssize_t |
5456 | array_size_show(struct mddev *mddev, char *page) |
5457 | { |
5458 | if (mddev->external_size) |
5459 | return sprintf(buf: page, fmt: "%llu\n" , |
5460 | (unsigned long long)mddev->array_sectors/2); |
5461 | else |
5462 | return sprintf(buf: page, fmt: "default\n" ); |
5463 | } |
5464 | |
5465 | static ssize_t |
5466 | array_size_store(struct mddev *mddev, const char *buf, size_t len) |
5467 | { |
5468 | sector_t sectors; |
5469 | int err; |
5470 | |
5471 | err = mddev_lock(mddev); |
5472 | if (err) |
5473 | return err; |
5474 | |
5475 | /* cluster raid doesn't support change array_sectors */ |
5476 | if (mddev_is_clustered(mddev)) { |
5477 | mddev_unlock(mddev); |
5478 | return -EINVAL; |
5479 | } |
5480 | |
5481 | if (strncmp(buf, "default" , 7) == 0) { |
5482 | if (mddev->pers) |
5483 | sectors = mddev->pers->size(mddev, 0, 0); |
5484 | else |
5485 | sectors = mddev->array_sectors; |
5486 | |
5487 | mddev->external_size = 0; |
5488 | } else { |
5489 | if (strict_blocks_to_sectors(buf, sectors: §ors) < 0) |
5490 | err = -EINVAL; |
5491 | else if (mddev->pers && mddev->pers->size(mddev, 0, 0) < sectors) |
5492 | err = -E2BIG; |
5493 | else |
5494 | mddev->external_size = 1; |
5495 | } |
5496 | |
5497 | if (!err) { |
5498 | mddev->array_sectors = sectors; |
5499 | if (mddev->pers) |
5500 | set_capacity_and_notify(disk: mddev->gendisk, |
5501 | size: mddev->array_sectors); |
5502 | } |
5503 | mddev_unlock(mddev); |
5504 | return err ?: len; |
5505 | } |
5506 | |
5507 | static struct md_sysfs_entry md_array_size = |
5508 | __ATTR(array_size, S_IRUGO|S_IWUSR, array_size_show, |
5509 | array_size_store); |
5510 | |
5511 | static ssize_t |
5512 | consistency_policy_show(struct mddev *mddev, char *page) |
5513 | { |
5514 | int ret; |
5515 | |
5516 | if (test_bit(MD_HAS_JOURNAL, &mddev->flags)) { |
5517 | ret = sprintf(buf: page, fmt: "journal\n" ); |
5518 | } else if (test_bit(MD_HAS_PPL, &mddev->flags)) { |
5519 | ret = sprintf(buf: page, fmt: "ppl\n" ); |
5520 | } else if (mddev->bitmap) { |
5521 | ret = sprintf(buf: page, fmt: "bitmap\n" ); |
5522 | } else if (mddev->pers) { |
5523 | if (mddev->pers->sync_request) |
5524 | ret = sprintf(buf: page, fmt: "resync\n" ); |
5525 | else |
5526 | ret = sprintf(buf: page, fmt: "none\n" ); |
5527 | } else { |
5528 | ret = sprintf(buf: page, fmt: "unknown\n" ); |
5529 | } |
5530 | |
5531 | return ret; |
5532 | } |
5533 | |
5534 | static ssize_t |
5535 | consistency_policy_store(struct mddev *mddev, const char *buf, size_t len) |
5536 | { |
5537 | int err = 0; |
5538 | |
5539 | if (mddev->pers) { |
5540 | if (mddev->pers->change_consistency_policy) |
5541 | err = mddev->pers->change_consistency_policy(mddev, buf); |
5542 | else |
5543 | err = -EBUSY; |
5544 | } else if (mddev->external && strncmp(buf, "ppl" , 3) == 0) { |
5545 | set_bit(nr: MD_HAS_PPL, addr: &mddev->flags); |
5546 | } else { |
5547 | err = -EINVAL; |
5548 | } |
5549 | |
5550 | return err ? err : len; |
5551 | } |
5552 | |
5553 | static struct md_sysfs_entry md_consistency_policy = |
5554 | __ATTR(consistency_policy, S_IRUGO | S_IWUSR, consistency_policy_show, |
5555 | consistency_policy_store); |
5556 | |
5557 | static ssize_t fail_last_dev_show(struct mddev *mddev, char *page) |
5558 | { |
5559 | return sprintf(buf: page, fmt: "%d\n" , mddev->fail_last_dev); |
5560 | } |
5561 | |
5562 | /* |
5563 | * Setting fail_last_dev to true to allow last device to be forcibly removed |
5564 | * from RAID1/RAID10. |
5565 | */ |
5566 | static ssize_t |
5567 | fail_last_dev_store(struct mddev *mddev, const char *buf, size_t len) |
5568 | { |
5569 | int ret; |
5570 | bool value; |
5571 | |
5572 | ret = kstrtobool(s: buf, res: &value); |
5573 | if (ret) |
5574 | return ret; |
5575 | |
5576 | if (value != mddev->fail_last_dev) |
5577 | mddev->fail_last_dev = value; |
5578 | |
5579 | return len; |
5580 | } |
5581 | static struct md_sysfs_entry md_fail_last_dev = |
5582 | __ATTR(fail_last_dev, S_IRUGO | S_IWUSR, fail_last_dev_show, |
5583 | fail_last_dev_store); |
5584 | |
5585 | static ssize_t serialize_policy_show(struct mddev *mddev, char *page) |
5586 | { |
5587 | if (mddev->pers == NULL || (mddev->pers->level != 1)) |
5588 | return sprintf(buf: page, fmt: "n/a\n" ); |
5589 | else |
5590 | return sprintf(buf: page, fmt: "%d\n" , mddev->serialize_policy); |
5591 | } |
5592 | |
5593 | /* |
5594 | * Setting serialize_policy to true to enforce write IO is not reordered |
5595 | * for raid1. |
5596 | */ |
5597 | static ssize_t |
5598 | serialize_policy_store(struct mddev *mddev, const char *buf, size_t len) |
5599 | { |
5600 | int err; |
5601 | bool value; |
5602 | |
5603 | err = kstrtobool(s: buf, res: &value); |
5604 | if (err) |
5605 | return err; |
5606 | |
5607 | if (value == mddev->serialize_policy) |
5608 | return len; |
5609 | |
5610 | err = mddev_suspend_and_lock(mddev); |
5611 | if (err) |
5612 | return err; |
5613 | if (mddev->pers == NULL || (mddev->pers->level != 1)) { |
5614 | pr_err("md: serialize_policy is only effective for raid1\n" ); |
5615 | err = -EINVAL; |
5616 | goto unlock; |
5617 | } |
5618 | |
5619 | if (value) |
5620 | mddev_create_serial_pool(mddev, NULL); |
5621 | else |
5622 | mddev_destroy_serial_pool(mddev, NULL); |
5623 | mddev->serialize_policy = value; |
5624 | unlock: |
5625 | mddev_unlock_and_resume(mddev); |
5626 | return err ?: len; |
5627 | } |
5628 | |
5629 | static struct md_sysfs_entry md_serialize_policy = |
5630 | __ATTR(serialize_policy, S_IRUGO | S_IWUSR, serialize_policy_show, |
5631 | serialize_policy_store); |
5632 | |
5633 | |
5634 | static struct attribute *md_default_attrs[] = { |
5635 | &md_level.attr, |
5636 | &md_layout.attr, |
5637 | &md_raid_disks.attr, |
5638 | &md_uuid.attr, |
5639 | &md_chunk_size.attr, |
5640 | &md_size.attr, |
5641 | &md_resync_start.attr, |
5642 | &md_metadata.attr, |
5643 | &md_new_device.attr, |
5644 | &md_safe_delay.attr, |
5645 | &md_array_state.attr, |
5646 | &md_reshape_position.attr, |
5647 | &md_reshape_direction.attr, |
5648 | &md_array_size.attr, |
5649 | &max_corr_read_errors.attr, |
5650 | &md_consistency_policy.attr, |
5651 | &md_fail_last_dev.attr, |
5652 | &md_serialize_policy.attr, |
5653 | NULL, |
5654 | }; |
5655 | |
5656 | static const struct attribute_group md_default_group = { |
5657 | .attrs = md_default_attrs, |
5658 | }; |
5659 | |
5660 | static struct attribute *md_redundancy_attrs[] = { |
5661 | &md_scan_mode.attr, |
5662 | &md_last_scan_mode.attr, |
5663 | &md_mismatches.attr, |
5664 | &md_sync_min.attr, |
5665 | &md_sync_max.attr, |
5666 | &md_sync_speed.attr, |
5667 | &md_sync_force_parallel.attr, |
5668 | &md_sync_completed.attr, |
5669 | &md_min_sync.attr, |
5670 | &md_max_sync.attr, |
5671 | &md_suspend_lo.attr, |
5672 | &md_suspend_hi.attr, |
5673 | &md_bitmap.attr, |
5674 | &md_degraded.attr, |
5675 | NULL, |
5676 | }; |
5677 | static const struct attribute_group md_redundancy_group = { |
5678 | .name = NULL, |
5679 | .attrs = md_redundancy_attrs, |
5680 | }; |
5681 | |
5682 | static const struct attribute_group *md_attr_groups[] = { |
5683 | &md_default_group, |
5684 | &md_bitmap_group, |
5685 | NULL, |
5686 | }; |
5687 | |
5688 | static ssize_t |
5689 | md_attr_show(struct kobject *kobj, struct attribute *attr, char *page) |
5690 | { |
5691 | struct md_sysfs_entry *entry = container_of(attr, struct md_sysfs_entry, attr); |
5692 | struct mddev *mddev = container_of(kobj, struct mddev, kobj); |
5693 | ssize_t rv; |
5694 | |
5695 | if (!entry->show) |
5696 | return -EIO; |
5697 | spin_lock(lock: &all_mddevs_lock); |
5698 | if (!mddev_get(mddev)) { |
5699 | spin_unlock(lock: &all_mddevs_lock); |
5700 | return -EBUSY; |
5701 | } |
5702 | spin_unlock(lock: &all_mddevs_lock); |
5703 | |
5704 | rv = entry->show(mddev, page); |
5705 | mddev_put(mddev); |
5706 | return rv; |
5707 | } |
5708 | |
5709 | static ssize_t |
5710 | md_attr_store(struct kobject *kobj, struct attribute *attr, |
5711 | const char *page, size_t length) |
5712 | { |
5713 | struct md_sysfs_entry *entry = container_of(attr, struct md_sysfs_entry, attr); |
5714 | struct mddev *mddev = container_of(kobj, struct mddev, kobj); |
5715 | ssize_t rv; |
5716 | |
5717 | if (!entry->store) |
5718 | return -EIO; |
5719 | if (!capable(CAP_SYS_ADMIN)) |
5720 | return -EACCES; |
5721 | spin_lock(lock: &all_mddevs_lock); |
5722 | if (!mddev_get(mddev)) { |
5723 | spin_unlock(lock: &all_mddevs_lock); |
5724 | return -EBUSY; |
5725 | } |
5726 | spin_unlock(lock: &all_mddevs_lock); |
5727 | rv = entry->store(mddev, page, length); |
5728 | mddev_put(mddev); |
5729 | return rv; |
5730 | } |
5731 | |
5732 | static void md_kobj_release(struct kobject *ko) |
5733 | { |
5734 | struct mddev *mddev = container_of(ko, struct mddev, kobj); |
5735 | |
5736 | if (mddev->sysfs_state) |
5737 | sysfs_put(kn: mddev->sysfs_state); |
5738 | if (mddev->sysfs_level) |
5739 | sysfs_put(kn: mddev->sysfs_level); |
5740 | |
5741 | del_gendisk(gp: mddev->gendisk); |
5742 | put_disk(disk: mddev->gendisk); |
5743 | } |
5744 | |
5745 | static const struct sysfs_ops md_sysfs_ops = { |
5746 | .show = md_attr_show, |
5747 | .store = md_attr_store, |
5748 | }; |
5749 | static const struct kobj_type md_ktype = { |
5750 | .release = md_kobj_release, |
5751 | .sysfs_ops = &md_sysfs_ops, |
5752 | .default_groups = md_attr_groups, |
5753 | }; |
5754 | |
5755 | int mdp_major = 0; |
5756 | |
5757 | /* stack the limit for all rdevs into lim */ |
5758 | void mddev_stack_rdev_limits(struct mddev *mddev, struct queue_limits *lim) |
5759 | { |
5760 | struct md_rdev *rdev; |
5761 | |
5762 | rdev_for_each(rdev, mddev) { |
5763 | queue_limits_stack_bdev(t: lim, bdev: rdev->bdev, offset: rdev->data_offset, |
5764 | pfx: mddev->gendisk->disk_name); |
5765 | } |
5766 | } |
5767 | EXPORT_SYMBOL_GPL(mddev_stack_rdev_limits); |
5768 | |
5769 | /* apply the extra stacking limits from a new rdev into mddev */ |
5770 | int mddev_stack_new_rdev(struct mddev *mddev, struct md_rdev *rdev) |
5771 | { |
5772 | struct queue_limits lim; |
5773 | |
5774 | if (mddev_is_dm(mddev)) |
5775 | return 0; |
5776 | |
5777 | lim = queue_limits_start_update(q: mddev->gendisk->queue); |
5778 | queue_limits_stack_bdev(t: &lim, bdev: rdev->bdev, offset: rdev->data_offset, |
5779 | pfx: mddev->gendisk->disk_name); |
5780 | return queue_limits_commit_update(q: mddev->gendisk->queue, lim: &lim); |
5781 | } |
5782 | EXPORT_SYMBOL_GPL(mddev_stack_new_rdev); |
5783 | |
5784 | /* update the optimal I/O size after a reshape */ |
5785 | void mddev_update_io_opt(struct mddev *mddev, unsigned int nr_stripes) |
5786 | { |
5787 | struct queue_limits lim; |
5788 | |
5789 | if (mddev_is_dm(mddev)) |
5790 | return; |
5791 | |
5792 | /* don't bother updating io_opt if we can't suspend the array */ |
5793 | if (mddev_suspend(mddev, false) < 0) |
5794 | return; |
5795 | lim = queue_limits_start_update(q: mddev->gendisk->queue); |
5796 | lim.io_opt = lim.io_min * nr_stripes; |
5797 | queue_limits_commit_update(q: mddev->gendisk->queue, lim: &lim); |
5798 | mddev_resume(mddev); |
5799 | } |
5800 | EXPORT_SYMBOL_GPL(mddev_update_io_opt); |
5801 | |
5802 | static void mddev_delayed_delete(struct work_struct *ws) |
5803 | { |
5804 | struct mddev *mddev = container_of(ws, struct mddev, del_work); |
5805 | |
5806 | kobject_put(kobj: &mddev->kobj); |
5807 | } |
5808 | |
5809 | struct mddev *md_alloc(dev_t dev, char *name) |
5810 | { |
5811 | /* |
5812 | * If dev is zero, name is the name of a device to allocate with |
5813 | * an arbitrary minor number. It will be "md_???" |
5814 | * If dev is non-zero it must be a device number with a MAJOR of |
5815 | * MD_MAJOR or mdp_major. In this case, if "name" is NULL, then |
5816 | * the device is being created by opening a node in /dev. |
5817 | * If "name" is not NULL, the device is being created by |
5818 | * writing to /sys/module/md_mod/parameters/new_array. |
5819 | */ |
5820 | static DEFINE_MUTEX(disks_mutex); |
5821 | struct mddev *mddev; |
5822 | struct gendisk *disk; |
5823 | int partitioned; |
5824 | int shift; |
5825 | int unit; |
5826 | int error ; |
5827 | |
5828 | /* |
5829 | * Wait for any previous instance of this device to be completely |
5830 | * removed (mddev_delayed_delete). |
5831 | */ |
5832 | flush_workqueue(md_misc_wq); |
5833 | |
5834 | mutex_lock(&disks_mutex); |
5835 | mddev = mddev_alloc(unit: dev); |
5836 | if (IS_ERR(ptr: mddev)) { |
5837 | error = PTR_ERR(ptr: mddev); |
5838 | goto out_unlock; |
5839 | } |
5840 | |
5841 | partitioned = (MAJOR(mddev->unit) != MD_MAJOR); |
5842 | shift = partitioned ? MdpMinorShift : 0; |
5843 | unit = MINOR(mddev->unit) >> shift; |
5844 | |
5845 | if (name && !dev) { |
5846 | /* Need to ensure that 'name' is not a duplicate. |
5847 | */ |
5848 | struct mddev *mddev2; |
5849 | spin_lock(lock: &all_mddevs_lock); |
5850 | |
5851 | list_for_each_entry(mddev2, &all_mddevs, all_mddevs) |
5852 | if (mddev2->gendisk && |
5853 | strcmp(mddev2->gendisk->disk_name, name) == 0) { |
5854 | spin_unlock(lock: &all_mddevs_lock); |
5855 | error = -EEXIST; |
5856 | goto out_free_mddev; |
5857 | } |
5858 | spin_unlock(lock: &all_mddevs_lock); |
5859 | } |
5860 | if (name && dev) |
5861 | /* |
5862 | * Creating /dev/mdNNN via "newarray", so adjust hold_active. |
5863 | */ |
5864 | mddev->hold_active = UNTIL_STOP; |
5865 | |
5866 | disk = blk_alloc_disk(NULL, NUMA_NO_NODE); |
5867 | if (IS_ERR(ptr: disk)) { |
5868 | error = PTR_ERR(ptr: disk); |
5869 | goto out_free_mddev; |
5870 | } |
5871 | |
5872 | disk->major = MAJOR(mddev->unit); |
5873 | disk->first_minor = unit << shift; |
5874 | disk->minors = 1 << shift; |
5875 | if (name) |
5876 | strcpy(p: disk->disk_name, q: name); |
5877 | else if (partitioned) |
5878 | sprintf(buf: disk->disk_name, fmt: "md_d%d" , unit); |
5879 | else |
5880 | sprintf(buf: disk->disk_name, fmt: "md%d" , unit); |
5881 | disk->fops = &md_fops; |
5882 | disk->private_data = mddev; |
5883 | |
5884 | blk_queue_write_cache(q: disk->queue, enabled: true, fua: true); |
5885 | disk->events |= DISK_EVENT_MEDIA_CHANGE; |
5886 | mddev->gendisk = disk; |
5887 | error = add_disk(disk); |
5888 | if (error) |
5889 | goto out_put_disk; |
5890 | |
5891 | kobject_init(kobj: &mddev->kobj, ktype: &md_ktype); |
5892 | error = kobject_add(kobj: &mddev->kobj, parent: &disk_to_dev(disk)->kobj, fmt: "%s" , "md" ); |
5893 | if (error) { |
5894 | /* |
5895 | * The disk is already live at this point. Clear the hold flag |
5896 | * and let mddev_put take care of the deletion, as it isn't any |
5897 | * different from a normal close on last release now. |
5898 | */ |
5899 | mddev->hold_active = 0; |
5900 | mutex_unlock(lock: &disks_mutex); |
5901 | mddev_put(mddev); |
5902 | return ERR_PTR(error); |
5903 | } |
5904 | |
5905 | kobject_uevent(kobj: &mddev->kobj, action: KOBJ_ADD); |
5906 | mddev->sysfs_state = sysfs_get_dirent_safe(sd: mddev->kobj.sd, name: "array_state" ); |
5907 | mddev->sysfs_level = sysfs_get_dirent_safe(sd: mddev->kobj.sd, name: "level" ); |
5908 | mutex_unlock(lock: &disks_mutex); |
5909 | return mddev; |
5910 | |
5911 | out_put_disk: |
5912 | put_disk(disk); |
5913 | out_free_mddev: |
5914 | mddev_free(mddev); |
5915 | out_unlock: |
5916 | mutex_unlock(lock: &disks_mutex); |
5917 | return ERR_PTR(error); |
5918 | } |
5919 | |
5920 | static int md_alloc_and_put(dev_t dev, char *name) |
5921 | { |
5922 | struct mddev *mddev = md_alloc(dev, name); |
5923 | |
5924 | if (IS_ERR(ptr: mddev)) |
5925 | return PTR_ERR(ptr: mddev); |
5926 | mddev_put(mddev); |
5927 | return 0; |
5928 | } |
5929 | |
5930 | static void md_probe(dev_t dev) |
5931 | { |
5932 | if (MAJOR(dev) == MD_MAJOR && MINOR(dev) >= 512) |
5933 | return; |
5934 | if (create_on_open) |
5935 | md_alloc_and_put(dev, NULL); |
5936 | } |
5937 | |
5938 | static int add_named_array(const char *val, const struct kernel_param *kp) |
5939 | { |
5940 | /* |
5941 | * val must be "md_*" or "mdNNN". |
5942 | * For "md_*" we allocate an array with a large free minor number, and |
5943 | * set the name to val. val must not already be an active name. |
5944 | * For "mdNNN" we allocate an array with the minor number NNN |
5945 | * which must not already be in use. |
5946 | */ |
5947 | int len = strlen(val); |
5948 | char buf[DISK_NAME_LEN]; |
5949 | unsigned long devnum; |
5950 | |
5951 | while (len && val[len-1] == '\n') |
5952 | len--; |
5953 | if (len >= DISK_NAME_LEN) |
5954 | return -E2BIG; |
5955 | strscpy(buf, val, len+1); |
5956 | if (strncmp(buf, "md_" , 3) == 0) |
5957 | return md_alloc_and_put(dev: 0, name: buf); |
5958 | if (strncmp(buf, "md" , 2) == 0 && |
5959 | isdigit(c: buf[2]) && |
5960 | kstrtoul(s: buf+2, base: 10, res: &devnum) == 0 && |
5961 | devnum <= MINORMASK) |
5962 | return md_alloc_and_put(MKDEV(MD_MAJOR, devnum), NULL); |
5963 | |
5964 | return -EINVAL; |
5965 | } |
5966 | |
5967 | static void md_safemode_timeout(struct timer_list *t) |
5968 | { |
5969 | struct mddev *mddev = from_timer(mddev, t, safemode_timer); |
5970 | |
5971 | mddev->safemode = 1; |
5972 | if (mddev->external) |
5973 | sysfs_notify_dirent_safe(sd: mddev->sysfs_state); |
5974 | |
5975 | md_wakeup_thread(thread: mddev->thread); |
5976 | } |
5977 | |
5978 | static int start_dirty_degraded; |
5979 | |
5980 | int md_run(struct mddev *mddev) |
5981 | { |
5982 | int err; |
5983 | struct md_rdev *rdev; |
5984 | struct md_personality *pers; |
5985 | bool nowait = true; |
5986 | |
5987 | if (list_empty(head: &mddev->disks)) |
5988 | /* cannot run an array with no devices.. */ |
5989 | return -EINVAL; |
5990 | |
5991 | if (mddev->pers) |
5992 | return -EBUSY; |
5993 | /* Cannot run until previous stop completes properly */ |
5994 | if (mddev->sysfs_active) |
5995 | return -EBUSY; |
5996 | |
5997 | /* |
5998 | * Analyze all RAID superblock(s) |
5999 | */ |
6000 | if (!mddev->raid_disks) { |
6001 | if (!mddev->persistent) |
6002 | return -EINVAL; |
6003 | err = analyze_sbs(mddev); |
6004 | if (err) |
6005 | return -EINVAL; |
6006 | } |
6007 | |
6008 | if (mddev->level != LEVEL_NONE) |
6009 | request_module("md-level-%d" , mddev->level); |
6010 | else if (mddev->clevel[0]) |
6011 | request_module("md-%s" , mddev->clevel); |
6012 | |
6013 | /* |
6014 | * Drop all container device buffers, from now on |
6015 | * the only valid external interface is through the md |
6016 | * device. |
6017 | */ |
6018 | mddev->has_superblocks = false; |
6019 | rdev_for_each(rdev, mddev) { |
6020 | if (test_bit(Faulty, &rdev->flags)) |
6021 | continue; |
6022 | sync_blockdev(bdev: rdev->bdev); |
6023 | invalidate_bdev(bdev: rdev->bdev); |
6024 | if (mddev->ro != MD_RDONLY && rdev_read_only(rdev)) { |
6025 | mddev->ro = MD_RDONLY; |
6026 | if (!mddev_is_dm(mddev)) |
6027 | set_disk_ro(disk: mddev->gendisk, read_only: 1); |
6028 | } |
6029 | |
6030 | if (rdev->sb_page) |
6031 | mddev->has_superblocks = true; |
6032 | |
6033 | /* perform some consistency tests on the device. |
6034 | * We don't want the data to overlap the metadata, |
6035 | * Internal Bitmap issues have been handled elsewhere. |
6036 | */ |
6037 | if (rdev->meta_bdev) { |
6038 | /* Nothing to check */; |
6039 | } else if (rdev->data_offset < rdev->sb_start) { |
6040 | if (mddev->dev_sectors && |
6041 | rdev->data_offset + mddev->dev_sectors |
6042 | > rdev->sb_start) { |
6043 | pr_warn("md: %s: data overlaps metadata\n" , |
6044 | mdname(mddev)); |
6045 | return -EINVAL; |
6046 | } |
6047 | } else { |
6048 | if (rdev->sb_start + rdev->sb_size/512 |
6049 | > rdev->data_offset) { |
6050 | pr_warn("md: %s: metadata overlaps data\n" , |
6051 | mdname(mddev)); |
6052 | return -EINVAL; |
6053 | } |
6054 | } |
6055 | sysfs_notify_dirent_safe(sd: rdev->sysfs_state); |
6056 | nowait = nowait && bdev_nowait(bdev: rdev->bdev); |
6057 | } |
6058 | |
6059 | if (!bioset_initialized(bs: &mddev->bio_set)) { |
6060 | err = bioset_init(&mddev->bio_set, BIO_POOL_SIZE, 0, flags: BIOSET_NEED_BVECS); |
6061 | if (err) |
6062 | return err; |
6063 | } |
6064 | if (!bioset_initialized(bs: &mddev->sync_set)) { |
6065 | err = bioset_init(&mddev->sync_set, BIO_POOL_SIZE, 0, flags: BIOSET_NEED_BVECS); |
6066 | if (err) |
6067 | goto exit_bio_set; |
6068 | } |
6069 | |
6070 | if (!bioset_initialized(bs: &mddev->io_clone_set)) { |
6071 | err = bioset_init(&mddev->io_clone_set, BIO_POOL_SIZE, |
6072 | offsetof(struct md_io_clone, bio_clone), flags: 0); |
6073 | if (err) |
6074 | goto exit_sync_set; |
6075 | } |
6076 | |
6077 | spin_lock(lock: &pers_lock); |
6078 | pers = find_pers(level: mddev->level, clevel: mddev->clevel); |
6079 | if (!pers || !try_module_get(module: pers->owner)) { |
6080 | spin_unlock(lock: &pers_lock); |
6081 | if (mddev->level != LEVEL_NONE) |
6082 | pr_warn("md: personality for level %d is not loaded!\n" , |
6083 | mddev->level); |
6084 | else |
6085 | pr_warn("md: personality for level %s is not loaded!\n" , |
6086 | mddev->clevel); |
6087 | err = -EINVAL; |
6088 | goto abort; |
6089 | } |
6090 | spin_unlock(lock: &pers_lock); |
6091 | if (mddev->level != pers->level) { |
6092 | mddev->level = pers->level; |
6093 | mddev->new_level = pers->level; |
6094 | } |
6095 | strscpy(mddev->clevel, pers->name, sizeof(mddev->clevel)); |
6096 | |
6097 | if (mddev->reshape_position != MaxSector && |
6098 | pers->start_reshape == NULL) { |
6099 | /* This personality cannot handle reshaping... */ |
6100 | module_put(module: pers->owner); |
6101 | err = -EINVAL; |
6102 | goto abort; |
6103 | } |
6104 | |
6105 | if (pers->sync_request) { |
6106 | /* Warn if this is a potentially silly |
6107 | * configuration. |
6108 | */ |
6109 | struct md_rdev *rdev2; |
6110 | int warned = 0; |
6111 | |
6112 | rdev_for_each(rdev, mddev) |
6113 | rdev_for_each(rdev2, mddev) { |
6114 | if (rdev < rdev2 && |
6115 | rdev->bdev->bd_disk == |
6116 | rdev2->bdev->bd_disk) { |
6117 | pr_warn("%s: WARNING: %pg appears to be on the same physical disk as %pg.\n" , |
6118 | mdname(mddev), |
6119 | rdev->bdev, |
6120 | rdev2->bdev); |
6121 | warned = 1; |
6122 | } |
6123 | } |
6124 | |
6125 | if (warned) |
6126 | pr_warn("True protection against single-disk failure might be compromised.\n" ); |
6127 | } |
6128 | |
6129 | /* dm-raid expect sync_thread to be frozen until resume */ |
6130 | if (mddev->gendisk) |
6131 | mddev->recovery = 0; |
6132 | |
6133 | /* may be over-ridden by personality */ |
6134 | mddev->resync_max_sectors = mddev->dev_sectors; |
6135 | |
6136 | mddev->ok_start_degraded = start_dirty_degraded; |
6137 | |
6138 | if (start_readonly && md_is_rdwr(mddev)) |
6139 | mddev->ro = MD_AUTO_READ; /* read-only, but switch on first write */ |
6140 | |
6141 | err = pers->run(mddev); |
6142 | if (err) |
6143 | pr_warn("md: pers->run() failed ...\n" ); |
6144 | else if (pers->size(mddev, 0, 0) < mddev->array_sectors) { |
6145 | WARN_ONCE(!mddev->external_size, |
6146 | "%s: default size too small, but 'external_size' not in effect?\n" , |
6147 | __func__); |
6148 | pr_warn("md: invalid array_size %llu > default size %llu\n" , |
6149 | (unsigned long long)mddev->array_sectors / 2, |
6150 | (unsigned long long)pers->size(mddev, 0, 0) / 2); |
6151 | err = -EINVAL; |
6152 | } |
6153 | if (err == 0 && pers->sync_request && |
6154 | (mddev->bitmap_info.file || mddev->bitmap_info.offset)) { |
6155 | struct bitmap *bitmap; |
6156 | |
6157 | bitmap = md_bitmap_create(mddev, slot: -1); |
6158 | if (IS_ERR(ptr: bitmap)) { |
6159 | err = PTR_ERR(ptr: bitmap); |
6160 | pr_warn("%s: failed to create bitmap (%d)\n" , |
6161 | mdname(mddev), err); |
6162 | } else |
6163 | mddev->bitmap = bitmap; |
6164 | |
6165 | } |
6166 | if (err) |
6167 | goto bitmap_abort; |
6168 | |
6169 | if (mddev->bitmap_info.max_write_behind > 0) { |
6170 | bool create_pool = false; |
6171 | |
6172 | rdev_for_each(rdev, mddev) { |
6173 | if (test_bit(WriteMostly, &rdev->flags) && |
6174 | rdev_init_serial(rdev)) |
6175 | create_pool = true; |
6176 | } |
6177 | if (create_pool && mddev->serial_info_pool == NULL) { |
6178 | mddev->serial_info_pool = |
6179 | mempool_create_kmalloc_pool(NR_SERIAL_INFOS, |
6180 | size: sizeof(struct serial_info)); |
6181 | if (!mddev->serial_info_pool) { |
6182 | err = -ENOMEM; |
6183 | goto bitmap_abort; |
6184 | } |
6185 | } |
6186 | } |
6187 | |
6188 | if (!mddev_is_dm(mddev)) { |
6189 | struct request_queue *q = mddev->gendisk->queue; |
6190 | bool nonrot = true; |
6191 | |
6192 | rdev_for_each(rdev, mddev) { |
6193 | if (rdev->raid_disk >= 0 && !bdev_nonrot(bdev: rdev->bdev)) { |
6194 | nonrot = false; |
6195 | break; |
6196 | } |
6197 | } |
6198 | if (mddev->degraded) |
6199 | nonrot = false; |
6200 | if (nonrot) |
6201 | blk_queue_flag_set(QUEUE_FLAG_NONROT, q); |
6202 | else |
6203 | blk_queue_flag_clear(QUEUE_FLAG_NONROT, q); |
6204 | blk_queue_flag_set(QUEUE_FLAG_IO_STAT, q); |
6205 | |
6206 | /* Set the NOWAIT flags if all underlying devices support it */ |
6207 | if (nowait) |
6208 | blk_queue_flag_set(QUEUE_FLAG_NOWAIT, q); |
6209 | } |
6210 | if (pers->sync_request) { |
6211 | if (mddev->kobj.sd && |
6212 | sysfs_create_group(kobj: &mddev->kobj, grp: &md_redundancy_group)) |
6213 | pr_warn("md: cannot register extra attributes for %s\n" , |
6214 | mdname(mddev)); |
6215 | mddev->sysfs_action = sysfs_get_dirent_safe(sd: mddev->kobj.sd, name: "sync_action" ); |
6216 | mddev->sysfs_completed = sysfs_get_dirent_safe(sd: mddev->kobj.sd, name: "sync_completed" ); |
6217 | mddev->sysfs_degraded = sysfs_get_dirent_safe(sd: mddev->kobj.sd, name: "degraded" ); |
6218 | } else if (mddev->ro == MD_AUTO_READ) |
6219 | mddev->ro = MD_RDWR; |
6220 | |
6221 | atomic_set(v: &mddev->max_corr_read_errors, |
6222 | MD_DEFAULT_MAX_CORRECTED_READ_ERRORS); |
6223 | mddev->safemode = 0; |
6224 | if (mddev_is_clustered(mddev)) |
6225 | mddev->safemode_delay = 0; |
6226 | else |
6227 | mddev->safemode_delay = DEFAULT_SAFEMODE_DELAY; |
6228 | mddev->in_sync = 1; |
6229 | smp_wmb(); |
6230 | spin_lock(lock: &mddev->lock); |
6231 | mddev->pers = pers; |
6232 | spin_unlock(lock: &mddev->lock); |
6233 | rdev_for_each(rdev, mddev) |
6234 | if (rdev->raid_disk >= 0) |
6235 | sysfs_link_rdev(mddev, rdev); /* failure here is OK */ |
6236 | |
6237 | if (mddev->degraded && md_is_rdwr(mddev)) |
6238 | /* This ensures that recovering status is reported immediately |
6239 | * via sysfs - until a lack of spares is confirmed. |
6240 | */ |
6241 | set_bit(nr: MD_RECOVERY_RECOVER, addr: &mddev->recovery); |
6242 | set_bit(nr: MD_RECOVERY_NEEDED, addr: &mddev->recovery); |
6243 | |
6244 | if (mddev->sb_flags) |
6245 | md_update_sb(mddev, 0); |
6246 | |
6247 | md_new_event(); |
6248 | return 0; |
6249 | |
6250 | bitmap_abort: |
6251 | mddev_detach(mddev); |
6252 | if (mddev->private) |
6253 | pers->free(mddev, mddev->private); |
6254 | mddev->private = NULL; |
6255 | module_put(module: pers->owner); |
6256 | md_bitmap_destroy(mddev); |
6257 | abort: |
6258 | bioset_exit(&mddev->io_clone_set); |
6259 | exit_sync_set: |
6260 | bioset_exit(&mddev->sync_set); |
6261 | exit_bio_set: |
6262 | bioset_exit(&mddev->bio_set); |
6263 | return err; |
6264 | } |
6265 | EXPORT_SYMBOL_GPL(md_run); |
6266 | |
6267 | int do_md_run(struct mddev *mddev) |
6268 | { |
6269 | int err; |
6270 | |
6271 | set_bit(nr: MD_NOT_READY, addr: &mddev->flags); |
6272 | err = md_run(mddev); |
6273 | if (err) |
6274 | goto out; |
6275 | err = md_bitmap_load(mddev); |
6276 | if (err) { |
6277 | md_bitmap_destroy(mddev); |
6278 | goto out; |
6279 | } |
6280 | |
6281 | if (mddev_is_clustered(mddev)) |
6282 | md_allow_write(mddev); |
6283 | |
6284 | /* run start up tasks that require md_thread */ |
6285 | md_start(mddev); |
6286 | |
6287 | md_wakeup_thread(thread: mddev->sync_thread); /* possibly kick off a reshape */ |
6288 | |
6289 | set_capacity_and_notify(disk: mddev->gendisk, size: mddev->array_sectors); |
6290 | clear_bit(nr: MD_NOT_READY, addr: &mddev->flags); |
6291 | mddev->changed = 1; |
6292 | kobject_uevent(kobj: &disk_to_dev(mddev->gendisk)->kobj, action: KOBJ_CHANGE); |
6293 | sysfs_notify_dirent_safe(sd: mddev->sysfs_state); |
6294 | sysfs_notify_dirent_safe(sd: mddev->sysfs_action); |
6295 | sysfs_notify_dirent_safe(sd: mddev->sysfs_degraded); |
6296 | out: |
6297 | clear_bit(nr: MD_NOT_READY, addr: &mddev->flags); |
6298 | return err; |
6299 | } |
6300 | |
6301 | int md_start(struct mddev *mddev) |
6302 | { |
6303 | int ret = 0; |
6304 | |
6305 | if (mddev->pers->start) { |
6306 | set_bit(nr: MD_RECOVERY_WAIT, addr: &mddev->recovery); |
6307 | ret = mddev->pers->start(mddev); |
6308 | clear_bit(nr: MD_RECOVERY_WAIT, addr: &mddev->recovery); |
6309 | md_wakeup_thread(thread: mddev->sync_thread); |
6310 | } |
6311 | return ret; |
6312 | } |
6313 | EXPORT_SYMBOL_GPL(md_start); |
6314 | |
6315 | static int restart_array(struct mddev *mddev) |
6316 | { |
6317 | struct gendisk *disk = mddev->gendisk; |
6318 | struct md_rdev *rdev; |
6319 | bool has_journal = false; |
6320 | bool has_readonly = false; |
6321 | |
6322 | /* Complain if it has no devices */ |
6323 | if (list_empty(head: &mddev->disks)) |
6324 | return -ENXIO; |
6325 | if (!mddev->pers) |
6326 | return -EINVAL; |
6327 | if (md_is_rdwr(mddev)) |
6328 | return -EBUSY; |
6329 | |
6330 | rcu_read_lock(); |
6331 | rdev_for_each_rcu(rdev, mddev) { |
6332 | if (test_bit(Journal, &rdev->flags) && |
6333 | !test_bit(Faulty, &rdev->flags)) |
6334 | has_journal = true; |
6335 | if (rdev_read_only(rdev)) |
6336 | has_readonly = true; |
6337 | } |
6338 | rcu_read_unlock(); |
6339 | if (test_bit(MD_HAS_JOURNAL, &mddev->flags) && !has_journal) |
6340 | /* Don't restart rw with journal missing/faulty */ |
6341 | return -EINVAL; |
6342 | if (has_readonly) |
6343 | return -EROFS; |
6344 | |
6345 | mddev->safemode = 0; |
6346 | mddev->ro = MD_RDWR; |
6347 | set_disk_ro(disk, read_only: 0); |
6348 | pr_debug("md: %s switched to read-write mode.\n" , mdname(mddev)); |
6349 | /* Kick recovery or resync if necessary */ |
6350 | set_bit(nr: MD_RECOVERY_NEEDED, addr: &mddev->recovery); |
6351 | md_wakeup_thread(thread: mddev->sync_thread); |
6352 | sysfs_notify_dirent_safe(sd: mddev->sysfs_state); |
6353 | return 0; |
6354 | } |
6355 | |
6356 | static void md_clean(struct mddev *mddev) |
6357 | { |
6358 | mddev->array_sectors = 0; |
6359 | mddev->external_size = 0; |
6360 | mddev->dev_sectors = 0; |
6361 | mddev->raid_disks = 0; |
6362 | mddev->recovery_cp = 0; |
6363 | mddev->resync_min = 0; |
6364 | mddev->resync_max = MaxSector; |
6365 | mddev->reshape_position = MaxSector; |
6366 | /* we still need mddev->external in export_rdev, do not clear it yet */ |
6367 | mddev->persistent = 0; |
6368 | mddev->level = LEVEL_NONE; |
6369 | mddev->clevel[0] = 0; |
6370 | /* |
6371 | * Don't clear MD_CLOSING, or mddev can be opened again. |
6372 | * 'hold_active != 0' means mddev is still in the creation |
6373 | * process and will be used later. |
6374 | */ |
6375 | if (mddev->hold_active) |
6376 | mddev->flags = 0; |
6377 | else |
6378 | mddev->flags &= BIT_ULL_MASK(MD_CLOSING); |
6379 | mddev->sb_flags = 0; |
6380 | mddev->ro = MD_RDWR; |
6381 | mddev->metadata_type[0] = 0; |
6382 | mddev->chunk_sectors = 0; |
6383 | mddev->ctime = mddev->utime = 0; |
6384 | mddev->layout = 0; |
6385 | mddev->max_disks = 0; |
6386 | mddev->events = 0; |
6387 | mddev->can_decrease_events = 0; |
6388 | mddev->delta_disks = 0; |
6389 | mddev->reshape_backwards = 0; |
6390 | mddev->new_level = LEVEL_NONE; |
6391 | mddev->new_layout = 0; |
6392 | mddev->new_chunk_sectors = 0; |
6393 | mddev->curr_resync = MD_RESYNC_NONE; |
6394 | atomic64_set(v: &mddev->resync_mismatches, i: 0); |
6395 | mddev->suspend_lo = mddev->suspend_hi = 0; |
6396 | mddev->sync_speed_min = mddev->sync_speed_max = 0; |
6397 | mddev->recovery = 0; |
6398 | mddev->in_sync = 0; |
6399 | mddev->changed = 0; |
6400 | mddev->degraded = 0; |
6401 | mddev->safemode = 0; |
6402 | mddev->private = NULL; |
6403 | mddev->cluster_info = NULL; |
6404 | mddev->bitmap_info.offset = 0; |
6405 | mddev->bitmap_info.default_offset = 0; |
6406 | mddev->bitmap_info.default_space = 0; |
6407 | mddev->bitmap_info.chunksize = 0; |
6408 | mddev->bitmap_info.daemon_sleep = 0; |
6409 | mddev->bitmap_info.max_write_behind = 0; |
6410 | mddev->bitmap_info.nodes = 0; |
6411 | } |
6412 | |
6413 | static void __md_stop_writes(struct mddev *mddev) |
6414 | { |
6415 | del_timer_sync(timer: &mddev->safemode_timer); |
6416 | |
6417 | if (mddev->pers && mddev->pers->quiesce) { |
6418 | mddev->pers->quiesce(mddev, 1); |
6419 | mddev->pers->quiesce(mddev, 0); |
6420 | } |
6421 | md_bitmap_flush(mddev); |
6422 | |
6423 | if (md_is_rdwr(mddev) && |
6424 | ((!mddev->in_sync && !mddev_is_clustered(mddev)) || |
6425 | mddev->sb_flags)) { |
6426 | /* mark array as shutdown cleanly */ |
6427 | if (!mddev_is_clustered(mddev)) |
6428 | mddev->in_sync = 1; |
6429 | md_update_sb(mddev, 1); |
6430 | } |
6431 | /* disable policy to guarantee rdevs free resources for serialization */ |
6432 | mddev->serialize_policy = 0; |
6433 | mddev_destroy_serial_pool(mddev, NULL); |
6434 | } |
6435 | |
6436 | void md_stop_writes(struct mddev *mddev) |
6437 | { |
6438 | mddev_lock_nointr(mddev); |
6439 | set_bit(nr: MD_RECOVERY_FROZEN, addr: &mddev->recovery); |
6440 | stop_sync_thread(mddev, locked: true, check_seq: false); |
6441 | __md_stop_writes(mddev); |
6442 | mddev_unlock(mddev); |
6443 | } |
6444 | EXPORT_SYMBOL_GPL(md_stop_writes); |
6445 | |
6446 | static void mddev_detach(struct mddev *mddev) |
6447 | { |
6448 | md_bitmap_wait_behind_writes(mddev); |
6449 | if (mddev->pers && mddev->pers->quiesce && !is_md_suspended(mddev)) { |
6450 | mddev->pers->quiesce(mddev, 1); |
6451 | mddev->pers->quiesce(mddev, 0); |
6452 | } |
6453 | md_unregister_thread(mddev, threadp: &mddev->thread); |
6454 | |
6455 | /* the unplug fn references 'conf' */ |
6456 | if (!mddev_is_dm(mddev)) |
6457 | blk_sync_queue(q: mddev->gendisk->queue); |
6458 | } |
6459 | |
6460 | static void __md_stop(struct mddev *mddev) |
6461 | { |
6462 | struct md_personality *pers = mddev->pers; |
6463 | md_bitmap_destroy(mddev); |
6464 | mddev_detach(mddev); |
6465 | spin_lock(lock: &mddev->lock); |
6466 | mddev->pers = NULL; |
6467 | spin_unlock(lock: &mddev->lock); |
6468 | if (mddev->private) |
6469 | pers->free(mddev, mddev->private); |
6470 | mddev->private = NULL; |
6471 | if (pers->sync_request && mddev->to_remove == NULL) |
6472 | mddev->to_remove = &md_redundancy_group; |
6473 | module_put(module: pers->owner); |
6474 | clear_bit(nr: MD_RECOVERY_FROZEN, addr: &mddev->recovery); |
6475 | |
6476 | bioset_exit(&mddev->bio_set); |
6477 | bioset_exit(&mddev->sync_set); |
6478 | bioset_exit(&mddev->io_clone_set); |
6479 | } |
6480 | |
6481 | void md_stop(struct mddev *mddev) |
6482 | { |
6483 | lockdep_assert_held(&mddev->reconfig_mutex); |
6484 | |
6485 | /* stop the array and free an attached data structures. |
6486 | * This is called from dm-raid |
6487 | */ |
6488 | __md_stop_writes(mddev); |
6489 | __md_stop(mddev); |
6490 | } |
6491 | |
6492 | EXPORT_SYMBOL_GPL(md_stop); |
6493 | |
6494 | /* ensure 'mddev->pers' exist before calling md_set_readonly() */ |
6495 | static int md_set_readonly(struct mddev *mddev) |
6496 | { |
6497 | int err = 0; |
6498 | int did_freeze = 0; |
6499 | |
6500 | if (mddev->external && test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)) |
6501 | return -EBUSY; |
6502 | |
6503 | if (!test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)) { |
6504 | did_freeze = 1; |
6505 | set_bit(nr: MD_RECOVERY_FROZEN, addr: &mddev->recovery); |
6506 | } |
6507 | |
6508 | stop_sync_thread(mddev, locked: false, check_seq: false); |
6509 | wait_event(mddev->sb_wait, |
6510 | !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)); |
6511 | mddev_lock_nointr(mddev); |
6512 | |
6513 | if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) { |
6514 | pr_warn("md: %s still in use.\n" ,mdname(mddev)); |
6515 | err = -EBUSY; |
6516 | goto out; |
6517 | } |
6518 | |
6519 | __md_stop_writes(mddev); |
6520 | |
6521 | if (mddev->ro == MD_RDONLY) { |
6522 | err = -ENXIO; |
6523 | goto out; |
6524 | } |
6525 | |
6526 | mddev->ro = MD_RDONLY; |
6527 | set_disk_ro(disk: mddev->gendisk, read_only: 1); |
6528 | |
6529 | out: |
6530 | if (!err || did_freeze) { |
6531 | clear_bit(nr: MD_RECOVERY_FROZEN, addr: &mddev->recovery); |
6532 | set_bit(nr: MD_RECOVERY_NEEDED, addr: &mddev->recovery); |
6533 | sysfs_notify_dirent_safe(sd: mddev->sysfs_state); |
6534 | } |
6535 | |
6536 | return err; |
6537 | } |
6538 | |
6539 | /* mode: |
6540 | * 0 - completely stop and dis-assemble array |
6541 | * 2 - stop but do not disassemble array |
6542 | */ |
6543 | static int do_md_stop(struct mddev *mddev, int mode) |
6544 | { |
6545 | struct gendisk *disk = mddev->gendisk; |
6546 | struct md_rdev *rdev; |
6547 | int did_freeze = 0; |
6548 | |
6549 | if (!test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)) { |
6550 | did_freeze = 1; |
6551 | set_bit(nr: MD_RECOVERY_FROZEN, addr: &mddev->recovery); |
6552 | } |
6553 | |
6554 | stop_sync_thread(mddev, locked: true, check_seq: false); |
6555 | |
6556 | if (mddev->sysfs_active || |
6557 | test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) { |
6558 | pr_warn("md: %s still in use.\n" ,mdname(mddev)); |
6559 | if (did_freeze) { |
6560 | clear_bit(nr: MD_RECOVERY_FROZEN, addr: &mddev->recovery); |
6561 | set_bit(nr: MD_RECOVERY_NEEDED, addr: &mddev->recovery); |
6562 | } |
6563 | return -EBUSY; |
6564 | } |
6565 | if (mddev->pers) { |
6566 | if (!md_is_rdwr(mddev)) |
6567 | set_disk_ro(disk, read_only: 0); |
6568 | |
6569 | __md_stop_writes(mddev); |
6570 | __md_stop(mddev); |
6571 | |
6572 | /* tell userspace to handle 'inactive' */ |
6573 | sysfs_notify_dirent_safe(sd: mddev->sysfs_state); |
6574 | |
6575 | rdev_for_each(rdev, mddev) |
6576 | if (rdev->raid_disk >= 0) |
6577 | sysfs_unlink_rdev(mddev, rdev); |
6578 | |
6579 | set_capacity_and_notify(disk, size: 0); |
6580 | mddev->changed = 1; |
6581 | |
6582 | if (!md_is_rdwr(mddev)) |
6583 | mddev->ro = MD_RDWR; |
6584 | } |
6585 | /* |
6586 | * Free resources if final stop |
6587 | */ |
6588 | if (mode == 0) { |
6589 | pr_info("md: %s stopped.\n" , mdname(mddev)); |
6590 | |
6591 | if (mddev->bitmap_info.file) { |
6592 | struct file *f = mddev->bitmap_info.file; |
6593 | spin_lock(lock: &mddev->lock); |
6594 | mddev->bitmap_info.file = NULL; |
6595 | spin_unlock(lock: &mddev->lock); |
6596 | fput(f); |
6597 | } |
6598 | mddev->bitmap_info.offset = 0; |
6599 | |
6600 | export_array(mddev); |
6601 | |
6602 | md_clean(mddev); |
6603 | if (mddev->hold_active == UNTIL_STOP) |
6604 | mddev->hold_active = 0; |
6605 | } |
6606 | md_new_event(); |
6607 | sysfs_notify_dirent_safe(sd: mddev->sysfs_state); |
6608 | return 0; |
6609 | } |
6610 | |
6611 | #ifndef MODULE |
6612 | static void autorun_array(struct mddev *mddev) |
6613 | { |
6614 | struct md_rdev *rdev; |
6615 | int err; |
6616 | |
6617 | if (list_empty(head: &mddev->disks)) |
6618 | return; |
6619 | |
6620 | pr_info("md: running: " ); |
6621 | |
6622 | rdev_for_each(rdev, mddev) { |
6623 | pr_cont("<%pg>" , rdev->bdev); |
6624 | } |
6625 | pr_cont("\n" ); |
6626 | |
6627 | err = do_md_run(mddev); |
6628 | if (err) { |
6629 | pr_warn("md: do_md_run() returned %d\n" , err); |
6630 | do_md_stop(mddev, mode: 0); |
6631 | } |
6632 | } |
6633 | |
6634 | /* |
6635 | * lets try to run arrays based on all disks that have arrived |
6636 | * until now. (those are in pending_raid_disks) |
6637 | * |
6638 | * the method: pick the first pending disk, collect all disks with |
6639 | * the same UUID, remove all from the pending list and put them into |
6640 | * the 'same_array' list. Then order this list based on superblock |
6641 | * update time (freshest comes first), kick out 'old' disks and |
6642 | * compare superblocks. If everything's fine then run it. |
6643 | * |
6644 | * If "unit" is allocated, then bump its reference count |
6645 | */ |
6646 | static void autorun_devices(int part) |
6647 | { |
6648 | struct md_rdev *rdev0, *rdev, *tmp; |
6649 | struct mddev *mddev; |
6650 | |
6651 | pr_info("md: autorun ...\n" ); |
6652 | while (!list_empty(head: &pending_raid_disks)) { |
6653 | int unit; |
6654 | dev_t dev; |
6655 | LIST_HEAD(candidates); |
6656 | rdev0 = list_entry(pending_raid_disks.next, |
6657 | struct md_rdev, same_set); |
6658 | |
6659 | pr_debug("md: considering %pg ...\n" , rdev0->bdev); |
6660 | INIT_LIST_HEAD(list: &candidates); |
6661 | rdev_for_each_list(rdev, tmp, &pending_raid_disks) |
6662 | if (super_90_load(rdev, refdev: rdev0, minor_version: 0) >= 0) { |
6663 | pr_debug("md: adding %pg ...\n" , |
6664 | rdev->bdev); |
6665 | list_move(list: &rdev->same_set, head: &candidates); |
6666 | } |
6667 | /* |
6668 | * now we have a set of devices, with all of them having |
6669 | * mostly sane superblocks. It's time to allocate the |
6670 | * mddev. |
6671 | */ |
6672 | if (part) { |
6673 | dev = MKDEV(mdp_major, |
6674 | rdev0->preferred_minor << MdpMinorShift); |
6675 | unit = MINOR(dev) >> MdpMinorShift; |
6676 | } else { |
6677 | dev = MKDEV(MD_MAJOR, rdev0->preferred_minor); |
6678 | unit = MINOR(dev); |
6679 | } |
6680 | if (rdev0->preferred_minor != unit) { |
6681 | pr_warn("md: unit number in %pg is bad: %d\n" , |
6682 | rdev0->bdev, rdev0->preferred_minor); |
6683 | break; |
6684 | } |
6685 | |
6686 | mddev = md_alloc(dev, NULL); |
6687 | if (IS_ERR(ptr: mddev)) |
6688 | break; |
6689 | |
6690 | if (mddev_suspend_and_lock(mddev)) |
6691 | pr_warn("md: %s locked, cannot run\n" , mdname(mddev)); |
6692 | else if (mddev->raid_disks || mddev->major_version |
6693 | || !list_empty(head: &mddev->disks)) { |
6694 | pr_warn("md: %s already running, cannot run %pg\n" , |
6695 | mdname(mddev), rdev0->bdev); |
6696 | mddev_unlock_and_resume(mddev); |
6697 | } else { |
6698 | pr_debug("md: created %s\n" , mdname(mddev)); |
6699 | mddev->persistent = 1; |
6700 | rdev_for_each_list(rdev, tmp, &candidates) { |
6701 | list_del_init(entry: &rdev->same_set); |
6702 | if (bind_rdev_to_array(rdev, mddev)) |
6703 | export_rdev(rdev, mddev); |
6704 | } |
6705 | autorun_array(mddev); |
6706 | mddev_unlock_and_resume(mddev); |
6707 | } |
6708 | /* on success, candidates will be empty, on error |
6709 | * it won't... |
6710 | */ |
6711 | rdev_for_each_list(rdev, tmp, &candidates) { |
6712 | list_del_init(entry: &rdev->same_set); |
6713 | export_rdev(rdev, mddev); |
6714 | } |
6715 | mddev_put(mddev); |
6716 | } |
6717 | pr_info("md: ... autorun DONE.\n" ); |
6718 | } |
6719 | #endif /* !MODULE */ |
6720 | |
6721 | static int get_version(void __user *arg) |
6722 | { |
6723 | mdu_version_t ver; |
6724 | |
6725 | ver.major = MD_MAJOR_VERSION; |
6726 | ver.minor = MD_MINOR_VERSION; |
6727 | ver.patchlevel = MD_PATCHLEVEL_VERSION; |
6728 | |
6729 | if (copy_to_user(to: arg, from: &ver, n: sizeof(ver))) |
6730 | return -EFAULT; |
6731 | |
6732 | return 0; |
6733 | } |
6734 | |
6735 | static int get_array_info(struct mddev *mddev, void __user *arg) |
6736 | { |
6737 | mdu_array_info_t info; |
6738 | int nr,working,insync,failed,spare; |
6739 | struct md_rdev *rdev; |
6740 | |
6741 | nr = working = insync = failed = spare = 0; |
6742 | rcu_read_lock(); |
6743 | rdev_for_each_rcu(rdev, mddev) { |
6744 | nr++; |
6745 | if (test_bit(Faulty, &rdev->flags)) |
6746 | failed++; |
6747 | else { |
6748 | working++; |
6749 | if (test_bit(In_sync, &rdev->flags)) |
6750 | insync++; |
6751 | else if (test_bit(Journal, &rdev->flags)) |
6752 | /* TODO: add journal count to md_u.h */ |
6753 | ; |
6754 | else |
6755 | spare++; |
6756 | } |
6757 | } |
6758 | rcu_read_unlock(); |
6759 | |
6760 | info.major_version = mddev->major_version; |
6761 | info.minor_version = mddev->minor_version; |
6762 | info.patch_version = MD_PATCHLEVEL_VERSION; |
6763 | info.ctime = clamp_t(time64_t, mddev->ctime, 0, U32_MAX); |
6764 | info.level = mddev->level; |
6765 | info.size = mddev->dev_sectors / 2; |
6766 | if (info.size != mddev->dev_sectors / 2) /* overflow */ |
6767 | info.size = -1; |
6768 | info.nr_disks = nr; |
6769 | info.raid_disks = mddev->raid_disks; |
6770 | info.md_minor = mddev->md_minor; |
6771 | info.not_persistent= !mddev->persistent; |
6772 | |
6773 | info.utime = clamp_t(time64_t, mddev->utime, 0, U32_MAX); |
6774 | info.state = 0; |
6775 | if (mddev->in_sync) |
6776 | info.state = (1<<MD_SB_CLEAN); |
6777 | if (mddev->bitmap && mddev->bitmap_info.offset) |
6778 | info.state |= (1<<MD_SB_BITMAP_PRESENT); |
6779 | if (mddev_is_clustered(mddev)) |
6780 | info.state |= (1<<MD_SB_CLUSTERED); |
6781 | info.active_disks = insync; |
6782 | info.working_disks = working; |
6783 | info.failed_disks = failed; |
6784 | info.spare_disks = spare; |
6785 | |
6786 | info.layout = mddev->layout; |
6787 | info.chunk_size = mddev->chunk_sectors << 9; |
6788 | |
6789 | if (copy_to_user(to: arg, from: &info, n: sizeof(info))) |
6790 | return -EFAULT; |
6791 | |
6792 | return 0; |
6793 | } |
6794 | |
6795 | static int get_bitmap_file(struct mddev *mddev, void __user * arg) |
6796 | { |
6797 | mdu_bitmap_file_t *file = NULL; /* too big for stack allocation */ |
6798 | char *ptr; |
6799 | int err; |
6800 | |
6801 | file = kzalloc(size: sizeof(*file), GFP_NOIO); |
6802 | if (!file) |
6803 | return -ENOMEM; |
6804 | |
6805 | err = 0; |
6806 | spin_lock(lock: &mddev->lock); |
6807 | /* bitmap enabled */ |
6808 | if (mddev->bitmap_info.file) { |
6809 | ptr = file_path(mddev->bitmap_info.file, file->pathname, |
6810 | sizeof(file->pathname)); |
6811 | if (IS_ERR(ptr)) |
6812 | err = PTR_ERR(ptr); |
6813 | else |
6814 | memmove(file->pathname, ptr, |
6815 | sizeof(file->pathname)-(ptr-file->pathname)); |
6816 | } |
6817 | spin_unlock(lock: &mddev->lock); |
6818 | |
6819 | if (err == 0 && |
6820 | copy_to_user(to: arg, from: file, n: sizeof(*file))) |
6821 | err = -EFAULT; |
6822 | |
6823 | kfree(objp: file); |
6824 | return err; |
6825 | } |
6826 | |
6827 | static int get_disk_info(struct mddev *mddev, void __user * arg) |
6828 | { |
6829 | mdu_disk_info_t info; |
6830 | struct md_rdev *rdev; |
6831 | |
6832 | if (copy_from_user(to: &info, from: arg, n: sizeof(info))) |
6833 | return -EFAULT; |
6834 | |
6835 | rcu_read_lock(); |
6836 | rdev = md_find_rdev_nr_rcu(mddev, info.number); |
6837 | if (rdev) { |
6838 | info.major = MAJOR(rdev->bdev->bd_dev); |
6839 | info.minor = MINOR(rdev->bdev->bd_dev); |
6840 | info.raid_disk = rdev->raid_disk; |
6841 | info.state = 0; |
6842 | if (test_bit(Faulty, &rdev->flags)) |
6843 | info.state |= (1<<MD_DISK_FAULTY); |
6844 | else if (test_bit(In_sync, &rdev->flags)) { |
6845 | info.state |= (1<<MD_DISK_ACTIVE); |
6846 | info.state |= (1<<MD_DISK_SYNC); |
6847 | } |
6848 | if (test_bit(Journal, &rdev->flags)) |
6849 | info.state |= (1<<MD_DISK_JOURNAL); |
6850 | if (test_bit(WriteMostly, &rdev->flags)) |
6851 | info.state |= (1<<MD_DISK_WRITEMOSTLY); |
6852 | if (test_bit(FailFast, &rdev->flags)) |
6853 | info.state |= (1<<MD_DISK_FAILFAST); |
6854 | } else { |
6855 | info.major = info.minor = 0; |
6856 | info.raid_disk = -1; |
6857 | info.state = (1<<MD_DISK_REMOVED); |
6858 | } |
6859 | rcu_read_unlock(); |
6860 | |
6861 | if (copy_to_user(to: arg, from: &info, n: sizeof(info))) |
6862 | return -EFAULT; |
6863 | |
6864 | return 0; |
6865 | } |
6866 | |
6867 | int md_add_new_disk(struct mddev *mddev, struct mdu_disk_info_s *info) |
6868 | { |
6869 | struct md_rdev *rdev; |
6870 | dev_t dev = MKDEV(info->major,info->minor); |
6871 | |
6872 | if (mddev_is_clustered(mddev) && |
6873 | !(info->state & ((1 << MD_DISK_CLUSTER_ADD) | (1 << MD_DISK_CANDIDATE)))) { |
6874 | pr_warn("%s: Cannot add to clustered mddev.\n" , |
6875 | mdname(mddev)); |
6876 | return -EINVAL; |
6877 | } |
6878 | |
6879 | if (info->major != MAJOR(dev) || info->minor != MINOR(dev)) |
6880 | return -EOVERFLOW; |
6881 | |
6882 | if (!mddev->raid_disks) { |
6883 | int err; |
6884 | /* expecting a device which has a superblock */ |
6885 | rdev = md_import_device(newdev: dev, super_format: mddev->major_version, super_minor: mddev->minor_version); |
6886 | if (IS_ERR(ptr: rdev)) { |
6887 | pr_warn("md: md_import_device returned %ld\n" , |
6888 | PTR_ERR(rdev)); |
6889 | return PTR_ERR(ptr: rdev); |
6890 | } |
6891 | if (!list_empty(head: &mddev->disks)) { |
6892 | struct md_rdev *rdev0 |
6893 | = list_entry(mddev->disks.next, |
6894 | struct md_rdev, same_set); |
6895 | err = super_types[mddev->major_version] |
6896 | .load_super(rdev, rdev0, mddev->minor_version); |
6897 | if (err < 0) { |
6898 | pr_warn("md: %pg has different UUID to %pg\n" , |
6899 | rdev->bdev, |
6900 | rdev0->bdev); |
6901 | export_rdev(rdev, mddev); |
6902 | return -EINVAL; |
6903 | } |
6904 | } |
6905 | err = bind_rdev_to_array(rdev, mddev); |
6906 | if (err) |
6907 | export_rdev(rdev, mddev); |
6908 | return err; |
6909 | } |
6910 | |
6911 | /* |
6912 | * md_add_new_disk can be used once the array is assembled |
6913 | * to add "hot spares". They must already have a superblock |
6914 | * written |
6915 | */ |
6916 | if (mddev->pers) { |
6917 | int err; |
6918 | if (!mddev->pers->hot_add_disk) { |
6919 | pr_warn("%s: personality does not support diskops!\n" , |
6920 | mdname(mddev)); |
6921 | return -EINVAL; |
6922 | } |
6923 | if (mddev->persistent) |
6924 | rdev = md_import_device(newdev: dev, super_format: mddev->major_version, |
6925 | super_minor: mddev->minor_version); |
6926 | else |
6927 | rdev = md_import_device(newdev: dev, super_format: -1, super_minor: -1); |
6928 | if (IS_ERR(ptr: rdev)) { |
6929 | pr_warn("md: md_import_device returned %ld\n" , |
6930 | PTR_ERR(rdev)); |
6931 | return PTR_ERR(ptr: rdev); |
6932 | } |
6933 | /* set saved_raid_disk if appropriate */ |
6934 | if (!mddev->persistent) { |
6935 | if (info->state & (1<<MD_DISK_SYNC) && |
6936 | info->raid_disk < mddev->raid_disks) { |
6937 | rdev->raid_disk = info->raid_disk; |
6938 | clear_bit(nr: Bitmap_sync, addr: &rdev->flags); |
6939 | } else |
6940 | rdev->raid_disk = -1; |
6941 | rdev->saved_raid_disk = rdev->raid_disk; |
6942 | } else |
6943 | super_types[mddev->major_version]. |
6944 | validate_super(mddev, NULL/*freshest*/, rdev); |
6945 | if ((info->state & (1<<MD_DISK_SYNC)) && |
6946 | rdev->raid_disk != info->raid_disk) { |
6947 | /* This was a hot-add request, but events doesn't |
6948 | * match, so reject it. |
6949 | */ |
6950 | export_rdev(rdev, mddev); |
6951 | return -EINVAL; |
6952 | } |
6953 | |
6954 | clear_bit(nr: In_sync, addr: &rdev->flags); /* just to be sure */ |
6955 | if (info->state & (1<<MD_DISK_WRITEMOSTLY)) |
6956 | set_bit(nr: WriteMostly, addr: &rdev->flags); |
6957 | else |
6958 | clear_bit(nr: WriteMostly, addr: &rdev->flags); |
6959 | if (info->state & (1<<MD_DISK_FAILFAST)) |
6960 | set_bit(nr: FailFast, addr: &rdev->flags); |
6961 | else |
6962 | clear_bit(nr: FailFast, addr: &rdev->flags); |
6963 | |
6964 | if (info->state & (1<<MD_DISK_JOURNAL)) { |
6965 | struct md_rdev *rdev2; |
6966 | bool has_journal = false; |
6967 | |
6968 | /* make sure no existing journal disk */ |
6969 | rdev_for_each(rdev2, mddev) { |
6970 | if (test_bit(Journal, &rdev2->flags)) { |
6971 | has_journal = true; |
6972 | break; |
6973 | } |
6974 | } |
6975 | if (has_journal || mddev->bitmap) { |
6976 | export_rdev(rdev, mddev); |
6977 | return -EBUSY; |
6978 | } |
6979 | set_bit(nr: Journal, addr: &rdev->flags); |
6980 | } |
6981 | /* |
6982 | * check whether the device shows up in other nodes |
6983 | */ |
6984 | if (mddev_is_clustered(mddev)) { |
6985 | if (info->state & (1 << MD_DISK_CANDIDATE)) |
6986 | set_bit(nr: Candidate, addr: &rdev->flags); |
6987 | else if (info->state & (1 << MD_DISK_CLUSTER_ADD)) { |
6988 | /* --add initiated by this node */ |
6989 | err = md_cluster_ops->add_new_disk(mddev, rdev); |
6990 | if (err) { |
6991 | export_rdev(rdev, mddev); |
6992 | return err; |
6993 | } |
6994 | } |
6995 | } |
6996 | |
6997 | rdev->raid_disk = -1; |
6998 | err = bind_rdev_to_array(rdev, mddev); |
6999 | |
7000 | if (err) |
7001 | export_rdev(rdev, mddev); |
7002 | |
7003 | if (mddev_is_clustered(mddev)) { |
7004 | if (info->state & (1 << MD_DISK_CANDIDATE)) { |
7005 | if (!err) { |
7006 | err = md_cluster_ops->new_disk_ack(mddev, |
7007 | err == 0); |
7008 | if (err) |
7009 | md_kick_rdev_from_array(rdev); |
7010 | } |
7011 | } else { |
7012 | if (err) |
7013 | md_cluster_ops->add_new_disk_cancel(mddev); |
7014 | else |
7015 | err = add_bound_rdev(rdev); |
7016 | } |
7017 | |
7018 | } else if (!err) |
7019 | err = add_bound_rdev(rdev); |
7020 | |
7021 | return err; |
7022 | } |
7023 | |
7024 | /* otherwise, md_add_new_disk is only allowed |
7025 | * for major_version==0 superblocks |
7026 | */ |
7027 | if (mddev->major_version != 0) { |
7028 | pr_warn("%s: ADD_NEW_DISK not supported\n" , mdname(mddev)); |
7029 | return -EINVAL; |
7030 | } |
7031 | |
7032 | if (!(info->state & (1<<MD_DISK_FAULTY))) { |
7033 | int err; |
7034 | rdev = md_import_device(newdev: dev, super_format: -1, super_minor: 0); |
7035 | if (IS_ERR(ptr: rdev)) { |
7036 | pr_warn("md: error, md_import_device() returned %ld\n" , |
7037 | PTR_ERR(rdev)); |
7038 | return PTR_ERR(ptr: rdev); |
7039 | } |
7040 | rdev->desc_nr = info->number; |
7041 | if (info->raid_disk < mddev->raid_disks) |
7042 | rdev->raid_disk = info->raid_disk; |
7043 | else |
7044 | rdev->raid_disk = -1; |
7045 | |
7046 | if (rdev->raid_disk < mddev->raid_disks) |
7047 | if (info->state & (1<<MD_DISK_SYNC)) |
7048 | set_bit(nr: In_sync, addr: &rdev->flags); |
7049 | |
7050 | if (info->state & (1<<MD_DISK_WRITEMOSTLY)) |
7051 | set_bit(nr: WriteMostly, addr: &rdev->flags); |
7052 | if (info->state & (1<<MD_DISK_FAILFAST)) |
7053 | set_bit(nr: FailFast, addr: &rdev->flags); |
7054 | |
7055 | if (!mddev->persistent) { |
7056 | pr_debug("md: nonpersistent superblock ...\n" ); |
7057 | rdev->sb_start = bdev_nr_sectors(bdev: rdev->bdev); |
7058 | } else |
7059 | rdev->sb_start = calc_dev_sboffset(rdev); |
7060 | rdev->sectors = rdev->sb_start; |
7061 | |
7062 | err = bind_rdev_to_array(rdev, mddev); |
7063 | if (err) { |
7064 | export_rdev(rdev, mddev); |
7065 | return err; |
7066 | } |
7067 | } |
7068 | |
7069 | return 0; |
7070 | } |
7071 | |
7072 | static int hot_remove_disk(struct mddev *mddev, dev_t dev) |
7073 | { |
7074 | struct md_rdev *rdev; |
7075 | |
7076 | if (!mddev->pers) |
7077 | return -ENODEV; |
7078 | |
7079 | rdev = find_rdev(mddev, dev); |
7080 | if (!rdev) |
7081 | return -ENXIO; |
7082 | |
7083 | if (rdev->raid_disk < 0) |
7084 | goto kick_rdev; |
7085 | |
7086 | clear_bit(nr: Blocked, addr: &rdev->flags); |
7087 | remove_and_add_spares(mddev, this: rdev); |
7088 | |
7089 | if (rdev->raid_disk >= 0) |
7090 | goto busy; |
7091 | |
7092 | kick_rdev: |
7093 | if (mddev_is_clustered(mddev)) { |
7094 | if (md_cluster_ops->remove_disk(mddev, rdev)) |
7095 | goto busy; |
7096 | } |
7097 | |
7098 | md_kick_rdev_from_array(rdev); |
7099 | set_bit(nr: MD_SB_CHANGE_DEVS, addr: &mddev->sb_flags); |
7100 | if (!mddev->thread) |
7101 | md_update_sb(mddev, 1); |
7102 | md_new_event(); |
7103 | |
7104 | return 0; |
7105 | busy: |
7106 | pr_debug("md: cannot remove active disk %pg from %s ...\n" , |
7107 | rdev->bdev, mdname(mddev)); |
7108 | return -EBUSY; |
7109 | } |
7110 | |
7111 | static int hot_add_disk(struct mddev *mddev, dev_t dev) |
7112 | { |
7113 | int err; |
7114 | struct md_rdev *rdev; |
7115 | |
7116 | if (!mddev->pers) |
7117 | return -ENODEV; |
7118 | |
7119 | if (mddev->major_version != 0) { |
7120 | pr_warn("%s: HOT_ADD may only be used with version-0 superblocks.\n" , |
7121 | mdname(mddev)); |
7122 | return -EINVAL; |
7123 | } |
7124 | if (!mddev->pers->hot_add_disk) { |
7125 | pr_warn("%s: personality does not support diskops!\n" , |
7126 | mdname(mddev)); |
7127 | return -EINVAL; |
7128 | } |
7129 | |
7130 | rdev = md_import_device(newdev: dev, super_format: -1, super_minor: 0); |
7131 | if (IS_ERR(ptr: rdev)) { |
7132 | pr_warn("md: error, md_import_device() returned %ld\n" , |
7133 | PTR_ERR(rdev)); |
7134 | return -EINVAL; |
7135 | } |
7136 | |
7137 | if (mddev->persistent) |
7138 | rdev->sb_start = calc_dev_sboffset(rdev); |
7139 | else |
7140 | rdev->sb_start = bdev_nr_sectors(bdev: rdev->bdev); |
7141 | |
7142 | rdev->sectors = rdev->sb_start; |
7143 | |
7144 | if (test_bit(Faulty, &rdev->flags)) { |
7145 | pr_warn("md: can not hot-add faulty %pg disk to %s!\n" , |
7146 | rdev->bdev, mdname(mddev)); |
7147 | err = -EINVAL; |
7148 | goto abort_export; |
7149 | } |
7150 | |
7151 | clear_bit(nr: In_sync, addr: &rdev->flags); |
7152 | rdev->desc_nr = -1; |
7153 | rdev->saved_raid_disk = -1; |
7154 | err = bind_rdev_to_array(rdev, mddev); |
7155 | if (err) |
7156 | goto abort_export; |
7157 | |
7158 | /* |
7159 | * The rest should better be atomic, we can have disk failures |
7160 | * noticed in interrupt contexts ... |
7161 | */ |
7162 | |
7163 | rdev->raid_disk = -1; |
7164 | |
7165 | set_bit(nr: MD_SB_CHANGE_DEVS, addr: &mddev->sb_flags); |
7166 | if (!mddev->thread) |
7167 | md_update_sb(mddev, 1); |
7168 | /* |
7169 | * If the new disk does not support REQ_NOWAIT, |
7170 | * disable on the whole MD. |
7171 | */ |
7172 | if (!bdev_nowait(bdev: rdev->bdev)) { |
7173 | pr_info("%s: Disabling nowait because %pg does not support nowait\n" , |
7174 | mdname(mddev), rdev->bdev); |
7175 | blk_queue_flag_clear(QUEUE_FLAG_NOWAIT, q: mddev->gendisk->queue); |
7176 | } |
7177 | /* |
7178 | * Kick recovery, maybe this spare has to be added to the |
7179 | * array immediately. |
7180 | */ |
7181 | set_bit(nr: MD_RECOVERY_NEEDED, addr: &mddev->recovery); |
7182 | md_new_event(); |
7183 | return 0; |
7184 | |
7185 | abort_export: |
7186 | export_rdev(rdev, mddev); |
7187 | return err; |
7188 | } |
7189 | |
7190 | static int set_bitmap_file(struct mddev *mddev, int fd) |
7191 | { |
7192 | int err = 0; |
7193 | |
7194 | if (mddev->pers) { |
7195 | if (!mddev->pers->quiesce || !mddev->thread) |
7196 | return -EBUSY; |
7197 | if (mddev->recovery || mddev->sync_thread) |
7198 | return -EBUSY; |
7199 | /* we should be able to change the bitmap.. */ |
7200 | } |
7201 | |
7202 | if (fd >= 0) { |
7203 | struct inode *inode; |
7204 | struct file *f; |
7205 | |
7206 | if (mddev->bitmap || mddev->bitmap_info.file) |
7207 | return -EEXIST; /* cannot add when bitmap is present */ |
7208 | |
7209 | if (!IS_ENABLED(CONFIG_MD_BITMAP_FILE)) { |
7210 | pr_warn("%s: bitmap files not supported by this kernel\n" , |
7211 | mdname(mddev)); |
7212 | return -EINVAL; |
7213 | } |
7214 | pr_warn("%s: using deprecated bitmap file support\n" , |
7215 | mdname(mddev)); |
7216 | |
7217 | f = fget(fd); |
7218 | |
7219 | if (f == NULL) { |
7220 | pr_warn("%s: error: failed to get bitmap file\n" , |
7221 | mdname(mddev)); |
7222 | return -EBADF; |
7223 | } |
7224 | |
7225 | inode = f->f_mapping->host; |
7226 | if (!S_ISREG(inode->i_mode)) { |
7227 | pr_warn("%s: error: bitmap file must be a regular file\n" , |
7228 | mdname(mddev)); |
7229 | err = -EBADF; |
7230 | } else if (!(f->f_mode & FMODE_WRITE)) { |
7231 | pr_warn("%s: error: bitmap file must open for write\n" , |
7232 | mdname(mddev)); |
7233 | err = -EBADF; |
7234 | } else if (atomic_read(v: &inode->i_writecount) != 1) { |
7235 | pr_warn("%s: error: bitmap file is already in use\n" , |
7236 | mdname(mddev)); |
7237 | err = -EBUSY; |
7238 | } |
7239 | if (err) { |
7240 | fput(f); |
7241 | return err; |
7242 | } |
7243 | mddev->bitmap_info.file = f; |
7244 | mddev->bitmap_info.offset = 0; /* file overrides offset */ |
7245 | } else if (mddev->bitmap == NULL) |
7246 | return -ENOENT; /* cannot remove what isn't there */ |
7247 | err = 0; |
7248 | if (mddev->pers) { |
7249 | if (fd >= 0) { |
7250 | struct bitmap *bitmap; |
7251 | |
7252 | bitmap = md_bitmap_create(mddev, slot: -1); |
7253 | if (!IS_ERR(ptr: bitmap)) { |
7254 | mddev->bitmap = bitmap; |
7255 | err = md_bitmap_load(mddev); |
7256 | } else |
7257 | err = PTR_ERR(ptr: bitmap); |
7258 | if (err) { |
7259 | md_bitmap_destroy(mddev); |
7260 | fd = -1; |
7261 | } |
7262 | } else if (fd < 0) { |
7263 | md_bitmap_destroy(mddev); |
7264 | } |
7265 | } |
7266 | if (fd < 0) { |
7267 | struct file *f = mddev->bitmap_info.file; |
7268 | if (f) { |
7269 | spin_lock(lock: &mddev->lock); |
7270 | mddev->bitmap_info.file = NULL; |
7271 | spin_unlock(lock: &mddev->lock); |
7272 | fput(f); |
7273 | } |
7274 | } |
7275 | |
7276 | return err; |
7277 | } |
7278 | |
7279 | /* |
7280 | * md_set_array_info is used two different ways |
7281 | * The original usage is when creating a new array. |
7282 | * In this usage, raid_disks is > 0 and it together with |
7283 | * level, size, not_persistent,layout,chunksize determine the |
7284 | * shape of the array. |
7285 | * This will always create an array with a type-0.90.0 superblock. |
7286 | * The newer usage is when assembling an array. |
7287 | * In this case raid_disks will be 0, and the major_version field is |
7288 | * use to determine which style super-blocks are to be found on the devices. |
7289 | * The minor and patch _version numbers are also kept incase the |
7290 | * super_block handler wishes to interpret them. |
7291 | */ |
7292 | int md_set_array_info(struct mddev *mddev, struct mdu_array_info_s *info) |
7293 | { |
7294 | if (info->raid_disks == 0) { |
7295 | /* just setting version number for superblock loading */ |
7296 | if (info->major_version < 0 || |
7297 | info->major_version >= ARRAY_SIZE(super_types) || |
7298 | super_types[info->major_version].name == NULL) { |
7299 | /* maybe try to auto-load a module? */ |
7300 | pr_warn("md: superblock version %d not known\n" , |
7301 | info->major_version); |
7302 | return -EINVAL; |
7303 | } |
7304 | mddev->major_version = info->major_version; |
7305 | mddev->minor_version = info->minor_version; |
7306 | mddev->patch_version = info->patch_version; |
7307 | mddev->persistent = !info->not_persistent; |
7308 | /* ensure mddev_put doesn't delete this now that there |
7309 | * is some minimal configuration. |
7310 | */ |
7311 | mddev->ctime = ktime_get_real_seconds(); |
7312 | return 0; |
7313 | } |
7314 | mddev->major_version = MD_MAJOR_VERSION; |
7315 | mddev->minor_version = MD_MINOR_VERSION; |
7316 | mddev->patch_version = MD_PATCHLEVEL_VERSION; |
7317 | mddev->ctime = ktime_get_real_seconds(); |
7318 | |
7319 | mddev->level = info->level; |
7320 | mddev->clevel[0] = 0; |
7321 | mddev->dev_sectors = 2 * (sector_t)info->size; |
7322 | mddev->raid_disks = info->raid_disks; |
7323 | /* don't set md_minor, it is determined by which /dev/md* was |
7324 | * openned |
7325 | */ |
7326 | if (info->state & (1<<MD_SB_CLEAN)) |
7327 | mddev->recovery_cp = MaxSector; |
7328 | else |
7329 | mddev->recovery_cp = 0; |
7330 | mddev->persistent = ! info->not_persistent; |
7331 | mddev->external = 0; |
7332 | |
7333 | mddev->layout = info->layout; |
7334 | if (mddev->level == 0) |
7335 | /* Cannot trust RAID0 layout info here */ |
7336 | mddev->layout = -1; |
7337 | mddev->chunk_sectors = info->chunk_size >> 9; |
7338 | |
7339 | if (mddev->persistent) { |
7340 | mddev->max_disks = MD_SB_DISKS; |
7341 | mddev->flags = 0; |
7342 | mddev->sb_flags = 0; |
7343 | } |
7344 | set_bit(nr: MD_SB_CHANGE_DEVS, addr: &mddev->sb_flags); |
7345 | |
7346 | mddev->bitmap_info.default_offset = MD_SB_BYTES >> 9; |
7347 | mddev->bitmap_info.default_space = 64*2 - (MD_SB_BYTES >> 9); |
7348 | mddev->bitmap_info.offset = 0; |
7349 | |
7350 | mddev->reshape_position = MaxSector; |
7351 | |
7352 | /* |
7353 | * Generate a 128 bit UUID |
7354 | */ |
7355 | get_random_bytes(buf: mddev->uuid, len: 16); |
7356 | |
7357 | mddev->new_level = mddev->level; |
7358 | mddev->new_chunk_sectors = mddev->chunk_sectors; |
7359 | mddev->new_layout = mddev->layout; |
7360 | mddev->delta_disks = 0; |
7361 | mddev->reshape_backwards = 0; |
7362 | |
7363 | return 0; |
7364 | } |
7365 | |
7366 | void md_set_array_sectors(struct mddev *mddev, sector_t array_sectors) |
7367 | { |
7368 | lockdep_assert_held(&mddev->reconfig_mutex); |
7369 | |
7370 | if (mddev->external_size) |
7371 | return; |
7372 | |
7373 | mddev->array_sectors = array_sectors; |
7374 | } |
7375 | EXPORT_SYMBOL(md_set_array_sectors); |
7376 | |
7377 | static int update_size(struct mddev *mddev, sector_t num_sectors) |
7378 | { |
7379 | struct md_rdev *rdev; |
7380 | int rv; |
7381 | int fit = (num_sectors == 0); |
7382 | sector_t old_dev_sectors = mddev->dev_sectors; |
7383 | |
7384 | if (mddev->pers->resize == NULL) |
7385 | return -EINVAL; |
7386 | /* The "num_sectors" is the number of sectors of each device that |
7387 | * is used. This can only make sense for arrays with redundancy. |
7388 | * linear and raid0 always use whatever space is available. We can only |
7389 | * consider changing this number if no resync or reconstruction is |
7390 | * happening, and if the new size is acceptable. It must fit before the |
7391 | * sb_start or, if that is <data_offset, it must fit before the size |
7392 | * of each device. If num_sectors is zero, we find the largest size |
7393 | * that fits. |
7394 | */ |
7395 | if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) |
7396 | return -EBUSY; |
7397 | if (!md_is_rdwr(mddev)) |
7398 | return -EROFS; |
7399 | |
7400 | rdev_for_each(rdev, mddev) { |
7401 | sector_t avail = rdev->sectors; |
7402 | |
7403 | if (fit && (num_sectors == 0 || num_sectors > avail)) |
7404 | num_sectors = avail; |
7405 | if (avail < num_sectors) |
7406 | return -ENOSPC; |
7407 | } |
7408 | rv = mddev->pers->resize(mddev, num_sectors); |
7409 | if (!rv) { |
7410 | if (mddev_is_clustered(mddev)) |
7411 | md_cluster_ops->update_size(mddev, old_dev_sectors); |
7412 | else if (!mddev_is_dm(mddev)) |
7413 | set_capacity_and_notify(disk: mddev->gendisk, |
7414 | size: mddev->array_sectors); |
7415 | } |
7416 | return rv; |
7417 | } |
7418 | |
7419 | static int update_raid_disks(struct mddev *mddev, int raid_disks) |
7420 | { |
7421 | int rv; |
7422 | struct md_rdev *rdev; |
7423 | /* change the number of raid disks */ |
7424 | if (mddev->pers->check_reshape == NULL) |
7425 | return -EINVAL; |
7426 | if (!md_is_rdwr(mddev)) |
7427 | return -EROFS; |
7428 | if (raid_disks <= 0 || |
7429 | (mddev->max_disks && raid_disks >= mddev->max_disks)) |
7430 | return -EINVAL; |
7431 | if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) || |
7432 | test_bit(MD_RESYNCING_REMOTE, &mddev->recovery) || |
7433 | mddev->reshape_position != MaxSector) |
7434 | return -EBUSY; |
7435 | |
7436 | rdev_for_each(rdev, mddev) { |
7437 | if (mddev->raid_disks < raid_disks && |
7438 | rdev->data_offset < rdev->new_data_offset) |
7439 | return -EINVAL; |
7440 | if (mddev->raid_disks > raid_disks && |
7441 | rdev->data_offset > rdev->new_data_offset) |
7442 | return -EINVAL; |
7443 | } |
7444 | |
7445 | mddev->delta_disks = raid_disks - mddev->raid_disks; |
7446 | if (mddev->delta_disks < 0) |
7447 | mddev->reshape_backwards = 1; |
7448 | else if (mddev->delta_disks > 0) |
7449 | mddev->reshape_backwards = 0; |
7450 | |
7451 | rv = mddev->pers->check_reshape(mddev); |
7452 | if (rv < 0) { |
7453 | mddev->delta_disks = 0; |
7454 | mddev->reshape_backwards = 0; |
7455 | } |
7456 | return rv; |
7457 | } |
7458 | |
7459 | /* |
7460 | * update_array_info is used to change the configuration of an |
7461 | * on-line array. |
7462 | * The version, ctime,level,size,raid_disks,not_persistent, layout,chunk_size |
7463 | * fields in the info are checked against the array. |
7464 | * Any differences that cannot be handled will cause an error. |
7465 | * Normally, only one change can be managed at a time. |
7466 | */ |
7467 | static int update_array_info(struct mddev *mddev, mdu_array_info_t *info) |
7468 | { |
7469 | int rv = 0; |
7470 | int cnt = 0; |
7471 | int state = 0; |
7472 | |
7473 | /* calculate expected state,ignoring low bits */ |
7474 | if (mddev->bitmap && mddev->bitmap_info.offset) |
7475 | state |= (1 << MD_SB_BITMAP_PRESENT); |
7476 | |
7477 | if (mddev->major_version != info->major_version || |
7478 | mddev->minor_version != info->minor_version || |
7479 | /* mddev->patch_version != info->patch_version || */ |
7480 | mddev->ctime != info->ctime || |
7481 | mddev->level != info->level || |
7482 | /* mddev->layout != info->layout || */ |
7483 | mddev->persistent != !info->not_persistent || |
7484 | mddev->chunk_sectors != info->chunk_size >> 9 || |
7485 | /* ignore bottom 8 bits of state, and allow SB_BITMAP_PRESENT to change */ |
7486 | ((state^info->state) & 0xfffffe00) |
7487 | ) |
7488 | return -EINVAL; |
7489 | /* Check there is only one change */ |
7490 | if (info->size >= 0 && mddev->dev_sectors / 2 != info->size) |
7491 | cnt++; |
7492 | if (mddev->raid_disks != info->raid_disks) |
7493 | cnt++; |
7494 | if (mddev->layout != info->layout) |
7495 | cnt++; |
7496 | if ((state ^ info->state) & (1<<MD_SB_BITMAP_PRESENT)) |
7497 | cnt++; |
7498 | if (cnt == 0) |
7499 | return 0; |
7500 | if (cnt > 1) |
7501 | return -EINVAL; |
7502 | |
7503 | if (mddev->layout != info->layout) { |
7504 | /* Change layout |
7505 | * we don't need to do anything at the md level, the |
7506 | * personality will take care of it all. |
7507 | */ |
7508 | if (mddev->pers->check_reshape == NULL) |
7509 | return -EINVAL; |
7510 | else { |
7511 | mddev->new_layout = info->layout; |
7512 | rv = mddev->pers->check_reshape(mddev); |
7513 | if (rv) |
7514 | mddev->new_layout = mddev->layout; |
7515 | return rv; |
7516 | } |
7517 | } |
7518 | if (info->size >= 0 && mddev->dev_sectors / 2 != info->size) |
7519 | rv = update_size(mddev, num_sectors: (sector_t)info->size * 2); |
7520 | |
7521 | if (mddev->raid_disks != info->raid_disks) |
7522 | rv = update_raid_disks(mddev, raid_disks: info->raid_disks); |
7523 | |
7524 | if ((state ^ info->state) & (1<<MD_SB_BITMAP_PRESENT)) { |
7525 | if (mddev->pers->quiesce == NULL || mddev->thread == NULL) { |
7526 | rv = -EINVAL; |
7527 | goto err; |
7528 | } |
7529 | if (mddev->recovery || mddev->sync_thread) { |
7530 | rv = -EBUSY; |
7531 | goto err; |
7532 | } |
7533 | if (info->state & (1<<MD_SB_BITMAP_PRESENT)) { |
7534 | struct bitmap *bitmap; |
7535 | /* add the bitmap */ |
7536 | if (mddev->bitmap) { |
7537 | rv = -EEXIST; |
7538 | goto err; |
7539 | } |
7540 | if (mddev->bitmap_info.default_offset == 0) { |
7541 | rv = -EINVAL; |
7542 | goto err; |
7543 | } |
7544 | mddev->bitmap_info.offset = |
7545 | mddev->bitmap_info.default_offset; |
7546 | mddev->bitmap_info.space = |
7547 | mddev->bitmap_info.default_space; |
7548 | bitmap = md_bitmap_create(mddev, slot: -1); |
7549 | if (!IS_ERR(ptr: bitmap)) { |
7550 | mddev->bitmap = bitmap; |
7551 | rv = md_bitmap_load(mddev); |
7552 | } else |
7553 | rv = PTR_ERR(ptr: bitmap); |
7554 | if (rv) |
7555 | md_bitmap_destroy(mddev); |
7556 | } else { |
7557 | /* remove the bitmap */ |
7558 | if (!mddev->bitmap) { |
7559 | rv = -ENOENT; |
7560 | goto err; |
7561 | } |
7562 | if (mddev->bitmap->storage.file) { |
7563 | rv = -EINVAL; |
7564 | goto err; |
7565 | } |
7566 | if (mddev->bitmap_info.nodes) { |
7567 | /* hold PW on all the bitmap lock */ |
7568 | if (md_cluster_ops->lock_all_bitmaps(mddev) <= 0) { |
7569 | pr_warn("md: can't change bitmap to none since the array is in use by more than one node\n" ); |
7570 | rv = -EPERM; |
7571 | md_cluster_ops->unlock_all_bitmaps(mddev); |
7572 | goto err; |
7573 | } |
7574 | |
7575 | mddev->bitmap_info.nodes = 0; |
7576 | md_cluster_ops->leave(mddev); |
7577 | module_put(module: md_cluster_mod); |
7578 | mddev->safemode_delay = DEFAULT_SAFEMODE_DELAY; |
7579 | } |
7580 | md_bitmap_destroy(mddev); |
7581 | mddev->bitmap_info.offset = 0; |
7582 | } |
7583 | } |
7584 | md_update_sb(mddev, 1); |
7585 | return rv; |
7586 | err: |
7587 | return rv; |
7588 | } |
7589 | |
7590 | static int set_disk_faulty(struct mddev *mddev, dev_t dev) |
7591 | { |
7592 | struct md_rdev *rdev; |
7593 | int err = 0; |
7594 | |
7595 | if (mddev->pers == NULL) |
7596 | return -ENODEV; |
7597 | |
7598 | rcu_read_lock(); |
7599 | rdev = md_find_rdev_rcu(mddev, dev); |
7600 | if (!rdev) |
7601 | err = -ENODEV; |
7602 | else { |
7603 | md_error(mddev, rdev); |
7604 | if (test_bit(MD_BROKEN, &mddev->flags)) |
7605 | err = -EBUSY; |
7606 | } |
7607 | rcu_read_unlock(); |
7608 | return err; |
7609 | } |
7610 | |
7611 | /* |
7612 | * We have a problem here : there is no easy way to give a CHS |
7613 | * virtual geometry. We currently pretend that we have a 2 heads |
7614 | * 4 sectors (with a BIG number of cylinders...). This drives |
7615 | * dosfs just mad... ;-) |
7616 | */ |
7617 | static int md_getgeo(struct block_device *bdev, struct hd_geometry *geo) |
7618 | { |
7619 | struct mddev *mddev = bdev->bd_disk->private_data; |
7620 | |
7621 | geo->heads = 2; |
7622 | geo->sectors = 4; |
7623 | geo->cylinders = mddev->array_sectors / 8; |
7624 | return 0; |
7625 | } |
7626 | |
7627 | static inline int md_ioctl_valid(unsigned int cmd) |
7628 | { |
7629 | switch (cmd) { |
7630 | case GET_ARRAY_INFO: |
7631 | case GET_DISK_INFO: |
7632 | case RAID_VERSION: |
7633 | return 0; |
7634 | case ADD_NEW_DISK: |
7635 | case GET_BITMAP_FILE: |
7636 | case HOT_ADD_DISK: |
7637 | case HOT_REMOVE_DISK: |
7638 | case RESTART_ARRAY_RW: |
7639 | case RUN_ARRAY: |
7640 | case SET_ARRAY_INFO: |
7641 | case SET_BITMAP_FILE: |
7642 | case SET_DISK_FAULTY: |
7643 | case STOP_ARRAY: |
7644 | case STOP_ARRAY_RO: |
7645 | case CLUSTERED_DISK_NACK: |
7646 | if (!capable(CAP_SYS_ADMIN)) |
7647 | return -EACCES; |
7648 | return 0; |
7649 | default: |
7650 | return -ENOTTY; |
7651 | } |
7652 | } |
7653 | |
7654 | static bool md_ioctl_need_suspend(unsigned int cmd) |
7655 | { |
7656 | switch (cmd) { |
7657 | case ADD_NEW_DISK: |
7658 | case HOT_ADD_DISK: |
7659 | case HOT_REMOVE_DISK: |
7660 | case SET_BITMAP_FILE: |
7661 | case SET_ARRAY_INFO: |
7662 | return true; |
7663 | default: |
7664 | return false; |
7665 | } |
7666 | } |
7667 | |
7668 | static int __md_set_array_info(struct mddev *mddev, void __user *argp) |
7669 | { |
7670 | mdu_array_info_t info; |
7671 | int err; |
7672 | |
7673 | if (!argp) |
7674 | memset(&info, 0, sizeof(info)); |
7675 | else if (copy_from_user(to: &info, from: argp, n: sizeof(info))) |
7676 | return -EFAULT; |
7677 | |
7678 | if (mddev->pers) { |
7679 | err = update_array_info(mddev, info: &info); |
7680 | if (err) |
7681 | pr_warn("md: couldn't update array info. %d\n" , err); |
7682 | return err; |
7683 | } |
7684 | |
7685 | if (!list_empty(head: &mddev->disks)) { |
7686 | pr_warn("md: array %s already has disks!\n" , mdname(mddev)); |
7687 | return -EBUSY; |
7688 | } |
7689 | |
7690 | if (mddev->raid_disks) { |
7691 | pr_warn("md: array %s already initialised!\n" , mdname(mddev)); |
7692 | return -EBUSY; |
7693 | } |
7694 | |
7695 | err = md_set_array_info(mddev, info: &info); |
7696 | if (err) |
7697 | pr_warn("md: couldn't set array info. %d\n" , err); |
7698 | |
7699 | return err; |
7700 | } |
7701 | |
7702 | static int md_ioctl(struct block_device *bdev, blk_mode_t mode, |
7703 | unsigned int cmd, unsigned long arg) |
7704 | { |
7705 | int err = 0; |
7706 | void __user *argp = (void __user *)arg; |
7707 | struct mddev *mddev = NULL; |
7708 | |
7709 | err = md_ioctl_valid(cmd); |
7710 | if (err) |
7711 | return err; |
7712 | |
7713 | /* |
7714 | * Commands dealing with the RAID driver but not any |
7715 | * particular array: |
7716 | */ |
7717 | if (cmd == RAID_VERSION) |
7718 | return get_version(arg: argp); |
7719 | |
7720 | /* |
7721 | * Commands creating/starting a new array: |
7722 | */ |
7723 | |
7724 | mddev = bdev->bd_disk->private_data; |
7725 | |
7726 | /* Some actions do not requires the mutex */ |
7727 | switch (cmd) { |
7728 | case GET_ARRAY_INFO: |
7729 | if (!mddev->raid_disks && !mddev->external) |
7730 | return -ENODEV; |
7731 | return get_array_info(mddev, arg: argp); |
7732 | |
7733 | case GET_DISK_INFO: |
7734 | if (!mddev->raid_disks && !mddev->external) |
7735 | return -ENODEV; |
7736 | return get_disk_info(mddev, arg: argp); |
7737 | |
7738 | case SET_DISK_FAULTY: |
7739 | return set_disk_faulty(mddev, dev: new_decode_dev(dev: arg)); |
7740 | |
7741 | case GET_BITMAP_FILE: |
7742 | return get_bitmap_file(mddev, arg: argp); |
7743 | } |
7744 | |
7745 | if (cmd == HOT_REMOVE_DISK) |
7746 | /* need to ensure recovery thread has run */ |
7747 | wait_event_interruptible_timeout(mddev->sb_wait, |
7748 | !test_bit(MD_RECOVERY_NEEDED, |
7749 | &mddev->recovery), |
7750 | msecs_to_jiffies(5000)); |
7751 | if (cmd == STOP_ARRAY || cmd == STOP_ARRAY_RO) { |
7752 | /* Need to flush page cache, and ensure no-one else opens |
7753 | * and writes |
7754 | */ |
7755 | err = mddev_set_closing_and_sync_blockdev(mddev, opener_num: 1); |
7756 | if (err) |
7757 | return err; |
7758 | } |
7759 | |
7760 | if (!md_is_rdwr(mddev)) |
7761 | flush_work(work: &mddev->sync_work); |
7762 | |
7763 | err = md_ioctl_need_suspend(cmd) ? mddev_suspend_and_lock(mddev) : |
7764 | mddev_lock(mddev); |
7765 | if (err) { |
7766 | pr_debug("md: ioctl lock interrupted, reason %d, cmd %d\n" , |
7767 | err, cmd); |
7768 | goto out; |
7769 | } |
7770 | |
7771 | if (cmd == SET_ARRAY_INFO) { |
7772 | err = __md_set_array_info(mddev, argp); |
7773 | goto unlock; |
7774 | } |
7775 | |
7776 | /* |
7777 | * Commands querying/configuring an existing array: |
7778 | */ |
7779 | /* if we are not initialised yet, only ADD_NEW_DISK, STOP_ARRAY, |
7780 | * RUN_ARRAY, and GET_ and SET_BITMAP_FILE are allowed */ |
7781 | if ((!mddev->raid_disks && !mddev->external) |
7782 | && cmd != ADD_NEW_DISK && cmd != STOP_ARRAY |
7783 | && cmd != RUN_ARRAY && cmd != SET_BITMAP_FILE |
7784 | && cmd != GET_BITMAP_FILE) { |
7785 | err = -ENODEV; |
7786 | goto unlock; |
7787 | } |
7788 | |
7789 | /* |
7790 | * Commands even a read-only array can execute: |
7791 | */ |
7792 | switch (cmd) { |
7793 | case RESTART_ARRAY_RW: |
7794 | err = restart_array(mddev); |
7795 | goto unlock; |
7796 | |
7797 | case STOP_ARRAY: |
7798 | err = do_md_stop(mddev, mode: 0); |
7799 | goto unlock; |
7800 | |
7801 | case STOP_ARRAY_RO: |
7802 | if (mddev->pers) |
7803 | err = md_set_readonly(mddev); |
7804 | goto unlock; |
7805 | |
7806 | case HOT_REMOVE_DISK: |
7807 | err = hot_remove_disk(mddev, dev: new_decode_dev(dev: arg)); |
7808 | goto unlock; |
7809 | |
7810 | case ADD_NEW_DISK: |
7811 | /* We can support ADD_NEW_DISK on read-only arrays |
7812 | * only if we are re-adding a preexisting device. |
7813 | * So require mddev->pers and MD_DISK_SYNC. |
7814 | */ |
7815 | if (mddev->pers) { |
7816 | mdu_disk_info_t info; |
7817 | if (copy_from_user(to: &info, from: argp, n: sizeof(info))) |
7818 | err = -EFAULT; |
7819 | else if (!(info.state & (1<<MD_DISK_SYNC))) |
7820 | /* Need to clear read-only for this */ |
7821 | break; |
7822 | else |
7823 | err = md_add_new_disk(mddev, info: &info); |
7824 | goto unlock; |
7825 | } |
7826 | break; |
7827 | } |
7828 | |
7829 | /* |
7830 | * The remaining ioctls are changing the state of the |
7831 | * superblock, so we do not allow them on read-only arrays. |
7832 | */ |
7833 | if (!md_is_rdwr(mddev) && mddev->pers) { |
7834 | if (mddev->ro != MD_AUTO_READ) { |
7835 | err = -EROFS; |
7836 | goto unlock; |
7837 | } |
7838 | mddev->ro = MD_RDWR; |
7839 | sysfs_notify_dirent_safe(sd: mddev->sysfs_state); |
7840 | set_bit(nr: MD_RECOVERY_NEEDED, addr: &mddev->recovery); |
7841 | /* mddev_unlock will wake thread */ |
7842 | /* If a device failed while we were read-only, we |
7843 | * need to make sure the metadata is updated now. |
7844 | */ |
7845 | if (test_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags)) { |
7846 | mddev_unlock(mddev); |
7847 | wait_event(mddev->sb_wait, |
7848 | !test_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags) && |
7849 | !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)); |
7850 | mddev_lock_nointr(mddev); |
7851 | } |
7852 | } |
7853 | |
7854 | switch (cmd) { |
7855 | case ADD_NEW_DISK: |
7856 | { |
7857 | mdu_disk_info_t info; |
7858 | if (copy_from_user(to: &info, from: argp, n: sizeof(info))) |
7859 | err = -EFAULT; |
7860 | else |
7861 | err = md_add_new_disk(mddev, info: &info); |
7862 | goto unlock; |
7863 | } |
7864 | |
7865 | case CLUSTERED_DISK_NACK: |
7866 | if (mddev_is_clustered(mddev)) |
7867 | md_cluster_ops->new_disk_ack(mddev, false); |
7868 | else |
7869 | err = -EINVAL; |
7870 | goto unlock; |
7871 | |
7872 | case HOT_ADD_DISK: |
7873 | err = hot_add_disk(mddev, dev: new_decode_dev(dev: arg)); |
7874 | goto unlock; |
7875 | |
7876 | case RUN_ARRAY: |
7877 | err = do_md_run(mddev); |
7878 | goto unlock; |
7879 | |
7880 | case SET_BITMAP_FILE: |
7881 | err = set_bitmap_file(mddev, fd: (int)arg); |
7882 | goto unlock; |
7883 | |
7884 | default: |
7885 | err = -EINVAL; |
7886 | goto unlock; |
7887 | } |
7888 | |
7889 | unlock: |
7890 | if (mddev->hold_active == UNTIL_IOCTL && |
7891 | err != -EINVAL) |
7892 | mddev->hold_active = 0; |
7893 | |
7894 | md_ioctl_need_suspend(cmd) ? mddev_unlock_and_resume(mddev) : |
7895 | mddev_unlock(mddev); |
7896 | |
7897 | out: |
7898 | if (cmd == STOP_ARRAY_RO || (err && cmd == STOP_ARRAY)) |
7899 | clear_bit(nr: MD_CLOSING, addr: &mddev->flags); |
7900 | return err; |
7901 | } |
7902 | #ifdef CONFIG_COMPAT |
7903 | static int md_compat_ioctl(struct block_device *bdev, blk_mode_t mode, |
7904 | unsigned int cmd, unsigned long arg) |
7905 | { |
7906 | switch (cmd) { |
7907 | case HOT_REMOVE_DISK: |
7908 | case HOT_ADD_DISK: |
7909 | case SET_DISK_FAULTY: |
7910 | case SET_BITMAP_FILE: |
7911 | /* These take in integer arg, do not convert */ |
7912 | break; |
7913 | default: |
7914 | arg = (unsigned long)compat_ptr(uptr: arg); |
7915 | break; |
7916 | } |
7917 | |
7918 | return md_ioctl(bdev, mode, cmd, arg); |
7919 | } |
7920 | #endif /* CONFIG_COMPAT */ |
7921 | |
7922 | static int md_set_read_only(struct block_device *bdev, bool ro) |
7923 | { |
7924 | struct mddev *mddev = bdev->bd_disk->private_data; |
7925 | int err; |
7926 | |
7927 | err = mddev_lock(mddev); |
7928 | if (err) |
7929 | return err; |
7930 | |
7931 | if (!mddev->raid_disks && !mddev->external) { |
7932 | err = -ENODEV; |
7933 | goto out_unlock; |
7934 | } |
7935 | |
7936 | /* |
7937 | * Transitioning to read-auto need only happen for arrays that call |
7938 | * md_write_start and which are not ready for writes yet. |
7939 | */ |
7940 | if (!ro && mddev->ro == MD_RDONLY && mddev->pers) { |
7941 | err = restart_array(mddev); |
7942 | if (err) |
7943 | goto out_unlock; |
7944 | mddev->ro = MD_AUTO_READ; |
7945 | } |
7946 | |
7947 | out_unlock: |
7948 | mddev_unlock(mddev); |
7949 | return err; |
7950 | } |
7951 | |
7952 | static int md_open(struct gendisk *disk, blk_mode_t mode) |
7953 | { |
7954 | struct mddev *mddev; |
7955 | int err; |
7956 | |
7957 | spin_lock(lock: &all_mddevs_lock); |
7958 | mddev = mddev_get(mddev: disk->private_data); |
7959 | spin_unlock(lock: &all_mddevs_lock); |
7960 | if (!mddev) |
7961 | return -ENODEV; |
7962 | |
7963 | err = mutex_lock_interruptible(&mddev->open_mutex); |
7964 | if (err) |
7965 | goto out; |
7966 | |
7967 | err = -ENODEV; |
7968 | if (test_bit(MD_CLOSING, &mddev->flags)) |
7969 | goto out_unlock; |
7970 | |
7971 | atomic_inc(v: &mddev->openers); |
7972 | mutex_unlock(lock: &mddev->open_mutex); |
7973 | |
7974 | disk_check_media_change(disk); |
7975 | return 0; |
7976 | |
7977 | out_unlock: |
7978 | mutex_unlock(lock: &mddev->open_mutex); |
7979 | out: |
7980 | mddev_put(mddev); |
7981 | return err; |
7982 | } |
7983 | |
7984 | static void md_release(struct gendisk *disk) |
7985 | { |
7986 | struct mddev *mddev = disk->private_data; |
7987 | |
7988 | BUG_ON(!mddev); |
7989 | atomic_dec(v: &mddev->openers); |
7990 | mddev_put(mddev); |
7991 | } |
7992 | |
7993 | static unsigned int md_check_events(struct gendisk *disk, unsigned int clearing) |
7994 | { |
7995 | struct mddev *mddev = disk->private_data; |
7996 | unsigned int ret = 0; |
7997 | |
7998 | if (mddev->changed) |
7999 | ret = DISK_EVENT_MEDIA_CHANGE; |
8000 | mddev->changed = 0; |
8001 | return ret; |
8002 | } |
8003 | |
8004 | static void md_free_disk(struct gendisk *disk) |
8005 | { |
8006 | struct mddev *mddev = disk->private_data; |
8007 | |
8008 | mddev_free(mddev); |
8009 | } |
8010 | |
8011 | const struct block_device_operations md_fops = |
8012 | { |
8013 | .owner = THIS_MODULE, |
8014 | .submit_bio = md_submit_bio, |
8015 | .open = md_open, |
8016 | .release = md_release, |
8017 | .ioctl = md_ioctl, |
8018 | #ifdef CONFIG_COMPAT |
8019 | .compat_ioctl = md_compat_ioctl, |
8020 | #endif |
8021 | .getgeo = md_getgeo, |
8022 | .check_events = md_check_events, |
8023 | .set_read_only = md_set_read_only, |
8024 | .free_disk = md_free_disk, |
8025 | }; |
8026 | |
8027 | static int md_thread(void *arg) |
8028 | { |
8029 | struct md_thread *thread = arg; |
8030 | |
8031 | /* |
8032 | * md_thread is a 'system-thread', it's priority should be very |
8033 | * high. We avoid resource deadlocks individually in each |
8034 | * raid personality. (RAID5 does preallocation) We also use RR and |
8035 | * the very same RT priority as kswapd, thus we will never get |
8036 | * into a priority inversion deadlock. |
8037 | * |
8038 | * we definitely have to have equal or higher priority than |
8039 | * bdflush, otherwise bdflush will deadlock if there are too |
8040 | * many dirty RAID5 blocks. |
8041 | */ |
8042 | |
8043 | allow_signal(SIGKILL); |
8044 | while (!kthread_should_stop()) { |
8045 | |
8046 | /* We need to wait INTERRUPTIBLE so that |
8047 | * we don't add to the load-average. |
8048 | * That means we need to be sure no signals are |
8049 | * pending |
8050 | */ |
8051 | if (signal_pending(current)) |
8052 | flush_signals(current); |
8053 | |
8054 | wait_event_interruptible_timeout |
8055 | (thread->wqueue, |
8056 | test_bit(THREAD_WAKEUP, &thread->flags) |
8057 | || kthread_should_stop() || kthread_should_park(), |
8058 | thread->timeout); |
8059 | |
8060 | clear_bit(THREAD_WAKEUP, addr: &thread->flags); |
8061 | if (kthread_should_park()) |
8062 | kthread_parkme(); |
8063 | if (!kthread_should_stop()) |
8064 | thread->run(thread); |
8065 | } |
8066 | |
8067 | return 0; |
8068 | } |
8069 | |
8070 | static void md_wakeup_thread_directly(struct md_thread __rcu *thread) |
8071 | { |
8072 | struct md_thread *t; |
8073 | |
8074 | rcu_read_lock(); |
8075 | t = rcu_dereference(thread); |
8076 | if (t) |
8077 | wake_up_process(tsk: t->tsk); |
8078 | rcu_read_unlock(); |
8079 | } |
8080 | |
8081 | void md_wakeup_thread(struct md_thread __rcu *thread) |
8082 | { |
8083 | struct md_thread *t; |
8084 | |
8085 | rcu_read_lock(); |
8086 | t = rcu_dereference(thread); |
8087 | if (t) { |
8088 | pr_debug("md: waking up MD thread %s.\n" , t->tsk->comm); |
8089 | set_bit(THREAD_WAKEUP, addr: &t->flags); |
8090 | wake_up(&t->wqueue); |
8091 | } |
8092 | rcu_read_unlock(); |
8093 | } |
8094 | EXPORT_SYMBOL(md_wakeup_thread); |
8095 | |
8096 | struct md_thread *md_register_thread(void (*run) (struct md_thread *), |
8097 | struct mddev *mddev, const char *name) |
8098 | { |
8099 | struct md_thread *thread; |
8100 | |
8101 | thread = kzalloc(size: sizeof(struct md_thread), GFP_KERNEL); |
8102 | if (!thread) |
8103 | return NULL; |
8104 | |
8105 | init_waitqueue_head(&thread->wqueue); |
8106 | |
8107 | thread->run = run; |
8108 | thread->mddev = mddev; |
8109 | thread->timeout = MAX_SCHEDULE_TIMEOUT; |
8110 | thread->tsk = kthread_run(md_thread, thread, |
8111 | "%s_%s" , |
8112 | mdname(thread->mddev), |
8113 | name); |
8114 | if (IS_ERR(ptr: thread->tsk)) { |
8115 | kfree(objp: thread); |
8116 | return NULL; |
8117 | } |
8118 | return thread; |
8119 | } |
8120 | EXPORT_SYMBOL(md_register_thread); |
8121 | |
8122 | void md_unregister_thread(struct mddev *mddev, struct md_thread __rcu **threadp) |
8123 | { |
8124 | struct md_thread *thread = rcu_dereference_protected(*threadp, |
8125 | lockdep_is_held(&mddev->reconfig_mutex)); |
8126 | |
8127 | if (!thread) |
8128 | return; |
8129 | |
8130 | rcu_assign_pointer(*threadp, NULL); |
8131 | synchronize_rcu(); |
8132 | |
8133 | pr_debug("interrupting MD-thread pid %d\n" , task_pid_nr(thread->tsk)); |
8134 | kthread_stop(k: thread->tsk); |
8135 | kfree(objp: thread); |
8136 | } |
8137 | EXPORT_SYMBOL(md_unregister_thread); |
8138 | |
8139 | void md_error(struct mddev *mddev, struct md_rdev *rdev) |
8140 | { |
8141 | if (!rdev || test_bit(Faulty, &rdev->flags)) |
8142 | return; |
8143 | |
8144 | if (!mddev->pers || !mddev->pers->error_handler) |
8145 | return; |
8146 | mddev->pers->error_handler(mddev, rdev); |
8147 | |
8148 | if (mddev->pers->level == 0) |
8149 | return; |
8150 | |
8151 | if (mddev->degraded && !test_bit(MD_BROKEN, &mddev->flags)) |
8152 | set_bit(nr: MD_RECOVERY_RECOVER, addr: &mddev->recovery); |
8153 | sysfs_notify_dirent_safe(sd: rdev->sysfs_state); |
8154 | set_bit(nr: MD_RECOVERY_INTR, addr: &mddev->recovery); |
8155 | if (!test_bit(MD_BROKEN, &mddev->flags)) { |
8156 | set_bit(nr: MD_RECOVERY_NEEDED, addr: &mddev->recovery); |
8157 | md_wakeup_thread(mddev->thread); |
8158 | } |
8159 | if (mddev->event_work.func) |
8160 | queue_work(wq: md_misc_wq, work: &mddev->event_work); |
8161 | md_new_event(); |
8162 | } |
8163 | EXPORT_SYMBOL(md_error); |
8164 | |
8165 | /* seq_file implementation /proc/mdstat */ |
8166 | |
8167 | static void status_unused(struct seq_file *seq) |
8168 | { |
8169 | int i = 0; |
8170 | struct md_rdev *rdev; |
8171 | |
8172 | seq_printf(m: seq, fmt: "unused devices: " ); |
8173 | |
8174 | list_for_each_entry(rdev, &pending_raid_disks, same_set) { |
8175 | i++; |
8176 | seq_printf(m: seq, fmt: "%pg " , rdev->bdev); |
8177 | } |
8178 | if (!i) |
8179 | seq_printf(m: seq, fmt: "<none>" ); |
8180 | |
8181 | seq_printf(m: seq, fmt: "\n" ); |
8182 | } |
8183 | |
8184 | static void status_personalities(struct seq_file *seq) |
8185 | { |
8186 | struct md_personality *pers; |
8187 | |
8188 | seq_puts(m: seq, s: "Personalities : " ); |
8189 | spin_lock(lock: &pers_lock); |
8190 | list_for_each_entry(pers, &pers_list, list) |
8191 | seq_printf(m: seq, fmt: "[%s] " , pers->name); |
8192 | |
8193 | spin_unlock(lock: &pers_lock); |
8194 | seq_puts(m: seq, s: "\n" ); |
8195 | } |
8196 | |
8197 | static int status_resync(struct seq_file *seq, struct mddev *mddev) |
8198 | { |
8199 | sector_t max_sectors, resync, res; |
8200 | unsigned long dt, db = 0; |
8201 | sector_t rt, curr_mark_cnt, resync_mark_cnt; |
8202 | int scale, recovery_active; |
8203 | unsigned int per_milli; |
8204 | |
8205 | if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) || |
8206 | test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) |
8207 | max_sectors = mddev->resync_max_sectors; |
8208 | else |
8209 | max_sectors = mddev->dev_sectors; |
8210 | |
8211 | resync = mddev->curr_resync; |
8212 | if (resync < MD_RESYNC_ACTIVE) { |
8213 | if (test_bit(MD_RECOVERY_DONE, &mddev->recovery)) |
8214 | /* Still cleaning up */ |
8215 | resync = max_sectors; |
8216 | } else if (resync > max_sectors) { |
8217 | resync = max_sectors; |
8218 | } else { |
8219 | res = atomic_read(v: &mddev->recovery_active); |
8220 | /* |
8221 | * Resync has started, but the subtraction has overflowed or |
8222 | * yielded one of the special values. Force it to active to |
8223 | * ensure the status reports an active resync. |
8224 | */ |
8225 | if (resync < res || resync - res < MD_RESYNC_ACTIVE) |
8226 | resync = MD_RESYNC_ACTIVE; |
8227 | else |
8228 | resync -= res; |
8229 | } |
8230 | |
8231 | if (resync == MD_RESYNC_NONE) { |
8232 | if (test_bit(MD_RESYNCING_REMOTE, &mddev->recovery)) { |
8233 | struct md_rdev *rdev; |
8234 | |
8235 | rdev_for_each(rdev, mddev) |
8236 | if (rdev->raid_disk >= 0 && |
8237 | !test_bit(Faulty, &rdev->flags) && |
8238 | rdev->recovery_offset != MaxSector && |
8239 | rdev->recovery_offset) { |
8240 | seq_printf(m: seq, fmt: "\trecover=REMOTE" ); |
8241 | return 1; |
8242 | } |
8243 | if (mddev->reshape_position != MaxSector) |
8244 | seq_printf(m: seq, fmt: "\treshape=REMOTE" ); |
8245 | else |
8246 | seq_printf(m: seq, fmt: "\tresync=REMOTE" ); |
8247 | return 1; |
8248 | } |
8249 | if (mddev->recovery_cp < MaxSector) { |
8250 | seq_printf(m: seq, fmt: "\tresync=PENDING" ); |
8251 | return 1; |
8252 | } |
8253 | return 0; |
8254 | } |
8255 | if (resync < MD_RESYNC_ACTIVE) { |
8256 | seq_printf(m: seq, fmt: "\tresync=DELAYED" ); |
8257 | return 1; |
8258 | } |
8259 | |
8260 | WARN_ON(max_sectors == 0); |
8261 | /* Pick 'scale' such that (resync>>scale)*1000 will fit |
8262 | * in a sector_t, and (max_sectors>>scale) will fit in a |
8263 | * u32, as those are the requirements for sector_div. |
8264 | * Thus 'scale' must be at least 10 |
8265 | */ |
8266 | scale = 10; |
8267 | if (sizeof(sector_t) > sizeof(unsigned long)) { |
8268 | while ( max_sectors/2 > (1ULL<<(scale+32))) |
8269 | scale++; |
8270 | } |
8271 | res = (resync>>scale)*1000; |
8272 | sector_div(res, (u32)((max_sectors>>scale)+1)); |
8273 | |
8274 | per_milli = res; |
8275 | { |
8276 | int i, x = per_milli/50, y = 20-x; |
8277 | seq_printf(m: seq, fmt: "[" ); |
8278 | for (i = 0; i < x; i++) |
8279 | seq_printf(m: seq, fmt: "=" ); |
8280 | seq_printf(m: seq, fmt: ">" ); |
8281 | for (i = 0; i < y; i++) |
8282 | seq_printf(m: seq, fmt: "." ); |
8283 | seq_printf(m: seq, fmt: "] " ); |
8284 | } |
8285 | seq_printf(m: seq, fmt: " %s =%3u.%u%% (%llu/%llu)" , |
8286 | (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)? |
8287 | "reshape" : |
8288 | (test_bit(MD_RECOVERY_CHECK, &mddev->recovery)? |
8289 | "check" : |
8290 | (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ? |
8291 | "resync" : "recovery" ))), |
8292 | per_milli/10, per_milli % 10, |
8293 | (unsigned long long) resync/2, |
8294 | (unsigned long long) max_sectors/2); |
8295 | |
8296 | /* |
8297 | * dt: time from mark until now |
8298 | * db: blocks written from mark until now |
8299 | * rt: remaining time |
8300 | * |
8301 | * rt is a sector_t, which is always 64bit now. We are keeping |
8302 | * the original algorithm, but it is not really necessary. |
8303 | * |
8304 | * Original algorithm: |
8305 | * So we divide before multiply in case it is 32bit and close |
8306 | * to the limit. |
8307 | * We scale the divisor (db) by 32 to avoid losing precision |
8308 | * near the end of resync when the number of remaining sectors |
8309 | * is close to 'db'. |
8310 | * We then divide rt by 32 after multiplying by db to compensate. |
8311 | * The '+1' avoids division by zero if db is very small. |
8312 | */ |
8313 | dt = ((jiffies - mddev->resync_mark) / HZ); |
8314 | if (!dt) dt++; |
8315 | |
8316 | curr_mark_cnt = mddev->curr_mark_cnt; |
8317 | recovery_active = atomic_read(v: &mddev->recovery_active); |
8318 | resync_mark_cnt = mddev->resync_mark_cnt; |
8319 | |
8320 | if (curr_mark_cnt >= (recovery_active + resync_mark_cnt)) |
8321 | db = curr_mark_cnt - (recovery_active + resync_mark_cnt); |
8322 | |
8323 | rt = max_sectors - resync; /* number of remaining sectors */ |
8324 | rt = div64_u64(dividend: rt, divisor: db/32+1); |
8325 | rt *= dt; |
8326 | rt >>= 5; |
8327 | |
8328 | seq_printf(m: seq, fmt: " finish=%lu.%lumin" , (unsigned long)rt / 60, |
8329 | ((unsigned long)rt % 60)/6); |
8330 | |
8331 | seq_printf(m: seq, fmt: " speed=%ldK/sec" , db/2/dt); |
8332 | return 1; |
8333 | } |
8334 | |
8335 | static void *md_seq_start(struct seq_file *seq, loff_t *pos) |
8336 | __acquires(&all_mddevs_lock) |
8337 | { |
8338 | seq->poll_event = atomic_read(v: &md_event_count); |
8339 | spin_lock(lock: &all_mddevs_lock); |
8340 | |
8341 | return seq_list_start_head(head: &all_mddevs, pos: *pos); |
8342 | } |
8343 | |
8344 | static void *md_seq_next(struct seq_file *seq, void *v, loff_t *pos) |
8345 | { |
8346 | return seq_list_next(v, head: &all_mddevs, ppos: pos); |
8347 | } |
8348 | |
8349 | static void md_seq_stop(struct seq_file *seq, void *v) |
8350 | __releases(&all_mddevs_lock) |
8351 | { |
8352 | spin_unlock(lock: &all_mddevs_lock); |
8353 | } |
8354 | |
8355 | static int md_seq_show(struct seq_file *seq, void *v) |
8356 | { |
8357 | struct mddev *mddev; |
8358 | sector_t sectors; |
8359 | struct md_rdev *rdev; |
8360 | |
8361 | if (v == &all_mddevs) { |
8362 | status_personalities(seq); |
8363 | if (list_empty(head: &all_mddevs)) |
8364 | status_unused(seq); |
8365 | return 0; |
8366 | } |
8367 | |
8368 | mddev = list_entry(v, struct mddev, all_mddevs); |
8369 | if (!mddev_get(mddev)) |
8370 | return 0; |
8371 | |
8372 | spin_unlock(lock: &all_mddevs_lock); |
8373 | spin_lock(lock: &mddev->lock); |
8374 | if (mddev->pers || mddev->raid_disks || !list_empty(head: &mddev->disks)) { |
8375 | seq_printf(m: seq, fmt: "%s : %sactive" , mdname(mddev), |
8376 | mddev->pers ? "" : "in" ); |
8377 | if (mddev->pers) { |
8378 | if (mddev->ro == MD_RDONLY) |
8379 | seq_printf(m: seq, fmt: " (read-only)" ); |
8380 | if (mddev->ro == MD_AUTO_READ) |
8381 | seq_printf(m: seq, fmt: " (auto-read-only)" ); |
8382 | seq_printf(m: seq, fmt: " %s" , mddev->pers->name); |
8383 | } |
8384 | |
8385 | sectors = 0; |
8386 | rcu_read_lock(); |
8387 | rdev_for_each_rcu(rdev, mddev) { |
8388 | seq_printf(m: seq, fmt: " %pg[%d]" , rdev->bdev, rdev->desc_nr); |
8389 | |
8390 | if (test_bit(WriteMostly, &rdev->flags)) |
8391 | seq_printf(m: seq, fmt: "(W)" ); |
8392 | if (test_bit(Journal, &rdev->flags)) |
8393 | seq_printf(m: seq, fmt: "(J)" ); |
8394 | if (test_bit(Faulty, &rdev->flags)) { |
8395 | seq_printf(m: seq, fmt: "(F)" ); |
8396 | continue; |
8397 | } |
8398 | if (rdev->raid_disk < 0) |
8399 | seq_printf(m: seq, fmt: "(S)" ); /* spare */ |
8400 | if (test_bit(Replacement, &rdev->flags)) |
8401 | seq_printf(m: seq, fmt: "(R)" ); |
8402 | sectors += rdev->sectors; |
8403 | } |
8404 | rcu_read_unlock(); |
8405 | |
8406 | if (!list_empty(head: &mddev->disks)) { |
8407 | if (mddev->pers) |
8408 | seq_printf(m: seq, fmt: "\n %llu blocks" , |
8409 | (unsigned long long) |
8410 | mddev->array_sectors / 2); |
8411 | else |
8412 | seq_printf(m: seq, fmt: "\n %llu blocks" , |
8413 | (unsigned long long)sectors / 2); |
8414 | } |
8415 | if (mddev->persistent) { |
8416 | if (mddev->major_version != 0 || |
8417 | mddev->minor_version != 90) { |
8418 | seq_printf(m: seq,fmt: " super %d.%d" , |
8419 | mddev->major_version, |
8420 | mddev->minor_version); |
8421 | } |
8422 | } else if (mddev->external) |
8423 | seq_printf(m: seq, fmt: " super external:%s" , |
8424 | mddev->metadata_type); |
8425 | else |
8426 | seq_printf(m: seq, fmt: " super non-persistent" ); |
8427 | |
8428 | if (mddev->pers) { |
8429 | mddev->pers->status(seq, mddev); |
8430 | seq_printf(m: seq, fmt: "\n " ); |
8431 | if (mddev->pers->sync_request) { |
8432 | if (status_resync(seq, mddev)) |
8433 | seq_printf(m: seq, fmt: "\n " ); |
8434 | } |
8435 | } else |
8436 | seq_printf(m: seq, fmt: "\n " ); |
8437 | |
8438 | md_bitmap_status(seq, bitmap: mddev->bitmap); |
8439 | |
8440 | seq_printf(m: seq, fmt: "\n" ); |
8441 | } |
8442 | spin_unlock(lock: &mddev->lock); |
8443 | spin_lock(lock: &all_mddevs_lock); |
8444 | |
8445 | if (mddev == list_last_entry(&all_mddevs, struct mddev, all_mddevs)) |
8446 | status_unused(seq); |
8447 | |
8448 | if (atomic_dec_and_test(v: &mddev->active)) |
8449 | __mddev_put(mddev); |
8450 | |
8451 | return 0; |
8452 | } |
8453 | |
8454 | static const struct seq_operations md_seq_ops = { |
8455 | .start = md_seq_start, |
8456 | .next = md_seq_next, |
8457 | .stop = md_seq_stop, |
8458 | .show = md_seq_show, |
8459 | }; |
8460 | |
8461 | static int md_seq_open(struct inode *inode, struct file *file) |
8462 | { |
8463 | struct seq_file *seq; |
8464 | int error; |
8465 | |
8466 | error = seq_open(file, &md_seq_ops); |
8467 | if (error) |
8468 | return error; |
8469 | |
8470 | seq = file->private_data; |
8471 | seq->poll_event = atomic_read(v: &md_event_count); |
8472 | return error; |
8473 | } |
8474 | |
8475 | static int md_unloading; |
8476 | static __poll_t mdstat_poll(struct file *filp, poll_table *wait) |
8477 | { |
8478 | struct seq_file *seq = filp->private_data; |
8479 | __poll_t mask; |
8480 | |
8481 | if (md_unloading) |
8482 | return EPOLLIN|EPOLLRDNORM|EPOLLERR|EPOLLPRI; |
8483 | poll_wait(filp, wait_address: &md_event_waiters, p: wait); |
8484 | |
8485 | /* always allow read */ |
8486 | mask = EPOLLIN | EPOLLRDNORM; |
8487 | |
8488 | if (seq->poll_event != atomic_read(v: &md_event_count)) |
8489 | mask |= EPOLLERR | EPOLLPRI; |
8490 | return mask; |
8491 | } |
8492 | |
8493 | static const struct proc_ops mdstat_proc_ops = { |
8494 | .proc_open = md_seq_open, |
8495 | .proc_read = seq_read, |
8496 | .proc_lseek = seq_lseek, |
8497 | .proc_release = seq_release, |
8498 | .proc_poll = mdstat_poll, |
8499 | }; |
8500 | |
8501 | int register_md_personality(struct md_personality *p) |
8502 | { |
8503 | pr_debug("md: %s personality registered for level %d\n" , |
8504 | p->name, p->level); |
8505 | spin_lock(lock: &pers_lock); |
8506 | list_add_tail(new: &p->list, head: &pers_list); |
8507 | spin_unlock(lock: &pers_lock); |
8508 | return 0; |
8509 | } |
8510 | EXPORT_SYMBOL(register_md_personality); |
8511 | |
8512 | int unregister_md_personality(struct md_personality *p) |
8513 | { |
8514 | pr_debug("md: %s personality unregistered\n" , p->name); |
8515 | spin_lock(lock: &pers_lock); |
8516 | list_del_init(entry: &p->list); |
8517 | spin_unlock(lock: &pers_lock); |
8518 | return 0; |
8519 | } |
8520 | EXPORT_SYMBOL(unregister_md_personality); |
8521 | |
8522 | int register_md_cluster_operations(struct md_cluster_operations *ops, |
8523 | struct module *module) |
8524 | { |
8525 | int ret = 0; |
8526 | spin_lock(lock: &pers_lock); |
8527 | if (md_cluster_ops != NULL) |
8528 | ret = -EALREADY; |
8529 | else { |
8530 | md_cluster_ops = ops; |
8531 | md_cluster_mod = module; |
8532 | } |
8533 | spin_unlock(lock: &pers_lock); |
8534 | return ret; |
8535 | } |
8536 | EXPORT_SYMBOL(register_md_cluster_operations); |
8537 | |
8538 | int unregister_md_cluster_operations(void) |
8539 | { |
8540 | spin_lock(lock: &pers_lock); |
8541 | md_cluster_ops = NULL; |
8542 | spin_unlock(lock: &pers_lock); |
8543 | return 0; |
8544 | } |
8545 | EXPORT_SYMBOL(unregister_md_cluster_operations); |
8546 | |
8547 | int md_setup_cluster(struct mddev *mddev, int nodes) |
8548 | { |
8549 | int ret; |
8550 | if (!md_cluster_ops) |
8551 | request_module("md-cluster" ); |
8552 | spin_lock(lock: &pers_lock); |
8553 | /* ensure module won't be unloaded */ |
8554 | if (!md_cluster_ops || !try_module_get(module: md_cluster_mod)) { |
8555 | pr_warn("can't find md-cluster module or get its reference.\n" ); |
8556 | spin_unlock(lock: &pers_lock); |
8557 | return -ENOENT; |
8558 | } |
8559 | spin_unlock(lock: &pers_lock); |
8560 | |
8561 | ret = md_cluster_ops->join(mddev, nodes); |
8562 | if (!ret) |
8563 | mddev->safemode_delay = 0; |
8564 | return ret; |
8565 | } |
8566 | |
8567 | void md_cluster_stop(struct mddev *mddev) |
8568 | { |
8569 | if (!md_cluster_ops) |
8570 | return; |
8571 | md_cluster_ops->leave(mddev); |
8572 | module_put(module: md_cluster_mod); |
8573 | } |
8574 | |
8575 | static int is_mddev_idle(struct mddev *mddev, int init) |
8576 | { |
8577 | struct md_rdev *rdev; |
8578 | int idle; |
8579 | int curr_events; |
8580 | |
8581 | idle = 1; |
8582 | rcu_read_lock(); |
8583 | rdev_for_each_rcu(rdev, mddev) { |
8584 | struct gendisk *disk = rdev->bdev->bd_disk; |
8585 | curr_events = (int)part_stat_read_accum(disk->part0, sectors) - |
8586 | atomic_read(v: &disk->sync_io); |
8587 | /* sync IO will cause sync_io to increase before the disk_stats |
8588 | * as sync_io is counted when a request starts, and |
8589 | * disk_stats is counted when it completes. |
8590 | * So resync activity will cause curr_events to be smaller than |
8591 | * when there was no such activity. |
8592 | * non-sync IO will cause disk_stat to increase without |
8593 | * increasing sync_io so curr_events will (eventually) |
8594 | * be larger than it was before. Once it becomes |
8595 | * substantially larger, the test below will cause |
8596 | * the array to appear non-idle, and resync will slow |
8597 | * down. |
8598 | * If there is a lot of outstanding resync activity when |
8599 | * we set last_event to curr_events, then all that activity |
8600 | * completing might cause the array to appear non-idle |
8601 | * and resync will be slowed down even though there might |
8602 | * not have been non-resync activity. This will only |
8603 | * happen once though. 'last_events' will soon reflect |
8604 | * the state where there is little or no outstanding |
8605 | * resync requests, and further resync activity will |
8606 | * always make curr_events less than last_events. |
8607 | * |
8608 | */ |
8609 | if (init || curr_events - rdev->last_events > 64) { |
8610 | rdev->last_events = curr_events; |
8611 | idle = 0; |
8612 | } |
8613 | } |
8614 | rcu_read_unlock(); |
8615 | return idle; |
8616 | } |
8617 | |
8618 | void md_done_sync(struct mddev *mddev, int blocks, int ok) |
8619 | { |
8620 | /* another "blocks" (512byte) blocks have been synced */ |
8621 | atomic_sub(i: blocks, v: &mddev->recovery_active); |
8622 | wake_up(&mddev->recovery_wait); |
8623 | if (!ok) { |
8624 | set_bit(nr: MD_RECOVERY_INTR, addr: &mddev->recovery); |
8625 | set_bit(nr: MD_RECOVERY_ERROR, addr: &mddev->recovery); |
8626 | md_wakeup_thread(mddev->thread); |
8627 | // stop recovery, signal do_sync .... |
8628 | } |
8629 | } |
8630 | EXPORT_SYMBOL(md_done_sync); |
8631 | |
8632 | /* md_write_start(mddev, bi) |
8633 | * If we need to update some array metadata (e.g. 'active' flag |
8634 | * in superblock) before writing, schedule a superblock update |
8635 | * and wait for it to complete. |
8636 | * A return value of 'false' means that the write wasn't recorded |
8637 | * and cannot proceed as the array is being suspend. |
8638 | */ |
8639 | bool md_write_start(struct mddev *mddev, struct bio *bi) |
8640 | { |
8641 | int did_change = 0; |
8642 | |
8643 | if (bio_data_dir(bi) != WRITE) |
8644 | return true; |
8645 | |
8646 | BUG_ON(mddev->ro == MD_RDONLY); |
8647 | if (mddev->ro == MD_AUTO_READ) { |
8648 | /* need to switch to read/write */ |
8649 | flush_work(work: &mddev->sync_work); |
8650 | mddev->ro = MD_RDWR; |
8651 | set_bit(nr: MD_RECOVERY_NEEDED, addr: &mddev->recovery); |
8652 | md_wakeup_thread(mddev->thread); |
8653 | md_wakeup_thread(mddev->sync_thread); |
8654 | did_change = 1; |
8655 | } |
8656 | rcu_read_lock(); |
8657 | percpu_ref_get(ref: &mddev->writes_pending); |
8658 | smp_mb(); /* Match smp_mb in set_in_sync() */ |
8659 | if (mddev->safemode == 1) |
8660 | mddev->safemode = 0; |
8661 | /* sync_checkers is always 0 when writes_pending is in per-cpu mode */ |
8662 | if (mddev->in_sync || mddev->sync_checkers) { |
8663 | spin_lock(lock: &mddev->lock); |
8664 | if (mddev->in_sync) { |
8665 | mddev->in_sync = 0; |
8666 | set_bit(nr: MD_SB_CHANGE_CLEAN, addr: &mddev->sb_flags); |
8667 | set_bit(nr: MD_SB_CHANGE_PENDING, addr: &mddev->sb_flags); |
8668 | md_wakeup_thread(mddev->thread); |
8669 | did_change = 1; |
8670 | } |
8671 | spin_unlock(lock: &mddev->lock); |
8672 | } |
8673 | rcu_read_unlock(); |
8674 | if (did_change) |
8675 | sysfs_notify_dirent_safe(sd: mddev->sysfs_state); |
8676 | if (!mddev->has_superblocks) |
8677 | return true; |
8678 | wait_event(mddev->sb_wait, |
8679 | !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags) || |
8680 | is_md_suspended(mddev)); |
8681 | if (test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)) { |
8682 | percpu_ref_put(ref: &mddev->writes_pending); |
8683 | return false; |
8684 | } |
8685 | return true; |
8686 | } |
8687 | EXPORT_SYMBOL(md_write_start); |
8688 | |
8689 | /* md_write_inc can only be called when md_write_start() has |
8690 | * already been called at least once of the current request. |
8691 | * It increments the counter and is useful when a single request |
8692 | * is split into several parts. Each part causes an increment and |
8693 | * so needs a matching md_write_end(). |
8694 | * Unlike md_write_start(), it is safe to call md_write_inc() inside |
8695 | * a spinlocked region. |
8696 | */ |
8697 | void md_write_inc(struct mddev *mddev, struct bio *bi) |
8698 | { |
8699 | if (bio_data_dir(bi) != WRITE) |
8700 | return; |
8701 | WARN_ON_ONCE(mddev->in_sync || !md_is_rdwr(mddev)); |
8702 | percpu_ref_get(ref: &mddev->writes_pending); |
8703 | } |
8704 | EXPORT_SYMBOL(md_write_inc); |
8705 | |
8706 | void md_write_end(struct mddev *mddev) |
8707 | { |
8708 | percpu_ref_put(ref: &mddev->writes_pending); |
8709 | |
8710 | if (mddev->safemode == 2) |
8711 | md_wakeup_thread(mddev->thread); |
8712 | else if (mddev->safemode_delay) |
8713 | /* The roundup() ensures this only performs locking once |
8714 | * every ->safemode_delay jiffies |
8715 | */ |
8716 | mod_timer(timer: &mddev->safemode_timer, |
8717 | roundup(jiffies, mddev->safemode_delay) + |
8718 | mddev->safemode_delay); |
8719 | } |
8720 | |
8721 | EXPORT_SYMBOL(md_write_end); |
8722 | |
8723 | /* This is used by raid0 and raid10 */ |
8724 | void md_submit_discard_bio(struct mddev *mddev, struct md_rdev *rdev, |
8725 | struct bio *bio, sector_t start, sector_t size) |
8726 | { |
8727 | struct bio *discard_bio = NULL; |
8728 | |
8729 | if (__blkdev_issue_discard(bdev: rdev->bdev, sector: start, nr_sects: size, GFP_NOIO, |
8730 | biop: &discard_bio) || !discard_bio) |
8731 | return; |
8732 | |
8733 | bio_chain(discard_bio, bio); |
8734 | bio_clone_blkg_association(dst: discard_bio, src: bio); |
8735 | mddev_trace_remap(mddev, bio: discard_bio, sector: bio->bi_iter.bi_sector); |
8736 | submit_bio_noacct(bio: discard_bio); |
8737 | } |
8738 | EXPORT_SYMBOL_GPL(md_submit_discard_bio); |
8739 | |
8740 | static void md_end_clone_io(struct bio *bio) |
8741 | { |
8742 | struct md_io_clone *md_io_clone = bio->bi_private; |
8743 | struct bio *orig_bio = md_io_clone->orig_bio; |
8744 | struct mddev *mddev = md_io_clone->mddev; |
8745 | |
8746 | if (bio->bi_status && !orig_bio->bi_status) |
8747 | orig_bio->bi_status = bio->bi_status; |
8748 | |
8749 | if (md_io_clone->start_time) |
8750 | bio_end_io_acct(bio: orig_bio, start_time: md_io_clone->start_time); |
8751 | |
8752 | bio_put(bio); |
8753 | bio_endio(orig_bio); |
8754 | percpu_ref_put(ref: &mddev->active_io); |
8755 | } |
8756 | |
8757 | static void md_clone_bio(struct mddev *mddev, struct bio **bio) |
8758 | { |
8759 | struct block_device *bdev = (*bio)->bi_bdev; |
8760 | struct md_io_clone *md_io_clone; |
8761 | struct bio *clone = |
8762 | bio_alloc_clone(bdev, bio_src: *bio, GFP_NOIO, bs: &mddev->io_clone_set); |
8763 | |
8764 | md_io_clone = container_of(clone, struct md_io_clone, bio_clone); |
8765 | md_io_clone->orig_bio = *bio; |
8766 | md_io_clone->mddev = mddev; |
8767 | if (blk_queue_io_stat(bdev->bd_disk->queue)) |
8768 | md_io_clone->start_time = bio_start_io_acct(bio: *bio); |
8769 | |
8770 | clone->bi_end_io = md_end_clone_io; |
8771 | clone->bi_private = md_io_clone; |
8772 | *bio = clone; |
8773 | } |
8774 | |
8775 | void md_account_bio(struct mddev *mddev, struct bio **bio) |
8776 | { |
8777 | percpu_ref_get(ref: &mddev->active_io); |
8778 | md_clone_bio(mddev, bio); |
8779 | } |
8780 | EXPORT_SYMBOL_GPL(md_account_bio); |
8781 | |
8782 | void md_free_cloned_bio(struct bio *bio) |
8783 | { |
8784 | struct md_io_clone *md_io_clone = bio->bi_private; |
8785 | struct bio *orig_bio = md_io_clone->orig_bio; |
8786 | struct mddev *mddev = md_io_clone->mddev; |
8787 | |
8788 | if (bio->bi_status && !orig_bio->bi_status) |
8789 | orig_bio->bi_status = bio->bi_status; |
8790 | |
8791 | if (md_io_clone->start_time) |
8792 | bio_end_io_acct(bio: orig_bio, start_time: md_io_clone->start_time); |
8793 | |
8794 | bio_put(bio); |
8795 | percpu_ref_put(ref: &mddev->active_io); |
8796 | } |
8797 | EXPORT_SYMBOL_GPL(md_free_cloned_bio); |
8798 | |
8799 | /* md_allow_write(mddev) |
8800 | * Calling this ensures that the array is marked 'active' so that writes |
8801 | * may proceed without blocking. It is important to call this before |
8802 | * attempting a GFP_KERNEL allocation while holding the mddev lock. |
8803 | * Must be called with mddev_lock held. |
8804 | */ |
8805 | void md_allow_write(struct mddev *mddev) |
8806 | { |
8807 | if (!mddev->pers) |
8808 | return; |
8809 | if (!md_is_rdwr(mddev)) |
8810 | return; |
8811 | if (!mddev->pers->sync_request) |
8812 | return; |
8813 | |
8814 | spin_lock(lock: &mddev->lock); |
8815 | if (mddev->in_sync) { |
8816 | mddev->in_sync = 0; |
8817 | set_bit(nr: MD_SB_CHANGE_CLEAN, addr: &mddev->sb_flags); |
8818 | set_bit(nr: MD_SB_CHANGE_PENDING, addr: &mddev->sb_flags); |
8819 | if (mddev->safemode_delay && |
8820 | mddev->safemode == 0) |
8821 | mddev->safemode = 1; |
8822 | spin_unlock(lock: &mddev->lock); |
8823 | md_update_sb(mddev, 0); |
8824 | sysfs_notify_dirent_safe(sd: mddev->sysfs_state); |
8825 | /* wait for the dirty state to be recorded in the metadata */ |
8826 | wait_event(mddev->sb_wait, |
8827 | !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)); |
8828 | } else |
8829 | spin_unlock(lock: &mddev->lock); |
8830 | } |
8831 | EXPORT_SYMBOL_GPL(md_allow_write); |
8832 | |
8833 | #define SYNC_MARKS 10 |
8834 | #define SYNC_MARK_STEP (3*HZ) |
8835 | #define UPDATE_FREQUENCY (5*60*HZ) |
8836 | void md_do_sync(struct md_thread *thread) |
8837 | { |
8838 | struct mddev *mddev = thread->mddev; |
8839 | struct mddev *mddev2; |
8840 | unsigned int currspeed = 0, window; |
8841 | sector_t max_sectors,j, io_sectors, recovery_done; |
8842 | unsigned long mark[SYNC_MARKS]; |
8843 | unsigned long update_time; |
8844 | sector_t mark_cnt[SYNC_MARKS]; |
8845 | int last_mark,m; |
8846 | sector_t last_check; |
8847 | int skipped = 0; |
8848 | struct md_rdev *rdev; |
8849 | char *desc, *action = NULL; |
8850 | struct blk_plug plug; |
8851 | int ret; |
8852 | |
8853 | /* just incase thread restarts... */ |
8854 | if (test_bit(MD_RECOVERY_DONE, &mddev->recovery)) |
8855 | return; |
8856 | |
8857 | if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) |
8858 | goto skip; |
8859 | |
8860 | if (test_bit(MD_RECOVERY_WAIT, &mddev->recovery) || |
8861 | !md_is_rdwr(mddev)) {/* never try to sync a read-only array */ |
8862 | set_bit(nr: MD_RECOVERY_INTR, addr: &mddev->recovery); |
8863 | goto skip; |
8864 | } |
8865 | |
8866 | if (mddev_is_clustered(mddev)) { |
8867 | ret = md_cluster_ops->resync_start(mddev); |
8868 | if (ret) |
8869 | goto skip; |
8870 | |
8871 | set_bit(nr: MD_CLUSTER_RESYNC_LOCKED, addr: &mddev->flags); |
8872 | if (!(test_bit(MD_RECOVERY_SYNC, &mddev->recovery) || |
8873 | test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) || |
8874 | test_bit(MD_RECOVERY_RECOVER, &mddev->recovery)) |
8875 | && ((unsigned long long)mddev->curr_resync_completed |
8876 | < (unsigned long long)mddev->resync_max_sectors)) |
8877 | goto skip; |
8878 | } |
8879 | |
8880 | if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { |
8881 | if (test_bit(MD_RECOVERY_CHECK, &mddev->recovery)) { |
8882 | desc = "data-check" ; |
8883 | action = "check" ; |
8884 | } else if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) { |
8885 | desc = "requested-resync" ; |
8886 | action = "repair" ; |
8887 | } else |
8888 | desc = "resync" ; |
8889 | } else if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) |
8890 | desc = "reshape" ; |
8891 | else |
8892 | desc = "recovery" ; |
8893 | |
8894 | mddev->last_sync_action = action ?: desc; |
8895 | |
8896 | /* |
8897 | * Before starting a resync we must have set curr_resync to |
8898 | * 2, and then checked that every "conflicting" array has curr_resync |
8899 | * less than ours. When we find one that is the same or higher |
8900 | * we wait on resync_wait. To avoid deadlock, we reduce curr_resync |
8901 | * to 1 if we choose to yield (based arbitrarily on address of mddev structure). |
8902 | * This will mean we have to start checking from the beginning again. |
8903 | * |
8904 | */ |
8905 | |
8906 | do { |
8907 | int mddev2_minor = -1; |
8908 | mddev->curr_resync = MD_RESYNC_DELAYED; |
8909 | |
8910 | try_again: |
8911 | if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) |
8912 | goto skip; |
8913 | spin_lock(lock: &all_mddevs_lock); |
8914 | list_for_each_entry(mddev2, &all_mddevs, all_mddevs) { |
8915 | if (test_bit(MD_DELETED, &mddev2->flags)) |
8916 | continue; |
8917 | if (mddev2 == mddev) |
8918 | continue; |
8919 | if (!mddev->parallel_resync |
8920 | && mddev2->curr_resync |
8921 | && match_mddev_units(mddev1: mddev, mddev2)) { |
8922 | DEFINE_WAIT(wq); |
8923 | if (mddev < mddev2 && |
8924 | mddev->curr_resync == MD_RESYNC_DELAYED) { |
8925 | /* arbitrarily yield */ |
8926 | mddev->curr_resync = MD_RESYNC_YIELDED; |
8927 | wake_up(&resync_wait); |
8928 | } |
8929 | if (mddev > mddev2 && |
8930 | mddev->curr_resync == MD_RESYNC_YIELDED) |
8931 | /* no need to wait here, we can wait the next |
8932 | * time 'round when curr_resync == 2 |
8933 | */ |
8934 | continue; |
8935 | /* We need to wait 'interruptible' so as not to |
8936 | * contribute to the load average, and not to |
8937 | * be caught by 'softlockup' |
8938 | */ |
8939 | prepare_to_wait(wq_head: &resync_wait, wq_entry: &wq, TASK_INTERRUPTIBLE); |
8940 | if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery) && |
8941 | mddev2->curr_resync >= mddev->curr_resync) { |
8942 | if (mddev2_minor != mddev2->md_minor) { |
8943 | mddev2_minor = mddev2->md_minor; |
8944 | pr_info("md: delaying %s of %s until %s has finished (they share one or more physical units)\n" , |
8945 | desc, mdname(mddev), |
8946 | mdname(mddev2)); |
8947 | } |
8948 | spin_unlock(lock: &all_mddevs_lock); |
8949 | |
8950 | if (signal_pending(current)) |
8951 | flush_signals(current); |
8952 | schedule(); |
8953 | finish_wait(wq_head: &resync_wait, wq_entry: &wq); |
8954 | goto try_again; |
8955 | } |
8956 | finish_wait(wq_head: &resync_wait, wq_entry: &wq); |
8957 | } |
8958 | } |
8959 | spin_unlock(lock: &all_mddevs_lock); |
8960 | } while (mddev->curr_resync < MD_RESYNC_DELAYED); |
8961 | |
8962 | j = 0; |
8963 | if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { |
8964 | /* resync follows the size requested by the personality, |
8965 | * which defaults to physical size, but can be virtual size |
8966 | */ |
8967 | max_sectors = mddev->resync_max_sectors; |
8968 | atomic64_set(v: &mddev->resync_mismatches, i: 0); |
8969 | /* we don't use the checkpoint if there's a bitmap */ |
8970 | if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) |
8971 | j = mddev->resync_min; |
8972 | else if (!mddev->bitmap) |
8973 | j = mddev->recovery_cp; |
8974 | |
8975 | } else if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) { |
8976 | max_sectors = mddev->resync_max_sectors; |
8977 | /* |
8978 | * If the original node aborts reshaping then we continue the |
8979 | * reshaping, so set j again to avoid restart reshape from the |
8980 | * first beginning |
8981 | */ |
8982 | if (mddev_is_clustered(mddev) && |
8983 | mddev->reshape_position != MaxSector) |
8984 | j = mddev->reshape_position; |
8985 | } else { |
8986 | /* recovery follows the physical size of devices */ |
8987 | max_sectors = mddev->dev_sectors; |
8988 | j = MaxSector; |
8989 | rcu_read_lock(); |
8990 | rdev_for_each_rcu(rdev, mddev) |
8991 | if (rdev->raid_disk >= 0 && |
8992 | !test_bit(Journal, &rdev->flags) && |
8993 | !test_bit(Faulty, &rdev->flags) && |
8994 | !test_bit(In_sync, &rdev->flags) && |
8995 | rdev->recovery_offset < j) |
8996 | j = rdev->recovery_offset; |
8997 | rcu_read_unlock(); |
8998 | |
8999 | /* If there is a bitmap, we need to make sure all |
9000 | * writes that started before we added a spare |
9001 | * complete before we start doing a recovery. |
9002 | * Otherwise the write might complete and (via |
9003 | * bitmap_endwrite) set a bit in the bitmap after the |
9004 | * recovery has checked that bit and skipped that |
9005 | * region. |
9006 | */ |
9007 | if (mddev->bitmap) { |
9008 | mddev->pers->quiesce(mddev, 1); |
9009 | mddev->pers->quiesce(mddev, 0); |
9010 | } |
9011 | } |
9012 | |
9013 | pr_info("md: %s of RAID array %s\n" , desc, mdname(mddev)); |
9014 | pr_debug("md: minimum _guaranteed_ speed: %d KB/sec/disk.\n" , speed_min(mddev)); |
9015 | pr_debug("md: using maximum available idle IO bandwidth (but not more than %d KB/sec) for %s.\n" , |
9016 | speed_max(mddev), desc); |
9017 | |
9018 | is_mddev_idle(mddev, init: 1); /* this initializes IO event counters */ |
9019 | |
9020 | io_sectors = 0; |
9021 | for (m = 0; m < SYNC_MARKS; m++) { |
9022 | mark[m] = jiffies; |
9023 | mark_cnt[m] = io_sectors; |
9024 | } |
9025 | last_mark = 0; |
9026 | mddev->resync_mark = mark[last_mark]; |
9027 | mddev->resync_mark_cnt = mark_cnt[last_mark]; |
9028 | |
9029 | /* |
9030 | * Tune reconstruction: |
9031 | */ |
9032 | window = 32 * (PAGE_SIZE / 512); |
9033 | pr_debug("md: using %dk window, over a total of %lluk.\n" , |
9034 | window/2, (unsigned long long)max_sectors/2); |
9035 | |
9036 | atomic_set(v: &mddev->recovery_active, i: 0); |
9037 | last_check = 0; |
9038 | |
9039 | if (j >= MD_RESYNC_ACTIVE) { |
9040 | pr_debug("md: resuming %s of %s from checkpoint.\n" , |
9041 | desc, mdname(mddev)); |
9042 | mddev->curr_resync = j; |
9043 | } else |
9044 | mddev->curr_resync = MD_RESYNC_ACTIVE; /* no longer delayed */ |
9045 | mddev->curr_resync_completed = j; |
9046 | sysfs_notify_dirent_safe(sd: mddev->sysfs_completed); |
9047 | md_new_event(); |
9048 | update_time = jiffies; |
9049 | |
9050 | blk_start_plug(&plug); |
9051 | while (j < max_sectors) { |
9052 | sector_t sectors; |
9053 | |
9054 | skipped = 0; |
9055 | |
9056 | if (!test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && |
9057 | ((mddev->curr_resync > mddev->curr_resync_completed && |
9058 | (mddev->curr_resync - mddev->curr_resync_completed) |
9059 | > (max_sectors >> 4)) || |
9060 | time_after_eq(jiffies, update_time + UPDATE_FREQUENCY) || |
9061 | (j - mddev->curr_resync_completed)*2 |
9062 | >= mddev->resync_max - mddev->curr_resync_completed || |
9063 | mddev->curr_resync_completed > mddev->resync_max |
9064 | )) { |
9065 | /* time to update curr_resync_completed */ |
9066 | wait_event(mddev->recovery_wait, |
9067 | atomic_read(&mddev->recovery_active) == 0); |
9068 | mddev->curr_resync_completed = j; |
9069 | if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) && |
9070 | j > mddev->recovery_cp) |
9071 | mddev->recovery_cp = j; |
9072 | update_time = jiffies; |
9073 | set_bit(nr: MD_SB_CHANGE_CLEAN, addr: &mddev->sb_flags); |
9074 | sysfs_notify_dirent_safe(sd: mddev->sysfs_completed); |
9075 | } |
9076 | |
9077 | while (j >= mddev->resync_max && |
9078 | !test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { |
9079 | /* As this condition is controlled by user-space, |
9080 | * we can block indefinitely, so use '_interruptible' |
9081 | * to avoid triggering warnings. |
9082 | */ |
9083 | flush_signals(current); /* just in case */ |
9084 | wait_event_interruptible(mddev->recovery_wait, |
9085 | mddev->resync_max > j |
9086 | || test_bit(MD_RECOVERY_INTR, |
9087 | &mddev->recovery)); |
9088 | } |
9089 | |
9090 | if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) |
9091 | break; |
9092 | |
9093 | sectors = mddev->pers->sync_request(mddev, j, &skipped); |
9094 | if (sectors == 0) { |
9095 | set_bit(nr: MD_RECOVERY_INTR, addr: &mddev->recovery); |
9096 | break; |
9097 | } |
9098 | |
9099 | if (!skipped) { /* actual IO requested */ |
9100 | io_sectors += sectors; |
9101 | atomic_add(i: sectors, v: &mddev->recovery_active); |
9102 | } |
9103 | |
9104 | if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) |
9105 | break; |
9106 | |
9107 | j += sectors; |
9108 | if (j > max_sectors) |
9109 | /* when skipping, extra large numbers can be returned. */ |
9110 | j = max_sectors; |
9111 | if (j >= MD_RESYNC_ACTIVE) |
9112 | mddev->curr_resync = j; |
9113 | mddev->curr_mark_cnt = io_sectors; |
9114 | if (last_check == 0) |
9115 | /* this is the earliest that rebuild will be |
9116 | * visible in /proc/mdstat |
9117 | */ |
9118 | md_new_event(); |
9119 | |
9120 | if (last_check + window > io_sectors || j == max_sectors) |
9121 | continue; |
9122 | |
9123 | last_check = io_sectors; |
9124 | repeat: |
9125 | if (time_after_eq(jiffies, mark[last_mark] + SYNC_MARK_STEP )) { |
9126 | /* step marks */ |
9127 | int next = (last_mark+1) % SYNC_MARKS; |
9128 | |
9129 | mddev->resync_mark = mark[next]; |
9130 | mddev->resync_mark_cnt = mark_cnt[next]; |
9131 | mark[next] = jiffies; |
9132 | mark_cnt[next] = io_sectors - atomic_read(v: &mddev->recovery_active); |
9133 | last_mark = next; |
9134 | } |
9135 | |
9136 | if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) |
9137 | break; |
9138 | |
9139 | /* |
9140 | * this loop exits only if either when we are slower than |
9141 | * the 'hard' speed limit, or the system was IO-idle for |
9142 | * a jiffy. |
9143 | * the system might be non-idle CPU-wise, but we only care |
9144 | * about not overloading the IO subsystem. (things like an |
9145 | * e2fsck being done on the RAID array should execute fast) |
9146 | */ |
9147 | cond_resched(); |
9148 | |
9149 | recovery_done = io_sectors - atomic_read(v: &mddev->recovery_active); |
9150 | currspeed = ((unsigned long)(recovery_done - mddev->resync_mark_cnt))/2 |
9151 | /((jiffies-mddev->resync_mark)/HZ +1) +1; |
9152 | |
9153 | if (currspeed > speed_min(mddev)) { |
9154 | if (currspeed > speed_max(mddev)) { |
9155 | msleep(msecs: 500); |
9156 | goto repeat; |
9157 | } |
9158 | if (!is_mddev_idle(mddev, init: 0)) { |
9159 | /* |
9160 | * Give other IO more of a chance. |
9161 | * The faster the devices, the less we wait. |
9162 | */ |
9163 | wait_event(mddev->recovery_wait, |
9164 | !atomic_read(&mddev->recovery_active)); |
9165 | } |
9166 | } |
9167 | } |
9168 | pr_info("md: %s: %s %s.\n" ,mdname(mddev), desc, |
9169 | test_bit(MD_RECOVERY_INTR, &mddev->recovery) |
9170 | ? "interrupted" : "done" ); |
9171 | /* |
9172 | * this also signals 'finished resyncing' to md_stop |
9173 | */ |
9174 | blk_finish_plug(&plug); |
9175 | wait_event(mddev->recovery_wait, !atomic_read(&mddev->recovery_active)); |
9176 | |
9177 | if (!test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && |
9178 | !test_bit(MD_RECOVERY_INTR, &mddev->recovery) && |
9179 | mddev->curr_resync >= MD_RESYNC_ACTIVE) { |
9180 | mddev->curr_resync_completed = mddev->curr_resync; |
9181 | sysfs_notify_dirent_safe(sd: mddev->sysfs_completed); |
9182 | } |
9183 | mddev->pers->sync_request(mddev, max_sectors, &skipped); |
9184 | |
9185 | if (!test_bit(MD_RECOVERY_CHECK, &mddev->recovery) && |
9186 | mddev->curr_resync > MD_RESYNC_ACTIVE) { |
9187 | if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { |
9188 | if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { |
9189 | if (mddev->curr_resync >= mddev->recovery_cp) { |
9190 | pr_debug("md: checkpointing %s of %s.\n" , |
9191 | desc, mdname(mddev)); |
9192 | if (test_bit(MD_RECOVERY_ERROR, |
9193 | &mddev->recovery)) |
9194 | mddev->recovery_cp = |
9195 | mddev->curr_resync_completed; |
9196 | else |
9197 | mddev->recovery_cp = |
9198 | mddev->curr_resync; |
9199 | } |
9200 | } else |
9201 | mddev->recovery_cp = MaxSector; |
9202 | } else { |
9203 | if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) |
9204 | mddev->curr_resync = MaxSector; |
9205 | if (!test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && |
9206 | test_bit(MD_RECOVERY_RECOVER, &mddev->recovery)) { |
9207 | rcu_read_lock(); |
9208 | rdev_for_each_rcu(rdev, mddev) |
9209 | if (rdev->raid_disk >= 0 && |
9210 | mddev->delta_disks >= 0 && |
9211 | !test_bit(Journal, &rdev->flags) && |
9212 | !test_bit(Faulty, &rdev->flags) && |
9213 | !test_bit(In_sync, &rdev->flags) && |
9214 | rdev->recovery_offset < mddev->curr_resync) |
9215 | rdev->recovery_offset = mddev->curr_resync; |
9216 | rcu_read_unlock(); |
9217 | } |
9218 | } |
9219 | } |
9220 | skip: |
9221 | /* set CHANGE_PENDING here since maybe another update is needed, |
9222 | * so other nodes are informed. It should be harmless for normal |
9223 | * raid */ |
9224 | set_mask_bits(&mddev->sb_flags, 0, |
9225 | BIT(MD_SB_CHANGE_PENDING) | BIT(MD_SB_CHANGE_DEVS)); |
9226 | |
9227 | if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && |
9228 | !test_bit(MD_RECOVERY_INTR, &mddev->recovery) && |
9229 | mddev->delta_disks > 0 && |
9230 | mddev->pers->finish_reshape && |
9231 | mddev->pers->size && |
9232 | !mddev_is_dm(mddev)) { |
9233 | mddev_lock_nointr(mddev); |
9234 | md_set_array_sectors(mddev, mddev->pers->size(mddev, 0, 0)); |
9235 | mddev_unlock(mddev); |
9236 | if (!mddev_is_clustered(mddev)) |
9237 | set_capacity_and_notify(disk: mddev->gendisk, |
9238 | size: mddev->array_sectors); |
9239 | } |
9240 | |
9241 | spin_lock(lock: &mddev->lock); |
9242 | if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { |
9243 | /* We completed so min/max setting can be forgotten if used. */ |
9244 | if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) |
9245 | mddev->resync_min = 0; |
9246 | mddev->resync_max = MaxSector; |
9247 | } else if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) |
9248 | mddev->resync_min = mddev->curr_resync_completed; |
9249 | set_bit(nr: MD_RECOVERY_DONE, addr: &mddev->recovery); |
9250 | mddev->curr_resync = MD_RESYNC_NONE; |
9251 | spin_unlock(lock: &mddev->lock); |
9252 | |
9253 | wake_up(&resync_wait); |
9254 | md_wakeup_thread(mddev->thread); |
9255 | return; |
9256 | } |
9257 | EXPORT_SYMBOL_GPL(md_do_sync); |
9258 | |
9259 | static bool rdev_removeable(struct md_rdev *rdev) |
9260 | { |
9261 | /* rdev is not used. */ |
9262 | if (rdev->raid_disk < 0) |
9263 | return false; |
9264 | |
9265 | /* There are still inflight io, don't remove this rdev. */ |
9266 | if (atomic_read(v: &rdev->nr_pending)) |
9267 | return false; |
9268 | |
9269 | /* |
9270 | * An error occurred but has not yet been acknowledged by the metadata |
9271 | * handler, don't remove this rdev. |
9272 | */ |
9273 | if (test_bit(Blocked, &rdev->flags)) |
9274 | return false; |
9275 | |
9276 | /* Fautly rdev is not used, it's safe to remove it. */ |
9277 | if (test_bit(Faulty, &rdev->flags)) |
9278 | return true; |
9279 | |
9280 | /* Journal disk can only be removed if it's faulty. */ |
9281 | if (test_bit(Journal, &rdev->flags)) |
9282 | return false; |
9283 | |
9284 | /* |
9285 | * 'In_sync' is cleared while 'raid_disk' is valid, which means |
9286 | * replacement has just become active from pers->spare_active(), and |
9287 | * then pers->hot_remove_disk() will replace this rdev with replacement. |
9288 | */ |
9289 | if (!test_bit(In_sync, &rdev->flags)) |
9290 | return true; |
9291 | |
9292 | return false; |
9293 | } |
9294 | |
9295 | static bool rdev_is_spare(struct md_rdev *rdev) |
9296 | { |
9297 | return !test_bit(Candidate, &rdev->flags) && rdev->raid_disk >= 0 && |
9298 | !test_bit(In_sync, &rdev->flags) && |
9299 | !test_bit(Journal, &rdev->flags) && |
9300 | !test_bit(Faulty, &rdev->flags); |
9301 | } |
9302 | |
9303 | static bool rdev_addable(struct md_rdev *rdev) |
9304 | { |
9305 | /* rdev is already used, don't add it again. */ |
9306 | if (test_bit(Candidate, &rdev->flags) || rdev->raid_disk >= 0 || |
9307 | test_bit(Faulty, &rdev->flags)) |
9308 | return false; |
9309 | |
9310 | /* Allow to add journal disk. */ |
9311 | if (test_bit(Journal, &rdev->flags)) |
9312 | return true; |
9313 | |
9314 | /* Allow to add if array is read-write. */ |
9315 | if (md_is_rdwr(mddev: rdev->mddev)) |
9316 | return true; |
9317 | |
9318 | /* |
9319 | * For read-only array, only allow to readd a rdev. And if bitmap is |
9320 | * used, don't allow to readd a rdev that is too old. |
9321 | */ |
9322 | if (rdev->saved_raid_disk >= 0 && !test_bit(Bitmap_sync, &rdev->flags)) |
9323 | return true; |
9324 | |
9325 | return false; |
9326 | } |
9327 | |
9328 | static bool md_spares_need_change(struct mddev *mddev) |
9329 | { |
9330 | struct md_rdev *rdev; |
9331 | |
9332 | rcu_read_lock(); |
9333 | rdev_for_each_rcu(rdev, mddev) { |
9334 | if (rdev_removeable(rdev) || rdev_addable(rdev)) { |
9335 | rcu_read_unlock(); |
9336 | return true; |
9337 | } |
9338 | } |
9339 | rcu_read_unlock(); |
9340 | return false; |
9341 | } |
9342 | |
9343 | static int remove_and_add_spares(struct mddev *mddev, |
9344 | struct md_rdev *this) |
9345 | { |
9346 | struct md_rdev *rdev; |
9347 | int spares = 0; |
9348 | int removed = 0; |
9349 | |
9350 | if (this && test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) |
9351 | /* Mustn't remove devices when resync thread is running */ |
9352 | return 0; |
9353 | |
9354 | rdev_for_each(rdev, mddev) { |
9355 | if ((this == NULL || rdev == this) && rdev_removeable(rdev) && |
9356 | !mddev->pers->hot_remove_disk(mddev, rdev)) { |
9357 | sysfs_unlink_rdev(mddev, rdev); |
9358 | rdev->saved_raid_disk = rdev->raid_disk; |
9359 | rdev->raid_disk = -1; |
9360 | removed++; |
9361 | } |
9362 | } |
9363 | |
9364 | if (removed && mddev->kobj.sd) |
9365 | sysfs_notify_dirent_safe(sd: mddev->sysfs_degraded); |
9366 | |
9367 | if (this && removed) |
9368 | goto no_add; |
9369 | |
9370 | rdev_for_each(rdev, mddev) { |
9371 | if (this && this != rdev) |
9372 | continue; |
9373 | if (rdev_is_spare(rdev)) |
9374 | spares++; |
9375 | if (!rdev_addable(rdev)) |
9376 | continue; |
9377 | if (!test_bit(Journal, &rdev->flags)) |
9378 | rdev->recovery_offset = 0; |
9379 | if (mddev->pers->hot_add_disk(mddev, rdev) == 0) { |
9380 | /* failure here is OK */ |
9381 | sysfs_link_rdev(mddev, rdev); |
9382 | if (!test_bit(Journal, &rdev->flags)) |
9383 | spares++; |
9384 | md_new_event(); |
9385 | set_bit(nr: MD_SB_CHANGE_DEVS, addr: &mddev->sb_flags); |
9386 | } |
9387 | } |
9388 | no_add: |
9389 | if (removed) |
9390 | set_bit(nr: MD_SB_CHANGE_DEVS, addr: &mddev->sb_flags); |
9391 | return spares; |
9392 | } |
9393 | |
9394 | static bool md_choose_sync_action(struct mddev *mddev, int *spares) |
9395 | { |
9396 | /* Check if reshape is in progress first. */ |
9397 | if (mddev->reshape_position != MaxSector) { |
9398 | if (mddev->pers->check_reshape == NULL || |
9399 | mddev->pers->check_reshape(mddev) != 0) |
9400 | return false; |
9401 | |
9402 | set_bit(nr: MD_RECOVERY_RESHAPE, addr: &mddev->recovery); |
9403 | clear_bit(nr: MD_RECOVERY_RECOVER, addr: &mddev->recovery); |
9404 | return true; |
9405 | } |
9406 | |
9407 | /* |
9408 | * Remove any failed drives, then add spares if possible. Spares are |
9409 | * also removed and re-added, to allow the personality to fail the |
9410 | * re-add. |
9411 | */ |
9412 | *spares = remove_and_add_spares(mddev, NULL); |
9413 | if (*spares) { |
9414 | clear_bit(nr: MD_RECOVERY_SYNC, addr: &mddev->recovery); |
9415 | clear_bit(nr: MD_RECOVERY_CHECK, addr: &mddev->recovery); |
9416 | clear_bit(nr: MD_RECOVERY_REQUESTED, addr: &mddev->recovery); |
9417 | |
9418 | /* Start new recovery. */ |
9419 | set_bit(nr: MD_RECOVERY_RECOVER, addr: &mddev->recovery); |
9420 | return true; |
9421 | } |
9422 | |
9423 | /* Check if recovery is in progress. */ |
9424 | if (mddev->recovery_cp < MaxSector) { |
9425 | set_bit(nr: MD_RECOVERY_SYNC, addr: &mddev->recovery); |
9426 | clear_bit(nr: MD_RECOVERY_RECOVER, addr: &mddev->recovery); |
9427 | return true; |
9428 | } |
9429 | |
9430 | /* Delay to choose resync/check/repair in md_do_sync(). */ |
9431 | if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) |
9432 | return true; |
9433 | |
9434 | /* Nothing to be done */ |
9435 | return false; |
9436 | } |
9437 | |
9438 | static void md_start_sync(struct work_struct *ws) |
9439 | { |
9440 | struct mddev *mddev = container_of(ws, struct mddev, sync_work); |
9441 | int spares = 0; |
9442 | bool suspend = false; |
9443 | char *name; |
9444 | |
9445 | /* |
9446 | * If reshape is still in progress, spares won't be added or removed |
9447 | * from conf until reshape is done. |
9448 | */ |
9449 | if (mddev->reshape_position == MaxSector && |
9450 | md_spares_need_change(mddev)) { |
9451 | suspend = true; |
9452 | mddev_suspend(mddev, false); |
9453 | } |
9454 | |
9455 | mddev_lock_nointr(mddev); |
9456 | if (!md_is_rdwr(mddev)) { |
9457 | /* |
9458 | * On a read-only array we can: |
9459 | * - remove failed devices |
9460 | * - add already-in_sync devices if the array itself is in-sync. |
9461 | * As we only add devices that are already in-sync, we can |
9462 | * activate the spares immediately. |
9463 | */ |
9464 | remove_and_add_spares(mddev, NULL); |
9465 | goto not_running; |
9466 | } |
9467 | |
9468 | if (!md_choose_sync_action(mddev, spares: &spares)) |
9469 | goto not_running; |
9470 | |
9471 | if (!mddev->pers->sync_request) |
9472 | goto not_running; |
9473 | |
9474 | /* |
9475 | * We are adding a device or devices to an array which has the bitmap |
9476 | * stored on all devices. So make sure all bitmap pages get written. |
9477 | */ |
9478 | if (spares) |
9479 | md_bitmap_write_all(bitmap: mddev->bitmap); |
9480 | |
9481 | name = test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) ? |
9482 | "reshape" : "resync" ; |
9483 | rcu_assign_pointer(mddev->sync_thread, |
9484 | md_register_thread(md_do_sync, mddev, name)); |
9485 | if (!mddev->sync_thread) { |
9486 | pr_warn("%s: could not start resync thread...\n" , |
9487 | mdname(mddev)); |
9488 | /* leave the spares where they are, it shouldn't hurt */ |
9489 | goto not_running; |
9490 | } |
9491 | |
9492 | mddev_unlock(mddev); |
9493 | /* |
9494 | * md_start_sync was triggered by MD_RECOVERY_NEEDED, so we should |
9495 | * not set it again. Otherwise, we may cause issue like this one: |
9496 | * https://bugzilla.kernel.org/show_bug.cgi?id=218200 |
9497 | * Therefore, use __mddev_resume(mddev, false). |
9498 | */ |
9499 | if (suspend) |
9500 | __mddev_resume(mddev, recovery_needed: false); |
9501 | md_wakeup_thread(mddev->sync_thread); |
9502 | sysfs_notify_dirent_safe(sd: mddev->sysfs_action); |
9503 | md_new_event(); |
9504 | return; |
9505 | |
9506 | not_running: |
9507 | clear_bit(nr: MD_RECOVERY_SYNC, addr: &mddev->recovery); |
9508 | clear_bit(nr: MD_RECOVERY_RESHAPE, addr: &mddev->recovery); |
9509 | clear_bit(nr: MD_RECOVERY_REQUESTED, addr: &mddev->recovery); |
9510 | clear_bit(nr: MD_RECOVERY_CHECK, addr: &mddev->recovery); |
9511 | clear_bit(nr: MD_RECOVERY_RUNNING, addr: &mddev->recovery); |
9512 | mddev_unlock(mddev); |
9513 | /* |
9514 | * md_start_sync was triggered by MD_RECOVERY_NEEDED, so we should |
9515 | * not set it again. Otherwise, we may cause issue like this one: |
9516 | * https://bugzilla.kernel.org/show_bug.cgi?id=218200 |
9517 | * Therefore, use __mddev_resume(mddev, false). |
9518 | */ |
9519 | if (suspend) |
9520 | __mddev_resume(mddev, recovery_needed: false); |
9521 | |
9522 | wake_up(&resync_wait); |
9523 | if (test_and_clear_bit(nr: MD_RECOVERY_RECOVER, addr: &mddev->recovery) && |
9524 | mddev->sysfs_action) |
9525 | sysfs_notify_dirent_safe(sd: mddev->sysfs_action); |
9526 | } |
9527 | |
9528 | static void unregister_sync_thread(struct mddev *mddev) |
9529 | { |
9530 | if (!test_bit(MD_RECOVERY_DONE, &mddev->recovery)) { |
9531 | /* resync/recovery still happening */ |
9532 | clear_bit(nr: MD_RECOVERY_NEEDED, addr: &mddev->recovery); |
9533 | return; |
9534 | } |
9535 | |
9536 | if (WARN_ON_ONCE(!mddev->sync_thread)) |
9537 | return; |
9538 | |
9539 | md_reap_sync_thread(mddev); |
9540 | } |
9541 | |
9542 | /* |
9543 | * This routine is regularly called by all per-raid-array threads to |
9544 | * deal with generic issues like resync and super-block update. |
9545 | * Raid personalities that don't have a thread (linear/raid0) do not |
9546 | * need this as they never do any recovery or update the superblock. |
9547 | * |
9548 | * It does not do any resync itself, but rather "forks" off other threads |
9549 | * to do that as needed. |
9550 | * When it is determined that resync is needed, we set MD_RECOVERY_RUNNING in |
9551 | * "->recovery" and create a thread at ->sync_thread. |
9552 | * When the thread finishes it sets MD_RECOVERY_DONE |
9553 | * and wakeups up this thread which will reap the thread and finish up. |
9554 | * This thread also removes any faulty devices (with nr_pending == 0). |
9555 | * |
9556 | * The overall approach is: |
9557 | * 1/ if the superblock needs updating, update it. |
9558 | * 2/ If a recovery thread is running, don't do anything else. |
9559 | * 3/ If recovery has finished, clean up, possibly marking spares active. |
9560 | * 4/ If there are any faulty devices, remove them. |
9561 | * 5/ If array is degraded, try to add spares devices |
9562 | * 6/ If array has spares or is not in-sync, start a resync thread. |
9563 | */ |
9564 | void md_check_recovery(struct mddev *mddev) |
9565 | { |
9566 | if (mddev->bitmap) |
9567 | md_bitmap_daemon_work(mddev); |
9568 | |
9569 | if (signal_pending(current)) { |
9570 | if (mddev->pers->sync_request && !mddev->external) { |
9571 | pr_debug("md: %s in immediate safe mode\n" , |
9572 | mdname(mddev)); |
9573 | mddev->safemode = 2; |
9574 | } |
9575 | flush_signals(current); |
9576 | } |
9577 | |
9578 | if (!md_is_rdwr(mddev) && |
9579 | !test_bit(MD_RECOVERY_NEEDED, &mddev->recovery) && |
9580 | !test_bit(MD_RECOVERY_DONE, &mddev->recovery)) |
9581 | return; |
9582 | if ( ! ( |
9583 | (mddev->sb_flags & ~ (1<<MD_SB_CHANGE_PENDING)) || |
9584 | test_bit(MD_RECOVERY_NEEDED, &mddev->recovery) || |
9585 | test_bit(MD_RECOVERY_DONE, &mddev->recovery) || |
9586 | (mddev->external == 0 && mddev->safemode == 1) || |
9587 | (mddev->safemode == 2 |
9588 | && !mddev->in_sync && mddev->recovery_cp == MaxSector) |
9589 | )) |
9590 | return; |
9591 | |
9592 | if (mddev_trylock(mddev)) { |
9593 | bool try_set_sync = mddev->safemode != 0; |
9594 | |
9595 | if (!mddev->external && mddev->safemode == 1) |
9596 | mddev->safemode = 0; |
9597 | |
9598 | if (!md_is_rdwr(mddev)) { |
9599 | struct md_rdev *rdev; |
9600 | |
9601 | if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) { |
9602 | unregister_sync_thread(mddev); |
9603 | goto unlock; |
9604 | } |
9605 | |
9606 | if (!mddev->external && mddev->in_sync) |
9607 | /* |
9608 | * 'Blocked' flag not needed as failed devices |
9609 | * will be recorded if array switched to read/write. |
9610 | * Leaving it set will prevent the device |
9611 | * from being removed. |
9612 | */ |
9613 | rdev_for_each(rdev, mddev) |
9614 | clear_bit(nr: Blocked, addr: &rdev->flags); |
9615 | |
9616 | /* |
9617 | * There is no thread, but we need to call |
9618 | * ->spare_active and clear saved_raid_disk |
9619 | */ |
9620 | set_bit(nr: MD_RECOVERY_INTR, addr: &mddev->recovery); |
9621 | md_reap_sync_thread(mddev); |
9622 | |
9623 | /* |
9624 | * Let md_start_sync() to remove and add rdevs to the |
9625 | * array. |
9626 | */ |
9627 | if (md_spares_need_change(mddev)) { |
9628 | set_bit(nr: MD_RECOVERY_RUNNING, addr: &mddev->recovery); |
9629 | queue_work(wq: md_misc_wq, work: &mddev->sync_work); |
9630 | } |
9631 | |
9632 | clear_bit(nr: MD_RECOVERY_RECOVER, addr: &mddev->recovery); |
9633 | clear_bit(nr: MD_RECOVERY_NEEDED, addr: &mddev->recovery); |
9634 | clear_bit(nr: MD_SB_CHANGE_PENDING, addr: &mddev->sb_flags); |
9635 | |
9636 | goto unlock; |
9637 | } |
9638 | |
9639 | if (mddev_is_clustered(mddev)) { |
9640 | struct md_rdev *rdev, *tmp; |
9641 | /* kick the device if another node issued a |
9642 | * remove disk. |
9643 | */ |
9644 | rdev_for_each_safe(rdev, tmp, mddev) { |
9645 | if (test_and_clear_bit(nr: ClusterRemove, addr: &rdev->flags) && |
9646 | rdev->raid_disk < 0) |
9647 | md_kick_rdev_from_array(rdev); |
9648 | } |
9649 | } |
9650 | |
9651 | if (try_set_sync && !mddev->external && !mddev->in_sync) { |
9652 | spin_lock(lock: &mddev->lock); |
9653 | set_in_sync(mddev); |
9654 | spin_unlock(lock: &mddev->lock); |
9655 | } |
9656 | |
9657 | if (mddev->sb_flags) |
9658 | md_update_sb(mddev, 0); |
9659 | |
9660 | /* |
9661 | * Never start a new sync thread if MD_RECOVERY_RUNNING is |
9662 | * still set. |
9663 | */ |
9664 | if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) { |
9665 | unregister_sync_thread(mddev); |
9666 | goto unlock; |
9667 | } |
9668 | |
9669 | /* Set RUNNING before clearing NEEDED to avoid |
9670 | * any transients in the value of "sync_action". |
9671 | */ |
9672 | mddev->curr_resync_completed = 0; |
9673 | spin_lock(lock: &mddev->lock); |
9674 | set_bit(nr: MD_RECOVERY_RUNNING, addr: &mddev->recovery); |
9675 | spin_unlock(lock: &mddev->lock); |
9676 | /* Clear some bits that don't mean anything, but |
9677 | * might be left set |
9678 | */ |
9679 | clear_bit(nr: MD_RECOVERY_INTR, addr: &mddev->recovery); |
9680 | clear_bit(nr: MD_RECOVERY_DONE, addr: &mddev->recovery); |
9681 | |
9682 | if (test_and_clear_bit(nr: MD_RECOVERY_NEEDED, addr: &mddev->recovery) && |
9683 | !test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)) { |
9684 | queue_work(wq: md_misc_wq, work: &mddev->sync_work); |
9685 | } else { |
9686 | clear_bit(nr: MD_RECOVERY_RUNNING, addr: &mddev->recovery); |
9687 | wake_up(&resync_wait); |
9688 | } |
9689 | |
9690 | unlock: |
9691 | wake_up(&mddev->sb_wait); |
9692 | mddev_unlock(mddev); |
9693 | } |
9694 | } |
9695 | EXPORT_SYMBOL(md_check_recovery); |
9696 | |
9697 | void md_reap_sync_thread(struct mddev *mddev) |
9698 | { |
9699 | struct md_rdev *rdev; |
9700 | sector_t old_dev_sectors = mddev->dev_sectors; |
9701 | bool is_reshaped = false; |
9702 | |
9703 | /* resync has finished, collect result */ |
9704 | md_unregister_thread(mddev, &mddev->sync_thread); |
9705 | atomic_inc(v: &mddev->sync_seq); |
9706 | |
9707 | if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery) && |
9708 | !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery) && |
9709 | mddev->degraded != mddev->raid_disks) { |
9710 | /* success...*/ |
9711 | /* activate any spares */ |
9712 | if (mddev->pers->spare_active(mddev)) { |
9713 | sysfs_notify_dirent_safe(sd: mddev->sysfs_degraded); |
9714 | set_bit(nr: MD_SB_CHANGE_DEVS, addr: &mddev->sb_flags); |
9715 | } |
9716 | } |
9717 | if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && |
9718 | mddev->pers->finish_reshape) { |
9719 | mddev->pers->finish_reshape(mddev); |
9720 | if (mddev_is_clustered(mddev)) |
9721 | is_reshaped = true; |
9722 | } |
9723 | |
9724 | /* If array is no-longer degraded, then any saved_raid_disk |
9725 | * information must be scrapped. |
9726 | */ |
9727 | if (!mddev->degraded) |
9728 | rdev_for_each(rdev, mddev) |
9729 | rdev->saved_raid_disk = -1; |
9730 | |
9731 | md_update_sb(mddev, 1); |
9732 | /* MD_SB_CHANGE_PENDING should be cleared by md_update_sb, so we can |
9733 | * call resync_finish here if MD_CLUSTER_RESYNC_LOCKED is set by |
9734 | * clustered raid */ |
9735 | if (test_and_clear_bit(nr: MD_CLUSTER_RESYNC_LOCKED, addr: &mddev->flags)) |
9736 | md_cluster_ops->resync_finish(mddev); |
9737 | clear_bit(nr: MD_RECOVERY_RUNNING, addr: &mddev->recovery); |
9738 | clear_bit(nr: MD_RECOVERY_DONE, addr: &mddev->recovery); |
9739 | clear_bit(nr: MD_RECOVERY_SYNC, addr: &mddev->recovery); |
9740 | clear_bit(nr: MD_RECOVERY_RESHAPE, addr: &mddev->recovery); |
9741 | clear_bit(nr: MD_RECOVERY_REQUESTED, addr: &mddev->recovery); |
9742 | clear_bit(nr: MD_RECOVERY_CHECK, addr: &mddev->recovery); |
9743 | /* |
9744 | * We call md_cluster_ops->update_size here because sync_size could |
9745 | * be changed by md_update_sb, and MD_RECOVERY_RESHAPE is cleared, |
9746 | * so it is time to update size across cluster. |
9747 | */ |
9748 | if (mddev_is_clustered(mddev) && is_reshaped |
9749 | && !test_bit(MD_CLOSING, &mddev->flags)) |
9750 | md_cluster_ops->update_size(mddev, old_dev_sectors); |
9751 | /* flag recovery needed just to double check */ |
9752 | set_bit(nr: MD_RECOVERY_NEEDED, addr: &mddev->recovery); |
9753 | sysfs_notify_dirent_safe(sd: mddev->sysfs_completed); |
9754 | sysfs_notify_dirent_safe(sd: mddev->sysfs_action); |
9755 | md_new_event(); |
9756 | if (mddev->event_work.func) |
9757 | queue_work(wq: md_misc_wq, work: &mddev->event_work); |
9758 | wake_up(&resync_wait); |
9759 | } |
9760 | EXPORT_SYMBOL(md_reap_sync_thread); |
9761 | |
9762 | void md_wait_for_blocked_rdev(struct md_rdev *rdev, struct mddev *mddev) |
9763 | { |
9764 | sysfs_notify_dirent_safe(sd: rdev->sysfs_state); |
9765 | wait_event_timeout(rdev->blocked_wait, |
9766 | !test_bit(Blocked, &rdev->flags) && |
9767 | !test_bit(BlockedBadBlocks, &rdev->flags), |
9768 | msecs_to_jiffies(5000)); |
9769 | rdev_dec_pending(rdev, mddev); |
9770 | } |
9771 | EXPORT_SYMBOL(md_wait_for_blocked_rdev); |
9772 | |
9773 | void md_finish_reshape(struct mddev *mddev) |
9774 | { |
9775 | /* called be personality module when reshape completes. */ |
9776 | struct md_rdev *rdev; |
9777 | |
9778 | rdev_for_each(rdev, mddev) { |
9779 | if (rdev->data_offset > rdev->new_data_offset) |
9780 | rdev->sectors += rdev->data_offset - rdev->new_data_offset; |
9781 | else |
9782 | rdev->sectors -= rdev->new_data_offset - rdev->data_offset; |
9783 | rdev->data_offset = rdev->new_data_offset; |
9784 | } |
9785 | } |
9786 | EXPORT_SYMBOL(md_finish_reshape); |
9787 | |
9788 | /* Bad block management */ |
9789 | |
9790 | /* Returns 1 on success, 0 on failure */ |
9791 | int rdev_set_badblocks(struct md_rdev *rdev, sector_t s, int sectors, |
9792 | int is_new) |
9793 | { |
9794 | struct mddev *mddev = rdev->mddev; |
9795 | int rv; |
9796 | if (is_new) |
9797 | s += rdev->new_data_offset; |
9798 | else |
9799 | s += rdev->data_offset; |
9800 | rv = badblocks_set(bb: &rdev->badblocks, s, sectors, acknowledged: 0); |
9801 | if (rv == 0) { |
9802 | /* Make sure they get written out promptly */ |
9803 | if (test_bit(ExternalBbl, &rdev->flags)) |
9804 | sysfs_notify_dirent_safe(sd: rdev->sysfs_unack_badblocks); |
9805 | sysfs_notify_dirent_safe(sd: rdev->sysfs_state); |
9806 | set_mask_bits(&mddev->sb_flags, 0, |
9807 | BIT(MD_SB_CHANGE_CLEAN) | BIT(MD_SB_CHANGE_PENDING)); |
9808 | md_wakeup_thread(rdev->mddev->thread); |
9809 | return 1; |
9810 | } else |
9811 | return 0; |
9812 | } |
9813 | EXPORT_SYMBOL_GPL(rdev_set_badblocks); |
9814 | |
9815 | int rdev_clear_badblocks(struct md_rdev *rdev, sector_t s, int sectors, |
9816 | int is_new) |
9817 | { |
9818 | int rv; |
9819 | if (is_new) |
9820 | s += rdev->new_data_offset; |
9821 | else |
9822 | s += rdev->data_offset; |
9823 | rv = badblocks_clear(bb: &rdev->badblocks, s, sectors); |
9824 | if ((rv == 0) && test_bit(ExternalBbl, &rdev->flags)) |
9825 | sysfs_notify_dirent_safe(sd: rdev->sysfs_badblocks); |
9826 | return rv; |
9827 | } |
9828 | EXPORT_SYMBOL_GPL(rdev_clear_badblocks); |
9829 | |
9830 | static int md_notify_reboot(struct notifier_block *this, |
9831 | unsigned long code, void *x) |
9832 | { |
9833 | struct mddev *mddev, *n; |
9834 | int need_delay = 0; |
9835 | |
9836 | spin_lock(lock: &all_mddevs_lock); |
9837 | list_for_each_entry_safe(mddev, n, &all_mddevs, all_mddevs) { |
9838 | if (!mddev_get(mddev)) |
9839 | continue; |
9840 | spin_unlock(lock: &all_mddevs_lock); |
9841 | if (mddev_trylock(mddev)) { |
9842 | if (mddev->pers) |
9843 | __md_stop_writes(mddev); |
9844 | if (mddev->persistent) |
9845 | mddev->safemode = 2; |
9846 | mddev_unlock(mddev); |
9847 | } |
9848 | need_delay = 1; |
9849 | mddev_put(mddev); |
9850 | spin_lock(lock: &all_mddevs_lock); |
9851 | } |
9852 | spin_unlock(lock: &all_mddevs_lock); |
9853 | |
9854 | /* |
9855 | * certain more exotic SCSI devices are known to be |
9856 | * volatile wrt too early system reboots. While the |
9857 | * right place to handle this issue is the given |
9858 | * driver, we do want to have a safe RAID driver ... |
9859 | */ |
9860 | if (need_delay) |
9861 | msleep(msecs: 1000); |
9862 | |
9863 | return NOTIFY_DONE; |
9864 | } |
9865 | |
9866 | static struct notifier_block md_notifier = { |
9867 | .notifier_call = md_notify_reboot, |
9868 | .next = NULL, |
9869 | .priority = INT_MAX, /* before any real devices */ |
9870 | }; |
9871 | |
9872 | static void md_geninit(void) |
9873 | { |
9874 | pr_debug("md: sizeof(mdp_super_t) = %d\n" , (int)sizeof(mdp_super_t)); |
9875 | |
9876 | proc_create(name: "mdstat" , S_IRUGO, NULL, proc_ops: &mdstat_proc_ops); |
9877 | } |
9878 | |
9879 | static int __init md_init(void) |
9880 | { |
9881 | int ret = -ENOMEM; |
9882 | |
9883 | md_wq = alloc_workqueue(fmt: "md" , flags: WQ_MEM_RECLAIM, max_active: 0); |
9884 | if (!md_wq) |
9885 | goto err_wq; |
9886 | |
9887 | md_misc_wq = alloc_workqueue(fmt: "md_misc" , flags: 0, max_active: 0); |
9888 | if (!md_misc_wq) |
9889 | goto err_misc_wq; |
9890 | |
9891 | md_bitmap_wq = alloc_workqueue(fmt: "md_bitmap" , flags: WQ_MEM_RECLAIM | WQ_UNBOUND, |
9892 | max_active: 0); |
9893 | if (!md_bitmap_wq) |
9894 | goto err_bitmap_wq; |
9895 | |
9896 | ret = __register_blkdev(MD_MAJOR, name: "md" , probe: md_probe); |
9897 | if (ret < 0) |
9898 | goto err_md; |
9899 | |
9900 | ret = __register_blkdev(major: 0, name: "mdp" , probe: md_probe); |
9901 | if (ret < 0) |
9902 | goto err_mdp; |
9903 | mdp_major = ret; |
9904 | |
9905 | register_reboot_notifier(&md_notifier); |
9906 | raid_table_header = register_sysctl("dev/raid" , raid_table); |
9907 | |
9908 | md_geninit(); |
9909 | return 0; |
9910 | |
9911 | err_mdp: |
9912 | unregister_blkdev(MD_MAJOR, name: "md" ); |
9913 | err_md: |
9914 | destroy_workqueue(wq: md_bitmap_wq); |
9915 | err_bitmap_wq: |
9916 | destroy_workqueue(wq: md_misc_wq); |
9917 | err_misc_wq: |
9918 | destroy_workqueue(wq: md_wq); |
9919 | err_wq: |
9920 | return ret; |
9921 | } |
9922 | |
9923 | static void check_sb_changes(struct mddev *mddev, struct md_rdev *rdev) |
9924 | { |
9925 | struct mdp_superblock_1 *sb = page_address(rdev->sb_page); |
9926 | struct md_rdev *rdev2, *tmp; |
9927 | int role, ret; |
9928 | |
9929 | /* |
9930 | * If size is changed in another node then we need to |
9931 | * do resize as well. |
9932 | */ |
9933 | if (mddev->dev_sectors != le64_to_cpu(sb->size)) { |
9934 | ret = mddev->pers->resize(mddev, le64_to_cpu(sb->size)); |
9935 | if (ret) |
9936 | pr_info("md-cluster: resize failed\n" ); |
9937 | else |
9938 | md_bitmap_update_sb(bitmap: mddev->bitmap); |
9939 | } |
9940 | |
9941 | /* Check for change of roles in the active devices */ |
9942 | rdev_for_each_safe(rdev2, tmp, mddev) { |
9943 | if (test_bit(Faulty, &rdev2->flags)) |
9944 | continue; |
9945 | |
9946 | /* Check if the roles changed */ |
9947 | role = le16_to_cpu(sb->dev_roles[rdev2->desc_nr]); |
9948 | |
9949 | if (test_bit(Candidate, &rdev2->flags)) { |
9950 | if (role == MD_DISK_ROLE_FAULTY) { |
9951 | pr_info("md: Removing Candidate device %pg because add failed\n" , |
9952 | rdev2->bdev); |
9953 | md_kick_rdev_from_array(rdev: rdev2); |
9954 | continue; |
9955 | } |
9956 | else |
9957 | clear_bit(nr: Candidate, addr: &rdev2->flags); |
9958 | } |
9959 | |
9960 | if (role != rdev2->raid_disk) { |
9961 | /* |
9962 | * got activated except reshape is happening. |
9963 | */ |
9964 | if (rdev2->raid_disk == -1 && role != MD_DISK_ROLE_SPARE && |
9965 | !(le32_to_cpu(sb->feature_map) & |
9966 | MD_FEATURE_RESHAPE_ACTIVE)) { |
9967 | rdev2->saved_raid_disk = role; |
9968 | ret = remove_and_add_spares(mddev, this: rdev2); |
9969 | pr_info("Activated spare: %pg\n" , |
9970 | rdev2->bdev); |
9971 | /* wakeup mddev->thread here, so array could |
9972 | * perform resync with the new activated disk */ |
9973 | set_bit(nr: MD_RECOVERY_NEEDED, addr: &mddev->recovery); |
9974 | md_wakeup_thread(mddev->thread); |
9975 | } |
9976 | /* device faulty |
9977 | * We just want to do the minimum to mark the disk |
9978 | * as faulty. The recovery is performed by the |
9979 | * one who initiated the error. |
9980 | */ |
9981 | if (role == MD_DISK_ROLE_FAULTY || |
9982 | role == MD_DISK_ROLE_JOURNAL) { |
9983 | md_error(mddev, rdev2); |
9984 | clear_bit(nr: Blocked, addr: &rdev2->flags); |
9985 | } |
9986 | } |
9987 | } |
9988 | |
9989 | if (mddev->raid_disks != le32_to_cpu(sb->raid_disks)) { |
9990 | ret = update_raid_disks(mddev, le32_to_cpu(sb->raid_disks)); |
9991 | if (ret) |
9992 | pr_warn("md: updating array disks failed. %d\n" , ret); |
9993 | } |
9994 | |
9995 | /* |
9996 | * Since mddev->delta_disks has already updated in update_raid_disks, |
9997 | * so it is time to check reshape. |
9998 | */ |
9999 | if (test_bit(MD_RESYNCING_REMOTE, &mddev->recovery) && |
10000 | (le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE)) { |
10001 | /* |
10002 | * reshape is happening in the remote node, we need to |
10003 | * update reshape_position and call start_reshape. |
10004 | */ |
10005 | mddev->reshape_position = le64_to_cpu(sb->reshape_position); |
10006 | if (mddev->pers->update_reshape_pos) |
10007 | mddev->pers->update_reshape_pos(mddev); |
10008 | if (mddev->pers->start_reshape) |
10009 | mddev->pers->start_reshape(mddev); |
10010 | } else if (test_bit(MD_RESYNCING_REMOTE, &mddev->recovery) && |
10011 | mddev->reshape_position != MaxSector && |
10012 | !(le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE)) { |
10013 | /* reshape is just done in another node. */ |
10014 | mddev->reshape_position = MaxSector; |
10015 | if (mddev->pers->update_reshape_pos) |
10016 | mddev->pers->update_reshape_pos(mddev); |
10017 | } |
10018 | |
10019 | /* Finally set the event to be up to date */ |
10020 | mddev->events = le64_to_cpu(sb->events); |
10021 | } |
10022 | |
10023 | static int read_rdev(struct mddev *mddev, struct md_rdev *rdev) |
10024 | { |
10025 | int err; |
10026 | struct page *swapout = rdev->sb_page; |
10027 | struct mdp_superblock_1 *sb; |
10028 | |
10029 | /* Store the sb page of the rdev in the swapout temporary |
10030 | * variable in case we err in the future |
10031 | */ |
10032 | rdev->sb_page = NULL; |
10033 | err = alloc_disk_sb(rdev); |
10034 | if (err == 0) { |
10035 | ClearPageUptodate(page: rdev->sb_page); |
10036 | rdev->sb_loaded = 0; |
10037 | err = super_types[mddev->major_version]. |
10038 | load_super(rdev, NULL, mddev->minor_version); |
10039 | } |
10040 | if (err < 0) { |
10041 | pr_warn("%s: %d Could not reload rdev(%d) err: %d. Restoring old values\n" , |
10042 | __func__, __LINE__, rdev->desc_nr, err); |
10043 | if (rdev->sb_page) |
10044 | put_page(page: rdev->sb_page); |
10045 | rdev->sb_page = swapout; |
10046 | rdev->sb_loaded = 1; |
10047 | return err; |
10048 | } |
10049 | |
10050 | sb = page_address(rdev->sb_page); |
10051 | /* Read the offset unconditionally, even if MD_FEATURE_RECOVERY_OFFSET |
10052 | * is not set |
10053 | */ |
10054 | |
10055 | if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RECOVERY_OFFSET)) |
10056 | rdev->recovery_offset = le64_to_cpu(sb->recovery_offset); |
10057 | |
10058 | /* The other node finished recovery, call spare_active to set |
10059 | * device In_sync and mddev->degraded |
10060 | */ |
10061 | if (rdev->recovery_offset == MaxSector && |
10062 | !test_bit(In_sync, &rdev->flags) && |
10063 | mddev->pers->spare_active(mddev)) |
10064 | sysfs_notify_dirent_safe(sd: mddev->sysfs_degraded); |
10065 | |
10066 | put_page(page: swapout); |
10067 | return 0; |
10068 | } |
10069 | |
10070 | void md_reload_sb(struct mddev *mddev, int nr) |
10071 | { |
10072 | struct md_rdev *rdev = NULL, *iter; |
10073 | int err; |
10074 | |
10075 | /* Find the rdev */ |
10076 | rdev_for_each_rcu(iter, mddev) { |
10077 | if (iter->desc_nr == nr) { |
10078 | rdev = iter; |
10079 | break; |
10080 | } |
10081 | } |
10082 | |
10083 | if (!rdev) { |
10084 | pr_warn("%s: %d Could not find rdev with nr %d\n" , __func__, __LINE__, nr); |
10085 | return; |
10086 | } |
10087 | |
10088 | err = read_rdev(mddev, rdev); |
10089 | if (err < 0) |
10090 | return; |
10091 | |
10092 | check_sb_changes(mddev, rdev); |
10093 | |
10094 | /* Read all rdev's to update recovery_offset */ |
10095 | rdev_for_each_rcu(rdev, mddev) { |
10096 | if (!test_bit(Faulty, &rdev->flags)) |
10097 | read_rdev(mddev, rdev); |
10098 | } |
10099 | } |
10100 | EXPORT_SYMBOL(md_reload_sb); |
10101 | |
10102 | #ifndef MODULE |
10103 | |
10104 | /* |
10105 | * Searches all registered partitions for autorun RAID arrays |
10106 | * at boot time. |
10107 | */ |
10108 | |
10109 | static DEFINE_MUTEX(detected_devices_mutex); |
10110 | static LIST_HEAD(all_detected_devices); |
10111 | struct detected_devices_node { |
10112 | struct list_head list; |
10113 | dev_t dev; |
10114 | }; |
10115 | |
10116 | void md_autodetect_dev(dev_t dev) |
10117 | { |
10118 | struct detected_devices_node *node_detected_dev; |
10119 | |
10120 | node_detected_dev = kzalloc(size: sizeof(*node_detected_dev), GFP_KERNEL); |
10121 | if (node_detected_dev) { |
10122 | node_detected_dev->dev = dev; |
10123 | mutex_lock(&detected_devices_mutex); |
10124 | list_add_tail(new: &node_detected_dev->list, head: &all_detected_devices); |
10125 | mutex_unlock(lock: &detected_devices_mutex); |
10126 | } |
10127 | } |
10128 | |
10129 | void md_autostart_arrays(int part) |
10130 | { |
10131 | struct md_rdev *rdev; |
10132 | struct detected_devices_node *node_detected_dev; |
10133 | dev_t dev; |
10134 | int i_scanned, i_passed; |
10135 | |
10136 | i_scanned = 0; |
10137 | i_passed = 0; |
10138 | |
10139 | pr_info("md: Autodetecting RAID arrays.\n" ); |
10140 | |
10141 | mutex_lock(&detected_devices_mutex); |
10142 | while (!list_empty(head: &all_detected_devices) && i_scanned < INT_MAX) { |
10143 | i_scanned++; |
10144 | node_detected_dev = list_entry(all_detected_devices.next, |
10145 | struct detected_devices_node, list); |
10146 | list_del(entry: &node_detected_dev->list); |
10147 | dev = node_detected_dev->dev; |
10148 | kfree(objp: node_detected_dev); |
10149 | mutex_unlock(lock: &detected_devices_mutex); |
10150 | rdev = md_import_device(newdev: dev,super_format: 0, super_minor: 90); |
10151 | mutex_lock(&detected_devices_mutex); |
10152 | if (IS_ERR(ptr: rdev)) |
10153 | continue; |
10154 | |
10155 | if (test_bit(Faulty, &rdev->flags)) |
10156 | continue; |
10157 | |
10158 | set_bit(nr: AutoDetected, addr: &rdev->flags); |
10159 | list_add(new: &rdev->same_set, head: &pending_raid_disks); |
10160 | i_passed++; |
10161 | } |
10162 | mutex_unlock(lock: &detected_devices_mutex); |
10163 | |
10164 | pr_debug("md: Scanned %d and added %d devices.\n" , i_scanned, i_passed); |
10165 | |
10166 | autorun_devices(part); |
10167 | } |
10168 | |
10169 | #endif /* !MODULE */ |
10170 | |
10171 | static __exit void md_exit(void) |
10172 | { |
10173 | struct mddev *mddev, *n; |
10174 | int delay = 1; |
10175 | |
10176 | unregister_blkdev(MD_MAJOR,name: "md" ); |
10177 | unregister_blkdev(major: mdp_major, name: "mdp" ); |
10178 | unregister_reboot_notifier(&md_notifier); |
10179 | unregister_sysctl_table(table: raid_table_header); |
10180 | |
10181 | /* We cannot unload the modules while some process is |
10182 | * waiting for us in select() or poll() - wake them up |
10183 | */ |
10184 | md_unloading = 1; |
10185 | while (waitqueue_active(wq_head: &md_event_waiters)) { |
10186 | /* not safe to leave yet */ |
10187 | wake_up(&md_event_waiters); |
10188 | msleep(msecs: delay); |
10189 | delay += delay; |
10190 | } |
10191 | remove_proc_entry("mdstat" , NULL); |
10192 | |
10193 | spin_lock(lock: &all_mddevs_lock); |
10194 | list_for_each_entry_safe(mddev, n, &all_mddevs, all_mddevs) { |
10195 | if (!mddev_get(mddev)) |
10196 | continue; |
10197 | spin_unlock(lock: &all_mddevs_lock); |
10198 | export_array(mddev); |
10199 | mddev->ctime = 0; |
10200 | mddev->hold_active = 0; |
10201 | /* |
10202 | * As the mddev is now fully clear, mddev_put will schedule |
10203 | * the mddev for destruction by a workqueue, and the |
10204 | * destroy_workqueue() below will wait for that to complete. |
10205 | */ |
10206 | mddev_put(mddev); |
10207 | spin_lock(lock: &all_mddevs_lock); |
10208 | } |
10209 | spin_unlock(lock: &all_mddevs_lock); |
10210 | |
10211 | destroy_workqueue(wq: md_misc_wq); |
10212 | destroy_workqueue(wq: md_bitmap_wq); |
10213 | destroy_workqueue(wq: md_wq); |
10214 | } |
10215 | |
10216 | subsys_initcall(md_init); |
10217 | module_exit(md_exit) |
10218 | |
10219 | static int get_ro(char *buffer, const struct kernel_param *kp) |
10220 | { |
10221 | return sprintf(buf: buffer, fmt: "%d\n" , start_readonly); |
10222 | } |
10223 | static int set_ro(const char *val, const struct kernel_param *kp) |
10224 | { |
10225 | return kstrtouint(s: val, base: 10, res: (unsigned int *)&start_readonly); |
10226 | } |
10227 | |
10228 | module_param_call(start_ro, set_ro, get_ro, NULL, S_IRUSR|S_IWUSR); |
10229 | module_param(start_dirty_degraded, int, S_IRUGO|S_IWUSR); |
10230 | module_param_call(new_array, add_named_array, NULL, NULL, S_IWUSR); |
10231 | module_param(create_on_open, bool, S_IRUSR|S_IWUSR); |
10232 | |
10233 | MODULE_LICENSE("GPL" ); |
10234 | MODULE_DESCRIPTION("MD RAID framework" ); |
10235 | MODULE_ALIAS("md" ); |
10236 | MODULE_ALIAS_BLOCKDEV_MAJOR(MD_MAJOR); |
10237 | |