1 | // SPDX-License-Identifier: GPL-2.0-or-later |
2 | /* |
3 | md.c : Multiple Devices driver for Linux |
4 | Copyright (C) 1998, 1999, 2000 Ingo Molnar |
5 | |
6 | completely rewritten, based on the MD driver code from Marc Zyngier |
7 | |
8 | Changes: |
9 | |
10 | - RAID-1/RAID-5 extensions by Miguel de Icaza, Gadi Oxman, Ingo Molnar |
11 | - RAID-6 extensions by H. Peter Anvin <hpa@zytor.com> |
12 | - boot support for linear and striped mode by Harald Hoyer <HarryH@Royal.Net> |
13 | - kerneld support by Boris Tobotras <boris@xtalk.msk.su> |
14 | - kmod support by: Cyrus Durgin |
15 | - RAID0 bugfixes: Mark Anthony Lisher <markal@iname.com> |
16 | - Devfs support by Richard Gooch <rgooch@atnf.csiro.au> |
17 | |
18 | - lots of fixes and improvements to the RAID1/RAID5 and generic |
19 | RAID code (such as request based resynchronization): |
20 | |
21 | Neil Brown <neilb@cse.unsw.edu.au>. |
22 | |
23 | - persistent bitmap code |
24 | Copyright (C) 2003-2004, Paul Clements, SteelEye Technology, Inc. |
25 | |
26 | |
27 | Errors, Warnings, etc. |
28 | Please use: |
29 | pr_crit() for error conditions that risk data loss |
30 | pr_err() for error conditions that are unexpected, like an IO error |
31 | or internal inconsistency |
32 | pr_warn() for error conditions that could have been predicated, like |
33 | adding a device to an array when it has incompatible metadata |
34 | pr_info() for every interesting, very rare events, like an array starting |
35 | or stopping, or resync starting or stopping |
36 | pr_debug() for everything else. |
37 | |
38 | */ |
39 | |
40 | #include <linux/sched/mm.h> |
41 | #include <linux/sched/signal.h> |
42 | #include <linux/kthread.h> |
43 | #include <linux/blkdev.h> |
44 | #include <linux/blk-integrity.h> |
45 | #include <linux/badblocks.h> |
46 | #include <linux/sysctl.h> |
47 | #include <linux/seq_file.h> |
48 | #include <linux/fs.h> |
49 | #include <linux/poll.h> |
50 | #include <linux/ctype.h> |
51 | #include <linux/string.h> |
52 | #include <linux/hdreg.h> |
53 | #include <linux/proc_fs.h> |
54 | #include <linux/random.h> |
55 | #include <linux/major.h> |
56 | #include <linux/module.h> |
57 | #include <linux/reboot.h> |
58 | #include <linux/file.h> |
59 | #include <linux/compat.h> |
60 | #include <linux/delay.h> |
61 | #include <linux/raid/md_p.h> |
62 | #include <linux/raid/md_u.h> |
63 | #include <linux/raid/detect.h> |
64 | #include <linux/slab.h> |
65 | #include <linux/percpu-refcount.h> |
66 | #include <linux/part_stat.h> |
67 | |
68 | #include <trace/events/block.h> |
69 | #include "md.h" |
70 | #include "md-bitmap.h" |
71 | #include "md-cluster.h" |
72 | |
73 | /* pers_list is a list of registered personalities protected by pers_lock. */ |
74 | static LIST_HEAD(pers_list); |
75 | static DEFINE_SPINLOCK(pers_lock); |
76 | |
77 | static const struct kobj_type md_ktype; |
78 | |
79 | struct md_cluster_operations *md_cluster_ops; |
80 | EXPORT_SYMBOL(md_cluster_ops); |
81 | static struct module *md_cluster_mod; |
82 | |
83 | static DECLARE_WAIT_QUEUE_HEAD(resync_wait); |
84 | static struct workqueue_struct *md_wq; |
85 | static struct workqueue_struct *md_misc_wq; |
86 | struct workqueue_struct *md_bitmap_wq; |
87 | |
88 | static int remove_and_add_spares(struct mddev *mddev, |
89 | struct md_rdev *this); |
90 | static void mddev_detach(struct mddev *mddev); |
91 | static void export_rdev(struct md_rdev *rdev, struct mddev *mddev); |
92 | static void md_wakeup_thread_directly(struct md_thread __rcu *thread); |
93 | |
94 | enum md_ro_state { |
95 | MD_RDWR, |
96 | MD_RDONLY, |
97 | MD_AUTO_READ, |
98 | MD_MAX_STATE |
99 | }; |
100 | |
101 | static bool md_is_rdwr(struct mddev *mddev) |
102 | { |
103 | return (mddev->ro == MD_RDWR); |
104 | } |
105 | |
106 | /* |
107 | * Default number of read corrections we'll attempt on an rdev |
108 | * before ejecting it from the array. We divide the read error |
109 | * count by 2 for every hour elapsed between read errors. |
110 | */ |
111 | #define MD_DEFAULT_MAX_CORRECTED_READ_ERRORS 20 |
112 | /* Default safemode delay: 200 msec */ |
113 | #define DEFAULT_SAFEMODE_DELAY ((200 * HZ)/1000 +1) |
114 | /* |
115 | * Current RAID-1,4,5 parallel reconstruction 'guaranteed speed limit' |
116 | * is 1000 KB/sec, so the extra system load does not show up that much. |
117 | * Increase it if you want to have more _guaranteed_ speed. Note that |
118 | * the RAID driver will use the maximum available bandwidth if the IO |
119 | * subsystem is idle. There is also an 'absolute maximum' reconstruction |
120 | * speed limit - in case reconstruction slows down your system despite |
121 | * idle IO detection. |
122 | * |
123 | * you can change it via /proc/sys/dev/raid/speed_limit_min and _max. |
124 | * or /sys/block/mdX/md/sync_speed_{min,max} |
125 | */ |
126 | |
127 | static int sysctl_speed_limit_min = 1000; |
128 | static int sysctl_speed_limit_max = 200000; |
129 | static inline int speed_min(struct mddev *mddev) |
130 | { |
131 | return mddev->sync_speed_min ? |
132 | mddev->sync_speed_min : sysctl_speed_limit_min; |
133 | } |
134 | |
135 | static inline int speed_max(struct mddev *mddev) |
136 | { |
137 | return mddev->sync_speed_max ? |
138 | mddev->sync_speed_max : sysctl_speed_limit_max; |
139 | } |
140 | |
141 | static void rdev_uninit_serial(struct md_rdev *rdev) |
142 | { |
143 | if (!test_and_clear_bit(nr: CollisionCheck, addr: &rdev->flags)) |
144 | return; |
145 | |
146 | kvfree(addr: rdev->serial); |
147 | rdev->serial = NULL; |
148 | } |
149 | |
150 | static void rdevs_uninit_serial(struct mddev *mddev) |
151 | { |
152 | struct md_rdev *rdev; |
153 | |
154 | rdev_for_each(rdev, mddev) |
155 | rdev_uninit_serial(rdev); |
156 | } |
157 | |
158 | static int rdev_init_serial(struct md_rdev *rdev) |
159 | { |
160 | /* serial_nums equals with BARRIER_BUCKETS_NR */ |
161 | int i, serial_nums = 1 << ((PAGE_SHIFT - ilog2(sizeof(atomic_t)))); |
162 | struct serial_in_rdev *serial = NULL; |
163 | |
164 | if (test_bit(CollisionCheck, &rdev->flags)) |
165 | return 0; |
166 | |
167 | serial = kvmalloc(size: sizeof(struct serial_in_rdev) * serial_nums, |
168 | GFP_KERNEL); |
169 | if (!serial) |
170 | return -ENOMEM; |
171 | |
172 | for (i = 0; i < serial_nums; i++) { |
173 | struct serial_in_rdev *serial_tmp = &serial[i]; |
174 | |
175 | spin_lock_init(&serial_tmp->serial_lock); |
176 | serial_tmp->serial_rb = RB_ROOT_CACHED; |
177 | init_waitqueue_head(&serial_tmp->serial_io_wait); |
178 | } |
179 | |
180 | rdev->serial = serial; |
181 | set_bit(nr: CollisionCheck, addr: &rdev->flags); |
182 | |
183 | return 0; |
184 | } |
185 | |
186 | static int rdevs_init_serial(struct mddev *mddev) |
187 | { |
188 | struct md_rdev *rdev; |
189 | int ret = 0; |
190 | |
191 | rdev_for_each(rdev, mddev) { |
192 | ret = rdev_init_serial(rdev); |
193 | if (ret) |
194 | break; |
195 | } |
196 | |
197 | /* Free all resources if pool is not existed */ |
198 | if (ret && !mddev->serial_info_pool) |
199 | rdevs_uninit_serial(mddev); |
200 | |
201 | return ret; |
202 | } |
203 | |
204 | /* |
205 | * rdev needs to enable serial stuffs if it meets the conditions: |
206 | * 1. it is multi-queue device flaged with writemostly. |
207 | * 2. the write-behind mode is enabled. |
208 | */ |
209 | static int rdev_need_serial(struct md_rdev *rdev) |
210 | { |
211 | return (rdev && rdev->mddev->bitmap_info.max_write_behind > 0 && |
212 | rdev->bdev->bd_disk->queue->nr_hw_queues != 1 && |
213 | test_bit(WriteMostly, &rdev->flags)); |
214 | } |
215 | |
216 | /* |
217 | * Init resource for rdev(s), then create serial_info_pool if: |
218 | * 1. rdev is the first device which return true from rdev_enable_serial. |
219 | * 2. rdev is NULL, means we want to enable serialization for all rdevs. |
220 | */ |
221 | void mddev_create_serial_pool(struct mddev *mddev, struct md_rdev *rdev) |
222 | { |
223 | int ret = 0; |
224 | |
225 | if (rdev && !rdev_need_serial(rdev) && |
226 | !test_bit(CollisionCheck, &rdev->flags)) |
227 | return; |
228 | |
229 | if (!rdev) |
230 | ret = rdevs_init_serial(mddev); |
231 | else |
232 | ret = rdev_init_serial(rdev); |
233 | if (ret) |
234 | return; |
235 | |
236 | if (mddev->serial_info_pool == NULL) { |
237 | /* |
238 | * already in memalloc noio context by |
239 | * mddev_suspend() |
240 | */ |
241 | mddev->serial_info_pool = |
242 | mempool_create_kmalloc_pool(NR_SERIAL_INFOS, |
243 | size: sizeof(struct serial_info)); |
244 | if (!mddev->serial_info_pool) { |
245 | rdevs_uninit_serial(mddev); |
246 | pr_err("can't alloc memory pool for serialization\n" ); |
247 | } |
248 | } |
249 | } |
250 | |
251 | /* |
252 | * Free resource from rdev(s), and destroy serial_info_pool under conditions: |
253 | * 1. rdev is the last device flaged with CollisionCheck. |
254 | * 2. when bitmap is destroyed while policy is not enabled. |
255 | * 3. for disable policy, the pool is destroyed only when no rdev needs it. |
256 | */ |
257 | void mddev_destroy_serial_pool(struct mddev *mddev, struct md_rdev *rdev) |
258 | { |
259 | if (rdev && !test_bit(CollisionCheck, &rdev->flags)) |
260 | return; |
261 | |
262 | if (mddev->serial_info_pool) { |
263 | struct md_rdev *temp; |
264 | int num = 0; /* used to track if other rdevs need the pool */ |
265 | |
266 | rdev_for_each(temp, mddev) { |
267 | if (!rdev) { |
268 | if (!mddev->serialize_policy || |
269 | !rdev_need_serial(rdev: temp)) |
270 | rdev_uninit_serial(rdev: temp); |
271 | else |
272 | num++; |
273 | } else if (temp != rdev && |
274 | test_bit(CollisionCheck, &temp->flags)) |
275 | num++; |
276 | } |
277 | |
278 | if (rdev) |
279 | rdev_uninit_serial(rdev); |
280 | |
281 | if (num) |
282 | pr_info("The mempool could be used by other devices\n" ); |
283 | else { |
284 | mempool_destroy(pool: mddev->serial_info_pool); |
285 | mddev->serial_info_pool = NULL; |
286 | } |
287 | } |
288 | } |
289 | |
290 | static struct ctl_table_header *; |
291 | |
292 | static struct ctl_table raid_table[] = { |
293 | { |
294 | .procname = "speed_limit_min" , |
295 | .data = &sysctl_speed_limit_min, |
296 | .maxlen = sizeof(int), |
297 | .mode = S_IRUGO|S_IWUSR, |
298 | .proc_handler = proc_dointvec, |
299 | }, |
300 | { |
301 | .procname = "speed_limit_max" , |
302 | .data = &sysctl_speed_limit_max, |
303 | .maxlen = sizeof(int), |
304 | .mode = S_IRUGO|S_IWUSR, |
305 | .proc_handler = proc_dointvec, |
306 | }, |
307 | }; |
308 | |
309 | static int start_readonly; |
310 | |
311 | /* |
312 | * The original mechanism for creating an md device is to create |
313 | * a device node in /dev and to open it. This causes races with device-close. |
314 | * The preferred method is to write to the "new_array" module parameter. |
315 | * This can avoid races. |
316 | * Setting create_on_open to false disables the original mechanism |
317 | * so all the races disappear. |
318 | */ |
319 | static bool create_on_open = true; |
320 | |
321 | /* |
322 | * We have a system wide 'event count' that is incremented |
323 | * on any 'interesting' event, and readers of /proc/mdstat |
324 | * can use 'poll' or 'select' to find out when the event |
325 | * count increases. |
326 | * |
327 | * Events are: |
328 | * start array, stop array, error, add device, remove device, |
329 | * start build, activate spare |
330 | */ |
331 | static DECLARE_WAIT_QUEUE_HEAD(md_event_waiters); |
332 | static atomic_t md_event_count; |
333 | void md_new_event(void) |
334 | { |
335 | atomic_inc(v: &md_event_count); |
336 | wake_up(&md_event_waiters); |
337 | } |
338 | EXPORT_SYMBOL_GPL(md_new_event); |
339 | |
340 | /* |
341 | * Enables to iterate over all existing md arrays |
342 | * all_mddevs_lock protects this list. |
343 | */ |
344 | static LIST_HEAD(all_mddevs); |
345 | static DEFINE_SPINLOCK(all_mddevs_lock); |
346 | |
347 | static bool is_md_suspended(struct mddev *mddev) |
348 | { |
349 | return percpu_ref_is_dying(ref: &mddev->active_io); |
350 | } |
351 | /* Rather than calling directly into the personality make_request function, |
352 | * IO requests come here first so that we can check if the device is |
353 | * being suspended pending a reconfiguration. |
354 | * We hold a refcount over the call to ->make_request. By the time that |
355 | * call has finished, the bio has been linked into some internal structure |
356 | * and so is visible to ->quiesce(), so we don't need the refcount any more. |
357 | */ |
358 | static bool is_suspended(struct mddev *mddev, struct bio *bio) |
359 | { |
360 | if (is_md_suspended(mddev)) |
361 | return true; |
362 | if (bio_data_dir(bio) != WRITE) |
363 | return false; |
364 | if (READ_ONCE(mddev->suspend_lo) >= READ_ONCE(mddev->suspend_hi)) |
365 | return false; |
366 | if (bio->bi_iter.bi_sector >= READ_ONCE(mddev->suspend_hi)) |
367 | return false; |
368 | if (bio_end_sector(bio) < READ_ONCE(mddev->suspend_lo)) |
369 | return false; |
370 | return true; |
371 | } |
372 | |
373 | void md_handle_request(struct mddev *mddev, struct bio *bio) |
374 | { |
375 | check_suspended: |
376 | if (is_suspended(mddev, bio)) { |
377 | DEFINE_WAIT(__wait); |
378 | /* Bail out if REQ_NOWAIT is set for the bio */ |
379 | if (bio->bi_opf & REQ_NOWAIT) { |
380 | bio_wouldblock_error(bio); |
381 | return; |
382 | } |
383 | for (;;) { |
384 | prepare_to_wait(wq_head: &mddev->sb_wait, wq_entry: &__wait, |
385 | TASK_UNINTERRUPTIBLE); |
386 | if (!is_suspended(mddev, bio)) |
387 | break; |
388 | schedule(); |
389 | } |
390 | finish_wait(wq_head: &mddev->sb_wait, wq_entry: &__wait); |
391 | } |
392 | if (!percpu_ref_tryget_live(ref: &mddev->active_io)) |
393 | goto check_suspended; |
394 | |
395 | if (!mddev->pers->make_request(mddev, bio)) { |
396 | percpu_ref_put(ref: &mddev->active_io); |
397 | goto check_suspended; |
398 | } |
399 | |
400 | percpu_ref_put(ref: &mddev->active_io); |
401 | } |
402 | EXPORT_SYMBOL(md_handle_request); |
403 | |
404 | static void md_submit_bio(struct bio *bio) |
405 | { |
406 | const int rw = bio_data_dir(bio); |
407 | struct mddev *mddev = bio->bi_bdev->bd_disk->private_data; |
408 | |
409 | if (mddev == NULL || mddev->pers == NULL) { |
410 | bio_io_error(bio); |
411 | return; |
412 | } |
413 | |
414 | if (unlikely(test_bit(MD_BROKEN, &mddev->flags)) && (rw == WRITE)) { |
415 | bio_io_error(bio); |
416 | return; |
417 | } |
418 | |
419 | bio = bio_split_to_limits(bio); |
420 | if (!bio) |
421 | return; |
422 | |
423 | if (mddev->ro == MD_RDONLY && unlikely(rw == WRITE)) { |
424 | if (bio_sectors(bio) != 0) |
425 | bio->bi_status = BLK_STS_IOERR; |
426 | bio_endio(bio); |
427 | return; |
428 | } |
429 | |
430 | /* bio could be mergeable after passing to underlayer */ |
431 | bio->bi_opf &= ~REQ_NOMERGE; |
432 | |
433 | md_handle_request(mddev, bio); |
434 | } |
435 | |
436 | /* |
437 | * Make sure no new requests are submitted to the device, and any requests that |
438 | * have been submitted are completely handled. |
439 | */ |
440 | int mddev_suspend(struct mddev *mddev, bool interruptible) |
441 | { |
442 | int err = 0; |
443 | |
444 | /* |
445 | * hold reconfig_mutex to wait for normal io will deadlock, because |
446 | * other context can't update super_block, and normal io can rely on |
447 | * updating super_block. |
448 | */ |
449 | lockdep_assert_not_held(&mddev->reconfig_mutex); |
450 | |
451 | if (interruptible) |
452 | err = mutex_lock_interruptible(&mddev->suspend_mutex); |
453 | else |
454 | mutex_lock(&mddev->suspend_mutex); |
455 | if (err) |
456 | return err; |
457 | |
458 | if (mddev->suspended) { |
459 | WRITE_ONCE(mddev->suspended, mddev->suspended + 1); |
460 | mutex_unlock(lock: &mddev->suspend_mutex); |
461 | return 0; |
462 | } |
463 | |
464 | percpu_ref_kill(ref: &mddev->active_io); |
465 | if (interruptible) |
466 | err = wait_event_interruptible(mddev->sb_wait, |
467 | percpu_ref_is_zero(&mddev->active_io)); |
468 | else |
469 | wait_event(mddev->sb_wait, |
470 | percpu_ref_is_zero(&mddev->active_io)); |
471 | if (err) { |
472 | percpu_ref_resurrect(ref: &mddev->active_io); |
473 | mutex_unlock(lock: &mddev->suspend_mutex); |
474 | return err; |
475 | } |
476 | |
477 | /* |
478 | * For raid456, io might be waiting for reshape to make progress, |
479 | * allow new reshape to start while waiting for io to be done to |
480 | * prevent deadlock. |
481 | */ |
482 | WRITE_ONCE(mddev->suspended, mddev->suspended + 1); |
483 | |
484 | del_timer_sync(timer: &mddev->safemode_timer); |
485 | /* restrict memory reclaim I/O during raid array is suspend */ |
486 | mddev->noio_flag = memalloc_noio_save(); |
487 | |
488 | mutex_unlock(lock: &mddev->suspend_mutex); |
489 | return 0; |
490 | } |
491 | EXPORT_SYMBOL_GPL(mddev_suspend); |
492 | |
493 | void mddev_resume(struct mddev *mddev) |
494 | { |
495 | lockdep_assert_not_held(&mddev->reconfig_mutex); |
496 | |
497 | mutex_lock(&mddev->suspend_mutex); |
498 | WRITE_ONCE(mddev->suspended, mddev->suspended - 1); |
499 | if (mddev->suspended) { |
500 | mutex_unlock(lock: &mddev->suspend_mutex); |
501 | return; |
502 | } |
503 | |
504 | /* entred the memalloc scope from mddev_suspend() */ |
505 | memalloc_noio_restore(flags: mddev->noio_flag); |
506 | |
507 | percpu_ref_resurrect(ref: &mddev->active_io); |
508 | wake_up(&mddev->sb_wait); |
509 | |
510 | set_bit(nr: MD_RECOVERY_NEEDED, addr: &mddev->recovery); |
511 | md_wakeup_thread(thread: mddev->thread); |
512 | md_wakeup_thread(thread: mddev->sync_thread); /* possibly kick off a reshape */ |
513 | |
514 | mutex_unlock(lock: &mddev->suspend_mutex); |
515 | } |
516 | EXPORT_SYMBOL_GPL(mddev_resume); |
517 | |
518 | /* |
519 | * Generic flush handling for md |
520 | */ |
521 | |
522 | static void md_end_flush(struct bio *bio) |
523 | { |
524 | struct md_rdev *rdev = bio->bi_private; |
525 | struct mddev *mddev = rdev->mddev; |
526 | |
527 | bio_put(bio); |
528 | |
529 | rdev_dec_pending(rdev, mddev); |
530 | |
531 | if (atomic_dec_and_test(v: &mddev->flush_pending)) { |
532 | /* The pre-request flush has finished */ |
533 | queue_work(wq: md_wq, work: &mddev->flush_work); |
534 | } |
535 | } |
536 | |
537 | static void md_submit_flush_data(struct work_struct *ws); |
538 | |
539 | static void submit_flushes(struct work_struct *ws) |
540 | { |
541 | struct mddev *mddev = container_of(ws, struct mddev, flush_work); |
542 | struct md_rdev *rdev; |
543 | |
544 | mddev->start_flush = ktime_get_boottime(); |
545 | INIT_WORK(&mddev->flush_work, md_submit_flush_data); |
546 | atomic_set(v: &mddev->flush_pending, i: 1); |
547 | rcu_read_lock(); |
548 | rdev_for_each_rcu(rdev, mddev) |
549 | if (rdev->raid_disk >= 0 && |
550 | !test_bit(Faulty, &rdev->flags)) { |
551 | /* Take two references, one is dropped |
552 | * when request finishes, one after |
553 | * we reclaim rcu_read_lock |
554 | */ |
555 | struct bio *bi; |
556 | atomic_inc(v: &rdev->nr_pending); |
557 | atomic_inc(v: &rdev->nr_pending); |
558 | rcu_read_unlock(); |
559 | bi = bio_alloc_bioset(bdev: rdev->bdev, nr_vecs: 0, |
560 | opf: REQ_OP_WRITE | REQ_PREFLUSH, |
561 | GFP_NOIO, bs: &mddev->bio_set); |
562 | bi->bi_end_io = md_end_flush; |
563 | bi->bi_private = rdev; |
564 | atomic_inc(v: &mddev->flush_pending); |
565 | submit_bio(bio: bi); |
566 | rcu_read_lock(); |
567 | rdev_dec_pending(rdev, mddev); |
568 | } |
569 | rcu_read_unlock(); |
570 | if (atomic_dec_and_test(v: &mddev->flush_pending)) |
571 | queue_work(wq: md_wq, work: &mddev->flush_work); |
572 | } |
573 | |
574 | static void md_submit_flush_data(struct work_struct *ws) |
575 | { |
576 | struct mddev *mddev = container_of(ws, struct mddev, flush_work); |
577 | struct bio *bio = mddev->flush_bio; |
578 | |
579 | /* |
580 | * must reset flush_bio before calling into md_handle_request to avoid a |
581 | * deadlock, because other bios passed md_handle_request suspend check |
582 | * could wait for this and below md_handle_request could wait for those |
583 | * bios because of suspend check |
584 | */ |
585 | spin_lock_irq(lock: &mddev->lock); |
586 | mddev->prev_flush_start = mddev->start_flush; |
587 | mddev->flush_bio = NULL; |
588 | spin_unlock_irq(lock: &mddev->lock); |
589 | wake_up(&mddev->sb_wait); |
590 | |
591 | if (bio->bi_iter.bi_size == 0) { |
592 | /* an empty barrier - all done */ |
593 | bio_endio(bio); |
594 | } else { |
595 | bio->bi_opf &= ~REQ_PREFLUSH; |
596 | md_handle_request(mddev, bio); |
597 | } |
598 | } |
599 | |
600 | /* |
601 | * Manages consolidation of flushes and submitting any flushes needed for |
602 | * a bio with REQ_PREFLUSH. Returns true if the bio is finished or is |
603 | * being finished in another context. Returns false if the flushing is |
604 | * complete but still needs the I/O portion of the bio to be processed. |
605 | */ |
606 | bool md_flush_request(struct mddev *mddev, struct bio *bio) |
607 | { |
608 | ktime_t req_start = ktime_get_boottime(); |
609 | spin_lock_irq(lock: &mddev->lock); |
610 | /* flush requests wait until ongoing flush completes, |
611 | * hence coalescing all the pending requests. |
612 | */ |
613 | wait_event_lock_irq(mddev->sb_wait, |
614 | !mddev->flush_bio || |
615 | ktime_before(req_start, mddev->prev_flush_start), |
616 | mddev->lock); |
617 | /* new request after previous flush is completed */ |
618 | if (ktime_after(cmp1: req_start, cmp2: mddev->prev_flush_start)) { |
619 | WARN_ON(mddev->flush_bio); |
620 | mddev->flush_bio = bio; |
621 | bio = NULL; |
622 | } |
623 | spin_unlock_irq(lock: &mddev->lock); |
624 | |
625 | if (!bio) { |
626 | INIT_WORK(&mddev->flush_work, submit_flushes); |
627 | queue_work(wq: md_wq, work: &mddev->flush_work); |
628 | } else { |
629 | /* flush was performed for some other bio while we waited. */ |
630 | if (bio->bi_iter.bi_size == 0) |
631 | /* an empty barrier - all done */ |
632 | bio_endio(bio); |
633 | else { |
634 | bio->bi_opf &= ~REQ_PREFLUSH; |
635 | return false; |
636 | } |
637 | } |
638 | return true; |
639 | } |
640 | EXPORT_SYMBOL(md_flush_request); |
641 | |
642 | static inline struct mddev *mddev_get(struct mddev *mddev) |
643 | { |
644 | lockdep_assert_held(&all_mddevs_lock); |
645 | |
646 | if (test_bit(MD_DELETED, &mddev->flags)) |
647 | return NULL; |
648 | atomic_inc(v: &mddev->active); |
649 | return mddev; |
650 | } |
651 | |
652 | static void mddev_delayed_delete(struct work_struct *ws); |
653 | |
654 | static void __mddev_put(struct mddev *mddev) |
655 | { |
656 | if (mddev->raid_disks || !list_empty(head: &mddev->disks) || |
657 | mddev->ctime || mddev->hold_active) |
658 | return; |
659 | |
660 | /* Array is not configured at all, and not held active, so destroy it */ |
661 | set_bit(nr: MD_DELETED, addr: &mddev->flags); |
662 | |
663 | /* |
664 | * Call queue_work inside the spinlock so that flush_workqueue() after |
665 | * mddev_find will succeed in waiting for the work to be done. |
666 | */ |
667 | queue_work(wq: md_misc_wq, work: &mddev->del_work); |
668 | } |
669 | |
670 | void mddev_put(struct mddev *mddev) |
671 | { |
672 | if (!atomic_dec_and_lock(&mddev->active, &all_mddevs_lock)) |
673 | return; |
674 | |
675 | __mddev_put(mddev); |
676 | spin_unlock(lock: &all_mddevs_lock); |
677 | } |
678 | |
679 | static void md_safemode_timeout(struct timer_list *t); |
680 | static void md_start_sync(struct work_struct *ws); |
681 | |
682 | static void active_io_release(struct percpu_ref *ref) |
683 | { |
684 | struct mddev *mddev = container_of(ref, struct mddev, active_io); |
685 | |
686 | wake_up(&mddev->sb_wait); |
687 | } |
688 | |
689 | static void no_op(struct percpu_ref *r) {} |
690 | |
691 | int mddev_init(struct mddev *mddev) |
692 | { |
693 | |
694 | if (percpu_ref_init(ref: &mddev->active_io, release: active_io_release, |
695 | flags: PERCPU_REF_ALLOW_REINIT, GFP_KERNEL)) |
696 | return -ENOMEM; |
697 | |
698 | if (percpu_ref_init(ref: &mddev->writes_pending, release: no_op, |
699 | flags: PERCPU_REF_ALLOW_REINIT, GFP_KERNEL)) { |
700 | percpu_ref_exit(ref: &mddev->active_io); |
701 | return -ENOMEM; |
702 | } |
703 | |
704 | /* We want to start with the refcount at zero */ |
705 | percpu_ref_put(ref: &mddev->writes_pending); |
706 | |
707 | mutex_init(&mddev->open_mutex); |
708 | mutex_init(&mddev->reconfig_mutex); |
709 | mutex_init(&mddev->sync_mutex); |
710 | mutex_init(&mddev->suspend_mutex); |
711 | mutex_init(&mddev->bitmap_info.mutex); |
712 | INIT_LIST_HEAD(list: &mddev->disks); |
713 | INIT_LIST_HEAD(list: &mddev->all_mddevs); |
714 | INIT_LIST_HEAD(list: &mddev->deleting); |
715 | timer_setup(&mddev->safemode_timer, md_safemode_timeout, 0); |
716 | atomic_set(v: &mddev->active, i: 1); |
717 | atomic_set(v: &mddev->openers, i: 0); |
718 | atomic_set(v: &mddev->sync_seq, i: 0); |
719 | spin_lock_init(&mddev->lock); |
720 | atomic_set(v: &mddev->flush_pending, i: 0); |
721 | init_waitqueue_head(&mddev->sb_wait); |
722 | init_waitqueue_head(&mddev->recovery_wait); |
723 | mddev->reshape_position = MaxSector; |
724 | mddev->reshape_backwards = 0; |
725 | mddev->last_sync_action = "none" ; |
726 | mddev->resync_min = 0; |
727 | mddev->resync_max = MaxSector; |
728 | mddev->level = LEVEL_NONE; |
729 | |
730 | INIT_WORK(&mddev->sync_work, md_start_sync); |
731 | INIT_WORK(&mddev->del_work, mddev_delayed_delete); |
732 | |
733 | return 0; |
734 | } |
735 | EXPORT_SYMBOL_GPL(mddev_init); |
736 | |
737 | void mddev_destroy(struct mddev *mddev) |
738 | { |
739 | percpu_ref_exit(ref: &mddev->active_io); |
740 | percpu_ref_exit(ref: &mddev->writes_pending); |
741 | } |
742 | EXPORT_SYMBOL_GPL(mddev_destroy); |
743 | |
744 | static struct mddev *mddev_find_locked(dev_t unit) |
745 | { |
746 | struct mddev *mddev; |
747 | |
748 | list_for_each_entry(mddev, &all_mddevs, all_mddevs) |
749 | if (mddev->unit == unit) |
750 | return mddev; |
751 | |
752 | return NULL; |
753 | } |
754 | |
755 | /* find an unused unit number */ |
756 | static dev_t mddev_alloc_unit(void) |
757 | { |
758 | static int next_minor = 512; |
759 | int start = next_minor; |
760 | bool is_free = 0; |
761 | dev_t dev = 0; |
762 | |
763 | while (!is_free) { |
764 | dev = MKDEV(MD_MAJOR, next_minor); |
765 | next_minor++; |
766 | if (next_minor > MINORMASK) |
767 | next_minor = 0; |
768 | if (next_minor == start) |
769 | return 0; /* Oh dear, all in use. */ |
770 | is_free = !mddev_find_locked(unit: dev); |
771 | } |
772 | |
773 | return dev; |
774 | } |
775 | |
776 | static struct mddev *mddev_alloc(dev_t unit) |
777 | { |
778 | struct mddev *new; |
779 | int error; |
780 | |
781 | if (unit && MAJOR(unit) != MD_MAJOR) |
782 | unit &= ~((1 << MdpMinorShift) - 1); |
783 | |
784 | new = kzalloc(size: sizeof(*new), GFP_KERNEL); |
785 | if (!new) |
786 | return ERR_PTR(error: -ENOMEM); |
787 | |
788 | error = mddev_init(new); |
789 | if (error) |
790 | goto out_free_new; |
791 | |
792 | spin_lock(lock: &all_mddevs_lock); |
793 | if (unit) { |
794 | error = -EEXIST; |
795 | if (mddev_find_locked(unit)) |
796 | goto out_destroy_new; |
797 | new->unit = unit; |
798 | if (MAJOR(unit) == MD_MAJOR) |
799 | new->md_minor = MINOR(unit); |
800 | else |
801 | new->md_minor = MINOR(unit) >> MdpMinorShift; |
802 | new->hold_active = UNTIL_IOCTL; |
803 | } else { |
804 | error = -ENODEV; |
805 | new->unit = mddev_alloc_unit(); |
806 | if (!new->unit) |
807 | goto out_destroy_new; |
808 | new->md_minor = MINOR(new->unit); |
809 | new->hold_active = UNTIL_STOP; |
810 | } |
811 | |
812 | list_add(new: &new->all_mddevs, head: &all_mddevs); |
813 | spin_unlock(lock: &all_mddevs_lock); |
814 | return new; |
815 | |
816 | out_destroy_new: |
817 | spin_unlock(lock: &all_mddevs_lock); |
818 | mddev_destroy(new); |
819 | out_free_new: |
820 | kfree(objp: new); |
821 | return ERR_PTR(error); |
822 | } |
823 | |
824 | static void mddev_free(struct mddev *mddev) |
825 | { |
826 | spin_lock(lock: &all_mddevs_lock); |
827 | list_del(entry: &mddev->all_mddevs); |
828 | spin_unlock(lock: &all_mddevs_lock); |
829 | |
830 | mddev_destroy(mddev); |
831 | kfree(objp: mddev); |
832 | } |
833 | |
834 | static const struct attribute_group md_redundancy_group; |
835 | |
836 | void mddev_unlock(struct mddev *mddev) |
837 | { |
838 | struct md_rdev *rdev; |
839 | struct md_rdev *tmp; |
840 | LIST_HEAD(delete); |
841 | |
842 | if (!list_empty(head: &mddev->deleting)) |
843 | list_splice_init(list: &mddev->deleting, head: &delete); |
844 | |
845 | if (mddev->to_remove) { |
846 | /* These cannot be removed under reconfig_mutex as |
847 | * an access to the files will try to take reconfig_mutex |
848 | * while holding the file unremovable, which leads to |
849 | * a deadlock. |
850 | * So hold set sysfs_active while the remove in happeing, |
851 | * and anything else which might set ->to_remove or my |
852 | * otherwise change the sysfs namespace will fail with |
853 | * -EBUSY if sysfs_active is still set. |
854 | * We set sysfs_active under reconfig_mutex and elsewhere |
855 | * test it under the same mutex to ensure its correct value |
856 | * is seen. |
857 | */ |
858 | const struct attribute_group *to_remove = mddev->to_remove; |
859 | mddev->to_remove = NULL; |
860 | mddev->sysfs_active = 1; |
861 | mutex_unlock(lock: &mddev->reconfig_mutex); |
862 | |
863 | if (mddev->kobj.sd) { |
864 | if (to_remove != &md_redundancy_group) |
865 | sysfs_remove_group(kobj: &mddev->kobj, grp: to_remove); |
866 | if (mddev->pers == NULL || |
867 | mddev->pers->sync_request == NULL) { |
868 | sysfs_remove_group(kobj: &mddev->kobj, grp: &md_redundancy_group); |
869 | if (mddev->sysfs_action) |
870 | sysfs_put(kn: mddev->sysfs_action); |
871 | if (mddev->sysfs_completed) |
872 | sysfs_put(kn: mddev->sysfs_completed); |
873 | if (mddev->sysfs_degraded) |
874 | sysfs_put(kn: mddev->sysfs_degraded); |
875 | mddev->sysfs_action = NULL; |
876 | mddev->sysfs_completed = NULL; |
877 | mddev->sysfs_degraded = NULL; |
878 | } |
879 | } |
880 | mddev->sysfs_active = 0; |
881 | } else |
882 | mutex_unlock(lock: &mddev->reconfig_mutex); |
883 | |
884 | md_wakeup_thread(thread: mddev->thread); |
885 | wake_up(&mddev->sb_wait); |
886 | |
887 | list_for_each_entry_safe(rdev, tmp, &delete, same_set) { |
888 | list_del_init(entry: &rdev->same_set); |
889 | kobject_del(kobj: &rdev->kobj); |
890 | export_rdev(rdev, mddev); |
891 | } |
892 | } |
893 | EXPORT_SYMBOL_GPL(mddev_unlock); |
894 | |
895 | struct md_rdev *md_find_rdev_nr_rcu(struct mddev *mddev, int nr) |
896 | { |
897 | struct md_rdev *rdev; |
898 | |
899 | rdev_for_each_rcu(rdev, mddev) |
900 | if (rdev->desc_nr == nr) |
901 | return rdev; |
902 | |
903 | return NULL; |
904 | } |
905 | EXPORT_SYMBOL_GPL(md_find_rdev_nr_rcu); |
906 | |
907 | static struct md_rdev *find_rdev(struct mddev *mddev, dev_t dev) |
908 | { |
909 | struct md_rdev *rdev; |
910 | |
911 | rdev_for_each(rdev, mddev) |
912 | if (rdev->bdev->bd_dev == dev) |
913 | return rdev; |
914 | |
915 | return NULL; |
916 | } |
917 | |
918 | struct md_rdev *md_find_rdev_rcu(struct mddev *mddev, dev_t dev) |
919 | { |
920 | struct md_rdev *rdev; |
921 | |
922 | rdev_for_each_rcu(rdev, mddev) |
923 | if (rdev->bdev->bd_dev == dev) |
924 | return rdev; |
925 | |
926 | return NULL; |
927 | } |
928 | EXPORT_SYMBOL_GPL(md_find_rdev_rcu); |
929 | |
930 | static struct md_personality *find_pers(int level, char *clevel) |
931 | { |
932 | struct md_personality *pers; |
933 | list_for_each_entry(pers, &pers_list, list) { |
934 | if (level != LEVEL_NONE && pers->level == level) |
935 | return pers; |
936 | if (strcmp(pers->name, clevel)==0) |
937 | return pers; |
938 | } |
939 | return NULL; |
940 | } |
941 | |
942 | /* return the offset of the super block in 512byte sectors */ |
943 | static inline sector_t calc_dev_sboffset(struct md_rdev *rdev) |
944 | { |
945 | return MD_NEW_SIZE_SECTORS(bdev_nr_sectors(rdev->bdev)); |
946 | } |
947 | |
948 | static int alloc_disk_sb(struct md_rdev *rdev) |
949 | { |
950 | rdev->sb_page = alloc_page(GFP_KERNEL); |
951 | if (!rdev->sb_page) |
952 | return -ENOMEM; |
953 | return 0; |
954 | } |
955 | |
956 | void md_rdev_clear(struct md_rdev *rdev) |
957 | { |
958 | if (rdev->sb_page) { |
959 | put_page(page: rdev->sb_page); |
960 | rdev->sb_loaded = 0; |
961 | rdev->sb_page = NULL; |
962 | rdev->sb_start = 0; |
963 | rdev->sectors = 0; |
964 | } |
965 | if (rdev->bb_page) { |
966 | put_page(page: rdev->bb_page); |
967 | rdev->bb_page = NULL; |
968 | } |
969 | badblocks_exit(bb: &rdev->badblocks); |
970 | } |
971 | EXPORT_SYMBOL_GPL(md_rdev_clear); |
972 | |
973 | static void super_written(struct bio *bio) |
974 | { |
975 | struct md_rdev *rdev = bio->bi_private; |
976 | struct mddev *mddev = rdev->mddev; |
977 | |
978 | if (bio->bi_status) { |
979 | pr_err("md: %s gets error=%d\n" , __func__, |
980 | blk_status_to_errno(bio->bi_status)); |
981 | md_error(mddev, rdev); |
982 | if (!test_bit(Faulty, &rdev->flags) |
983 | && (bio->bi_opf & MD_FAILFAST)) { |
984 | set_bit(nr: MD_SB_NEED_REWRITE, addr: &mddev->sb_flags); |
985 | set_bit(nr: LastDev, addr: &rdev->flags); |
986 | } |
987 | } else |
988 | clear_bit(nr: LastDev, addr: &rdev->flags); |
989 | |
990 | bio_put(bio); |
991 | |
992 | rdev_dec_pending(rdev, mddev); |
993 | |
994 | if (atomic_dec_and_test(v: &mddev->pending_writes)) |
995 | wake_up(&mddev->sb_wait); |
996 | } |
997 | |
998 | void md_super_write(struct mddev *mddev, struct md_rdev *rdev, |
999 | sector_t sector, int size, struct page *page) |
1000 | { |
1001 | /* write first size bytes of page to sector of rdev |
1002 | * Increment mddev->pending_writes before returning |
1003 | * and decrement it on completion, waking up sb_wait |
1004 | * if zero is reached. |
1005 | * If an error occurred, call md_error |
1006 | */ |
1007 | struct bio *bio; |
1008 | |
1009 | if (!page) |
1010 | return; |
1011 | |
1012 | if (test_bit(Faulty, &rdev->flags)) |
1013 | return; |
1014 | |
1015 | bio = bio_alloc_bioset(bdev: rdev->meta_bdev ? rdev->meta_bdev : rdev->bdev, |
1016 | nr_vecs: 1, |
1017 | opf: REQ_OP_WRITE | REQ_SYNC | REQ_PREFLUSH | REQ_FUA, |
1018 | GFP_NOIO, bs: &mddev->sync_set); |
1019 | |
1020 | atomic_inc(v: &rdev->nr_pending); |
1021 | |
1022 | bio->bi_iter.bi_sector = sector; |
1023 | __bio_add_page(bio, page, len: size, off: 0); |
1024 | bio->bi_private = rdev; |
1025 | bio->bi_end_io = super_written; |
1026 | |
1027 | if (test_bit(MD_FAILFAST_SUPPORTED, &mddev->flags) && |
1028 | test_bit(FailFast, &rdev->flags) && |
1029 | !test_bit(LastDev, &rdev->flags)) |
1030 | bio->bi_opf |= MD_FAILFAST; |
1031 | |
1032 | atomic_inc(v: &mddev->pending_writes); |
1033 | submit_bio(bio); |
1034 | } |
1035 | |
1036 | int md_super_wait(struct mddev *mddev) |
1037 | { |
1038 | /* wait for all superblock writes that were scheduled to complete */ |
1039 | wait_event(mddev->sb_wait, atomic_read(&mddev->pending_writes)==0); |
1040 | if (test_and_clear_bit(nr: MD_SB_NEED_REWRITE, addr: &mddev->sb_flags)) |
1041 | return -EAGAIN; |
1042 | return 0; |
1043 | } |
1044 | |
1045 | int sync_page_io(struct md_rdev *rdev, sector_t sector, int size, |
1046 | struct page *page, blk_opf_t opf, bool metadata_op) |
1047 | { |
1048 | struct bio bio; |
1049 | struct bio_vec bvec; |
1050 | |
1051 | if (metadata_op && rdev->meta_bdev) |
1052 | bio_init(bio: &bio, bdev: rdev->meta_bdev, table: &bvec, max_vecs: 1, opf); |
1053 | else |
1054 | bio_init(bio: &bio, bdev: rdev->bdev, table: &bvec, max_vecs: 1, opf); |
1055 | |
1056 | if (metadata_op) |
1057 | bio.bi_iter.bi_sector = sector + rdev->sb_start; |
1058 | else if (rdev->mddev->reshape_position != MaxSector && |
1059 | (rdev->mddev->reshape_backwards == |
1060 | (sector >= rdev->mddev->reshape_position))) |
1061 | bio.bi_iter.bi_sector = sector + rdev->new_data_offset; |
1062 | else |
1063 | bio.bi_iter.bi_sector = sector + rdev->data_offset; |
1064 | __bio_add_page(bio: &bio, page, len: size, off: 0); |
1065 | |
1066 | submit_bio_wait(bio: &bio); |
1067 | |
1068 | return !bio.bi_status; |
1069 | } |
1070 | EXPORT_SYMBOL_GPL(sync_page_io); |
1071 | |
1072 | static int read_disk_sb(struct md_rdev *rdev, int size) |
1073 | { |
1074 | if (rdev->sb_loaded) |
1075 | return 0; |
1076 | |
1077 | if (!sync_page_io(rdev, 0, size, rdev->sb_page, REQ_OP_READ, true)) |
1078 | goto fail; |
1079 | rdev->sb_loaded = 1; |
1080 | return 0; |
1081 | |
1082 | fail: |
1083 | pr_err("md: disabled device %pg, could not read superblock.\n" , |
1084 | rdev->bdev); |
1085 | return -EINVAL; |
1086 | } |
1087 | |
1088 | static int md_uuid_equal(mdp_super_t *sb1, mdp_super_t *sb2) |
1089 | { |
1090 | return sb1->set_uuid0 == sb2->set_uuid0 && |
1091 | sb1->set_uuid1 == sb2->set_uuid1 && |
1092 | sb1->set_uuid2 == sb2->set_uuid2 && |
1093 | sb1->set_uuid3 == sb2->set_uuid3; |
1094 | } |
1095 | |
1096 | static int md_sb_equal(mdp_super_t *sb1, mdp_super_t *sb2) |
1097 | { |
1098 | int ret; |
1099 | mdp_super_t *tmp1, *tmp2; |
1100 | |
1101 | tmp1 = kmalloc(size: sizeof(*tmp1),GFP_KERNEL); |
1102 | tmp2 = kmalloc(size: sizeof(*tmp2),GFP_KERNEL); |
1103 | |
1104 | if (!tmp1 || !tmp2) { |
1105 | ret = 0; |
1106 | goto abort; |
1107 | } |
1108 | |
1109 | *tmp1 = *sb1; |
1110 | *tmp2 = *sb2; |
1111 | |
1112 | /* |
1113 | * nr_disks is not constant |
1114 | */ |
1115 | tmp1->nr_disks = 0; |
1116 | tmp2->nr_disks = 0; |
1117 | |
1118 | ret = (memcmp(p: tmp1, q: tmp2, MD_SB_GENERIC_CONSTANT_WORDS * 4) == 0); |
1119 | abort: |
1120 | kfree(objp: tmp1); |
1121 | kfree(objp: tmp2); |
1122 | return ret; |
1123 | } |
1124 | |
1125 | static u32 md_csum_fold(u32 csum) |
1126 | { |
1127 | csum = (csum & 0xffff) + (csum >> 16); |
1128 | return (csum & 0xffff) + (csum >> 16); |
1129 | } |
1130 | |
1131 | static unsigned int calc_sb_csum(mdp_super_t *sb) |
1132 | { |
1133 | u64 newcsum = 0; |
1134 | u32 *sb32 = (u32*)sb; |
1135 | int i; |
1136 | unsigned int disk_csum, csum; |
1137 | |
1138 | disk_csum = sb->sb_csum; |
1139 | sb->sb_csum = 0; |
1140 | |
1141 | for (i = 0; i < MD_SB_BYTES/4 ; i++) |
1142 | newcsum += sb32[i]; |
1143 | csum = (newcsum & 0xffffffff) + (newcsum>>32); |
1144 | |
1145 | #ifdef CONFIG_ALPHA |
1146 | /* This used to use csum_partial, which was wrong for several |
1147 | * reasons including that different results are returned on |
1148 | * different architectures. It isn't critical that we get exactly |
1149 | * the same return value as before (we always csum_fold before |
1150 | * testing, and that removes any differences). However as we |
1151 | * know that csum_partial always returned a 16bit value on |
1152 | * alphas, do a fold to maximise conformity to previous behaviour. |
1153 | */ |
1154 | sb->sb_csum = md_csum_fold(disk_csum); |
1155 | #else |
1156 | sb->sb_csum = disk_csum; |
1157 | #endif |
1158 | return csum; |
1159 | } |
1160 | |
1161 | /* |
1162 | * Handle superblock details. |
1163 | * We want to be able to handle multiple superblock formats |
1164 | * so we have a common interface to them all, and an array of |
1165 | * different handlers. |
1166 | * We rely on user-space to write the initial superblock, and support |
1167 | * reading and updating of superblocks. |
1168 | * Interface methods are: |
1169 | * int load_super(struct md_rdev *dev, struct md_rdev *refdev, int minor_version) |
1170 | * loads and validates a superblock on dev. |
1171 | * if refdev != NULL, compare superblocks on both devices |
1172 | * Return: |
1173 | * 0 - dev has a superblock that is compatible with refdev |
1174 | * 1 - dev has a superblock that is compatible and newer than refdev |
1175 | * so dev should be used as the refdev in future |
1176 | * -EINVAL superblock incompatible or invalid |
1177 | * -othererror e.g. -EIO |
1178 | * |
1179 | * int validate_super(struct mddev *mddev, struct md_rdev *dev) |
1180 | * Verify that dev is acceptable into mddev. |
1181 | * The first time, mddev->raid_disks will be 0, and data from |
1182 | * dev should be merged in. Subsequent calls check that dev |
1183 | * is new enough. Return 0 or -EINVAL |
1184 | * |
1185 | * void sync_super(struct mddev *mddev, struct md_rdev *dev) |
1186 | * Update the superblock for rdev with data in mddev |
1187 | * This does not write to disc. |
1188 | * |
1189 | */ |
1190 | |
1191 | struct super_type { |
1192 | char *name; |
1193 | struct module *owner; |
1194 | int (*load_super)(struct md_rdev *rdev, |
1195 | struct md_rdev *refdev, |
1196 | int minor_version); |
1197 | int (*validate_super)(struct mddev *mddev, |
1198 | struct md_rdev *rdev); |
1199 | void (*sync_super)(struct mddev *mddev, |
1200 | struct md_rdev *rdev); |
1201 | unsigned long long (*rdev_size_change)(struct md_rdev *rdev, |
1202 | sector_t num_sectors); |
1203 | int (*allow_new_offset)(struct md_rdev *rdev, |
1204 | unsigned long long new_offset); |
1205 | }; |
1206 | |
1207 | /* |
1208 | * Check that the given mddev has no bitmap. |
1209 | * |
1210 | * This function is called from the run method of all personalities that do not |
1211 | * support bitmaps. It prints an error message and returns non-zero if mddev |
1212 | * has a bitmap. Otherwise, it returns 0. |
1213 | * |
1214 | */ |
1215 | int md_check_no_bitmap(struct mddev *mddev) |
1216 | { |
1217 | if (!mddev->bitmap_info.file && !mddev->bitmap_info.offset) |
1218 | return 0; |
1219 | pr_warn("%s: bitmaps are not supported for %s\n" , |
1220 | mdname(mddev), mddev->pers->name); |
1221 | return 1; |
1222 | } |
1223 | EXPORT_SYMBOL(md_check_no_bitmap); |
1224 | |
1225 | /* |
1226 | * load_super for 0.90.0 |
1227 | */ |
1228 | static int super_90_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_version) |
1229 | { |
1230 | mdp_super_t *sb; |
1231 | int ret; |
1232 | bool spare_disk = true; |
1233 | |
1234 | /* |
1235 | * Calculate the position of the superblock (512byte sectors), |
1236 | * it's at the end of the disk. |
1237 | * |
1238 | * It also happens to be a multiple of 4Kb. |
1239 | */ |
1240 | rdev->sb_start = calc_dev_sboffset(rdev); |
1241 | |
1242 | ret = read_disk_sb(rdev, MD_SB_BYTES); |
1243 | if (ret) |
1244 | return ret; |
1245 | |
1246 | ret = -EINVAL; |
1247 | |
1248 | sb = page_address(rdev->sb_page); |
1249 | |
1250 | if (sb->md_magic != MD_SB_MAGIC) { |
1251 | pr_warn("md: invalid raid superblock magic on %pg\n" , |
1252 | rdev->bdev); |
1253 | goto abort; |
1254 | } |
1255 | |
1256 | if (sb->major_version != 0 || |
1257 | sb->minor_version < 90 || |
1258 | sb->minor_version > 91) { |
1259 | pr_warn("Bad version number %d.%d on %pg\n" , |
1260 | sb->major_version, sb->minor_version, rdev->bdev); |
1261 | goto abort; |
1262 | } |
1263 | |
1264 | if (sb->raid_disks <= 0) |
1265 | goto abort; |
1266 | |
1267 | if (md_csum_fold(csum: calc_sb_csum(sb)) != md_csum_fold(csum: sb->sb_csum)) { |
1268 | pr_warn("md: invalid superblock checksum on %pg\n" , rdev->bdev); |
1269 | goto abort; |
1270 | } |
1271 | |
1272 | rdev->preferred_minor = sb->md_minor; |
1273 | rdev->data_offset = 0; |
1274 | rdev->new_data_offset = 0; |
1275 | rdev->sb_size = MD_SB_BYTES; |
1276 | rdev->badblocks.shift = -1; |
1277 | |
1278 | if (sb->level == LEVEL_MULTIPATH) |
1279 | rdev->desc_nr = -1; |
1280 | else |
1281 | rdev->desc_nr = sb->this_disk.number; |
1282 | |
1283 | /* not spare disk, or LEVEL_MULTIPATH */ |
1284 | if (sb->level == LEVEL_MULTIPATH || |
1285 | (rdev->desc_nr >= 0 && |
1286 | rdev->desc_nr < MD_SB_DISKS && |
1287 | sb->disks[rdev->desc_nr].state & |
1288 | ((1<<MD_DISK_SYNC) | (1 << MD_DISK_ACTIVE)))) |
1289 | spare_disk = false; |
1290 | |
1291 | if (!refdev) { |
1292 | if (!spare_disk) |
1293 | ret = 1; |
1294 | else |
1295 | ret = 0; |
1296 | } else { |
1297 | __u64 ev1, ev2; |
1298 | mdp_super_t *refsb = page_address(refdev->sb_page); |
1299 | if (!md_uuid_equal(sb1: refsb, sb2: sb)) { |
1300 | pr_warn("md: %pg has different UUID to %pg\n" , |
1301 | rdev->bdev, refdev->bdev); |
1302 | goto abort; |
1303 | } |
1304 | if (!md_sb_equal(sb1: refsb, sb2: sb)) { |
1305 | pr_warn("md: %pg has same UUID but different superblock to %pg\n" , |
1306 | rdev->bdev, refdev->bdev); |
1307 | goto abort; |
1308 | } |
1309 | ev1 = md_event(sb); |
1310 | ev2 = md_event(sb: refsb); |
1311 | |
1312 | if (!spare_disk && ev1 > ev2) |
1313 | ret = 1; |
1314 | else |
1315 | ret = 0; |
1316 | } |
1317 | rdev->sectors = rdev->sb_start; |
1318 | /* Limit to 4TB as metadata cannot record more than that. |
1319 | * (not needed for Linear and RAID0 as metadata doesn't |
1320 | * record this size) |
1321 | */ |
1322 | if ((u64)rdev->sectors >= (2ULL << 32) && sb->level >= 1) |
1323 | rdev->sectors = (sector_t)(2ULL << 32) - 2; |
1324 | |
1325 | if (rdev->sectors < ((sector_t)sb->size) * 2 && sb->level >= 1) |
1326 | /* "this cannot possibly happen" ... */ |
1327 | ret = -EINVAL; |
1328 | |
1329 | abort: |
1330 | return ret; |
1331 | } |
1332 | |
1333 | /* |
1334 | * validate_super for 0.90.0 |
1335 | */ |
1336 | static int super_90_validate(struct mddev *mddev, struct md_rdev *rdev) |
1337 | { |
1338 | mdp_disk_t *desc; |
1339 | mdp_super_t *sb = page_address(rdev->sb_page); |
1340 | __u64 ev1 = md_event(sb); |
1341 | |
1342 | rdev->raid_disk = -1; |
1343 | clear_bit(nr: Faulty, addr: &rdev->flags); |
1344 | clear_bit(nr: In_sync, addr: &rdev->flags); |
1345 | clear_bit(nr: Bitmap_sync, addr: &rdev->flags); |
1346 | clear_bit(nr: WriteMostly, addr: &rdev->flags); |
1347 | |
1348 | if (mddev->raid_disks == 0) { |
1349 | mddev->major_version = 0; |
1350 | mddev->minor_version = sb->minor_version; |
1351 | mddev->patch_version = sb->patch_version; |
1352 | mddev->external = 0; |
1353 | mddev->chunk_sectors = sb->chunk_size >> 9; |
1354 | mddev->ctime = sb->ctime; |
1355 | mddev->utime = sb->utime; |
1356 | mddev->level = sb->level; |
1357 | mddev->clevel[0] = 0; |
1358 | mddev->layout = sb->layout; |
1359 | mddev->raid_disks = sb->raid_disks; |
1360 | mddev->dev_sectors = ((sector_t)sb->size) * 2; |
1361 | mddev->events = ev1; |
1362 | mddev->bitmap_info.offset = 0; |
1363 | mddev->bitmap_info.space = 0; |
1364 | /* bitmap can use 60 K after the 4K superblocks */ |
1365 | mddev->bitmap_info.default_offset = MD_SB_BYTES >> 9; |
1366 | mddev->bitmap_info.default_space = 64*2 - (MD_SB_BYTES >> 9); |
1367 | mddev->reshape_backwards = 0; |
1368 | |
1369 | if (mddev->minor_version >= 91) { |
1370 | mddev->reshape_position = sb->reshape_position; |
1371 | mddev->delta_disks = sb->delta_disks; |
1372 | mddev->new_level = sb->new_level; |
1373 | mddev->new_layout = sb->new_layout; |
1374 | mddev->new_chunk_sectors = sb->new_chunk >> 9; |
1375 | if (mddev->delta_disks < 0) |
1376 | mddev->reshape_backwards = 1; |
1377 | } else { |
1378 | mddev->reshape_position = MaxSector; |
1379 | mddev->delta_disks = 0; |
1380 | mddev->new_level = mddev->level; |
1381 | mddev->new_layout = mddev->layout; |
1382 | mddev->new_chunk_sectors = mddev->chunk_sectors; |
1383 | } |
1384 | if (mddev->level == 0) |
1385 | mddev->layout = -1; |
1386 | |
1387 | if (sb->state & (1<<MD_SB_CLEAN)) |
1388 | mddev->recovery_cp = MaxSector; |
1389 | else { |
1390 | if (sb->events_hi == sb->cp_events_hi && |
1391 | sb->events_lo == sb->cp_events_lo) { |
1392 | mddev->recovery_cp = sb->recovery_cp; |
1393 | } else |
1394 | mddev->recovery_cp = 0; |
1395 | } |
1396 | |
1397 | memcpy(mddev->uuid+0, &sb->set_uuid0, 4); |
1398 | memcpy(mddev->uuid+4, &sb->set_uuid1, 4); |
1399 | memcpy(mddev->uuid+8, &sb->set_uuid2, 4); |
1400 | memcpy(mddev->uuid+12,&sb->set_uuid3, 4); |
1401 | |
1402 | mddev->max_disks = MD_SB_DISKS; |
1403 | |
1404 | if (sb->state & (1<<MD_SB_BITMAP_PRESENT) && |
1405 | mddev->bitmap_info.file == NULL) { |
1406 | mddev->bitmap_info.offset = |
1407 | mddev->bitmap_info.default_offset; |
1408 | mddev->bitmap_info.space = |
1409 | mddev->bitmap_info.default_space; |
1410 | } |
1411 | |
1412 | } else if (mddev->pers == NULL) { |
1413 | /* Insist on good event counter while assembling, except |
1414 | * for spares (which don't need an event count) */ |
1415 | ++ev1; |
1416 | if (sb->disks[rdev->desc_nr].state & ( |
1417 | (1<<MD_DISK_SYNC) | (1 << MD_DISK_ACTIVE))) |
1418 | if (ev1 < mddev->events) |
1419 | return -EINVAL; |
1420 | } else if (mddev->bitmap) { |
1421 | /* if adding to array with a bitmap, then we can accept an |
1422 | * older device ... but not too old. |
1423 | */ |
1424 | if (ev1 < mddev->bitmap->events_cleared) |
1425 | return 0; |
1426 | if (ev1 < mddev->events) |
1427 | set_bit(nr: Bitmap_sync, addr: &rdev->flags); |
1428 | } else { |
1429 | if (ev1 < mddev->events) |
1430 | /* just a hot-add of a new device, leave raid_disk at -1 */ |
1431 | return 0; |
1432 | } |
1433 | |
1434 | if (mddev->level != LEVEL_MULTIPATH) { |
1435 | desc = sb->disks + rdev->desc_nr; |
1436 | |
1437 | if (desc->state & (1<<MD_DISK_FAULTY)) |
1438 | set_bit(nr: Faulty, addr: &rdev->flags); |
1439 | else if (desc->state & (1<<MD_DISK_SYNC) /* && |
1440 | desc->raid_disk < mddev->raid_disks */) { |
1441 | set_bit(nr: In_sync, addr: &rdev->flags); |
1442 | rdev->raid_disk = desc->raid_disk; |
1443 | rdev->saved_raid_disk = desc->raid_disk; |
1444 | } else if (desc->state & (1<<MD_DISK_ACTIVE)) { |
1445 | /* active but not in sync implies recovery up to |
1446 | * reshape position. We don't know exactly where |
1447 | * that is, so set to zero for now */ |
1448 | if (mddev->minor_version >= 91) { |
1449 | rdev->recovery_offset = 0; |
1450 | rdev->raid_disk = desc->raid_disk; |
1451 | } |
1452 | } |
1453 | if (desc->state & (1<<MD_DISK_WRITEMOSTLY)) |
1454 | set_bit(nr: WriteMostly, addr: &rdev->flags); |
1455 | if (desc->state & (1<<MD_DISK_FAILFAST)) |
1456 | set_bit(nr: FailFast, addr: &rdev->flags); |
1457 | } else /* MULTIPATH are always insync */ |
1458 | set_bit(nr: In_sync, addr: &rdev->flags); |
1459 | return 0; |
1460 | } |
1461 | |
1462 | /* |
1463 | * sync_super for 0.90.0 |
1464 | */ |
1465 | static void super_90_sync(struct mddev *mddev, struct md_rdev *rdev) |
1466 | { |
1467 | mdp_super_t *sb; |
1468 | struct md_rdev *rdev2; |
1469 | int next_spare = mddev->raid_disks; |
1470 | |
1471 | /* make rdev->sb match mddev data.. |
1472 | * |
1473 | * 1/ zero out disks |
1474 | * 2/ Add info for each disk, keeping track of highest desc_nr (next_spare); |
1475 | * 3/ any empty disks < next_spare become removed |
1476 | * |
1477 | * disks[0] gets initialised to REMOVED because |
1478 | * we cannot be sure from other fields if it has |
1479 | * been initialised or not. |
1480 | */ |
1481 | int i; |
1482 | int active=0, working=0,failed=0,spare=0,nr_disks=0; |
1483 | |
1484 | rdev->sb_size = MD_SB_BYTES; |
1485 | |
1486 | sb = page_address(rdev->sb_page); |
1487 | |
1488 | memset(sb, 0, sizeof(*sb)); |
1489 | |
1490 | sb->md_magic = MD_SB_MAGIC; |
1491 | sb->major_version = mddev->major_version; |
1492 | sb->patch_version = mddev->patch_version; |
1493 | sb->gvalid_words = 0; /* ignored */ |
1494 | memcpy(&sb->set_uuid0, mddev->uuid+0, 4); |
1495 | memcpy(&sb->set_uuid1, mddev->uuid+4, 4); |
1496 | memcpy(&sb->set_uuid2, mddev->uuid+8, 4); |
1497 | memcpy(&sb->set_uuid3, mddev->uuid+12,4); |
1498 | |
1499 | sb->ctime = clamp_t(time64_t, mddev->ctime, 0, U32_MAX); |
1500 | sb->level = mddev->level; |
1501 | sb->size = mddev->dev_sectors / 2; |
1502 | sb->raid_disks = mddev->raid_disks; |
1503 | sb->md_minor = mddev->md_minor; |
1504 | sb->not_persistent = 0; |
1505 | sb->utime = clamp_t(time64_t, mddev->utime, 0, U32_MAX); |
1506 | sb->state = 0; |
1507 | sb->events_hi = (mddev->events>>32); |
1508 | sb->events_lo = (u32)mddev->events; |
1509 | |
1510 | if (mddev->reshape_position == MaxSector) |
1511 | sb->minor_version = 90; |
1512 | else { |
1513 | sb->minor_version = 91; |
1514 | sb->reshape_position = mddev->reshape_position; |
1515 | sb->new_level = mddev->new_level; |
1516 | sb->delta_disks = mddev->delta_disks; |
1517 | sb->new_layout = mddev->new_layout; |
1518 | sb->new_chunk = mddev->new_chunk_sectors << 9; |
1519 | } |
1520 | mddev->minor_version = sb->minor_version; |
1521 | if (mddev->in_sync) |
1522 | { |
1523 | sb->recovery_cp = mddev->recovery_cp; |
1524 | sb->cp_events_hi = (mddev->events>>32); |
1525 | sb->cp_events_lo = (u32)mddev->events; |
1526 | if (mddev->recovery_cp == MaxSector) |
1527 | sb->state = (1<< MD_SB_CLEAN); |
1528 | } else |
1529 | sb->recovery_cp = 0; |
1530 | |
1531 | sb->layout = mddev->layout; |
1532 | sb->chunk_size = mddev->chunk_sectors << 9; |
1533 | |
1534 | if (mddev->bitmap && mddev->bitmap_info.file == NULL) |
1535 | sb->state |= (1<<MD_SB_BITMAP_PRESENT); |
1536 | |
1537 | sb->disks[0].state = (1<<MD_DISK_REMOVED); |
1538 | rdev_for_each(rdev2, mddev) { |
1539 | mdp_disk_t *d; |
1540 | int desc_nr; |
1541 | int is_active = test_bit(In_sync, &rdev2->flags); |
1542 | |
1543 | if (rdev2->raid_disk >= 0 && |
1544 | sb->minor_version >= 91) |
1545 | /* we have nowhere to store the recovery_offset, |
1546 | * but if it is not below the reshape_position, |
1547 | * we can piggy-back on that. |
1548 | */ |
1549 | is_active = 1; |
1550 | if (rdev2->raid_disk < 0 || |
1551 | test_bit(Faulty, &rdev2->flags)) |
1552 | is_active = 0; |
1553 | if (is_active) |
1554 | desc_nr = rdev2->raid_disk; |
1555 | else |
1556 | desc_nr = next_spare++; |
1557 | rdev2->desc_nr = desc_nr; |
1558 | d = &sb->disks[rdev2->desc_nr]; |
1559 | nr_disks++; |
1560 | d->number = rdev2->desc_nr; |
1561 | d->major = MAJOR(rdev2->bdev->bd_dev); |
1562 | d->minor = MINOR(rdev2->bdev->bd_dev); |
1563 | if (is_active) |
1564 | d->raid_disk = rdev2->raid_disk; |
1565 | else |
1566 | d->raid_disk = rdev2->desc_nr; /* compatibility */ |
1567 | if (test_bit(Faulty, &rdev2->flags)) |
1568 | d->state = (1<<MD_DISK_FAULTY); |
1569 | else if (is_active) { |
1570 | d->state = (1<<MD_DISK_ACTIVE); |
1571 | if (test_bit(In_sync, &rdev2->flags)) |
1572 | d->state |= (1<<MD_DISK_SYNC); |
1573 | active++; |
1574 | working++; |
1575 | } else { |
1576 | d->state = 0; |
1577 | spare++; |
1578 | working++; |
1579 | } |
1580 | if (test_bit(WriteMostly, &rdev2->flags)) |
1581 | d->state |= (1<<MD_DISK_WRITEMOSTLY); |
1582 | if (test_bit(FailFast, &rdev2->flags)) |
1583 | d->state |= (1<<MD_DISK_FAILFAST); |
1584 | } |
1585 | /* now set the "removed" and "faulty" bits on any missing devices */ |
1586 | for (i=0 ; i < mddev->raid_disks ; i++) { |
1587 | mdp_disk_t *d = &sb->disks[i]; |
1588 | if (d->state == 0 && d->number == 0) { |
1589 | d->number = i; |
1590 | d->raid_disk = i; |
1591 | d->state = (1<<MD_DISK_REMOVED); |
1592 | d->state |= (1<<MD_DISK_FAULTY); |
1593 | failed++; |
1594 | } |
1595 | } |
1596 | sb->nr_disks = nr_disks; |
1597 | sb->active_disks = active; |
1598 | sb->working_disks = working; |
1599 | sb->failed_disks = failed; |
1600 | sb->spare_disks = spare; |
1601 | |
1602 | sb->this_disk = sb->disks[rdev->desc_nr]; |
1603 | sb->sb_csum = calc_sb_csum(sb); |
1604 | } |
1605 | |
1606 | /* |
1607 | * rdev_size_change for 0.90.0 |
1608 | */ |
1609 | static unsigned long long |
1610 | super_90_rdev_size_change(struct md_rdev *rdev, sector_t num_sectors) |
1611 | { |
1612 | if (num_sectors && num_sectors < rdev->mddev->dev_sectors) |
1613 | return 0; /* component must fit device */ |
1614 | if (rdev->mddev->bitmap_info.offset) |
1615 | return 0; /* can't move bitmap */ |
1616 | rdev->sb_start = calc_dev_sboffset(rdev); |
1617 | if (!num_sectors || num_sectors > rdev->sb_start) |
1618 | num_sectors = rdev->sb_start; |
1619 | /* Limit to 4TB as metadata cannot record more than that. |
1620 | * 4TB == 2^32 KB, or 2*2^32 sectors. |
1621 | */ |
1622 | if ((u64)num_sectors >= (2ULL << 32) && rdev->mddev->level >= 1) |
1623 | num_sectors = (sector_t)(2ULL << 32) - 2; |
1624 | do { |
1625 | md_super_write(mddev: rdev->mddev, rdev, sector: rdev->sb_start, size: rdev->sb_size, |
1626 | page: rdev->sb_page); |
1627 | } while (md_super_wait(mddev: rdev->mddev) < 0); |
1628 | return num_sectors; |
1629 | } |
1630 | |
1631 | static int |
1632 | super_90_allow_new_offset(struct md_rdev *rdev, unsigned long long new_offset) |
1633 | { |
1634 | /* non-zero offset changes not possible with v0.90 */ |
1635 | return new_offset == 0; |
1636 | } |
1637 | |
1638 | /* |
1639 | * version 1 superblock |
1640 | */ |
1641 | |
1642 | static __le32 calc_sb_1_csum(struct mdp_superblock_1 *sb) |
1643 | { |
1644 | __le32 disk_csum; |
1645 | u32 csum; |
1646 | unsigned long long newcsum; |
1647 | int size = 256 + le32_to_cpu(sb->max_dev)*2; |
1648 | __le32 *isuper = (__le32*)sb; |
1649 | |
1650 | disk_csum = sb->sb_csum; |
1651 | sb->sb_csum = 0; |
1652 | newcsum = 0; |
1653 | for (; size >= 4; size -= 4) |
1654 | newcsum += le32_to_cpu(*isuper++); |
1655 | |
1656 | if (size == 2) |
1657 | newcsum += le16_to_cpu(*(__le16*) isuper); |
1658 | |
1659 | csum = (newcsum & 0xffffffff) + (newcsum >> 32); |
1660 | sb->sb_csum = disk_csum; |
1661 | return cpu_to_le32(csum); |
1662 | } |
1663 | |
1664 | static int super_1_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_version) |
1665 | { |
1666 | struct mdp_superblock_1 *sb; |
1667 | int ret; |
1668 | sector_t sb_start; |
1669 | sector_t sectors; |
1670 | int bmask; |
1671 | bool spare_disk = true; |
1672 | |
1673 | /* |
1674 | * Calculate the position of the superblock in 512byte sectors. |
1675 | * It is always aligned to a 4K boundary and |
1676 | * depeding on minor_version, it can be: |
1677 | * 0: At least 8K, but less than 12K, from end of device |
1678 | * 1: At start of device |
1679 | * 2: 4K from start of device. |
1680 | */ |
1681 | switch(minor_version) { |
1682 | case 0: |
1683 | sb_start = bdev_nr_sectors(bdev: rdev->bdev) - 8 * 2; |
1684 | sb_start &= ~(sector_t)(4*2-1); |
1685 | break; |
1686 | case 1: |
1687 | sb_start = 0; |
1688 | break; |
1689 | case 2: |
1690 | sb_start = 8; |
1691 | break; |
1692 | default: |
1693 | return -EINVAL; |
1694 | } |
1695 | rdev->sb_start = sb_start; |
1696 | |
1697 | /* superblock is rarely larger than 1K, but it can be larger, |
1698 | * and it is safe to read 4k, so we do that |
1699 | */ |
1700 | ret = read_disk_sb(rdev, size: 4096); |
1701 | if (ret) return ret; |
1702 | |
1703 | sb = page_address(rdev->sb_page); |
1704 | |
1705 | if (sb->magic != cpu_to_le32(MD_SB_MAGIC) || |
1706 | sb->major_version != cpu_to_le32(1) || |
1707 | le32_to_cpu(sb->max_dev) > (4096-256)/2 || |
1708 | le64_to_cpu(sb->super_offset) != rdev->sb_start || |
1709 | (le32_to_cpu(sb->feature_map) & ~MD_FEATURE_ALL) != 0) |
1710 | return -EINVAL; |
1711 | |
1712 | if (calc_sb_1_csum(sb) != sb->sb_csum) { |
1713 | pr_warn("md: invalid superblock checksum on %pg\n" , |
1714 | rdev->bdev); |
1715 | return -EINVAL; |
1716 | } |
1717 | if (le64_to_cpu(sb->data_size) < 10) { |
1718 | pr_warn("md: data_size too small on %pg\n" , |
1719 | rdev->bdev); |
1720 | return -EINVAL; |
1721 | } |
1722 | if (sb->pad0 || |
1723 | sb->pad3[0] || |
1724 | memcmp(p: sb->pad3, q: sb->pad3+1, size: sizeof(sb->pad3) - sizeof(sb->pad3[1]))) |
1725 | /* Some padding is non-zero, might be a new feature */ |
1726 | return -EINVAL; |
1727 | |
1728 | rdev->preferred_minor = 0xffff; |
1729 | rdev->data_offset = le64_to_cpu(sb->data_offset); |
1730 | rdev->new_data_offset = rdev->data_offset; |
1731 | if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE) && |
1732 | (le32_to_cpu(sb->feature_map) & MD_FEATURE_NEW_OFFSET)) |
1733 | rdev->new_data_offset += (s32)le32_to_cpu(sb->new_offset); |
1734 | atomic_set(v: &rdev->corrected_errors, le32_to_cpu(sb->cnt_corrected_read)); |
1735 | |
1736 | rdev->sb_size = le32_to_cpu(sb->max_dev) * 2 + 256; |
1737 | bmask = queue_logical_block_size(q: rdev->bdev->bd_disk->queue)-1; |
1738 | if (rdev->sb_size & bmask) |
1739 | rdev->sb_size = (rdev->sb_size | bmask) + 1; |
1740 | |
1741 | if (minor_version |
1742 | && rdev->data_offset < sb_start + (rdev->sb_size/512)) |
1743 | return -EINVAL; |
1744 | if (minor_version |
1745 | && rdev->new_data_offset < sb_start + (rdev->sb_size/512)) |
1746 | return -EINVAL; |
1747 | |
1748 | if (sb->level == cpu_to_le32(LEVEL_MULTIPATH)) |
1749 | rdev->desc_nr = -1; |
1750 | else |
1751 | rdev->desc_nr = le32_to_cpu(sb->dev_number); |
1752 | |
1753 | if (!rdev->bb_page) { |
1754 | rdev->bb_page = alloc_page(GFP_KERNEL); |
1755 | if (!rdev->bb_page) |
1756 | return -ENOMEM; |
1757 | } |
1758 | if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_BAD_BLOCKS) && |
1759 | rdev->badblocks.count == 0) { |
1760 | /* need to load the bad block list. |
1761 | * Currently we limit it to one page. |
1762 | */ |
1763 | s32 offset; |
1764 | sector_t bb_sector; |
1765 | __le64 *bbp; |
1766 | int i; |
1767 | int sectors = le16_to_cpu(sb->bblog_size); |
1768 | if (sectors > (PAGE_SIZE / 512)) |
1769 | return -EINVAL; |
1770 | offset = le32_to_cpu(sb->bblog_offset); |
1771 | if (offset == 0) |
1772 | return -EINVAL; |
1773 | bb_sector = (long long)offset; |
1774 | if (!sync_page_io(rdev, bb_sector, sectors << 9, |
1775 | rdev->bb_page, REQ_OP_READ, true)) |
1776 | return -EIO; |
1777 | bbp = (__le64 *)page_address(rdev->bb_page); |
1778 | rdev->badblocks.shift = sb->bblog_shift; |
1779 | for (i = 0 ; i < (sectors << (9-3)) ; i++, bbp++) { |
1780 | u64 bb = le64_to_cpu(*bbp); |
1781 | int count = bb & (0x3ff); |
1782 | u64 sector = bb >> 10; |
1783 | sector <<= sb->bblog_shift; |
1784 | count <<= sb->bblog_shift; |
1785 | if (bb + 1 == 0) |
1786 | break; |
1787 | if (badblocks_set(bb: &rdev->badblocks, s: sector, sectors: count, acknowledged: 1)) |
1788 | return -EINVAL; |
1789 | } |
1790 | } else if (sb->bblog_offset != 0) |
1791 | rdev->badblocks.shift = 0; |
1792 | |
1793 | if ((le32_to_cpu(sb->feature_map) & |
1794 | (MD_FEATURE_PPL | MD_FEATURE_MULTIPLE_PPLS))) { |
1795 | rdev->ppl.offset = (__s16)le16_to_cpu(sb->ppl.offset); |
1796 | rdev->ppl.size = le16_to_cpu(sb->ppl.size); |
1797 | rdev->ppl.sector = rdev->sb_start + rdev->ppl.offset; |
1798 | } |
1799 | |
1800 | if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RAID0_LAYOUT) && |
1801 | sb->level != 0) |
1802 | return -EINVAL; |
1803 | |
1804 | /* not spare disk, or LEVEL_MULTIPATH */ |
1805 | if (sb->level == cpu_to_le32(LEVEL_MULTIPATH) || |
1806 | (rdev->desc_nr >= 0 && |
1807 | rdev->desc_nr < le32_to_cpu(sb->max_dev) && |
1808 | (le16_to_cpu(sb->dev_roles[rdev->desc_nr]) < MD_DISK_ROLE_MAX || |
1809 | le16_to_cpu(sb->dev_roles[rdev->desc_nr]) == MD_DISK_ROLE_JOURNAL))) |
1810 | spare_disk = false; |
1811 | |
1812 | if (!refdev) { |
1813 | if (!spare_disk) |
1814 | ret = 1; |
1815 | else |
1816 | ret = 0; |
1817 | } else { |
1818 | __u64 ev1, ev2; |
1819 | struct mdp_superblock_1 *refsb = page_address(refdev->sb_page); |
1820 | |
1821 | if (memcmp(p: sb->set_uuid, q: refsb->set_uuid, size: 16) != 0 || |
1822 | sb->level != refsb->level || |
1823 | sb->layout != refsb->layout || |
1824 | sb->chunksize != refsb->chunksize) { |
1825 | pr_warn("md: %pg has strangely different superblock to %pg\n" , |
1826 | rdev->bdev, |
1827 | refdev->bdev); |
1828 | return -EINVAL; |
1829 | } |
1830 | ev1 = le64_to_cpu(sb->events); |
1831 | ev2 = le64_to_cpu(refsb->events); |
1832 | |
1833 | if (!spare_disk && ev1 > ev2) |
1834 | ret = 1; |
1835 | else |
1836 | ret = 0; |
1837 | } |
1838 | if (minor_version) |
1839 | sectors = bdev_nr_sectors(bdev: rdev->bdev) - rdev->data_offset; |
1840 | else |
1841 | sectors = rdev->sb_start; |
1842 | if (sectors < le64_to_cpu(sb->data_size)) |
1843 | return -EINVAL; |
1844 | rdev->sectors = le64_to_cpu(sb->data_size); |
1845 | return ret; |
1846 | } |
1847 | |
1848 | static int super_1_validate(struct mddev *mddev, struct md_rdev *rdev) |
1849 | { |
1850 | struct mdp_superblock_1 *sb = page_address(rdev->sb_page); |
1851 | __u64 ev1 = le64_to_cpu(sb->events); |
1852 | |
1853 | rdev->raid_disk = -1; |
1854 | clear_bit(nr: Faulty, addr: &rdev->flags); |
1855 | clear_bit(nr: In_sync, addr: &rdev->flags); |
1856 | clear_bit(nr: Bitmap_sync, addr: &rdev->flags); |
1857 | clear_bit(nr: WriteMostly, addr: &rdev->flags); |
1858 | |
1859 | if (mddev->raid_disks == 0) { |
1860 | mddev->major_version = 1; |
1861 | mddev->patch_version = 0; |
1862 | mddev->external = 0; |
1863 | mddev->chunk_sectors = le32_to_cpu(sb->chunksize); |
1864 | mddev->ctime = le64_to_cpu(sb->ctime); |
1865 | mddev->utime = le64_to_cpu(sb->utime); |
1866 | mddev->level = le32_to_cpu(sb->level); |
1867 | mddev->clevel[0] = 0; |
1868 | mddev->layout = le32_to_cpu(sb->layout); |
1869 | mddev->raid_disks = le32_to_cpu(sb->raid_disks); |
1870 | mddev->dev_sectors = le64_to_cpu(sb->size); |
1871 | mddev->events = ev1; |
1872 | mddev->bitmap_info.offset = 0; |
1873 | mddev->bitmap_info.space = 0; |
1874 | /* Default location for bitmap is 1K after superblock |
1875 | * using 3K - total of 4K |
1876 | */ |
1877 | mddev->bitmap_info.default_offset = 1024 >> 9; |
1878 | mddev->bitmap_info.default_space = (4096-1024) >> 9; |
1879 | mddev->reshape_backwards = 0; |
1880 | |
1881 | mddev->recovery_cp = le64_to_cpu(sb->resync_offset); |
1882 | memcpy(mddev->uuid, sb->set_uuid, 16); |
1883 | |
1884 | mddev->max_disks = (4096-256)/2; |
1885 | |
1886 | if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_BITMAP_OFFSET) && |
1887 | mddev->bitmap_info.file == NULL) { |
1888 | mddev->bitmap_info.offset = |
1889 | (__s32)le32_to_cpu(sb->bitmap_offset); |
1890 | /* Metadata doesn't record how much space is available. |
1891 | * For 1.0, we assume we can use up to the superblock |
1892 | * if before, else to 4K beyond superblock. |
1893 | * For others, assume no change is possible. |
1894 | */ |
1895 | if (mddev->minor_version > 0) |
1896 | mddev->bitmap_info.space = 0; |
1897 | else if (mddev->bitmap_info.offset > 0) |
1898 | mddev->bitmap_info.space = |
1899 | 8 - mddev->bitmap_info.offset; |
1900 | else |
1901 | mddev->bitmap_info.space = |
1902 | -mddev->bitmap_info.offset; |
1903 | } |
1904 | |
1905 | if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE)) { |
1906 | mddev->reshape_position = le64_to_cpu(sb->reshape_position); |
1907 | mddev->delta_disks = le32_to_cpu(sb->delta_disks); |
1908 | mddev->new_level = le32_to_cpu(sb->new_level); |
1909 | mddev->new_layout = le32_to_cpu(sb->new_layout); |
1910 | mddev->new_chunk_sectors = le32_to_cpu(sb->new_chunk); |
1911 | if (mddev->delta_disks < 0 || |
1912 | (mddev->delta_disks == 0 && |
1913 | (le32_to_cpu(sb->feature_map) |
1914 | & MD_FEATURE_RESHAPE_BACKWARDS))) |
1915 | mddev->reshape_backwards = 1; |
1916 | } else { |
1917 | mddev->reshape_position = MaxSector; |
1918 | mddev->delta_disks = 0; |
1919 | mddev->new_level = mddev->level; |
1920 | mddev->new_layout = mddev->layout; |
1921 | mddev->new_chunk_sectors = mddev->chunk_sectors; |
1922 | } |
1923 | |
1924 | if (mddev->level == 0 && |
1925 | !(le32_to_cpu(sb->feature_map) & MD_FEATURE_RAID0_LAYOUT)) |
1926 | mddev->layout = -1; |
1927 | |
1928 | if (le32_to_cpu(sb->feature_map) & MD_FEATURE_JOURNAL) |
1929 | set_bit(nr: MD_HAS_JOURNAL, addr: &mddev->flags); |
1930 | |
1931 | if (le32_to_cpu(sb->feature_map) & |
1932 | (MD_FEATURE_PPL | MD_FEATURE_MULTIPLE_PPLS)) { |
1933 | if (le32_to_cpu(sb->feature_map) & |
1934 | (MD_FEATURE_BITMAP_OFFSET | MD_FEATURE_JOURNAL)) |
1935 | return -EINVAL; |
1936 | if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_PPL) && |
1937 | (le32_to_cpu(sb->feature_map) & |
1938 | MD_FEATURE_MULTIPLE_PPLS)) |
1939 | return -EINVAL; |
1940 | set_bit(nr: MD_HAS_PPL, addr: &mddev->flags); |
1941 | } |
1942 | } else if (mddev->pers == NULL) { |
1943 | /* Insist of good event counter while assembling, except for |
1944 | * spares (which don't need an event count) */ |
1945 | ++ev1; |
1946 | if (rdev->desc_nr >= 0 && |
1947 | rdev->desc_nr < le32_to_cpu(sb->max_dev) && |
1948 | (le16_to_cpu(sb->dev_roles[rdev->desc_nr]) < MD_DISK_ROLE_MAX || |
1949 | le16_to_cpu(sb->dev_roles[rdev->desc_nr]) == MD_DISK_ROLE_JOURNAL)) |
1950 | if (ev1 < mddev->events) |
1951 | return -EINVAL; |
1952 | } else if (mddev->bitmap) { |
1953 | /* If adding to array with a bitmap, then we can accept an |
1954 | * older device, but not too old. |
1955 | */ |
1956 | if (ev1 < mddev->bitmap->events_cleared) |
1957 | return 0; |
1958 | if (ev1 < mddev->events) |
1959 | set_bit(nr: Bitmap_sync, addr: &rdev->flags); |
1960 | } else { |
1961 | if (ev1 < mddev->events) |
1962 | /* just a hot-add of a new device, leave raid_disk at -1 */ |
1963 | return 0; |
1964 | } |
1965 | if (mddev->level != LEVEL_MULTIPATH) { |
1966 | int role; |
1967 | if (rdev->desc_nr < 0 || |
1968 | rdev->desc_nr >= le32_to_cpu(sb->max_dev)) { |
1969 | role = MD_DISK_ROLE_SPARE; |
1970 | rdev->desc_nr = -1; |
1971 | } else |
1972 | role = le16_to_cpu(sb->dev_roles[rdev->desc_nr]); |
1973 | switch(role) { |
1974 | case MD_DISK_ROLE_SPARE: /* spare */ |
1975 | break; |
1976 | case MD_DISK_ROLE_FAULTY: /* faulty */ |
1977 | set_bit(nr: Faulty, addr: &rdev->flags); |
1978 | break; |
1979 | case MD_DISK_ROLE_JOURNAL: /* journal device */ |
1980 | if (!(le32_to_cpu(sb->feature_map) & MD_FEATURE_JOURNAL)) { |
1981 | /* journal device without journal feature */ |
1982 | pr_warn("md: journal device provided without journal feature, ignoring the device\n" ); |
1983 | return -EINVAL; |
1984 | } |
1985 | set_bit(nr: Journal, addr: &rdev->flags); |
1986 | rdev->journal_tail = le64_to_cpu(sb->journal_tail); |
1987 | rdev->raid_disk = 0; |
1988 | break; |
1989 | default: |
1990 | rdev->saved_raid_disk = role; |
1991 | if ((le32_to_cpu(sb->feature_map) & |
1992 | MD_FEATURE_RECOVERY_OFFSET)) { |
1993 | rdev->recovery_offset = le64_to_cpu(sb->recovery_offset); |
1994 | if (!(le32_to_cpu(sb->feature_map) & |
1995 | MD_FEATURE_RECOVERY_BITMAP)) |
1996 | rdev->saved_raid_disk = -1; |
1997 | } else { |
1998 | /* |
1999 | * If the array is FROZEN, then the device can't |
2000 | * be in_sync with rest of array. |
2001 | */ |
2002 | if (!test_bit(MD_RECOVERY_FROZEN, |
2003 | &mddev->recovery)) |
2004 | set_bit(nr: In_sync, addr: &rdev->flags); |
2005 | } |
2006 | rdev->raid_disk = role; |
2007 | break; |
2008 | } |
2009 | if (sb->devflags & WriteMostly1) |
2010 | set_bit(nr: WriteMostly, addr: &rdev->flags); |
2011 | if (sb->devflags & FailFast1) |
2012 | set_bit(nr: FailFast, addr: &rdev->flags); |
2013 | if (le32_to_cpu(sb->feature_map) & MD_FEATURE_REPLACEMENT) |
2014 | set_bit(nr: Replacement, addr: &rdev->flags); |
2015 | } else /* MULTIPATH are always insync */ |
2016 | set_bit(nr: In_sync, addr: &rdev->flags); |
2017 | |
2018 | return 0; |
2019 | } |
2020 | |
2021 | static void super_1_sync(struct mddev *mddev, struct md_rdev *rdev) |
2022 | { |
2023 | struct mdp_superblock_1 *sb; |
2024 | struct md_rdev *rdev2; |
2025 | int max_dev, i; |
2026 | /* make rdev->sb match mddev and rdev data. */ |
2027 | |
2028 | sb = page_address(rdev->sb_page); |
2029 | |
2030 | sb->feature_map = 0; |
2031 | sb->pad0 = 0; |
2032 | sb->recovery_offset = cpu_to_le64(0); |
2033 | memset(sb->pad3, 0, sizeof(sb->pad3)); |
2034 | |
2035 | sb->utime = cpu_to_le64((__u64)mddev->utime); |
2036 | sb->events = cpu_to_le64(mddev->events); |
2037 | if (mddev->in_sync) |
2038 | sb->resync_offset = cpu_to_le64(mddev->recovery_cp); |
2039 | else if (test_bit(MD_JOURNAL_CLEAN, &mddev->flags)) |
2040 | sb->resync_offset = cpu_to_le64(MaxSector); |
2041 | else |
2042 | sb->resync_offset = cpu_to_le64(0); |
2043 | |
2044 | sb->cnt_corrected_read = cpu_to_le32(atomic_read(&rdev->corrected_errors)); |
2045 | |
2046 | sb->raid_disks = cpu_to_le32(mddev->raid_disks); |
2047 | sb->size = cpu_to_le64(mddev->dev_sectors); |
2048 | sb->chunksize = cpu_to_le32(mddev->chunk_sectors); |
2049 | sb->level = cpu_to_le32(mddev->level); |
2050 | sb->layout = cpu_to_le32(mddev->layout); |
2051 | if (test_bit(FailFast, &rdev->flags)) |
2052 | sb->devflags |= FailFast1; |
2053 | else |
2054 | sb->devflags &= ~FailFast1; |
2055 | |
2056 | if (test_bit(WriteMostly, &rdev->flags)) |
2057 | sb->devflags |= WriteMostly1; |
2058 | else |
2059 | sb->devflags &= ~WriteMostly1; |
2060 | sb->data_offset = cpu_to_le64(rdev->data_offset); |
2061 | sb->data_size = cpu_to_le64(rdev->sectors); |
2062 | |
2063 | if (mddev->bitmap && mddev->bitmap_info.file == NULL) { |
2064 | sb->bitmap_offset = cpu_to_le32((__u32)mddev->bitmap_info.offset); |
2065 | sb->feature_map = cpu_to_le32(MD_FEATURE_BITMAP_OFFSET); |
2066 | } |
2067 | |
2068 | if (rdev->raid_disk >= 0 && !test_bit(Journal, &rdev->flags) && |
2069 | !test_bit(In_sync, &rdev->flags)) { |
2070 | sb->feature_map |= |
2071 | cpu_to_le32(MD_FEATURE_RECOVERY_OFFSET); |
2072 | sb->recovery_offset = |
2073 | cpu_to_le64(rdev->recovery_offset); |
2074 | if (rdev->saved_raid_disk >= 0 && mddev->bitmap) |
2075 | sb->feature_map |= |
2076 | cpu_to_le32(MD_FEATURE_RECOVERY_BITMAP); |
2077 | } |
2078 | /* Note: recovery_offset and journal_tail share space */ |
2079 | if (test_bit(Journal, &rdev->flags)) |
2080 | sb->journal_tail = cpu_to_le64(rdev->journal_tail); |
2081 | if (test_bit(Replacement, &rdev->flags)) |
2082 | sb->feature_map |= |
2083 | cpu_to_le32(MD_FEATURE_REPLACEMENT); |
2084 | |
2085 | if (mddev->reshape_position != MaxSector) { |
2086 | sb->feature_map |= cpu_to_le32(MD_FEATURE_RESHAPE_ACTIVE); |
2087 | sb->reshape_position = cpu_to_le64(mddev->reshape_position); |
2088 | sb->new_layout = cpu_to_le32(mddev->new_layout); |
2089 | sb->delta_disks = cpu_to_le32(mddev->delta_disks); |
2090 | sb->new_level = cpu_to_le32(mddev->new_level); |
2091 | sb->new_chunk = cpu_to_le32(mddev->new_chunk_sectors); |
2092 | if (mddev->delta_disks == 0 && |
2093 | mddev->reshape_backwards) |
2094 | sb->feature_map |
2095 | |= cpu_to_le32(MD_FEATURE_RESHAPE_BACKWARDS); |
2096 | if (rdev->new_data_offset != rdev->data_offset) { |
2097 | sb->feature_map |
2098 | |= cpu_to_le32(MD_FEATURE_NEW_OFFSET); |
2099 | sb->new_offset = cpu_to_le32((__u32)(rdev->new_data_offset |
2100 | - rdev->data_offset)); |
2101 | } |
2102 | } |
2103 | |
2104 | if (mddev_is_clustered(mddev)) |
2105 | sb->feature_map |= cpu_to_le32(MD_FEATURE_CLUSTERED); |
2106 | |
2107 | if (rdev->badblocks.count == 0) |
2108 | /* Nothing to do for bad blocks*/ ; |
2109 | else if (sb->bblog_offset == 0) |
2110 | /* Cannot record bad blocks on this device */ |
2111 | md_error(mddev, rdev); |
2112 | else { |
2113 | struct badblocks *bb = &rdev->badblocks; |
2114 | __le64 *bbp = (__le64 *)page_address(rdev->bb_page); |
2115 | u64 *p = bb->page; |
2116 | sb->feature_map |= cpu_to_le32(MD_FEATURE_BAD_BLOCKS); |
2117 | if (bb->changed) { |
2118 | unsigned seq; |
2119 | |
2120 | retry: |
2121 | seq = read_seqbegin(sl: &bb->lock); |
2122 | |
2123 | memset(bbp, 0xff, PAGE_SIZE); |
2124 | |
2125 | for (i = 0 ; i < bb->count ; i++) { |
2126 | u64 internal_bb = p[i]; |
2127 | u64 store_bb = ((BB_OFFSET(internal_bb) << 10) |
2128 | | BB_LEN(internal_bb)); |
2129 | bbp[i] = cpu_to_le64(store_bb); |
2130 | } |
2131 | bb->changed = 0; |
2132 | if (read_seqretry(sl: &bb->lock, start: seq)) |
2133 | goto retry; |
2134 | |
2135 | bb->sector = (rdev->sb_start + |
2136 | (int)le32_to_cpu(sb->bblog_offset)); |
2137 | bb->size = le16_to_cpu(sb->bblog_size); |
2138 | } |
2139 | } |
2140 | |
2141 | max_dev = 0; |
2142 | rdev_for_each(rdev2, mddev) |
2143 | if (rdev2->desc_nr+1 > max_dev) |
2144 | max_dev = rdev2->desc_nr+1; |
2145 | |
2146 | if (max_dev > le32_to_cpu(sb->max_dev)) { |
2147 | int bmask; |
2148 | sb->max_dev = cpu_to_le32(max_dev); |
2149 | rdev->sb_size = max_dev * 2 + 256; |
2150 | bmask = queue_logical_block_size(q: rdev->bdev->bd_disk->queue)-1; |
2151 | if (rdev->sb_size & bmask) |
2152 | rdev->sb_size = (rdev->sb_size | bmask) + 1; |
2153 | } else |
2154 | max_dev = le32_to_cpu(sb->max_dev); |
2155 | |
2156 | for (i=0; i<max_dev;i++) |
2157 | sb->dev_roles[i] = cpu_to_le16(MD_DISK_ROLE_SPARE); |
2158 | |
2159 | if (test_bit(MD_HAS_JOURNAL, &mddev->flags)) |
2160 | sb->feature_map |= cpu_to_le32(MD_FEATURE_JOURNAL); |
2161 | |
2162 | if (test_bit(MD_HAS_PPL, &mddev->flags)) { |
2163 | if (test_bit(MD_HAS_MULTIPLE_PPLS, &mddev->flags)) |
2164 | sb->feature_map |= |
2165 | cpu_to_le32(MD_FEATURE_MULTIPLE_PPLS); |
2166 | else |
2167 | sb->feature_map |= cpu_to_le32(MD_FEATURE_PPL); |
2168 | sb->ppl.offset = cpu_to_le16(rdev->ppl.offset); |
2169 | sb->ppl.size = cpu_to_le16(rdev->ppl.size); |
2170 | } |
2171 | |
2172 | rdev_for_each(rdev2, mddev) { |
2173 | i = rdev2->desc_nr; |
2174 | if (test_bit(Faulty, &rdev2->flags)) |
2175 | sb->dev_roles[i] = cpu_to_le16(MD_DISK_ROLE_FAULTY); |
2176 | else if (test_bit(In_sync, &rdev2->flags)) |
2177 | sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk); |
2178 | else if (test_bit(Journal, &rdev2->flags)) |
2179 | sb->dev_roles[i] = cpu_to_le16(MD_DISK_ROLE_JOURNAL); |
2180 | else if (rdev2->raid_disk >= 0) |
2181 | sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk); |
2182 | else |
2183 | sb->dev_roles[i] = cpu_to_le16(MD_DISK_ROLE_SPARE); |
2184 | } |
2185 | |
2186 | sb->sb_csum = calc_sb_1_csum(sb); |
2187 | } |
2188 | |
2189 | static sector_t super_1_choose_bm_space(sector_t dev_size) |
2190 | { |
2191 | sector_t bm_space; |
2192 | |
2193 | /* if the device is bigger than 8Gig, save 64k for bitmap |
2194 | * usage, if bigger than 200Gig, save 128k |
2195 | */ |
2196 | if (dev_size < 64*2) |
2197 | bm_space = 0; |
2198 | else if (dev_size - 64*2 >= 200*1024*1024*2) |
2199 | bm_space = 128*2; |
2200 | else if (dev_size - 4*2 > 8*1024*1024*2) |
2201 | bm_space = 64*2; |
2202 | else |
2203 | bm_space = 4*2; |
2204 | return bm_space; |
2205 | } |
2206 | |
2207 | static unsigned long long |
2208 | super_1_rdev_size_change(struct md_rdev *rdev, sector_t num_sectors) |
2209 | { |
2210 | struct mdp_superblock_1 *sb; |
2211 | sector_t max_sectors; |
2212 | if (num_sectors && num_sectors < rdev->mddev->dev_sectors) |
2213 | return 0; /* component must fit device */ |
2214 | if (rdev->data_offset != rdev->new_data_offset) |
2215 | return 0; /* too confusing */ |
2216 | if (rdev->sb_start < rdev->data_offset) { |
2217 | /* minor versions 1 and 2; superblock before data */ |
2218 | max_sectors = bdev_nr_sectors(bdev: rdev->bdev) - rdev->data_offset; |
2219 | if (!num_sectors || num_sectors > max_sectors) |
2220 | num_sectors = max_sectors; |
2221 | } else if (rdev->mddev->bitmap_info.offset) { |
2222 | /* minor version 0 with bitmap we can't move */ |
2223 | return 0; |
2224 | } else { |
2225 | /* minor version 0; superblock after data */ |
2226 | sector_t sb_start, bm_space; |
2227 | sector_t dev_size = bdev_nr_sectors(bdev: rdev->bdev); |
2228 | |
2229 | /* 8K is for superblock */ |
2230 | sb_start = dev_size - 8*2; |
2231 | sb_start &= ~(sector_t)(4*2 - 1); |
2232 | |
2233 | bm_space = super_1_choose_bm_space(dev_size); |
2234 | |
2235 | /* Space that can be used to store date needs to decrease |
2236 | * superblock bitmap space and bad block space(4K) |
2237 | */ |
2238 | max_sectors = sb_start - bm_space - 4*2; |
2239 | |
2240 | if (!num_sectors || num_sectors > max_sectors) |
2241 | num_sectors = max_sectors; |
2242 | rdev->sb_start = sb_start; |
2243 | } |
2244 | sb = page_address(rdev->sb_page); |
2245 | sb->data_size = cpu_to_le64(num_sectors); |
2246 | sb->super_offset = cpu_to_le64(rdev->sb_start); |
2247 | sb->sb_csum = calc_sb_1_csum(sb); |
2248 | do { |
2249 | md_super_write(mddev: rdev->mddev, rdev, sector: rdev->sb_start, size: rdev->sb_size, |
2250 | page: rdev->sb_page); |
2251 | } while (md_super_wait(mddev: rdev->mddev) < 0); |
2252 | return num_sectors; |
2253 | |
2254 | } |
2255 | |
2256 | static int |
2257 | super_1_allow_new_offset(struct md_rdev *rdev, |
2258 | unsigned long long new_offset) |
2259 | { |
2260 | /* All necessary checks on new >= old have been done */ |
2261 | struct bitmap *bitmap; |
2262 | if (new_offset >= rdev->data_offset) |
2263 | return 1; |
2264 | |
2265 | /* with 1.0 metadata, there is no metadata to tread on |
2266 | * so we can always move back */ |
2267 | if (rdev->mddev->minor_version == 0) |
2268 | return 1; |
2269 | |
2270 | /* otherwise we must be sure not to step on |
2271 | * any metadata, so stay: |
2272 | * 36K beyond start of superblock |
2273 | * beyond end of badblocks |
2274 | * beyond write-intent bitmap |
2275 | */ |
2276 | if (rdev->sb_start + (32+4)*2 > new_offset) |
2277 | return 0; |
2278 | bitmap = rdev->mddev->bitmap; |
2279 | if (bitmap && !rdev->mddev->bitmap_info.file && |
2280 | rdev->sb_start + rdev->mddev->bitmap_info.offset + |
2281 | bitmap->storage.file_pages * (PAGE_SIZE>>9) > new_offset) |
2282 | return 0; |
2283 | if (rdev->badblocks.sector + rdev->badblocks.size > new_offset) |
2284 | return 0; |
2285 | |
2286 | return 1; |
2287 | } |
2288 | |
2289 | static struct super_type super_types[] = { |
2290 | [0] = { |
2291 | .name = "0.90.0" , |
2292 | .owner = THIS_MODULE, |
2293 | .load_super = super_90_load, |
2294 | .validate_super = super_90_validate, |
2295 | .sync_super = super_90_sync, |
2296 | .rdev_size_change = super_90_rdev_size_change, |
2297 | .allow_new_offset = super_90_allow_new_offset, |
2298 | }, |
2299 | [1] = { |
2300 | .name = "md-1" , |
2301 | .owner = THIS_MODULE, |
2302 | .load_super = super_1_load, |
2303 | .validate_super = super_1_validate, |
2304 | .sync_super = super_1_sync, |
2305 | .rdev_size_change = super_1_rdev_size_change, |
2306 | .allow_new_offset = super_1_allow_new_offset, |
2307 | }, |
2308 | }; |
2309 | |
2310 | static void sync_super(struct mddev *mddev, struct md_rdev *rdev) |
2311 | { |
2312 | if (mddev->sync_super) { |
2313 | mddev->sync_super(mddev, rdev); |
2314 | return; |
2315 | } |
2316 | |
2317 | BUG_ON(mddev->major_version >= ARRAY_SIZE(super_types)); |
2318 | |
2319 | super_types[mddev->major_version].sync_super(mddev, rdev); |
2320 | } |
2321 | |
2322 | static int match_mddev_units(struct mddev *mddev1, struct mddev *mddev2) |
2323 | { |
2324 | struct md_rdev *rdev, *rdev2; |
2325 | |
2326 | rcu_read_lock(); |
2327 | rdev_for_each_rcu(rdev, mddev1) { |
2328 | if (test_bit(Faulty, &rdev->flags) || |
2329 | test_bit(Journal, &rdev->flags) || |
2330 | rdev->raid_disk == -1) |
2331 | continue; |
2332 | rdev_for_each_rcu(rdev2, mddev2) { |
2333 | if (test_bit(Faulty, &rdev2->flags) || |
2334 | test_bit(Journal, &rdev2->flags) || |
2335 | rdev2->raid_disk == -1) |
2336 | continue; |
2337 | if (rdev->bdev->bd_disk == rdev2->bdev->bd_disk) { |
2338 | rcu_read_unlock(); |
2339 | return 1; |
2340 | } |
2341 | } |
2342 | } |
2343 | rcu_read_unlock(); |
2344 | return 0; |
2345 | } |
2346 | |
2347 | static LIST_HEAD(pending_raid_disks); |
2348 | |
2349 | /* |
2350 | * Try to register data integrity profile for an mddev |
2351 | * |
2352 | * This is called when an array is started and after a disk has been kicked |
2353 | * from the array. It only succeeds if all working and active component devices |
2354 | * are integrity capable with matching profiles. |
2355 | */ |
2356 | int md_integrity_register(struct mddev *mddev) |
2357 | { |
2358 | struct md_rdev *rdev, *reference = NULL; |
2359 | |
2360 | if (list_empty(head: &mddev->disks)) |
2361 | return 0; /* nothing to do */ |
2362 | if (!mddev->gendisk || blk_get_integrity(disk: mddev->gendisk)) |
2363 | return 0; /* shouldn't register, or already is */ |
2364 | rdev_for_each(rdev, mddev) { |
2365 | /* skip spares and non-functional disks */ |
2366 | if (test_bit(Faulty, &rdev->flags)) |
2367 | continue; |
2368 | if (rdev->raid_disk < 0) |
2369 | continue; |
2370 | if (!reference) { |
2371 | /* Use the first rdev as the reference */ |
2372 | reference = rdev; |
2373 | continue; |
2374 | } |
2375 | /* does this rdev's profile match the reference profile? */ |
2376 | if (blk_integrity_compare(reference->bdev->bd_disk, |
2377 | rdev->bdev->bd_disk) < 0) |
2378 | return -EINVAL; |
2379 | } |
2380 | if (!reference || !bdev_get_integrity(bdev: reference->bdev)) |
2381 | return 0; |
2382 | /* |
2383 | * All component devices are integrity capable and have matching |
2384 | * profiles, register the common profile for the md device. |
2385 | */ |
2386 | blk_integrity_register(mddev->gendisk, |
2387 | bdev_get_integrity(bdev: reference->bdev)); |
2388 | |
2389 | pr_debug("md: data integrity enabled on %s\n" , mdname(mddev)); |
2390 | if (bioset_integrity_create(&mddev->bio_set, BIO_POOL_SIZE) || |
2391 | (mddev->level != 1 && mddev->level != 10 && |
2392 | bioset_integrity_create(&mddev->io_clone_set, BIO_POOL_SIZE))) { |
2393 | /* |
2394 | * No need to handle the failure of bioset_integrity_create, |
2395 | * because the function is called by md_run() -> pers->run(), |
2396 | * md_run calls bioset_exit -> bioset_integrity_free in case |
2397 | * of failure case. |
2398 | */ |
2399 | pr_err("md: failed to create integrity pool for %s\n" , |
2400 | mdname(mddev)); |
2401 | return -EINVAL; |
2402 | } |
2403 | return 0; |
2404 | } |
2405 | EXPORT_SYMBOL(md_integrity_register); |
2406 | |
2407 | /* |
2408 | * Attempt to add an rdev, but only if it is consistent with the current |
2409 | * integrity profile |
2410 | */ |
2411 | int md_integrity_add_rdev(struct md_rdev *rdev, struct mddev *mddev) |
2412 | { |
2413 | struct blk_integrity *bi_mddev; |
2414 | |
2415 | if (!mddev->gendisk) |
2416 | return 0; |
2417 | |
2418 | bi_mddev = blk_get_integrity(disk: mddev->gendisk); |
2419 | |
2420 | if (!bi_mddev) /* nothing to do */ |
2421 | return 0; |
2422 | |
2423 | if (blk_integrity_compare(mddev->gendisk, rdev->bdev->bd_disk) != 0) { |
2424 | pr_err("%s: incompatible integrity profile for %pg\n" , |
2425 | mdname(mddev), rdev->bdev); |
2426 | return -ENXIO; |
2427 | } |
2428 | |
2429 | return 0; |
2430 | } |
2431 | EXPORT_SYMBOL(md_integrity_add_rdev); |
2432 | |
2433 | static bool rdev_read_only(struct md_rdev *rdev) |
2434 | { |
2435 | return bdev_read_only(bdev: rdev->bdev) || |
2436 | (rdev->meta_bdev && bdev_read_only(bdev: rdev->meta_bdev)); |
2437 | } |
2438 | |
2439 | static int bind_rdev_to_array(struct md_rdev *rdev, struct mddev *mddev) |
2440 | { |
2441 | char b[BDEVNAME_SIZE]; |
2442 | int err; |
2443 | |
2444 | /* prevent duplicates */ |
2445 | if (find_rdev(mddev, dev: rdev->bdev->bd_dev)) |
2446 | return -EEXIST; |
2447 | |
2448 | if (rdev_read_only(rdev) && mddev->pers) |
2449 | return -EROFS; |
2450 | |
2451 | /* make sure rdev->sectors exceeds mddev->dev_sectors */ |
2452 | if (!test_bit(Journal, &rdev->flags) && |
2453 | rdev->sectors && |
2454 | (mddev->dev_sectors == 0 || rdev->sectors < mddev->dev_sectors)) { |
2455 | if (mddev->pers) { |
2456 | /* Cannot change size, so fail |
2457 | * If mddev->level <= 0, then we don't care |
2458 | * about aligning sizes (e.g. linear) |
2459 | */ |
2460 | if (mddev->level > 0) |
2461 | return -ENOSPC; |
2462 | } else |
2463 | mddev->dev_sectors = rdev->sectors; |
2464 | } |
2465 | |
2466 | /* Verify rdev->desc_nr is unique. |
2467 | * If it is -1, assign a free number, else |
2468 | * check number is not in use |
2469 | */ |
2470 | rcu_read_lock(); |
2471 | if (rdev->desc_nr < 0) { |
2472 | int choice = 0; |
2473 | if (mddev->pers) |
2474 | choice = mddev->raid_disks; |
2475 | while (md_find_rdev_nr_rcu(mddev, choice)) |
2476 | choice++; |
2477 | rdev->desc_nr = choice; |
2478 | } else { |
2479 | if (md_find_rdev_nr_rcu(mddev, rdev->desc_nr)) { |
2480 | rcu_read_unlock(); |
2481 | return -EBUSY; |
2482 | } |
2483 | } |
2484 | rcu_read_unlock(); |
2485 | if (!test_bit(Journal, &rdev->flags) && |
2486 | mddev->max_disks && rdev->desc_nr >= mddev->max_disks) { |
2487 | pr_warn("md: %s: array is limited to %d devices\n" , |
2488 | mdname(mddev), mddev->max_disks); |
2489 | return -EBUSY; |
2490 | } |
2491 | snprintf(buf: b, size: sizeof(b), fmt: "%pg" , rdev->bdev); |
2492 | strreplace(str: b, old: '/', new: '!'); |
2493 | |
2494 | rdev->mddev = mddev; |
2495 | pr_debug("md: bind<%s>\n" , b); |
2496 | |
2497 | if (mddev->raid_disks) |
2498 | mddev_create_serial_pool(mddev, rdev); |
2499 | |
2500 | if ((err = kobject_add(kobj: &rdev->kobj, parent: &mddev->kobj, fmt: "dev-%s" , b))) |
2501 | goto fail; |
2502 | |
2503 | /* failure here is OK */ |
2504 | err = sysfs_create_link(kobj: &rdev->kobj, bdev_kobj(rdev->bdev), name: "block" ); |
2505 | rdev->sysfs_state = sysfs_get_dirent_safe(sd: rdev->kobj.sd, name: "state" ); |
2506 | rdev->sysfs_unack_badblocks = |
2507 | sysfs_get_dirent_safe(sd: rdev->kobj.sd, name: "unacknowledged_bad_blocks" ); |
2508 | rdev->sysfs_badblocks = |
2509 | sysfs_get_dirent_safe(sd: rdev->kobj.sd, name: "bad_blocks" ); |
2510 | |
2511 | list_add_rcu(new: &rdev->same_set, head: &mddev->disks); |
2512 | bd_link_disk_holder(bdev: rdev->bdev, disk: mddev->gendisk); |
2513 | |
2514 | /* May as well allow recovery to be retried once */ |
2515 | mddev->recovery_disabled++; |
2516 | |
2517 | return 0; |
2518 | |
2519 | fail: |
2520 | pr_warn("md: failed to register dev-%s for %s\n" , |
2521 | b, mdname(mddev)); |
2522 | return err; |
2523 | } |
2524 | |
2525 | void md_autodetect_dev(dev_t dev); |
2526 | |
2527 | /* just for claiming the bdev */ |
2528 | static struct md_rdev claim_rdev; |
2529 | |
2530 | static void export_rdev(struct md_rdev *rdev, struct mddev *mddev) |
2531 | { |
2532 | pr_debug("md: export_rdev(%pg)\n" , rdev->bdev); |
2533 | md_rdev_clear(rdev); |
2534 | #ifndef MODULE |
2535 | if (test_bit(AutoDetected, &rdev->flags)) |
2536 | md_autodetect_dev(dev: rdev->bdev->bd_dev); |
2537 | #endif |
2538 | bdev_release(handle: rdev->bdev_handle); |
2539 | rdev->bdev = NULL; |
2540 | kobject_put(kobj: &rdev->kobj); |
2541 | } |
2542 | |
2543 | static void md_kick_rdev_from_array(struct md_rdev *rdev) |
2544 | { |
2545 | struct mddev *mddev = rdev->mddev; |
2546 | |
2547 | bd_unlink_disk_holder(bdev: rdev->bdev, disk: rdev->mddev->gendisk); |
2548 | list_del_rcu(entry: &rdev->same_set); |
2549 | pr_debug("md: unbind<%pg>\n" , rdev->bdev); |
2550 | mddev_destroy_serial_pool(mddev: rdev->mddev, rdev); |
2551 | rdev->mddev = NULL; |
2552 | sysfs_remove_link(kobj: &rdev->kobj, name: "block" ); |
2553 | sysfs_put(kn: rdev->sysfs_state); |
2554 | sysfs_put(kn: rdev->sysfs_unack_badblocks); |
2555 | sysfs_put(kn: rdev->sysfs_badblocks); |
2556 | rdev->sysfs_state = NULL; |
2557 | rdev->sysfs_unack_badblocks = NULL; |
2558 | rdev->sysfs_badblocks = NULL; |
2559 | rdev->badblocks.count = 0; |
2560 | |
2561 | synchronize_rcu(); |
2562 | |
2563 | /* |
2564 | * kobject_del() will wait for all in progress writers to be done, where |
2565 | * reconfig_mutex is held, hence it can't be called under |
2566 | * reconfig_mutex and it's delayed to mddev_unlock(). |
2567 | */ |
2568 | list_add(new: &rdev->same_set, head: &mddev->deleting); |
2569 | } |
2570 | |
2571 | static void export_array(struct mddev *mddev) |
2572 | { |
2573 | struct md_rdev *rdev; |
2574 | |
2575 | while (!list_empty(head: &mddev->disks)) { |
2576 | rdev = list_first_entry(&mddev->disks, struct md_rdev, |
2577 | same_set); |
2578 | md_kick_rdev_from_array(rdev); |
2579 | } |
2580 | mddev->raid_disks = 0; |
2581 | mddev->major_version = 0; |
2582 | } |
2583 | |
2584 | static bool set_in_sync(struct mddev *mddev) |
2585 | { |
2586 | lockdep_assert_held(&mddev->lock); |
2587 | if (!mddev->in_sync) { |
2588 | mddev->sync_checkers++; |
2589 | spin_unlock(lock: &mddev->lock); |
2590 | percpu_ref_switch_to_atomic_sync(ref: &mddev->writes_pending); |
2591 | spin_lock(lock: &mddev->lock); |
2592 | if (!mddev->in_sync && |
2593 | percpu_ref_is_zero(ref: &mddev->writes_pending)) { |
2594 | mddev->in_sync = 1; |
2595 | /* |
2596 | * Ensure ->in_sync is visible before we clear |
2597 | * ->sync_checkers. |
2598 | */ |
2599 | smp_mb(); |
2600 | set_bit(nr: MD_SB_CHANGE_CLEAN, addr: &mddev->sb_flags); |
2601 | sysfs_notify_dirent_safe(sd: mddev->sysfs_state); |
2602 | } |
2603 | if (--mddev->sync_checkers == 0) |
2604 | percpu_ref_switch_to_percpu(ref: &mddev->writes_pending); |
2605 | } |
2606 | if (mddev->safemode == 1) |
2607 | mddev->safemode = 0; |
2608 | return mddev->in_sync; |
2609 | } |
2610 | |
2611 | static void sync_sbs(struct mddev *mddev, int nospares) |
2612 | { |
2613 | /* Update each superblock (in-memory image), but |
2614 | * if we are allowed to, skip spares which already |
2615 | * have the right event counter, or have one earlier |
2616 | * (which would mean they aren't being marked as dirty |
2617 | * with the rest of the array) |
2618 | */ |
2619 | struct md_rdev *rdev; |
2620 | rdev_for_each(rdev, mddev) { |
2621 | if (rdev->sb_events == mddev->events || |
2622 | (nospares && |
2623 | rdev->raid_disk < 0 && |
2624 | rdev->sb_events+1 == mddev->events)) { |
2625 | /* Don't update this superblock */ |
2626 | rdev->sb_loaded = 2; |
2627 | } else { |
2628 | sync_super(mddev, rdev); |
2629 | rdev->sb_loaded = 1; |
2630 | } |
2631 | } |
2632 | } |
2633 | |
2634 | static bool does_sb_need_changing(struct mddev *mddev) |
2635 | { |
2636 | struct md_rdev *rdev = NULL, *iter; |
2637 | struct mdp_superblock_1 *sb; |
2638 | int role; |
2639 | |
2640 | /* Find a good rdev */ |
2641 | rdev_for_each(iter, mddev) |
2642 | if ((iter->raid_disk >= 0) && !test_bit(Faulty, &iter->flags)) { |
2643 | rdev = iter; |
2644 | break; |
2645 | } |
2646 | |
2647 | /* No good device found. */ |
2648 | if (!rdev) |
2649 | return false; |
2650 | |
2651 | sb = page_address(rdev->sb_page); |
2652 | /* Check if a device has become faulty or a spare become active */ |
2653 | rdev_for_each(rdev, mddev) { |
2654 | role = le16_to_cpu(sb->dev_roles[rdev->desc_nr]); |
2655 | /* Device activated? */ |
2656 | if (role == MD_DISK_ROLE_SPARE && rdev->raid_disk >= 0 && |
2657 | !test_bit(Faulty, &rdev->flags)) |
2658 | return true; |
2659 | /* Device turned faulty? */ |
2660 | if (test_bit(Faulty, &rdev->flags) && (role < MD_DISK_ROLE_MAX)) |
2661 | return true; |
2662 | } |
2663 | |
2664 | /* Check if any mddev parameters have changed */ |
2665 | if ((mddev->dev_sectors != le64_to_cpu(sb->size)) || |
2666 | (mddev->reshape_position != le64_to_cpu(sb->reshape_position)) || |
2667 | (mddev->layout != le32_to_cpu(sb->layout)) || |
2668 | (mddev->raid_disks != le32_to_cpu(sb->raid_disks)) || |
2669 | (mddev->chunk_sectors != le32_to_cpu(sb->chunksize))) |
2670 | return true; |
2671 | |
2672 | return false; |
2673 | } |
2674 | |
2675 | void md_update_sb(struct mddev *mddev, int force_change) |
2676 | { |
2677 | struct md_rdev *rdev; |
2678 | int sync_req; |
2679 | int nospares = 0; |
2680 | int any_badblocks_changed = 0; |
2681 | int ret = -1; |
2682 | |
2683 | if (!md_is_rdwr(mddev)) { |
2684 | if (force_change) |
2685 | set_bit(nr: MD_SB_CHANGE_DEVS, addr: &mddev->sb_flags); |
2686 | return; |
2687 | } |
2688 | |
2689 | repeat: |
2690 | if (mddev_is_clustered(mddev)) { |
2691 | if (test_and_clear_bit(nr: MD_SB_CHANGE_DEVS, addr: &mddev->sb_flags)) |
2692 | force_change = 1; |
2693 | if (test_and_clear_bit(nr: MD_SB_CHANGE_CLEAN, addr: &mddev->sb_flags)) |
2694 | nospares = 1; |
2695 | ret = md_cluster_ops->metadata_update_start(mddev); |
2696 | /* Has someone else has updated the sb */ |
2697 | if (!does_sb_need_changing(mddev)) { |
2698 | if (ret == 0) |
2699 | md_cluster_ops->metadata_update_cancel(mddev); |
2700 | bit_clear_unless(&mddev->sb_flags, BIT(MD_SB_CHANGE_PENDING), |
2701 | BIT(MD_SB_CHANGE_DEVS) | |
2702 | BIT(MD_SB_CHANGE_CLEAN)); |
2703 | return; |
2704 | } |
2705 | } |
2706 | |
2707 | /* |
2708 | * First make sure individual recovery_offsets are correct |
2709 | * curr_resync_completed can only be used during recovery. |
2710 | * During reshape/resync it might use array-addresses rather |
2711 | * that device addresses. |
2712 | */ |
2713 | rdev_for_each(rdev, mddev) { |
2714 | if (rdev->raid_disk >= 0 && |
2715 | mddev->delta_disks >= 0 && |
2716 | test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) && |
2717 | test_bit(MD_RECOVERY_RECOVER, &mddev->recovery) && |
2718 | !test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && |
2719 | !test_bit(Journal, &rdev->flags) && |
2720 | !test_bit(In_sync, &rdev->flags) && |
2721 | mddev->curr_resync_completed > rdev->recovery_offset) |
2722 | rdev->recovery_offset = mddev->curr_resync_completed; |
2723 | |
2724 | } |
2725 | if (!mddev->persistent) { |
2726 | clear_bit(nr: MD_SB_CHANGE_CLEAN, addr: &mddev->sb_flags); |
2727 | clear_bit(nr: MD_SB_CHANGE_DEVS, addr: &mddev->sb_flags); |
2728 | if (!mddev->external) { |
2729 | clear_bit(nr: MD_SB_CHANGE_PENDING, addr: &mddev->sb_flags); |
2730 | rdev_for_each(rdev, mddev) { |
2731 | if (rdev->badblocks.changed) { |
2732 | rdev->badblocks.changed = 0; |
2733 | ack_all_badblocks(bb: &rdev->badblocks); |
2734 | md_error(mddev, rdev); |
2735 | } |
2736 | clear_bit(nr: Blocked, addr: &rdev->flags); |
2737 | clear_bit(nr: BlockedBadBlocks, addr: &rdev->flags); |
2738 | wake_up(&rdev->blocked_wait); |
2739 | } |
2740 | } |
2741 | wake_up(&mddev->sb_wait); |
2742 | return; |
2743 | } |
2744 | |
2745 | spin_lock(lock: &mddev->lock); |
2746 | |
2747 | mddev->utime = ktime_get_real_seconds(); |
2748 | |
2749 | if (test_and_clear_bit(nr: MD_SB_CHANGE_DEVS, addr: &mddev->sb_flags)) |
2750 | force_change = 1; |
2751 | if (test_and_clear_bit(nr: MD_SB_CHANGE_CLEAN, addr: &mddev->sb_flags)) |
2752 | /* just a clean<-> dirty transition, possibly leave spares alone, |
2753 | * though if events isn't the right even/odd, we will have to do |
2754 | * spares after all |
2755 | */ |
2756 | nospares = 1; |
2757 | if (force_change) |
2758 | nospares = 0; |
2759 | if (mddev->degraded) |
2760 | /* If the array is degraded, then skipping spares is both |
2761 | * dangerous and fairly pointless. |
2762 | * Dangerous because a device that was removed from the array |
2763 | * might have a event_count that still looks up-to-date, |
2764 | * so it can be re-added without a resync. |
2765 | * Pointless because if there are any spares to skip, |
2766 | * then a recovery will happen and soon that array won't |
2767 | * be degraded any more and the spare can go back to sleep then. |
2768 | */ |
2769 | nospares = 0; |
2770 | |
2771 | sync_req = mddev->in_sync; |
2772 | |
2773 | /* If this is just a dirty<->clean transition, and the array is clean |
2774 | * and 'events' is odd, we can roll back to the previous clean state */ |
2775 | if (nospares |
2776 | && (mddev->in_sync && mddev->recovery_cp == MaxSector) |
2777 | && mddev->can_decrease_events |
2778 | && mddev->events != 1) { |
2779 | mddev->events--; |
2780 | mddev->can_decrease_events = 0; |
2781 | } else { |
2782 | /* otherwise we have to go forward and ... */ |
2783 | mddev->events ++; |
2784 | mddev->can_decrease_events = nospares; |
2785 | } |
2786 | |
2787 | /* |
2788 | * This 64-bit counter should never wrap. |
2789 | * Either we are in around ~1 trillion A.C., assuming |
2790 | * 1 reboot per second, or we have a bug... |
2791 | */ |
2792 | WARN_ON(mddev->events == 0); |
2793 | |
2794 | rdev_for_each(rdev, mddev) { |
2795 | if (rdev->badblocks.changed) |
2796 | any_badblocks_changed++; |
2797 | if (test_bit(Faulty, &rdev->flags)) |
2798 | set_bit(nr: FaultRecorded, addr: &rdev->flags); |
2799 | } |
2800 | |
2801 | sync_sbs(mddev, nospares); |
2802 | spin_unlock(lock: &mddev->lock); |
2803 | |
2804 | pr_debug("md: updating %s RAID superblock on device (in sync %d)\n" , |
2805 | mdname(mddev), mddev->in_sync); |
2806 | |
2807 | if (mddev->queue) |
2808 | blk_add_trace_msg(mddev->queue, "md md_update_sb" ); |
2809 | rewrite: |
2810 | md_bitmap_update_sb(bitmap: mddev->bitmap); |
2811 | rdev_for_each(rdev, mddev) { |
2812 | if (rdev->sb_loaded != 1) |
2813 | continue; /* no noise on spare devices */ |
2814 | |
2815 | if (!test_bit(Faulty, &rdev->flags)) { |
2816 | md_super_write(mddev,rdev, |
2817 | sector: rdev->sb_start, size: rdev->sb_size, |
2818 | page: rdev->sb_page); |
2819 | pr_debug("md: (write) %pg's sb offset: %llu\n" , |
2820 | rdev->bdev, |
2821 | (unsigned long long)rdev->sb_start); |
2822 | rdev->sb_events = mddev->events; |
2823 | if (rdev->badblocks.size) { |
2824 | md_super_write(mddev, rdev, |
2825 | sector: rdev->badblocks.sector, |
2826 | size: rdev->badblocks.size << 9, |
2827 | page: rdev->bb_page); |
2828 | rdev->badblocks.size = 0; |
2829 | } |
2830 | |
2831 | } else |
2832 | pr_debug("md: %pg (skipping faulty)\n" , |
2833 | rdev->bdev); |
2834 | |
2835 | if (mddev->level == LEVEL_MULTIPATH) |
2836 | /* only need to write one superblock... */ |
2837 | break; |
2838 | } |
2839 | if (md_super_wait(mddev) < 0) |
2840 | goto rewrite; |
2841 | /* if there was a failure, MD_SB_CHANGE_DEVS was set, and we re-write super */ |
2842 | |
2843 | if (mddev_is_clustered(mddev) && ret == 0) |
2844 | md_cluster_ops->metadata_update_finish(mddev); |
2845 | |
2846 | if (mddev->in_sync != sync_req || |
2847 | !bit_clear_unless(&mddev->sb_flags, BIT(MD_SB_CHANGE_PENDING), |
2848 | BIT(MD_SB_CHANGE_DEVS) | BIT(MD_SB_CHANGE_CLEAN))) |
2849 | /* have to write it out again */ |
2850 | goto repeat; |
2851 | wake_up(&mddev->sb_wait); |
2852 | if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) |
2853 | sysfs_notify_dirent_safe(sd: mddev->sysfs_completed); |
2854 | |
2855 | rdev_for_each(rdev, mddev) { |
2856 | if (test_and_clear_bit(nr: FaultRecorded, addr: &rdev->flags)) |
2857 | clear_bit(nr: Blocked, addr: &rdev->flags); |
2858 | |
2859 | if (any_badblocks_changed) |
2860 | ack_all_badblocks(bb: &rdev->badblocks); |
2861 | clear_bit(nr: BlockedBadBlocks, addr: &rdev->flags); |
2862 | wake_up(&rdev->blocked_wait); |
2863 | } |
2864 | } |
2865 | EXPORT_SYMBOL(md_update_sb); |
2866 | |
2867 | static int add_bound_rdev(struct md_rdev *rdev) |
2868 | { |
2869 | struct mddev *mddev = rdev->mddev; |
2870 | int err = 0; |
2871 | bool add_journal = test_bit(Journal, &rdev->flags); |
2872 | |
2873 | if (!mddev->pers->hot_remove_disk || add_journal) { |
2874 | /* If there is hot_add_disk but no hot_remove_disk |
2875 | * then added disks for geometry changes, |
2876 | * and should be added immediately. |
2877 | */ |
2878 | super_types[mddev->major_version]. |
2879 | validate_super(mddev, rdev); |
2880 | err = mddev->pers->hot_add_disk(mddev, rdev); |
2881 | if (err) { |
2882 | md_kick_rdev_from_array(rdev); |
2883 | return err; |
2884 | } |
2885 | } |
2886 | sysfs_notify_dirent_safe(sd: rdev->sysfs_state); |
2887 | |
2888 | set_bit(nr: MD_SB_CHANGE_DEVS, addr: &mddev->sb_flags); |
2889 | if (mddev->degraded) |
2890 | set_bit(nr: MD_RECOVERY_RECOVER, addr: &mddev->recovery); |
2891 | set_bit(nr: MD_RECOVERY_NEEDED, addr: &mddev->recovery); |
2892 | md_new_event(); |
2893 | md_wakeup_thread(thread: mddev->thread); |
2894 | return 0; |
2895 | } |
2896 | |
2897 | /* words written to sysfs files may, or may not, be \n terminated. |
2898 | * We want to accept with case. For this we use cmd_match. |
2899 | */ |
2900 | static int cmd_match(const char *cmd, const char *str) |
2901 | { |
2902 | /* See if cmd, written into a sysfs file, matches |
2903 | * str. They must either be the same, or cmd can |
2904 | * have a trailing newline |
2905 | */ |
2906 | while (*cmd && *str && *cmd == *str) { |
2907 | cmd++; |
2908 | str++; |
2909 | } |
2910 | if (*cmd == '\n') |
2911 | cmd++; |
2912 | if (*str || *cmd) |
2913 | return 0; |
2914 | return 1; |
2915 | } |
2916 | |
2917 | struct rdev_sysfs_entry { |
2918 | struct attribute attr; |
2919 | ssize_t (*show)(struct md_rdev *, char *); |
2920 | ssize_t (*store)(struct md_rdev *, const char *, size_t); |
2921 | }; |
2922 | |
2923 | static ssize_t |
2924 | state_show(struct md_rdev *rdev, char *page) |
2925 | { |
2926 | char *sep = "," ; |
2927 | size_t len = 0; |
2928 | unsigned long flags = READ_ONCE(rdev->flags); |
2929 | |
2930 | if (test_bit(Faulty, &flags) || |
2931 | (!test_bit(ExternalBbl, &flags) && |
2932 | rdev->badblocks.unacked_exist)) |
2933 | len += sprintf(buf: page+len, fmt: "faulty%s" , sep); |
2934 | if (test_bit(In_sync, &flags)) |
2935 | len += sprintf(buf: page+len, fmt: "in_sync%s" , sep); |
2936 | if (test_bit(Journal, &flags)) |
2937 | len += sprintf(buf: page+len, fmt: "journal%s" , sep); |
2938 | if (test_bit(WriteMostly, &flags)) |
2939 | len += sprintf(buf: page+len, fmt: "write_mostly%s" , sep); |
2940 | if (test_bit(Blocked, &flags) || |
2941 | (rdev->badblocks.unacked_exist |
2942 | && !test_bit(Faulty, &flags))) |
2943 | len += sprintf(buf: page+len, fmt: "blocked%s" , sep); |
2944 | if (!test_bit(Faulty, &flags) && |
2945 | !test_bit(Journal, &flags) && |
2946 | !test_bit(In_sync, &flags)) |
2947 | len += sprintf(buf: page+len, fmt: "spare%s" , sep); |
2948 | if (test_bit(WriteErrorSeen, &flags)) |
2949 | len += sprintf(buf: page+len, fmt: "write_error%s" , sep); |
2950 | if (test_bit(WantReplacement, &flags)) |
2951 | len += sprintf(buf: page+len, fmt: "want_replacement%s" , sep); |
2952 | if (test_bit(Replacement, &flags)) |
2953 | len += sprintf(buf: page+len, fmt: "replacement%s" , sep); |
2954 | if (test_bit(ExternalBbl, &flags)) |
2955 | len += sprintf(buf: page+len, fmt: "external_bbl%s" , sep); |
2956 | if (test_bit(FailFast, &flags)) |
2957 | len += sprintf(buf: page+len, fmt: "failfast%s" , sep); |
2958 | |
2959 | if (len) |
2960 | len -= strlen(sep); |
2961 | |
2962 | return len+sprintf(buf: page+len, fmt: "\n" ); |
2963 | } |
2964 | |
2965 | static ssize_t |
2966 | state_store(struct md_rdev *rdev, const char *buf, size_t len) |
2967 | { |
2968 | /* can write |
2969 | * faulty - simulates an error |
2970 | * remove - disconnects the device |
2971 | * writemostly - sets write_mostly |
2972 | * -writemostly - clears write_mostly |
2973 | * blocked - sets the Blocked flags |
2974 | * -blocked - clears the Blocked and possibly simulates an error |
2975 | * insync - sets Insync providing device isn't active |
2976 | * -insync - clear Insync for a device with a slot assigned, |
2977 | * so that it gets rebuilt based on bitmap |
2978 | * write_error - sets WriteErrorSeen |
2979 | * -write_error - clears WriteErrorSeen |
2980 | * {,-}failfast - set/clear FailFast |
2981 | */ |
2982 | |
2983 | struct mddev *mddev = rdev->mddev; |
2984 | int err = -EINVAL; |
2985 | bool need_update_sb = false; |
2986 | |
2987 | if (cmd_match(cmd: buf, str: "faulty" ) && rdev->mddev->pers) { |
2988 | md_error(mddev: rdev->mddev, rdev); |
2989 | |
2990 | if (test_bit(MD_BROKEN, &rdev->mddev->flags)) |
2991 | err = -EBUSY; |
2992 | else |
2993 | err = 0; |
2994 | } else if (cmd_match(cmd: buf, str: "remove" )) { |
2995 | if (rdev->mddev->pers) { |
2996 | clear_bit(nr: Blocked, addr: &rdev->flags); |
2997 | remove_and_add_spares(mddev: rdev->mddev, this: rdev); |
2998 | } |
2999 | if (rdev->raid_disk >= 0) |
3000 | err = -EBUSY; |
3001 | else { |
3002 | err = 0; |
3003 | if (mddev_is_clustered(mddev)) |
3004 | err = md_cluster_ops->remove_disk(mddev, rdev); |
3005 | |
3006 | if (err == 0) { |
3007 | md_kick_rdev_from_array(rdev); |
3008 | if (mddev->pers) { |
3009 | set_bit(nr: MD_SB_CHANGE_DEVS, addr: &mddev->sb_flags); |
3010 | md_wakeup_thread(thread: mddev->thread); |
3011 | } |
3012 | md_new_event(); |
3013 | } |
3014 | } |
3015 | } else if (cmd_match(cmd: buf, str: "writemostly" )) { |
3016 | set_bit(nr: WriteMostly, addr: &rdev->flags); |
3017 | mddev_create_serial_pool(mddev: rdev->mddev, rdev); |
3018 | need_update_sb = true; |
3019 | err = 0; |
3020 | } else if (cmd_match(cmd: buf, str: "-writemostly" )) { |
3021 | mddev_destroy_serial_pool(mddev: rdev->mddev, rdev); |
3022 | clear_bit(nr: WriteMostly, addr: &rdev->flags); |
3023 | need_update_sb = true; |
3024 | err = 0; |
3025 | } else if (cmd_match(cmd: buf, str: "blocked" )) { |
3026 | set_bit(nr: Blocked, addr: &rdev->flags); |
3027 | err = 0; |
3028 | } else if (cmd_match(cmd: buf, str: "-blocked" )) { |
3029 | if (!test_bit(Faulty, &rdev->flags) && |
3030 | !test_bit(ExternalBbl, &rdev->flags) && |
3031 | rdev->badblocks.unacked_exist) { |
3032 | /* metadata handler doesn't understand badblocks, |
3033 | * so we need to fail the device |
3034 | */ |
3035 | md_error(mddev: rdev->mddev, rdev); |
3036 | } |
3037 | clear_bit(nr: Blocked, addr: &rdev->flags); |
3038 | clear_bit(nr: BlockedBadBlocks, addr: &rdev->flags); |
3039 | wake_up(&rdev->blocked_wait); |
3040 | set_bit(nr: MD_RECOVERY_NEEDED, addr: &rdev->mddev->recovery); |
3041 | md_wakeup_thread(thread: rdev->mddev->thread); |
3042 | |
3043 | err = 0; |
3044 | } else if (cmd_match(cmd: buf, str: "insync" ) && rdev->raid_disk == -1) { |
3045 | set_bit(nr: In_sync, addr: &rdev->flags); |
3046 | err = 0; |
3047 | } else if (cmd_match(cmd: buf, str: "failfast" )) { |
3048 | set_bit(nr: FailFast, addr: &rdev->flags); |
3049 | need_update_sb = true; |
3050 | err = 0; |
3051 | } else if (cmd_match(cmd: buf, str: "-failfast" )) { |
3052 | clear_bit(nr: FailFast, addr: &rdev->flags); |
3053 | need_update_sb = true; |
3054 | err = 0; |
3055 | } else if (cmd_match(cmd: buf, str: "-insync" ) && rdev->raid_disk >= 0 && |
3056 | !test_bit(Journal, &rdev->flags)) { |
3057 | if (rdev->mddev->pers == NULL) { |
3058 | clear_bit(nr: In_sync, addr: &rdev->flags); |
3059 | rdev->saved_raid_disk = rdev->raid_disk; |
3060 | rdev->raid_disk = -1; |
3061 | err = 0; |
3062 | } |
3063 | } else if (cmd_match(cmd: buf, str: "write_error" )) { |
3064 | set_bit(nr: WriteErrorSeen, addr: &rdev->flags); |
3065 | err = 0; |
3066 | } else if (cmd_match(cmd: buf, str: "-write_error" )) { |
3067 | clear_bit(nr: WriteErrorSeen, addr: &rdev->flags); |
3068 | err = 0; |
3069 | } else if (cmd_match(cmd: buf, str: "want_replacement" )) { |
3070 | /* Any non-spare device that is not a replacement can |
3071 | * become want_replacement at any time, but we then need to |
3072 | * check if recovery is needed. |
3073 | */ |
3074 | if (rdev->raid_disk >= 0 && |
3075 | !test_bit(Journal, &rdev->flags) && |
3076 | !test_bit(Replacement, &rdev->flags)) |
3077 | set_bit(nr: WantReplacement, addr: &rdev->flags); |
3078 | set_bit(nr: MD_RECOVERY_NEEDED, addr: &rdev->mddev->recovery); |
3079 | md_wakeup_thread(thread: rdev->mddev->thread); |
3080 | err = 0; |
3081 | } else if (cmd_match(cmd: buf, str: "-want_replacement" )) { |
3082 | /* Clearing 'want_replacement' is always allowed. |
3083 | * Once replacements starts it is too late though. |
3084 | */ |
3085 | err = 0; |
3086 | clear_bit(nr: WantReplacement, addr: &rdev->flags); |
3087 | } else if (cmd_match(cmd: buf, str: "replacement" )) { |
3088 | /* Can only set a device as a replacement when array has not |
3089 | * yet been started. Once running, replacement is automatic |
3090 | * from spares, or by assigning 'slot'. |
3091 | */ |
3092 | if (rdev->mddev->pers) |
3093 | err = -EBUSY; |
3094 | else { |
3095 | set_bit(nr: Replacement, addr: &rdev->flags); |
3096 | err = 0; |
3097 | } |
3098 | } else if (cmd_match(cmd: buf, str: "-replacement" )) { |
3099 | /* Similarly, can only clear Replacement before start */ |
3100 | if (rdev->mddev->pers) |
3101 | err = -EBUSY; |
3102 | else { |
3103 | clear_bit(nr: Replacement, addr: &rdev->flags); |
3104 | err = 0; |
3105 | } |
3106 | } else if (cmd_match(cmd: buf, str: "re-add" )) { |
3107 | if (!rdev->mddev->pers) |
3108 | err = -EINVAL; |
3109 | else if (test_bit(Faulty, &rdev->flags) && (rdev->raid_disk == -1) && |
3110 | rdev->saved_raid_disk >= 0) { |
3111 | /* clear_bit is performed _after_ all the devices |
3112 | * have their local Faulty bit cleared. If any writes |
3113 | * happen in the meantime in the local node, they |
3114 | * will land in the local bitmap, which will be synced |
3115 | * by this node eventually |
3116 | */ |
3117 | if (!mddev_is_clustered(mddev: rdev->mddev) || |
3118 | (err = md_cluster_ops->gather_bitmaps(rdev)) == 0) { |
3119 | clear_bit(nr: Faulty, addr: &rdev->flags); |
3120 | err = add_bound_rdev(rdev); |
3121 | } |
3122 | } else |
3123 | err = -EBUSY; |
3124 | } else if (cmd_match(cmd: buf, str: "external_bbl" ) && (rdev->mddev->external)) { |
3125 | set_bit(nr: ExternalBbl, addr: &rdev->flags); |
3126 | rdev->badblocks.shift = 0; |
3127 | err = 0; |
3128 | } else if (cmd_match(cmd: buf, str: "-external_bbl" ) && (rdev->mddev->external)) { |
3129 | clear_bit(nr: ExternalBbl, addr: &rdev->flags); |
3130 | err = 0; |
3131 | } |
3132 | if (need_update_sb) |
3133 | md_update_sb(mddev, 1); |
3134 | if (!err) |
3135 | sysfs_notify_dirent_safe(sd: rdev->sysfs_state); |
3136 | return err ? err : len; |
3137 | } |
3138 | static struct rdev_sysfs_entry rdev_state = |
3139 | __ATTR_PREALLOC(state, S_IRUGO|S_IWUSR, state_show, state_store); |
3140 | |
3141 | static ssize_t |
3142 | errors_show(struct md_rdev *rdev, char *page) |
3143 | { |
3144 | return sprintf(buf: page, fmt: "%d\n" , atomic_read(v: &rdev->corrected_errors)); |
3145 | } |
3146 | |
3147 | static ssize_t |
3148 | errors_store(struct md_rdev *rdev, const char *buf, size_t len) |
3149 | { |
3150 | unsigned int n; |
3151 | int rv; |
3152 | |
3153 | rv = kstrtouint(s: buf, base: 10, res: &n); |
3154 | if (rv < 0) |
3155 | return rv; |
3156 | atomic_set(v: &rdev->corrected_errors, i: n); |
3157 | return len; |
3158 | } |
3159 | static struct rdev_sysfs_entry rdev_errors = |
3160 | __ATTR(errors, S_IRUGO|S_IWUSR, errors_show, errors_store); |
3161 | |
3162 | static ssize_t |
3163 | slot_show(struct md_rdev *rdev, char *page) |
3164 | { |
3165 | if (test_bit(Journal, &rdev->flags)) |
3166 | return sprintf(buf: page, fmt: "journal\n" ); |
3167 | else if (rdev->raid_disk < 0) |
3168 | return sprintf(buf: page, fmt: "none\n" ); |
3169 | else |
3170 | return sprintf(buf: page, fmt: "%d\n" , rdev->raid_disk); |
3171 | } |
3172 | |
3173 | static ssize_t |
3174 | slot_store(struct md_rdev *rdev, const char *buf, size_t len) |
3175 | { |
3176 | int slot; |
3177 | int err; |
3178 | |
3179 | if (test_bit(Journal, &rdev->flags)) |
3180 | return -EBUSY; |
3181 | if (strncmp(buf, "none" , 4)==0) |
3182 | slot = -1; |
3183 | else { |
3184 | err = kstrtouint(s: buf, base: 10, res: (unsigned int *)&slot); |
3185 | if (err < 0) |
3186 | return err; |
3187 | if (slot < 0) |
3188 | /* overflow */ |
3189 | return -ENOSPC; |
3190 | } |
3191 | if (rdev->mddev->pers && slot == -1) { |
3192 | /* Setting 'slot' on an active array requires also |
3193 | * updating the 'rd%d' link, and communicating |
3194 | * with the personality with ->hot_*_disk. |
3195 | * For now we only support removing |
3196 | * failed/spare devices. This normally happens automatically, |
3197 | * but not when the metadata is externally managed. |
3198 | */ |
3199 | if (rdev->raid_disk == -1) |
3200 | return -EEXIST; |
3201 | /* personality does all needed checks */ |
3202 | if (rdev->mddev->pers->hot_remove_disk == NULL) |
3203 | return -EINVAL; |
3204 | clear_bit(nr: Blocked, addr: &rdev->flags); |
3205 | remove_and_add_spares(mddev: rdev->mddev, this: rdev); |
3206 | if (rdev->raid_disk >= 0) |
3207 | return -EBUSY; |
3208 | set_bit(nr: MD_RECOVERY_NEEDED, addr: &rdev->mddev->recovery); |
3209 | md_wakeup_thread(thread: rdev->mddev->thread); |
3210 | } else if (rdev->mddev->pers) { |
3211 | /* Activating a spare .. or possibly reactivating |
3212 | * if we ever get bitmaps working here. |
3213 | */ |
3214 | int err; |
3215 | |
3216 | if (rdev->raid_disk != -1) |
3217 | return -EBUSY; |
3218 | |
3219 | if (test_bit(MD_RECOVERY_RUNNING, &rdev->mddev->recovery)) |
3220 | return -EBUSY; |
3221 | |
3222 | if (rdev->mddev->pers->hot_add_disk == NULL) |
3223 | return -EINVAL; |
3224 | |
3225 | if (slot >= rdev->mddev->raid_disks && |
3226 | slot >= rdev->mddev->raid_disks + rdev->mddev->delta_disks) |
3227 | return -ENOSPC; |
3228 | |
3229 | rdev->raid_disk = slot; |
3230 | if (test_bit(In_sync, &rdev->flags)) |
3231 | rdev->saved_raid_disk = slot; |
3232 | else |
3233 | rdev->saved_raid_disk = -1; |
3234 | clear_bit(nr: In_sync, addr: &rdev->flags); |
3235 | clear_bit(nr: Bitmap_sync, addr: &rdev->flags); |
3236 | err = rdev->mddev->pers->hot_add_disk(rdev->mddev, rdev); |
3237 | if (err) { |
3238 | rdev->raid_disk = -1; |
3239 | return err; |
3240 | } else |
3241 | sysfs_notify_dirent_safe(sd: rdev->sysfs_state); |
3242 | /* failure here is OK */; |
3243 | sysfs_link_rdev(mddev: rdev->mddev, rdev); |
3244 | /* don't wakeup anyone, leave that to userspace. */ |
3245 | } else { |
3246 | if (slot >= rdev->mddev->raid_disks && |
3247 | slot >= rdev->mddev->raid_disks + rdev->mddev->delta_disks) |
3248 | return -ENOSPC; |
3249 | rdev->raid_disk = slot; |
3250 | /* assume it is working */ |
3251 | clear_bit(nr: Faulty, addr: &rdev->flags); |
3252 | clear_bit(nr: WriteMostly, addr: &rdev->flags); |
3253 | set_bit(nr: In_sync, addr: &rdev->flags); |
3254 | sysfs_notify_dirent_safe(sd: rdev->sysfs_state); |
3255 | } |
3256 | return len; |
3257 | } |
3258 | |
3259 | static struct rdev_sysfs_entry rdev_slot = |
3260 | __ATTR(slot, S_IRUGO|S_IWUSR, slot_show, slot_store); |
3261 | |
3262 | static ssize_t |
3263 | offset_show(struct md_rdev *rdev, char *page) |
3264 | { |
3265 | return sprintf(buf: page, fmt: "%llu\n" , (unsigned long long)rdev->data_offset); |
3266 | } |
3267 | |
3268 | static ssize_t |
3269 | offset_store(struct md_rdev *rdev, const char *buf, size_t len) |
3270 | { |
3271 | unsigned long long offset; |
3272 | if (kstrtoull(s: buf, base: 10, res: &offset) < 0) |
3273 | return -EINVAL; |
3274 | if (rdev->mddev->pers && rdev->raid_disk >= 0) |
3275 | return -EBUSY; |
3276 | if (rdev->sectors && rdev->mddev->external) |
3277 | /* Must set offset before size, so overlap checks |
3278 | * can be sane */ |
3279 | return -EBUSY; |
3280 | rdev->data_offset = offset; |
3281 | rdev->new_data_offset = offset; |
3282 | return len; |
3283 | } |
3284 | |
3285 | static struct rdev_sysfs_entry rdev_offset = |
3286 | __ATTR(offset, S_IRUGO|S_IWUSR, offset_show, offset_store); |
3287 | |
3288 | static ssize_t new_offset_show(struct md_rdev *rdev, char *page) |
3289 | { |
3290 | return sprintf(buf: page, fmt: "%llu\n" , |
3291 | (unsigned long long)rdev->new_data_offset); |
3292 | } |
3293 | |
3294 | static ssize_t new_offset_store(struct md_rdev *rdev, |
3295 | const char *buf, size_t len) |
3296 | { |
3297 | unsigned long long new_offset; |
3298 | struct mddev *mddev = rdev->mddev; |
3299 | |
3300 | if (kstrtoull(s: buf, base: 10, res: &new_offset) < 0) |
3301 | return -EINVAL; |
3302 | |
3303 | if (mddev->sync_thread || |
3304 | test_bit(MD_RECOVERY_RUNNING,&mddev->recovery)) |
3305 | return -EBUSY; |
3306 | if (new_offset == rdev->data_offset) |
3307 | /* reset is always permitted */ |
3308 | ; |
3309 | else if (new_offset > rdev->data_offset) { |
3310 | /* must not push array size beyond rdev_sectors */ |
3311 | if (new_offset - rdev->data_offset |
3312 | + mddev->dev_sectors > rdev->sectors) |
3313 | return -E2BIG; |
3314 | } |
3315 | /* Metadata worries about other space details. */ |
3316 | |
3317 | /* decreasing the offset is inconsistent with a backwards |
3318 | * reshape. |
3319 | */ |
3320 | if (new_offset < rdev->data_offset && |
3321 | mddev->reshape_backwards) |
3322 | return -EINVAL; |
3323 | /* Increasing offset is inconsistent with forwards |
3324 | * reshape. reshape_direction should be set to |
3325 | * 'backwards' first. |
3326 | */ |
3327 | if (new_offset > rdev->data_offset && |
3328 | !mddev->reshape_backwards) |
3329 | return -EINVAL; |
3330 | |
3331 | if (mddev->pers && mddev->persistent && |
3332 | !super_types[mddev->major_version] |
3333 | .allow_new_offset(rdev, new_offset)) |
3334 | return -E2BIG; |
3335 | rdev->new_data_offset = new_offset; |
3336 | if (new_offset > rdev->data_offset) |
3337 | mddev->reshape_backwards = 1; |
3338 | else if (new_offset < rdev->data_offset) |
3339 | mddev->reshape_backwards = 0; |
3340 | |
3341 | return len; |
3342 | } |
3343 | static struct rdev_sysfs_entry rdev_new_offset = |
3344 | __ATTR(new_offset, S_IRUGO|S_IWUSR, new_offset_show, new_offset_store); |
3345 | |
3346 | static ssize_t |
3347 | rdev_size_show(struct md_rdev *rdev, char *page) |
3348 | { |
3349 | return sprintf(buf: page, fmt: "%llu\n" , (unsigned long long)rdev->sectors / 2); |
3350 | } |
3351 | |
3352 | static int md_rdevs_overlap(struct md_rdev *a, struct md_rdev *b) |
3353 | { |
3354 | /* check if two start/length pairs overlap */ |
3355 | if (a->data_offset + a->sectors <= b->data_offset) |
3356 | return false; |
3357 | if (b->data_offset + b->sectors <= a->data_offset) |
3358 | return false; |
3359 | return true; |
3360 | } |
3361 | |
3362 | static bool md_rdev_overlaps(struct md_rdev *rdev) |
3363 | { |
3364 | struct mddev *mddev; |
3365 | struct md_rdev *rdev2; |
3366 | |
3367 | spin_lock(lock: &all_mddevs_lock); |
3368 | list_for_each_entry(mddev, &all_mddevs, all_mddevs) { |
3369 | if (test_bit(MD_DELETED, &mddev->flags)) |
3370 | continue; |
3371 | rdev_for_each(rdev2, mddev) { |
3372 | if (rdev != rdev2 && rdev->bdev == rdev2->bdev && |
3373 | md_rdevs_overlap(a: rdev, b: rdev2)) { |
3374 | spin_unlock(lock: &all_mddevs_lock); |
3375 | return true; |
3376 | } |
3377 | } |
3378 | } |
3379 | spin_unlock(lock: &all_mddevs_lock); |
3380 | return false; |
3381 | } |
3382 | |
3383 | static int strict_blocks_to_sectors(const char *buf, sector_t *sectors) |
3384 | { |
3385 | unsigned long long blocks; |
3386 | sector_t new; |
3387 | |
3388 | if (kstrtoull(s: buf, base: 10, res: &blocks) < 0) |
3389 | return -EINVAL; |
3390 | |
3391 | if (blocks & 1ULL << (8 * sizeof(blocks) - 1)) |
3392 | return -EINVAL; /* sector conversion overflow */ |
3393 | |
3394 | new = blocks * 2; |
3395 | if (new != blocks * 2) |
3396 | return -EINVAL; /* unsigned long long to sector_t overflow */ |
3397 | |
3398 | *sectors = new; |
3399 | return 0; |
3400 | } |
3401 | |
3402 | static ssize_t |
3403 | rdev_size_store(struct md_rdev *rdev, const char *buf, size_t len) |
3404 | { |
3405 | struct mddev *my_mddev = rdev->mddev; |
3406 | sector_t oldsectors = rdev->sectors; |
3407 | sector_t sectors; |
3408 | |
3409 | if (test_bit(Journal, &rdev->flags)) |
3410 | return -EBUSY; |
3411 | if (strict_blocks_to_sectors(buf, sectors: §ors) < 0) |
3412 | return -EINVAL; |
3413 | if (rdev->data_offset != rdev->new_data_offset) |
3414 | return -EINVAL; /* too confusing */ |
3415 | if (my_mddev->pers && rdev->raid_disk >= 0) { |
3416 | if (my_mddev->persistent) { |
3417 | sectors = super_types[my_mddev->major_version]. |
3418 | rdev_size_change(rdev, sectors); |
3419 | if (!sectors) |
3420 | return -EBUSY; |
3421 | } else if (!sectors) |
3422 | sectors = bdev_nr_sectors(bdev: rdev->bdev) - |
3423 | rdev->data_offset; |
3424 | if (!my_mddev->pers->resize) |
3425 | /* Cannot change size for RAID0 or Linear etc */ |
3426 | return -EINVAL; |
3427 | } |
3428 | if (sectors < my_mddev->dev_sectors) |
3429 | return -EINVAL; /* component must fit device */ |
3430 | |
3431 | rdev->sectors = sectors; |
3432 | |
3433 | /* |
3434 | * Check that all other rdevs with the same bdev do not overlap. This |
3435 | * check does not provide a hard guarantee, it just helps avoid |
3436 | * dangerous mistakes. |
3437 | */ |
3438 | if (sectors > oldsectors && my_mddev->external && |
3439 | md_rdev_overlaps(rdev)) { |
3440 | /* |
3441 | * Someone else could have slipped in a size change here, but |
3442 | * doing so is just silly. We put oldsectors back because we |
3443 | * know it is safe, and trust userspace not to race with itself. |
3444 | */ |
3445 | rdev->sectors = oldsectors; |
3446 | return -EBUSY; |
3447 | } |
3448 | return len; |
3449 | } |
3450 | |
3451 | static struct rdev_sysfs_entry rdev_size = |
3452 | __ATTR(size, S_IRUGO|S_IWUSR, rdev_size_show, rdev_size_store); |
3453 | |
3454 | static ssize_t recovery_start_show(struct md_rdev *rdev, char *page) |
3455 | { |
3456 | unsigned long long recovery_start = rdev->recovery_offset; |
3457 | |
3458 | if (test_bit(In_sync, &rdev->flags) || |
3459 | recovery_start == MaxSector) |
3460 | return sprintf(buf: page, fmt: "none\n" ); |
3461 | |
3462 | return sprintf(buf: page, fmt: "%llu\n" , recovery_start); |
3463 | } |
3464 | |
3465 | static ssize_t recovery_start_store(struct md_rdev *rdev, const char *buf, size_t len) |
3466 | { |
3467 | unsigned long long recovery_start; |
3468 | |
3469 | if (cmd_match(cmd: buf, str: "none" )) |
3470 | recovery_start = MaxSector; |
3471 | else if (kstrtoull(s: buf, base: 10, res: &recovery_start)) |
3472 | return -EINVAL; |
3473 | |
3474 | if (rdev->mddev->pers && |
3475 | rdev->raid_disk >= 0) |
3476 | return -EBUSY; |
3477 | |
3478 | rdev->recovery_offset = recovery_start; |
3479 | if (recovery_start == MaxSector) |
3480 | set_bit(nr: In_sync, addr: &rdev->flags); |
3481 | else |
3482 | clear_bit(nr: In_sync, addr: &rdev->flags); |
3483 | return len; |
3484 | } |
3485 | |
3486 | static struct rdev_sysfs_entry rdev_recovery_start = |
3487 | __ATTR(recovery_start, S_IRUGO|S_IWUSR, recovery_start_show, recovery_start_store); |
3488 | |
3489 | /* sysfs access to bad-blocks list. |
3490 | * We present two files. |
3491 | * 'bad-blocks' lists sector numbers and lengths of ranges that |
3492 | * are recorded as bad. The list is truncated to fit within |
3493 | * the one-page limit of sysfs. |
3494 | * Writing "sector length" to this file adds an acknowledged |
3495 | * bad block list. |
3496 | * 'unacknowledged-bad-blocks' lists bad blocks that have not yet |
3497 | * been acknowledged. Writing to this file adds bad blocks |
3498 | * without acknowledging them. This is largely for testing. |
3499 | */ |
3500 | static ssize_t bb_show(struct md_rdev *rdev, char *page) |
3501 | { |
3502 | return badblocks_show(bb: &rdev->badblocks, page, unack: 0); |
3503 | } |
3504 | static ssize_t bb_store(struct md_rdev *rdev, const char *page, size_t len) |
3505 | { |
3506 | int rv = badblocks_store(bb: &rdev->badblocks, page, len, unack: 0); |
3507 | /* Maybe that ack was all we needed */ |
3508 | if (test_and_clear_bit(nr: BlockedBadBlocks, addr: &rdev->flags)) |
3509 | wake_up(&rdev->blocked_wait); |
3510 | return rv; |
3511 | } |
3512 | static struct rdev_sysfs_entry rdev_bad_blocks = |
3513 | __ATTR(bad_blocks, S_IRUGO|S_IWUSR, bb_show, bb_store); |
3514 | |
3515 | static ssize_t ubb_show(struct md_rdev *rdev, char *page) |
3516 | { |
3517 | return badblocks_show(bb: &rdev->badblocks, page, unack: 1); |
3518 | } |
3519 | static ssize_t ubb_store(struct md_rdev *rdev, const char *page, size_t len) |
3520 | { |
3521 | return badblocks_store(bb: &rdev->badblocks, page, len, unack: 1); |
3522 | } |
3523 | static struct rdev_sysfs_entry rdev_unack_bad_blocks = |
3524 | __ATTR(unacknowledged_bad_blocks, S_IRUGO|S_IWUSR, ubb_show, ubb_store); |
3525 | |
3526 | static ssize_t |
3527 | ppl_sector_show(struct md_rdev *rdev, char *page) |
3528 | { |
3529 | return sprintf(buf: page, fmt: "%llu\n" , (unsigned long long)rdev->ppl.sector); |
3530 | } |
3531 | |
3532 | static ssize_t |
3533 | ppl_sector_store(struct md_rdev *rdev, const char *buf, size_t len) |
3534 | { |
3535 | unsigned long long sector; |
3536 | |
3537 | if (kstrtoull(s: buf, base: 10, res: §or) < 0) |
3538 | return -EINVAL; |
3539 | if (sector != (sector_t)sector) |
3540 | return -EINVAL; |
3541 | |
3542 | if (rdev->mddev->pers && test_bit(MD_HAS_PPL, &rdev->mddev->flags) && |
3543 | rdev->raid_disk >= 0) |
3544 | return -EBUSY; |
3545 | |
3546 | if (rdev->mddev->persistent) { |
3547 | if (rdev->mddev->major_version == 0) |
3548 | return -EINVAL; |
3549 | if ((sector > rdev->sb_start && |
3550 | sector - rdev->sb_start > S16_MAX) || |
3551 | (sector < rdev->sb_start && |
3552 | rdev->sb_start - sector > -S16_MIN)) |
3553 | return -EINVAL; |
3554 | rdev->ppl.offset = sector - rdev->sb_start; |
3555 | } else if (!rdev->mddev->external) { |
3556 | return -EBUSY; |
3557 | } |
3558 | rdev->ppl.sector = sector; |
3559 | return len; |
3560 | } |
3561 | |
3562 | static struct rdev_sysfs_entry rdev_ppl_sector = |
3563 | __ATTR(ppl_sector, S_IRUGO|S_IWUSR, ppl_sector_show, ppl_sector_store); |
3564 | |
3565 | static ssize_t |
3566 | ppl_size_show(struct md_rdev *rdev, char *page) |
3567 | { |
3568 | return sprintf(buf: page, fmt: "%u\n" , rdev->ppl.size); |
3569 | } |
3570 | |
3571 | static ssize_t |
3572 | ppl_size_store(struct md_rdev *rdev, const char *buf, size_t len) |
3573 | { |
3574 | unsigned int size; |
3575 | |
3576 | if (kstrtouint(s: buf, base: 10, res: &size) < 0) |
3577 | return -EINVAL; |
3578 | |
3579 | if (rdev->mddev->pers && test_bit(MD_HAS_PPL, &rdev->mddev->flags) && |
3580 | rdev->raid_disk >= 0) |
3581 | return -EBUSY; |
3582 | |
3583 | if (rdev->mddev->persistent) { |
3584 | if (rdev->mddev->major_version == 0) |
3585 | return -EINVAL; |
3586 | if (size > U16_MAX) |
3587 | return -EINVAL; |
3588 | } else if (!rdev->mddev->external) { |
3589 | return -EBUSY; |
3590 | } |
3591 | rdev->ppl.size = size; |
3592 | return len; |
3593 | } |
3594 | |
3595 | static struct rdev_sysfs_entry rdev_ppl_size = |
3596 | __ATTR(ppl_size, S_IRUGO|S_IWUSR, ppl_size_show, ppl_size_store); |
3597 | |
3598 | static struct attribute *rdev_default_attrs[] = { |
3599 | &rdev_state.attr, |
3600 | &rdev_errors.attr, |
3601 | &rdev_slot.attr, |
3602 | &rdev_offset.attr, |
3603 | &rdev_new_offset.attr, |
3604 | &rdev_size.attr, |
3605 | &rdev_recovery_start.attr, |
3606 | &rdev_bad_blocks.attr, |
3607 | &rdev_unack_bad_blocks.attr, |
3608 | &rdev_ppl_sector.attr, |
3609 | &rdev_ppl_size.attr, |
3610 | NULL, |
3611 | }; |
3612 | ATTRIBUTE_GROUPS(rdev_default); |
3613 | static ssize_t |
3614 | rdev_attr_show(struct kobject *kobj, struct attribute *attr, char *page) |
3615 | { |
3616 | struct rdev_sysfs_entry *entry = container_of(attr, struct rdev_sysfs_entry, attr); |
3617 | struct md_rdev *rdev = container_of(kobj, struct md_rdev, kobj); |
3618 | |
3619 | if (!entry->show) |
3620 | return -EIO; |
3621 | if (!rdev->mddev) |
3622 | return -ENODEV; |
3623 | return entry->show(rdev, page); |
3624 | } |
3625 | |
3626 | static ssize_t |
3627 | rdev_attr_store(struct kobject *kobj, struct attribute *attr, |
3628 | const char *page, size_t length) |
3629 | { |
3630 | struct rdev_sysfs_entry *entry = container_of(attr, struct rdev_sysfs_entry, attr); |
3631 | struct md_rdev *rdev = container_of(kobj, struct md_rdev, kobj); |
3632 | struct kernfs_node *kn = NULL; |
3633 | bool suspend = false; |
3634 | ssize_t rv; |
3635 | struct mddev *mddev = rdev->mddev; |
3636 | |
3637 | if (!entry->store) |
3638 | return -EIO; |
3639 | if (!capable(CAP_SYS_ADMIN)) |
3640 | return -EACCES; |
3641 | if (!mddev) |
3642 | return -ENODEV; |
3643 | |
3644 | if (entry->store == state_store) { |
3645 | if (cmd_match(cmd: page, str: "remove" )) |
3646 | kn = sysfs_break_active_protection(kobj, attr); |
3647 | if (cmd_match(cmd: page, str: "remove" ) || cmd_match(cmd: page, str: "re-add" ) || |
3648 | cmd_match(cmd: page, str: "writemostly" ) || |
3649 | cmd_match(cmd: page, str: "-writemostly" )) |
3650 | suspend = true; |
3651 | } |
3652 | |
3653 | rv = suspend ? mddev_suspend_and_lock(mddev) : mddev_lock(mddev); |
3654 | if (!rv) { |
3655 | if (rdev->mddev == NULL) |
3656 | rv = -ENODEV; |
3657 | else |
3658 | rv = entry->store(rdev, page, length); |
3659 | suspend ? mddev_unlock_and_resume(mddev) : mddev_unlock(mddev); |
3660 | } |
3661 | |
3662 | if (kn) |
3663 | sysfs_unbreak_active_protection(kn); |
3664 | |
3665 | return rv; |
3666 | } |
3667 | |
3668 | static void rdev_free(struct kobject *ko) |
3669 | { |
3670 | struct md_rdev *rdev = container_of(ko, struct md_rdev, kobj); |
3671 | kfree(objp: rdev); |
3672 | } |
3673 | static const struct sysfs_ops rdev_sysfs_ops = { |
3674 | .show = rdev_attr_show, |
3675 | .store = rdev_attr_store, |
3676 | }; |
3677 | static const struct kobj_type rdev_ktype = { |
3678 | .release = rdev_free, |
3679 | .sysfs_ops = &rdev_sysfs_ops, |
3680 | .default_groups = rdev_default_groups, |
3681 | }; |
3682 | |
3683 | int md_rdev_init(struct md_rdev *rdev) |
3684 | { |
3685 | rdev->desc_nr = -1; |
3686 | rdev->saved_raid_disk = -1; |
3687 | rdev->raid_disk = -1; |
3688 | rdev->flags = 0; |
3689 | rdev->data_offset = 0; |
3690 | rdev->new_data_offset = 0; |
3691 | rdev->sb_events = 0; |
3692 | rdev->last_read_error = 0; |
3693 | rdev->sb_loaded = 0; |
3694 | rdev->bb_page = NULL; |
3695 | atomic_set(v: &rdev->nr_pending, i: 0); |
3696 | atomic_set(v: &rdev->read_errors, i: 0); |
3697 | atomic_set(v: &rdev->corrected_errors, i: 0); |
3698 | |
3699 | INIT_LIST_HEAD(list: &rdev->same_set); |
3700 | init_waitqueue_head(&rdev->blocked_wait); |
3701 | |
3702 | /* Add space to store bad block list. |
3703 | * This reserves the space even on arrays where it cannot |
3704 | * be used - I wonder if that matters |
3705 | */ |
3706 | return badblocks_init(bb: &rdev->badblocks, enable: 0); |
3707 | } |
3708 | EXPORT_SYMBOL_GPL(md_rdev_init); |
3709 | |
3710 | /* |
3711 | * Import a device. If 'super_format' >= 0, then sanity check the superblock |
3712 | * |
3713 | * mark the device faulty if: |
3714 | * |
3715 | * - the device is nonexistent (zero size) |
3716 | * - the device has no valid superblock |
3717 | * |
3718 | * a faulty rdev _never_ has rdev->sb set. |
3719 | */ |
3720 | static struct md_rdev *md_import_device(dev_t newdev, int super_format, int super_minor) |
3721 | { |
3722 | struct md_rdev *rdev; |
3723 | sector_t size; |
3724 | int err; |
3725 | |
3726 | rdev = kzalloc(size: sizeof(*rdev), GFP_KERNEL); |
3727 | if (!rdev) |
3728 | return ERR_PTR(error: -ENOMEM); |
3729 | |
3730 | err = md_rdev_init(rdev); |
3731 | if (err) |
3732 | goto out_free_rdev; |
3733 | err = alloc_disk_sb(rdev); |
3734 | if (err) |
3735 | goto out_clear_rdev; |
3736 | |
3737 | rdev->bdev_handle = bdev_open_by_dev(dev: newdev, |
3738 | BLK_OPEN_READ | BLK_OPEN_WRITE, |
3739 | holder: super_format == -2 ? &claim_rdev : rdev, NULL); |
3740 | if (IS_ERR(ptr: rdev->bdev_handle)) { |
3741 | pr_warn("md: could not open device unknown-block(%u,%u).\n" , |
3742 | MAJOR(newdev), MINOR(newdev)); |
3743 | err = PTR_ERR(ptr: rdev->bdev_handle); |
3744 | goto out_clear_rdev; |
3745 | } |
3746 | rdev->bdev = rdev->bdev_handle->bdev; |
3747 | |
3748 | kobject_init(kobj: &rdev->kobj, ktype: &rdev_ktype); |
3749 | |
3750 | size = bdev_nr_bytes(bdev: rdev->bdev) >> BLOCK_SIZE_BITS; |
3751 | if (!size) { |
3752 | pr_warn("md: %pg has zero or unknown size, marking faulty!\n" , |
3753 | rdev->bdev); |
3754 | err = -EINVAL; |
3755 | goto out_blkdev_put; |
3756 | } |
3757 | |
3758 | if (super_format >= 0) { |
3759 | err = super_types[super_format]. |
3760 | load_super(rdev, NULL, super_minor); |
3761 | if (err == -EINVAL) { |
3762 | pr_warn("md: %pg does not have a valid v%d.%d superblock, not importing!\n" , |
3763 | rdev->bdev, |
3764 | super_format, super_minor); |
3765 | goto out_blkdev_put; |
3766 | } |
3767 | if (err < 0) { |
3768 | pr_warn("md: could not read %pg's sb, not importing!\n" , |
3769 | rdev->bdev); |
3770 | goto out_blkdev_put; |
3771 | } |
3772 | } |
3773 | |
3774 | return rdev; |
3775 | |
3776 | out_blkdev_put: |
3777 | bdev_release(handle: rdev->bdev_handle); |
3778 | out_clear_rdev: |
3779 | md_rdev_clear(rdev); |
3780 | out_free_rdev: |
3781 | kfree(objp: rdev); |
3782 | return ERR_PTR(error: err); |
3783 | } |
3784 | |
3785 | /* |
3786 | * Check a full RAID array for plausibility |
3787 | */ |
3788 | |
3789 | static int analyze_sbs(struct mddev *mddev) |
3790 | { |
3791 | int i; |
3792 | struct md_rdev *rdev, *freshest, *tmp; |
3793 | |
3794 | freshest = NULL; |
3795 | rdev_for_each_safe(rdev, tmp, mddev) |
3796 | switch (super_types[mddev->major_version]. |
3797 | load_super(rdev, freshest, mddev->minor_version)) { |
3798 | case 1: |
3799 | freshest = rdev; |
3800 | break; |
3801 | case 0: |
3802 | break; |
3803 | default: |
3804 | pr_warn("md: fatal superblock inconsistency in %pg -- removing from array\n" , |
3805 | rdev->bdev); |
3806 | md_kick_rdev_from_array(rdev); |
3807 | } |
3808 | |
3809 | /* Cannot find a valid fresh disk */ |
3810 | if (!freshest) { |
3811 | pr_warn("md: cannot find a valid disk\n" ); |
3812 | return -EINVAL; |
3813 | } |
3814 | |
3815 | super_types[mddev->major_version]. |
3816 | validate_super(mddev, freshest); |
3817 | |
3818 | i = 0; |
3819 | rdev_for_each_safe(rdev, tmp, mddev) { |
3820 | if (mddev->max_disks && |
3821 | (rdev->desc_nr >= mddev->max_disks || |
3822 | i > mddev->max_disks)) { |
3823 | pr_warn("md: %s: %pg: only %d devices permitted\n" , |
3824 | mdname(mddev), rdev->bdev, |
3825 | mddev->max_disks); |
3826 | md_kick_rdev_from_array(rdev); |
3827 | continue; |
3828 | } |
3829 | if (rdev != freshest) { |
3830 | if (super_types[mddev->major_version]. |
3831 | validate_super(mddev, rdev)) { |
3832 | pr_warn("md: kicking non-fresh %pg from array!\n" , |
3833 | rdev->bdev); |
3834 | md_kick_rdev_from_array(rdev); |
3835 | continue; |
3836 | } |
3837 | } |
3838 | if (mddev->level == LEVEL_MULTIPATH) { |
3839 | rdev->desc_nr = i++; |
3840 | rdev->raid_disk = rdev->desc_nr; |
3841 | set_bit(nr: In_sync, addr: &rdev->flags); |
3842 | } else if (rdev->raid_disk >= |
3843 | (mddev->raid_disks - min(0, mddev->delta_disks)) && |
3844 | !test_bit(Journal, &rdev->flags)) { |
3845 | rdev->raid_disk = -1; |
3846 | clear_bit(nr: In_sync, addr: &rdev->flags); |
3847 | } |
3848 | } |
3849 | |
3850 | return 0; |
3851 | } |
3852 | |
3853 | /* Read a fixed-point number. |
3854 | * Numbers in sysfs attributes should be in "standard" units where |
3855 | * possible, so time should be in seconds. |
3856 | * However we internally use a a much smaller unit such as |
3857 | * milliseconds or jiffies. |
3858 | * This function takes a decimal number with a possible fractional |
3859 | * component, and produces an integer which is the result of |
3860 | * multiplying that number by 10^'scale'. |
3861 | * all without any floating-point arithmetic. |
3862 | */ |
3863 | int strict_strtoul_scaled(const char *cp, unsigned long *res, int scale) |
3864 | { |
3865 | unsigned long result = 0; |
3866 | long decimals = -1; |
3867 | while (isdigit(c: *cp) || (*cp == '.' && decimals < 0)) { |
3868 | if (*cp == '.') |
3869 | decimals = 0; |
3870 | else if (decimals < scale) { |
3871 | unsigned int value; |
3872 | value = *cp - '0'; |
3873 | result = result * 10 + value; |
3874 | if (decimals >= 0) |
3875 | decimals++; |
3876 | } |
3877 | cp++; |
3878 | } |
3879 | if (*cp == '\n') |
3880 | cp++; |
3881 | if (*cp) |
3882 | return -EINVAL; |
3883 | if (decimals < 0) |
3884 | decimals = 0; |
3885 | *res = result * int_pow(base: 10, exp: scale - decimals); |
3886 | return 0; |
3887 | } |
3888 | |
3889 | static ssize_t |
3890 | safe_delay_show(struct mddev *mddev, char *page) |
3891 | { |
3892 | unsigned int msec = ((unsigned long)mddev->safemode_delay*1000)/HZ; |
3893 | |
3894 | return sprintf(buf: page, fmt: "%u.%03u\n" , msec/1000, msec%1000); |
3895 | } |
3896 | static ssize_t |
3897 | safe_delay_store(struct mddev *mddev, const char *cbuf, size_t len) |
3898 | { |
3899 | unsigned long msec; |
3900 | |
3901 | if (mddev_is_clustered(mddev)) { |
3902 | pr_warn("md: Safemode is disabled for clustered mode\n" ); |
3903 | return -EINVAL; |
3904 | } |
3905 | |
3906 | if (strict_strtoul_scaled(cp: cbuf, res: &msec, scale: 3) < 0 || msec > UINT_MAX / HZ) |
3907 | return -EINVAL; |
3908 | if (msec == 0) |
3909 | mddev->safemode_delay = 0; |
3910 | else { |
3911 | unsigned long old_delay = mddev->safemode_delay; |
3912 | unsigned long new_delay = (msec*HZ)/1000; |
3913 | |
3914 | if (new_delay == 0) |
3915 | new_delay = 1; |
3916 | mddev->safemode_delay = new_delay; |
3917 | if (new_delay < old_delay || old_delay == 0) |
3918 | mod_timer(timer: &mddev->safemode_timer, expires: jiffies+1); |
3919 | } |
3920 | return len; |
3921 | } |
3922 | static struct md_sysfs_entry md_safe_delay = |
3923 | __ATTR(safe_mode_delay, S_IRUGO|S_IWUSR,safe_delay_show, safe_delay_store); |
3924 | |
3925 | static ssize_t |
3926 | level_show(struct mddev *mddev, char *page) |
3927 | { |
3928 | struct md_personality *p; |
3929 | int ret; |
3930 | spin_lock(lock: &mddev->lock); |
3931 | p = mddev->pers; |
3932 | if (p) |
3933 | ret = sprintf(buf: page, fmt: "%s\n" , p->name); |
3934 | else if (mddev->clevel[0]) |
3935 | ret = sprintf(buf: page, fmt: "%s\n" , mddev->clevel); |
3936 | else if (mddev->level != LEVEL_NONE) |
3937 | ret = sprintf(buf: page, fmt: "%d\n" , mddev->level); |
3938 | else |
3939 | ret = 0; |
3940 | spin_unlock(lock: &mddev->lock); |
3941 | return ret; |
3942 | } |
3943 | |
3944 | static ssize_t |
3945 | level_store(struct mddev *mddev, const char *buf, size_t len) |
3946 | { |
3947 | char clevel[16]; |
3948 | ssize_t rv; |
3949 | size_t slen = len; |
3950 | struct md_personality *pers, *oldpers; |
3951 | long level; |
3952 | void *priv, *oldpriv; |
3953 | struct md_rdev *rdev; |
3954 | |
3955 | if (slen == 0 || slen >= sizeof(clevel)) |
3956 | return -EINVAL; |
3957 | |
3958 | rv = mddev_suspend_and_lock(mddev); |
3959 | if (rv) |
3960 | return rv; |
3961 | |
3962 | if (mddev->pers == NULL) { |
3963 | memcpy(mddev->clevel, buf, slen); |
3964 | if (mddev->clevel[slen-1] == '\n') |
3965 | slen--; |
3966 | mddev->clevel[slen] = 0; |
3967 | mddev->level = LEVEL_NONE; |
3968 | rv = len; |
3969 | goto out_unlock; |
3970 | } |
3971 | rv = -EROFS; |
3972 | if (!md_is_rdwr(mddev)) |
3973 | goto out_unlock; |
3974 | |
3975 | /* request to change the personality. Need to ensure: |
3976 | * - array is not engaged in resync/recovery/reshape |
3977 | * - old personality can be suspended |
3978 | * - new personality will access other array. |
3979 | */ |
3980 | |
3981 | rv = -EBUSY; |
3982 | if (mddev->sync_thread || |
3983 | test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) || |
3984 | mddev->reshape_position != MaxSector || |
3985 | mddev->sysfs_active) |
3986 | goto out_unlock; |
3987 | |
3988 | rv = -EINVAL; |
3989 | if (!mddev->pers->quiesce) { |
3990 | pr_warn("md: %s: %s does not support online personality change\n" , |
3991 | mdname(mddev), mddev->pers->name); |
3992 | goto out_unlock; |
3993 | } |
3994 | |
3995 | /* Now find the new personality */ |
3996 | memcpy(clevel, buf, slen); |
3997 | if (clevel[slen-1] == '\n') |
3998 | slen--; |
3999 | clevel[slen] = 0; |
4000 | if (kstrtol(s: clevel, base: 10, res: &level)) |
4001 | level = LEVEL_NONE; |
4002 | |
4003 | if (request_module("md-%s" , clevel) != 0) |
4004 | request_module("md-level-%s" , clevel); |
4005 | spin_lock(lock: &pers_lock); |
4006 | pers = find_pers(level, clevel); |
4007 | if (!pers || !try_module_get(module: pers->owner)) { |
4008 | spin_unlock(lock: &pers_lock); |
4009 | pr_warn("md: personality %s not loaded\n" , clevel); |
4010 | rv = -EINVAL; |
4011 | goto out_unlock; |
4012 | } |
4013 | spin_unlock(lock: &pers_lock); |
4014 | |
4015 | if (pers == mddev->pers) { |
4016 | /* Nothing to do! */ |
4017 | module_put(module: pers->owner); |
4018 | rv = len; |
4019 | goto out_unlock; |
4020 | } |
4021 | if (!pers->takeover) { |
4022 | module_put(module: pers->owner); |
4023 | pr_warn("md: %s: %s does not support personality takeover\n" , |
4024 | mdname(mddev), clevel); |
4025 | rv = -EINVAL; |
4026 | goto out_unlock; |
4027 | } |
4028 | |
4029 | rdev_for_each(rdev, mddev) |
4030 | rdev->new_raid_disk = rdev->raid_disk; |
4031 | |
4032 | /* ->takeover must set new_* and/or delta_disks |
4033 | * if it succeeds, and may set them when it fails. |
4034 | */ |
4035 | priv = pers->takeover(mddev); |
4036 | if (IS_ERR(ptr: priv)) { |
4037 | mddev->new_level = mddev->level; |
4038 | mddev->new_layout = mddev->layout; |
4039 | mddev->new_chunk_sectors = mddev->chunk_sectors; |
4040 | mddev->raid_disks -= mddev->delta_disks; |
4041 | mddev->delta_disks = 0; |
4042 | mddev->reshape_backwards = 0; |
4043 | module_put(module: pers->owner); |
4044 | pr_warn("md: %s: %s would not accept array\n" , |
4045 | mdname(mddev), clevel); |
4046 | rv = PTR_ERR(ptr: priv); |
4047 | goto out_unlock; |
4048 | } |
4049 | |
4050 | /* Looks like we have a winner */ |
4051 | mddev_detach(mddev); |
4052 | |
4053 | spin_lock(lock: &mddev->lock); |
4054 | oldpers = mddev->pers; |
4055 | oldpriv = mddev->private; |
4056 | mddev->pers = pers; |
4057 | mddev->private = priv; |
4058 | strscpy(p: mddev->clevel, q: pers->name, size: sizeof(mddev->clevel)); |
4059 | mddev->level = mddev->new_level; |
4060 | mddev->layout = mddev->new_layout; |
4061 | mddev->chunk_sectors = mddev->new_chunk_sectors; |
4062 | mddev->delta_disks = 0; |
4063 | mddev->reshape_backwards = 0; |
4064 | mddev->degraded = 0; |
4065 | spin_unlock(lock: &mddev->lock); |
4066 | |
4067 | if (oldpers->sync_request == NULL && |
4068 | mddev->external) { |
4069 | /* We are converting from a no-redundancy array |
4070 | * to a redundancy array and metadata is managed |
4071 | * externally so we need to be sure that writes |
4072 | * won't block due to a need to transition |
4073 | * clean->dirty |
4074 | * until external management is started. |
4075 | */ |
4076 | mddev->in_sync = 0; |
4077 | mddev->safemode_delay = 0; |
4078 | mddev->safemode = 0; |
4079 | } |
4080 | |
4081 | oldpers->free(mddev, oldpriv); |
4082 | |
4083 | if (oldpers->sync_request == NULL && |
4084 | pers->sync_request != NULL) { |
4085 | /* need to add the md_redundancy_group */ |
4086 | if (sysfs_create_group(kobj: &mddev->kobj, grp: &md_redundancy_group)) |
4087 | pr_warn("md: cannot register extra attributes for %s\n" , |
4088 | mdname(mddev)); |
4089 | mddev->sysfs_action = sysfs_get_dirent(parent: mddev->kobj.sd, name: "sync_action" ); |
4090 | mddev->sysfs_completed = sysfs_get_dirent_safe(sd: mddev->kobj.sd, name: "sync_completed" ); |
4091 | mddev->sysfs_degraded = sysfs_get_dirent_safe(sd: mddev->kobj.sd, name: "degraded" ); |
4092 | } |
4093 | if (oldpers->sync_request != NULL && |
4094 | pers->sync_request == NULL) { |
4095 | /* need to remove the md_redundancy_group */ |
4096 | if (mddev->to_remove == NULL) |
4097 | mddev->to_remove = &md_redundancy_group; |
4098 | } |
4099 | |
4100 | module_put(module: oldpers->owner); |
4101 | |
4102 | rdev_for_each(rdev, mddev) { |
4103 | if (rdev->raid_disk < 0) |
4104 | continue; |
4105 | if (rdev->new_raid_disk >= mddev->raid_disks) |
4106 | rdev->new_raid_disk = -1; |
4107 | if (rdev->new_raid_disk == rdev->raid_disk) |
4108 | continue; |
4109 | sysfs_unlink_rdev(mddev, rdev); |
4110 | } |
4111 | rdev_for_each(rdev, mddev) { |
4112 | if (rdev->raid_disk < 0) |
4113 | continue; |
4114 | if (rdev->new_raid_disk == rdev->raid_disk) |
4115 | continue; |
4116 | rdev->raid_disk = rdev->new_raid_disk; |
4117 | if (rdev->raid_disk < 0) |
4118 | clear_bit(nr: In_sync, addr: &rdev->flags); |
4119 | else { |
4120 | if (sysfs_link_rdev(mddev, rdev)) |
4121 | pr_warn("md: cannot register rd%d for %s after level change\n" , |
4122 | rdev->raid_disk, mdname(mddev)); |
4123 | } |
4124 | } |
4125 | |
4126 | if (pers->sync_request == NULL) { |
4127 | /* this is now an array without redundancy, so |
4128 | * it must always be in_sync |
4129 | */ |
4130 | mddev->in_sync = 1; |
4131 | del_timer_sync(timer: &mddev->safemode_timer); |
4132 | } |
4133 | blk_set_stacking_limits(lim: &mddev->queue->limits); |
4134 | pers->run(mddev); |
4135 | set_bit(nr: MD_SB_CHANGE_DEVS, addr: &mddev->sb_flags); |
4136 | if (!mddev->thread) |
4137 | md_update_sb(mddev, 1); |
4138 | sysfs_notify_dirent_safe(sd: mddev->sysfs_level); |
4139 | md_new_event(); |
4140 | rv = len; |
4141 | out_unlock: |
4142 | mddev_unlock_and_resume(mddev); |
4143 | return rv; |
4144 | } |
4145 | |
4146 | static struct md_sysfs_entry md_level = |
4147 | __ATTR(level, S_IRUGO|S_IWUSR, level_show, level_store); |
4148 | |
4149 | static ssize_t |
4150 | layout_show(struct mddev *mddev, char *page) |
4151 | { |
4152 | /* just a number, not meaningful for all levels */ |
4153 | if (mddev->reshape_position != MaxSector && |
4154 | mddev->layout != mddev->new_layout) |
4155 | return sprintf(buf: page, fmt: "%d (%d)\n" , |
4156 | mddev->new_layout, mddev->layout); |
4157 | return sprintf(buf: page, fmt: "%d\n" , mddev->layout); |
4158 | } |
4159 | |
4160 | static ssize_t |
4161 | layout_store(struct mddev *mddev, const char *buf, size_t len) |
4162 | { |
4163 | unsigned int n; |
4164 | int err; |
4165 | |
4166 | err = kstrtouint(s: buf, base: 10, res: &n); |
4167 | if (err < 0) |
4168 | return err; |
4169 | err = mddev_lock(mddev); |
4170 | if (err) |
4171 | return err; |
4172 | |
4173 | if (mddev->pers) { |
4174 | if (mddev->pers->check_reshape == NULL) |
4175 | err = -EBUSY; |
4176 | else if (!md_is_rdwr(mddev)) |
4177 | err = -EROFS; |
4178 | else { |
4179 | mddev->new_layout = n; |
4180 | err = mddev->pers->check_reshape(mddev); |
4181 | if (err) |
4182 | mddev->new_layout = mddev->layout; |
4183 | } |
4184 | } else { |
4185 | mddev->new_layout = n; |
4186 | if (mddev->reshape_position == MaxSector) |
4187 | mddev->layout = n; |
4188 | } |
4189 | mddev_unlock(mddev); |
4190 | return err ?: len; |
4191 | } |
4192 | static struct md_sysfs_entry md_layout = |
4193 | __ATTR(layout, S_IRUGO|S_IWUSR, layout_show, layout_store); |
4194 | |
4195 | static ssize_t |
4196 | raid_disks_show(struct mddev *mddev, char *page) |
4197 | { |
4198 | if (mddev->raid_disks == 0) |
4199 | return 0; |
4200 | if (mddev->reshape_position != MaxSector && |
4201 | mddev->delta_disks != 0) |
4202 | return sprintf(buf: page, fmt: "%d (%d)\n" , mddev->raid_disks, |
4203 | mddev->raid_disks - mddev->delta_disks); |
4204 | return sprintf(buf: page, fmt: "%d\n" , mddev->raid_disks); |
4205 | } |
4206 | |
4207 | static int update_raid_disks(struct mddev *mddev, int raid_disks); |
4208 | |
4209 | static ssize_t |
4210 | raid_disks_store(struct mddev *mddev, const char *buf, size_t len) |
4211 | { |
4212 | unsigned int n; |
4213 | int err; |
4214 | |
4215 | err = kstrtouint(s: buf, base: 10, res: &n); |
4216 | if (err < 0) |
4217 | return err; |
4218 | |
4219 | err = mddev_lock(mddev); |
4220 | if (err) |
4221 | return err; |
4222 | if (mddev->pers) |
4223 | err = update_raid_disks(mddev, raid_disks: n); |
4224 | else if (mddev->reshape_position != MaxSector) { |
4225 | struct md_rdev *rdev; |
4226 | int olddisks = mddev->raid_disks - mddev->delta_disks; |
4227 | |
4228 | err = -EINVAL; |
4229 | rdev_for_each(rdev, mddev) { |
4230 | if (olddisks < n && |
4231 | rdev->data_offset < rdev->new_data_offset) |
4232 | goto out_unlock; |
4233 | if (olddisks > n && |
4234 | rdev->data_offset > rdev->new_data_offset) |
4235 | goto out_unlock; |
4236 | } |
4237 | err = 0; |
4238 | mddev->delta_disks = n - olddisks; |
4239 | mddev->raid_disks = n; |
4240 | mddev->reshape_backwards = (mddev->delta_disks < 0); |
4241 | } else |
4242 | mddev->raid_disks = n; |
4243 | out_unlock: |
4244 | mddev_unlock(mddev); |
4245 | return err ? err : len; |
4246 | } |
4247 | static struct md_sysfs_entry md_raid_disks = |
4248 | __ATTR(raid_disks, S_IRUGO|S_IWUSR, raid_disks_show, raid_disks_store); |
4249 | |
4250 | static ssize_t |
4251 | uuid_show(struct mddev *mddev, char *page) |
4252 | { |
4253 | return sprintf(buf: page, fmt: "%pU\n" , mddev->uuid); |
4254 | } |
4255 | static struct md_sysfs_entry md_uuid = |
4256 | __ATTR(uuid, S_IRUGO, uuid_show, NULL); |
4257 | |
4258 | static ssize_t |
4259 | chunk_size_show(struct mddev *mddev, char *page) |
4260 | { |
4261 | if (mddev->reshape_position != MaxSector && |
4262 | mddev->chunk_sectors != mddev->new_chunk_sectors) |
4263 | return sprintf(buf: page, fmt: "%d (%d)\n" , |
4264 | mddev->new_chunk_sectors << 9, |
4265 | mddev->chunk_sectors << 9); |
4266 | return sprintf(buf: page, fmt: "%d\n" , mddev->chunk_sectors << 9); |
4267 | } |
4268 | |
4269 | static ssize_t |
4270 | chunk_size_store(struct mddev *mddev, const char *buf, size_t len) |
4271 | { |
4272 | unsigned long n; |
4273 | int err; |
4274 | |
4275 | err = kstrtoul(s: buf, base: 10, res: &n); |
4276 | if (err < 0) |
4277 | return err; |
4278 | |
4279 | err = mddev_lock(mddev); |
4280 | if (err) |
4281 | return err; |
4282 | if (mddev->pers) { |
4283 | if (mddev->pers->check_reshape == NULL) |
4284 | err = -EBUSY; |
4285 | else if (!md_is_rdwr(mddev)) |
4286 | err = -EROFS; |
4287 | else { |
4288 | mddev->new_chunk_sectors = n >> 9; |
4289 | err = mddev->pers->check_reshape(mddev); |
4290 | if (err) |
4291 | mddev->new_chunk_sectors = mddev->chunk_sectors; |
4292 | } |
4293 | } else { |
4294 | mddev->new_chunk_sectors = n >> 9; |
4295 | if (mddev->reshape_position == MaxSector) |
4296 | mddev->chunk_sectors = n >> 9; |
4297 | } |
4298 | mddev_unlock(mddev); |
4299 | return err ?: len; |
4300 | } |
4301 | static struct md_sysfs_entry md_chunk_size = |
4302 | __ATTR(chunk_size, S_IRUGO|S_IWUSR, chunk_size_show, chunk_size_store); |
4303 | |
4304 | static ssize_t |
4305 | resync_start_show(struct mddev *mddev, char *page) |
4306 | { |
4307 | if (mddev->recovery_cp == MaxSector) |
4308 | return sprintf(buf: page, fmt: "none\n" ); |
4309 | return sprintf(buf: page, fmt: "%llu\n" , (unsigned long long)mddev->recovery_cp); |
4310 | } |
4311 | |
4312 | static ssize_t |
4313 | resync_start_store(struct mddev *mddev, const char *buf, size_t len) |
4314 | { |
4315 | unsigned long long n; |
4316 | int err; |
4317 | |
4318 | if (cmd_match(cmd: buf, str: "none" )) |
4319 | n = MaxSector; |
4320 | else { |
4321 | err = kstrtoull(s: buf, base: 10, res: &n); |
4322 | if (err < 0) |
4323 | return err; |
4324 | if (n != (sector_t)n) |
4325 | return -EINVAL; |
4326 | } |
4327 | |
4328 | err = mddev_lock(mddev); |
4329 | if (err) |
4330 | return err; |
4331 | if (mddev->pers && !test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)) |
4332 | err = -EBUSY; |
4333 | |
4334 | if (!err) { |
4335 | mddev->recovery_cp = n; |
4336 | if (mddev->pers) |
4337 | set_bit(nr: MD_SB_CHANGE_CLEAN, addr: &mddev->sb_flags); |
4338 | } |
4339 | mddev_unlock(mddev); |
4340 | return err ?: len; |
4341 | } |
4342 | static struct md_sysfs_entry md_resync_start = |
4343 | __ATTR_PREALLOC(resync_start, S_IRUGO|S_IWUSR, |
4344 | resync_start_show, resync_start_store); |
4345 | |
4346 | /* |
4347 | * The array state can be: |
4348 | * |
4349 | * clear |
4350 | * No devices, no size, no level |
4351 | * Equivalent to STOP_ARRAY ioctl |
4352 | * inactive |
4353 | * May have some settings, but array is not active |
4354 | * all IO results in error |
4355 | * When written, doesn't tear down array, but just stops it |
4356 | * suspended (not supported yet) |
4357 | * All IO requests will block. The array can be reconfigured. |
4358 | * Writing this, if accepted, will block until array is quiescent |
4359 | * readonly |
4360 | * no resync can happen. no superblocks get written. |
4361 | * write requests fail |
4362 | * read-auto |
4363 | * like readonly, but behaves like 'clean' on a write request. |
4364 | * |
4365 | * clean - no pending writes, but otherwise active. |
4366 | * When written to inactive array, starts without resync |
4367 | * If a write request arrives then |
4368 | * if metadata is known, mark 'dirty' and switch to 'active'. |
4369 | * if not known, block and switch to write-pending |
4370 | * If written to an active array that has pending writes, then fails. |
4371 | * active |
4372 | * fully active: IO and resync can be happening. |
4373 | * When written to inactive array, starts with resync |
4374 | * |
4375 | * write-pending |
4376 | * clean, but writes are blocked waiting for 'active' to be written. |
4377 | * |
4378 | * active-idle |
4379 | * like active, but no writes have been seen for a while (100msec). |
4380 | * |
4381 | * broken |
4382 | * Array is failed. It's useful because mounted-arrays aren't stopped |
4383 | * when array is failed, so this state will at least alert the user that |
4384 | * something is wrong. |
4385 | */ |
4386 | enum array_state { clear, inactive, suspended, readonly, read_auto, clean, active, |
4387 | write_pending, active_idle, broken, bad_word}; |
4388 | static char *array_states[] = { |
4389 | "clear" , "inactive" , "suspended" , "readonly" , "read-auto" , "clean" , "active" , |
4390 | "write-pending" , "active-idle" , "broken" , NULL }; |
4391 | |
4392 | static int match_word(const char *word, char **list) |
4393 | { |
4394 | int n; |
4395 | for (n=0; list[n]; n++) |
4396 | if (cmd_match(cmd: word, str: list[n])) |
4397 | break; |
4398 | return n; |
4399 | } |
4400 | |
4401 | static ssize_t |
4402 | array_state_show(struct mddev *mddev, char *page) |
4403 | { |
4404 | enum array_state st = inactive; |
4405 | |
4406 | if (mddev->pers && !test_bit(MD_NOT_READY, &mddev->flags)) { |
4407 | switch(mddev->ro) { |
4408 | case MD_RDONLY: |
4409 | st = readonly; |
4410 | break; |
4411 | case MD_AUTO_READ: |
4412 | st = read_auto; |
4413 | break; |
4414 | case MD_RDWR: |
4415 | spin_lock(lock: &mddev->lock); |
4416 | if (test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)) |
4417 | st = write_pending; |
4418 | else if (mddev->in_sync) |
4419 | st = clean; |
4420 | else if (mddev->safemode) |
4421 | st = active_idle; |
4422 | else |
4423 | st = active; |
4424 | spin_unlock(lock: &mddev->lock); |
4425 | } |
4426 | |
4427 | if (test_bit(MD_BROKEN, &mddev->flags) && st == clean) |
4428 | st = broken; |
4429 | } else { |
4430 | if (list_empty(head: &mddev->disks) && |
4431 | mddev->raid_disks == 0 && |
4432 | mddev->dev_sectors == 0) |
4433 | st = clear; |
4434 | else |
4435 | st = inactive; |
4436 | } |
4437 | return sprintf(buf: page, fmt: "%s\n" , array_states[st]); |
4438 | } |
4439 | |
4440 | static int do_md_stop(struct mddev *mddev, int ro, struct block_device *bdev); |
4441 | static int md_set_readonly(struct mddev *mddev, struct block_device *bdev); |
4442 | static int restart_array(struct mddev *mddev); |
4443 | |
4444 | static ssize_t |
4445 | array_state_store(struct mddev *mddev, const char *buf, size_t len) |
4446 | { |
4447 | int err = 0; |
4448 | enum array_state st = match_word(word: buf, list: array_states); |
4449 | |
4450 | /* No lock dependent actions */ |
4451 | switch (st) { |
4452 | case suspended: /* not supported yet */ |
4453 | case write_pending: /* cannot be set */ |
4454 | case active_idle: /* cannot be set */ |
4455 | case broken: /* cannot be set */ |
4456 | case bad_word: |
4457 | return -EINVAL; |
4458 | default: |
4459 | break; |
4460 | } |
4461 | |
4462 | if (mddev->pers && (st == active || st == clean) && |
4463 | mddev->ro != MD_RDONLY) { |
4464 | /* don't take reconfig_mutex when toggling between |
4465 | * clean and active |
4466 | */ |
4467 | spin_lock(lock: &mddev->lock); |
4468 | if (st == active) { |
4469 | restart_array(mddev); |
4470 | clear_bit(nr: MD_SB_CHANGE_PENDING, addr: &mddev->sb_flags); |
4471 | md_wakeup_thread(thread: mddev->thread); |
4472 | wake_up(&mddev->sb_wait); |
4473 | } else /* st == clean */ { |
4474 | restart_array(mddev); |
4475 | if (!set_in_sync(mddev)) |
4476 | err = -EBUSY; |
4477 | } |
4478 | if (!err) |
4479 | sysfs_notify_dirent_safe(sd: mddev->sysfs_state); |
4480 | spin_unlock(lock: &mddev->lock); |
4481 | return err ?: len; |
4482 | } |
4483 | err = mddev_lock(mddev); |
4484 | if (err) |
4485 | return err; |
4486 | |
4487 | switch (st) { |
4488 | case inactive: |
4489 | /* stop an active array, return 0 otherwise */ |
4490 | if (mddev->pers) |
4491 | err = do_md_stop(mddev, ro: 2, NULL); |
4492 | break; |
4493 | case clear: |
4494 | err = do_md_stop(mddev, ro: 0, NULL); |
4495 | break; |
4496 | case readonly: |
4497 | if (mddev->pers) |
4498 | err = md_set_readonly(mddev, NULL); |
4499 | else { |
4500 | mddev->ro = MD_RDONLY; |
4501 | set_disk_ro(disk: mddev->gendisk, read_only: 1); |
4502 | err = do_md_run(mddev); |
4503 | } |
4504 | break; |
4505 | case read_auto: |
4506 | if (mddev->pers) { |
4507 | if (md_is_rdwr(mddev)) |
4508 | err = md_set_readonly(mddev, NULL); |
4509 | else if (mddev->ro == MD_RDONLY) |
4510 | err = restart_array(mddev); |
4511 | if (err == 0) { |
4512 | mddev->ro = MD_AUTO_READ; |
4513 | set_disk_ro(disk: mddev->gendisk, read_only: 0); |
4514 | } |
4515 | } else { |
4516 | mddev->ro = MD_AUTO_READ; |
4517 | err = do_md_run(mddev); |
4518 | } |
4519 | break; |
4520 | case clean: |
4521 | if (mddev->pers) { |
4522 | err = restart_array(mddev); |
4523 | if (err) |
4524 | break; |
4525 | spin_lock(lock: &mddev->lock); |
4526 | if (!set_in_sync(mddev)) |
4527 | err = -EBUSY; |
4528 | spin_unlock(lock: &mddev->lock); |
4529 | } else |
4530 | err = -EINVAL; |
4531 | break; |
4532 | case active: |
4533 | if (mddev->pers) { |
4534 | err = restart_array(mddev); |
4535 | if (err) |
4536 | break; |
4537 | clear_bit(nr: MD_SB_CHANGE_PENDING, addr: &mddev->sb_flags); |
4538 | wake_up(&mddev->sb_wait); |
4539 | err = 0; |
4540 | } else { |
4541 | mddev->ro = MD_RDWR; |
4542 | set_disk_ro(disk: mddev->gendisk, read_only: 0); |
4543 | err = do_md_run(mddev); |
4544 | } |
4545 | break; |
4546 | default: |
4547 | err = -EINVAL; |
4548 | break; |
4549 | } |
4550 | |
4551 | if (!err) { |
4552 | if (mddev->hold_active == UNTIL_IOCTL) |
4553 | mddev->hold_active = 0; |
4554 | sysfs_notify_dirent_safe(sd: mddev->sysfs_state); |
4555 | } |
4556 | mddev_unlock(mddev); |
4557 | return err ?: len; |
4558 | } |
4559 | static struct md_sysfs_entry md_array_state = |
4560 | __ATTR_PREALLOC(array_state, S_IRUGO|S_IWUSR, array_state_show, array_state_store); |
4561 | |
4562 | static ssize_t |
4563 | max_corrected_read_errors_show(struct mddev *mddev, char *page) { |
4564 | return sprintf(buf: page, fmt: "%d\n" , |
4565 | atomic_read(v: &mddev->max_corr_read_errors)); |
4566 | } |
4567 | |
4568 | static ssize_t |
4569 | max_corrected_read_errors_store(struct mddev *mddev, const char *buf, size_t len) |
4570 | { |
4571 | unsigned int n; |
4572 | int rv; |
4573 | |
4574 | rv = kstrtouint(s: buf, base: 10, res: &n); |
4575 | if (rv < 0) |
4576 | return rv; |
4577 | if (n > INT_MAX) |
4578 | return -EINVAL; |
4579 | atomic_set(v: &mddev->max_corr_read_errors, i: n); |
4580 | return len; |
4581 | } |
4582 | |
4583 | static struct md_sysfs_entry max_corr_read_errors = |
4584 | __ATTR(max_read_errors, S_IRUGO|S_IWUSR, max_corrected_read_errors_show, |
4585 | max_corrected_read_errors_store); |
4586 | |
4587 | static ssize_t |
4588 | null_show(struct mddev *mddev, char *page) |
4589 | { |
4590 | return -EINVAL; |
4591 | } |
4592 | |
4593 | static ssize_t |
4594 | new_dev_store(struct mddev *mddev, const char *buf, size_t len) |
4595 | { |
4596 | /* buf must be %d:%d\n? giving major and minor numbers */ |
4597 | /* The new device is added to the array. |
4598 | * If the array has a persistent superblock, we read the |
4599 | * superblock to initialise info and check validity. |
4600 | * Otherwise, only checking done is that in bind_rdev_to_array, |
4601 | * which mainly checks size. |
4602 | */ |
4603 | char *e; |
4604 | int major = simple_strtoul(buf, &e, 10); |
4605 | int minor; |
4606 | dev_t dev; |
4607 | struct md_rdev *rdev; |
4608 | int err; |
4609 | |
4610 | if (!*buf || *e != ':' || !e[1] || e[1] == '\n') |
4611 | return -EINVAL; |
4612 | minor = simple_strtoul(e+1, &e, 10); |
4613 | if (*e && *e != '\n') |
4614 | return -EINVAL; |
4615 | dev = MKDEV(major, minor); |
4616 | if (major != MAJOR(dev) || |
4617 | minor != MINOR(dev)) |
4618 | return -EOVERFLOW; |
4619 | |
4620 | err = mddev_suspend_and_lock(mddev); |
4621 | if (err) |
4622 | return err; |
4623 | if (mddev->persistent) { |
4624 | rdev = md_import_device(newdev: dev, super_format: mddev->major_version, |
4625 | super_minor: mddev->minor_version); |
4626 | if (!IS_ERR(ptr: rdev) && !list_empty(head: &mddev->disks)) { |
4627 | struct md_rdev *rdev0 |
4628 | = list_entry(mddev->disks.next, |
4629 | struct md_rdev, same_set); |
4630 | err = super_types[mddev->major_version] |
4631 | .load_super(rdev, rdev0, mddev->minor_version); |
4632 | if (err < 0) |
4633 | goto out; |
4634 | } |
4635 | } else if (mddev->external) |
4636 | rdev = md_import_device(newdev: dev, super_format: -2, super_minor: -1); |
4637 | else |
4638 | rdev = md_import_device(newdev: dev, super_format: -1, super_minor: -1); |
4639 | |
4640 | if (IS_ERR(ptr: rdev)) { |
4641 | mddev_unlock_and_resume(mddev); |
4642 | return PTR_ERR(ptr: rdev); |
4643 | } |
4644 | err = bind_rdev_to_array(rdev, mddev); |
4645 | out: |
4646 | if (err) |
4647 | export_rdev(rdev, mddev); |
4648 | mddev_unlock_and_resume(mddev); |
4649 | if (!err) |
4650 | md_new_event(); |
4651 | return err ? err : len; |
4652 | } |
4653 | |
4654 | static struct md_sysfs_entry md_new_device = |
4655 | __ATTR(new_dev, S_IWUSR, null_show, new_dev_store); |
4656 | |
4657 | static ssize_t |
4658 | bitmap_store(struct mddev *mddev, const char *buf, size_t len) |
4659 | { |
4660 | char *end; |
4661 | unsigned long chunk, end_chunk; |
4662 | int err; |
4663 | |
4664 | err = mddev_lock(mddev); |
4665 | if (err) |
4666 | return err; |
4667 | if (!mddev->bitmap) |
4668 | goto out; |
4669 | /* buf should be <chunk> <chunk> ... or <chunk>-<chunk> ... (range) */ |
4670 | while (*buf) { |
4671 | chunk = end_chunk = simple_strtoul(buf, &end, 0); |
4672 | if (buf == end) break; |
4673 | if (*end == '-') { /* range */ |
4674 | buf = end + 1; |
4675 | end_chunk = simple_strtoul(buf, &end, 0); |
4676 | if (buf == end) break; |
4677 | } |
4678 | if (*end && !isspace(*end)) break; |
4679 | md_bitmap_dirty_bits(bitmap: mddev->bitmap, s: chunk, e: end_chunk); |
4680 | buf = skip_spaces(end); |
4681 | } |
4682 | md_bitmap_unplug(bitmap: mddev->bitmap); /* flush the bits to disk */ |
4683 | out: |
4684 | mddev_unlock(mddev); |
4685 | return len; |
4686 | } |
4687 | |
4688 | static struct md_sysfs_entry md_bitmap = |
4689 | __ATTR(bitmap_set_bits, S_IWUSR, null_show, bitmap_store); |
4690 | |
4691 | static ssize_t |
4692 | size_show(struct mddev *mddev, char *page) |
4693 | { |
4694 | return sprintf(buf: page, fmt: "%llu\n" , |
4695 | (unsigned long long)mddev->dev_sectors / 2); |
4696 | } |
4697 | |
4698 | static int update_size(struct mddev *mddev, sector_t num_sectors); |
4699 | |
4700 | static ssize_t |
4701 | size_store(struct mddev *mddev, const char *buf, size_t len) |
4702 | { |
4703 | /* If array is inactive, we can reduce the component size, but |
4704 | * not increase it (except from 0). |
4705 | * If array is active, we can try an on-line resize |
4706 | */ |
4707 | sector_t sectors; |
4708 | int err = strict_blocks_to_sectors(buf, sectors: §ors); |
4709 | |
4710 | if (err < 0) |
4711 | return err; |
4712 | err = mddev_lock(mddev); |
4713 | if (err) |
4714 | return err; |
4715 | if (mddev->pers) { |
4716 | err = update_size(mddev, num_sectors: sectors); |
4717 | if (err == 0) |
4718 | md_update_sb(mddev, 1); |
4719 | } else { |
4720 | if (mddev->dev_sectors == 0 || |
4721 | mddev->dev_sectors > sectors) |
4722 | mddev->dev_sectors = sectors; |
4723 | else |
4724 | err = -ENOSPC; |
4725 | } |
4726 | mddev_unlock(mddev); |
4727 | return err ? err : len; |
4728 | } |
4729 | |
4730 | static struct md_sysfs_entry md_size = |
4731 | __ATTR(component_size, S_IRUGO|S_IWUSR, size_show, size_store); |
4732 | |
4733 | /* Metadata version. |
4734 | * This is one of |
4735 | * 'none' for arrays with no metadata (good luck...) |
4736 | * 'external' for arrays with externally managed metadata, |
4737 | * or N.M for internally known formats |
4738 | */ |
4739 | static ssize_t |
4740 | metadata_show(struct mddev *mddev, char *page) |
4741 | { |
4742 | if (mddev->persistent) |
4743 | return sprintf(buf: page, fmt: "%d.%d\n" , |
4744 | mddev->major_version, mddev->minor_version); |
4745 | else if (mddev->external) |
4746 | return sprintf(buf: page, fmt: "external:%s\n" , mddev->metadata_type); |
4747 | else |
4748 | return sprintf(buf: page, fmt: "none\n" ); |
4749 | } |
4750 | |
4751 | static ssize_t |
4752 | metadata_store(struct mddev *mddev, const char *buf, size_t len) |
4753 | { |
4754 | int major, minor; |
4755 | char *e; |
4756 | int err; |
4757 | /* Changing the details of 'external' metadata is |
4758 | * always permitted. Otherwise there must be |
4759 | * no devices attached to the array. |
4760 | */ |
4761 | |
4762 | err = mddev_lock(mddev); |
4763 | if (err) |
4764 | return err; |
4765 | err = -EBUSY; |
4766 | if (mddev->external && strncmp(buf, "external:" , 9) == 0) |
4767 | ; |
4768 | else if (!list_empty(head: &mddev->disks)) |
4769 | goto out_unlock; |
4770 | |
4771 | err = 0; |
4772 | if (cmd_match(cmd: buf, str: "none" )) { |
4773 | mddev->persistent = 0; |
4774 | mddev->external = 0; |
4775 | mddev->major_version = 0; |
4776 | mddev->minor_version = 90; |
4777 | goto out_unlock; |
4778 | } |
4779 | if (strncmp(buf, "external:" , 9) == 0) { |
4780 | size_t namelen = len-9; |
4781 | if (namelen >= sizeof(mddev->metadata_type)) |
4782 | namelen = sizeof(mddev->metadata_type)-1; |
4783 | memcpy(mddev->metadata_type, buf+9, namelen); |
4784 | mddev->metadata_type[namelen] = 0; |
4785 | if (namelen && mddev->metadata_type[namelen-1] == '\n') |
4786 | mddev->metadata_type[--namelen] = 0; |
4787 | mddev->persistent = 0; |
4788 | mddev->external = 1; |
4789 | mddev->major_version = 0; |
4790 | mddev->minor_version = 90; |
4791 | goto out_unlock; |
4792 | } |
4793 | major = simple_strtoul(buf, &e, 10); |
4794 | err = -EINVAL; |
4795 | if (e==buf || *e != '.') |
4796 | goto out_unlock; |
4797 | buf = e+1; |
4798 | minor = simple_strtoul(buf, &e, 10); |
4799 | if (e==buf || (*e && *e != '\n') ) |
4800 | goto out_unlock; |
4801 | err = -ENOENT; |
4802 | if (major >= ARRAY_SIZE(super_types) || super_types[major].name == NULL) |
4803 | goto out_unlock; |
4804 | mddev->major_version = major; |
4805 | mddev->minor_version = minor; |
4806 | mddev->persistent = 1; |
4807 | mddev->external = 0; |
4808 | err = 0; |
4809 | out_unlock: |
4810 | mddev_unlock(mddev); |
4811 | return err ?: len; |
4812 | } |
4813 | |
4814 | static struct md_sysfs_entry md_metadata = |
4815 | __ATTR_PREALLOC(metadata_version, S_IRUGO|S_IWUSR, metadata_show, metadata_store); |
4816 | |
4817 | static ssize_t |
4818 | action_show(struct mddev *mddev, char *page) |
4819 | { |
4820 | char *type = "idle" ; |
4821 | unsigned long recovery = mddev->recovery; |
4822 | if (test_bit(MD_RECOVERY_FROZEN, &recovery)) |
4823 | type = "frozen" ; |
4824 | else if (test_bit(MD_RECOVERY_RUNNING, &recovery) || |
4825 | (md_is_rdwr(mddev) && test_bit(MD_RECOVERY_NEEDED, &recovery))) { |
4826 | if (test_bit(MD_RECOVERY_RESHAPE, &recovery)) |
4827 | type = "reshape" ; |
4828 | else if (test_bit(MD_RECOVERY_SYNC, &recovery)) { |
4829 | if (!test_bit(MD_RECOVERY_REQUESTED, &recovery)) |
4830 | type = "resync" ; |
4831 | else if (test_bit(MD_RECOVERY_CHECK, &recovery)) |
4832 | type = "check" ; |
4833 | else |
4834 | type = "repair" ; |
4835 | } else if (test_bit(MD_RECOVERY_RECOVER, &recovery)) |
4836 | type = "recover" ; |
4837 | else if (mddev->reshape_position != MaxSector) |
4838 | type = "reshape" ; |
4839 | } |
4840 | return sprintf(buf: page, fmt: "%s\n" , type); |
4841 | } |
4842 | |
4843 | static void stop_sync_thread(struct mddev *mddev) |
4844 | { |
4845 | if (!test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) |
4846 | return; |
4847 | |
4848 | if (mddev_lock(mddev)) |
4849 | return; |
4850 | |
4851 | /* |
4852 | * Check again in case MD_RECOVERY_RUNNING is cleared before lock is |
4853 | * held. |
4854 | */ |
4855 | if (!test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) { |
4856 | mddev_unlock(mddev); |
4857 | return; |
4858 | } |
4859 | |
4860 | if (work_pending(&mddev->del_work)) |
4861 | flush_workqueue(md_misc_wq); |
4862 | |
4863 | set_bit(nr: MD_RECOVERY_INTR, addr: &mddev->recovery); |
4864 | /* |
4865 | * Thread might be blocked waiting for metadata update which will now |
4866 | * never happen |
4867 | */ |
4868 | md_wakeup_thread_directly(thread: mddev->sync_thread); |
4869 | |
4870 | mddev_unlock(mddev); |
4871 | } |
4872 | |
4873 | static void idle_sync_thread(struct mddev *mddev) |
4874 | { |
4875 | int sync_seq = atomic_read(v: &mddev->sync_seq); |
4876 | |
4877 | mutex_lock(&mddev->sync_mutex); |
4878 | clear_bit(nr: MD_RECOVERY_FROZEN, addr: &mddev->recovery); |
4879 | stop_sync_thread(mddev); |
4880 | |
4881 | wait_event(resync_wait, sync_seq != atomic_read(&mddev->sync_seq) || |
4882 | !test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)); |
4883 | |
4884 | mutex_unlock(lock: &mddev->sync_mutex); |
4885 | } |
4886 | |
4887 | static void frozen_sync_thread(struct mddev *mddev) |
4888 | { |
4889 | mutex_lock(&mddev->sync_mutex); |
4890 | set_bit(nr: MD_RECOVERY_FROZEN, addr: &mddev->recovery); |
4891 | stop_sync_thread(mddev); |
4892 | |
4893 | wait_event(resync_wait, mddev->sync_thread == NULL && |
4894 | !test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)); |
4895 | |
4896 | mutex_unlock(lock: &mddev->sync_mutex); |
4897 | } |
4898 | |
4899 | static ssize_t |
4900 | action_store(struct mddev *mddev, const char *page, size_t len) |
4901 | { |
4902 | if (!mddev->pers || !mddev->pers->sync_request) |
4903 | return -EINVAL; |
4904 | |
4905 | |
4906 | if (cmd_match(cmd: page, str: "idle" )) |
4907 | idle_sync_thread(mddev); |
4908 | else if (cmd_match(cmd: page, str: "frozen" )) |
4909 | frozen_sync_thread(mddev); |
4910 | else if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) |
4911 | return -EBUSY; |
4912 | else if (cmd_match(cmd: page, str: "resync" )) |
4913 | clear_bit(nr: MD_RECOVERY_FROZEN, addr: &mddev->recovery); |
4914 | else if (cmd_match(cmd: page, str: "recover" )) { |
4915 | clear_bit(nr: MD_RECOVERY_FROZEN, addr: &mddev->recovery); |
4916 | set_bit(nr: MD_RECOVERY_RECOVER, addr: &mddev->recovery); |
4917 | } else if (cmd_match(cmd: page, str: "reshape" )) { |
4918 | int err; |
4919 | if (mddev->pers->start_reshape == NULL) |
4920 | return -EINVAL; |
4921 | err = mddev_lock(mddev); |
4922 | if (!err) { |
4923 | if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) { |
4924 | err = -EBUSY; |
4925 | } else if (mddev->reshape_position == MaxSector || |
4926 | mddev->pers->check_reshape == NULL || |
4927 | mddev->pers->check_reshape(mddev)) { |
4928 | clear_bit(nr: MD_RECOVERY_FROZEN, addr: &mddev->recovery); |
4929 | err = mddev->pers->start_reshape(mddev); |
4930 | } else { |
4931 | /* |
4932 | * If reshape is still in progress, and |
4933 | * md_check_recovery() can continue to reshape, |
4934 | * don't restart reshape because data can be |
4935 | * corrupted for raid456. |
4936 | */ |
4937 | clear_bit(nr: MD_RECOVERY_FROZEN, addr: &mddev->recovery); |
4938 | } |
4939 | mddev_unlock(mddev); |
4940 | } |
4941 | if (err) |
4942 | return err; |
4943 | sysfs_notify_dirent_safe(sd: mddev->sysfs_degraded); |
4944 | } else { |
4945 | if (cmd_match(cmd: page, str: "check" )) |
4946 | set_bit(nr: MD_RECOVERY_CHECK, addr: &mddev->recovery); |
4947 | else if (!cmd_match(cmd: page, str: "repair" )) |
4948 | return -EINVAL; |
4949 | clear_bit(nr: MD_RECOVERY_FROZEN, addr: &mddev->recovery); |
4950 | set_bit(nr: MD_RECOVERY_REQUESTED, addr: &mddev->recovery); |
4951 | set_bit(nr: MD_RECOVERY_SYNC, addr: &mddev->recovery); |
4952 | } |
4953 | if (mddev->ro == MD_AUTO_READ) { |
4954 | /* A write to sync_action is enough to justify |
4955 | * canceling read-auto mode |
4956 | */ |
4957 | flush_work(work: &mddev->sync_work); |
4958 | mddev->ro = MD_RDWR; |
4959 | md_wakeup_thread(thread: mddev->sync_thread); |
4960 | } |
4961 | set_bit(nr: MD_RECOVERY_NEEDED, addr: &mddev->recovery); |
4962 | md_wakeup_thread(thread: mddev->thread); |
4963 | sysfs_notify_dirent_safe(sd: mddev->sysfs_action); |
4964 | return len; |
4965 | } |
4966 | |
4967 | static struct md_sysfs_entry md_scan_mode = |
4968 | __ATTR_PREALLOC(sync_action, S_IRUGO|S_IWUSR, action_show, action_store); |
4969 | |
4970 | static ssize_t |
4971 | last_sync_action_show(struct mddev *mddev, char *page) |
4972 | { |
4973 | return sprintf(buf: page, fmt: "%s\n" , mddev->last_sync_action); |
4974 | } |
4975 | |
4976 | static struct md_sysfs_entry md_last_scan_mode = __ATTR_RO(last_sync_action); |
4977 | |
4978 | static ssize_t |
4979 | mismatch_cnt_show(struct mddev *mddev, char *page) |
4980 | { |
4981 | return sprintf(buf: page, fmt: "%llu\n" , |
4982 | (unsigned long long) |
4983 | atomic64_read(v: &mddev->resync_mismatches)); |
4984 | } |
4985 | |
4986 | static struct md_sysfs_entry md_mismatches = __ATTR_RO(mismatch_cnt); |
4987 | |
4988 | static ssize_t |
4989 | sync_min_show(struct mddev *mddev, char *page) |
4990 | { |
4991 | return sprintf(buf: page, fmt: "%d (%s)\n" , speed_min(mddev), |
4992 | mddev->sync_speed_min ? "local" : "system" ); |
4993 | } |
4994 | |
4995 | static ssize_t |
4996 | sync_min_store(struct mddev *mddev, const char *buf, size_t len) |
4997 | { |
4998 | unsigned int min; |
4999 | int rv; |
5000 | |
5001 | if (strncmp(buf, "system" , 6)==0) { |
5002 | min = 0; |
5003 | } else { |
5004 | rv = kstrtouint(s: buf, base: 10, res: &min); |
5005 | if (rv < 0) |
5006 | return rv; |
5007 | if (min == 0) |
5008 | return -EINVAL; |
5009 | } |
5010 | mddev->sync_speed_min = min; |
5011 | return len; |
5012 | } |
5013 | |
5014 | static struct md_sysfs_entry md_sync_min = |
5015 | __ATTR(sync_speed_min, S_IRUGO|S_IWUSR, sync_min_show, sync_min_store); |
5016 | |
5017 | static ssize_t |
5018 | sync_max_show(struct mddev *mddev, char *page) |
5019 | { |
5020 | return sprintf(buf: page, fmt: "%d (%s)\n" , speed_max(mddev), |
5021 | mddev->sync_speed_max ? "local" : "system" ); |
5022 | } |
5023 | |
5024 | static ssize_t |
5025 | sync_max_store(struct mddev *mddev, const char *buf, size_t len) |
5026 | { |
5027 | unsigned int max; |
5028 | int rv; |
5029 | |
5030 | if (strncmp(buf, "system" , 6)==0) { |
5031 | max = 0; |
5032 | } else { |
5033 | rv = kstrtouint(s: buf, base: 10, res: &max); |
5034 | if (rv < 0) |
5035 | return rv; |
5036 | if (max == 0) |
5037 | return -EINVAL; |
5038 | } |
5039 | mddev->sync_speed_max = max; |
5040 | return len; |
5041 | } |
5042 | |
5043 | static struct md_sysfs_entry md_sync_max = |
5044 | __ATTR(sync_speed_max, S_IRUGO|S_IWUSR, sync_max_show, sync_max_store); |
5045 | |
5046 | static ssize_t |
5047 | degraded_show(struct mddev *mddev, char *page) |
5048 | { |
5049 | return sprintf(buf: page, fmt: "%d\n" , mddev->degraded); |
5050 | } |
5051 | static struct md_sysfs_entry md_degraded = __ATTR_RO(degraded); |
5052 | |
5053 | static ssize_t |
5054 | sync_force_parallel_show(struct mddev *mddev, char *page) |
5055 | { |
5056 | return sprintf(buf: page, fmt: "%d\n" , mddev->parallel_resync); |
5057 | } |
5058 | |
5059 | static ssize_t |
5060 | sync_force_parallel_store(struct mddev *mddev, const char *buf, size_t len) |
5061 | { |
5062 | long n; |
5063 | |
5064 | if (kstrtol(s: buf, base: 10, res: &n)) |
5065 | return -EINVAL; |
5066 | |
5067 | if (n != 0 && n != 1) |
5068 | return -EINVAL; |
5069 | |
5070 | mddev->parallel_resync = n; |
5071 | |
5072 | if (mddev->sync_thread) |
5073 | wake_up(&resync_wait); |
5074 | |
5075 | return len; |
5076 | } |
5077 | |
5078 | /* force parallel resync, even with shared block devices */ |
5079 | static struct md_sysfs_entry md_sync_force_parallel = |
5080 | __ATTR(sync_force_parallel, S_IRUGO|S_IWUSR, |
5081 | sync_force_parallel_show, sync_force_parallel_store); |
5082 | |
5083 | static ssize_t |
5084 | sync_speed_show(struct mddev *mddev, char *page) |
5085 | { |
5086 | unsigned long resync, dt, db; |
5087 | if (mddev->curr_resync == MD_RESYNC_NONE) |
5088 | return sprintf(buf: page, fmt: "none\n" ); |
5089 | resync = mddev->curr_mark_cnt - atomic_read(v: &mddev->recovery_active); |
5090 | dt = (jiffies - mddev->resync_mark) / HZ; |
5091 | if (!dt) dt++; |
5092 | db = resync - mddev->resync_mark_cnt; |
5093 | return sprintf(buf: page, fmt: "%lu\n" , db/dt/2); /* K/sec */ |
5094 | } |
5095 | |
5096 | static struct md_sysfs_entry md_sync_speed = __ATTR_RO(sync_speed); |
5097 | |
5098 | static ssize_t |
5099 | sync_completed_show(struct mddev *mddev, char *page) |
5100 | { |
5101 | unsigned long long max_sectors, resync; |
5102 | |
5103 | if (!test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) |
5104 | return sprintf(buf: page, fmt: "none\n" ); |
5105 | |
5106 | if (mddev->curr_resync == MD_RESYNC_YIELDED || |
5107 | mddev->curr_resync == MD_RESYNC_DELAYED) |
5108 | return sprintf(buf: page, fmt: "delayed\n" ); |
5109 | |
5110 | if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) || |
5111 | test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) |
5112 | max_sectors = mddev->resync_max_sectors; |
5113 | else |
5114 | max_sectors = mddev->dev_sectors; |
5115 | |
5116 | resync = mddev->curr_resync_completed; |
5117 | return sprintf(buf: page, fmt: "%llu / %llu\n" , resync, max_sectors); |
5118 | } |
5119 | |
5120 | static struct md_sysfs_entry md_sync_completed = |
5121 | __ATTR_PREALLOC(sync_completed, S_IRUGO, sync_completed_show, NULL); |
5122 | |
5123 | static ssize_t |
5124 | min_sync_show(struct mddev *mddev, char *page) |
5125 | { |
5126 | return sprintf(buf: page, fmt: "%llu\n" , |
5127 | (unsigned long long)mddev->resync_min); |
5128 | } |
5129 | static ssize_t |
5130 | min_sync_store(struct mddev *mddev, const char *buf, size_t len) |
5131 | { |
5132 | unsigned long long min; |
5133 | int err; |
5134 | |
5135 | if (kstrtoull(s: buf, base: 10, res: &min)) |
5136 | return -EINVAL; |
5137 | |
5138 | spin_lock(lock: &mddev->lock); |
5139 | err = -EINVAL; |
5140 | if (min > mddev->resync_max) |
5141 | goto out_unlock; |
5142 | |
5143 | err = -EBUSY; |
5144 | if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) |
5145 | goto out_unlock; |
5146 | |
5147 | /* Round down to multiple of 4K for safety */ |
5148 | mddev->resync_min = round_down(min, 8); |
5149 | err = 0; |
5150 | |
5151 | out_unlock: |
5152 | spin_unlock(lock: &mddev->lock); |
5153 | return err ?: len; |
5154 | } |
5155 | |
5156 | static struct md_sysfs_entry md_min_sync = |
5157 | __ATTR(sync_min, S_IRUGO|S_IWUSR, min_sync_show, min_sync_store); |
5158 | |
5159 | static ssize_t |
5160 | max_sync_show(struct mddev *mddev, char *page) |
5161 | { |
5162 | if (mddev->resync_max == MaxSector) |
5163 | return sprintf(buf: page, fmt: "max\n" ); |
5164 | else |
5165 | return sprintf(buf: page, fmt: "%llu\n" , |
5166 | (unsigned long long)mddev->resync_max); |
5167 | } |
5168 | static ssize_t |
5169 | max_sync_store(struct mddev *mddev, const char *buf, size_t len) |
5170 | { |
5171 | int err; |
5172 | spin_lock(lock: &mddev->lock); |
5173 | if (strncmp(buf, "max" , 3) == 0) |
5174 | mddev->resync_max = MaxSector; |
5175 | else { |
5176 | unsigned long long max; |
5177 | int chunk; |
5178 | |
5179 | err = -EINVAL; |
5180 | if (kstrtoull(s: buf, base: 10, res: &max)) |
5181 | goto out_unlock; |
5182 | if (max < mddev->resync_min) |
5183 | goto out_unlock; |
5184 | |
5185 | err = -EBUSY; |
5186 | if (max < mddev->resync_max && md_is_rdwr(mddev) && |
5187 | test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) |
5188 | goto out_unlock; |
5189 | |
5190 | /* Must be a multiple of chunk_size */ |
5191 | chunk = mddev->chunk_sectors; |
5192 | if (chunk) { |
5193 | sector_t temp = max; |
5194 | |
5195 | err = -EINVAL; |
5196 | if (sector_div(temp, chunk)) |
5197 | goto out_unlock; |
5198 | } |
5199 | mddev->resync_max = max; |
5200 | } |
5201 | wake_up(&mddev->recovery_wait); |
5202 | err = 0; |
5203 | out_unlock: |
5204 | spin_unlock(lock: &mddev->lock); |
5205 | return err ?: len; |
5206 | } |
5207 | |
5208 | static struct md_sysfs_entry md_max_sync = |
5209 | __ATTR(sync_max, S_IRUGO|S_IWUSR, max_sync_show, max_sync_store); |
5210 | |
5211 | static ssize_t |
5212 | suspend_lo_show(struct mddev *mddev, char *page) |
5213 | { |
5214 | return sprintf(buf: page, fmt: "%llu\n" , |
5215 | (unsigned long long)READ_ONCE(mddev->suspend_lo)); |
5216 | } |
5217 | |
5218 | static ssize_t |
5219 | suspend_lo_store(struct mddev *mddev, const char *buf, size_t len) |
5220 | { |
5221 | unsigned long long new; |
5222 | int err; |
5223 | |
5224 | err = kstrtoull(s: buf, base: 10, res: &new); |
5225 | if (err < 0) |
5226 | return err; |
5227 | if (new != (sector_t)new) |
5228 | return -EINVAL; |
5229 | |
5230 | err = mddev_suspend(mddev, true); |
5231 | if (err) |
5232 | return err; |
5233 | |
5234 | WRITE_ONCE(mddev->suspend_lo, new); |
5235 | mddev_resume(mddev); |
5236 | |
5237 | return len; |
5238 | } |
5239 | static struct md_sysfs_entry md_suspend_lo = |
5240 | __ATTR(suspend_lo, S_IRUGO|S_IWUSR, suspend_lo_show, suspend_lo_store); |
5241 | |
5242 | static ssize_t |
5243 | suspend_hi_show(struct mddev *mddev, char *page) |
5244 | { |
5245 | return sprintf(buf: page, fmt: "%llu\n" , |
5246 | (unsigned long long)READ_ONCE(mddev->suspend_hi)); |
5247 | } |
5248 | |
5249 | static ssize_t |
5250 | suspend_hi_store(struct mddev *mddev, const char *buf, size_t len) |
5251 | { |
5252 | unsigned long long new; |
5253 | int err; |
5254 | |
5255 | err = kstrtoull(s: buf, base: 10, res: &new); |
5256 | if (err < 0) |
5257 | return err; |
5258 | if (new != (sector_t)new) |
5259 | return -EINVAL; |
5260 | |
5261 | err = mddev_suspend(mddev, true); |
5262 | if (err) |
5263 | return err; |
5264 | |
5265 | WRITE_ONCE(mddev->suspend_hi, new); |
5266 | mddev_resume(mddev); |
5267 | |
5268 | return len; |
5269 | } |
5270 | static struct md_sysfs_entry md_suspend_hi = |
5271 | __ATTR(suspend_hi, S_IRUGO|S_IWUSR, suspend_hi_show, suspend_hi_store); |
5272 | |
5273 | static ssize_t |
5274 | reshape_position_show(struct mddev *mddev, char *page) |
5275 | { |
5276 | if (mddev->reshape_position != MaxSector) |
5277 | return sprintf(buf: page, fmt: "%llu\n" , |
5278 | (unsigned long long)mddev->reshape_position); |
5279 | strcpy(p: page, q: "none\n" ); |
5280 | return 5; |
5281 | } |
5282 | |
5283 | static ssize_t |
5284 | reshape_position_store(struct mddev *mddev, const char *buf, size_t len) |
5285 | { |
5286 | struct md_rdev *rdev; |
5287 | unsigned long long new; |
5288 | int err; |
5289 | |
5290 | err = kstrtoull(s: buf, base: 10, res: &new); |
5291 | if (err < 0) |
5292 | return err; |
5293 | if (new != (sector_t)new) |
5294 | return -EINVAL; |
5295 | err = mddev_lock(mddev); |
5296 | if (err) |
5297 | return err; |
5298 | err = -EBUSY; |
5299 | if (mddev->pers) |
5300 | goto unlock; |
5301 | mddev->reshape_position = new; |
5302 | mddev->delta_disks = 0; |
5303 | mddev->reshape_backwards = 0; |
5304 | mddev->new_level = mddev->level; |
5305 | mddev->new_layout = mddev->layout; |
5306 | mddev->new_chunk_sectors = mddev->chunk_sectors; |
5307 | rdev_for_each(rdev, mddev) |
5308 | rdev->new_data_offset = rdev->data_offset; |
5309 | err = 0; |
5310 | unlock: |
5311 | mddev_unlock(mddev); |
5312 | return err ?: len; |
5313 | } |
5314 | |
5315 | static struct md_sysfs_entry md_reshape_position = |
5316 | __ATTR(reshape_position, S_IRUGO|S_IWUSR, reshape_position_show, |
5317 | reshape_position_store); |
5318 | |
5319 | static ssize_t |
5320 | reshape_direction_show(struct mddev *mddev, char *page) |
5321 | { |
5322 | return sprintf(buf: page, fmt: "%s\n" , |
5323 | mddev->reshape_backwards ? "backwards" : "forwards" ); |
5324 | } |
5325 | |
5326 | static ssize_t |
5327 | reshape_direction_store(struct mddev *mddev, const char *buf, size_t len) |
5328 | { |
5329 | int backwards = 0; |
5330 | int err; |
5331 | |
5332 | if (cmd_match(cmd: buf, str: "forwards" )) |
5333 | backwards = 0; |
5334 | else if (cmd_match(cmd: buf, str: "backwards" )) |
5335 | backwards = 1; |
5336 | else |
5337 | return -EINVAL; |
5338 | if (mddev->reshape_backwards == backwards) |
5339 | return len; |
5340 | |
5341 | err = mddev_lock(mddev); |
5342 | if (err) |
5343 | return err; |
5344 | /* check if we are allowed to change */ |
5345 | if (mddev->delta_disks) |
5346 | err = -EBUSY; |
5347 | else if (mddev->persistent && |
5348 | mddev->major_version == 0) |
5349 | err = -EINVAL; |
5350 | else |
5351 | mddev->reshape_backwards = backwards; |
5352 | mddev_unlock(mddev); |
5353 | return err ?: len; |
5354 | } |
5355 | |
5356 | static struct md_sysfs_entry md_reshape_direction = |
5357 | __ATTR(reshape_direction, S_IRUGO|S_IWUSR, reshape_direction_show, |
5358 | reshape_direction_store); |
5359 | |
5360 | static ssize_t |
5361 | array_size_show(struct mddev *mddev, char *page) |
5362 | { |
5363 | if (mddev->external_size) |
5364 | return sprintf(buf: page, fmt: "%llu\n" , |
5365 | (unsigned long long)mddev->array_sectors/2); |
5366 | else |
5367 | return sprintf(buf: page, fmt: "default\n" ); |
5368 | } |
5369 | |
5370 | static ssize_t |
5371 | array_size_store(struct mddev *mddev, const char *buf, size_t len) |
5372 | { |
5373 | sector_t sectors; |
5374 | int err; |
5375 | |
5376 | err = mddev_lock(mddev); |
5377 | if (err) |
5378 | return err; |
5379 | |
5380 | /* cluster raid doesn't support change array_sectors */ |
5381 | if (mddev_is_clustered(mddev)) { |
5382 | mddev_unlock(mddev); |
5383 | return -EINVAL; |
5384 | } |
5385 | |
5386 | if (strncmp(buf, "default" , 7) == 0) { |
5387 | if (mddev->pers) |
5388 | sectors = mddev->pers->size(mddev, 0, 0); |
5389 | else |
5390 | sectors = mddev->array_sectors; |
5391 | |
5392 | mddev->external_size = 0; |
5393 | } else { |
5394 | if (strict_blocks_to_sectors(buf, sectors: §ors) < 0) |
5395 | err = -EINVAL; |
5396 | else if (mddev->pers && mddev->pers->size(mddev, 0, 0) < sectors) |
5397 | err = -E2BIG; |
5398 | else |
5399 | mddev->external_size = 1; |
5400 | } |
5401 | |
5402 | if (!err) { |
5403 | mddev->array_sectors = sectors; |
5404 | if (mddev->pers) |
5405 | set_capacity_and_notify(disk: mddev->gendisk, |
5406 | size: mddev->array_sectors); |
5407 | } |
5408 | mddev_unlock(mddev); |
5409 | return err ?: len; |
5410 | } |
5411 | |
5412 | static struct md_sysfs_entry md_array_size = |
5413 | __ATTR(array_size, S_IRUGO|S_IWUSR, array_size_show, |
5414 | array_size_store); |
5415 | |
5416 | static ssize_t |
5417 | consistency_policy_show(struct mddev *mddev, char *page) |
5418 | { |
5419 | int ret; |
5420 | |
5421 | if (test_bit(MD_HAS_JOURNAL, &mddev->flags)) { |
5422 | ret = sprintf(buf: page, fmt: "journal\n" ); |
5423 | } else if (test_bit(MD_HAS_PPL, &mddev->flags)) { |
5424 | ret = sprintf(buf: page, fmt: "ppl\n" ); |
5425 | } else if (mddev->bitmap) { |
5426 | ret = sprintf(buf: page, fmt: "bitmap\n" ); |
5427 | } else if (mddev->pers) { |
5428 | if (mddev->pers->sync_request) |
5429 | ret = sprintf(buf: page, fmt: "resync\n" ); |
5430 | else |
5431 | ret = sprintf(buf: page, fmt: "none\n" ); |
5432 | } else { |
5433 | ret = sprintf(buf: page, fmt: "unknown\n" ); |
5434 | } |
5435 | |
5436 | return ret; |
5437 | } |
5438 | |
5439 | static ssize_t |
5440 | consistency_policy_store(struct mddev *mddev, const char *buf, size_t len) |
5441 | { |
5442 | int err = 0; |
5443 | |
5444 | if (mddev->pers) { |
5445 | if (mddev->pers->change_consistency_policy) |
5446 | err = mddev->pers->change_consistency_policy(mddev, buf); |
5447 | else |
5448 | err = -EBUSY; |
5449 | } else if (mddev->external && strncmp(buf, "ppl" , 3) == 0) { |
5450 | set_bit(nr: MD_HAS_PPL, addr: &mddev->flags); |
5451 | } else { |
5452 | err = -EINVAL; |
5453 | } |
5454 | |
5455 | return err ? err : len; |
5456 | } |
5457 | |
5458 | static struct md_sysfs_entry md_consistency_policy = |
5459 | __ATTR(consistency_policy, S_IRUGO | S_IWUSR, consistency_policy_show, |
5460 | consistency_policy_store); |
5461 | |
5462 | static ssize_t fail_last_dev_show(struct mddev *mddev, char *page) |
5463 | { |
5464 | return sprintf(buf: page, fmt: "%d\n" , mddev->fail_last_dev); |
5465 | } |
5466 | |
5467 | /* |
5468 | * Setting fail_last_dev to true to allow last device to be forcibly removed |
5469 | * from RAID1/RAID10. |
5470 | */ |
5471 | static ssize_t |
5472 | fail_last_dev_store(struct mddev *mddev, const char *buf, size_t len) |
5473 | { |
5474 | int ret; |
5475 | bool value; |
5476 | |
5477 | ret = kstrtobool(s: buf, res: &value); |
5478 | if (ret) |
5479 | return ret; |
5480 | |
5481 | if (value != mddev->fail_last_dev) |
5482 | mddev->fail_last_dev = value; |
5483 | |
5484 | return len; |
5485 | } |
5486 | static struct md_sysfs_entry md_fail_last_dev = |
5487 | __ATTR(fail_last_dev, S_IRUGO | S_IWUSR, fail_last_dev_show, |
5488 | fail_last_dev_store); |
5489 | |
5490 | static ssize_t serialize_policy_show(struct mddev *mddev, char *page) |
5491 | { |
5492 | if (mddev->pers == NULL || (mddev->pers->level != 1)) |
5493 | return sprintf(buf: page, fmt: "n/a\n" ); |
5494 | else |
5495 | return sprintf(buf: page, fmt: "%d\n" , mddev->serialize_policy); |
5496 | } |
5497 | |
5498 | /* |
5499 | * Setting serialize_policy to true to enforce write IO is not reordered |
5500 | * for raid1. |
5501 | */ |
5502 | static ssize_t |
5503 | serialize_policy_store(struct mddev *mddev, const char *buf, size_t len) |
5504 | { |
5505 | int err; |
5506 | bool value; |
5507 | |
5508 | err = kstrtobool(s: buf, res: &value); |
5509 | if (err) |
5510 | return err; |
5511 | |
5512 | if (value == mddev->serialize_policy) |
5513 | return len; |
5514 | |
5515 | err = mddev_suspend_and_lock(mddev); |
5516 | if (err) |
5517 | return err; |
5518 | if (mddev->pers == NULL || (mddev->pers->level != 1)) { |
5519 | pr_err("md: serialize_policy is only effective for raid1\n" ); |
5520 | err = -EINVAL; |
5521 | goto unlock; |
5522 | } |
5523 | |
5524 | if (value) |
5525 | mddev_create_serial_pool(mddev, NULL); |
5526 | else |
5527 | mddev_destroy_serial_pool(mddev, NULL); |
5528 | mddev->serialize_policy = value; |
5529 | unlock: |
5530 | mddev_unlock_and_resume(mddev); |
5531 | return err ?: len; |
5532 | } |
5533 | |
5534 | static struct md_sysfs_entry md_serialize_policy = |
5535 | __ATTR(serialize_policy, S_IRUGO | S_IWUSR, serialize_policy_show, |
5536 | serialize_policy_store); |
5537 | |
5538 | |
5539 | static struct attribute *md_default_attrs[] = { |
5540 | &md_level.attr, |
5541 | &md_layout.attr, |
5542 | &md_raid_disks.attr, |
5543 | &md_uuid.attr, |
5544 | &md_chunk_size.attr, |
5545 | &md_size.attr, |
5546 | &md_resync_start.attr, |
5547 | &md_metadata.attr, |
5548 | &md_new_device.attr, |
5549 | &md_safe_delay.attr, |
5550 | &md_array_state.attr, |
5551 | &md_reshape_position.attr, |
5552 | &md_reshape_direction.attr, |
5553 | &md_array_size.attr, |
5554 | &max_corr_read_errors.attr, |
5555 | &md_consistency_policy.attr, |
5556 | &md_fail_last_dev.attr, |
5557 | &md_serialize_policy.attr, |
5558 | NULL, |
5559 | }; |
5560 | |
5561 | static const struct attribute_group md_default_group = { |
5562 | .attrs = md_default_attrs, |
5563 | }; |
5564 | |
5565 | static struct attribute *md_redundancy_attrs[] = { |
5566 | &md_scan_mode.attr, |
5567 | &md_last_scan_mode.attr, |
5568 | &md_mismatches.attr, |
5569 | &md_sync_min.attr, |
5570 | &md_sync_max.attr, |
5571 | &md_sync_speed.attr, |
5572 | &md_sync_force_parallel.attr, |
5573 | &md_sync_completed.attr, |
5574 | &md_min_sync.attr, |
5575 | &md_max_sync.attr, |
5576 | &md_suspend_lo.attr, |
5577 | &md_suspend_hi.attr, |
5578 | &md_bitmap.attr, |
5579 | &md_degraded.attr, |
5580 | NULL, |
5581 | }; |
5582 | static const struct attribute_group md_redundancy_group = { |
5583 | .name = NULL, |
5584 | .attrs = md_redundancy_attrs, |
5585 | }; |
5586 | |
5587 | static const struct attribute_group *md_attr_groups[] = { |
5588 | &md_default_group, |
5589 | &md_bitmap_group, |
5590 | NULL, |
5591 | }; |
5592 | |
5593 | static ssize_t |
5594 | md_attr_show(struct kobject *kobj, struct attribute *attr, char *page) |
5595 | { |
5596 | struct md_sysfs_entry *entry = container_of(attr, struct md_sysfs_entry, attr); |
5597 | struct mddev *mddev = container_of(kobj, struct mddev, kobj); |
5598 | ssize_t rv; |
5599 | |
5600 | if (!entry->show) |
5601 | return -EIO; |
5602 | spin_lock(lock: &all_mddevs_lock); |
5603 | if (!mddev_get(mddev)) { |
5604 | spin_unlock(lock: &all_mddevs_lock); |
5605 | return -EBUSY; |
5606 | } |
5607 | spin_unlock(lock: &all_mddevs_lock); |
5608 | |
5609 | rv = entry->show(mddev, page); |
5610 | mddev_put(mddev); |
5611 | return rv; |
5612 | } |
5613 | |
5614 | static ssize_t |
5615 | md_attr_store(struct kobject *kobj, struct attribute *attr, |
5616 | const char *page, size_t length) |
5617 | { |
5618 | struct md_sysfs_entry *entry = container_of(attr, struct md_sysfs_entry, attr); |
5619 | struct mddev *mddev = container_of(kobj, struct mddev, kobj); |
5620 | ssize_t rv; |
5621 | |
5622 | if (!entry->store) |
5623 | return -EIO; |
5624 | if (!capable(CAP_SYS_ADMIN)) |
5625 | return -EACCES; |
5626 | spin_lock(lock: &all_mddevs_lock); |
5627 | if (!mddev_get(mddev)) { |
5628 | spin_unlock(lock: &all_mddevs_lock); |
5629 | return -EBUSY; |
5630 | } |
5631 | spin_unlock(lock: &all_mddevs_lock); |
5632 | rv = entry->store(mddev, page, length); |
5633 | mddev_put(mddev); |
5634 | return rv; |
5635 | } |
5636 | |
5637 | static void md_kobj_release(struct kobject *ko) |
5638 | { |
5639 | struct mddev *mddev = container_of(ko, struct mddev, kobj); |
5640 | |
5641 | if (mddev->sysfs_state) |
5642 | sysfs_put(kn: mddev->sysfs_state); |
5643 | if (mddev->sysfs_level) |
5644 | sysfs_put(kn: mddev->sysfs_level); |
5645 | |
5646 | del_gendisk(gp: mddev->gendisk); |
5647 | put_disk(disk: mddev->gendisk); |
5648 | } |
5649 | |
5650 | static const struct sysfs_ops md_sysfs_ops = { |
5651 | .show = md_attr_show, |
5652 | .store = md_attr_store, |
5653 | }; |
5654 | static const struct kobj_type md_ktype = { |
5655 | .release = md_kobj_release, |
5656 | .sysfs_ops = &md_sysfs_ops, |
5657 | .default_groups = md_attr_groups, |
5658 | }; |
5659 | |
5660 | int mdp_major = 0; |
5661 | |
5662 | static void mddev_delayed_delete(struct work_struct *ws) |
5663 | { |
5664 | struct mddev *mddev = container_of(ws, struct mddev, del_work); |
5665 | |
5666 | kobject_put(kobj: &mddev->kobj); |
5667 | } |
5668 | |
5669 | struct mddev *md_alloc(dev_t dev, char *name) |
5670 | { |
5671 | /* |
5672 | * If dev is zero, name is the name of a device to allocate with |
5673 | * an arbitrary minor number. It will be "md_???" |
5674 | * If dev is non-zero it must be a device number with a MAJOR of |
5675 | * MD_MAJOR or mdp_major. In this case, if "name" is NULL, then |
5676 | * the device is being created by opening a node in /dev. |
5677 | * If "name" is not NULL, the device is being created by |
5678 | * writing to /sys/module/md_mod/parameters/new_array. |
5679 | */ |
5680 | static DEFINE_MUTEX(disks_mutex); |
5681 | struct mddev *mddev; |
5682 | struct gendisk *disk; |
5683 | int partitioned; |
5684 | int shift; |
5685 | int unit; |
5686 | int error ; |
5687 | |
5688 | /* |
5689 | * Wait for any previous instance of this device to be completely |
5690 | * removed (mddev_delayed_delete). |
5691 | */ |
5692 | flush_workqueue(md_misc_wq); |
5693 | |
5694 | mutex_lock(&disks_mutex); |
5695 | mddev = mddev_alloc(unit: dev); |
5696 | if (IS_ERR(ptr: mddev)) { |
5697 | error = PTR_ERR(ptr: mddev); |
5698 | goto out_unlock; |
5699 | } |
5700 | |
5701 | partitioned = (MAJOR(mddev->unit) != MD_MAJOR); |
5702 | shift = partitioned ? MdpMinorShift : 0; |
5703 | unit = MINOR(mddev->unit) >> shift; |
5704 | |
5705 | if (name && !dev) { |
5706 | /* Need to ensure that 'name' is not a duplicate. |
5707 | */ |
5708 | struct mddev *mddev2; |
5709 | spin_lock(lock: &all_mddevs_lock); |
5710 | |
5711 | list_for_each_entry(mddev2, &all_mddevs, all_mddevs) |
5712 | if (mddev2->gendisk && |
5713 | strcmp(mddev2->gendisk->disk_name, name) == 0) { |
5714 | spin_unlock(lock: &all_mddevs_lock); |
5715 | error = -EEXIST; |
5716 | goto out_free_mddev; |
5717 | } |
5718 | spin_unlock(lock: &all_mddevs_lock); |
5719 | } |
5720 | if (name && dev) |
5721 | /* |
5722 | * Creating /dev/mdNNN via "newarray", so adjust hold_active. |
5723 | */ |
5724 | mddev->hold_active = UNTIL_STOP; |
5725 | |
5726 | error = -ENOMEM; |
5727 | disk = blk_alloc_disk(NUMA_NO_NODE); |
5728 | if (!disk) |
5729 | goto out_free_mddev; |
5730 | |
5731 | disk->major = MAJOR(mddev->unit); |
5732 | disk->first_minor = unit << shift; |
5733 | disk->minors = 1 << shift; |
5734 | if (name) |
5735 | strcpy(p: disk->disk_name, q: name); |
5736 | else if (partitioned) |
5737 | sprintf(buf: disk->disk_name, fmt: "md_d%d" , unit); |
5738 | else |
5739 | sprintf(buf: disk->disk_name, fmt: "md%d" , unit); |
5740 | disk->fops = &md_fops; |
5741 | disk->private_data = mddev; |
5742 | |
5743 | mddev->queue = disk->queue; |
5744 | blk_set_stacking_limits(lim: &mddev->queue->limits); |
5745 | blk_queue_write_cache(q: mddev->queue, enabled: true, fua: true); |
5746 | disk->events |= DISK_EVENT_MEDIA_CHANGE; |
5747 | mddev->gendisk = disk; |
5748 | error = add_disk(disk); |
5749 | if (error) |
5750 | goto out_put_disk; |
5751 | |
5752 | kobject_init(kobj: &mddev->kobj, ktype: &md_ktype); |
5753 | error = kobject_add(kobj: &mddev->kobj, parent: &disk_to_dev(disk)->kobj, fmt: "%s" , "md" ); |
5754 | if (error) { |
5755 | /* |
5756 | * The disk is already live at this point. Clear the hold flag |
5757 | * and let mddev_put take care of the deletion, as it isn't any |
5758 | * different from a normal close on last release now. |
5759 | */ |
5760 | mddev->hold_active = 0; |
5761 | mutex_unlock(lock: &disks_mutex); |
5762 | mddev_put(mddev); |
5763 | return ERR_PTR(error); |
5764 | } |
5765 | |
5766 | kobject_uevent(kobj: &mddev->kobj, action: KOBJ_ADD); |
5767 | mddev->sysfs_state = sysfs_get_dirent_safe(sd: mddev->kobj.sd, name: "array_state" ); |
5768 | mddev->sysfs_level = sysfs_get_dirent_safe(sd: mddev->kobj.sd, name: "level" ); |
5769 | mutex_unlock(lock: &disks_mutex); |
5770 | return mddev; |
5771 | |
5772 | out_put_disk: |
5773 | put_disk(disk); |
5774 | out_free_mddev: |
5775 | mddev_free(mddev); |
5776 | out_unlock: |
5777 | mutex_unlock(lock: &disks_mutex); |
5778 | return ERR_PTR(error); |
5779 | } |
5780 | |
5781 | static int md_alloc_and_put(dev_t dev, char *name) |
5782 | { |
5783 | struct mddev *mddev = md_alloc(dev, name); |
5784 | |
5785 | if (IS_ERR(ptr: mddev)) |
5786 | return PTR_ERR(ptr: mddev); |
5787 | mddev_put(mddev); |
5788 | return 0; |
5789 | } |
5790 | |
5791 | static void md_probe(dev_t dev) |
5792 | { |
5793 | if (MAJOR(dev) == MD_MAJOR && MINOR(dev) >= 512) |
5794 | return; |
5795 | if (create_on_open) |
5796 | md_alloc_and_put(dev, NULL); |
5797 | } |
5798 | |
5799 | static int add_named_array(const char *val, const struct kernel_param *kp) |
5800 | { |
5801 | /* |
5802 | * val must be "md_*" or "mdNNN". |
5803 | * For "md_*" we allocate an array with a large free minor number, and |
5804 | * set the name to val. val must not already be an active name. |
5805 | * For "mdNNN" we allocate an array with the minor number NNN |
5806 | * which must not already be in use. |
5807 | */ |
5808 | int len = strlen(val); |
5809 | char buf[DISK_NAME_LEN]; |
5810 | unsigned long devnum; |
5811 | |
5812 | while (len && val[len-1] == '\n') |
5813 | len--; |
5814 | if (len >= DISK_NAME_LEN) |
5815 | return -E2BIG; |
5816 | strscpy(p: buf, q: val, size: len+1); |
5817 | if (strncmp(buf, "md_" , 3) == 0) |
5818 | return md_alloc_and_put(dev: 0, name: buf); |
5819 | if (strncmp(buf, "md" , 2) == 0 && |
5820 | isdigit(c: buf[2]) && |
5821 | kstrtoul(s: buf+2, base: 10, res: &devnum) == 0 && |
5822 | devnum <= MINORMASK) |
5823 | return md_alloc_and_put(MKDEV(MD_MAJOR, devnum), NULL); |
5824 | |
5825 | return -EINVAL; |
5826 | } |
5827 | |
5828 | static void md_safemode_timeout(struct timer_list *t) |
5829 | { |
5830 | struct mddev *mddev = from_timer(mddev, t, safemode_timer); |
5831 | |
5832 | mddev->safemode = 1; |
5833 | if (mddev->external) |
5834 | sysfs_notify_dirent_safe(sd: mddev->sysfs_state); |
5835 | |
5836 | md_wakeup_thread(thread: mddev->thread); |
5837 | } |
5838 | |
5839 | static int start_dirty_degraded; |
5840 | |
5841 | int md_run(struct mddev *mddev) |
5842 | { |
5843 | int err; |
5844 | struct md_rdev *rdev; |
5845 | struct md_personality *pers; |
5846 | bool nowait = true; |
5847 | |
5848 | if (list_empty(head: &mddev->disks)) |
5849 | /* cannot run an array with no devices.. */ |
5850 | return -EINVAL; |
5851 | |
5852 | if (mddev->pers) |
5853 | return -EBUSY; |
5854 | /* Cannot run until previous stop completes properly */ |
5855 | if (mddev->sysfs_active) |
5856 | return -EBUSY; |
5857 | |
5858 | /* |
5859 | * Analyze all RAID superblock(s) |
5860 | */ |
5861 | if (!mddev->raid_disks) { |
5862 | if (!mddev->persistent) |
5863 | return -EINVAL; |
5864 | err = analyze_sbs(mddev); |
5865 | if (err) |
5866 | return -EINVAL; |
5867 | } |
5868 | |
5869 | if (mddev->level != LEVEL_NONE) |
5870 | request_module("md-level-%d" , mddev->level); |
5871 | else if (mddev->clevel[0]) |
5872 | request_module("md-%s" , mddev->clevel); |
5873 | |
5874 | /* |
5875 | * Drop all container device buffers, from now on |
5876 | * the only valid external interface is through the md |
5877 | * device. |
5878 | */ |
5879 | mddev->has_superblocks = false; |
5880 | rdev_for_each(rdev, mddev) { |
5881 | if (test_bit(Faulty, &rdev->flags)) |
5882 | continue; |
5883 | sync_blockdev(bdev: rdev->bdev); |
5884 | invalidate_bdev(bdev: rdev->bdev); |
5885 | if (mddev->ro != MD_RDONLY && rdev_read_only(rdev)) { |
5886 | mddev->ro = MD_RDONLY; |
5887 | if (mddev->gendisk) |
5888 | set_disk_ro(disk: mddev->gendisk, read_only: 1); |
5889 | } |
5890 | |
5891 | if (rdev->sb_page) |
5892 | mddev->has_superblocks = true; |
5893 | |
5894 | /* perform some consistency tests on the device. |
5895 | * We don't want the data to overlap the metadata, |
5896 | * Internal Bitmap issues have been handled elsewhere. |
5897 | */ |
5898 | if (rdev->meta_bdev) { |
5899 | /* Nothing to check */; |
5900 | } else if (rdev->data_offset < rdev->sb_start) { |
5901 | if (mddev->dev_sectors && |
5902 | rdev->data_offset + mddev->dev_sectors |
5903 | > rdev->sb_start) { |
5904 | pr_warn("md: %s: data overlaps metadata\n" , |
5905 | mdname(mddev)); |
5906 | return -EINVAL; |
5907 | } |
5908 | } else { |
5909 | if (rdev->sb_start + rdev->sb_size/512 |
5910 | > rdev->data_offset) { |
5911 | pr_warn("md: %s: metadata overlaps data\n" , |
5912 | mdname(mddev)); |
5913 | return -EINVAL; |
5914 | } |
5915 | } |
5916 | sysfs_notify_dirent_safe(sd: rdev->sysfs_state); |
5917 | nowait = nowait && bdev_nowait(bdev: rdev->bdev); |
5918 | } |
5919 | |
5920 | if (!bioset_initialized(bs: &mddev->bio_set)) { |
5921 | err = bioset_init(&mddev->bio_set, BIO_POOL_SIZE, 0, flags: BIOSET_NEED_BVECS); |
5922 | if (err) |
5923 | return err; |
5924 | } |
5925 | if (!bioset_initialized(bs: &mddev->sync_set)) { |
5926 | err = bioset_init(&mddev->sync_set, BIO_POOL_SIZE, 0, flags: BIOSET_NEED_BVECS); |
5927 | if (err) |
5928 | goto exit_bio_set; |
5929 | } |
5930 | |
5931 | if (!bioset_initialized(bs: &mddev->io_clone_set)) { |
5932 | err = bioset_init(&mddev->io_clone_set, BIO_POOL_SIZE, |
5933 | offsetof(struct md_io_clone, bio_clone), flags: 0); |
5934 | if (err) |
5935 | goto exit_sync_set; |
5936 | } |
5937 | |
5938 | spin_lock(lock: &pers_lock); |
5939 | pers = find_pers(level: mddev->level, clevel: mddev->clevel); |
5940 | if (!pers || !try_module_get(module: pers->owner)) { |
5941 | spin_unlock(lock: &pers_lock); |
5942 | if (mddev->level != LEVEL_NONE) |
5943 | pr_warn("md: personality for level %d is not loaded!\n" , |
5944 | mddev->level); |
5945 | else |
5946 | pr_warn("md: personality for level %s is not loaded!\n" , |
5947 | mddev->clevel); |
5948 | err = -EINVAL; |
5949 | goto abort; |
5950 | } |
5951 | spin_unlock(lock: &pers_lock); |
5952 | if (mddev->level != pers->level) { |
5953 | mddev->level = pers->level; |
5954 | mddev->new_level = pers->level; |
5955 | } |
5956 | strscpy(p: mddev->clevel, q: pers->name, size: sizeof(mddev->clevel)); |
5957 | |
5958 | if (mddev->reshape_position != MaxSector && |
5959 | pers->start_reshape == NULL) { |
5960 | /* This personality cannot handle reshaping... */ |
5961 | module_put(module: pers->owner); |
5962 | err = -EINVAL; |
5963 | goto abort; |
5964 | } |
5965 | |
5966 | if (pers->sync_request) { |
5967 | /* Warn if this is a potentially silly |
5968 | * configuration. |
5969 | */ |
5970 | struct md_rdev *rdev2; |
5971 | int warned = 0; |
5972 | |
5973 | rdev_for_each(rdev, mddev) |
5974 | rdev_for_each(rdev2, mddev) { |
5975 | if (rdev < rdev2 && |
5976 | rdev->bdev->bd_disk == |
5977 | rdev2->bdev->bd_disk) { |
5978 | pr_warn("%s: WARNING: %pg appears to be on the same physical disk as %pg.\n" , |
5979 | mdname(mddev), |
5980 | rdev->bdev, |
5981 | rdev2->bdev); |
5982 | warned = 1; |
5983 | } |
5984 | } |
5985 | |
5986 | if (warned) |
5987 | pr_warn("True protection against single-disk failure might be compromised.\n" ); |
5988 | } |
5989 | |
5990 | mddev->recovery = 0; |
5991 | /* may be over-ridden by personality */ |
5992 | mddev->resync_max_sectors = mddev->dev_sectors; |
5993 | |
5994 | mddev->ok_start_degraded = start_dirty_degraded; |
5995 | |
5996 | if (start_readonly && md_is_rdwr(mddev)) |
5997 | mddev->ro = MD_AUTO_READ; /* read-only, but switch on first write */ |
5998 | |
5999 | err = pers->run(mddev); |
6000 | if (err) |
6001 | pr_warn("md: pers->run() failed ...\n" ); |
6002 | else if (pers->size(mddev, 0, 0) < mddev->array_sectors) { |
6003 | WARN_ONCE(!mddev->external_size, |
6004 | "%s: default size too small, but 'external_size' not in effect?\n" , |
6005 | __func__); |
6006 | pr_warn("md: invalid array_size %llu > default size %llu\n" , |
6007 | (unsigned long long)mddev->array_sectors / 2, |
6008 | (unsigned long long)pers->size(mddev, 0, 0) / 2); |
6009 | err = -EINVAL; |
6010 | } |
6011 | if (err == 0 && pers->sync_request && |
6012 | (mddev->bitmap_info.file || mddev->bitmap_info.offset)) { |
6013 | struct bitmap *bitmap; |
6014 | |
6015 | bitmap = md_bitmap_create(mddev, slot: -1); |
6016 | if (IS_ERR(ptr: bitmap)) { |
6017 | err = PTR_ERR(ptr: bitmap); |
6018 | pr_warn("%s: failed to create bitmap (%d)\n" , |
6019 | mdname(mddev), err); |
6020 | } else |
6021 | mddev->bitmap = bitmap; |
6022 | |
6023 | } |
6024 | if (err) |
6025 | goto bitmap_abort; |
6026 | |
6027 | if (mddev->bitmap_info.max_write_behind > 0) { |
6028 | bool create_pool = false; |
6029 | |
6030 | rdev_for_each(rdev, mddev) { |
6031 | if (test_bit(WriteMostly, &rdev->flags) && |
6032 | rdev_init_serial(rdev)) |
6033 | create_pool = true; |
6034 | } |
6035 | if (create_pool && mddev->serial_info_pool == NULL) { |
6036 | mddev->serial_info_pool = |
6037 | mempool_create_kmalloc_pool(NR_SERIAL_INFOS, |
6038 | size: sizeof(struct serial_info)); |
6039 | if (!mddev->serial_info_pool) { |
6040 | err = -ENOMEM; |
6041 | goto bitmap_abort; |
6042 | } |
6043 | } |
6044 | } |
6045 | |
6046 | if (mddev->queue) { |
6047 | bool nonrot = true; |
6048 | |
6049 | rdev_for_each(rdev, mddev) { |
6050 | if (rdev->raid_disk >= 0 && !bdev_nonrot(bdev: rdev->bdev)) { |
6051 | nonrot = false; |
6052 | break; |
6053 | } |
6054 | } |
6055 | if (mddev->degraded) |
6056 | nonrot = false; |
6057 | if (nonrot) |
6058 | blk_queue_flag_set(QUEUE_FLAG_NONROT, q: mddev->queue); |
6059 | else |
6060 | blk_queue_flag_clear(QUEUE_FLAG_NONROT, q: mddev->queue); |
6061 | blk_queue_flag_set(QUEUE_FLAG_IO_STAT, q: mddev->queue); |
6062 | |
6063 | /* Set the NOWAIT flags if all underlying devices support it */ |
6064 | if (nowait) |
6065 | blk_queue_flag_set(QUEUE_FLAG_NOWAIT, q: mddev->queue); |
6066 | } |
6067 | if (pers->sync_request) { |
6068 | if (mddev->kobj.sd && |
6069 | sysfs_create_group(kobj: &mddev->kobj, grp: &md_redundancy_group)) |
6070 | pr_warn("md: cannot register extra attributes for %s\n" , |
6071 | mdname(mddev)); |
6072 | mddev->sysfs_action = sysfs_get_dirent_safe(sd: mddev->kobj.sd, name: "sync_action" ); |
6073 | mddev->sysfs_completed = sysfs_get_dirent_safe(sd: mddev->kobj.sd, name: "sync_completed" ); |
6074 | mddev->sysfs_degraded = sysfs_get_dirent_safe(sd: mddev->kobj.sd, name: "degraded" ); |
6075 | } else if (mddev->ro == MD_AUTO_READ) |
6076 | mddev->ro = MD_RDWR; |
6077 | |
6078 | atomic_set(v: &mddev->max_corr_read_errors, |
6079 | MD_DEFAULT_MAX_CORRECTED_READ_ERRORS); |
6080 | mddev->safemode = 0; |
6081 | if (mddev_is_clustered(mddev)) |
6082 | mddev->safemode_delay = 0; |
6083 | else |
6084 | mddev->safemode_delay = DEFAULT_SAFEMODE_DELAY; |
6085 | mddev->in_sync = 1; |
6086 | smp_wmb(); |
6087 | spin_lock(lock: &mddev->lock); |
6088 | mddev->pers = pers; |
6089 | spin_unlock(lock: &mddev->lock); |
6090 | rdev_for_each(rdev, mddev) |
6091 | if (rdev->raid_disk >= 0) |
6092 | sysfs_link_rdev(mddev, rdev); /* failure here is OK */ |
6093 | |
6094 | if (mddev->degraded && md_is_rdwr(mddev)) |
6095 | /* This ensures that recovering status is reported immediately |
6096 | * via sysfs - until a lack of spares is confirmed. |
6097 | */ |
6098 | set_bit(nr: MD_RECOVERY_RECOVER, addr: &mddev->recovery); |
6099 | set_bit(nr: MD_RECOVERY_NEEDED, addr: &mddev->recovery); |
6100 | |
6101 | if (mddev->sb_flags) |
6102 | md_update_sb(mddev, 0); |
6103 | |
6104 | md_new_event(); |
6105 | return 0; |
6106 | |
6107 | bitmap_abort: |
6108 | mddev_detach(mddev); |
6109 | if (mddev->private) |
6110 | pers->free(mddev, mddev->private); |
6111 | mddev->private = NULL; |
6112 | module_put(module: pers->owner); |
6113 | md_bitmap_destroy(mddev); |
6114 | abort: |
6115 | bioset_exit(&mddev->io_clone_set); |
6116 | exit_sync_set: |
6117 | bioset_exit(&mddev->sync_set); |
6118 | exit_bio_set: |
6119 | bioset_exit(&mddev->bio_set); |
6120 | return err; |
6121 | } |
6122 | EXPORT_SYMBOL_GPL(md_run); |
6123 | |
6124 | int do_md_run(struct mddev *mddev) |
6125 | { |
6126 | int err; |
6127 | |
6128 | set_bit(nr: MD_NOT_READY, addr: &mddev->flags); |
6129 | err = md_run(mddev); |
6130 | if (err) |
6131 | goto out; |
6132 | err = md_bitmap_load(mddev); |
6133 | if (err) { |
6134 | md_bitmap_destroy(mddev); |
6135 | goto out; |
6136 | } |
6137 | |
6138 | if (mddev_is_clustered(mddev)) |
6139 | md_allow_write(mddev); |
6140 | |
6141 | /* run start up tasks that require md_thread */ |
6142 | md_start(mddev); |
6143 | |
6144 | md_wakeup_thread(thread: mddev->thread); |
6145 | md_wakeup_thread(thread: mddev->sync_thread); /* possibly kick off a reshape */ |
6146 | |
6147 | set_capacity_and_notify(disk: mddev->gendisk, size: mddev->array_sectors); |
6148 | clear_bit(nr: MD_NOT_READY, addr: &mddev->flags); |
6149 | mddev->changed = 1; |
6150 | kobject_uevent(kobj: &disk_to_dev(mddev->gendisk)->kobj, action: KOBJ_CHANGE); |
6151 | sysfs_notify_dirent_safe(sd: mddev->sysfs_state); |
6152 | sysfs_notify_dirent_safe(sd: mddev->sysfs_action); |
6153 | sysfs_notify_dirent_safe(sd: mddev->sysfs_degraded); |
6154 | out: |
6155 | clear_bit(nr: MD_NOT_READY, addr: &mddev->flags); |
6156 | return err; |
6157 | } |
6158 | |
6159 | int md_start(struct mddev *mddev) |
6160 | { |
6161 | int ret = 0; |
6162 | |
6163 | if (mddev->pers->start) { |
6164 | set_bit(nr: MD_RECOVERY_WAIT, addr: &mddev->recovery); |
6165 | md_wakeup_thread(thread: mddev->thread); |
6166 | ret = mddev->pers->start(mddev); |
6167 | clear_bit(nr: MD_RECOVERY_WAIT, addr: &mddev->recovery); |
6168 | md_wakeup_thread(thread: mddev->sync_thread); |
6169 | } |
6170 | return ret; |
6171 | } |
6172 | EXPORT_SYMBOL_GPL(md_start); |
6173 | |
6174 | static int restart_array(struct mddev *mddev) |
6175 | { |
6176 | struct gendisk *disk = mddev->gendisk; |
6177 | struct md_rdev *rdev; |
6178 | bool has_journal = false; |
6179 | bool has_readonly = false; |
6180 | |
6181 | /* Complain if it has no devices */ |
6182 | if (list_empty(head: &mddev->disks)) |
6183 | return -ENXIO; |
6184 | if (!mddev->pers) |
6185 | return -EINVAL; |
6186 | if (md_is_rdwr(mddev)) |
6187 | return -EBUSY; |
6188 | |
6189 | rcu_read_lock(); |
6190 | rdev_for_each_rcu(rdev, mddev) { |
6191 | if (test_bit(Journal, &rdev->flags) && |
6192 | !test_bit(Faulty, &rdev->flags)) |
6193 | has_journal = true; |
6194 | if (rdev_read_only(rdev)) |
6195 | has_readonly = true; |
6196 | } |
6197 | rcu_read_unlock(); |
6198 | if (test_bit(MD_HAS_JOURNAL, &mddev->flags) && !has_journal) |
6199 | /* Don't restart rw with journal missing/faulty */ |
6200 | return -EINVAL; |
6201 | if (has_readonly) |
6202 | return -EROFS; |
6203 | |
6204 | mddev->safemode = 0; |
6205 | mddev->ro = MD_RDWR; |
6206 | set_disk_ro(disk, read_only: 0); |
6207 | pr_debug("md: %s switched to read-write mode.\n" , mdname(mddev)); |
6208 | /* Kick recovery or resync if necessary */ |
6209 | set_bit(nr: MD_RECOVERY_NEEDED, addr: &mddev->recovery); |
6210 | md_wakeup_thread(thread: mddev->thread); |
6211 | md_wakeup_thread(thread: mddev->sync_thread); |
6212 | sysfs_notify_dirent_safe(sd: mddev->sysfs_state); |
6213 | return 0; |
6214 | } |
6215 | |
6216 | static void md_clean(struct mddev *mddev) |
6217 | { |
6218 | mddev->array_sectors = 0; |
6219 | mddev->external_size = 0; |
6220 | mddev->dev_sectors = 0; |
6221 | mddev->raid_disks = 0; |
6222 | mddev->recovery_cp = 0; |
6223 | mddev->resync_min = 0; |
6224 | mddev->resync_max = MaxSector; |
6225 | mddev->reshape_position = MaxSector; |
6226 | /* we still need mddev->external in export_rdev, do not clear it yet */ |
6227 | mddev->persistent = 0; |
6228 | mddev->level = LEVEL_NONE; |
6229 | mddev->clevel[0] = 0; |
6230 | mddev->flags = 0; |
6231 | mddev->sb_flags = 0; |
6232 | mddev->ro = MD_RDWR; |
6233 | mddev->metadata_type[0] = 0; |
6234 | mddev->chunk_sectors = 0; |
6235 | mddev->ctime = mddev->utime = 0; |
6236 | mddev->layout = 0; |
6237 | mddev->max_disks = 0; |
6238 | mddev->events = 0; |
6239 | mddev->can_decrease_events = 0; |
6240 | mddev->delta_disks = 0; |
6241 | mddev->reshape_backwards = 0; |
6242 | mddev->new_level = LEVEL_NONE; |
6243 | mddev->new_layout = 0; |
6244 | mddev->new_chunk_sectors = 0; |
6245 | mddev->curr_resync = MD_RESYNC_NONE; |
6246 | atomic64_set(v: &mddev->resync_mismatches, i: 0); |
6247 | mddev->suspend_lo = mddev->suspend_hi = 0; |
6248 | mddev->sync_speed_min = mddev->sync_speed_max = 0; |
6249 | mddev->recovery = 0; |
6250 | mddev->in_sync = 0; |
6251 | mddev->changed = 0; |
6252 | mddev->degraded = 0; |
6253 | mddev->safemode = 0; |
6254 | mddev->private = NULL; |
6255 | mddev->cluster_info = NULL; |
6256 | mddev->bitmap_info.offset = 0; |
6257 | mddev->bitmap_info.default_offset = 0; |
6258 | mddev->bitmap_info.default_space = 0; |
6259 | mddev->bitmap_info.chunksize = 0; |
6260 | mddev->bitmap_info.daemon_sleep = 0; |
6261 | mddev->bitmap_info.max_write_behind = 0; |
6262 | mddev->bitmap_info.nodes = 0; |
6263 | } |
6264 | |
6265 | static void __md_stop_writes(struct mddev *mddev) |
6266 | { |
6267 | set_bit(nr: MD_RECOVERY_FROZEN, addr: &mddev->recovery); |
6268 | if (work_pending(&mddev->del_work)) |
6269 | flush_workqueue(md_misc_wq); |
6270 | if (mddev->sync_thread) { |
6271 | set_bit(nr: MD_RECOVERY_INTR, addr: &mddev->recovery); |
6272 | md_reap_sync_thread(mddev); |
6273 | } |
6274 | |
6275 | del_timer_sync(timer: &mddev->safemode_timer); |
6276 | |
6277 | if (mddev->pers && mddev->pers->quiesce) { |
6278 | mddev->pers->quiesce(mddev, 1); |
6279 | mddev->pers->quiesce(mddev, 0); |
6280 | } |
6281 | md_bitmap_flush(mddev); |
6282 | |
6283 | if (md_is_rdwr(mddev) && |
6284 | ((!mddev->in_sync && !mddev_is_clustered(mddev)) || |
6285 | mddev->sb_flags)) { |
6286 | /* mark array as shutdown cleanly */ |
6287 | if (!mddev_is_clustered(mddev)) |
6288 | mddev->in_sync = 1; |
6289 | md_update_sb(mddev, 1); |
6290 | } |
6291 | /* disable policy to guarantee rdevs free resources for serialization */ |
6292 | mddev->serialize_policy = 0; |
6293 | mddev_destroy_serial_pool(mddev, NULL); |
6294 | } |
6295 | |
6296 | void md_stop_writes(struct mddev *mddev) |
6297 | { |
6298 | mddev_lock_nointr(mddev); |
6299 | __md_stop_writes(mddev); |
6300 | mddev_unlock(mddev); |
6301 | } |
6302 | EXPORT_SYMBOL_GPL(md_stop_writes); |
6303 | |
6304 | static void mddev_detach(struct mddev *mddev) |
6305 | { |
6306 | md_bitmap_wait_behind_writes(mddev); |
6307 | if (mddev->pers && mddev->pers->quiesce && !is_md_suspended(mddev)) { |
6308 | mddev->pers->quiesce(mddev, 1); |
6309 | mddev->pers->quiesce(mddev, 0); |
6310 | } |
6311 | md_unregister_thread(mddev, threadp: &mddev->thread); |
6312 | if (mddev->queue) |
6313 | blk_sync_queue(q: mddev->queue); /* the unplug fn references 'conf'*/ |
6314 | } |
6315 | |
6316 | static void __md_stop(struct mddev *mddev) |
6317 | { |
6318 | struct md_personality *pers = mddev->pers; |
6319 | md_bitmap_destroy(mddev); |
6320 | mddev_detach(mddev); |
6321 | /* Ensure ->event_work is done */ |
6322 | if (mddev->event_work.func) |
6323 | flush_workqueue(md_misc_wq); |
6324 | spin_lock(lock: &mddev->lock); |
6325 | mddev->pers = NULL; |
6326 | spin_unlock(lock: &mddev->lock); |
6327 | if (mddev->private) |
6328 | pers->free(mddev, mddev->private); |
6329 | mddev->private = NULL; |
6330 | if (pers->sync_request && mddev->to_remove == NULL) |
6331 | mddev->to_remove = &md_redundancy_group; |
6332 | module_put(module: pers->owner); |
6333 | clear_bit(nr: MD_RECOVERY_FROZEN, addr: &mddev->recovery); |
6334 | |
6335 | bioset_exit(&mddev->bio_set); |
6336 | bioset_exit(&mddev->sync_set); |
6337 | bioset_exit(&mddev->io_clone_set); |
6338 | } |
6339 | |
6340 | void md_stop(struct mddev *mddev) |
6341 | { |
6342 | lockdep_assert_held(&mddev->reconfig_mutex); |
6343 | |
6344 | /* stop the array and free an attached data structures. |
6345 | * This is called from dm-raid |
6346 | */ |
6347 | __md_stop_writes(mddev); |
6348 | __md_stop(mddev); |
6349 | } |
6350 | |
6351 | EXPORT_SYMBOL_GPL(md_stop); |
6352 | |
6353 | static int md_set_readonly(struct mddev *mddev, struct block_device *bdev) |
6354 | { |
6355 | int err = 0; |
6356 | int did_freeze = 0; |
6357 | |
6358 | if (!test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)) { |
6359 | did_freeze = 1; |
6360 | set_bit(nr: MD_RECOVERY_FROZEN, addr: &mddev->recovery); |
6361 | md_wakeup_thread(thread: mddev->thread); |
6362 | } |
6363 | if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) |
6364 | set_bit(nr: MD_RECOVERY_INTR, addr: &mddev->recovery); |
6365 | |
6366 | /* |
6367 | * Thread might be blocked waiting for metadata update which will now |
6368 | * never happen |
6369 | */ |
6370 | md_wakeup_thread_directly(thread: mddev->sync_thread); |
6371 | |
6372 | if (mddev->external && test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)) |
6373 | return -EBUSY; |
6374 | mddev_unlock(mddev); |
6375 | wait_event(resync_wait, !test_bit(MD_RECOVERY_RUNNING, |
6376 | &mddev->recovery)); |
6377 | wait_event(mddev->sb_wait, |
6378 | !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)); |
6379 | mddev_lock_nointr(mddev); |
6380 | |
6381 | mutex_lock(&mddev->open_mutex); |
6382 | if ((mddev->pers && atomic_read(v: &mddev->openers) > !!bdev) || |
6383 | mddev->sync_thread || |
6384 | test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) { |
6385 | pr_warn("md: %s still in use.\n" ,mdname(mddev)); |
6386 | if (did_freeze) { |
6387 | clear_bit(nr: MD_RECOVERY_FROZEN, addr: &mddev->recovery); |
6388 | set_bit(nr: MD_RECOVERY_NEEDED, addr: &mddev->recovery); |
6389 | md_wakeup_thread(thread: mddev->thread); |
6390 | } |
6391 | err = -EBUSY; |
6392 | goto out; |
6393 | } |
6394 | if (mddev->pers) { |
6395 | __md_stop_writes(mddev); |
6396 | |
6397 | err = -ENXIO; |
6398 | if (mddev->ro == MD_RDONLY) |
6399 | goto out; |
6400 | mddev->ro = MD_RDONLY; |
6401 | set_disk_ro(disk: mddev->gendisk, read_only: 1); |
6402 | clear_bit(nr: MD_RECOVERY_FROZEN, addr: &mddev->recovery); |
6403 | set_bit(nr: MD_RECOVERY_NEEDED, addr: &mddev->recovery); |
6404 | md_wakeup_thread(thread: mddev->thread); |
6405 | sysfs_notify_dirent_safe(sd: mddev->sysfs_state); |
6406 | err = 0; |
6407 | } |
6408 | out: |
6409 | mutex_unlock(lock: &mddev->open_mutex); |
6410 | return err; |
6411 | } |
6412 | |
6413 | /* mode: |
6414 | * 0 - completely stop and dis-assemble array |
6415 | * 2 - stop but do not disassemble array |
6416 | */ |
6417 | static int do_md_stop(struct mddev *mddev, int mode, |
6418 | struct block_device *bdev) |
6419 | { |
6420 | struct gendisk *disk = mddev->gendisk; |
6421 | struct md_rdev *rdev; |
6422 | int did_freeze = 0; |
6423 | |
6424 | if (!test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)) { |
6425 | did_freeze = 1; |
6426 | set_bit(nr: MD_RECOVERY_FROZEN, addr: &mddev->recovery); |
6427 | md_wakeup_thread(thread: mddev->thread); |
6428 | } |
6429 | if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) |
6430 | set_bit(nr: MD_RECOVERY_INTR, addr: &mddev->recovery); |
6431 | |
6432 | /* |
6433 | * Thread might be blocked waiting for metadata update which will now |
6434 | * never happen |
6435 | */ |
6436 | md_wakeup_thread_directly(thread: mddev->sync_thread); |
6437 | |
6438 | mddev_unlock(mddev); |
6439 | wait_event(resync_wait, (mddev->sync_thread == NULL && |
6440 | !test_bit(MD_RECOVERY_RUNNING, |
6441 | &mddev->recovery))); |
6442 | mddev_lock_nointr(mddev); |
6443 | |
6444 | mutex_lock(&mddev->open_mutex); |
6445 | if ((mddev->pers && atomic_read(v: &mddev->openers) > !!bdev) || |
6446 | mddev->sysfs_active || |
6447 | mddev->sync_thread || |
6448 | test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) { |
6449 | pr_warn("md: %s still in use.\n" ,mdname(mddev)); |
6450 | mutex_unlock(lock: &mddev->open_mutex); |
6451 | if (did_freeze) { |
6452 | clear_bit(nr: MD_RECOVERY_FROZEN, addr: &mddev->recovery); |
6453 | set_bit(nr: MD_RECOVERY_NEEDED, addr: &mddev->recovery); |
6454 | md_wakeup_thread(thread: mddev->thread); |
6455 | } |
6456 | return -EBUSY; |
6457 | } |
6458 | if (mddev->pers) { |
6459 | if (!md_is_rdwr(mddev)) |
6460 | set_disk_ro(disk, read_only: 0); |
6461 | |
6462 | __md_stop_writes(mddev); |
6463 | __md_stop(mddev); |
6464 | |
6465 | /* tell userspace to handle 'inactive' */ |
6466 | sysfs_notify_dirent_safe(sd: mddev->sysfs_state); |
6467 | |
6468 | rdev_for_each(rdev, mddev) |
6469 | if (rdev->raid_disk >= 0) |
6470 | sysfs_unlink_rdev(mddev, rdev); |
6471 | |
6472 | set_capacity_and_notify(disk, size: 0); |
6473 | mutex_unlock(lock: &mddev->open_mutex); |
6474 | mddev->changed = 1; |
6475 | |
6476 | if (!md_is_rdwr(mddev)) |
6477 | mddev->ro = MD_RDWR; |
6478 | } else |
6479 | mutex_unlock(lock: &mddev->open_mutex); |
6480 | /* |
6481 | * Free resources if final stop |
6482 | */ |
6483 | if (mode == 0) { |
6484 | pr_info("md: %s stopped.\n" , mdname(mddev)); |
6485 | |
6486 | if (mddev->bitmap_info.file) { |
6487 | struct file *f = mddev->bitmap_info.file; |
6488 | spin_lock(lock: &mddev->lock); |
6489 | mddev->bitmap_info.file = NULL; |
6490 | spin_unlock(lock: &mddev->lock); |
6491 | fput(f); |
6492 | } |
6493 | mddev->bitmap_info.offset = 0; |
6494 | |
6495 | export_array(mddev); |
6496 | |
6497 | md_clean(mddev); |
6498 | if (mddev->hold_active == UNTIL_STOP) |
6499 | mddev->hold_active = 0; |
6500 | } |
6501 | md_new_event(); |
6502 | sysfs_notify_dirent_safe(sd: mddev->sysfs_state); |
6503 | return 0; |
6504 | } |
6505 | |
6506 | #ifndef MODULE |
6507 | static void autorun_array(struct mddev *mddev) |
6508 | { |
6509 | struct md_rdev *rdev; |
6510 | int err; |
6511 | |
6512 | if (list_empty(head: &mddev->disks)) |
6513 | return; |
6514 | |
6515 | pr_info("md: running: " ); |
6516 | |
6517 | rdev_for_each(rdev, mddev) { |
6518 | pr_cont("<%pg>" , rdev->bdev); |
6519 | } |
6520 | pr_cont("\n" ); |
6521 | |
6522 | err = do_md_run(mddev); |
6523 | if (err) { |
6524 | pr_warn("md: do_md_run() returned %d\n" , err); |
6525 | do_md_stop(mddev, mode: 0, NULL); |
6526 | } |
6527 | } |
6528 | |
6529 | /* |
6530 | * lets try to run arrays based on all disks that have arrived |
6531 | * until now. (those are in pending_raid_disks) |
6532 | * |
6533 | * the method: pick the first pending disk, collect all disks with |
6534 | * the same UUID, remove all from the pending list and put them into |
6535 | * the 'same_array' list. Then order this list based on superblock |
6536 | * update time (freshest comes first), kick out 'old' disks and |
6537 | * compare superblocks. If everything's fine then run it. |
6538 | * |
6539 | * If "unit" is allocated, then bump its reference count |
6540 | */ |
6541 | static void autorun_devices(int part) |
6542 | { |
6543 | struct md_rdev *rdev0, *rdev, *tmp; |
6544 | struct mddev *mddev; |
6545 | |
6546 | pr_info("md: autorun ...\n" ); |
6547 | while (!list_empty(head: &pending_raid_disks)) { |
6548 | int unit; |
6549 | dev_t dev; |
6550 | LIST_HEAD(candidates); |
6551 | rdev0 = list_entry(pending_raid_disks.next, |
6552 | struct md_rdev, same_set); |
6553 | |
6554 | pr_debug("md: considering %pg ...\n" , rdev0->bdev); |
6555 | INIT_LIST_HEAD(list: &candidates); |
6556 | rdev_for_each_list(rdev, tmp, &pending_raid_disks) |
6557 | if (super_90_load(rdev, refdev: rdev0, minor_version: 0) >= 0) { |
6558 | pr_debug("md: adding %pg ...\n" , |
6559 | rdev->bdev); |
6560 | list_move(list: &rdev->same_set, head: &candidates); |
6561 | } |
6562 | /* |
6563 | * now we have a set of devices, with all of them having |
6564 | * mostly sane superblocks. It's time to allocate the |
6565 | * mddev. |
6566 | */ |
6567 | if (part) { |
6568 | dev = MKDEV(mdp_major, |
6569 | rdev0->preferred_minor << MdpMinorShift); |
6570 | unit = MINOR(dev) >> MdpMinorShift; |
6571 | } else { |
6572 | dev = MKDEV(MD_MAJOR, rdev0->preferred_minor); |
6573 | unit = MINOR(dev); |
6574 | } |
6575 | if (rdev0->preferred_minor != unit) { |
6576 | pr_warn("md: unit number in %pg is bad: %d\n" , |
6577 | rdev0->bdev, rdev0->preferred_minor); |
6578 | break; |
6579 | } |
6580 | |
6581 | mddev = md_alloc(dev, NULL); |
6582 | if (IS_ERR(ptr: mddev)) |
6583 | break; |
6584 | |
6585 | if (mddev_suspend_and_lock(mddev)) |
6586 | pr_warn("md: %s locked, cannot run\n" , mdname(mddev)); |
6587 | else if (mddev->raid_disks || mddev->major_version |
6588 | || !list_empty(head: &mddev->disks)) { |
6589 | pr_warn("md: %s already running, cannot run %pg\n" , |
6590 | mdname(mddev), rdev0->bdev); |
6591 | mddev_unlock_and_resume(mddev); |
6592 | } else { |
6593 | pr_debug("md: created %s\n" , mdname(mddev)); |
6594 | mddev->persistent = 1; |
6595 | rdev_for_each_list(rdev, tmp, &candidates) { |
6596 | list_del_init(entry: &rdev->same_set); |
6597 | if (bind_rdev_to_array(rdev, mddev)) |
6598 | export_rdev(rdev, mddev); |
6599 | } |
6600 | autorun_array(mddev); |
6601 | mddev_unlock_and_resume(mddev); |
6602 | } |
6603 | /* on success, candidates will be empty, on error |
6604 | * it won't... |
6605 | */ |
6606 | rdev_for_each_list(rdev, tmp, &candidates) { |
6607 | list_del_init(entry: &rdev->same_set); |
6608 | export_rdev(rdev, mddev); |
6609 | } |
6610 | mddev_put(mddev); |
6611 | } |
6612 | pr_info("md: ... autorun DONE.\n" ); |
6613 | } |
6614 | #endif /* !MODULE */ |
6615 | |
6616 | static int get_version(void __user *arg) |
6617 | { |
6618 | mdu_version_t ver; |
6619 | |
6620 | ver.major = MD_MAJOR_VERSION; |
6621 | ver.minor = MD_MINOR_VERSION; |
6622 | ver.patchlevel = MD_PATCHLEVEL_VERSION; |
6623 | |
6624 | if (copy_to_user(to: arg, from: &ver, n: sizeof(ver))) |
6625 | return -EFAULT; |
6626 | |
6627 | return 0; |
6628 | } |
6629 | |
6630 | static int get_array_info(struct mddev *mddev, void __user *arg) |
6631 | { |
6632 | mdu_array_info_t info; |
6633 | int nr,working,insync,failed,spare; |
6634 | struct md_rdev *rdev; |
6635 | |
6636 | nr = working = insync = failed = spare = 0; |
6637 | rcu_read_lock(); |
6638 | rdev_for_each_rcu(rdev, mddev) { |
6639 | nr++; |
6640 | if (test_bit(Faulty, &rdev->flags)) |
6641 | failed++; |
6642 | else { |
6643 | working++; |
6644 | if (test_bit(In_sync, &rdev->flags)) |
6645 | insync++; |
6646 | else if (test_bit(Journal, &rdev->flags)) |
6647 | /* TODO: add journal count to md_u.h */ |
6648 | ; |
6649 | else |
6650 | spare++; |
6651 | } |
6652 | } |
6653 | rcu_read_unlock(); |
6654 | |
6655 | info.major_version = mddev->major_version; |
6656 | info.minor_version = mddev->minor_version; |
6657 | info.patch_version = MD_PATCHLEVEL_VERSION; |
6658 | info.ctime = clamp_t(time64_t, mddev->ctime, 0, U32_MAX); |
6659 | info.level = mddev->level; |
6660 | info.size = mddev->dev_sectors / 2; |
6661 | if (info.size != mddev->dev_sectors / 2) /* overflow */ |
6662 | info.size = -1; |
6663 | info.nr_disks = nr; |
6664 | info.raid_disks = mddev->raid_disks; |
6665 | info.md_minor = mddev->md_minor; |
6666 | info.not_persistent= !mddev->persistent; |
6667 | |
6668 | info.utime = clamp_t(time64_t, mddev->utime, 0, U32_MAX); |
6669 | info.state = 0; |
6670 | if (mddev->in_sync) |
6671 | info.state = (1<<MD_SB_CLEAN); |
6672 | if (mddev->bitmap && mddev->bitmap_info.offset) |
6673 | info.state |= (1<<MD_SB_BITMAP_PRESENT); |
6674 | if (mddev_is_clustered(mddev)) |
6675 | info.state |= (1<<MD_SB_CLUSTERED); |
6676 | info.active_disks = insync; |
6677 | info.working_disks = working; |
6678 | info.failed_disks = failed; |
6679 | info.spare_disks = spare; |
6680 | |
6681 | info.layout = mddev->layout; |
6682 | info.chunk_size = mddev->chunk_sectors << 9; |
6683 | |
6684 | if (copy_to_user(to: arg, from: &info, n: sizeof(info))) |
6685 | return -EFAULT; |
6686 | |
6687 | return 0; |
6688 | } |
6689 | |
6690 | static int get_bitmap_file(struct mddev *mddev, void __user * arg) |
6691 | { |
6692 | mdu_bitmap_file_t *file = NULL; /* too big for stack allocation */ |
6693 | char *ptr; |
6694 | int err; |
6695 | |
6696 | file = kzalloc(size: sizeof(*file), GFP_NOIO); |
6697 | if (!file) |
6698 | return -ENOMEM; |
6699 | |
6700 | err = 0; |
6701 | spin_lock(lock: &mddev->lock); |
6702 | /* bitmap enabled */ |
6703 | if (mddev->bitmap_info.file) { |
6704 | ptr = file_path(mddev->bitmap_info.file, file->pathname, |
6705 | sizeof(file->pathname)); |
6706 | if (IS_ERR(ptr)) |
6707 | err = PTR_ERR(ptr); |
6708 | else |
6709 | memmove(file->pathname, ptr, |
6710 | sizeof(file->pathname)-(ptr-file->pathname)); |
6711 | } |
6712 | spin_unlock(lock: &mddev->lock); |
6713 | |
6714 | if (err == 0 && |
6715 | copy_to_user(to: arg, from: file, n: sizeof(*file))) |
6716 | err = -EFAULT; |
6717 | |
6718 | kfree(objp: file); |
6719 | return err; |
6720 | } |
6721 | |
6722 | static int get_disk_info(struct mddev *mddev, void __user * arg) |
6723 | { |
6724 | mdu_disk_info_t info; |
6725 | struct md_rdev *rdev; |
6726 | |
6727 | if (copy_from_user(to: &info, from: arg, n: sizeof(info))) |
6728 | return -EFAULT; |
6729 | |
6730 | rcu_read_lock(); |
6731 | rdev = md_find_rdev_nr_rcu(mddev, info.number); |
6732 | if (rdev) { |
6733 | info.major = MAJOR(rdev->bdev->bd_dev); |
6734 | info.minor = MINOR(rdev->bdev->bd_dev); |
6735 | info.raid_disk = rdev->raid_disk; |
6736 | info.state = 0; |
6737 | if (test_bit(Faulty, &rdev->flags)) |
6738 | info.state |= (1<<MD_DISK_FAULTY); |
6739 | else if (test_bit(In_sync, &rdev->flags)) { |
6740 | info.state |= (1<<MD_DISK_ACTIVE); |
6741 | info.state |= (1<<MD_DISK_SYNC); |
6742 | } |
6743 | if (test_bit(Journal, &rdev->flags)) |
6744 | info.state |= (1<<MD_DISK_JOURNAL); |
6745 | if (test_bit(WriteMostly, &rdev->flags)) |
6746 | info.state |= (1<<MD_DISK_WRITEMOSTLY); |
6747 | if (test_bit(FailFast, &rdev->flags)) |
6748 | info.state |= (1<<MD_DISK_FAILFAST); |
6749 | } else { |
6750 | info.major = info.minor = 0; |
6751 | info.raid_disk = -1; |
6752 | info.state = (1<<MD_DISK_REMOVED); |
6753 | } |
6754 | rcu_read_unlock(); |
6755 | |
6756 | if (copy_to_user(to: arg, from: &info, n: sizeof(info))) |
6757 | return -EFAULT; |
6758 | |
6759 | return 0; |
6760 | } |
6761 | |
6762 | int md_add_new_disk(struct mddev *mddev, struct mdu_disk_info_s *info) |
6763 | { |
6764 | struct md_rdev *rdev; |
6765 | dev_t dev = MKDEV(info->major,info->minor); |
6766 | |
6767 | if (mddev_is_clustered(mddev) && |
6768 | !(info->state & ((1 << MD_DISK_CLUSTER_ADD) | (1 << MD_DISK_CANDIDATE)))) { |
6769 | pr_warn("%s: Cannot add to clustered mddev.\n" , |
6770 | mdname(mddev)); |
6771 | return -EINVAL; |
6772 | } |
6773 | |
6774 | if (info->major != MAJOR(dev) || info->minor != MINOR(dev)) |
6775 | return -EOVERFLOW; |
6776 | |
6777 | if (!mddev->raid_disks) { |
6778 | int err; |
6779 | /* expecting a device which has a superblock */ |
6780 | rdev = md_import_device(newdev: dev, super_format: mddev->major_version, super_minor: mddev->minor_version); |
6781 | if (IS_ERR(ptr: rdev)) { |
6782 | pr_warn("md: md_import_device returned %ld\n" , |
6783 | PTR_ERR(rdev)); |
6784 | return PTR_ERR(ptr: rdev); |
6785 | } |
6786 | if (!list_empty(head: &mddev->disks)) { |
6787 | struct md_rdev *rdev0 |
6788 | = list_entry(mddev->disks.next, |
6789 | struct md_rdev, same_set); |
6790 | err = super_types[mddev->major_version] |
6791 | .load_super(rdev, rdev0, mddev->minor_version); |
6792 | if (err < 0) { |
6793 | pr_warn("md: %pg has different UUID to %pg\n" , |
6794 | rdev->bdev, |
6795 | rdev0->bdev); |
6796 | export_rdev(rdev, mddev); |
6797 | return -EINVAL; |
6798 | } |
6799 | } |
6800 | err = bind_rdev_to_array(rdev, mddev); |
6801 | if (err) |
6802 | export_rdev(rdev, mddev); |
6803 | return err; |
6804 | } |
6805 | |
6806 | /* |
6807 | * md_add_new_disk can be used once the array is assembled |
6808 | * to add "hot spares". They must already have a superblock |
6809 | * written |
6810 | */ |
6811 | if (mddev->pers) { |
6812 | int err; |
6813 | if (!mddev->pers->hot_add_disk) { |
6814 | pr_warn("%s: personality does not support diskops!\n" , |
6815 | mdname(mddev)); |
6816 | return -EINVAL; |
6817 | } |
6818 | if (mddev->persistent) |
6819 | rdev = md_import_device(newdev: dev, super_format: mddev->major_version, |
6820 | super_minor: mddev->minor_version); |
6821 | else |
6822 | rdev = md_import_device(newdev: dev, super_format: -1, super_minor: -1); |
6823 | if (IS_ERR(ptr: rdev)) { |
6824 | pr_warn("md: md_import_device returned %ld\n" , |
6825 | PTR_ERR(rdev)); |
6826 | return PTR_ERR(ptr: rdev); |
6827 | } |
6828 | /* set saved_raid_disk if appropriate */ |
6829 | if (!mddev->persistent) { |
6830 | if (info->state & (1<<MD_DISK_SYNC) && |
6831 | info->raid_disk < mddev->raid_disks) { |
6832 | rdev->raid_disk = info->raid_disk; |
6833 | clear_bit(nr: Bitmap_sync, addr: &rdev->flags); |
6834 | } else |
6835 | rdev->raid_disk = -1; |
6836 | rdev->saved_raid_disk = rdev->raid_disk; |
6837 | } else |
6838 | super_types[mddev->major_version]. |
6839 | validate_super(mddev, rdev); |
6840 | if ((info->state & (1<<MD_DISK_SYNC)) && |
6841 | rdev->raid_disk != info->raid_disk) { |
6842 | /* This was a hot-add request, but events doesn't |
6843 | * match, so reject it. |
6844 | */ |
6845 | export_rdev(rdev, mddev); |
6846 | return -EINVAL; |
6847 | } |
6848 | |
6849 | clear_bit(nr: In_sync, addr: &rdev->flags); /* just to be sure */ |
6850 | if (info->state & (1<<MD_DISK_WRITEMOSTLY)) |
6851 | set_bit(nr: WriteMostly, addr: &rdev->flags); |
6852 | else |
6853 | clear_bit(nr: WriteMostly, addr: &rdev->flags); |
6854 | if (info->state & (1<<MD_DISK_FAILFAST)) |
6855 | set_bit(nr: FailFast, addr: &rdev->flags); |
6856 | else |
6857 | clear_bit(nr: FailFast, addr: &rdev->flags); |
6858 | |
6859 | if (info->state & (1<<MD_DISK_JOURNAL)) { |
6860 | struct md_rdev *rdev2; |
6861 | bool has_journal = false; |
6862 | |
6863 | /* make sure no existing journal disk */ |
6864 | rdev_for_each(rdev2, mddev) { |
6865 | if (test_bit(Journal, &rdev2->flags)) { |
6866 | has_journal = true; |
6867 | break; |
6868 | } |
6869 | } |
6870 | if (has_journal || mddev->bitmap) { |
6871 | export_rdev(rdev, mddev); |
6872 | return -EBUSY; |
6873 | } |
6874 | set_bit(nr: Journal, addr: &rdev->flags); |
6875 | } |
6876 | /* |
6877 | * check whether the device shows up in other nodes |
6878 | */ |
6879 | if (mddev_is_clustered(mddev)) { |
6880 | if (info->state & (1 << MD_DISK_CANDIDATE)) |
6881 | set_bit(nr: Candidate, addr: &rdev->flags); |
6882 | else if (info->state & (1 << MD_DISK_CLUSTER_ADD)) { |
6883 | /* --add initiated by this node */ |
6884 | err = md_cluster_ops->add_new_disk(mddev, rdev); |
6885 | if (err) { |
6886 | export_rdev(rdev, mddev); |
6887 | return err; |
6888 | } |
6889 | } |
6890 | } |
6891 | |
6892 | rdev->raid_disk = -1; |
6893 | err = bind_rdev_to_array(rdev, mddev); |
6894 | |
6895 | if (err) |
6896 | export_rdev(rdev, mddev); |
6897 | |
6898 | if (mddev_is_clustered(mddev)) { |
6899 | if (info->state & (1 << MD_DISK_CANDIDATE)) { |
6900 | if (!err) { |
6901 | err = md_cluster_ops->new_disk_ack(mddev, |
6902 | err == 0); |
6903 | if (err) |
6904 | md_kick_rdev_from_array(rdev); |
6905 | } |
6906 | } else { |
6907 | if (err) |
6908 | md_cluster_ops->add_new_disk_cancel(mddev); |
6909 | else |
6910 | err = add_bound_rdev(rdev); |
6911 | } |
6912 | |
6913 | } else if (!err) |
6914 | err = add_bound_rdev(rdev); |
6915 | |
6916 | return err; |
6917 | } |
6918 | |
6919 | /* otherwise, md_add_new_disk is only allowed |
6920 | * for major_version==0 superblocks |
6921 | */ |
6922 | if (mddev->major_version != 0) { |
6923 | pr_warn("%s: ADD_NEW_DISK not supported\n" , mdname(mddev)); |
6924 | return -EINVAL; |
6925 | } |
6926 | |
6927 | if (!(info->state & (1<<MD_DISK_FAULTY))) { |
6928 | int err; |
6929 | rdev = md_import_device(newdev: dev, super_format: -1, super_minor: 0); |
6930 | if (IS_ERR(ptr: rdev)) { |
6931 | pr_warn("md: error, md_import_device() returned %ld\n" , |
6932 | PTR_ERR(rdev)); |
6933 | return PTR_ERR(ptr: rdev); |
6934 | } |
6935 | rdev->desc_nr = info->number; |
6936 | if (info->raid_disk < mddev->raid_disks) |
6937 | rdev->raid_disk = info->raid_disk; |
6938 | else |
6939 | rdev->raid_disk = -1; |
6940 | |
6941 | if (rdev->raid_disk < mddev->raid_disks) |
6942 | if (info->state & (1<<MD_DISK_SYNC)) |
6943 | set_bit(nr: In_sync, addr: &rdev->flags); |
6944 | |
6945 | if (info->state & (1<<MD_DISK_WRITEMOSTLY)) |
6946 | set_bit(nr: WriteMostly, addr: &rdev->flags); |
6947 | if (info->state & (1<<MD_DISK_FAILFAST)) |
6948 | set_bit(nr: FailFast, addr: &rdev->flags); |
6949 | |
6950 | if (!mddev->persistent) { |
6951 | pr_debug("md: nonpersistent superblock ...\n" ); |
6952 | rdev->sb_start = bdev_nr_sectors(bdev: rdev->bdev); |
6953 | } else |
6954 | rdev->sb_start = calc_dev_sboffset(rdev); |
6955 | rdev->sectors = rdev->sb_start; |
6956 | |
6957 | err = bind_rdev_to_array(rdev, mddev); |
6958 | if (err) { |
6959 | export_rdev(rdev, mddev); |
6960 | return err; |
6961 | } |
6962 | } |
6963 | |
6964 | return 0; |
6965 | } |
6966 | |
6967 | static int hot_remove_disk(struct mddev *mddev, dev_t dev) |
6968 | { |
6969 | struct md_rdev *rdev; |
6970 | |
6971 | if (!mddev->pers) |
6972 | return -ENODEV; |
6973 | |
6974 | rdev = find_rdev(mddev, dev); |
6975 | if (!rdev) |
6976 | return -ENXIO; |
6977 | |
6978 | if (rdev->raid_disk < 0) |
6979 | goto kick_rdev; |
6980 | |
6981 | clear_bit(nr: Blocked, addr: &rdev->flags); |
6982 | remove_and_add_spares(mddev, this: rdev); |
6983 | |
6984 | if (rdev->raid_disk >= 0) |
6985 | goto busy; |
6986 | |
6987 | kick_rdev: |
6988 | if (mddev_is_clustered(mddev)) { |
6989 | if (md_cluster_ops->remove_disk(mddev, rdev)) |
6990 | goto busy; |
6991 | } |
6992 | |
6993 | md_kick_rdev_from_array(rdev); |
6994 | set_bit(nr: MD_SB_CHANGE_DEVS, addr: &mddev->sb_flags); |
6995 | if (mddev->thread) |
6996 | md_wakeup_thread(thread: mddev->thread); |
6997 | else |
6998 | md_update_sb(mddev, 1); |
6999 | md_new_event(); |
7000 | |
7001 | return 0; |
7002 | busy: |
7003 | pr_debug("md: cannot remove active disk %pg from %s ...\n" , |
7004 | rdev->bdev, mdname(mddev)); |
7005 | return -EBUSY; |
7006 | } |
7007 | |
7008 | static int hot_add_disk(struct mddev *mddev, dev_t dev) |
7009 | { |
7010 | int err; |
7011 | struct md_rdev *rdev; |
7012 | |
7013 | if (!mddev->pers) |
7014 | return -ENODEV; |
7015 | |
7016 | if (mddev->major_version != 0) { |
7017 | pr_warn("%s: HOT_ADD may only be used with version-0 superblocks.\n" , |
7018 | mdname(mddev)); |
7019 | return -EINVAL; |
7020 | } |
7021 | if (!mddev->pers->hot_add_disk) { |
7022 | pr_warn("%s: personality does not support diskops!\n" , |
7023 | mdname(mddev)); |
7024 | return -EINVAL; |
7025 | } |
7026 | |
7027 | rdev = md_import_device(newdev: dev, super_format: -1, super_minor: 0); |
7028 | if (IS_ERR(ptr: rdev)) { |
7029 | pr_warn("md: error, md_import_device() returned %ld\n" , |
7030 | PTR_ERR(rdev)); |
7031 | return -EINVAL; |
7032 | } |
7033 | |
7034 | if (mddev->persistent) |
7035 | rdev->sb_start = calc_dev_sboffset(rdev); |
7036 | else |
7037 | rdev->sb_start = bdev_nr_sectors(bdev: rdev->bdev); |
7038 | |
7039 | rdev->sectors = rdev->sb_start; |
7040 | |
7041 | if (test_bit(Faulty, &rdev->flags)) { |
7042 | pr_warn("md: can not hot-add faulty %pg disk to %s!\n" , |
7043 | rdev->bdev, mdname(mddev)); |
7044 | err = -EINVAL; |
7045 | goto abort_export; |
7046 | } |
7047 | |
7048 | clear_bit(nr: In_sync, addr: &rdev->flags); |
7049 | rdev->desc_nr = -1; |
7050 | rdev->saved_raid_disk = -1; |
7051 | err = bind_rdev_to_array(rdev, mddev); |
7052 | if (err) |
7053 | goto abort_export; |
7054 | |
7055 | /* |
7056 | * The rest should better be atomic, we can have disk failures |
7057 | * noticed in interrupt contexts ... |
7058 | */ |
7059 | |
7060 | rdev->raid_disk = -1; |
7061 | |
7062 | set_bit(nr: MD_SB_CHANGE_DEVS, addr: &mddev->sb_flags); |
7063 | if (!mddev->thread) |
7064 | md_update_sb(mddev, 1); |
7065 | /* |
7066 | * If the new disk does not support REQ_NOWAIT, |
7067 | * disable on the whole MD. |
7068 | */ |
7069 | if (!bdev_nowait(bdev: rdev->bdev)) { |
7070 | pr_info("%s: Disabling nowait because %pg does not support nowait\n" , |
7071 | mdname(mddev), rdev->bdev); |
7072 | blk_queue_flag_clear(QUEUE_FLAG_NOWAIT, q: mddev->queue); |
7073 | } |
7074 | /* |
7075 | * Kick recovery, maybe this spare has to be added to the |
7076 | * array immediately. |
7077 | */ |
7078 | set_bit(nr: MD_RECOVERY_NEEDED, addr: &mddev->recovery); |
7079 | md_wakeup_thread(thread: mddev->thread); |
7080 | md_new_event(); |
7081 | return 0; |
7082 | |
7083 | abort_export: |
7084 | export_rdev(rdev, mddev); |
7085 | return err; |
7086 | } |
7087 | |
7088 | static int set_bitmap_file(struct mddev *mddev, int fd) |
7089 | { |
7090 | int err = 0; |
7091 | |
7092 | if (mddev->pers) { |
7093 | if (!mddev->pers->quiesce || !mddev->thread) |
7094 | return -EBUSY; |
7095 | if (mddev->recovery || mddev->sync_thread) |
7096 | return -EBUSY; |
7097 | /* we should be able to change the bitmap.. */ |
7098 | } |
7099 | |
7100 | if (fd >= 0) { |
7101 | struct inode *inode; |
7102 | struct file *f; |
7103 | |
7104 | if (mddev->bitmap || mddev->bitmap_info.file) |
7105 | return -EEXIST; /* cannot add when bitmap is present */ |
7106 | |
7107 | if (!IS_ENABLED(CONFIG_MD_BITMAP_FILE)) { |
7108 | pr_warn("%s: bitmap files not supported by this kernel\n" , |
7109 | mdname(mddev)); |
7110 | return -EINVAL; |
7111 | } |
7112 | pr_warn("%s: using deprecated bitmap file support\n" , |
7113 | mdname(mddev)); |
7114 | |
7115 | f = fget(fd); |
7116 | |
7117 | if (f == NULL) { |
7118 | pr_warn("%s: error: failed to get bitmap file\n" , |
7119 | mdname(mddev)); |
7120 | return -EBADF; |
7121 | } |
7122 | |
7123 | inode = f->f_mapping->host; |
7124 | if (!S_ISREG(inode->i_mode)) { |
7125 | pr_warn("%s: error: bitmap file must be a regular file\n" , |
7126 | mdname(mddev)); |
7127 | err = -EBADF; |
7128 | } else if (!(f->f_mode & FMODE_WRITE)) { |
7129 | pr_warn("%s: error: bitmap file must open for write\n" , |
7130 | mdname(mddev)); |
7131 | err = -EBADF; |
7132 | } else if (atomic_read(v: &inode->i_writecount) != 1) { |
7133 | pr_warn("%s: error: bitmap file is already in use\n" , |
7134 | mdname(mddev)); |
7135 | err = -EBUSY; |
7136 | } |
7137 | if (err) { |
7138 | fput(f); |
7139 | return err; |
7140 | } |
7141 | mddev->bitmap_info.file = f; |
7142 | mddev->bitmap_info.offset = 0; /* file overrides offset */ |
7143 | } else if (mddev->bitmap == NULL) |
7144 | return -ENOENT; /* cannot remove what isn't there */ |
7145 | err = 0; |
7146 | if (mddev->pers) { |
7147 | if (fd >= 0) { |
7148 | struct bitmap *bitmap; |
7149 | |
7150 | bitmap = md_bitmap_create(mddev, slot: -1); |
7151 | if (!IS_ERR(ptr: bitmap)) { |
7152 | mddev->bitmap = bitmap; |
7153 | err = md_bitmap_load(mddev); |
7154 | } else |
7155 | err = PTR_ERR(ptr: bitmap); |
7156 | if (err) { |
7157 | md_bitmap_destroy(mddev); |
7158 | fd = -1; |
7159 | } |
7160 | } else if (fd < 0) { |
7161 | md_bitmap_destroy(mddev); |
7162 | } |
7163 | } |
7164 | if (fd < 0) { |
7165 | struct file *f = mddev->bitmap_info.file; |
7166 | if (f) { |
7167 | spin_lock(lock: &mddev->lock); |
7168 | mddev->bitmap_info.file = NULL; |
7169 | spin_unlock(lock: &mddev->lock); |
7170 | fput(f); |
7171 | } |
7172 | } |
7173 | |
7174 | return err; |
7175 | } |
7176 | |
7177 | /* |
7178 | * md_set_array_info is used two different ways |
7179 | * The original usage is when creating a new array. |
7180 | * In this usage, raid_disks is > 0 and it together with |
7181 | * level, size, not_persistent,layout,chunksize determine the |
7182 | * shape of the array. |
7183 | * This will always create an array with a type-0.90.0 superblock. |
7184 | * The newer usage is when assembling an array. |
7185 | * In this case raid_disks will be 0, and the major_version field is |
7186 | * use to determine which style super-blocks are to be found on the devices. |
7187 | * The minor and patch _version numbers are also kept incase the |
7188 | * super_block handler wishes to interpret them. |
7189 | */ |
7190 | int md_set_array_info(struct mddev *mddev, struct mdu_array_info_s *info) |
7191 | { |
7192 | if (info->raid_disks == 0) { |
7193 | /* just setting version number for superblock loading */ |
7194 | if (info->major_version < 0 || |
7195 | info->major_version >= ARRAY_SIZE(super_types) || |
7196 | super_types[info->major_version].name == NULL) { |
7197 | /* maybe try to auto-load a module? */ |
7198 | pr_warn("md: superblock version %d not known\n" , |
7199 | info->major_version); |
7200 | return -EINVAL; |
7201 | } |
7202 | mddev->major_version = info->major_version; |
7203 | mddev->minor_version = info->minor_version; |
7204 | mddev->patch_version = info->patch_version; |
7205 | mddev->persistent = !info->not_persistent; |
7206 | /* ensure mddev_put doesn't delete this now that there |
7207 | * is some minimal configuration. |
7208 | */ |
7209 | mddev->ctime = ktime_get_real_seconds(); |
7210 | return 0; |
7211 | } |
7212 | mddev->major_version = MD_MAJOR_VERSION; |
7213 | mddev->minor_version = MD_MINOR_VERSION; |
7214 | mddev->patch_version = MD_PATCHLEVEL_VERSION; |
7215 | mddev->ctime = ktime_get_real_seconds(); |
7216 | |
7217 | mddev->level = info->level; |
7218 | mddev->clevel[0] = 0; |
7219 | mddev->dev_sectors = 2 * (sector_t)info->size; |
7220 | mddev->raid_disks = info->raid_disks; |
7221 | /* don't set md_minor, it is determined by which /dev/md* was |
7222 | * openned |
7223 | */ |
7224 | if (info->state & (1<<MD_SB_CLEAN)) |
7225 | mddev->recovery_cp = MaxSector; |
7226 | else |
7227 | mddev->recovery_cp = 0; |
7228 | mddev->persistent = ! info->not_persistent; |
7229 | mddev->external = 0; |
7230 | |
7231 | mddev->layout = info->layout; |
7232 | if (mddev->level == 0) |
7233 | /* Cannot trust RAID0 layout info here */ |
7234 | mddev->layout = -1; |
7235 | mddev->chunk_sectors = info->chunk_size >> 9; |
7236 | |
7237 | if (mddev->persistent) { |
7238 | mddev->max_disks = MD_SB_DISKS; |
7239 | mddev->flags = 0; |
7240 | mddev->sb_flags = 0; |
7241 | } |
7242 | set_bit(nr: MD_SB_CHANGE_DEVS, addr: &mddev->sb_flags); |
7243 | |
7244 | mddev->bitmap_info.default_offset = MD_SB_BYTES >> 9; |
7245 | mddev->bitmap_info.default_space = 64*2 - (MD_SB_BYTES >> 9); |
7246 | mddev->bitmap_info.offset = 0; |
7247 | |
7248 | mddev->reshape_position = MaxSector; |
7249 | |
7250 | /* |
7251 | * Generate a 128 bit UUID |
7252 | */ |
7253 | get_random_bytes(buf: mddev->uuid, len: 16); |
7254 | |
7255 | mddev->new_level = mddev->level; |
7256 | mddev->new_chunk_sectors = mddev->chunk_sectors; |
7257 | mddev->new_layout = mddev->layout; |
7258 | mddev->delta_disks = 0; |
7259 | mddev->reshape_backwards = 0; |
7260 | |
7261 | return 0; |
7262 | } |
7263 | |
7264 | void md_set_array_sectors(struct mddev *mddev, sector_t array_sectors) |
7265 | { |
7266 | lockdep_assert_held(&mddev->reconfig_mutex); |
7267 | |
7268 | if (mddev->external_size) |
7269 | return; |
7270 | |
7271 | mddev->array_sectors = array_sectors; |
7272 | } |
7273 | EXPORT_SYMBOL(md_set_array_sectors); |
7274 | |
7275 | static int update_size(struct mddev *mddev, sector_t num_sectors) |
7276 | { |
7277 | struct md_rdev *rdev; |
7278 | int rv; |
7279 | int fit = (num_sectors == 0); |
7280 | sector_t old_dev_sectors = mddev->dev_sectors; |
7281 | |
7282 | if (mddev->pers->resize == NULL) |
7283 | return -EINVAL; |
7284 | /* The "num_sectors" is the number of sectors of each device that |
7285 | * is used. This can only make sense for arrays with redundancy. |
7286 | * linear and raid0 always use whatever space is available. We can only |
7287 | * consider changing this number if no resync or reconstruction is |
7288 | * happening, and if the new size is acceptable. It must fit before the |
7289 | * sb_start or, if that is <data_offset, it must fit before the size |
7290 | * of each device. If num_sectors is zero, we find the largest size |
7291 | * that fits. |
7292 | */ |
7293 | if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) || |
7294 | mddev->sync_thread) |
7295 | return -EBUSY; |
7296 | if (!md_is_rdwr(mddev)) |
7297 | return -EROFS; |
7298 | |
7299 | rdev_for_each(rdev, mddev) { |
7300 | sector_t avail = rdev->sectors; |
7301 | |
7302 | if (fit && (num_sectors == 0 || num_sectors > avail)) |
7303 | num_sectors = avail; |
7304 | if (avail < num_sectors) |
7305 | return -ENOSPC; |
7306 | } |
7307 | rv = mddev->pers->resize(mddev, num_sectors); |
7308 | if (!rv) { |
7309 | if (mddev_is_clustered(mddev)) |
7310 | md_cluster_ops->update_size(mddev, old_dev_sectors); |
7311 | else if (mddev->queue) { |
7312 | set_capacity_and_notify(disk: mddev->gendisk, |
7313 | size: mddev->array_sectors); |
7314 | } |
7315 | } |
7316 | return rv; |
7317 | } |
7318 | |
7319 | static int update_raid_disks(struct mddev *mddev, int raid_disks) |
7320 | { |
7321 | int rv; |
7322 | struct md_rdev *rdev; |
7323 | /* change the number of raid disks */ |
7324 | if (mddev->pers->check_reshape == NULL) |
7325 | return -EINVAL; |
7326 | if (!md_is_rdwr(mddev)) |
7327 | return -EROFS; |
7328 | if (raid_disks <= 0 || |
7329 | (mddev->max_disks && raid_disks >= mddev->max_disks)) |
7330 | return -EINVAL; |
7331 | if (mddev->sync_thread || |
7332 | test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) || |
7333 | test_bit(MD_RESYNCING_REMOTE, &mddev->recovery) || |
7334 | mddev->reshape_position != MaxSector) |
7335 | return -EBUSY; |
7336 | |
7337 | rdev_for_each(rdev, mddev) { |
7338 | if (mddev->raid_disks < raid_disks && |
7339 | rdev->data_offset < rdev->new_data_offset) |
7340 | return -EINVAL; |
7341 | if (mddev->raid_disks > raid_disks && |
7342 | rdev->data_offset > rdev->new_data_offset) |
7343 | return -EINVAL; |
7344 | } |
7345 | |
7346 | mddev->delta_disks = raid_disks - mddev->raid_disks; |
7347 | if (mddev->delta_disks < 0) |
7348 | mddev->reshape_backwards = 1; |
7349 | else if (mddev->delta_disks > 0) |
7350 | mddev->reshape_backwards = 0; |
7351 | |
7352 | rv = mddev->pers->check_reshape(mddev); |
7353 | if (rv < 0) { |
7354 | mddev->delta_disks = 0; |
7355 | mddev->reshape_backwards = 0; |
7356 | } |
7357 | return rv; |
7358 | } |
7359 | |
7360 | /* |
7361 | * update_array_info is used to change the configuration of an |
7362 | * on-line array. |
7363 | * The version, ctime,level,size,raid_disks,not_persistent, layout,chunk_size |
7364 | * fields in the info are checked against the array. |
7365 | * Any differences that cannot be handled will cause an error. |
7366 | * Normally, only one change can be managed at a time. |
7367 | */ |
7368 | static int update_array_info(struct mddev *mddev, mdu_array_info_t *info) |
7369 | { |
7370 | int rv = 0; |
7371 | int cnt = 0; |
7372 | int state = 0; |
7373 | |
7374 | /* calculate expected state,ignoring low bits */ |
7375 | if (mddev->bitmap && mddev->bitmap_info.offset) |
7376 | state |= (1 << MD_SB_BITMAP_PRESENT); |
7377 | |
7378 | if (mddev->major_version != info->major_version || |
7379 | mddev->minor_version != info->minor_version || |
7380 | /* mddev->patch_version != info->patch_version || */ |
7381 | mddev->ctime != info->ctime || |
7382 | mddev->level != info->level || |
7383 | /* mddev->layout != info->layout || */ |
7384 | mddev->persistent != !info->not_persistent || |
7385 | mddev->chunk_sectors != info->chunk_size >> 9 || |
7386 | /* ignore bottom 8 bits of state, and allow SB_BITMAP_PRESENT to change */ |
7387 | ((state^info->state) & 0xfffffe00) |
7388 | ) |
7389 | return -EINVAL; |
7390 | /* Check there is only one change */ |
7391 | if (info->size >= 0 && mddev->dev_sectors / 2 != info->size) |
7392 | cnt++; |
7393 | if (mddev->raid_disks != info->raid_disks) |
7394 | cnt++; |
7395 | if (mddev->layout != info->layout) |
7396 | cnt++; |
7397 | if ((state ^ info->state) & (1<<MD_SB_BITMAP_PRESENT)) |
7398 | cnt++; |
7399 | if (cnt == 0) |
7400 | return 0; |
7401 | if (cnt > 1) |
7402 | return -EINVAL; |
7403 | |
7404 | if (mddev->layout != info->layout) { |
7405 | /* Change layout |
7406 | * we don't need to do anything at the md level, the |
7407 | * personality will take care of it all. |
7408 | */ |
7409 | if (mddev->pers->check_reshape == NULL) |
7410 | return -EINVAL; |
7411 | else { |
7412 | mddev->new_layout = info->layout; |
7413 | rv = mddev->pers->check_reshape(mddev); |
7414 | if (rv) |
7415 | mddev->new_layout = mddev->layout; |
7416 | return rv; |
7417 | } |
7418 | } |
7419 | if (info->size >= 0 && mddev->dev_sectors / 2 != info->size) |
7420 | rv = update_size(mddev, num_sectors: (sector_t)info->size * 2); |
7421 | |
7422 | if (mddev->raid_disks != info->raid_disks) |
7423 | rv = update_raid_disks(mddev, raid_disks: info->raid_disks); |
7424 | |
7425 | if ((state ^ info->state) & (1<<MD_SB_BITMAP_PRESENT)) { |
7426 | if (mddev->pers->quiesce == NULL || mddev->thread == NULL) { |
7427 | rv = -EINVAL; |
7428 | goto err; |
7429 | } |
7430 | if (mddev->recovery || mddev->sync_thread) { |
7431 | rv = -EBUSY; |
7432 | goto err; |
7433 | } |
7434 | if (info->state & (1<<MD_SB_BITMAP_PRESENT)) { |
7435 | struct bitmap *bitmap; |
7436 | /* add the bitmap */ |
7437 | if (mddev->bitmap) { |
7438 | rv = -EEXIST; |
7439 | goto err; |
7440 | } |
7441 | if (mddev->bitmap_info.default_offset == 0) { |
7442 | rv = -EINVAL; |
7443 | goto err; |
7444 | } |
7445 | mddev->bitmap_info.offset = |
7446 | mddev->bitmap_info.default_offset; |
7447 | mddev->bitmap_info.space = |
7448 | mddev->bitmap_info.default_space; |
7449 | bitmap = md_bitmap_create(mddev, slot: -1); |
7450 | if (!IS_ERR(ptr: bitmap)) { |
7451 | mddev->bitmap = bitmap; |
7452 | rv = md_bitmap_load(mddev); |
7453 | } else |
7454 | rv = PTR_ERR(ptr: bitmap); |
7455 | if (rv) |
7456 | md_bitmap_destroy(mddev); |
7457 | } else { |
7458 | /* remove the bitmap */ |
7459 | if (!mddev->bitmap) { |
7460 | rv = -ENOENT; |
7461 | goto err; |
7462 | } |
7463 | if (mddev->bitmap->storage.file) { |
7464 | rv = -EINVAL; |
7465 | goto err; |
7466 | } |
7467 | if (mddev->bitmap_info.nodes) { |
7468 | /* hold PW on all the bitmap lock */ |
7469 | if (md_cluster_ops->lock_all_bitmaps(mddev) <= 0) { |
7470 | pr_warn("md: can't change bitmap to none since the array is in use by more than one node\n" ); |
7471 | rv = -EPERM; |
7472 | md_cluster_ops->unlock_all_bitmaps(mddev); |
7473 | goto err; |
7474 | } |
7475 | |
7476 | mddev->bitmap_info.nodes = 0; |
7477 | md_cluster_ops->leave(mddev); |
7478 | module_put(module: md_cluster_mod); |
7479 | mddev->safemode_delay = DEFAULT_SAFEMODE_DELAY; |
7480 | } |
7481 | md_bitmap_destroy(mddev); |
7482 | mddev->bitmap_info.offset = 0; |
7483 | } |
7484 | } |
7485 | md_update_sb(mddev, 1); |
7486 | return rv; |
7487 | err: |
7488 | return rv; |
7489 | } |
7490 | |
7491 | static int set_disk_faulty(struct mddev *mddev, dev_t dev) |
7492 | { |
7493 | struct md_rdev *rdev; |
7494 | int err = 0; |
7495 | |
7496 | if (mddev->pers == NULL) |
7497 | return -ENODEV; |
7498 | |
7499 | rcu_read_lock(); |
7500 | rdev = md_find_rdev_rcu(mddev, dev); |
7501 | if (!rdev) |
7502 | err = -ENODEV; |
7503 | else { |
7504 | md_error(mddev, rdev); |
7505 | if (test_bit(MD_BROKEN, &mddev->flags)) |
7506 | err = -EBUSY; |
7507 | } |
7508 | rcu_read_unlock(); |
7509 | return err; |
7510 | } |
7511 | |
7512 | /* |
7513 | * We have a problem here : there is no easy way to give a CHS |
7514 | * virtual geometry. We currently pretend that we have a 2 heads |
7515 | * 4 sectors (with a BIG number of cylinders...). This drives |
7516 | * dosfs just mad... ;-) |
7517 | */ |
7518 | static int md_getgeo(struct block_device *bdev, struct hd_geometry *geo) |
7519 | { |
7520 | struct mddev *mddev = bdev->bd_disk->private_data; |
7521 | |
7522 | geo->heads = 2; |
7523 | geo->sectors = 4; |
7524 | geo->cylinders = mddev->array_sectors / 8; |
7525 | return 0; |
7526 | } |
7527 | |
7528 | static inline bool md_ioctl_valid(unsigned int cmd) |
7529 | { |
7530 | switch (cmd) { |
7531 | case ADD_NEW_DISK: |
7532 | case GET_ARRAY_INFO: |
7533 | case GET_BITMAP_FILE: |
7534 | case GET_DISK_INFO: |
7535 | case HOT_ADD_DISK: |
7536 | case HOT_REMOVE_DISK: |
7537 | case RAID_VERSION: |
7538 | case RESTART_ARRAY_RW: |
7539 | case RUN_ARRAY: |
7540 | case SET_ARRAY_INFO: |
7541 | case SET_BITMAP_FILE: |
7542 | case SET_DISK_FAULTY: |
7543 | case STOP_ARRAY: |
7544 | case STOP_ARRAY_RO: |
7545 | case CLUSTERED_DISK_NACK: |
7546 | return true; |
7547 | default: |
7548 | return false; |
7549 | } |
7550 | } |
7551 | |
7552 | static bool md_ioctl_need_suspend(unsigned int cmd) |
7553 | { |
7554 | switch (cmd) { |
7555 | case ADD_NEW_DISK: |
7556 | case HOT_ADD_DISK: |
7557 | case HOT_REMOVE_DISK: |
7558 | case SET_BITMAP_FILE: |
7559 | case SET_ARRAY_INFO: |
7560 | return true; |
7561 | default: |
7562 | return false; |
7563 | } |
7564 | } |
7565 | |
7566 | static int __md_set_array_info(struct mddev *mddev, void __user *argp) |
7567 | { |
7568 | mdu_array_info_t info; |
7569 | int err; |
7570 | |
7571 | if (!argp) |
7572 | memset(&info, 0, sizeof(info)); |
7573 | else if (copy_from_user(to: &info, from: argp, n: sizeof(info))) |
7574 | return -EFAULT; |
7575 | |
7576 | if (mddev->pers) { |
7577 | err = update_array_info(mddev, info: &info); |
7578 | if (err) |
7579 | pr_warn("md: couldn't update array info. %d\n" , err); |
7580 | return err; |
7581 | } |
7582 | |
7583 | if (!list_empty(head: &mddev->disks)) { |
7584 | pr_warn("md: array %s already has disks!\n" , mdname(mddev)); |
7585 | return -EBUSY; |
7586 | } |
7587 | |
7588 | if (mddev->raid_disks) { |
7589 | pr_warn("md: array %s already initialised!\n" , mdname(mddev)); |
7590 | return -EBUSY; |
7591 | } |
7592 | |
7593 | err = md_set_array_info(mddev, info: &info); |
7594 | if (err) |
7595 | pr_warn("md: couldn't set array info. %d\n" , err); |
7596 | |
7597 | return err; |
7598 | } |
7599 | |
7600 | static int md_ioctl(struct block_device *bdev, blk_mode_t mode, |
7601 | unsigned int cmd, unsigned long arg) |
7602 | { |
7603 | int err = 0; |
7604 | void __user *argp = (void __user *)arg; |
7605 | struct mddev *mddev = NULL; |
7606 | bool did_set_md_closing = false; |
7607 | |
7608 | if (!md_ioctl_valid(cmd)) |
7609 | return -ENOTTY; |
7610 | |
7611 | switch (cmd) { |
7612 | case RAID_VERSION: |
7613 | case GET_ARRAY_INFO: |
7614 | case GET_DISK_INFO: |
7615 | break; |
7616 | default: |
7617 | if (!capable(CAP_SYS_ADMIN)) |
7618 | return -EACCES; |
7619 | } |
7620 | |
7621 | /* |
7622 | * Commands dealing with the RAID driver but not any |
7623 | * particular array: |
7624 | */ |
7625 | switch (cmd) { |
7626 | case RAID_VERSION: |
7627 | err = get_version(arg: argp); |
7628 | goto out; |
7629 | default:; |
7630 | } |
7631 | |
7632 | /* |
7633 | * Commands creating/starting a new array: |
7634 | */ |
7635 | |
7636 | mddev = bdev->bd_disk->private_data; |
7637 | |
7638 | if (!mddev) { |
7639 | BUG(); |
7640 | goto out; |
7641 | } |
7642 | |
7643 | /* Some actions do not requires the mutex */ |
7644 | switch (cmd) { |
7645 | case GET_ARRAY_INFO: |
7646 | if (!mddev->raid_disks && !mddev->external) |
7647 | err = -ENODEV; |
7648 | else |
7649 | err = get_array_info(mddev, arg: argp); |
7650 | goto out; |
7651 | |
7652 | case GET_DISK_INFO: |
7653 | if (!mddev->raid_disks && !mddev->external) |
7654 | err = -ENODEV; |
7655 | else |
7656 | err = get_disk_info(mddev, arg: argp); |
7657 | goto out; |
7658 | |
7659 | case SET_DISK_FAULTY: |
7660 | err = set_disk_faulty(mddev, dev: new_decode_dev(dev: arg)); |
7661 | goto out; |
7662 | |
7663 | case GET_BITMAP_FILE: |
7664 | err = get_bitmap_file(mddev, arg: argp); |
7665 | goto out; |
7666 | |
7667 | } |
7668 | |
7669 | if (cmd == HOT_REMOVE_DISK) |
7670 | /* need to ensure recovery thread has run */ |
7671 | wait_event_interruptible_timeout(mddev->sb_wait, |
7672 | !test_bit(MD_RECOVERY_NEEDED, |
7673 | &mddev->recovery), |
7674 | msecs_to_jiffies(5000)); |
7675 | if (cmd == STOP_ARRAY || cmd == STOP_ARRAY_RO) { |
7676 | /* Need to flush page cache, and ensure no-one else opens |
7677 | * and writes |
7678 | */ |
7679 | mutex_lock(&mddev->open_mutex); |
7680 | if (mddev->pers && atomic_read(v: &mddev->openers) > 1) { |
7681 | mutex_unlock(lock: &mddev->open_mutex); |
7682 | err = -EBUSY; |
7683 | goto out; |
7684 | } |
7685 | if (test_and_set_bit(nr: MD_CLOSING, addr: &mddev->flags)) { |
7686 | mutex_unlock(lock: &mddev->open_mutex); |
7687 | err = -EBUSY; |
7688 | goto out; |
7689 | } |
7690 | did_set_md_closing = true; |
7691 | mutex_unlock(lock: &mddev->open_mutex); |
7692 | sync_blockdev(bdev); |
7693 | } |
7694 | |
7695 | if (!md_is_rdwr(mddev)) |
7696 | flush_work(work: &mddev->sync_work); |
7697 | |
7698 | err = md_ioctl_need_suspend(cmd) ? mddev_suspend_and_lock(mddev) : |
7699 | mddev_lock(mddev); |
7700 | if (err) { |
7701 | pr_debug("md: ioctl lock interrupted, reason %d, cmd %d\n" , |
7702 | err, cmd); |
7703 | goto out; |
7704 | } |
7705 | |
7706 | if (cmd == SET_ARRAY_INFO) { |
7707 | err = __md_set_array_info(mddev, argp); |
7708 | goto unlock; |
7709 | } |
7710 | |
7711 | /* |
7712 | * Commands querying/configuring an existing array: |
7713 | */ |
7714 | /* if we are not initialised yet, only ADD_NEW_DISK, STOP_ARRAY, |
7715 | * RUN_ARRAY, and GET_ and SET_BITMAP_FILE are allowed */ |
7716 | if ((!mddev->raid_disks && !mddev->external) |
7717 | && cmd != ADD_NEW_DISK && cmd != STOP_ARRAY |
7718 | && cmd != RUN_ARRAY && cmd != SET_BITMAP_FILE |
7719 | && cmd != GET_BITMAP_FILE) { |
7720 | err = -ENODEV; |
7721 | goto unlock; |
7722 | } |
7723 | |
7724 | /* |
7725 | * Commands even a read-only array can execute: |
7726 | */ |
7727 | switch (cmd) { |
7728 | case RESTART_ARRAY_RW: |
7729 | err = restart_array(mddev); |
7730 | goto unlock; |
7731 | |
7732 | case STOP_ARRAY: |
7733 | err = do_md_stop(mddev, mode: 0, bdev); |
7734 | goto unlock; |
7735 | |
7736 | case STOP_ARRAY_RO: |
7737 | err = md_set_readonly(mddev, bdev); |
7738 | goto unlock; |
7739 | |
7740 | case HOT_REMOVE_DISK: |
7741 | err = hot_remove_disk(mddev, dev: new_decode_dev(dev: arg)); |
7742 | goto unlock; |
7743 | |
7744 | case ADD_NEW_DISK: |
7745 | /* We can support ADD_NEW_DISK on read-only arrays |
7746 | * only if we are re-adding a preexisting device. |
7747 | * So require mddev->pers and MD_DISK_SYNC. |
7748 | */ |
7749 | if (mddev->pers) { |
7750 | mdu_disk_info_t info; |
7751 | if (copy_from_user(to: &info, from: argp, n: sizeof(info))) |
7752 | err = -EFAULT; |
7753 | else if (!(info.state & (1<<MD_DISK_SYNC))) |
7754 | /* Need to clear read-only for this */ |
7755 | break; |
7756 | else |
7757 | err = md_add_new_disk(mddev, info: &info); |
7758 | goto unlock; |
7759 | } |
7760 | break; |
7761 | } |
7762 | |
7763 | /* |
7764 | * The remaining ioctls are changing the state of the |
7765 | * superblock, so we do not allow them on read-only arrays. |
7766 | */ |
7767 | if (!md_is_rdwr(mddev) && mddev->pers) { |
7768 | if (mddev->ro != MD_AUTO_READ) { |
7769 | err = -EROFS; |
7770 | goto unlock; |
7771 | } |
7772 | mddev->ro = MD_RDWR; |
7773 | sysfs_notify_dirent_safe(sd: mddev->sysfs_state); |
7774 | set_bit(nr: MD_RECOVERY_NEEDED, addr: &mddev->recovery); |
7775 | /* mddev_unlock will wake thread */ |
7776 | /* If a device failed while we were read-only, we |
7777 | * need to make sure the metadata is updated now. |
7778 | */ |
7779 | if (test_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags)) { |
7780 | mddev_unlock(mddev); |
7781 | wait_event(mddev->sb_wait, |
7782 | !test_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags) && |
7783 | !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)); |
7784 | mddev_lock_nointr(mddev); |
7785 | } |
7786 | } |
7787 | |
7788 | switch (cmd) { |
7789 | case ADD_NEW_DISK: |
7790 | { |
7791 | mdu_disk_info_t info; |
7792 | if (copy_from_user(to: &info, from: argp, n: sizeof(info))) |
7793 | err = -EFAULT; |
7794 | else |
7795 | err = md_add_new_disk(mddev, info: &info); |
7796 | goto unlock; |
7797 | } |
7798 | |
7799 | case CLUSTERED_DISK_NACK: |
7800 | if (mddev_is_clustered(mddev)) |
7801 | md_cluster_ops->new_disk_ack(mddev, false); |
7802 | else |
7803 | err = -EINVAL; |
7804 | goto unlock; |
7805 | |
7806 | case HOT_ADD_DISK: |
7807 | err = hot_add_disk(mddev, dev: new_decode_dev(dev: arg)); |
7808 | goto unlock; |
7809 | |
7810 | case RUN_ARRAY: |
7811 | err = do_md_run(mddev); |
7812 | goto unlock; |
7813 | |
7814 | case SET_BITMAP_FILE: |
7815 | err = set_bitmap_file(mddev, fd: (int)arg); |
7816 | goto unlock; |
7817 | |
7818 | default: |
7819 | err = -EINVAL; |
7820 | goto unlock; |
7821 | } |
7822 | |
7823 | unlock: |
7824 | if (mddev->hold_active == UNTIL_IOCTL && |
7825 | err != -EINVAL) |
7826 | mddev->hold_active = 0; |
7827 | |
7828 | md_ioctl_need_suspend(cmd) ? mddev_unlock_and_resume(mddev) : |
7829 | mddev_unlock(mddev); |
7830 | |
7831 | out: |
7832 | if(did_set_md_closing) |
7833 | clear_bit(nr: MD_CLOSING, addr: &mddev->flags); |
7834 | return err; |
7835 | } |
7836 | #ifdef CONFIG_COMPAT |
7837 | static int md_compat_ioctl(struct block_device *bdev, blk_mode_t mode, |
7838 | unsigned int cmd, unsigned long arg) |
7839 | { |
7840 | switch (cmd) { |
7841 | case HOT_REMOVE_DISK: |
7842 | case HOT_ADD_DISK: |
7843 | case SET_DISK_FAULTY: |
7844 | case SET_BITMAP_FILE: |
7845 | /* These take in integer arg, do not convert */ |
7846 | break; |
7847 | default: |
7848 | arg = (unsigned long)compat_ptr(uptr: arg); |
7849 | break; |
7850 | } |
7851 | |
7852 | return md_ioctl(bdev, mode, cmd, arg); |
7853 | } |
7854 | #endif /* CONFIG_COMPAT */ |
7855 | |
7856 | static int md_set_read_only(struct block_device *bdev, bool ro) |
7857 | { |
7858 | struct mddev *mddev = bdev->bd_disk->private_data; |
7859 | int err; |
7860 | |
7861 | err = mddev_lock(mddev); |
7862 | if (err) |
7863 | return err; |
7864 | |
7865 | if (!mddev->raid_disks && !mddev->external) { |
7866 | err = -ENODEV; |
7867 | goto out_unlock; |
7868 | } |
7869 | |
7870 | /* |
7871 | * Transitioning to read-auto need only happen for arrays that call |
7872 | * md_write_start and which are not ready for writes yet. |
7873 | */ |
7874 | if (!ro && mddev->ro == MD_RDONLY && mddev->pers) { |
7875 | err = restart_array(mddev); |
7876 | if (err) |
7877 | goto out_unlock; |
7878 | mddev->ro = MD_AUTO_READ; |
7879 | } |
7880 | |
7881 | out_unlock: |
7882 | mddev_unlock(mddev); |
7883 | return err; |
7884 | } |
7885 | |
7886 | static int md_open(struct gendisk *disk, blk_mode_t mode) |
7887 | { |
7888 | struct mddev *mddev; |
7889 | int err; |
7890 | |
7891 | spin_lock(lock: &all_mddevs_lock); |
7892 | mddev = mddev_get(mddev: disk->private_data); |
7893 | spin_unlock(lock: &all_mddevs_lock); |
7894 | if (!mddev) |
7895 | return -ENODEV; |
7896 | |
7897 | err = mutex_lock_interruptible(&mddev->open_mutex); |
7898 | if (err) |
7899 | goto out; |
7900 | |
7901 | err = -ENODEV; |
7902 | if (test_bit(MD_CLOSING, &mddev->flags)) |
7903 | goto out_unlock; |
7904 | |
7905 | atomic_inc(v: &mddev->openers); |
7906 | mutex_unlock(lock: &mddev->open_mutex); |
7907 | |
7908 | disk_check_media_change(disk); |
7909 | return 0; |
7910 | |
7911 | out_unlock: |
7912 | mutex_unlock(lock: &mddev->open_mutex); |
7913 | out: |
7914 | mddev_put(mddev); |
7915 | return err; |
7916 | } |
7917 | |
7918 | static void md_release(struct gendisk *disk) |
7919 | { |
7920 | struct mddev *mddev = disk->private_data; |
7921 | |
7922 | BUG_ON(!mddev); |
7923 | atomic_dec(v: &mddev->openers); |
7924 | mddev_put(mddev); |
7925 | } |
7926 | |
7927 | static unsigned int md_check_events(struct gendisk *disk, unsigned int clearing) |
7928 | { |
7929 | struct mddev *mddev = disk->private_data; |
7930 | unsigned int ret = 0; |
7931 | |
7932 | if (mddev->changed) |
7933 | ret = DISK_EVENT_MEDIA_CHANGE; |
7934 | mddev->changed = 0; |
7935 | return ret; |
7936 | } |
7937 | |
7938 | static void md_free_disk(struct gendisk *disk) |
7939 | { |
7940 | struct mddev *mddev = disk->private_data; |
7941 | |
7942 | mddev_free(mddev); |
7943 | } |
7944 | |
7945 | const struct block_device_operations md_fops = |
7946 | { |
7947 | .owner = THIS_MODULE, |
7948 | .submit_bio = md_submit_bio, |
7949 | .open = md_open, |
7950 | .release = md_release, |
7951 | .ioctl = md_ioctl, |
7952 | #ifdef CONFIG_COMPAT |
7953 | .compat_ioctl = md_compat_ioctl, |
7954 | #endif |
7955 | .getgeo = md_getgeo, |
7956 | .check_events = md_check_events, |
7957 | .set_read_only = md_set_read_only, |
7958 | .free_disk = md_free_disk, |
7959 | }; |
7960 | |
7961 | static int md_thread(void *arg) |
7962 | { |
7963 | struct md_thread *thread = arg; |
7964 | |
7965 | /* |
7966 | * md_thread is a 'system-thread', it's priority should be very |
7967 | * high. We avoid resource deadlocks individually in each |
7968 | * raid personality. (RAID5 does preallocation) We also use RR and |
7969 | * the very same RT priority as kswapd, thus we will never get |
7970 | * into a priority inversion deadlock. |
7971 | * |
7972 | * we definitely have to have equal or higher priority than |
7973 | * bdflush, otherwise bdflush will deadlock if there are too |
7974 | * many dirty RAID5 blocks. |
7975 | */ |
7976 | |
7977 | allow_signal(SIGKILL); |
7978 | while (!kthread_should_stop()) { |
7979 | |
7980 | /* We need to wait INTERRUPTIBLE so that |
7981 | * we don't add to the load-average. |
7982 | * That means we need to be sure no signals are |
7983 | * pending |
7984 | */ |
7985 | if (signal_pending(current)) |
7986 | flush_signals(current); |
7987 | |
7988 | wait_event_interruptible_timeout |
7989 | (thread->wqueue, |
7990 | test_bit(THREAD_WAKEUP, &thread->flags) |
7991 | || kthread_should_stop() || kthread_should_park(), |
7992 | thread->timeout); |
7993 | |
7994 | clear_bit(THREAD_WAKEUP, addr: &thread->flags); |
7995 | if (kthread_should_park()) |
7996 | kthread_parkme(); |
7997 | if (!kthread_should_stop()) |
7998 | thread->run(thread); |
7999 | } |
8000 | |
8001 | return 0; |
8002 | } |
8003 | |
8004 | static void md_wakeup_thread_directly(struct md_thread __rcu *thread) |
8005 | { |
8006 | struct md_thread *t; |
8007 | |
8008 | rcu_read_lock(); |
8009 | t = rcu_dereference(thread); |
8010 | if (t) |
8011 | wake_up_process(tsk: t->tsk); |
8012 | rcu_read_unlock(); |
8013 | } |
8014 | |
8015 | void md_wakeup_thread(struct md_thread __rcu *thread) |
8016 | { |
8017 | struct md_thread *t; |
8018 | |
8019 | rcu_read_lock(); |
8020 | t = rcu_dereference(thread); |
8021 | if (t) { |
8022 | pr_debug("md: waking up MD thread %s.\n" , t->tsk->comm); |
8023 | set_bit(THREAD_WAKEUP, addr: &t->flags); |
8024 | wake_up(&t->wqueue); |
8025 | } |
8026 | rcu_read_unlock(); |
8027 | } |
8028 | EXPORT_SYMBOL(md_wakeup_thread); |
8029 | |
8030 | struct md_thread *md_register_thread(void (*run) (struct md_thread *), |
8031 | struct mddev *mddev, const char *name) |
8032 | { |
8033 | struct md_thread *thread; |
8034 | |
8035 | thread = kzalloc(size: sizeof(struct md_thread), GFP_KERNEL); |
8036 | if (!thread) |
8037 | return NULL; |
8038 | |
8039 | init_waitqueue_head(&thread->wqueue); |
8040 | |
8041 | thread->run = run; |
8042 | thread->mddev = mddev; |
8043 | thread->timeout = MAX_SCHEDULE_TIMEOUT; |
8044 | thread->tsk = kthread_run(md_thread, thread, |
8045 | "%s_%s" , |
8046 | mdname(thread->mddev), |
8047 | name); |
8048 | if (IS_ERR(ptr: thread->tsk)) { |
8049 | kfree(objp: thread); |
8050 | return NULL; |
8051 | } |
8052 | return thread; |
8053 | } |
8054 | EXPORT_SYMBOL(md_register_thread); |
8055 | |
8056 | void md_unregister_thread(struct mddev *mddev, struct md_thread __rcu **threadp) |
8057 | { |
8058 | struct md_thread *thread = rcu_dereference_protected(*threadp, |
8059 | lockdep_is_held(&mddev->reconfig_mutex)); |
8060 | |
8061 | if (!thread) |
8062 | return; |
8063 | |
8064 | rcu_assign_pointer(*threadp, NULL); |
8065 | synchronize_rcu(); |
8066 | |
8067 | pr_debug("interrupting MD-thread pid %d\n" , task_pid_nr(thread->tsk)); |
8068 | kthread_stop(k: thread->tsk); |
8069 | kfree(objp: thread); |
8070 | } |
8071 | EXPORT_SYMBOL(md_unregister_thread); |
8072 | |
8073 | void md_error(struct mddev *mddev, struct md_rdev *rdev) |
8074 | { |
8075 | if (!rdev || test_bit(Faulty, &rdev->flags)) |
8076 | return; |
8077 | |
8078 | if (!mddev->pers || !mddev->pers->error_handler) |
8079 | return; |
8080 | mddev->pers->error_handler(mddev, rdev); |
8081 | |
8082 | if (mddev->pers->level == 0 || mddev->pers->level == LEVEL_LINEAR) |
8083 | return; |
8084 | |
8085 | if (mddev->degraded && !test_bit(MD_BROKEN, &mddev->flags)) |
8086 | set_bit(nr: MD_RECOVERY_RECOVER, addr: &mddev->recovery); |
8087 | sysfs_notify_dirent_safe(sd: rdev->sysfs_state); |
8088 | set_bit(nr: MD_RECOVERY_INTR, addr: &mddev->recovery); |
8089 | if (!test_bit(MD_BROKEN, &mddev->flags)) { |
8090 | set_bit(nr: MD_RECOVERY_NEEDED, addr: &mddev->recovery); |
8091 | md_wakeup_thread(mddev->thread); |
8092 | } |
8093 | if (mddev->event_work.func) |
8094 | queue_work(wq: md_misc_wq, work: &mddev->event_work); |
8095 | md_new_event(); |
8096 | } |
8097 | EXPORT_SYMBOL(md_error); |
8098 | |
8099 | /* seq_file implementation /proc/mdstat */ |
8100 | |
8101 | static void status_unused(struct seq_file *seq) |
8102 | { |
8103 | int i = 0; |
8104 | struct md_rdev *rdev; |
8105 | |
8106 | seq_printf(m: seq, fmt: "unused devices: " ); |
8107 | |
8108 | list_for_each_entry(rdev, &pending_raid_disks, same_set) { |
8109 | i++; |
8110 | seq_printf(m: seq, fmt: "%pg " , rdev->bdev); |
8111 | } |
8112 | if (!i) |
8113 | seq_printf(m: seq, fmt: "<none>" ); |
8114 | |
8115 | seq_printf(m: seq, fmt: "\n" ); |
8116 | } |
8117 | |
8118 | static int status_resync(struct seq_file *seq, struct mddev *mddev) |
8119 | { |
8120 | sector_t max_sectors, resync, res; |
8121 | unsigned long dt, db = 0; |
8122 | sector_t rt, curr_mark_cnt, resync_mark_cnt; |
8123 | int scale, recovery_active; |
8124 | unsigned int per_milli; |
8125 | |
8126 | if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) || |
8127 | test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) |
8128 | max_sectors = mddev->resync_max_sectors; |
8129 | else |
8130 | max_sectors = mddev->dev_sectors; |
8131 | |
8132 | resync = mddev->curr_resync; |
8133 | if (resync < MD_RESYNC_ACTIVE) { |
8134 | if (test_bit(MD_RECOVERY_DONE, &mddev->recovery)) |
8135 | /* Still cleaning up */ |
8136 | resync = max_sectors; |
8137 | } else if (resync > max_sectors) { |
8138 | resync = max_sectors; |
8139 | } else { |
8140 | res = atomic_read(v: &mddev->recovery_active); |
8141 | /* |
8142 | * Resync has started, but the subtraction has overflowed or |
8143 | * yielded one of the special values. Force it to active to |
8144 | * ensure the status reports an active resync. |
8145 | */ |
8146 | if (resync < res || resync - res < MD_RESYNC_ACTIVE) |
8147 | resync = MD_RESYNC_ACTIVE; |
8148 | else |
8149 | resync -= res; |
8150 | } |
8151 | |
8152 | if (resync == MD_RESYNC_NONE) { |
8153 | if (test_bit(MD_RESYNCING_REMOTE, &mddev->recovery)) { |
8154 | struct md_rdev *rdev; |
8155 | |
8156 | rdev_for_each(rdev, mddev) |
8157 | if (rdev->raid_disk >= 0 && |
8158 | !test_bit(Faulty, &rdev->flags) && |
8159 | rdev->recovery_offset != MaxSector && |
8160 | rdev->recovery_offset) { |
8161 | seq_printf(m: seq, fmt: "\trecover=REMOTE" ); |
8162 | return 1; |
8163 | } |
8164 | if (mddev->reshape_position != MaxSector) |
8165 | seq_printf(m: seq, fmt: "\treshape=REMOTE" ); |
8166 | else |
8167 | seq_printf(m: seq, fmt: "\tresync=REMOTE" ); |
8168 | return 1; |
8169 | } |
8170 | if (mddev->recovery_cp < MaxSector) { |
8171 | seq_printf(m: seq, fmt: "\tresync=PENDING" ); |
8172 | return 1; |
8173 | } |
8174 | return 0; |
8175 | } |
8176 | if (resync < MD_RESYNC_ACTIVE) { |
8177 | seq_printf(m: seq, fmt: "\tresync=DELAYED" ); |
8178 | return 1; |
8179 | } |
8180 | |
8181 | WARN_ON(max_sectors == 0); |
8182 | /* Pick 'scale' such that (resync>>scale)*1000 will fit |
8183 | * in a sector_t, and (max_sectors>>scale) will fit in a |
8184 | * u32, as those are the requirements for sector_div. |
8185 | * Thus 'scale' must be at least 10 |
8186 | */ |
8187 | scale = 10; |
8188 | if (sizeof(sector_t) > sizeof(unsigned long)) { |
8189 | while ( max_sectors/2 > (1ULL<<(scale+32))) |
8190 | scale++; |
8191 | } |
8192 | res = (resync>>scale)*1000; |
8193 | sector_div(res, (u32)((max_sectors>>scale)+1)); |
8194 | |
8195 | per_milli = res; |
8196 | { |
8197 | int i, x = per_milli/50, y = 20-x; |
8198 | seq_printf(m: seq, fmt: "[" ); |
8199 | for (i = 0; i < x; i++) |
8200 | seq_printf(m: seq, fmt: "=" ); |
8201 | seq_printf(m: seq, fmt: ">" ); |
8202 | for (i = 0; i < y; i++) |
8203 | seq_printf(m: seq, fmt: "." ); |
8204 | seq_printf(m: seq, fmt: "] " ); |
8205 | } |
8206 | seq_printf(m: seq, fmt: " %s =%3u.%u%% (%llu/%llu)" , |
8207 | (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)? |
8208 | "reshape" : |
8209 | (test_bit(MD_RECOVERY_CHECK, &mddev->recovery)? |
8210 | "check" : |
8211 | (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ? |
8212 | "resync" : "recovery" ))), |
8213 | per_milli/10, per_milli % 10, |
8214 | (unsigned long long) resync/2, |
8215 | (unsigned long long) max_sectors/2); |
8216 | |
8217 | /* |
8218 | * dt: time from mark until now |
8219 | * db: blocks written from mark until now |
8220 | * rt: remaining time |
8221 | * |
8222 | * rt is a sector_t, which is always 64bit now. We are keeping |
8223 | * the original algorithm, but it is not really necessary. |
8224 | * |
8225 | * Original algorithm: |
8226 | * So we divide before multiply in case it is 32bit and close |
8227 | * to the limit. |
8228 | * We scale the divisor (db) by 32 to avoid losing precision |
8229 | * near the end of resync when the number of remaining sectors |
8230 | * is close to 'db'. |
8231 | * We then divide rt by 32 after multiplying by db to compensate. |
8232 | * The '+1' avoids division by zero if db is very small. |
8233 | */ |
8234 | dt = ((jiffies - mddev->resync_mark) / HZ); |
8235 | if (!dt) dt++; |
8236 | |
8237 | curr_mark_cnt = mddev->curr_mark_cnt; |
8238 | recovery_active = atomic_read(v: &mddev->recovery_active); |
8239 | resync_mark_cnt = mddev->resync_mark_cnt; |
8240 | |
8241 | if (curr_mark_cnt >= (recovery_active + resync_mark_cnt)) |
8242 | db = curr_mark_cnt - (recovery_active + resync_mark_cnt); |
8243 | |
8244 | rt = max_sectors - resync; /* number of remaining sectors */ |
8245 | rt = div64_u64(dividend: rt, divisor: db/32+1); |
8246 | rt *= dt; |
8247 | rt >>= 5; |
8248 | |
8249 | seq_printf(m: seq, fmt: " finish=%lu.%lumin" , (unsigned long)rt / 60, |
8250 | ((unsigned long)rt % 60)/6); |
8251 | |
8252 | seq_printf(m: seq, fmt: " speed=%ldK/sec" , db/2/dt); |
8253 | return 1; |
8254 | } |
8255 | |
8256 | static void *md_seq_start(struct seq_file *seq, loff_t *pos) |
8257 | __acquires(&all_mddevs_lock) |
8258 | { |
8259 | struct md_personality *pers; |
8260 | |
8261 | seq_puts(m: seq, s: "Personalities : " ); |
8262 | spin_lock(lock: &pers_lock); |
8263 | list_for_each_entry(pers, &pers_list, list) |
8264 | seq_printf(m: seq, fmt: "[%s] " , pers->name); |
8265 | |
8266 | spin_unlock(lock: &pers_lock); |
8267 | seq_puts(m: seq, s: "\n" ); |
8268 | seq->poll_event = atomic_read(v: &md_event_count); |
8269 | |
8270 | spin_lock(lock: &all_mddevs_lock); |
8271 | |
8272 | return seq_list_start(head: &all_mddevs, pos: *pos); |
8273 | } |
8274 | |
8275 | static void *md_seq_next(struct seq_file *seq, void *v, loff_t *pos) |
8276 | { |
8277 | return seq_list_next(v, head: &all_mddevs, ppos: pos); |
8278 | } |
8279 | |
8280 | static void md_seq_stop(struct seq_file *seq, void *v) |
8281 | __releases(&all_mddevs_lock) |
8282 | { |
8283 | status_unused(seq); |
8284 | spin_unlock(lock: &all_mddevs_lock); |
8285 | } |
8286 | |
8287 | static int md_seq_show(struct seq_file *seq, void *v) |
8288 | { |
8289 | struct mddev *mddev = list_entry(v, struct mddev, all_mddevs); |
8290 | sector_t sectors; |
8291 | struct md_rdev *rdev; |
8292 | |
8293 | if (!mddev_get(mddev)) |
8294 | return 0; |
8295 | |
8296 | spin_unlock(lock: &all_mddevs_lock); |
8297 | spin_lock(lock: &mddev->lock); |
8298 | if (mddev->pers || mddev->raid_disks || !list_empty(head: &mddev->disks)) { |
8299 | seq_printf(m: seq, fmt: "%s : %sactive" , mdname(mddev), |
8300 | mddev->pers ? "" : "in" ); |
8301 | if (mddev->pers) { |
8302 | if (mddev->ro == MD_RDONLY) |
8303 | seq_printf(m: seq, fmt: " (read-only)" ); |
8304 | if (mddev->ro == MD_AUTO_READ) |
8305 | seq_printf(m: seq, fmt: " (auto-read-only)" ); |
8306 | seq_printf(m: seq, fmt: " %s" , mddev->pers->name); |
8307 | } |
8308 | |
8309 | sectors = 0; |
8310 | rcu_read_lock(); |
8311 | rdev_for_each_rcu(rdev, mddev) { |
8312 | seq_printf(m: seq, fmt: " %pg[%d]" , rdev->bdev, rdev->desc_nr); |
8313 | |
8314 | if (test_bit(WriteMostly, &rdev->flags)) |
8315 | seq_printf(m: seq, fmt: "(W)" ); |
8316 | if (test_bit(Journal, &rdev->flags)) |
8317 | seq_printf(m: seq, fmt: "(J)" ); |
8318 | if (test_bit(Faulty, &rdev->flags)) { |
8319 | seq_printf(m: seq, fmt: "(F)" ); |
8320 | continue; |
8321 | } |
8322 | if (rdev->raid_disk < 0) |
8323 | seq_printf(m: seq, fmt: "(S)" ); /* spare */ |
8324 | if (test_bit(Replacement, &rdev->flags)) |
8325 | seq_printf(m: seq, fmt: "(R)" ); |
8326 | sectors += rdev->sectors; |
8327 | } |
8328 | rcu_read_unlock(); |
8329 | |
8330 | if (!list_empty(head: &mddev->disks)) { |
8331 | if (mddev->pers) |
8332 | seq_printf(m: seq, fmt: "\n %llu blocks" , |
8333 | (unsigned long long) |
8334 | mddev->array_sectors / 2); |
8335 | else |
8336 | seq_printf(m: seq, fmt: "\n %llu blocks" , |
8337 | (unsigned long long)sectors / 2); |
8338 | } |
8339 | if (mddev->persistent) { |
8340 | if (mddev->major_version != 0 || |
8341 | mddev->minor_version != 90) { |
8342 | seq_printf(m: seq,fmt: " super %d.%d" , |
8343 | mddev->major_version, |
8344 | mddev->minor_version); |
8345 | } |
8346 | } else if (mddev->external) |
8347 | seq_printf(m: seq, fmt: " super external:%s" , |
8348 | mddev->metadata_type); |
8349 | else |
8350 | seq_printf(m: seq, fmt: " super non-persistent" ); |
8351 | |
8352 | if (mddev->pers) { |
8353 | mddev->pers->status(seq, mddev); |
8354 | seq_printf(m: seq, fmt: "\n " ); |
8355 | if (mddev->pers->sync_request) { |
8356 | if (status_resync(seq, mddev)) |
8357 | seq_printf(m: seq, fmt: "\n " ); |
8358 | } |
8359 | } else |
8360 | seq_printf(m: seq, fmt: "\n " ); |
8361 | |
8362 | md_bitmap_status(seq, bitmap: mddev->bitmap); |
8363 | |
8364 | seq_printf(m: seq, fmt: "\n" ); |
8365 | } |
8366 | spin_unlock(lock: &mddev->lock); |
8367 | spin_lock(lock: &all_mddevs_lock); |
8368 | if (atomic_dec_and_test(v: &mddev->active)) |
8369 | __mddev_put(mddev); |
8370 | |
8371 | return 0; |
8372 | } |
8373 | |
8374 | static const struct seq_operations md_seq_ops = { |
8375 | .start = md_seq_start, |
8376 | .next = md_seq_next, |
8377 | .stop = md_seq_stop, |
8378 | .show = md_seq_show, |
8379 | }; |
8380 | |
8381 | static int md_seq_open(struct inode *inode, struct file *file) |
8382 | { |
8383 | struct seq_file *seq; |
8384 | int error; |
8385 | |
8386 | error = seq_open(file, &md_seq_ops); |
8387 | if (error) |
8388 | return error; |
8389 | |
8390 | seq = file->private_data; |
8391 | seq->poll_event = atomic_read(v: &md_event_count); |
8392 | return error; |
8393 | } |
8394 | |
8395 | static int md_unloading; |
8396 | static __poll_t mdstat_poll(struct file *filp, poll_table *wait) |
8397 | { |
8398 | struct seq_file *seq = filp->private_data; |
8399 | __poll_t mask; |
8400 | |
8401 | if (md_unloading) |
8402 | return EPOLLIN|EPOLLRDNORM|EPOLLERR|EPOLLPRI; |
8403 | poll_wait(filp, wait_address: &md_event_waiters, p: wait); |
8404 | |
8405 | /* always allow read */ |
8406 | mask = EPOLLIN | EPOLLRDNORM; |
8407 | |
8408 | if (seq->poll_event != atomic_read(v: &md_event_count)) |
8409 | mask |= EPOLLERR | EPOLLPRI; |
8410 | return mask; |
8411 | } |
8412 | |
8413 | static const struct proc_ops mdstat_proc_ops = { |
8414 | .proc_open = md_seq_open, |
8415 | .proc_read = seq_read, |
8416 | .proc_lseek = seq_lseek, |
8417 | .proc_release = seq_release, |
8418 | .proc_poll = mdstat_poll, |
8419 | }; |
8420 | |
8421 | int register_md_personality(struct md_personality *p) |
8422 | { |
8423 | pr_debug("md: %s personality registered for level %d\n" , |
8424 | p->name, p->level); |
8425 | spin_lock(lock: &pers_lock); |
8426 | list_add_tail(new: &p->list, head: &pers_list); |
8427 | spin_unlock(lock: &pers_lock); |
8428 | return 0; |
8429 | } |
8430 | EXPORT_SYMBOL(register_md_personality); |
8431 | |
8432 | int unregister_md_personality(struct md_personality *p) |
8433 | { |
8434 | pr_debug("md: %s personality unregistered\n" , p->name); |
8435 | spin_lock(lock: &pers_lock); |
8436 | list_del_init(entry: &p->list); |
8437 | spin_unlock(lock: &pers_lock); |
8438 | return 0; |
8439 | } |
8440 | EXPORT_SYMBOL(unregister_md_personality); |
8441 | |
8442 | int register_md_cluster_operations(struct md_cluster_operations *ops, |
8443 | struct module *module) |
8444 | { |
8445 | int ret = 0; |
8446 | spin_lock(lock: &pers_lock); |
8447 | if (md_cluster_ops != NULL) |
8448 | ret = -EALREADY; |
8449 | else { |
8450 | md_cluster_ops = ops; |
8451 | md_cluster_mod = module; |
8452 | } |
8453 | spin_unlock(lock: &pers_lock); |
8454 | return ret; |
8455 | } |
8456 | EXPORT_SYMBOL(register_md_cluster_operations); |
8457 | |
8458 | int unregister_md_cluster_operations(void) |
8459 | { |
8460 | spin_lock(lock: &pers_lock); |
8461 | md_cluster_ops = NULL; |
8462 | spin_unlock(lock: &pers_lock); |
8463 | return 0; |
8464 | } |
8465 | EXPORT_SYMBOL(unregister_md_cluster_operations); |
8466 | |
8467 | int md_setup_cluster(struct mddev *mddev, int nodes) |
8468 | { |
8469 | int ret; |
8470 | if (!md_cluster_ops) |
8471 | request_module("md-cluster" ); |
8472 | spin_lock(lock: &pers_lock); |
8473 | /* ensure module won't be unloaded */ |
8474 | if (!md_cluster_ops || !try_module_get(module: md_cluster_mod)) { |
8475 | pr_warn("can't find md-cluster module or get its reference.\n" ); |
8476 | spin_unlock(lock: &pers_lock); |
8477 | return -ENOENT; |
8478 | } |
8479 | spin_unlock(lock: &pers_lock); |
8480 | |
8481 | ret = md_cluster_ops->join(mddev, nodes); |
8482 | if (!ret) |
8483 | mddev->safemode_delay = 0; |
8484 | return ret; |
8485 | } |
8486 | |
8487 | void md_cluster_stop(struct mddev *mddev) |
8488 | { |
8489 | if (!md_cluster_ops) |
8490 | return; |
8491 | md_cluster_ops->leave(mddev); |
8492 | module_put(module: md_cluster_mod); |
8493 | } |
8494 | |
8495 | static int is_mddev_idle(struct mddev *mddev, int init) |
8496 | { |
8497 | struct md_rdev *rdev; |
8498 | int idle; |
8499 | int curr_events; |
8500 | |
8501 | idle = 1; |
8502 | rcu_read_lock(); |
8503 | rdev_for_each_rcu(rdev, mddev) { |
8504 | struct gendisk *disk = rdev->bdev->bd_disk; |
8505 | curr_events = (int)part_stat_read_accum(disk->part0, sectors) - |
8506 | atomic_read(v: &disk->sync_io); |
8507 | /* sync IO will cause sync_io to increase before the disk_stats |
8508 | * as sync_io is counted when a request starts, and |
8509 | * disk_stats is counted when it completes. |
8510 | * So resync activity will cause curr_events to be smaller than |
8511 | * when there was no such activity. |
8512 | * non-sync IO will cause disk_stat to increase without |
8513 | * increasing sync_io so curr_events will (eventually) |
8514 | * be larger than it was before. Once it becomes |
8515 | * substantially larger, the test below will cause |
8516 | * the array to appear non-idle, and resync will slow |
8517 | * down. |
8518 | * If there is a lot of outstanding resync activity when |
8519 | * we set last_event to curr_events, then all that activity |
8520 | * completing might cause the array to appear non-idle |
8521 | * and resync will be slowed down even though there might |
8522 | * not have been non-resync activity. This will only |
8523 | * happen once though. 'last_events' will soon reflect |
8524 | * the state where there is little or no outstanding |
8525 | * resync requests, and further resync activity will |
8526 | * always make curr_events less than last_events. |
8527 | * |
8528 | */ |
8529 | if (init || curr_events - rdev->last_events > 64) { |
8530 | rdev->last_events = curr_events; |
8531 | idle = 0; |
8532 | } |
8533 | } |
8534 | rcu_read_unlock(); |
8535 | return idle; |
8536 | } |
8537 | |
8538 | void md_done_sync(struct mddev *mddev, int blocks, int ok) |
8539 | { |
8540 | /* another "blocks" (512byte) blocks have been synced */ |
8541 | atomic_sub(i: blocks, v: &mddev->recovery_active); |
8542 | wake_up(&mddev->recovery_wait); |
8543 | if (!ok) { |
8544 | set_bit(nr: MD_RECOVERY_INTR, addr: &mddev->recovery); |
8545 | set_bit(nr: MD_RECOVERY_ERROR, addr: &mddev->recovery); |
8546 | md_wakeup_thread(mddev->thread); |
8547 | // stop recovery, signal do_sync .... |
8548 | } |
8549 | } |
8550 | EXPORT_SYMBOL(md_done_sync); |
8551 | |
8552 | /* md_write_start(mddev, bi) |
8553 | * If we need to update some array metadata (e.g. 'active' flag |
8554 | * in superblock) before writing, schedule a superblock update |
8555 | * and wait for it to complete. |
8556 | * A return value of 'false' means that the write wasn't recorded |
8557 | * and cannot proceed as the array is being suspend. |
8558 | */ |
8559 | bool md_write_start(struct mddev *mddev, struct bio *bi) |
8560 | { |
8561 | int did_change = 0; |
8562 | |
8563 | if (bio_data_dir(bi) != WRITE) |
8564 | return true; |
8565 | |
8566 | BUG_ON(mddev->ro == MD_RDONLY); |
8567 | if (mddev->ro == MD_AUTO_READ) { |
8568 | /* need to switch to read/write */ |
8569 | flush_work(work: &mddev->sync_work); |
8570 | mddev->ro = MD_RDWR; |
8571 | set_bit(nr: MD_RECOVERY_NEEDED, addr: &mddev->recovery); |
8572 | md_wakeup_thread(mddev->thread); |
8573 | md_wakeup_thread(mddev->sync_thread); |
8574 | did_change = 1; |
8575 | } |
8576 | rcu_read_lock(); |
8577 | percpu_ref_get(ref: &mddev->writes_pending); |
8578 | smp_mb(); /* Match smp_mb in set_in_sync() */ |
8579 | if (mddev->safemode == 1) |
8580 | mddev->safemode = 0; |
8581 | /* sync_checkers is always 0 when writes_pending is in per-cpu mode */ |
8582 | if (mddev->in_sync || mddev->sync_checkers) { |
8583 | spin_lock(lock: &mddev->lock); |
8584 | if (mddev->in_sync) { |
8585 | mddev->in_sync = 0; |
8586 | set_bit(nr: MD_SB_CHANGE_CLEAN, addr: &mddev->sb_flags); |
8587 | set_bit(nr: MD_SB_CHANGE_PENDING, addr: &mddev->sb_flags); |
8588 | md_wakeup_thread(mddev->thread); |
8589 | did_change = 1; |
8590 | } |
8591 | spin_unlock(lock: &mddev->lock); |
8592 | } |
8593 | rcu_read_unlock(); |
8594 | if (did_change) |
8595 | sysfs_notify_dirent_safe(sd: mddev->sysfs_state); |
8596 | if (!mddev->has_superblocks) |
8597 | return true; |
8598 | wait_event(mddev->sb_wait, |
8599 | !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags) || |
8600 | is_md_suspended(mddev)); |
8601 | if (test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)) { |
8602 | percpu_ref_put(ref: &mddev->writes_pending); |
8603 | return false; |
8604 | } |
8605 | return true; |
8606 | } |
8607 | EXPORT_SYMBOL(md_write_start); |
8608 | |
8609 | /* md_write_inc can only be called when md_write_start() has |
8610 | * already been called at least once of the current request. |
8611 | * It increments the counter and is useful when a single request |
8612 | * is split into several parts. Each part causes an increment and |
8613 | * so needs a matching md_write_end(). |
8614 | * Unlike md_write_start(), it is safe to call md_write_inc() inside |
8615 | * a spinlocked region. |
8616 | */ |
8617 | void md_write_inc(struct mddev *mddev, struct bio *bi) |
8618 | { |
8619 | if (bio_data_dir(bi) != WRITE) |
8620 | return; |
8621 | WARN_ON_ONCE(mddev->in_sync || !md_is_rdwr(mddev)); |
8622 | percpu_ref_get(ref: &mddev->writes_pending); |
8623 | } |
8624 | EXPORT_SYMBOL(md_write_inc); |
8625 | |
8626 | void md_write_end(struct mddev *mddev) |
8627 | { |
8628 | percpu_ref_put(ref: &mddev->writes_pending); |
8629 | |
8630 | if (mddev->safemode == 2) |
8631 | md_wakeup_thread(mddev->thread); |
8632 | else if (mddev->safemode_delay) |
8633 | /* The roundup() ensures this only performs locking once |
8634 | * every ->safemode_delay jiffies |
8635 | */ |
8636 | mod_timer(timer: &mddev->safemode_timer, |
8637 | roundup(jiffies, mddev->safemode_delay) + |
8638 | mddev->safemode_delay); |
8639 | } |
8640 | |
8641 | EXPORT_SYMBOL(md_write_end); |
8642 | |
8643 | /* This is used by raid0 and raid10 */ |
8644 | void md_submit_discard_bio(struct mddev *mddev, struct md_rdev *rdev, |
8645 | struct bio *bio, sector_t start, sector_t size) |
8646 | { |
8647 | struct bio *discard_bio = NULL; |
8648 | |
8649 | if (__blkdev_issue_discard(bdev: rdev->bdev, sector: start, nr_sects: size, GFP_NOIO, |
8650 | biop: &discard_bio) || !discard_bio) |
8651 | return; |
8652 | |
8653 | bio_chain(discard_bio, bio); |
8654 | bio_clone_blkg_association(dst: discard_bio, src: bio); |
8655 | if (mddev->gendisk) |
8656 | trace_block_bio_remap(bio: discard_bio, |
8657 | dev: disk_devt(disk: mddev->gendisk), |
8658 | from: bio->bi_iter.bi_sector); |
8659 | submit_bio_noacct(bio: discard_bio); |
8660 | } |
8661 | EXPORT_SYMBOL_GPL(md_submit_discard_bio); |
8662 | |
8663 | static void md_end_clone_io(struct bio *bio) |
8664 | { |
8665 | struct md_io_clone *md_io_clone = bio->bi_private; |
8666 | struct bio *orig_bio = md_io_clone->orig_bio; |
8667 | struct mddev *mddev = md_io_clone->mddev; |
8668 | |
8669 | orig_bio->bi_status = bio->bi_status; |
8670 | |
8671 | if (md_io_clone->start_time) |
8672 | bio_end_io_acct(bio: orig_bio, start_time: md_io_clone->start_time); |
8673 | |
8674 | bio_put(bio); |
8675 | bio_endio(orig_bio); |
8676 | percpu_ref_put(ref: &mddev->active_io); |
8677 | } |
8678 | |
8679 | static void md_clone_bio(struct mddev *mddev, struct bio **bio) |
8680 | { |
8681 | struct block_device *bdev = (*bio)->bi_bdev; |
8682 | struct md_io_clone *md_io_clone; |
8683 | struct bio *clone = |
8684 | bio_alloc_clone(bdev, bio_src: *bio, GFP_NOIO, bs: &mddev->io_clone_set); |
8685 | |
8686 | md_io_clone = container_of(clone, struct md_io_clone, bio_clone); |
8687 | md_io_clone->orig_bio = *bio; |
8688 | md_io_clone->mddev = mddev; |
8689 | if (blk_queue_io_stat(bdev->bd_disk->queue)) |
8690 | md_io_clone->start_time = bio_start_io_acct(bio: *bio); |
8691 | |
8692 | clone->bi_end_io = md_end_clone_io; |
8693 | clone->bi_private = md_io_clone; |
8694 | *bio = clone; |
8695 | } |
8696 | |
8697 | void md_account_bio(struct mddev *mddev, struct bio **bio) |
8698 | { |
8699 | percpu_ref_get(ref: &mddev->active_io); |
8700 | md_clone_bio(mddev, bio); |
8701 | } |
8702 | EXPORT_SYMBOL_GPL(md_account_bio); |
8703 | |
8704 | /* md_allow_write(mddev) |
8705 | * Calling this ensures that the array is marked 'active' so that writes |
8706 | * may proceed without blocking. It is important to call this before |
8707 | * attempting a GFP_KERNEL allocation while holding the mddev lock. |
8708 | * Must be called with mddev_lock held. |
8709 | */ |
8710 | void md_allow_write(struct mddev *mddev) |
8711 | { |
8712 | if (!mddev->pers) |
8713 | return; |
8714 | if (!md_is_rdwr(mddev)) |
8715 | return; |
8716 | if (!mddev->pers->sync_request) |
8717 | return; |
8718 | |
8719 | spin_lock(lock: &mddev->lock); |
8720 | if (mddev->in_sync) { |
8721 | mddev->in_sync = 0; |
8722 | set_bit(nr: MD_SB_CHANGE_CLEAN, addr: &mddev->sb_flags); |
8723 | set_bit(nr: MD_SB_CHANGE_PENDING, addr: &mddev->sb_flags); |
8724 | if (mddev->safemode_delay && |
8725 | mddev->safemode == 0) |
8726 | mddev->safemode = 1; |
8727 | spin_unlock(lock: &mddev->lock); |
8728 | md_update_sb(mddev, 0); |
8729 | sysfs_notify_dirent_safe(sd: mddev->sysfs_state); |
8730 | /* wait for the dirty state to be recorded in the metadata */ |
8731 | wait_event(mddev->sb_wait, |
8732 | !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)); |
8733 | } else |
8734 | spin_unlock(lock: &mddev->lock); |
8735 | } |
8736 | EXPORT_SYMBOL_GPL(md_allow_write); |
8737 | |
8738 | #define SYNC_MARKS 10 |
8739 | #define SYNC_MARK_STEP (3*HZ) |
8740 | #define UPDATE_FREQUENCY (5*60*HZ) |
8741 | void md_do_sync(struct md_thread *thread) |
8742 | { |
8743 | struct mddev *mddev = thread->mddev; |
8744 | struct mddev *mddev2; |
8745 | unsigned int currspeed = 0, window; |
8746 | sector_t max_sectors,j, io_sectors, recovery_done; |
8747 | unsigned long mark[SYNC_MARKS]; |
8748 | unsigned long update_time; |
8749 | sector_t mark_cnt[SYNC_MARKS]; |
8750 | int last_mark,m; |
8751 | sector_t last_check; |
8752 | int skipped = 0; |
8753 | struct md_rdev *rdev; |
8754 | char *desc, *action = NULL; |
8755 | struct blk_plug plug; |
8756 | int ret; |
8757 | |
8758 | /* just incase thread restarts... */ |
8759 | if (test_bit(MD_RECOVERY_DONE, &mddev->recovery) || |
8760 | test_bit(MD_RECOVERY_WAIT, &mddev->recovery)) |
8761 | return; |
8762 | if (!md_is_rdwr(mddev)) {/* never try to sync a read-only array */ |
8763 | set_bit(nr: MD_RECOVERY_INTR, addr: &mddev->recovery); |
8764 | return; |
8765 | } |
8766 | |
8767 | if (mddev_is_clustered(mddev)) { |
8768 | ret = md_cluster_ops->resync_start(mddev); |
8769 | if (ret) |
8770 | goto skip; |
8771 | |
8772 | set_bit(nr: MD_CLUSTER_RESYNC_LOCKED, addr: &mddev->flags); |
8773 | if (!(test_bit(MD_RECOVERY_SYNC, &mddev->recovery) || |
8774 | test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) || |
8775 | test_bit(MD_RECOVERY_RECOVER, &mddev->recovery)) |
8776 | && ((unsigned long long)mddev->curr_resync_completed |
8777 | < (unsigned long long)mddev->resync_max_sectors)) |
8778 | goto skip; |
8779 | } |
8780 | |
8781 | if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { |
8782 | if (test_bit(MD_RECOVERY_CHECK, &mddev->recovery)) { |
8783 | desc = "data-check" ; |
8784 | action = "check" ; |
8785 | } else if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) { |
8786 | desc = "requested-resync" ; |
8787 | action = "repair" ; |
8788 | } else |
8789 | desc = "resync" ; |
8790 | } else if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) |
8791 | desc = "reshape" ; |
8792 | else |
8793 | desc = "recovery" ; |
8794 | |
8795 | mddev->last_sync_action = action ?: desc; |
8796 | |
8797 | /* |
8798 | * Before starting a resync we must have set curr_resync to |
8799 | * 2, and then checked that every "conflicting" array has curr_resync |
8800 | * less than ours. When we find one that is the same or higher |
8801 | * we wait on resync_wait. To avoid deadlock, we reduce curr_resync |
8802 | * to 1 if we choose to yield (based arbitrarily on address of mddev structure). |
8803 | * This will mean we have to start checking from the beginning again. |
8804 | * |
8805 | */ |
8806 | |
8807 | do { |
8808 | int mddev2_minor = -1; |
8809 | mddev->curr_resync = MD_RESYNC_DELAYED; |
8810 | |
8811 | try_again: |
8812 | if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) |
8813 | goto skip; |
8814 | spin_lock(lock: &all_mddevs_lock); |
8815 | list_for_each_entry(mddev2, &all_mddevs, all_mddevs) { |
8816 | if (test_bit(MD_DELETED, &mddev2->flags)) |
8817 | continue; |
8818 | if (mddev2 == mddev) |
8819 | continue; |
8820 | if (!mddev->parallel_resync |
8821 | && mddev2->curr_resync |
8822 | && match_mddev_units(mddev1: mddev, mddev2)) { |
8823 | DEFINE_WAIT(wq); |
8824 | if (mddev < mddev2 && |
8825 | mddev->curr_resync == MD_RESYNC_DELAYED) { |
8826 | /* arbitrarily yield */ |
8827 | mddev->curr_resync = MD_RESYNC_YIELDED; |
8828 | wake_up(&resync_wait); |
8829 | } |
8830 | if (mddev > mddev2 && |
8831 | mddev->curr_resync == MD_RESYNC_YIELDED) |
8832 | /* no need to wait here, we can wait the next |
8833 | * time 'round when curr_resync == 2 |
8834 | */ |
8835 | continue; |
8836 | /* We need to wait 'interruptible' so as not to |
8837 | * contribute to the load average, and not to |
8838 | * be caught by 'softlockup' |
8839 | */ |
8840 | prepare_to_wait(wq_head: &resync_wait, wq_entry: &wq, TASK_INTERRUPTIBLE); |
8841 | if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery) && |
8842 | mddev2->curr_resync >= mddev->curr_resync) { |
8843 | if (mddev2_minor != mddev2->md_minor) { |
8844 | mddev2_minor = mddev2->md_minor; |
8845 | pr_info("md: delaying %s of %s until %s has finished (they share one or more physical units)\n" , |
8846 | desc, mdname(mddev), |
8847 | mdname(mddev2)); |
8848 | } |
8849 | spin_unlock(lock: &all_mddevs_lock); |
8850 | |
8851 | if (signal_pending(current)) |
8852 | flush_signals(current); |
8853 | schedule(); |
8854 | finish_wait(wq_head: &resync_wait, wq_entry: &wq); |
8855 | goto try_again; |
8856 | } |
8857 | finish_wait(wq_head: &resync_wait, wq_entry: &wq); |
8858 | } |
8859 | } |
8860 | spin_unlock(lock: &all_mddevs_lock); |
8861 | } while (mddev->curr_resync < MD_RESYNC_DELAYED); |
8862 | |
8863 | j = 0; |
8864 | if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { |
8865 | /* resync follows the size requested by the personality, |
8866 | * which defaults to physical size, but can be virtual size |
8867 | */ |
8868 | max_sectors = mddev->resync_max_sectors; |
8869 | atomic64_set(v: &mddev->resync_mismatches, i: 0); |
8870 | /* we don't use the checkpoint if there's a bitmap */ |
8871 | if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) |
8872 | j = mddev->resync_min; |
8873 | else if (!mddev->bitmap) |
8874 | j = mddev->recovery_cp; |
8875 | |
8876 | } else if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) { |
8877 | max_sectors = mddev->resync_max_sectors; |
8878 | /* |
8879 | * If the original node aborts reshaping then we continue the |
8880 | * reshaping, so set j again to avoid restart reshape from the |
8881 | * first beginning |
8882 | */ |
8883 | if (mddev_is_clustered(mddev) && |
8884 | mddev->reshape_position != MaxSector) |
8885 | j = mddev->reshape_position; |
8886 | } else { |
8887 | /* recovery follows the physical size of devices */ |
8888 | max_sectors = mddev->dev_sectors; |
8889 | j = MaxSector; |
8890 | rcu_read_lock(); |
8891 | rdev_for_each_rcu(rdev, mddev) |
8892 | if (rdev->raid_disk >= 0 && |
8893 | !test_bit(Journal, &rdev->flags) && |
8894 | !test_bit(Faulty, &rdev->flags) && |
8895 | !test_bit(In_sync, &rdev->flags) && |
8896 | rdev->recovery_offset < j) |
8897 | j = rdev->recovery_offset; |
8898 | rcu_read_unlock(); |
8899 | |
8900 | /* If there is a bitmap, we need to make sure all |
8901 | * writes that started before we added a spare |
8902 | * complete before we start doing a recovery. |
8903 | * Otherwise the write might complete and (via |
8904 | * bitmap_endwrite) set a bit in the bitmap after the |
8905 | * recovery has checked that bit and skipped that |
8906 | * region. |
8907 | */ |
8908 | if (mddev->bitmap) { |
8909 | mddev->pers->quiesce(mddev, 1); |
8910 | mddev->pers->quiesce(mddev, 0); |
8911 | } |
8912 | } |
8913 | |
8914 | pr_info("md: %s of RAID array %s\n" , desc, mdname(mddev)); |
8915 | pr_debug("md: minimum _guaranteed_ speed: %d KB/sec/disk.\n" , speed_min(mddev)); |
8916 | pr_debug("md: using maximum available idle IO bandwidth (but not more than %d KB/sec) for %s.\n" , |
8917 | speed_max(mddev), desc); |
8918 | |
8919 | is_mddev_idle(mddev, init: 1); /* this initializes IO event counters */ |
8920 | |
8921 | io_sectors = 0; |
8922 | for (m = 0; m < SYNC_MARKS; m++) { |
8923 | mark[m] = jiffies; |
8924 | mark_cnt[m] = io_sectors; |
8925 | } |
8926 | last_mark = 0; |
8927 | mddev->resync_mark = mark[last_mark]; |
8928 | mddev->resync_mark_cnt = mark_cnt[last_mark]; |
8929 | |
8930 | /* |
8931 | * Tune reconstruction: |
8932 | */ |
8933 | window = 32 * (PAGE_SIZE / 512); |
8934 | pr_debug("md: using %dk window, over a total of %lluk.\n" , |
8935 | window/2, (unsigned long long)max_sectors/2); |
8936 | |
8937 | atomic_set(v: &mddev->recovery_active, i: 0); |
8938 | last_check = 0; |
8939 | |
8940 | if (j >= MD_RESYNC_ACTIVE) { |
8941 | pr_debug("md: resuming %s of %s from checkpoint.\n" , |
8942 | desc, mdname(mddev)); |
8943 | mddev->curr_resync = j; |
8944 | } else |
8945 | mddev->curr_resync = MD_RESYNC_ACTIVE; /* no longer delayed */ |
8946 | mddev->curr_resync_completed = j; |
8947 | sysfs_notify_dirent_safe(sd: mddev->sysfs_completed); |
8948 | md_new_event(); |
8949 | update_time = jiffies; |
8950 | |
8951 | blk_start_plug(&plug); |
8952 | while (j < max_sectors) { |
8953 | sector_t sectors; |
8954 | |
8955 | skipped = 0; |
8956 | |
8957 | if (!test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && |
8958 | ((mddev->curr_resync > mddev->curr_resync_completed && |
8959 | (mddev->curr_resync - mddev->curr_resync_completed) |
8960 | > (max_sectors >> 4)) || |
8961 | time_after_eq(jiffies, update_time + UPDATE_FREQUENCY) || |
8962 | (j - mddev->curr_resync_completed)*2 |
8963 | >= mddev->resync_max - mddev->curr_resync_completed || |
8964 | mddev->curr_resync_completed > mddev->resync_max |
8965 | )) { |
8966 | /* time to update curr_resync_completed */ |
8967 | wait_event(mddev->recovery_wait, |
8968 | atomic_read(&mddev->recovery_active) == 0); |
8969 | mddev->curr_resync_completed = j; |
8970 | if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) && |
8971 | j > mddev->recovery_cp) |
8972 | mddev->recovery_cp = j; |
8973 | update_time = jiffies; |
8974 | set_bit(nr: MD_SB_CHANGE_CLEAN, addr: &mddev->sb_flags); |
8975 | sysfs_notify_dirent_safe(sd: mddev->sysfs_completed); |
8976 | } |
8977 | |
8978 | while (j >= mddev->resync_max && |
8979 | !test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { |
8980 | /* As this condition is controlled by user-space, |
8981 | * we can block indefinitely, so use '_interruptible' |
8982 | * to avoid triggering warnings. |
8983 | */ |
8984 | flush_signals(current); /* just in case */ |
8985 | wait_event_interruptible(mddev->recovery_wait, |
8986 | mddev->resync_max > j |
8987 | || test_bit(MD_RECOVERY_INTR, |
8988 | &mddev->recovery)); |
8989 | } |
8990 | |
8991 | if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) |
8992 | break; |
8993 | |
8994 | sectors = mddev->pers->sync_request(mddev, j, &skipped); |
8995 | if (sectors == 0) { |
8996 | set_bit(nr: MD_RECOVERY_INTR, addr: &mddev->recovery); |
8997 | break; |
8998 | } |
8999 | |
9000 | if (!skipped) { /* actual IO requested */ |
9001 | io_sectors += sectors; |
9002 | atomic_add(i: sectors, v: &mddev->recovery_active); |
9003 | } |
9004 | |
9005 | if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) |
9006 | break; |
9007 | |
9008 | j += sectors; |
9009 | if (j > max_sectors) |
9010 | /* when skipping, extra large numbers can be returned. */ |
9011 | j = max_sectors; |
9012 | if (j >= MD_RESYNC_ACTIVE) |
9013 | mddev->curr_resync = j; |
9014 | mddev->curr_mark_cnt = io_sectors; |
9015 | if (last_check == 0) |
9016 | /* this is the earliest that rebuild will be |
9017 | * visible in /proc/mdstat |
9018 | */ |
9019 | md_new_event(); |
9020 | |
9021 | if (last_check + window > io_sectors || j == max_sectors) |
9022 | continue; |
9023 | |
9024 | last_check = io_sectors; |
9025 | repeat: |
9026 | if (time_after_eq(jiffies, mark[last_mark] + SYNC_MARK_STEP )) { |
9027 | /* step marks */ |
9028 | int next = (last_mark+1) % SYNC_MARKS; |
9029 | |
9030 | mddev->resync_mark = mark[next]; |
9031 | mddev->resync_mark_cnt = mark_cnt[next]; |
9032 | mark[next] = jiffies; |
9033 | mark_cnt[next] = io_sectors - atomic_read(v: &mddev->recovery_active); |
9034 | last_mark = next; |
9035 | } |
9036 | |
9037 | if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) |
9038 | break; |
9039 | |
9040 | /* |
9041 | * this loop exits only if either when we are slower than |
9042 | * the 'hard' speed limit, or the system was IO-idle for |
9043 | * a jiffy. |
9044 | * the system might be non-idle CPU-wise, but we only care |
9045 | * about not overloading the IO subsystem. (things like an |
9046 | * e2fsck being done on the RAID array should execute fast) |
9047 | */ |
9048 | cond_resched(); |
9049 | |
9050 | recovery_done = io_sectors - atomic_read(v: &mddev->recovery_active); |
9051 | currspeed = ((unsigned long)(recovery_done - mddev->resync_mark_cnt))/2 |
9052 | /((jiffies-mddev->resync_mark)/HZ +1) +1; |
9053 | |
9054 | if (currspeed > speed_min(mddev)) { |
9055 | if (currspeed > speed_max(mddev)) { |
9056 | msleep(msecs: 500); |
9057 | goto repeat; |
9058 | } |
9059 | if (!is_mddev_idle(mddev, init: 0)) { |
9060 | /* |
9061 | * Give other IO more of a chance. |
9062 | * The faster the devices, the less we wait. |
9063 | */ |
9064 | wait_event(mddev->recovery_wait, |
9065 | !atomic_read(&mddev->recovery_active)); |
9066 | } |
9067 | } |
9068 | } |
9069 | pr_info("md: %s: %s %s.\n" ,mdname(mddev), desc, |
9070 | test_bit(MD_RECOVERY_INTR, &mddev->recovery) |
9071 | ? "interrupted" : "done" ); |
9072 | /* |
9073 | * this also signals 'finished resyncing' to md_stop |
9074 | */ |
9075 | blk_finish_plug(&plug); |
9076 | wait_event(mddev->recovery_wait, !atomic_read(&mddev->recovery_active)); |
9077 | |
9078 | if (!test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && |
9079 | !test_bit(MD_RECOVERY_INTR, &mddev->recovery) && |
9080 | mddev->curr_resync >= MD_RESYNC_ACTIVE) { |
9081 | mddev->curr_resync_completed = mddev->curr_resync; |
9082 | sysfs_notify_dirent_safe(sd: mddev->sysfs_completed); |
9083 | } |
9084 | mddev->pers->sync_request(mddev, max_sectors, &skipped); |
9085 | |
9086 | if (!test_bit(MD_RECOVERY_CHECK, &mddev->recovery) && |
9087 | mddev->curr_resync > MD_RESYNC_ACTIVE) { |
9088 | if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { |
9089 | if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { |
9090 | if (mddev->curr_resync >= mddev->recovery_cp) { |
9091 | pr_debug("md: checkpointing %s of %s.\n" , |
9092 | desc, mdname(mddev)); |
9093 | if (test_bit(MD_RECOVERY_ERROR, |
9094 | &mddev->recovery)) |
9095 | mddev->recovery_cp = |
9096 | mddev->curr_resync_completed; |
9097 | else |
9098 | mddev->recovery_cp = |
9099 | mddev->curr_resync; |
9100 | } |
9101 | } else |
9102 | mddev->recovery_cp = MaxSector; |
9103 | } else { |
9104 | if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) |
9105 | mddev->curr_resync = MaxSector; |
9106 | if (!test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && |
9107 | test_bit(MD_RECOVERY_RECOVER, &mddev->recovery)) { |
9108 | rcu_read_lock(); |
9109 | rdev_for_each_rcu(rdev, mddev) |
9110 | if (rdev->raid_disk >= 0 && |
9111 | mddev->delta_disks >= 0 && |
9112 | !test_bit(Journal, &rdev->flags) && |
9113 | !test_bit(Faulty, &rdev->flags) && |
9114 | !test_bit(In_sync, &rdev->flags) && |
9115 | rdev->recovery_offset < mddev->curr_resync) |
9116 | rdev->recovery_offset = mddev->curr_resync; |
9117 | rcu_read_unlock(); |
9118 | } |
9119 | } |
9120 | } |
9121 | skip: |
9122 | /* set CHANGE_PENDING here since maybe another update is needed, |
9123 | * so other nodes are informed. It should be harmless for normal |
9124 | * raid */ |
9125 | set_mask_bits(&mddev->sb_flags, 0, |
9126 | BIT(MD_SB_CHANGE_PENDING) | BIT(MD_SB_CHANGE_DEVS)); |
9127 | |
9128 | if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && |
9129 | !test_bit(MD_RECOVERY_INTR, &mddev->recovery) && |
9130 | mddev->delta_disks > 0 && |
9131 | mddev->pers->finish_reshape && |
9132 | mddev->pers->size && |
9133 | mddev->queue) { |
9134 | mddev_lock_nointr(mddev); |
9135 | md_set_array_sectors(mddev, mddev->pers->size(mddev, 0, 0)); |
9136 | mddev_unlock(mddev); |
9137 | if (!mddev_is_clustered(mddev)) |
9138 | set_capacity_and_notify(disk: mddev->gendisk, |
9139 | size: mddev->array_sectors); |
9140 | } |
9141 | |
9142 | spin_lock(lock: &mddev->lock); |
9143 | if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { |
9144 | /* We completed so min/max setting can be forgotten if used. */ |
9145 | if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) |
9146 | mddev->resync_min = 0; |
9147 | mddev->resync_max = MaxSector; |
9148 | } else if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) |
9149 | mddev->resync_min = mddev->curr_resync_completed; |
9150 | set_bit(nr: MD_RECOVERY_DONE, addr: &mddev->recovery); |
9151 | mddev->curr_resync = MD_RESYNC_NONE; |
9152 | spin_unlock(lock: &mddev->lock); |
9153 | |
9154 | wake_up(&resync_wait); |
9155 | md_wakeup_thread(mddev->thread); |
9156 | return; |
9157 | } |
9158 | EXPORT_SYMBOL_GPL(md_do_sync); |
9159 | |
9160 | static bool rdev_removeable(struct md_rdev *rdev) |
9161 | { |
9162 | /* rdev is not used. */ |
9163 | if (rdev->raid_disk < 0) |
9164 | return false; |
9165 | |
9166 | /* There are still inflight io, don't remove this rdev. */ |
9167 | if (atomic_read(v: &rdev->nr_pending)) |
9168 | return false; |
9169 | |
9170 | /* |
9171 | * An error occurred but has not yet been acknowledged by the metadata |
9172 | * handler, don't remove this rdev. |
9173 | */ |
9174 | if (test_bit(Blocked, &rdev->flags)) |
9175 | return false; |
9176 | |
9177 | /* Fautly rdev is not used, it's safe to remove it. */ |
9178 | if (test_bit(Faulty, &rdev->flags)) |
9179 | return true; |
9180 | |
9181 | /* Journal disk can only be removed if it's faulty. */ |
9182 | if (test_bit(Journal, &rdev->flags)) |
9183 | return false; |
9184 | |
9185 | /* |
9186 | * 'In_sync' is cleared while 'raid_disk' is valid, which means |
9187 | * replacement has just become active from pers->spare_active(), and |
9188 | * then pers->hot_remove_disk() will replace this rdev with replacement. |
9189 | */ |
9190 | if (!test_bit(In_sync, &rdev->flags)) |
9191 | return true; |
9192 | |
9193 | return false; |
9194 | } |
9195 | |
9196 | static bool rdev_is_spare(struct md_rdev *rdev) |
9197 | { |
9198 | return !test_bit(Candidate, &rdev->flags) && rdev->raid_disk >= 0 && |
9199 | !test_bit(In_sync, &rdev->flags) && |
9200 | !test_bit(Journal, &rdev->flags) && |
9201 | !test_bit(Faulty, &rdev->flags); |
9202 | } |
9203 | |
9204 | static bool rdev_addable(struct md_rdev *rdev) |
9205 | { |
9206 | /* rdev is already used, don't add it again. */ |
9207 | if (test_bit(Candidate, &rdev->flags) || rdev->raid_disk >= 0 || |
9208 | test_bit(Faulty, &rdev->flags)) |
9209 | return false; |
9210 | |
9211 | /* Allow to add journal disk. */ |
9212 | if (test_bit(Journal, &rdev->flags)) |
9213 | return true; |
9214 | |
9215 | /* Allow to add if array is read-write. */ |
9216 | if (md_is_rdwr(mddev: rdev->mddev)) |
9217 | return true; |
9218 | |
9219 | /* |
9220 | * For read-only array, only allow to readd a rdev. And if bitmap is |
9221 | * used, don't allow to readd a rdev that is too old. |
9222 | */ |
9223 | if (rdev->saved_raid_disk >= 0 && !test_bit(Bitmap_sync, &rdev->flags)) |
9224 | return true; |
9225 | |
9226 | return false; |
9227 | } |
9228 | |
9229 | static bool md_spares_need_change(struct mddev *mddev) |
9230 | { |
9231 | struct md_rdev *rdev; |
9232 | |
9233 | rdev_for_each(rdev, mddev) |
9234 | if (rdev_removeable(rdev) || rdev_addable(rdev)) |
9235 | return true; |
9236 | return false; |
9237 | } |
9238 | |
9239 | static int remove_and_add_spares(struct mddev *mddev, |
9240 | struct md_rdev *this) |
9241 | { |
9242 | struct md_rdev *rdev; |
9243 | int spares = 0; |
9244 | int removed = 0; |
9245 | bool remove_some = false; |
9246 | |
9247 | if (this && test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) |
9248 | /* Mustn't remove devices when resync thread is running */ |
9249 | return 0; |
9250 | |
9251 | rdev_for_each(rdev, mddev) { |
9252 | if ((this == NULL || rdev == this) && |
9253 | rdev->raid_disk >= 0 && |
9254 | !test_bit(Blocked, &rdev->flags) && |
9255 | test_bit(Faulty, &rdev->flags) && |
9256 | atomic_read(v: &rdev->nr_pending)==0) { |
9257 | /* Faulty non-Blocked devices with nr_pending == 0 |
9258 | * never get nr_pending incremented, |
9259 | * never get Faulty cleared, and never get Blocked set. |
9260 | * So we can synchronize_rcu now rather than once per device |
9261 | */ |
9262 | remove_some = true; |
9263 | set_bit(nr: RemoveSynchronized, addr: &rdev->flags); |
9264 | } |
9265 | } |
9266 | |
9267 | if (remove_some) |
9268 | synchronize_rcu(); |
9269 | rdev_for_each(rdev, mddev) { |
9270 | if ((this == NULL || rdev == this) && |
9271 | (test_bit(RemoveSynchronized, &rdev->flags) || |
9272 | rdev_removeable(rdev))) { |
9273 | if (mddev->pers->hot_remove_disk( |
9274 | mddev, rdev) == 0) { |
9275 | sysfs_unlink_rdev(mddev, rdev); |
9276 | rdev->saved_raid_disk = rdev->raid_disk; |
9277 | rdev->raid_disk = -1; |
9278 | removed++; |
9279 | } |
9280 | } |
9281 | if (remove_some && test_bit(RemoveSynchronized, &rdev->flags)) |
9282 | clear_bit(nr: RemoveSynchronized, addr: &rdev->flags); |
9283 | } |
9284 | |
9285 | if (removed && mddev->kobj.sd) |
9286 | sysfs_notify_dirent_safe(sd: mddev->sysfs_degraded); |
9287 | |
9288 | if (this && removed) |
9289 | goto no_add; |
9290 | |
9291 | rdev_for_each(rdev, mddev) { |
9292 | if (this && this != rdev) |
9293 | continue; |
9294 | if (rdev_is_spare(rdev)) |
9295 | spares++; |
9296 | if (!rdev_addable(rdev)) |
9297 | continue; |
9298 | if (!test_bit(Journal, &rdev->flags)) |
9299 | rdev->recovery_offset = 0; |
9300 | if (mddev->pers->hot_add_disk(mddev, rdev) == 0) { |
9301 | /* failure here is OK */ |
9302 | sysfs_link_rdev(mddev, rdev); |
9303 | if (!test_bit(Journal, &rdev->flags)) |
9304 | spares++; |
9305 | md_new_event(); |
9306 | set_bit(nr: MD_SB_CHANGE_DEVS, addr: &mddev->sb_flags); |
9307 | } |
9308 | } |
9309 | no_add: |
9310 | if (removed) |
9311 | set_bit(nr: MD_SB_CHANGE_DEVS, addr: &mddev->sb_flags); |
9312 | return spares; |
9313 | } |
9314 | |
9315 | static bool md_choose_sync_action(struct mddev *mddev, int *spares) |
9316 | { |
9317 | /* Check if reshape is in progress first. */ |
9318 | if (mddev->reshape_position != MaxSector) { |
9319 | if (mddev->pers->check_reshape == NULL || |
9320 | mddev->pers->check_reshape(mddev) != 0) |
9321 | return false; |
9322 | |
9323 | set_bit(nr: MD_RECOVERY_RESHAPE, addr: &mddev->recovery); |
9324 | clear_bit(nr: MD_RECOVERY_RECOVER, addr: &mddev->recovery); |
9325 | return true; |
9326 | } |
9327 | |
9328 | /* |
9329 | * Remove any failed drives, then add spares if possible. Spares are |
9330 | * also removed and re-added, to allow the personality to fail the |
9331 | * re-add. |
9332 | */ |
9333 | *spares = remove_and_add_spares(mddev, NULL); |
9334 | if (*spares) { |
9335 | clear_bit(nr: MD_RECOVERY_SYNC, addr: &mddev->recovery); |
9336 | clear_bit(nr: MD_RECOVERY_CHECK, addr: &mddev->recovery); |
9337 | clear_bit(nr: MD_RECOVERY_REQUESTED, addr: &mddev->recovery); |
9338 | |
9339 | /* Start new recovery. */ |
9340 | set_bit(nr: MD_RECOVERY_RECOVER, addr: &mddev->recovery); |
9341 | return true; |
9342 | } |
9343 | |
9344 | /* Check if recovery is in progress. */ |
9345 | if (mddev->recovery_cp < MaxSector) { |
9346 | set_bit(nr: MD_RECOVERY_SYNC, addr: &mddev->recovery); |
9347 | clear_bit(nr: MD_RECOVERY_RECOVER, addr: &mddev->recovery); |
9348 | return true; |
9349 | } |
9350 | |
9351 | /* Delay to choose resync/check/repair in md_do_sync(). */ |
9352 | if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) |
9353 | return true; |
9354 | |
9355 | /* Nothing to be done */ |
9356 | return false; |
9357 | } |
9358 | |
9359 | static void md_start_sync(struct work_struct *ws) |
9360 | { |
9361 | struct mddev *mddev = container_of(ws, struct mddev, sync_work); |
9362 | int spares = 0; |
9363 | bool suspend = false; |
9364 | |
9365 | if (md_spares_need_change(mddev)) |
9366 | suspend = true; |
9367 | |
9368 | suspend ? mddev_suspend_and_lock_nointr(mddev) : |
9369 | mddev_lock_nointr(mddev); |
9370 | |
9371 | if (!md_is_rdwr(mddev)) { |
9372 | /* |
9373 | * On a read-only array we can: |
9374 | * - remove failed devices |
9375 | * - add already-in_sync devices if the array itself is in-sync. |
9376 | * As we only add devices that are already in-sync, we can |
9377 | * activate the spares immediately. |
9378 | */ |
9379 | remove_and_add_spares(mddev, NULL); |
9380 | goto not_running; |
9381 | } |
9382 | |
9383 | if (!md_choose_sync_action(mddev, spares: &spares)) |
9384 | goto not_running; |
9385 | |
9386 | if (!mddev->pers->sync_request) |
9387 | goto not_running; |
9388 | |
9389 | /* |
9390 | * We are adding a device or devices to an array which has the bitmap |
9391 | * stored on all devices. So make sure all bitmap pages get written. |
9392 | */ |
9393 | if (spares) |
9394 | md_bitmap_write_all(bitmap: mddev->bitmap); |
9395 | |
9396 | rcu_assign_pointer(mddev->sync_thread, |
9397 | md_register_thread(md_do_sync, mddev, "resync" )); |
9398 | if (!mddev->sync_thread) { |
9399 | pr_warn("%s: could not start resync thread...\n" , |
9400 | mdname(mddev)); |
9401 | /* leave the spares where they are, it shouldn't hurt */ |
9402 | goto not_running; |
9403 | } |
9404 | |
9405 | suspend ? mddev_unlock_and_resume(mddev) : mddev_unlock(mddev); |
9406 | md_wakeup_thread(mddev->sync_thread); |
9407 | sysfs_notify_dirent_safe(sd: mddev->sysfs_action); |
9408 | md_new_event(); |
9409 | return; |
9410 | |
9411 | not_running: |
9412 | clear_bit(nr: MD_RECOVERY_SYNC, addr: &mddev->recovery); |
9413 | clear_bit(nr: MD_RECOVERY_RESHAPE, addr: &mddev->recovery); |
9414 | clear_bit(nr: MD_RECOVERY_REQUESTED, addr: &mddev->recovery); |
9415 | clear_bit(nr: MD_RECOVERY_CHECK, addr: &mddev->recovery); |
9416 | clear_bit(nr: MD_RECOVERY_RUNNING, addr: &mddev->recovery); |
9417 | suspend ? mddev_unlock_and_resume(mddev) : mddev_unlock(mddev); |
9418 | |
9419 | wake_up(&resync_wait); |
9420 | if (test_and_clear_bit(nr: MD_RECOVERY_RECOVER, addr: &mddev->recovery) && |
9421 | mddev->sysfs_action) |
9422 | sysfs_notify_dirent_safe(sd: mddev->sysfs_action); |
9423 | } |
9424 | |
9425 | /* |
9426 | * This routine is regularly called by all per-raid-array threads to |
9427 | * deal with generic issues like resync and super-block update. |
9428 | * Raid personalities that don't have a thread (linear/raid0) do not |
9429 | * need this as they never do any recovery or update the superblock. |
9430 | * |
9431 | * It does not do any resync itself, but rather "forks" off other threads |
9432 | * to do that as needed. |
9433 | * When it is determined that resync is needed, we set MD_RECOVERY_RUNNING in |
9434 | * "->recovery" and create a thread at ->sync_thread. |
9435 | * When the thread finishes it sets MD_RECOVERY_DONE |
9436 | * and wakeups up this thread which will reap the thread and finish up. |
9437 | * This thread also removes any faulty devices (with nr_pending == 0). |
9438 | * |
9439 | * The overall approach is: |
9440 | * 1/ if the superblock needs updating, update it. |
9441 | * 2/ If a recovery thread is running, don't do anything else. |
9442 | * 3/ If recovery has finished, clean up, possibly marking spares active. |
9443 | * 4/ If there are any faulty devices, remove them. |
9444 | * 5/ If array is degraded, try to add spares devices |
9445 | * 6/ If array has spares or is not in-sync, start a resync thread. |
9446 | */ |
9447 | void md_check_recovery(struct mddev *mddev) |
9448 | { |
9449 | if (READ_ONCE(mddev->suspended)) |
9450 | return; |
9451 | |
9452 | if (mddev->bitmap) |
9453 | md_bitmap_daemon_work(mddev); |
9454 | |
9455 | if (signal_pending(current)) { |
9456 | if (mddev->pers->sync_request && !mddev->external) { |
9457 | pr_debug("md: %s in immediate safe mode\n" , |
9458 | mdname(mddev)); |
9459 | mddev->safemode = 2; |
9460 | } |
9461 | flush_signals(current); |
9462 | } |
9463 | |
9464 | if (!md_is_rdwr(mddev) && |
9465 | !test_bit(MD_RECOVERY_NEEDED, &mddev->recovery)) |
9466 | return; |
9467 | if ( ! ( |
9468 | (mddev->sb_flags & ~ (1<<MD_SB_CHANGE_PENDING)) || |
9469 | test_bit(MD_RECOVERY_NEEDED, &mddev->recovery) || |
9470 | test_bit(MD_RECOVERY_DONE, &mddev->recovery) || |
9471 | (mddev->external == 0 && mddev->safemode == 1) || |
9472 | (mddev->safemode == 2 |
9473 | && !mddev->in_sync && mddev->recovery_cp == MaxSector) |
9474 | )) |
9475 | return; |
9476 | |
9477 | if (mddev_trylock(mddev)) { |
9478 | bool try_set_sync = mddev->safemode != 0; |
9479 | |
9480 | if (!mddev->external && mddev->safemode == 1) |
9481 | mddev->safemode = 0; |
9482 | |
9483 | if (!md_is_rdwr(mddev)) { |
9484 | struct md_rdev *rdev; |
9485 | |
9486 | if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) { |
9487 | /* sync_work already queued. */ |
9488 | clear_bit(nr: MD_RECOVERY_NEEDED, addr: &mddev->recovery); |
9489 | goto unlock; |
9490 | } |
9491 | |
9492 | if (!mddev->external && mddev->in_sync) |
9493 | /* |
9494 | * 'Blocked' flag not needed as failed devices |
9495 | * will be recorded if array switched to read/write. |
9496 | * Leaving it set will prevent the device |
9497 | * from being removed. |
9498 | */ |
9499 | rdev_for_each(rdev, mddev) |
9500 | clear_bit(nr: Blocked, addr: &rdev->flags); |
9501 | |
9502 | /* |
9503 | * There is no thread, but we need to call |
9504 | * ->spare_active and clear saved_raid_disk |
9505 | */ |
9506 | set_bit(nr: MD_RECOVERY_INTR, addr: &mddev->recovery); |
9507 | md_reap_sync_thread(mddev); |
9508 | |
9509 | /* |
9510 | * Let md_start_sync() to remove and add rdevs to the |
9511 | * array. |
9512 | */ |
9513 | if (md_spares_need_change(mddev)) { |
9514 | set_bit(nr: MD_RECOVERY_RUNNING, addr: &mddev->recovery); |
9515 | queue_work(wq: md_misc_wq, work: &mddev->sync_work); |
9516 | } |
9517 | |
9518 | clear_bit(nr: MD_RECOVERY_RECOVER, addr: &mddev->recovery); |
9519 | clear_bit(nr: MD_RECOVERY_NEEDED, addr: &mddev->recovery); |
9520 | clear_bit(nr: MD_SB_CHANGE_PENDING, addr: &mddev->sb_flags); |
9521 | |
9522 | goto unlock; |
9523 | } |
9524 | |
9525 | if (mddev_is_clustered(mddev)) { |
9526 | struct md_rdev *rdev, *tmp; |
9527 | /* kick the device if another node issued a |
9528 | * remove disk. |
9529 | */ |
9530 | rdev_for_each_safe(rdev, tmp, mddev) { |
9531 | if (test_and_clear_bit(nr: ClusterRemove, addr: &rdev->flags) && |
9532 | rdev->raid_disk < 0) |
9533 | md_kick_rdev_from_array(rdev); |
9534 | } |
9535 | } |
9536 | |
9537 | if (try_set_sync && !mddev->external && !mddev->in_sync) { |
9538 | spin_lock(lock: &mddev->lock); |
9539 | set_in_sync(mddev); |
9540 | spin_unlock(lock: &mddev->lock); |
9541 | } |
9542 | |
9543 | if (mddev->sb_flags) |
9544 | md_update_sb(mddev, 0); |
9545 | |
9546 | /* |
9547 | * Never start a new sync thread if MD_RECOVERY_RUNNING is |
9548 | * still set. |
9549 | */ |
9550 | if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) { |
9551 | if (!test_bit(MD_RECOVERY_DONE, &mddev->recovery)) { |
9552 | /* resync/recovery still happening */ |
9553 | clear_bit(nr: MD_RECOVERY_NEEDED, addr: &mddev->recovery); |
9554 | goto unlock; |
9555 | } |
9556 | |
9557 | if (WARN_ON_ONCE(!mddev->sync_thread)) |
9558 | goto unlock; |
9559 | |
9560 | md_reap_sync_thread(mddev); |
9561 | goto unlock; |
9562 | } |
9563 | |
9564 | /* Set RUNNING before clearing NEEDED to avoid |
9565 | * any transients in the value of "sync_action". |
9566 | */ |
9567 | mddev->curr_resync_completed = 0; |
9568 | spin_lock(lock: &mddev->lock); |
9569 | set_bit(nr: MD_RECOVERY_RUNNING, addr: &mddev->recovery); |
9570 | spin_unlock(lock: &mddev->lock); |
9571 | /* Clear some bits that don't mean anything, but |
9572 | * might be left set |
9573 | */ |
9574 | clear_bit(nr: MD_RECOVERY_INTR, addr: &mddev->recovery); |
9575 | clear_bit(nr: MD_RECOVERY_DONE, addr: &mddev->recovery); |
9576 | |
9577 | if (test_and_clear_bit(nr: MD_RECOVERY_NEEDED, addr: &mddev->recovery) && |
9578 | !test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)) { |
9579 | queue_work(wq: md_misc_wq, work: &mddev->sync_work); |
9580 | } else { |
9581 | clear_bit(nr: MD_RECOVERY_RUNNING, addr: &mddev->recovery); |
9582 | wake_up(&resync_wait); |
9583 | } |
9584 | |
9585 | unlock: |
9586 | wake_up(&mddev->sb_wait); |
9587 | mddev_unlock(mddev); |
9588 | } |
9589 | } |
9590 | EXPORT_SYMBOL(md_check_recovery); |
9591 | |
9592 | void md_reap_sync_thread(struct mddev *mddev) |
9593 | { |
9594 | struct md_rdev *rdev; |
9595 | sector_t old_dev_sectors = mddev->dev_sectors; |
9596 | bool is_reshaped = false; |
9597 | |
9598 | /* resync has finished, collect result */ |
9599 | md_unregister_thread(mddev, &mddev->sync_thread); |
9600 | atomic_inc(v: &mddev->sync_seq); |
9601 | |
9602 | if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery) && |
9603 | !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery) && |
9604 | mddev->degraded != mddev->raid_disks) { |
9605 | /* success...*/ |
9606 | /* activate any spares */ |
9607 | if (mddev->pers->spare_active(mddev)) { |
9608 | sysfs_notify_dirent_safe(sd: mddev->sysfs_degraded); |
9609 | set_bit(nr: MD_SB_CHANGE_DEVS, addr: &mddev->sb_flags); |
9610 | } |
9611 | } |
9612 | if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && |
9613 | mddev->pers->finish_reshape) { |
9614 | mddev->pers->finish_reshape(mddev); |
9615 | if (mddev_is_clustered(mddev)) |
9616 | is_reshaped = true; |
9617 | } |
9618 | |
9619 | /* If array is no-longer degraded, then any saved_raid_disk |
9620 | * information must be scrapped. |
9621 | */ |
9622 | if (!mddev->degraded) |
9623 | rdev_for_each(rdev, mddev) |
9624 | rdev->saved_raid_disk = -1; |
9625 | |
9626 | md_update_sb(mddev, 1); |
9627 | /* MD_SB_CHANGE_PENDING should be cleared by md_update_sb, so we can |
9628 | * call resync_finish here if MD_CLUSTER_RESYNC_LOCKED is set by |
9629 | * clustered raid */ |
9630 | if (test_and_clear_bit(nr: MD_CLUSTER_RESYNC_LOCKED, addr: &mddev->flags)) |
9631 | md_cluster_ops->resync_finish(mddev); |
9632 | clear_bit(nr: MD_RECOVERY_RUNNING, addr: &mddev->recovery); |
9633 | clear_bit(nr: MD_RECOVERY_DONE, addr: &mddev->recovery); |
9634 | clear_bit(nr: MD_RECOVERY_SYNC, addr: &mddev->recovery); |
9635 | clear_bit(nr: MD_RECOVERY_RESHAPE, addr: &mddev->recovery); |
9636 | clear_bit(nr: MD_RECOVERY_REQUESTED, addr: &mddev->recovery); |
9637 | clear_bit(nr: MD_RECOVERY_CHECK, addr: &mddev->recovery); |
9638 | /* |
9639 | * We call md_cluster_ops->update_size here because sync_size could |
9640 | * be changed by md_update_sb, and MD_RECOVERY_RESHAPE is cleared, |
9641 | * so it is time to update size across cluster. |
9642 | */ |
9643 | if (mddev_is_clustered(mddev) && is_reshaped |
9644 | && !test_bit(MD_CLOSING, &mddev->flags)) |
9645 | md_cluster_ops->update_size(mddev, old_dev_sectors); |
9646 | /* flag recovery needed just to double check */ |
9647 | set_bit(nr: MD_RECOVERY_NEEDED, addr: &mddev->recovery); |
9648 | sysfs_notify_dirent_safe(sd: mddev->sysfs_completed); |
9649 | sysfs_notify_dirent_safe(sd: mddev->sysfs_action); |
9650 | md_new_event(); |
9651 | if (mddev->event_work.func) |
9652 | queue_work(wq: md_misc_wq, work: &mddev->event_work); |
9653 | wake_up(&resync_wait); |
9654 | } |
9655 | EXPORT_SYMBOL(md_reap_sync_thread); |
9656 | |
9657 | void md_wait_for_blocked_rdev(struct md_rdev *rdev, struct mddev *mddev) |
9658 | { |
9659 | sysfs_notify_dirent_safe(sd: rdev->sysfs_state); |
9660 | wait_event_timeout(rdev->blocked_wait, |
9661 | !test_bit(Blocked, &rdev->flags) && |
9662 | !test_bit(BlockedBadBlocks, &rdev->flags), |
9663 | msecs_to_jiffies(5000)); |
9664 | rdev_dec_pending(rdev, mddev); |
9665 | } |
9666 | EXPORT_SYMBOL(md_wait_for_blocked_rdev); |
9667 | |
9668 | void md_finish_reshape(struct mddev *mddev) |
9669 | { |
9670 | /* called be personality module when reshape completes. */ |
9671 | struct md_rdev *rdev; |
9672 | |
9673 | rdev_for_each(rdev, mddev) { |
9674 | if (rdev->data_offset > rdev->new_data_offset) |
9675 | rdev->sectors += rdev->data_offset - rdev->new_data_offset; |
9676 | else |
9677 | rdev->sectors -= rdev->new_data_offset - rdev->data_offset; |
9678 | rdev->data_offset = rdev->new_data_offset; |
9679 | } |
9680 | } |
9681 | EXPORT_SYMBOL(md_finish_reshape); |
9682 | |
9683 | /* Bad block management */ |
9684 | |
9685 | /* Returns 1 on success, 0 on failure */ |
9686 | int rdev_set_badblocks(struct md_rdev *rdev, sector_t s, int sectors, |
9687 | int is_new) |
9688 | { |
9689 | struct mddev *mddev = rdev->mddev; |
9690 | int rv; |
9691 | if (is_new) |
9692 | s += rdev->new_data_offset; |
9693 | else |
9694 | s += rdev->data_offset; |
9695 | rv = badblocks_set(bb: &rdev->badblocks, s, sectors, acknowledged: 0); |
9696 | if (rv == 0) { |
9697 | /* Make sure they get written out promptly */ |
9698 | if (test_bit(ExternalBbl, &rdev->flags)) |
9699 | sysfs_notify_dirent_safe(sd: rdev->sysfs_unack_badblocks); |
9700 | sysfs_notify_dirent_safe(sd: rdev->sysfs_state); |
9701 | set_mask_bits(&mddev->sb_flags, 0, |
9702 | BIT(MD_SB_CHANGE_CLEAN) | BIT(MD_SB_CHANGE_PENDING)); |
9703 | md_wakeup_thread(rdev->mddev->thread); |
9704 | return 1; |
9705 | } else |
9706 | return 0; |
9707 | } |
9708 | EXPORT_SYMBOL_GPL(rdev_set_badblocks); |
9709 | |
9710 | int rdev_clear_badblocks(struct md_rdev *rdev, sector_t s, int sectors, |
9711 | int is_new) |
9712 | { |
9713 | int rv; |
9714 | if (is_new) |
9715 | s += rdev->new_data_offset; |
9716 | else |
9717 | s += rdev->data_offset; |
9718 | rv = badblocks_clear(bb: &rdev->badblocks, s, sectors); |
9719 | if ((rv == 0) && test_bit(ExternalBbl, &rdev->flags)) |
9720 | sysfs_notify_dirent_safe(sd: rdev->sysfs_badblocks); |
9721 | return rv; |
9722 | } |
9723 | EXPORT_SYMBOL_GPL(rdev_clear_badblocks); |
9724 | |
9725 | static int md_notify_reboot(struct notifier_block *this, |
9726 | unsigned long code, void *x) |
9727 | { |
9728 | struct mddev *mddev, *n; |
9729 | int need_delay = 0; |
9730 | |
9731 | spin_lock(lock: &all_mddevs_lock); |
9732 | list_for_each_entry_safe(mddev, n, &all_mddevs, all_mddevs) { |
9733 | if (!mddev_get(mddev)) |
9734 | continue; |
9735 | spin_unlock(lock: &all_mddevs_lock); |
9736 | if (mddev_trylock(mddev)) { |
9737 | if (mddev->pers) |
9738 | __md_stop_writes(mddev); |
9739 | if (mddev->persistent) |
9740 | mddev->safemode = 2; |
9741 | mddev_unlock(mddev); |
9742 | } |
9743 | need_delay = 1; |
9744 | mddev_put(mddev); |
9745 | spin_lock(lock: &all_mddevs_lock); |
9746 | } |
9747 | spin_unlock(lock: &all_mddevs_lock); |
9748 | |
9749 | /* |
9750 | * certain more exotic SCSI devices are known to be |
9751 | * volatile wrt too early system reboots. While the |
9752 | * right place to handle this issue is the given |
9753 | * driver, we do want to have a safe RAID driver ... |
9754 | */ |
9755 | if (need_delay) |
9756 | msleep(msecs: 1000); |
9757 | |
9758 | return NOTIFY_DONE; |
9759 | } |
9760 | |
9761 | static struct notifier_block md_notifier = { |
9762 | .notifier_call = md_notify_reboot, |
9763 | .next = NULL, |
9764 | .priority = INT_MAX, /* before any real devices */ |
9765 | }; |
9766 | |
9767 | static void md_geninit(void) |
9768 | { |
9769 | pr_debug("md: sizeof(mdp_super_t) = %d\n" , (int)sizeof(mdp_super_t)); |
9770 | |
9771 | proc_create(name: "mdstat" , S_IRUGO, NULL, proc_ops: &mdstat_proc_ops); |
9772 | } |
9773 | |
9774 | static int __init md_init(void) |
9775 | { |
9776 | int ret = -ENOMEM; |
9777 | |
9778 | md_wq = alloc_workqueue(fmt: "md" , flags: WQ_MEM_RECLAIM, max_active: 0); |
9779 | if (!md_wq) |
9780 | goto err_wq; |
9781 | |
9782 | md_misc_wq = alloc_workqueue(fmt: "md_misc" , flags: 0, max_active: 0); |
9783 | if (!md_misc_wq) |
9784 | goto err_misc_wq; |
9785 | |
9786 | md_bitmap_wq = alloc_workqueue(fmt: "md_bitmap" , flags: WQ_MEM_RECLAIM | WQ_UNBOUND, |
9787 | max_active: 0); |
9788 | if (!md_bitmap_wq) |
9789 | goto err_bitmap_wq; |
9790 | |
9791 | ret = __register_blkdev(MD_MAJOR, name: "md" , probe: md_probe); |
9792 | if (ret < 0) |
9793 | goto err_md; |
9794 | |
9795 | ret = __register_blkdev(major: 0, name: "mdp" , probe: md_probe); |
9796 | if (ret < 0) |
9797 | goto err_mdp; |
9798 | mdp_major = ret; |
9799 | |
9800 | register_reboot_notifier(&md_notifier); |
9801 | raid_table_header = register_sysctl("dev/raid" , raid_table); |
9802 | |
9803 | md_geninit(); |
9804 | return 0; |
9805 | |
9806 | err_mdp: |
9807 | unregister_blkdev(MD_MAJOR, name: "md" ); |
9808 | err_md: |
9809 | destroy_workqueue(wq: md_bitmap_wq); |
9810 | err_bitmap_wq: |
9811 | destroy_workqueue(wq: md_misc_wq); |
9812 | err_misc_wq: |
9813 | destroy_workqueue(wq: md_wq); |
9814 | err_wq: |
9815 | return ret; |
9816 | } |
9817 | |
9818 | static void check_sb_changes(struct mddev *mddev, struct md_rdev *rdev) |
9819 | { |
9820 | struct mdp_superblock_1 *sb = page_address(rdev->sb_page); |
9821 | struct md_rdev *rdev2, *tmp; |
9822 | int role, ret; |
9823 | |
9824 | /* |
9825 | * If size is changed in another node then we need to |
9826 | * do resize as well. |
9827 | */ |
9828 | if (mddev->dev_sectors != le64_to_cpu(sb->size)) { |
9829 | ret = mddev->pers->resize(mddev, le64_to_cpu(sb->size)); |
9830 | if (ret) |
9831 | pr_info("md-cluster: resize failed\n" ); |
9832 | else |
9833 | md_bitmap_update_sb(bitmap: mddev->bitmap); |
9834 | } |
9835 | |
9836 | /* Check for change of roles in the active devices */ |
9837 | rdev_for_each_safe(rdev2, tmp, mddev) { |
9838 | if (test_bit(Faulty, &rdev2->flags)) |
9839 | continue; |
9840 | |
9841 | /* Check if the roles changed */ |
9842 | role = le16_to_cpu(sb->dev_roles[rdev2->desc_nr]); |
9843 | |
9844 | if (test_bit(Candidate, &rdev2->flags)) { |
9845 | if (role == MD_DISK_ROLE_FAULTY) { |
9846 | pr_info("md: Removing Candidate device %pg because add failed\n" , |
9847 | rdev2->bdev); |
9848 | md_kick_rdev_from_array(rdev: rdev2); |
9849 | continue; |
9850 | } |
9851 | else |
9852 | clear_bit(nr: Candidate, addr: &rdev2->flags); |
9853 | } |
9854 | |
9855 | if (role != rdev2->raid_disk) { |
9856 | /* |
9857 | * got activated except reshape is happening. |
9858 | */ |
9859 | if (rdev2->raid_disk == -1 && role != MD_DISK_ROLE_SPARE && |
9860 | !(le32_to_cpu(sb->feature_map) & |
9861 | MD_FEATURE_RESHAPE_ACTIVE)) { |
9862 | rdev2->saved_raid_disk = role; |
9863 | ret = remove_and_add_spares(mddev, this: rdev2); |
9864 | pr_info("Activated spare: %pg\n" , |
9865 | rdev2->bdev); |
9866 | /* wakeup mddev->thread here, so array could |
9867 | * perform resync with the new activated disk */ |
9868 | set_bit(nr: MD_RECOVERY_NEEDED, addr: &mddev->recovery); |
9869 | md_wakeup_thread(mddev->thread); |
9870 | } |
9871 | /* device faulty |
9872 | * We just want to do the minimum to mark the disk |
9873 | * as faulty. The recovery is performed by the |
9874 | * one who initiated the error. |
9875 | */ |
9876 | if (role == MD_DISK_ROLE_FAULTY || |
9877 | role == MD_DISK_ROLE_JOURNAL) { |
9878 | md_error(mddev, rdev2); |
9879 | clear_bit(nr: Blocked, addr: &rdev2->flags); |
9880 | } |
9881 | } |
9882 | } |
9883 | |
9884 | if (mddev->raid_disks != le32_to_cpu(sb->raid_disks)) { |
9885 | ret = update_raid_disks(mddev, le32_to_cpu(sb->raid_disks)); |
9886 | if (ret) |
9887 | pr_warn("md: updating array disks failed. %d\n" , ret); |
9888 | } |
9889 | |
9890 | /* |
9891 | * Since mddev->delta_disks has already updated in update_raid_disks, |
9892 | * so it is time to check reshape. |
9893 | */ |
9894 | if (test_bit(MD_RESYNCING_REMOTE, &mddev->recovery) && |
9895 | (le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE)) { |
9896 | /* |
9897 | * reshape is happening in the remote node, we need to |
9898 | * update reshape_position and call start_reshape. |
9899 | */ |
9900 | mddev->reshape_position = le64_to_cpu(sb->reshape_position); |
9901 | if (mddev->pers->update_reshape_pos) |
9902 | mddev->pers->update_reshape_pos(mddev); |
9903 | if (mddev->pers->start_reshape) |
9904 | mddev->pers->start_reshape(mddev); |
9905 | } else if (test_bit(MD_RESYNCING_REMOTE, &mddev->recovery) && |
9906 | mddev->reshape_position != MaxSector && |
9907 | !(le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE)) { |
9908 | /* reshape is just done in another node. */ |
9909 | mddev->reshape_position = MaxSector; |
9910 | if (mddev->pers->update_reshape_pos) |
9911 | mddev->pers->update_reshape_pos(mddev); |
9912 | } |
9913 | |
9914 | /* Finally set the event to be up to date */ |
9915 | mddev->events = le64_to_cpu(sb->events); |
9916 | } |
9917 | |
9918 | static int read_rdev(struct mddev *mddev, struct md_rdev *rdev) |
9919 | { |
9920 | int err; |
9921 | struct page *swapout = rdev->sb_page; |
9922 | struct mdp_superblock_1 *sb; |
9923 | |
9924 | /* Store the sb page of the rdev in the swapout temporary |
9925 | * variable in case we err in the future |
9926 | */ |
9927 | rdev->sb_page = NULL; |
9928 | err = alloc_disk_sb(rdev); |
9929 | if (err == 0) { |
9930 | ClearPageUptodate(page: rdev->sb_page); |
9931 | rdev->sb_loaded = 0; |
9932 | err = super_types[mddev->major_version]. |
9933 | load_super(rdev, NULL, mddev->minor_version); |
9934 | } |
9935 | if (err < 0) { |
9936 | pr_warn("%s: %d Could not reload rdev(%d) err: %d. Restoring old values\n" , |
9937 | __func__, __LINE__, rdev->desc_nr, err); |
9938 | if (rdev->sb_page) |
9939 | put_page(page: rdev->sb_page); |
9940 | rdev->sb_page = swapout; |
9941 | rdev->sb_loaded = 1; |
9942 | return err; |
9943 | } |
9944 | |
9945 | sb = page_address(rdev->sb_page); |
9946 | /* Read the offset unconditionally, even if MD_FEATURE_RECOVERY_OFFSET |
9947 | * is not set |
9948 | */ |
9949 | |
9950 | if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RECOVERY_OFFSET)) |
9951 | rdev->recovery_offset = le64_to_cpu(sb->recovery_offset); |
9952 | |
9953 | /* The other node finished recovery, call spare_active to set |
9954 | * device In_sync and mddev->degraded |
9955 | */ |
9956 | if (rdev->recovery_offset == MaxSector && |
9957 | !test_bit(In_sync, &rdev->flags) && |
9958 | mddev->pers->spare_active(mddev)) |
9959 | sysfs_notify_dirent_safe(sd: mddev->sysfs_degraded); |
9960 | |
9961 | put_page(page: swapout); |
9962 | return 0; |
9963 | } |
9964 | |
9965 | void md_reload_sb(struct mddev *mddev, int nr) |
9966 | { |
9967 | struct md_rdev *rdev = NULL, *iter; |
9968 | int err; |
9969 | |
9970 | /* Find the rdev */ |
9971 | rdev_for_each_rcu(iter, mddev) { |
9972 | if (iter->desc_nr == nr) { |
9973 | rdev = iter; |
9974 | break; |
9975 | } |
9976 | } |
9977 | |
9978 | if (!rdev) { |
9979 | pr_warn("%s: %d Could not find rdev with nr %d\n" , __func__, __LINE__, nr); |
9980 | return; |
9981 | } |
9982 | |
9983 | err = read_rdev(mddev, rdev); |
9984 | if (err < 0) |
9985 | return; |
9986 | |
9987 | check_sb_changes(mddev, rdev); |
9988 | |
9989 | /* Read all rdev's to update recovery_offset */ |
9990 | rdev_for_each_rcu(rdev, mddev) { |
9991 | if (!test_bit(Faulty, &rdev->flags)) |
9992 | read_rdev(mddev, rdev); |
9993 | } |
9994 | } |
9995 | EXPORT_SYMBOL(md_reload_sb); |
9996 | |
9997 | #ifndef MODULE |
9998 | |
9999 | /* |
10000 | * Searches all registered partitions for autorun RAID arrays |
10001 | * at boot time. |
10002 | */ |
10003 | |
10004 | static DEFINE_MUTEX(detected_devices_mutex); |
10005 | static LIST_HEAD(all_detected_devices); |
10006 | struct detected_devices_node { |
10007 | struct list_head list; |
10008 | dev_t dev; |
10009 | }; |
10010 | |
10011 | void md_autodetect_dev(dev_t dev) |
10012 | { |
10013 | struct detected_devices_node *node_detected_dev; |
10014 | |
10015 | node_detected_dev = kzalloc(size: sizeof(*node_detected_dev), GFP_KERNEL); |
10016 | if (node_detected_dev) { |
10017 | node_detected_dev->dev = dev; |
10018 | mutex_lock(&detected_devices_mutex); |
10019 | list_add_tail(new: &node_detected_dev->list, head: &all_detected_devices); |
10020 | mutex_unlock(lock: &detected_devices_mutex); |
10021 | } |
10022 | } |
10023 | |
10024 | void md_autostart_arrays(int part) |
10025 | { |
10026 | struct md_rdev *rdev; |
10027 | struct detected_devices_node *node_detected_dev; |
10028 | dev_t dev; |
10029 | int i_scanned, i_passed; |
10030 | |
10031 | i_scanned = 0; |
10032 | i_passed = 0; |
10033 | |
10034 | pr_info("md: Autodetecting RAID arrays.\n" ); |
10035 | |
10036 | mutex_lock(&detected_devices_mutex); |
10037 | while (!list_empty(head: &all_detected_devices) && i_scanned < INT_MAX) { |
10038 | i_scanned++; |
10039 | node_detected_dev = list_entry(all_detected_devices.next, |
10040 | struct detected_devices_node, list); |
10041 | list_del(entry: &node_detected_dev->list); |
10042 | dev = node_detected_dev->dev; |
10043 | kfree(objp: node_detected_dev); |
10044 | mutex_unlock(lock: &detected_devices_mutex); |
10045 | rdev = md_import_device(newdev: dev,super_format: 0, super_minor: 90); |
10046 | mutex_lock(&detected_devices_mutex); |
10047 | if (IS_ERR(ptr: rdev)) |
10048 | continue; |
10049 | |
10050 | if (test_bit(Faulty, &rdev->flags)) |
10051 | continue; |
10052 | |
10053 | set_bit(nr: AutoDetected, addr: &rdev->flags); |
10054 | list_add(new: &rdev->same_set, head: &pending_raid_disks); |
10055 | i_passed++; |
10056 | } |
10057 | mutex_unlock(lock: &detected_devices_mutex); |
10058 | |
10059 | pr_debug("md: Scanned %d and added %d devices.\n" , i_scanned, i_passed); |
10060 | |
10061 | autorun_devices(part); |
10062 | } |
10063 | |
10064 | #endif /* !MODULE */ |
10065 | |
10066 | static __exit void md_exit(void) |
10067 | { |
10068 | struct mddev *mddev, *n; |
10069 | int delay = 1; |
10070 | |
10071 | unregister_blkdev(MD_MAJOR,name: "md" ); |
10072 | unregister_blkdev(major: mdp_major, name: "mdp" ); |
10073 | unregister_reboot_notifier(&md_notifier); |
10074 | unregister_sysctl_table(table: raid_table_header); |
10075 | |
10076 | /* We cannot unload the modules while some process is |
10077 | * waiting for us in select() or poll() - wake them up |
10078 | */ |
10079 | md_unloading = 1; |
10080 | while (waitqueue_active(wq_head: &md_event_waiters)) { |
10081 | /* not safe to leave yet */ |
10082 | wake_up(&md_event_waiters); |
10083 | msleep(msecs: delay); |
10084 | delay += delay; |
10085 | } |
10086 | remove_proc_entry("mdstat" , NULL); |
10087 | |
10088 | spin_lock(lock: &all_mddevs_lock); |
10089 | list_for_each_entry_safe(mddev, n, &all_mddevs, all_mddevs) { |
10090 | if (!mddev_get(mddev)) |
10091 | continue; |
10092 | spin_unlock(lock: &all_mddevs_lock); |
10093 | export_array(mddev); |
10094 | mddev->ctime = 0; |
10095 | mddev->hold_active = 0; |
10096 | /* |
10097 | * As the mddev is now fully clear, mddev_put will schedule |
10098 | * the mddev for destruction by a workqueue, and the |
10099 | * destroy_workqueue() below will wait for that to complete. |
10100 | */ |
10101 | mddev_put(mddev); |
10102 | spin_lock(lock: &all_mddevs_lock); |
10103 | } |
10104 | spin_unlock(lock: &all_mddevs_lock); |
10105 | |
10106 | destroy_workqueue(wq: md_misc_wq); |
10107 | destroy_workqueue(wq: md_bitmap_wq); |
10108 | destroy_workqueue(wq: md_wq); |
10109 | } |
10110 | |
10111 | subsys_initcall(md_init); |
10112 | module_exit(md_exit) |
10113 | |
10114 | static int get_ro(char *buffer, const struct kernel_param *kp) |
10115 | { |
10116 | return sprintf(buf: buffer, fmt: "%d\n" , start_readonly); |
10117 | } |
10118 | static int set_ro(const char *val, const struct kernel_param *kp) |
10119 | { |
10120 | return kstrtouint(s: val, base: 10, res: (unsigned int *)&start_readonly); |
10121 | } |
10122 | |
10123 | module_param_call(start_ro, set_ro, get_ro, NULL, S_IRUSR|S_IWUSR); |
10124 | module_param(start_dirty_degraded, int, S_IRUGO|S_IWUSR); |
10125 | module_param_call(new_array, add_named_array, NULL, NULL, S_IWUSR); |
10126 | module_param(create_on_open, bool, S_IRUSR|S_IWUSR); |
10127 | |
10128 | MODULE_LICENSE("GPL" ); |
10129 | MODULE_DESCRIPTION("MD RAID framework" ); |
10130 | MODULE_ALIAS("md" ); |
10131 | MODULE_ALIAS_BLOCKDEV_MAJOR(MD_MAJOR); |
10132 | |