1/*
2 md.c : Multiple Devices driver for Linux
3 Copyright (C) 1998, 1999, 2000 Ingo Molnar
4
5 completely rewritten, based on the MD driver code from Marc Zyngier
6
7 Changes:
8
9 - RAID-1/RAID-5 extensions by Miguel de Icaza, Gadi Oxman, Ingo Molnar
10 - RAID-6 extensions by H. Peter Anvin <hpa@zytor.com>
11 - boot support for linear and striped mode by Harald Hoyer <HarryH@Royal.Net>
12 - kerneld support by Boris Tobotras <boris@xtalk.msk.su>
13 - kmod support by: Cyrus Durgin
14 - RAID0 bugfixes: Mark Anthony Lisher <markal@iname.com>
15 - Devfs support by Richard Gooch <rgooch@atnf.csiro.au>
16
17 - lots of fixes and improvements to the RAID1/RAID5 and generic
18 RAID code (such as request based resynchronization):
19
20 Neil Brown <neilb@cse.unsw.edu.au>.
21
22 - persistent bitmap code
23 Copyright (C) 2003-2004, Paul Clements, SteelEye Technology, Inc.
24
25 This program is free software; you can redistribute it and/or modify
26 it under the terms of the GNU General Public License as published by
27 the Free Software Foundation; either version 2, or (at your option)
28 any later version.
29
30 You should have received a copy of the GNU General Public License
31 (for example /usr/src/linux/COPYING); if not, write to the Free
32 Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
33
34 Errors, Warnings, etc.
35 Please use:
36 pr_crit() for error conditions that risk data loss
37 pr_err() for error conditions that are unexpected, like an IO error
38 or internal inconsistency
39 pr_warn() for error conditions that could have been predicated, like
40 adding a device to an array when it has incompatible metadata
41 pr_info() for every interesting, very rare events, like an array starting
42 or stopping, or resync starting or stopping
43 pr_debug() for everything else.
44
45*/
46
47#include <linux/sched/signal.h>
48#include <linux/kthread.h>
49#include <linux/blkdev.h>
50#include <linux/badblocks.h>
51#include <linux/sysctl.h>
52#include <linux/seq_file.h>
53#include <linux/fs.h>
54#include <linux/poll.h>
55#include <linux/ctype.h>
56#include <linux/string.h>
57#include <linux/hdreg.h>
58#include <linux/proc_fs.h>
59#include <linux/random.h>
60#include <linux/module.h>
61#include <linux/reboot.h>
62#include <linux/file.h>
63#include <linux/compat.h>
64#include <linux/delay.h>
65#include <linux/raid/md_p.h>
66#include <linux/raid/md_u.h>
67#include <linux/slab.h>
68#include <linux/percpu-refcount.h>
69
70#include <trace/events/block.h>
71#include "md.h"
72#include "md-bitmap.h"
73#include "md-cluster.h"
74
75#ifndef MODULE
76static void autostart_arrays(int part);
77#endif
78
79/* pers_list is a list of registered personalities protected
80 * by pers_lock.
81 * pers_lock does extra service to protect accesses to
82 * mddev->thread when the mutex cannot be held.
83 */
84static LIST_HEAD(pers_list);
85static DEFINE_SPINLOCK(pers_lock);
86
87static struct kobj_type md_ktype;
88
89struct md_cluster_operations *md_cluster_ops;
90EXPORT_SYMBOL(md_cluster_ops);
91struct module *md_cluster_mod;
92EXPORT_SYMBOL(md_cluster_mod);
93
94static DECLARE_WAIT_QUEUE_HEAD(resync_wait);
95static struct workqueue_struct *md_wq;
96static struct workqueue_struct *md_misc_wq;
97
98static int remove_and_add_spares(struct mddev *mddev,
99 struct md_rdev *this);
100static void mddev_detach(struct mddev *mddev);
101
102/*
103 * Default number of read corrections we'll attempt on an rdev
104 * before ejecting it from the array. We divide the read error
105 * count by 2 for every hour elapsed between read errors.
106 */
107#define MD_DEFAULT_MAX_CORRECTED_READ_ERRORS 20
108/*
109 * Current RAID-1,4,5 parallel reconstruction 'guaranteed speed limit'
110 * is 1000 KB/sec, so the extra system load does not show up that much.
111 * Increase it if you want to have more _guaranteed_ speed. Note that
112 * the RAID driver will use the maximum available bandwidth if the IO
113 * subsystem is idle. There is also an 'absolute maximum' reconstruction
114 * speed limit - in case reconstruction slows down your system despite
115 * idle IO detection.
116 *
117 * you can change it via /proc/sys/dev/raid/speed_limit_min and _max.
118 * or /sys/block/mdX/md/sync_speed_{min,max}
119 */
120
121static int sysctl_speed_limit_min = 1000;
122static int sysctl_speed_limit_max = 200000;
123static inline int speed_min(struct mddev *mddev)
124{
125 return mddev->sync_speed_min ?
126 mddev->sync_speed_min : sysctl_speed_limit_min;
127}
128
129static inline int speed_max(struct mddev *mddev)
130{
131 return mddev->sync_speed_max ?
132 mddev->sync_speed_max : sysctl_speed_limit_max;
133}
134
135static void * flush_info_alloc(gfp_t gfp_flags, void *data)
136{
137 return kzalloc(sizeof(struct flush_info), gfp_flags);
138}
139static void flush_info_free(void *flush_info, void *data)
140{
141 kfree(flush_info);
142}
143
144static void * flush_bio_alloc(gfp_t gfp_flags, void *data)
145{
146 return kzalloc(sizeof(struct flush_bio), gfp_flags);
147}
148static void flush_bio_free(void *flush_bio, void *data)
149{
150 kfree(flush_bio);
151}
152
153static struct ctl_table_header *raid_table_header;
154
155static struct ctl_table raid_table[] = {
156 {
157 .procname = "speed_limit_min",
158 .data = &sysctl_speed_limit_min,
159 .maxlen = sizeof(int),
160 .mode = S_IRUGO|S_IWUSR,
161 .proc_handler = proc_dointvec,
162 },
163 {
164 .procname = "speed_limit_max",
165 .data = &sysctl_speed_limit_max,
166 .maxlen = sizeof(int),
167 .mode = S_IRUGO|S_IWUSR,
168 .proc_handler = proc_dointvec,
169 },
170 { }
171};
172
173static struct ctl_table raid_dir_table[] = {
174 {
175 .procname = "raid",
176 .maxlen = 0,
177 .mode = S_IRUGO|S_IXUGO,
178 .child = raid_table,
179 },
180 { }
181};
182
183static struct ctl_table raid_root_table[] = {
184 {
185 .procname = "dev",
186 .maxlen = 0,
187 .mode = 0555,
188 .child = raid_dir_table,
189 },
190 { }
191};
192
193static const struct block_device_operations md_fops;
194
195static int start_readonly;
196
197/*
198 * The original mechanism for creating an md device is to create
199 * a device node in /dev and to open it. This causes races with device-close.
200 * The preferred method is to write to the "new_array" module parameter.
201 * This can avoid races.
202 * Setting create_on_open to false disables the original mechanism
203 * so all the races disappear.
204 */
205static bool create_on_open = true;
206
207struct bio *bio_alloc_mddev(gfp_t gfp_mask, int nr_iovecs,
208 struct mddev *mddev)
209{
210 if (!mddev || !bioset_initialized(&mddev->bio_set))
211 return bio_alloc(gfp_mask, nr_iovecs);
212
213 return bio_alloc_bioset(gfp_mask, nr_iovecs, &mddev->bio_set);
214}
215EXPORT_SYMBOL_GPL(bio_alloc_mddev);
216
217static struct bio *md_bio_alloc_sync(struct mddev *mddev)
218{
219 if (!mddev || !bioset_initialized(&mddev->sync_set))
220 return bio_alloc(GFP_NOIO, 1);
221
222 return bio_alloc_bioset(GFP_NOIO, 1, &mddev->sync_set);
223}
224
225/*
226 * We have a system wide 'event count' that is incremented
227 * on any 'interesting' event, and readers of /proc/mdstat
228 * can use 'poll' or 'select' to find out when the event
229 * count increases.
230 *
231 * Events are:
232 * start array, stop array, error, add device, remove device,
233 * start build, activate spare
234 */
235static DECLARE_WAIT_QUEUE_HEAD(md_event_waiters);
236static atomic_t md_event_count;
237void md_new_event(struct mddev *mddev)
238{
239 atomic_inc(&md_event_count);
240 wake_up(&md_event_waiters);
241}
242EXPORT_SYMBOL_GPL(md_new_event);
243
244/*
245 * Enables to iterate over all existing md arrays
246 * all_mddevs_lock protects this list.
247 */
248static LIST_HEAD(all_mddevs);
249static DEFINE_SPINLOCK(all_mddevs_lock);
250
251/*
252 * iterates through all used mddevs in the system.
253 * We take care to grab the all_mddevs_lock whenever navigating
254 * the list, and to always hold a refcount when unlocked.
255 * Any code which breaks out of this loop while own
256 * a reference to the current mddev and must mddev_put it.
257 */
258#define for_each_mddev(_mddev,_tmp) \
259 \
260 for (({ spin_lock(&all_mddevs_lock); \
261 _tmp = all_mddevs.next; \
262 _mddev = NULL;}); \
263 ({ if (_tmp != &all_mddevs) \
264 mddev_get(list_entry(_tmp, struct mddev, all_mddevs));\
265 spin_unlock(&all_mddevs_lock); \
266 if (_mddev) mddev_put(_mddev); \
267 _mddev = list_entry(_tmp, struct mddev, all_mddevs); \
268 _tmp != &all_mddevs;}); \
269 ({ spin_lock(&all_mddevs_lock); \
270 _tmp = _tmp->next;}) \
271 )
272
273/* Rather than calling directly into the personality make_request function,
274 * IO requests come here first so that we can check if the device is
275 * being suspended pending a reconfiguration.
276 * We hold a refcount over the call to ->make_request. By the time that
277 * call has finished, the bio has been linked into some internal structure
278 * and so is visible to ->quiesce(), so we don't need the refcount any more.
279 */
280static bool is_suspended(struct mddev *mddev, struct bio *bio)
281{
282 if (mddev->suspended)
283 return true;
284 if (bio_data_dir(bio) != WRITE)
285 return false;
286 if (mddev->suspend_lo >= mddev->suspend_hi)
287 return false;
288 if (bio->bi_iter.bi_sector >= mddev->suspend_hi)
289 return false;
290 if (bio_end_sector(bio) < mddev->suspend_lo)
291 return false;
292 return true;
293}
294
295void md_handle_request(struct mddev *mddev, struct bio *bio)
296{
297check_suspended:
298 rcu_read_lock();
299 if (is_suspended(mddev, bio)) {
300 DEFINE_WAIT(__wait);
301 for (;;) {
302 prepare_to_wait(&mddev->sb_wait, &__wait,
303 TASK_UNINTERRUPTIBLE);
304 if (!is_suspended(mddev, bio))
305 break;
306 rcu_read_unlock();
307 schedule();
308 rcu_read_lock();
309 }
310 finish_wait(&mddev->sb_wait, &__wait);
311 }
312 atomic_inc(&mddev->active_io);
313 rcu_read_unlock();
314
315 if (!mddev->pers->make_request(mddev, bio)) {
316 atomic_dec(&mddev->active_io);
317 wake_up(&mddev->sb_wait);
318 goto check_suspended;
319 }
320
321 if (atomic_dec_and_test(&mddev->active_io) && mddev->suspended)
322 wake_up(&mddev->sb_wait);
323}
324EXPORT_SYMBOL(md_handle_request);
325
326static blk_qc_t md_make_request(struct request_queue *q, struct bio *bio)
327{
328 const int rw = bio_data_dir(bio);
329 const int sgrp = op_stat_group(bio_op(bio));
330 struct mddev *mddev = q->queuedata;
331 unsigned int sectors;
332
333 blk_queue_split(q, &bio);
334
335 if (mddev == NULL || mddev->pers == NULL) {
336 bio_io_error(bio);
337 return BLK_QC_T_NONE;
338 }
339 if (mddev->ro == 1 && unlikely(rw == WRITE)) {
340 if (bio_sectors(bio) != 0)
341 bio->bi_status = BLK_STS_IOERR;
342 bio_endio(bio);
343 return BLK_QC_T_NONE;
344 }
345
346 /*
347 * save the sectors now since our bio can
348 * go away inside make_request
349 */
350 sectors = bio_sectors(bio);
351 /* bio could be mergeable after passing to underlayer */
352 bio->bi_opf &= ~REQ_NOMERGE;
353
354 md_handle_request(mddev, bio);
355
356 part_stat_lock();
357 part_stat_inc(&mddev->gendisk->part0, ios[sgrp]);
358 part_stat_add(&mddev->gendisk->part0, sectors[sgrp], sectors);
359 part_stat_unlock();
360
361 return BLK_QC_T_NONE;
362}
363
364/* mddev_suspend makes sure no new requests are submitted
365 * to the device, and that any requests that have been submitted
366 * are completely handled.
367 * Once mddev_detach() is called and completes, the module will be
368 * completely unused.
369 */
370void mddev_suspend(struct mddev *mddev)
371{
372 WARN_ON_ONCE(mddev->thread && current == mddev->thread->tsk);
373 lockdep_assert_held(&mddev->reconfig_mutex);
374 if (mddev->suspended++)
375 return;
376 synchronize_rcu();
377 wake_up(&mddev->sb_wait);
378 set_bit(MD_ALLOW_SB_UPDATE, &mddev->flags);
379 smp_mb__after_atomic();
380 wait_event(mddev->sb_wait, atomic_read(&mddev->active_io) == 0);
381 mddev->pers->quiesce(mddev, 1);
382 clear_bit_unlock(MD_ALLOW_SB_UPDATE, &mddev->flags);
383 wait_event(mddev->sb_wait, !test_bit(MD_UPDATING_SB, &mddev->flags));
384
385 del_timer_sync(&mddev->safemode_timer);
386}
387EXPORT_SYMBOL_GPL(mddev_suspend);
388
389void mddev_resume(struct mddev *mddev)
390{
391 lockdep_assert_held(&mddev->reconfig_mutex);
392 if (--mddev->suspended)
393 return;
394 wake_up(&mddev->sb_wait);
395 mddev->pers->quiesce(mddev, 0);
396
397 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
398 md_wakeup_thread(mddev->thread);
399 md_wakeup_thread(mddev->sync_thread); /* possibly kick off a reshape */
400}
401EXPORT_SYMBOL_GPL(mddev_resume);
402
403int mddev_congested(struct mddev *mddev, int bits)
404{
405 struct md_personality *pers = mddev->pers;
406 int ret = 0;
407
408 rcu_read_lock();
409 if (mddev->suspended)
410 ret = 1;
411 else if (pers && pers->congested)
412 ret = pers->congested(mddev, bits);
413 rcu_read_unlock();
414 return ret;
415}
416EXPORT_SYMBOL_GPL(mddev_congested);
417static int md_congested(void *data, int bits)
418{
419 struct mddev *mddev = data;
420 return mddev_congested(mddev, bits);
421}
422
423/*
424 * Generic flush handling for md
425 */
426static void submit_flushes(struct work_struct *ws)
427{
428 struct flush_info *fi = container_of(ws, struct flush_info, flush_work);
429 struct mddev *mddev = fi->mddev;
430 struct bio *bio = fi->bio;
431
432 bio->bi_opf &= ~REQ_PREFLUSH;
433 md_handle_request(mddev, bio);
434
435 mempool_free(fi, mddev->flush_pool);
436}
437
438static void md_end_flush(struct bio *fbio)
439{
440 struct flush_bio *fb = fbio->bi_private;
441 struct md_rdev *rdev = fb->rdev;
442 struct flush_info *fi = fb->fi;
443 struct bio *bio = fi->bio;
444 struct mddev *mddev = fi->mddev;
445
446 rdev_dec_pending(rdev, mddev);
447
448 if (atomic_dec_and_test(&fi->flush_pending)) {
449 if (bio->bi_iter.bi_size == 0) {
450 /* an empty barrier - all done */
451 bio_endio(bio);
452 mempool_free(fi, mddev->flush_pool);
453 } else {
454 INIT_WORK(&fi->flush_work, submit_flushes);
455 queue_work(md_wq, &fi->flush_work);
456 }
457 }
458
459 mempool_free(fb, mddev->flush_bio_pool);
460 bio_put(fbio);
461}
462
463void md_flush_request(struct mddev *mddev, struct bio *bio)
464{
465 struct md_rdev *rdev;
466 struct flush_info *fi;
467
468 fi = mempool_alloc(mddev->flush_pool, GFP_NOIO);
469
470 fi->bio = bio;
471 fi->mddev = mddev;
472 atomic_set(&fi->flush_pending, 1);
473
474 rcu_read_lock();
475 rdev_for_each_rcu(rdev, mddev)
476 if (rdev->raid_disk >= 0 &&
477 !test_bit(Faulty, &rdev->flags)) {
478 /* Take two references, one is dropped
479 * when request finishes, one after
480 * we reclaim rcu_read_lock
481 */
482 struct bio *bi;
483 struct flush_bio *fb;
484 atomic_inc(&rdev->nr_pending);
485 atomic_inc(&rdev->nr_pending);
486 rcu_read_unlock();
487
488 fb = mempool_alloc(mddev->flush_bio_pool, GFP_NOIO);
489 fb->fi = fi;
490 fb->rdev = rdev;
491
492 bi = bio_alloc_mddev(GFP_NOIO, 0, mddev);
493 bio_set_dev(bi, rdev->bdev);
494 bi->bi_end_io = md_end_flush;
495 bi->bi_private = fb;
496 bi->bi_opf = REQ_OP_WRITE | REQ_PREFLUSH;
497
498 atomic_inc(&fi->flush_pending);
499 submit_bio(bi);
500
501 rcu_read_lock();
502 rdev_dec_pending(rdev, mddev);
503 }
504 rcu_read_unlock();
505
506 if (atomic_dec_and_test(&fi->flush_pending)) {
507 if (bio->bi_iter.bi_size == 0) {
508 /* an empty barrier - all done */
509 bio_endio(bio);
510 mempool_free(fi, mddev->flush_pool);
511 } else {
512 INIT_WORK(&fi->flush_work, submit_flushes);
513 queue_work(md_wq, &fi->flush_work);
514 }
515 }
516}
517EXPORT_SYMBOL(md_flush_request);
518
519static inline struct mddev *mddev_get(struct mddev *mddev)
520{
521 atomic_inc(&mddev->active);
522 return mddev;
523}
524
525static void mddev_delayed_delete(struct work_struct *ws);
526
527static void mddev_put(struct mddev *mddev)
528{
529 if (!atomic_dec_and_lock(&mddev->active, &all_mddevs_lock))
530 return;
531 if (!mddev->raid_disks && list_empty(&mddev->disks) &&
532 mddev->ctime == 0 && !mddev->hold_active) {
533 /* Array is not configured at all, and not held active,
534 * so destroy it */
535 list_del_init(&mddev->all_mddevs);
536
537 /*
538 * Call queue_work inside the spinlock so that
539 * flush_workqueue() after mddev_find will succeed in waiting
540 * for the work to be done.
541 */
542 INIT_WORK(&mddev->del_work, mddev_delayed_delete);
543 queue_work(md_misc_wq, &mddev->del_work);
544 }
545 spin_unlock(&all_mddevs_lock);
546}
547
548static void md_safemode_timeout(struct timer_list *t);
549
550void mddev_init(struct mddev *mddev)
551{
552 kobject_init(&mddev->kobj, &md_ktype);
553 mutex_init(&mddev->open_mutex);
554 mutex_init(&mddev->reconfig_mutex);
555 mutex_init(&mddev->bitmap_info.mutex);
556 INIT_LIST_HEAD(&mddev->disks);
557 INIT_LIST_HEAD(&mddev->all_mddevs);
558 timer_setup(&mddev->safemode_timer, md_safemode_timeout, 0);
559 atomic_set(&mddev->active, 1);
560 atomic_set(&mddev->openers, 0);
561 atomic_set(&mddev->active_io, 0);
562 spin_lock_init(&mddev->lock);
563 init_waitqueue_head(&mddev->sb_wait);
564 init_waitqueue_head(&mddev->recovery_wait);
565 mddev->reshape_position = MaxSector;
566 mddev->reshape_backwards = 0;
567 mddev->last_sync_action = "none";
568 mddev->resync_min = 0;
569 mddev->resync_max = MaxSector;
570 mddev->level = LEVEL_NONE;
571}
572EXPORT_SYMBOL_GPL(mddev_init);
573
574static struct mddev *mddev_find(dev_t unit)
575{
576 struct mddev *mddev, *new = NULL;
577
578 if (unit && MAJOR(unit) != MD_MAJOR)
579 unit &= ~((1<<MdpMinorShift)-1);
580
581 retry:
582 spin_lock(&all_mddevs_lock);
583
584 if (unit) {
585 list_for_each_entry(mddev, &all_mddevs, all_mddevs)
586 if (mddev->unit == unit) {
587 mddev_get(mddev);
588 spin_unlock(&all_mddevs_lock);
589 kfree(new);
590 return mddev;
591 }
592
593 if (new) {
594 list_add(&new->all_mddevs, &all_mddevs);
595 spin_unlock(&all_mddevs_lock);
596 new->hold_active = UNTIL_IOCTL;
597 return new;
598 }
599 } else if (new) {
600 /* find an unused unit number */
601 static int next_minor = 512;
602 int start = next_minor;
603 int is_free = 0;
604 int dev = 0;
605 while (!is_free) {
606 dev = MKDEV(MD_MAJOR, next_minor);
607 next_minor++;
608 if (next_minor > MINORMASK)
609 next_minor = 0;
610 if (next_minor == start) {
611 /* Oh dear, all in use. */
612 spin_unlock(&all_mddevs_lock);
613 kfree(new);
614 return NULL;
615 }
616
617 is_free = 1;
618 list_for_each_entry(mddev, &all_mddevs, all_mddevs)
619 if (mddev->unit == dev) {
620 is_free = 0;
621 break;
622 }
623 }
624 new->unit = dev;
625 new->md_minor = MINOR(dev);
626 new->hold_active = UNTIL_STOP;
627 list_add(&new->all_mddevs, &all_mddevs);
628 spin_unlock(&all_mddevs_lock);
629 return new;
630 }
631 spin_unlock(&all_mddevs_lock);
632
633 new = kzalloc(sizeof(*new), GFP_KERNEL);
634 if (!new)
635 return NULL;
636
637 new->unit = unit;
638 if (MAJOR(unit) == MD_MAJOR)
639 new->md_minor = MINOR(unit);
640 else
641 new->md_minor = MINOR(unit) >> MdpMinorShift;
642
643 mddev_init(new);
644
645 goto retry;
646}
647
648static struct attribute_group md_redundancy_group;
649
650void mddev_unlock(struct mddev *mddev)
651{
652 if (mddev->to_remove) {
653 /* These cannot be removed under reconfig_mutex as
654 * an access to the files will try to take reconfig_mutex
655 * while holding the file unremovable, which leads to
656 * a deadlock.
657 * So hold set sysfs_active while the remove in happeing,
658 * and anything else which might set ->to_remove or my
659 * otherwise change the sysfs namespace will fail with
660 * -EBUSY if sysfs_active is still set.
661 * We set sysfs_active under reconfig_mutex and elsewhere
662 * test it under the same mutex to ensure its correct value
663 * is seen.
664 */
665 struct attribute_group *to_remove = mddev->to_remove;
666 mddev->to_remove = NULL;
667 mddev->sysfs_active = 1;
668 mutex_unlock(&mddev->reconfig_mutex);
669
670 if (mddev->kobj.sd) {
671 if (to_remove != &md_redundancy_group)
672 sysfs_remove_group(&mddev->kobj, to_remove);
673 if (mddev->pers == NULL ||
674 mddev->pers->sync_request == NULL) {
675 sysfs_remove_group(&mddev->kobj, &md_redundancy_group);
676 if (mddev->sysfs_action)
677 sysfs_put(mddev->sysfs_action);
678 mddev->sysfs_action = NULL;
679 }
680 }
681 mddev->sysfs_active = 0;
682 } else
683 mutex_unlock(&mddev->reconfig_mutex);
684
685 /* As we've dropped the mutex we need a spinlock to
686 * make sure the thread doesn't disappear
687 */
688 spin_lock(&pers_lock);
689 md_wakeup_thread(mddev->thread);
690 wake_up(&mddev->sb_wait);
691 spin_unlock(&pers_lock);
692}
693EXPORT_SYMBOL_GPL(mddev_unlock);
694
695struct md_rdev *md_find_rdev_nr_rcu(struct mddev *mddev, int nr)
696{
697 struct md_rdev *rdev;
698
699 rdev_for_each_rcu(rdev, mddev)
700 if (rdev->desc_nr == nr)
701 return rdev;
702
703 return NULL;
704}
705EXPORT_SYMBOL_GPL(md_find_rdev_nr_rcu);
706
707static struct md_rdev *find_rdev(struct mddev *mddev, dev_t dev)
708{
709 struct md_rdev *rdev;
710
711 rdev_for_each(rdev, mddev)
712 if (rdev->bdev->bd_dev == dev)
713 return rdev;
714
715 return NULL;
716}
717
718struct md_rdev *md_find_rdev_rcu(struct mddev *mddev, dev_t dev)
719{
720 struct md_rdev *rdev;
721
722 rdev_for_each_rcu(rdev, mddev)
723 if (rdev->bdev->bd_dev == dev)
724 return rdev;
725
726 return NULL;
727}
728EXPORT_SYMBOL_GPL(md_find_rdev_rcu);
729
730static struct md_personality *find_pers(int level, char *clevel)
731{
732 struct md_personality *pers;
733 list_for_each_entry(pers, &pers_list, list) {
734 if (level != LEVEL_NONE && pers->level == level)
735 return pers;
736 if (strcmp(pers->name, clevel)==0)
737 return pers;
738 }
739 return NULL;
740}
741
742/* return the offset of the super block in 512byte sectors */
743static inline sector_t calc_dev_sboffset(struct md_rdev *rdev)
744{
745 sector_t num_sectors = i_size_read(rdev->bdev->bd_inode) / 512;
746 return MD_NEW_SIZE_SECTORS(num_sectors);
747}
748
749static int alloc_disk_sb(struct md_rdev *rdev)
750{
751 rdev->sb_page = alloc_page(GFP_KERNEL);
752 if (!rdev->sb_page)
753 return -ENOMEM;
754 return 0;
755}
756
757void md_rdev_clear(struct md_rdev *rdev)
758{
759 if (rdev->sb_page) {
760 put_page(rdev->sb_page);
761 rdev->sb_loaded = 0;
762 rdev->sb_page = NULL;
763 rdev->sb_start = 0;
764 rdev->sectors = 0;
765 }
766 if (rdev->bb_page) {
767 put_page(rdev->bb_page);
768 rdev->bb_page = NULL;
769 }
770 badblocks_exit(&rdev->badblocks);
771}
772EXPORT_SYMBOL_GPL(md_rdev_clear);
773
774static void super_written(struct bio *bio)
775{
776 struct md_rdev *rdev = bio->bi_private;
777 struct mddev *mddev = rdev->mddev;
778
779 if (bio->bi_status) {
780 pr_err("md: super_written gets error=%d\n", bio->bi_status);
781 md_error(mddev, rdev);
782 if (!test_bit(Faulty, &rdev->flags)
783 && (bio->bi_opf & MD_FAILFAST)) {
784 set_bit(MD_SB_NEED_REWRITE, &mddev->sb_flags);
785 set_bit(LastDev, &rdev->flags);
786 }
787 } else
788 clear_bit(LastDev, &rdev->flags);
789
790 if (atomic_dec_and_test(&mddev->pending_writes))
791 wake_up(&mddev->sb_wait);
792 rdev_dec_pending(rdev, mddev);
793 bio_put(bio);
794}
795
796void md_super_write(struct mddev *mddev, struct md_rdev *rdev,
797 sector_t sector, int size, struct page *page)
798{
799 /* write first size bytes of page to sector of rdev
800 * Increment mddev->pending_writes before returning
801 * and decrement it on completion, waking up sb_wait
802 * if zero is reached.
803 * If an error occurred, call md_error
804 */
805 struct bio *bio;
806 int ff = 0;
807
808 if (!page)
809 return;
810
811 if (test_bit(Faulty, &rdev->flags))
812 return;
813
814 bio = md_bio_alloc_sync(mddev);
815
816 atomic_inc(&rdev->nr_pending);
817
818 bio_set_dev(bio, rdev->meta_bdev ? rdev->meta_bdev : rdev->bdev);
819 bio->bi_iter.bi_sector = sector;
820 bio_add_page(bio, page, size, 0);
821 bio->bi_private = rdev;
822 bio->bi_end_io = super_written;
823
824 if (test_bit(MD_FAILFAST_SUPPORTED, &mddev->flags) &&
825 test_bit(FailFast, &rdev->flags) &&
826 !test_bit(LastDev, &rdev->flags))
827 ff = MD_FAILFAST;
828 bio->bi_opf = REQ_OP_WRITE | REQ_SYNC | REQ_PREFLUSH | REQ_FUA | ff;
829
830 atomic_inc(&mddev->pending_writes);
831 submit_bio(bio);
832}
833
834int md_super_wait(struct mddev *mddev)
835{
836 /* wait for all superblock writes that were scheduled to complete */
837 wait_event(mddev->sb_wait, atomic_read(&mddev->pending_writes)==0);
838 if (test_and_clear_bit(MD_SB_NEED_REWRITE, &mddev->sb_flags))
839 return -EAGAIN;
840 return 0;
841}
842
843int sync_page_io(struct md_rdev *rdev, sector_t sector, int size,
844 struct page *page, int op, int op_flags, bool metadata_op)
845{
846 struct bio *bio = md_bio_alloc_sync(rdev->mddev);
847 int ret;
848
849 if (metadata_op && rdev->meta_bdev)
850 bio_set_dev(bio, rdev->meta_bdev);
851 else
852 bio_set_dev(bio, rdev->bdev);
853 bio_set_op_attrs(bio, op, op_flags);
854 if (metadata_op)
855 bio->bi_iter.bi_sector = sector + rdev->sb_start;
856 else if (rdev->mddev->reshape_position != MaxSector &&
857 (rdev->mddev->reshape_backwards ==
858 (sector >= rdev->mddev->reshape_position)))
859 bio->bi_iter.bi_sector = sector + rdev->new_data_offset;
860 else
861 bio->bi_iter.bi_sector = sector + rdev->data_offset;
862 bio_add_page(bio, page, size, 0);
863
864 submit_bio_wait(bio);
865
866 ret = !bio->bi_status;
867 bio_put(bio);
868 return ret;
869}
870EXPORT_SYMBOL_GPL(sync_page_io);
871
872static int read_disk_sb(struct md_rdev *rdev, int size)
873{
874 char b[BDEVNAME_SIZE];
875
876 if (rdev->sb_loaded)
877 return 0;
878
879 if (!sync_page_io(rdev, 0, size, rdev->sb_page, REQ_OP_READ, 0, true))
880 goto fail;
881 rdev->sb_loaded = 1;
882 return 0;
883
884fail:
885 pr_err("md: disabled device %s, could not read superblock.\n",
886 bdevname(rdev->bdev,b));
887 return -EINVAL;
888}
889
890static int md_uuid_equal(mdp_super_t *sb1, mdp_super_t *sb2)
891{
892 return sb1->set_uuid0 == sb2->set_uuid0 &&
893 sb1->set_uuid1 == sb2->set_uuid1 &&
894 sb1->set_uuid2 == sb2->set_uuid2 &&
895 sb1->set_uuid3 == sb2->set_uuid3;
896}
897
898static int md_sb_equal(mdp_super_t *sb1, mdp_super_t *sb2)
899{
900 int ret;
901 mdp_super_t *tmp1, *tmp2;
902
903 tmp1 = kmalloc(sizeof(*tmp1),GFP_KERNEL);
904 tmp2 = kmalloc(sizeof(*tmp2),GFP_KERNEL);
905
906 if (!tmp1 || !tmp2) {
907 ret = 0;
908 goto abort;
909 }
910
911 *tmp1 = *sb1;
912 *tmp2 = *sb2;
913
914 /*
915 * nr_disks is not constant
916 */
917 tmp1->nr_disks = 0;
918 tmp2->nr_disks = 0;
919
920 ret = (memcmp(tmp1, tmp2, MD_SB_GENERIC_CONSTANT_WORDS * 4) == 0);
921abort:
922 kfree(tmp1);
923 kfree(tmp2);
924 return ret;
925}
926
927static u32 md_csum_fold(u32 csum)
928{
929 csum = (csum & 0xffff) + (csum >> 16);
930 return (csum & 0xffff) + (csum >> 16);
931}
932
933static unsigned int calc_sb_csum(mdp_super_t *sb)
934{
935 u64 newcsum = 0;
936 u32 *sb32 = (u32*)sb;
937 int i;
938 unsigned int disk_csum, csum;
939
940 disk_csum = sb->sb_csum;
941 sb->sb_csum = 0;
942
943 for (i = 0; i < MD_SB_BYTES/4 ; i++)
944 newcsum += sb32[i];
945 csum = (newcsum & 0xffffffff) + (newcsum>>32);
946
947#ifdef CONFIG_ALPHA
948 /* This used to use csum_partial, which was wrong for several
949 * reasons including that different results are returned on
950 * different architectures. It isn't critical that we get exactly
951 * the same return value as before (we always csum_fold before
952 * testing, and that removes any differences). However as we
953 * know that csum_partial always returned a 16bit value on
954 * alphas, do a fold to maximise conformity to previous behaviour.
955 */
956 sb->sb_csum = md_csum_fold(disk_csum);
957#else
958 sb->sb_csum = disk_csum;
959#endif
960 return csum;
961}
962
963/*
964 * Handle superblock details.
965 * We want to be able to handle multiple superblock formats
966 * so we have a common interface to them all, and an array of
967 * different handlers.
968 * We rely on user-space to write the initial superblock, and support
969 * reading and updating of superblocks.
970 * Interface methods are:
971 * int load_super(struct md_rdev *dev, struct md_rdev *refdev, int minor_version)
972 * loads and validates a superblock on dev.
973 * if refdev != NULL, compare superblocks on both devices
974 * Return:
975 * 0 - dev has a superblock that is compatible with refdev
976 * 1 - dev has a superblock that is compatible and newer than refdev
977 * so dev should be used as the refdev in future
978 * -EINVAL superblock incompatible or invalid
979 * -othererror e.g. -EIO
980 *
981 * int validate_super(struct mddev *mddev, struct md_rdev *dev)
982 * Verify that dev is acceptable into mddev.
983 * The first time, mddev->raid_disks will be 0, and data from
984 * dev should be merged in. Subsequent calls check that dev
985 * is new enough. Return 0 or -EINVAL
986 *
987 * void sync_super(struct mddev *mddev, struct md_rdev *dev)
988 * Update the superblock for rdev with data in mddev
989 * This does not write to disc.
990 *
991 */
992
993struct super_type {
994 char *name;
995 struct module *owner;
996 int (*load_super)(struct md_rdev *rdev,
997 struct md_rdev *refdev,
998 int minor_version);
999 int (*validate_super)(struct mddev *mddev,
1000 struct md_rdev *rdev);
1001 void (*sync_super)(struct mddev *mddev,
1002 struct md_rdev *rdev);
1003 unsigned long long (*rdev_size_change)(struct md_rdev *rdev,
1004 sector_t num_sectors);
1005 int (*allow_new_offset)(struct md_rdev *rdev,
1006 unsigned long long new_offset);
1007};
1008
1009/*
1010 * Check that the given mddev has no bitmap.
1011 *
1012 * This function is called from the run method of all personalities that do not
1013 * support bitmaps. It prints an error message and returns non-zero if mddev
1014 * has a bitmap. Otherwise, it returns 0.
1015 *
1016 */
1017int md_check_no_bitmap(struct mddev *mddev)
1018{
1019 if (!mddev->bitmap_info.file && !mddev->bitmap_info.offset)
1020 return 0;
1021 pr_warn("%s: bitmaps are not supported for %s\n",
1022 mdname(mddev), mddev->pers->name);
1023 return 1;
1024}
1025EXPORT_SYMBOL(md_check_no_bitmap);
1026
1027/*
1028 * load_super for 0.90.0
1029 */
1030static int super_90_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_version)
1031{
1032 char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE];
1033 mdp_super_t *sb;
1034 int ret;
1035
1036 /*
1037 * Calculate the position of the superblock (512byte sectors),
1038 * it's at the end of the disk.
1039 *
1040 * It also happens to be a multiple of 4Kb.
1041 */
1042 rdev->sb_start = calc_dev_sboffset(rdev);
1043
1044 ret = read_disk_sb(rdev, MD_SB_BYTES);
1045 if (ret)
1046 return ret;
1047
1048 ret = -EINVAL;
1049
1050 bdevname(rdev->bdev, b);
1051 sb = page_address(rdev->sb_page);
1052
1053 if (sb->md_magic != MD_SB_MAGIC) {
1054 pr_warn("md: invalid raid superblock magic on %s\n", b);
1055 goto abort;
1056 }
1057
1058 if (sb->major_version != 0 ||
1059 sb->minor_version < 90 ||
1060 sb->minor_version > 91) {
1061 pr_warn("Bad version number %d.%d on %s\n",
1062 sb->major_version, sb->minor_version, b);
1063 goto abort;
1064 }
1065
1066 if (sb->raid_disks <= 0)
1067 goto abort;
1068
1069 if (md_csum_fold(calc_sb_csum(sb)) != md_csum_fold(sb->sb_csum)) {
1070 pr_warn("md: invalid superblock checksum on %s\n", b);
1071 goto abort;
1072 }
1073
1074 rdev->preferred_minor = sb->md_minor;
1075 rdev->data_offset = 0;
1076 rdev->new_data_offset = 0;
1077 rdev->sb_size = MD_SB_BYTES;
1078 rdev->badblocks.shift = -1;
1079
1080 if (sb->level == LEVEL_MULTIPATH)
1081 rdev->desc_nr = -1;
1082 else
1083 rdev->desc_nr = sb->this_disk.number;
1084
1085 if (!refdev) {
1086 ret = 1;
1087 } else {
1088 __u64 ev1, ev2;
1089 mdp_super_t *refsb = page_address(refdev->sb_page);
1090 if (!md_uuid_equal(refsb, sb)) {
1091 pr_warn("md: %s has different UUID to %s\n",
1092 b, bdevname(refdev->bdev,b2));
1093 goto abort;
1094 }
1095 if (!md_sb_equal(refsb, sb)) {
1096 pr_warn("md: %s has same UUID but different superblock to %s\n",
1097 b, bdevname(refdev->bdev, b2));
1098 goto abort;
1099 }
1100 ev1 = md_event(sb);
1101 ev2 = md_event(refsb);
1102 if (ev1 > ev2)
1103 ret = 1;
1104 else
1105 ret = 0;
1106 }
1107 rdev->sectors = rdev->sb_start;
1108 /* Limit to 4TB as metadata cannot record more than that.
1109 * (not needed for Linear and RAID0 as metadata doesn't
1110 * record this size)
1111 */
1112 if (IS_ENABLED(CONFIG_LBDAF) && (u64)rdev->sectors >= (2ULL << 32) &&
1113 sb->level >= 1)
1114 rdev->sectors = (sector_t)(2ULL << 32) - 2;
1115
1116 if (rdev->sectors < ((sector_t)sb->size) * 2 && sb->level >= 1)
1117 /* "this cannot possibly happen" ... */
1118 ret = -EINVAL;
1119
1120 abort:
1121 return ret;
1122}
1123
1124/*
1125 * validate_super for 0.90.0
1126 */
1127static int super_90_validate(struct mddev *mddev, struct md_rdev *rdev)
1128{
1129 mdp_disk_t *desc;
1130 mdp_super_t *sb = page_address(rdev->sb_page);
1131 __u64 ev1 = md_event(sb);
1132
1133 rdev->raid_disk = -1;
1134 clear_bit(Faulty, &rdev->flags);
1135 clear_bit(In_sync, &rdev->flags);
1136 clear_bit(Bitmap_sync, &rdev->flags);
1137 clear_bit(WriteMostly, &rdev->flags);
1138
1139 if (mddev->raid_disks == 0) {
1140 mddev->major_version = 0;
1141 mddev->minor_version = sb->minor_version;
1142 mddev->patch_version = sb->patch_version;
1143 mddev->external = 0;
1144 mddev->chunk_sectors = sb->chunk_size >> 9;
1145 mddev->ctime = sb->ctime;
1146 mddev->utime = sb->utime;
1147 mddev->level = sb->level;
1148 mddev->clevel[0] = 0;
1149 mddev->layout = sb->layout;
1150 mddev->raid_disks = sb->raid_disks;
1151 mddev->dev_sectors = ((sector_t)sb->size) * 2;
1152 mddev->events = ev1;
1153 mddev->bitmap_info.offset = 0;
1154 mddev->bitmap_info.space = 0;
1155 /* bitmap can use 60 K after the 4K superblocks */
1156 mddev->bitmap_info.default_offset = MD_SB_BYTES >> 9;
1157 mddev->bitmap_info.default_space = 64*2 - (MD_SB_BYTES >> 9);
1158 mddev->reshape_backwards = 0;
1159
1160 if (mddev->minor_version >= 91) {
1161 mddev->reshape_position = sb->reshape_position;
1162 mddev->delta_disks = sb->delta_disks;
1163 mddev->new_level = sb->new_level;
1164 mddev->new_layout = sb->new_layout;
1165 mddev->new_chunk_sectors = sb->new_chunk >> 9;
1166 if (mddev->delta_disks < 0)
1167 mddev->reshape_backwards = 1;
1168 } else {
1169 mddev->reshape_position = MaxSector;
1170 mddev->delta_disks = 0;
1171 mddev->new_level = mddev->level;
1172 mddev->new_layout = mddev->layout;
1173 mddev->new_chunk_sectors = mddev->chunk_sectors;
1174 }
1175
1176 if (sb->state & (1<<MD_SB_CLEAN))
1177 mddev->recovery_cp = MaxSector;
1178 else {
1179 if (sb->events_hi == sb->cp_events_hi &&
1180 sb->events_lo == sb->cp_events_lo) {
1181 mddev->recovery_cp = sb->recovery_cp;
1182 } else
1183 mddev->recovery_cp = 0;
1184 }
1185
1186 memcpy(mddev->uuid+0, &sb->set_uuid0, 4);
1187 memcpy(mddev->uuid+4, &sb->set_uuid1, 4);
1188 memcpy(mddev->uuid+8, &sb->set_uuid2, 4);
1189 memcpy(mddev->uuid+12,&sb->set_uuid3, 4);
1190
1191 mddev->max_disks = MD_SB_DISKS;
1192
1193 if (sb->state & (1<<MD_SB_BITMAP_PRESENT) &&
1194 mddev->bitmap_info.file == NULL) {
1195 mddev->bitmap_info.offset =
1196 mddev->bitmap_info.default_offset;
1197 mddev->bitmap_info.space =
1198 mddev->bitmap_info.default_space;
1199 }
1200
1201 } else if (mddev->pers == NULL) {
1202 /* Insist on good event counter while assembling, except
1203 * for spares (which don't need an event count) */
1204 ++ev1;
1205 if (sb->disks[rdev->desc_nr].state & (
1206 (1<<MD_DISK_SYNC) | (1 << MD_DISK_ACTIVE)))
1207 if (ev1 < mddev->events)
1208 return -EINVAL;
1209 } else if (mddev->bitmap) {
1210 /* if adding to array with a bitmap, then we can accept an
1211 * older device ... but not too old.
1212 */
1213 if (ev1 < mddev->bitmap->events_cleared)
1214 return 0;
1215 if (ev1 < mddev->events)
1216 set_bit(Bitmap_sync, &rdev->flags);
1217 } else {
1218 if (ev1 < mddev->events)
1219 /* just a hot-add of a new device, leave raid_disk at -1 */
1220 return 0;
1221 }
1222
1223 if (mddev->level != LEVEL_MULTIPATH) {
1224 desc = sb->disks + rdev->desc_nr;
1225
1226 if (desc->state & (1<<MD_DISK_FAULTY))
1227 set_bit(Faulty, &rdev->flags);
1228 else if (desc->state & (1<<MD_DISK_SYNC) /* &&
1229 desc->raid_disk < mddev->raid_disks */) {
1230 set_bit(In_sync, &rdev->flags);
1231 rdev->raid_disk = desc->raid_disk;
1232 rdev->saved_raid_disk = desc->raid_disk;
1233 } else if (desc->state & (1<<MD_DISK_ACTIVE)) {
1234 /* active but not in sync implies recovery up to
1235 * reshape position. We don't know exactly where
1236 * that is, so set to zero for now */
1237 if (mddev->minor_version >= 91) {
1238 rdev->recovery_offset = 0;
1239 rdev->raid_disk = desc->raid_disk;
1240 }
1241 }
1242 if (desc->state & (1<<MD_DISK_WRITEMOSTLY))
1243 set_bit(WriteMostly, &rdev->flags);
1244 if (desc->state & (1<<MD_DISK_FAILFAST))
1245 set_bit(FailFast, &rdev->flags);
1246 } else /* MULTIPATH are always insync */
1247 set_bit(In_sync, &rdev->flags);
1248 return 0;
1249}
1250
1251/*
1252 * sync_super for 0.90.0
1253 */
1254static void super_90_sync(struct mddev *mddev, struct md_rdev *rdev)
1255{
1256 mdp_super_t *sb;
1257 struct md_rdev *rdev2;
1258 int next_spare = mddev->raid_disks;
1259
1260 /* make rdev->sb match mddev data..
1261 *
1262 * 1/ zero out disks
1263 * 2/ Add info for each disk, keeping track of highest desc_nr (next_spare);
1264 * 3/ any empty disks < next_spare become removed
1265 *
1266 * disks[0] gets initialised to REMOVED because
1267 * we cannot be sure from other fields if it has
1268 * been initialised or not.
1269 */
1270 int i;
1271 int active=0, working=0,failed=0,spare=0,nr_disks=0;
1272
1273 rdev->sb_size = MD_SB_BYTES;
1274
1275 sb = page_address(rdev->sb_page);
1276
1277 memset(sb, 0, sizeof(*sb));
1278
1279 sb->md_magic = MD_SB_MAGIC;
1280 sb->major_version = mddev->major_version;
1281 sb->patch_version = mddev->patch_version;
1282 sb->gvalid_words = 0; /* ignored */
1283 memcpy(&sb->set_uuid0, mddev->uuid+0, 4);
1284 memcpy(&sb->set_uuid1, mddev->uuid+4, 4);
1285 memcpy(&sb->set_uuid2, mddev->uuid+8, 4);
1286 memcpy(&sb->set_uuid3, mddev->uuid+12,4);
1287
1288 sb->ctime = clamp_t(time64_t, mddev->ctime, 0, U32_MAX);
1289 sb->level = mddev->level;
1290 sb->size = mddev->dev_sectors / 2;
1291 sb->raid_disks = mddev->raid_disks;
1292 sb->md_minor = mddev->md_minor;
1293 sb->not_persistent = 0;
1294 sb->utime = clamp_t(time64_t, mddev->utime, 0, U32_MAX);
1295 sb->state = 0;
1296 sb->events_hi = (mddev->events>>32);
1297 sb->events_lo = (u32)mddev->events;
1298
1299 if (mddev->reshape_position == MaxSector)
1300 sb->minor_version = 90;
1301 else {
1302 sb->minor_version = 91;
1303 sb->reshape_position = mddev->reshape_position;
1304 sb->new_level = mddev->new_level;
1305 sb->delta_disks = mddev->delta_disks;
1306 sb->new_layout = mddev->new_layout;
1307 sb->new_chunk = mddev->new_chunk_sectors << 9;
1308 }
1309 mddev->minor_version = sb->minor_version;
1310 if (mddev->in_sync)
1311 {
1312 sb->recovery_cp = mddev->recovery_cp;
1313 sb->cp_events_hi = (mddev->events>>32);
1314 sb->cp_events_lo = (u32)mddev->events;
1315 if (mddev->recovery_cp == MaxSector)
1316 sb->state = (1<< MD_SB_CLEAN);
1317 } else
1318 sb->recovery_cp = 0;
1319
1320 sb->layout = mddev->layout;
1321 sb->chunk_size = mddev->chunk_sectors << 9;
1322
1323 if (mddev->bitmap && mddev->bitmap_info.file == NULL)
1324 sb->state |= (1<<MD_SB_BITMAP_PRESENT);
1325
1326 sb->disks[0].state = (1<<MD_DISK_REMOVED);
1327 rdev_for_each(rdev2, mddev) {
1328 mdp_disk_t *d;
1329 int desc_nr;
1330 int is_active = test_bit(In_sync, &rdev2->flags);
1331
1332 if (rdev2->raid_disk >= 0 &&
1333 sb->minor_version >= 91)
1334 /* we have nowhere to store the recovery_offset,
1335 * but if it is not below the reshape_position,
1336 * we can piggy-back on that.
1337 */
1338 is_active = 1;
1339 if (rdev2->raid_disk < 0 ||
1340 test_bit(Faulty, &rdev2->flags))
1341 is_active = 0;
1342 if (is_active)
1343 desc_nr = rdev2->raid_disk;
1344 else
1345 desc_nr = next_spare++;
1346 rdev2->desc_nr = desc_nr;
1347 d = &sb->disks[rdev2->desc_nr];
1348 nr_disks++;
1349 d->number = rdev2->desc_nr;
1350 d->major = MAJOR(rdev2->bdev->bd_dev);
1351 d->minor = MINOR(rdev2->bdev->bd_dev);
1352 if (is_active)
1353 d->raid_disk = rdev2->raid_disk;
1354 else
1355 d->raid_disk = rdev2->desc_nr; /* compatibility */
1356 if (test_bit(Faulty, &rdev2->flags))
1357 d->state = (1<<MD_DISK_FAULTY);
1358 else if (is_active) {
1359 d->state = (1<<MD_DISK_ACTIVE);
1360 if (test_bit(In_sync, &rdev2->flags))
1361 d->state |= (1<<MD_DISK_SYNC);
1362 active++;
1363 working++;
1364 } else {
1365 d->state = 0;
1366 spare++;
1367 working++;
1368 }
1369 if (test_bit(WriteMostly, &rdev2->flags))
1370 d->state |= (1<<MD_DISK_WRITEMOSTLY);
1371 if (test_bit(FailFast, &rdev2->flags))
1372 d->state |= (1<<MD_DISK_FAILFAST);
1373 }
1374 /* now set the "removed" and "faulty" bits on any missing devices */
1375 for (i=0 ; i < mddev->raid_disks ; i++) {
1376 mdp_disk_t *d = &sb->disks[i];
1377 if (d->state == 0 && d->number == 0) {
1378 d->number = i;
1379 d->raid_disk = i;
1380 d->state = (1<<MD_DISK_REMOVED);
1381 d->state |= (1<<MD_DISK_FAULTY);
1382 failed++;
1383 }
1384 }
1385 sb->nr_disks = nr_disks;
1386 sb->active_disks = active;
1387 sb->working_disks = working;
1388 sb->failed_disks = failed;
1389 sb->spare_disks = spare;
1390
1391 sb->this_disk = sb->disks[rdev->desc_nr];
1392 sb->sb_csum = calc_sb_csum(sb);
1393}
1394
1395/*
1396 * rdev_size_change for 0.90.0
1397 */
1398static unsigned long long
1399super_90_rdev_size_change(struct md_rdev *rdev, sector_t num_sectors)
1400{
1401 if (num_sectors && num_sectors < rdev->mddev->dev_sectors)
1402 return 0; /* component must fit device */
1403 if (rdev->mddev->bitmap_info.offset)
1404 return 0; /* can't move bitmap */
1405 rdev->sb_start = calc_dev_sboffset(rdev);
1406 if (!num_sectors || num_sectors > rdev->sb_start)
1407 num_sectors = rdev->sb_start;
1408 /* Limit to 4TB as metadata cannot record more than that.
1409 * 4TB == 2^32 KB, or 2*2^32 sectors.
1410 */
1411 if (IS_ENABLED(CONFIG_LBDAF) && (u64)num_sectors >= (2ULL << 32) &&
1412 rdev->mddev->level >= 1)
1413 num_sectors = (sector_t)(2ULL << 32) - 2;
1414 do {
1415 md_super_write(rdev->mddev, rdev, rdev->sb_start, rdev->sb_size,
1416 rdev->sb_page);
1417 } while (md_super_wait(rdev->mddev) < 0);
1418 return num_sectors;
1419}
1420
1421static int
1422super_90_allow_new_offset(struct md_rdev *rdev, unsigned long long new_offset)
1423{
1424 /* non-zero offset changes not possible with v0.90 */
1425 return new_offset == 0;
1426}
1427
1428/*
1429 * version 1 superblock
1430 */
1431
1432static __le32 calc_sb_1_csum(struct mdp_superblock_1 *sb)
1433{
1434 __le32 disk_csum;
1435 u32 csum;
1436 unsigned long long newcsum;
1437 int size = 256 + le32_to_cpu(sb->max_dev)*2;
1438 __le32 *isuper = (__le32*)sb;
1439
1440 disk_csum = sb->sb_csum;
1441 sb->sb_csum = 0;
1442 newcsum = 0;
1443 for (; size >= 4; size -= 4)
1444 newcsum += le32_to_cpu(*isuper++);
1445
1446 if (size == 2)
1447 newcsum += le16_to_cpu(*(__le16*) isuper);
1448
1449 csum = (newcsum & 0xffffffff) + (newcsum >> 32);
1450 sb->sb_csum = disk_csum;
1451 return cpu_to_le32(csum);
1452}
1453
1454static int super_1_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_version)
1455{
1456 struct mdp_superblock_1 *sb;
1457 int ret;
1458 sector_t sb_start;
1459 sector_t sectors;
1460 char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE];
1461 int bmask;
1462
1463 /*
1464 * Calculate the position of the superblock in 512byte sectors.
1465 * It is always aligned to a 4K boundary and
1466 * depeding on minor_version, it can be:
1467 * 0: At least 8K, but less than 12K, from end of device
1468 * 1: At start of device
1469 * 2: 4K from start of device.
1470 */
1471 switch(minor_version) {
1472 case 0:
1473 sb_start = i_size_read(rdev->bdev->bd_inode) >> 9;
1474 sb_start -= 8*2;
1475 sb_start &= ~(sector_t)(4*2-1);
1476 break;
1477 case 1:
1478 sb_start = 0;
1479 break;
1480 case 2:
1481 sb_start = 8;
1482 break;
1483 default:
1484 return -EINVAL;
1485 }
1486 rdev->sb_start = sb_start;
1487
1488 /* superblock is rarely larger than 1K, but it can be larger,
1489 * and it is safe to read 4k, so we do that
1490 */
1491 ret = read_disk_sb(rdev, 4096);
1492 if (ret) return ret;
1493
1494 sb = page_address(rdev->sb_page);
1495
1496 if (sb->magic != cpu_to_le32(MD_SB_MAGIC) ||
1497 sb->major_version != cpu_to_le32(1) ||
1498 le32_to_cpu(sb->max_dev) > (4096-256)/2 ||
1499 le64_to_cpu(sb->super_offset) != rdev->sb_start ||
1500 (le32_to_cpu(sb->feature_map) & ~MD_FEATURE_ALL) != 0)
1501 return -EINVAL;
1502
1503 if (calc_sb_1_csum(sb) != sb->sb_csum) {
1504 pr_warn("md: invalid superblock checksum on %s\n",
1505 bdevname(rdev->bdev,b));
1506 return -EINVAL;
1507 }
1508 if (le64_to_cpu(sb->data_size) < 10) {
1509 pr_warn("md: data_size too small on %s\n",
1510 bdevname(rdev->bdev,b));
1511 return -EINVAL;
1512 }
1513 if (sb->pad0 ||
1514 sb->pad3[0] ||
1515 memcmp(sb->pad3, sb->pad3+1, sizeof(sb->pad3) - sizeof(sb->pad3[1])))
1516 /* Some padding is non-zero, might be a new feature */
1517 return -EINVAL;
1518
1519 rdev->preferred_minor = 0xffff;
1520 rdev->data_offset = le64_to_cpu(sb->data_offset);
1521 rdev->new_data_offset = rdev->data_offset;
1522 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE) &&
1523 (le32_to_cpu(sb->feature_map) & MD_FEATURE_NEW_OFFSET))
1524 rdev->new_data_offset += (s32)le32_to_cpu(sb->new_offset);
1525 atomic_set(&rdev->corrected_errors, le32_to_cpu(sb->cnt_corrected_read));
1526
1527 rdev->sb_size = le32_to_cpu(sb->max_dev) * 2 + 256;
1528 bmask = queue_logical_block_size(rdev->bdev->bd_disk->queue)-1;
1529 if (rdev->sb_size & bmask)
1530 rdev->sb_size = (rdev->sb_size | bmask) + 1;
1531
1532 if (minor_version
1533 && rdev->data_offset < sb_start + (rdev->sb_size/512))
1534 return -EINVAL;
1535 if (minor_version
1536 && rdev->new_data_offset < sb_start + (rdev->sb_size/512))
1537 return -EINVAL;
1538
1539 if (sb->level == cpu_to_le32(LEVEL_MULTIPATH))
1540 rdev->desc_nr = -1;
1541 else
1542 rdev->desc_nr = le32_to_cpu(sb->dev_number);
1543
1544 if (!rdev->bb_page) {
1545 rdev->bb_page = alloc_page(GFP_KERNEL);
1546 if (!rdev->bb_page)
1547 return -ENOMEM;
1548 }
1549 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_BAD_BLOCKS) &&
1550 rdev->badblocks.count == 0) {
1551 /* need to load the bad block list.
1552 * Currently we limit it to one page.
1553 */
1554 s32 offset;
1555 sector_t bb_sector;
1556 u64 *bbp;
1557 int i;
1558 int sectors = le16_to_cpu(sb->bblog_size);
1559 if (sectors > (PAGE_SIZE / 512))
1560 return -EINVAL;
1561 offset = le32_to_cpu(sb->bblog_offset);
1562 if (offset == 0)
1563 return -EINVAL;
1564 bb_sector = (long long)offset;
1565 if (!sync_page_io(rdev, bb_sector, sectors << 9,
1566 rdev->bb_page, REQ_OP_READ, 0, true))
1567 return -EIO;
1568 bbp = (u64 *)page_address(rdev->bb_page);
1569 rdev->badblocks.shift = sb->bblog_shift;
1570 for (i = 0 ; i < (sectors << (9-3)) ; i++, bbp++) {
1571 u64 bb = le64_to_cpu(*bbp);
1572 int count = bb & (0x3ff);
1573 u64 sector = bb >> 10;
1574 sector <<= sb->bblog_shift;
1575 count <<= sb->bblog_shift;
1576 if (bb + 1 == 0)
1577 break;
1578 if (badblocks_set(&rdev->badblocks, sector, count, 1))
1579 return -EINVAL;
1580 }
1581 } else if (sb->bblog_offset != 0)
1582 rdev->badblocks.shift = 0;
1583
1584 if ((le32_to_cpu(sb->feature_map) &
1585 (MD_FEATURE_PPL | MD_FEATURE_MULTIPLE_PPLS))) {
1586 rdev->ppl.offset = (__s16)le16_to_cpu(sb->ppl.offset);
1587 rdev->ppl.size = le16_to_cpu(sb->ppl.size);
1588 rdev->ppl.sector = rdev->sb_start + rdev->ppl.offset;
1589 }
1590
1591 if (!refdev) {
1592 ret = 1;
1593 } else {
1594 __u64 ev1, ev2;
1595 struct mdp_superblock_1 *refsb = page_address(refdev->sb_page);
1596
1597 if (memcmp(sb->set_uuid, refsb->set_uuid, 16) != 0 ||
1598 sb->level != refsb->level ||
1599 sb->layout != refsb->layout ||
1600 sb->chunksize != refsb->chunksize) {
1601 pr_warn("md: %s has strangely different superblock to %s\n",
1602 bdevname(rdev->bdev,b),
1603 bdevname(refdev->bdev,b2));
1604 return -EINVAL;
1605 }
1606 ev1 = le64_to_cpu(sb->events);
1607 ev2 = le64_to_cpu(refsb->events);
1608
1609 if (ev1 > ev2)
1610 ret = 1;
1611 else
1612 ret = 0;
1613 }
1614 if (minor_version) {
1615 sectors = (i_size_read(rdev->bdev->bd_inode) >> 9);
1616 sectors -= rdev->data_offset;
1617 } else
1618 sectors = rdev->sb_start;
1619 if (sectors < le64_to_cpu(sb->data_size))
1620 return -EINVAL;
1621 rdev->sectors = le64_to_cpu(sb->data_size);
1622 return ret;
1623}
1624
1625static int super_1_validate(struct mddev *mddev, struct md_rdev *rdev)
1626{
1627 struct mdp_superblock_1 *sb = page_address(rdev->sb_page);
1628 __u64 ev1 = le64_to_cpu(sb->events);
1629
1630 rdev->raid_disk = -1;
1631 clear_bit(Faulty, &rdev->flags);
1632 clear_bit(In_sync, &rdev->flags);
1633 clear_bit(Bitmap_sync, &rdev->flags);
1634 clear_bit(WriteMostly, &rdev->flags);
1635
1636 if (mddev->raid_disks == 0) {
1637 mddev->major_version = 1;
1638 mddev->patch_version = 0;
1639 mddev->external = 0;
1640 mddev->chunk_sectors = le32_to_cpu(sb->chunksize);
1641 mddev->ctime = le64_to_cpu(sb->ctime);
1642 mddev->utime = le64_to_cpu(sb->utime);
1643 mddev->level = le32_to_cpu(sb->level);
1644 mddev->clevel[0] = 0;
1645 mddev->layout = le32_to_cpu(sb->layout);
1646 mddev->raid_disks = le32_to_cpu(sb->raid_disks);
1647 mddev->dev_sectors = le64_to_cpu(sb->size);
1648 mddev->events = ev1;
1649 mddev->bitmap_info.offset = 0;
1650 mddev->bitmap_info.space = 0;
1651 /* Default location for bitmap is 1K after superblock
1652 * using 3K - total of 4K
1653 */
1654 mddev->bitmap_info.default_offset = 1024 >> 9;
1655 mddev->bitmap_info.default_space = (4096-1024) >> 9;
1656 mddev->reshape_backwards = 0;
1657
1658 mddev->recovery_cp = le64_to_cpu(sb->resync_offset);
1659 memcpy(mddev->uuid, sb->set_uuid, 16);
1660
1661 mddev->max_disks = (4096-256)/2;
1662
1663 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_BITMAP_OFFSET) &&
1664 mddev->bitmap_info.file == NULL) {
1665 mddev->bitmap_info.offset =
1666 (__s32)le32_to_cpu(sb->bitmap_offset);
1667 /* Metadata doesn't record how much space is available.
1668 * For 1.0, we assume we can use up to the superblock
1669 * if before, else to 4K beyond superblock.
1670 * For others, assume no change is possible.
1671 */
1672 if (mddev->minor_version > 0)
1673 mddev->bitmap_info.space = 0;
1674 else if (mddev->bitmap_info.offset > 0)
1675 mddev->bitmap_info.space =
1676 8 - mddev->bitmap_info.offset;
1677 else
1678 mddev->bitmap_info.space =
1679 -mddev->bitmap_info.offset;
1680 }
1681
1682 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE)) {
1683 mddev->reshape_position = le64_to_cpu(sb->reshape_position);
1684 mddev->delta_disks = le32_to_cpu(sb->delta_disks);
1685 mddev->new_level = le32_to_cpu(sb->new_level);
1686 mddev->new_layout = le32_to_cpu(sb->new_layout);
1687 mddev->new_chunk_sectors = le32_to_cpu(sb->new_chunk);
1688 if (mddev->delta_disks < 0 ||
1689 (mddev->delta_disks == 0 &&
1690 (le32_to_cpu(sb->feature_map)
1691 & MD_FEATURE_RESHAPE_BACKWARDS)))
1692 mddev->reshape_backwards = 1;
1693 } else {
1694 mddev->reshape_position = MaxSector;
1695 mddev->delta_disks = 0;
1696 mddev->new_level = mddev->level;
1697 mddev->new_layout = mddev->layout;
1698 mddev->new_chunk_sectors = mddev->chunk_sectors;
1699 }
1700
1701 if (le32_to_cpu(sb->feature_map) & MD_FEATURE_JOURNAL)
1702 set_bit(MD_HAS_JOURNAL, &mddev->flags);
1703
1704 if (le32_to_cpu(sb->feature_map) &
1705 (MD_FEATURE_PPL | MD_FEATURE_MULTIPLE_PPLS)) {
1706 if (le32_to_cpu(sb->feature_map) &
1707 (MD_FEATURE_BITMAP_OFFSET | MD_FEATURE_JOURNAL))
1708 return -EINVAL;
1709 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_PPL) &&
1710 (le32_to_cpu(sb->feature_map) &
1711 MD_FEATURE_MULTIPLE_PPLS))
1712 return -EINVAL;
1713 set_bit(MD_HAS_PPL, &mddev->flags);
1714 }
1715 } else if (mddev->pers == NULL) {
1716 /* Insist of good event counter while assembling, except for
1717 * spares (which don't need an event count) */
1718 ++ev1;
1719 if (rdev->desc_nr >= 0 &&
1720 rdev->desc_nr < le32_to_cpu(sb->max_dev) &&
1721 (le16_to_cpu(sb->dev_roles[rdev->desc_nr]) < MD_DISK_ROLE_MAX ||
1722 le16_to_cpu(sb->dev_roles[rdev->desc_nr]) == MD_DISK_ROLE_JOURNAL))
1723 if (ev1 < mddev->events)
1724 return -EINVAL;
1725 } else if (mddev->bitmap) {
1726 /* If adding to array with a bitmap, then we can accept an
1727 * older device, but not too old.
1728 */
1729 if (ev1 < mddev->bitmap->events_cleared)
1730 return 0;
1731 if (ev1 < mddev->events)
1732 set_bit(Bitmap_sync, &rdev->flags);
1733 } else {
1734 if (ev1 < mddev->events)
1735 /* just a hot-add of a new device, leave raid_disk at -1 */
1736 return 0;
1737 }
1738 if (mddev->level != LEVEL_MULTIPATH) {
1739 int role;
1740 if (rdev->desc_nr < 0 ||
1741 rdev->desc_nr >= le32_to_cpu(sb->max_dev)) {
1742 role = MD_DISK_ROLE_SPARE;
1743 rdev->desc_nr = -1;
1744 } else
1745 role = le16_to_cpu(sb->dev_roles[rdev->desc_nr]);
1746 switch(role) {
1747 case MD_DISK_ROLE_SPARE: /* spare */
1748 break;
1749 case MD_DISK_ROLE_FAULTY: /* faulty */
1750 set_bit(Faulty, &rdev->flags);
1751 break;
1752 case MD_DISK_ROLE_JOURNAL: /* journal device */
1753 if (!(le32_to_cpu(sb->feature_map) & MD_FEATURE_JOURNAL)) {
1754 /* journal device without journal feature */
1755 pr_warn("md: journal device provided without journal feature, ignoring the device\n");
1756 return -EINVAL;
1757 }
1758 set_bit(Journal, &rdev->flags);
1759 rdev->journal_tail = le64_to_cpu(sb->journal_tail);
1760 rdev->raid_disk = 0;
1761 break;
1762 default:
1763 rdev->saved_raid_disk = role;
1764 if ((le32_to_cpu(sb->feature_map) &
1765 MD_FEATURE_RECOVERY_OFFSET)) {
1766 rdev->recovery_offset = le64_to_cpu(sb->recovery_offset);
1767 if (!(le32_to_cpu(sb->feature_map) &
1768 MD_FEATURE_RECOVERY_BITMAP))
1769 rdev->saved_raid_disk = -1;
1770 } else
1771 set_bit(In_sync, &rdev->flags);
1772 rdev->raid_disk = role;
1773 break;
1774 }
1775 if (sb->devflags & WriteMostly1)
1776 set_bit(WriteMostly, &rdev->flags);
1777 if (sb->devflags & FailFast1)
1778 set_bit(FailFast, &rdev->flags);
1779 if (le32_to_cpu(sb->feature_map) & MD_FEATURE_REPLACEMENT)
1780 set_bit(Replacement, &rdev->flags);
1781 } else /* MULTIPATH are always insync */
1782 set_bit(In_sync, &rdev->flags);
1783
1784 return 0;
1785}
1786
1787static void super_1_sync(struct mddev *mddev, struct md_rdev *rdev)
1788{
1789 struct mdp_superblock_1 *sb;
1790 struct md_rdev *rdev2;
1791 int max_dev, i;
1792 /* make rdev->sb match mddev and rdev data. */
1793
1794 sb = page_address(rdev->sb_page);
1795
1796 sb->feature_map = 0;
1797 sb->pad0 = 0;
1798 sb->recovery_offset = cpu_to_le64(0);
1799 memset(sb->pad3, 0, sizeof(sb->pad3));
1800
1801 sb->utime = cpu_to_le64((__u64)mddev->utime);
1802 sb->events = cpu_to_le64(mddev->events);
1803 if (mddev->in_sync)
1804 sb->resync_offset = cpu_to_le64(mddev->recovery_cp);
1805 else if (test_bit(MD_JOURNAL_CLEAN, &mddev->flags))
1806 sb->resync_offset = cpu_to_le64(MaxSector);
1807 else
1808 sb->resync_offset = cpu_to_le64(0);
1809
1810 sb->cnt_corrected_read = cpu_to_le32(atomic_read(&rdev->corrected_errors));
1811
1812 sb->raid_disks = cpu_to_le32(mddev->raid_disks);
1813 sb->size = cpu_to_le64(mddev->dev_sectors);
1814 sb->chunksize = cpu_to_le32(mddev->chunk_sectors);
1815 sb->level = cpu_to_le32(mddev->level);
1816 sb->layout = cpu_to_le32(mddev->layout);
1817 if (test_bit(FailFast, &rdev->flags))
1818 sb->devflags |= FailFast1;
1819 else
1820 sb->devflags &= ~FailFast1;
1821
1822 if (test_bit(WriteMostly, &rdev->flags))
1823 sb->devflags |= WriteMostly1;
1824 else
1825 sb->devflags &= ~WriteMostly1;
1826 sb->data_offset = cpu_to_le64(rdev->data_offset);
1827 sb->data_size = cpu_to_le64(rdev->sectors);
1828
1829 if (mddev->bitmap && mddev->bitmap_info.file == NULL) {
1830 sb->bitmap_offset = cpu_to_le32((__u32)mddev->bitmap_info.offset);
1831 sb->feature_map = cpu_to_le32(MD_FEATURE_BITMAP_OFFSET);
1832 }
1833
1834 if (rdev->raid_disk >= 0 && !test_bit(Journal, &rdev->flags) &&
1835 !test_bit(In_sync, &rdev->flags)) {
1836 sb->feature_map |=
1837 cpu_to_le32(MD_FEATURE_RECOVERY_OFFSET);
1838 sb->recovery_offset =
1839 cpu_to_le64(rdev->recovery_offset);
1840 if (rdev->saved_raid_disk >= 0 && mddev->bitmap)
1841 sb->feature_map |=
1842 cpu_to_le32(MD_FEATURE_RECOVERY_BITMAP);
1843 }
1844 /* Note: recovery_offset and journal_tail share space */
1845 if (test_bit(Journal, &rdev->flags))
1846 sb->journal_tail = cpu_to_le64(rdev->journal_tail);
1847 if (test_bit(Replacement, &rdev->flags))
1848 sb->feature_map |=
1849 cpu_to_le32(MD_FEATURE_REPLACEMENT);
1850
1851 if (mddev->reshape_position != MaxSector) {
1852 sb->feature_map |= cpu_to_le32(MD_FEATURE_RESHAPE_ACTIVE);
1853 sb->reshape_position = cpu_to_le64(mddev->reshape_position);
1854 sb->new_layout = cpu_to_le32(mddev->new_layout);
1855 sb->delta_disks = cpu_to_le32(mddev->delta_disks);
1856 sb->new_level = cpu_to_le32(mddev->new_level);
1857 sb->new_chunk = cpu_to_le32(mddev->new_chunk_sectors);
1858 if (mddev->delta_disks == 0 &&
1859 mddev->reshape_backwards)
1860 sb->feature_map
1861 |= cpu_to_le32(MD_FEATURE_RESHAPE_BACKWARDS);
1862 if (rdev->new_data_offset != rdev->data_offset) {
1863 sb->feature_map
1864 |= cpu_to_le32(MD_FEATURE_NEW_OFFSET);
1865 sb->new_offset = cpu_to_le32((__u32)(rdev->new_data_offset
1866 - rdev->data_offset));
1867 }
1868 }
1869
1870 if (mddev_is_clustered(mddev))
1871 sb->feature_map |= cpu_to_le32(MD_FEATURE_CLUSTERED);
1872
1873 if (rdev->badblocks.count == 0)
1874 /* Nothing to do for bad blocks*/ ;
1875 else if (sb->bblog_offset == 0)
1876 /* Cannot record bad blocks on this device */
1877 md_error(mddev, rdev);
1878 else {
1879 struct badblocks *bb = &rdev->badblocks;
1880 u64 *bbp = (u64 *)page_address(rdev->bb_page);
1881 u64 *p = bb->page;
1882 sb->feature_map |= cpu_to_le32(MD_FEATURE_BAD_BLOCKS);
1883 if (bb->changed) {
1884 unsigned seq;
1885
1886retry:
1887 seq = read_seqbegin(&bb->lock);
1888
1889 memset(bbp, 0xff, PAGE_SIZE);
1890
1891 for (i = 0 ; i < bb->count ; i++) {
1892 u64 internal_bb = p[i];
1893 u64 store_bb = ((BB_OFFSET(internal_bb) << 10)
1894 | BB_LEN(internal_bb));
1895 bbp[i] = cpu_to_le64(store_bb);
1896 }
1897 bb->changed = 0;
1898 if (read_seqretry(&bb->lock, seq))
1899 goto retry;
1900
1901 bb->sector = (rdev->sb_start +
1902 (int)le32_to_cpu(sb->bblog_offset));
1903 bb->size = le16_to_cpu(sb->bblog_size);
1904 }
1905 }
1906
1907 max_dev = 0;
1908 rdev_for_each(rdev2, mddev)
1909 if (rdev2->desc_nr+1 > max_dev)
1910 max_dev = rdev2->desc_nr+1;
1911
1912 if (max_dev > le32_to_cpu(sb->max_dev)) {
1913 int bmask;
1914 sb->max_dev = cpu_to_le32(max_dev);
1915 rdev->sb_size = max_dev * 2 + 256;
1916 bmask = queue_logical_block_size(rdev->bdev->bd_disk->queue)-1;
1917 if (rdev->sb_size & bmask)
1918 rdev->sb_size = (rdev->sb_size | bmask) + 1;
1919 } else
1920 max_dev = le32_to_cpu(sb->max_dev);
1921
1922 for (i=0; i<max_dev;i++)
1923 sb->dev_roles[i] = cpu_to_le16(MD_DISK_ROLE_SPARE);
1924
1925 if (test_bit(MD_HAS_JOURNAL, &mddev->flags))
1926 sb->feature_map |= cpu_to_le32(MD_FEATURE_JOURNAL);
1927
1928 if (test_bit(MD_HAS_PPL, &mddev->flags)) {
1929 if (test_bit(MD_HAS_MULTIPLE_PPLS, &mddev->flags))
1930 sb->feature_map |=
1931 cpu_to_le32(MD_FEATURE_MULTIPLE_PPLS);
1932 else
1933 sb->feature_map |= cpu_to_le32(MD_FEATURE_PPL);
1934 sb->ppl.offset = cpu_to_le16(rdev->ppl.offset);
1935 sb->ppl.size = cpu_to_le16(rdev->ppl.size);
1936 }
1937
1938 rdev_for_each(rdev2, mddev) {
1939 i = rdev2->desc_nr;
1940 if (test_bit(Faulty, &rdev2->flags))
1941 sb->dev_roles[i] = cpu_to_le16(MD_DISK_ROLE_FAULTY);
1942 else if (test_bit(In_sync, &rdev2->flags))
1943 sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk);
1944 else if (test_bit(Journal, &rdev2->flags))
1945 sb->dev_roles[i] = cpu_to_le16(MD_DISK_ROLE_JOURNAL);
1946 else if (rdev2->raid_disk >= 0)
1947 sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk);
1948 else
1949 sb->dev_roles[i] = cpu_to_le16(MD_DISK_ROLE_SPARE);
1950 }
1951
1952 sb->sb_csum = calc_sb_1_csum(sb);
1953}
1954
1955static unsigned long long
1956super_1_rdev_size_change(struct md_rdev *rdev, sector_t num_sectors)
1957{
1958 struct mdp_superblock_1 *sb;
1959 sector_t max_sectors;
1960 if (num_sectors && num_sectors < rdev->mddev->dev_sectors)
1961 return 0; /* component must fit device */
1962 if (rdev->data_offset != rdev->new_data_offset)
1963 return 0; /* too confusing */
1964 if (rdev->sb_start < rdev->data_offset) {
1965 /* minor versions 1 and 2; superblock before data */
1966 max_sectors = i_size_read(rdev->bdev->bd_inode) >> 9;
1967 max_sectors -= rdev->data_offset;
1968 if (!num_sectors || num_sectors > max_sectors)
1969 num_sectors = max_sectors;
1970 } else if (rdev->mddev->bitmap_info.offset) {
1971 /* minor version 0 with bitmap we can't move */
1972 return 0;
1973 } else {
1974 /* minor version 0; superblock after data */
1975 sector_t sb_start;
1976 sb_start = (i_size_read(rdev->bdev->bd_inode) >> 9) - 8*2;
1977 sb_start &= ~(sector_t)(4*2 - 1);
1978 max_sectors = rdev->sectors + sb_start - rdev->sb_start;
1979 if (!num_sectors || num_sectors > max_sectors)
1980 num_sectors = max_sectors;
1981 rdev->sb_start = sb_start;
1982 }
1983 sb = page_address(rdev->sb_page);
1984 sb->data_size = cpu_to_le64(num_sectors);
1985 sb->super_offset = cpu_to_le64(rdev->sb_start);
1986 sb->sb_csum = calc_sb_1_csum(sb);
1987 do {
1988 md_super_write(rdev->mddev, rdev, rdev->sb_start, rdev->sb_size,
1989 rdev->sb_page);
1990 } while (md_super_wait(rdev->mddev) < 0);
1991 return num_sectors;
1992
1993}
1994
1995static int
1996super_1_allow_new_offset(struct md_rdev *rdev,
1997 unsigned long long new_offset)
1998{
1999 /* All necessary checks on new >= old have been done */
2000 struct bitmap *bitmap;
2001 if (new_offset >= rdev->data_offset)
2002 return 1;
2003
2004 /* with 1.0 metadata, there is no metadata to tread on
2005 * so we can always move back */
2006 if (rdev->mddev->minor_version == 0)
2007 return 1;
2008
2009 /* otherwise we must be sure not to step on
2010 * any metadata, so stay:
2011 * 36K beyond start of superblock
2012 * beyond end of badblocks
2013 * beyond write-intent bitmap
2014 */
2015 if (rdev->sb_start + (32+4)*2 > new_offset)
2016 return 0;
2017 bitmap = rdev->mddev->bitmap;
2018 if (bitmap && !rdev->mddev->bitmap_info.file &&
2019 rdev->sb_start + rdev->mddev->bitmap_info.offset +
2020 bitmap->storage.file_pages * (PAGE_SIZE>>9) > new_offset)
2021 return 0;
2022 if (rdev->badblocks.sector + rdev->badblocks.size > new_offset)
2023 return 0;
2024
2025 return 1;
2026}
2027
2028static struct super_type super_types[] = {
2029 [0] = {
2030 .name = "0.90.0",
2031 .owner = THIS_MODULE,
2032 .load_super = super_90_load,
2033 .validate_super = super_90_validate,
2034 .sync_super = super_90_sync,
2035 .rdev_size_change = super_90_rdev_size_change,
2036 .allow_new_offset = super_90_allow_new_offset,
2037 },
2038 [1] = {
2039 .name = "md-1",
2040 .owner = THIS_MODULE,
2041 .load_super = super_1_load,
2042 .validate_super = super_1_validate,
2043 .sync_super = super_1_sync,
2044 .rdev_size_change = super_1_rdev_size_change,
2045 .allow_new_offset = super_1_allow_new_offset,
2046 },
2047};
2048
2049static void sync_super(struct mddev *mddev, struct md_rdev *rdev)
2050{
2051 if (mddev->sync_super) {
2052 mddev->sync_super(mddev, rdev);
2053 return;
2054 }
2055
2056 BUG_ON(mddev->major_version >= ARRAY_SIZE(super_types));
2057
2058 super_types[mddev->major_version].sync_super(mddev, rdev);
2059}
2060
2061static int match_mddev_units(struct mddev *mddev1, struct mddev *mddev2)
2062{
2063 struct md_rdev *rdev, *rdev2;
2064
2065 rcu_read_lock();
2066 rdev_for_each_rcu(rdev, mddev1) {
2067 if (test_bit(Faulty, &rdev->flags) ||
2068 test_bit(Journal, &rdev->flags) ||
2069 rdev->raid_disk == -1)
2070 continue;
2071 rdev_for_each_rcu(rdev2, mddev2) {
2072 if (test_bit(Faulty, &rdev2->flags) ||
2073 test_bit(Journal, &rdev2->flags) ||
2074 rdev2->raid_disk == -1)
2075 continue;
2076 if (rdev->bdev->bd_contains ==
2077 rdev2->bdev->bd_contains) {
2078 rcu_read_unlock();
2079 return 1;
2080 }
2081 }
2082 }
2083 rcu_read_unlock();
2084 return 0;
2085}
2086
2087static LIST_HEAD(pending_raid_disks);
2088
2089/*
2090 * Try to register data integrity profile for an mddev
2091 *
2092 * This is called when an array is started and after a disk has been kicked
2093 * from the array. It only succeeds if all working and active component devices
2094 * are integrity capable with matching profiles.
2095 */
2096int md_integrity_register(struct mddev *mddev)
2097{
2098 struct md_rdev *rdev, *reference = NULL;
2099
2100 if (list_empty(&mddev->disks))
2101 return 0; /* nothing to do */
2102 if (!mddev->gendisk || blk_get_integrity(mddev->gendisk))
2103 return 0; /* shouldn't register, or already is */
2104 rdev_for_each(rdev, mddev) {
2105 /* skip spares and non-functional disks */
2106 if (test_bit(Faulty, &rdev->flags))
2107 continue;
2108 if (rdev->raid_disk < 0)
2109 continue;
2110 if (!reference) {
2111 /* Use the first rdev as the reference */
2112 reference = rdev;
2113 continue;
2114 }
2115 /* does this rdev's profile match the reference profile? */
2116 if (blk_integrity_compare(reference->bdev->bd_disk,
2117 rdev->bdev->bd_disk) < 0)
2118 return -EINVAL;
2119 }
2120 if (!reference || !bdev_get_integrity(reference->bdev))
2121 return 0;
2122 /*
2123 * All component devices are integrity capable and have matching
2124 * profiles, register the common profile for the md device.
2125 */
2126 blk_integrity_register(mddev->gendisk,
2127 bdev_get_integrity(reference->bdev));
2128
2129 pr_debug("md: data integrity enabled on %s\n", mdname(mddev));
2130 if (bioset_integrity_create(&mddev->bio_set, BIO_POOL_SIZE)) {
2131 pr_err("md: failed to create integrity pool for %s\n",
2132 mdname(mddev));
2133 return -EINVAL;
2134 }
2135 return 0;
2136}
2137EXPORT_SYMBOL(md_integrity_register);
2138
2139/*
2140 * Attempt to add an rdev, but only if it is consistent with the current
2141 * integrity profile
2142 */
2143int md_integrity_add_rdev(struct md_rdev *rdev, struct mddev *mddev)
2144{
2145 struct blk_integrity *bi_mddev;
2146 char name[BDEVNAME_SIZE];
2147
2148 if (!mddev->gendisk)
2149 return 0;
2150
2151 bi_mddev = blk_get_integrity(mddev->gendisk);
2152
2153 if (!bi_mddev) /* nothing to do */
2154 return 0;
2155
2156 if (blk_integrity_compare(mddev->gendisk, rdev->bdev->bd_disk) != 0) {
2157 pr_err("%s: incompatible integrity profile for %s\n",
2158 mdname(mddev), bdevname(rdev->bdev, name));
2159 return -ENXIO;
2160 }
2161
2162 return 0;
2163}
2164EXPORT_SYMBOL(md_integrity_add_rdev);
2165
2166static int bind_rdev_to_array(struct md_rdev *rdev, struct mddev *mddev)
2167{
2168 char b[BDEVNAME_SIZE];
2169 struct kobject *ko;
2170 int err;
2171
2172 /* prevent duplicates */
2173 if (find_rdev(mddev, rdev->bdev->bd_dev))
2174 return -EEXIST;
2175
2176 if ((bdev_read_only(rdev->bdev) || bdev_read_only(rdev->meta_bdev)) &&
2177 mddev->pers)
2178 return -EROFS;
2179
2180 /* make sure rdev->sectors exceeds mddev->dev_sectors */
2181 if (!test_bit(Journal, &rdev->flags) &&
2182 rdev->sectors &&
2183 (mddev->dev_sectors == 0 || rdev->sectors < mddev->dev_sectors)) {
2184 if (mddev->pers) {
2185 /* Cannot change size, so fail
2186 * If mddev->level <= 0, then we don't care
2187 * about aligning sizes (e.g. linear)
2188 */
2189 if (mddev->level > 0)
2190 return -ENOSPC;
2191 } else
2192 mddev->dev_sectors = rdev->sectors;
2193 }
2194
2195 /* Verify rdev->desc_nr is unique.
2196 * If it is -1, assign a free number, else
2197 * check number is not in use
2198 */
2199 rcu_read_lock();
2200 if (rdev->desc_nr < 0) {
2201 int choice = 0;
2202 if (mddev->pers)
2203 choice = mddev->raid_disks;
2204 while (md_find_rdev_nr_rcu(mddev, choice))
2205 choice++;
2206 rdev->desc_nr = choice;
2207 } else {
2208 if (md_find_rdev_nr_rcu(mddev, rdev->desc_nr)) {
2209 rcu_read_unlock();
2210 return -EBUSY;
2211 }
2212 }
2213 rcu_read_unlock();
2214 if (!test_bit(Journal, &rdev->flags) &&
2215 mddev->max_disks && rdev->desc_nr >= mddev->max_disks) {
2216 pr_warn("md: %s: array is limited to %d devices\n",
2217 mdname(mddev), mddev->max_disks);
2218 return -EBUSY;
2219 }
2220 bdevname(rdev->bdev,b);
2221 strreplace(b, '/', '!');
2222
2223 rdev->mddev = mddev;
2224 pr_debug("md: bind<%s>\n", b);
2225
2226 if ((err = kobject_add(&rdev->kobj, &mddev->kobj, "dev-%s", b)))
2227 goto fail;
2228
2229 ko = &part_to_dev(rdev->bdev->bd_part)->kobj;
2230 if (sysfs_create_link(&rdev->kobj, ko, "block"))
2231 /* failure here is OK */;
2232 rdev->sysfs_state = sysfs_get_dirent_safe(rdev->kobj.sd, "state");
2233
2234 list_add_rcu(&rdev->same_set, &mddev->disks);
2235 bd_link_disk_holder(rdev->bdev, mddev->gendisk);
2236
2237 /* May as well allow recovery to be retried once */
2238 mddev->recovery_disabled++;
2239
2240 return 0;
2241
2242 fail:
2243 pr_warn("md: failed to register dev-%s for %s\n",
2244 b, mdname(mddev));
2245 return err;
2246}
2247
2248static void md_delayed_delete(struct work_struct *ws)
2249{
2250 struct md_rdev *rdev = container_of(ws, struct md_rdev, del_work);
2251 kobject_del(&rdev->kobj);
2252 kobject_put(&rdev->kobj);
2253}
2254
2255static void unbind_rdev_from_array(struct md_rdev *rdev)
2256{
2257 char b[BDEVNAME_SIZE];
2258
2259 bd_unlink_disk_holder(rdev->bdev, rdev->mddev->gendisk);
2260 list_del_rcu(&rdev->same_set);
2261 pr_debug("md: unbind<%s>\n", bdevname(rdev->bdev,b));
2262 rdev->mddev = NULL;
2263 sysfs_remove_link(&rdev->kobj, "block");
2264 sysfs_put(rdev->sysfs_state);
2265 rdev->sysfs_state = NULL;
2266 rdev->badblocks.count = 0;
2267 /* We need to delay this, otherwise we can deadlock when
2268 * writing to 'remove' to "dev/state". We also need
2269 * to delay it due to rcu usage.
2270 */
2271 synchronize_rcu();
2272 INIT_WORK(&rdev->del_work, md_delayed_delete);
2273 kobject_get(&rdev->kobj);
2274 queue_work(md_misc_wq, &rdev->del_work);
2275}
2276
2277/*
2278 * prevent the device from being mounted, repartitioned or
2279 * otherwise reused by a RAID array (or any other kernel
2280 * subsystem), by bd_claiming the device.
2281 */
2282static int lock_rdev(struct md_rdev *rdev, dev_t dev, int shared)
2283{
2284 int err = 0;
2285 struct block_device *bdev;
2286 char b[BDEVNAME_SIZE];
2287
2288 bdev = blkdev_get_by_dev(dev, FMODE_READ|FMODE_WRITE|FMODE_EXCL,
2289 shared ? (struct md_rdev *)lock_rdev : rdev);
2290 if (IS_ERR(bdev)) {
2291 pr_warn("md: could not open %s.\n", __bdevname(dev, b));
2292 return PTR_ERR(bdev);
2293 }
2294 rdev->bdev = bdev;
2295 return err;
2296}
2297
2298static void unlock_rdev(struct md_rdev *rdev)
2299{
2300 struct block_device *bdev = rdev->bdev;
2301 rdev->bdev = NULL;
2302 blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
2303}
2304
2305void md_autodetect_dev(dev_t dev);
2306
2307static void export_rdev(struct md_rdev *rdev)
2308{
2309 char b[BDEVNAME_SIZE];
2310
2311 pr_debug("md: export_rdev(%s)\n", bdevname(rdev->bdev,b));
2312 md_rdev_clear(rdev);
2313#ifndef MODULE
2314 if (test_bit(AutoDetected, &rdev->flags))
2315 md_autodetect_dev(rdev->bdev->bd_dev);
2316#endif
2317 unlock_rdev(rdev);
2318 kobject_put(&rdev->kobj);
2319}
2320
2321void md_kick_rdev_from_array(struct md_rdev *rdev)
2322{
2323 unbind_rdev_from_array(rdev);
2324 export_rdev(rdev);
2325}
2326EXPORT_SYMBOL_GPL(md_kick_rdev_from_array);
2327
2328static void export_array(struct mddev *mddev)
2329{
2330 struct md_rdev *rdev;
2331
2332 while (!list_empty(&mddev->disks)) {
2333 rdev = list_first_entry(&mddev->disks, struct md_rdev,
2334 same_set);
2335 md_kick_rdev_from_array(rdev);
2336 }
2337 mddev->raid_disks = 0;
2338 mddev->major_version = 0;
2339}
2340
2341static bool set_in_sync(struct mddev *mddev)
2342{
2343 lockdep_assert_held(&mddev->lock);
2344 if (!mddev->in_sync) {
2345 mddev->sync_checkers++;
2346 spin_unlock(&mddev->lock);
2347 percpu_ref_switch_to_atomic_sync(&mddev->writes_pending);
2348 spin_lock(&mddev->lock);
2349 if (!mddev->in_sync &&
2350 percpu_ref_is_zero(&mddev->writes_pending)) {
2351 mddev->in_sync = 1;
2352 /*
2353 * Ensure ->in_sync is visible before we clear
2354 * ->sync_checkers.
2355 */
2356 smp_mb();
2357 set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags);
2358 sysfs_notify_dirent_safe(mddev->sysfs_state);
2359 }
2360 if (--mddev->sync_checkers == 0)
2361 percpu_ref_switch_to_percpu(&mddev->writes_pending);
2362 }
2363 if (mddev->safemode == 1)
2364 mddev->safemode = 0;
2365 return mddev->in_sync;
2366}
2367
2368static void sync_sbs(struct mddev *mddev, int nospares)
2369{
2370 /* Update each superblock (in-memory image), but
2371 * if we are allowed to, skip spares which already
2372 * have the right event counter, or have one earlier
2373 * (which would mean they aren't being marked as dirty
2374 * with the rest of the array)
2375 */
2376 struct md_rdev *rdev;
2377 rdev_for_each(rdev, mddev) {
2378 if (rdev->sb_events == mddev->events ||
2379 (nospares &&
2380 rdev->raid_disk < 0 &&
2381 rdev->sb_events+1 == mddev->events)) {
2382 /* Don't update this superblock */
2383 rdev->sb_loaded = 2;
2384 } else {
2385 sync_super(mddev, rdev);
2386 rdev->sb_loaded = 1;
2387 }
2388 }
2389}
2390
2391static bool does_sb_need_changing(struct mddev *mddev)
2392{
2393 struct md_rdev *rdev;
2394 struct mdp_superblock_1 *sb;
2395 int role;
2396
2397 /* Find a good rdev */
2398 rdev_for_each(rdev, mddev)
2399 if ((rdev->raid_disk >= 0) && !test_bit(Faulty, &rdev->flags))
2400 break;
2401
2402 /* No good device found. */
2403 if (!rdev)
2404 return false;
2405
2406 sb = page_address(rdev->sb_page);
2407 /* Check if a device has become faulty or a spare become active */
2408 rdev_for_each(rdev, mddev) {
2409 role = le16_to_cpu(sb->dev_roles[rdev->desc_nr]);
2410 /* Device activated? */
2411 if (role == 0xffff && rdev->raid_disk >=0 &&
2412 !test_bit(Faulty, &rdev->flags))
2413 return true;
2414 /* Device turned faulty? */
2415 if (test_bit(Faulty, &rdev->flags) && (role < 0xfffd))
2416 return true;
2417 }
2418
2419 /* Check if any mddev parameters have changed */
2420 if ((mddev->dev_sectors != le64_to_cpu(sb->size)) ||
2421 (mddev->reshape_position != le64_to_cpu(sb->reshape_position)) ||
2422 (mddev->layout != le32_to_cpu(sb->layout)) ||
2423 (mddev->raid_disks != le32_to_cpu(sb->raid_disks)) ||
2424 (mddev->chunk_sectors != le32_to_cpu(sb->chunksize)))
2425 return true;
2426
2427 return false;
2428}
2429
2430void md_update_sb(struct mddev *mddev, int force_change)
2431{
2432 struct md_rdev *rdev;
2433 int sync_req;
2434 int nospares = 0;
2435 int any_badblocks_changed = 0;
2436 int ret = -1;
2437
2438 if (mddev->ro) {
2439 if (force_change)
2440 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
2441 return;
2442 }
2443
2444repeat:
2445 if (mddev_is_clustered(mddev)) {
2446 if (test_and_clear_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags))
2447 force_change = 1;
2448 if (test_and_clear_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags))
2449 nospares = 1;
2450 ret = md_cluster_ops->metadata_update_start(mddev);
2451 /* Has someone else has updated the sb */
2452 if (!does_sb_need_changing(mddev)) {
2453 if (ret == 0)
2454 md_cluster_ops->metadata_update_cancel(mddev);
2455 bit_clear_unless(&mddev->sb_flags, BIT(MD_SB_CHANGE_PENDING),
2456 BIT(MD_SB_CHANGE_DEVS) |
2457 BIT(MD_SB_CHANGE_CLEAN));
2458 return;
2459 }
2460 }
2461
2462 /*
2463 * First make sure individual recovery_offsets are correct
2464 * curr_resync_completed can only be used during recovery.
2465 * During reshape/resync it might use array-addresses rather
2466 * that device addresses.
2467 */
2468 rdev_for_each(rdev, mddev) {
2469 if (rdev->raid_disk >= 0 &&
2470 mddev->delta_disks >= 0 &&
2471 test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) &&
2472 test_bit(MD_RECOVERY_RECOVER, &mddev->recovery) &&
2473 !test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
2474 !test_bit(Journal, &rdev->flags) &&
2475 !test_bit(In_sync, &rdev->flags) &&
2476 mddev->curr_resync_completed > rdev->recovery_offset)
2477 rdev->recovery_offset = mddev->curr_resync_completed;
2478
2479 }
2480 if (!mddev->persistent) {
2481 clear_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags);
2482 clear_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
2483 if (!mddev->external) {
2484 clear_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags);
2485 rdev_for_each(rdev, mddev) {
2486 if (rdev->badblocks.changed) {
2487 rdev->badblocks.changed = 0;
2488 ack_all_badblocks(&rdev->badblocks);
2489 md_error(mddev, rdev);
2490 }
2491 clear_bit(Blocked, &rdev->flags);
2492 clear_bit(BlockedBadBlocks, &rdev->flags);
2493 wake_up(&rdev->blocked_wait);
2494 }
2495 }
2496 wake_up(&mddev->sb_wait);
2497 return;
2498 }
2499
2500 spin_lock(&mddev->lock);
2501
2502 mddev->utime = ktime_get_real_seconds();
2503
2504 if (test_and_clear_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags))
2505 force_change = 1;
2506 if (test_and_clear_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags))
2507 /* just a clean<-> dirty transition, possibly leave spares alone,
2508 * though if events isn't the right even/odd, we will have to do
2509 * spares after all
2510 */
2511 nospares = 1;
2512 if (force_change)
2513 nospares = 0;
2514 if (mddev->degraded)
2515 /* If the array is degraded, then skipping spares is both
2516 * dangerous and fairly pointless.
2517 * Dangerous because a device that was removed from the array
2518 * might have a event_count that still looks up-to-date,
2519 * so it can be re-added without a resync.
2520 * Pointless because if there are any spares to skip,
2521 * then a recovery will happen and soon that array won't
2522 * be degraded any more and the spare can go back to sleep then.
2523 */
2524 nospares = 0;
2525
2526 sync_req = mddev->in_sync;
2527
2528 /* If this is just a dirty<->clean transition, and the array is clean
2529 * and 'events' is odd, we can roll back to the previous clean state */
2530 if (nospares
2531 && (mddev->in_sync && mddev->recovery_cp == MaxSector)
2532 && mddev->can_decrease_events
2533 && mddev->events != 1) {
2534 mddev->events--;
2535 mddev->can_decrease_events = 0;
2536 } else {
2537 /* otherwise we have to go forward and ... */
2538 mddev->events ++;
2539 mddev->can_decrease_events = nospares;
2540 }
2541
2542 /*
2543 * This 64-bit counter should never wrap.
2544 * Either we are in around ~1 trillion A.C., assuming
2545 * 1 reboot per second, or we have a bug...
2546 */
2547 WARN_ON(mddev->events == 0);
2548
2549 rdev_for_each(rdev, mddev) {
2550 if (rdev->badblocks.changed)
2551 any_badblocks_changed++;
2552 if (test_bit(Faulty, &rdev->flags))
2553 set_bit(FaultRecorded, &rdev->flags);
2554 }
2555
2556 sync_sbs(mddev, nospares);
2557 spin_unlock(&mddev->lock);
2558
2559 pr_debug("md: updating %s RAID superblock on device (in sync %d)\n",
2560 mdname(mddev), mddev->in_sync);
2561
2562 if (mddev->queue)
2563 blk_add_trace_msg(mddev->queue, "md md_update_sb");
2564rewrite:
2565 md_bitmap_update_sb(mddev->bitmap);
2566 rdev_for_each(rdev, mddev) {
2567 char b[BDEVNAME_SIZE];
2568
2569 if (rdev->sb_loaded != 1)
2570 continue; /* no noise on spare devices */
2571
2572 if (!test_bit(Faulty, &rdev->flags)) {
2573 md_super_write(mddev,rdev,
2574 rdev->sb_start, rdev->sb_size,
2575 rdev->sb_page);
2576 pr_debug("md: (write) %s's sb offset: %llu\n",
2577 bdevname(rdev->bdev, b),
2578 (unsigned long long)rdev->sb_start);
2579 rdev->sb_events = mddev->events;
2580 if (rdev->badblocks.size) {
2581 md_super_write(mddev, rdev,
2582 rdev->badblocks.sector,
2583 rdev->badblocks.size << 9,
2584 rdev->bb_page);
2585 rdev->badblocks.size = 0;
2586 }
2587
2588 } else
2589 pr_debug("md: %s (skipping faulty)\n",
2590 bdevname(rdev->bdev, b));
2591
2592 if (mddev->level == LEVEL_MULTIPATH)
2593 /* only need to write one superblock... */
2594 break;
2595 }
2596 if (md_super_wait(mddev) < 0)
2597 goto rewrite;
2598 /* if there was a failure, MD_SB_CHANGE_DEVS was set, and we re-write super */
2599
2600 if (mddev_is_clustered(mddev) && ret == 0)
2601 md_cluster_ops->metadata_update_finish(mddev);
2602
2603 if (mddev->in_sync != sync_req ||
2604 !bit_clear_unless(&mddev->sb_flags, BIT(MD_SB_CHANGE_PENDING),
2605 BIT(MD_SB_CHANGE_DEVS) | BIT(MD_SB_CHANGE_CLEAN)))
2606 /* have to write it out again */
2607 goto repeat;
2608 wake_up(&mddev->sb_wait);
2609 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
2610 sysfs_notify(&mddev->kobj, NULL, "sync_completed");
2611
2612 rdev_for_each(rdev, mddev) {
2613 if (test_and_clear_bit(FaultRecorded, &rdev->flags))
2614 clear_bit(Blocked, &rdev->flags);
2615
2616 if (any_badblocks_changed)
2617 ack_all_badblocks(&rdev->badblocks);
2618 clear_bit(BlockedBadBlocks, &rdev->flags);
2619 wake_up(&rdev->blocked_wait);
2620 }
2621}
2622EXPORT_SYMBOL(md_update_sb);
2623
2624static int add_bound_rdev(struct md_rdev *rdev)
2625{
2626 struct mddev *mddev = rdev->mddev;
2627 int err = 0;
2628 bool add_journal = test_bit(Journal, &rdev->flags);
2629
2630 if (!mddev->pers->hot_remove_disk || add_journal) {
2631 /* If there is hot_add_disk but no hot_remove_disk
2632 * then added disks for geometry changes,
2633 * and should be added immediately.
2634 */
2635 super_types[mddev->major_version].
2636 validate_super(mddev, rdev);
2637 if (add_journal)
2638 mddev_suspend(mddev);
2639 err = mddev->pers->hot_add_disk(mddev, rdev);
2640 if (add_journal)
2641 mddev_resume(mddev);
2642 if (err) {
2643 md_kick_rdev_from_array(rdev);
2644 return err;
2645 }
2646 }
2647 sysfs_notify_dirent_safe(rdev->sysfs_state);
2648
2649 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
2650 if (mddev->degraded)
2651 set_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
2652 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
2653 md_new_event(mddev);
2654 md_wakeup_thread(mddev->thread);
2655 return 0;
2656}
2657
2658/* words written to sysfs files may, or may not, be \n terminated.
2659 * We want to accept with case. For this we use cmd_match.
2660 */
2661static int cmd_match(const char *cmd, const char *str)
2662{
2663 /* See if cmd, written into a sysfs file, matches
2664 * str. They must either be the same, or cmd can
2665 * have a trailing newline
2666 */
2667 while (*cmd && *str && *cmd == *str) {
2668 cmd++;
2669 str++;
2670 }
2671 if (*cmd == '\n')
2672 cmd++;
2673 if (*str || *cmd)
2674 return 0;
2675 return 1;
2676}
2677
2678struct rdev_sysfs_entry {
2679 struct attribute attr;
2680 ssize_t (*show)(struct md_rdev *, char *);
2681 ssize_t (*store)(struct md_rdev *, const char *, size_t);
2682};
2683
2684static ssize_t
2685state_show(struct md_rdev *rdev, char *page)
2686{
2687 char *sep = ",";
2688 size_t len = 0;
2689 unsigned long flags = READ_ONCE(rdev->flags);
2690
2691 if (test_bit(Faulty, &flags) ||
2692 (!test_bit(ExternalBbl, &flags) &&
2693 rdev->badblocks.unacked_exist))
2694 len += sprintf(page+len, "faulty%s", sep);
2695 if (test_bit(In_sync, &flags))
2696 len += sprintf(page+len, "in_sync%s", sep);
2697 if (test_bit(Journal, &flags))
2698 len += sprintf(page+len, "journal%s", sep);
2699 if (test_bit(WriteMostly, &flags))
2700 len += sprintf(page+len, "write_mostly%s", sep);
2701 if (test_bit(Blocked, &flags) ||
2702 (rdev->badblocks.unacked_exist
2703 && !test_bit(Faulty, &flags)))
2704 len += sprintf(page+len, "blocked%s", sep);
2705 if (!test_bit(Faulty, &flags) &&
2706 !test_bit(Journal, &flags) &&
2707 !test_bit(In_sync, &flags))
2708 len += sprintf(page+len, "spare%s", sep);
2709 if (test_bit(WriteErrorSeen, &flags))
2710 len += sprintf(page+len, "write_error%s", sep);
2711 if (test_bit(WantReplacement, &flags))
2712 len += sprintf(page+len, "want_replacement%s", sep);
2713 if (test_bit(Replacement, &flags))
2714 len += sprintf(page+len, "replacement%s", sep);
2715 if (test_bit(ExternalBbl, &flags))
2716 len += sprintf(page+len, "external_bbl%s", sep);
2717 if (test_bit(FailFast, &flags))
2718 len += sprintf(page+len, "failfast%s", sep);
2719
2720 if (len)
2721 len -= strlen(sep);
2722
2723 return len+sprintf(page+len, "\n");
2724}
2725
2726static ssize_t
2727state_store(struct md_rdev *rdev, const char *buf, size_t len)
2728{
2729 /* can write
2730 * faulty - simulates an error
2731 * remove - disconnects the device
2732 * writemostly - sets write_mostly
2733 * -writemostly - clears write_mostly
2734 * blocked - sets the Blocked flags
2735 * -blocked - clears the Blocked and possibly simulates an error
2736 * insync - sets Insync providing device isn't active
2737 * -insync - clear Insync for a device with a slot assigned,
2738 * so that it gets rebuilt based on bitmap
2739 * write_error - sets WriteErrorSeen
2740 * -write_error - clears WriteErrorSeen
2741 * {,-}failfast - set/clear FailFast
2742 */
2743 int err = -EINVAL;
2744 if (cmd_match(buf, "faulty") && rdev->mddev->pers) {
2745 md_error(rdev->mddev, rdev);
2746 if (test_bit(Faulty, &rdev->flags))
2747 err = 0;
2748 else
2749 err = -EBUSY;
2750 } else if (cmd_match(buf, "remove")) {
2751 if (rdev->mddev->pers) {
2752 clear_bit(Blocked, &rdev->flags);
2753 remove_and_add_spares(rdev->mddev, rdev);
2754 }
2755 if (rdev->raid_disk >= 0)
2756 err = -EBUSY;
2757 else {
2758 struct mddev *mddev = rdev->mddev;
2759 err = 0;
2760 if (mddev_is_clustered(mddev))
2761 err = md_cluster_ops->remove_disk(mddev, rdev);
2762
2763 if (err == 0) {
2764 md_kick_rdev_from_array(rdev);
2765 if (mddev->pers) {
2766 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
2767 md_wakeup_thread(mddev->thread);
2768 }
2769 md_new_event(mddev);
2770 }
2771 }
2772 } else if (cmd_match(buf, "writemostly")) {
2773 set_bit(WriteMostly, &rdev->flags);
2774 err = 0;
2775 } else if (cmd_match(buf, "-writemostly")) {
2776 clear_bit(WriteMostly, &rdev->flags);
2777 err = 0;
2778 } else if (cmd_match(buf, "blocked")) {
2779 set_bit(Blocked, &rdev->flags);
2780 err = 0;
2781 } else if (cmd_match(buf, "-blocked")) {
2782 if (!test_bit(Faulty, &rdev->flags) &&
2783 !test_bit(ExternalBbl, &rdev->flags) &&
2784 rdev->badblocks.unacked_exist) {
2785 /* metadata handler doesn't understand badblocks,
2786 * so we need to fail the device
2787 */
2788 md_error(rdev->mddev, rdev);
2789 }
2790 clear_bit(Blocked, &rdev->flags);
2791 clear_bit(BlockedBadBlocks, &rdev->flags);
2792 wake_up(&rdev->blocked_wait);
2793 set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery);
2794 md_wakeup_thread(rdev->mddev->thread);
2795
2796 err = 0;
2797 } else if (cmd_match(buf, "insync") && rdev->raid_disk == -1) {
2798 set_bit(In_sync, &rdev->flags);
2799 err = 0;
2800 } else if (cmd_match(buf, "failfast")) {
2801 set_bit(FailFast, &rdev->flags);
2802 err = 0;
2803 } else if (cmd_match(buf, "-failfast")) {
2804 clear_bit(FailFast, &rdev->flags);
2805 err = 0;
2806 } else if (cmd_match(buf, "-insync") && rdev->raid_disk >= 0 &&
2807 !test_bit(Journal, &rdev->flags)) {
2808 if (rdev->mddev->pers == NULL) {
2809 clear_bit(In_sync, &rdev->flags);
2810 rdev->saved_raid_disk = rdev->raid_disk;
2811 rdev->raid_disk = -1;
2812 err = 0;
2813 }
2814 } else if (cmd_match(buf, "write_error")) {
2815 set_bit(WriteErrorSeen, &rdev->flags);
2816 err = 0;
2817 } else if (cmd_match(buf, "-write_error")) {
2818 clear_bit(WriteErrorSeen, &rdev->flags);
2819 err = 0;
2820 } else if (cmd_match(buf, "want_replacement")) {
2821 /* Any non-spare device that is not a replacement can
2822 * become want_replacement at any time, but we then need to
2823 * check if recovery is needed.
2824 */
2825 if (rdev->raid_disk >= 0 &&
2826 !test_bit(Journal, &rdev->flags) &&
2827 !test_bit(Replacement, &rdev->flags))
2828 set_bit(WantReplacement, &rdev->flags);
2829 set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery);
2830 md_wakeup_thread(rdev->mddev->thread);
2831 err = 0;
2832 } else if (cmd_match(buf, "-want_replacement")) {
2833 /* Clearing 'want_replacement' is always allowed.
2834 * Once replacements starts it is too late though.
2835 */
2836 err = 0;
2837 clear_bit(WantReplacement, &rdev->flags);
2838 } else if (cmd_match(buf, "replacement")) {
2839 /* Can only set a device as a replacement when array has not
2840 * yet been started. Once running, replacement is automatic
2841 * from spares, or by assigning 'slot'.
2842 */
2843 if (rdev->mddev->pers)
2844 err = -EBUSY;
2845 else {
2846 set_bit(Replacement, &rdev->flags);
2847 err = 0;
2848 }
2849 } else if (cmd_match(buf, "-replacement")) {
2850 /* Similarly, can only clear Replacement before start */
2851 if (rdev->mddev->pers)
2852 err = -EBUSY;
2853 else {
2854 clear_bit(Replacement, &rdev->flags);
2855 err = 0;
2856 }
2857 } else if (cmd_match(buf, "re-add")) {
2858 if (test_bit(Faulty, &rdev->flags) && (rdev->raid_disk == -1) &&
2859 rdev->saved_raid_disk >= 0) {
2860 /* clear_bit is performed _after_ all the devices
2861 * have their local Faulty bit cleared. If any writes
2862 * happen in the meantime in the local node, they
2863 * will land in the local bitmap, which will be synced
2864 * by this node eventually
2865 */
2866 if (!mddev_is_clustered(rdev->mddev) ||
2867 (err = md_cluster_ops->gather_bitmaps(rdev)) == 0) {
2868 clear_bit(Faulty, &rdev->flags);
2869 err = add_bound_rdev(rdev);
2870 }
2871 } else
2872 err = -EBUSY;
2873 } else if (cmd_match(buf, "external_bbl") && (rdev->mddev->external)) {
2874 set_bit(ExternalBbl, &rdev->flags);
2875 rdev->badblocks.shift = 0;
2876 err = 0;
2877 } else if (cmd_match(buf, "-external_bbl") && (rdev->mddev->external)) {
2878 clear_bit(ExternalBbl, &rdev->flags);
2879 err = 0;
2880 }
2881 if (!err)
2882 sysfs_notify_dirent_safe(rdev->sysfs_state);
2883 return err ? err : len;
2884}
2885static struct rdev_sysfs_entry rdev_state =
2886__ATTR_PREALLOC(state, S_IRUGO|S_IWUSR, state_show, state_store);
2887
2888static ssize_t
2889errors_show(struct md_rdev *rdev, char *page)
2890{
2891 return sprintf(page, "%d\n", atomic_read(&rdev->corrected_errors));
2892}
2893
2894static ssize_t
2895errors_store(struct md_rdev *rdev, const char *buf, size_t len)
2896{
2897 unsigned int n;
2898 int rv;
2899
2900 rv = kstrtouint(buf, 10, &n);
2901 if (rv < 0)
2902 return rv;
2903 atomic_set(&rdev->corrected_errors, n);
2904 return len;
2905}
2906static struct rdev_sysfs_entry rdev_errors =
2907__ATTR(errors, S_IRUGO|S_IWUSR, errors_show, errors_store);
2908
2909static ssize_t
2910slot_show(struct md_rdev *rdev, char *page)
2911{
2912 if (test_bit(Journal, &rdev->flags))
2913 return sprintf(page, "journal\n");
2914 else if (rdev->raid_disk < 0)
2915 return sprintf(page, "none\n");
2916 else
2917 return sprintf(page, "%d\n", rdev->raid_disk);
2918}
2919
2920static ssize_t
2921slot_store(struct md_rdev *rdev, const char *buf,