md.h source code [linux/drivers/md/md.h]

1	/ SPDX-License-Identifier: GPL-2.0-or-later /
2	/*
3	md.h : kernel internal structure of the Linux MD driver
4	Copyright (C) 1996-98 Ingo Molnar, Gadi Oxman
5
6	*/
7
8	#ifndef _MD_MD_H
9	#define _MD_MD_H
10
11	#include <linux/blkdev.h>
12	#include <linux/backing-dev.h>
13	#include <linux/badblocks.h>
14	#include <linux/kobject.h>
15	#include <linux/list.h>
16	#include <linux/mm.h>
17	#include <linux/mutex.h>
18	#include <linux/timer.h>
19	#include <linux/wait.h>
20	#include <linux/workqueue.h>
21	#include <trace/events/block.h>
22	#include "md-cluster.h"
23
24	#define MaxSector (~(sector_t)0)
25
26	/*
27	* These flags should really be called "NO_RETRY" rather than
28	* "FAILFAST" because they don't make any promise about time lapse,
29	* only about the number of retries, which will be zero.
30	* REQ_FAILFAST_DRIVER is not included because
31	* Commit: 4a27446f3e39 ("[SCSI] modify scsi to handle new fail fast flags.")
32	* seems to suggest that the errors it avoids retrying should usually
33	* be retried.
34	*/
35	#define MD_FAILFAST (REQ_FAILFAST_DEV \| REQ_FAILFAST_TRANSPORT)
36
37	/*
38	* The struct embedded in rdev is used to serialize IO.
39	*/
40	struct serial_in_rdev {
41	struct rb_root_cached serial_rb;
42	spinlock_t serial_lock;
43	wait_queue_head_t serial_io_wait;
44	};
45
46	/*
47	* MD's 'extended' device
48	*/
49	struct md_rdev {
50	struct list_head same_set; / RAID devices within the same set /
51
52	sector_t sectors; / Device size (in 512bytes sectors) /
53	struct mddev mddev; /* RAID array if running /
54	int last_events; / IO event timestamp /
55
56	/*
57	* If meta_bdev is non-NULL, it means that a separate device is
58	* being used to store the metadata (superblock/bitmap) which
59	* would otherwise be contained on the same device as the data (bdev).
60	*/
61	struct block_device *meta_bdev;
62	struct block_device bdev; /* block device handle /
63	struct file bdev_file; /* Handle from open for bdev /
64
65	struct page sb_page, bb_page;
66	int sb_loaded;
67	__u64 sb_events;
68	sector_t data_offset; / start of data in array /
69	sector_t new_data_offset;/ only relevant while reshaping /
70	sector_t sb_start; / offset of the super block (in 512byte sectors) /
71	int sb_size; / bytes in the superblock /
72	int preferred_minor; / autorun support /
73
74	struct kobject kobj;
75
76	/ A device can be in one of three states based on two flags:*
77	* Not working: faulty==1 in_sync==0
78	* Fully working: faulty==0 in_sync==1
79	* Working, but not
80	* in sync with array
81	* faulty==0 in_sync==0
82	*
83	* It can never have faulty==1, in_sync==1
84	* This reduces the burden of testing multiple flags in many cases
85	*/
86
87	unsigned long flags; / bit set of 'enum flag_bits' bits. /
88	wait_queue_head_t blocked_wait;
89
90	int desc_nr; / descriptor index in the superblock /
91	int raid_disk; / role of device in array /
92	int new_raid_disk; / role that the device will have in*
93	* the array after a level-change completes.
94	*/
95	int saved_raid_disk; / role that device used to have in the*
96	* array and could again if we did a partial
97	* resync from the bitmap
98	*/
99	union {
100	sector_t recovery_offset;/ If this device has been partially*
101	* recovered, this is where we were
102	* up to.
103	*/
104	sector_t journal_tail; / If this device is a journal device,*
105	* this is the journal tail (journal
106	* recovery start point)
107	*/
108	};
109
110	atomic_t nr_pending; / number of pending requests.*
111	* only maintained for arrays that
112	* support hot removal
113	*/
114	atomic_t read_errors; / number of consecutive read errors that*
115	* we have tried to ignore.
116	*/
117	time64_t last_read_error; / monotonic time since our*
118	* last read error
119	*/
120	atomic_t corrected_errors; / number of corrected read errors,*
121	* for reporting to userspace and storing
122	* in superblock.
123	*/
124
125	struct serial_in_rdev serial; /* used for raid1 io serialization /
126
127	struct kernfs_node sysfs_state; /* handle for 'state'*
128	* sysfs entry */
129	/ handle for 'unacknowledged_bad_blocks' sysfs dentry /
130	struct kernfs_node *sysfs_unack_badblocks;
131	/ handle for 'bad_blocks' sysfs dentry /
132	struct kernfs_node *sysfs_badblocks;
133	struct badblocks badblocks;
134
135	struct {
136	short offset; / Offset from superblock to start of PPL.*
137	* Not used by external metadata. */
138	unsigned int size; / Size in sectors of the PPL space /
139	sector_t sector; / First sector of the PPL space /
140	} ppl;
141	};
142	enum flag_bits {
143	Faulty, / device is known to have a fault /
144	In_sync, / device is in_sync with rest of array /
145	Bitmap_sync, / ..actually, not quite In_sync. Need a*
146	* bitmap-based recovery to get fully in sync.
147	* The bit is only meaningful before device
148	* has been passed to pers->hot_add_disk.
149	*/
150	WriteMostly, / Avoid reading if at all possible /
151	AutoDetected, / added by auto-detect /
152	Blocked, / An error occurred but has not yet*
153	* been acknowledged by the metadata
154	* handler, so don't allow writes
155	* until it is cleared */
156	WriteErrorSeen, / A write error has been seen on this*
157	* device
158	*/
159	FaultRecorded, / Intermediate state for clearing*
160	* Blocked. The Fault is/will-be
161	* recorded in the metadata, but that
162	* metadata hasn't been stored safely
163	* on disk yet.
164	*/
165	BlockedBadBlocks, / A writer is blocked because they*
166	* found an unacknowledged bad-block.
167	* This can safely be cleared at any
168	* time, and the writer will re-check.
169	* It may be set at any time, and at
170	* worst the writer will timeout and
171	* re-check. So setting it as
172	* accurately as possible is good, but
173	* not absolutely critical.
174	*/
175	WantReplacement, / This device is a candidate to be*
176	* hot-replaced, either because it has
177	* reported some faults, or because
178	* of explicit request.
179	*/
180	Replacement, / This device is a replacement for*
181	* a want_replacement device with same
182	* raid_disk number.
183	*/
184	Candidate, / For clustered environments only:*
185	* This device is seen locally but not
186	* by the whole cluster
187	*/
188	Journal, / This device is used as journal for*
189	* raid-5/6.
190	* Usually, this device should be faster
191	* than other devices in the array
192	*/
193	ClusterRemove,
194	ExternalBbl, / External metadata provides bad*
195	* block management for a disk
196	*/
197	FailFast, / Minimal retries should be attempted on*
198	* this device, so use REQ_FAILFAST_DEV.
199	* Also don't try to repair failed reads.
200	* It is expects that no bad block log
201	* is present.
202	*/
203	LastDev, / Seems to be the last working dev as*
204	* it didn't fail, so don't use FailFast
205	* any more for metadata
206	*/
207	CollisionCheck, /*
208	* check if there is collision between raid1
209	* serial bios.
210	*/
211	Nonrot, / non-rotational device (SSD) /
212	};
213
214	static inline int is_badblock(struct md_rdev rdev, sector_t s, int* sectors,
215	sector_t first_bad, int* *bad_sectors)
216	{
217	if (unlikely(rdev->badblocks.count)) {
218	int rv = badblocks_check(bb: &rdev->badblocks, s: rdev->data_offset + s,
219	sectors,
220	first_bad, bad_sectors);
221	if (rv)
222	*first_bad -= rdev->data_offset;
223	return rv;
224	}
225	return `0`;
226	}
227
228	static inline int rdev_has_badblock(struct md_rdev *rdev, sector_t s,
229	int sectors)
230	{
231	sector_t first_bad;
232	int bad_sectors;
233
234	return is_badblock(rdev, s, sectors, first_bad: &first_bad, bad_sectors: &bad_sectors);
235	}
236
237	extern int rdev_set_badblocks(struct md_rdev rdev, sector_t s, int* sectors,
238	int is_new);
239	extern int rdev_clear_badblocks(struct md_rdev rdev, sector_t s, int* sectors,
240	int is_new);
241	struct md_cluster_info;
242
243	/**
244	* enum mddev_flags - md device flags.
245	* @MD_ARRAY_FIRST_USE: First use of array, needs initialization.
246	* @MD_CLOSING: If set, we are closing the array, do not open it then.
247	* @MD_JOURNAL_CLEAN: A raid with journal is already clean.
248	* @MD_HAS_JOURNAL: The raid array has journal feature set.
249	* @MD_CLUSTER_RESYNC_LOCKED: cluster raid only, which means node, already took
250	* resync lock, need to release the lock.
251	* @MD_FAILFAST_SUPPORTED: Using MD_FAILFAST on metadata writes is supported as
252	* calls to md_error() will never cause the array to
253	* become failed.
254	* @MD_HAS_PPL: The raid array has PPL feature set.
255	* @MD_HAS_MULTIPLE_PPLS: The raid array has multiple PPLs feature set.
256	* @MD_NOT_READY: do_md_run() is active, so 'array_state', ust not report that
257	* array is ready yet.
258	* @MD_BROKEN: This is used to stop writes and mark array as failed.
259	* @MD_DELETED: This device is being deleted
260	*
261	* change UNSUPPORTED_MDDEV_FLAGS for each array type if new flag is added
262	*/
263	enum mddev_flags {
264	MD_ARRAY_FIRST_USE,
265	MD_CLOSING,
266	MD_JOURNAL_CLEAN,
267	MD_HAS_JOURNAL,
268	MD_CLUSTER_RESYNC_LOCKED,
269	MD_FAILFAST_SUPPORTED,
270	MD_HAS_PPL,
271	MD_HAS_MULTIPLE_PPLS,
272	MD_NOT_READY,
273	MD_BROKEN,
274	MD_DELETED,
275	};
276
277	enum mddev_sb_flags {
278	MD_SB_CHANGE_DEVS, / Some device status has changed /
279	MD_SB_CHANGE_CLEAN, / transition to or from 'clean' /
280	MD_SB_CHANGE_PENDING, / switch from 'clean' to 'active' in progress /
281	MD_SB_NEED_REWRITE, / metadata write needs to be repeated /
282	};
283
284	#define NR_SERIAL_INFOS 8
285	/ record current range of serialize IOs /
286	struct serial_info {
287	struct rb_node node;
288	sector_t start; / start sector of rb node /
289	sector_t last; / end sector of rb node /
290	sector_t _subtree_last; / highest sector in subtree of rb node /
291	};
292
293	/*
294	* mddev->curr_resync stores the current sector of the resync but
295	* also has some overloaded values.
296	*/
297	enum {
298	/ No resync in progress /
299	MD_RESYNC_NONE = `0`,
300	/ Yielded to allow another conflicting resync to commence /
301	MD_RESYNC_YIELDED = `1`,
302	/ Delayed to check that there is no conflict with another sync /
303	MD_RESYNC_DELAYED = `2`,
304	/ Any value greater than or equal to this is in an active resync /
305	MD_RESYNC_ACTIVE = `3`,
306	};
307
308	struct mddev {
309	void *private;
310	struct md_personality *pers;
311	dev_t unit;
312	int md_minor;
313	struct list_head disks;
314	unsigned long flags;
315	unsigned long sb_flags;
316
317	int suspended;
318	struct mutex suspend_mutex;
319	struct percpu_ref active_io;
320	int ro;
321	int sysfs_active; / set when sysfs deletes*
322	* are happening, so run/
323	* takeover/stop are not safe
324	*/
325	struct gendisk *gendisk;
326
327	struct kobject kobj;
328	int hold_active;
329	#define UNTIL_IOCTL 1
330	#define UNTIL_STOP 2
331
332	/ Superblock information /
333	int major_version,
334	minor_version,
335	patch_version;
336	int persistent;
337	int external; / metadata is*
338	* managed externally */
339	char metadata_type[`17`]; / externally set/
340	int chunk_sectors;
341	time64_t ctime, utime;
342	int level, layout;
343	char clevel[`16`];
344	int raid_disks;
345	int max_disks;
346	sector_t dev_sectors; / used size of*
347	* component devices */
348	sector_t array_sectors; / exported array size /
349	int external_size; / size managed*
350	* externally */
351	__u64 events;
352	/ If the last 'event' was simply a clean->dirty transition, and*
353	* we didn't write it to the spares, then it is safe and simple
354	* to just decrement the event count on a dirty->clean transition.
355	* So we record that possibility here.
356	*/
357	int can_decrease_events;
358
359	char uuid[`16`];
360
361	/ If the array is being reshaped, we need to record the*
362	* new shape and an indication of where we are up to.
363	* This is written to the superblock.
364	* If reshape_position is MaxSector, then no reshape is happening (yet).
365	*/
366	sector_t reshape_position;
367	int delta_disks, new_level, new_layout;
368	int new_chunk_sectors;
369	int reshape_backwards;
370
371	struct md_thread __rcu thread; /* management thread /
372	struct md_thread __rcu sync_thread; /* doing resync or reconstruct /
373
374	/ 'last_sync_action' is initialized to "none". It is set when a*
375	* sync operation (i.e "data-check", "requested-resync", "resync",
376	* "recovery", or "reshape") is started. It holds this value even
377	* when the sync thread is "frozen" (interrupted) or "idle" (stopped
378	* or finished). It is overwritten when a new sync operation is begun.
379	*/
380	char *last_sync_action;
381	sector_t curr_resync; / last block scheduled /
382	/ As resync requests can complete out of order, we cannot easily track*
383	* how much resync has been completed. So we occasionally pause until
384	* everything completes, then set curr_resync_completed to curr_resync.
385	* As such it may be well behind the real resync mark, but it is a value
386	* we are certain of.
387	*/
388	sector_t curr_resync_completed;
389	unsigned long resync_mark; / a recent timestamp /
390	sector_t resync_mark_cnt;/ blocks written at resync_mark /
391	sector_t curr_mark_cnt; / blocks scheduled now /
392
393	sector_t resync_max_sectors; / may be set by personality /
394
395	atomic64_t resync_mismatches; / count of sectors where*
396	* parity/replica mismatch found
397	*/
398
399	/ allow user-space to request suspension of IO to regions of the array /
400	sector_t suspend_lo;
401	sector_t suspend_hi;
402	/ if zero, use the system-wide default /
403	int sync_speed_min;
404	int sync_speed_max;
405
406	/ resync even though the same disks are shared among md-devices /
407	int parallel_resync;
408
409	int ok_start_degraded;
410
411	unsigned long recovery;
412	/ If a RAID personality determines that recovery (of a particular*
413	* device) will fail due to a read error on the source device, it
414	* takes a copy of this number and does not attempt recovery again
415	* until this number changes.
416	*/
417	int recovery_disabled;
418
419	int in_sync; / know to not need resync /
420	/ 'open_mutex' avoids races between 'md_open' and 'do_md_stop', so*
421	* that we are never stopping an array while it is open.
422	* 'reconfig_mutex' protects all other reconfiguration.
423	* These locks are separate due to conflicting interactions
424	* with disk->open_mutex.
425	* Lock ordering is:
426	* reconfig_mutex -> disk->open_mutex
427	* disk->open_mutex -> open_mutex: e.g. __blkdev_get -> md_open
428	*/
429	struct mutex open_mutex;
430	struct mutex reconfig_mutex;
431	atomic_t active; / general refcount /
432	atomic_t openers; / number of active opens /
433
434	int changed; / True if we might need to*
435	* reread partition info */
436	int degraded; / whether md should consider*
437	* adding a spare
438	*/
439
440	atomic_t recovery_active; / blocks scheduled, but not written /
441	wait_queue_head_t recovery_wait;
442	sector_t recovery_cp;
443	sector_t resync_min; / user requested sync*
444	* starts here */
445	sector_t resync_max; / resync should pause*
446	* when it gets here */
447
448	struct kernfs_node sysfs_state; /* handle for 'array_state'*
449	* file in sysfs.
450	*/
451	struct kernfs_node sysfs_action; /* handle for 'sync_action' /
452	struct kernfs_node sysfs_completed; /handle for 'sync_completed' /*
453	struct kernfs_node sysfs_degraded; /handle for 'degraded' /*
454	struct kernfs_node sysfs_level; /handle for 'level' /*
455
456	/ used for delayed sysfs removal /
457	struct work_struct del_work;
458	/ used for register new sync thread /
459	struct work_struct sync_work;
460
461	/ "lock" protects:*
462	* flush_bio transition from NULL to !NULL
463	* rdev superblocks, events
464	* clearing MD_CHANGE_*
465	* in_sync - and related safemode and MD_CHANGE changes
466	* pers (also protected by reconfig_mutex and pending IO).
467	* clearing ->bitmap
468	* clearing ->bitmap_info.file
469	* changing ->resync_{min,max}
470	* setting MD_RECOVERY_RUNNING (which interacts with resync_{min,max})
471	*/
472	spinlock_t lock;
473	wait_queue_head_t sb_wait; / for waiting on superblock updates /
474	atomic_t pending_writes; / number of active superblock writes /
475
476	unsigned int safemode; / if set, update "clean" superblock*
477	* when no writes pending.
478	*/
479	unsigned int safemode_delay;
480	struct timer_list safemode_timer;
481	struct percpu_ref writes_pending;
482	int sync_checkers; / # of threads checking writes_pending /
483
484	struct bitmap bitmap; /* the bitmap for the device /
485	struct {
486	struct file file; /* the bitmap file /
487	loff_t offset; / offset from superblock of*
488	* start of bitmap. May be
489	* negative, but not '0'
490	* For external metadata, offset
491	* from start of device.
492	*/
493	unsigned long space; / space available at this offset /
494	loff_t default_offset; / this is the offset to use when*
495	* hot-adding a bitmap. It should
496	* eventually be settable by sysfs.
497	*/
498	unsigned long default_space; / space available at*
499	* default offset */
500	struct mutex mutex;
501	unsigned long chunksize;
502	unsigned long daemon_sleep; / how many jiffies between updates? /
503	unsigned long max_write_behind; / write-behind mode /
504	int external;
505	int nodes; / Maximum number of nodes in the cluster /
506	char cluster_name[`64`]; / Name of the cluster /
507	} bitmap_info;
508
509	atomic_t max_corr_read_errors; / max read retries /
510	struct list_head all_mddevs;
511
512	const struct attribute_group *to_remove;
513
514	struct bio_set bio_set;
515	struct bio_set sync_set; / for sync operations like*
516	* metadata and bitmap writes
517	*/
518	struct bio_set io_clone_set;
519
520	/ Generic flush handling.*
521	* The last to finish preflush schedules a worker to submit
522	* the rest of the request (without the REQ_PREFLUSH flag).
523	*/
524	struct bio *flush_bio;
525	atomic_t flush_pending;
526	ktime_t start_flush, prev_flush_start; / prev_flush_start is when the previous completed*
527	* flush was started.
528	*/
529	struct work_struct flush_work;
530	struct work_struct event_work; / used by dm to report failure event /
531	mempool_t *serial_info_pool;
532	void (sync_super)(struct* mddev mddev, struct* md_rdev *rdev);
533	struct md_cluster_info *cluster_info;
534	unsigned int good_device_nr; / good device num within cluster raid /
535	unsigned int noio_flag; / for memalloc scope API /
536
537	/*
538	* Temporarily store rdev that will be finally removed when
539	* reconfig_mutex is unlocked, protected by reconfig_mutex.
540	*/
541	struct list_head deleting;
542
543	/ Used to synchronize idle and frozen for action_store() /
544	struct mutex sync_mutex;
545	/ The sequence number for sync thread /
546	atomic_t sync_seq;
547
548	bool has_superblocks:`1`;
549	bool fail_last_dev:`1`;
550	bool serialize_policy:`1`;
551	};
552
553	enum recovery_flags {
554	/*
555	* If neither SYNC or RESHAPE are set, then it is a recovery.
556	*/
557	MD_RECOVERY_RUNNING, / a thread is running, or about to be started /
558	MD_RECOVERY_SYNC, / actually doing a resync, not a recovery /
559	MD_RECOVERY_RECOVER, / doing recovery, or need to try it. /
560	MD_RECOVERY_INTR, / resync needs to be aborted for some reason /
561	MD_RECOVERY_DONE, / thread is done and is waiting to be reaped /
562	MD_RECOVERY_NEEDED, / we might need to start a resync/recover /
563	MD_RECOVERY_REQUESTED, / user-space has requested a sync (used with SYNC) /
564	MD_RECOVERY_CHECK, / user-space request for check-only, no repair /
565	MD_RECOVERY_RESHAPE, / A reshape is happening /
566	MD_RECOVERY_FROZEN, / User request to abort, and not restart, any action /
567	MD_RECOVERY_ERROR, / sync-action interrupted because io-error /
568	MD_RECOVERY_WAIT, / waiting for pers->start() to finish /
569	MD_RESYNCING_REMOTE, / remote node is running resync thread /
570	};
571
572	enum md_ro_state {
573	MD_RDWR,
574	MD_RDONLY,
575	MD_AUTO_READ,
576	MD_MAX_STATE
577	};
578
579	static inline bool md_is_rdwr(struct mddev *mddev)
580	{
581	return (mddev->ro == MD_RDWR);
582	}
583
584	static inline bool reshape_interrupted(struct mddev *mddev)
585	{
586	/ reshape never start /
587	if (mddev->reshape_position == MaxSector)
588	return false;
589
590	/ interrupted /
591	if (!test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
592	return true;
593
594	/ running reshape will be interrupted soon. /
595	if (test_bit(MD_RECOVERY_WAIT, &mddev->recovery) \|\|
596	test_bit(MD_RECOVERY_INTR, &mddev->recovery) \|\|
597	test_bit(MD_RECOVERY_FROZEN, &mddev->recovery))
598	return true;
599
600	return false;
601	}
602
603	static inline int __must_check mddev_lock(struct mddev *mddev)
604	{
605	return mutex_lock_interruptible(&mddev->reconfig_mutex);
606	}
607
608	/ Sometimes we need to take the lock in a situation where*
609	* failure due to interrupts is not acceptable.
610	*/
611	static inline void mddev_lock_nointr(struct mddev *mddev)
612	{
613	mutex_lock(&mddev->reconfig_mutex);
614	}
615
616	static inline int mddev_trylock(struct mddev *mddev)
617	{
618	return mutex_trylock(lock: &mddev->reconfig_mutex);
619	}
620	extern void mddev_unlock(struct mddev *mddev);
621
622	static inline void md_sync_acct(struct block_device bdev, unsigned* long nr_sectors)
623	{
624	atomic_add(i: nr_sectors, v: &bdev->bd_disk->sync_io);
625	}
626
627	static inline void md_sync_acct_bio(struct bio bio, unsigned* long nr_sectors)
628	{
629	md_sync_acct(bdev: bio->bi_bdev, nr_sectors);
630	}
631
632	struct md_personality
633	{
634	char *name;
635	int level;
636	struct list_head list;
637	struct module *owner;
638	bool __must_check (make_request)(struct* mddev mddev, struct* bio *bio);
639	/*
640	* start up works that do NOT require md_thread. tasks that
641	* requires md_thread should go into start()
642	*/
643	int (run)(struct* mddev *mddev);
644	/ start up works that require md threads /
645	int (start)(struct* mddev *mddev);
646	void (free)(struct* mddev mddev, void* *priv);
647	void (status)(struct* seq_file seq, struct* mddev *mddev);
648	/ error_handler must set ->faulty and clear ->in_sync*
649	* if appropriate, and should abort recovery if needed
650	*/
651	void (error_handler)(struct* mddev mddev, struct* md_rdev *rdev);
652	int (hot_add_disk) (struct* mddev mddev, struct* md_rdev *rdev);
653	int (hot_remove_disk) (struct* mddev mddev, struct* md_rdev *rdev);
654	int (spare_active) (struct* mddev *mddev);
655	sector_t (sync_request)(struct* mddev mddev, sector_t sector_nr, int* *skipped);
656	int (resize) (struct* mddev *mddev, sector_t sectors);
657	sector_t (size) (struct* mddev mddev, sector_t sectors, int* raid_disks);
658	int (check_reshape) (struct* mddev *mddev);
659	int (start_reshape) (struct* mddev *mddev);
660	void (finish_reshape) (struct* mddev *mddev);
661	void (update_reshape_pos) (struct* mddev *mddev);
662	void (prepare_suspend) (struct* mddev *mddev);
663	/ quiesce suspends or resumes internal processing.*
664	* 1 - stop new actions and wait for action io to complete
665	* 0 - return to normal behaviour
666	*/
667	void (quiesce) (struct* mddev mddev, int* quiesce);
668	/ takeover is used to transition an array from one*
669	* personality to another. The new personality must be able
670	* to handle the data in the current layout.
671	* e.g. 2drive raid1 -> 2drive raid5
672	* ndrive raid5 -> degraded n+1drive raid6 with special layout
673	* If the takeover succeeds, a new 'private' structure is returned.
674	* This needs to be installed and then ->run used to activate the
675	* array.
676	*/
677	void (takeover) (struct mddev *mddev);
678	/ Changes the consistency policy of an active array. /
679	int (change_consistency_policy)(struct* mddev mddev, const* char *buf);
680	};
681
682	struct md_sysfs_entry {
683	struct attribute attr;
684	ssize_t (show)(struct* mddev , char* *);
685	ssize_t (store)(struct* mddev , const* char *, size_t);
686	};
687	extern const struct attribute_group md_bitmap_group;
688
689	static inline struct kernfs_node sysfs_get_dirent_safe(struct* kernfs_node sd, char* *name)
690	{
691	if (sd)
692	return sysfs_get_dirent(parent: sd, name);
693	return sd;
694	}
695	static inline void sysfs_notify_dirent_safe(struct kernfs_node *sd)
696	{
697	if (sd)
698	sysfs_notify_dirent(kn: sd);
699	}
700
701	static inline char * mdname (struct mddev * mddev)
702	{
703	return mddev->gendisk ? mddev->gendisk->disk_name : "mdX";
704	}
705
706	static inline int sysfs_link_rdev(struct mddev mddev, struct* md_rdev *rdev)
707	{
708	char nm[`20`];
709	if (!test_bit(Replacement, &rdev->flags) &&
710	!test_bit(Journal, &rdev->flags) &&
711	mddev->kobj.sd) {
712	sprintf(buf: nm, fmt: "rd%d", rdev->raid_disk);
713	return sysfs_create_link(kobj: &mddev->kobj, target: &rdev->kobj, name: nm);
714	} else
715	return `0`;
716	}
717
718	static inline void sysfs_unlink_rdev(struct mddev mddev, struct* md_rdev *rdev)
719	{
720	char nm[`20`];
721	if (!test_bit(Replacement, &rdev->flags) &&
722	!test_bit(Journal, &rdev->flags) &&
723	mddev->kobj.sd) {
724	sprintf(buf: nm, fmt: "rd%d", rdev->raid_disk);
725	sysfs_remove_link(kobj: &mddev->kobj, name: nm);
726	}
727	}
728
729	/*
730	* iterates through some rdev ringlist. It's safe to remove the
731	* current 'rdev'. Dont touch 'tmp' though.
732	*/
733	#define rdev_for_each_list(rdev, tmp, head) \
734	list_for_each_entry_safe(rdev, tmp, head, same_set)
735
736	/*
737	* iterates through the 'same array disks' ringlist
738	*/
739	#define rdev_for_each(rdev, mddev) \
740	list_for_each_entry(rdev, &((mddev)->disks), same_set)
741
742	#define rdev_for_each_safe(rdev, tmp, mddev) \
743	list_for_each_entry_safe(rdev, tmp, &((mddev)->disks), same_set)
744
745	#define rdev_for_each_rcu(rdev, mddev) \
746	list_for_each_entry_rcu(rdev, &((mddev)->disks), same_set)
747
748	struct md_thread {
749	void (run) (struct* md_thread *thread);
750	struct mddev *mddev;
751	wait_queue_head_t wqueue;
752	unsigned long flags;
753	struct task_struct *tsk;
754	unsigned long timeout;
755	void *private;
756	};
757
758	struct md_io_clone {
759	struct mddev *mddev;
760	struct bio *orig_bio;
761	unsigned long start_time;
762	struct bio bio_clone;
763	};
764
765	#define THREAD_WAKEUP 0
766
767	static inline void safe_put_page(struct page *p)
768	{
769	if (p) put_page(page: p);
770	}
771
772	extern int register_md_personality(struct md_personality *p);
773	extern int unregister_md_personality(struct md_personality *p);
774	extern int register_md_cluster_operations(struct md_cluster_operations *ops,
775	struct module *module);
776	extern int unregister_md_cluster_operations(void);
777	extern int md_setup_cluster(struct mddev mddev, int* nodes);
778	extern void md_cluster_stop(struct mddev *mddev);
779	extern struct md_thread *md_register_thread(
780	void (run)(struct* md_thread *thread),
781	struct mddev *mddev,
782	const char *name);
783	extern void md_unregister_thread(struct mddev mddev, struct* md_thread __rcu **threadp);
784	extern void md_wakeup_thread(struct md_thread __rcu *thread);
785	extern void md_check_recovery(struct mddev *mddev);
786	extern void md_reap_sync_thread(struct mddev *mddev);
787	extern bool md_write_start(struct mddev mddev, struct* bio *bi);
788	extern void md_write_inc(struct mddev mddev, struct* bio *bi);
789	extern void md_write_end(struct mddev *mddev);
790	extern void md_done_sync(struct mddev mddev, int* blocks, int ok);
791	extern void md_error(struct mddev mddev, struct* md_rdev *rdev);
792	extern void md_finish_reshape(struct mddev *mddev);
793	void md_submit_discard_bio(struct mddev mddev, struct* md_rdev *rdev,
794	struct bio *bio, sector_t start, sector_t size);
795	void md_account_bio(struct mddev mddev, struct* bio **bio);
796	void md_free_cloned_bio(struct bio *bio);
797
798	extern bool __must_check md_flush_request(struct mddev mddev, struct* bio *bio);
799	extern void md_super_write(struct mddev mddev, struct* md_rdev *rdev,
800	sector_t sector, int size, struct page *page);
801	extern int md_super_wait(struct mddev *mddev);
802	extern int sync_page_io(struct md_rdev rdev, sector_t sector, int* size,
803	struct page *page, blk_opf_t opf, bool metadata_op);
804	extern void md_do_sync(struct md_thread *thread);
805	extern void md_new_event(void);
806	extern void md_allow_write(struct mddev *mddev);
807	extern void md_wait_for_blocked_rdev(struct md_rdev rdev, struct* mddev *mddev);
808	extern void md_set_array_sectors(struct mddev *mddev, sector_t array_sectors);
809	extern int md_check_no_bitmap(struct mddev *mddev);
810	extern int md_integrity_register(struct mddev *mddev);
811	extern int md_integrity_add_rdev(struct md_rdev rdev, struct* mddev *mddev);
812	extern int strict_strtoul_scaled(const char cp, unsigned* long res, int* scale);
813
814	extern int mddev_init(struct mddev *mddev);
815	extern void mddev_destroy(struct mddev *mddev);
816	struct mddev md_alloc(dev_t dev, char* *name);
817	void mddev_put(struct mddev *mddev);
818	extern int md_run(struct mddev *mddev);
819	extern int md_start(struct mddev *mddev);
820	extern void md_stop(struct mddev *mddev);
821	extern void md_stop_writes(struct mddev *mddev);
822	extern int md_rdev_init(struct md_rdev *rdev);
823	extern void md_rdev_clear(struct md_rdev *rdev);
824
825	extern bool md_handle_request(struct mddev mddev, struct* bio *bio);
826	extern int mddev_suspend(struct mddev *mddev, bool interruptible);
827	extern void mddev_resume(struct mddev *mddev);
828	extern void md_idle_sync_thread(struct mddev *mddev);
829	extern void md_frozen_sync_thread(struct mddev *mddev);
830	extern void md_unfrozen_sync_thread(struct mddev *mddev);
831
832	extern void md_reload_sb(struct mddev mddev, int* raid_disk);
833	extern void md_update_sb(struct mddev mddev, int* force);
834	extern void mddev_create_serial_pool(struct mddev mddev, struct* md_rdev *rdev);
835	extern void mddev_destroy_serial_pool(struct mddev *mddev,
836	struct md_rdev *rdev);
837	struct md_rdev md_find_rdev_nr_rcu(struct* mddev mddev, int* nr);
838	struct md_rdev md_find_rdev_rcu(struct* mddev *mddev, dev_t dev);
839
840	static inline bool is_rdev_broken(struct md_rdev *rdev)
841	{
842	return !disk_live(disk: rdev->bdev->bd_disk);
843	}
844
845	static inline void rdev_dec_pending(struct md_rdev rdev, struct* mddev *mddev)
846	{
847	int faulty = test_bit(Faulty, &rdev->flags);
848	if (atomic_dec_and_test(v: &rdev->nr_pending) && faulty) {
849	set_bit(nr: MD_RECOVERY_NEEDED, addr: &mddev->recovery);
850	md_wakeup_thread(thread: mddev->thread);
851	}
852	}
853
854	extern struct md_cluster_operations *md_cluster_ops;
855	static inline int mddev_is_clustered(struct mddev *mddev)
856	{
857	return mddev->cluster_info && mddev->bitmap_info.nodes > `1`;
858	}
859
860	/ clear unsupported mddev_flags /
861	static inline void mddev_clear_unsupported_flags(struct mddev *mddev,
862	unsigned long unsupported_flags)
863	{
864	mddev->flags &= ~unsupported_flags;
865	}
866
867	static inline void mddev_check_write_zeroes(struct mddev mddev, struct* bio *bio)
868	{
869	if (bio_op(bio) == REQ_OP_WRITE_ZEROES &&
870	!bio->bi_bdev->bd_disk->queue->limits.max_write_zeroes_sectors)
871	mddev->gendisk->queue->limits.max_write_zeroes_sectors = `0`;
872	}
873
874	static inline int mddev_suspend_and_lock(struct mddev *mddev)
875	{
876	int ret;
877
878	ret = mddev_suspend(mddev, interruptible: true);
879	if (ret)
880	return ret;
881
882	ret = mddev_lock(mddev);
883	if (ret)
884	mddev_resume(mddev);
885
886	return ret;
887	}
888
889	static inline void mddev_suspend_and_lock_nointr(struct mddev *mddev)
890	{
891	mddev_suspend(mddev, interruptible: false);
892	mutex_lock(&mddev->reconfig_mutex);
893	}
894
895	static inline void mddev_unlock_and_resume(struct mddev *mddev)
896	{
897	mddev_unlock(mddev);
898	mddev_resume(mddev);
899	}
900
901	struct mdu_array_info_s;
902	struct mdu_disk_info_s;
903
904	extern int mdp_major;
905	extern struct workqueue_struct *md_bitmap_wq;
906	void md_autostart_arrays(int part);
907	int md_set_array_info(struct mddev mddev, struct* mdu_array_info_s *info);
908	int md_add_new_disk(struct mddev mddev, struct* mdu_disk_info_s *info);
909	int do_md_run(struct mddev *mddev);
910	void mddev_stack_rdev_limits(struct mddev mddev, struct* queue_limits *lim);
911	int mddev_stack_new_rdev(struct mddev mddev, struct* md_rdev *rdev);
912	void mddev_update_io_opt(struct mddev mddev, unsigned* int nr_stripes);
913
914	extern const struct block_device_operations md_fops;
915
916	/*
917	* MD devices can be used undeneath by DM, in which case ->gendisk is NULL.
918	*/
919	static inline bool mddev_is_dm(struct mddev *mddev)
920	{
921	return !mddev->gendisk;
922	}
923
924	static inline void mddev_trace_remap(struct mddev mddev, struct* bio *bio,
925	sector_t sector)
926	{
927	if (!mddev_is_dm(mddev))
928	trace_block_bio_remap(bio, dev: disk_devt(disk: mddev->gendisk), from: sector);
929	}
930
931	#define mddev_add_trace_msg(mddev, fmt, args...) \
932	do { \
933	if (!mddev_is_dm(mddev)) \
934	blk_add_trace_msg((mddev)->gendisk->queue, fmt, ##args); \
935	} while (0)
936
937	#endif /* _MD_MD_H */
938

source code of linux/drivers/md/md.h