raid1.c source code [linux/drivers/md/raid1.c]

1	// SPDX-License-Identifier: GPL-2.0-or-later
2	/*
3	* raid1.c : Multiple Devices driver for Linux
4	*
5	* Copyright (C) 1999, 2000, 2001 Ingo Molnar, Red Hat
6	*
7	* Copyright (C) 1996, 1997, 1998 Ingo Molnar, Miguel de Icaza, Gadi Oxman
8	*
9	* RAID-1 management functions.
10	*
11	* Better read-balancing code written by Mika Kuoppala <miku@iki.fi>, 2000
12	*
13	* Fixes to reconstruction by Jakob Østergaard" <jakob@ostenfeld.dk>
14	* Various fixes by Neil Brown <neilb@cse.unsw.edu.au>
15	*
16	* Changes by Peter T. Breuer <ptb@it.uc3m.es> 31/1/2003 to support
17	* bitmapped intelligence in resync:
18	*
19	* - bitmap marked during normal i/o
20	* - bitmap used to skip nondirty blocks during sync
21	*
22	* Additions to bitmap code, (C) 2003-2004 Paul Clements, SteelEye Technology:
23	* - persistent bitmap code
24	*/
25
26	#include <linux/slab.h>
27	#include <linux/delay.h>
28	#include <linux/blkdev.h>
29	#include <linux/module.h>
30	#include <linux/seq_file.h>
31	#include <linux/ratelimit.h>
32	#include <linux/interval_tree_generic.h>
33
34	#include <trace/events/block.h>
35
36	#include "md.h"
37	#include "raid1.h"
38	#include "md-bitmap.h"
39
40	#define UNSUPPORTED_MDDEV_FLAGS \
41	((1L << MD_HAS_JOURNAL) \| \
42	(1L << MD_JOURNAL_CLEAN) \| \
43	(1L << MD_HAS_PPL) \| \
44	(1L << MD_HAS_MULTIPLE_PPLS))
45
46	static void allow_barrier(struct r1conf *conf, sector_t sector_nr);
47	static void lower_barrier(struct r1conf *conf, sector_t sector_nr);
48
49	#define RAID_1_10_NAME "raid1"
50	#include "raid1-10.c"
51
52	#define START(node) ((node)->start)
53	#define LAST(node) ((node)->last)
54	INTERVAL_TREE_DEFINE(struct serial_info, node, sector_t, _subtree_last,
55	START, LAST, static inline, raid1_rb);
56
57	static int check_and_add_serial(struct md_rdev rdev, struct* r1bio *r1_bio,
58	struct serial_info si, int* idx)
59	{
60	unsigned long flags;
61	int ret = `0`;
62	sector_t lo = r1_bio->sector;
63	sector_t hi = lo + r1_bio->sectors;
64	struct serial_in_rdev *serial = &rdev->serial[idx];
65
66	spin_lock_irqsave(&serial->serial_lock, flags);
67	/ collision happened /
68	if (raid1_rb_iter_first(root: &serial->serial_rb, start: lo, last: hi))
69	ret = -EBUSY;
70	else {
71	si->start = lo;
72	si->last = hi;
73	raid1_rb_insert(node: si, root: &serial->serial_rb);
74	}
75	spin_unlock_irqrestore(lock: &serial->serial_lock, flags);
76
77	return ret;
78	}
79
80	static void wait_for_serialization(struct md_rdev rdev, struct* r1bio *r1_bio)
81	{
82	struct mddev *mddev = rdev->mddev;
83	struct serial_info *si;
84	int idx = sector_to_idx(sector: r1_bio->sector);
85	struct serial_in_rdev *serial = &rdev->serial[idx];
86
87	if (WARN_ON(!mddev->serial_info_pool))
88	return;
89	si = mempool_alloc(pool: mddev->serial_info_pool, GFP_NOIO);
90	wait_event(serial->serial_io_wait,
91	check_and_add_serial(rdev, r1_bio, si, idx) == `0`);
92	}
93
94	static void remove_serial(struct md_rdev *rdev, sector_t lo, sector_t hi)
95	{
96	struct serial_info *si;
97	unsigned long flags;
98	int found = `0`;
99	struct mddev *mddev = rdev->mddev;
100	int idx = sector_to_idx(sector: lo);
101	struct serial_in_rdev *serial = &rdev->serial[idx];
102
103	spin_lock_irqsave(&serial->serial_lock, flags);
104	for (si = raid1_rb_iter_first(root: &serial->serial_rb, start: lo, last: hi);
105	si; si = raid1_rb_iter_next(node: si, start: lo, last: hi)) {
106	if (si->start == lo && si->last == hi) {
107	raid1_rb_remove(node: si, root: &serial->serial_rb);
108	mempool_free(element: si, pool: mddev->serial_info_pool);
109	found = `1`;
110	break;
111	}
112	}
113	if (!found)
114	WARN(`1`, "The write IO is not recorded for serialization\n");
115	spin_unlock_irqrestore(lock: &serial->serial_lock, flags);
116	wake_up(&serial->serial_io_wait);
117	}
118
119	/*
120	* for resync bio, r1bio pointer can be retrieved from the per-bio
121	* 'struct resync_pages'.
122	*/
123	static inline struct r1bio get_resync_r1bio(struct* bio *bio)
124	{
125	return get_resync_pages(bio)->raid_bio;
126	}
127
128	static void * r1bio_pool_alloc(gfp_t gfp_flags, void *data)
129	{
130	struct pool_info *pi = data;
131	int size = offsetof(struct r1bio, bios[pi->raid_disks]);
132
133	/ allocate a r1bio with room for raid_disks entries in the bios array /
134	return kzalloc(size, flags: gfp_flags);
135	}
136
137	#define RESYNC_DEPTH 32
138	#define RESYNC_SECTORS (RESYNC_BLOCK_SIZE >> 9)
139	#define RESYNC_WINDOW (RESYNC_BLOCK_SIZE * RESYNC_DEPTH)
140	#define RESYNC_WINDOW_SECTORS (RESYNC_WINDOW >> 9)
141	#define CLUSTER_RESYNC_WINDOW (16 * RESYNC_WINDOW)
142	#define CLUSTER_RESYNC_WINDOW_SECTORS (CLUSTER_RESYNC_WINDOW >> 9)
143
144	static void * r1buf_pool_alloc(gfp_t gfp_flags, void *data)
145	{
146	struct pool_info *pi = data;
147	struct r1bio *r1_bio;
148	struct bio *bio;
149	int need_pages;
150	int j;
151	struct resync_pages *rps;
152
153	r1_bio = r1bio_pool_alloc(gfp_flags, data: pi);
154	if (!r1_bio)
155	return NULL;
156
157	rps = kmalloc_array(n: pi->raid_disks, size: sizeof(struct resync_pages),
158	flags: gfp_flags);
159	if (!rps)
160	goto out_free_r1bio;
161
162	/*
163	* Allocate bios : 1 for reading, n-1 for writing
164	*/
165	for (j = pi->raid_disks ; j-- ; ) {
166	bio = bio_kmalloc(RESYNC_PAGES, gfp_mask: gfp_flags);
167	if (!bio)
168	goto out_free_bio;
169	bio_init(bio, NULL, table: bio->bi_inline_vecs, RESYNC_PAGES, opf: `0`);
170	r1_bio->bios[j] = bio;
171	}
172	/*
173	* Allocate RESYNC_PAGES data pages and attach them to
174	* the first bio.
175	* If this is a user-requested check/repair, allocate
176	* RESYNC_PAGES for each bio.
177	*/
178	if (test_bit(MD_RECOVERY_REQUESTED, &pi->mddev->recovery))
179	need_pages = pi->raid_disks;
180	else
181	need_pages = `1`;
182	for (j = `0`; j < pi->raid_disks; j++) {
183	struct resync_pages *rp = &rps[j];
184
185	bio = r1_bio->bios[j];
186
187	if (j < need_pages) {
188	if (resync_alloc_pages(rp, gfp_flags))
189	goto out_free_pages;
190	} else {
191	memcpy(rp, &rps[`0`], sizeof(*rp));
192	resync_get_all_pages(rp);
193	}
194
195	rp->raid_bio = r1_bio;
196	bio->bi_private = rp;
197	}
198
199	r1_bio->master_bio = NULL;
200
201	return r1_bio;
202
203	out_free_pages:
204	while (--j >= `0`)
205	resync_free_pages(rp: &rps[j]);
206
207	out_free_bio:
208	while (++j < pi->raid_disks) {
209	bio_uninit(r1_bio->bios[j]);
210	kfree(objp: r1_bio->bios[j]);
211	}
212	kfree(objp: rps);
213
214	out_free_r1bio:
215	rbio_pool_free(rbio: r1_bio, data);
216	return NULL;
217	}
218
219	static void r1buf_pool_free(void __r1_bio, void* *data)
220	{
221	struct pool_info *pi = data;
222	int i;
223	struct r1bio *r1bio = __r1_bio;
224	struct resync_pages *rp = NULL;
225
226	for (i = pi->raid_disks; i--; ) {
227	rp = get_resync_pages(bio: r1bio->bios[i]);
228	resync_free_pages(rp);
229	bio_uninit(r1bio->bios[i]);
230	kfree(objp: r1bio->bios[i]);
231	}
232
233	/ resync pages array stored in the 1st bio's .bi_private /
234	kfree(objp: rp);
235
236	rbio_pool_free(rbio: r1bio, data);
237	}
238
239	static void put_all_bios(struct r1conf conf, struct* r1bio *r1_bio)
240	{
241	int i;
242
243	for (i = `0`; i < conf->raid_disks * `2`; i++) {
244	struct bio **bio = r1_bio->bios + i;
245	if (!BIO_SPECIAL(*bio))
246	bio_put(*bio);
247	*bio = NULL;
248	}
249	}
250
251	static void free_r1bio(struct r1bio *r1_bio)
252	{
253	struct r1conf *conf = r1_bio->mddev->private;
254
255	put_all_bios(conf, r1_bio);
256	mempool_free(element: r1_bio, pool: &conf->r1bio_pool);
257	}
258
259	static void put_buf(struct r1bio *r1_bio)
260	{
261	struct r1conf *conf = r1_bio->mddev->private;
262	sector_t sect = r1_bio->sector;
263	int i;
264
265	for (i = `0`; i < conf->raid_disks * `2`; i++) {
266	struct bio *bio = r1_bio->bios[i];
267	if (bio->bi_end_io)
268	rdev_dec_pending(rdev: conf->mirrors[i].rdev, mddev: r1_bio->mddev);
269	}
270
271	mempool_free(element: r1_bio, pool: &conf->r1buf_pool);
272
273	lower_barrier(conf, sector_nr: sect);
274	}
275
276	static void reschedule_retry(struct r1bio *r1_bio)
277	{
278	unsigned long flags;
279	struct mddev *mddev = r1_bio->mddev;
280	struct r1conf *conf = mddev->private;
281	int idx;
282
283	idx = sector_to_idx(sector: r1_bio->sector);
284	spin_lock_irqsave(&conf->device_lock, flags);
285	list_add(new: &r1_bio->retry_list, head: &conf->retry_list);
286	atomic_inc(v: &conf->nr_queued[idx]);
287	spin_unlock_irqrestore(lock: &conf->device_lock, flags);
288
289	wake_up(&conf->wait_barrier);
290	md_wakeup_thread(thread: mddev->thread);
291	}
292
293	/*
294	* raid_end_bio_io() is called when we have finished servicing a mirrored
295	* operation and are ready to return a success/failure code to the buffer
296	* cache layer.
297	*/
298	static void call_bio_endio(struct r1bio *r1_bio)
299	{
300	struct bio *bio = r1_bio->master_bio;
301
302	if (!test_bit(R1BIO_Uptodate, &r1_bio->state))
303	bio->bi_status = BLK_STS_IOERR;
304
305	bio_endio(bio);
306	}
307
308	static void raid_end_bio_io(struct r1bio *r1_bio)
309	{
310	struct bio *bio = r1_bio->master_bio;
311	struct r1conf *conf = r1_bio->mddev->private;
312	sector_t sector = r1_bio->sector;
313
314	/ if nobody has done the final endio yet, do it now /
315	if (!test_and_set_bit(nr: R1BIO_Returned, addr: &r1_bio->state)) {
316	pr_debug("raid1: sync end %s on sectors %llu-%llu\n",
317	(bio_data_dir(bio) == WRITE) ? "write" : "read",
318	(unsigned long long) bio->bi_iter.bi_sector,
319	(unsigned long long) bio_end_sector(bio) - `1`);
320
321	call_bio_endio(r1_bio);
322	}
323
324	free_r1bio(r1_bio);
325	/*
326	* Wake up any possible resync thread that waits for the device
327	* to go idle. All I/Os, even write-behind writes, are done.
328	*/
329	allow_barrier(conf, sector_nr: sector);
330	}
331
332	/*
333	* Update disk head position estimator based on IRQ completion info.
334	*/
335	static inline void update_head_pos(int disk, struct r1bio *r1_bio)
336	{
337	struct r1conf *conf = r1_bio->mddev->private;
338
339	conf->mirrors[disk].head_position =
340	r1_bio->sector + (r1_bio->sectors);
341	}
342
343	/*
344	* Find the disk number which triggered given bio
345	*/
346	static int find_bio_disk(struct r1bio r1_bio, struct* bio *bio)
347	{
348	int mirror;
349	struct r1conf *conf = r1_bio->mddev->private;
350	int raid_disks = conf->raid_disks;
351
352	for (mirror = `0`; mirror < raid_disks * `2`; mirror++)
353	if (r1_bio->bios[mirror] == bio)
354	break;
355
356	BUG_ON(mirror == raid_disks * `2`);
357	update_head_pos(disk: mirror, r1_bio);
358
359	return mirror;
360	}
361
362	static void raid1_end_read_request(struct bio *bio)
363	{
364	int uptodate = !bio->bi_status;
365	struct r1bio *r1_bio = bio->bi_private;
366	struct r1conf *conf = r1_bio->mddev->private;
367	struct md_rdev *rdev = conf->mirrors[r1_bio->read_disk].rdev;
368
369	/*
370	* this branch is our 'one mirror IO has finished' event handler:
371	*/
372	update_head_pos(disk: r1_bio->read_disk, r1_bio);
373
374	if (uptodate)
375	set_bit(nr: R1BIO_Uptodate, addr: &r1_bio->state);
376	else if (test_bit(FailFast, &rdev->flags) &&
377	test_bit(R1BIO_FailFast, &r1_bio->state))
378	/ This was a fail-fast read so we definitely*
379	* want to retry */
380	;
381	else {
382	/ If all other devices have failed, we want to return*
383	* the error upwards rather than fail the last device.
384	* Here we redefine "uptodate" to mean "Don't want to retry"
385	*/
386	unsigned long flags;
387	spin_lock_irqsave(&conf->device_lock, flags);
388	if (r1_bio->mddev->degraded == conf->raid_disks \|\|
389	(r1_bio->mddev->degraded == conf->raid_disks-`1` &&
390	test_bit(In_sync, &rdev->flags)))
391	uptodate = `1`;
392	spin_unlock_irqrestore(lock: &conf->device_lock, flags);
393	}
394
395	if (uptodate) {
396	raid_end_bio_io(r1_bio);
397	rdev_dec_pending(rdev, mddev: conf->mddev);
398	} else {
399	/*
400	* oops, read error:
401	*/
402	pr_err_ratelimited("md/raid1:%s: %pg: rescheduling sector %llu\n",
403	mdname(conf->mddev),
404	rdev->bdev,
405	(unsigned long long)r1_bio->sector);
406	set_bit(nr: R1BIO_ReadError, addr: &r1_bio->state);
407	reschedule_retry(r1_bio);
408	/ don't drop the reference on read_disk yet /
409	}
410	}
411
412	static void close_write(struct r1bio *r1_bio)
413	{
414	/ it really is the end of this request /
415	if (test_bit(R1BIO_BehindIO, &r1_bio->state)) {
416	bio_free_pages(bio: r1_bio->behind_master_bio);
417	bio_put(r1_bio->behind_master_bio);
418	r1_bio->behind_master_bio = NULL;
419	}
420	/ clear the bitmap if all writes complete successfully /
421	md_bitmap_endwrite(bitmap: r1_bio->mddev->bitmap, offset: r1_bio->sector,
422	sectors: r1_bio->sectors,
423	success: !test_bit(R1BIO_Degraded, &r1_bio->state),
424	test_bit(R1BIO_BehindIO, &r1_bio->state));
425	md_write_end(mddev: r1_bio->mddev);
426	}
427
428	static void r1_bio_write_done(struct r1bio *r1_bio)
429	{
430	if (!atomic_dec_and_test(v: &r1_bio->remaining))
431	return;
432
433	if (test_bit(R1BIO_WriteError, &r1_bio->state))
434	reschedule_retry(r1_bio);
435	else {
436	close_write(r1_bio);
437	if (test_bit(R1BIO_MadeGood, &r1_bio->state))
438	reschedule_retry(r1_bio);
439	else
440	raid_end_bio_io(r1_bio);
441	}
442	}
443
444	static void raid1_end_write_request(struct bio *bio)
445	{
446	struct r1bio *r1_bio = bio->bi_private;
447	int behind = test_bit(R1BIO_BehindIO, &r1_bio->state);
448	struct r1conf *conf = r1_bio->mddev->private;
449	struct bio *to_put = NULL;
450	int mirror = find_bio_disk(r1_bio, bio);
451	struct md_rdev *rdev = conf->mirrors[mirror].rdev;
452	bool discard_error;
453	sector_t lo = r1_bio->sector;
454	sector_t hi = r1_bio->sector + r1_bio->sectors;
455
456	discard_error = bio->bi_status && bio_op(bio) == REQ_OP_DISCARD;
457
458	/*
459	* 'one mirror IO has finished' event handler:
460	*/
461	if (bio->bi_status && !discard_error) {
462	set_bit(nr: WriteErrorSeen, addr: &rdev->flags);
463	if (!test_and_set_bit(nr: WantReplacement, addr: &rdev->flags))
464	set_bit(nr: MD_RECOVERY_NEEDED, addr: &
465	conf->mddev->recovery);
466
467	if (test_bit(FailFast, &rdev->flags) &&
468	(bio->bi_opf & MD_FAILFAST) &&
469	/ We never try FailFast to WriteMostly devices /
470	!test_bit(WriteMostly, &rdev->flags)) {
471	md_error(mddev: r1_bio->mddev, rdev);
472	}
473
474	/*
475	* When the device is faulty, it is not necessary to
476	* handle write error.
477	*/
478	if (!test_bit(Faulty, &rdev->flags))
479	set_bit(nr: R1BIO_WriteError, addr: &r1_bio->state);
480	else {
481	/ Fail the request /
482	set_bit(nr: R1BIO_Degraded, addr: &r1_bio->state);
483	/ Finished with this branch /
484	r1_bio->bios[mirror] = NULL;
485	to_put = bio;
486	}
487	} else {
488	/*
489	* Set R1BIO_Uptodate in our master bio, so that we
490	* will return a good error code for to the higher
491	* levels even if IO on some other mirrored buffer
492	* fails.
493	*
494	* The 'master' represents the composite IO operation
495	* to user-side. So if something waits for IO, then it
496	* will wait for the 'master' bio.
497	*/
498	r1_bio->bios[mirror] = NULL;
499	to_put = bio;
500	/*
501	* Do not set R1BIO_Uptodate if the current device is
502	* rebuilding or Faulty. This is because we cannot use
503	* such device for properly reading the data back (we could
504	* potentially use it, if the current write would have felt
505	* before rdev->recovery_offset, but for simplicity we don't
506	* check this here.
507	*/
508	if (test_bit(In_sync, &rdev->flags) &&
509	!test_bit(Faulty, &rdev->flags))
510	set_bit(nr: R1BIO_Uptodate, addr: &r1_bio->state);
511
512	/ Maybe we can clear some bad blocks. /
513	if (rdev_has_badblock(rdev, s: r1_bio->sector, sectors: r1_bio->sectors) &&
514	!discard_error) {
515	r1_bio->bios[mirror] = IO_MADE_GOOD;
516	set_bit(nr: R1BIO_MadeGood, addr: &r1_bio->state);
517	}
518	}
519
520	if (behind) {
521	if (test_bit(CollisionCheck, &rdev->flags))
522	remove_serial(rdev, lo, hi);
523	if (test_bit(WriteMostly, &rdev->flags))
524	atomic_dec(v: &r1_bio->behind_remaining);
525
526	/*
527	* In behind mode, we ACK the master bio once the I/O
528	* has safely reached all non-writemostly
529	* disks. Setting the Returned bit ensures that this
530	* gets done only once -- we don't ever want to return
531	* -EIO here, instead we'll wait
532	*/
533	if (atomic_read(v: &r1_bio->behind_remaining) >= (atomic_read(v: &r1_bio->remaining)-`1`) &&
534	test_bit(R1BIO_Uptodate, &r1_bio->state)) {
535	/ Maybe we can return now /
536	if (!test_and_set_bit(nr: R1BIO_Returned, addr: &r1_bio->state)) {
537	struct bio *mbio = r1_bio->master_bio;
538	pr_debug("raid1: behind end write sectors"
539	" %llu-%llu\n",
540	(unsigned long long) mbio->bi_iter.bi_sector,
541	(unsigned long long) bio_end_sector(mbio) - `1`);
542	call_bio_endio(r1_bio);
543	}
544	}
545	} else if (rdev->mddev->serialize_policy)
546	remove_serial(rdev, lo, hi);
547	if (r1_bio->bios[mirror] == NULL)
548	rdev_dec_pending(rdev, mddev: conf->mddev);
549
550	/*
551	* Let's see if all mirrored write operations have finished
552	* already.
553	*/
554	r1_bio_write_done(r1_bio);
555
556	if (to_put)
557	bio_put(to_put);
558	}
559
560	static sector_t align_to_barrier_unit_end(sector_t start_sector,
561	sector_t sectors)
562	{
563	sector_t len;
564
565	WARN_ON(sectors == `0`);
566	/*
567	* len is the number of sectors from start_sector to end of the
568	* barrier unit which start_sector belongs to.
569	*/
570	len = round_up(start_sector + `1`, BARRIER_UNIT_SECTOR_SIZE) -
571	start_sector;
572
573	if (len > sectors)
574	len = sectors;
575
576	return len;
577	}
578
579	static void update_read_sectors(struct r1conf conf, int* disk,
580	sector_t this_sector, int len)
581	{
582	struct raid1_info *info = &conf->mirrors[disk];
583
584	atomic_inc(v: &info->rdev->nr_pending);
585	if (info->next_seq_sect != this_sector)
586	info->seq_start = this_sector;
587	info->next_seq_sect = this_sector + len;
588	}
589
590	static int choose_first_rdev(struct r1conf conf, struct* r1bio *r1_bio,
591	int *max_sectors)
592	{
593	sector_t this_sector = r1_bio->sector;
594	int len = r1_bio->sectors;
595	int disk;
596
597	for (disk = `0` ; disk < conf->raid_disks * `2` ; disk++) {
598	struct md_rdev *rdev;
599	int read_len;
600
601	if (r1_bio->bios[disk] == IO_BLOCKED)
602	continue;
603
604	rdev = conf->mirrors[disk].rdev;
605	if (!rdev \|\| test_bit(Faulty, &rdev->flags))
606	continue;
607
608	/ choose the first disk even if it has some bad blocks. /
609	read_len = raid1_check_read_range(rdev, this_sector, len: &len);
610	if (read_len > `0`) {
611	update_read_sectors(conf, disk, this_sector, len: read_len);
612	*max_sectors = read_len;
613	return disk;
614	}
615	}
616
617	return -`1`;
618	}
619
620	static int choose_bb_rdev(struct r1conf conf, struct* r1bio *r1_bio,
621	int *max_sectors)
622	{
623	sector_t this_sector = r1_bio->sector;
624	int best_disk = -`1`;
625	int best_len = `0`;
626	int disk;
627
628	for (disk = `0` ; disk < conf->raid_disks * `2` ; disk++) {
629	struct md_rdev *rdev;
630	int len;
631	int read_len;
632
633	if (r1_bio->bios[disk] == IO_BLOCKED)
634	continue;
635
636	rdev = conf->mirrors[disk].rdev;
637	if (!rdev \|\| test_bit(Faulty, &rdev->flags) \|\|
638	test_bit(WriteMostly, &rdev->flags))
639	continue;
640
641	/ keep track of the disk with the most readable sectors. /
642	len = r1_bio->sectors;
643	read_len = raid1_check_read_range(rdev, this_sector, len: &len);
644	if (read_len > best_len) {
645	best_disk = disk;
646	best_len = read_len;
647	}
648	}
649
650	if (best_disk != -`1`) {
651	*max_sectors = best_len;
652	update_read_sectors(conf, disk: best_disk, this_sector, len: best_len);
653	}
654
655	return best_disk;
656	}
657
658	static int choose_slow_rdev(struct r1conf conf, struct* r1bio *r1_bio,
659	int *max_sectors)
660	{
661	sector_t this_sector = r1_bio->sector;
662	int bb_disk = -`1`;
663	int bb_read_len = `0`;
664	int disk;
665
666	for (disk = `0` ; disk < conf->raid_disks * `2` ; disk++) {
667	struct md_rdev *rdev;
668	int len;
669	int read_len;
670
671	if (r1_bio->bios[disk] == IO_BLOCKED)
672	continue;
673
674	rdev = conf->mirrors[disk].rdev;
675	if (!rdev \|\| test_bit(Faulty, &rdev->flags) \|\|
676	!test_bit(WriteMostly, &rdev->flags))
677	continue;
678
679	/ there are no bad blocks, we can use this disk /
680	len = r1_bio->sectors;
681	read_len = raid1_check_read_range(rdev, this_sector, len: &len);
682	if (read_len == r1_bio->sectors) {
683	update_read_sectors(conf, disk, this_sector, len: read_len);
684	return disk;
685	}
686
687	/*
688	* there are partial bad blocks, choose the rdev with largest
689	* read length.
690	*/
691	if (read_len > bb_read_len) {
692	bb_disk = disk;
693	bb_read_len = read_len;
694	}
695	}
696
697	if (bb_disk != -`1`) {
698	*max_sectors = bb_read_len;
699	update_read_sectors(conf, disk: bb_disk, this_sector, len: bb_read_len);
700	}
701
702	return bb_disk;
703	}
704
705	static bool is_sequential(struct r1conf conf, int* disk, struct r1bio *r1_bio)
706	{
707	/ TODO: address issues with this check and concurrency. /
708	return conf->mirrors[disk].next_seq_sect == r1_bio->sector \|\|
709	conf->mirrors[disk].head_position == r1_bio->sector;
710	}
711
712	/*
713	* If buffered sequential IO size exceeds optimal iosize, check if there is idle
714	* disk. If yes, choose the idle disk.
715	*/
716	static bool should_choose_next(struct r1conf conf, int* disk)
717	{
718	struct raid1_info *mirror = &conf->mirrors[disk];
719	int opt_iosize;
720
721	if (!test_bit(Nonrot, &mirror->rdev->flags))
722	return false;
723
724	opt_iosize = bdev_io_opt(bdev: mirror->rdev->bdev) >> `9`;
725	return opt_iosize > `0` && mirror->seq_start != MaxSector &&
726	mirror->next_seq_sect > opt_iosize &&
727	mirror->next_seq_sect - opt_iosize >= mirror->seq_start;
728	}
729
730	static bool rdev_readable(struct md_rdev rdev, struct* r1bio *r1_bio)
731	{
732	if (!rdev \|\| test_bit(Faulty, &rdev->flags))
733	return false;
734
735	/ still in recovery /
736	if (!test_bit(In_sync, &rdev->flags) &&
737	rdev->recovery_offset < r1_bio->sector + r1_bio->sectors)
738	return false;
739
740	/ don't read from slow disk unless have to /
741	if (test_bit(WriteMostly, &rdev->flags))
742	return false;
743
744	/ don't split IO for bad blocks unless have to /
745	if (rdev_has_badblock(rdev, s: r1_bio->sector, sectors: r1_bio->sectors))
746	return false;
747
748	return true;
749	}
750
751	struct read_balance_ctl {
752	sector_t closest_dist;
753	int closest_dist_disk;
754	int min_pending;
755	int min_pending_disk;
756	int sequential_disk;
757	int readable_disks;
758	};
759
760	static int choose_best_rdev(struct r1conf conf, struct* r1bio *r1_bio)
761	{
762	int disk;
763	struct read_balance_ctl ctl = {
764	.closest_dist_disk = -`1`,
765	.closest_dist = MaxSector,
766	.min_pending_disk = -`1`,
767	.min_pending = UINT_MAX,
768	.sequential_disk = -`1`,
769	};
770
771	for (disk = `0` ; disk < conf->raid_disks * `2` ; disk++) {
772	struct md_rdev *rdev;
773	sector_t dist;
774	unsigned int pending;
775
776	if (r1_bio->bios[disk] == IO_BLOCKED)
777	continue;
778
779	rdev = conf->mirrors[disk].rdev;
780	if (!rdev_readable(rdev, r1_bio))
781	continue;
782
783	/ At least two disks to choose from so failfast is OK /
784	if (ctl.readable_disks++ == `1`)
785	set_bit(nr: R1BIO_FailFast, addr: &r1_bio->state);
786
787	pending = atomic_read(v: &rdev->nr_pending);
788	dist = abs(r1_bio->sector - conf->mirrors[disk].head_position);
789
790	/ Don't change to another disk for sequential reads /
791	if (is_sequential(conf, disk, r1_bio)) {
792	if (!should_choose_next(conf, disk))
793	return disk;
794
795	/*
796	* Add 'pending' to avoid choosing this disk if
797	* there is other idle disk.
798	*/
799	pending++;
800	/*
801	* If there is no other idle disk, this disk
802	* will be chosen.
803	*/
804	ctl.sequential_disk = disk;
805	}
806
807	if (ctl.min_pending > pending) {
808	ctl.min_pending = pending;
809	ctl.min_pending_disk = disk;
810	}
811
812	if (ctl.closest_dist > dist) {
813	ctl.closest_dist = dist;
814	ctl.closest_dist_disk = disk;
815	}
816	}
817
818	/*
819	* sequential IO size exceeds optimal iosize, however, there is no other
820	* idle disk, so choose the sequential disk.
821	*/
822	if (ctl.sequential_disk != -`1` && ctl.min_pending != `0`)
823	return ctl.sequential_disk;
824
825	/*
826	* If all disks are rotational, choose the closest disk. If any disk is
827	* non-rotational, choose the disk with less pending request even the
828	* disk is rotational, which might/might not be optimal for raids with
829	* mixed ratation/non-rotational disks depending on workload.
830	*/
831	if (ctl.min_pending_disk != -`1` &&
832	(READ_ONCE(conf->nonrot_disks) \|\| ctl.min_pending == `0`))
833	return ctl.min_pending_disk;
834	else
835	return ctl.closest_dist_disk;
836	}
837
838	/*
839	* This routine returns the disk from which the requested read should be done.
840	*
841	* 1) If resync is in progress, find the first usable disk and use it even if it
842	* has some bad blocks.
843	*
844	* 2) Now that there is no resync, loop through all disks and skipping slow
845	* disks and disks with bad blocks for now. Only pay attention to key disk
846	* choice.
847	*
848	* 3) If we've made it this far, now look for disks with bad blocks and choose
849	* the one with most number of sectors.
850	*
851	* 4) If we are all the way at the end, we have no choice but to use a disk even
852	* if it is write mostly.
853	*
854	* The rdev for the device selected will have nr_pending incremented.
855	*/
856	static int read_balance(struct r1conf conf, struct* r1bio *r1_bio,
857	int *max_sectors)
858	{
859	int disk;
860
861	clear_bit(nr: R1BIO_FailFast, addr: &r1_bio->state);
862
863	if (raid1_should_read_first(mddev: conf->mddev, this_sector: r1_bio->sector,
864	len: r1_bio->sectors))
865	return choose_first_rdev(conf, r1_bio, max_sectors);
866
867	disk = choose_best_rdev(conf, r1_bio);
868	if (disk >= `0`) {
869	*max_sectors = r1_bio->sectors;
870	update_read_sectors(conf, disk, this_sector: r1_bio->sector,
871	len: r1_bio->sectors);
872	return disk;
873	}
874
875	/*
876	* If we are here it means we didn't find a perfectly good disk so
877	* now spend a bit more time trying to find one with the most good
878	* sectors.
879	*/
880	disk = choose_bb_rdev(conf, r1_bio, max_sectors);
881	if (disk >= `0`)
882	return disk;
883
884	return choose_slow_rdev(conf, r1_bio, max_sectors);
885	}
886
887	static void wake_up_barrier(struct r1conf *conf)
888	{
889	if (wq_has_sleeper(wq_head: &conf->wait_barrier))
890	wake_up(&conf->wait_barrier);
891	}
892
893	static void flush_bio_list(struct r1conf conf, struct* bio *bio)
894	{
895	/ flush any pending bitmap writes to disk before proceeding w/ I/O /
896	raid1_prepare_flush_writes(bitmap: conf->mddev->bitmap);
897	wake_up_barrier(conf);
898
899	while (bio) { / submit pending writes /
900	struct bio *next = bio->bi_next;
901
902	raid1_submit_write(bio);
903	bio = next;
904	cond_resched();
905	}
906	}
907
908	static void flush_pending_writes(struct r1conf *conf)
909	{
910	/ Any writes that have been queued but are awaiting*
911	* bitmap updates get flushed here.
912	*/
913	spin_lock_irq(lock: &conf->device_lock);
914
915	if (conf->pending_bio_list.head) {
916	struct blk_plug plug;
917	struct bio *bio;
918
919	bio = bio_list_get(bl: &conf->pending_bio_list);
920	spin_unlock_irq(lock: &conf->device_lock);
921
922	/*
923	* As this is called in a wait_event() loop (see freeze_array),
924	* current->state might be TASK_UNINTERRUPTIBLE which will
925	* cause a warning when we prepare to wait again. As it is
926	* rare that this path is taken, it is perfectly safe to force
927	* us to go around the wait_event() loop again, so the warning
928	* is a false-positive. Silence the warning by resetting
929	* thread state
930	*/
931	__set_current_state(TASK_RUNNING);
932	blk_start_plug(&plug);
933	flush_bio_list(conf, bio);
934	blk_finish_plug(&plug);
935	} else
936	spin_unlock_irq(lock: &conf->device_lock);
937	}
938
939	/ Barriers....*
940	* Sometimes we need to suspend IO while we do something else,
941	* either some resync/recovery, or reconfigure the array.
942	* To do this we raise a 'barrier'.
943	* The 'barrier' is a counter that can be raised multiple times
944	* to count how many activities are happening which preclude
945	* normal IO.
946	* We can only raise the barrier if there is no pending IO.
947	* i.e. if nr_pending == 0.
948	* We choose only to raise the barrier if no-one is waiting for the
949	* barrier to go down. This means that as soon as an IO request
950	* is ready, no other operations which require a barrier will start
951	* until the IO request has had a chance.
952	*
953	* So: regular IO calls 'wait_barrier'. When that returns there
954	* is no backgroup IO happening, It must arrange to call
955	* allow_barrier when it has finished its IO.
956	* backgroup IO calls must call raise_barrier. Once that returns
957	* there is no normal IO happeing. It must arrange to call
958	* lower_barrier when the particular background IO completes.
959	*
960	* If resync/recovery is interrupted, returns -EINTR;
961	* Otherwise, returns 0.
962	*/
963	static int raise_barrier(struct r1conf *conf, sector_t sector_nr)
964	{
965	int idx = sector_to_idx(sector: sector_nr);
966
967	spin_lock_irq(lock: &conf->resync_lock);
968
969	/ Wait until no block IO is waiting /
970	wait_event_lock_irq(conf->wait_barrier,
971	!atomic_read(&conf->nr_waiting[idx]),
972	conf->resync_lock);
973
974	/ block any new IO from starting /
975	atomic_inc(v: &conf->barrier[idx]);
976	/*
977	* In raise_barrier() we firstly increase conf->barrier[idx] then
978	* check conf->nr_pending[idx]. In _wait_barrier() we firstly
979	* increase conf->nr_pending[idx] then check conf->barrier[idx].
980	* A memory barrier here to make sure conf->nr_pending[idx] won't
981	* be fetched before conf->barrier[idx] is increased. Otherwise
982	* there will be a race between raise_barrier() and _wait_barrier().
983	*/
984	smp_mb__after_atomic();
985
986	/ For these conditions we must wait:*
987	* A: while the array is in frozen state
988	* B: while conf->nr_pending[idx] is not 0, meaning regular I/O
989	* existing in corresponding I/O barrier bucket.
990	* C: while conf->barrier[idx] >= RESYNC_DEPTH, meaning reaches
991	* max resync count which allowed on current I/O barrier bucket.
992	*/
993	wait_event_lock_irq(conf->wait_barrier,
994	(!conf->array_frozen &&
995	!atomic_read(&conf->nr_pending[idx]) &&
996	atomic_read(&conf->barrier[idx]) < RESYNC_DEPTH) \|\|
997	test_bit(MD_RECOVERY_INTR, &conf->mddev->recovery),
998	conf->resync_lock);
999
1000	if (test_bit(MD_RECOVERY_INTR, &conf->mddev->recovery)) {
1001	atomic_dec(v: &conf->barrier[idx]);
1002	spin_unlock_irq(lock: &conf->resync_lock);
1003	wake_up(&conf->wait_barrier);
1004	return -EINTR;
1005	}
1006
1007	atomic_inc(v: &conf->nr_sync_pending);
1008	spin_unlock_irq(lock: &conf->resync_lock);
1009
1010	return `0`;
1011	}
1012
1013	static void lower_barrier(struct r1conf *conf, sector_t sector_nr)
1014	{
1015	int idx = sector_to_idx(sector: sector_nr);
1016
1017	BUG_ON(atomic_read(&conf->barrier[idx]) <= `0`);
1018
1019	atomic_dec(v: &conf->barrier[idx]);
1020	atomic_dec(v: &conf->nr_sync_pending);
1021	wake_up(&conf->wait_barrier);
1022	}
1023
1024	static bool _wait_barrier(struct r1conf conf, int* idx, bool nowait)
1025	{
1026	bool ret = true;
1027
1028	/*
1029	* We need to increase conf->nr_pending[idx] very early here,
1030	* then raise_barrier() can be blocked when it waits for
1031	* conf->nr_pending[idx] to be 0. Then we can avoid holding
1032	* conf->resync_lock when there is no barrier raised in same
1033	* barrier unit bucket. Also if the array is frozen, I/O
1034	* should be blocked until array is unfrozen.
1035	*/
1036	atomic_inc(v: &conf->nr_pending[idx]);
1037	/*
1038	* In _wait_barrier() we firstly increase conf->nr_pending[idx], then
1039	* check conf->barrier[idx]. In raise_barrier() we firstly increase
1040	* conf->barrier[idx], then check conf->nr_pending[idx]. A memory
1041	* barrier is necessary here to make sure conf->barrier[idx] won't be
1042	* fetched before conf->nr_pending[idx] is increased. Otherwise there
1043	* will be a race between _wait_barrier() and raise_barrier().
1044	*/
1045	smp_mb__after_atomic();
1046
1047	/*
1048	* Don't worry about checking two atomic_t variables at same time
1049	* here. If during we check conf->barrier[idx], the array is
1050	* frozen (conf->array_frozen is 1), and chonf->barrier[idx] is
1051	* 0, it is safe to return and make the I/O continue. Because the
1052	* array is frozen, all I/O returned here will eventually complete
1053	* or be queued, no race will happen. See code comment in
1054	* frozen_array().
1055	*/
1056	if (!READ_ONCE(conf->array_frozen) &&
1057	!atomic_read(v: &conf->barrier[idx]))
1058	return ret;
1059
1060	/*
1061	* After holding conf->resync_lock, conf->nr_pending[idx]
1062	* should be decreased before waiting for barrier to drop.
1063	* Otherwise, we may encounter a race condition because
1064	* raise_barrer() might be waiting for conf->nr_pending[idx]
1065	* to be 0 at same time.
1066	*/
1067	spin_lock_irq(lock: &conf->resync_lock);
1068	atomic_inc(v: &conf->nr_waiting[idx]);
1069	atomic_dec(v: &conf->nr_pending[idx]);
1070	/*
1071	* In case freeze_array() is waiting for
1072	* get_unqueued_pending() == extra
1073	*/
1074	wake_up_barrier(conf);
1075	/ Wait for the barrier in same barrier unit bucket to drop. /
1076
1077	/ Return false when nowait flag is set /
1078	if (nowait) {
1079	ret = false;
1080	} else {
1081	wait_event_lock_irq(conf->wait_barrier,
1082	!conf->array_frozen &&
1083	!atomic_read(&conf->barrier[idx]),
1084	conf->resync_lock);
1085	atomic_inc(v: &conf->nr_pending[idx]);
1086	}
1087
1088	atomic_dec(v: &conf->nr_waiting[idx]);
1089	spin_unlock_irq(lock: &conf->resync_lock);
1090	return ret;
1091	}
1092
1093	static bool wait_read_barrier(struct r1conf *conf, sector_t sector_nr, bool nowait)
1094	{
1095	int idx = sector_to_idx(sector: sector_nr);
1096	bool ret = true;
1097
1098	/*
1099	* Very similar to _wait_barrier(). The difference is, for read
1100	* I/O we don't need wait for sync I/O, but if the whole array
1101	* is frozen, the read I/O still has to wait until the array is
1102	* unfrozen. Since there is no ordering requirement with
1103	* conf->barrier[idx] here, memory barrier is unnecessary as well.
1104	*/
1105	atomic_inc(v: &conf->nr_pending[idx]);
1106
1107	if (!READ_ONCE(conf->array_frozen))
1108	return ret;
1109
1110	spin_lock_irq(lock: &conf->resync_lock);
1111	atomic_inc(v: &conf->nr_waiting[idx]);
1112	atomic_dec(v: &conf->nr_pending[idx]);
1113	/*
1114	* In case freeze_array() is waiting for
1115	* get_unqueued_pending() == extra
1116	*/
1117	wake_up_barrier(conf);
1118	/ Wait for array to be unfrozen /
1119
1120	/ Return false when nowait flag is set /
1121	if (nowait) {
1122	/ Return false when nowait flag is set /
1123	ret = false;
1124	} else {
1125	wait_event_lock_irq(conf->wait_barrier,
1126	!conf->array_frozen,
1127	conf->resync_lock);
1128	atomic_inc(v: &conf->nr_pending[idx]);
1129	}
1130
1131	atomic_dec(v: &conf->nr_waiting[idx]);
1132	spin_unlock_irq(lock: &conf->resync_lock);
1133	return ret;
1134	}
1135
1136	static bool wait_barrier(struct r1conf *conf, sector_t sector_nr, bool nowait)
1137	{
1138	int idx = sector_to_idx(sector: sector_nr);
1139
1140	return _wait_barrier(conf, idx, nowait);
1141	}
1142
1143	static void _allow_barrier(struct r1conf conf, int* idx)
1144	{
1145	atomic_dec(v: &conf->nr_pending[idx]);
1146	wake_up_barrier(conf);
1147	}
1148
1149	static void allow_barrier(struct r1conf *conf, sector_t sector_nr)
1150	{
1151	int idx = sector_to_idx(sector: sector_nr);
1152
1153	_allow_barrier(conf, idx);
1154	}
1155
1156	/ conf->resync_lock should be held /
1157	static int get_unqueued_pending(struct r1conf *conf)
1158	{
1159	int idx, ret;
1160
1161	ret = atomic_read(v: &conf->nr_sync_pending);
1162	for (idx = `0`; idx < BARRIER_BUCKETS_NR; idx++)
1163	ret += atomic_read(v: &conf->nr_pending[idx]) -
1164	atomic_read(v: &conf->nr_queued[idx]);
1165
1166	return ret;
1167	}
1168
1169	static void freeze_array(struct r1conf conf, int* extra)
1170	{
1171	/ Stop sync I/O and normal I/O and wait for everything to*
1172	* go quiet.
1173	* This is called in two situations:
1174	* 1) management command handlers (reshape, remove disk, quiesce).
1175	* 2) one normal I/O request failed.
1176
1177	* After array_frozen is set to 1, new sync IO will be blocked at
1178	* raise_barrier(), and new normal I/O will blocked at _wait_barrier()
1179	* or wait_read_barrier(). The flying I/Os will either complete or be
1180	* queued. When everything goes quite, there are only queued I/Os left.
1181
1182	* Every flying I/O contributes to a conf->nr_pending[idx], idx is the
1183	* barrier bucket index which this I/O request hits. When all sync and
1184	* normal I/O are queued, sum of all conf->nr_pending[] will match sum
1185	* of all conf->nr_queued[]. But normal I/O failure is an exception,
1186	* in handle_read_error(), we may call freeze_array() before trying to
1187	* fix the read error. In this case, the error read I/O is not queued,
1188	* so get_unqueued_pending() == 1.
1189	*
1190	* Therefore before this function returns, we need to wait until
1191	* get_unqueued_pendings(conf) gets equal to extra. For
1192	* normal I/O context, extra is 1, in rested situations extra is 0.
1193	*/
1194	spin_lock_irq(lock: &conf->resync_lock);
1195	conf->array_frozen = `1`;
1196	mddev_add_trace_msg(conf->mddev, "raid1 wait freeze");
1197	wait_event_lock_irq_cmd(
1198	conf->wait_barrier,
1199	get_unqueued_pending(conf) == extra,
1200	conf->resync_lock,
1201	flush_pending_writes(conf));
1202	spin_unlock_irq(lock: &conf->resync_lock);
1203	}
1204	static void unfreeze_array(struct r1conf *conf)
1205	{
1206	/ reverse the effect of the freeze /
1207	spin_lock_irq(lock: &conf->resync_lock);
1208	conf->array_frozen = `0`;
1209	spin_unlock_irq(lock: &conf->resync_lock);
1210	wake_up(&conf->wait_barrier);
1211	}
1212
1213	static void alloc_behind_master_bio(struct r1bio *r1_bio,
1214	struct bio *bio)
1215	{
1216	int size = bio->bi_iter.bi_size;
1217	unsigned vcnt = (size + PAGE_SIZE - `1`) >> PAGE_SHIFT;
1218	int i = `0`;
1219	struct bio *behind_bio = NULL;
1220
1221	behind_bio = bio_alloc_bioset(NULL, nr_vecs: vcnt, opf: `0`, GFP_NOIO,
1222	bs: &r1_bio->mddev->bio_set);
1223
1224	/ discard op, we don't support writezero/writesame yet /
1225	if (!bio_has_data(bio)) {
1226	behind_bio->bi_iter.bi_size = size;
1227	goto skip_copy;
1228	}
1229
1230	while (i < vcnt && size) {
1231	struct page *page;
1232	int len = min_t(int, PAGE_SIZE, size);
1233
1234	page = alloc_page(GFP_NOIO);
1235	if (unlikely(!page))
1236	goto free_pages;
1237
1238	if (!bio_add_page(bio: behind_bio, page, len, off: `0`)) {
1239	put_page(page);
1240	goto free_pages;
1241	}
1242
1243	size -= len;
1244	i++;
1245	}
1246
1247	bio_copy_data(dst: behind_bio, src: bio);
1248	skip_copy:
1249	r1_bio->behind_master_bio = behind_bio;
1250	set_bit(nr: R1BIO_BehindIO, addr: &r1_bio->state);
1251
1252	return;
1253
1254	free_pages:
1255	pr_debug("%dB behind alloc failed, doing sync I/O\n",
1256	bio->bi_iter.bi_size);
1257	bio_free_pages(bio: behind_bio);
1258	bio_put(behind_bio);
1259	}
1260
1261	static void raid1_unplug(struct blk_plug_cb *cb, bool from_schedule)
1262	{
1263	struct raid1_plug_cb plug = container_of(cb, struct* raid1_plug_cb,
1264	cb);
1265	struct mddev *mddev = plug->cb.data;
1266	struct r1conf *conf = mddev->private;
1267	struct bio *bio;
1268
1269	if (from_schedule) {
1270	spin_lock_irq(lock: &conf->device_lock);
1271	bio_list_merge(bl: &conf->pending_bio_list, bl2: &plug->pending);
1272	spin_unlock_irq(lock: &conf->device_lock);
1273	wake_up_barrier(conf);
1274	md_wakeup_thread(thread: mddev->thread);
1275	kfree(objp: plug);
1276	return;
1277	}
1278
1279	/ we aren't scheduling, so we can do the write-out directly. /
1280	bio = bio_list_get(bl: &plug->pending);
1281	flush_bio_list(conf, bio);
1282	kfree(objp: plug);
1283	}
1284
1285	static void init_r1bio(struct r1bio r1_bio, struct* mddev mddev, struct* bio *bio)
1286	{
1287	r1_bio->master_bio = bio;
1288	r1_bio->sectors = bio_sectors(bio);
1289	r1_bio->state = `0`;
1290	r1_bio->mddev = mddev;
1291	r1_bio->sector = bio->bi_iter.bi_sector;
1292	}
1293
1294	static inline struct r1bio *
1295	alloc_r1bio(struct mddev mddev, struct* bio *bio)
1296	{
1297	struct r1conf *conf = mddev->private;
1298	struct r1bio *r1_bio;
1299
1300	r1_bio = mempool_alloc(pool: &conf->r1bio_pool, GFP_NOIO);
1301	/ Ensure no bio records IO_BLOCKED /
1302	memset(r1_bio->bios, `0`, conf->raid_disks * sizeof(r1_bio->bios[`0`]));
1303	init_r1bio(r1_bio, mddev, bio);
1304	return r1_bio;
1305	}
1306
1307	static void raid1_read_request(struct mddev mddev, struct* bio *bio,
1308	int max_read_sectors, struct r1bio *r1_bio)
1309	{
1310	struct r1conf *conf = mddev->private;
1311	struct raid1_info *mirror;
1312	struct bio *read_bio;
1313	struct bitmap *bitmap = mddev->bitmap;
1314	const enum req_op op = bio_op(bio);
1315	const blk_opf_t do_sync = bio->bi_opf & REQ_SYNC;
1316	int max_sectors;
1317	int rdisk;
1318	bool r1bio_existed = !!r1_bio;
1319	char b[BDEVNAME_SIZE];
1320
1321	/*
1322	* If r1_bio is set, we are blocking the raid1d thread
1323	* so there is a tiny risk of deadlock. So ask for
1324	* emergency memory if needed.
1325	*/
1326	gfp_t gfp = r1_bio ? (GFP_NOIO \| __GFP_HIGH) : GFP_NOIO;
1327
1328	if (r1bio_existed) {
1329	/ Need to get the block device name carefully /
1330	struct md_rdev *rdev = conf->mirrors[r1_bio->read_disk].rdev;
1331
1332	if (rdev)
1333	snprintf(buf: b, size: sizeof(b), fmt: "%pg", rdev->bdev);
1334	else
1335	strcpy(p: b, q: "???");
1336	}
1337
1338	/*
1339	* Still need barrier for READ in case that whole
1340	* array is frozen.
1341	*/
1342	if (!wait_read_barrier(conf, sector_nr: bio->bi_iter.bi_sector,
1343	nowait: bio->bi_opf & REQ_NOWAIT)) {
1344	bio_wouldblock_error(bio);
1345	return;
1346	}
1347
1348	if (!r1_bio)
1349	r1_bio = alloc_r1bio(mddev, bio);
1350	else
1351	init_r1bio(r1_bio, mddev, bio);
1352	r1_bio->sectors = max_read_sectors;
1353
1354	/*
1355	* make_request() can abort the operation when read-ahead is being
1356	* used and no empty request is available.
1357	*/
1358	rdisk = read_balance(conf, r1_bio, max_sectors: &max_sectors);
1359
1360	if (rdisk < `0`) {
1361	/ couldn't find anywhere to read from /
1362	if (r1bio_existed) {
1363	pr_crit_ratelimited("md/raid1:%s: %s: unrecoverable I/O read error for block %llu\n",
1364	mdname(mddev),
1365	b,
1366	(unsigned long long)r1_bio->sector);
1367	}
1368	raid_end_bio_io(r1_bio);
1369	return;
1370	}
1371	mirror = conf->mirrors + rdisk;
1372
1373	if (r1bio_existed)
1374	pr_info_ratelimited("md/raid1:%s: redirecting sector %llu to other mirror: %pg\n",
1375	mdname(mddev),
1376	(unsigned long long)r1_bio->sector,
1377	mirror->rdev->bdev);
1378
1379	if (test_bit(WriteMostly, &mirror->rdev->flags) &&
1380	bitmap) {
1381	/*
1382	* Reading from a write-mostly device must take care not to
1383	* over-take any writes that are 'behind'
1384	*/
1385	mddev_add_trace_msg(mddev, "raid1 wait behind writes");
1386	wait_event(bitmap->behind_wait,
1387	atomic_read(&bitmap->behind_writes) == `0`);
1388	}
1389
1390	if (max_sectors < bio_sectors(bio)) {
1391	struct bio *split = bio_split(bio, sectors: max_sectors,
1392	gfp, bs: &conf->bio_split);
1393	bio_chain(split, bio);
1394	submit_bio_noacct(bio);
1395	bio = split;
1396	r1_bio->master_bio = bio;
1397	r1_bio->sectors = max_sectors;
1398	}
1399
1400	r1_bio->read_disk = rdisk;
1401	if (!r1bio_existed) {
1402	md_account_bio(mddev, bio: &bio);
1403	r1_bio->master_bio = bio;
1404	}
1405	read_bio = bio_alloc_clone(bdev: mirror->rdev->bdev, bio_src: bio, gfp,
1406	bs: &mddev->bio_set);
1407
1408	r1_bio->bios[rdisk] = read_bio;
1409
1410	read_bio->bi_iter.bi_sector = r1_bio->sector +
1411	mirror->rdev->data_offset;
1412	read_bio->bi_end_io = raid1_end_read_request;
1413	read_bio->bi_opf = op \| do_sync;
1414	if (test_bit(FailFast, &mirror->rdev->flags) &&
1415	test_bit(R1BIO_FailFast, &r1_bio->state))
1416	read_bio->bi_opf \|= MD_FAILFAST;
1417	read_bio->bi_private = r1_bio;
1418	mddev_trace_remap(mddev, bio: read_bio, sector: r1_bio->sector);
1419	submit_bio_noacct(bio: read_bio);
1420	}
1421
1422	static void raid1_write_request(struct mddev mddev, struct* bio *bio,
1423	int max_write_sectors)
1424	{
1425	struct r1conf *conf = mddev->private;
1426	struct r1bio *r1_bio;
1427	int i, disks;
1428	struct bitmap *bitmap = mddev->bitmap;
1429	unsigned long flags;
1430	struct md_rdev *blocked_rdev;
1431	int first_clone;
1432	int max_sectors;
1433	bool write_behind = false;
1434	bool is_discard = (bio_op(bio) == REQ_OP_DISCARD);
1435
1436	if (mddev_is_clustered(mddev) &&
1437	md_cluster_ops->area_resyncing(mddev, WRITE,
1438	bio->bi_iter.bi_sector, bio_end_sector(bio))) {
1439
1440	DEFINE_WAIT(w);
1441	if (bio->bi_opf & REQ_NOWAIT) {
1442	bio_wouldblock_error(bio);
1443	return;
1444	}
1445	for (;;) {
1446	prepare_to_wait(wq_head: &conf->wait_barrier,
1447	wq_entry: &w, TASK_IDLE);
1448	if (!md_cluster_ops->area_resyncing(mddev, WRITE,
1449	bio->bi_iter.bi_sector,
1450	bio_end_sector(bio)))
1451	break;
1452	schedule();
1453	}
1454	finish_wait(wq_head: &conf->wait_barrier, wq_entry: &w);
1455	}
1456
1457	/*
1458	* Register the new request and wait if the reconstruction
1459	* thread has put up a bar for new requests.
1460	* Continue immediately if no resync is active currently.
1461	*/
1462	if (!wait_barrier(conf, sector_nr: bio->bi_iter.bi_sector,
1463	nowait: bio->bi_opf & REQ_NOWAIT)) {
1464	bio_wouldblock_error(bio);
1465	return;
1466	}
1467
1468	retry_write:
1469	r1_bio = alloc_r1bio(mddev, bio);
1470	r1_bio->sectors = max_write_sectors;
1471
1472	/ first select target devices under rcu_lock and*
1473	* inc refcount on their rdev. Record them by setting
1474	* bios[x] to bio
1475	* If there are known/acknowledged bad blocks on any device on
1476	* which we have seen a write error, we want to avoid writing those
1477	* blocks.
1478	* This potentially requires several writes to write around
1479	* the bad blocks. Each set of writes gets it's own r1bio
1480	* with a set of bios attached.
1481	*/
1482
1483	disks = conf->raid_disks * `2`;
1484	blocked_rdev = NULL;
1485	max_sectors = r1_bio->sectors;
1486	for (i = `0`; i < disks; i++) {
1487	struct md_rdev *rdev = conf->mirrors[i].rdev;
1488
1489	/*
1490	* The write-behind io is only attempted on drives marked as
1491	* write-mostly, which means we could allocate write behind
1492	* bio later.
1493	*/
1494	if (!is_discard && rdev && test_bit(WriteMostly, &rdev->flags))
1495	write_behind = true;
1496
1497	if (rdev && unlikely(test_bit(Blocked, &rdev->flags))) {
1498	atomic_inc(v: &rdev->nr_pending);
1499	blocked_rdev = rdev;
1500	break;
1501	}
1502	r1_bio->bios[i] = NULL;
1503	if (!rdev \|\| test_bit(Faulty, &rdev->flags)) {
1504	if (i < conf->raid_disks)
1505	set_bit(nr: R1BIO_Degraded, addr: &r1_bio->state);
1506	continue;
1507	}
1508
1509	atomic_inc(v: &rdev->nr_pending);
1510	if (test_bit(WriteErrorSeen, &rdev->flags)) {
1511	sector_t first_bad;
1512	int bad_sectors;
1513	int is_bad;
1514
1515	is_bad = is_badblock(rdev, s: r1_bio->sector, sectors: max_sectors,
1516	first_bad: &first_bad, bad_sectors: &bad_sectors);
1517	if (is_bad < `0`) {
1518	/ mustn't write here until the bad block is*
1519	* acknowledged*/
1520	set_bit(nr: BlockedBadBlocks, addr: &rdev->flags);
1521	blocked_rdev = rdev;
1522	break;
1523	}
1524	if (is_bad && first_bad <= r1_bio->sector) {
1525	/ Cannot write here at all /
1526	bad_sectors -= (r1_bio->sector - first_bad);
1527	if (bad_sectors < max_sectors)
1528	/ mustn't write more than bad_sectors*
1529	* to other devices yet
1530	*/
1531	max_sectors = bad_sectors;
1532	rdev_dec_pending(rdev, mddev);
1533	/ We don't set R1BIO_Degraded as that*
1534	* only applies if the disk is
1535	* missing, so it might be re-added,
1536	* and we want to know to recover this
1537	* chunk.
1538	* In this case the device is here,
1539	* and the fact that this chunk is not
1540	* in-sync is recorded in the bad
1541	* block log
1542	*/
1543	continue;
1544	}
1545	if (is_bad) {
1546	int good_sectors = first_bad - r1_bio->sector;
1547	if (good_sectors < max_sectors)
1548	max_sectors = good_sectors;
1549	}
1550	}
1551	r1_bio->bios[i] = bio;
1552	}
1553
1554	if (unlikely(blocked_rdev)) {
1555	/ Wait for this device to become unblocked /
1556	int j;
1557
1558	for (j = `0`; j < i; j++)
1559	if (r1_bio->bios[j])
1560	rdev_dec_pending(rdev: conf->mirrors[j].rdev, mddev);
1561	mempool_free(element: r1_bio, pool: &conf->r1bio_pool);
1562	allow_barrier(conf, sector_nr: bio->bi_iter.bi_sector);
1563
1564	if (bio->bi_opf & REQ_NOWAIT) {
1565	bio_wouldblock_error(bio);
1566	return;
1567	}
1568	mddev_add_trace_msg(mddev, "raid1 wait rdev %d blocked",
1569	blocked_rdev->raid_disk);
1570	md_wait_for_blocked_rdev(rdev: blocked_rdev, mddev);
1571	wait_barrier(conf, sector_nr: bio->bi_iter.bi_sector, nowait: false);
1572	goto retry_write;
1573	}
1574
1575	/*
1576	* When using a bitmap, we may call alloc_behind_master_bio below.
1577	* alloc_behind_master_bio allocates a copy of the data payload a page
1578	* at a time and thus needs a new bio that can fit the whole payload
1579	* this bio in page sized chunks.
1580	*/
1581	if (write_behind && bitmap)
1582	max_sectors = min_t(int, max_sectors,
1583	BIO_MAX_VECS * (PAGE_SIZE >> `9`));
1584	if (max_sectors < bio_sectors(bio)) {
1585	struct bio *split = bio_split(bio, sectors: max_sectors,
1586	GFP_NOIO, bs: &conf->bio_split);
1587	bio_chain(split, bio);
1588	submit_bio_noacct(bio);
1589	bio = split;
1590	r1_bio->master_bio = bio;
1591	r1_bio->sectors = max_sectors;
1592	}
1593
1594	md_account_bio(mddev, bio: &bio);
1595	r1_bio->master_bio = bio;
1596	atomic_set(v: &r1_bio->remaining, i: `1`);
1597	atomic_set(v: &r1_bio->behind_remaining, i: `0`);
1598
1599	first_clone = `1`;
1600
1601	for (i = `0`; i < disks; i++) {
1602	struct bio *mbio = NULL;
1603	struct md_rdev *rdev = conf->mirrors[i].rdev;
1604	if (!r1_bio->bios[i])
1605	continue;
1606
1607	if (first_clone) {
1608	/ do behind I/O ?*
1609	* Not if there are too many, or cannot
1610	* allocate memory, or a reader on WriteMostly
1611	* is waiting for behind writes to flush */
1612	if (bitmap && write_behind &&
1613	(atomic_read(v: &bitmap->behind_writes)
1614	< mddev->bitmap_info.max_write_behind) &&
1615	!waitqueue_active(wq_head: &bitmap->behind_wait)) {
1616	alloc_behind_master_bio(r1_bio, bio);
1617	}
1618
1619	md_bitmap_startwrite(bitmap, offset: r1_bio->sector, sectors: r1_bio->sectors,
1620	test_bit(R1BIO_BehindIO, &r1_bio->state));
1621	first_clone = `0`;
1622	}
1623
1624	if (r1_bio->behind_master_bio) {
1625	mbio = bio_alloc_clone(bdev: rdev->bdev,
1626	bio_src: r1_bio->behind_master_bio,
1627	GFP_NOIO, bs: &mddev->bio_set);
1628	if (test_bit(CollisionCheck, &rdev->flags))
1629	wait_for_serialization(rdev, r1_bio);
1630	if (test_bit(WriteMostly, &rdev->flags))
1631	atomic_inc(v: &r1_bio->behind_remaining);
1632	} else {
1633	mbio = bio_alloc_clone(bdev: rdev->bdev, bio_src: bio, GFP_NOIO,
1634	bs: &mddev->bio_set);
1635
1636	if (mddev->serialize_policy)
1637	wait_for_serialization(rdev, r1_bio);
1638	}
1639
1640	r1_bio->bios[i] = mbio;
1641
1642	mbio->bi_iter.bi_sector = (r1_bio->sector + rdev->data_offset);
1643	mbio->bi_end_io = raid1_end_write_request;
1644	mbio->bi_opf = bio_op(bio) \| (bio->bi_opf & (REQ_SYNC \| REQ_FUA));
1645	if (test_bit(FailFast, &rdev->flags) &&
1646	!test_bit(WriteMostly, &rdev->flags) &&
1647	conf->raid_disks - mddev->degraded > `1`)
1648	mbio->bi_opf \|= MD_FAILFAST;
1649	mbio->bi_private = r1_bio;
1650
1651	atomic_inc(v: &r1_bio->remaining);
1652	mddev_trace_remap(mddev, bio: mbio, sector: r1_bio->sector);
1653	/ flush_pending_writes() needs access to the rdev so.../
1654	mbio->bi_bdev = (void *)rdev;
1655	if (!raid1_add_bio_to_plug(mddev, bio: mbio, unplug: raid1_unplug, copies: disks)) {
1656	spin_lock_irqsave(&conf->device_lock, flags);
1657	bio_list_add(bl: &conf->pending_bio_list, bio: mbio);
1658	spin_unlock_irqrestore(lock: &conf->device_lock, flags);
1659	md_wakeup_thread(thread: mddev->thread);
1660	}
1661	}
1662
1663	r1_bio_write_done(r1_bio);
1664
1665	/ In case raid1d snuck in to freeze_array /
1666	wake_up_barrier(conf);
1667	}
1668
1669	static bool raid1_make_request(struct mddev mddev, struct* bio *bio)
1670	{
1671	sector_t sectors;
1672
1673	if (unlikely(bio->bi_opf & REQ_PREFLUSH)
1674	&& md_flush_request(mddev, bio))
1675	return true;
1676
1677	/*
1678	* There is a limit to the maximum size, but
1679	* the read/write handler might find a lower limit
1680	* due to bad blocks. To avoid multiple splits,
1681	* we pass the maximum number of sectors down
1682	* and let the lower level perform the split.
1683	*/
1684	sectors = align_to_barrier_unit_end(
1685	start_sector: bio->bi_iter.bi_sector, bio_sectors(bio));
1686
1687	if (bio_data_dir(bio) == READ)
1688	raid1_read_request(mddev, bio, max_read_sectors: sectors, NULL);
1689	else {
1690	if (!md_write_start(mddev,bi: bio))
1691	return false;
1692	raid1_write_request(mddev, bio, max_write_sectors: sectors);
1693	}
1694	return true;
1695	}
1696
1697	static void raid1_status(struct seq_file seq, struct* mddev *mddev)
1698	{
1699	struct r1conf *conf = mddev->private;
1700	int i;
1701
1702	lockdep_assert_held(&mddev->lock);
1703
1704	seq_printf(m: seq, fmt: " [%d/%d] [", conf->raid_disks,
1705	conf->raid_disks - mddev->degraded);
1706	for (i = `0`; i < conf->raid_disks; i++) {
1707	struct md_rdev *rdev = READ_ONCE(conf->mirrors[i].rdev);
1708
1709	seq_printf(m: seq, fmt: "%s",
1710	rdev && test_bit(In_sync, &rdev->flags) ? "U" : "_");
1711	}
1712	seq_printf(m: seq, fmt: "]");
1713	}
1714
1715	/**
1716	* raid1_error() - RAID1 error handler.
1717	* @mddev: affected md device.
1718	* @rdev: member device to fail.
1719	*
1720	* The routine acknowledges &rdev failure and determines new @mddev state.
1721	* If it failed, then:
1722	* - &MD_BROKEN flag is set in &mddev->flags.
1723	* - recovery is disabled.
1724	* Otherwise, it must be degraded:
1725	* - recovery is interrupted.
1726	* - &mddev->degraded is bumped.
1727	*
1728	* @rdev is marked as &Faulty excluding case when array is failed and
1729	* &mddev->fail_last_dev is off.
1730	*/
1731	static void raid1_error(struct mddev mddev, struct* md_rdev *rdev)
1732	{
1733	struct r1conf *conf = mddev->private;
1734	unsigned long flags;
1735
1736	spin_lock_irqsave(&conf->device_lock, flags);
1737
1738	if (test_bit(In_sync, &rdev->flags) &&
1739	(conf->raid_disks - mddev->degraded) == `1`) {
1740	set_bit(nr: MD_BROKEN, addr: &mddev->flags);
1741
1742	if (!mddev->fail_last_dev) {
1743	conf->recovery_disabled = mddev->recovery_disabled;
1744	spin_unlock_irqrestore(lock: &conf->device_lock, flags);
1745	return;
1746	}
1747	}
1748	set_bit(nr: Blocked, addr: &rdev->flags);
1749	if (test_and_clear_bit(nr: In_sync, addr: &rdev->flags))
1750	mddev->degraded++;
1751	set_bit(nr: Faulty, addr: &rdev->flags);
1752	spin_unlock_irqrestore(lock: &conf->device_lock, flags);
1753	/*
1754	* if recovery is running, make sure it aborts.
1755	*/
1756	set_bit(nr: MD_RECOVERY_INTR, addr: &mddev->recovery);
1757	set_mask_bits(&mddev->sb_flags, `0`,
1758	BIT(MD_SB_CHANGE_DEVS) \| BIT(MD_SB_CHANGE_PENDING));
1759	pr_crit("md/raid1:%s: Disk failure on %pg, disabling device.\n"
1760	"md/raid1:%s: Operation continuing on %d devices.\n",
1761	mdname(mddev), rdev->bdev,
1762	mdname(mddev), conf->raid_disks - mddev->degraded);
1763	}
1764
1765	static void print_conf(struct r1conf *conf)
1766	{
1767	int i;
1768
1769	pr_debug("RAID1 conf printout:\n");
1770	if (!conf) {
1771	pr_debug("(!conf)\n");
1772	return;
1773	}
1774	pr_debug(" --- wd:%d rd:%d\n", conf->raid_disks - conf->mddev->degraded,
1775	conf->raid_disks);
1776
1777	lockdep_assert_held(&conf->mddev->reconfig_mutex);
1778	for (i = `0`; i < conf->raid_disks; i++) {
1779	struct md_rdev *rdev = conf->mirrors[i].rdev;
1780	if (rdev)
1781	pr_debug(" disk %d, wo:%d, o:%d, dev:%pg\n",
1782	i, !test_bit(In_sync, &rdev->flags),
1783	!test_bit(Faulty, &rdev->flags),
1784	rdev->bdev);
1785	}
1786	}
1787
1788	static void close_sync(struct r1conf *conf)
1789	{
1790	int idx;
1791
1792	for (idx = `0`; idx < BARRIER_BUCKETS_NR; idx++) {
1793	_wait_barrier(conf, idx, nowait: false);
1794	_allow_barrier(conf, idx);
1795	}
1796
1797	mempool_exit(pool: &conf->r1buf_pool);
1798	}
1799
1800	static int raid1_spare_active(struct mddev *mddev)
1801	{
1802	int i;
1803	struct r1conf *conf = mddev->private;
1804	int count = `0`;
1805	unsigned long flags;
1806
1807	/*
1808	* Find all failed disks within the RAID1 configuration
1809	* and mark them readable.
1810	* Called under mddev lock, so rcu protection not needed.
1811	* device_lock used to avoid races with raid1_end_read_request
1812	* which expects 'In_sync' flags and ->degraded to be consistent.
1813	*/
1814	spin_lock_irqsave(&conf->device_lock, flags);
1815	for (i = `0`; i < conf->raid_disks; i++) {
1816	struct md_rdev *rdev = conf->mirrors[i].rdev;
1817	struct md_rdev *repl = conf->mirrors[conf->raid_disks + i].rdev;
1818	if (repl
1819	&& !test_bit(Candidate, &repl->flags)
1820	&& repl->recovery_offset == MaxSector
1821	&& !test_bit(Faulty, &repl->flags)
1822	&& !test_and_set_bit(nr: In_sync, addr: &repl->flags)) {
1823	/ replacement has just become active /
1824	if (!rdev \|\|
1825	!test_and_clear_bit(nr: In_sync, addr: &rdev->flags))
1826	count++;
1827	if (rdev) {
1828	/ Replaced device not technically*
1829	* faulty, but we need to be sure
1830	* it gets removed and never re-added
1831	*/
1832	set_bit(nr: Faulty, addr: &rdev->flags);
1833	sysfs_notify_dirent_safe(
1834	sd: rdev->sysfs_state);
1835	}
1836	}
1837	if (rdev
1838	&& rdev->recovery_offset == MaxSector
1839	&& !test_bit(Faulty, &rdev->flags)
1840	&& !test_and_set_bit(nr: In_sync, addr: &rdev->flags)) {
1841	count++;
1842	sysfs_notify_dirent_safe(sd: rdev->sysfs_state);
1843	}
1844	}
1845	mddev->degraded -= count;
1846	spin_unlock_irqrestore(lock: &conf->device_lock, flags);
1847
1848	print_conf(conf);
1849	return count;
1850	}
1851
1852	static bool raid1_add_conf(struct r1conf conf, struct* md_rdev rdev, int* disk,
1853	bool replacement)
1854	{
1855	struct raid1_info *info = conf->mirrors + disk;
1856
1857	if (replacement)
1858	info += conf->raid_disks;
1859
1860	if (info->rdev)
1861	return false;
1862
1863	if (bdev_nonrot(bdev: rdev->bdev)) {
1864	set_bit(nr: Nonrot, addr: &rdev->flags);
1865	WRITE_ONCE(conf->nonrot_disks, conf->nonrot_disks + `1`);
1866	}
1867
1868	rdev->raid_disk = disk;
1869	info->head_position = `0`;
1870	info->seq_start = MaxSector;
1871	WRITE_ONCE(info->rdev, rdev);
1872
1873	return true;
1874	}
1875
1876	static bool raid1_remove_conf(struct r1conf conf, int* disk)
1877	{
1878	struct raid1_info *info = conf->mirrors + disk;
1879	struct md_rdev *rdev = info->rdev;
1880
1881	if (!rdev \|\| test_bit(In_sync, &rdev->flags) \|\|
1882	atomic_read(v: &rdev->nr_pending))
1883	return false;
1884
1885	/ Only remove non-faulty devices if recovery is not possible. /
1886	if (!test_bit(Faulty, &rdev->flags) &&
1887	rdev->mddev->recovery_disabled != conf->recovery_disabled &&
1888	rdev->mddev->degraded < conf->raid_disks)
1889	return false;
1890
1891	if (test_and_clear_bit(nr: Nonrot, addr: &rdev->flags))
1892	WRITE_ONCE(conf->nonrot_disks, conf->nonrot_disks - `1`);
1893
1894	WRITE_ONCE(info->rdev, NULL);
1895	return true;
1896	}
1897
1898	static int raid1_add_disk(struct mddev mddev, struct* md_rdev *rdev)
1899	{
1900	struct r1conf *conf = mddev->private;
1901	int err = -EEXIST;
1902	int mirror = `0`, repl_slot = -`1`;
1903	struct raid1_info *p;
1904	int first = `0`;
1905	int last = conf->raid_disks - `1`;
1906
1907	if (mddev->recovery_disabled == conf->recovery_disabled)
1908	return -EBUSY;
1909
1910	if (md_integrity_add_rdev(rdev, mddev))
1911	return -ENXIO;
1912
1913	if (rdev->raid_disk >= `0`)
1914	first = last = rdev->raid_disk;
1915
1916	/*
1917	* find the disk ... but prefer rdev->saved_raid_disk
1918	* if possible.
1919	*/
1920	if (rdev->saved_raid_disk >= `0` &&
1921	rdev->saved_raid_disk >= first &&
1922	rdev->saved_raid_disk < conf->raid_disks &&
1923	conf->mirrors[rdev->saved_raid_disk].rdev == NULL)
1924	first = last = rdev->saved_raid_disk;
1925
1926	for (mirror = first; mirror <= last; mirror++) {
1927	p = conf->mirrors + mirror;
1928	if (!p->rdev) {
1929	err = mddev_stack_new_rdev(mddev, rdev);
1930	if (err)
1931	return err;
1932
1933	raid1_add_conf(conf, rdev, disk: mirror, replacement: false);
1934	/ As all devices are equivalent, we don't need a full recovery*
1935	* if this was recently any drive of the array
1936	*/
1937	if (rdev->saved_raid_disk < `0`)
1938	conf->fullsync = `1`;
1939	break;
1940	}
1941	if (test_bit(WantReplacement, &p->rdev->flags) &&
1942	p[conf->raid_disks].rdev == NULL && repl_slot < `0`)
1943	repl_slot = mirror;
1944	}
1945
1946	if (err && repl_slot >= `0`) {
1947	/ Add this device as a replacement /
1948	clear_bit(nr: In_sync, addr: &rdev->flags);
1949	set_bit(nr: Replacement, addr: &rdev->flags);
1950	raid1_add_conf(conf, rdev, disk: repl_slot, replacement: true);
1951	err = `0`;
1952	conf->fullsync = `1`;
1953	}
1954
1955	print_conf(conf);
1956	return err;
1957	}
1958
1959	static int raid1_remove_disk(struct mddev mddev, struct* md_rdev *rdev)
1960	{
1961	struct r1conf *conf = mddev->private;
1962	int err = `0`;
1963	int number = rdev->raid_disk;
1964	struct raid1_info *p = conf->mirrors + number;
1965
1966	if (unlikely(number >= conf->raid_disks))
1967	goto abort;
1968
1969	if (rdev != p->rdev) {
1970	number += conf->raid_disks;
1971	p = conf->mirrors + number;
1972	}
1973
1974	print_conf(conf);
1975	if (rdev == p->rdev) {
1976	if (!raid1_remove_conf(conf, disk: number)) {
1977	err = -EBUSY;
1978	goto abort;
1979	}
1980
1981	if (number < conf->raid_disks &&
1982	conf->mirrors[conf->raid_disks + number].rdev) {
1983	/ We just removed a device that is being replaced.*
1984	* Move down the replacement. We drain all IO before
1985	* doing this to avoid confusion.
1986	*/
1987	struct md_rdev *repl =
1988	conf->mirrors[conf->raid_disks + number].rdev;
1989	freeze_array(conf, extra: `0`);
1990	if (atomic_read(v: &repl->nr_pending)) {
1991	/ It means that some queued IO of retry_list*
1992	* hold repl. Thus, we cannot set replacement
1993	* as NULL, avoiding rdev NULL pointer
1994	* dereference in sync_request_write and
1995	* handle_write_finished.
1996	*/
1997	err = -EBUSY;
1998	unfreeze_array(conf);
1999	goto abort;
2000	}
2001	clear_bit(nr: Replacement, addr: &repl->flags);
2002	WRITE_ONCE(p->rdev, repl);
2003	conf->mirrors[conf->raid_disks + number].rdev = NULL;
2004	unfreeze_array(conf);
2005	}
2006
2007	clear_bit(nr: WantReplacement, addr: &rdev->flags);
2008	err = md_integrity_register(mddev);
2009	}
2010	abort:
2011
2012	print_conf(conf);
2013	return err;
2014	}
2015
2016	static void end_sync_read(struct bio *bio)
2017	{
2018	struct r1bio *r1_bio = get_resync_r1bio(bio);
2019
2020	update_head_pos(disk: r1_bio->read_disk, r1_bio);
2021
2022	/*
2023	* we have read a block, now it needs to be re-written,
2024	* or re-read if the read failed.
2025	* We don't do much here, just schedule handling by raid1d
2026	*/
2027	if (!bio->bi_status)
2028	set_bit(nr: R1BIO_Uptodate, addr: &r1_bio->state);
2029
2030	if (atomic_dec_and_test(v: &r1_bio->remaining))
2031	reschedule_retry(r1_bio);
2032	}
2033
2034	static void abort_sync_write(struct mddev mddev, struct* r1bio *r1_bio)
2035	{
2036	sector_t sync_blocks = `0`;
2037	sector_t s = r1_bio->sector;
2038	long sectors_to_go = r1_bio->sectors;
2039
2040	/ make sure these bits don't get cleared. /
2041	do {
2042	md_bitmap_end_sync(bitmap: mddev->bitmap, offset: s, blocks: &sync_blocks, aborted: `1`);
2043	s += sync_blocks;
2044	sectors_to_go -= sync_blocks;
2045	} while (sectors_to_go > `0`);
2046	}
2047
2048	static void put_sync_write_buf(struct r1bio r1_bio, int* uptodate)
2049	{
2050	if (atomic_dec_and_test(v: &r1_bio->remaining)) {
2051	struct mddev *mddev = r1_bio->mddev;
2052	int s = r1_bio->sectors;
2053
2054	if (test_bit(R1BIO_MadeGood, &r1_bio->state) \|\|
2055	test_bit(R1BIO_WriteError, &r1_bio->state))
2056	reschedule_retry(r1_bio);
2057	else {
2058	put_buf(r1_bio);
2059	md_done_sync(mddev, blocks: s, ok: uptodate);
2060	}
2061	}
2062	}
2063
2064	static void end_sync_write(struct bio *bio)
2065	{
2066	int uptodate = !bio->bi_status;
2067	struct r1bio *r1_bio = get_resync_r1bio(bio);
2068	struct mddev *mddev = r1_bio->mddev;
2069	struct r1conf *conf = mddev->private;
2070	struct md_rdev *rdev = conf->mirrors[find_bio_disk(r1_bio, bio)].rdev;
2071
2072	if (!uptodate) {
2073	abort_sync_write(mddev, r1_bio);
2074	set_bit(nr: WriteErrorSeen, addr: &rdev->flags);
2075	if (!test_and_set_bit(nr: WantReplacement, addr: &rdev->flags))
2076	set_bit(nr: MD_RECOVERY_NEEDED, addr: &
2077	mddev->recovery);
2078	set_bit(nr: R1BIO_WriteError, addr: &r1_bio->state);
2079	} else if (rdev_has_badblock(rdev, s: r1_bio->sector, sectors: r1_bio->sectors) &&
2080	!rdev_has_badblock(rdev: conf->mirrors[r1_bio->read_disk].rdev,
2081	s: r1_bio->sector, sectors: r1_bio->sectors)) {
2082	set_bit(nr: R1BIO_MadeGood, addr: &r1_bio->state);
2083	}
2084
2085	put_sync_write_buf(r1_bio, uptodate);
2086	}
2087
2088	static int r1_sync_page_io(struct md_rdev *rdev, sector_t sector,
2089	int sectors, struct page *page, blk_opf_t rw)
2090	{
2091	if (sync_page_io(rdev, sector, size: sectors << `9`, page, opf: rw, metadata_op: false))
2092	/ success /
2093	return `1`;
2094	if (rw == REQ_OP_WRITE) {
2095	set_bit(nr: WriteErrorSeen, addr: &rdev->flags);
2096	if (!test_and_set_bit(nr: WantReplacement,
2097	addr: &rdev->flags))
2098	set_bit(nr: MD_RECOVERY_NEEDED, addr: &
2099	rdev->mddev->recovery);
2100	}
2101	/ need to record an error - either for the block or the device /
2102	if (!rdev_set_badblocks(rdev, s: sector, sectors, is_new: `0`))
2103	md_error(mddev: rdev->mddev, rdev);
2104	return `0`;
2105	}
2106
2107	static int fix_sync_read_error(struct r1bio *r1_bio)
2108	{
2109	/ Try some synchronous reads of other devices to get*
2110	* good data, much like with normal read errors. Only
2111	* read into the pages we already have so we don't
2112	* need to re-issue the read request.
2113	* We don't need to freeze the array, because being in an
2114	* active sync request, there is no normal IO, and
2115	* no overlapping syncs.
2116	* We don't need to check is_badblock() again as we
2117	* made sure that anything with a bad block in range
2118	* will have bi_end_io clear.
2119	*/
2120	struct mddev *mddev = r1_bio->mddev;
2121	struct r1conf *conf = mddev->private;
2122	struct bio *bio = r1_bio->bios[r1_bio->read_disk];
2123	struct page **pages = get_resync_pages(bio)->pages;
2124	sector_t sect = r1_bio->sector;
2125	int sectors = r1_bio->sectors;
2126	int idx = `0`;
2127	struct md_rdev *rdev;
2128
2129	rdev = conf->mirrors[r1_bio->read_disk].rdev;
2130	if (test_bit(FailFast, &rdev->flags)) {
2131	/ Don't try recovering from here - just fail it*
2132	* ... unless it is the last working device of course */
2133	md_error(mddev, rdev);
2134	if (test_bit(Faulty, &rdev->flags))
2135	/ Don't try to read from here, but make sure*
2136	* put_buf does it's thing
2137	*/
2138	bio->bi_end_io = end_sync_write;
2139	}
2140
2141	while(sectors) {
2142	int s = sectors;
2143	int d = r1_bio->read_disk;
2144	int success = `0`;
2145	int start;
2146
2147	if (s > (PAGE_SIZE>>`9`))
2148	s = PAGE_SIZE >> `9`;
2149	do {
2150	if (r1_bio->bios[d]->bi_end_io == end_sync_read) {
2151	/ No rcu protection needed here devices*
2152	* can only be removed when no resync is
2153	* active, and resync is currently active
2154	*/
2155	rdev = conf->mirrors[d].rdev;
2156	if (sync_page_io(rdev, sector: sect, size: s<<`9`,
2157	page: pages[idx],
2158	opf: REQ_OP_READ, metadata_op: false)) {
2159	success = `1`;
2160	break;
2161	}
2162	}
2163	d++;
2164	if (d == conf->raid_disks * `2`)
2165	d = `0`;
2166	} while (!success && d != r1_bio->read_disk);
2167
2168	if (!success) {
2169	int abort = `0`;
2170	/ Cannot read from anywhere, this block is lost.*
2171	* Record a bad block on each device. If that doesn't
2172	* work just disable and interrupt the recovery.
2173	* Don't fail devices as that won't really help.
2174	*/
2175	pr_crit_ratelimited("md/raid1:%s: %pg: unrecoverable I/O read error for block %llu\n",
2176	mdname(mddev), bio->bi_bdev,
2177	(unsigned long long)r1_bio->sector);
2178	for (d = `0`; d < conf->raid_disks * `2`; d++) {
2179	rdev = conf->mirrors[d].rdev;
2180	if (!rdev \|\| test_bit(Faulty, &rdev->flags))
2181	continue;
2182	if (!rdev_set_badblocks(rdev, s: sect, sectors: s, is_new: `0`))
2183	abort = `1`;
2184	}
2185	if (abort) {
2186	conf->recovery_disabled =
2187	mddev->recovery_disabled;
2188	set_bit(nr: MD_RECOVERY_INTR, addr: &mddev->recovery);
2189	md_done_sync(mddev, blocks: r1_bio->sectors, ok: `0`);
2190	put_buf(r1_bio);
2191	return `0`;
2192	}
2193	/ Try next page /
2194	sectors -= s;
2195	sect += s;
2196	idx++;
2197	continue;
2198	}
2199
2200	start = d;
2201	/ write it back and re-read /
2202	while (d != r1_bio->read_disk) {
2203	if (d == `0`)
2204	d = conf->raid_disks * `2`;
2205	d--;
2206	if (r1_bio->bios[d]->bi_end_io != end_sync_read)
2207	continue;
2208	rdev = conf->mirrors[d].rdev;
2209	if (r1_sync_page_io(rdev, sector: sect, sectors: s,
2210	page: pages[idx],
2211	rw: REQ_OP_WRITE) == `0`) {
2212	r1_bio->bios[d]->bi_end_io = NULL;
2213	rdev_dec_pending(rdev, mddev);
2214	}
2215	}
2216	d = start;
2217	while (d != r1_bio->read_disk) {
2218	if (d == `0`)
2219	d = conf->raid_disks * `2`;
2220	d--;
2221	if (r1_bio->bios[d]->bi_end_io != end_sync_read)
2222	continue;
2223	rdev = conf->mirrors[d].rdev;
2224	if (r1_sync_page_io(rdev, sector: sect, sectors: s,
2225	page: pages[idx],
2226	rw: REQ_OP_READ) != `0`)
2227	atomic_add(i: s, v: &rdev->corrected_errors);
2228	}
2229	sectors -= s;
2230	sect += s;
2231	idx ++;
2232	}
2233	set_bit(nr: R1BIO_Uptodate, addr: &r1_bio->state);
2234	bio->bi_status = `0`;
2235	return `1`;
2236	}
2237
2238	static void process_checks(struct r1bio *r1_bio)
2239	{
2240	/ We have read all readable devices. If we haven't*
2241	* got the block, then there is no hope left.
2242	* If we have, then we want to do a comparison
2243	* and skip the write if everything is the same.
2244	* If any blocks failed to read, then we need to
2245	* attempt an over-write
2246	*/
2247	struct mddev *mddev = r1_bio->mddev;
2248	struct r1conf *conf = mddev->private;
2249	int primary;
2250	int i;
2251	int vcnt;
2252
2253	/ Fix variable parts of all bios /
2254	vcnt = (r1_bio->sectors + PAGE_SIZE / `512` - `1`) >> (PAGE_SHIFT - `9`);
2255	for (i = `0`; i < conf->raid_disks * `2`; i++) {
2256	blk_status_t status;
2257	struct bio *b = r1_bio->bios[i];
2258	struct resync_pages *rp = get_resync_pages(bio: b);
2259	if (b->bi_end_io != end_sync_read)
2260	continue;
2261	/ fixup the bio for reuse, but preserve errno /
2262	status = b->bi_status;
2263	bio_reset(bio: b, bdev: conf->mirrors[i].rdev->bdev, opf: REQ_OP_READ);
2264	b->bi_status = status;
2265	b->bi_iter.bi_sector = r1_bio->sector +
2266	conf->mirrors[i].rdev->data_offset;
2267	b->bi_end_io = end_sync_read;
2268	rp->raid_bio = r1_bio;
2269	b->bi_private = rp;
2270
2271	/ initialize bvec table again /
2272	md_bio_reset_resync_pages(bio: b, rp, size: r1_bio->sectors << `9`);
2273	}
2274	for (primary = `0`; primary < conf->raid_disks * `2`; primary++)
2275	if (r1_bio->bios[primary]->bi_end_io == end_sync_read &&
2276	!r1_bio->bios[primary]->bi_status) {
2277	r1_bio->bios[primary]->bi_end_io = NULL;
2278	rdev_dec_pending(rdev: conf->mirrors[primary].rdev, mddev);
2279	break;
2280	}
2281	r1_bio->read_disk = primary;
2282	for (i = `0`; i < conf->raid_disks * `2`; i++) {
2283	int j = `0`;
2284	struct bio *pbio = r1_bio->bios[primary];
2285	struct bio *sbio = r1_bio->bios[i];
2286	blk_status_t status = sbio->bi_status;
2287	struct page **ppages = get_resync_pages(bio: pbio)->pages;
2288	struct page **spages = get_resync_pages(bio: sbio)->pages;
2289	struct bio_vec *bi;
2290	int page_len[RESYNC_PAGES] = { `0` };
2291	struct bvec_iter_all iter_all;
2292
2293	if (sbio->bi_end_io != end_sync_read)
2294	continue;
2295	/ Now we can 'fixup' the error value /
2296	sbio->bi_status = `0`;
2297
2298	bio_for_each_segment_all(bi, sbio, iter_all)
2299	page_len[j++] = bi->bv_len;
2300
2301	if (!status) {
2302	for (j = vcnt; j-- ; ) {
2303	if (memcmp(page_address(ppages[j]),
2304	page_address(spages[j]),
2305	size: page_len[j]))
2306	break;
2307	}
2308	} else
2309	j = `0`;
2310	if (j >= `0`)
2311	atomic64_add(i: r1_bio->sectors, v: &mddev->resync_mismatches);
2312	if (j < `0` \|\| (test_bit(MD_RECOVERY_CHECK, &mddev->recovery)
2313	&& !status)) {
2314	/ No need to write to this device. /
2315	sbio->bi_end_io = NULL;
2316	rdev_dec_pending(rdev: conf->mirrors[i].rdev, mddev);
2317	continue;
2318	}
2319
2320	bio_copy_data(dst: sbio, src: pbio);
2321	}
2322	}
2323
2324	static void sync_request_write(struct mddev mddev, struct* r1bio *r1_bio)
2325	{
2326	struct r1conf *conf = mddev->private;
2327	int i;
2328	int disks = conf->raid_disks * `2`;
2329	struct bio *wbio;
2330
2331	if (!test_bit(R1BIO_Uptodate, &r1_bio->state))
2332	/ ouch - failed to read all of that. /
2333	if (!fix_sync_read_error(r1_bio))
2334	return;
2335
2336	if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
2337	process_checks(r1_bio);
2338
2339	/*
2340	* schedule writes
2341	*/
2342	atomic_set(v: &r1_bio->remaining, i: `1`);
2343	for (i = `0`; i < disks ; i++) {
2344	wbio = r1_bio->bios[i];
2345	if (wbio->bi_end_io == NULL \|\|
2346	(wbio->bi_end_io == end_sync_read &&
2347	(i == r1_bio->read_disk \|\|
2348	!test_bit(MD_RECOVERY_SYNC, &mddev->recovery))))
2349	continue;
2350	if (test_bit(Faulty, &conf->mirrors[i].rdev->flags)) {
2351	abort_sync_write(mddev, r1_bio);
2352	continue;
2353	}
2354
2355	wbio->bi_opf = REQ_OP_WRITE;
2356	if (test_bit(FailFast, &conf->mirrors[i].rdev->flags))
2357	wbio->bi_opf \|= MD_FAILFAST;
2358
2359	wbio->bi_end_io = end_sync_write;
2360	atomic_inc(v: &r1_bio->remaining);
2361	md_sync_acct(bdev: conf->mirrors[i].rdev->bdev, bio_sectors(wbio));
2362
2363	submit_bio_noacct(bio: wbio);
2364	}
2365
2366	put_sync_write_buf(r1_bio, uptodate: `1`);
2367	}
2368
2369	/*
2370	* This is a kernel thread which:
2371	*
2372	* 1. Retries failed read operations on working mirrors.
2373	* 2. Updates the raid superblock when problems encounter.
2374	* 3. Performs writes following reads for array synchronising.
2375	*/
2376
2377	static void fix_read_error(struct r1conf conf, struct* r1bio *r1_bio)
2378	{
2379	sector_t sect = r1_bio->sector;
2380	int sectors = r1_bio->sectors;
2381	int read_disk = r1_bio->read_disk;
2382	struct mddev *mddev = conf->mddev;
2383	struct md_rdev *rdev = conf->mirrors[read_disk].rdev;
2384
2385	if (exceed_read_errors(mddev, rdev)) {
2386	r1_bio->bios[r1_bio->read_disk] = IO_BLOCKED;
2387	return;
2388	}
2389
2390	while(sectors) {
2391	int s = sectors;
2392	int d = read_disk;
2393	int success = `0`;
2394	int start;
2395
2396	if (s > (PAGE_SIZE>>`9`))
2397	s = PAGE_SIZE >> `9`;
2398
2399	do {
2400	rdev = conf->mirrors[d].rdev;
2401	if (rdev &&
2402	(test_bit(In_sync, &rdev->flags) \|\|
2403	(!test_bit(Faulty, &rdev->flags) &&
2404	rdev->recovery_offset >= sect + s)) &&
2405	rdev_has_badblock(rdev, s: sect, sectors: s) == `0`) {
2406	atomic_inc(v: &rdev->nr_pending);
2407	if (sync_page_io(rdev, sector: sect, size: s<<`9`,
2408	page: conf->tmppage, opf: REQ_OP_READ, metadata_op: false))
2409	success = `1`;
2410	rdev_dec_pending(rdev, mddev);
2411	if (success)
2412	break;
2413	}
2414
2415	d++;
2416	if (d == conf->raid_disks * `2`)
2417	d = `0`;
2418	} while (d != read_disk);
2419
2420	if (!success) {
2421	/ Cannot read from anywhere - mark it bad /
2422	struct md_rdev *rdev = conf->mirrors[read_disk].rdev;
2423	if (!rdev_set_badblocks(rdev, s: sect, sectors: s, is_new: `0`))
2424	md_error(mddev, rdev);
2425	break;
2426	}
2427	/ write it back and re-read /
2428	start = d;
2429	while (d != read_disk) {
2430	if (d==`0`)
2431	d = conf->raid_disks * `2`;
2432	d--;
2433	rdev = conf->mirrors[d].rdev;
2434	if (rdev &&
2435	!test_bit(Faulty, &rdev->flags)) {
2436	atomic_inc(v: &rdev->nr_pending);
2437	r1_sync_page_io(rdev, sector: sect, sectors: s,
2438	page: conf->tmppage, rw: REQ_OP_WRITE);
2439	rdev_dec_pending(rdev, mddev);
2440	}
2441	}
2442	d = start;
2443	while (d != read_disk) {
2444	if (d==`0`)
2445	d = conf->raid_disks * `2`;
2446	d--;
2447	rdev = conf->mirrors[d].rdev;
2448	if (rdev &&
2449	!test_bit(Faulty, &rdev->flags)) {
2450	atomic_inc(v: &rdev->nr_pending);
2451	if (r1_sync_page_io(rdev, sector: sect, sectors: s,
2452	page: conf->tmppage, rw: REQ_OP_READ)) {
2453	atomic_add(i: s, v: &rdev->corrected_errors);
2454	pr_info("md/raid1:%s: read error corrected (%d sectors at %llu on %pg)\n",
2455	mdname(mddev), s,
2456	(unsigned long long)(sect +
2457	rdev->data_offset),
2458	rdev->bdev);
2459	}
2460	rdev_dec_pending(rdev, mddev);
2461	}
2462	}
2463	sectors -= s;
2464	sect += s;
2465	}
2466	}
2467
2468	static int narrow_write_error(struct r1bio r1_bio, int* i)
2469	{
2470	struct mddev *mddev = r1_bio->mddev;
2471	struct r1conf *conf = mddev->private;
2472	struct md_rdev *rdev = conf->mirrors[i].rdev;
2473
2474	/ bio has the data to be written to device 'i' where*
2475	* we just recently had a write error.
2476	* We repeatedly clone the bio and trim down to one block,
2477	* then try the write. Where the write fails we record
2478	* a bad block.
2479	* It is conceivable that the bio doesn't exactly align with
2480	* blocks. We must handle this somehow.
2481	*
2482	* We currently own a reference on the rdev.
2483	*/
2484
2485	int block_sectors;
2486	sector_t sector;
2487	int sectors;
2488	int sect_to_write = r1_bio->sectors;
2489	int ok = `1`;
2490
2491	if (rdev->badblocks.shift < `0`)
2492	return `0`;
2493
2494	block_sectors = roundup(`1` << rdev->badblocks.shift,
2495	bdev_logical_block_size(rdev->bdev) >> `9`);
2496	sector = r1_bio->sector;
2497	sectors = ((sector + block_sectors)
2498	& ~(sector_t)(block_sectors - `1`))
2499	- sector;
2500
2501	while (sect_to_write) {
2502	struct bio *wbio;
2503	if (sectors > sect_to_write)
2504	sectors = sect_to_write;
2505	/ Write at 'sector' for 'sectors'/
2506
2507	if (test_bit(R1BIO_BehindIO, &r1_bio->state)) {
2508	wbio = bio_alloc_clone(bdev: rdev->bdev,
2509	bio_src: r1_bio->behind_master_bio,
2510	GFP_NOIO, bs: &mddev->bio_set);
2511	} else {
2512	wbio = bio_alloc_clone(bdev: rdev->bdev, bio_src: r1_bio->master_bio,
2513	GFP_NOIO, bs: &mddev->bio_set);
2514	}
2515
2516	wbio->bi_opf = REQ_OP_WRITE;
2517	wbio->bi_iter.bi_sector = r1_bio->sector;
2518	wbio->bi_iter.bi_size = r1_bio->sectors << `9`;
2519
2520	bio_trim(bio: wbio, offset: sector - r1_bio->sector, size: sectors);
2521	wbio->bi_iter.bi_sector += rdev->data_offset;
2522
2523	if (submit_bio_wait(bio: wbio) < `0`)
2524	/ failure! /
2525	ok = rdev_set_badblocks(rdev, s: sector,
2526	sectors, is_new: `0`)
2527	&& ok;
2528
2529	bio_put(wbio);
2530	sect_to_write -= sectors;
2531	sector += sectors;
2532	sectors = block_sectors;
2533	}
2534	return ok;
2535	}
2536
2537	static void handle_sync_write_finished(struct r1conf conf, struct* r1bio *r1_bio)
2538	{
2539	int m;
2540	int s = r1_bio->sectors;
2541	for (m = `0`; m < conf->raid_disks * `2` ; m++) {
2542	struct md_rdev *rdev = conf->mirrors[m].rdev;
2543	struct bio *bio = r1_bio->bios[m];
2544	if (bio->bi_end_io == NULL)
2545	continue;
2546	if (!bio->bi_status &&
2547	test_bit(R1BIO_MadeGood, &r1_bio->state)) {
2548	rdev_clear_badblocks(rdev, s: r1_bio->sector, sectors: s, is_new: `0`);
2549	}
2550	if (bio->bi_status &&
2551	test_bit(R1BIO_WriteError, &r1_bio->state)) {
2552	if (!rdev_set_badblocks(rdev, s: r1_bio->sector, sectors: s, is_new: `0`))
2553	md_error(mddev: conf->mddev, rdev);
2554	}
2555	}
2556	put_buf(r1_bio);
2557	md_done_sync(mddev: conf->mddev, blocks: s, ok: `1`);
2558	}
2559
2560	static void handle_write_finished(struct r1conf conf, struct* r1bio *r1_bio)
2561	{
2562	int m, idx;
2563	bool fail = false;
2564
2565	for (m = `0`; m < conf->raid_disks * `2` ; m++)
2566	if (r1_bio->bios[m] == IO_MADE_GOOD) {
2567	struct md_rdev *rdev = conf->mirrors[m].rdev;
2568	rdev_clear_badblocks(rdev,
2569	s: r1_bio->sector,
2570	sectors: r1_bio->sectors, is_new: `0`);
2571	rdev_dec_pending(rdev, mddev: conf->mddev);
2572	} else if (r1_bio->bios[m] != NULL) {
2573	/ This drive got a write error. We need to*
2574	* narrow down and record precise write
2575	* errors.
2576	*/
2577	fail = true;
2578	if (!narrow_write_error(r1_bio, i: m)) {
2579	md_error(mddev: conf->mddev,
2580	rdev: conf->mirrors[m].rdev);
2581	/ an I/O failed, we can't clear the bitmap /
2582	set_bit(nr: R1BIO_Degraded, addr: &r1_bio->state);
2583	}
2584	rdev_dec_pending(rdev: conf->mirrors[m].rdev,
2585	mddev: conf->mddev);
2586	}
2587	if (fail) {
2588	spin_lock_irq(lock: &conf->device_lock);
2589	list_add(new: &r1_bio->retry_list, head: &conf->bio_end_io_list);
2590	idx = sector_to_idx(sector: r1_bio->sector);
2591	atomic_inc(v: &conf->nr_queued[idx]);
2592	spin_unlock_irq(lock: &conf->device_lock);
2593	/*
2594	* In case freeze_array() is waiting for condition
2595	* get_unqueued_pending() == extra to be true.
2596	*/
2597	wake_up(&conf->wait_barrier);
2598	md_wakeup_thread(thread: conf->mddev->thread);
2599	} else {
2600	if (test_bit(R1BIO_WriteError, &r1_bio->state))
2601	close_write(r1_bio);
2602	raid_end_bio_io(r1_bio);
2603	}
2604	}
2605
2606	static void handle_read_error(struct r1conf conf, struct* r1bio *r1_bio)
2607	{
2608	struct mddev *mddev = conf->mddev;
2609	struct bio *bio;
2610	struct md_rdev *rdev;
2611	sector_t sector;
2612
2613	clear_bit(nr: R1BIO_ReadError, addr: &r1_bio->state);
2614	/ we got a read error. Maybe the drive is bad. Maybe just*
2615	* the block and we can fix it.
2616	* We freeze all other IO, and try reading the block from
2617	* other devices. When we find one, we re-write
2618	* and check it that fixes the read error.
2619	* This is all done synchronously while the array is
2620	* frozen
2621	*/
2622
2623	bio = r1_bio->bios[r1_bio->read_disk];
2624	bio_put(bio);
2625	r1_bio->bios[r1_bio->read_disk] = NULL;
2626
2627	rdev = conf->mirrors[r1_bio->read_disk].rdev;
2628	if (mddev->ro == `0`
2629	&& !test_bit(FailFast, &rdev->flags)) {
2630	freeze_array(conf, extra: `1`);
2631	fix_read_error(conf, r1_bio);
2632	unfreeze_array(conf);
2633	} else if (mddev->ro == `0` && test_bit(FailFast, &rdev->flags)) {
2634	md_error(mddev, rdev);
2635	} else {
2636	r1_bio->bios[r1_bio->read_disk] = IO_BLOCKED;
2637	}
2638
2639	rdev_dec_pending(rdev, mddev: conf->mddev);
2640	sector = r1_bio->sector;
2641	bio = r1_bio->master_bio;
2642
2643	/ Reuse the old r1_bio so that the IO_BLOCKED settings are preserved /
2644	r1_bio->state = `0`;
2645	raid1_read_request(mddev, bio, max_read_sectors: r1_bio->sectors, r1_bio);
2646	allow_barrier(conf, sector_nr: sector);
2647	}
2648
2649	static void raid1d(struct md_thread *thread)
2650	{
2651	struct mddev *mddev = thread->mddev;
2652	struct r1bio *r1_bio;
2653	unsigned long flags;
2654	struct r1conf *conf = mddev->private;
2655	struct list_head *head = &conf->retry_list;
2656	struct blk_plug plug;
2657	int idx;
2658
2659	md_check_recovery(mddev);
2660
2661	if (!list_empty_careful(head: &conf->bio_end_io_list) &&
2662	!test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)) {
2663	LIST_HEAD(tmp);
2664	spin_lock_irqsave(&conf->device_lock, flags);
2665	if (!test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags))
2666	list_splice_init(list: &conf->bio_end_io_list, head: &tmp);
2667	spin_unlock_irqrestore(lock: &conf->device_lock, flags);
2668	while (!list_empty(head: &tmp)) {
2669	r1_bio = list_first_entry(&tmp, struct r1bio,
2670	retry_list);
2671	list_del(entry: &r1_bio->retry_list);
2672	idx = sector_to_idx(sector: r1_bio->sector);
2673	atomic_dec(v: &conf->nr_queued[idx]);
2674	if (mddev->degraded)
2675	set_bit(nr: R1BIO_Degraded, addr: &r1_bio->state);
2676	if (test_bit(R1BIO_WriteError, &r1_bio->state))
2677	close_write(r1_bio);
2678	raid_end_bio_io(r1_bio);
2679	}
2680	}
2681
2682	blk_start_plug(&plug);
2683	for (;;) {
2684
2685	flush_pending_writes(conf);
2686
2687	spin_lock_irqsave(&conf->device_lock, flags);
2688	if (list_empty(head)) {
2689	spin_unlock_irqrestore(lock: &conf->device_lock, flags);
2690	break;
2691	}
2692	r1_bio = list_entry(head->prev, struct r1bio, retry_list);
2693	list_del(entry: head->prev);
2694	idx = sector_to_idx(sector: r1_bio->sector);
2695	atomic_dec(v: &conf->nr_queued[idx]);
2696	spin_unlock_irqrestore(lock: &conf->device_lock, flags);
2697
2698	mddev = r1_bio->mddev;
2699	conf = mddev->private;
2700	if (test_bit(R1BIO_IsSync, &r1_bio->state)) {
2701	if (test_bit(R1BIO_MadeGood, &r1_bio->state) \|\|
2702	test_bit(R1BIO_WriteError, &r1_bio->state))
2703	handle_sync_write_finished(conf, r1_bio);
2704	else
2705	sync_request_write(mddev, r1_bio);
2706	} else if (test_bit(R1BIO_MadeGood, &r1_bio->state) \|\|
2707	test_bit(R1BIO_WriteError, &r1_bio->state))
2708	handle_write_finished(conf, r1_bio);
2709	else if (test_bit(R1BIO_ReadError, &r1_bio->state))
2710	handle_read_error(conf, r1_bio);
2711	else
2712	WARN_ON_ONCE(`1`);
2713
2714	cond_resched();
2715	if (mddev->sb_flags & ~(`1`<<MD_SB_CHANGE_PENDING))
2716	md_check_recovery(mddev);
2717	}
2718	blk_finish_plug(&plug);
2719	}
2720
2721	static int init_resync(struct r1conf *conf)
2722	{
2723	int buffs;
2724
2725	buffs = RESYNC_WINDOW / RESYNC_BLOCK_SIZE;
2726	BUG_ON(mempool_initialized(&conf->r1buf_pool));
2727
2728	return mempool_init(pool: &conf->r1buf_pool, min_nr: buffs, alloc_fn: r1buf_pool_alloc,
2729	free_fn: r1buf_pool_free, pool_data: conf->poolinfo);
2730	}
2731
2732	static struct r1bio raid1_alloc_init_r1buf(struct* r1conf *conf)
2733	{
2734	struct r1bio *r1bio = mempool_alloc(pool: &conf->r1buf_pool, GFP_NOIO);
2735	struct resync_pages *rps;
2736	struct bio *bio;
2737	int i;
2738
2739	for (i = conf->poolinfo->raid_disks; i--; ) {
2740	bio = r1bio->bios[i];
2741	rps = bio->bi_private;
2742	bio_reset(bio, NULL, opf: `0`);
2743	bio->bi_private = rps;
2744	}
2745	r1bio->master_bio = NULL;
2746	return r1bio;
2747	}
2748
2749	/*
2750	* perform a "sync" on one "block"
2751	*
2752	* We need to make sure that no normal I/O request - particularly write
2753	* requests - conflict with active sync requests.
2754	*
2755	* This is achieved by tracking pending requests and a 'barrier' concept
2756	* that can be installed to exclude normal IO requests.
2757	*/
2758
2759	static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr,
2760	int *skipped)
2761	{
2762	struct r1conf *conf = mddev->private;
2763	struct r1bio *r1_bio;
2764	struct bio *bio;
2765	sector_t max_sector, nr_sectors;
2766	int disk = -`1`;
2767	int i;
2768	int wonly = -`1`;
2769	int write_targets = `0`, read_targets = `0`;
2770	sector_t sync_blocks;
2771	int still_degraded = `0`;
2772	int good_sectors = RESYNC_SECTORS;
2773	int min_bad = `0`; / number of sectors that are bad in all devices /
2774	int idx = sector_to_idx(sector: sector_nr);
2775	int page_idx = `0`;
2776
2777	if (!mempool_initialized(pool: &conf->r1buf_pool))
2778	if (init_resync(conf))
2779	return `0`;
2780
2781	max_sector = mddev->dev_sectors;
2782	if (sector_nr >= max_sector) {
2783	/ If we aborted, we need to abort the*
2784	* sync on the 'current' bitmap chunk (there will
2785	* only be one in raid1 resync.
2786	* We can find the current addess in mddev->curr_resync
2787	*/
2788	if (mddev->curr_resync < max_sector) / aborted /
2789	md_bitmap_end_sync(bitmap: mddev->bitmap, offset: mddev->curr_resync,
2790	blocks: &sync_blocks, aborted: `1`);
2791	else / completed sync /
2792	conf->fullsync = `0`;
2793
2794	md_bitmap_close_sync(bitmap: mddev->bitmap);
2795	close_sync(conf);
2796
2797	if (mddev_is_clustered(mddev)) {
2798	conf->cluster_sync_low = `0`;
2799	conf->cluster_sync_high = `0`;
2800	}
2801	return `0`;
2802	}
2803
2804	if (mddev->bitmap == NULL &&
2805	mddev->recovery_cp == MaxSector &&
2806	!test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery) &&
2807	conf->fullsync == `0`) {
2808	*skipped = `1`;
2809	return max_sector - sector_nr;
2810	}
2811	/ before building a request, check if we can skip these blocks..*
2812	* This call the bitmap_start_sync doesn't actually record anything
2813	*/
2814	if (!md_bitmap_start_sync(bitmap: mddev->bitmap, offset: sector_nr, blocks: &sync_blocks, degraded: `1`) &&
2815	!conf->fullsync && !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) {
2816	/ We can skip this block, and probably several more /
2817	*skipped = `1`;
2818	return sync_blocks;
2819	}
2820
2821	/*
2822	* If there is non-resync activity waiting for a turn, then let it
2823	* though before starting on this new sync request.
2824	*/
2825	if (atomic_read(v: &conf->nr_waiting[idx]))
2826	schedule_timeout_uninterruptible(timeout: `1`);
2827
2828	/ we are incrementing sector_nr below. To be safe, we check against*
2829	* sector_nr + two times RESYNC_SECTORS
2830	*/
2831
2832	md_bitmap_cond_end_sync(bitmap: mddev->bitmap, sector: sector_nr,
2833	force: mddev_is_clustered(mddev) && (sector_nr + `2` * RESYNC_SECTORS > conf->cluster_sync_high));
2834
2835
2836	if (raise_barrier(conf, sector_nr))
2837	return `0`;
2838
2839	r1_bio = raid1_alloc_init_r1buf(conf);
2840
2841	/*
2842	* If we get a correctably read error during resync or recovery,
2843	* we might want to read from a different device. So we
2844	* flag all drives that could conceivably be read from for READ,
2845	* and any others (which will be non-In_sync devices) for WRITE.
2846	* If a read fails, we try reading from something else for which READ
2847	* is OK.
2848	*/
2849
2850	r1_bio->mddev = mddev;
2851	r1_bio->sector = sector_nr;
2852	r1_bio->state = `0`;
2853	set_bit(nr: R1BIO_IsSync, addr: &r1_bio->state);
2854	/ make sure good_sectors won't go across barrier unit boundary /
2855	good_sectors = align_to_barrier_unit_end(start_sector: sector_nr, sectors: good_sectors);
2856
2857	for (i = `0`; i < conf->raid_disks * `2`; i++) {
2858	struct md_rdev *rdev;
2859	bio = r1_bio->bios[i];
2860
2861	rdev = conf->mirrors[i].rdev;
2862	if (rdev == NULL \|\|
2863	test_bit(Faulty, &rdev->flags)) {
2864	if (i < conf->raid_disks)
2865	still_degraded = `1`;
2866	} else if (!test_bit(In_sync, &rdev->flags)) {
2867	bio->bi_opf = REQ_OP_WRITE;
2868	bio->bi_end_io = end_sync_write;
2869	write_targets ++;
2870	} else {
2871	/ may need to read from here /
2872	sector_t first_bad = MaxSector;
2873	int bad_sectors;
2874
2875	if (is_badblock(rdev, s: sector_nr, sectors: good_sectors,
2876	first_bad: &first_bad, bad_sectors: &bad_sectors)) {
2877	if (first_bad > sector_nr)
2878	good_sectors = first_bad - sector_nr;
2879	else {
2880	bad_sectors -= (sector_nr - first_bad);
2881	if (min_bad == `0` \|\|
2882	min_bad > bad_sectors)
2883	min_bad = bad_sectors;
2884	}
2885	}
2886	if (sector_nr < first_bad) {
2887	if (test_bit(WriteMostly, &rdev->flags)) {
2888	if (wonly < `0`)
2889	wonly = i;
2890	} else {
2891	if (disk < `0`)
2892	disk = i;
2893	}
2894	bio->bi_opf = REQ_OP_READ;
2895	bio->bi_end_io = end_sync_read;
2896	read_targets++;
2897	} else if (!test_bit(WriteErrorSeen, &rdev->flags) &&
2898	test_bit(MD_RECOVERY_SYNC, &mddev->recovery) &&
2899	!test_bit(MD_RECOVERY_CHECK, &mddev->recovery)) {
2900	/*
2901	* The device is suitable for reading (InSync),
2902	* but has bad block(s) here. Let's try to correct them,
2903	* if we are doing resync or repair. Otherwise, leave
2904	* this device alone for this sync request.
2905	*/
2906	bio->bi_opf = REQ_OP_WRITE;
2907	bio->bi_end_io = end_sync_write;
2908	write_targets++;
2909	}
2910	}
2911	if (rdev && bio->bi_end_io) {
2912	atomic_inc(v: &rdev->nr_pending);
2913	bio->bi_iter.bi_sector = sector_nr + rdev->data_offset;
2914	bio_set_dev(bio, bdev: rdev->bdev);
2915	if (test_bit(FailFast, &rdev->flags))
2916	bio->bi_opf \|= MD_FAILFAST;
2917	}
2918	}
2919	if (disk < `0`)
2920	disk = wonly;
2921	r1_bio->read_disk = disk;
2922
2923	if (read_targets == `0` && min_bad > `0`) {
2924	/ These sectors are bad on all InSync devices, so we*
2925	* need to mark them bad on all write targets
2926	*/
2927	int ok = `1`;
2928	for (i = `0` ; i < conf->raid_disks * `2` ; i++)
2929	if (r1_bio->bios[i]->bi_end_io == end_sync_write) {
2930	struct md_rdev *rdev = conf->mirrors[i].rdev;
2931	ok = rdev_set_badblocks(rdev, s: sector_nr,
2932	sectors: min_bad, is_new: `0`
2933	) && ok;
2934	}
2935	set_bit(nr: MD_SB_CHANGE_DEVS, addr: &mddev->sb_flags);
2936	*skipped = `1`;
2937	put_buf(r1_bio);
2938
2939	if (!ok) {
2940	/ Cannot record the badblocks, so need to*
2941	* abort the resync.
2942	* If there are multiple read targets, could just
2943	* fail the really bad ones ???
2944	*/
2945	conf->recovery_disabled = mddev->recovery_disabled;
2946	set_bit(nr: MD_RECOVERY_INTR, addr: &mddev->recovery);
2947	return `0`;
2948	} else
2949	return min_bad;
2950
2951	}
2952	if (min_bad > `0` && min_bad < good_sectors) {
2953	/ only resync enough to reach the next bad->good*
2954	* transition */
2955	good_sectors = min_bad;
2956	}
2957
2958	if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) && read_targets > `0`)
2959	/ extra read targets are also write targets /
2960	write_targets += read_targets-`1`;
2961
2962	if (write_targets == `0` \|\| read_targets == `0`) {
2963	/ There is nowhere to write, so all non-sync*
2964	* drives must be failed - so we are finished
2965	*/
2966	sector_t rv;
2967	if (min_bad > `0`)
2968	max_sector = sector_nr + min_bad;
2969	rv = max_sector - sector_nr;
2970	*skipped = `1`;
2971	put_buf(r1_bio);
2972	return rv;
2973	}
2974
2975	if (max_sector > mddev->resync_max)
2976	max_sector = mddev->resync_max; / Don't do IO beyond here /
2977	if (max_sector > sector_nr + good_sectors)
2978	max_sector = sector_nr + good_sectors;
2979	nr_sectors = `0`;
2980	sync_blocks = `0`;
2981	do {
2982	struct page *page;
2983	int len = PAGE_SIZE;
2984	if (sector_nr + (len>>`9`) > max_sector)
2985	len = (max_sector - sector_nr) << `9`;
2986	if (len == `0`)
2987	break;
2988	if (sync_blocks == `0`) {
2989	if (!md_bitmap_start_sync(bitmap: mddev->bitmap, offset: sector_nr,
2990	blocks: &sync_blocks, degraded: still_degraded) &&
2991	!conf->fullsync &&
2992	!test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
2993	break;
2994	if ((len >> `9`) > sync_blocks)
2995	len = sync_blocks<<`9`;
2996	}
2997
2998	for (i = `0` ; i < conf->raid_disks * `2`; i++) {
2999	struct resync_pages *rp;
3000
3001	bio = r1_bio->bios[i];
3002	rp = get_resync_pages(bio);
3003	if (bio->bi_end_io) {
3004	page = resync_fetch_page(rp, idx: page_idx);
3005
3006	/*
3007	* won't fail because the vec table is big
3008	* enough to hold all these pages
3009	*/
3010	__bio_add_page(bio, page, len, off: `0`);
3011	}
3012	}
3013	nr_sectors += len>>`9`;
3014	sector_nr += len>>`9`;
3015	sync_blocks -= (len>>`9`);
3016	} while (++page_idx < RESYNC_PAGES);
3017
3018	r1_bio->sectors = nr_sectors;
3019
3020	if (mddev_is_clustered(mddev) &&
3021	conf->cluster_sync_high < sector_nr + nr_sectors) {
3022	conf->cluster_sync_low = mddev->curr_resync_completed;
3023	conf->cluster_sync_high = conf->cluster_sync_low + CLUSTER_RESYNC_WINDOW_SECTORS;
3024	/ Send resync message /
3025	md_cluster_ops->resync_info_update(mddev,
3026	conf->cluster_sync_low,
3027	conf->cluster_sync_high);
3028	}
3029
3030	/ For a user-requested sync, we read all readable devices and do a*
3031	* compare
3032	*/
3033	if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) {
3034	atomic_set(v: &r1_bio->remaining, i: read_targets);
3035	for (i = `0`; i < conf->raid_disks * `2` && read_targets; i++) {
3036	bio = r1_bio->bios[i];
3037	if (bio->bi_end_io == end_sync_read) {
3038	read_targets--;
3039	md_sync_acct_bio(bio, nr_sectors);
3040	if (read_targets == `1`)
3041	bio->bi_opf &= ~MD_FAILFAST;
3042	submit_bio_noacct(bio);
3043	}
3044	}
3045	} else {
3046	atomic_set(v: &r1_bio->remaining, i: `1`);
3047	bio = r1_bio->bios[r1_bio->read_disk];
3048	md_sync_acct_bio(bio, nr_sectors);
3049	if (read_targets == `1`)
3050	bio->bi_opf &= ~MD_FAILFAST;
3051	submit_bio_noacct(bio);
3052	}
3053	return nr_sectors;
3054	}
3055
3056	static sector_t raid1_size(struct mddev mddev, sector_t sectors, int* raid_disks)
3057	{
3058	if (sectors)
3059	return sectors;
3060
3061	return mddev->dev_sectors;
3062	}
3063
3064	static struct r1conf setup_conf(struct* mddev *mddev)
3065	{
3066	struct r1conf *conf;
3067	int i;
3068	struct raid1_info *disk;
3069	struct md_rdev *rdev;
3070	int err = -ENOMEM;
3071
3072	conf = kzalloc(size: sizeof(struct r1conf), GFP_KERNEL);
3073	if (!conf)
3074	goto abort;
3075
3076	conf->nr_pending = kcalloc(BARRIER_BUCKETS_NR,
3077	size: sizeof(atomic_t), GFP_KERNEL);
3078	if (!conf->nr_pending)
3079	goto abort;
3080
3081	conf->nr_waiting = kcalloc(BARRIER_BUCKETS_NR,
3082	size: sizeof(atomic_t), GFP_KERNEL);
3083	if (!conf->nr_waiting)
3084	goto abort;
3085
3086	conf->nr_queued = kcalloc(BARRIER_BUCKETS_NR,
3087	size: sizeof(atomic_t), GFP_KERNEL);
3088	if (!conf->nr_queued)
3089	goto abort;
3090
3091	conf->barrier = kcalloc(BARRIER_BUCKETS_NR,
3092	size: sizeof(atomic_t), GFP_KERNEL);
3093	if (!conf->barrier)
3094	goto abort;
3095
3096	conf->mirrors = kzalloc(array3_size(sizeof(struct raid1_info),
3097	mddev->raid_disks, `2`),
3098	GFP_KERNEL);
3099	if (!conf->mirrors)
3100	goto abort;
3101
3102	conf->tmppage = alloc_page(GFP_KERNEL);
3103	if (!conf->tmppage)
3104	goto abort;
3105
3106	conf->poolinfo = kzalloc(size: sizeof(*conf->poolinfo), GFP_KERNEL);
3107	if (!conf->poolinfo)
3108	goto abort;
3109	conf->poolinfo->raid_disks = mddev->raid_disks * `2`;
3110	err = mempool_init(pool: &conf->r1bio_pool, NR_RAID_BIOS, alloc_fn: r1bio_pool_alloc,
3111	free_fn: rbio_pool_free, pool_data: conf->poolinfo);
3112	if (err)
3113	goto abort;
3114
3115	err = bioset_init(&conf->bio_split, BIO_POOL_SIZE, `0`, flags: `0`);
3116	if (err)
3117	goto abort;
3118
3119	conf->poolinfo->mddev = mddev;
3120
3121	err = -EINVAL;
3122	spin_lock_init(&conf->device_lock);
3123	conf->raid_disks = mddev->raid_disks;
3124	rdev_for_each(rdev, mddev) {
3125	int disk_idx = rdev->raid_disk;
3126
3127	if (disk_idx >= conf->raid_disks \|\| disk_idx < `0`)
3128	continue;
3129
3130	if (!raid1_add_conf(conf, rdev, disk: disk_idx,
3131	test_bit(Replacement, &rdev->flags)))
3132	goto abort;
3133	}
3134	conf->mddev = mddev;
3135	INIT_LIST_HEAD(list: &conf->retry_list);
3136	INIT_LIST_HEAD(list: &conf->bio_end_io_list);
3137
3138	spin_lock_init(&conf->resync_lock);
3139	init_waitqueue_head(&conf->wait_barrier);
3140
3141	bio_list_init(bl: &conf->pending_bio_list);
3142	conf->recovery_disabled = mddev->recovery_disabled - `1`;
3143
3144	err = -EIO;
3145	for (i = `0`; i < conf->raid_disks * `2`; i++) {
3146
3147	disk = conf->mirrors + i;
3148
3149	if (i < conf->raid_disks &&
3150	disk[conf->raid_disks].rdev) {
3151	/ This slot has a replacement. /
3152	if (!disk->rdev) {
3153	/ No original, just make the replacement*
3154	* a recovering spare
3155	*/
3156	disk->rdev =
3157	disk[conf->raid_disks].rdev;
3158	disk[conf->raid_disks].rdev = NULL;
3159	} else if (!test_bit(In_sync, &disk->rdev->flags))
3160	/ Original is not in_sync - bad /
3161	goto abort;
3162	}
3163
3164	if (!disk->rdev \|\|
3165	!test_bit(In_sync, &disk->rdev->flags)) {
3166	disk->head_position = `0`;
3167	if (disk->rdev &&
3168	(disk->rdev->saved_raid_disk < `0`))
3169	conf->fullsync = `1`;
3170	}
3171	}
3172
3173	err = -ENOMEM;
3174	rcu_assign_pointer(conf->thread,
3175	md_register_thread(raid1d, mddev, "raid1"));
3176	if (!conf->thread)
3177	goto abort;
3178
3179	return conf;
3180
3181	abort:
3182	if (conf) {
3183	mempool_exit(pool: &conf->r1bio_pool);
3184	kfree(objp: conf->mirrors);
3185	safe_put_page(p: conf->tmppage);
3186	kfree(objp: conf->poolinfo);
3187	kfree(objp: conf->nr_pending);
3188	kfree(objp: conf->nr_waiting);
3189	kfree(objp: conf->nr_queued);
3190	kfree(objp: conf->barrier);
3191	bioset_exit(&conf->bio_split);
3192	kfree(objp: conf);
3193	}
3194	return ERR_PTR(error: err);
3195	}
3196
3197	static int raid1_set_limits(struct mddev *mddev)
3198	{
3199	struct queue_limits lim;
3200
3201	blk_set_stacking_limits(lim: &lim);
3202	lim.max_write_zeroes_sectors = `0`;
3203	mddev_stack_rdev_limits(mddev, lim: &lim);
3204	return queue_limits_set(q: mddev->gendisk->queue, lim: &lim);
3205	}
3206
3207	static void raid1_free(struct mddev mddev, void* *priv);
3208	static int raid1_run(struct mddev *mddev)
3209	{
3210	struct r1conf *conf;
3211	int i;
3212	int ret;
3213
3214	if (mddev->level != `1`) {
3215	pr_warn("md/raid1:%s: raid level not set to mirroring (%d)\n",
3216	mdname(mddev), mddev->level);
3217	return -EIO;
3218	}
3219	if (mddev->reshape_position != MaxSector) {
3220	pr_warn("md/raid1:%s: reshape_position set but not supported\n",
3221	mdname(mddev));
3222	return -EIO;
3223	}
3224
3225	/*
3226	* copy the already verified devices into our private RAID1
3227	* bookkeeping area. [whatever we allocate in run(),
3228	* should be freed in raid1_free()]
3229	*/
3230	if (mddev->private == NULL)
3231	conf = setup_conf(mddev);
3232	else
3233	conf = mddev->private;
3234
3235	if (IS_ERR(ptr: conf))
3236	return PTR_ERR(ptr: conf);
3237
3238	if (!mddev_is_dm(mddev)) {
3239	ret = raid1_set_limits(mddev);
3240	if (ret)
3241	goto abort;
3242	}
3243
3244	mddev->degraded = `0`;
3245	for (i = `0`; i < conf->raid_disks; i++)
3246	if (conf->mirrors[i].rdev == NULL \|\|
3247	!test_bit(In_sync, &conf->mirrors[i].rdev->flags) \|\|
3248	test_bit(Faulty, &conf->mirrors[i].rdev->flags))
3249	mddev->degraded++;
3250	/*
3251	* RAID1 needs at least one disk in active
3252	*/
3253	if (conf->raid_disks - mddev->degraded < `1`) {
3254	md_unregister_thread(mddev, threadp: &conf->thread);
3255	ret = -EINVAL;
3256	goto abort;
3257	}
3258
3259	if (conf->raid_disks - mddev->degraded == `1`)
3260	mddev->recovery_cp = MaxSector;
3261
3262	if (mddev->recovery_cp != MaxSector)
3263	pr_info("md/raid1:%s: not clean -- starting background reconstruction\n",
3264	mdname(mddev));
3265	pr_info("md/raid1:%s: active with %d out of %d mirrors\n",
3266	mdname(mddev), mddev->raid_disks - mddev->degraded,
3267	mddev->raid_disks);
3268
3269	/*
3270	* Ok, everything is just fine now
3271	*/
3272	rcu_assign_pointer(mddev->thread, conf->thread);
3273	rcu_assign_pointer(conf->thread, NULL);
3274	mddev->private = conf;
3275	set_bit(nr: MD_FAILFAST_SUPPORTED, addr: &mddev->flags);
3276
3277	md_set_array_sectors(mddev, array_sectors: raid1_size(mddev, sectors: `0`, raid_disks: `0`));
3278
3279	ret = md_integrity_register(mddev);
3280	if (ret) {
3281	md_unregister_thread(mddev, threadp: &mddev->thread);
3282	goto abort;
3283	}
3284	return `0`;
3285
3286	abort:
3287	raid1_free(mddev, priv: conf);
3288	return ret;
3289	}
3290
3291	static void raid1_free(struct mddev mddev, void* *priv)
3292	{
3293	struct r1conf *conf = priv;
3294
3295	mempool_exit(pool: &conf->r1bio_pool);
3296	kfree(objp: conf->mirrors);
3297	safe_put_page(p: conf->tmppage);
3298	kfree(objp: conf->poolinfo);
3299	kfree(objp: conf->nr_pending);
3300	kfree(objp: conf->nr_waiting);
3301	kfree(objp: conf->nr_queued);
3302	kfree(objp: conf->barrier);
3303	bioset_exit(&conf->bio_split);
3304	kfree(objp: conf);
3305	}
3306
3307	static int raid1_resize(struct mddev *mddev, sector_t sectors)
3308	{
3309	/ no resync is happening, and there is enough space*
3310	* on all devices, so we can resize.
3311	* We need to make sure resync covers any new space.
3312	* If the array is shrinking we should possibly wait until
3313	* any io in the removed space completes, but it hardly seems
3314	* worth it.
3315	*/
3316	sector_t newsize = raid1_size(mddev, sectors, raid_disks: `0`);
3317	if (mddev->external_size &&
3318	mddev->array_sectors > newsize)
3319	return -EINVAL;
3320	if (mddev->bitmap) {
3321	int ret = md_bitmap_resize(bitmap: mddev->bitmap, blocks: newsize, chunksize: `0`, init: `0`);
3322	if (ret)
3323	return ret;
3324	}
3325	md_set_array_sectors(mddev, array_sectors: newsize);
3326	if (sectors > mddev->dev_sectors &&
3327	mddev->recovery_cp > mddev->dev_sectors) {
3328	mddev->recovery_cp = mddev->dev_sectors;
3329	set_bit(nr: MD_RECOVERY_NEEDED, addr: &mddev->recovery);
3330	}
3331	mddev->dev_sectors = sectors;
3332	mddev->resync_max_sectors = sectors;
3333	return `0`;
3334	}
3335
3336	static int raid1_reshape(struct mddev *mddev)
3337	{
3338	/ We need to:*
3339	* 1/ resize the r1bio_pool
3340	* 2/ resize conf->mirrors
3341	*
3342	* We allocate a new r1bio_pool if we can.
3343	* Then raise a device barrier and wait until all IO stops.
3344	* Then resize conf->mirrors and swap in the new r1bio pool.
3345	*
3346	* At the same time, we "pack" the devices so that all the missing
3347	* devices have the higher raid_disk numbers.
3348	*/
3349	mempool_t newpool, oldpool;
3350	struct pool_info *newpoolinfo;
3351	struct raid1_info *newmirrors;
3352	struct r1conf *conf = mddev->private;
3353	int cnt, raid_disks;
3354	unsigned long flags;
3355	int d, d2;
3356	int ret;
3357
3358	memset(&newpool, `0`, sizeof(newpool));
3359	memset(&oldpool, `0`, sizeof(oldpool));
3360
3361	/ Cannot change chunk_size, layout, or level /
3362	if (mddev->chunk_sectors != mddev->new_chunk_sectors \|\|
3363	mddev->layout != mddev->new_layout \|\|
3364	mddev->level != mddev->new_level) {
3365	mddev->new_chunk_sectors = mddev->chunk_sectors;
3366	mddev->new_layout = mddev->layout;
3367	mddev->new_level = mddev->level;
3368	return -EINVAL;
3369	}
3370
3371	if (!mddev_is_clustered(mddev))
3372	md_allow_write(mddev);
3373
3374	raid_disks = mddev->raid_disks + mddev->delta_disks;
3375
3376	if (raid_disks < conf->raid_disks) {
3377	cnt=`0`;
3378	for (d= `0`; d < conf->raid_disks; d++)
3379	if (conf->mirrors[d].rdev)
3380	cnt++;
3381	if (cnt > raid_disks)
3382	return -EBUSY;
3383	}
3384
3385	newpoolinfo = kmalloc(size: sizeof(*newpoolinfo), GFP_KERNEL);
3386	if (!newpoolinfo)
3387	return -ENOMEM;
3388	newpoolinfo->mddev = mddev;
3389	newpoolinfo->raid_disks = raid_disks * `2`;
3390
3391	ret = mempool_init(pool: &newpool, NR_RAID_BIOS, alloc_fn: r1bio_pool_alloc,
3392	free_fn: rbio_pool_free, pool_data: newpoolinfo);
3393	if (ret) {
3394	kfree(objp: newpoolinfo);
3395	return ret;
3396	}
3397	newmirrors = kzalloc(array3_size(sizeof(struct raid1_info),
3398	raid_disks, `2`),
3399	GFP_KERNEL);
3400	if (!newmirrors) {
3401	kfree(objp: newpoolinfo);
3402	mempool_exit(pool: &newpool);
3403	return -ENOMEM;
3404	}
3405
3406	freeze_array(conf, extra: `0`);
3407
3408	/ ok, everything is stopped /
3409	oldpool = conf->r1bio_pool;
3410	conf->r1bio_pool = newpool;
3411
3412	for (d = d2 = `0`; d < conf->raid_disks; d++) {
3413	struct md_rdev *rdev = conf->mirrors[d].rdev;
3414	if (rdev && rdev->raid_disk != d2) {
3415	sysfs_unlink_rdev(mddev, rdev);
3416	rdev->raid_disk = d2;
3417	sysfs_unlink_rdev(mddev, rdev);
3418	if (sysfs_link_rdev(mddev, rdev))
3419	pr_warn("md/raid1:%s: cannot register rd%d\n",
3420	mdname(mddev), rdev->raid_disk);
3421	}
3422	if (rdev)
3423	newmirrors[d2++].rdev = rdev;
3424	}
3425	kfree(objp: conf->mirrors);
3426	conf->mirrors = newmirrors;
3427	kfree(objp: conf->poolinfo);
3428	conf->poolinfo = newpoolinfo;
3429
3430	spin_lock_irqsave(&conf->device_lock, flags);
3431	mddev->degraded += (raid_disks - conf->raid_disks);
3432	spin_unlock_irqrestore(lock: &conf->device_lock, flags);
3433	conf->raid_disks = mddev->raid_disks = raid_disks;
3434	mddev->delta_disks = `0`;
3435
3436	unfreeze_array(conf);
3437
3438	set_bit(nr: MD_RECOVERY_RECOVER, addr: &mddev->recovery);
3439	set_bit(nr: MD_RECOVERY_NEEDED, addr: &mddev->recovery);
3440	md_wakeup_thread(thread: mddev->thread);
3441
3442	mempool_exit(pool: &oldpool);
3443	return `0`;
3444	}
3445
3446	static void raid1_quiesce(struct mddev mddev, int* quiesce)
3447	{
3448	struct r1conf *conf = mddev->private;
3449
3450	if (quiesce)
3451	freeze_array(conf, extra: `0`);
3452	else
3453	unfreeze_array(conf);
3454	}
3455
3456	static void raid1_takeover(struct* mddev *mddev)
3457	{
3458	/ raid1 can take over:*
3459	* raid5 with 2 devices, any layout or chunk size
3460	*/
3461	if (mddev->level == `5` && mddev->raid_disks == `2`) {
3462	struct r1conf *conf;
3463	mddev->new_level = `1`;
3464	mddev->new_layout = `0`;
3465	mddev->new_chunk_sectors = `0`;
3466	conf = setup_conf(mddev);
3467	if (!IS_ERR(ptr: conf)) {
3468	/ Array must appear to be quiesced /
3469	conf->array_frozen = `1`;
3470	mddev_clear_unsupported_flags(mddev,
3471	UNSUPPORTED_MDDEV_FLAGS);
3472	}
3473	return conf;
3474	}
3475	return ERR_PTR(error: -EINVAL);
3476	}
3477
3478	static struct md_personality raid1_personality =
3479	{
3480	.name = "raid1",
3481	.level = `1`,
3482	.owner = THIS_MODULE,
3483	.make_request = raid1_make_request,
3484	.run = raid1_run,
3485	.free = raid1_free,
3486	.status = raid1_status,
3487	.error_handler = raid1_error,
3488	.hot_add_disk = raid1_add_disk,
3489	.hot_remove_disk= raid1_remove_disk,
3490	.spare_active = raid1_spare_active,
3491	.sync_request = raid1_sync_request,
3492	.resize = raid1_resize,
3493	.size = raid1_size,
3494	.check_reshape = raid1_reshape,
3495	.quiesce = raid1_quiesce,
3496	.takeover = raid1_takeover,
3497	};
3498
3499	static int __init raid_init(void)
3500	{
3501	return register_md_personality(p: &raid1_personality);
3502	}
3503
3504	static void raid_exit(void)
3505	{
3506	unregister_md_personality(p: &raid1_personality);
3507	}
3508
3509	module_init(raid_init);
3510	module_exit(raid_exit);
3511	MODULE_LICENSE("GPL");
3512	MODULE_DESCRIPTION("RAID1 (mirroring) personality for MD");
3513	MODULE_ALIAS("md-personality-3"); / RAID1 /
3514	MODULE_ALIAS("md-raid1");
3515	MODULE_ALIAS("md-level-1");
3516

source code of linux/drivers/md/raid1.c