raid5.c source code [linux/drivers/md/raid5.c]

1	// SPDX-License-Identifier: GPL-2.0-or-later
2	/*
3	* raid5.c : Multiple Devices driver for Linux
4	* Copyright (C) 1996, 1997 Ingo Molnar, Miguel de Icaza, Gadi Oxman
5	* Copyright (C) 1999, 2000 Ingo Molnar
6	* Copyright (C) 2002, 2003 H. Peter Anvin
7	*
8	* RAID-4/5/6 management functions.
9	* Thanks to Penguin Computing for making the RAID-6 development possible
10	* by donating a test server!
11	*/
12
13	/*
14	* BITMAP UNPLUGGING:
15	*
16	* The sequencing for updating the bitmap reliably is a little
17	* subtle (and I got it wrong the first time) so it deserves some
18	* explanation.
19	*
20	* We group bitmap updates into batches. Each batch has a number.
21	* We may write out several batches at once, but that isn't very important.
22	* conf->seq_write is the number of the last batch successfully written.
23	* conf->seq_flush is the number of the last batch that was closed to
24	* new additions.
25	* When we discover that we will need to write to any block in a stripe
26	* (in add_stripe_bio) we update the in-memory bitmap and record in sh->bm_seq
27	* the number of the batch it will be in. This is seq_flush+1.
28	* When we are ready to do a write, if that batch hasn't been written yet,
29	* we plug the array and queue the stripe for later.
30	* When an unplug happens, we increment bm_flush, thus closing the current
31	* batch.
32	* When we notice that bm_flush > bm_write, we write out all pending updates
33	* to the bitmap, and advance bm_write to where bm_flush was.
34	* This may occasionally write a bit out twice, but is sure never to
35	* miss any bits.
36	*/
37
38	#include <linux/blkdev.h>
39	#include <linux/delay.h>
40	#include <linux/kthread.h>
41	#include <linux/raid/pq.h>
42	#include <linux/async_tx.h>
43	#include <linux/module.h>
44	#include <linux/async.h>
45	#include <linux/seq_file.h>
46	#include <linux/cpu.h>
47	#include <linux/slab.h>
48	#include <linux/ratelimit.h>
49	#include <linux/nodemask.h>
50
51	#include <trace/events/block.h>
52	#include <linux/list_sort.h>
53
54	#include "md.h"
55	#include "raid5.h"
56	#include "raid0.h"
57	#include "md-bitmap.h"
58	#include "raid5-log.h"
59
60	#define UNSUPPORTED_MDDEV_FLAGS (1L << MD_FAILFAST_SUPPORTED)
61
62	#define cpu_to_group(cpu) cpu_to_node(cpu)
63	#define ANY_GROUP NUMA_NO_NODE
64
65	#define RAID5_MAX_REQ_STRIPES 256
66
67	static bool devices_handle_discard_safely = false;
68	module_param(devices_handle_discard_safely, bool, `0644`);
69	MODULE_PARM_DESC(devices_handle_discard_safely,
70	"Set to Y if all devices in each array reliably return zeroes on reads from discarded regions");
71	static struct workqueue_struct *raid5_wq;
72
73	static void raid5_quiesce(struct mddev mddev, int* quiesce);
74
75	static inline struct hlist_head stripe_hash(struct* r5conf *conf, sector_t sect)
76	{
77	int hash = (sect >> RAID5_STRIPE_SHIFT(conf)) & HASH_MASK;
78	return &conf->stripe_hashtbl[hash];
79	}
80
81	static inline int stripe_hash_locks_hash(struct r5conf *conf, sector_t sect)
82	{
83	return (sect >> RAID5_STRIPE_SHIFT(conf)) & STRIPE_HASH_LOCKS_MASK;
84	}
85
86	static inline void lock_device_hash_lock(struct r5conf conf, int* hash)
87	__acquires(&conf->device_lock)
88	{
89	spin_lock_irq(lock: conf->hash_locks + hash);
90	spin_lock(lock: &conf->device_lock);
91	}
92
93	static inline void unlock_device_hash_lock(struct r5conf conf, int* hash)
94	__releases(&conf->device_lock)
95	{
96	spin_unlock(lock: &conf->device_lock);
97	spin_unlock_irq(lock: conf->hash_locks + hash);
98	}
99
100	static inline void lock_all_device_hash_locks_irq(struct r5conf *conf)
101	__acquires(&conf->device_lock)
102	{
103	int i;
104	spin_lock_irq(lock: conf->hash_locks);
105	for (i = `1`; i < NR_STRIPE_HASH_LOCKS; i++)
106	spin_lock_nest_lock(conf->hash_locks + i, conf->hash_locks);
107	spin_lock(lock: &conf->device_lock);
108	}
109
110	static inline void unlock_all_device_hash_locks_irq(struct r5conf *conf)
111	__releases(&conf->device_lock)
112	{
113	int i;
114	spin_unlock(lock: &conf->device_lock);
115	for (i = NR_STRIPE_HASH_LOCKS - `1`; i; i--)
116	spin_unlock(lock: conf->hash_locks + i);
117	spin_unlock_irq(lock: conf->hash_locks);
118	}
119
120	/ Find first data disk in a raid6 stripe /
121	static inline int raid6_d0(struct stripe_head *sh)
122	{
123	if (sh->ddf_layout)
124	/ ddf always start from first device /
125	return `0`;
126	/ md starts just after Q block /
127	if (sh->qd_idx == sh->disks - `1`)
128	return `0`;
129	else
130	return sh->qd_idx + `1`;
131	}
132	static inline int raid6_next_disk(int disk, int raid_disks)
133	{
134	disk++;
135	return (disk < raid_disks) ? disk : `0`;
136	}
137
138	/ When walking through the disks in a raid5, starting at raid6_d0,*
139	* We need to map each disk to a 'slot', where the data disks are slot
140	* 0 .. raid_disks-3, the parity disk is raid_disks-2 and the Q disk
141	* is raid_disks-1. This help does that mapping.
142	*/
143	static int raid6_idx_to_slot(int idx, struct stripe_head *sh,
144	int count, int* syndrome_disks)
145	{
146	int slot = *count;
147
148	if (sh->ddf_layout)
149	(*count)++;
150	if (idx == sh->pd_idx)
151	return syndrome_disks;
152	if (idx == sh->qd_idx)
153	return syndrome_disks + `1`;
154	if (!sh->ddf_layout)
155	(*count)++;
156	return slot;
157	}
158
159	static void print_raid5_conf (struct r5conf *conf);
160
161	static int stripe_operations_active(struct stripe_head *sh)
162	{
163	return sh->check_state \|\| sh->reconstruct_state \|\|
164	test_bit(STRIPE_BIOFILL_RUN, &sh->state) \|\|
165	test_bit(STRIPE_COMPUTE_RUN, &sh->state);
166	}
167
168	static bool stripe_is_lowprio(struct stripe_head *sh)
169	{
170	return (test_bit(STRIPE_R5C_FULL_STRIPE, &sh->state) \|\|
171	test_bit(STRIPE_R5C_PARTIAL_STRIPE, &sh->state)) &&
172	!test_bit(STRIPE_R5C_CACHING, &sh->state);
173	}
174
175	static void raid5_wakeup_stripe_thread(struct stripe_head *sh)
176	__must_hold(&sh->raid_conf->device_lock)
177	{
178	struct r5conf *conf = sh->raid_conf;
179	struct r5worker_group *group;
180	int thread_cnt;
181	int i, cpu = sh->cpu;
182
183	if (!cpu_online(cpu)) {
184	cpu = cpumask_any(cpu_online_mask);
185	sh->cpu = cpu;
186	}
187
188	if (list_empty(head: &sh->lru)) {
189	struct r5worker_group *group;
190	group = conf->worker_groups + cpu_to_group(cpu);
191	if (stripe_is_lowprio(sh))
192	list_add_tail(new: &sh->lru, head: &group->loprio_list);
193	else
194	list_add_tail(new: &sh->lru, head: &group->handle_list);
195	group->stripes_cnt++;
196	sh->group = group;
197	}
198
199	if (conf->worker_cnt_per_group == `0`) {
200	md_wakeup_thread(thread: conf->mddev->thread);
201	return;
202	}
203
204	group = conf->worker_groups + cpu_to_group(sh->cpu);
205
206	group->workers[`0`].working = true;
207	/ at least one worker should run to avoid race /
208	queue_work_on(cpu: sh->cpu, wq: raid5_wq, work: &group->workers[`0`].work);
209
210	thread_cnt = group->stripes_cnt / MAX_STRIPE_BATCH - `1`;
211	/ wakeup more workers /
212	for (i = `1`; i < conf->worker_cnt_per_group && thread_cnt > `0`; i++) {
213	if (group->workers[i].working == false) {
214	group->workers[i].working = true;
215	queue_work_on(cpu: sh->cpu, wq: raid5_wq,
216	work: &group->workers[i].work);
217	thread_cnt--;
218	}
219	}
220	}
221
222	static void do_release_stripe(struct r5conf conf, struct* stripe_head *sh,
223	struct list_head *temp_inactive_list)
224	__must_hold(&conf->device_lock)
225	{
226	int i;
227	int injournal = `0`; / number of date pages with R5_InJournal /
228
229	BUG_ON(!list_empty(&sh->lru));
230	BUG_ON(atomic_read(&conf->active_stripes)==`0`);
231
232	if (r5c_is_writeback(log: conf->log))
233	for (i = sh->disks; i--; )
234	if (test_bit(R5_InJournal, &sh->dev[i].flags))
235	injournal++;
236	/*
237	* In the following cases, the stripe cannot be released to cached
238	* lists. Therefore, we make the stripe write out and set
239	* STRIPE_HANDLE:
240	* 1. when quiesce in r5c write back;
241	* 2. when resync is requested fot the stripe.
242	*/
243	if (test_bit(STRIPE_SYNC_REQUESTED, &sh->state) \|\|
244	(conf->quiesce && r5c_is_writeback(log: conf->log) &&
245	!test_bit(STRIPE_HANDLE, &sh->state) && injournal != `0`)) {
246	if (test_bit(STRIPE_R5C_CACHING, &sh->state))
247	r5c_make_stripe_write_out(sh);
248	set_bit(nr: STRIPE_HANDLE, addr: &sh->state);
249	}
250
251	if (test_bit(STRIPE_HANDLE, &sh->state)) {
252	if (test_bit(STRIPE_DELAYED, &sh->state) &&
253	!test_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
254	list_add_tail(new: &sh->lru, head: &conf->delayed_list);
255	else if (test_bit(STRIPE_BIT_DELAY, &sh->state) &&
256	sh->bm_seq - conf->seq_write > `0`)
257	list_add_tail(new: &sh->lru, head: &conf->bitmap_list);
258	else {
259	clear_bit(nr: STRIPE_DELAYED, addr: &sh->state);
260	clear_bit(nr: STRIPE_BIT_DELAY, addr: &sh->state);
261	if (conf->worker_cnt_per_group == `0`) {
262	if (stripe_is_lowprio(sh))
263	list_add_tail(new: &sh->lru,
264	head: &conf->loprio_list);
265	else
266	list_add_tail(new: &sh->lru,
267	head: &conf->handle_list);
268	} else {
269	raid5_wakeup_stripe_thread(sh);
270	return;
271	}
272	}
273	md_wakeup_thread(thread: conf->mddev->thread);
274	} else {
275	BUG_ON(stripe_operations_active(sh));
276	if (test_and_clear_bit(nr: STRIPE_PREREAD_ACTIVE, addr: &sh->state))
277	if (atomic_dec_return(v: &conf->preread_active_stripes)
278	< IO_THRESHOLD)
279	md_wakeup_thread(thread: conf->mddev->thread);
280	atomic_dec(v: &conf->active_stripes);
281	if (!test_bit(STRIPE_EXPANDING, &sh->state)) {
282	if (!r5c_is_writeback(log: conf->log))
283	list_add_tail(new: &sh->lru, head: temp_inactive_list);
284	else {
285	WARN_ON(test_bit(R5_InJournal, &sh->dev[sh->pd_idx].flags));
286	if (injournal == `0`)
287	list_add_tail(new: &sh->lru, head: temp_inactive_list);
288	else if (injournal == conf->raid_disks - conf->max_degraded) {
289	/ full stripe /
290	if (!test_and_set_bit(nr: STRIPE_R5C_FULL_STRIPE, addr: &sh->state))
291	atomic_inc(v: &conf->r5c_cached_full_stripes);
292	if (test_and_clear_bit(nr: STRIPE_R5C_PARTIAL_STRIPE, addr: &sh->state))
293	atomic_dec(v: &conf->r5c_cached_partial_stripes);
294	list_add_tail(new: &sh->lru, head: &conf->r5c_full_stripe_list);
295	r5c_check_cached_full_stripe(conf);
296	} else
297	/*
298	* STRIPE_R5C_PARTIAL_STRIPE is set in
299	* r5c_try_caching_write(). No need to
300	* set it again.
301	*/
302	list_add_tail(new: &sh->lru, head: &conf->r5c_partial_stripe_list);
303	}
304	}
305	}
306	}
307
308	static void __release_stripe(struct r5conf conf, struct* stripe_head *sh,
309	struct list_head *temp_inactive_list)
310	__must_hold(&conf->device_lock)
311	{
312	if (atomic_dec_and_test(v: &sh->count))
313	do_release_stripe(conf, sh, temp_inactive_list);
314	}
315
316	/*
317	* @hash could be NR_STRIPE_HASH_LOCKS, then we have a list of inactive_list
318	*
319	* Be careful: Only one task can add/delete stripes from temp_inactive_list at
320	* given time. Adding stripes only takes device lock, while deleting stripes
321	* only takes hash lock.
322	*/
323	static void release_inactive_stripe_list(struct r5conf *conf,
324	struct list_head *temp_inactive_list,
325	int hash)
326	{
327	int size;
328	bool do_wakeup = false;
329	unsigned long flags;
330
331	if (hash == NR_STRIPE_HASH_LOCKS) {
332	size = NR_STRIPE_HASH_LOCKS;
333	hash = NR_STRIPE_HASH_LOCKS - `1`;
334	} else
335	size = `1`;
336	while (size) {
337	struct list_head *list = &temp_inactive_list[size - `1`];
338
339	/*
340	* We don't hold any lock here yet, raid5_get_active_stripe() might
341	* remove stripes from the list
342	*/
343	if (!list_empty_careful(head: list)) {
344	spin_lock_irqsave(conf->hash_locks + hash, flags);
345	if (list_empty(head: conf->inactive_list + hash) &&
346	!list_empty(head: list))
347	atomic_dec(v: &conf->empty_inactive_list_nr);
348	list_splice_tail_init(list, head: conf->inactive_list + hash);
349	do_wakeup = true;
350	spin_unlock_irqrestore(lock: conf->hash_locks + hash, flags);
351	}
352	size--;
353	hash--;
354	}
355
356	if (do_wakeup) {
357	wake_up(&conf->wait_for_stripe);
358	if (atomic_read(v: &conf->active_stripes) == `0`)
359	wake_up(&conf->wait_for_quiescent);
360	if (conf->retry_read_aligned)
361	md_wakeup_thread(thread: conf->mddev->thread);
362	}
363	}
364
365	static int release_stripe_list(struct r5conf *conf,
366	struct list_head *temp_inactive_list)
367	__must_hold(&conf->device_lock)
368	{
369	struct stripe_head sh, t;
370	int count = `0`;
371	struct llist_node *head;
372
373	head = llist_del_all(head: &conf->released_stripes);
374	head = llist_reverse_order(head);
375	llist_for_each_entry_safe(sh, t, head, release_list) {
376	int hash;
377
378	/ sh could be readded after STRIPE_ON_RELEASE_LIST is cleard /
379	smp_mb();
380	clear_bit(nr: STRIPE_ON_RELEASE_LIST, addr: &sh->state);
381	/*
382	* Don't worry the bit is set here, because if the bit is set
383	* again, the count is always > 1. This is true for
384	* STRIPE_ON_UNPLUG_LIST bit too.
385	*/
386	hash = sh->hash_lock_index;
387	__release_stripe(conf, sh, temp_inactive_list: &temp_inactive_list[hash]);
388	count++;
389	}
390
391	return count;
392	}
393
394	void raid5_release_stripe(struct stripe_head *sh)
395	{
396	struct r5conf *conf = sh->raid_conf;
397	unsigned long flags;
398	struct list_head list;
399	int hash;
400	bool wakeup;
401
402	/ Avoid release_list until the last reference.*
403	*/
404	if (atomic_add_unless(v: &sh->count, a: -`1`, u: `1`))
405	return;
406
407	if (unlikely(!conf->mddev->thread) \|\|
408	test_and_set_bit(nr: STRIPE_ON_RELEASE_LIST, addr: &sh->state))
409	goto slow_path;
410	wakeup = llist_add(new: &sh->release_list, head: &conf->released_stripes);
411	if (wakeup)
412	md_wakeup_thread(thread: conf->mddev->thread);
413	return;
414	slow_path:
415	/ we are ok here if STRIPE_ON_RELEASE_LIST is set or not /
416	if (atomic_dec_and_lock_irqsave(&sh->count, &conf->device_lock, flags)) {
417	INIT_LIST_HEAD(list: &list);
418	hash = sh->hash_lock_index;
419	do_release_stripe(conf, sh, temp_inactive_list: &list);
420	spin_unlock_irqrestore(lock: &conf->device_lock, flags);
421	release_inactive_stripe_list(conf, temp_inactive_list: &list, hash);
422	}
423	}
424
425	static inline void remove_hash(struct stripe_head *sh)
426	{
427	pr_debug("remove_hash(), stripe %llu\n",
428	(unsigned long long)sh->sector);
429
430	hlist_del_init(n: &sh->hash);
431	}
432
433	static inline void insert_hash(struct r5conf conf, struct* stripe_head *sh)
434	{
435	struct hlist_head *hp = stripe_hash(conf, sect: sh->sector);
436
437	pr_debug("insert_hash(), stripe %llu\n",
438	(unsigned long long)sh->sector);
439
440	hlist_add_head(n: &sh->hash, h: hp);
441	}
442
443	/ find an idle stripe, make sure it is unhashed, and return it. /
444	static struct stripe_head get_free_stripe(struct* r5conf conf, int* hash)
445	{
446	struct stripe_head *sh = NULL;
447	struct list_head *first;
448
449	if (list_empty(head: conf->inactive_list + hash))
450	goto out;
451	first = (conf->inactive_list + hash)->next;
452	sh = list_entry(first, struct stripe_head, lru);
453	list_del_init(entry: first);
454	remove_hash(sh);
455	atomic_inc(v: &conf->active_stripes);
456	BUG_ON(hash != sh->hash_lock_index);
457	if (list_empty(head: conf->inactive_list + hash))
458	atomic_inc(v: &conf->empty_inactive_list_nr);
459	out:
460	return sh;
461	}
462
463	#if PAGE_SIZE != DEFAULT_STRIPE_SIZE
464	static void free_stripe_pages(struct stripe_head *sh)
465	{
466	int i;
467	struct page *p;
468
469	/ Have not allocate page pool /
470	if (!sh->pages)
471	return;
472
473	for (i = `0`; i < sh->nr_pages; i++) {
474	p = sh->pages[i];
475	if (p)
476	put_page(p);
477	sh->pages[i] = NULL;
478	}
479	}
480
481	static int alloc_stripe_pages(struct stripe_head *sh, gfp_t gfp)
482	{
483	int i;
484	struct page *p;
485
486	for (i = `0`; i < sh->nr_pages; i++) {
487	/ The page have allocated. /
488	if (sh->pages[i])
489	continue;
490
491	p = alloc_page(gfp);
492	if (!p) {
493	free_stripe_pages(sh);
494	return -ENOMEM;
495	}
496	sh->pages[i] = p;
497	}
498	return `0`;
499	}
500
501	static int
502	init_stripe_shared_pages(struct stripe_head sh, struct* r5conf conf, int* disks)
503	{
504	int nr_pages, cnt;
505
506	if (sh->pages)
507	return `0`;
508
509	/ Each of the sh->dev[i] need one conf->stripe_size /
510	cnt = PAGE_SIZE / conf->stripe_size;
511	nr_pages = (disks + cnt - `1`) / cnt;
512
513	sh->pages = kcalloc(nr_pages, sizeof(struct page *), GFP_KERNEL);
514	if (!sh->pages)
515	return -ENOMEM;
516	sh->nr_pages = nr_pages;
517	sh->stripes_per_page = cnt;
518	return `0`;
519	}
520	#endif
521
522	static void shrink_buffers(struct stripe_head *sh)
523	{
524	int i;
525	int num = sh->raid_conf->pool_size;
526
527	#if PAGE_SIZE == DEFAULT_STRIPE_SIZE
528	for (i = `0`; i < num ; i++) {
529	struct page *p;
530
531	WARN_ON(sh->dev[i].page != sh->dev[i].orig_page);
532	p = sh->dev[i].page;
533	if (!p)
534	continue;
535	sh->dev[i].page = NULL;
536	put_page(page: p);
537	}
538	#else
539	for (i = `0`; i < num; i++)
540	sh->dev[i].page = NULL;
541	free_stripe_pages(sh); / Free pages /
542	#endif
543	}
544
545	static int grow_buffers(struct stripe_head *sh, gfp_t gfp)
546	{
547	int i;
548	int num = sh->raid_conf->pool_size;
549
550	#if PAGE_SIZE == DEFAULT_STRIPE_SIZE
551	for (i = `0`; i < num; i++) {
552	struct page *page;
553
554	if (!(page = alloc_page(gfp))) {
555	return `1`;
556	}
557	sh->dev[i].page = page;
558	sh->dev[i].orig_page = page;
559	sh->dev[i].offset = `0`;
560	}
561	#else
562	if (alloc_stripe_pages(sh, gfp))
563	return -ENOMEM;
564
565	for (i = `0`; i < num; i++) {
566	sh->dev[i].page = raid5_get_dev_page(sh, i);
567	sh->dev[i].orig_page = sh->dev[i].page;
568	sh->dev[i].offset = raid5_get_page_offset(sh, i);
569	}
570	#endif
571	return `0`;
572	}
573
574	static void stripe_set_idx(sector_t stripe, struct r5conf conf, int* previous,
575	struct stripe_head *sh);
576
577	static void init_stripe(struct stripe_head sh, sector_t sector, int* previous)
578	{
579	struct r5conf *conf = sh->raid_conf;
580	int i, seq;
581
582	BUG_ON(atomic_read(&sh->count) != `0`);
583	BUG_ON(test_bit(STRIPE_HANDLE, &sh->state));
584	BUG_ON(stripe_operations_active(sh));
585	BUG_ON(sh->batch_head);
586
587	pr_debug("init_stripe called, stripe %llu\n",
588	(unsigned long long)sector);
589	retry:
590	seq = read_seqcount_begin(&conf->gen_lock);
591	sh->generation = conf->generation - previous;
592	sh->disks = previous ? conf->previous_raid_disks : conf->raid_disks;
593	sh->sector = sector;
594	stripe_set_idx(stripe: sector, conf, previous, sh);
595	sh->state = `0`;
596
597	for (i = sh->disks; i--; ) {
598	struct r5dev *dev = &sh->dev[i];
599
600	if (dev->toread \|\| dev->read \|\| dev->towrite \|\| dev->written \|\|
601	test_bit(R5_LOCKED, &dev->flags)) {
602	pr_err("sector=%llx i=%d %p %p %p %p %d\n",
603	(unsigned long long)sh->sector, i, dev->toread,
604	dev->read, dev->towrite, dev->written,
605	test_bit(R5_LOCKED, &dev->flags));
606	WARN_ON(`1`);
607	}
608	dev->flags = `0`;
609	dev->sector = raid5_compute_blocknr(sh, i, previous);
610	}
611	if (read_seqcount_retry(&conf->gen_lock, seq))
612	goto retry;
613	sh->overwrite_disks = `0`;
614	insert_hash(conf, sh);
615	sh->cpu = smp_processor_id();
616	set_bit(nr: STRIPE_BATCH_READY, addr: &sh->state);
617	}
618
619	static struct stripe_head __find_stripe(struct* r5conf *conf, sector_t sector,
620	short generation)
621	{
622	struct stripe_head *sh;
623
624	pr_debug("__find_stripe, sector %llu\n", (unsigned long long)sector);
625	hlist_for_each_entry(sh, stripe_hash(conf, sector), hash)
626	if (sh->sector == sector && sh->generation == generation)
627	return sh;
628	pr_debug("__stripe %llu not in cache\n", (unsigned long long)sector);
629	return NULL;
630	}
631
632	static struct stripe_head find_get_stripe(struct* r5conf *conf,
633	sector_t sector, short generation, int hash)
634	{
635	int inc_empty_inactive_list_flag;
636	struct stripe_head *sh;
637
638	sh = __find_stripe(conf, sector, generation);
639	if (!sh)
640	return NULL;
641
642	if (atomic_inc_not_zero(v: &sh->count))
643	return sh;
644
645	/*
646	* Slow path. The reference count is zero which means the stripe must
647	* be on a list (sh->lru). Must remove the stripe from the list that
648	* references it with the device_lock held.
649	*/
650
651	spin_lock(lock: &conf->device_lock);
652	if (!atomic_read(v: &sh->count)) {
653	if (!test_bit(STRIPE_HANDLE, &sh->state))
654	atomic_inc(v: &conf->active_stripes);
655	BUG_ON(list_empty(&sh->lru) &&
656	!test_bit(STRIPE_EXPANDING, &sh->state));
657	inc_empty_inactive_list_flag = `0`;
658	if (!list_empty(head: conf->inactive_list + hash))
659	inc_empty_inactive_list_flag = `1`;
660	list_del_init(entry: &sh->lru);
661	if (list_empty(head: conf->inactive_list + hash) &&
662	inc_empty_inactive_list_flag)
663	atomic_inc(v: &conf->empty_inactive_list_nr);
664	if (sh->group) {
665	sh->group->stripes_cnt--;
666	sh->group = NULL;
667	}
668	}
669	atomic_inc(v: &sh->count);
670	spin_unlock(lock: &conf->device_lock);
671
672	return sh;
673	}
674
675	/*
676	* Need to check if array has failed when deciding whether to:
677	* - start an array
678	* - remove non-faulty devices
679	* - add a spare
680	* - allow a reshape
681	* This determination is simple when no reshape is happening.
682	* However if there is a reshape, we need to carefully check
683	* both the before and after sections.
684	* This is because some failed devices may only affect one
685	* of the two sections, and some non-in_sync devices may
686	* be insync in the section most affected by failed devices.
687	*
688	* Most calls to this function hold &conf->device_lock. Calls
689	* in raid5_run() do not require the lock as no other threads
690	* have been started yet.
691	*/
692	int raid5_calc_degraded(struct r5conf *conf)
693	{
694	int degraded, degraded2;
695	int i;
696
697	degraded = `0`;
698	for (i = `0`; i < conf->previous_raid_disks; i++) {
699	struct md_rdev *rdev = READ_ONCE(conf->disks[i].rdev);
700
701	if (rdev && test_bit(Faulty, &rdev->flags))
702	rdev = READ_ONCE(conf->disks[i].replacement);
703	if (!rdev \|\| test_bit(Faulty, &rdev->flags))
704	degraded++;
705	else if (test_bit(In_sync, &rdev->flags))
706	;
707	else
708	/ not in-sync or faulty.*
709	* If the reshape increases the number of devices,
710	* this is being recovered by the reshape, so
711	* this 'previous' section is not in_sync.
712	* If the number of devices is being reduced however,
713	* the device can only be part of the array if
714	* we are reverting a reshape, so this section will
715	* be in-sync.
716	*/
717	if (conf->raid_disks >= conf->previous_raid_disks)
718	degraded++;
719	}
720	if (conf->raid_disks == conf->previous_raid_disks)
721	return degraded;
722	degraded2 = `0`;
723	for (i = `0`; i < conf->raid_disks; i++) {
724	struct md_rdev *rdev = READ_ONCE(conf->disks[i].rdev);
725
726	if (rdev && test_bit(Faulty, &rdev->flags))
727	rdev = READ_ONCE(conf->disks[i].replacement);
728	if (!rdev \|\| test_bit(Faulty, &rdev->flags))
729	degraded2++;
730	else if (test_bit(In_sync, &rdev->flags))
731	;
732	else
733	/ not in-sync or faulty.*
734	* If reshape increases the number of devices, this
735	* section has already been recovered, else it
736	* almost certainly hasn't.
737	*/
738	if (conf->raid_disks <= conf->previous_raid_disks)
739	degraded2++;
740	}
741	if (degraded2 > degraded)
742	return degraded2;
743	return degraded;
744	}
745
746	static bool has_failed(struct r5conf *conf)
747	{
748	int degraded = conf->mddev->degraded;
749
750	if (test_bit(MD_BROKEN, &conf->mddev->flags))
751	return true;
752
753	if (conf->mddev->reshape_position != MaxSector)
754	degraded = raid5_calc_degraded(conf);
755
756	return degraded > conf->max_degraded;
757	}
758
759	enum stripe_result {
760	STRIPE_SUCCESS = `0`,
761	STRIPE_RETRY,
762	STRIPE_SCHEDULE_AND_RETRY,
763	STRIPE_FAIL,
764	STRIPE_WAIT_RESHAPE,
765	};
766
767	struct stripe_request_ctx {
768	/ a reference to the last stripe_head for batching /
769	struct stripe_head *batch_last;
770
771	/ first sector in the request /
772	sector_t first_sector;
773
774	/ last sector in the request /
775	sector_t last_sector;
776
777	/*
778	* bitmap to track stripe sectors that have been added to stripes
779	* add one to account for unaligned requests
780	*/
781	DECLARE_BITMAP(sectors_to_do, RAID5_MAX_REQ_STRIPES + `1`);
782
783	/ the request had REQ_PREFLUSH, cleared after the first stripe_head /
784	bool do_flush;
785	};
786
787	/*
788	* Block until another thread clears R5_INACTIVE_BLOCKED or
789	* there are fewer than 3/4 the maximum number of active stripes
790	* and there is an inactive stripe available.
791	*/
792	static bool is_inactive_blocked(struct r5conf conf, int* hash)
793	{
794	if (list_empty(head: conf->inactive_list + hash))
795	return false;
796
797	if (!test_bit(R5_INACTIVE_BLOCKED, &conf->cache_state))
798	return true;
799
800	return (atomic_read(v: &conf->active_stripes) <
801	(conf->max_nr_stripes * `3` / `4`));
802	}
803
804	struct stripe_head raid5_get_active_stripe(struct* r5conf *conf,
805	struct stripe_request_ctx *ctx, sector_t sector,
806	unsigned int flags)
807	{
808	struct stripe_head *sh;
809	int hash = stripe_hash_locks_hash(conf, sect: sector);
810	int previous = !!(flags & R5_GAS_PREVIOUS);
811
812	pr_debug("get_stripe, sector %llu\n", (unsigned long long)sector);
813
814	spin_lock_irq(lock: conf->hash_locks + hash);
815
816	for (;;) {
817	if (!(flags & R5_GAS_NOQUIESCE) && conf->quiesce) {
818	/*
819	* Must release the reference to batch_last before
820	* waiting, on quiesce, otherwise the batch_last will
821	* hold a reference to a stripe and raid5_quiesce()
822	* will deadlock waiting for active_stripes to go to
823	* zero.
824	*/
825	if (ctx && ctx->batch_last) {
826	raid5_release_stripe(sh: ctx->batch_last);
827	ctx->batch_last = NULL;
828	}
829
830	wait_event_lock_irq(conf->wait_for_quiescent,
831	!conf->quiesce,
832	*(conf->hash_locks + hash));
833	}
834
835	sh = find_get_stripe(conf, sector, generation: conf->generation - previous,
836	hash);
837	if (sh)
838	break;
839
840	if (!test_bit(R5_INACTIVE_BLOCKED, &conf->cache_state)) {
841	sh = get_free_stripe(conf, hash);
842	if (sh) {
843	r5c_check_stripe_cache_usage(conf);
844	init_stripe(sh, sector, previous);
845	atomic_inc(v: &sh->count);
846	break;
847	}
848
849	if (!test_bit(R5_DID_ALLOC, &conf->cache_state))
850	set_bit(nr: R5_ALLOC_MORE, addr: &conf->cache_state);
851	}
852
853	if (flags & R5_GAS_NOBLOCK)
854	break;
855
856	set_bit(nr: R5_INACTIVE_BLOCKED, addr: &conf->cache_state);
857	r5l_wake_reclaim(log: conf->log, space: `0`);
858
859	/ release batch_last before wait to avoid risk of deadlock /
860	if (ctx && ctx->batch_last) {
861	raid5_release_stripe(sh: ctx->batch_last);
862	ctx->batch_last = NULL;
863	}
864
865	wait_event_lock_irq(conf->wait_for_stripe,
866	is_inactive_blocked(conf, hash),
867	*(conf->hash_locks + hash));
868	clear_bit(nr: R5_INACTIVE_BLOCKED, addr: &conf->cache_state);
869	}
870
871	spin_unlock_irq(lock: conf->hash_locks + hash);
872	return sh;
873	}
874
875	static bool is_full_stripe_write(struct stripe_head *sh)
876	{
877	BUG_ON(sh->overwrite_disks > (sh->disks - sh->raid_conf->max_degraded));
878	return sh->overwrite_disks == (sh->disks - sh->raid_conf->max_degraded);
879	}
880
881	static void lock_two_stripes(struct stripe_head sh1, struct* stripe_head *sh2)
882	__acquires(&sh1->stripe_lock)
883	__acquires(&sh2->stripe_lock)
884	{
885	if (sh1 > sh2) {
886	spin_lock_irq(lock: &sh2->stripe_lock);
887	spin_lock_nested(&sh1->stripe_lock, `1`);
888	} else {
889	spin_lock_irq(lock: &sh1->stripe_lock);
890	spin_lock_nested(&sh2->stripe_lock, `1`);
891	}
892	}
893
894	static void unlock_two_stripes(struct stripe_head sh1, struct* stripe_head *sh2)
895	__releases(&sh1->stripe_lock)
896	__releases(&sh2->stripe_lock)
897	{
898	spin_unlock(lock: &sh1->stripe_lock);
899	spin_unlock_irq(lock: &sh2->stripe_lock);
900	}
901
902	/ Only freshly new full stripe normal write stripe can be added to a batch list /
903	static bool stripe_can_batch(struct stripe_head *sh)
904	{
905	struct r5conf *conf = sh->raid_conf;
906
907	if (raid5_has_log(conf) \|\| raid5_has_ppl(conf))
908	return false;
909	return test_bit(STRIPE_BATCH_READY, &sh->state) &&
910	!test_bit(STRIPE_BITMAP_PENDING, &sh->state) &&
911	is_full_stripe_write(sh);
912	}
913
914	/ we only do back search /
915	static void stripe_add_to_batch_list(struct r5conf *conf,
916	struct stripe_head sh, struct* stripe_head *last_sh)
917	{
918	struct stripe_head *head;
919	sector_t head_sector, tmp_sec;
920	int hash;
921	int dd_idx;
922
923	/ Don't cross chunks, so stripe pd_idx/qd_idx is the same /
924	tmp_sec = sh->sector;
925	if (!sector_div(tmp_sec, conf->chunk_sectors))
926	return;
927	head_sector = sh->sector - RAID5_STRIPE_SECTORS(conf);
928
929	if (last_sh && head_sector == last_sh->sector) {
930	head = last_sh;
931	atomic_inc(v: &head->count);
932	} else {
933	hash = stripe_hash_locks_hash(conf, sect: head_sector);
934	spin_lock_irq(lock: conf->hash_locks + hash);
935	head = find_get_stripe(conf, sector: head_sector, generation: conf->generation,
936	hash);
937	spin_unlock_irq(lock: conf->hash_locks + hash);
938	if (!head)
939	return;
940	if (!stripe_can_batch(sh: head))
941	goto out;
942	}
943
944	lock_two_stripes(sh1: head, sh2: sh);
945	/ clear_batch_ready clear the flag /
946	if (!stripe_can_batch(sh: head) \|\| !stripe_can_batch(sh))
947	goto unlock_out;
948
949	if (sh->batch_head)
950	goto unlock_out;
951
952	dd_idx = `0`;
953	while (dd_idx == sh->pd_idx \|\| dd_idx == sh->qd_idx)
954	dd_idx++;
955	if (head->dev[dd_idx].towrite->bi_opf != sh->dev[dd_idx].towrite->bi_opf \|\|
956	bio_op(bio: head->dev[dd_idx].towrite) != bio_op(bio: sh->dev[dd_idx].towrite))
957	goto unlock_out;
958
959	if (head->batch_head) {
960	spin_lock(lock: &head->batch_head->batch_lock);
961	/ This batch list is already running /
962	if (!stripe_can_batch(sh: head)) {
963	spin_unlock(lock: &head->batch_head->batch_lock);
964	goto unlock_out;
965	}
966	/*
967	* We must assign batch_head of this stripe within the
968	* batch_lock, otherwise clear_batch_ready of batch head
969	* stripe could clear BATCH_READY bit of this stripe and
970	* this stripe->batch_head doesn't get assigned, which
971	* could confuse clear_batch_ready for this stripe
972	*/
973	sh->batch_head = head->batch_head;
974
975	/*
976	* at this point, head's BATCH_READY could be cleared, but we
977	* can still add the stripe to batch list
978	*/
979	list_add(new: &sh->batch_list, head: &head->batch_list);
980	spin_unlock(lock: &head->batch_head->batch_lock);
981	} else {
982	head->batch_head = head;
983	sh->batch_head = head->batch_head;
984	spin_lock(lock: &head->batch_lock);
985	list_add_tail(new: &sh->batch_list, head: &head->batch_list);
986	spin_unlock(lock: &head->batch_lock);
987	}
988
989	if (test_and_clear_bit(nr: STRIPE_PREREAD_ACTIVE, addr: &sh->state))
990	if (atomic_dec_return(v: &conf->preread_active_stripes)
991	< IO_THRESHOLD)
992	md_wakeup_thread(thread: conf->mddev->thread);
993
994	if (test_and_clear_bit(nr: STRIPE_BIT_DELAY, addr: &sh->state)) {
995	int seq = sh->bm_seq;
996	if (test_bit(STRIPE_BIT_DELAY, &sh->batch_head->state) &&
997	sh->batch_head->bm_seq > seq)
998	seq = sh->batch_head->bm_seq;
999	set_bit(nr: STRIPE_BIT_DELAY, addr: &sh->batch_head->state);
1000	sh->batch_head->bm_seq = seq;
1001	}
1002
1003	atomic_inc(v: &sh->count);
1004	unlock_out:
1005	unlock_two_stripes(sh1: head, sh2: sh);
1006	out:
1007	raid5_release_stripe(sh: head);
1008	}
1009
1010	/ Determine if 'data_offset' or 'new_data_offset' should be used*
1011	* in this stripe_head.
1012	*/
1013	static int use_new_offset(struct r5conf conf, struct* stripe_head *sh)
1014	{
1015	sector_t progress = conf->reshape_progress;
1016	/ Need a memory barrier to make sure we see the value*
1017	* of conf->generation, or ->data_offset that was set before
1018	* reshape_progress was updated.
1019	*/
1020	smp_rmb();
1021	if (progress == MaxSector)
1022	return `0`;
1023	if (sh->generation == conf->generation - `1`)
1024	return `0`;
1025	/ We are in a reshape, and this is a new-generation stripe,*
1026	* so use new_data_offset.
1027	*/
1028	return `1`;
1029	}
1030
1031	static void dispatch_bio_list(struct bio_list *tmp)
1032	{
1033	struct bio *bio;
1034
1035	while ((bio = bio_list_pop(bl: tmp)))
1036	submit_bio_noacct(bio);
1037	}
1038
1039	static int cmp_stripe(void priv, const* struct list_head *a,
1040	const struct list_head *b)
1041	{
1042	const struct r5pending_data *da = list_entry(a,
1043	struct r5pending_data, sibling);
1044	const struct r5pending_data *db = list_entry(b,
1045	struct r5pending_data, sibling);
1046	if (da->sector > db->sector)
1047	return `1`;
1048	if (da->sector < db->sector)
1049	return -`1`;
1050	return `0`;
1051	}
1052
1053	static void dispatch_defer_bios(struct r5conf conf, int* target,
1054	struct bio_list *list)
1055	{
1056	struct r5pending_data *data;
1057	struct list_head first, next = NULL;
1058	int cnt = `0`;
1059
1060	if (conf->pending_data_cnt == `0`)
1061	return;
1062
1063	list_sort(NULL, head: &conf->pending_list, cmp: cmp_stripe);
1064
1065	first = conf->pending_list.next;
1066
1067	/ temporarily move the head /
1068	if (conf->next_pending_data)
1069	list_move_tail(list: &conf->pending_list,
1070	head: &conf->next_pending_data->sibling);
1071
1072	while (!list_empty(head: &conf->pending_list)) {
1073	data = list_first_entry(&conf->pending_list,
1074	struct r5pending_data, sibling);
1075	if (&data->sibling == first)
1076	first = data->sibling.next;
1077	next = data->sibling.next;
1078
1079	bio_list_merge(bl: list, bl2: &data->bios);
1080	list_move(list: &data->sibling, head: &conf->free_list);
1081	cnt++;
1082	if (cnt >= target)
1083	break;
1084	}
1085	conf->pending_data_cnt -= cnt;
1086	BUG_ON(conf->pending_data_cnt < `0` \|\| cnt < target);
1087
1088	if (next != &conf->pending_list)
1089	conf->next_pending_data = list_entry(next,
1090	struct r5pending_data, sibling);
1091	else
1092	conf->next_pending_data = NULL;
1093	/ list isn't empty /
1094	if (first != &conf->pending_list)
1095	list_move_tail(list: &conf->pending_list, head: first);
1096	}
1097
1098	static void flush_deferred_bios(struct r5conf *conf)
1099	{
1100	struct bio_list tmp = BIO_EMPTY_LIST;
1101
1102	if (conf->pending_data_cnt == `0`)
1103	return;
1104
1105	spin_lock(lock: &conf->pending_bios_lock);
1106	dispatch_defer_bios(conf, target: conf->pending_data_cnt, list: &tmp);
1107	BUG_ON(conf->pending_data_cnt != `0`);
1108	spin_unlock(lock: &conf->pending_bios_lock);
1109
1110	dispatch_bio_list(tmp: &tmp);
1111	}
1112
1113	static void defer_issue_bios(struct r5conf *conf, sector_t sector,
1114	struct bio_list *bios)
1115	{
1116	struct bio_list tmp = BIO_EMPTY_LIST;
1117	struct r5pending_data *ent;
1118
1119	spin_lock(lock: &conf->pending_bios_lock);
1120	ent = list_first_entry(&conf->free_list, struct r5pending_data,
1121	sibling);
1122	list_move_tail(list: &ent->sibling, head: &conf->pending_list);
1123	ent->sector = sector;
1124	bio_list_init(bl: &ent->bios);
1125	bio_list_merge(bl: &ent->bios, bl2: bios);
1126	conf->pending_data_cnt++;
1127	if (conf->pending_data_cnt >= PENDING_IO_MAX)
1128	dispatch_defer_bios(conf, PENDING_IO_ONE_FLUSH, list: &tmp);
1129
1130	spin_unlock(lock: &conf->pending_bios_lock);
1131
1132	dispatch_bio_list(tmp: &tmp);
1133	}
1134
1135	static void
1136	raid5_end_read_request(struct bio *bi);
1137	static void
1138	raid5_end_write_request(struct bio *bi);
1139
1140	static void ops_run_io(struct stripe_head sh, struct* stripe_head_state *s)
1141	{
1142	struct r5conf *conf = sh->raid_conf;
1143	int i, disks = sh->disks;
1144	struct stripe_head *head_sh = sh;
1145	struct bio_list pending_bios = BIO_EMPTY_LIST;
1146	struct r5dev *dev;
1147	bool should_defer;
1148
1149	might_sleep();
1150
1151	if (log_stripe(sh, s) == `0`)
1152	return;
1153
1154	should_defer = conf->batch_bio_dispatch && conf->group_cnt;
1155
1156	for (i = disks; i--; ) {
1157	enum req_op op;
1158	blk_opf_t op_flags = `0`;
1159	int replace_only = `0`;
1160	struct bio bi, rbi;
1161	struct md_rdev rdev, rrdev = NULL;
1162
1163	sh = head_sh;
1164	if (test_and_clear_bit(nr: R5_Wantwrite, addr: &sh->dev[i].flags)) {
1165	op = REQ_OP_WRITE;
1166	if (test_and_clear_bit(nr: R5_WantFUA, addr: &sh->dev[i].flags))
1167	op_flags = REQ_FUA;
1168	if (test_bit(R5_Discard, &sh->dev[i].flags))
1169	op = REQ_OP_DISCARD;
1170	} else if (test_and_clear_bit(nr: R5_Wantread, addr: &sh->dev[i].flags))
1171	op = REQ_OP_READ;
1172	else if (test_and_clear_bit(nr: R5_WantReplace,
1173	addr: &sh->dev[i].flags)) {
1174	op = REQ_OP_WRITE;
1175	replace_only = `1`;
1176	} else
1177	continue;
1178	if (test_and_clear_bit(nr: R5_SyncIO, addr: &sh->dev[i].flags))
1179	op_flags \|= REQ_SYNC;
1180
1181	again:
1182	dev = &sh->dev[i];
1183	bi = &dev->req;
1184	rbi = &dev->rreq; / For writing to replacement /
1185
1186	rdev = conf->disks[i].rdev;
1187	rrdev = conf->disks[i].replacement;
1188	if (op_is_write(op)) {
1189	if (replace_only)
1190	rdev = NULL;
1191	if (rdev == rrdev)
1192	/ We raced and saw duplicates /
1193	rrdev = NULL;
1194	} else {
1195	if (test_bit(R5_ReadRepl, &head_sh->dev[i].flags) && rrdev)
1196	rdev = rrdev;
1197	rrdev = NULL;
1198	}
1199
1200	if (rdev && test_bit(Faulty, &rdev->flags))
1201	rdev = NULL;
1202	if (rdev)
1203	atomic_inc(v: &rdev->nr_pending);
1204	if (rrdev && test_bit(Faulty, &rrdev->flags))
1205	rrdev = NULL;
1206	if (rrdev)
1207	atomic_inc(v: &rrdev->nr_pending);
1208
1209	/ We have already checked bad blocks for reads. Now*
1210	* need to check for writes. We never accept write errors
1211	* on the replacement, so we don't to check rrdev.
1212	*/
1213	while (op_is_write(op) && rdev &&
1214	test_bit(WriteErrorSeen, &rdev->flags)) {
1215	int bad = rdev_has_badblock(rdev, s: sh->sector,
1216	RAID5_STRIPE_SECTORS(conf));
1217	if (!bad)
1218	break;
1219
1220	if (bad < `0`) {
1221	set_bit(nr: BlockedBadBlocks, addr: &rdev->flags);
1222	if (!conf->mddev->external &&
1223	conf->mddev->sb_flags) {
1224	/ It is very unlikely, but we might*
1225	* still need to write out the
1226	* bad block log - better give it
1227	* a chance*/
1228	md_check_recovery(mddev: conf->mddev);
1229	}
1230	/*
1231	* Because md_wait_for_blocked_rdev
1232	* will dec nr_pending, we must
1233	* increment it first.
1234	*/
1235	atomic_inc(v: &rdev->nr_pending);
1236	md_wait_for_blocked_rdev(rdev, mddev: conf->mddev);
1237	} else {
1238	/ Acknowledged bad block - skip the write /
1239	rdev_dec_pending(rdev, mddev: conf->mddev);
1240	rdev = NULL;
1241	}
1242	}
1243
1244	if (rdev) {
1245	if (s->syncing \|\| s->expanding \|\| s->expanded
1246	\|\| s->replacing)
1247	md_sync_acct(bdev: rdev->bdev, RAID5_STRIPE_SECTORS(conf));
1248
1249	set_bit(nr: STRIPE_IO_STARTED, addr: &sh->state);
1250
1251	bio_init(bio: bi, bdev: rdev->bdev, table: &dev->vec, max_vecs: `1`, opf: op \| op_flags);
1252	bi->bi_end_io = op_is_write(op)
1253	? raid5_end_write_request
1254	: raid5_end_read_request;
1255	bi->bi_private = sh;
1256
1257	pr_debug("%s: for %llu schedule op %d on disc %d\n",
1258	__func__, (unsigned long long)sh->sector,
1259	bi->bi_opf, i);
1260	atomic_inc(v: &sh->count);
1261	if (sh != head_sh)
1262	atomic_inc(v: &head_sh->count);
1263	if (use_new_offset(conf, sh))
1264	bi->bi_iter.bi_sector = (sh->sector
1265	+ rdev->new_data_offset);
1266	else
1267	bi->bi_iter.bi_sector = (sh->sector
1268	+ rdev->data_offset);
1269	if (test_bit(R5_ReadNoMerge, &head_sh->dev[i].flags))
1270	bi->bi_opf \|= REQ_NOMERGE;
1271
1272	if (test_bit(R5_SkipCopy, &sh->dev[i].flags))
1273	WARN_ON(test_bit(R5_UPTODATE, &sh->dev[i].flags));
1274
1275	if (!op_is_write(op) &&
1276	test_bit(R5_InJournal, &sh->dev[i].flags))
1277	/*
1278	* issuing read for a page in journal, this
1279	* must be preparing for prexor in rmw; read
1280	* the data into orig_page
1281	*/
1282	sh->dev[i].vec.bv_page = sh->dev[i].orig_page;
1283	else
1284	sh->dev[i].vec.bv_page = sh->dev[i].page;
1285	bi->bi_vcnt = `1`;
1286	bi->bi_io_vec[`0`].bv_len = RAID5_STRIPE_SIZE(conf);
1287	bi->bi_io_vec[`0`].bv_offset = sh->dev[i].offset;
1288	bi->bi_iter.bi_size = RAID5_STRIPE_SIZE(conf);
1289	/*
1290	* If this is discard request, set bi_vcnt 0. We don't
1291	* want to confuse SCSI because SCSI will replace payload
1292	*/
1293	if (op == REQ_OP_DISCARD)
1294	bi->bi_vcnt = `0`;
1295	if (rrdev)
1296	set_bit(nr: R5_DOUBLE_LOCKED, addr: &sh->dev[i].flags);
1297
1298	mddev_trace_remap(mddev: conf->mddev, bio: bi, sector: sh->dev[i].sector);
1299	if (should_defer && op_is_write(op))
1300	bio_list_add(bl: &pending_bios, bio: bi);
1301	else
1302	submit_bio_noacct(bio: bi);
1303	}
1304	if (rrdev) {
1305	if (s->syncing \|\| s->expanding \|\| s->expanded
1306	\|\| s->replacing)
1307	md_sync_acct(bdev: rrdev->bdev, RAID5_STRIPE_SECTORS(conf));
1308
1309	set_bit(nr: STRIPE_IO_STARTED, addr: &sh->state);
1310
1311	bio_init(bio: rbi, bdev: rrdev->bdev, table: &dev->rvec, max_vecs: `1`, opf: op \| op_flags);
1312	BUG_ON(!op_is_write(op));
1313	rbi->bi_end_io = raid5_end_write_request;
1314	rbi->bi_private = sh;
1315
1316	pr_debug("%s: for %llu schedule op %d on "
1317	"replacement disc %d\n",
1318	__func__, (unsigned long long)sh->sector,
1319	rbi->bi_opf, i);
1320	atomic_inc(v: &sh->count);
1321	if (sh != head_sh)
1322	atomic_inc(v: &head_sh->count);
1323	if (use_new_offset(conf, sh))
1324	rbi->bi_iter.bi_sector = (sh->sector
1325	+ rrdev->new_data_offset);
1326	else
1327	rbi->bi_iter.bi_sector = (sh->sector
1328	+ rrdev->data_offset);
1329	if (test_bit(R5_SkipCopy, &sh->dev[i].flags))
1330	WARN_ON(test_bit(R5_UPTODATE, &sh->dev[i].flags));
1331	sh->dev[i].rvec.bv_page = sh->dev[i].page;
1332	rbi->bi_vcnt = `1`;
1333	rbi->bi_io_vec[`0`].bv_len = RAID5_STRIPE_SIZE(conf);
1334	rbi->bi_io_vec[`0`].bv_offset = sh->dev[i].offset;
1335	rbi->bi_iter.bi_size = RAID5_STRIPE_SIZE(conf);
1336	/*
1337	* If this is discard request, set bi_vcnt 0. We don't
1338	* want to confuse SCSI because SCSI will replace payload
1339	*/
1340	if (op == REQ_OP_DISCARD)
1341	rbi->bi_vcnt = `0`;
1342	mddev_trace_remap(mddev: conf->mddev, bio: rbi, sector: sh->dev[i].sector);
1343	if (should_defer && op_is_write(op))
1344	bio_list_add(bl: &pending_bios, bio: rbi);
1345	else
1346	submit_bio_noacct(bio: rbi);
1347	}
1348	if (!rdev && !rrdev) {
1349	if (op_is_write(op))
1350	set_bit(nr: STRIPE_DEGRADED, addr: &sh->state);
1351	pr_debug("skip op %d on disc %d for sector %llu\n",
1352	bi->bi_opf, i, (unsigned long long)sh->sector);
1353	clear_bit(nr: R5_LOCKED, addr: &sh->dev[i].flags);
1354	set_bit(nr: STRIPE_HANDLE, addr: &sh->state);
1355	}
1356
1357	if (!head_sh->batch_head)
1358	continue;
1359	sh = list_first_entry(&sh->batch_list, struct stripe_head,
1360	batch_list);
1361	if (sh != head_sh)
1362	goto again;
1363	}
1364
1365	if (should_defer && !bio_list_empty(bl: &pending_bios))
1366	defer_issue_bios(conf, sector: head_sh->sector, bios: &pending_bios);
1367	}
1368
1369	static struct dma_async_tx_descriptor *
1370	async_copy_data(int frombio, struct bio bio, struct* page **page,
1371	unsigned int poff, sector_t sector, struct dma_async_tx_descriptor *tx,
1372	struct stripe_head sh, int* no_skipcopy)
1373	{
1374	struct bio_vec bvl;
1375	struct bvec_iter iter;
1376	struct page *bio_page;
1377	int page_offset;
1378	struct async_submit_ctl submit;
1379	enum async_tx_flags flags = `0`;
1380	struct r5conf *conf = sh->raid_conf;
1381
1382	if (bio->bi_iter.bi_sector >= sector)
1383	page_offset = (signed)(bio->bi_iter.bi_sector - sector) * `512`;
1384	else
1385	page_offset = (signed)(sector - bio->bi_iter.bi_sector) * -`512`;
1386
1387	if (frombio)
1388	flags \|= ASYNC_TX_FENCE;
1389	init_async_submit(args: &submit, flags, tx, NULL, NULL, NULL);
1390
1391	bio_for_each_segment(bvl, bio, iter) {
1392	int len = bvl.bv_len;
1393	int clen;
1394	int b_offset = `0`;
1395
1396	if (page_offset < `0`) {
1397	b_offset = -page_offset;
1398	page_offset += b_offset;
1399	len -= b_offset;
1400	}
1401
1402	if (len > `0` && page_offset + len > RAID5_STRIPE_SIZE(conf))
1403	clen = RAID5_STRIPE_SIZE(conf) - page_offset;
1404	else
1405	clen = len;
1406
1407	if (clen > `0`) {
1408	b_offset += bvl.bv_offset;
1409	bio_page = bvl.bv_page;
1410	if (frombio) {
1411	if (conf->skip_copy &&
1412	b_offset == `0` && page_offset == `0` &&
1413	clen == RAID5_STRIPE_SIZE(conf) &&
1414	!no_skipcopy)
1415	*page = bio_page;
1416	else
1417	tx = async_memcpy(dest: *page, src: bio_page, dest_offset: page_offset + poff,
1418	src_offset: b_offset, len: clen, submit: &submit);
1419	} else
1420	tx = async_memcpy(dest: bio_page, src: *page, dest_offset: b_offset,
1421	src_offset: page_offset + poff, len: clen, submit: &submit);
1422	}
1423	/ chain the operations /
1424	submit.depend_tx = tx;
1425
1426	if (clen < len) / hit end of page /
1427	break;
1428	page_offset += len;
1429	}
1430
1431	return tx;
1432	}
1433
1434	static void ops_complete_biofill(void *stripe_head_ref)
1435	{
1436	struct stripe_head *sh = stripe_head_ref;
1437	int i;
1438	struct r5conf *conf = sh->raid_conf;
1439
1440	pr_debug("%s: stripe %llu\n", __func__,
1441	(unsigned long long)sh->sector);
1442
1443	/ clear completed biofills /
1444	for (i = sh->disks; i--; ) {
1445	struct r5dev *dev = &sh->dev[i];
1446
1447	/ acknowledge completion of a biofill operation /
1448	/ and check if we need to reply to a read request,*
1449	* new R5_Wantfill requests are held off until
1450	* !STRIPE_BIOFILL_RUN
1451	*/
1452	if (test_and_clear_bit(nr: R5_Wantfill, addr: &dev->flags)) {
1453	struct bio rbi, rbi2;
1454
1455	BUG_ON(!dev->read);
1456	rbi = dev->read;
1457	dev->read = NULL;
1458	while (rbi && rbi->bi_iter.bi_sector <
1459	dev->sector + RAID5_STRIPE_SECTORS(conf)) {
1460	rbi2 = r5_next_bio(conf, bio: rbi, sector: dev->sector);
1461	bio_endio(rbi);
1462	rbi = rbi2;
1463	}
1464	}
1465	}
1466	clear_bit(nr: STRIPE_BIOFILL_RUN, addr: &sh->state);
1467
1468	set_bit(nr: STRIPE_HANDLE, addr: &sh->state);
1469	raid5_release_stripe(sh);
1470	}
1471
1472	static void ops_run_biofill(struct stripe_head *sh)
1473	{
1474	struct dma_async_tx_descriptor *tx = NULL;
1475	struct async_submit_ctl submit;
1476	int i;
1477	struct r5conf *conf = sh->raid_conf;
1478
1479	BUG_ON(sh->batch_head);
1480	pr_debug("%s: stripe %llu\n", __func__,
1481	(unsigned long long)sh->sector);
1482
1483	for (i = sh->disks; i--; ) {
1484	struct r5dev *dev = &sh->dev[i];
1485	if (test_bit(R5_Wantfill, &dev->flags)) {
1486	struct bio *rbi;
1487	spin_lock_irq(lock: &sh->stripe_lock);
1488	dev->read = rbi = dev->toread;
1489	dev->toread = NULL;
1490	spin_unlock_irq(lock: &sh->stripe_lock);
1491	while (rbi && rbi->bi_iter.bi_sector <
1492	dev->sector + RAID5_STRIPE_SECTORS(conf)) {
1493	tx = async_copy_data(frombio: `0`, bio: rbi, page: &dev->page,
1494	poff: dev->offset,
1495	sector: dev->sector, tx, sh, no_skipcopy: `0`);
1496	rbi = r5_next_bio(conf, bio: rbi, sector: dev->sector);
1497	}
1498	}
1499	}
1500
1501	atomic_inc(v: &sh->count);
1502	init_async_submit(args: &submit, flags: ASYNC_TX_ACK, tx, cb_fn: ops_complete_biofill, cb_param: sh, NULL);
1503	async_trigger_callback(submit: &submit);
1504	}
1505
1506	static void mark_target_uptodate(struct stripe_head sh, int* target)
1507	{
1508	struct r5dev *tgt;
1509
1510	if (target < `0`)
1511	return;
1512
1513	tgt = &sh->dev[target];
1514	set_bit(nr: R5_UPTODATE, addr: &tgt->flags);
1515	BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags));
1516	clear_bit(nr: R5_Wantcompute, addr: &tgt->flags);
1517	}
1518
1519	static void ops_complete_compute(void *stripe_head_ref)
1520	{
1521	struct stripe_head *sh = stripe_head_ref;
1522
1523	pr_debug("%s: stripe %llu\n", __func__,
1524	(unsigned long long)sh->sector);
1525
1526	/ mark the computed target(s) as uptodate /
1527	mark_target_uptodate(sh, target: sh->ops.target);
1528	mark_target_uptodate(sh, target: sh->ops.target2);
1529
1530	clear_bit(nr: STRIPE_COMPUTE_RUN, addr: &sh->state);
1531	if (sh->check_state == check_state_compute_run)
1532	sh->check_state = check_state_compute_result;
1533	set_bit(nr: STRIPE_HANDLE, addr: &sh->state);
1534	raid5_release_stripe(sh);
1535	}
1536
1537	/ return a pointer to the address conversion region of the scribble buffer /
1538	static struct page to_addr_page(struct** raid5_percpu percpu, int* i)
1539	{
1540	return percpu->scribble + i * percpu->scribble_obj_size;
1541	}
1542
1543	/ return a pointer to the address conversion region of the scribble buffer /
1544	static addr_conv_t to_addr_conv(struct* stripe_head *sh,
1545	struct raid5_percpu percpu, int* i)
1546	{
1547	return (void *) (to_addr_page(percpu, i) + sh->disks + `2`);
1548	}
1549
1550	/*
1551	* Return a pointer to record offset address.
1552	*/
1553	static unsigned int *
1554	to_addr_offs(struct stripe_head sh, struct* raid5_percpu *percpu)
1555	{
1556	return (unsigned int *) (to_addr_conv(sh, percpu, i: `0`) + sh->disks + `2`);
1557	}
1558
1559	static struct dma_async_tx_descriptor *
1560	ops_run_compute5(struct stripe_head sh, struct* raid5_percpu *percpu)
1561	{
1562	int disks = sh->disks;
1563	struct page **xor_srcs = to_addr_page(percpu, i: `0`);
1564	unsigned int *off_srcs = to_addr_offs(sh, percpu);
1565	int target = sh->ops.target;
1566	struct r5dev *tgt = &sh->dev[target];
1567	struct page *xor_dest = tgt->page;
1568	unsigned int off_dest = tgt->offset;
1569	int count = `0`;
1570	struct dma_async_tx_descriptor *tx;
1571	struct async_submit_ctl submit;
1572	int i;
1573
1574	BUG_ON(sh->batch_head);
1575
1576	pr_debug("%s: stripe %llu block: %d\n",
1577	__func__, (unsigned long long)sh->sector, target);
1578	BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags));
1579
1580	for (i = disks; i--; ) {
1581	if (i != target) {
1582	off_srcs[count] = sh->dev[i].offset;
1583	xor_srcs[count++] = sh->dev[i].page;
1584	}
1585	}
1586
1587	atomic_inc(v: &sh->count);
1588
1589	init_async_submit(args: &submit, flags: ASYNC_TX_FENCE\|ASYNC_TX_XOR_ZERO_DST, NULL,
1590	cb_fn: ops_complete_compute, cb_param: sh, scribble: to_addr_conv(sh, percpu, i: `0`));
1591	if (unlikely(count == `1`))
1592	tx = async_memcpy(dest: xor_dest, src: xor_srcs[`0`], dest_offset: off_dest, src_offset: off_srcs[`0`],
1593	RAID5_STRIPE_SIZE(sh->raid_conf), submit: &submit);
1594	else
1595	tx = async_xor_offs(dest: xor_dest, offset: off_dest, src_list: xor_srcs, src_offset: off_srcs, src_cnt: count,
1596	RAID5_STRIPE_SIZE(sh->raid_conf), submit: &submit);
1597
1598	return tx;
1599	}
1600
1601	/ set_syndrome_sources - populate source buffers for gen_syndrome*
1602	* @srcs - (struct page *) array of size sh->disks
1603	* @offs - (unsigned int) array of offset for each page
1604	* @sh - stripe_head to parse
1605	*
1606	* Populates srcs in proper layout order for the stripe and returns the
1607	* 'count' of sources to be used in a call to async_gen_syndrome. The P
1608	* destination buffer is recorded in srcs[count] and the Q destination
1609	* is recorded in srcs[count+1]].
1610	*/
1611	static int set_syndrome_sources(struct page **srcs,
1612	unsigned int *offs,
1613	struct stripe_head *sh,
1614	int srctype)
1615	{
1616	int disks = sh->disks;
1617	int syndrome_disks = sh->ddf_layout ? disks : (disks - `2`);
1618	int d0_idx = raid6_d0(sh);
1619	int count;
1620	int i;
1621
1622	for (i = `0`; i < disks; i++)
1623	srcs[i] = NULL;
1624
1625	count = `0`;
1626	i = d0_idx;
1627	do {
1628	int slot = raid6_idx_to_slot(idx: i, sh, count: &count, syndrome_disks);
1629	struct r5dev *dev = &sh->dev[i];
1630
1631	if (i == sh->qd_idx \|\| i == sh->pd_idx \|\|
1632	(srctype == SYNDROME_SRC_ALL) \|\|
1633	(srctype == SYNDROME_SRC_WANT_DRAIN &&
1634	(test_bit(R5_Wantdrain, &dev->flags) \|\|
1635	test_bit(R5_InJournal, &dev->flags))) \|\|
1636	(srctype == SYNDROME_SRC_WRITTEN &&
1637	(dev->written \|\|
1638	test_bit(R5_InJournal, &dev->flags)))) {
1639	if (test_bit(R5_InJournal, &dev->flags))
1640	srcs[slot] = sh->dev[i].orig_page;
1641	else
1642	srcs[slot] = sh->dev[i].page;
1643	/*
1644	* For R5_InJournal, PAGE_SIZE must be 4KB and will
1645	* not shared page. In that case, dev[i].offset
1646	* is 0.
1647	*/
1648	offs[slot] = sh->dev[i].offset;
1649	}
1650	i = raid6_next_disk(disk: i, raid_disks: disks);
1651	} while (i != d0_idx);
1652
1653	return syndrome_disks;
1654	}
1655
1656	static struct dma_async_tx_descriptor *
1657	ops_run_compute6_1(struct stripe_head sh, struct* raid5_percpu *percpu)
1658	{
1659	int disks = sh->disks;
1660	struct page **blocks = to_addr_page(percpu, i: `0`);
1661	unsigned int *offs = to_addr_offs(sh, percpu);
1662	int target;
1663	int qd_idx = sh->qd_idx;
1664	struct dma_async_tx_descriptor *tx;
1665	struct async_submit_ctl submit;
1666	struct r5dev *tgt;
1667	struct page *dest;
1668	unsigned int dest_off;
1669	int i;
1670	int count;
1671
1672	BUG_ON(sh->batch_head);
1673	if (sh->ops.target < `0`)
1674	target = sh->ops.target2;
1675	else if (sh->ops.target2 < `0`)
1676	target = sh->ops.target;
1677	else
1678	/ we should only have one valid target /
1679	BUG();
1680	BUG_ON(target < `0`);
1681	pr_debug("%s: stripe %llu block: %d\n",
1682	__func__, (unsigned long long)sh->sector, target);
1683
1684	tgt = &sh->dev[target];
1685	BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags));
1686	dest = tgt->page;
1687	dest_off = tgt->offset;
1688
1689	atomic_inc(v: &sh->count);
1690
1691	if (target == qd_idx) {
1692	count = set_syndrome_sources(srcs: blocks, offs, sh, srctype: SYNDROME_SRC_ALL);
1693	blocks[count] = NULL; / regenerating p is not necessary /
1694	BUG_ON(blocks[count+`1`] != dest); / q should already be set /
1695	init_async_submit(args: &submit, flags: ASYNC_TX_FENCE, NULL,
1696	cb_fn: ops_complete_compute, cb_param: sh,
1697	scribble: to_addr_conv(sh, percpu, i: `0`));
1698	tx = async_gen_syndrome(blocks, offsets: offs, src_cnt: count+`2`,
1699	RAID5_STRIPE_SIZE(sh->raid_conf), submit: &submit);
1700	} else {
1701	/ Compute any data- or p-drive using XOR /
1702	count = `0`;
1703	for (i = disks; i-- ; ) {
1704	if (i == target \|\| i == qd_idx)
1705	continue;
1706	offs[count] = sh->dev[i].offset;
1707	blocks[count++] = sh->dev[i].page;
1708	}
1709
1710	init_async_submit(args: &submit, flags: ASYNC_TX_FENCE\|ASYNC_TX_XOR_ZERO_DST,
1711	NULL, cb_fn: ops_complete_compute, cb_param: sh,
1712	scribble: to_addr_conv(sh, percpu, i: `0`));
1713	tx = async_xor_offs(dest, offset: dest_off, src_list: blocks, src_offset: offs, src_cnt: count,
1714	RAID5_STRIPE_SIZE(sh->raid_conf), submit: &submit);
1715	}
1716
1717	return tx;
1718	}
1719
1720	static struct dma_async_tx_descriptor *
1721	ops_run_compute6_2(struct stripe_head sh, struct* raid5_percpu *percpu)
1722	{
1723	int i, count, disks = sh->disks;
1724	int syndrome_disks = sh->ddf_layout ? disks : disks-`2`;
1725	int d0_idx = raid6_d0(sh);
1726	int faila = -`1`, failb = -`1`;
1727	int target = sh->ops.target;
1728	int target2 = sh->ops.target2;
1729	struct r5dev *tgt = &sh->dev[target];
1730	struct r5dev *tgt2 = &sh->dev[target2];
1731	struct dma_async_tx_descriptor *tx;
1732	struct page **blocks = to_addr_page(percpu, i: `0`);
1733	unsigned int *offs = to_addr_offs(sh, percpu);
1734	struct async_submit_ctl submit;
1735
1736	BUG_ON(sh->batch_head);
1737	pr_debug("%s: stripe %llu block1: %d block2: %d\n",
1738	__func__, (unsigned long long)sh->sector, target, target2);
1739	BUG_ON(target < `0` \|\| target2 < `0`);
1740	BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags));
1741	BUG_ON(!test_bit(R5_Wantcompute, &tgt2->flags));
1742
1743	/ we need to open-code set_syndrome_sources to handle the*
1744	* slot number conversion for 'faila' and 'failb'
1745	*/
1746	for (i = `0`; i < disks ; i++) {
1747	offs[i] = `0`;
1748	blocks[i] = NULL;
1749	}
1750	count = `0`;
1751	i = d0_idx;
1752	do {
1753	int slot = raid6_idx_to_slot(idx: i, sh, count: &count, syndrome_disks);
1754
1755	offs[slot] = sh->dev[i].offset;
1756	blocks[slot] = sh->dev[i].page;
1757
1758	if (i == target)
1759	faila = slot;
1760	if (i == target2)
1761	failb = slot;
1762	i = raid6_next_disk(disk: i, raid_disks: disks);
1763	} while (i != d0_idx);
1764
1765	BUG_ON(faila == failb);
1766	if (failb < faila)
1767	swap(faila, failb);
1768	pr_debug("%s: stripe: %llu faila: %d failb: %d\n",
1769	__func__, (unsigned long long)sh->sector, faila, failb);
1770
1771	atomic_inc(v: &sh->count);
1772
1773	if (failb == syndrome_disks+`1`) {
1774	/ Q disk is one of the missing disks /
1775	if (faila == syndrome_disks) {
1776	/ Missing P+Q, just recompute /
1777	init_async_submit(args: &submit, flags: ASYNC_TX_FENCE, NULL,
1778	cb_fn: ops_complete_compute, cb_param: sh,
1779	scribble: to_addr_conv(sh, percpu, i: `0`));
1780	return async_gen_syndrome(blocks, offsets: offs, src_cnt: syndrome_disks+`2`,
1781	RAID5_STRIPE_SIZE(sh->raid_conf),
1782	submit: &submit);
1783	} else {
1784	struct page *dest;
1785	unsigned int dest_off;
1786	int data_target;
1787	int qd_idx = sh->qd_idx;
1788
1789	/ Missing D+Q: recompute D from P, then recompute Q /
1790	if (target == qd_idx)
1791	data_target = target2;
1792	else
1793	data_target = target;
1794
1795	count = `0`;
1796	for (i = disks; i-- ; ) {
1797	if (i == data_target \|\| i == qd_idx)
1798	continue;
1799	offs[count] = sh->dev[i].offset;
1800	blocks[count++] = sh->dev[i].page;
1801	}
1802	dest = sh->dev[data_target].page;
1803	dest_off = sh->dev[data_target].offset;
1804	init_async_submit(args: &submit,
1805	flags: ASYNC_TX_FENCE\|ASYNC_TX_XOR_ZERO_DST,
1806	NULL, NULL, NULL,
1807	scribble: to_addr_conv(sh, percpu, i: `0`));
1808	tx = async_xor_offs(dest, offset: dest_off, src_list: blocks, src_offset: offs, src_cnt: count,
1809	RAID5_STRIPE_SIZE(sh->raid_conf),
1810	submit: &submit);
1811
1812	count = set_syndrome_sources(srcs: blocks, offs, sh, srctype: SYNDROME_SRC_ALL);
1813	init_async_submit(args: &submit, flags: ASYNC_TX_FENCE, tx,
1814	cb_fn: ops_complete_compute, cb_param: sh,
1815	scribble: to_addr_conv(sh, percpu, i: `0`));
1816	return async_gen_syndrome(blocks, offsets: offs, src_cnt: count+`2`,
1817	RAID5_STRIPE_SIZE(sh->raid_conf),
1818	submit: &submit);
1819	}
1820	} else {
1821	init_async_submit(args: &submit, flags: ASYNC_TX_FENCE, NULL,
1822	cb_fn: ops_complete_compute, cb_param: sh,
1823	scribble: to_addr_conv(sh, percpu, i: `0`));
1824	if (failb == syndrome_disks) {
1825	/ We're missing D+P. /
1826	return async_raid6_datap_recov(src_num: syndrome_disks+`2`,
1827	RAID5_STRIPE_SIZE(sh->raid_conf),
1828	faila,
1829	ptrs: blocks, offs, submit: &submit);
1830	} else {
1831	/ We're missing D+D. /
1832	return async_raid6_2data_recov(src_num: syndrome_disks+`2`,
1833	RAID5_STRIPE_SIZE(sh->raid_conf),
1834	faila, failb,
1835	ptrs: blocks, offs, submit: &submit);
1836	}
1837	}
1838	}
1839
1840	static void ops_complete_prexor(void *stripe_head_ref)
1841	{
1842	struct stripe_head *sh = stripe_head_ref;
1843
1844	pr_debug("%s: stripe %llu\n", __func__,
1845	(unsigned long long)sh->sector);
1846
1847	if (r5c_is_writeback(log: sh->raid_conf->log))
1848	/*
1849	* raid5-cache write back uses orig_page during prexor.
1850	* After prexor, it is time to free orig_page
1851	*/
1852	r5c_release_extra_page(sh);
1853	}
1854
1855	static struct dma_async_tx_descriptor *
1856	ops_run_prexor5(struct stripe_head sh, struct* raid5_percpu *percpu,
1857	struct dma_async_tx_descriptor *tx)
1858	{
1859	int disks = sh->disks;
1860	struct page **xor_srcs = to_addr_page(percpu, i: `0`);
1861	unsigned int *off_srcs = to_addr_offs(sh, percpu);
1862	int count = `0`, pd_idx = sh->pd_idx, i;
1863	struct async_submit_ctl submit;
1864
1865	/ existing parity data subtracted /
1866	unsigned int off_dest = off_srcs[count] = sh->dev[pd_idx].offset;
1867	struct page *xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page;
1868
1869	BUG_ON(sh->batch_head);
1870	pr_debug("%s: stripe %llu\n", __func__,
1871	(unsigned long long)sh->sector);
1872
1873	for (i = disks; i--; ) {
1874	struct r5dev *dev = &sh->dev[i];
1875	/ Only process blocks that are known to be uptodate /
1876	if (test_bit(R5_InJournal, &dev->flags)) {
1877	/*
1878	* For this case, PAGE_SIZE must be equal to 4KB and
1879	* page offset is zero.
1880	*/
1881	off_srcs[count] = dev->offset;
1882	xor_srcs[count++] = dev->orig_page;
1883	} else if (test_bit(R5_Wantdrain, &dev->flags)) {
1884	off_srcs[count] = dev->offset;
1885	xor_srcs[count++] = dev->page;
1886	}
1887	}
1888
1889	init_async_submit(args: &submit, flags: ASYNC_TX_FENCE\|ASYNC_TX_XOR_DROP_DST, tx,
1890	cb_fn: ops_complete_prexor, cb_param: sh, scribble: to_addr_conv(sh, percpu, i: `0`));
1891	tx = async_xor_offs(dest: xor_dest, offset: off_dest, src_list: xor_srcs, src_offset: off_srcs, src_cnt: count,
1892	RAID5_STRIPE_SIZE(sh->raid_conf), submit: &submit);
1893
1894	return tx;
1895	}
1896
1897	static struct dma_async_tx_descriptor *
1898	ops_run_prexor6(struct stripe_head sh, struct* raid5_percpu *percpu,
1899	struct dma_async_tx_descriptor *tx)
1900	{
1901	struct page **blocks = to_addr_page(percpu, i: `0`);
1902	unsigned int *offs = to_addr_offs(sh, percpu);
1903	int count;
1904	struct async_submit_ctl submit;
1905
1906	pr_debug("%s: stripe %llu\n", __func__,
1907	(unsigned long long)sh->sector);
1908
1909	count = set_syndrome_sources(srcs: blocks, offs, sh, srctype: SYNDROME_SRC_WANT_DRAIN);
1910
1911	init_async_submit(args: &submit, flags: ASYNC_TX_FENCE\|ASYNC_TX_PQ_XOR_DST, tx,
1912	cb_fn: ops_complete_prexor, cb_param: sh, scribble: to_addr_conv(sh, percpu, i: `0`));
1913	tx = async_gen_syndrome(blocks, offsets: offs, src_cnt: count+`2`,
1914	RAID5_STRIPE_SIZE(sh->raid_conf), submit: &submit);
1915
1916	return tx;
1917	}
1918
1919	static struct dma_async_tx_descriptor *
1920	ops_run_biodrain(struct stripe_head sh, struct* dma_async_tx_descriptor *tx)
1921	{
1922	struct r5conf *conf = sh->raid_conf;
1923	int disks = sh->disks;
1924	int i;
1925	struct stripe_head *head_sh = sh;
1926
1927	pr_debug("%s: stripe %llu\n", __func__,
1928	(unsigned long long)sh->sector);
1929
1930	for (i = disks; i--; ) {
1931	struct r5dev *dev;
1932	struct bio *chosen;
1933
1934	sh = head_sh;
1935	if (test_and_clear_bit(nr: R5_Wantdrain, addr: &head_sh->dev[i].flags)) {
1936	struct bio *wbi;
1937
1938	again:
1939	dev = &sh->dev[i];
1940	/*
1941	* clear R5_InJournal, so when rewriting a page in
1942	* journal, it is not skipped by r5l_log_stripe()
1943	*/
1944	clear_bit(nr: R5_InJournal, addr: &dev->flags);
1945	spin_lock_irq(lock: &sh->stripe_lock);
1946	chosen = dev->towrite;
1947	dev->towrite = NULL;
1948	sh->overwrite_disks = `0`;
1949	BUG_ON(dev->written);
1950	wbi = dev->written = chosen;
1951	spin_unlock_irq(lock: &sh->stripe_lock);
1952	WARN_ON(dev->page != dev->orig_page);
1953
1954	while (wbi && wbi->bi_iter.bi_sector <
1955	dev->sector + RAID5_STRIPE_SECTORS(conf)) {
1956	if (wbi->bi_opf & REQ_FUA)
1957	set_bit(nr: R5_WantFUA, addr: &dev->flags);
1958	if (wbi->bi_opf & REQ_SYNC)
1959	set_bit(nr: R5_SyncIO, addr: &dev->flags);
1960	if (bio_op(bio: wbi) == REQ_OP_DISCARD)
1961	set_bit(nr: R5_Discard, addr: &dev->flags);
1962	else {
1963	tx = async_copy_data(frombio: `1`, bio: wbi, page: &dev->page,
1964	poff: dev->offset,
1965	sector: dev->sector, tx, sh,
1966	no_skipcopy: r5c_is_writeback(log: conf->log));
1967	if (dev->page != dev->orig_page &&
1968	!r5c_is_writeback(log: conf->log)) {
1969	set_bit(nr: R5_SkipCopy, addr: &dev->flags);
1970	clear_bit(nr: R5_UPTODATE, addr: &dev->flags);
1971	clear_bit(nr: R5_OVERWRITE, addr: &dev->flags);
1972	}
1973	}
1974	wbi = r5_next_bio(conf, bio: wbi, sector: dev->sector);
1975	}
1976
1977	if (head_sh->batch_head) {
1978	sh = list_first_entry(&sh->batch_list,
1979	struct stripe_head,
1980	batch_list);
1981	if (sh == head_sh)
1982	continue;
1983	goto again;
1984	}
1985	}
1986	}
1987
1988	return tx;
1989	}
1990
1991	static void ops_complete_reconstruct(void *stripe_head_ref)
1992	{
1993	struct stripe_head *sh = stripe_head_ref;
1994	int disks = sh->disks;
1995	int pd_idx = sh->pd_idx;
1996	int qd_idx = sh->qd_idx;
1997	int i;
1998	bool fua = false, sync = false, discard = false;
1999
2000	pr_debug("%s: stripe %llu\n", __func__,
2001	(unsigned long long)sh->sector);
2002
2003	for (i = disks; i--; ) {
2004	fua \|= test_bit(R5_WantFUA, &sh->dev[i].flags);
2005	sync \|= test_bit(R5_SyncIO, &sh->dev[i].flags);
2006	discard \|= test_bit(R5_Discard, &sh->dev[i].flags);
2007	}
2008
2009	for (i = disks; i--; ) {
2010	struct r5dev *dev = &sh->dev[i];
2011
2012	if (dev->written \|\| i == pd_idx \|\| i == qd_idx) {
2013	if (!discard && !test_bit(R5_SkipCopy, &dev->flags)) {
2014	set_bit(nr: R5_UPTODATE, addr: &dev->flags);
2015	if (test_bit(STRIPE_EXPAND_READY, &sh->state))
2016	set_bit(nr: R5_Expanded, addr: &dev->flags);
2017	}
2018	if (fua)
2019	set_bit(nr: R5_WantFUA, addr: &dev->flags);
2020	if (sync)
2021	set_bit(nr: R5_SyncIO, addr: &dev->flags);
2022	}
2023	}
2024
2025	if (sh->reconstruct_state == reconstruct_state_drain_run)
2026	sh->reconstruct_state = reconstruct_state_drain_result;
2027	else if (sh->reconstruct_state == reconstruct_state_prexor_drain_run)
2028	sh->reconstruct_state = reconstruct_state_prexor_drain_result;
2029	else {
2030	BUG_ON(sh->reconstruct_state != reconstruct_state_run);
2031	sh->reconstruct_state = reconstruct_state_result;
2032	}
2033
2034	set_bit(nr: STRIPE_HANDLE, addr: &sh->state);
2035	raid5_release_stripe(sh);
2036	}
2037
2038	static void
2039	ops_run_reconstruct5(struct stripe_head sh, struct* raid5_percpu *percpu,
2040	struct dma_async_tx_descriptor *tx)
2041	{
2042	int disks = sh->disks;
2043	struct page **xor_srcs;
2044	unsigned int *off_srcs;
2045	struct async_submit_ctl submit;
2046	int count, pd_idx = sh->pd_idx, i;
2047	struct page *xor_dest;
2048	unsigned int off_dest;
2049	int prexor = `0`;
2050	unsigned long flags;
2051	int j = `0`;
2052	struct stripe_head *head_sh = sh;
2053	int last_stripe;
2054
2055	pr_debug("%s: stripe %llu\n", __func__,
2056	(unsigned long long)sh->sector);
2057
2058	for (i = `0`; i < sh->disks; i++) {
2059	if (pd_idx == i)
2060	continue;
2061	if (!test_bit(R5_Discard, &sh->dev[i].flags))
2062	break;
2063	}
2064	if (i >= sh->disks) {
2065	atomic_inc(v: &sh->count);
2066	set_bit(nr: R5_Discard, addr: &sh->dev[pd_idx].flags);
2067	ops_complete_reconstruct(stripe_head_ref: sh);
2068	return;
2069	}
2070	again:
2071	count = `0`;
2072	xor_srcs = to_addr_page(percpu, i: j);
2073	off_srcs = to_addr_offs(sh, percpu);
2074	/ check if prexor is active which means only process blocks*
2075	* that are part of a read-modify-write (written)
2076	*/
2077	if (head_sh->reconstruct_state == reconstruct_state_prexor_drain_run) {
2078	prexor = `1`;
2079	off_dest = off_srcs[count] = sh->dev[pd_idx].offset;
2080	xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page;
2081	for (i = disks; i--; ) {
2082	struct r5dev *dev = &sh->dev[i];
2083	if (head_sh->dev[i].written \|\|
2084	test_bit(R5_InJournal, &head_sh->dev[i].flags)) {
2085	off_srcs[count] = dev->offset;
2086	xor_srcs[count++] = dev->page;
2087	}
2088	}
2089	} else {
2090	xor_dest = sh->dev[pd_idx].page;
2091	off_dest = sh->dev[pd_idx].offset;
2092	for (i = disks; i--; ) {
2093	struct r5dev *dev = &sh->dev[i];
2094	if (i != pd_idx) {
2095	off_srcs[count] = dev->offset;
2096	xor_srcs[count++] = dev->page;
2097	}
2098	}
2099	}
2100
2101	/ 1/ if we prexor'd then the dest is reused as a source*
2102	* 2/ if we did not prexor then we are redoing the parity
2103	* set ASYNC_TX_XOR_DROP_DST and ASYNC_TX_XOR_ZERO_DST
2104	* for the synchronous xor case
2105	*/
2106	last_stripe = !head_sh->batch_head \|\|
2107	list_first_entry(&sh->batch_list,
2108	struct stripe_head, batch_list) == head_sh;
2109	if (last_stripe) {
2110	flags = ASYNC_TX_ACK \|
2111	(prexor ? ASYNC_TX_XOR_DROP_DST : ASYNC_TX_XOR_ZERO_DST);
2112
2113	atomic_inc(v: &head_sh->count);
2114	init_async_submit(args: &submit, flags, tx, cb_fn: ops_complete_reconstruct, cb_param: head_sh,
2115	scribble: to_addr_conv(sh, percpu, i: j));
2116	} else {
2117	flags = prexor ? ASYNC_TX_XOR_DROP_DST : ASYNC_TX_XOR_ZERO_DST;
2118	init_async_submit(args: &submit, flags, tx, NULL, NULL,
2119	scribble: to_addr_conv(sh, percpu, i: j));
2120	}
2121
2122	if (unlikely(count == `1`))
2123	tx = async_memcpy(dest: xor_dest, src: xor_srcs[`0`], dest_offset: off_dest, src_offset: off_srcs[`0`],
2124	RAID5_STRIPE_SIZE(sh->raid_conf), submit: &submit);
2125	else
2126	tx = async_xor_offs(dest: xor_dest, offset: off_dest, src_list: xor_srcs, src_offset: off_srcs, src_cnt: count,
2127	RAID5_STRIPE_SIZE(sh->raid_conf), submit: &submit);
2128	if (!last_stripe) {
2129	j++;
2130	sh = list_first_entry(&sh->batch_list, struct stripe_head,
2131	batch_list);
2132	goto again;
2133	}
2134	}
2135
2136	static void
2137	ops_run_reconstruct6(struct stripe_head sh, struct* raid5_percpu *percpu,
2138	struct dma_async_tx_descriptor *tx)
2139	{
2140	struct async_submit_ctl submit;
2141	struct page **blocks;
2142	unsigned int *offs;
2143	int count, i, j = `0`;
2144	struct stripe_head *head_sh = sh;
2145	int last_stripe;
2146	int synflags;
2147	unsigned long txflags;
2148
2149	pr_debug("%s: stripe %llu\n", __func__, (unsigned long long)sh->sector);
2150
2151	for (i = `0`; i < sh->disks; i++) {
2152	if (sh->pd_idx == i \|\| sh->qd_idx == i)
2153	continue;
2154	if (!test_bit(R5_Discard, &sh->dev[i].flags))
2155	break;
2156	}
2157	if (i >= sh->disks) {
2158	atomic_inc(v: &sh->count);
2159	set_bit(nr: R5_Discard, addr: &sh->dev[sh->pd_idx].flags);
2160	set_bit(nr: R5_Discard, addr: &sh->dev[sh->qd_idx].flags);
2161	ops_complete_reconstruct(stripe_head_ref: sh);
2162	return;
2163	}
2164
2165	again:
2166	blocks = to_addr_page(percpu, i: j);
2167	offs = to_addr_offs(sh, percpu);
2168
2169	if (sh->reconstruct_state == reconstruct_state_prexor_drain_run) {
2170	synflags = SYNDROME_SRC_WRITTEN;
2171	txflags = ASYNC_TX_ACK \| ASYNC_TX_PQ_XOR_DST;
2172	} else {
2173	synflags = SYNDROME_SRC_ALL;
2174	txflags = ASYNC_TX_ACK;
2175	}
2176
2177	count = set_syndrome_sources(srcs: blocks, offs, sh, srctype: synflags);
2178	last_stripe = !head_sh->batch_head \|\|
2179	list_first_entry(&sh->batch_list,
2180	struct stripe_head, batch_list) == head_sh;
2181
2182	if (last_stripe) {
2183	atomic_inc(v: &head_sh->count);
2184	init_async_submit(args: &submit, flags: txflags, tx, cb_fn: ops_complete_reconstruct,
2185	cb_param: head_sh, scribble: to_addr_conv(sh, percpu, i: j));
2186	} else
2187	init_async_submit(args: &submit, flags: `0`, tx, NULL, NULL,
2188	scribble: to_addr_conv(sh, percpu, i: j));
2189	tx = async_gen_syndrome(blocks, offsets: offs, src_cnt: count+`2`,
2190	RAID5_STRIPE_SIZE(sh->raid_conf), submit: &submit);
2191	if (!last_stripe) {
2192	j++;
2193	sh = list_first_entry(&sh->batch_list, struct stripe_head,
2194	batch_list);
2195	goto again;
2196	}
2197	}
2198
2199	static void ops_complete_check(void *stripe_head_ref)
2200	{
2201	struct stripe_head *sh = stripe_head_ref;
2202
2203	pr_debug("%s: stripe %llu\n", __func__,
2204	(unsigned long long)sh->sector);
2205
2206	sh->check_state = check_state_check_result;
2207	set_bit(nr: STRIPE_HANDLE, addr: &sh->state);
2208	raid5_release_stripe(sh);
2209	}
2210
2211	static void ops_run_check_p(struct stripe_head sh, struct* raid5_percpu *percpu)
2212	{
2213	int disks = sh->disks;
2214	int pd_idx = sh->pd_idx;
2215	int qd_idx = sh->qd_idx;
2216	struct page *xor_dest;
2217	unsigned int off_dest;
2218	struct page **xor_srcs = to_addr_page(percpu, i: `0`);
2219	unsigned int *off_srcs = to_addr_offs(sh, percpu);
2220	struct dma_async_tx_descriptor *tx;
2221	struct async_submit_ctl submit;
2222	int count;
2223	int i;
2224
2225	pr_debug("%s: stripe %llu\n", __func__,
2226	(unsigned long long)sh->sector);
2227
2228	BUG_ON(sh->batch_head);
2229	count = `0`;
2230	xor_dest = sh->dev[pd_idx].page;
2231	off_dest = sh->dev[pd_idx].offset;
2232	off_srcs[count] = off_dest;
2233	xor_srcs[count++] = xor_dest;
2234	for (i = disks; i--; ) {
2235	if (i == pd_idx \|\| i == qd_idx)
2236	continue;
2237	off_srcs[count] = sh->dev[i].offset;
2238	xor_srcs[count++] = sh->dev[i].page;
2239	}
2240
2241	init_async_submit(args: &submit, flags: `0`, NULL, NULL, NULL,
2242	scribble: to_addr_conv(sh, percpu, i: `0`));
2243	tx = async_xor_val_offs(dest: xor_dest, offset: off_dest, src_list: xor_srcs, src_offset: off_srcs, src_cnt: count,
2244	RAID5_STRIPE_SIZE(sh->raid_conf),
2245	result: &sh->ops.zero_sum_result, submit: &submit);
2246
2247	atomic_inc(v: &sh->count);
2248	init_async_submit(args: &submit, flags: ASYNC_TX_ACK, tx, cb_fn: ops_complete_check, cb_param: sh, NULL);
2249	tx = async_trigger_callback(submit: &submit);
2250	}
2251
2252	static void ops_run_check_pq(struct stripe_head sh, struct* raid5_percpu percpu, int* checkp)
2253	{
2254	struct page **srcs = to_addr_page(percpu, i: `0`);
2255	unsigned int *offs = to_addr_offs(sh, percpu);
2256	struct async_submit_ctl submit;
2257	int count;
2258
2259	pr_debug("%s: stripe %llu checkp: %d\n", __func__,
2260	(unsigned long long)sh->sector, checkp);
2261
2262	BUG_ON(sh->batch_head);
2263	count = set_syndrome_sources(srcs, offs, sh, srctype: SYNDROME_SRC_ALL);
2264	if (!checkp)
2265	srcs[count] = NULL;
2266
2267	atomic_inc(v: &sh->count);
2268	init_async_submit(args: &submit, flags: ASYNC_TX_ACK, NULL, cb_fn: ops_complete_check,
2269	cb_param: sh, scribble: to_addr_conv(sh, percpu, i: `0`));
2270	async_syndrome_val(blocks: srcs, offsets: offs, src_cnt: count+`2`,
2271	RAID5_STRIPE_SIZE(sh->raid_conf),
2272	pqres: &sh->ops.zero_sum_result, spare: percpu->spare_page, s_off: `0`, submit: &submit);
2273	}
2274
2275	static void raid_run_ops(struct stripe_head sh, unsigned* long ops_request)
2276	{
2277	int overlap_clear = `0`, i, disks = sh->disks;
2278	struct dma_async_tx_descriptor *tx = NULL;
2279	struct r5conf *conf = sh->raid_conf;
2280	int level = conf->level;
2281	struct raid5_percpu *percpu;
2282
2283	local_lock(&conf->percpu->lock);
2284	percpu = this_cpu_ptr(conf->percpu);
2285	if (test_bit(STRIPE_OP_BIOFILL, &ops_request)) {
2286	ops_run_biofill(sh);
2287	overlap_clear++;
2288	}
2289
2290	if (test_bit(STRIPE_OP_COMPUTE_BLK, &ops_request)) {
2291	if (level < `6`)
2292	tx = ops_run_compute5(sh, percpu);
2293	else {
2294	if (sh->ops.target2 < `0` \|\| sh->ops.target < `0`)
2295	tx = ops_run_compute6_1(sh, percpu);
2296	else
2297	tx = ops_run_compute6_2(sh, percpu);
2298	}
2299	/ terminate the chain if reconstruct is not set to be run /
2300	if (tx && !test_bit(STRIPE_OP_RECONSTRUCT, &ops_request))
2301	async_tx_ack(tx);
2302	}
2303
2304	if (test_bit(STRIPE_OP_PREXOR, &ops_request)) {
2305	if (level < `6`)
2306	tx = ops_run_prexor5(sh, percpu, tx);
2307	else
2308	tx = ops_run_prexor6(sh, percpu, tx);
2309	}
2310
2311	if (test_bit(STRIPE_OP_PARTIAL_PARITY, &ops_request))
2312	tx = ops_run_partial_parity(sh, percpu, tx);
2313
2314	if (test_bit(STRIPE_OP_BIODRAIN, &ops_request)) {
2315	tx = ops_run_biodrain(sh, tx);
2316	overlap_clear++;
2317	}
2318
2319	if (test_bit(STRIPE_OP_RECONSTRUCT, &ops_request)) {
2320	if (level < `6`)
2321	ops_run_reconstruct5(sh, percpu, tx);
2322	else
2323	ops_run_reconstruct6(sh, percpu, tx);
2324	}
2325
2326	if (test_bit(STRIPE_OP_CHECK, &ops_request)) {
2327	if (sh->check_state == check_state_run)
2328	ops_run_check_p(sh, percpu);
2329	else if (sh->check_state == check_state_run_q)
2330	ops_run_check_pq(sh, percpu, checkp: `0`);
2331	else if (sh->check_state == check_state_run_pq)
2332	ops_run_check_pq(sh, percpu, checkp: `1`);
2333	else
2334	BUG();
2335	}
2336
2337	if (overlap_clear && !sh->batch_head) {
2338	for (i = disks; i--; ) {
2339	struct r5dev *dev = &sh->dev[i];
2340	if (test_and_clear_bit(nr: R5_Overlap, addr: &dev->flags))
2341	wake_up(&sh->raid_conf->wait_for_overlap);
2342	}
2343	}
2344	local_unlock(&conf->percpu->lock);
2345	}
2346
2347	static void free_stripe(struct kmem_cache sc, struct* stripe_head *sh)
2348	{
2349	#if PAGE_SIZE != DEFAULT_STRIPE_SIZE
2350	kfree(sh->pages);
2351	#endif
2352	if (sh->ppl_page)
2353	__free_page(sh->ppl_page);
2354	kmem_cache_free(s: sc, objp: sh);
2355	}
2356
2357	static struct stripe_head alloc_stripe(struct* kmem_cache *sc, gfp_t gfp,
2358	int disks, struct r5conf *conf)
2359	{
2360	struct stripe_head *sh;
2361
2362	sh = kmem_cache_zalloc(k: sc, flags: gfp);
2363	if (sh) {
2364	spin_lock_init(&sh->stripe_lock);
2365	spin_lock_init(&sh->batch_lock);
2366	INIT_LIST_HEAD(list: &sh->batch_list);
2367	INIT_LIST_HEAD(list: &sh->lru);
2368	INIT_LIST_HEAD(list: &sh->r5c);
2369	INIT_LIST_HEAD(list: &sh->log_list);
2370	atomic_set(v: &sh->count, i: `1`);
2371	sh->raid_conf = conf;
2372	sh->log_start = MaxSector;
2373
2374	if (raid5_has_ppl(conf)) {
2375	sh->ppl_page = alloc_page(gfp);
2376	if (!sh->ppl_page) {
2377	free_stripe(sc, sh);
2378	return NULL;
2379	}
2380	}
2381	#if PAGE_SIZE != DEFAULT_STRIPE_SIZE
2382	if (init_stripe_shared_pages(sh, conf, disks)) {
2383	free_stripe(sc, sh);
2384	return NULL;
2385	}
2386	#endif
2387	}
2388	return sh;
2389	}
2390	static int grow_one_stripe(struct r5conf *conf, gfp_t gfp)
2391	{
2392	struct stripe_head *sh;
2393
2394	sh = alloc_stripe(sc: conf->slab_cache, gfp, disks: conf->pool_size, conf);
2395	if (!sh)
2396	return `0`;
2397
2398	if (grow_buffers(sh, gfp)) {
2399	shrink_buffers(sh);
2400	free_stripe(sc: conf->slab_cache, sh);
2401	return `0`;
2402	}
2403	sh->hash_lock_index =
2404	conf->max_nr_stripes % NR_STRIPE_HASH_LOCKS;
2405	/ we just created an active stripe so... /
2406	atomic_inc(v: &conf->active_stripes);
2407
2408	raid5_release_stripe(sh);
2409	WRITE_ONCE(conf->max_nr_stripes, conf->max_nr_stripes + `1`);
2410	return `1`;
2411	}
2412
2413	static int grow_stripes(struct r5conf conf, int* num)
2414	{
2415	struct kmem_cache *sc;
2416	size_t namelen = sizeof(conf->cache_name[`0`]);
2417	int devs = max(conf->raid_disks, conf->previous_raid_disks);
2418
2419	if (mddev_is_dm(mddev: conf->mddev))
2420	snprintf(buf: conf->cache_name[`0`], size: namelen,
2421	fmt: "raid%d-%p", conf->level, conf->mddev);
2422	else
2423	snprintf(buf: conf->cache_name[`0`], size: namelen,
2424	fmt: "raid%d-%s", conf->level, mdname(mddev: conf->mddev));
2425	snprintf(buf: conf->cache_name[`1`], size: namelen, fmt: "%.27s-alt", conf->cache_name[`0`]);
2426
2427	conf->active_name = `0`;
2428	sc = kmem_cache_create(name: conf->cache_name[conf->active_name],
2429	struct_size_t(struct stripe_head, dev, devs),
2430	align: `0`, flags: `0`, NULL);
2431	if (!sc)
2432	return `1`;
2433	conf->slab_cache = sc;
2434	conf->pool_size = devs;
2435	while (num--)
2436	if (!grow_one_stripe(conf, GFP_KERNEL))
2437	return `1`;
2438
2439	return `0`;
2440	}
2441
2442	/**
2443	* scribble_alloc - allocate percpu scribble buffer for required size
2444	* of the scribble region
2445	* @percpu: from for_each_present_cpu() of the caller
2446	* @num: total number of disks in the array
2447	* @cnt: scribble objs count for required size of the scribble region
2448	*
2449	* The scribble buffer size must be enough to contain:
2450	* 1/ a struct page pointer for each device in the array +2
2451	* 2/ room to convert each entry in (1) to its corresponding dma
2452	* (dma_map_page()) or page (page_address()) address.
2453	*
2454	* Note: the +2 is for the destination buffers of the ddf/raid6 case where we
2455	* calculate over all devices (not just the data blocks), using zeros in place
2456	* of the P and Q blocks.
2457	*/
2458	static int scribble_alloc(struct raid5_percpu *percpu,
2459	int num, int cnt)
2460	{
2461	size_t obj_size =
2462	sizeof(struct page ) (num + `2`) +
2463	sizeof(addr_conv_t) * (num + `2`) +
2464	sizeof(unsigned int) * (num + `2`);
2465	void *scribble;
2466
2467	/*
2468	* If here is in raid array suspend context, it is in memalloc noio
2469	* context as well, there is no potential recursive memory reclaim
2470	* I/Os with the GFP_KERNEL flag.
2471	*/
2472	scribble = kvmalloc_array(n: cnt, size: obj_size, GFP_KERNEL);
2473	if (!scribble)
2474	return -ENOMEM;
2475
2476	kvfree(addr: percpu->scribble);
2477
2478	percpu->scribble = scribble;
2479	percpu->scribble_obj_size = obj_size;
2480	return `0`;
2481	}
2482
2483	static int resize_chunks(struct r5conf conf, int* new_disks, int new_sectors)
2484	{
2485	unsigned long cpu;
2486	int err = `0`;
2487
2488	/ Never shrink. /
2489	if (conf->scribble_disks >= new_disks &&
2490	conf->scribble_sectors >= new_sectors)
2491	return `0`;
2492
2493	raid5_quiesce(mddev: conf->mddev, quiesce: true);
2494	cpus_read_lock();
2495
2496	for_each_present_cpu(cpu) {
2497	struct raid5_percpu *percpu;
2498
2499	percpu = per_cpu_ptr(conf->percpu, cpu);
2500	err = scribble_alloc(percpu, num: new_disks,
2501	cnt: new_sectors / RAID5_STRIPE_SECTORS(conf));
2502	if (err)
2503	break;
2504	}
2505
2506	cpus_read_unlock();
2507	raid5_quiesce(mddev: conf->mddev, quiesce: false);
2508
2509	if (!err) {
2510	conf->scribble_disks = new_disks;
2511	conf->scribble_sectors = new_sectors;
2512	}
2513	return err;
2514	}
2515
2516	static int resize_stripes(struct r5conf conf, int* newsize)
2517	{
2518	/ Make all the stripes able to hold 'newsize' devices.*
2519	* New slots in each stripe get 'page' set to a new page.
2520	*
2521	* This happens in stages:
2522	* 1/ create a new kmem_cache and allocate the required number of
2523	* stripe_heads.
2524	* 2/ gather all the old stripe_heads and transfer the pages across
2525	* to the new stripe_heads. This will have the side effect of
2526	* freezing the array as once all stripe_heads have been collected,
2527	* no IO will be possible. Old stripe heads are freed once their
2528	* pages have been transferred over, and the old kmem_cache is
2529	* freed when all stripes are done.
2530	* 3/ reallocate conf->disks to be suitable bigger. If this fails,
2531	* we simple return a failure status - no need to clean anything up.
2532	* 4/ allocate new pages for the new slots in the new stripe_heads.
2533	* If this fails, we don't bother trying the shrink the
2534	* stripe_heads down again, we just leave them as they are.
2535	* As each stripe_head is processed the new one is released into
2536	* active service.
2537	*
2538	* Once step2 is started, we cannot afford to wait for a write,
2539	* so we use GFP_NOIO allocations.
2540	*/
2541	struct stripe_head osh, nsh;
2542	LIST_HEAD(newstripes);
2543	struct disk_info *ndisks;
2544	int err = `0`;
2545	struct kmem_cache *sc;
2546	int i;
2547	int hash, cnt;
2548
2549	md_allow_write(mddev: conf->mddev);
2550
2551	/ Step 1 /
2552	sc = kmem_cache_create(name: conf->cache_name[`1`-conf->active_name],
2553	struct_size_t(struct stripe_head, dev, newsize),
2554	align: `0`, flags: `0`, NULL);
2555	if (!sc)
2556	return -ENOMEM;
2557
2558	/ Need to ensure auto-resizing doesn't interfere /
2559	mutex_lock(&conf->cache_size_mutex);
2560
2561	for (i = conf->max_nr_stripes; i; i--) {
2562	nsh = alloc_stripe(sc, GFP_KERNEL, disks: newsize, conf);
2563	if (!nsh)
2564	break;
2565
2566	list_add(new: &nsh->lru, head: &newstripes);
2567	}
2568	if (i) {
2569	/ didn't get enough, give up /
2570	while (!list_empty(head: &newstripes)) {
2571	nsh = list_entry(newstripes.next, struct stripe_head, lru);
2572	list_del(entry: &nsh->lru);
2573	free_stripe(sc, sh: nsh);
2574	}
2575	kmem_cache_destroy(s: sc);
2576	mutex_unlock(lock: &conf->cache_size_mutex);
2577	return -ENOMEM;
2578	}
2579	/ Step 2 - Must use GFP_NOIO now.*
2580	* OK, we have enough stripes, start collecting inactive
2581	* stripes and copying them over
2582	*/
2583	hash = `0`;
2584	cnt = `0`;
2585	list_for_each_entry(nsh, &newstripes, lru) {
2586	lock_device_hash_lock(conf, hash);
2587	wait_event_cmd(conf->wait_for_stripe,
2588	!list_empty(conf->inactive_list + hash),
2589	unlock_device_hash_lock(conf, hash),
2590	lock_device_hash_lock(conf, hash));
2591	osh = get_free_stripe(conf, hash);
2592	unlock_device_hash_lock(conf, hash);
2593
2594	#if PAGE_SIZE != DEFAULT_STRIPE_SIZE
2595	for (i = `0`; i < osh->nr_pages; i++) {
2596	nsh->pages[i] = osh->pages[i];
2597	osh->pages[i] = NULL;
2598	}
2599	#endif
2600	for(i=`0`; i<conf->pool_size; i++) {
2601	nsh->dev[i].page = osh->dev[i].page;
2602	nsh->dev[i].orig_page = osh->dev[i].page;
2603	nsh->dev[i].offset = osh->dev[i].offset;
2604	}
2605	nsh->hash_lock_index = hash;
2606	free_stripe(sc: conf->slab_cache, sh: osh);
2607	cnt++;
2608	if (cnt >= conf->max_nr_stripes / NR_STRIPE_HASH_LOCKS +
2609	!!((conf->max_nr_stripes % NR_STRIPE_HASH_LOCKS) > hash)) {
2610	hash++;
2611	cnt = `0`;
2612	}
2613	}
2614	kmem_cache_destroy(s: conf->slab_cache);
2615
2616	/ Step 3.*
2617	* At this point, we are holding all the stripes so the array
2618	* is completely stalled, so now is a good time to resize
2619	* conf->disks and the scribble region
2620	*/
2621	ndisks = kcalloc(n: newsize, size: sizeof(struct disk_info), GFP_NOIO);
2622	if (ndisks) {
2623	for (i = `0`; i < conf->pool_size; i++)
2624	ndisks[i] = conf->disks[i];
2625
2626	for (i = conf->pool_size; i < newsize; i++) {
2627	ndisks[i].extra_page = alloc_page(GFP_NOIO);
2628	if (!ndisks[i].extra_page)
2629	err = -ENOMEM;
2630	}
2631
2632	if (err) {
2633	for (i = conf->pool_size; i < newsize; i++)
2634	if (ndisks[i].extra_page)
2635	put_page(page: ndisks[i].extra_page);
2636	kfree(objp: ndisks);
2637	} else {
2638	kfree(objp: conf->disks);
2639	conf->disks = ndisks;
2640	}
2641	} else
2642	err = -ENOMEM;
2643
2644	conf->slab_cache = sc;
2645	conf->active_name = `1`-conf->active_name;
2646
2647	/ Step 4, return new stripes to service /
2648	while(!list_empty(head: &newstripes)) {
2649	nsh = list_entry(newstripes.next, struct stripe_head, lru);
2650	list_del_init(entry: &nsh->lru);
2651
2652	#if PAGE_SIZE != DEFAULT_STRIPE_SIZE
2653	for (i = `0`; i < nsh->nr_pages; i++) {
2654	if (nsh->pages[i])
2655	continue;
2656	nsh->pages[i] = alloc_page(GFP_NOIO);
2657	if (!nsh->pages[i])
2658	err = -ENOMEM;
2659	}
2660
2661	for (i = conf->raid_disks; i < newsize; i++) {
2662	if (nsh->dev[i].page)
2663	continue;
2664	nsh->dev[i].page = raid5_get_dev_page(nsh, i);
2665	nsh->dev[i].orig_page = nsh->dev[i].page;
2666	nsh->dev[i].offset = raid5_get_page_offset(nsh, i);
2667	}
2668	#else
2669	for (i=conf->raid_disks; i < newsize; i++)
2670	if (nsh->dev[i].page == NULL) {
2671	struct page *p = alloc_page(GFP_NOIO);
2672	nsh->dev[i].page = p;
2673	nsh->dev[i].orig_page = p;
2674	nsh->dev[i].offset = `0`;
2675	if (!p)
2676	err = -ENOMEM;
2677	}
2678	#endif
2679	raid5_release_stripe(sh: nsh);
2680	}
2681	/ critical section pass, GFP_NOIO no longer needed /
2682
2683	if (!err)
2684	conf->pool_size = newsize;
2685	mutex_unlock(lock: &conf->cache_size_mutex);
2686
2687	return err;
2688	}
2689
2690	static int drop_one_stripe(struct r5conf *conf)
2691	{
2692	struct stripe_head *sh;
2693	int hash = (conf->max_nr_stripes - `1`) & STRIPE_HASH_LOCKS_MASK;
2694
2695	spin_lock_irq(lock: conf->hash_locks + hash);
2696	sh = get_free_stripe(conf, hash);
2697	spin_unlock_irq(lock: conf->hash_locks + hash);
2698	if (!sh)
2699	return `0`;
2700	BUG_ON(atomic_read(&sh->count));
2701	shrink_buffers(sh);
2702	free_stripe(sc: conf->slab_cache, sh);
2703	atomic_dec(v: &conf->active_stripes);
2704	WRITE_ONCE(conf->max_nr_stripes, conf->max_nr_stripes - `1`);
2705	return `1`;
2706	}
2707
2708	static void shrink_stripes(struct r5conf *conf)
2709	{
2710	while (conf->max_nr_stripes &&
2711	drop_one_stripe(conf))
2712	;
2713
2714	kmem_cache_destroy(s: conf->slab_cache);
2715	conf->slab_cache = NULL;
2716	}
2717
2718	static void raid5_end_read_request(struct bio * bi)
2719	{
2720	struct stripe_head *sh = bi->bi_private;
2721	struct r5conf *conf = sh->raid_conf;
2722	int disks = sh->disks, i;
2723	struct md_rdev *rdev = NULL;
2724	sector_t s;
2725
2726	for (i=`0` ; i<disks; i++)
2727	if (bi == &sh->dev[i].req)
2728	break;
2729
2730	pr_debug("end_read_request %llu/%d, count: %d, error %d.\n",
2731	(unsigned long long)sh->sector, i, atomic_read(&sh->count),
2732	bi->bi_status);
2733	if (i == disks) {
2734	BUG();
2735	return;
2736	}
2737	if (test_bit(R5_ReadRepl, &sh->dev[i].flags))
2738	/ If replacement finished while this request was outstanding,*
2739	* 'replacement' might be NULL already.
2740	* In that case it moved down to 'rdev'.
2741	* rdev is not removed until all requests are finished.
2742	*/
2743	rdev = conf->disks[i].replacement;
2744	if (!rdev)
2745	rdev = conf->disks[i].rdev;
2746
2747	if (use_new_offset(conf, sh))
2748	s = sh->sector + rdev->new_data_offset;
2749	else
2750	s = sh->sector + rdev->data_offset;
2751	if (!bi->bi_status) {
2752	set_bit(nr: R5_UPTODATE, addr: &sh->dev[i].flags);
2753	if (test_bit(R5_ReadError, &sh->dev[i].flags)) {
2754	/ Note that this cannot happen on a*
2755	* replacement device. We just fail those on
2756	* any error
2757	*/
2758	pr_info_ratelimited(
2759	"md/raid:%s: read error corrected (%lu sectors at %llu on %pg)\n",
2760	mdname(conf->mddev), RAID5_STRIPE_SECTORS(conf),
2761	(unsigned long long)s,
2762	rdev->bdev);
2763	atomic_add(RAID5_STRIPE_SECTORS(conf), v: &rdev->corrected_errors);
2764	clear_bit(nr: R5_ReadError, addr: &sh->dev[i].flags);
2765	clear_bit(nr: R5_ReWrite, addr: &sh->dev[i].flags);
2766	} else if (test_bit(R5_ReadNoMerge, &sh->dev[i].flags))
2767	clear_bit(nr: R5_ReadNoMerge, addr: &sh->dev[i].flags);
2768
2769	if (test_bit(R5_InJournal, &sh->dev[i].flags))
2770	/*
2771	* end read for a page in journal, this
2772	* must be preparing for prexor in rmw
2773	*/
2774	set_bit(nr: R5_OrigPageUPTDODATE, addr: &sh->dev[i].flags);
2775
2776	if (atomic_read(v: &rdev->read_errors))
2777	atomic_set(v: &rdev->read_errors, i: `0`);
2778	} else {
2779	int retry = `0`;
2780	int set_bad = `0`;
2781
2782	clear_bit(nr: R5_UPTODATE, addr: &sh->dev[i].flags);
2783	if (!(bi->bi_status == BLK_STS_PROTECTION))
2784	atomic_inc(v: &rdev->read_errors);
2785	if (test_bit(R5_ReadRepl, &sh->dev[i].flags))
2786	pr_warn_ratelimited(
2787	"md/raid:%s: read error on replacement device (sector %llu on %pg).\n",
2788	mdname(conf->mddev),
2789	(unsigned long long)s,
2790	rdev->bdev);
2791	else if (conf->mddev->degraded >= conf->max_degraded) {
2792	set_bad = `1`;
2793	pr_warn_ratelimited(
2794	"md/raid:%s: read error not correctable (sector %llu on %pg).\n",
2795	mdname(conf->mddev),
2796	(unsigned long long)s,
2797	rdev->bdev);
2798	} else if (test_bit(R5_ReWrite, &sh->dev[i].flags)) {
2799	/ Oh, no!!! /
2800	set_bad = `1`;
2801	pr_warn_ratelimited(
2802	"md/raid:%s: read error NOT corrected!! (sector %llu on %pg).\n",
2803	mdname(conf->mddev),
2804	(unsigned long long)s,
2805	rdev->bdev);
2806	} else if (atomic_read(v: &rdev->read_errors)
2807	> conf->max_nr_stripes) {
2808	if (!test_bit(Faulty, &rdev->flags)) {
2809	pr_warn("md/raid:%s: %d read_errors > %d stripes\n",
2810	mdname(conf->mddev),
2811	atomic_read(&rdev->read_errors),
2812	conf->max_nr_stripes);
2813	pr_warn("md/raid:%s: Too many read errors, failing device %pg.\n",
2814	mdname(conf->mddev), rdev->bdev);
2815	}
2816	} else
2817	retry = `1`;
2818	if (set_bad && test_bit(In_sync, &rdev->flags)
2819	&& !test_bit(R5_ReadNoMerge, &sh->dev[i].flags))
2820	retry = `1`;
2821	if (retry)
2822	if (sh->qd_idx >= `0` && sh->pd_idx == i)
2823	set_bit(nr: R5_ReadError, addr: &sh->dev[i].flags);
2824	else if (test_bit(R5_ReadNoMerge, &sh->dev[i].flags)) {
2825	set_bit(nr: R5_ReadError, addr: &sh->dev[i].flags);
2826	clear_bit(nr: R5_ReadNoMerge, addr: &sh->dev[i].flags);
2827	} else
2828	set_bit(nr: R5_ReadNoMerge, addr: &sh->dev[i].flags);
2829	else {
2830	clear_bit(nr: R5_ReadError, addr: &sh->dev[i].flags);
2831	clear_bit(nr: R5_ReWrite, addr: &sh->dev[i].flags);
2832	if (!(set_bad
2833	&& test_bit(In_sync, &rdev->flags)
2834	&& rdev_set_badblocks(
2835	rdev, s: sh->sector, RAID5_STRIPE_SECTORS(conf), is_new: `0`)))
2836	md_error(mddev: conf->mddev, rdev);
2837	}
2838	}
2839	rdev_dec_pending(rdev, mddev: conf->mddev);
2840	bio_uninit(bi);
2841	clear_bit(nr: R5_LOCKED, addr: &sh->dev[i].flags);
2842	set_bit(nr: STRIPE_HANDLE, addr: &sh->state);
2843	raid5_release_stripe(sh);
2844	}
2845
2846	static void raid5_end_write_request(struct bio *bi)
2847	{
2848	struct stripe_head *sh = bi->bi_private;
2849	struct r5conf *conf = sh->raid_conf;
2850	int disks = sh->disks, i;
2851	struct md_rdev *rdev;
2852	int replacement = `0`;
2853
2854	for (i = `0` ; i < disks; i++) {
2855	if (bi == &sh->dev[i].req) {
2856	rdev = conf->disks[i].rdev;
2857	break;
2858	}
2859	if (bi == &sh->dev[i].rreq) {
2860	rdev = conf->disks[i].replacement;
2861	if (rdev)
2862	replacement = `1`;
2863	else
2864	/ rdev was removed and 'replacement'*
2865	* replaced it. rdev is not removed
2866	* until all requests are finished.
2867	*/
2868	rdev = conf->disks[i].rdev;
2869	break;
2870	}
2871	}
2872	pr_debug("end_write_request %llu/%d, count %d, error: %d.\n",
2873	(unsigned long long)sh->sector, i, atomic_read(&sh->count),
2874	bi->bi_status);
2875	if (i == disks) {
2876	BUG();
2877	return;
2878	}
2879
2880	if (replacement) {
2881	if (bi->bi_status)
2882	md_error(mddev: conf->mddev, rdev);
2883	else if (rdev_has_badblock(rdev, s: sh->sector,
2884	RAID5_STRIPE_SECTORS(conf)))
2885	set_bit(nr: R5_MadeGoodRepl, addr: &sh->dev[i].flags);
2886	} else {
2887	if (bi->bi_status) {
2888	set_bit(nr: STRIPE_DEGRADED, addr: &sh->state);
2889	set_bit(nr: WriteErrorSeen, addr: &rdev->flags);
2890	set_bit(nr: R5_WriteError, addr: &sh->dev[i].flags);
2891	if (!test_and_set_bit(nr: WantReplacement, addr: &rdev->flags))
2892	set_bit(nr: MD_RECOVERY_NEEDED,
2893	addr: &rdev->mddev->recovery);
2894	} else if (rdev_has_badblock(rdev, s: sh->sector,
2895	RAID5_STRIPE_SECTORS(conf))) {
2896	set_bit(nr: R5_MadeGood, addr: &sh->dev[i].flags);
2897	if (test_bit(R5_ReadError, &sh->dev[i].flags))
2898	/ That was a successful write so make*
2899	* sure it looks like we already did
2900	* a re-write.
2901	*/
2902	set_bit(nr: R5_ReWrite, addr: &sh->dev[i].flags);
2903	}
2904	}
2905	rdev_dec_pending(rdev, mddev: conf->mddev);
2906
2907	if (sh->batch_head && bi->bi_status && !replacement)
2908	set_bit(nr: STRIPE_BATCH_ERR, addr: &sh->batch_head->state);
2909
2910	bio_uninit(bi);
2911	if (!test_and_clear_bit(nr: R5_DOUBLE_LOCKED, addr: &sh->dev[i].flags))
2912	clear_bit(nr: R5_LOCKED, addr: &sh->dev[i].flags);
2913	set_bit(nr: STRIPE_HANDLE, addr: &sh->state);
2914
2915	if (sh->batch_head && sh != sh->batch_head)
2916	raid5_release_stripe(sh: sh->batch_head);
2917	raid5_release_stripe(sh);
2918	}
2919
2920	static void raid5_error(struct mddev mddev, struct* md_rdev *rdev)
2921	{
2922	struct r5conf *conf = mddev->private;
2923	unsigned long flags;
2924	pr_debug("raid456: error called\n");
2925
2926	pr_crit("md/raid:%s: Disk failure on %pg, disabling device.\n",
2927	mdname(mddev), rdev->bdev);
2928
2929	spin_lock_irqsave(&conf->device_lock, flags);
2930	set_bit(nr: Faulty, addr: &rdev->flags);
2931	clear_bit(nr: In_sync, addr: &rdev->flags);
2932	mddev->degraded = raid5_calc_degraded(conf);
2933
2934	if (has_failed(conf)) {
2935	set_bit(nr: MD_BROKEN, addr: &conf->mddev->flags);
2936	conf->recovery_disabled = mddev->recovery_disabled;
2937
2938	pr_crit("md/raid:%s: Cannot continue operation (%d/%d failed).\n",
2939	mdname(mddev), mddev->degraded, conf->raid_disks);
2940	} else {
2941	pr_crit("md/raid:%s: Operation continuing on %d devices.\n",
2942	mdname(mddev), conf->raid_disks - mddev->degraded);
2943	}
2944
2945	spin_unlock_irqrestore(lock: &conf->device_lock, flags);
2946	set_bit(nr: MD_RECOVERY_INTR, addr: &mddev->recovery);
2947
2948	set_bit(nr: Blocked, addr: &rdev->flags);
2949	set_mask_bits(&mddev->sb_flags, `0`,
2950	BIT(MD_SB_CHANGE_DEVS) \| BIT(MD_SB_CHANGE_PENDING));
2951	r5c_update_on_rdev_error(mddev, rdev);
2952	}
2953
2954	/*
2955	* Input: a 'big' sector number,
2956	* Output: index of the data and parity disk, and the sector # in them.
2957	*/
2958	sector_t raid5_compute_sector(struct r5conf *conf, sector_t r_sector,
2959	int previous, int *dd_idx,
2960	struct stripe_head *sh)
2961	{
2962	sector_t stripe, stripe2;
2963	sector_t chunk_number;
2964	unsigned int chunk_offset;
2965	int pd_idx, qd_idx;
2966	int ddf_layout = `0`;
2967	sector_t new_sector;
2968	int algorithm = previous ? conf->prev_algo
2969	: conf->algorithm;
2970	int sectors_per_chunk = previous ? conf->prev_chunk_sectors
2971	: conf->chunk_sectors;
2972	int raid_disks = previous ? conf->previous_raid_disks
2973	: conf->raid_disks;
2974	int data_disks = raid_disks - conf->max_degraded;
2975
2976	/ First compute the information on this sector /
2977
2978	/*
2979	* Compute the chunk number and the sector offset inside the chunk
2980	*/
2981	chunk_offset = sector_div(r_sector, sectors_per_chunk);
2982	chunk_number = r_sector;
2983
2984	/*
2985	* Compute the stripe number
2986	*/
2987	stripe = chunk_number;
2988	*dd_idx = sector_div(stripe, data_disks);
2989	stripe2 = stripe;
2990	/*
2991	* Select the parity disk based on the user selected algorithm.
2992	*/
2993	pd_idx = qd_idx = -`1`;
2994	switch(conf->level) {
2995	case `4`:
2996	pd_idx = data_disks;
2997	break;
2998	case `5`:
2999	switch (algorithm) {
3000	case ALGORITHM_LEFT_ASYMMETRIC:
3001	pd_idx = data_disks - sector_div(stripe2, raid_disks);
3002	if (*dd_idx >= pd_idx)
3003	(*dd_idx)++;
3004	break;
3005	case ALGORITHM_RIGHT_ASYMMETRIC:
3006	pd_idx = sector_div(stripe2, raid_disks);
3007	if (*dd_idx >= pd_idx)
3008	(*dd_idx)++;
3009	break;
3010	case ALGORITHM_LEFT_SYMMETRIC:
3011	pd_idx = data_disks - sector_div(stripe2, raid_disks);
3012	dd_idx = (pd_idx + `1` + dd_idx) % raid_disks;
3013	break;
3014	case ALGORITHM_RIGHT_SYMMETRIC:
3015	pd_idx = sector_div(stripe2, raid_disks);
3016	dd_idx = (pd_idx + `1` + dd_idx) % raid_disks;
3017	break;
3018	case ALGORITHM_PARITY_0:
3019	pd_idx = `0`;
3020	(*dd_idx)++;
3021	break;
3022	case ALGORITHM_PARITY_N:
3023	pd_idx = data_disks;
3024	break;
3025	default:
3026	BUG();
3027	}
3028	break;
3029	case `6`:
3030
3031	switch (algorithm) {
3032	case ALGORITHM_LEFT_ASYMMETRIC:
3033	pd_idx = raid_disks - `1` - sector_div(stripe2, raid_disks);
3034	qd_idx = pd_idx + `1`;
3035	if (pd_idx == raid_disks-`1`) {
3036	(dd_idx)++; /* Q D D D P /
3037	qd_idx = `0`;
3038	} else if (*dd_idx >= pd_idx)
3039	(dd_idx) += `2`; /* D D P Q D /
3040	break;
3041	case ALGORITHM_RIGHT_ASYMMETRIC:
3042	pd_idx = sector_div(stripe2, raid_disks);
3043	qd_idx = pd_idx + `1`;
3044	if (pd_idx == raid_disks-`1`) {
3045	(dd_idx)++; /* Q D D D P /
3046	qd_idx = `0`;
3047	} else if (*dd_idx >= pd_idx)
3048	(dd_idx) += `2`; /* D D P Q D /
3049	break;
3050	case ALGORITHM_LEFT_SYMMETRIC:
3051	pd_idx = raid_disks - `1` - sector_div(stripe2, raid_disks);
3052	qd_idx = (pd_idx + `1`) % raid_disks;
3053	dd_idx = (pd_idx + `2` + dd_idx) % raid_disks;
3054	break;
3055	case ALGORITHM_RIGHT_SYMMETRIC:
3056	pd_idx = sector_div(stripe2, raid_disks);
3057	qd_idx = (pd_idx + `1`) % raid_disks;
3058	dd_idx = (pd_idx + `2` + dd_idx) % raid_disks;
3059	break;
3060
3061	case ALGORITHM_PARITY_0:
3062	pd_idx = `0`;
3063	qd_idx = `1`;
3064	(*dd_idx) += `2`;
3065	break;
3066	case ALGORITHM_PARITY_N:
3067	pd_idx = data_disks;
3068	qd_idx = data_disks + `1`;
3069	break;
3070
3071	case ALGORITHM_ROTATING_ZERO_RESTART:
3072	/ Exactly the same as RIGHT_ASYMMETRIC, but or*
3073	* of blocks for computing Q is different.
3074	*/
3075	pd_idx = sector_div(stripe2, raid_disks);
3076	qd_idx = pd_idx + `1`;
3077	if (pd_idx == raid_disks-`1`) {
3078	(dd_idx)++; /* Q D D D P /
3079	qd_idx = `0`;
3080	} else if (*dd_idx >= pd_idx)
3081	(dd_idx) += `2`; /* D D P Q D /
3082	ddf_layout = `1`;
3083	break;
3084
3085	case ALGORITHM_ROTATING_N_RESTART:
3086	/ Same a left_asymmetric, by first stripe is*
3087	* D D D P Q rather than
3088	* Q D D D P
3089	*/
3090	stripe2 += `1`;
3091	pd_idx = raid_disks - `1` - sector_div(stripe2, raid_disks);
3092	qd_idx = pd_idx + `1`;
3093	if (pd_idx == raid_disks-`1`) {
3094	(dd_idx)++; /* Q D D D P /
3095	qd_idx = `0`;
3096	} else if (*dd_idx >= pd_idx)
3097	(dd_idx) += `2`; /* D D P Q D /
3098	ddf_layout = `1`;
3099	break;
3100
3101	case ALGORITHM_ROTATING_N_CONTINUE:
3102	/ Same as left_symmetric but Q is before P /
3103	pd_idx = raid_disks - `1` - sector_div(stripe2, raid_disks);
3104	qd_idx = (pd_idx + raid_disks - `1`) % raid_disks;
3105	dd_idx = (pd_idx + `1` + dd_idx) % raid_disks;
3106	ddf_layout = `1`;
3107	break;
3108
3109	case ALGORITHM_LEFT_ASYMMETRIC_6:
3110	/ RAID5 left_asymmetric, with Q on last device /
3111	pd_idx = data_disks - sector_div(stripe2, raid_disks-`1`);
3112	if (*dd_idx >= pd_idx)
3113	(*dd_idx)++;
3114	qd_idx = raid_disks - `1`;
3115	break;
3116
3117	case ALGORITHM_RIGHT_ASYMMETRIC_6:
3118	pd_idx = sector_div(stripe2, raid_disks-`1`);
3119	if (*dd_idx >= pd_idx)
3120	(*dd_idx)++;
3121	qd_idx = raid_disks - `1`;
3122	break;
3123
3124	case ALGORITHM_LEFT_SYMMETRIC_6:
3125	pd_idx = data_disks - sector_div(stripe2, raid_disks-`1`);
3126	dd_idx = (pd_idx + `1` + dd_idx) % (raid_disks-`1`);
3127	qd_idx = raid_disks - `1`;
3128	break;
3129
3130	case ALGORITHM_RIGHT_SYMMETRIC_6:
3131	pd_idx = sector_div(stripe2, raid_disks-`1`);
3132	dd_idx = (pd_idx + `1` + dd_idx) % (raid_disks-`1`);
3133	qd_idx = raid_disks - `1`;
3134	break;
3135
3136	case ALGORITHM_PARITY_0_6:
3137	pd_idx = `0`;
3138	(*dd_idx)++;
3139	qd_idx = raid_disks - `1`;
3140	break;
3141
3142	default:
3143	BUG();
3144	}
3145	break;
3146	}
3147
3148	if (sh) {
3149	sh->pd_idx = pd_idx;
3150	sh->qd_idx = qd_idx;
3151	sh->ddf_layout = ddf_layout;
3152	}
3153	/*
3154	* Finally, compute the new sector number
3155	*/
3156	new_sector = (sector_t)stripe * sectors_per_chunk + chunk_offset;
3157	return new_sector;
3158	}
3159
3160	sector_t raid5_compute_blocknr(struct stripe_head sh, int* i, int previous)
3161	{
3162	struct r5conf *conf = sh->raid_conf;
3163	int raid_disks = sh->disks;
3164	int data_disks = raid_disks - conf->max_degraded;
3165	sector_t new_sector = sh->sector, check;
3166	int sectors_per_chunk = previous ? conf->prev_chunk_sectors
3167	: conf->chunk_sectors;
3168	int algorithm = previous ? conf->prev_algo
3169	: conf->algorithm;
3170	sector_t stripe;
3171	int chunk_offset;
3172	sector_t chunk_number;
3173	int dummy1, dd_idx = i;
3174	sector_t r_sector;
3175	struct stripe_head sh2;
3176
3177	chunk_offset = sector_div(new_sector, sectors_per_chunk);
3178	stripe = new_sector;
3179
3180	if (i == sh->pd_idx)
3181	return `0`;
3182	switch(conf->level) {
3183	case `4`: break;
3184	case `5`:
3185	switch (algorithm) {
3186	case ALGORITHM_LEFT_ASYMMETRIC:
3187	case ALGORITHM_RIGHT_ASYMMETRIC:
3188	if (i > sh->pd_idx)
3189	i--;
3190	break;
3191	case ALGORITHM_LEFT_SYMMETRIC:
3192	case ALGORITHM_RIGHT_SYMMETRIC:
3193	if (i < sh->pd_idx)
3194	i += raid_disks;
3195	i -= (sh->pd_idx + `1`);
3196	break;
3197	case ALGORITHM_PARITY_0:
3198	i -= `1`;
3199	break;
3200	case ALGORITHM_PARITY_N:
3201	break;
3202	default:
3203	BUG();
3204	}
3205	break;
3206	case `6`:
3207	if (i == sh->qd_idx)
3208	return `0`; / It is the Q disk /
3209	switch (algorithm) {
3210	case ALGORITHM_LEFT_ASYMMETRIC:
3211	case ALGORITHM_RIGHT_ASYMMETRIC:
3212	case ALGORITHM_ROTATING_ZERO_RESTART:
3213	case ALGORITHM_ROTATING_N_RESTART:
3214	if (sh->pd_idx == raid_disks-`1`)
3215	i--; / Q D D D P /
3216	else if (i > sh->pd_idx)
3217	i -= `2`; / D D P Q D /
3218	break;
3219	case ALGORITHM_LEFT_SYMMETRIC:
3220	case ALGORITHM_RIGHT_SYMMETRIC:
3221	if (sh->pd_idx == raid_disks-`1`)
3222	i--; / Q D D D P /
3223	else {
3224	/ D D P Q D /
3225	if (i < sh->pd_idx)
3226	i += raid_disks;
3227	i -= (sh->pd_idx + `2`);
3228	}
3229	break;
3230	case ALGORITHM_PARITY_0:
3231	i -= `2`;
3232	break;
3233	case ALGORITHM_PARITY_N:
3234	break;
3235	case ALGORITHM_ROTATING_N_CONTINUE:
3236	/ Like left_symmetric, but P is before Q /
3237	if (sh->pd_idx == `0`)
3238	i--; / P D D D Q /
3239	else {
3240	/ D D Q P D /
3241	if (i < sh->pd_idx)
3242	i += raid_disks;
3243	i -= (sh->pd_idx + `1`);
3244	}
3245	break;
3246	case ALGORITHM_LEFT_ASYMMETRIC_6:
3247	case ALGORITHM_RIGHT_ASYMMETRIC_6:
3248	if (i > sh->pd_idx)
3249	i--;
3250	break;
3251	case ALGORITHM_LEFT_SYMMETRIC_6:
3252	case ALGORITHM_RIGHT_SYMMETRIC_6:
3253	if (i < sh->pd_idx)
3254	i += data_disks + `1`;
3255	i -= (sh->pd_idx + `1`);
3256	break;
3257	case ALGORITHM_PARITY_0_6:
3258	i -= `1`;
3259	break;
3260	default:
3261	BUG();
3262	}
3263	break;
3264	}
3265
3266	chunk_number = stripe * data_disks + i;
3267	r_sector = chunk_number * sectors_per_chunk + chunk_offset;
3268
3269	check = raid5_compute_sector(conf, r_sector,
3270	previous, dd_idx: &dummy1, sh: &sh2);
3271	if (check != sh->sector \|\| dummy1 != dd_idx \|\| sh2.pd_idx != sh->pd_idx
3272	\|\| sh2.qd_idx != sh->qd_idx) {
3273	pr_warn("md/raid:%s: compute_blocknr: map not correct\n",
3274	mdname(conf->mddev));
3275	return `0`;
3276	}
3277	return r_sector;
3278	}
3279
3280	/*
3281	* There are cases where we want handle_stripe_dirtying() and
3282	* schedule_reconstruction() to delay towrite to some dev of a stripe.
3283	*
3284	* This function checks whether we want to delay the towrite. Specifically,
3285	* we delay the towrite when:
3286	*
3287	* 1. degraded stripe has a non-overwrite to the missing dev, AND this
3288	* stripe has data in journal (for other devices).
3289	*
3290	* In this case, when reading data for the non-overwrite dev, it is
3291	* necessary to handle complex rmw of write back cache (prexor with
3292	* orig_page, and xor with page). To keep read path simple, we would
3293	* like to flush data in journal to RAID disks first, so complex rmw
3294	* is handled in the write patch (handle_stripe_dirtying).
3295	*
3296	* 2. when journal space is critical (R5C_LOG_CRITICAL=1)
3297	*
3298	* It is important to be able to flush all stripes in raid5-cache.
3299	* Therefore, we need reserve some space on the journal device for
3300	* these flushes. If flush operation includes pending writes to the
3301	* stripe, we need to reserve (conf->raid_disk + 1) pages per stripe
3302	* for the flush out. If we exclude these pending writes from flush
3303	* operation, we only need (conf->max_degraded + 1) pages per stripe.
3304	* Therefore, excluding pending writes in these cases enables more
3305	* efficient use of the journal device.
3306	*
3307	* Note: To make sure the stripe makes progress, we only delay
3308	* towrite for stripes with data already in journal (injournal > 0).
3309	* When LOG_CRITICAL, stripes with injournal == 0 will be sent to
3310	* no_space_stripes list.
3311	*
3312	* 3. during journal failure
3313	* In journal failure, we try to flush all cached data to raid disks
3314	* based on data in stripe cache. The array is read-only to upper
3315	* layers, so we would skip all pending writes.
3316	*
3317	*/
3318	static inline bool delay_towrite(struct r5conf *conf,
3319	struct r5dev *dev,
3320	struct stripe_head_state *s)
3321	{
3322	/ case 1 above /
3323	if (!test_bit(R5_OVERWRITE, &dev->flags) &&
3324	!test_bit(R5_Insync, &dev->flags) && s->injournal)
3325	return true;
3326	/ case 2 above /
3327	if (test_bit(R5C_LOG_CRITICAL, &conf->cache_state) &&
3328	s->injournal > `0`)
3329	return true;
3330	/ case 3 above /
3331	if (s->log_failed && s->injournal)
3332	return true;
3333	return false;
3334	}
3335
3336	static void
3337	schedule_reconstruction(struct stripe_head sh, struct* stripe_head_state *s,
3338	int rcw, int expand)
3339	{
3340	int i, pd_idx = sh->pd_idx, qd_idx = sh->qd_idx, disks = sh->disks;
3341	struct r5conf *conf = sh->raid_conf;
3342	int level = conf->level;
3343
3344	if (rcw) {
3345	/*
3346	* In some cases, handle_stripe_dirtying initially decided to
3347	* run rmw and allocates extra page for prexor. However, rcw is
3348	* cheaper later on. We need to free the extra page now,
3349	* because we won't be able to do that in ops_complete_prexor().
3350	*/
3351	r5c_release_extra_page(sh);
3352
3353	for (i = disks; i--; ) {
3354	struct r5dev *dev = &sh->dev[i];
3355
3356	if (dev->towrite && !delay_towrite(conf, dev, s)) {
3357	set_bit(nr: R5_LOCKED, addr: &dev->flags);
3358	set_bit(nr: R5_Wantdrain, addr: &dev->flags);
3359	if (!expand)
3360	clear_bit(nr: R5_UPTODATE, addr: &dev->flags);
3361	s->locked++;
3362	} else if (test_bit(R5_InJournal, &dev->flags)) {
3363	set_bit(nr: R5_LOCKED, addr: &dev->flags);
3364	s->locked++;
3365	}
3366	}
3367	/ if we are not expanding this is a proper write request, and*
3368	* there will be bios with new data to be drained into the
3369	* stripe cache
3370	*/
3371	if (!expand) {
3372	if (!s->locked)
3373	/ False alarm, nothing to do /
3374	return;
3375	sh->reconstruct_state = reconstruct_state_drain_run;
3376	set_bit(nr: STRIPE_OP_BIODRAIN, addr: &s->ops_request);
3377	} else
3378	sh->reconstruct_state = reconstruct_state_run;
3379
3380	set_bit(nr: STRIPE_OP_RECONSTRUCT, addr: &s->ops_request);
3381
3382	if (s->locked + conf->max_degraded == disks)
3383	if (!test_and_set_bit(nr: STRIPE_FULL_WRITE, addr: &sh->state))
3384	atomic_inc(v: &conf->pending_full_writes);
3385	} else {
3386	BUG_ON(!(test_bit(R5_UPTODATE, &sh->dev[pd_idx].flags) \|\|
3387	test_bit(R5_Wantcompute, &sh->dev[pd_idx].flags)));
3388	BUG_ON(level == `6` &&
3389	(!(test_bit(R5_UPTODATE, &sh->dev[qd_idx].flags) \|\|
3390	test_bit(R5_Wantcompute, &sh->dev[qd_idx].flags))));
3391
3392	for (i = disks; i--; ) {
3393	struct r5dev *dev = &sh->dev[i];
3394	if (i == pd_idx \|\| i == qd_idx)
3395	continue;
3396
3397	if (dev->towrite &&
3398	(test_bit(R5_UPTODATE, &dev->flags) \|\|
3399	test_bit(R5_Wantcompute, &dev->flags))) {
3400	set_bit(nr: R5_Wantdrain, addr: &dev->flags);
3401	set_bit(nr: R5_LOCKED, addr: &dev->flags);
3402	clear_bit(nr: R5_UPTODATE, addr: &dev->flags);
3403	s->locked++;
3404	} else if (test_bit(R5_InJournal, &dev->flags)) {
3405	set_bit(nr: R5_LOCKED, addr: &dev->flags);
3406	s->locked++;
3407	}
3408	}
3409	if (!s->locked)
3410	/ False alarm - nothing to do /
3411	return;
3412	sh->reconstruct_state = reconstruct_state_prexor_drain_run;
3413	set_bit(nr: STRIPE_OP_PREXOR, addr: &s->ops_request);
3414	set_bit(nr: STRIPE_OP_BIODRAIN, addr: &s->ops_request);
3415	set_bit(nr: STRIPE_OP_RECONSTRUCT, addr: &s->ops_request);
3416	}
3417
3418	/ keep the parity disk(s) locked while asynchronous operations*
3419	* are in flight
3420	*/
3421	set_bit(nr: R5_LOCKED, addr: &sh->dev[pd_idx].flags);
3422	clear_bit(nr: R5_UPTODATE, addr: &sh->dev[pd_idx].flags);
3423	s->locked++;
3424
3425	if (level == `6`) {
3426	int qd_idx = sh->qd_idx;
3427	struct r5dev *dev = &sh->dev[qd_idx];
3428
3429	set_bit(nr: R5_LOCKED, addr: &dev->flags);
3430	clear_bit(nr: R5_UPTODATE, addr: &dev->flags);
3431	s->locked++;
3432	}
3433
3434	if (raid5_has_ppl(conf: sh->raid_conf) && sh->ppl_page &&
3435	test_bit(STRIPE_OP_BIODRAIN, &s->ops_request) &&
3436	!test_bit(STRIPE_FULL_WRITE, &sh->state) &&
3437	test_bit(R5_Insync, &sh->dev[pd_idx].flags))
3438	set_bit(nr: STRIPE_OP_PARTIAL_PARITY, addr: &s->ops_request);
3439
3440	pr_debug("%s: stripe %llu locked: %d ops_request: %lx\n",
3441	__func__, (unsigned long long)sh->sector,
3442	s->locked, s->ops_request);
3443	}
3444
3445	static bool stripe_bio_overlaps(struct stripe_head sh, struct* bio *bi,
3446	int dd_idx, int forwrite)
3447	{
3448	struct r5conf *conf = sh->raid_conf;
3449	struct bio **bip;
3450
3451	pr_debug("checking bi b#%llu to stripe s#%llu\n",
3452	bi->bi_iter.bi_sector, sh->sector);
3453
3454	/ Don't allow new IO added to stripes in batch list /
3455	if (sh->batch_head)
3456	return true;
3457
3458	if (forwrite)
3459	bip = &sh->dev[dd_idx].towrite;
3460	else
3461	bip = &sh->dev[dd_idx].toread;
3462
3463	while (bip && (bip)->bi_iter.bi_sector < bi->bi_iter.bi_sector) {
3464	if (bio_end_sector(*bip) > bi->bi_iter.bi_sector)
3465	return true;
3466	bip = &(*bip)->bi_next;
3467	}
3468
3469	if (bip && (bip)->bi_iter.bi_sector < bio_end_sector(bi))
3470	return true;
3471
3472	if (forwrite && raid5_has_ppl(conf)) {
3473	/*
3474	* With PPL only writes to consecutive data chunks within a
3475	* stripe are allowed because for a single stripe_head we can
3476	* only have one PPL entry at a time, which describes one data
3477	* range. Not really an overlap, but wait_for_overlap can be
3478	* used to handle this.
3479	*/
3480	sector_t sector;
3481	sector_t first = `0`;
3482	sector_t last = `0`;
3483	int count = `0`;
3484	int i;
3485
3486	for (i = `0`; i < sh->disks; i++) {
3487	if (i != sh->pd_idx &&
3488	(i == dd_idx \|\| sh->dev[i].towrite)) {
3489	sector = sh->dev[i].sector;
3490	if (count == `0` \|\| sector < first)
3491	first = sector;
3492	if (sector > last)
3493	last = sector;
3494	count++;
3495	}
3496	}
3497
3498	if (first + conf->chunk_sectors * (count - `1`) != last)
3499	return true;
3500	}
3501
3502	return false;
3503	}
3504
3505	static void __add_stripe_bio(struct stripe_head sh, struct* bio *bi,
3506	int dd_idx, int forwrite, int previous)
3507	{
3508	struct r5conf *conf = sh->raid_conf;
3509	struct bio **bip;
3510	int firstwrite = `0`;
3511
3512	if (forwrite) {
3513	bip = &sh->dev[dd_idx].towrite;
3514	if (!*bip)
3515	firstwrite = `1`;
3516	} else {
3517	bip = &sh->dev[dd_idx].toread;
3518	}
3519
3520	while (bip && (bip)->bi_iter.bi_sector < bi->bi_iter.bi_sector)
3521	bip = &(*bip)->bi_next;
3522
3523	if (!forwrite \|\| previous)
3524	clear_bit(nr: STRIPE_BATCH_READY, addr: &sh->state);
3525
3526	BUG_ON(bip && bi->bi_next && (bip) != bi->bi_next);
3527	if (*bip)
3528	bi->bi_next = *bip;
3529	*bip = bi;
3530	bio_inc_remaining(bio: bi);
3531	md_write_inc(mddev: conf->mddev, bi);
3532
3533	if (forwrite) {
3534	/ check if page is covered /
3535	sector_t sector = sh->dev[dd_idx].sector;
3536	for (bi=sh->dev[dd_idx].towrite;
3537	sector < sh->dev[dd_idx].sector + RAID5_STRIPE_SECTORS(conf) &&
3538	bi && bi->bi_iter.bi_sector <= sector;
3539	bi = r5_next_bio(conf, bio: bi, sector: sh->dev[dd_idx].sector)) {
3540	if (bio_end_sector(bi) >= sector)
3541	sector = bio_end_sector(bi);
3542	}
3543	if (sector >= sh->dev[dd_idx].sector + RAID5_STRIPE_SECTORS(conf))
3544	if (!test_and_set_bit(nr: R5_OVERWRITE, addr: &sh->dev[dd_idx].flags))
3545	sh->overwrite_disks++;
3546	}
3547
3548	pr_debug("added bi b#%llu to stripe s#%llu, disk %d, logical %llu\n",
3549	(*bip)->bi_iter.bi_sector, sh->sector, dd_idx,
3550	sh->dev[dd_idx].sector);
3551
3552	if (conf->mddev->bitmap && firstwrite) {
3553	/ Cannot hold spinlock over bitmap_startwrite,*
3554	* but must ensure this isn't added to a batch until
3555	* we have added to the bitmap and set bm_seq.
3556	* So set STRIPE_BITMAP_PENDING to prevent
3557	* batching.
3558	* If multiple __add_stripe_bio() calls race here they
3559	* much all set STRIPE_BITMAP_PENDING. So only the first one
3560	* to complete "bitmap_startwrite" gets to set
3561	* STRIPE_BIT_DELAY. This is important as once a stripe
3562	* is added to a batch, STRIPE_BIT_DELAY cannot be changed
3563	* any more.
3564	*/
3565	set_bit(nr: STRIPE_BITMAP_PENDING, addr: &sh->state);
3566	spin_unlock_irq(lock: &sh->stripe_lock);
3567	md_bitmap_startwrite(bitmap: conf->mddev->bitmap, offset: sh->sector,
3568	RAID5_STRIPE_SECTORS(conf), behind: `0`);
3569	spin_lock_irq(lock: &sh->stripe_lock);
3570	clear_bit(nr: STRIPE_BITMAP_PENDING, addr: &sh->state);
3571	if (!sh->batch_head) {
3572	sh->bm_seq = conf->seq_flush+`1`;
3573	set_bit(nr: STRIPE_BIT_DELAY, addr: &sh->state);
3574	}
3575	}
3576	}
3577
3578	/*
3579	* Each stripe/dev can have one or more bios attached.
3580	* toread/towrite point to the first in a chain.
3581	* The bi_next chain must be in order.
3582	*/
3583	static bool add_stripe_bio(struct stripe_head sh, struct* bio *bi,
3584	int dd_idx, int forwrite, int previous)
3585	{
3586	spin_lock_irq(lock: &sh->stripe_lock);
3587
3588	if (stripe_bio_overlaps(sh, bi, dd_idx, forwrite)) {
3589	set_bit(nr: R5_Overlap, addr: &sh->dev[dd_idx].flags);
3590	spin_unlock_irq(lock: &sh->stripe_lock);
3591	return false;
3592	}
3593
3594	__add_stripe_bio(sh, bi, dd_idx, forwrite, previous);
3595	spin_unlock_irq(lock: &sh->stripe_lock);
3596	return true;
3597	}
3598
3599	static void end_reshape(struct r5conf *conf);
3600
3601	static void stripe_set_idx(sector_t stripe, struct r5conf conf, int* previous,
3602	struct stripe_head *sh)
3603	{
3604	int sectors_per_chunk =
3605	previous ? conf->prev_chunk_sectors : conf->chunk_sectors;
3606	int dd_idx;
3607	int chunk_offset = sector_div(stripe, sectors_per_chunk);
3608	int disks = previous ? conf->previous_raid_disks : conf->raid_disks;
3609
3610	raid5_compute_sector(conf,
3611	r_sector: stripe * (disks - conf->max_degraded)
3612	*sectors_per_chunk + chunk_offset,
3613	previous,
3614	dd_idx: &dd_idx, sh);
3615	}
3616
3617	static void
3618	handle_failed_stripe(struct r5conf conf, struct* stripe_head *sh,
3619	struct stripe_head_state s, int* disks)
3620	{
3621	int i;
3622	BUG_ON(sh->batch_head);
3623	for (i = disks; i--; ) {
3624	struct bio *bi;
3625	int bitmap_end = `0`;
3626
3627	if (test_bit(R5_ReadError, &sh->dev[i].flags)) {
3628	struct md_rdev *rdev = conf->disks[i].rdev;
3629
3630	if (rdev && test_bit(In_sync, &rdev->flags) &&
3631	!test_bit(Faulty, &rdev->flags))
3632	atomic_inc(v: &rdev->nr_pending);
3633	else
3634	rdev = NULL;
3635	if (rdev) {
3636	if (!rdev_set_badblocks(
3637	rdev,
3638	s: sh->sector,
3639	RAID5_STRIPE_SECTORS(conf), is_new: `0`))
3640	md_error(mddev: conf->mddev, rdev);
3641	rdev_dec_pending(rdev, mddev: conf->mddev);
3642	}
3643	}
3644	spin_lock_irq(lock: &sh->stripe_lock);
3645	/ fail all writes first /
3646	bi = sh->dev[i].towrite;
3647	sh->dev[i].towrite = NULL;
3648	sh->overwrite_disks = `0`;
3649	spin_unlock_irq(lock: &sh->stripe_lock);
3650	if (bi)
3651	bitmap_end = `1`;
3652
3653	log_stripe_write_finished(sh);
3654
3655	if (test_and_clear_bit(nr: R5_Overlap, addr: &sh->dev[i].flags))
3656	wake_up(&conf->wait_for_overlap);
3657
3658	while (bi && bi->bi_iter.bi_sector <
3659	sh->dev[i].sector + RAID5_STRIPE_SECTORS(conf)) {
3660	struct bio *nextbi = r5_next_bio(conf, bio: bi, sector: sh->dev[i].sector);
3661
3662	md_write_end(mddev: conf->mddev);
3663	bio_io_error(bio: bi);
3664	bi = nextbi;
3665	}
3666	if (bitmap_end)
3667	md_bitmap_endwrite(bitmap: conf->mddev->bitmap, offset: sh->sector,
3668	RAID5_STRIPE_SECTORS(conf), success: `0`, behind: `0`);
3669	bitmap_end = `0`;
3670	/ and fail all 'written' /
3671	bi = sh->dev[i].written;
3672	sh->dev[i].written = NULL;
3673	if (test_and_clear_bit(nr: R5_SkipCopy, addr: &sh->dev[i].flags)) {
3674	WARN_ON(test_bit(R5_UPTODATE, &sh->dev[i].flags));
3675	sh->dev[i].page = sh->dev[i].orig_page;
3676	}
3677
3678	if (bi) bitmap_end = `1`;
3679	while (bi && bi->bi_iter.bi_sector <
3680	sh->dev[i].sector + RAID5_STRIPE_SECTORS(conf)) {
3681	struct bio *bi2 = r5_next_bio(conf, bio: bi, sector: sh->dev[i].sector);
3682
3683	md_write_end(mddev: conf->mddev);
3684	bio_io_error(bio: bi);
3685	bi = bi2;
3686	}
3687
3688	/ fail any reads if this device is non-operational and*
3689	* the data has not reached the cache yet.
3690	*/
3691	if (!test_bit(R5_Wantfill, &sh->dev[i].flags) &&
3692	s->failed > conf->max_degraded &&
3693	(!test_bit(R5_Insync, &sh->dev[i].flags) \|\|
3694	test_bit(R5_ReadError, &sh->dev[i].flags))) {
3695	spin_lock_irq(lock: &sh->stripe_lock);
3696	bi = sh->dev[i].toread;
3697	sh->dev[i].toread = NULL;
3698	spin_unlock_irq(lock: &sh->stripe_lock);
3699	if (test_and_clear_bit(nr: R5_Overlap, addr: &sh->dev[i].flags))
3700	wake_up(&conf->wait_for_overlap);
3701	if (bi)
3702	s->to_read--;
3703	while (bi && bi->bi_iter.bi_sector <
3704	sh->dev[i].sector + RAID5_STRIPE_SECTORS(conf)) {
3705	struct bio *nextbi =
3706	r5_next_bio(conf, bio: bi, sector: sh->dev[i].sector);
3707
3708	bio_io_error(bio: bi);
3709	bi = nextbi;
3710	}
3711	}
3712	if (bitmap_end)
3713	md_bitmap_endwrite(bitmap: conf->mddev->bitmap, offset: sh->sector,
3714	RAID5_STRIPE_SECTORS(conf), success: `0`, behind: `0`);
3715	/ If we were in the middle of a write the parity block might*
3716	* still be locked - so just clear all R5_LOCKED flags
3717	*/
3718	clear_bit(nr: R5_LOCKED, addr: &sh->dev[i].flags);
3719	}
3720	s->to_write = `0`;
3721	s->written = `0`;
3722
3723	if (test_and_clear_bit(nr: STRIPE_FULL_WRITE, addr: &sh->state))
3724	if (atomic_dec_and_test(v: &conf->pending_full_writes))
3725	md_wakeup_thread(thread: conf->mddev->thread);
3726	}
3727
3728	static void
3729	handle_failed_sync(struct r5conf conf, struct* stripe_head *sh,
3730	struct stripe_head_state *s)
3731	{
3732	int abort = `0`;
3733	int i;
3734
3735	BUG_ON(sh->batch_head);
3736	clear_bit(nr: STRIPE_SYNCING, addr: &sh->state);
3737	if (test_and_clear_bit(nr: R5_Overlap, addr: &sh->dev[sh->pd_idx].flags))
3738	wake_up(&conf->wait_for_overlap);
3739	s->syncing = `0`;
3740	s->replacing = `0`;
3741	/ There is nothing more to do for sync/check/repair.*
3742	* Don't even need to abort as that is handled elsewhere
3743	* if needed, and not always wanted e.g. if there is a known
3744	* bad block here.
3745	* For recover/replace we need to record a bad block on all
3746	* non-sync devices, or abort the recovery
3747	*/
3748	if (test_bit(MD_RECOVERY_RECOVER, &conf->mddev->recovery)) {
3749	/ During recovery devices cannot be removed, so*
3750	* locking and refcounting of rdevs is not needed
3751	*/
3752	for (i = `0`; i < conf->raid_disks; i++) {
3753	struct md_rdev *rdev = conf->disks[i].rdev;
3754
3755	if (rdev
3756	&& !test_bit(Faulty, &rdev->flags)
3757	&& !test_bit(In_sync, &rdev->flags)
3758	&& !rdev_set_badblocks(rdev, s: sh->sector,
3759	RAID5_STRIPE_SECTORS(conf), is_new: `0`))
3760	abort = `1`;
3761	rdev = conf->disks[i].replacement;
3762
3763	if (rdev
3764	&& !test_bit(Faulty, &rdev->flags)
3765	&& !test_bit(In_sync, &rdev->flags)
3766	&& !rdev_set_badblocks(rdev, s: sh->sector,
3767	RAID5_STRIPE_SECTORS(conf), is_new: `0`))
3768	abort = `1`;
3769	}
3770	if (abort)
3771	conf->recovery_disabled =
3772	conf->mddev->recovery_disabled;
3773	}
3774	md_done_sync(mddev: conf->mddev, RAID5_STRIPE_SECTORS(conf), ok: !abort);
3775	}
3776
3777	static int want_replace(struct stripe_head sh, int* disk_idx)
3778	{
3779	struct md_rdev *rdev;
3780	int rv = `0`;
3781
3782	rdev = sh->raid_conf->disks[disk_idx].replacement;
3783	if (rdev
3784	&& !test_bit(Faulty, &rdev->flags)
3785	&& !test_bit(In_sync, &rdev->flags)
3786	&& (rdev->recovery_offset <= sh->sector
3787	\|\| rdev->mddev->recovery_cp <= sh->sector))
3788	rv = `1`;
3789	return rv;
3790	}
3791
3792	static int need_this_block(struct stripe_head sh, struct* stripe_head_state *s,
3793	int disk_idx, int disks)
3794	{
3795	struct r5dev *dev = &sh->dev[disk_idx];
3796	struct r5dev *fdev[`2`] = { &sh->dev[s->failed_num[`0`]],
3797	&sh->dev[s->failed_num[`1`]] };
3798	int i;
3799	bool force_rcw = (sh->raid_conf->rmw_level == PARITY_DISABLE_RMW);
3800
3801
3802	if (test_bit(R5_LOCKED, &dev->flags) \|\|
3803	test_bit(R5_UPTODATE, &dev->flags))
3804	/ No point reading this as we already have it or have*
3805	* decided to get it.
3806	*/
3807	return `0`;
3808
3809	if (dev->toread \|\|
3810	(dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags)))
3811	/ We need this block to directly satisfy a request /
3812	return `1`;
3813
3814	if (s->syncing \|\| s->expanding \|\|
3815	(s->replacing && want_replace(sh, disk_idx)))
3816	/ When syncing, or expanding we read everything.*
3817	* When replacing, we need the replaced block.
3818	*/
3819	return `1`;
3820
3821	if ((s->failed >= `1` && fdev[`0`]->toread) \|\|
3822	(s->failed >= `2` && fdev[`1`]->toread))
3823	/ If we want to read from a failed device, then*
3824	* we need to actually read every other device.
3825	*/
3826	return `1`;
3827
3828	/ Sometimes neither read-modify-write nor reconstruct-write*
3829	* cycles can work. In those cases we read every block we
3830	* can. Then the parity-update is certain to have enough to
3831	* work with.
3832	* This can only be a problem when we need to write something,
3833	* and some device has failed. If either of those tests
3834	* fail we need look no further.
3835	*/
3836	if (!s->failed \|\| !s->to_write)
3837	return `0`;
3838
3839	if (test_bit(R5_Insync, &dev->flags) &&
3840	!test_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
3841	/ Pre-reads at not permitted until after short delay*
3842	* to gather multiple requests. However if this
3843	* device is no Insync, the block could only be computed
3844	* and there is no need to delay that.
3845	*/
3846	return `0`;
3847
3848	for (i = `0`; i < s->failed && i < `2`; i++) {
3849	if (fdev[i]->towrite &&
3850	!test_bit(R5_UPTODATE, &fdev[i]->flags) &&
3851	!test_bit(R5_OVERWRITE, &fdev[i]->flags))
3852	/ If we have a partial write to a failed*
3853	* device, then we will need to reconstruct
3854	* the content of that device, so all other
3855	* devices must be read.
3856	*/
3857	return `1`;
3858
3859	if (s->failed >= `2` &&
3860	(fdev[i]->towrite \|\|
3861	s->failed_num[i] == sh->pd_idx \|\|
3862	s->failed_num[i] == sh->qd_idx) &&
3863	!test_bit(R5_UPTODATE, &fdev[i]->flags))
3864	/ In max degraded raid6, If the failed disk is P, Q,*
3865	* or we want to read the failed disk, we need to do
3866	* reconstruct-write.
3867	*/
3868	force_rcw = true;
3869	}
3870
3871	/ If we are forced to do a reconstruct-write, because parity*
3872	* cannot be trusted and we are currently recovering it, there
3873	* is extra need to be careful.
3874	* If one of the devices that we would need to read, because
3875	* it is not being overwritten (and maybe not written at all)
3876	* is missing/faulty, then we need to read everything we can.
3877	*/
3878	if (!force_rcw &&
3879	sh->sector < sh->raid_conf->mddev->recovery_cp)
3880	/ reconstruct-write isn't being forced /
3881	return `0`;
3882	for (i = `0`; i < s->failed && i < `2`; i++) {
3883	if (s->failed_num[i] != sh->pd_idx &&
3884	s->failed_num[i] != sh->qd_idx &&
3885	!test_bit(R5_UPTODATE, &fdev[i]->flags) &&
3886	!test_bit(R5_OVERWRITE, &fdev[i]->flags))
3887	return `1`;
3888	}
3889
3890	return `0`;
3891	}
3892
3893	/ fetch_block - checks the given member device to see if its data needs*
3894	* to be read or computed to satisfy a request.
3895	*
3896	* Returns 1 when no more member devices need to be checked, otherwise returns
3897	* 0 to tell the loop in handle_stripe_fill to continue
3898	*/
3899	static int fetch_block(struct stripe_head sh, struct* stripe_head_state *s,
3900	int disk_idx, int disks)
3901	{
3902	struct r5dev *dev = &sh->dev[disk_idx];
3903
3904	/ is the data in this block needed, and can we get it? /
3905	if (need_this_block(sh, s, disk_idx, disks)) {
3906	/ we would like to get this block, possibly by computing it,*
3907	* otherwise read it if the backing disk is insync
3908	*/
3909	BUG_ON(test_bit(R5_Wantcompute, &dev->flags));
3910	BUG_ON(test_bit(R5_Wantread, &dev->flags));
3911	BUG_ON(sh->batch_head);
3912
3913	/*
3914	* In the raid6 case if the only non-uptodate disk is P
3915	* then we already trusted P to compute the other failed
3916	* drives. It is safe to compute rather than re-read P.
3917	* In other cases we only compute blocks from failed
3918	* devices, otherwise check/repair might fail to detect
3919	* a real inconsistency.
3920	*/
3921
3922	if ((s->uptodate == disks - `1`) &&
3923	((sh->qd_idx >= `0` && sh->pd_idx == disk_idx) \|\|
3924	(s->failed && (disk_idx == s->failed_num[`0`] \|\|
3925	disk_idx == s->failed_num[`1`])))) {
3926	/ have disk failed, and we're requested to fetch it;*
3927	* do compute it
3928	*/
3929	pr_debug("Computing stripe %llu block %d\n",
3930	(unsigned long long)sh->sector, disk_idx);
3931	set_bit(nr: STRIPE_COMPUTE_RUN, addr: &sh->state);
3932	set_bit(nr: STRIPE_OP_COMPUTE_BLK, addr: &s->ops_request);
3933	set_bit(nr: R5_Wantcompute, addr: &dev->flags);
3934	sh->ops.target = disk_idx;
3935	sh->ops.target2 = -`1`; / no 2nd target /
3936	s->req_compute = `1`;
3937	/ Careful: from this point on 'uptodate' is in the eye*
3938	* of raid_run_ops which services 'compute' operations
3939	* before writes. R5_Wantcompute flags a block that will
3940	* be R5_UPTODATE by the time it is needed for a
3941	* subsequent operation.
3942	*/
3943	s->uptodate++;
3944	return `1`;
3945	} else if (s->uptodate == disks-`2` && s->failed >= `2`) {
3946	/ Computing 2-failure is very expensive; only*
3947	* do it if failed >= 2
3948	*/
3949	int other;
3950	for (other = disks; other--; ) {
3951	if (other == disk_idx)
3952	continue;
3953	if (!test_bit(R5_UPTODATE,
3954	&sh->dev[other].flags))
3955	break;
3956	}
3957	BUG_ON(other < `0`);
3958	pr_debug("Computing stripe %llu blocks %d,%d\n",
3959	(unsigned long long)sh->sector,
3960	disk_idx, other);
3961	set_bit(nr: STRIPE_COMPUTE_RUN, addr: &sh->state);
3962	set_bit(nr: STRIPE_OP_COMPUTE_BLK, addr: &s->ops_request);
3963	set_bit(nr: R5_Wantcompute, addr: &sh->dev[disk_idx].flags);
3964	set_bit(nr: R5_Wantcompute, addr: &sh->dev[other].flags);
3965	sh->ops.target = disk_idx;
3966	sh->ops.target2 = other;
3967	s->uptodate += `2`;
3968	s->req_compute = `1`;
3969	return `1`;
3970	} else if (test_bit(R5_Insync, &dev->flags)) {
3971	set_bit(nr: R5_LOCKED, addr: &dev->flags);
3972	set_bit(nr: R5_Wantread, addr: &dev->flags);
3973	s->locked++;
3974	pr_debug("Reading block %d (sync=%d)\n",
3975	disk_idx, s->syncing);
3976	}
3977	}
3978
3979	return `0`;
3980	}
3981
3982	/*
3983	* handle_stripe_fill - read or compute data to satisfy pending requests.
3984	*/
3985	static void handle_stripe_fill(struct stripe_head *sh,
3986	struct stripe_head_state *s,
3987	int disks)
3988	{
3989	int i;
3990
3991	/ look for blocks to read/compute, skip this if a compute*
3992	* is already in flight, or if the stripe contents are in the
3993	* midst of changing due to a write
3994	*/
3995	if (!test_bit(STRIPE_COMPUTE_RUN, &sh->state) && !sh->check_state &&
3996	!sh->reconstruct_state) {
3997
3998	/*
3999	* For degraded stripe with data in journal, do not handle
4000	* read requests yet, instead, flush the stripe to raid
4001	* disks first, this avoids handling complex rmw of write
4002	* back cache (prexor with orig_page, and then xor with
4003	* page) in the read path
4004	*/
4005	if (s->to_read && s->injournal && s->failed) {
4006	if (test_bit(STRIPE_R5C_CACHING, &sh->state))
4007	r5c_make_stripe_write_out(sh);
4008	goto out;
4009	}
4010
4011	for (i = disks; i--; )
4012	if (fetch_block(sh, s, disk_idx: i, disks))
4013	break;
4014	}
4015	out:
4016	set_bit(nr: STRIPE_HANDLE, addr: &sh->state);
4017	}
4018
4019	static void break_stripe_batch_list(struct stripe_head *head_sh,
4020	unsigned long handle_flags);
4021	/ handle_stripe_clean_event*
4022	* any written block on an uptodate or failed drive can be returned.
4023	* Note that if we 'wrote' to a failed drive, it will be UPTODATE, but
4024	* never LOCKED, so we don't need to test 'failed' directly.
4025	*/
4026	static void handle_stripe_clean_event(struct r5conf *conf,
4027	struct stripe_head sh, int* disks)
4028	{
4029	int i;
4030	struct r5dev *dev;
4031	int discard_pending = `0`;
4032	struct stripe_head *head_sh = sh;
4033	bool do_endio = false;
4034
4035	for (i = disks; i--; )
4036	if (sh->dev[i].written) {
4037	dev = &sh->dev[i];
4038	if (!test_bit(R5_LOCKED, &dev->flags) &&
4039	(test_bit(R5_UPTODATE, &dev->flags) \|\|
4040	test_bit(R5_Discard, &dev->flags) \|\|
4041	test_bit(R5_SkipCopy, &dev->flags))) {
4042	/ We can return any write requests /
4043	struct bio wbi, wbi2;
4044	pr_debug("Return write for disc %d\n", i);
4045	if (test_and_clear_bit(nr: R5_Discard, addr: &dev->flags))
4046	clear_bit(nr: R5_UPTODATE, addr: &dev->flags);
4047	if (test_and_clear_bit(nr: R5_SkipCopy, addr: &dev->flags)) {
4048	WARN_ON(test_bit(R5_UPTODATE, &dev->flags));
4049	}
4050	do_endio = true;
4051
4052	returnbi:
4053	dev->page = dev->orig_page;
4054	wbi = dev->written;
4055	dev->written = NULL;
4056	while (wbi && wbi->bi_iter.bi_sector <
4057	dev->sector + RAID5_STRIPE_SECTORS(conf)) {
4058	wbi2 = r5_next_bio(conf, bio: wbi, sector: dev->sector);
4059	md_write_end(mddev: conf->mddev);
4060	bio_endio(wbi);
4061	wbi = wbi2;
4062	}
4063	md_bitmap_endwrite(bitmap: conf->mddev->bitmap, offset: sh->sector,
4064	RAID5_STRIPE_SECTORS(conf),
4065	success: !test_bit(STRIPE_DEGRADED, &sh->state),
4066	behind: `0`);
4067	if (head_sh->batch_head) {
4068	sh = list_first_entry(&sh->batch_list,
4069	struct stripe_head,
4070	batch_list);
4071	if (sh != head_sh) {
4072	dev = &sh->dev[i];
4073	goto returnbi;
4074	}
4075	}
4076	sh = head_sh;
4077	dev = &sh->dev[i];
4078	} else if (test_bit(R5_Discard, &dev->flags))
4079	discard_pending = `1`;
4080	}
4081
4082	log_stripe_write_finished(sh);
4083
4084	if (!discard_pending &&
4085	test_bit(R5_Discard, &sh->dev[sh->pd_idx].flags)) {
4086	int hash;
4087	clear_bit(nr: R5_Discard, addr: &sh->dev[sh->pd_idx].flags);
4088	clear_bit(nr: R5_UPTODATE, addr: &sh->dev[sh->pd_idx].flags);
4089	if (sh->qd_idx >= `0`) {
4090	clear_bit(nr: R5_Discard, addr: &sh->dev[sh->qd_idx].flags);
4091	clear_bit(nr: R5_UPTODATE, addr: &sh->dev[sh->qd_idx].flags);
4092	}
4093	/ now that discard is done we can proceed with any sync /
4094	clear_bit(nr: STRIPE_DISCARD, addr: &sh->state);
4095	/*
4096	* SCSI discard will change some bio fields and the stripe has
4097	* no updated data, so remove it from hash list and the stripe
4098	* will be reinitialized
4099	*/
4100	unhash:
4101	hash = sh->hash_lock_index;
4102	spin_lock_irq(lock: conf->hash_locks + hash);
4103	remove_hash(sh);
4104	spin_unlock_irq(lock: conf->hash_locks + hash);
4105	if (head_sh->batch_head) {
4106	sh = list_first_entry(&sh->batch_list,
4107	struct stripe_head, batch_list);
4108	if (sh != head_sh)
4109	goto unhash;
4110	}
4111	sh = head_sh;
4112
4113	if (test_bit(STRIPE_SYNC_REQUESTED, &sh->state))
4114	set_bit(nr: STRIPE_HANDLE, addr: &sh->state);
4115
4116	}
4117
4118	if (test_and_clear_bit(nr: STRIPE_FULL_WRITE, addr: &sh->state))
4119	if (atomic_dec_and_test(v: &conf->pending_full_writes))
4120	md_wakeup_thread(thread: conf->mddev->thread);
4121
4122	if (head_sh->batch_head && do_endio)
4123	break_stripe_batch_list(head_sh, STRIPE_EXPAND_SYNC_FLAGS);
4124	}
4125
4126	/*
4127	* For RMW in write back cache, we need extra page in prexor to store the
4128	* old data. This page is stored in dev->orig_page.
4129	*
4130	* This function checks whether we have data for prexor. The exact logic
4131	* is:
4132	* R5_UPTODATE && (!R5_InJournal \|\| R5_OrigPageUPTDODATE)
4133	*/
4134	static inline bool uptodate_for_rmw(struct r5dev *dev)
4135	{
4136	return (test_bit(R5_UPTODATE, &dev->flags)) &&
4137	(!test_bit(R5_InJournal, &dev->flags) \|\|
4138	test_bit(R5_OrigPageUPTDODATE, &dev->flags));
4139	}
4140
4141	static int handle_stripe_dirtying(struct r5conf *conf,
4142	struct stripe_head *sh,
4143	struct stripe_head_state *s,
4144	int disks)
4145	{
4146	int rmw = `0`, rcw = `0`, i;
4147	sector_t recovery_cp = conf->mddev->recovery_cp;
4148
4149	/ Check whether resync is now happening or should start.*
4150	* If yes, then the array is dirty (after unclean shutdown or
4151	* initial creation), so parity in some stripes might be inconsistent.
4152	* In this case, we need to always do reconstruct-write, to ensure
4153	* that in case of drive failure or read-error correction, we
4154	* generate correct data from the parity.
4155	*/
4156	if (conf->rmw_level == PARITY_DISABLE_RMW \|\|
4157	(recovery_cp < MaxSector && sh->sector >= recovery_cp &&
4158	s->failed == `0`)) {
4159	/ Calculate the real rcw later - for now make it*
4160	* look like rcw is cheaper
4161	*/
4162	rcw = `1`; rmw = `2`;
4163	pr_debug("force RCW rmw_level=%u, recovery_cp=%llu sh->sector=%llu\n",
4164	conf->rmw_level, (unsigned long long)recovery_cp,
4165	(unsigned long long)sh->sector);
4166	} else for (i = disks; i--; ) {
4167	/ would I have to read this buffer for read_modify_write /
4168	struct r5dev *dev = &sh->dev[i];
4169	if (((dev->towrite && !delay_towrite(conf, dev, s)) \|\|
4170	i == sh->pd_idx \|\| i == sh->qd_idx \|\|
4171	test_bit(R5_InJournal, &dev->flags)) &&
4172	!test_bit(R5_LOCKED, &dev->flags) &&
4173	!(uptodate_for_rmw(dev) \|\|
4174	test_bit(R5_Wantcompute, &dev->flags))) {
4175	if (test_bit(R5_Insync, &dev->flags))
4176	rmw++;
4177	else
4178	rmw += `2`disks; /* cannot read it /
4179	}
4180	/ Would I have to read this buffer for reconstruct_write /
4181	if (!test_bit(R5_OVERWRITE, &dev->flags) &&
4182	i != sh->pd_idx && i != sh->qd_idx &&
4183	!test_bit(R5_LOCKED, &dev->flags) &&
4184	!(test_bit(R5_UPTODATE, &dev->flags) \|\|
4185	test_bit(R5_Wantcompute, &dev->flags))) {
4186	if (test_bit(R5_Insync, &dev->flags))
4187	rcw++;
4188	else
4189	rcw += `2`*disks;
4190	}
4191	}
4192
4193	pr_debug("for sector %llu state 0x%lx, rmw=%d rcw=%d\n",
4194	(unsigned long long)sh->sector, sh->state, rmw, rcw);
4195	set_bit(nr: STRIPE_HANDLE, addr: &sh->state);
4196	if ((rmw < rcw \|\| (rmw == rcw && conf->rmw_level == PARITY_PREFER_RMW)) && rmw > `0`) {
4197	/ prefer read-modify-write, but need to get some data /
4198	mddev_add_trace_msg(conf->mddev, "raid5 rmw %llu %d",
4199	sh->sector, rmw);
4200
4201	for (i = disks; i--; ) {
4202	struct r5dev *dev = &sh->dev[i];
4203	if (test_bit(R5_InJournal, &dev->flags) &&
4204	dev->page == dev->orig_page &&
4205	!test_bit(R5_LOCKED, &sh->dev[sh->pd_idx].flags)) {
4206	/ alloc page for prexor /
4207	struct page *p = alloc_page(GFP_NOIO);
4208
4209	if (p) {
4210	dev->orig_page = p;
4211	continue;
4212	}
4213
4214	/*
4215	* alloc_page() failed, try use
4216	* disk_info->extra_page
4217	*/
4218	if (!test_and_set_bit(nr: R5C_EXTRA_PAGE_IN_USE,
4219	addr: &conf->cache_state)) {
4220	r5c_use_extra_page(sh);
4221	break;
4222	}
4223
4224	/ extra_page in use, add to delayed_list /
4225	set_bit(nr: STRIPE_DELAYED, addr: &sh->state);
4226	s->waiting_extra_page = `1`;
4227	return -EAGAIN;
4228	}
4229	}
4230
4231	for (i = disks; i--; ) {
4232	struct r5dev *dev = &sh->dev[i];
4233	if (((dev->towrite && !delay_towrite(conf, dev, s)) \|\|
4234	i == sh->pd_idx \|\| i == sh->qd_idx \|\|
4235	test_bit(R5_InJournal, &dev->flags)) &&
4236	!test_bit(R5_LOCKED, &dev->flags) &&
4237	!(uptodate_for_rmw(dev) \|\|
4238	test_bit(R5_Wantcompute, &dev->flags)) &&
4239	test_bit(R5_Insync, &dev->flags)) {
4240	if (test_bit(STRIPE_PREREAD_ACTIVE,
4241	&sh->state)) {
4242	pr_debug("Read_old block %d for r-m-w\n",
4243	i);
4244	set_bit(nr: R5_LOCKED, addr: &dev->flags);
4245	set_bit(nr: R5_Wantread, addr: &dev->flags);
4246	s->locked++;
4247	} else
4248	set_bit(nr: STRIPE_DELAYED, addr: &sh->state);
4249	}
4250	}
4251	}
4252	if ((rcw < rmw \|\| (rcw == rmw && conf->rmw_level != PARITY_PREFER_RMW)) && rcw > `0`) {
4253	/ want reconstruct write, but need to get some data /
4254	int qread =`0`;
4255	rcw = `0`;
4256	for (i = disks; i--; ) {
4257	struct r5dev *dev = &sh->dev[i];
4258	if (!test_bit(R5_OVERWRITE, &dev->flags) &&
4259	i != sh->pd_idx && i != sh->qd_idx &&
4260	!test_bit(R5_LOCKED, &dev->flags) &&
4261	!(test_bit(R5_UPTODATE, &dev->flags) \|\|
4262	test_bit(R5_Wantcompute, &dev->flags))) {
4263	rcw++;
4264	if (test_bit(R5_Insync, &dev->flags) &&
4265	test_bit(STRIPE_PREREAD_ACTIVE,
4266	&sh->state)) {
4267	pr_debug("Read_old block "
4268	"%d for Reconstruct\n", i);
4269	set_bit(nr: R5_LOCKED, addr: &dev->flags);
4270	set_bit(nr: R5_Wantread, addr: &dev->flags);
4271	s->locked++;
4272	qread++;
4273	} else
4274	set_bit(nr: STRIPE_DELAYED, addr: &sh->state);
4275	}
4276	}
4277	if (rcw && !mddev_is_dm(mddev: conf->mddev))
4278	blk_add_trace_msg(conf->mddev->gendisk->queue,
4279	"raid5 rcw %llu %d %d %d",
4280	(unsigned long long)sh->sector, rcw, qread,
4281	test_bit(STRIPE_DELAYED, &sh->state));
4282	}
4283
4284	if (rcw > disks && rmw > disks &&
4285	!test_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
4286	set_bit(nr: STRIPE_DELAYED, addr: &sh->state);
4287
4288	/ now if nothing is locked, and if we have enough data,*
4289	* we can start a write request
4290	*/
4291	/ since handle_stripe can be called at any time we need to handle the*
4292	* case where a compute block operation has been submitted and then a
4293	* subsequent call wants to start a write request. raid_run_ops only
4294	* handles the case where compute block and reconstruct are requested
4295	* simultaneously. If this is not the case then new writes need to be
4296	* held off until the compute completes.
4297	*/
4298	if ((s->req_compute \|\| !test_bit(STRIPE_COMPUTE_RUN, &sh->state)) &&
4299	(s->locked == `0` && (rcw == `0` \|\| rmw == `0`) &&
4300	!test_bit(STRIPE_BIT_DELAY, &sh->state)))
4301	schedule_reconstruction(sh, s, rcw: rcw == `0`, expand: `0`);
4302	return `0`;
4303	}
4304
4305	static void handle_parity_checks5(struct r5conf conf, struct* stripe_head *sh,
4306	struct stripe_head_state s, int* disks)
4307	{
4308	struct r5dev *dev = NULL;
4309
4310	BUG_ON(sh->batch_head);
4311	set_bit(nr: STRIPE_HANDLE, addr: &sh->state);
4312
4313	switch (sh->check_state) {
4314	case check_state_idle:
4315	/ start a new check operation if there are no failures /
4316	if (s->failed == `0`) {
4317	BUG_ON(s->uptodate != disks);
4318	sh->check_state = check_state_run;
4319	set_bit(nr: STRIPE_OP_CHECK, addr: &s->ops_request);
4320	clear_bit(nr: R5_UPTODATE, addr: &sh->dev[sh->pd_idx].flags);
4321	s->uptodate--;
4322	break;
4323	}
4324	dev = &sh->dev[s->failed_num[`0`]];
4325	fallthrough;
4326	case check_state_compute_result:
4327	sh->check_state = check_state_idle;
4328	if (!dev)
4329	dev = &sh->dev[sh->pd_idx];
4330
4331	/ check that a write has not made the stripe insync /
4332	if (test_bit(STRIPE_INSYNC, &sh->state))
4333	break;
4334
4335	/ either failed parity check, or recovery is happening /
4336	BUG_ON(!test_bit(R5_UPTODATE, &dev->flags));
4337	BUG_ON(s->uptodate != disks);
4338
4339	set_bit(nr: R5_LOCKED, addr: &dev->flags);
4340	s->locked++;
4341	set_bit(nr: R5_Wantwrite, addr: &dev->flags);
4342
4343	clear_bit(nr: STRIPE_DEGRADED, addr: &sh->state);
4344	set_bit(nr: STRIPE_INSYNC, addr: &sh->state);
4345	break;
4346	case check_state_run:
4347	break; / we will be called again upon completion /
4348	case check_state_check_result:
4349	sh->check_state = check_state_idle;
4350
4351	/ if a failure occurred during the check operation, leave*
4352	* STRIPE_INSYNC not set and let the stripe be handled again
4353	*/
4354	if (s->failed)
4355	break;
4356
4357	/ handle a successful check operation, if parity is correct*
4358	* we are done. Otherwise update the mismatch count and repair
4359	* parity if !MD_RECOVERY_CHECK
4360	*/
4361	if ((sh->ops.zero_sum_result & SUM_CHECK_P_RESULT) == `0`)
4362	/ parity is correct (on disc,*
4363	* not in buffer any more)
4364	*/
4365	set_bit(nr: STRIPE_INSYNC, addr: &sh->state);
4366	else {
4367	atomic64_add(RAID5_STRIPE_SECTORS(conf), v: &conf->mddev->resync_mismatches);
4368	if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery)) {
4369	/ don't try to repair!! /
4370	set_bit(nr: STRIPE_INSYNC, addr: &sh->state);
4371	pr_warn_ratelimited("%s: mismatch sector in range "
4372	"%llu-%llu\n", mdname(conf->mddev),
4373	(unsigned long long) sh->sector,
4374	(unsigned long long) sh->sector +
4375	RAID5_STRIPE_SECTORS(conf));
4376	} else {
4377	sh->check_state = check_state_compute_run;
4378	set_bit(nr: STRIPE_COMPUTE_RUN, addr: &sh->state);
4379	set_bit(nr: STRIPE_OP_COMPUTE_BLK, addr: &s->ops_request);
4380	set_bit(nr: R5_Wantcompute,
4381	addr: &sh->dev[sh->pd_idx].flags);
4382	sh->ops.target = sh->pd_idx;
4383	sh->ops.target2 = -`1`;
4384	s->uptodate++;
4385	}
4386	}
4387	break;
4388	case check_state_compute_run:
4389	break;
4390	default:
4391	pr_err("%s: unknown check_state: %d sector: %llu\n",
4392	__func__, sh->check_state,
4393	(unsigned long long) sh->sector);
4394	BUG();
4395	}
4396	}
4397
4398	static void handle_parity_checks6(struct r5conf conf, struct* stripe_head *sh,
4399	struct stripe_head_state *s,
4400	int disks)
4401	{
4402	int pd_idx = sh->pd_idx;
4403	int qd_idx = sh->qd_idx;
4404	struct r5dev *dev;
4405
4406	BUG_ON(sh->batch_head);
4407	set_bit(nr: STRIPE_HANDLE, addr: &sh->state);
4408
4409	BUG_ON(s->failed > `2`);
4410
4411	/ Want to check and possibly repair P and Q.*
4412	* However there could be one 'failed' device, in which
4413	* case we can only check one of them, possibly using the
4414	* other to generate missing data
4415	*/
4416
4417	switch (sh->check_state) {
4418	case check_state_idle:
4419	/ start a new check operation if there are < 2 failures /
4420	if (s->failed == s->q_failed) {
4421	/ The only possible failed device holds Q, so it*
4422	* makes sense to check P (If anything else were failed,
4423	* we would have used P to recreate it).
4424	*/
4425	sh->check_state = check_state_run;
4426	}
4427	if (!s->q_failed && s->failed < `2`) {
4428	/ Q is not failed, and we didn't use it to generate*
4429	* anything, so it makes sense to check it
4430	*/
4431	if (sh->check_state == check_state_run)
4432	sh->check_state = check_state_run_pq;
4433	else
4434	sh->check_state = check_state_run_q;
4435	}
4436
4437	/ discard potentially stale zero_sum_result /
4438	sh->ops.zero_sum_result = `0`;
4439
4440	if (sh->check_state == check_state_run) {
4441	/ async_xor_zero_sum destroys the contents of P /
4442	clear_bit(nr: R5_UPTODATE, addr: &sh->dev[pd_idx].flags);
4443	s->uptodate--;
4444	}
4445	if (sh->check_state >= check_state_run &&
4446	sh->check_state <= check_state_run_pq) {
4447	/ async_syndrome_zero_sum preserves P and Q, so*
4448	* no need to mark them !uptodate here
4449	*/
4450	set_bit(nr: STRIPE_OP_CHECK, addr: &s->ops_request);
4451	break;
4452	}
4453
4454	/ we have 2-disk failure /
4455	BUG_ON(s->failed != `2`);
4456	fallthrough;
4457	case check_state_compute_result:
4458	sh->check_state = check_state_idle;
4459
4460	/ check that a write has not made the stripe insync /
4461	if (test_bit(STRIPE_INSYNC, &sh->state))
4462	break;
4463
4464	/ now write out any block on a failed drive,*
4465	* or P or Q if they were recomputed
4466	*/
4467	dev = NULL;
4468	if (s->failed == `2`) {
4469	dev = &sh->dev[s->failed_num[`1`]];
4470	s->locked++;
4471	set_bit(nr: R5_LOCKED, addr: &dev->flags);
4472	set_bit(nr: R5_Wantwrite, addr: &dev->flags);
4473	}
4474	if (s->failed >= `1`) {
4475	dev = &sh->dev[s->failed_num[`0`]];
4476	s->locked++;
4477	set_bit(nr: R5_LOCKED, addr: &dev->flags);
4478	set_bit(nr: R5_Wantwrite, addr: &dev->flags);
4479	}
4480	if (sh->ops.zero_sum_result & SUM_CHECK_P_RESULT) {
4481	dev = &sh->dev[pd_idx];
4482	s->locked++;
4483	set_bit(nr: R5_LOCKED, addr: &dev->flags);
4484	set_bit(nr: R5_Wantwrite, addr: &dev->flags);
4485	}
4486	if (sh->ops.zero_sum_result & SUM_CHECK_Q_RESULT) {
4487	dev = &sh->dev[qd_idx];
4488	s->locked++;
4489	set_bit(nr: R5_LOCKED, addr: &dev->flags);
4490	set_bit(nr: R5_Wantwrite, addr: &dev->flags);
4491	}
4492	if (WARN_ONCE(dev && !test_bit(R5_UPTODATE, &dev->flags),
4493	"%s: disk%td not up to date\n",
4494	mdname(conf->mddev),
4495	dev - (struct r5dev *) &sh->dev)) {
4496	clear_bit(nr: R5_LOCKED, addr: &dev->flags);
4497	clear_bit(nr: R5_Wantwrite, addr: &dev->flags);
4498	s->locked--;
4499	}
4500	clear_bit(nr: STRIPE_DEGRADED, addr: &sh->state);
4501
4502	set_bit(nr: STRIPE_INSYNC, addr: &sh->state);
4503	break;
4504	case check_state_run:
4505	case check_state_run_q:
4506	case check_state_run_pq:
4507	break; / we will be called again upon completion /
4508	case check_state_check_result:
4509	sh->check_state = check_state_idle;
4510
4511	/ handle a successful check operation, if parity is correct*
4512	* we are done. Otherwise update the mismatch count and repair
4513	* parity if !MD_RECOVERY_CHECK
4514	*/
4515	if (sh->ops.zero_sum_result == `0`) {
4516	/ both parities are correct /
4517	if (!s->failed)
4518	set_bit(nr: STRIPE_INSYNC, addr: &sh->state);
4519	else {
4520	/ in contrast to the raid5 case we can validate*
4521	* parity, but still have a failure to write
4522	* back
4523	*/
4524	sh->check_state = check_state_compute_result;
4525	/ Returning at this point means that we may go*
4526	* off and bring p and/or q uptodate again so
4527	* we make sure to check zero_sum_result again
4528	* to verify if p or q need writeback
4529	*/
4530	}
4531	} else {
4532	atomic64_add(RAID5_STRIPE_SECTORS(conf), v: &conf->mddev->resync_mismatches);
4533	if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery)) {
4534	/ don't try to repair!! /
4535	set_bit(nr: STRIPE_INSYNC, addr: &sh->state);
4536	pr_warn_ratelimited("%s: mismatch sector in range "
4537	"%llu-%llu\n", mdname(conf->mddev),
4538	(unsigned long long) sh->sector,
4539	(unsigned long long) sh->sector +
4540	RAID5_STRIPE_SECTORS(conf));
4541	} else {
4542	int *target = &sh->ops.target;
4543
4544	sh->ops.target = -`1`;
4545	sh->ops.target2 = -`1`;
4546	sh->check_state = check_state_compute_run;
4547	set_bit(nr: STRIPE_COMPUTE_RUN, addr: &sh->state);
4548	set_bit(nr: STRIPE_OP_COMPUTE_BLK, addr: &s->ops_request);
4549	if (sh->ops.zero_sum_result & SUM_CHECK_P_RESULT) {
4550	set_bit(nr: R5_Wantcompute,
4551	addr: &sh->dev[pd_idx].flags);
4552	*target = pd_idx;
4553	target = &sh->ops.target2;
4554	s->uptodate++;
4555	}
4556	if (sh->ops.zero_sum_result & SUM_CHECK_Q_RESULT) {
4557	set_bit(nr: R5_Wantcompute,
4558	addr: &sh->dev[qd_idx].flags);
4559	*target = qd_idx;
4560	s->uptodate++;
4561	}
4562	}
4563	}
4564	break;
4565	case check_state_compute_run:
4566	break;
4567	default:
4568	pr_warn("%s: unknown check_state: %d sector: %llu\n",
4569	__func__, sh->check_state,
4570	(unsigned long long) sh->sector);
4571	BUG();
4572	}
4573	}
4574
4575	static void handle_stripe_expansion(struct r5conf conf, struct* stripe_head *sh)
4576	{
4577	int i;
4578
4579	/ We have read all the blocks in this stripe and now we need to*
4580	* copy some of them into a target stripe for expand.
4581	*/
4582	struct dma_async_tx_descriptor *tx = NULL;
4583	BUG_ON(sh->batch_head);
4584	clear_bit(nr: STRIPE_EXPAND_SOURCE, addr: &sh->state);
4585	for (i = `0`; i < sh->disks; i++)
4586	if (i != sh->pd_idx && i != sh->qd_idx) {
4587	int dd_idx, j;
4588	struct stripe_head *sh2;
4589	struct async_submit_ctl submit;
4590
4591	sector_t bn = raid5_compute_blocknr(sh, i, previous: `1`);
4592	sector_t s = raid5_compute_sector(conf, r_sector: bn, previous: `0`,
4593	dd_idx: &dd_idx, NULL);
4594	sh2 = raid5_get_active_stripe(conf, NULL, sector: s,
4595	R5_GAS_NOBLOCK \| R5_GAS_NOQUIESCE);
4596	if (sh2 == NULL)
4597	/ so far only the early blocks of this stripe*
4598	* have been requested. When later blocks
4599	* get requested, we will try again
4600	*/
4601	continue;
4602	if (!test_bit(STRIPE_EXPANDING, &sh2->state) \|\|
4603	test_bit(R5_Expanded, &sh2->dev[dd_idx].flags)) {
4604	/ must have already done this block /
4605	raid5_release_stripe(sh: sh2);
4606	continue;
4607	}
4608
4609	/ place all the copies on one channel /
4610	init_async_submit(args: &submit, flags: `0`, tx, NULL, NULL, NULL);
4611	tx = async_memcpy(dest: sh2->dev[dd_idx].page,
4612	src: sh->dev[i].page, dest_offset: sh2->dev[dd_idx].offset,
4613	src_offset: sh->dev[i].offset, RAID5_STRIPE_SIZE(conf),
4614	submit: &submit);
4615
4616	set_bit(nr: R5_Expanded, addr: &sh2->dev[dd_idx].flags);
4617	set_bit(nr: R5_UPTODATE, addr: &sh2->dev[dd_idx].flags);
4618	for (j = `0`; j < conf->raid_disks; j++)
4619	if (j != sh2->pd_idx &&
4620	j != sh2->qd_idx &&
4621	!test_bit(R5_Expanded, &sh2->dev[j].flags))
4622	break;
4623	if (j == conf->raid_disks) {
4624	set_bit(nr: STRIPE_EXPAND_READY, addr: &sh2->state);
4625	set_bit(nr: STRIPE_HANDLE, addr: &sh2->state);
4626	}
4627	raid5_release_stripe(sh: sh2);
4628
4629	}
4630	/ done submitting copies, wait for them to complete /
4631	async_tx_quiesce(tx: &tx);
4632	}
4633
4634	/*
4635	* handle_stripe - do things to a stripe.
4636	*
4637	* We lock the stripe by setting STRIPE_ACTIVE and then examine the
4638	* state of various bits to see what needs to be done.
4639	* Possible results:
4640	* return some read requests which now have data
4641	* return some write requests which are safely on storage
4642	* schedule a read on some buffers
4643	* schedule a write of some buffers
4644	* return confirmation of parity correctness
4645	*
4646	*/
4647
4648	static void analyse_stripe(struct stripe_head sh, struct* stripe_head_state *s)
4649	{
4650	struct r5conf *conf = sh->raid_conf;
4651	int disks = sh->disks;
4652	struct r5dev *dev;
4653	int i;
4654	int do_recovery = `0`;
4655
4656	memset(s, `0`, sizeof(*s));
4657
4658	s->expanding = test_bit(STRIPE_EXPAND_SOURCE, &sh->state) && !sh->batch_head;
4659	s->expanded = test_bit(STRIPE_EXPAND_READY, &sh->state) && !sh->batch_head;
4660	s->failed_num[`0`] = -`1`;
4661	s->failed_num[`1`] = -`1`;
4662	s->log_failed = r5l_log_disk_error(conf);
4663
4664	/ Now to look around and see what can be done /
4665	for (i=disks; i--; ) {
4666	struct md_rdev *rdev;
4667	int is_bad = `0`;
4668
4669	dev = &sh->dev[i];
4670
4671	pr_debug("check %d: state 0x%lx read %p write %p written %p\n",
4672	i, dev->flags,
4673	dev->toread, dev->towrite, dev->written);
4674	/ maybe we can reply to a read*
4675	*
4676	* new wantfill requests are only permitted while
4677	* ops_complete_biofill is guaranteed to be inactive
4678	*/
4679	if (test_bit(R5_UPTODATE, &dev->flags) && dev->toread &&
4680	!test_bit(STRIPE_BIOFILL_RUN, &sh->state))
4681	set_bit(nr: R5_Wantfill, addr: &dev->flags);
4682
4683	/ now count some things /
4684	if (test_bit(R5_LOCKED, &dev->flags))
4685	s->locked++;
4686	if (test_bit(R5_UPTODATE, &dev->flags))
4687	s->uptodate++;
4688	if (test_bit(R5_Wantcompute, &dev->flags)) {
4689	s->compute++;
4690	BUG_ON(s->compute > `2`);
4691	}
4692
4693	if (test_bit(R5_Wantfill, &dev->flags))
4694	s->to_fill++;
4695	else if (dev->toread)
4696	s->to_read++;
4697	if (dev->towrite) {
4698	s->to_write++;
4699	if (!test_bit(R5_OVERWRITE, &dev->flags))
4700	s->non_overwrite++;
4701	}
4702	if (dev->written)
4703	s->written++;
4704	/ Prefer to use the replacement for reads, but only*
4705	* if it is recovered enough and has no bad blocks.
4706	*/
4707	rdev = conf->disks[i].replacement;
4708	if (rdev && !test_bit(Faulty, &rdev->flags) &&
4709	rdev->recovery_offset >= sh->sector + RAID5_STRIPE_SECTORS(conf) &&
4710	!rdev_has_badblock(rdev, s: sh->sector,
4711	RAID5_STRIPE_SECTORS(conf)))
4712	set_bit(nr: R5_ReadRepl, addr: &dev->flags);
4713	else {
4714	if (rdev && !test_bit(Faulty, &rdev->flags))
4715	set_bit(nr: R5_NeedReplace, addr: &dev->flags);
4716	else
4717	clear_bit(nr: R5_NeedReplace, addr: &dev->flags);
4718	rdev = conf->disks[i].rdev;
4719	clear_bit(nr: R5_ReadRepl, addr: &dev->flags);
4720	}
4721	if (rdev && test_bit(Faulty, &rdev->flags))
4722	rdev = NULL;
4723	if (rdev) {
4724	is_bad = rdev_has_badblock(rdev, s: sh->sector,
4725	RAID5_STRIPE_SECTORS(conf));
4726	if (s->blocked_rdev == NULL
4727	&& (test_bit(Blocked, &rdev->flags)
4728	\|\| is_bad < `0`)) {
4729	if (is_bad < `0`)
4730	set_bit(nr: BlockedBadBlocks,
4731	addr: &rdev->flags);
4732	s->blocked_rdev = rdev;
4733	atomic_inc(v: &rdev->nr_pending);
4734	}
4735	}
4736	clear_bit(nr: R5_Insync, addr: &dev->flags);
4737	if (!rdev)
4738	/ Not in-sync /;
4739	else if (is_bad) {
4740	/ also not in-sync /
4741	if (!test_bit(WriteErrorSeen, &rdev->flags) &&
4742	test_bit(R5_UPTODATE, &dev->flags)) {
4743	/ treat as in-sync, but with a read error*
4744	* which we can now try to correct
4745	*/
4746	set_bit(nr: R5_Insync, addr: &dev->flags);
4747	set_bit(nr: R5_ReadError, addr: &dev->flags);
4748	}
4749	} else if (test_bit(In_sync, &rdev->flags))
4750	set_bit(nr: R5_Insync, addr: &dev->flags);
4751	else if (sh->sector + RAID5_STRIPE_SECTORS(conf) <= rdev->recovery_offset)
4752	/ in sync if before recovery_offset /
4753	set_bit(nr: R5_Insync, addr: &dev->flags);
4754	else if (test_bit(R5_UPTODATE, &dev->flags) &&
4755	test_bit(R5_Expanded, &dev->flags))
4756	/ If we've reshaped into here, we assume it is Insync.*
4757	* We will shortly update recovery_offset to make
4758	* it official.
4759	*/
4760	set_bit(nr: R5_Insync, addr: &dev->flags);
4761
4762	if (test_bit(R5_WriteError, &dev->flags)) {
4763	/ This flag does not apply to '.replacement'*
4764	* only to .rdev, so make sure to check that*/
4765	struct md_rdev *rdev2 = conf->disks[i].rdev;
4766
4767	if (rdev2 == rdev)
4768	clear_bit(nr: R5_Insync, addr: &dev->flags);
4769	if (rdev2 && !test_bit(Faulty, &rdev2->flags)) {
4770	s->handle_bad_blocks = `1`;
4771	atomic_inc(v: &rdev2->nr_pending);
4772	} else
4773	clear_bit(nr: R5_WriteError, addr: &dev->flags);
4774	}
4775	if (test_bit(R5_MadeGood, &dev->flags)) {
4776	/ This flag does not apply to '.replacement'*
4777	* only to .rdev, so make sure to check that*/
4778	struct md_rdev *rdev2 = conf->disks[i].rdev;
4779
4780	if (rdev2 && !test_bit(Faulty, &rdev2->flags)) {
4781	s->handle_bad_blocks = `1`;
4782	atomic_inc(v: &rdev2->nr_pending);
4783	} else
4784	clear_bit(nr: R5_MadeGood, addr: &dev->flags);
4785	}
4786	if (test_bit(R5_MadeGoodRepl, &dev->flags)) {
4787	struct md_rdev *rdev2 = conf->disks[i].replacement;
4788
4789	if (rdev2 && !test_bit(Faulty, &rdev2->flags)) {
4790	s->handle_bad_blocks = `1`;
4791	atomic_inc(v: &rdev2->nr_pending);
4792	} else
4793	clear_bit(nr: R5_MadeGoodRepl, addr: &dev->flags);
4794	}
4795	if (!test_bit(R5_Insync, &dev->flags)) {
4796	/ The ReadError flag will just be confusing now /
4797	clear_bit(nr: R5_ReadError, addr: &dev->flags);
4798	clear_bit(nr: R5_ReWrite, addr: &dev->flags);
4799	}
4800	if (test_bit(R5_ReadError, &dev->flags))
4801	clear_bit(nr: R5_Insync, addr: &dev->flags);
4802	if (!test_bit(R5_Insync, &dev->flags)) {
4803	if (s->failed < `2`)
4804	s->failed_num[s->failed] = i;
4805	s->failed++;
4806	if (rdev && !test_bit(Faulty, &rdev->flags))
4807	do_recovery = `1`;
4808	else if (!rdev) {
4809	rdev = conf->disks[i].replacement;
4810	if (rdev && !test_bit(Faulty, &rdev->flags))
4811	do_recovery = `1`;
4812	}
4813	}
4814
4815	if (test_bit(R5_InJournal, &dev->flags))
4816	s->injournal++;
4817	if (test_bit(R5_InJournal, &dev->flags) && dev->written)
4818	s->just_cached++;
4819	}
4820	if (test_bit(STRIPE_SYNCING, &sh->state)) {
4821	/ If there is a failed device being replaced,*
4822	* we must be recovering.
4823	* else if we are after recovery_cp, we must be syncing
4824	* else if MD_RECOVERY_REQUESTED is set, we also are syncing.
4825	* else we can only be replacing
4826	* sync and recovery both need to read all devices, and so
4827	* use the same flag.
4828	*/
4829	if (do_recovery \|\|
4830	sh->sector >= conf->mddev->recovery_cp \|\|
4831	test_bit(MD_RECOVERY_REQUESTED, &(conf->mddev->recovery)))
4832	s->syncing = `1`;
4833	else
4834	s->replacing = `1`;
4835	}
4836	}
4837
4838	/*
4839	* Return '1' if this is a member of batch, or '0' if it is a lone stripe or
4840	* a head which can now be handled.
4841	*/
4842	static int clear_batch_ready(struct stripe_head *sh)
4843	{
4844	struct stripe_head *tmp;
4845	if (!test_and_clear_bit(nr: STRIPE_BATCH_READY, addr: &sh->state))
4846	return (sh->batch_head && sh->batch_head != sh);
4847	spin_lock(lock: &sh->stripe_lock);
4848	if (!sh->batch_head) {
4849	spin_unlock(lock: &sh->stripe_lock);
4850	return `0`;
4851	}
4852
4853	/*
4854	* this stripe could be added to a batch list before we check
4855	* BATCH_READY, skips it
4856	*/
4857	if (sh->batch_head != sh) {
4858	spin_unlock(lock: &sh->stripe_lock);
4859	return `1`;
4860	}
4861	spin_lock(lock: &sh->batch_lock);
4862	list_for_each_entry(tmp, &sh->batch_list, batch_list)
4863	clear_bit(nr: STRIPE_BATCH_READY, addr: &tmp->state);
4864	spin_unlock(lock: &sh->batch_lock);
4865	spin_unlock(lock: &sh->stripe_lock);
4866
4867	/*
4868	* BATCH_READY is cleared, no new stripes can be added.
4869	* batch_list can be accessed without lock
4870	*/
4871	return `0`;
4872	}
4873
4874	static void break_stripe_batch_list(struct stripe_head *head_sh,
4875	unsigned long handle_flags)
4876	{
4877	struct stripe_head sh, next;
4878	int i;
4879	int do_wakeup = `0`;
4880
4881	list_for_each_entry_safe(sh, next, &head_sh->batch_list, batch_list) {
4882
4883	list_del_init(entry: &sh->batch_list);
4884
4885	WARN_ONCE(sh->state & ((`1` << STRIPE_ACTIVE) \|
4886	(`1` << STRIPE_SYNCING) \|
4887	(`1` << STRIPE_REPLACED) \|
4888	(`1` << STRIPE_DELAYED) \|
4889	(`1` << STRIPE_BIT_DELAY) \|
4890	(`1` << STRIPE_FULL_WRITE) \|
4891	(`1` << STRIPE_BIOFILL_RUN) \|
4892	(`1` << STRIPE_COMPUTE_RUN) \|
4893	(`1` << STRIPE_DISCARD) \|
4894	(`1` << STRIPE_BATCH_READY) \|
4895	(`1` << STRIPE_BATCH_ERR) \|
4896	(`1` << STRIPE_BITMAP_PENDING)),
4897	"stripe state: %lx\n", sh->state);
4898	WARN_ONCE(head_sh->state & ((`1` << STRIPE_DISCARD) \|
4899	(`1` << STRIPE_REPLACED)),
4900	"head stripe state: %lx\n", head_sh->state);
4901
4902	set_mask_bits(&sh->state, ~(STRIPE_EXPAND_SYNC_FLAGS \|
4903	(`1` << STRIPE_PREREAD_ACTIVE) \|
4904	(`1` << STRIPE_DEGRADED) \|
4905	(`1` << STRIPE_ON_UNPLUG_LIST)),
4906	head_sh->state & (`1` << STRIPE_INSYNC));
4907
4908	sh->check_state = head_sh->check_state;
4909	sh->reconstruct_state = head_sh->reconstruct_state;
4910	spin_lock_irq(lock: &sh->stripe_lock);
4911	sh->batch_head = NULL;
4912	spin_unlock_irq(lock: &sh->stripe_lock);
4913	for (i = `0`; i < sh->disks; i++) {
4914	if (test_and_clear_bit(nr: R5_Overlap, addr: &sh->dev[i].flags))
4915	do_wakeup = `1`;
4916	sh->dev[i].flags = head_sh->dev[i].flags &
4917	(~((`1` << R5_WriteError) \| (`1` << R5_Overlap)));
4918	}
4919	if (handle_flags == `0` \|\|
4920	sh->state & handle_flags)
4921	set_bit(nr: STRIPE_HANDLE, addr: &sh->state);
4922	raid5_release_stripe(sh);
4923	}
4924	spin_lock_irq(lock: &head_sh->stripe_lock);
4925	head_sh->batch_head = NULL;
4926	spin_unlock_irq(lock: &head_sh->stripe_lock);
4927	for (i = `0`; i < head_sh->disks; i++)
4928	if (test_and_clear_bit(nr: R5_Overlap, addr: &head_sh->dev[i].flags))
4929	do_wakeup = `1`;
4930	if (head_sh->state & handle_flags)
4931	set_bit(nr: STRIPE_HANDLE, addr: &head_sh->state);
4932
4933	if (do_wakeup)
4934	wake_up(&head_sh->raid_conf->wait_for_overlap);
4935	}
4936
4937	static void handle_stripe(struct stripe_head *sh)
4938	{
4939	struct stripe_head_state s;
4940	struct r5conf *conf = sh->raid_conf;
4941	int i;
4942	int prexor;
4943	int disks = sh->disks;
4944	struct r5dev pdev, qdev;
4945
4946	clear_bit(nr: STRIPE_HANDLE, addr: &sh->state);
4947
4948	/*
4949	* handle_stripe should not continue handle the batched stripe, only
4950	* the head of batch list or lone stripe can continue. Otherwise we
4951	* could see break_stripe_batch_list warns about the STRIPE_ACTIVE
4952	* is set for the batched stripe.
4953	*/
4954	if (clear_batch_ready(sh))
4955	return;
4956
4957	if (test_and_set_bit_lock(nr: STRIPE_ACTIVE, addr: &sh->state)) {
4958	/ already being handled, ensure it gets handled*
4959	* again when current action finishes */
4960	set_bit(nr: STRIPE_HANDLE, addr: &sh->state);
4961	return;
4962	}
4963
4964	if (test_and_clear_bit(nr: STRIPE_BATCH_ERR, addr: &sh->state))
4965	break_stripe_batch_list(head_sh: sh, handle_flags: `0`);
4966
4967	if (test_bit(STRIPE_SYNC_REQUESTED, &sh->state) && !sh->batch_head) {
4968	spin_lock(lock: &sh->stripe_lock);
4969	/*
4970	* Cannot process 'sync' concurrently with 'discard'.
4971	* Flush data in r5cache before 'sync'.
4972	*/
4973	if (!test_bit(STRIPE_R5C_PARTIAL_STRIPE, &sh->state) &&
4974	!test_bit(STRIPE_R5C_FULL_STRIPE, &sh->state) &&
4975	!test_bit(STRIPE_DISCARD, &sh->state) &&
4976	test_and_clear_bit(nr: STRIPE_SYNC_REQUESTED, addr: &sh->state)) {
4977	set_bit(nr: STRIPE_SYNCING, addr: &sh->state);
4978	clear_bit(nr: STRIPE_INSYNC, addr: &sh->state);
4979	clear_bit(nr: STRIPE_REPLACED, addr: &sh->state);
4980	}
4981	spin_unlock(lock: &sh->stripe_lock);
4982	}
4983	clear_bit(nr: STRIPE_DELAYED, addr: &sh->state);
4984
4985	pr_debug("handling stripe %llu, state=%#lx cnt=%d, "
4986	"pd_idx=%d, qd_idx=%d\n, check:%d, reconstruct:%d\n",
4987	(unsigned long long)sh->sector, sh->state,
4988	atomic_read(&sh->count), sh->pd_idx, sh->qd_idx,
4989	sh->check_state, sh->reconstruct_state);
4990
4991	analyse_stripe(sh, s: &s);
4992
4993	if (test_bit(STRIPE_LOG_TRAPPED, &sh->state))
4994	goto finish;
4995
4996	if (s.handle_bad_blocks \|\|
4997	test_bit(MD_SB_CHANGE_PENDING, &conf->mddev->sb_flags)) {
4998	set_bit(nr: STRIPE_HANDLE, addr: &sh->state);
4999	goto finish;
5000	}
5001
5002	if (unlikely(s.blocked_rdev)) {
5003	if (s.syncing \|\| s.expanding \|\| s.expanded \|\|
5004	s.replacing \|\| s.to_write \|\| s.written) {
5005	set_bit(nr: STRIPE_HANDLE, addr: &sh->state);
5006	goto finish;
5007	}
5008	/ There is nothing for the blocked_rdev to block /
5009	rdev_dec_pending(rdev: s.blocked_rdev, mddev: conf->mddev);
5010	s.blocked_rdev = NULL;
5011	}
5012
5013	if (s.to_fill && !test_bit(STRIPE_BIOFILL_RUN, &sh->state)) {
5014	set_bit(nr: STRIPE_OP_BIOFILL, addr: &s.ops_request);
5015	set_bit(nr: STRIPE_BIOFILL_RUN, addr: &sh->state);
5016	}
5017
5018	pr_debug("locked=%d uptodate=%d to_read=%d"
5019	" to_write=%d failed=%d failed_num=%d,%d\n",
5020	s.locked, s.uptodate, s.to_read, s.to_write, s.failed,
5021	s.failed_num[`0`], s.failed_num[`1`]);
5022	/*
5023	* check if the array has lost more than max_degraded devices and,
5024	* if so, some requests might need to be failed.
5025	*
5026	* When journal device failed (log_failed), we will only process
5027	* the stripe if there is data need write to raid disks
5028	*/
5029	if (s.failed > conf->max_degraded \|\|
5030	(s.log_failed && s.injournal == `0`)) {
5031	sh->check_state = `0`;
5032	sh->reconstruct_state = `0`;
5033	break_stripe_batch_list(head_sh: sh, handle_flags: `0`);
5034	if (s.to_read+s.to_write+s.written)
5035	handle_failed_stripe(conf, sh, s: &s, disks);
5036	if (s.syncing + s.replacing)
5037	handle_failed_sync(conf, sh, s: &s);
5038	}
5039
5040	/ Now we check to see if any write operations have recently*
5041	* completed
5042	*/
5043	prexor = `0`;
5044	if (sh->reconstruct_state == reconstruct_state_prexor_drain_result)
5045	prexor = `1`;
5046	if (sh->reconstruct_state == reconstruct_state_drain_result \|\|
5047	sh->reconstruct_state == reconstruct_state_prexor_drain_result) {
5048	sh->reconstruct_state = reconstruct_state_idle;
5049
5050	/ All the 'written' buffers and the parity block are ready to*
5051	* be written back to disk
5052	*/
5053	BUG_ON(!test_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags) &&
5054	!test_bit(R5_Discard, &sh->dev[sh->pd_idx].flags));
5055	BUG_ON(sh->qd_idx >= `0` &&
5056	!test_bit(R5_UPTODATE, &sh->dev[sh->qd_idx].flags) &&
5057	!test_bit(R5_Discard, &sh->dev[sh->qd_idx].flags));
5058	for (i = disks; i--; ) {
5059	struct r5dev *dev = &sh->dev[i];
5060	if (test_bit(R5_LOCKED, &dev->flags) &&
5061	(i == sh->pd_idx \|\| i == sh->qd_idx \|\|
5062	dev->written \|\| test_bit(R5_InJournal,
5063	&dev->flags))) {
5064	pr_debug("Writing block %d\n", i);
5065	set_bit(nr: R5_Wantwrite, addr: &dev->flags);
5066	if (prexor)
5067	continue;
5068	if (s.failed > `1`)
5069	continue;
5070	if (!test_bit(R5_Insync, &dev->flags) \|\|
5071	((i == sh->pd_idx \|\| i == sh->qd_idx) &&
5072	s.failed == `0`))
5073	set_bit(nr: STRIPE_INSYNC, addr: &sh->state);
5074	}
5075	}
5076	if (test_and_clear_bit(nr: STRIPE_PREREAD_ACTIVE, addr: &sh->state))
5077	s.dec_preread_active = `1`;
5078	}
5079
5080	/*
5081	* might be able to return some write requests if the parity blocks
5082	* are safe, or on a failed drive
5083	*/
5084	pdev = &sh->dev[sh->pd_idx];
5085	s.p_failed = (s.failed >= `1` && s.failed_num[`0`] == sh->pd_idx)
5086	\|\| (s.failed >= `2` && s.failed_num[`1`] == sh->pd_idx);
5087	qdev = &sh->dev[sh->qd_idx];
5088	s.q_failed = (s.failed >= `1` && s.failed_num[`0`] == sh->qd_idx)
5089	\|\| (s.failed >= `2` && s.failed_num[`1`] == sh->qd_idx)
5090	\|\| conf->level < `6`;
5091
5092	if (s.written &&
5093	(s.p_failed \|\| ((test_bit(R5_Insync, &pdev->flags)
5094	&& !test_bit(R5_LOCKED, &pdev->flags)
5095	&& (test_bit(R5_UPTODATE, &pdev->flags) \|\|
5096	test_bit(R5_Discard, &pdev->flags))))) &&
5097	(s.q_failed \|\| ((test_bit(R5_Insync, &qdev->flags)
5098	&& !test_bit(R5_LOCKED, &qdev->flags)
5099	&& (test_bit(R5_UPTODATE, &qdev->flags) \|\|
5100	test_bit(R5_Discard, &qdev->flags))))))
5101	handle_stripe_clean_event(conf, sh, disks);
5102
5103	if (s.just_cached)
5104	r5c_handle_cached_data_endio(conf, sh, disks);
5105	log_stripe_write_finished(sh);
5106
5107	/ Now we might consider reading some blocks, either to check/generate*
5108	* parity, or to satisfy requests
5109	* or to load a block that is being partially written.
5110	*/
5111	if (s.to_read \|\| s.non_overwrite
5112	\|\| (s.to_write && s.failed)
5113	\|\| (s.syncing && (s.uptodate + s.compute < disks))
5114	\|\| s.replacing
5115	\|\| s.expanding)
5116	handle_stripe_fill(sh, s: &s, disks);
5117
5118	/*
5119	* When the stripe finishes full journal write cycle (write to journal
5120	* and raid disk), this is the clean up procedure so it is ready for
5121	* next operation.
5122	*/
5123	r5c_finish_stripe_write_out(conf, sh, s: &s);
5124
5125	/*
5126	* Now to consider new write requests, cache write back and what else,
5127	* if anything should be read. We do not handle new writes when:
5128	* 1/ A 'write' operation (copy+xor) is already in flight.
5129	* 2/ A 'check' operation is in flight, as it may clobber the parity
5130	* block.
5131	* 3/ A r5c cache log write is in flight.
5132	*/
5133
5134	if (!sh->reconstruct_state && !sh->check_state && !sh->log_io) {
5135	if (!r5c_is_writeback(log: conf->log)) {
5136	if (s.to_write)
5137	handle_stripe_dirtying(conf, sh, s: &s, disks);
5138	} else { / write back cache /
5139	int ret = `0`;
5140
5141	/ First, try handle writes in caching phase /
5142	if (s.to_write)
5143	ret = r5c_try_caching_write(conf, sh, s: &s,
5144	disks);
5145	/*
5146	* If caching phase failed: ret == -EAGAIN
5147	* OR
5148	* stripe under reclaim: !caching && injournal
5149	*
5150	* fall back to handle_stripe_dirtying()
5151	*/
5152	if (ret == -EAGAIN \|\|
5153	/ stripe under reclaim: !caching && injournal /
5154	(!test_bit(STRIPE_R5C_CACHING, &sh->state) &&
5155	s.injournal > `0`)) {
5156	ret = handle_stripe_dirtying(conf, sh, s: &s,
5157	disks);
5158	if (ret == -EAGAIN)
5159	goto finish;
5160	}
5161	}
5162	}
5163
5164	/ maybe we need to check and possibly fix the parity for this stripe*
5165	* Any reads will already have been scheduled, so we just see if enough
5166	* data is available. The parity check is held off while parity
5167	* dependent operations are in flight.
5168	*/
5169	if (sh->check_state \|\|
5170	(s.syncing && s.locked == `0` &&
5171	!test_bit(STRIPE_COMPUTE_RUN, &sh->state) &&
5172	!test_bit(STRIPE_INSYNC, &sh->state))) {
5173	if (conf->level == `6`)
5174	handle_parity_checks6(conf, sh, s: &s, disks);
5175	else
5176	handle_parity_checks5(conf, sh, s: &s, disks);
5177	}
5178
5179	if ((s.replacing \|\| s.syncing) && s.locked == `0`
5180	&& !test_bit(STRIPE_COMPUTE_RUN, &sh->state)
5181	&& !test_bit(STRIPE_REPLACED, &sh->state)) {
5182	/ Write out to replacement devices where possible /
5183	for (i = `0`; i < conf->raid_disks; i++)
5184	if (test_bit(R5_NeedReplace, &sh->dev[i].flags)) {
5185	WARN_ON(!test_bit(R5_UPTODATE, &sh->dev[i].flags));
5186	set_bit(nr: R5_WantReplace, addr: &sh->dev[i].flags);
5187	set_bit(nr: R5_LOCKED, addr: &sh->dev[i].flags);
5188	s.locked++;
5189	}
5190	if (s.replacing)
5191	set_bit(nr: STRIPE_INSYNC, addr: &sh->state);
5192	set_bit(nr: STRIPE_REPLACED, addr: &sh->state);
5193	}
5194	if ((s.syncing \|\| s.replacing) && s.locked == `0` &&
5195	!test_bit(STRIPE_COMPUTE_RUN, &sh->state) &&
5196	test_bit(STRIPE_INSYNC, &sh->state)) {
5197	md_done_sync(mddev: conf->mddev, RAID5_STRIPE_SECTORS(conf), ok: `1`);
5198	clear_bit(nr: STRIPE_SYNCING, addr: &sh->state);
5199	if (test_and_clear_bit(nr: R5_Overlap, addr: &sh->dev[sh->pd_idx].flags))
5200	wake_up(&conf->wait_for_overlap);
5201	}
5202
5203	/ If the failed drives are just a ReadError, then we might need*
5204	* to progress the repair/check process
5205	*/
5206	if (s.failed <= conf->max_degraded && !conf->mddev->ro)
5207	for (i = `0`; i < s.failed; i++) {
5208	struct r5dev *dev = &sh->dev[s.failed_num[i]];
5209	if (test_bit(R5_ReadError, &dev->flags)
5210	&& !test_bit(R5_LOCKED, &dev->flags)
5211	&& test_bit(R5_UPTODATE, &dev->flags)
5212	) {
5213	if (!test_bit(R5_ReWrite, &dev->flags)) {
5214	set_bit(nr: R5_Wantwrite, addr: &dev->flags);
5215	set_bit(nr: R5_ReWrite, addr: &dev->flags);
5216	} else
5217	/ let's read it back /
5218	set_bit(nr: R5_Wantread, addr: &dev->flags);
5219	set_bit(nr: R5_LOCKED, addr: &dev->flags);
5220	s.locked++;
5221	}
5222	}
5223
5224	/ Finish reconstruct operations initiated by the expansion process /
5225	if (sh->reconstruct_state == reconstruct_state_result) {
5226	struct stripe_head *sh_src
5227	= raid5_get_active_stripe(conf, NULL, sector: sh->sector,
5228	R5_GAS_PREVIOUS \| R5_GAS_NOBLOCK \|
5229	R5_GAS_NOQUIESCE);
5230	if (sh_src && test_bit(STRIPE_EXPAND_SOURCE, &sh_src->state)) {
5231	/ sh cannot be written until sh_src has been read.*
5232	* so arrange for sh to be delayed a little
5233	*/
5234	set_bit(nr: STRIPE_DELAYED, addr: &sh->state);
5235	set_bit(nr: STRIPE_HANDLE, addr: &sh->state);
5236	if (!test_and_set_bit(nr: STRIPE_PREREAD_ACTIVE,
5237	addr: &sh_src->state))
5238	atomic_inc(v: &conf->preread_active_stripes);
5239	raid5_release_stripe(sh: sh_src);
5240	goto finish;
5241	}
5242	if (sh_src)
5243	raid5_release_stripe(sh: sh_src);
5244
5245	sh->reconstruct_state = reconstruct_state_idle;
5246	clear_bit(nr: STRIPE_EXPANDING, addr: &sh->state);
5247	for (i = conf->raid_disks; i--; ) {
5248	set_bit(nr: R5_Wantwrite, addr: &sh->dev[i].flags);
5249	set_bit(nr: R5_LOCKED, addr: &sh->dev[i].flags);
5250	s.locked++;
5251	}
5252	}
5253
5254	if (s.expanded && test_bit(STRIPE_EXPANDING, &sh->state) &&
5255	!sh->reconstruct_state) {
5256	/ Need to write out all blocks after computing parity /
5257	sh->disks = conf->raid_disks;
5258	stripe_set_idx(stripe: sh->sector, conf, previous: `0`, sh);
5259	schedule_reconstruction(sh, s: &s, rcw: `1`, expand: `1`);
5260	} else if (s.expanded && !sh->reconstruct_state && s.locked == `0`) {
5261	clear_bit(nr: STRIPE_EXPAND_READY, addr: &sh->state);
5262	atomic_dec(v: &conf->reshape_stripes);
5263	wake_up(&conf->wait_for_overlap);
5264	md_done_sync(mddev: conf->mddev, RAID5_STRIPE_SECTORS(conf), ok: `1`);
5265	}
5266
5267	if (s.expanding && s.locked == `0` &&
5268	!test_bit(STRIPE_COMPUTE_RUN, &sh->state))
5269	handle_stripe_expansion(conf, sh);
5270
5271	finish:
5272	/ wait for this device to become unblocked /
5273	if (unlikely(s.blocked_rdev)) {
5274	if (conf->mddev->external)
5275	md_wait_for_blocked_rdev(rdev: s.blocked_rdev,
5276	mddev: conf->mddev);
5277	else
5278	/ Internal metadata will immediately*
5279	* be written by raid5d, so we don't
5280	* need to wait here.
5281	*/
5282	rdev_dec_pending(rdev: s.blocked_rdev,
5283	mddev: conf->mddev);
5284	}
5285
5286	if (s.handle_bad_blocks)
5287	for (i = disks; i--; ) {
5288	struct md_rdev *rdev;
5289	struct r5dev *dev = &sh->dev[i];
5290	if (test_and_clear_bit(nr: R5_WriteError, addr: &dev->flags)) {
5291	/ We own a safe reference to the rdev /
5292	rdev = conf->disks[i].rdev;
5293	if (!rdev_set_badblocks(rdev, s: sh->sector,
5294	RAID5_STRIPE_SECTORS(conf), is_new: `0`))
5295	md_error(mddev: conf->mddev, rdev);
5296	rdev_dec_pending(rdev, mddev: conf->mddev);
5297	}
5298	if (test_and_clear_bit(nr: R5_MadeGood, addr: &dev->flags)) {
5299	rdev = conf->disks[i].rdev;
5300	rdev_clear_badblocks(rdev, s: sh->sector,
5301	RAID5_STRIPE_SECTORS(conf), is_new: `0`);
5302	rdev_dec_pending(rdev, mddev: conf->mddev);
5303	}
5304	if (test_and_clear_bit(nr: R5_MadeGoodRepl, addr: &dev->flags)) {
5305	rdev = conf->disks[i].replacement;
5306	if (!rdev)
5307	/ rdev have been moved down /
5308	rdev = conf->disks[i].rdev;
5309	rdev_clear_badblocks(rdev, s: sh->sector,
5310	RAID5_STRIPE_SECTORS(conf), is_new: `0`);
5311	rdev_dec_pending(rdev, mddev: conf->mddev);
5312	}
5313	}
5314
5315	if (s.ops_request)
5316	raid_run_ops(sh, ops_request: s.ops_request);
5317
5318	ops_run_io(sh, s: &s);
5319
5320	if (s.dec_preread_active) {
5321	/ We delay this until after ops_run_io so that if make_request*
5322	* is waiting on a flush, it won't continue until the writes
5323	* have actually been submitted.
5324	*/
5325	atomic_dec(v: &conf->preread_active_stripes);
5326	if (atomic_read(v: &conf->preread_active_stripes) <
5327	IO_THRESHOLD)
5328	md_wakeup_thread(thread: conf->mddev->thread);
5329	}
5330
5331	clear_bit_unlock(nr: STRIPE_ACTIVE, addr: &sh->state);
5332	}
5333
5334	static void raid5_activate_delayed(struct r5conf *conf)
5335	__must_hold(&conf->device_lock)
5336	{
5337	if (atomic_read(v: &conf->preread_active_stripes) < IO_THRESHOLD) {
5338	while (!list_empty(head: &conf->delayed_list)) {
5339	struct list_head *l = conf->delayed_list.next;
5340	struct stripe_head *sh;
5341	sh = list_entry(l, struct stripe_head, lru);
5342	list_del_init(entry: l);
5343	clear_bit(nr: STRIPE_DELAYED, addr: &sh->state);
5344	if (!test_and_set_bit(nr: STRIPE_PREREAD_ACTIVE, addr: &sh->state))
5345	atomic_inc(v: &conf->preread_active_stripes);
5346	list_add_tail(new: &sh->lru, head: &conf->hold_list);
5347	raid5_wakeup_stripe_thread(sh);
5348	}
5349	}
5350	}
5351
5352	static void activate_bit_delay(struct r5conf *conf,
5353	struct list_head *temp_inactive_list)
5354	__must_hold(&conf->device_lock)
5355	{
5356	struct list_head head;
5357	list_add(new: &head, head: &conf->bitmap_list);
5358	list_del_init(entry: &conf->bitmap_list);
5359	while (!list_empty(head: &head)) {
5360	struct stripe_head sh = list_entry(head.next, struct* stripe_head, lru);
5361	int hash;
5362	list_del_init(entry: &sh->lru);
5363	atomic_inc(v: &sh->count);
5364	hash = sh->hash_lock_index;
5365	__release_stripe(conf, sh, temp_inactive_list: &temp_inactive_list[hash]);
5366	}
5367	}
5368
5369	static int in_chunk_boundary(struct mddev mddev, struct* bio *bio)
5370	{
5371	struct r5conf *conf = mddev->private;
5372	sector_t sector = bio->bi_iter.bi_sector;
5373	unsigned int chunk_sectors;
5374	unsigned int bio_sectors = bio_sectors(bio);
5375
5376	chunk_sectors = min(conf->chunk_sectors, conf->prev_chunk_sectors);
5377	return chunk_sectors >=
5378	((sector & (chunk_sectors - `1`)) + bio_sectors);
5379	}
5380
5381	/*
5382	* add bio to the retry LIFO ( in O(1) ... we are in interrupt )
5383	* later sampled by raid5d.
5384	*/
5385	static void add_bio_to_retry(struct bio bi,struct* r5conf *conf)
5386	{
5387	unsigned long flags;
5388
5389	spin_lock_irqsave(&conf->device_lock, flags);
5390
5391	bi->bi_next = conf->retry_read_aligned_list;
5392	conf->retry_read_aligned_list = bi;
5393
5394	spin_unlock_irqrestore(lock: &conf->device_lock, flags);
5395	md_wakeup_thread(thread: conf->mddev->thread);
5396	}
5397
5398	static struct bio remove_bio_from_retry(struct* r5conf *conf,
5399	unsigned int *offset)
5400	{
5401	struct bio *bi;
5402
5403	bi = conf->retry_read_aligned;
5404	if (bi) {
5405	*offset = conf->retry_read_offset;
5406	conf->retry_read_aligned = NULL;
5407	return bi;
5408	}
5409	bi = conf->retry_read_aligned_list;
5410	if(bi) {
5411	conf->retry_read_aligned_list = bi->bi_next;
5412	bi->bi_next = NULL;
5413	*offset = `0`;
5414	}
5415
5416	return bi;
5417	}
5418
5419	/*
5420	* The "raid5_align_endio" should check if the read succeeded and if it
5421	* did, call bio_endio on the original bio (having bio_put the new bio
5422	* first).
5423	* If the read failed..
5424	*/
5425	static void raid5_align_endio(struct bio *bi)
5426	{
5427	struct bio *raid_bi = bi->bi_private;
5428	struct md_rdev rdev = (void* *)raid_bi->bi_next;
5429	struct mddev *mddev = rdev->mddev;
5430	struct r5conf *conf = mddev->private;
5431	blk_status_t error = bi->bi_status;
5432
5433	bio_put(bi);
5434	raid_bi->bi_next = NULL;
5435	rdev_dec_pending(rdev, mddev: conf->mddev);
5436
5437	if (!error) {
5438	bio_endio(raid_bi);
5439	if (atomic_dec_and_test(v: &conf->active_aligned_reads))
5440	wake_up(&conf->wait_for_quiescent);
5441	return;
5442	}
5443
5444	pr_debug("raid5_align_endio : io error...handing IO for a retry\n");
5445
5446	add_bio_to_retry(bi: raid_bi, conf);
5447	}
5448
5449	static int raid5_read_one_chunk(struct mddev mddev, struct* bio *raid_bio)
5450	{
5451	struct r5conf *conf = mddev->private;
5452	struct bio *align_bio;
5453	struct md_rdev *rdev;
5454	sector_t sector, end_sector;
5455	int dd_idx;
5456	bool did_inc;
5457
5458	if (!in_chunk_boundary(mddev, bio: raid_bio)) {
5459	pr_debug("%s: non aligned\n", __func__);
5460	return `0`;
5461	}
5462
5463	sector = raid5_compute_sector(conf, r_sector: raid_bio->bi_iter.bi_sector, previous: `0`,
5464	dd_idx: &dd_idx, NULL);
5465	end_sector = sector + bio_sectors(raid_bio);
5466
5467	if (r5c_big_stripe_cached(conf, sect: sector))
5468	return `0`;
5469
5470	rdev = conf->disks[dd_idx].replacement;
5471	if (!rdev \|\| test_bit(Faulty, &rdev->flags) \|\|
5472	rdev->recovery_offset < end_sector) {
5473	rdev = conf->disks[dd_idx].rdev;
5474	if (!rdev)
5475	return `0`;
5476	if (test_bit(Faulty, &rdev->flags) \|\|
5477	!(test_bit(In_sync, &rdev->flags) \|\|
5478	rdev->recovery_offset >= end_sector))
5479	return `0`;
5480	}
5481
5482	atomic_inc(v: &rdev->nr_pending);
5483
5484	if (rdev_has_badblock(rdev, s: sector, bio_sectors(raid_bio))) {
5485	rdev_dec_pending(rdev, mddev);
5486	return `0`;
5487	}
5488
5489	md_account_bio(mddev, bio: &raid_bio);
5490	raid_bio->bi_next = (void *)rdev;
5491
5492	align_bio = bio_alloc_clone(bdev: rdev->bdev, bio_src: raid_bio, GFP_NOIO,
5493	bs: &mddev->bio_set);
5494	align_bio->bi_end_io = raid5_align_endio;
5495	align_bio->bi_private = raid_bio;
5496	align_bio->bi_iter.bi_sector = sector;
5497
5498	/ No reshape active, so we can trust rdev->data_offset /
5499	align_bio->bi_iter.bi_sector += rdev->data_offset;
5500
5501	did_inc = false;
5502	if (conf->quiesce == `0`) {
5503	atomic_inc(v: &conf->active_aligned_reads);
5504	did_inc = true;
5505	}
5506	/ need a memory barrier to detect the race with raid5_quiesce() /
5507	if (!did_inc \|\| smp_load_acquire(&conf->quiesce) != `0`) {
5508	/ quiesce is in progress, so we need to undo io activation and wait*
5509	* for it to finish
5510	*/
5511	if (did_inc && atomic_dec_and_test(v: &conf->active_aligned_reads))
5512	wake_up(&conf->wait_for_quiescent);
5513	spin_lock_irq(lock: &conf->device_lock);
5514	wait_event_lock_irq(conf->wait_for_quiescent, conf->quiesce == `0`,
5515	conf->device_lock);
5516	atomic_inc(v: &conf->active_aligned_reads);
5517	spin_unlock_irq(lock: &conf->device_lock);
5518	}
5519
5520	mddev_trace_remap(mddev, bio: align_bio, sector: raid_bio->bi_iter.bi_sector);
5521	submit_bio_noacct(bio: align_bio);
5522	return `1`;
5523	}
5524
5525	static struct bio chunk_aligned_read(struct* mddev mddev, struct* bio *raid_bio)
5526	{
5527	struct bio *split;
5528	sector_t sector = raid_bio->bi_iter.bi_sector;
5529	unsigned chunk_sects = mddev->chunk_sectors;
5530	unsigned sectors = chunk_sects - (sector & (chunk_sects-`1`));
5531
5532	if (sectors < bio_sectors(raid_bio)) {
5533	struct r5conf *conf = mddev->private;
5534	split = bio_split(bio: raid_bio, sectors, GFP_NOIO, bs: &conf->bio_split);
5535	bio_chain(split, raid_bio);
5536	submit_bio_noacct(bio: raid_bio);
5537	raid_bio = split;
5538	}
5539
5540	if (!raid5_read_one_chunk(mddev, raid_bio))
5541	return raid_bio;
5542
5543	return NULL;
5544	}
5545
5546	/ __get_priority_stripe - get the next stripe to process*
5547	*
5548	* Full stripe writes are allowed to pass preread active stripes up until
5549	* the bypass_threshold is exceeded. In general the bypass_count
5550	* increments when the handle_list is handled before the hold_list; however, it
5551	* will not be incremented when STRIPE_IO_STARTED is sampled set signifying a
5552	* stripe with in flight i/o. The bypass_count will be reset when the
5553	* head of the hold_list has changed, i.e. the head was promoted to the
5554	* handle_list.
5555	*/
5556	static struct stripe_head __get_priority_stripe(struct* r5conf conf, int* group)
5557	__must_hold(&conf->device_lock)
5558	{
5559	struct stripe_head sh, tmp;
5560	struct list_head *handle_list = NULL;
5561	struct r5worker_group *wg;
5562	bool second_try = !r5c_is_writeback(log: conf->log) &&
5563	!r5l_log_disk_error(conf);
5564	bool try_loprio = test_bit(R5C_LOG_TIGHT, &conf->cache_state) \|\|
5565	r5l_log_disk_error(conf);
5566
5567	again:
5568	wg = NULL;
5569	sh = NULL;
5570	if (conf->worker_cnt_per_group == `0`) {
5571	handle_list = try_loprio ? &conf->loprio_list :
5572	&conf->handle_list;
5573	} else if (group != ANY_GROUP) {
5574	handle_list = try_loprio ? &conf->worker_groups[group].loprio_list :
5575	&conf->worker_groups[group].handle_list;
5576	wg = &conf->worker_groups[group];
5577	} else {
5578	int i;
5579	for (i = `0`; i < conf->group_cnt; i++) {
5580	handle_list = try_loprio ? &conf->worker_groups[i].loprio_list :
5581	&conf->worker_groups[i].handle_list;
5582	wg = &conf->worker_groups[i];
5583	if (!list_empty(head: handle_list))
5584	break;
5585	}
5586	}
5587
5588	pr_debug("%s: handle: %s hold: %s full_writes: %d bypass_count: %d\n",
5589	__func__,
5590	list_empty(handle_list) ? "empty" : "busy",
5591	list_empty(&conf->hold_list) ? "empty" : "busy",
5592	atomic_read(&conf->pending_full_writes), conf->bypass_count);
5593
5594	if (!list_empty(head: handle_list)) {
5595	sh = list_entry(handle_list->next, typeof(*sh), lru);
5596
5597	if (list_empty(head: &conf->hold_list))
5598	conf->bypass_count = `0`;
5599	else if (!test_bit(STRIPE_IO_STARTED, &sh->state)) {
5600	if (conf->hold_list.next == conf->last_hold)
5601	conf->bypass_count++;
5602	else {
5603	conf->last_hold = conf->hold_list.next;
5604	conf->bypass_count -= conf->bypass_threshold;
5605	if (conf->bypass_count < `0`)
5606	conf->bypass_count = `0`;
5607	}
5608	}
5609	} else if (!list_empty(head: &conf->hold_list) &&
5610	((conf->bypass_threshold &&
5611	conf->bypass_count > conf->bypass_threshold) \|\|
5612	atomic_read(v: &conf->pending_full_writes) == `0`)) {
5613
5614	list_for_each_entry(tmp, &conf->hold_list, lru) {
5615	if (conf->worker_cnt_per_group == `0` \|\|
5616	group == ANY_GROUP \|\|
5617	!cpu_online(cpu: tmp->cpu) \|\|
5618	cpu_to_group(tmp->cpu) == group) {
5619	sh = tmp;
5620	break;
5621	}
5622	}
5623
5624	if (sh) {
5625	conf->bypass_count -= conf->bypass_threshold;
5626	if (conf->bypass_count < `0`)
5627	conf->bypass_count = `0`;
5628	}
5629	wg = NULL;
5630	}
5631
5632	if (!sh) {
5633	if (second_try)
5634	return NULL;
5635	second_try = true;
5636	try_loprio = !try_loprio;
5637	goto again;
5638	}
5639
5640	if (wg) {
5641	wg->stripes_cnt--;
5642	sh->group = NULL;
5643	}
5644	list_del_init(entry: &sh->lru);
5645	BUG_ON(atomic_inc_return(&sh->count) != `1`);
5646	return sh;
5647	}
5648
5649	struct raid5_plug_cb {
5650	struct blk_plug_cb cb;
5651	struct list_head list;
5652	struct list_head temp_inactive_list[NR_STRIPE_HASH_LOCKS];
5653	};
5654
5655	static void raid5_unplug(struct blk_plug_cb *blk_cb, bool from_schedule)
5656	{
5657	struct raid5_plug_cb *cb = container_of(
5658	blk_cb, struct raid5_plug_cb, cb);
5659	struct stripe_head *sh;
5660	struct mddev *mddev = cb->cb.data;
5661	struct r5conf *conf = mddev->private;
5662	int cnt = `0`;
5663	int hash;
5664
5665	if (cb->list.next && !list_empty(head: &cb->list)) {
5666	spin_lock_irq(lock: &conf->device_lock);
5667	while (!list_empty(head: &cb->list)) {
5668	sh = list_first_entry(&cb->list, struct stripe_head, lru);
5669	list_del_init(entry: &sh->lru);
5670	/*
5671	* avoid race release_stripe_plug() sees
5672	* STRIPE_ON_UNPLUG_LIST clear but the stripe
5673	* is still in our list
5674	*/
5675	smp_mb__before_atomic();
5676	clear_bit(nr: STRIPE_ON_UNPLUG_LIST, addr: &sh->state);
5677	/*
5678	* STRIPE_ON_RELEASE_LIST could be set here. In that
5679	* case, the count is always > 1 here
5680	*/
5681	hash = sh->hash_lock_index;
5682	__release_stripe(conf, sh, temp_inactive_list: &cb->temp_inactive_list[hash]);
5683	cnt++;
5684	}
5685	spin_unlock_irq(lock: &conf->device_lock);
5686	}
5687	release_inactive_stripe_list(conf, temp_inactive_list: cb->temp_inactive_list,
5688	NR_STRIPE_HASH_LOCKS);
5689	if (!mddev_is_dm(mddev))
5690	trace_block_unplug(q: mddev->gendisk->queue, depth: cnt, explicit: !from_schedule);
5691	kfree(objp: cb);
5692	}
5693
5694	static void release_stripe_plug(struct mddev *mddev,
5695	struct stripe_head *sh)
5696	{
5697	struct blk_plug_cb *blk_cb = blk_check_plugged(
5698	unplug: raid5_unplug, data: mddev,
5699	size: sizeof(struct raid5_plug_cb));
5700	struct raid5_plug_cb *cb;
5701
5702	if (!blk_cb) {
5703	raid5_release_stripe(sh);
5704	return;
5705	}
5706
5707	cb = container_of(blk_cb, struct raid5_plug_cb, cb);
5708
5709	if (cb->list.next == NULL) {
5710	int i;
5711	INIT_LIST_HEAD(list: &cb->list);
5712	for (i = `0`; i < NR_STRIPE_HASH_LOCKS; i++)
5713	INIT_LIST_HEAD(list: cb->temp_inactive_list + i);
5714	}
5715
5716	if (!test_and_set_bit(nr: STRIPE_ON_UNPLUG_LIST, addr: &sh->state))
5717	list_add_tail(new: &sh->lru, head: &cb->list);
5718	else
5719	raid5_release_stripe(sh);
5720	}
5721
5722	static void make_discard_request(struct mddev mddev, struct* bio *bi)
5723	{
5724	struct r5conf *conf = mddev->private;
5725	sector_t logical_sector, last_sector;
5726	struct stripe_head *sh;
5727	int stripe_sectors;
5728
5729	/ We need to handle this when io_uring supports discard/trim /
5730	if (WARN_ON_ONCE(bi->bi_opf & REQ_NOWAIT))
5731	return;
5732
5733	if (mddev->reshape_position != MaxSector)
5734	/ Skip discard while reshape is happening /
5735	return;
5736
5737	logical_sector = bi->bi_iter.bi_sector & ~((sector_t)RAID5_STRIPE_SECTORS(conf)-`1`);
5738	last_sector = bio_end_sector(bi);
5739
5740	bi->bi_next = NULL;
5741
5742	stripe_sectors = conf->chunk_sectors *
5743	(conf->raid_disks - conf->max_degraded);
5744	logical_sector = DIV_ROUND_UP_SECTOR_T(logical_sector,
5745	stripe_sectors);
5746	sector_div(last_sector, stripe_sectors);
5747
5748	logical_sector *= conf->chunk_sectors;
5749	last_sector *= conf->chunk_sectors;
5750
5751	for (; logical_sector < last_sector;
5752	logical_sector += RAID5_STRIPE_SECTORS(conf)) {
5753	DEFINE_WAIT(w);
5754	int d;
5755	again:
5756	sh = raid5_get_active_stripe(conf, NULL, sector: logical_sector, flags: `0`);
5757	prepare_to_wait(wq_head: &conf->wait_for_overlap, wq_entry: &w,
5758	TASK_UNINTERRUPTIBLE);
5759	set_bit(nr: R5_Overlap, addr: &sh->dev[sh->pd_idx].flags);
5760	if (test_bit(STRIPE_SYNCING, &sh->state)) {
5761	raid5_release_stripe(sh);
5762	schedule();
5763	goto again;
5764	}
5765	clear_bit(nr: R5_Overlap, addr: &sh->dev[sh->pd_idx].flags);
5766	spin_lock_irq(lock: &sh->stripe_lock);
5767	for (d = `0`; d < conf->raid_disks; d++) {
5768	if (d == sh->pd_idx \|\| d == sh->qd_idx)
5769	continue;
5770	if (sh->dev[d].towrite \|\| sh->dev[d].toread) {
5771	set_bit(nr: R5_Overlap, addr: &sh->dev[d].flags);
5772	spin_unlock_irq(lock: &sh->stripe_lock);
5773	raid5_release_stripe(sh);
5774	schedule();
5775	goto again;
5776	}
5777	}
5778	set_bit(nr: STRIPE_DISCARD, addr: &sh->state);
5779	finish_wait(wq_head: &conf->wait_for_overlap, wq_entry: &w);
5780	sh->overwrite_disks = `0`;
5781	for (d = `0`; d < conf->raid_disks; d++) {
5782	if (d == sh->pd_idx \|\| d == sh->qd_idx)
5783	continue;
5784	sh->dev[d].towrite = bi;
5785	set_bit(nr: R5_OVERWRITE, addr: &sh->dev[d].flags);
5786	bio_inc_remaining(bio: bi);
5787	md_write_inc(mddev, bi);
5788	sh->overwrite_disks++;
5789	}
5790	spin_unlock_irq(lock: &sh->stripe_lock);
5791	if (conf->mddev->bitmap) {
5792	for (d = `0`;
5793	d < conf->raid_disks - conf->max_degraded;
5794	d++)
5795	md_bitmap_startwrite(bitmap: mddev->bitmap,
5796	offset: sh->sector,
5797	RAID5_STRIPE_SECTORS(conf),
5798	behind: `0`);
5799	sh->bm_seq = conf->seq_flush + `1`;
5800	set_bit(nr: STRIPE_BIT_DELAY, addr: &sh->state);
5801	}
5802
5803	set_bit(nr: STRIPE_HANDLE, addr: &sh->state);
5804	clear_bit(nr: STRIPE_DELAYED, addr: &sh->state);
5805	if (!test_and_set_bit(nr: STRIPE_PREREAD_ACTIVE, addr: &sh->state))
5806	atomic_inc(v: &conf->preread_active_stripes);
5807	release_stripe_plug(mddev, sh);
5808	}
5809
5810	bio_endio(bi);
5811	}
5812
5813	static bool ahead_of_reshape(struct mddev *mddev, sector_t sector,
5814	sector_t reshape_sector)
5815	{
5816	return mddev->reshape_backwards ? sector < reshape_sector :
5817	sector >= reshape_sector;
5818	}
5819
5820	static bool range_ahead_of_reshape(struct mddev *mddev, sector_t min,
5821	sector_t max, sector_t reshape_sector)
5822	{
5823	return mddev->reshape_backwards ? max < reshape_sector :
5824	min >= reshape_sector;
5825	}
5826
5827	static bool stripe_ahead_of_reshape(struct mddev mddev, struct* r5conf *conf,
5828	struct stripe_head *sh)
5829	{
5830	sector_t max_sector = `0`, min_sector = MaxSector;
5831	bool ret = false;
5832	int dd_idx;
5833
5834	for (dd_idx = `0`; dd_idx < sh->disks; dd_idx++) {
5835	if (dd_idx == sh->pd_idx \|\| dd_idx == sh->qd_idx)
5836	continue;
5837
5838	min_sector = min(min_sector, sh->dev[dd_idx].sector);
5839	max_sector = max(max_sector, sh->dev[dd_idx].sector);
5840	}
5841
5842	spin_lock_irq(lock: &conf->device_lock);
5843
5844	if (!range_ahead_of_reshape(mddev, min: min_sector, max: max_sector,
5845	reshape_sector: conf->reshape_progress))
5846	/ mismatch, need to try again /
5847	ret = true;
5848
5849	spin_unlock_irq(lock: &conf->device_lock);
5850
5851	return ret;
5852	}
5853
5854	static int add_all_stripe_bios(struct r5conf *conf,
5855	struct stripe_request_ctx ctx, struct* stripe_head *sh,
5856	struct bio bi, int* forwrite, int previous)
5857	{
5858	int dd_idx;
5859	int ret = `1`;
5860
5861	spin_lock_irq(lock: &sh->stripe_lock);
5862
5863	for (dd_idx = `0`; dd_idx < sh->disks; dd_idx++) {
5864	struct r5dev *dev = &sh->dev[dd_idx];
5865
5866	if (dd_idx == sh->pd_idx \|\| dd_idx == sh->qd_idx)
5867	continue;
5868
5869	if (dev->sector < ctx->first_sector \|\|
5870	dev->sector >= ctx->last_sector)
5871	continue;
5872
5873	if (stripe_bio_overlaps(sh, bi, dd_idx, forwrite)) {
5874	set_bit(nr: R5_Overlap, addr: &dev->flags);
5875	ret = `0`;
5876	continue;
5877	}
5878	}
5879
5880	if (!ret)
5881	goto out;
5882
5883	for (dd_idx = `0`; dd_idx < sh->disks; dd_idx++) {
5884	struct r5dev *dev = &sh->dev[dd_idx];
5885
5886	if (dd_idx == sh->pd_idx \|\| dd_idx == sh->qd_idx)
5887	continue;
5888
5889	if (dev->sector < ctx->first_sector \|\|
5890	dev->sector >= ctx->last_sector)
5891	continue;
5892
5893	__add_stripe_bio(sh, bi, dd_idx, forwrite, previous);
5894	clear_bit(nr: (dev->sector - ctx->first_sector) >>
5895	RAID5_STRIPE_SHIFT(conf), addr: ctx->sectors_to_do);
5896	}
5897
5898	out:
5899	spin_unlock_irq(lock: &sh->stripe_lock);
5900	return ret;
5901	}
5902
5903	static enum stripe_result make_stripe_request(struct mddev *mddev,
5904	struct r5conf conf, struct* stripe_request_ctx *ctx,
5905	sector_t logical_sector, struct bio *bi)
5906	{
5907	const int rw = bio_data_dir(bi);
5908	enum stripe_result ret;
5909	struct stripe_head *sh;
5910	sector_t new_sector;
5911	int previous = `0`, flags = `0`;
5912	int seq, dd_idx;
5913
5914	seq = read_seqcount_begin(&conf->gen_lock);
5915
5916	if (unlikely(conf->reshape_progress != MaxSector)) {
5917	/*
5918	* Spinlock is needed as reshape_progress may be
5919	* 64bit on a 32bit platform, and so it might be
5920	* possible to see a half-updated value
5921	* Of course reshape_progress could change after
5922	* the lock is dropped, so once we get a reference
5923	* to the stripe that we think it is, we will have
5924	* to check again.
5925	*/
5926	spin_lock_irq(lock: &conf->device_lock);
5927	if (ahead_of_reshape(mddev, sector: logical_sector,
5928	reshape_sector: conf->reshape_progress)) {
5929	previous = `1`;
5930	} else {
5931	if (ahead_of_reshape(mddev, sector: logical_sector,
5932	reshape_sector: conf->reshape_safe)) {
5933	spin_unlock_irq(lock: &conf->device_lock);
5934	ret = STRIPE_SCHEDULE_AND_RETRY;
5935	goto out;
5936	}
5937	}
5938	spin_unlock_irq(lock: &conf->device_lock);
5939	}
5940
5941	new_sector = raid5_compute_sector(conf, r_sector: logical_sector, previous,
5942	dd_idx: &dd_idx, NULL);
5943	pr_debug("raid456: %s, sector %llu logical %llu\n", __func__,
5944	new_sector, logical_sector);
5945
5946	if (previous)
5947	flags \|= R5_GAS_PREVIOUS;
5948	if (bi->bi_opf & REQ_RAHEAD)
5949	flags \|= R5_GAS_NOBLOCK;
5950	sh = raid5_get_active_stripe(conf, ctx, sector: new_sector, flags);
5951	if (unlikely(!sh)) {
5952	/ cannot get stripe, just give-up /
5953	bi->bi_status = BLK_STS_IOERR;
5954	return STRIPE_FAIL;
5955	}
5956
5957	if (unlikely(previous) &&
5958	stripe_ahead_of_reshape(mddev, conf, sh)) {
5959	/*
5960	* Expansion moved on while waiting for a stripe.
5961	* Expansion could still move past after this
5962	* test, but as we are holding a reference to
5963	* 'sh', we know that if that happens,
5964	* STRIPE_EXPANDING will get set and the expansion
5965	* won't proceed until we finish with the stripe.
5966	*/
5967	ret = STRIPE_SCHEDULE_AND_RETRY;
5968	goto out_release;
5969	}
5970
5971	if (read_seqcount_retry(&conf->gen_lock, seq)) {
5972	/ Might have got the wrong stripe_head by accident /
5973	ret = STRIPE_RETRY;
5974	goto out_release;
5975	}
5976
5977	if (test_bit(STRIPE_EXPANDING, &sh->state) \|\|
5978	!add_all_stripe_bios(conf, ctx, sh, bi, forwrite: rw, previous)) {
5979	/*
5980	* Stripe is busy expanding or add failed due to
5981	* overlap. Flush everything and wait a while.
5982	*/
5983	md_wakeup_thread(thread: mddev->thread);
5984	ret = STRIPE_SCHEDULE_AND_RETRY;
5985	goto out_release;
5986	}
5987
5988	if (stripe_can_batch(sh)) {
5989	stripe_add_to_batch_list(conf, sh, last_sh: ctx->batch_last);
5990	if (ctx->batch_last)
5991	raid5_release_stripe(sh: ctx->batch_last);
5992	atomic_inc(v: &sh->count);
5993	ctx->batch_last = sh;
5994	}
5995
5996	if (ctx->do_flush) {
5997	set_bit(nr: STRIPE_R5C_PREFLUSH, addr: &sh->state);
5998	/ we only need flush for one stripe /
5999	ctx->do_flush = false;
6000	}
6001
6002	set_bit(nr: STRIPE_HANDLE, addr: &sh->state);
6003	clear_bit(nr: STRIPE_DELAYED, addr: &sh->state);
6004	if ((!sh->batch_head \|\| sh == sh->batch_head) &&
6005	(bi->bi_opf & REQ_SYNC) &&
6006	!test_and_set_bit(nr: STRIPE_PREREAD_ACTIVE, addr: &sh->state))
6007	atomic_inc(v: &conf->preread_active_stripes);
6008
6009	release_stripe_plug(mddev, sh);
6010	return STRIPE_SUCCESS;
6011
6012	out_release:
6013	raid5_release_stripe(sh);
6014	out:
6015	if (ret == STRIPE_SCHEDULE_AND_RETRY && reshape_interrupted(mddev)) {
6016	bi->bi_status = BLK_STS_RESOURCE;
6017	ret = STRIPE_WAIT_RESHAPE;
6018	pr_err_ratelimited("dm-raid456: io across reshape position while reshape can't make progress");
6019	}
6020	return ret;
6021	}
6022
6023	/*
6024	* If the bio covers multiple data disks, find sector within the bio that has
6025	* the lowest chunk offset in the first chunk.
6026	*/
6027	static sector_t raid5_bio_lowest_chunk_sector(struct r5conf *conf,
6028	struct bio *bi)
6029	{
6030	int sectors_per_chunk = conf->chunk_sectors;
6031	int raid_disks = conf->raid_disks;
6032	int dd_idx;
6033	struct stripe_head sh;
6034	unsigned int chunk_offset;
6035	sector_t r_sector = bi->bi_iter.bi_sector & ~((sector_t)RAID5_STRIPE_SECTORS(conf)-`1`);
6036	sector_t sector;
6037
6038	/ We pass in fake stripe_head to get back parity disk numbers /
6039	sector = raid5_compute_sector(conf, r_sector, previous: `0`, dd_idx: &dd_idx, sh: &sh);
6040	chunk_offset = sector_div(sector, sectors_per_chunk);
6041	if (sectors_per_chunk - chunk_offset >= bio_sectors(bi))
6042	return r_sector;
6043	/*
6044	* Bio crosses to the next data disk. Check whether it's in the same
6045	* chunk.
6046	*/
6047	dd_idx++;
6048	while (dd_idx == sh.pd_idx \|\| dd_idx == sh.qd_idx)
6049	dd_idx++;
6050	if (dd_idx >= raid_disks)
6051	return r_sector;
6052	return r_sector + sectors_per_chunk - chunk_offset;
6053	}
6054
6055	static bool raid5_make_request(struct mddev mddev, struct* bio * bi)
6056	{
6057	DEFINE_WAIT_FUNC(wait, woken_wake_function);
6058	struct r5conf *conf = mddev->private;
6059	sector_t logical_sector;
6060	struct stripe_request_ctx ctx = {};
6061	const int rw = bio_data_dir(bi);
6062	enum stripe_result res;
6063	int s, stripe_cnt;
6064
6065	if (unlikely(bi->bi_opf & REQ_PREFLUSH)) {
6066	int ret = log_handle_flush_request(conf, bio: bi);
6067
6068	if (ret == `0`)
6069	return true;
6070	if (ret == -ENODEV) {
6071	if (md_flush_request(mddev, bio: bi))
6072	return true;
6073	}
6074	/ ret == -EAGAIN, fallback /
6075	/*
6076	* if r5l_handle_flush_request() didn't clear REQ_PREFLUSH,
6077	* we need to flush journal device
6078	*/
6079	ctx.do_flush = bi->bi_opf & REQ_PREFLUSH;
6080	}
6081
6082	if (!md_write_start(mddev, bi))
6083	return false;
6084	/*
6085	* If array is degraded, better not do chunk aligned read because
6086	* later we might have to read it again in order to reconstruct
6087	* data on failed drives.
6088	*/
6089	if (rw == READ && mddev->degraded == `0` &&
6090	mddev->reshape_position == MaxSector) {
6091	bi = chunk_aligned_read(mddev, raid_bio: bi);
6092	if (!bi)
6093	return true;
6094	}
6095
6096	if (unlikely(bio_op(bi) == REQ_OP_DISCARD)) {
6097	make_discard_request(mddev, bi);
6098	md_write_end(mddev);
6099	return true;
6100	}
6101
6102	logical_sector = bi->bi_iter.bi_sector & ~((sector_t)RAID5_STRIPE_SECTORS(conf)-`1`);
6103	ctx.first_sector = logical_sector;
6104	ctx.last_sector = bio_end_sector(bi);
6105	bi->bi_next = NULL;
6106
6107	stripe_cnt = DIV_ROUND_UP_SECTOR_T(ctx.last_sector - logical_sector,
6108	RAID5_STRIPE_SECTORS(conf));
6109	bitmap_set(map: ctx.sectors_to_do, start: `0`, nbits: stripe_cnt);
6110
6111	pr_debug("raid456: %s, logical %llu to %llu\n", __func__,
6112	bi->bi_iter.bi_sector, ctx.last_sector);
6113
6114	/ Bail out if conflicts with reshape and REQ_NOWAIT is set /
6115	if ((bi->bi_opf & REQ_NOWAIT) &&
6116	(conf->reshape_progress != MaxSector) &&
6117	!ahead_of_reshape(mddev, sector: logical_sector, reshape_sector: conf->reshape_progress) &&
6118	ahead_of_reshape(mddev, sector: logical_sector, reshape_sector: conf->reshape_safe)) {
6119	bio_wouldblock_error(bio: bi);
6120	if (rw == WRITE)
6121	md_write_end(mddev);
6122	return true;
6123	}
6124	md_account_bio(mddev, bio: &bi);
6125
6126	/*
6127	* Lets start with the stripe with the lowest chunk offset in the first
6128	* chunk. That has the best chances of creating IOs adjacent to
6129	* previous IOs in case of sequential IO and thus creates the most
6130	* sequential IO pattern. We don't bother with the optimization when
6131	* reshaping as the performance benefit is not worth the complexity.
6132	*/
6133	if (likely(conf->reshape_progress == MaxSector))
6134	logical_sector = raid5_bio_lowest_chunk_sector(conf, bi);
6135	s = (logical_sector - ctx.first_sector) >> RAID5_STRIPE_SHIFT(conf);
6136
6137	add_wait_queue(wq_head: &conf->wait_for_overlap, wq_entry: &wait);
6138	while (`1`) {
6139	res = make_stripe_request(mddev, conf, ctx: &ctx, logical_sector,
6140	bi);
6141	if (res == STRIPE_FAIL \|\| res == STRIPE_WAIT_RESHAPE)
6142	break;
6143
6144	if (res == STRIPE_RETRY)
6145	continue;
6146
6147	if (res == STRIPE_SCHEDULE_AND_RETRY) {
6148	/*
6149	* Must release the reference to batch_last before
6150	* scheduling and waiting for work to be done,
6151	* otherwise the batch_last stripe head could prevent
6152	* raid5_activate_delayed() from making progress
6153	* and thus deadlocking.
6154	*/
6155	if (ctx.batch_last) {
6156	raid5_release_stripe(sh: ctx.batch_last);
6157	ctx.batch_last = NULL;
6158	}
6159
6160	wait_woken(wq_entry: &wait, TASK_UNINTERRUPTIBLE,
6161	MAX_SCHEDULE_TIMEOUT);
6162	continue;
6163	}
6164
6165	s = find_next_bit_wrap(addr: ctx.sectors_to_do, size: stripe_cnt, offset: s);
6166	if (s == stripe_cnt)
6167	break;
6168
6169	logical_sector = ctx.first_sector +
6170	(s << RAID5_STRIPE_SHIFT(conf));
6171	}
6172	remove_wait_queue(wq_head: &conf->wait_for_overlap, wq_entry: &wait);
6173
6174	if (ctx.batch_last)
6175	raid5_release_stripe(sh: ctx.batch_last);
6176
6177	if (rw == WRITE)
6178	md_write_end(mddev);
6179	if (res == STRIPE_WAIT_RESHAPE) {
6180	md_free_cloned_bio(bio: bi);
6181	return false;
6182	}
6183
6184	bio_endio(bi);
6185	return true;
6186	}
6187
6188	static sector_t raid5_size(struct mddev mddev, sector_t sectors, int* raid_disks);
6189
6190	static sector_t reshape_request(struct mddev mddev, sector_t sector_nr, int* *skipped)
6191	{
6192	/ reshaping is quite different to recovery/resync so it is*
6193	* handled quite separately ... here.
6194	*
6195	* On each call to sync_request, we gather one chunk worth of
6196	* destination stripes and flag them as expanding.
6197	* Then we find all the source stripes and request reads.
6198	* As the reads complete, handle_stripe will copy the data
6199	* into the destination stripe and release that stripe.
6200	*/
6201	struct r5conf *conf = mddev->private;
6202	struct stripe_head *sh;
6203	struct md_rdev *rdev;
6204	sector_t first_sector, last_sector;
6205	int raid_disks = conf->previous_raid_disks;
6206	int data_disks = raid_disks - conf->max_degraded;
6207	int new_data_disks = conf->raid_disks - conf->max_degraded;
6208	int i;
6209	int dd_idx;
6210	sector_t writepos, readpos, safepos;
6211	sector_t stripe_addr;
6212	int reshape_sectors;
6213	struct list_head stripes;
6214	sector_t retn;
6215
6216	if (sector_nr == `0`) {
6217	/ If restarting in the middle, skip the initial sectors /
6218	if (mddev->reshape_backwards &&
6219	conf->reshape_progress < raid5_size(mddev, sectors: `0`, raid_disks: `0`)) {
6220	sector_nr = raid5_size(mddev, sectors: `0`, raid_disks: `0`)
6221	- conf->reshape_progress;
6222	} else if (mddev->reshape_backwards &&
6223	conf->reshape_progress == MaxSector) {
6224	/ shouldn't happen, but just in case, finish up./
6225	sector_nr = MaxSector;
6226	} else if (!mddev->reshape_backwards &&
6227	conf->reshape_progress > `0`)
6228	sector_nr = conf->reshape_progress;
6229	sector_div(sector_nr, new_data_disks);
6230	if (sector_nr) {
6231	mddev->curr_resync_completed = sector_nr;
6232	sysfs_notify_dirent_safe(sd: mddev->sysfs_completed);
6233	*skipped = `1`;
6234	retn = sector_nr;
6235	goto finish;
6236	}
6237	}
6238
6239	/ We need to process a full chunk at a time.*
6240	* If old and new chunk sizes differ, we need to process the
6241	* largest of these
6242	*/
6243
6244	reshape_sectors = max(conf->chunk_sectors, conf->prev_chunk_sectors);
6245
6246	/ We update the metadata at least every 10 seconds, or when*
6247	* the data about to be copied would over-write the source of
6248	* the data at the front of the range. i.e. one new_stripe
6249	* along from reshape_progress new_maps to after where
6250	* reshape_safe old_maps to
6251	*/
6252	writepos = conf->reshape_progress;
6253	sector_div(writepos, new_data_disks);
6254	readpos = conf->reshape_progress;
6255	sector_div(readpos, data_disks);
6256	safepos = conf->reshape_safe;
6257	sector_div(safepos, data_disks);
6258	if (mddev->reshape_backwards) {
6259	BUG_ON(writepos < reshape_sectors);
6260	writepos -= reshape_sectors;
6261	readpos += reshape_sectors;
6262	safepos += reshape_sectors;
6263	} else {
6264	writepos += reshape_sectors;
6265	/ readpos and safepos are worst-case calculations.*
6266	* A negative number is overly pessimistic, and causes
6267	* obvious problems for unsigned storage. So clip to 0.
6268	*/
6269	readpos -= min_t(sector_t, reshape_sectors, readpos);
6270	safepos -= min_t(sector_t, reshape_sectors, safepos);
6271	}
6272
6273	/ Having calculated the 'writepos' possibly use it*
6274	* to set 'stripe_addr' which is where we will write to.
6275	*/
6276	if (mddev->reshape_backwards) {
6277	BUG_ON(conf->reshape_progress == `0`);
6278	stripe_addr = writepos;
6279	BUG_ON((mddev->dev_sectors &
6280	~((sector_t)reshape_sectors - `1`))
6281	- reshape_sectors - stripe_addr
6282	!= sector_nr);
6283	} else {
6284	BUG_ON(writepos != sector_nr + reshape_sectors);
6285	stripe_addr = sector_nr;
6286	}
6287
6288	/ 'writepos' is the most advanced device address we might write.*
6289	* 'readpos' is the least advanced device address we might read.
6290	* 'safepos' is the least address recorded in the metadata as having
6291	* been reshaped.
6292	* If there is a min_offset_diff, these are adjusted either by
6293	* increasing the safepos/readpos if diff is negative, or
6294	* increasing writepos if diff is positive.
6295	* If 'readpos' is then behind 'writepos', there is no way that we can
6296	* ensure safety in the face of a crash - that must be done by userspace
6297	* making a backup of the data. So in that case there is no particular
6298	* rush to update metadata.
6299	* Otherwise if 'safepos' is behind 'writepos', then we really need to
6300	* update the metadata to advance 'safepos' to match 'readpos' so that
6301	* we can be safe in the event of a crash.
6302	* So we insist on updating metadata if safepos is behind writepos and
6303	* readpos is beyond writepos.
6304	* In any case, update the metadata every 10 seconds.
6305	* Maybe that number should be configurable, but I'm not sure it is
6306	* worth it.... maybe it could be a multiple of safemode_delay???
6307	*/
6308	if (conf->min_offset_diff < `0`) {
6309	safepos += -conf->min_offset_diff;
6310	readpos += -conf->min_offset_diff;
6311	} else
6312	writepos += conf->min_offset_diff;
6313
6314	if ((mddev->reshape_backwards
6315	? (safepos > writepos && readpos < writepos)
6316	: (safepos < writepos && readpos > writepos)) \|\|
6317	time_after(jiffies, conf->reshape_checkpoint + `10`*HZ)) {
6318	/ Cannot proceed until we've updated the superblock... /
6319	wait_event(conf->wait_for_overlap,
6320	atomic_read(&conf->reshape_stripes)==`0`
6321	\|\| test_bit(MD_RECOVERY_INTR, &mddev->recovery));
6322	if (atomic_read(v: &conf->reshape_stripes) != `0`)
6323	return `0`;
6324	mddev->reshape_position = conf->reshape_progress;
6325	mddev->curr_resync_completed = sector_nr;
6326	if (!mddev->reshape_backwards)
6327	/ Can update recovery_offset /
6328	rdev_for_each(rdev, mddev)
6329	if (rdev->raid_disk >= `0` &&
6330	!test_bit(Journal, &rdev->flags) &&
6331	!test_bit(In_sync, &rdev->flags) &&
6332	rdev->recovery_offset < sector_nr)
6333	rdev->recovery_offset = sector_nr;
6334
6335	conf->reshape_checkpoint = jiffies;
6336	set_bit(nr: MD_SB_CHANGE_DEVS, addr: &mddev->sb_flags);
6337	md_wakeup_thread(thread: mddev->thread);
6338	wait_event(mddev->sb_wait, mddev->sb_flags == `0` \|\|
6339	test_bit(MD_RECOVERY_INTR, &mddev->recovery));
6340	if (test_bit(MD_RECOVERY_INTR, &mddev->recovery))
6341	return `0`;
6342	spin_lock_irq(lock: &conf->device_lock);
6343	conf->reshape_safe = mddev->reshape_position;
6344	spin_unlock_irq(lock: &conf->device_lock);
6345	wake_up(&conf->wait_for_overlap);
6346	sysfs_notify_dirent_safe(sd: mddev->sysfs_completed);
6347	}
6348
6349	INIT_LIST_HEAD(list: &stripes);
6350	for (i = `0`; i < reshape_sectors; i += RAID5_STRIPE_SECTORS(conf)) {
6351	int j;
6352	int skipped_disk = `0`;
6353	sh = raid5_get_active_stripe(conf, NULL, sector: stripe_addr+i,
6354	R5_GAS_NOQUIESCE);
6355	set_bit(nr: STRIPE_EXPANDING, addr: &sh->state);
6356	atomic_inc(v: &conf->reshape_stripes);
6357	/ If any of this stripe is beyond the end of the old*
6358	* array, then we need to zero those blocks
6359	*/
6360	for (j=sh->disks; j--;) {
6361	sector_t s;
6362	if (j == sh->pd_idx)
6363	continue;
6364	if (conf->level == `6` &&
6365	j == sh->qd_idx)
6366	continue;
6367	s = raid5_compute_blocknr(sh, i: j, previous: `0`);
6368	if (s < raid5_size(mddev, sectors: `0`, raid_disks: `0`)) {
6369	skipped_disk = `1`;
6370	continue;
6371	}
6372	memset(page_address(sh->dev[j].page), `0`, RAID5_STRIPE_SIZE(conf));
6373	set_bit(nr: R5_Expanded, addr: &sh->dev[j].flags);
6374	set_bit(nr: R5_UPTODATE, addr: &sh->dev[j].flags);
6375	}
6376	if (!skipped_disk) {
6377	set_bit(nr: STRIPE_EXPAND_READY, addr: &sh->state);
6378	set_bit(nr: STRIPE_HANDLE, addr: &sh->state);
6379	}
6380	list_add(new: &sh->lru, head: &stripes);
6381	}
6382	spin_lock_irq(lock: &conf->device_lock);
6383	if (mddev->reshape_backwards)
6384	conf->reshape_progress -= reshape_sectors * new_data_disks;
6385	else
6386	conf->reshape_progress += reshape_sectors * new_data_disks;
6387	spin_unlock_irq(lock: &conf->device_lock);
6388	/ Ok, those stripe are ready. We can start scheduling*
6389	* reads on the source stripes.
6390	* The source stripes are determined by mapping the first and last
6391	* block on the destination stripes.
6392	*/
6393	first_sector =
6394	raid5_compute_sector(conf, r_sector: stripe_addr*(new_data_disks),
6395	previous: `1`, dd_idx: &dd_idx, NULL);
6396	last_sector =
6397	raid5_compute_sector(conf, r_sector: ((stripe_addr+reshape_sectors)
6398	* new_data_disks - `1`),
6399	previous: `1`, dd_idx: &dd_idx, NULL);
6400	if (last_sector >= mddev->dev_sectors)
6401	last_sector = mddev->dev_sectors - `1`;
6402	while (first_sector <= last_sector) {
6403	sh = raid5_get_active_stripe(conf, NULL, sector: first_sector,
6404	R5_GAS_PREVIOUS \| R5_GAS_NOQUIESCE);
6405	set_bit(nr: STRIPE_EXPAND_SOURCE, addr: &sh->state);
6406	set_bit(nr: STRIPE_HANDLE, addr: &sh->state);
6407	raid5_release_stripe(sh);
6408	first_sector += RAID5_STRIPE_SECTORS(conf);
6409	}
6410	/ Now that the sources are clearly marked, we can release*
6411	* the destination stripes
6412	*/
6413	while (!list_empty(head: &stripes)) {
6414	sh = list_entry(stripes.next, struct stripe_head, lru);
6415	list_del_init(entry: &sh->lru);
6416	raid5_release_stripe(sh);
6417	}
6418	/ If this takes us to the resync_max point where we have to pause,*
6419	* then we need to write out the superblock.
6420	*/
6421	sector_nr += reshape_sectors;
6422	retn = reshape_sectors;
6423	finish:
6424	if (mddev->curr_resync_completed > mddev->resync_max \|\|
6425	(sector_nr - mddev->curr_resync_completed) * `2`
6426	>= mddev->resync_max - mddev->curr_resync_completed) {
6427	/ Cannot proceed until we've updated the superblock... /
6428	wait_event(conf->wait_for_overlap,
6429	atomic_read(&conf->reshape_stripes) == `0`
6430	\|\| test_bit(MD_RECOVERY_INTR, &mddev->recovery));
6431	if (atomic_read(v: &conf->reshape_stripes) != `0`)
6432	goto ret;
6433	mddev->reshape_position = conf->reshape_progress;
6434	mddev->curr_resync_completed = sector_nr;
6435	if (!mddev->reshape_backwards)
6436	/ Can update recovery_offset /
6437	rdev_for_each(rdev, mddev)
6438	if (rdev->raid_disk >= `0` &&
6439	!test_bit(Journal, &rdev->flags) &&
6440	!test_bit(In_sync, &rdev->flags) &&
6441	rdev->recovery_offset < sector_nr)
6442	rdev->recovery_offset = sector_nr;
6443	conf->reshape_checkpoint = jiffies;
6444	set_bit(nr: MD_SB_CHANGE_DEVS, addr: &mddev->sb_flags);
6445	md_wakeup_thread(thread: mddev->thread);
6446	wait_event(mddev->sb_wait,
6447	!test_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags)
6448	\|\| test_bit(MD_RECOVERY_INTR, &mddev->recovery));
6449	if (test_bit(MD_RECOVERY_INTR, &mddev->recovery))
6450	goto ret;
6451	spin_lock_irq(lock: &conf->device_lock);
6452	conf->reshape_safe = mddev->reshape_position;
6453	spin_unlock_irq(lock: &conf->device_lock);
6454	wake_up(&conf->wait_for_overlap);
6455	sysfs_notify_dirent_safe(sd: mddev->sysfs_completed);
6456	}
6457	ret:
6458	return retn;
6459	}
6460
6461	static inline sector_t raid5_sync_request(struct mddev *mddev, sector_t sector_nr,
6462	int *skipped)
6463	{
6464	struct r5conf *conf = mddev->private;
6465	struct stripe_head *sh;
6466	sector_t max_sector = mddev->dev_sectors;
6467	sector_t sync_blocks;
6468	int still_degraded = `0`;
6469	int i;
6470
6471	if (sector_nr >= max_sector) {
6472	/ just being told to finish up .. nothing much to do /
6473
6474	if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) {
6475	end_reshape(conf);
6476	return `0`;
6477	}
6478
6479	if (mddev->curr_resync < max_sector) / aborted /
6480	md_bitmap_end_sync(bitmap: mddev->bitmap, offset: mddev->curr_resync,
6481	blocks: &sync_blocks, aborted: `1`);
6482	else / completed sync /
6483	conf->fullsync = `0`;
6484	md_bitmap_close_sync(bitmap: mddev->bitmap);
6485
6486	return `0`;
6487	}
6488
6489	/ Allow raid5_quiesce to complete /
6490	wait_event(conf->wait_for_overlap, conf->quiesce != `2`);
6491
6492	if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
6493	return reshape_request(mddev, sector_nr, skipped);
6494
6495	/ No need to check resync_max as we never do more than one*
6496	* stripe, and as resync_max will always be on a chunk boundary,
6497	* if the check in md_do_sync didn't fire, there is no chance
6498	* of overstepping resync_max here
6499	*/
6500
6501	/ if there is too many failed drives and we are trying*
6502	* to resync, then assert that we are finished, because there is
6503	* nothing we can do.
6504	*/
6505	if (mddev->degraded >= conf->max_degraded &&
6506	test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
6507	sector_t rv = mddev->dev_sectors - sector_nr;
6508	*skipped = `1`;
6509	return rv;
6510	}
6511	if (!test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery) &&
6512	!conf->fullsync &&
6513	!md_bitmap_start_sync(bitmap: mddev->bitmap, offset: sector_nr, blocks: &sync_blocks, degraded: `1`) &&
6514	sync_blocks >= RAID5_STRIPE_SECTORS(conf)) {
6515	/ we can skip this block, and probably more /
6516	do_div(sync_blocks, RAID5_STRIPE_SECTORS(conf));
6517	*skipped = `1`;
6518	/ keep things rounded to whole stripes /
6519	return sync_blocks * RAID5_STRIPE_SECTORS(conf);
6520	}
6521
6522	md_bitmap_cond_end_sync(bitmap: mddev->bitmap, sector: sector_nr, force: false);
6523
6524	sh = raid5_get_active_stripe(conf, NULL, sector: sector_nr,
6525	R5_GAS_NOBLOCK);
6526	if (sh == NULL) {
6527	sh = raid5_get_active_stripe(conf, NULL, sector: sector_nr, flags: `0`);
6528	/ make sure we don't swamp the stripe cache if someone else*
6529	* is trying to get access
6530	*/
6531	schedule_timeout_uninterruptible(timeout: `1`);
6532	}
6533	/ Need to check if array will still be degraded after recovery/resync*
6534	* Note in case of > 1 drive failures it's possible we're rebuilding
6535	* one drive while leaving another faulty drive in array.
6536	*/
6537	for (i = `0`; i < conf->raid_disks; i++) {
6538	struct md_rdev *rdev = conf->disks[i].rdev;
6539
6540	if (rdev == NULL \|\| test_bit(Faulty, &rdev->flags))
6541	still_degraded = `1`;
6542	}
6543
6544	md_bitmap_start_sync(bitmap: mddev->bitmap, offset: sector_nr, blocks: &sync_blocks, degraded: still_degraded);
6545
6546	set_bit(nr: STRIPE_SYNC_REQUESTED, addr: &sh->state);
6547	set_bit(nr: STRIPE_HANDLE, addr: &sh->state);
6548
6549	raid5_release_stripe(sh);
6550
6551	return RAID5_STRIPE_SECTORS(conf);
6552	}
6553
6554	static int retry_aligned_read(struct r5conf conf, struct* bio *raid_bio,
6555	unsigned int offset)
6556	{
6557	/ We may not be able to submit a whole bio at once as there*
6558	* may not be enough stripe_heads available.
6559	* We cannot pre-allocate enough stripe_heads as we may need
6560	* more than exist in the cache (if we allow ever large chunks).
6561	* So we do one stripe head at a time and record in
6562	* ->bi_hw_segments how many have been done.
6563	*
6564	* We know that this entire raid_bio is in one chunk, so
6565	* it will be only one 'dd_idx' and only need one call to raid5_compute_sector.
6566	*/
6567	struct stripe_head *sh;
6568	int dd_idx;
6569	sector_t sector, logical_sector, last_sector;
6570	int scnt = `0`;
6571	int handled = `0`;
6572
6573	logical_sector = raid_bio->bi_iter.bi_sector &
6574	~((sector_t)RAID5_STRIPE_SECTORS(conf)-`1`);
6575	sector = raid5_compute_sector(conf, r_sector: logical_sector,
6576	previous: `0`, dd_idx: &dd_idx, NULL);
6577	last_sector = bio_end_sector(raid_bio);
6578
6579	for (; logical_sector < last_sector;
6580	logical_sector += RAID5_STRIPE_SECTORS(conf),
6581	sector += RAID5_STRIPE_SECTORS(conf),
6582	scnt++) {
6583
6584	if (scnt < offset)
6585	/ already done this stripe /
6586	continue;
6587
6588	sh = raid5_get_active_stripe(conf, NULL, sector,
6589	R5_GAS_NOBLOCK \| R5_GAS_NOQUIESCE);
6590	if (!sh) {
6591	/ failed to get a stripe - must wait /
6592	conf->retry_read_aligned = raid_bio;
6593	conf->retry_read_offset = scnt;
6594	return handled;
6595	}
6596
6597	if (!add_stripe_bio(sh, bi: raid_bio, dd_idx, forwrite: `0`, previous: `0`)) {
6598	raid5_release_stripe(sh);
6599	conf->retry_read_aligned = raid_bio;
6600	conf->retry_read_offset = scnt;
6601	return handled;
6602	}
6603
6604	set_bit(nr: R5_ReadNoMerge, addr: &sh->dev[dd_idx].flags);
6605	handle_stripe(sh);
6606	raid5_release_stripe(sh);
6607	handled++;
6608	}
6609
6610	bio_endio(raid_bio);
6611
6612	if (atomic_dec_and_test(v: &conf->active_aligned_reads))
6613	wake_up(&conf->wait_for_quiescent);
6614	return handled;
6615	}
6616
6617	static int handle_active_stripes(struct r5conf conf, int* group,
6618	struct r5worker *worker,
6619	struct list_head *temp_inactive_list)
6620	__must_hold(&conf->device_lock)
6621	{
6622	struct stripe_head batch[MAX_STRIPE_BATCH], sh;
6623	int i, batch_size = `0`, hash;
6624	bool release_inactive = false;
6625
6626	while (batch_size < MAX_STRIPE_BATCH &&
6627	(sh = __get_priority_stripe(conf, group)) != NULL)
6628	batch[batch_size++] = sh;
6629
6630	if (batch_size == `0`) {
6631	for (i = `0`; i < NR_STRIPE_HASH_LOCKS; i++)
6632	if (!list_empty(head: temp_inactive_list + i))
6633	break;
6634	if (i == NR_STRIPE_HASH_LOCKS) {
6635	spin_unlock_irq(lock: &conf->device_lock);
6636	log_flush_stripe_to_raid(conf);
6637	spin_lock_irq(lock: &conf->device_lock);
6638	return batch_size;
6639	}
6640	release_inactive = true;
6641	}
6642	spin_unlock_irq(lock: &conf->device_lock);
6643
6644	release_inactive_stripe_list(conf, temp_inactive_list,
6645	NR_STRIPE_HASH_LOCKS);
6646
6647	r5l_flush_stripe_to_raid(log: conf->log);
6648	if (release_inactive) {
6649	spin_lock_irq(lock: &conf->device_lock);
6650	return `0`;
6651	}
6652
6653	for (i = `0`; i < batch_size; i++)
6654	handle_stripe(sh: batch[i]);
6655	log_write_stripe_run(conf);
6656
6657	cond_resched();
6658
6659	spin_lock_irq(lock: &conf->device_lock);
6660	for (i = `0`; i < batch_size; i++) {
6661	hash = batch[i]->hash_lock_index;
6662	__release_stripe(conf, sh: batch[i], temp_inactive_list: &temp_inactive_list[hash]);
6663	}
6664	return batch_size;
6665	}
6666
6667	static void raid5_do_work(struct work_struct *work)
6668	{
6669	struct r5worker worker = container_of(work, struct* r5worker, work);
6670	struct r5worker_group *group = worker->group;
6671	struct r5conf *conf = group->conf;
6672	struct mddev *mddev = conf->mddev;
6673	int group_id = group - conf->worker_groups;
6674	int handled;
6675	struct blk_plug plug;
6676
6677	pr_debug("+++ raid5worker active\n");
6678
6679	blk_start_plug(&plug);
6680	handled = `0`;
6681	spin_lock_irq(lock: &conf->device_lock);
6682	while (`1`) {
6683	int batch_size, released;
6684
6685	released = release_stripe_list(conf, temp_inactive_list: worker->temp_inactive_list);
6686
6687	batch_size = handle_active_stripes(conf, group: group_id, worker,
6688	temp_inactive_list: worker->temp_inactive_list);
6689	worker->working = false;
6690	if (!batch_size && !released)
6691	break;
6692	handled += batch_size;
6693	wait_event_lock_irq(mddev->sb_wait,
6694	!test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags),
6695	conf->device_lock);
6696	}
6697	pr_debug("%d stripes handled\n", handled);
6698
6699	spin_unlock_irq(lock: &conf->device_lock);
6700
6701	flush_deferred_bios(conf);
6702
6703	r5l_flush_stripe_to_raid(log: conf->log);
6704
6705	async_tx_issue_pending_all();
6706	blk_finish_plug(&plug);
6707
6708	pr_debug("--- raid5worker inactive\n");
6709	}
6710
6711	/*
6712	* This is our raid5 kernel thread.
6713	*
6714	* We scan the hash table for stripes which can be handled now.
6715	* During the scan, completed stripes are saved for us by the interrupt
6716	* handler, so that they will not have to wait for our next wakeup.
6717	*/
6718	static void raid5d(struct md_thread *thread)
6719	{
6720	struct mddev *mddev = thread->mddev;
6721	struct r5conf *conf = mddev->private;
6722	int handled;
6723	struct blk_plug plug;
6724
6725	pr_debug("+++ raid5d active\n");
6726
6727	md_check_recovery(mddev);
6728
6729	blk_start_plug(&plug);
6730	handled = `0`;
6731	spin_lock_irq(lock: &conf->device_lock);
6732	while (`1`) {
6733	struct bio *bio;
6734	int batch_size, released;
6735	unsigned int offset;
6736
6737	released = release_stripe_list(conf, temp_inactive_list: conf->temp_inactive_list);
6738	if (released)
6739	clear_bit(nr: R5_DID_ALLOC, addr: &conf->cache_state);
6740
6741	if (
6742	!list_empty(head: &conf->bitmap_list)) {
6743	/ Now is a good time to flush some bitmap updates /
6744	conf->seq_flush++;
6745	spin_unlock_irq(lock: &conf->device_lock);
6746	md_bitmap_unplug(bitmap: mddev->bitmap);
6747	spin_lock_irq(lock: &conf->device_lock);
6748	conf->seq_write = conf->seq_flush;
6749	activate_bit_delay(conf, temp_inactive_list: conf->temp_inactive_list);
6750	}
6751	raid5_activate_delayed(conf);
6752
6753	while ((bio = remove_bio_from_retry(conf, offset: &offset))) {
6754	int ok;
6755	spin_unlock_irq(lock: &conf->device_lock);
6756	ok = retry_aligned_read(conf, raid_bio: bio, offset);
6757	spin_lock_irq(lock: &conf->device_lock);
6758	if (!ok)
6759	break;
6760	handled++;
6761	}
6762
6763	batch_size = handle_active_stripes(conf, ANY_GROUP, NULL,
6764	temp_inactive_list: conf->temp_inactive_list);
6765	if (!batch_size && !released)
6766	break;
6767	handled += batch_size;
6768
6769	if (mddev->sb_flags & ~(`1` << MD_SB_CHANGE_PENDING)) {
6770	spin_unlock_irq(lock: &conf->device_lock);
6771	md_check_recovery(mddev);
6772	spin_lock_irq(lock: &conf->device_lock);
6773
6774	/*
6775	* Waiting on MD_SB_CHANGE_PENDING below may deadlock
6776	* seeing md_check_recovery() is needed to clear
6777	* the flag when using mdmon.
6778	*/
6779	continue;
6780	}
6781
6782	wait_event_lock_irq(mddev->sb_wait,
6783	!test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags),
6784	conf->device_lock);
6785	}
6786	pr_debug("%d stripes handled\n", handled);
6787
6788	spin_unlock_irq(lock: &conf->device_lock);
6789	if (test_and_clear_bit(nr: R5_ALLOC_MORE, addr: &conf->cache_state) &&
6790	mutex_trylock(lock: &conf->cache_size_mutex)) {
6791	grow_one_stripe(conf, __GFP_NOWARN);
6792	/ Set flag even if allocation failed. This helps*
6793	* slow down allocation requests when mem is short
6794	*/
6795	set_bit(nr: R5_DID_ALLOC, addr: &conf->cache_state);
6796	mutex_unlock(lock: &conf->cache_size_mutex);
6797	}
6798
6799	flush_deferred_bios(conf);
6800
6801	r5l_flush_stripe_to_raid(log: conf->log);
6802
6803	async_tx_issue_pending_all();
6804	blk_finish_plug(&plug);
6805
6806	pr_debug("--- raid5d inactive\n");
6807	}
6808
6809	static ssize_t
6810	raid5_show_stripe_cache_size(struct mddev mddev, char* *page)
6811	{
6812	struct r5conf *conf;
6813	int ret = `0`;
6814	spin_lock(lock: &mddev->lock);
6815	conf = mddev->private;
6816	if (conf)
6817	ret = sprintf(buf: page, fmt: "%d\n", conf->min_nr_stripes);
6818	spin_unlock(lock: &mddev->lock);
6819	return ret;
6820	}
6821
6822	int
6823	raid5_set_cache_size(struct mddev mddev, int* size)
6824	{
6825	int result = `0`;
6826	struct r5conf *conf = mddev->private;
6827
6828	if (size <= `16` \|\| size > `32768`)
6829	return -EINVAL;
6830
6831	WRITE_ONCE(conf->min_nr_stripes, size);
6832	mutex_lock(&conf->cache_size_mutex);
6833	while (size < conf->max_nr_stripes &&
6834	drop_one_stripe(conf))
6835	;
6836	mutex_unlock(lock: &conf->cache_size_mutex);
6837
6838	md_allow_write(mddev);
6839
6840	mutex_lock(&conf->cache_size_mutex);
6841	while (size > conf->max_nr_stripes)
6842	if (!grow_one_stripe(conf, GFP_KERNEL)) {
6843	WRITE_ONCE(conf->min_nr_stripes, conf->max_nr_stripes);
6844	result = -ENOMEM;
6845	break;
6846	}
6847	mutex_unlock(lock: &conf->cache_size_mutex);
6848
6849	return result;
6850	}
6851	EXPORT_SYMBOL(raid5_set_cache_size);
6852
6853	static ssize_t
6854	raid5_store_stripe_cache_size(struct mddev mddev, const* char *page, size_t len)
6855	{
6856	struct r5conf *conf;
6857	unsigned long new;
6858	int err;
6859
6860	if (len >= PAGE_SIZE)
6861	return -EINVAL;
6862	if (kstrtoul(s: page, base: `10`, res: &new))
6863	return -EINVAL;
6864	err = mddev_lock(mddev);
6865	if (err)
6866	return err;
6867	conf = mddev->private;
6868	if (!conf)
6869	err = -ENODEV;
6870	else
6871	err = raid5_set_cache_size(mddev, new);
6872	mddev_unlock(mddev);
6873
6874	return err ?: len;
6875	}
6876
6877	static struct md_sysfs_entry
6878	raid5_stripecache_size = __ATTR(stripe_cache_size, S_IRUGO \| S_IWUSR,
6879	raid5_show_stripe_cache_size,
6880	raid5_store_stripe_cache_size);
6881
6882	static ssize_t
6883	raid5_show_rmw_level(struct mddev mddev, char* *page)
6884	{
6885	struct r5conf *conf = mddev->private;
6886	if (conf)
6887	return sprintf(buf: page, fmt: "%d\n", conf->rmw_level);
6888	else
6889	return `0`;
6890	}
6891
6892	static ssize_t
6893	raid5_store_rmw_level(struct mddev mddev, const* char *page, size_t len)
6894	{
6895	struct r5conf *conf = mddev->private;
6896	unsigned long new;
6897
6898	if (!conf)
6899	return -ENODEV;
6900
6901	if (len >= PAGE_SIZE)
6902	return -EINVAL;
6903
6904	if (kstrtoul(s: page, base: `10`, res: &new))
6905	return -EINVAL;
6906
6907	if (new != PARITY_DISABLE_RMW && !raid6_call.xor_syndrome)
6908	return -EINVAL;
6909
6910	if (new != PARITY_DISABLE_RMW &&
6911	new != PARITY_ENABLE_RMW &&
6912	new != PARITY_PREFER_RMW)
6913	return -EINVAL;
6914
6915	conf->rmw_level = new;
6916	return len;
6917	}
6918
6919	static struct md_sysfs_entry
6920	raid5_rmw_level = __ATTR(rmw_level, S_IRUGO \| S_IWUSR,
6921	raid5_show_rmw_level,
6922	raid5_store_rmw_level);
6923
6924	static ssize_t
6925	raid5_show_stripe_size(struct mddev mddev, char* *page)
6926	{
6927	struct r5conf *conf;
6928	int ret = `0`;
6929
6930	spin_lock(lock: &mddev->lock);
6931	conf = mddev->private;
6932	if (conf)
6933	ret = sprintf(buf: page, fmt: "%lu\n", RAID5_STRIPE_SIZE(conf));
6934	spin_unlock(lock: &mddev->lock);
6935	return ret;
6936	}
6937
6938	#if PAGE_SIZE != DEFAULT_STRIPE_SIZE
6939	static ssize_t
6940	raid5_store_stripe_size(struct mddev mddev, const* char *page, size_t len)
6941	{
6942	struct r5conf *conf;
6943	unsigned long new;
6944	int err;
6945	int size;
6946
6947	if (len >= PAGE_SIZE)
6948	return -EINVAL;
6949	if (kstrtoul(page, `10`, &new))
6950	return -EINVAL;
6951
6952	/*
6953	* The value should not be bigger than PAGE_SIZE. It requires to
6954	* be multiple of DEFAULT_STRIPE_SIZE and the value should be power
6955	* of two.
6956	*/
6957	if (new % DEFAULT_STRIPE_SIZE != `0` \|\|
6958	new > PAGE_SIZE \|\| new == `0` \|\|
6959	new != roundup_pow_of_two(new))
6960	return -EINVAL;
6961
6962	err = mddev_suspend_and_lock(mddev);
6963	if (err)
6964	return err;
6965
6966	conf = mddev->private;
6967	if (!conf) {
6968	err = -ENODEV;
6969	goto out_unlock;
6970	}
6971
6972	if (new == conf->stripe_size)
6973	goto out_unlock;
6974
6975	pr_debug("md/raid: change stripe_size from %lu to %lu\n",
6976	conf->stripe_size, new);
6977
6978	if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) \|\|
6979	mddev->reshape_position != MaxSector \|\| mddev->sysfs_active) {
6980	err = -EBUSY;
6981	goto out_unlock;
6982	}
6983
6984	mutex_lock(&conf->cache_size_mutex);
6985	size = conf->max_nr_stripes;
6986
6987	shrink_stripes(conf);
6988
6989	conf->stripe_size = new;
6990	conf->stripe_shift = ilog2(new) - `9`;
6991	conf->stripe_sectors = new >> `9`;
6992	if (grow_stripes(conf, size)) {
6993	pr_warn("md/raid:%s: couldn't allocate buffers\n",
6994	mdname(mddev));
6995	err = -ENOMEM;
6996	}
6997	mutex_unlock(&conf->cache_size_mutex);
6998
6999	out_unlock:
7000	mddev_unlock_and_resume(mddev);
7001	return err ?: len;
7002	}
7003
7004	static struct md_sysfs_entry
7005	raid5_stripe_size = __ATTR(stripe_size, `0644`,
7006	raid5_show_stripe_size,
7007	raid5_store_stripe_size);
7008	#else
7009	static struct md_sysfs_entry
7010	raid5_stripe_size = __ATTR(stripe_size, `0444`,
7011	raid5_show_stripe_size,
7012	NULL);
7013	#endif
7014
7015	static ssize_t
7016	raid5_show_preread_threshold(struct mddev mddev, char* *page)
7017	{
7018	struct r5conf *conf;
7019	int ret = `0`;
7020	spin_lock(lock: &mddev->lock);
7021	conf = mddev->private;
7022	if (conf)
7023	ret = sprintf(buf: page, fmt: "%d\n", conf->bypass_threshold);
7024	spin_unlock(lock: &mddev->lock);
7025	return ret;
7026	}
7027
7028	static ssize_t
7029	raid5_store_preread_threshold(struct mddev mddev, const* char *page, size_t len)
7030	{
7031	struct r5conf *conf;
7032	unsigned long new;
7033	int err;
7034
7035	if (len >= PAGE_SIZE)
7036	return -EINVAL;
7037	if (kstrtoul(s: page, base: `10`, res: &new))
7038	return -EINVAL;
7039
7040	err = mddev_lock(mddev);
7041	if (err)
7042	return err;
7043	conf = mddev->private;
7044	if (!conf)
7045	err = -ENODEV;
7046	else if (new > conf->min_nr_stripes)
7047	err = -EINVAL;
7048	else
7049	conf->bypass_threshold = new;
7050	mddev_unlock(mddev);
7051	return err ?: len;
7052	}
7053
7054	static struct md_sysfs_entry
7055	raid5_preread_bypass_threshold = __ATTR(preread_bypass_threshold,
7056	S_IRUGO \| S_IWUSR,
7057	raid5_show_preread_threshold,
7058	raid5_store_preread_threshold);
7059
7060	static ssize_t
7061	raid5_show_skip_copy(struct mddev mddev, char* *page)
7062	{
7063	struct r5conf *conf;
7064	int ret = `0`;
7065	spin_lock(lock: &mddev->lock);
7066	conf = mddev->private;
7067	if (conf)
7068	ret = sprintf(buf: page, fmt: "%d\n", conf->skip_copy);
7069	spin_unlock(lock: &mddev->lock);
7070	return ret;
7071	}
7072
7073	static ssize_t
7074	raid5_store_skip_copy(struct mddev mddev, const* char *page, size_t len)
7075	{
7076	struct r5conf *conf;
7077	unsigned long new;
7078	int err;
7079
7080	if (len >= PAGE_SIZE)
7081	return -EINVAL;
7082	if (kstrtoul(s: page, base: `10`, res: &new))
7083	return -EINVAL;
7084	new = !!new;
7085
7086	err = mddev_suspend_and_lock(mddev);
7087	if (err)
7088	return err;
7089	conf = mddev->private;
7090	if (!conf)
7091	err = -ENODEV;
7092	else if (new != conf->skip_copy) {
7093	struct request_queue *q = mddev->gendisk->queue;
7094
7095	conf->skip_copy = new;
7096	if (new)
7097	blk_queue_flag_set(QUEUE_FLAG_STABLE_WRITES, q);
7098	else
7099	blk_queue_flag_clear(QUEUE_FLAG_STABLE_WRITES, q);
7100	}
7101	mddev_unlock_and_resume(mddev);
7102	return err ?: len;
7103	}
7104
7105	static struct md_sysfs_entry
7106	raid5_skip_copy = __ATTR(skip_copy, S_IRUGO \| S_IWUSR,
7107	raid5_show_skip_copy,
7108	raid5_store_skip_copy);
7109
7110	static ssize_t
7111	stripe_cache_active_show(struct mddev mddev, char* *page)
7112	{
7113	struct r5conf *conf = mddev->private;
7114	if (conf)
7115	return sprintf(buf: page, fmt: "%d\n", atomic_read(v: &conf->active_stripes));
7116	else
7117	return `0`;
7118	}
7119
7120	static struct md_sysfs_entry
7121	raid5_stripecache_active = __ATTR_RO(stripe_cache_active);
7122
7123	static ssize_t
7124	raid5_show_group_thread_cnt(struct mddev mddev, char* *page)
7125	{
7126	struct r5conf *conf;
7127	int ret = `0`;
7128	spin_lock(lock: &mddev->lock);
7129	conf = mddev->private;
7130	if (conf)
7131	ret = sprintf(buf: page, fmt: "%d\n", conf->worker_cnt_per_group);
7132	spin_unlock(lock: &mddev->lock);
7133	return ret;
7134	}
7135
7136	static int alloc_thread_groups(struct r5conf conf, int* cnt,
7137	int *group_cnt,
7138	struct r5worker_group **worker_groups);
7139	static ssize_t
7140	raid5_store_group_thread_cnt(struct mddev mddev, const* char *page, size_t len)
7141	{
7142	struct r5conf *conf;
7143	unsigned int new;
7144	int err;
7145	struct r5worker_group new_groups, old_groups;
7146	int group_cnt;
7147
7148	if (len >= PAGE_SIZE)
7149	return -EINVAL;
7150	if (kstrtouint(s: page, base: `10`, res: &new))
7151	return -EINVAL;
7152	/ 8192 should be big enough /
7153	if (new > `8192`)
7154	return -EINVAL;
7155
7156	err = mddev_suspend_and_lock(mddev);
7157	if (err)
7158	return err;
7159	conf = mddev->private;
7160	if (!conf)
7161	err = -ENODEV;
7162	else if (new != conf->worker_cnt_per_group) {
7163	old_groups = conf->worker_groups;
7164	if (old_groups)
7165	flush_workqueue(raid5_wq);
7166
7167	err = alloc_thread_groups(conf, cnt: new, group_cnt: &group_cnt, worker_groups: &new_groups);
7168	if (!err) {
7169	spin_lock_irq(lock: &conf->device_lock);
7170	conf->group_cnt = group_cnt;
7171	conf->worker_cnt_per_group = new;
7172	conf->worker_groups = new_groups;
7173	spin_unlock_irq(lock: &conf->device_lock);
7174
7175	if (old_groups)
7176	kfree(objp: old_groups[`0`].workers);
7177	kfree(objp: old_groups);
7178	}
7179	}
7180	mddev_unlock_and_resume(mddev);
7181
7182	return err ?: len;
7183	}
7184
7185	static struct md_sysfs_entry
7186	raid5_group_thread_cnt = __ATTR(group_thread_cnt, S_IRUGO \| S_IWUSR,
7187	raid5_show_group_thread_cnt,
7188	raid5_store_group_thread_cnt);
7189
7190	static struct attribute *raid5_attrs[] = {
7191	&raid5_stripecache_size.attr,
7192	&raid5_stripecache_active.attr,
7193	&raid5_preread_bypass_threshold.attr,
7194	&raid5_group_thread_cnt.attr,
7195	&raid5_skip_copy.attr,
7196	&raid5_rmw_level.attr,
7197	&raid5_stripe_size.attr,
7198	&r5c_journal_mode.attr,
7199	&ppl_write_hint.attr,
7200	NULL,
7201	};
7202	static const struct attribute_group raid5_attrs_group = {
7203	.name = NULL,
7204	.attrs = raid5_attrs,
7205	};
7206
7207	static int alloc_thread_groups(struct r5conf conf, int* cnt, int *group_cnt,
7208	struct r5worker_group **worker_groups)
7209	{
7210	int i, j, k;
7211	ssize_t size;
7212	struct r5worker *workers;
7213
7214	if (cnt == `0`) {
7215	*group_cnt = `0`;
7216	*worker_groups = NULL;
7217	return `0`;
7218	}
7219	*group_cnt = num_possible_nodes();
7220	size = sizeof(struct r5worker) * cnt;
7221	workers = kcalloc(n: size, size: *group_cnt, GFP_NOIO);
7222	worker_groups = kcalloc(n: group_cnt, size: sizeof(struct r5worker_group),
7223	GFP_NOIO);
7224	if (!*worker_groups \|\| !workers) {
7225	kfree(objp: workers);
7226	kfree(objp: *worker_groups);
7227	return -ENOMEM;
7228	}
7229
7230	for (i = `0`; i < *group_cnt; i++) {
7231	struct r5worker_group *group;
7232
7233	group = &(*worker_groups)[i];
7234	INIT_LIST_HEAD(list: &group->handle_list);
7235	INIT_LIST_HEAD(list: &group->loprio_list);
7236	group->conf = conf;
7237	group->workers = workers + i * cnt;
7238
7239	for (j = `0`; j < cnt; j++) {
7240	struct r5worker *worker = group->workers + j;
7241	worker->group = group;
7242	INIT_WORK(&worker->work, raid5_do_work);
7243
7244	for (k = `0`; k < NR_STRIPE_HASH_LOCKS; k++)
7245	INIT_LIST_HEAD(list: worker->temp_inactive_list + k);
7246	}
7247	}
7248
7249	return `0`;
7250	}
7251
7252	static void free_thread_groups(struct r5conf *conf)
7253	{
7254	if (conf->worker_groups)
7255	kfree(objp: conf->worker_groups[`0`].workers);
7256	kfree(objp: conf->worker_groups);
7257	conf->worker_groups = NULL;
7258	}
7259
7260	static sector_t
7261	raid5_size(struct mddev mddev, sector_t sectors, int* raid_disks)
7262	{
7263	struct r5conf *conf = mddev->private;
7264
7265	if (!sectors)
7266	sectors = mddev->dev_sectors;
7267	if (!raid_disks)
7268	/ size is defined by the smallest of previous and new size /
7269	raid_disks = min(conf->raid_disks, conf->previous_raid_disks);
7270
7271	sectors &= ~((sector_t)conf->chunk_sectors - `1`);
7272	sectors &= ~((sector_t)conf->prev_chunk_sectors - `1`);
7273	return sectors * (raid_disks - conf->max_degraded);
7274	}
7275
7276	static void free_scratch_buffer(struct r5conf conf, struct* raid5_percpu *percpu)
7277	{
7278	safe_put_page(p: percpu->spare_page);
7279	percpu->spare_page = NULL;
7280	kvfree(addr: percpu->scribble);
7281	percpu->scribble = NULL;
7282	}
7283
7284	static int alloc_scratch_buffer(struct r5conf conf, struct* raid5_percpu *percpu)
7285	{
7286	if (conf->level == `6` && !percpu->spare_page) {
7287	percpu->spare_page = alloc_page(GFP_KERNEL);
7288	if (!percpu->spare_page)
7289	return -ENOMEM;
7290	}
7291
7292	if (scribble_alloc(percpu,
7293	max(conf->raid_disks,
7294	conf->previous_raid_disks),
7295	max(conf->chunk_sectors,
7296	conf->prev_chunk_sectors)
7297	/ RAID5_STRIPE_SECTORS(conf))) {
7298	free_scratch_buffer(conf, percpu);
7299	return -ENOMEM;
7300	}
7301
7302	local_lock_init(&percpu->lock);
7303	return `0`;
7304	}
7305
7306	static int raid456_cpu_dead(unsigned int cpu, struct hlist_node *node)
7307	{
7308	struct r5conf conf = hlist_entry_safe(node, struct* r5conf, node);
7309
7310	free_scratch_buffer(conf, per_cpu_ptr(conf->percpu, cpu));
7311	return `0`;
7312	}
7313
7314	static void raid5_free_percpu(struct r5conf *conf)
7315	{
7316	if (!conf->percpu)
7317	return;
7318
7319	cpuhp_state_remove_instance(state: CPUHP_MD_RAID5_PREPARE, node: &conf->node);
7320	free_percpu(pdata: conf->percpu);
7321	}
7322
7323	static void free_conf(struct r5conf *conf)
7324	{
7325	int i;
7326
7327	log_exit(conf);
7328
7329	shrinker_free(shrinker: conf->shrinker);
7330	free_thread_groups(conf);
7331	shrink_stripes(conf);
7332	raid5_free_percpu(conf);
7333	for (i = `0`; i < conf->pool_size; i++)
7334	if (conf->disks[i].extra_page)
7335	put_page(page: conf->disks[i].extra_page);
7336	kfree(objp: conf->disks);
7337	bioset_exit(&conf->bio_split);
7338	kfree(objp: conf->stripe_hashtbl);
7339	kfree(objp: conf->pending_data);
7340	kfree(objp: conf);
7341	}
7342
7343	static int raid456_cpu_up_prepare(unsigned int cpu, struct hlist_node *node)
7344	{
7345	struct r5conf conf = hlist_entry_safe(node, struct* r5conf, node);
7346	struct raid5_percpu *percpu = per_cpu_ptr(conf->percpu, cpu);
7347
7348	if (alloc_scratch_buffer(conf, percpu)) {
7349	pr_warn("%s: failed memory allocation for cpu%u\n",
7350	__func__, cpu);
7351	return -ENOMEM;
7352	}
7353	return `0`;
7354	}
7355
7356	static int raid5_alloc_percpu(struct r5conf *conf)
7357	{
7358	int err = `0`;
7359
7360	conf->percpu = alloc_percpu(struct raid5_percpu);
7361	if (!conf->percpu)
7362	return -ENOMEM;
7363
7364	err = cpuhp_state_add_instance(state: CPUHP_MD_RAID5_PREPARE, node: &conf->node);
7365	if (!err) {
7366	conf->scribble_disks = max(conf->raid_disks,
7367	conf->previous_raid_disks);
7368	conf->scribble_sectors = max(conf->chunk_sectors,
7369	conf->prev_chunk_sectors);
7370	}
7371	return err;
7372	}
7373
7374	static unsigned long raid5_cache_scan(struct shrinker *shrink,
7375	struct shrink_control *sc)
7376	{
7377	struct r5conf *conf = shrink->private_data;
7378	unsigned long ret = SHRINK_STOP;
7379
7380	if (mutex_trylock(lock: &conf->cache_size_mutex)) {
7381	ret= `0`;
7382	while (ret < sc->nr_to_scan &&
7383	conf->max_nr_stripes > conf->min_nr_stripes) {
7384	if (drop_one_stripe(conf) == `0`) {
7385	ret = SHRINK_STOP;
7386	break;
7387	}
7388	ret++;
7389	}
7390	mutex_unlock(lock: &conf->cache_size_mutex);
7391	}
7392	return ret;
7393	}
7394
7395	static unsigned long raid5_cache_count(struct shrinker *shrink,
7396	struct shrink_control *sc)
7397	{
7398	struct r5conf *conf = shrink->private_data;
7399	int max_stripes = READ_ONCE(conf->max_nr_stripes);
7400	int min_stripes = READ_ONCE(conf->min_nr_stripes);
7401
7402	if (max_stripes < min_stripes)
7403	/ unlikely, but not impossible /
7404	return `0`;
7405	return max_stripes - min_stripes;
7406	}
7407
7408	static struct r5conf setup_conf(struct* mddev *mddev)
7409	{
7410	struct r5conf *conf;
7411	int raid_disk, memory, max_disks;
7412	struct md_rdev *rdev;
7413	struct disk_info *disk;
7414	char pers_name[`6`];
7415	int i;
7416	int group_cnt;
7417	struct r5worker_group *new_group;
7418	int ret = -ENOMEM;
7419
7420	if (mddev->new_level != `5`
7421	&& mddev->new_level != `4`
7422	&& mddev->new_level != `6`) {
7423	pr_warn("md/raid:%s: raid level not set to 4/5/6 (%d)\n",
7424	mdname(mddev), mddev->new_level);
7425	return ERR_PTR(error: -EIO);
7426	}
7427	if ((mddev->new_level == `5`
7428	&& !algorithm_valid_raid5(layout: mddev->new_layout)) \|\|
7429	(mddev->new_level == `6`
7430	&& !algorithm_valid_raid6(layout: mddev->new_layout))) {
7431	pr_warn("md/raid:%s: layout %d not supported\n",
7432	mdname(mddev), mddev->new_layout);
7433	return ERR_PTR(error: -EIO);
7434	}
7435	if (mddev->new_level == `6` && mddev->raid_disks < `4`) {
7436	pr_warn("md/raid:%s: not enough configured devices (%d, minimum 4)\n",
7437	mdname(mddev), mddev->raid_disks);
7438	return ERR_PTR(error: -EINVAL);
7439	}
7440
7441	if (!mddev->new_chunk_sectors \|\|
7442	(mddev->new_chunk_sectors << `9`) % PAGE_SIZE \|\|
7443	!is_power_of_2(n: mddev->new_chunk_sectors)) {
7444	pr_warn("md/raid:%s: invalid chunk size %d\n",
7445	mdname(mddev), mddev->new_chunk_sectors << `9`);
7446	return ERR_PTR(error: -EINVAL);
7447	}
7448
7449	conf = kzalloc(size: sizeof(struct r5conf), GFP_KERNEL);
7450	if (conf == NULL)
7451	goto abort;
7452
7453	#if PAGE_SIZE != DEFAULT_STRIPE_SIZE
7454	conf->stripe_size = DEFAULT_STRIPE_SIZE;
7455	conf->stripe_shift = ilog2(DEFAULT_STRIPE_SIZE) - `9`;
7456	conf->stripe_sectors = DEFAULT_STRIPE_SIZE >> `9`;
7457	#endif
7458	INIT_LIST_HEAD(list: &conf->free_list);
7459	INIT_LIST_HEAD(list: &conf->pending_list);
7460	conf->pending_data = kcalloc(PENDING_IO_MAX,
7461	size: sizeof(struct r5pending_data),
7462	GFP_KERNEL);
7463	if (!conf->pending_data)
7464	goto abort;
7465	for (i = `0`; i < PENDING_IO_MAX; i++)
7466	list_add(new: &conf->pending_data[i].sibling, head: &conf->free_list);
7467	/ Don't enable multi-threading by default/
7468	if (!alloc_thread_groups(conf, cnt: `0`, group_cnt: &group_cnt, worker_groups: &new_group)) {
7469	conf->group_cnt = group_cnt;
7470	conf->worker_cnt_per_group = `0`;
7471	conf->worker_groups = new_group;
7472	} else
7473	goto abort;
7474	spin_lock_init(&conf->device_lock);
7475	seqcount_spinlock_init(&conf->gen_lock, &conf->device_lock);
7476	mutex_init(&conf->cache_size_mutex);
7477
7478	init_waitqueue_head(&conf->wait_for_quiescent);
7479	init_waitqueue_head(&conf->wait_for_stripe);
7480	init_waitqueue_head(&conf->wait_for_overlap);
7481	INIT_LIST_HEAD(list: &conf->handle_list);
7482	INIT_LIST_HEAD(list: &conf->loprio_list);
7483	INIT_LIST_HEAD(list: &conf->hold_list);
7484	INIT_LIST_HEAD(list: &conf->delayed_list);
7485	INIT_LIST_HEAD(list: &conf->bitmap_list);
7486	init_llist_head(list: &conf->released_stripes);
7487	atomic_set(v: &conf->active_stripes, i: `0`);
7488	atomic_set(v: &conf->preread_active_stripes, i: `0`);
7489	atomic_set(v: &conf->active_aligned_reads, i: `0`);
7490	spin_lock_init(&conf->pending_bios_lock);
7491	conf->batch_bio_dispatch = true;
7492	rdev_for_each(rdev, mddev) {
7493	if (test_bit(Journal, &rdev->flags))
7494	continue;
7495	if (bdev_nonrot(bdev: rdev->bdev)) {
7496	conf->batch_bio_dispatch = false;
7497	break;
7498	}
7499	}
7500
7501	conf->bypass_threshold = BYPASS_THRESHOLD;
7502	conf->recovery_disabled = mddev->recovery_disabled - `1`;
7503
7504	conf->raid_disks = mddev->raid_disks;
7505	if (mddev->reshape_position == MaxSector)
7506	conf->previous_raid_disks = mddev->raid_disks;
7507	else
7508	conf->previous_raid_disks = mddev->raid_disks - mddev->delta_disks;
7509	max_disks = max(conf->raid_disks, conf->previous_raid_disks);
7510
7511	conf->disks = kcalloc(n: max_disks, size: sizeof(struct disk_info),
7512	GFP_KERNEL);
7513
7514	if (!conf->disks)
7515	goto abort;
7516
7517	for (i = `0`; i < max_disks; i++) {
7518	conf->disks[i].extra_page = alloc_page(GFP_KERNEL);
7519	if (!conf->disks[i].extra_page)
7520	goto abort;
7521	}
7522
7523	ret = bioset_init(&conf->bio_split, BIO_POOL_SIZE, `0`, flags: `0`);
7524	if (ret)
7525	goto abort;
7526	conf->mddev = mddev;
7527
7528	ret = -ENOMEM;
7529	conf->stripe_hashtbl = kzalloc(PAGE_SIZE, GFP_KERNEL);
7530	if (!conf->stripe_hashtbl)
7531	goto abort;
7532
7533	/ We init hash_locks[0] separately to that it can be used*
7534	* as the reference lock in the spin_lock_nest_lock() call
7535	* in lock_all_device_hash_locks_irq in order to convince
7536	* lockdep that we know what we are doing.
7537	*/
7538	spin_lock_init(conf->hash_locks);
7539	for (i = `1`; i < NR_STRIPE_HASH_LOCKS; i++)
7540	spin_lock_init(conf->hash_locks + i);
7541
7542	for (i = `0`; i < NR_STRIPE_HASH_LOCKS; i++)
7543	INIT_LIST_HEAD(list: conf->inactive_list + i);
7544
7545	for (i = `0`; i < NR_STRIPE_HASH_LOCKS; i++)
7546	INIT_LIST_HEAD(list: conf->temp_inactive_list + i);
7547
7548	atomic_set(v: &conf->r5c_cached_full_stripes, i: `0`);
7549	INIT_LIST_HEAD(list: &conf->r5c_full_stripe_list);
7550	atomic_set(v: &conf->r5c_cached_partial_stripes, i: `0`);
7551	INIT_LIST_HEAD(list: &conf->r5c_partial_stripe_list);
7552	atomic_set(v: &conf->r5c_flushing_full_stripes, i: `0`);
7553	atomic_set(v: &conf->r5c_flushing_partial_stripes, i: `0`);
7554
7555	conf->level = mddev->new_level;
7556	conf->chunk_sectors = mddev->new_chunk_sectors;
7557	ret = raid5_alloc_percpu(conf);
7558	if (ret)
7559	goto abort;
7560
7561	pr_debug("raid456: run(%s) called.\n", mdname(mddev));
7562
7563	ret = -EIO;
7564	rdev_for_each(rdev, mddev) {
7565	raid_disk = rdev->raid_disk;
7566	if (raid_disk >= max_disks
7567	\|\| raid_disk < `0` \|\| test_bit(Journal, &rdev->flags))
7568	continue;
7569	disk = conf->disks + raid_disk;
7570
7571	if (test_bit(Replacement, &rdev->flags)) {
7572	if (disk->replacement)
7573	goto abort;
7574	RCU_INIT_POINTER(disk->replacement, rdev);
7575	} else {
7576	if (disk->rdev)
7577	goto abort;
7578	RCU_INIT_POINTER(disk->rdev, rdev);
7579	}
7580
7581	if (test_bit(In_sync, &rdev->flags)) {
7582	pr_info("md/raid:%s: device %pg operational as raid disk %d\n",
7583	mdname(mddev), rdev->bdev, raid_disk);
7584	} else if (rdev->saved_raid_disk != raid_disk)
7585	/ Cannot rely on bitmap to complete recovery /
7586	conf->fullsync = `1`;
7587	}
7588
7589	conf->level = mddev->new_level;
7590	if (conf->level == `6`) {
7591	conf->max_degraded = `2`;
7592	if (raid6_call.xor_syndrome)
7593	conf->rmw_level = PARITY_ENABLE_RMW;
7594	else
7595	conf->rmw_level = PARITY_DISABLE_RMW;
7596	} else {
7597	conf->max_degraded = `1`;
7598	conf->rmw_level = PARITY_ENABLE_RMW;
7599	}
7600	conf->algorithm = mddev->new_layout;
7601	conf->reshape_progress = mddev->reshape_position;
7602	if (conf->reshape_progress != MaxSector) {
7603	conf->prev_chunk_sectors = mddev->chunk_sectors;
7604	conf->prev_algo = mddev->layout;
7605	} else {
7606	conf->prev_chunk_sectors = conf->chunk_sectors;
7607	conf->prev_algo = conf->algorithm;
7608	}
7609
7610	conf->min_nr_stripes = NR_STRIPES;
7611	if (mddev->reshape_position != MaxSector) {
7612	int stripes = max_t(int,
7613	((mddev->chunk_sectors << `9`) / RAID5_STRIPE_SIZE(conf)) * `4`,
7614	((mddev->new_chunk_sectors << `9`) / RAID5_STRIPE_SIZE(conf)) * `4`);
7615	conf->min_nr_stripes = max(NR_STRIPES, stripes);
7616	if (conf->min_nr_stripes != NR_STRIPES)
7617	pr_info("md/raid:%s: force stripe size %d for reshape\n",
7618	mdname(mddev), conf->min_nr_stripes);
7619	}
7620	memory = conf->min_nr_stripes * (sizeof(struct stripe_head) +
7621	max_disks * ((sizeof(struct bio) + PAGE_SIZE))) / `1024`;
7622	atomic_set(v: &conf->empty_inactive_list_nr, NR_STRIPE_HASH_LOCKS);
7623	if (grow_stripes(conf, num: conf->min_nr_stripes)) {
7624	pr_warn("md/raid:%s: couldn't allocate %dkB for buffers\n",
7625	mdname(mddev), memory);
7626	ret = -ENOMEM;
7627	goto abort;
7628	} else
7629	pr_debug("md/raid:%s: allocated %dkB\n", mdname(mddev), memory);
7630	/*
7631	* Losing a stripe head costs more than the time to refill it,
7632	* it reduces the queue depth and so can hurt throughput.
7633	* So set it rather large, scaled by number of devices.
7634	*/
7635	conf->shrinker = shrinker_alloc(flags: `0`, fmt: "md-raid5:%s", mdname(mddev));
7636	if (!conf->shrinker) {
7637	ret = -ENOMEM;
7638	pr_warn("md/raid:%s: couldn't allocate shrinker.\n",
7639	mdname(mddev));
7640	goto abort;
7641	}
7642
7643	conf->shrinker->seeks = DEFAULT_SEEKS * conf->raid_disks * `4`;
7644	conf->shrinker->scan_objects = raid5_cache_scan;
7645	conf->shrinker->count_objects = raid5_cache_count;
7646	conf->shrinker->batch = `128`;
7647	conf->shrinker->private_data = conf;
7648
7649	shrinker_register(shrinker: conf->shrinker);
7650
7651	sprintf(buf: pers_name, fmt: "raid%d", mddev->new_level);
7652	rcu_assign_pointer(conf->thread,
7653	md_register_thread(raid5d, mddev, pers_name));
7654	if (!conf->thread) {
7655	pr_warn("md/raid:%s: couldn't allocate thread.\n",
7656	mdname(mddev));
7657	ret = -ENOMEM;
7658	goto abort;
7659	}
7660
7661	return conf;
7662
7663	abort:
7664	if (conf)
7665	free_conf(conf);
7666	return ERR_PTR(error: ret);
7667	}
7668
7669	static int only_parity(int raid_disk, int algo, int raid_disks, int max_degraded)
7670	{
7671	switch (algo) {
7672	case ALGORITHM_PARITY_0:
7673	if (raid_disk < max_degraded)
7674	return `1`;
7675	break;
7676	case ALGORITHM_PARITY_N:
7677	if (raid_disk >= raid_disks - max_degraded)
7678	return `1`;
7679	break;
7680	case ALGORITHM_PARITY_0_6:
7681	if (raid_disk == `0` \|\|
7682	raid_disk == raid_disks - `1`)
7683	return `1`;
7684	break;
7685	case ALGORITHM_LEFT_ASYMMETRIC_6:
7686	case ALGORITHM_RIGHT_ASYMMETRIC_6:
7687	case ALGORITHM_LEFT_SYMMETRIC_6:
7688	case ALGORITHM_RIGHT_SYMMETRIC_6:
7689	if (raid_disk == raid_disks - `1`)
7690	return `1`;
7691	}
7692	return `0`;
7693	}
7694
7695	static int raid5_set_limits(struct mddev *mddev)
7696	{
7697	struct r5conf *conf = mddev->private;
7698	struct queue_limits lim;
7699	int data_disks, stripe;
7700	struct md_rdev *rdev;
7701
7702	/*
7703	* The read-ahead size must cover two whole stripes, which is
7704	* 2 * (datadisks) * chunksize where 'n' is the number of raid devices.
7705	*/
7706	data_disks = conf->previous_raid_disks - conf->max_degraded;
7707
7708	/*
7709	* We can only discard a whole stripe. It doesn't make sense to
7710	* discard data disk but write parity disk
7711	*/
7712	stripe = roundup_pow_of_two(data_disks * (mddev->chunk_sectors << `9`));
7713
7714	blk_set_stacking_limits(lim: &lim);
7715	lim.io_min = mddev->chunk_sectors << `9`;
7716	lim.io_opt = lim.io_min * (conf->raid_disks - conf->max_degraded);
7717	lim.raid_partial_stripes_expensive = `1`;
7718	lim.discard_granularity = stripe;
7719	lim.max_write_zeroes_sectors = `0`;
7720	mddev_stack_rdev_limits(mddev, lim: &lim);
7721	rdev_for_each(rdev, mddev)
7722	queue_limits_stack_bdev(t: &lim, bdev: rdev->bdev, offset: rdev->new_data_offset,
7723	pfx: mddev->gendisk->disk_name);
7724
7725	/*
7726	* Zeroing is required for discard, otherwise data could be lost.
7727	*
7728	* Consider a scenario: discard a stripe (the stripe could be
7729	* inconsistent if discard_zeroes_data is 0); write one disk of the
7730	* stripe (the stripe could be inconsistent again depending on which
7731	* disks are used to calculate parity); the disk is broken; The stripe
7732	* data of this disk is lost.
7733	*
7734	* We only allow DISCARD if the sysadmin has confirmed that only safe
7735	* devices are in use by setting a module parameter. A better idea
7736	* might be to turn DISCARD into WRITE_ZEROES requests, as that is
7737	* required to be safe.
7738	*/
7739	if (!devices_handle_discard_safely \|\|
7740	lim.max_discard_sectors < (stripe >> `9`) \|\|
7741	lim.discard_granularity < stripe)
7742	lim.max_hw_discard_sectors = `0`;
7743
7744	/*
7745	* Requests require having a bitmap for each stripe.
7746	* Limit the max sectors based on this.
7747	*/
7748	lim.max_hw_sectors = RAID5_MAX_REQ_STRIPES << RAID5_STRIPE_SHIFT(conf);
7749
7750	/ No restrictions on the number of segments in the request /
7751	lim.max_segments = USHRT_MAX;
7752
7753	return queue_limits_set(q: mddev->gendisk->queue, lim: &lim);
7754	}
7755
7756	static int raid5_run(struct mddev *mddev)
7757	{
7758	struct r5conf *conf;
7759	int dirty_parity_disks = `0`;
7760	struct md_rdev *rdev;
7761	struct md_rdev *journal_dev = NULL;
7762	sector_t reshape_offset = `0`;
7763	int i;
7764	long long min_offset_diff = `0`;
7765	int first = `1`;
7766	int ret = -EIO;
7767
7768	if (mddev->recovery_cp != MaxSector)
7769	pr_notice("md/raid:%s: not clean -- starting background reconstruction\n",
7770	mdname(mddev));
7771
7772	rdev_for_each(rdev, mddev) {
7773	long long diff;
7774
7775	if (test_bit(Journal, &rdev->flags)) {
7776	journal_dev = rdev;
7777	continue;
7778	}
7779	if (rdev->raid_disk < `0`)
7780	continue;
7781	diff = (rdev->new_data_offset - rdev->data_offset);
7782	if (first) {
7783	min_offset_diff = diff;
7784	first = `0`;
7785	} else if (mddev->reshape_backwards &&
7786	diff < min_offset_diff)
7787	min_offset_diff = diff;
7788	else if (!mddev->reshape_backwards &&
7789	diff > min_offset_diff)
7790	min_offset_diff = diff;
7791	}
7792
7793	if ((test_bit(MD_HAS_JOURNAL, &mddev->flags) \|\| journal_dev) &&
7794	(mddev->bitmap_info.offset \|\| mddev->bitmap_info.file)) {
7795	pr_notice("md/raid:%s: array cannot have both journal and bitmap\n",
7796	mdname(mddev));
7797	return -EINVAL;
7798	}
7799
7800	if (mddev->reshape_position != MaxSector) {
7801	/ Check that we can continue the reshape.*
7802	* Difficulties arise if the stripe we would write to
7803	* next is at or after the stripe we would read from next.
7804	* For a reshape that changes the number of devices, this
7805	* is only possible for a very short time, and mdadm makes
7806	* sure that time appears to have past before assembling
7807	* the array. So we fail if that time hasn't passed.
7808	* For a reshape that keeps the number of devices the same
7809	* mdadm must be monitoring the reshape can keeping the
7810	* critical areas read-only and backed up. It will start
7811	* the array in read-only mode, so we check for that.
7812	*/
7813	sector_t here_new, here_old;
7814	int old_disks;
7815	int max_degraded = (mddev->level == `6` ? `2` : `1`);
7816	int chunk_sectors;
7817	int new_data_disks;
7818
7819	if (journal_dev) {
7820	pr_warn("md/raid:%s: don't support reshape with journal - aborting.\n",
7821	mdname(mddev));
7822	return -EINVAL;
7823	}
7824
7825	if (mddev->new_level != mddev->level) {
7826	pr_warn("md/raid:%s: unsupported reshape required - aborting.\n",
7827	mdname(mddev));
7828	return -EINVAL;
7829	}
7830	old_disks = mddev->raid_disks - mddev->delta_disks;
7831	/ reshape_position must be on a new-stripe boundary, and one*
7832	* further up in new geometry must map after here in old
7833	* geometry.
7834	* If the chunk sizes are different, then as we perform reshape
7835	* in units of the largest of the two, reshape_position needs
7836	* be a multiple of the largest chunk size times new data disks.
7837	*/
7838	here_new = mddev->reshape_position;
7839	chunk_sectors = max(mddev->chunk_sectors, mddev->new_chunk_sectors);
7840	new_data_disks = mddev->raid_disks - max_degraded;
7841	if (sector_div(here_new, chunk_sectors * new_data_disks)) {
7842	pr_warn("md/raid:%s: reshape_position not on a stripe boundary\n",
7843	mdname(mddev));
7844	return -EINVAL;
7845	}
7846	reshape_offset = here_new * chunk_sectors;
7847	/ here_new is the stripe we will write to /
7848	here_old = mddev->reshape_position;
7849	sector_div(here_old, chunk_sectors * (old_disks-max_degraded));
7850	/ here_old is the first stripe that we might need to read*
7851	* from */
7852	if (mddev->delta_disks == `0`) {
7853	/ We cannot be sure it is safe to start an in-place*
7854	* reshape. It is only safe if user-space is monitoring
7855	* and taking constant backups.
7856	* mdadm always starts a situation like this in
7857	* readonly mode so it can take control before
7858	* allowing any writes. So just check for that.
7859	*/
7860	if (abs(min_offset_diff) >= mddev->chunk_sectors &&
7861	abs(min_offset_diff) >= mddev->new_chunk_sectors)
7862	/ not really in-place - so OK /;
7863	else if (mddev->ro == `0`) {
7864	pr_warn("md/raid:%s: in-place reshape must be started in read-only mode - aborting\n",
7865	mdname(mddev));
7866	return -EINVAL;
7867	}
7868	} else if (mddev->reshape_backwards
7869	? (here_new * chunk_sectors + min_offset_diff <=
7870	here_old * chunk_sectors)
7871	: (here_new * chunk_sectors >=
7872	here_old * chunk_sectors + (-min_offset_diff))) {
7873	/ Reading from the same stripe as writing to - bad /
7874	pr_warn("md/raid:%s: reshape_position too early for auto-recovery - aborting.\n",
7875	mdname(mddev));
7876	return -EINVAL;
7877	}
7878	pr_debug("md/raid:%s: reshape will continue\n", mdname(mddev));
7879	/ OK, we should be able to continue; /
7880	} else {
7881	BUG_ON(mddev->level != mddev->new_level);
7882	BUG_ON(mddev->layout != mddev->new_layout);
7883	BUG_ON(mddev->chunk_sectors != mddev->new_chunk_sectors);
7884	BUG_ON(mddev->delta_disks != `0`);
7885	}
7886
7887	if (test_bit(MD_HAS_JOURNAL, &mddev->flags) &&
7888	test_bit(MD_HAS_PPL, &mddev->flags)) {
7889	pr_warn("md/raid:%s: using journal device and PPL not allowed - disabling PPL\n",
7890	mdname(mddev));
7891	clear_bit(nr: MD_HAS_PPL, addr: &mddev->flags);
7892	clear_bit(nr: MD_HAS_MULTIPLE_PPLS, addr: &mddev->flags);
7893	}
7894
7895	if (mddev->private == NULL)
7896	conf = setup_conf(mddev);
7897	else
7898	conf = mddev->private;
7899
7900	if (IS_ERR(ptr: conf))
7901	return PTR_ERR(ptr: conf);
7902
7903	if (test_bit(MD_HAS_JOURNAL, &mddev->flags)) {
7904	if (!journal_dev) {
7905	pr_warn("md/raid:%s: journal disk is missing, force array readonly\n",
7906	mdname(mddev));
7907	mddev->ro = `1`;
7908	set_disk_ro(disk: mddev->gendisk, read_only: `1`);
7909	} else if (mddev->recovery_cp == MaxSector)
7910	set_bit(nr: MD_JOURNAL_CLEAN, addr: &mddev->flags);
7911	}
7912
7913	conf->min_offset_diff = min_offset_diff;
7914	rcu_assign_pointer(mddev->thread, conf->thread);
7915	rcu_assign_pointer(conf->thread, NULL);
7916	mddev->private = conf;
7917
7918	for (i = `0`; i < conf->raid_disks && conf->previous_raid_disks;
7919	i++) {
7920	rdev = conf->disks[i].rdev;
7921	if (!rdev)
7922	continue;
7923	if (conf->disks[i].replacement &&
7924	conf->reshape_progress != MaxSector) {
7925	/ replacements and reshape simply do not mix. /
7926	pr_warn("md: cannot handle concurrent replacement and reshape.\n");
7927	goto abort;
7928	}
7929	if (test_bit(In_sync, &rdev->flags))
7930	continue;
7931	/ This disc is not fully in-sync. However if it*
7932	* just stored parity (beyond the recovery_offset),
7933	* when we don't need to be concerned about the
7934	* array being dirty.
7935	* When reshape goes 'backwards', we never have
7936	* partially completed devices, so we only need
7937	* to worry about reshape going forwards.
7938	*/
7939	/ Hack because v0.91 doesn't store recovery_offset properly. /
7940	if (mddev->major_version == `0` &&
7941	mddev->minor_version > `90`)
7942	rdev->recovery_offset = reshape_offset;
7943
7944	if (rdev->recovery_offset < reshape_offset) {
7945	/ We need to check old and new layout /
7946	if (!only_parity(raid_disk: rdev->raid_disk,
7947	algo: conf->algorithm,
7948	raid_disks: conf->raid_disks,
7949	max_degraded: conf->max_degraded))
7950	continue;
7951	}
7952	if (!only_parity(raid_disk: rdev->raid_disk,
7953	algo: conf->prev_algo,
7954	raid_disks: conf->previous_raid_disks,
7955	max_degraded: conf->max_degraded))
7956	continue;
7957	dirty_parity_disks++;
7958	}
7959
7960	/*
7961	* 0 for a fully functional array, 1 or 2 for a degraded array.
7962	*/
7963	mddev->degraded = raid5_calc_degraded(conf);
7964
7965	if (has_failed(conf)) {
7966	pr_crit("md/raid:%s: not enough operational devices (%d/%d failed)\n",
7967	mdname(mddev), mddev->degraded, conf->raid_disks);
7968	goto abort;
7969	}
7970
7971	/ device size must be a multiple of chunk size /
7972	mddev->dev_sectors &= ~((sector_t)mddev->chunk_sectors - `1`);
7973	mddev->resync_max_sectors = mddev->dev_sectors;
7974
7975	if (mddev->degraded > dirty_parity_disks &&
7976	mddev->recovery_cp != MaxSector) {
7977	if (test_bit(MD_HAS_PPL, &mddev->flags))
7978	pr_crit("md/raid:%s: starting dirty degraded array with PPL.\n",
7979	mdname(mddev));
7980	else if (mddev->ok_start_degraded)
7981	pr_crit("md/raid:%s: starting dirty degraded array - data corruption possible.\n",
7982	mdname(mddev));
7983	else {
7984	pr_crit("md/raid:%s: cannot start dirty degraded array.\n",
7985	mdname(mddev));
7986	goto abort;
7987	}
7988	}
7989
7990	pr_info("md/raid:%s: raid level %d active with %d out of %d devices, algorithm %d\n",
7991	mdname(mddev), conf->level,
7992	mddev->raid_disks-mddev->degraded, mddev->raid_disks,
7993	mddev->new_layout);
7994
7995	print_raid5_conf(conf);
7996
7997	if (conf->reshape_progress != MaxSector) {
7998	conf->reshape_safe = conf->reshape_progress;
7999	atomic_set(v: &conf->reshape_stripes, i: `0`);
8000	clear_bit(nr: MD_RECOVERY_SYNC, addr: &mddev->recovery);
8001	clear_bit(nr: MD_RECOVERY_CHECK, addr: &mddev->recovery);
8002	set_bit(nr: MD_RECOVERY_RESHAPE, addr: &mddev->recovery);
8003	set_bit(nr: MD_RECOVERY_NEEDED, addr: &mddev->recovery);
8004	}
8005
8006	/ Ok, everything is just fine now /
8007	if (mddev->to_remove == &raid5_attrs_group)
8008	mddev->to_remove = NULL;
8009	else if (mddev->kobj.sd &&
8010	sysfs_create_group(kobj: &mddev->kobj, grp: &raid5_attrs_group))
8011	pr_warn("raid5: failed to create sysfs attributes for %s\n",
8012	mdname(mddev));
8013	md_set_array_sectors(mddev, array_sectors: raid5_size(mddev, sectors: `0`, raid_disks: `0`));
8014
8015	if (!mddev_is_dm(mddev)) {
8016	ret = raid5_set_limits(mddev);
8017	if (ret)
8018	goto abort;
8019	}
8020
8021	if (log_init(conf, journal_dev, ppl: raid5_has_ppl(conf)))
8022	goto abort;
8023
8024	return `0`;
8025	abort:
8026	md_unregister_thread(mddev, threadp: &mddev->thread);
8027	print_raid5_conf(conf);
8028	free_conf(conf);
8029	mddev->private = NULL;
8030	pr_warn("md/raid:%s: failed to run raid set.\n", mdname(mddev));
8031	return ret;
8032	}
8033
8034	static void raid5_free(struct mddev mddev, void* *priv)
8035	{
8036	struct r5conf *conf = priv;
8037
8038	free_conf(conf);
8039	mddev->to_remove = &raid5_attrs_group;
8040	}
8041
8042	static void raid5_status(struct seq_file seq, struct* mddev *mddev)
8043	{
8044	struct r5conf *conf = mddev->private;
8045	int i;
8046
8047	lockdep_assert_held(&mddev->lock);
8048
8049	seq_printf(m: seq, fmt: " level %d, %dk chunk, algorithm %d", mddev->level,
8050	conf->chunk_sectors / `2`, mddev->layout);
8051	seq_printf (m: seq, fmt: " [%d/%d] [", conf->raid_disks, conf->raid_disks - mddev->degraded);
8052	for (i = `0`; i < conf->raid_disks; i++) {
8053	struct md_rdev *rdev = READ_ONCE(conf->disks[i].rdev);
8054
8055	seq_printf (m: seq, fmt: "%s", rdev && test_bit(In_sync, &rdev->flags) ? "U" : "_");
8056	}
8057	seq_printf (m: seq, fmt: "]");
8058	}
8059
8060	static void print_raid5_conf (struct r5conf *conf)
8061	{
8062	struct md_rdev *rdev;
8063	int i;
8064
8065	pr_debug("RAID conf printout:\n");
8066	if (!conf) {
8067	pr_debug("(conf==NULL)\n");
8068	return;
8069	}
8070	pr_debug(" --- level:%d rd:%d wd:%d\n", conf->level,
8071	conf->raid_disks,
8072	conf->raid_disks - conf->mddev->degraded);
8073
8074	rcu_read_lock();
8075	for (i = `0`; i < conf->raid_disks; i++) {
8076	rdev = rcu_dereference(conf->disks[i].rdev);
8077	if (rdev)
8078	pr_debug(" disk %d, o:%d, dev:%pg\n",
8079	i, !test_bit(Faulty, &rdev->flags),
8080	rdev->bdev);
8081	}
8082	rcu_read_unlock();
8083	}
8084
8085	static int raid5_spare_active(struct mddev *mddev)
8086	{
8087	int i;
8088	struct r5conf *conf = mddev->private;
8089	struct md_rdev rdev, replacement;
8090	int count = `0`;
8091	unsigned long flags;
8092
8093	for (i = `0`; i < conf->raid_disks; i++) {
8094	rdev = conf->disks[i].rdev;
8095	replacement = conf->disks[i].replacement;
8096	if (replacement
8097	&& replacement->recovery_offset == MaxSector
8098	&& !test_bit(Faulty, &replacement->flags)
8099	&& !test_and_set_bit(nr: In_sync, addr: &replacement->flags)) {
8100	/ Replacement has just become active. /
8101	if (!rdev
8102	\|\| !test_and_clear_bit(nr: In_sync, addr: &rdev->flags))
8103	count++;
8104	if (rdev) {
8105	/ Replaced device not technically faulty,*
8106	* but we need to be sure it gets removed
8107	* and never re-added.
8108	*/
8109	set_bit(nr: Faulty, addr: &rdev->flags);
8110	sysfs_notify_dirent_safe(
8111	sd: rdev->sysfs_state);
8112	}
8113	sysfs_notify_dirent_safe(sd: replacement->sysfs_state);
8114	} else if (rdev
8115	&& rdev->recovery_offset == MaxSector
8116	&& !test_bit(Faulty, &rdev->flags)
8117	&& !test_and_set_bit(nr: In_sync, addr: &rdev->flags)) {
8118	count++;
8119	sysfs_notify_dirent_safe(sd: rdev->sysfs_state);
8120	}
8121	}
8122	spin_lock_irqsave(&conf->device_lock, flags);
8123	mddev->degraded = raid5_calc_degraded(conf);
8124	spin_unlock_irqrestore(lock: &conf->device_lock, flags);
8125	print_raid5_conf(conf);
8126	return count;
8127	}
8128
8129	static int raid5_remove_disk(struct mddev mddev, struct* md_rdev *rdev)
8130	{
8131	struct r5conf *conf = mddev->private;
8132	int err = `0`;
8133	int number = rdev->raid_disk;
8134	struct md_rdev **rdevp;
8135	struct disk_info *p;
8136	struct md_rdev *tmp;
8137
8138	print_raid5_conf(conf);
8139	if (test_bit(Journal, &rdev->flags) && conf->log) {
8140	/*
8141	* we can't wait pending write here, as this is called in
8142	* raid5d, wait will deadlock.
8143	* neilb: there is no locking about new writes here,
8144	* so this cannot be safe.
8145	*/
8146	if (atomic_read(v: &conf->active_stripes) \|\|
8147	atomic_read(v: &conf->r5c_cached_full_stripes) \|\|
8148	atomic_read(v: &conf->r5c_cached_partial_stripes)) {
8149	return -EBUSY;
8150	}
8151	log_exit(conf);
8152	return `0`;
8153	}
8154	if (unlikely(number >= conf->pool_size))
8155	return `0`;
8156	p = conf->disks + number;
8157	if (rdev == p->rdev)
8158	rdevp = &p->rdev;
8159	else if (rdev == p->replacement)
8160	rdevp = &p->replacement;
8161	else
8162	return `0`;
8163
8164	if (number >= conf->raid_disks &&
8165	conf->reshape_progress == MaxSector)
8166	clear_bit(nr: In_sync, addr: &rdev->flags);
8167
8168	if (test_bit(In_sync, &rdev->flags) \|\|
8169	atomic_read(v: &rdev->nr_pending)) {
8170	err = -EBUSY;
8171	goto abort;
8172	}
8173	/ Only remove non-faulty devices if recovery*
8174	* isn't possible.
8175	*/
8176	if (!test_bit(Faulty, &rdev->flags) &&
8177	mddev->recovery_disabled != conf->recovery_disabled &&
8178	!has_failed(conf) &&
8179	(!p->replacement \|\| p->replacement == rdev) &&
8180	number < conf->raid_disks) {
8181	err = -EBUSY;
8182	goto abort;
8183	}
8184	WRITE_ONCE(*rdevp, NULL);
8185	if (!err) {
8186	err = log_modify(conf, rdev, add: false);
8187	if (err)
8188	goto abort;
8189	}
8190
8191	tmp = p->replacement;
8192	if (tmp) {
8193	/ We must have just cleared 'rdev' /
8194	WRITE_ONCE(p->rdev, tmp);
8195	clear_bit(nr: Replacement, addr: &tmp->flags);
8196	WRITE_ONCE(p->replacement, NULL);
8197
8198	if (!err)
8199	err = log_modify(conf, rdev: tmp, add: true);
8200	}
8201
8202	clear_bit(nr: WantReplacement, addr: &rdev->flags);
8203	abort:
8204
8205	print_raid5_conf(conf);
8206	return err;
8207	}
8208
8209	static int raid5_add_disk(struct mddev mddev, struct* md_rdev *rdev)
8210	{
8211	struct r5conf *conf = mddev->private;
8212	int ret, err = -EEXIST;
8213	int disk;
8214	struct disk_info *p;
8215	struct md_rdev *tmp;
8216	int first = `0`;
8217	int last = conf->raid_disks - `1`;
8218
8219	if (test_bit(Journal, &rdev->flags)) {
8220	if (conf->log)
8221	return -EBUSY;
8222
8223	rdev->raid_disk = `0`;
8224	/*
8225	* The array is in readonly mode if journal is missing, so no
8226	* write requests running. We should be safe
8227	*/
8228	ret = log_init(conf, journal_dev: rdev, ppl: false);
8229	if (ret)
8230	return ret;
8231
8232	ret = r5l_start(log: conf->log);
8233	if (ret)
8234	return ret;
8235
8236	return `0`;
8237	}
8238	if (mddev->recovery_disabled == conf->recovery_disabled)
8239	return -EBUSY;
8240
8241	if (rdev->saved_raid_disk < `0` && has_failed(conf))
8242	/ no point adding a device /
8243	return -EINVAL;
8244
8245	if (rdev->raid_disk >= `0`)
8246	first = last = rdev->raid_disk;
8247
8248	/*
8249	* find the disk ... but prefer rdev->saved_raid_disk
8250	* if possible.
8251	*/
8252	if (rdev->saved_raid_disk >= first &&
8253	rdev->saved_raid_disk <= last &&
8254	conf->disks[rdev->saved_raid_disk].rdev == NULL)
8255	first = rdev->saved_raid_disk;
8256
8257	for (disk = first; disk <= last; disk++) {
8258	p = conf->disks + disk;
8259	if (p->rdev == NULL) {
8260	clear_bit(nr: In_sync, addr: &rdev->flags);
8261	rdev->raid_disk = disk;
8262	if (rdev->saved_raid_disk != disk)
8263	conf->fullsync = `1`;
8264	WRITE_ONCE(p->rdev, rdev);
8265
8266	err = log_modify(conf, rdev, add: true);
8267
8268	goto out;
8269	}
8270	}
8271	for (disk = first; disk <= last; disk++) {
8272	p = conf->disks + disk;
8273	tmp = p->rdev;
8274	if (test_bit(WantReplacement, &tmp->flags) &&
8275	mddev->reshape_position == MaxSector &&
8276	p->replacement == NULL) {
8277	clear_bit(nr: In_sync, addr: &rdev->flags);
8278	set_bit(nr: Replacement, addr: &rdev->flags);
8279	rdev->raid_disk = disk;
8280	err = `0`;
8281	conf->fullsync = `1`;
8282	WRITE_ONCE(p->replacement, rdev);
8283	break;
8284	}
8285	}
8286	out:
8287	print_raid5_conf(conf);
8288	return err;
8289	}
8290
8291	static int raid5_resize(struct mddev *mddev, sector_t sectors)
8292	{
8293	/ no resync is happening, and there is enough space*
8294	* on all devices, so we can resize.
8295	* We need to make sure resync covers any new space.
8296	* If the array is shrinking we should possibly wait until
8297	* any io in the removed space completes, but it hardly seems
8298	* worth it.
8299	*/
8300	sector_t newsize;
8301	struct r5conf *conf = mddev->private;
8302
8303	if (raid5_has_log(conf) \|\| raid5_has_ppl(conf))
8304	return -EINVAL;
8305	sectors &= ~((sector_t)conf->chunk_sectors - `1`);
8306	newsize = raid5_size(mddev, sectors, raid_disks: mddev->raid_disks);
8307	if (mddev->external_size &&
8308	mddev->array_sectors > newsize)
8309	return -EINVAL;
8310	if (mddev->bitmap) {
8311	int ret = md_bitmap_resize(bitmap: mddev->bitmap, blocks: sectors, chunksize: `0`, init: `0`);
8312	if (ret)
8313	return ret;
8314	}
8315	md_set_array_sectors(mddev, array_sectors: newsize);
8316	if (sectors > mddev->dev_sectors &&
8317	mddev->recovery_cp > mddev->dev_sectors) {
8318	mddev->recovery_cp = mddev->dev_sectors;
8319	set_bit(nr: MD_RECOVERY_NEEDED, addr: &mddev->recovery);
8320	}
8321	mddev->dev_sectors = sectors;
8322	mddev->resync_max_sectors = sectors;
8323	return `0`;
8324	}
8325
8326	static int check_stripe_cache(struct mddev *mddev)
8327	{
8328	/ Can only proceed if there are plenty of stripe_heads.*
8329	* We need a minimum of one full stripe,, and for sensible progress
8330	* it is best to have about 4 times that.
8331	* If we require 4 times, then the default 256 4K stripe_heads will
8332	* allow for chunk sizes up to 256K, which is probably OK.
8333	* If the chunk size is greater, user-space should request more
8334	* stripe_heads first.
8335	*/
8336	struct r5conf *conf = mddev->private;
8337	if (((mddev->chunk_sectors << `9`) / RAID5_STRIPE_SIZE(conf)) * `4`
8338	> conf->min_nr_stripes \|\|
8339	((mddev->new_chunk_sectors << `9`) / RAID5_STRIPE_SIZE(conf)) * `4`
8340	> conf->min_nr_stripes) {
8341	pr_warn("md/raid:%s: reshape: not enough stripes. Needed %lu\n",
8342	mdname(mddev),
8343	((max(mddev->chunk_sectors, mddev->new_chunk_sectors) << `9`)
8344	/ RAID5_STRIPE_SIZE(conf))*`4`);
8345	return `0`;
8346	}
8347	return `1`;
8348	}
8349
8350	static int check_reshape(struct mddev *mddev)
8351	{
8352	struct r5conf *conf = mddev->private;
8353
8354	if (raid5_has_log(conf) \|\| raid5_has_ppl(conf))
8355	return -EINVAL;
8356	if (mddev->delta_disks == `0` &&
8357	mddev->new_layout == mddev->layout &&
8358	mddev->new_chunk_sectors == mddev->chunk_sectors)
8359	return `0`; / nothing to do /
8360	if (has_failed(conf))
8361	return -EINVAL;
8362	if (mddev->delta_disks < `0` && mddev->reshape_position == MaxSector) {
8363	/ We might be able to shrink, but the devices must*
8364	* be made bigger first.
8365	* For raid6, 4 is the minimum size.
8366	* Otherwise 2 is the minimum
8367	*/
8368	int min = `2`;
8369	if (mddev->level == `6`)
8370	min = `4`;
8371	if (mddev->raid_disks + mddev->delta_disks < min)
8372	return -EINVAL;
8373	}
8374
8375	if (!check_stripe_cache(mddev))
8376	return -ENOSPC;
8377
8378	if (mddev->new_chunk_sectors > mddev->chunk_sectors \|\|
8379	mddev->delta_disks > `0`)
8380	if (resize_chunks(conf,
8381	new_disks: conf->previous_raid_disks
8382	+ max(`0`, mddev->delta_disks),
8383	max(mddev->new_chunk_sectors,
8384	mddev->chunk_sectors)
8385	) < `0`)
8386	return -ENOMEM;
8387
8388	if (conf->previous_raid_disks + mddev->delta_disks <= conf->pool_size)
8389	return `0`; / never bother to shrink /
8390	return resize_stripes(conf, newsize: (conf->previous_raid_disks
8391	+ mddev->delta_disks));
8392	}
8393
8394	static int raid5_start_reshape(struct mddev *mddev)
8395	{
8396	struct r5conf *conf = mddev->private;
8397	struct md_rdev *rdev;
8398	int spares = `0`;
8399	int i;
8400	unsigned long flags;
8401
8402	if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
8403	return -EBUSY;
8404
8405	if (!check_stripe_cache(mddev))
8406	return -ENOSPC;
8407
8408	if (has_failed(conf))
8409	return -EINVAL;
8410
8411	/ raid5 can't handle concurrent reshape and recovery /
8412	if (mddev->recovery_cp < MaxSector)
8413	return -EBUSY;
8414	for (i = `0`; i < conf->raid_disks; i++)
8415	if (conf->disks[i].replacement)
8416	return -EBUSY;
8417
8418	rdev_for_each(rdev, mddev) {
8419	if (!test_bit(In_sync, &rdev->flags)
8420	&& !test_bit(Faulty, &rdev->flags))
8421	spares++;
8422	}
8423
8424	if (spares - mddev->degraded < mddev->delta_disks - conf->max_degraded)
8425	/ Not enough devices even to make a degraded array*
8426	* of that size
8427	*/
8428	return -EINVAL;
8429
8430	/ Refuse to reduce size of the array. Any reductions in*
8431	* array size must be through explicit setting of array_size
8432	* attribute.
8433	*/
8434	if (raid5_size(mddev, sectors: `0`, raid_disks: conf->raid_disks + mddev->delta_disks)
8435	< mddev->array_sectors) {
8436	pr_warn("md/raid:%s: array size must be reduced before number of disks\n",
8437	mdname(mddev));
8438	return -EINVAL;
8439	}
8440
8441	atomic_set(v: &conf->reshape_stripes, i: `0`);
8442	spin_lock_irq(lock: &conf->device_lock);
8443	write_seqcount_begin(&conf->gen_lock);
8444	conf->previous_raid_disks = conf->raid_disks;
8445	conf->raid_disks += mddev->delta_disks;
8446	conf->prev_chunk_sectors = conf->chunk_sectors;
8447	conf->chunk_sectors = mddev->new_chunk_sectors;
8448	conf->prev_algo = conf->algorithm;
8449	conf->algorithm = mddev->new_layout;
8450	conf->generation++;
8451	/ Code that selects data_offset needs to see the generation update*
8452	* if reshape_progress has been set - so a memory barrier needed.
8453	*/
8454	smp_mb();
8455	if (mddev->reshape_backwards)
8456	conf->reshape_progress = raid5_size(mddev, sectors: `0`, raid_disks: `0`);
8457	else
8458	conf->reshape_progress = `0`;
8459	conf->reshape_safe = conf->reshape_progress;
8460	write_seqcount_end(&conf->gen_lock);
8461	spin_unlock_irq(lock: &conf->device_lock);
8462
8463	/ Now make sure any requests that proceeded on the assumption*
8464	* the reshape wasn't running - like Discard or Read - have
8465	* completed.
8466	*/
8467	raid5_quiesce(mddev, quiesce: true);
8468	raid5_quiesce(mddev, quiesce: false);
8469
8470	/ Add some new drives, as many as will fit.*
8471	* We know there are enough to make the newly sized array work.
8472	* Don't add devices if we are reducing the number of
8473	* devices in the array. This is because it is not possible
8474	* to correctly record the "partially reconstructed" state of
8475	* such devices during the reshape and confusion could result.
8476	*/
8477	if (mddev->delta_disks >= `0`) {
8478	rdev_for_each(rdev, mddev)
8479	if (rdev->raid_disk < `0` &&
8480	!test_bit(Faulty, &rdev->flags)) {
8481	if (raid5_add_disk(mddev, rdev) == `0`) {
8482	if (rdev->raid_disk
8483	>= conf->previous_raid_disks)
8484	set_bit(nr: In_sync, addr: &rdev->flags);
8485	else
8486	rdev->recovery_offset = `0`;
8487
8488	/ Failure here is OK /
8489	sysfs_link_rdev(mddev, rdev);
8490	}
8491	} else if (rdev->raid_disk >= conf->previous_raid_disks
8492	&& !test_bit(Faulty, &rdev->flags)) {
8493	/ This is a spare that was manually added /
8494	set_bit(nr: In_sync, addr: &rdev->flags);
8495	}
8496
8497	/ When a reshape changes the number of devices,*
8498	* ->degraded is measured against the larger of the
8499	* pre and post number of devices.
8500	*/
8501	spin_lock_irqsave(&conf->device_lock, flags);
8502	mddev->degraded = raid5_calc_degraded(conf);
8503	spin_unlock_irqrestore(lock: &conf->device_lock, flags);
8504	}
8505	mddev->raid_disks = conf->raid_disks;
8506	mddev->reshape_position = conf->reshape_progress;
8507	set_bit(nr: MD_SB_CHANGE_DEVS, addr: &mddev->sb_flags);
8508
8509	clear_bit(nr: MD_RECOVERY_SYNC, addr: &mddev->recovery);
8510	clear_bit(nr: MD_RECOVERY_CHECK, addr: &mddev->recovery);
8511	clear_bit(nr: MD_RECOVERY_DONE, addr: &mddev->recovery);
8512	set_bit(nr: MD_RECOVERY_RESHAPE, addr: &mddev->recovery);
8513	set_bit(nr: MD_RECOVERY_NEEDED, addr: &mddev->recovery);
8514	conf->reshape_checkpoint = jiffies;
8515	md_new_event();
8516	return `0`;
8517	}
8518
8519	/ This is called from the reshape thread and should make any*
8520	* changes needed in 'conf'
8521	*/
8522	static void end_reshape(struct r5conf *conf)
8523	{
8524
8525	if (!test_bit(MD_RECOVERY_INTR, &conf->mddev->recovery)) {
8526	struct md_rdev *rdev;
8527
8528	spin_lock_irq(lock: &conf->device_lock);
8529	conf->previous_raid_disks = conf->raid_disks;
8530	md_finish_reshape(mddev: conf->mddev);
8531	smp_wmb();
8532	conf->reshape_progress = MaxSector;
8533	conf->mddev->reshape_position = MaxSector;
8534	rdev_for_each(rdev, conf->mddev)
8535	if (rdev->raid_disk >= `0` &&
8536	!test_bit(Journal, &rdev->flags) &&
8537	!test_bit(In_sync, &rdev->flags))
8538	rdev->recovery_offset = MaxSector;
8539	spin_unlock_irq(lock: &conf->device_lock);
8540	wake_up(&conf->wait_for_overlap);
8541
8542	mddev_update_io_opt(mddev: conf->mddev,
8543	nr_stripes: conf->raid_disks - conf->max_degraded);
8544	}
8545	}
8546
8547	/ This is called from the raid5d thread with mddev_lock held.*
8548	* It makes config changes to the device.
8549	*/
8550	static void raid5_finish_reshape(struct mddev *mddev)
8551	{
8552	struct r5conf *conf = mddev->private;
8553	struct md_rdev *rdev;
8554
8555	if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
8556
8557	if (mddev->delta_disks <= `0`) {
8558	int d;
8559	spin_lock_irq(lock: &conf->device_lock);
8560	mddev->degraded = raid5_calc_degraded(conf);
8561	spin_unlock_irq(lock: &conf->device_lock);
8562	for (d = conf->raid_disks ;
8563	d < conf->raid_disks - mddev->delta_disks;
8564	d++) {
8565	rdev = conf->disks[d].rdev;
8566	if (rdev)
8567	clear_bit(nr: In_sync, addr: &rdev->flags);
8568	rdev = conf->disks[d].replacement;
8569	if (rdev)
8570	clear_bit(nr: In_sync, addr: &rdev->flags);
8571	}
8572	}
8573	mddev->layout = conf->algorithm;
8574	mddev->chunk_sectors = conf->chunk_sectors;
8575	mddev->reshape_position = MaxSector;
8576	mddev->delta_disks = `0`;
8577	mddev->reshape_backwards = `0`;
8578	}
8579	}
8580
8581	static void raid5_quiesce(struct mddev mddev, int* quiesce)
8582	{
8583	struct r5conf *conf = mddev->private;
8584
8585	if (quiesce) {
8586	/ stop all writes /
8587	lock_all_device_hash_locks_irq(conf);
8588	/ '2' tells resync/reshape to pause so that all*
8589	* active stripes can drain
8590	*/
8591	r5c_flush_cache(conf, INT_MAX);
8592	/ need a memory barrier to make sure read_one_chunk() sees*
8593	* quiesce started and reverts to slow (locked) path.
8594	*/
8595	smp_store_release(&conf->quiesce, `2`);
8596	wait_event_cmd(conf->wait_for_quiescent,
8597	atomic_read(&conf->active_stripes) == `0` &&
8598	atomic_read(&conf->active_aligned_reads) == `0`,
8599	unlock_all_device_hash_locks_irq(conf),
8600	lock_all_device_hash_locks_irq(conf));
8601	conf->quiesce = `1`;
8602	unlock_all_device_hash_locks_irq(conf);
8603	/ allow reshape to continue /
8604	wake_up(&conf->wait_for_overlap);
8605	} else {
8606	/ re-enable writes /
8607	lock_all_device_hash_locks_irq(conf);
8608	conf->quiesce = `0`;
8609	wake_up(&conf->wait_for_quiescent);
8610	wake_up(&conf->wait_for_overlap);
8611	unlock_all_device_hash_locks_irq(conf);
8612	}
8613	log_quiesce(conf, quiesce);
8614	}
8615
8616	static void raid45_takeover_raid0(struct* mddev mddev, int* level)
8617	{
8618	struct r0conf *raid0_conf = mddev->private;
8619	sector_t sectors;
8620
8621	/ for raid0 takeover only one zone is supported /
8622	if (raid0_conf->nr_strip_zones > `1`) {
8623	pr_warn("md/raid:%s: cannot takeover raid0 with more than one zone.\n",
8624	mdname(mddev));
8625	return ERR_PTR(error: -EINVAL);
8626	}
8627
8628	sectors = raid0_conf->strip_zone[`0`].zone_end;
8629	sector_div(sectors, raid0_conf->strip_zone[`0`].nb_dev);
8630	mddev->dev_sectors = sectors;
8631	mddev->new_level = level;
8632	mddev->new_layout = ALGORITHM_PARITY_N;
8633	mddev->new_chunk_sectors = mddev->chunk_sectors;
8634	mddev->raid_disks += `1`;
8635	mddev->delta_disks = `1`;
8636	/ make sure it will be not marked as dirty /
8637	mddev->recovery_cp = MaxSector;
8638
8639	return setup_conf(mddev);
8640	}
8641
8642	static void raid5_takeover_raid1(struct* mddev *mddev)
8643	{
8644	int chunksect;
8645	void *ret;
8646
8647	if (mddev->raid_disks != `2` \|\|
8648	mddev->degraded > `1`)
8649	return ERR_PTR(error: -EINVAL);
8650
8651	/ Should check if there are write-behind devices? /
8652
8653	chunksect = `64``2`; /* 64K by default /
8654
8655	/ The array must be an exact multiple of chunksize /
8656	while (chunksect && (mddev->array_sectors & (chunksect-`1`)))
8657	chunksect >>= `1`;
8658
8659	if ((chunksect<<`9`) < RAID5_STRIPE_SIZE((struct r5conf *)mddev->private))
8660	/ array size does not allow a suitable chunk size /
8661	return ERR_PTR(error: -EINVAL);
8662
8663	mddev->new_level = `5`;
8664	mddev->new_layout = ALGORITHM_LEFT_SYMMETRIC;
8665	mddev->new_chunk_sectors = chunksect;
8666
8667	ret = setup_conf(mddev);
8668	if (!IS_ERR(ptr: ret))
8669	mddev_clear_unsupported_flags(mddev,
8670	UNSUPPORTED_MDDEV_FLAGS);
8671	return ret;
8672	}
8673
8674	static void raid5_takeover_raid6(struct* mddev *mddev)
8675	{
8676	int new_layout;
8677
8678	switch (mddev->layout) {
8679	case ALGORITHM_LEFT_ASYMMETRIC_6:
8680	new_layout = ALGORITHM_LEFT_ASYMMETRIC;
8681	break;
8682	case ALGORITHM_RIGHT_ASYMMETRIC_6:
8683	new_layout = ALGORITHM_RIGHT_ASYMMETRIC;
8684	break;
8685	case ALGORITHM_LEFT_SYMMETRIC_6:
8686	new_layout = ALGORITHM_LEFT_SYMMETRIC;
8687	break;
8688	case ALGORITHM_RIGHT_SYMMETRIC_6:
8689	new_layout = ALGORITHM_RIGHT_SYMMETRIC;
8690	break;
8691	case ALGORITHM_PARITY_0_6:
8692	new_layout = ALGORITHM_PARITY_0;
8693	break;
8694	case ALGORITHM_PARITY_N:
8695	new_layout = ALGORITHM_PARITY_N;
8696	break;
8697	default:
8698	return ERR_PTR(error: -EINVAL);
8699	}
8700	mddev->new_level = `5`;
8701	mddev->new_layout = new_layout;
8702	mddev->delta_disks = -`1`;
8703	mddev->raid_disks -= `1`;
8704	return setup_conf(mddev);
8705	}
8706
8707	static int raid5_check_reshape(struct mddev *mddev)
8708	{
8709	/ For a 2-drive array, the layout and chunk size can be changed*
8710	* immediately as not restriping is needed.
8711	* For larger arrays we record the new value - after validation
8712	* to be used by a reshape pass.
8713	*/
8714	struct r5conf *conf = mddev->private;
8715	int new_chunk = mddev->new_chunk_sectors;
8716
8717	if (mddev->new_layout >= `0` && !algorithm_valid_raid5(layout: mddev->new_layout))
8718	return -EINVAL;
8719	if (new_chunk > `0`) {
8720	if (!is_power_of_2(n: new_chunk))
8721	return -EINVAL;
8722	if (new_chunk < (PAGE_SIZE>>`9`))
8723	return -EINVAL;
8724	if (mddev->array_sectors & (new_chunk-`1`))
8725	/ not factor of array size /
8726	return -EINVAL;
8727	}
8728
8729	/ They look valid /
8730
8731	if (mddev->raid_disks == `2`) {
8732	/ can make the change immediately /
8733	if (mddev->new_layout >= `0`) {
8734	conf->algorithm = mddev->new_layout;
8735	mddev->layout = mddev->new_layout;
8736	}
8737	if (new_chunk > `0`) {
8738	conf->chunk_sectors = new_chunk ;
8739	mddev->chunk_sectors = new_chunk;
8740	}
8741	set_bit(nr: MD_SB_CHANGE_DEVS, addr: &mddev->sb_flags);
8742	md_wakeup_thread(thread: mddev->thread);
8743	}
8744	return check_reshape(mddev);
8745	}
8746
8747	static int raid6_check_reshape(struct mddev *mddev)
8748	{
8749	int new_chunk = mddev->new_chunk_sectors;
8750
8751	if (mddev->new_layout >= `0` && !algorithm_valid_raid6(layout: mddev->new_layout))
8752	return -EINVAL;
8753	if (new_chunk > `0`) {
8754	if (!is_power_of_2(n: new_chunk))
8755	return -EINVAL;
8756	if (new_chunk < (PAGE_SIZE >> `9`))
8757	return -EINVAL;
8758	if (mddev->array_sectors & (new_chunk-`1`))
8759	/ not factor of array size /
8760	return -EINVAL;
8761	}
8762
8763	/ They look valid /
8764	return check_reshape(mddev);
8765	}
8766
8767	static void raid5_takeover(struct* mddev *mddev)
8768	{
8769	/ raid5 can take over:*
8770	* raid0 - if there is only one strip zone - make it a raid4 layout
8771	* raid1 - if there are two drives. We need to know the chunk size
8772	* raid4 - trivial - just use a raid4 layout.
8773	* raid6 - Providing it is a *_6 layout
8774	*/
8775	if (mddev->level == `0`)
8776	return raid45_takeover_raid0(mddev, level: `5`);
8777	if (mddev->level == `1`)
8778	return raid5_takeover_raid1(mddev);
8779	if (mddev->level == `4`) {
8780	mddev->new_layout = ALGORITHM_PARITY_N;
8781	mddev->new_level = `5`;
8782	return setup_conf(mddev);
8783	}
8784	if (mddev->level == `6`)
8785	return raid5_takeover_raid6(mddev);
8786
8787	return ERR_PTR(error: -EINVAL);
8788	}
8789
8790	static void raid4_takeover(struct* mddev *mddev)
8791	{
8792	/ raid4 can take over:*
8793	* raid0 - if there is only one strip zone
8794	* raid5 - if layout is right
8795	*/
8796	if (mddev->level == `0`)
8797	return raid45_takeover_raid0(mddev, level: `4`);
8798	if (mddev->level == `5` &&
8799	mddev->layout == ALGORITHM_PARITY_N) {
8800	mddev->new_layout = `0`;
8801	mddev->new_level = `4`;
8802	return setup_conf(mddev);
8803	}
8804	return ERR_PTR(error: -EINVAL);
8805	}
8806
8807	static struct md_personality raid5_personality;
8808
8809	static void raid6_takeover(struct* mddev *mddev)
8810	{
8811	/ Currently can only take over a raid5. We map the*
8812	* personality to an equivalent raid6 personality
8813	* with the Q block at the end.
8814	*/
8815	int new_layout;
8816
8817	if (mddev->pers != &raid5_personality)
8818	return ERR_PTR(error: -EINVAL);
8819	if (mddev->degraded > `1`)
8820	return ERR_PTR(error: -EINVAL);
8821	if (mddev->raid_disks > `253`)
8822	return ERR_PTR(error: -EINVAL);
8823	if (mddev->raid_disks < `3`)
8824	return ERR_PTR(error: -EINVAL);
8825
8826	switch (mddev->layout) {
8827	case ALGORITHM_LEFT_ASYMMETRIC:
8828	new_layout = ALGORITHM_LEFT_ASYMMETRIC_6;
8829	break;
8830	case ALGORITHM_RIGHT_ASYMMETRIC:
8831	new_layout = ALGORITHM_RIGHT_ASYMMETRIC_6;
8832	break;
8833	case ALGORITHM_LEFT_SYMMETRIC:
8834	new_layout = ALGORITHM_LEFT_SYMMETRIC_6;
8835	break;
8836	case ALGORITHM_RIGHT_SYMMETRIC:
8837	new_layout = ALGORITHM_RIGHT_SYMMETRIC_6;
8838	break;
8839	case ALGORITHM_PARITY_0:
8840	new_layout = ALGORITHM_PARITY_0_6;
8841	break;
8842	case ALGORITHM_PARITY_N:
8843	new_layout = ALGORITHM_PARITY_N;
8844	break;
8845	default:
8846	return ERR_PTR(error: -EINVAL);
8847	}
8848	mddev->new_level = `6`;
8849	mddev->new_layout = new_layout;
8850	mddev->delta_disks = `1`;
8851	mddev->raid_disks += `1`;
8852	return setup_conf(mddev);
8853	}
8854
8855	static int raid5_change_consistency_policy(struct mddev mddev, const* char *buf)
8856	{
8857	struct r5conf *conf;
8858	int err;
8859
8860	err = mddev_suspend_and_lock(mddev);
8861	if (err)
8862	return err;
8863	conf = mddev->private;
8864	if (!conf) {
8865	mddev_unlock_and_resume(mddev);
8866	return -ENODEV;
8867	}
8868
8869	if (strncmp(buf, "ppl", `3`) == `0`) {
8870	/ ppl only works with RAID 5 /
8871	if (!raid5_has_ppl(conf) && conf->level == `5`) {
8872	err = log_init(conf, NULL, ppl: true);
8873	if (!err) {
8874	err = resize_stripes(conf, newsize: conf->pool_size);
8875	if (err)
8876	log_exit(conf);
8877	}
8878	} else
8879	err = -EINVAL;
8880	} else if (strncmp(buf, "resync", `6`) == `0`) {
8881	if (raid5_has_ppl(conf)) {
8882	log_exit(conf);
8883	err = resize_stripes(conf, newsize: conf->pool_size);
8884	} else if (test_bit(MD_HAS_JOURNAL, &conf->mddev->flags) &&
8885	r5l_log_disk_error(conf)) {
8886	bool journal_dev_exists = false;
8887	struct md_rdev *rdev;
8888
8889	rdev_for_each(rdev, mddev)
8890	if (test_bit(Journal, &rdev->flags)) {
8891	journal_dev_exists = true;
8892	break;
8893	}
8894
8895	if (!journal_dev_exists)
8896	clear_bit(nr: MD_HAS_JOURNAL, addr: &mddev->flags);
8897	else / need remove journal device first /
8898	err = -EBUSY;
8899	} else
8900	err = -EINVAL;
8901	} else {
8902	err = -EINVAL;
8903	}
8904
8905	if (!err)
8906	md_update_sb(mddev, force: `1`);
8907
8908	mddev_unlock_and_resume(mddev);
8909
8910	return err;
8911	}
8912
8913	static int raid5_start(struct mddev *mddev)
8914	{
8915	struct r5conf *conf = mddev->private;
8916
8917	return r5l_start(log: conf->log);
8918	}
8919
8920	/*
8921	* This is only used for dm-raid456, caller already frozen sync_thread, hence
8922	* if rehsape is still in progress, io that is waiting for reshape can never be
8923	* done now, hence wake up and handle those IO.
8924	*/
8925	static void raid5_prepare_suspend(struct mddev *mddev)
8926	{
8927	struct r5conf *conf = mddev->private;
8928
8929	wake_up(&conf->wait_for_overlap);
8930	}
8931
8932	static struct md_personality raid6_personality =
8933	{
8934	.name = "raid6",
8935	.level = `6`,
8936	.owner = THIS_MODULE,
8937	.make_request = raid5_make_request,
8938	.run = raid5_run,
8939	.start = raid5_start,
8940	.free = raid5_free,
8941	.status = raid5_status,
8942	.error_handler = raid5_error,
8943	.hot_add_disk = raid5_add_disk,
8944	.hot_remove_disk= raid5_remove_disk,
8945	.spare_active = raid5_spare_active,
8946	.sync_request = raid5_sync_request,
8947	.resize = raid5_resize,
8948	.size = raid5_size,
8949	.check_reshape = raid6_check_reshape,
8950	.start_reshape = raid5_start_reshape,
8951	.finish_reshape = raid5_finish_reshape,
8952	.quiesce = raid5_quiesce,
8953	.takeover = raid6_takeover,
8954	.change_consistency_policy = raid5_change_consistency_policy,
8955	.prepare_suspend = raid5_prepare_suspend,
8956	};
8957	static struct md_personality raid5_personality =
8958	{
8959	.name = "raid5",
8960	.level = `5`,
8961	.owner = THIS_MODULE,
8962	.make_request = raid5_make_request,
8963	.run = raid5_run,
8964	.start = raid5_start,
8965	.free = raid5_free,
8966	.status = raid5_status,
8967	.error_handler = raid5_error,
8968	.hot_add_disk = raid5_add_disk,
8969	.hot_remove_disk= raid5_remove_disk,
8970	.spare_active = raid5_spare_active,
8971	.sync_request = raid5_sync_request,
8972	.resize = raid5_resize,
8973	.size = raid5_size,
8974	.check_reshape = raid5_check_reshape,
8975	.start_reshape = raid5_start_reshape,
8976	.finish_reshape = raid5_finish_reshape,
8977	.quiesce = raid5_quiesce,
8978	.takeover = raid5_takeover,
8979	.change_consistency_policy = raid5_change_consistency_policy,
8980	.prepare_suspend = raid5_prepare_suspend,
8981	};
8982
8983	static struct md_personality raid4_personality =
8984	{
8985	.name = "raid4",
8986	.level = `4`,
8987	.owner = THIS_MODULE,
8988	.make_request = raid5_make_request,
8989	.run = raid5_run,
8990	.start = raid5_start,
8991	.free = raid5_free,
8992	.status = raid5_status,
8993	.error_handler = raid5_error,
8994	.hot_add_disk = raid5_add_disk,
8995	.hot_remove_disk= raid5_remove_disk,
8996	.spare_active = raid5_spare_active,
8997	.sync_request = raid5_sync_request,
8998	.resize = raid5_resize,
8999	.size = raid5_size,
9000	.check_reshape = raid5_check_reshape,
9001	.start_reshape = raid5_start_reshape,
9002	.finish_reshape = raid5_finish_reshape,
9003	.quiesce = raid5_quiesce,
9004	.takeover = raid4_takeover,
9005	.change_consistency_policy = raid5_change_consistency_policy,
9006	.prepare_suspend = raid5_prepare_suspend,
9007	};
9008
9009	static int __init raid5_init(void)
9010	{
9011	int ret;
9012
9013	raid5_wq = alloc_workqueue(fmt: "raid5wq",
9014	flags: WQ_UNBOUND\|WQ_MEM_RECLAIM\|WQ_CPU_INTENSIVE\|WQ_SYSFS, max_active: `0`);
9015	if (!raid5_wq)
9016	return -ENOMEM;
9017
9018	ret = cpuhp_setup_state_multi(state: CPUHP_MD_RAID5_PREPARE,
9019	name: "md/raid5:prepare",
9020	startup: raid456_cpu_up_prepare,
9021	teardown: raid456_cpu_dead);
9022	if (ret) {
9023	destroy_workqueue(wq: raid5_wq);
9024	return ret;
9025	}
9026	register_md_personality(p: &raid6_personality);
9027	register_md_personality(p: &raid5_personality);
9028	register_md_personality(p: &raid4_personality);
9029	return `0`;
9030	}
9031
9032	static void raid5_exit(void)
9033	{
9034	unregister_md_personality(p: &raid6_personality);
9035	unregister_md_personality(p: &raid5_personality);
9036	unregister_md_personality(p: &raid4_personality);
9037	cpuhp_remove_multi_state(state: CPUHP_MD_RAID5_PREPARE);
9038	destroy_workqueue(wq: raid5_wq);
9039	}
9040
9041	module_init(raid5_init);
9042	module_exit(raid5_exit);
9043	MODULE_LICENSE("GPL");
9044	MODULE_DESCRIPTION("RAID4/5/6 (striping with parity) personality for MD");
9045	MODULE_ALIAS("md-personality-4"); / RAID5 /
9046	MODULE_ALIAS("md-raid5");
9047	MODULE_ALIAS("md-raid4");
9048	MODULE_ALIAS("md-level-5");
9049	MODULE_ALIAS("md-level-4");
9050	MODULE_ALIAS("md-personality-8"); / RAID6 /
9051	MODULE_ALIAS("md-raid6");
9052	MODULE_ALIAS("md-level-6");
9053
9054	/ This used to be two separate modules, they were: /
9055	MODULE_ALIAS("raid5");
9056	MODULE_ALIAS("raid6");
9057

source code of linux/drivers/md/raid5.c