raid5.c source code [linux/drivers/md/raid5.c]

1	// SPDX-License-Identifier: GPL-2.0-or-later
2	/*
3	* raid5.c : Multiple Devices driver for Linux
4	* Copyright (C) 1996, 1997 Ingo Molnar, Miguel de Icaza, Gadi Oxman
5	* Copyright (C) 1999, 2000 Ingo Molnar
6	* Copyright (C) 2002, 2003 H. Peter Anvin
7	*
8	* RAID-4/5/6 management functions.
9	* Thanks to Penguin Computing for making the RAID-6 development possible
10	* by donating a test server!
11	*/
12
13	/*
14	* BITMAP UNPLUGGING:
15	*
16	* The sequencing for updating the bitmap reliably is a little
17	* subtle (and I got it wrong the first time) so it deserves some
18	* explanation.
19	*
20	* We group bitmap updates into batches. Each batch has a number.
21	* We may write out several batches at once, but that isn't very important.
22	* conf->seq_write is the number of the last batch successfully written.
23	* conf->seq_flush is the number of the last batch that was closed to
24	* new additions.
25	* When we discover that we will need to write to any block in a stripe
26	* (in add_stripe_bio) we update the in-memory bitmap and record in sh->bm_seq
27	* the number of the batch it will be in. This is seq_flush+1.
28	* When we are ready to do a write, if that batch hasn't been written yet,
29	* we plug the array and queue the stripe for later.
30	* When an unplug happens, we increment bm_flush, thus closing the current
31	* batch.
32	* When we notice that bm_flush > bm_write, we write out all pending updates
33	* to the bitmap, and advance bm_write to where bm_flush was.
34	* This may occasionally write a bit out twice, but is sure never to
35	* miss any bits.
36	*/
37
38	#include <linux/blkdev.h>
39	#include <linux/delay.h>
40	#include <linux/kthread.h>
41	#include <linux/raid/pq.h>
42	#include <linux/async_tx.h>
43	#include <linux/module.h>
44	#include <linux/async.h>
45	#include <linux/seq_file.h>
46	#include <linux/cpu.h>
47	#include <linux/slab.h>
48	#include <linux/ratelimit.h>
49	#include <linux/nodemask.h>
50
51	#include <trace/events/block.h>
52	#include <linux/list_sort.h>
53
54	#include "md.h"
55	#include "raid5.h"
56	#include "raid0.h"
57	#include "md-bitmap.h"
58	#include "raid5-log.h"
59
60	#define UNSUPPORTED_MDDEV_FLAGS (1L << MD_FAILFAST_SUPPORTED)
61
62	#define cpu_to_group(cpu) cpu_to_node(cpu)
63	#define ANY_GROUP NUMA_NO_NODE
64
65	#define RAID5_MAX_REQ_STRIPES 256
66
67	static bool devices_handle_discard_safely = false;
68	module_param(devices_handle_discard_safely, bool, `0644`);
69	MODULE_PARM_DESC(devices_handle_discard_safely,
70	"Set to Y if all devices in each array reliably return zeroes on reads from discarded regions");
71	static struct workqueue_struct *raid5_wq;
72
73	static void raid5_quiesce(struct mddev mddev, int* quiesce);
74
75	static inline struct hlist_head stripe_hash(struct* r5conf *conf, sector_t sect)
76	{
77	int hash = (sect >> RAID5_STRIPE_SHIFT(conf)) & HASH_MASK;
78	return &conf->stripe_hashtbl[hash];
79	}
80
81	static inline int stripe_hash_locks_hash(struct r5conf *conf, sector_t sect)
82	{
83	return (sect >> RAID5_STRIPE_SHIFT(conf)) & STRIPE_HASH_LOCKS_MASK;
84	}
85
86	static inline void lock_device_hash_lock(struct r5conf conf, int* hash)
87	__acquires(&conf->device_lock)
88	{
89	spin_lock_irq(lock: conf->hash_locks + hash);
90	spin_lock(lock: &conf->device_lock);
91	}
92
93	static inline void unlock_device_hash_lock(struct r5conf conf, int* hash)
94	__releases(&conf->device_lock)
95	{
96	spin_unlock(lock: &conf->device_lock);
97	spin_unlock_irq(lock: conf->hash_locks + hash);
98	}
99
100	static inline void lock_all_device_hash_locks_irq(struct r5conf *conf)
101	__acquires(&conf->device_lock)
102	{
103	int i;
104	spin_lock_irq(lock: conf->hash_locks);
105	for (i = `1`; i < NR_STRIPE_HASH_LOCKS; i++)
106	spin_lock_nest_lock(conf->hash_locks + i, conf->hash_locks);
107	spin_lock(lock: &conf->device_lock);
108	}
109
110	static inline void unlock_all_device_hash_locks_irq(struct r5conf *conf)
111	__releases(&conf->device_lock)
112	{
113	int i;
114	spin_unlock(lock: &conf->device_lock);
115	for (i = NR_STRIPE_HASH_LOCKS - `1`; i; i--)
116	spin_unlock(lock: conf->hash_locks + i);
117	spin_unlock_irq(lock: conf->hash_locks);
118	}
119
120	/ Find first data disk in a raid6 stripe /
121	static inline int raid6_d0(struct stripe_head *sh)
122	{
123	if (sh->ddf_layout)
124	/ ddf always start from first device /
125	return `0`;
126	/ md starts just after Q block /
127	if (sh->qd_idx == sh->disks - `1`)
128	return `0`;
129	else
130	return sh->qd_idx + `1`;
131	}
132	static inline int raid6_next_disk(int disk, int raid_disks)
133	{
134	disk++;
135	return (disk < raid_disks) ? disk : `0`;
136	}
137
138	/ When walking through the disks in a raid5, starting at raid6_d0,*
139	* We need to map each disk to a 'slot', where the data disks are slot
140	* 0 .. raid_disks-3, the parity disk is raid_disks-2 and the Q disk
141	* is raid_disks-1. This help does that mapping.
142	*/
143	static int raid6_idx_to_slot(int idx, struct stripe_head *sh,
144	int count, int* syndrome_disks)
145	{
146	int slot = *count;
147
148	if (sh->ddf_layout)
149	(*count)++;
150	if (idx == sh->pd_idx)
151	return syndrome_disks;
152	if (idx == sh->qd_idx)
153	return syndrome_disks + `1`;
154	if (!sh->ddf_layout)
155	(*count)++;
156	return slot;
157	}
158
159	static void print_raid5_conf (struct r5conf *conf);
160
161	static int stripe_operations_active(struct stripe_head *sh)
162	{
163	return sh->check_state \|\| sh->reconstruct_state \|\|
164	test_bit(STRIPE_BIOFILL_RUN, &sh->state) \|\|
165	test_bit(STRIPE_COMPUTE_RUN, &sh->state);
166	}
167
168	static bool stripe_is_lowprio(struct stripe_head *sh)
169	{
170	return (test_bit(STRIPE_R5C_FULL_STRIPE, &sh->state) \|\|
171	test_bit(STRIPE_R5C_PARTIAL_STRIPE, &sh->state)) &&
172	!test_bit(STRIPE_R5C_CACHING, &sh->state);
173	}
174
175	static void raid5_wakeup_stripe_thread(struct stripe_head *sh)
176	__must_hold(&sh->raid_conf->device_lock)
177	{
178	struct r5conf *conf = sh->raid_conf;
179	struct r5worker_group *group;
180	int thread_cnt;
181	int i, cpu = sh->cpu;
182
183	if (!cpu_online(cpu)) {
184	cpu = cpumask_any(cpu_online_mask);
185	sh->cpu = cpu;
186	}
187
188	if (list_empty(head: &sh->lru)) {
189	struct r5worker_group *group;
190	group = conf->worker_groups + cpu_to_group(cpu);
191	if (stripe_is_lowprio(sh))
192	list_add_tail(new: &sh->lru, head: &group->loprio_list);
193	else
194	list_add_tail(new: &sh->lru, head: &group->handle_list);
195	group->stripes_cnt++;
196	sh->group = group;
197	}
198
199	if (conf->worker_cnt_per_group == `0`) {
200	md_wakeup_thread(thread: conf->mddev->thread);
201	return;
202	}
203
204	group = conf->worker_groups + cpu_to_group(sh->cpu);
205
206	group->workers[`0`].working = true;
207	/ at least one worker should run to avoid race /
208	queue_work_on(cpu: sh->cpu, wq: raid5_wq, work: &group->workers[`0`].work);
209
210	thread_cnt = group->stripes_cnt / MAX_STRIPE_BATCH - `1`;
211	/ wakeup more workers /
212	for (i = `1`; i < conf->worker_cnt_per_group && thread_cnt > `0`; i++) {
213	if (group->workers[i].working == false) {
214	group->workers[i].working = true;
215	queue_work_on(cpu: sh->cpu, wq: raid5_wq,
216	work: &group->workers[i].work);
217	thread_cnt--;
218	}
219	}
220	}
221
222	static void do_release_stripe(struct r5conf conf, struct* stripe_head *sh,
223	struct list_head *temp_inactive_list)
224	__must_hold(&conf->device_lock)
225	{
226	int i;
227	int injournal = `0`; / number of date pages with R5_InJournal /
228
229	BUG_ON(!list_empty(&sh->lru));
230	BUG_ON(atomic_read(&conf->active_stripes)==`0`);
231
232	if (r5c_is_writeback(log: conf->log))
233	for (i = sh->disks; i--; )
234	if (test_bit(R5_InJournal, &sh->dev[i].flags))
235	injournal++;
236	/*
237	* In the following cases, the stripe cannot be released to cached
238	* lists. Therefore, we make the stripe write out and set
239	* STRIPE_HANDLE:
240	* 1. when quiesce in r5c write back;
241	* 2. when resync is requested fot the stripe.
242	*/
243	if (test_bit(STRIPE_SYNC_REQUESTED, &sh->state) \|\|
244	(conf->quiesce && r5c_is_writeback(log: conf->log) &&
245	!test_bit(STRIPE_HANDLE, &sh->state) && injournal != `0`)) {
246	if (test_bit(STRIPE_R5C_CACHING, &sh->state))
247	r5c_make_stripe_write_out(sh);
248	set_bit(nr: STRIPE_HANDLE, addr: &sh->state);
249	}
250
251	if (test_bit(STRIPE_HANDLE, &sh->state)) {
252	if (test_bit(STRIPE_DELAYED, &sh->state) &&
253	!test_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
254	list_add_tail(new: &sh->lru, head: &conf->delayed_list);
255	else if (test_bit(STRIPE_BIT_DELAY, &sh->state) &&
256	sh->bm_seq - conf->seq_write > `0`)
257	list_add_tail(new: &sh->lru, head: &conf->bitmap_list);
258	else {
259	clear_bit(nr: STRIPE_DELAYED, addr: &sh->state);
260	clear_bit(nr: STRIPE_BIT_DELAY, addr: &sh->state);
261	if (conf->worker_cnt_per_group == `0`) {
262	if (stripe_is_lowprio(sh))
263	list_add_tail(new: &sh->lru,
264	head: &conf->loprio_list);
265	else
266	list_add_tail(new: &sh->lru,
267	head: &conf->handle_list);
268	} else {
269	raid5_wakeup_stripe_thread(sh);
270	return;
271	}
272	}
273	md_wakeup_thread(thread: conf->mddev->thread);
274	} else {
275	BUG_ON(stripe_operations_active(sh));
276	if (test_and_clear_bit(nr: STRIPE_PREREAD_ACTIVE, addr: &sh->state))
277	if (atomic_dec_return(v: &conf->preread_active_stripes)
278	< IO_THRESHOLD)
279	md_wakeup_thread(thread: conf->mddev->thread);
280	atomic_dec(v: &conf->active_stripes);
281	if (!test_bit(STRIPE_EXPANDING, &sh->state)) {
282	if (!r5c_is_writeback(log: conf->log))
283	list_add_tail(new: &sh->lru, head: temp_inactive_list);
284	else {
285	WARN_ON(test_bit(R5_InJournal, &sh->dev[sh->pd_idx].flags));
286	if (injournal == `0`)
287	list_add_tail(new: &sh->lru, head: temp_inactive_list);
288	else if (injournal == conf->raid_disks - conf->max_degraded) {
289	/ full stripe /
290	if (!test_and_set_bit(nr: STRIPE_R5C_FULL_STRIPE, addr: &sh->state))
291	atomic_inc(v: &conf->r5c_cached_full_stripes);
292	if (test_and_clear_bit(nr: STRIPE_R5C_PARTIAL_STRIPE, addr: &sh->state))
293	atomic_dec(v: &conf->r5c_cached_partial_stripes);
294	list_add_tail(new: &sh->lru, head: &conf->r5c_full_stripe_list);
295	r5c_check_cached_full_stripe(conf);
296	} else
297	/*
298	* STRIPE_R5C_PARTIAL_STRIPE is set in
299	* r5c_try_caching_write(). No need to
300	* set it again.
301	*/
302	list_add_tail(new: &sh->lru, head: &conf->r5c_partial_stripe_list);
303	}
304	}
305	}
306	}
307
308	static void __release_stripe(struct r5conf conf, struct* stripe_head *sh,
309	struct list_head *temp_inactive_list)
310	__must_hold(&conf->device_lock)
311	{
312	if (atomic_dec_and_test(v: &sh->count))
313	do_release_stripe(conf, sh, temp_inactive_list);
314	}
315
316	/*
317	* @hash could be NR_STRIPE_HASH_LOCKS, then we have a list of inactive_list
318	*
319	* Be careful: Only one task can add/delete stripes from temp_inactive_list at
320	* given time. Adding stripes only takes device lock, while deleting stripes
321	* only takes hash lock.
322	*/
323	static void release_inactive_stripe_list(struct r5conf *conf,
324	struct list_head *temp_inactive_list,
325	int hash)
326	{
327	int size;
328	bool do_wakeup = false;
329	unsigned long flags;
330
331	if (hash == NR_STRIPE_HASH_LOCKS) {
332	size = NR_STRIPE_HASH_LOCKS;
333	hash = NR_STRIPE_HASH_LOCKS - `1`;
334	} else
335	size = `1`;
336	while (size) {
337	struct list_head *list = &temp_inactive_list[size - `1`];
338
339	/*
340	* We don't hold any lock here yet, raid5_get_active_stripe() might
341	* remove stripes from the list
342	*/
343	if (!list_empty_careful(head: list)) {
344	spin_lock_irqsave(conf->hash_locks + hash, flags);
345	if (list_empty(head: conf->inactive_list + hash) &&
346	!list_empty(head: list))
347	atomic_dec(v: &conf->empty_inactive_list_nr);
348	list_splice_tail_init(list, head: conf->inactive_list + hash);
349	do_wakeup = true;
350	spin_unlock_irqrestore(lock: conf->hash_locks + hash, flags);
351	}
352	size--;
353	hash--;
354	}
355
356	if (do_wakeup) {
357	wake_up(&conf->wait_for_stripe);
358	if (atomic_read(v: &conf->active_stripes) == `0`)
359	wake_up(&conf->wait_for_quiescent);
360	if (conf->retry_read_aligned)
361	md_wakeup_thread(thread: conf->mddev->thread);
362	}
363	}
364
365	static int release_stripe_list(struct r5conf *conf,
366	struct list_head *temp_inactive_list)
367	__must_hold(&conf->device_lock)
368	{
369	struct stripe_head sh, t;
370	int count = `0`;
371	struct llist_node *head;
372
373	head = llist_del_all(head: &conf->released_stripes);
374	head = llist_reverse_order(head);
375	llist_for_each_entry_safe(sh, t, head, release_list) {
376	int hash;
377
378	/ sh could be readded after STRIPE_ON_RELEASE_LIST is cleard /
379	smp_mb();
380	clear_bit(nr: STRIPE_ON_RELEASE_LIST, addr: &sh->state);
381	/*
382	* Don't worry the bit is set here, because if the bit is set
383	* again, the count is always > 1. This is true for
384	* STRIPE_ON_UNPLUG_LIST bit too.
385	*/
386	hash = sh->hash_lock_index;
387	__release_stripe(conf, sh, temp_inactive_list: &temp_inactive_list[hash]);
388	count++;
389	}
390
391	return count;
392	}
393
394	void raid5_release_stripe(struct stripe_head *sh)
395	{
396	struct r5conf *conf = sh->raid_conf;
397	unsigned long flags;
398	struct list_head list;
399	int hash;
400	bool wakeup;
401
402	/ Avoid release_list until the last reference.*
403	*/
404	if (atomic_add_unless(v: &sh->count, a: -`1`, u: `1`))
405	return;
406
407	if (unlikely(!conf->mddev->thread) \|\|
408	test_and_set_bit(nr: STRIPE_ON_RELEASE_LIST, addr: &sh->state))
409	goto slow_path;
410	wakeup = llist_add(new: &sh->release_list, head: &conf->released_stripes);
411	if (wakeup)
412	md_wakeup_thread(thread: conf->mddev->thread);
413	return;
414	slow_path:
415	/ we are ok here if STRIPE_ON_RELEASE_LIST is set or not /
416	if (atomic_dec_and_lock_irqsave(&sh->count, &conf->device_lock, flags)) {
417	INIT_LIST_HEAD(list: &list);
418	hash = sh->hash_lock_index;
419	do_release_stripe(conf, sh, temp_inactive_list: &list);
420	spin_unlock_irqrestore(lock: &conf->device_lock, flags);
421	release_inactive_stripe_list(conf, temp_inactive_list: &list, hash);
422	}
423	}
424
425	static inline void remove_hash(struct stripe_head *sh)
426	{
427	pr_debug("remove_hash(), stripe %llu\n",
428	(unsigned long long)sh->sector);
429
430	hlist_del_init(n: &sh->hash);
431	}
432
433	static inline void insert_hash(struct r5conf conf, struct* stripe_head *sh)
434	{
435	struct hlist_head *hp = stripe_hash(conf, sect: sh->sector);
436
437	pr_debug("insert_hash(), stripe %llu\n",
438	(unsigned long long)sh->sector);
439
440	hlist_add_head(n: &sh->hash, h: hp);
441	}
442
443	/ find an idle stripe, make sure it is unhashed, and return it. /
444	static struct stripe_head get_free_stripe(struct* r5conf conf, int* hash)
445	{
446	struct stripe_head *sh = NULL;
447	struct list_head *first;
448
449	if (list_empty(head: conf->inactive_list + hash))
450	goto out;
451	first = (conf->inactive_list + hash)->next;
452	sh = list_entry(first, struct stripe_head, lru);
453	list_del_init(entry: first);
454	remove_hash(sh);
455	atomic_inc(v: &conf->active_stripes);
456	BUG_ON(hash != sh->hash_lock_index);
457	if (list_empty(head: conf->inactive_list + hash))
458	atomic_inc(v: &conf->empty_inactive_list_nr);
459	out:
460	return sh;
461	}
462
463	#if PAGE_SIZE != DEFAULT_STRIPE_SIZE
464	static void free_stripe_pages(struct stripe_head *sh)
465	{
466	int i;
467	struct page *p;
468
469	/ Have not allocate page pool /
470	if (!sh->pages)
471	return;
472
473	for (i = `0`; i < sh->nr_pages; i++) {
474	p = sh->pages[i];
475	if (p)
476	put_page(p);
477	sh->pages[i] = NULL;
478	}
479	}
480
481	static int alloc_stripe_pages(struct stripe_head *sh, gfp_t gfp)
482	{
483	int i;
484	struct page *p;
485
486	for (i = `0`; i < sh->nr_pages; i++) {
487	/ The page have allocated. /
488	if (sh->pages[i])
489	continue;
490
491	p = alloc_page(gfp);
492	if (!p) {
493	free_stripe_pages(sh);
494	return -ENOMEM;
495	}
496	sh->pages[i] = p;
497	}
498	return `0`;
499	}
500
501	static int
502	init_stripe_shared_pages(struct stripe_head sh, struct* r5conf conf, int* disks)
503	{
504	int nr_pages, cnt;
505
506	if (sh->pages)
507	return `0`;
508
509	/ Each of the sh->dev[i] need one conf->stripe_size /
510	cnt = PAGE_SIZE / conf->stripe_size;
511	nr_pages = (disks + cnt - `1`) / cnt;
512
513	sh->pages = kcalloc(nr_pages, sizeof(struct page *), GFP_KERNEL);
514	if (!sh->pages)
515	return -ENOMEM;
516	sh->nr_pages = nr_pages;
517	sh->stripes_per_page = cnt;
518	return `0`;
519	}
520	#endif
521
522	static void shrink_buffers(struct stripe_head *sh)
523	{
524	int i;
525	int num = sh->raid_conf->pool_size;
526
527	#if PAGE_SIZE == DEFAULT_STRIPE_SIZE
528	for (i = `0`; i < num ; i++) {
529	struct page *p;
530
531	WARN_ON(sh->dev[i].page != sh->dev[i].orig_page);
532	p = sh->dev[i].page;
533	if (!p)
534	continue;
535	sh->dev[i].page = NULL;
536	put_page(page: p);
537	}
538	#else
539	for (i = `0`; i < num; i++)
540	sh->dev[i].page = NULL;
541	free_stripe_pages(sh); / Free pages /
542	#endif
543	}
544
545	static int grow_buffers(struct stripe_head *sh, gfp_t gfp)
546	{
547	int i;
548	int num = sh->raid_conf->pool_size;
549
550	#if PAGE_SIZE == DEFAULT_STRIPE_SIZE
551	for (i = `0`; i < num; i++) {
552	struct page *page;
553
554	if (!(page = alloc_page(gfp))) {
555	return `1`;
556	}
557	sh->dev[i].page = page;
558	sh->dev[i].orig_page = page;
559	sh->dev[i].offset = `0`;
560	}
561	#else
562	if (alloc_stripe_pages(sh, gfp))
563	return -ENOMEM;
564
565	for (i = `0`; i < num; i++) {
566	sh->dev[i].page = raid5_get_dev_page(sh, i);
567	sh->dev[i].orig_page = sh->dev[i].page;
568	sh->dev[i].offset = raid5_get_page_offset(sh, i);
569	}
570	#endif
571	return `0`;
572	}
573
574	static void stripe_set_idx(sector_t stripe, struct r5conf conf, int* previous,
575	struct stripe_head *sh);
576
577	static void init_stripe(struct stripe_head sh, sector_t sector, int* previous)
578	{
579	struct r5conf *conf = sh->raid_conf;
580	int i, seq;
581
582	BUG_ON(atomic_read(&sh->count) != `0`);
583	BUG_ON(test_bit(STRIPE_HANDLE, &sh->state));
584	BUG_ON(stripe_operations_active(sh));
585	BUG_ON(sh->batch_head);
586
587	pr_debug("init_stripe called, stripe %llu\n",
588	(unsigned long long)sector);
589	retry:
590	seq = read_seqcount_begin(&conf->gen_lock);
591	sh->generation = conf->generation - previous;
592	sh->disks = previous ? conf->previous_raid_disks : conf->raid_disks;
593	sh->sector = sector;
594	stripe_set_idx(stripe: sector, conf, previous, sh);
595	sh->state = `0`;
596
597	for (i = sh->disks; i--; ) {
598	struct r5dev *dev = &sh->dev[i];
599
600	if (dev->toread \|\| dev->read \|\| dev->towrite \|\| dev->written \|\|
601	test_bit(R5_LOCKED, &dev->flags)) {
602	pr_err("sector=%llx i=%d %p %p %p %p %d\n",
603	(unsigned long long)sh->sector, i, dev->toread,
604	dev->read, dev->towrite, dev->written,
605	test_bit(R5_LOCKED, &dev->flags));
606	WARN_ON(`1`);
607	}
608	dev->flags = `0`;
609	dev->sector = raid5_compute_blocknr(sh, i, previous);
610	}
611	if (read_seqcount_retry(&conf->gen_lock, seq))
612	goto retry;
613	sh->overwrite_disks = `0`;
614	insert_hash(conf, sh);
615	sh->cpu = smp_processor_id();
616	set_bit(nr: STRIPE_BATCH_READY, addr: &sh->state);
617	}
618
619	static struct stripe_head __find_stripe(struct* r5conf *conf, sector_t sector,
620	short generation)
621	{
622	struct stripe_head *sh;
623
624	pr_debug("__find_stripe, sector %llu\n", (unsigned long long)sector);
625	hlist_for_each_entry(sh, stripe_hash(conf, sector), hash)
626	if (sh->sector == sector && sh->generation == generation)
627	return sh;
628	pr_debug("__stripe %llu not in cache\n", (unsigned long long)sector);
629	return NULL;
630	}
631
632	static struct stripe_head find_get_stripe(struct* r5conf *conf,
633	sector_t sector, short generation, int hash)
634	{
635	int inc_empty_inactive_list_flag;
636	struct stripe_head *sh;
637
638	sh = __find_stripe(conf, sector, generation);
639	if (!sh)
640	return NULL;
641
642	if (atomic_inc_not_zero(v: &sh->count))
643	return sh;
644
645	/*
646	* Slow path. The reference count is zero which means the stripe must
647	* be on a list (sh->lru). Must remove the stripe from the list that
648	* references it with the device_lock held.
649	*/
650
651	spin_lock(lock: &conf->device_lock);
652	if (!atomic_read(v: &sh->count)) {
653	if (!test_bit(STRIPE_HANDLE, &sh->state))
654	atomic_inc(v: &conf->active_stripes);
655	BUG_ON(list_empty(&sh->lru) &&
656	!test_bit(STRIPE_EXPANDING, &sh->state));
657	inc_empty_inactive_list_flag = `0`;
658	if (!list_empty(head: conf->inactive_list + hash))
659	inc_empty_inactive_list_flag = `1`;
660	list_del_init(entry: &sh->lru);
661	if (list_empty(head: conf->inactive_list + hash) &&
662	inc_empty_inactive_list_flag)
663	atomic_inc(v: &conf->empty_inactive_list_nr);
664	if (sh->group) {
665	sh->group->stripes_cnt--;
666	sh->group = NULL;
667	}
668	}
669	atomic_inc(v: &sh->count);
670	spin_unlock(lock: &conf->device_lock);
671
672	return sh;
673	}
674
675	/*
676	* Need to check if array has failed when deciding whether to:
677	* - start an array
678	* - remove non-faulty devices
679	* - add a spare
680	* - allow a reshape
681	* This determination is simple when no reshape is happening.
682	* However if there is a reshape, we need to carefully check
683	* both the before and after sections.
684	* This is because some failed devices may only affect one
685	* of the two sections, and some non-in_sync devices may
686	* be insync in the section most affected by failed devices.
687	*
688	* Most calls to this function hold &conf->device_lock. Calls
689	* in raid5_run() do not require the lock as no other threads
690	* have been started yet.
691	*/
692	int raid5_calc_degraded(struct r5conf *conf)
693	{
694	int degraded, degraded2;
695	int i;
696
697	rcu_read_lock();
698	degraded = `0`;
699	for (i = `0`; i < conf->previous_raid_disks; i++) {
700	struct md_rdev *rdev = rcu_dereference(conf->disks[i].rdev);
701	if (rdev && test_bit(Faulty, &rdev->flags))
702	rdev = rcu_dereference(conf->disks[i].replacement);
703	if (!rdev \|\| test_bit(Faulty, &rdev->flags))
704	degraded++;
705	else if (test_bit(In_sync, &rdev->flags))
706	;
707	else
708	/ not in-sync or faulty.*
709	* If the reshape increases the number of devices,
710	* this is being recovered by the reshape, so
711	* this 'previous' section is not in_sync.
712	* If the number of devices is being reduced however,
713	* the device can only be part of the array if
714	* we are reverting a reshape, so this section will
715	* be in-sync.
716	*/
717	if (conf->raid_disks >= conf->previous_raid_disks)
718	degraded++;
719	}
720	rcu_read_unlock();
721	if (conf->raid_disks == conf->previous_raid_disks)
722	return degraded;
723	rcu_read_lock();
724	degraded2 = `0`;
725	for (i = `0`; i < conf->raid_disks; i++) {
726	struct md_rdev *rdev = rcu_dereference(conf->disks[i].rdev);
727	if (rdev && test_bit(Faulty, &rdev->flags))
728	rdev = rcu_dereference(conf->disks[i].replacement);
729	if (!rdev \|\| test_bit(Faulty, &rdev->flags))
730	degraded2++;
731	else if (test_bit(In_sync, &rdev->flags))
732	;
733	else
734	/ not in-sync or faulty.*
735	* If reshape increases the number of devices, this
736	* section has already been recovered, else it
737	* almost certainly hasn't.
738	*/
739	if (conf->raid_disks <= conf->previous_raid_disks)
740	degraded2++;
741	}
742	rcu_read_unlock();
743	if (degraded2 > degraded)
744	return degraded2;
745	return degraded;
746	}
747
748	static bool has_failed(struct r5conf *conf)
749	{
750	int degraded = conf->mddev->degraded;
751
752	if (test_bit(MD_BROKEN, &conf->mddev->flags))
753	return true;
754
755	if (conf->mddev->reshape_position != MaxSector)
756	degraded = raid5_calc_degraded(conf);
757
758	return degraded > conf->max_degraded;
759	}
760
761	enum stripe_result {
762	STRIPE_SUCCESS = `0`,
763	STRIPE_RETRY,
764	STRIPE_SCHEDULE_AND_RETRY,
765	STRIPE_FAIL,
766	};
767
768	struct stripe_request_ctx {
769	/ a reference to the last stripe_head for batching /
770	struct stripe_head *batch_last;
771
772	/ first sector in the request /
773	sector_t first_sector;
774
775	/ last sector in the request /
776	sector_t last_sector;
777
778	/*
779	* bitmap to track stripe sectors that have been added to stripes
780	* add one to account for unaligned requests
781	*/
782	DECLARE_BITMAP(sectors_to_do, RAID5_MAX_REQ_STRIPES + `1`);
783
784	/ the request had REQ_PREFLUSH, cleared after the first stripe_head /
785	bool do_flush;
786	};
787
788	/*
789	* Block until another thread clears R5_INACTIVE_BLOCKED or
790	* there are fewer than 3/4 the maximum number of active stripes
791	* and there is an inactive stripe available.
792	*/
793	static bool is_inactive_blocked(struct r5conf conf, int* hash)
794	{
795	if (list_empty(head: conf->inactive_list + hash))
796	return false;
797
798	if (!test_bit(R5_INACTIVE_BLOCKED, &conf->cache_state))
799	return true;
800
801	return (atomic_read(v: &conf->active_stripes) <
802	(conf->max_nr_stripes * `3` / `4`));
803	}
804
805	struct stripe_head raid5_get_active_stripe(struct* r5conf *conf,
806	struct stripe_request_ctx *ctx, sector_t sector,
807	unsigned int flags)
808	{
809	struct stripe_head *sh;
810	int hash = stripe_hash_locks_hash(conf, sect: sector);
811	int previous = !!(flags & R5_GAS_PREVIOUS);
812
813	pr_debug("get_stripe, sector %llu\n", (unsigned long long)sector);
814
815	spin_lock_irq(lock: conf->hash_locks + hash);
816
817	for (;;) {
818	if (!(flags & R5_GAS_NOQUIESCE) && conf->quiesce) {
819	/*
820	* Must release the reference to batch_last before
821	* waiting, on quiesce, otherwise the batch_last will
822	* hold a reference to a stripe and raid5_quiesce()
823	* will deadlock waiting for active_stripes to go to
824	* zero.
825	*/
826	if (ctx && ctx->batch_last) {
827	raid5_release_stripe(sh: ctx->batch_last);
828	ctx->batch_last = NULL;
829	}
830
831	wait_event_lock_irq(conf->wait_for_quiescent,
832	!conf->quiesce,
833	*(conf->hash_locks + hash));
834	}
835
836	sh = find_get_stripe(conf, sector, generation: conf->generation - previous,
837	hash);
838	if (sh)
839	break;
840
841	if (!test_bit(R5_INACTIVE_BLOCKED, &conf->cache_state)) {
842	sh = get_free_stripe(conf, hash);
843	if (sh) {
844	r5c_check_stripe_cache_usage(conf);
845	init_stripe(sh, sector, previous);
846	atomic_inc(v: &sh->count);
847	break;
848	}
849
850	if (!test_bit(R5_DID_ALLOC, &conf->cache_state))
851	set_bit(nr: R5_ALLOC_MORE, addr: &conf->cache_state);
852	}
853
854	if (flags & R5_GAS_NOBLOCK)
855	break;
856
857	set_bit(nr: R5_INACTIVE_BLOCKED, addr: &conf->cache_state);
858	r5l_wake_reclaim(log: conf->log, space: `0`);
859
860	/ release batch_last before wait to avoid risk of deadlock /
861	if (ctx && ctx->batch_last) {
862	raid5_release_stripe(sh: ctx->batch_last);
863	ctx->batch_last = NULL;
864	}
865
866	wait_event_lock_irq(conf->wait_for_stripe,
867	is_inactive_blocked(conf, hash),
868	*(conf->hash_locks + hash));
869	clear_bit(nr: R5_INACTIVE_BLOCKED, addr: &conf->cache_state);
870	}
871
872	spin_unlock_irq(lock: conf->hash_locks + hash);
873	return sh;
874	}
875
876	static bool is_full_stripe_write(struct stripe_head *sh)
877	{
878	BUG_ON(sh->overwrite_disks > (sh->disks - sh->raid_conf->max_degraded));
879	return sh->overwrite_disks == (sh->disks - sh->raid_conf->max_degraded);
880	}
881
882	static void lock_two_stripes(struct stripe_head sh1, struct* stripe_head *sh2)
883	__acquires(&sh1->stripe_lock)
884	__acquires(&sh2->stripe_lock)
885	{
886	if (sh1 > sh2) {
887	spin_lock_irq(lock: &sh2->stripe_lock);
888	spin_lock_nested(&sh1->stripe_lock, `1`);
889	} else {
890	spin_lock_irq(lock: &sh1->stripe_lock);
891	spin_lock_nested(&sh2->stripe_lock, `1`);
892	}
893	}
894
895	static void unlock_two_stripes(struct stripe_head sh1, struct* stripe_head *sh2)
896	__releases(&sh1->stripe_lock)
897	__releases(&sh2->stripe_lock)
898	{
899	spin_unlock(lock: &sh1->stripe_lock);
900	spin_unlock_irq(lock: &sh2->stripe_lock);
901	}
902
903	/ Only freshly new full stripe normal write stripe can be added to a batch list /
904	static bool stripe_can_batch(struct stripe_head *sh)
905	{
906	struct r5conf *conf = sh->raid_conf;
907
908	if (raid5_has_log(conf) \|\| raid5_has_ppl(conf))
909	return false;
910	return test_bit(STRIPE_BATCH_READY, &sh->state) &&
911	!test_bit(STRIPE_BITMAP_PENDING, &sh->state) &&
912	is_full_stripe_write(sh);
913	}
914
915	/ we only do back search /
916	static void stripe_add_to_batch_list(struct r5conf *conf,
917	struct stripe_head sh, struct* stripe_head *last_sh)
918	{
919	struct stripe_head *head;
920	sector_t head_sector, tmp_sec;
921	int hash;
922	int dd_idx;
923
924	/ Don't cross chunks, so stripe pd_idx/qd_idx is the same /
925	tmp_sec = sh->sector;
926	if (!sector_div(tmp_sec, conf->chunk_sectors))
927	return;
928	head_sector = sh->sector - RAID5_STRIPE_SECTORS(conf);
929
930	if (last_sh && head_sector == last_sh->sector) {
931	head = last_sh;
932	atomic_inc(v: &head->count);
933	} else {
934	hash = stripe_hash_locks_hash(conf, sect: head_sector);
935	spin_lock_irq(lock: conf->hash_locks + hash);
936	head = find_get_stripe(conf, sector: head_sector, generation: conf->generation,
937	hash);
938	spin_unlock_irq(lock: conf->hash_locks + hash);
939	if (!head)
940	return;
941	if (!stripe_can_batch(sh: head))
942	goto out;
943	}
944
945	lock_two_stripes(sh1: head, sh2: sh);
946	/ clear_batch_ready clear the flag /
947	if (!stripe_can_batch(sh: head) \|\| !stripe_can_batch(sh))
948	goto unlock_out;
949
950	if (sh->batch_head)
951	goto unlock_out;
952
953	dd_idx = `0`;
954	while (dd_idx == sh->pd_idx \|\| dd_idx == sh->qd_idx)
955	dd_idx++;
956	if (head->dev[dd_idx].towrite->bi_opf != sh->dev[dd_idx].towrite->bi_opf \|\|
957	bio_op(bio: head->dev[dd_idx].towrite) != bio_op(bio: sh->dev[dd_idx].towrite))
958	goto unlock_out;
959
960	if (head->batch_head) {
961	spin_lock(lock: &head->batch_head->batch_lock);
962	/ This batch list is already running /
963	if (!stripe_can_batch(sh: head)) {
964	spin_unlock(lock: &head->batch_head->batch_lock);
965	goto unlock_out;
966	}
967	/*
968	* We must assign batch_head of this stripe within the
969	* batch_lock, otherwise clear_batch_ready of batch head
970	* stripe could clear BATCH_READY bit of this stripe and
971	* this stripe->batch_head doesn't get assigned, which
972	* could confuse clear_batch_ready for this stripe
973	*/
974	sh->batch_head = head->batch_head;
975
976	/*
977	* at this point, head's BATCH_READY could be cleared, but we
978	* can still add the stripe to batch list
979	*/
980	list_add(new: &sh->batch_list, head: &head->batch_list);
981	spin_unlock(lock: &head->batch_head->batch_lock);
982	} else {
983	head->batch_head = head;
984	sh->batch_head = head->batch_head;
985	spin_lock(lock: &head->batch_lock);
986	list_add_tail(new: &sh->batch_list, head: &head->batch_list);
987	spin_unlock(lock: &head->batch_lock);
988	}
989
990	if (test_and_clear_bit(nr: STRIPE_PREREAD_ACTIVE, addr: &sh->state))
991	if (atomic_dec_return(v: &conf->preread_active_stripes)
992	< IO_THRESHOLD)
993	md_wakeup_thread(thread: conf->mddev->thread);
994
995	if (test_and_clear_bit(nr: STRIPE_BIT_DELAY, addr: &sh->state)) {
996	int seq = sh->bm_seq;
997	if (test_bit(STRIPE_BIT_DELAY, &sh->batch_head->state) &&
998	sh->batch_head->bm_seq > seq)
999	seq = sh->batch_head->bm_seq;
1000	set_bit(nr: STRIPE_BIT_DELAY, addr: &sh->batch_head->state);
1001	sh->batch_head->bm_seq = seq;
1002	}
1003
1004	atomic_inc(v: &sh->count);
1005	unlock_out:
1006	unlock_two_stripes(sh1: head, sh2: sh);
1007	out:
1008	raid5_release_stripe(sh: head);
1009	}
1010
1011	/ Determine if 'data_offset' or 'new_data_offset' should be used*
1012	* in this stripe_head.
1013	*/
1014	static int use_new_offset(struct r5conf conf, struct* stripe_head *sh)
1015	{
1016	sector_t progress = conf->reshape_progress;
1017	/ Need a memory barrier to make sure we see the value*
1018	* of conf->generation, or ->data_offset that was set before
1019	* reshape_progress was updated.
1020	*/
1021	smp_rmb();
1022	if (progress == MaxSector)
1023	return `0`;
1024	if (sh->generation == conf->generation - `1`)
1025	return `0`;
1026	/ We are in a reshape, and this is a new-generation stripe,*
1027	* so use new_data_offset.
1028	*/
1029	return `1`;
1030	}
1031
1032	static void dispatch_bio_list(struct bio_list *tmp)
1033	{
1034	struct bio *bio;
1035
1036	while ((bio = bio_list_pop(bl: tmp)))
1037	submit_bio_noacct(bio);
1038	}
1039
1040	static int cmp_stripe(void priv, const* struct list_head *a,
1041	const struct list_head *b)
1042	{
1043	const struct r5pending_data *da = list_entry(a,
1044	struct r5pending_data, sibling);
1045	const struct r5pending_data *db = list_entry(b,
1046	struct r5pending_data, sibling);
1047	if (da->sector > db->sector)
1048	return `1`;
1049	if (da->sector < db->sector)
1050	return -`1`;
1051	return `0`;
1052	}
1053
1054	static void dispatch_defer_bios(struct r5conf conf, int* target,
1055	struct bio_list *list)
1056	{
1057	struct r5pending_data *data;
1058	struct list_head first, next = NULL;
1059	int cnt = `0`;
1060
1061	if (conf->pending_data_cnt == `0`)
1062	return;
1063
1064	list_sort(NULL, head: &conf->pending_list, cmp: cmp_stripe);
1065
1066	first = conf->pending_list.next;
1067
1068	/ temporarily move the head /
1069	if (conf->next_pending_data)
1070	list_move_tail(list: &conf->pending_list,
1071	head: &conf->next_pending_data->sibling);
1072
1073	while (!list_empty(head: &conf->pending_list)) {
1074	data = list_first_entry(&conf->pending_list,
1075	struct r5pending_data, sibling);
1076	if (&data->sibling == first)
1077	first = data->sibling.next;
1078	next = data->sibling.next;
1079
1080	bio_list_merge(bl: list, bl2: &data->bios);
1081	list_move(list: &data->sibling, head: &conf->free_list);
1082	cnt++;
1083	if (cnt >= target)
1084	break;
1085	}
1086	conf->pending_data_cnt -= cnt;
1087	BUG_ON(conf->pending_data_cnt < `0` \|\| cnt < target);
1088
1089	if (next != &conf->pending_list)
1090	conf->next_pending_data = list_entry(next,
1091	struct r5pending_data, sibling);
1092	else
1093	conf->next_pending_data = NULL;
1094	/ list isn't empty /
1095	if (first != &conf->pending_list)
1096	list_move_tail(list: &conf->pending_list, head: first);
1097	}
1098
1099	static void flush_deferred_bios(struct r5conf *conf)
1100	{
1101	struct bio_list tmp = BIO_EMPTY_LIST;
1102
1103	if (conf->pending_data_cnt == `0`)
1104	return;
1105
1106	spin_lock(lock: &conf->pending_bios_lock);
1107	dispatch_defer_bios(conf, target: conf->pending_data_cnt, list: &tmp);
1108	BUG_ON(conf->pending_data_cnt != `0`);
1109	spin_unlock(lock: &conf->pending_bios_lock);
1110
1111	dispatch_bio_list(tmp: &tmp);
1112	}
1113
1114	static void defer_issue_bios(struct r5conf *conf, sector_t sector,
1115	struct bio_list *bios)
1116	{
1117	struct bio_list tmp = BIO_EMPTY_LIST;
1118	struct r5pending_data *ent;
1119
1120	spin_lock(lock: &conf->pending_bios_lock);
1121	ent = list_first_entry(&conf->free_list, struct r5pending_data,
1122	sibling);
1123	list_move_tail(list: &ent->sibling, head: &conf->pending_list);
1124	ent->sector = sector;
1125	bio_list_init(bl: &ent->bios);
1126	bio_list_merge(bl: &ent->bios, bl2: bios);
1127	conf->pending_data_cnt++;
1128	if (conf->pending_data_cnt >= PENDING_IO_MAX)
1129	dispatch_defer_bios(conf, PENDING_IO_ONE_FLUSH, list: &tmp);
1130
1131	spin_unlock(lock: &conf->pending_bios_lock);
1132
1133	dispatch_bio_list(tmp: &tmp);
1134	}
1135
1136	static void
1137	raid5_end_read_request(struct bio *bi);
1138	static void
1139	raid5_end_write_request(struct bio *bi);
1140
1141	static void ops_run_io(struct stripe_head sh, struct* stripe_head_state *s)
1142	{
1143	struct r5conf *conf = sh->raid_conf;
1144	int i, disks = sh->disks;
1145	struct stripe_head *head_sh = sh;
1146	struct bio_list pending_bios = BIO_EMPTY_LIST;
1147	struct r5dev *dev;
1148	bool should_defer;
1149
1150	might_sleep();
1151
1152	if (log_stripe(sh, s) == `0`)
1153	return;
1154
1155	should_defer = conf->batch_bio_dispatch && conf->group_cnt;
1156
1157	for (i = disks; i--; ) {
1158	enum req_op op;
1159	blk_opf_t op_flags = `0`;
1160	int replace_only = `0`;
1161	struct bio bi, rbi;
1162	struct md_rdev rdev, rrdev = NULL;
1163
1164	sh = head_sh;
1165	if (test_and_clear_bit(nr: R5_Wantwrite, addr: &sh->dev[i].flags)) {
1166	op = REQ_OP_WRITE;
1167	if (test_and_clear_bit(nr: R5_WantFUA, addr: &sh->dev[i].flags))
1168	op_flags = REQ_FUA;
1169	if (test_bit(R5_Discard, &sh->dev[i].flags))
1170	op = REQ_OP_DISCARD;
1171	} else if (test_and_clear_bit(nr: R5_Wantread, addr: &sh->dev[i].flags))
1172	op = REQ_OP_READ;
1173	else if (test_and_clear_bit(nr: R5_WantReplace,
1174	addr: &sh->dev[i].flags)) {
1175	op = REQ_OP_WRITE;
1176	replace_only = `1`;
1177	} else
1178	continue;
1179	if (test_and_clear_bit(nr: R5_SyncIO, addr: &sh->dev[i].flags))
1180	op_flags \|= REQ_SYNC;
1181
1182	again:
1183	dev = &sh->dev[i];
1184	bi = &dev->req;
1185	rbi = &dev->rreq; / For writing to replacement /
1186
1187	rcu_read_lock();
1188	rrdev = rcu_dereference(conf->disks[i].replacement);
1189	smp_mb(); / Ensure that if rrdev is NULL, rdev won't be /
1190	rdev = rcu_dereference(conf->disks[i].rdev);
1191	if (!rdev) {
1192	rdev = rrdev;
1193	rrdev = NULL;
1194	}
1195	if (op_is_write(op)) {
1196	if (replace_only)
1197	rdev = NULL;
1198	if (rdev == rrdev)
1199	/ We raced and saw duplicates /
1200	rrdev = NULL;
1201	} else {
1202	if (test_bit(R5_ReadRepl, &head_sh->dev[i].flags) && rrdev)
1203	rdev = rrdev;
1204	rrdev = NULL;
1205	}
1206
1207	if (rdev && test_bit(Faulty, &rdev->flags))
1208	rdev = NULL;
1209	if (rdev)
1210	atomic_inc(v: &rdev->nr_pending);
1211	if (rrdev && test_bit(Faulty, &rrdev->flags))
1212	rrdev = NULL;
1213	if (rrdev)
1214	atomic_inc(v: &rrdev->nr_pending);
1215	rcu_read_unlock();
1216
1217	/ We have already checked bad blocks for reads. Now*
1218	* need to check for writes. We never accept write errors
1219	* on the replacement, so we don't to check rrdev.
1220	*/
1221	while (op_is_write(op) && rdev &&
1222	test_bit(WriteErrorSeen, &rdev->flags)) {
1223	sector_t first_bad;
1224	int bad_sectors;
1225	int bad = is_badblock(rdev, s: sh->sector, RAID5_STRIPE_SECTORS(conf),
1226	first_bad: &first_bad, bad_sectors: &bad_sectors);
1227	if (!bad)
1228	break;
1229
1230	if (bad < `0`) {
1231	set_bit(nr: BlockedBadBlocks, addr: &rdev->flags);
1232	if (!conf->mddev->external &&
1233	conf->mddev->sb_flags) {
1234	/ It is very unlikely, but we might*
1235	* still need to write out the
1236	* bad block log - better give it
1237	* a chance*/
1238	md_check_recovery(mddev: conf->mddev);
1239	}
1240	/*
1241	* Because md_wait_for_blocked_rdev
1242	* will dec nr_pending, we must
1243	* increment it first.
1244	*/
1245	atomic_inc(v: &rdev->nr_pending);
1246	md_wait_for_blocked_rdev(rdev, mddev: conf->mddev);
1247	} else {
1248	/ Acknowledged bad block - skip the write /
1249	rdev_dec_pending(rdev, mddev: conf->mddev);
1250	rdev = NULL;
1251	}
1252	}
1253
1254	if (rdev) {
1255	if (s->syncing \|\| s->expanding \|\| s->expanded
1256	\|\| s->replacing)
1257	md_sync_acct(bdev: rdev->bdev, RAID5_STRIPE_SECTORS(conf));
1258
1259	set_bit(nr: STRIPE_IO_STARTED, addr: &sh->state);
1260
1261	bio_init(bio: bi, bdev: rdev->bdev, table: &dev->vec, max_vecs: `1`, opf: op \| op_flags);
1262	bi->bi_end_io = op_is_write(op)
1263	? raid5_end_write_request
1264	: raid5_end_read_request;
1265	bi->bi_private = sh;
1266
1267	pr_debug("%s: for %llu schedule op %d on disc %d\n",
1268	__func__, (unsigned long long)sh->sector,
1269	bi->bi_opf, i);
1270	atomic_inc(v: &sh->count);
1271	if (sh != head_sh)
1272	atomic_inc(v: &head_sh->count);
1273	if (use_new_offset(conf, sh))
1274	bi->bi_iter.bi_sector = (sh->sector
1275	+ rdev->new_data_offset);
1276	else
1277	bi->bi_iter.bi_sector = (sh->sector
1278	+ rdev->data_offset);
1279	if (test_bit(R5_ReadNoMerge, &head_sh->dev[i].flags))
1280	bi->bi_opf \|= REQ_NOMERGE;
1281
1282	if (test_bit(R5_SkipCopy, &sh->dev[i].flags))
1283	WARN_ON(test_bit(R5_UPTODATE, &sh->dev[i].flags));
1284
1285	if (!op_is_write(op) &&
1286	test_bit(R5_InJournal, &sh->dev[i].flags))
1287	/*
1288	* issuing read for a page in journal, this
1289	* must be preparing for prexor in rmw; read
1290	* the data into orig_page
1291	*/
1292	sh->dev[i].vec.bv_page = sh->dev[i].orig_page;
1293	else
1294	sh->dev[i].vec.bv_page = sh->dev[i].page;
1295	bi->bi_vcnt = `1`;
1296	bi->bi_io_vec[`0`].bv_len = RAID5_STRIPE_SIZE(conf);
1297	bi->bi_io_vec[`0`].bv_offset = sh->dev[i].offset;
1298	bi->bi_iter.bi_size = RAID5_STRIPE_SIZE(conf);
1299	/*
1300	* If this is discard request, set bi_vcnt 0. We don't
1301	* want to confuse SCSI because SCSI will replace payload
1302	*/
1303	if (op == REQ_OP_DISCARD)
1304	bi->bi_vcnt = `0`;
1305	if (rrdev)
1306	set_bit(nr: R5_DOUBLE_LOCKED, addr: &sh->dev[i].flags);
1307
1308	if (conf->mddev->gendisk)
1309	trace_block_bio_remap(bio: bi,
1310	dev: disk_devt(disk: conf->mddev->gendisk),
1311	from: sh->dev[i].sector);
1312	if (should_defer && op_is_write(op))
1313	bio_list_add(bl: &pending_bios, bio: bi);
1314	else
1315	submit_bio_noacct(bio: bi);
1316	}
1317	if (rrdev) {
1318	if (s->syncing \|\| s->expanding \|\| s->expanded
1319	\|\| s->replacing)
1320	md_sync_acct(bdev: rrdev->bdev, RAID5_STRIPE_SECTORS(conf));
1321
1322	set_bit(nr: STRIPE_IO_STARTED, addr: &sh->state);
1323
1324	bio_init(bio: rbi, bdev: rrdev->bdev, table: &dev->rvec, max_vecs: `1`, opf: op \| op_flags);
1325	BUG_ON(!op_is_write(op));
1326	rbi->bi_end_io = raid5_end_write_request;
1327	rbi->bi_private = sh;
1328
1329	pr_debug("%s: for %llu schedule op %d on "
1330	"replacement disc %d\n",
1331	__func__, (unsigned long long)sh->sector,
1332	rbi->bi_opf, i);
1333	atomic_inc(v: &sh->count);
1334	if (sh != head_sh)
1335	atomic_inc(v: &head_sh->count);
1336	if (use_new_offset(conf, sh))
1337	rbi->bi_iter.bi_sector = (sh->sector
1338	+ rrdev->new_data_offset);
1339	else
1340	rbi->bi_iter.bi_sector = (sh->sector
1341	+ rrdev->data_offset);
1342	if (test_bit(R5_SkipCopy, &sh->dev[i].flags))
1343	WARN_ON(test_bit(R5_UPTODATE, &sh->dev[i].flags));
1344	sh->dev[i].rvec.bv_page = sh->dev[i].page;
1345	rbi->bi_vcnt = `1`;
1346	rbi->bi_io_vec[`0`].bv_len = RAID5_STRIPE_SIZE(conf);
1347	rbi->bi_io_vec[`0`].bv_offset = sh->dev[i].offset;
1348	rbi->bi_iter.bi_size = RAID5_STRIPE_SIZE(conf);
1349	/*
1350	* If this is discard request, set bi_vcnt 0. We don't
1351	* want to confuse SCSI because SCSI will replace payload
1352	*/
1353	if (op == REQ_OP_DISCARD)
1354	rbi->bi_vcnt = `0`;
1355	if (conf->mddev->gendisk)
1356	trace_block_bio_remap(bio: rbi,
1357	dev: disk_devt(disk: conf->mddev->gendisk),
1358	from: sh->dev[i].sector);
1359	if (should_defer && op_is_write(op))
1360	bio_list_add(bl: &pending_bios, bio: rbi);
1361	else
1362	submit_bio_noacct(bio: rbi);
1363	}
1364	if (!rdev && !rrdev) {
1365	if (op_is_write(op))
1366	set_bit(nr: STRIPE_DEGRADED, addr: &sh->state);
1367	pr_debug("skip op %d on disc %d for sector %llu\n",
1368	bi->bi_opf, i, (unsigned long long)sh->sector);
1369	clear_bit(nr: R5_LOCKED, addr: &sh->dev[i].flags);
1370	set_bit(nr: STRIPE_HANDLE, addr: &sh->state);
1371	}
1372
1373	if (!head_sh->batch_head)
1374	continue;
1375	sh = list_first_entry(&sh->batch_list, struct stripe_head,
1376	batch_list);
1377	if (sh != head_sh)
1378	goto again;
1379	}
1380
1381	if (should_defer && !bio_list_empty(bl: &pending_bios))
1382	defer_issue_bios(conf, sector: head_sh->sector, bios: &pending_bios);
1383	}
1384
1385	static struct dma_async_tx_descriptor *
1386	async_copy_data(int frombio, struct bio bio, struct* page **page,
1387	unsigned int poff, sector_t sector, struct dma_async_tx_descriptor *tx,
1388	struct stripe_head sh, int* no_skipcopy)
1389	{
1390	struct bio_vec bvl;
1391	struct bvec_iter iter;
1392	struct page *bio_page;
1393	int page_offset;
1394	struct async_submit_ctl submit;
1395	enum async_tx_flags flags = `0`;
1396	struct r5conf *conf = sh->raid_conf;
1397
1398	if (bio->bi_iter.bi_sector >= sector)
1399	page_offset = (signed)(bio->bi_iter.bi_sector - sector) * `512`;
1400	else
1401	page_offset = (signed)(sector - bio->bi_iter.bi_sector) * -`512`;
1402
1403	if (frombio)
1404	flags \|= ASYNC_TX_FENCE;
1405	init_async_submit(args: &submit, flags, tx, NULL, NULL, NULL);
1406
1407	bio_for_each_segment(bvl, bio, iter) {
1408	int len = bvl.bv_len;
1409	int clen;
1410	int b_offset = `0`;
1411
1412	if (page_offset < `0`) {
1413	b_offset = -page_offset;
1414	page_offset += b_offset;
1415	len -= b_offset;
1416	}
1417
1418	if (len > `0` && page_offset + len > RAID5_STRIPE_SIZE(conf))
1419	clen = RAID5_STRIPE_SIZE(conf) - page_offset;
1420	else
1421	clen = len;
1422
1423	if (clen > `0`) {
1424	b_offset += bvl.bv_offset;
1425	bio_page = bvl.bv_page;
1426	if (frombio) {
1427	if (conf->skip_copy &&
1428	b_offset == `0` && page_offset == `0` &&
1429	clen == RAID5_STRIPE_SIZE(conf) &&
1430	!no_skipcopy)
1431	*page = bio_page;
1432	else
1433	tx = async_memcpy(dest: *page, src: bio_page, dest_offset: page_offset + poff,
1434	src_offset: b_offset, len: clen, submit: &submit);
1435	} else
1436	tx = async_memcpy(dest: bio_page, src: *page, dest_offset: b_offset,
1437	src_offset: page_offset + poff, len: clen, submit: &submit);
1438	}
1439	/ chain the operations /
1440	submit.depend_tx = tx;
1441
1442	if (clen < len) / hit end of page /
1443	break;
1444	page_offset += len;
1445	}
1446
1447	return tx;
1448	}
1449
1450	static void ops_complete_biofill(void *stripe_head_ref)
1451	{
1452	struct stripe_head *sh = stripe_head_ref;
1453	int i;
1454	struct r5conf *conf = sh->raid_conf;
1455
1456	pr_debug("%s: stripe %llu\n", __func__,
1457	(unsigned long long)sh->sector);
1458
1459	/ clear completed biofills /
1460	for (i = sh->disks; i--; ) {
1461	struct r5dev *dev = &sh->dev[i];
1462
1463	/ acknowledge completion of a biofill operation /
1464	/ and check if we need to reply to a read request,*
1465	* new R5_Wantfill requests are held off until
1466	* !STRIPE_BIOFILL_RUN
1467	*/
1468	if (test_and_clear_bit(nr: R5_Wantfill, addr: &dev->flags)) {
1469	struct bio rbi, rbi2;
1470
1471	BUG_ON(!dev->read);
1472	rbi = dev->read;
1473	dev->read = NULL;
1474	while (rbi && rbi->bi_iter.bi_sector <
1475	dev->sector + RAID5_STRIPE_SECTORS(conf)) {
1476	rbi2 = r5_next_bio(conf, bio: rbi, sector: dev->sector);
1477	bio_endio(rbi);
1478	rbi = rbi2;
1479	}
1480	}
1481	}
1482	clear_bit(nr: STRIPE_BIOFILL_RUN, addr: &sh->state);
1483
1484	set_bit(nr: STRIPE_HANDLE, addr: &sh->state);
1485	raid5_release_stripe(sh);
1486	}
1487
1488	static void ops_run_biofill(struct stripe_head *sh)
1489	{
1490	struct dma_async_tx_descriptor *tx = NULL;
1491	struct async_submit_ctl submit;
1492	int i;
1493	struct r5conf *conf = sh->raid_conf;
1494
1495	BUG_ON(sh->batch_head);
1496	pr_debug("%s: stripe %llu\n", __func__,
1497	(unsigned long long)sh->sector);
1498
1499	for (i = sh->disks; i--; ) {
1500	struct r5dev *dev = &sh->dev[i];
1501	if (test_bit(R5_Wantfill, &dev->flags)) {
1502	struct bio *rbi;
1503	spin_lock_irq(lock: &sh->stripe_lock);
1504	dev->read = rbi = dev->toread;
1505	dev->toread = NULL;
1506	spin_unlock_irq(lock: &sh->stripe_lock);
1507	while (rbi && rbi->bi_iter.bi_sector <
1508	dev->sector + RAID5_STRIPE_SECTORS(conf)) {
1509	tx = async_copy_data(frombio: `0`, bio: rbi, page: &dev->page,
1510	poff: dev->offset,
1511	sector: dev->sector, tx, sh, no_skipcopy: `0`);
1512	rbi = r5_next_bio(conf, bio: rbi, sector: dev->sector);
1513	}
1514	}
1515	}
1516
1517	atomic_inc(v: &sh->count);
1518	init_async_submit(args: &submit, flags: ASYNC_TX_ACK, tx, cb_fn: ops_complete_biofill, cb_param: sh, NULL);
1519	async_trigger_callback(submit: &submit);
1520	}
1521
1522	static void mark_target_uptodate(struct stripe_head sh, int* target)
1523	{
1524	struct r5dev *tgt;
1525
1526	if (target < `0`)
1527	return;
1528
1529	tgt = &sh->dev[target];
1530	set_bit(nr: R5_UPTODATE, addr: &tgt->flags);
1531	BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags));
1532	clear_bit(nr: R5_Wantcompute, addr: &tgt->flags);
1533	}
1534
1535	static void ops_complete_compute(void *stripe_head_ref)
1536	{
1537	struct stripe_head *sh = stripe_head_ref;
1538
1539	pr_debug("%s: stripe %llu\n", __func__,
1540	(unsigned long long)sh->sector);
1541
1542	/ mark the computed target(s) as uptodate /
1543	mark_target_uptodate(sh, target: sh->ops.target);
1544	mark_target_uptodate(sh, target: sh->ops.target2);
1545
1546	clear_bit(nr: STRIPE_COMPUTE_RUN, addr: &sh->state);
1547	if (sh->check_state == check_state_compute_run)
1548	sh->check_state = check_state_compute_result;
1549	set_bit(nr: STRIPE_HANDLE, addr: &sh->state);
1550	raid5_release_stripe(sh);
1551	}
1552
1553	/ return a pointer to the address conversion region of the scribble buffer /
1554	static struct page to_addr_page(struct** raid5_percpu percpu, int* i)
1555	{
1556	return percpu->scribble + i * percpu->scribble_obj_size;
1557	}
1558
1559	/ return a pointer to the address conversion region of the scribble buffer /
1560	static addr_conv_t to_addr_conv(struct* stripe_head *sh,
1561	struct raid5_percpu percpu, int* i)
1562	{
1563	return (void *) (to_addr_page(percpu, i) + sh->disks + `2`);
1564	}
1565
1566	/*
1567	* Return a pointer to record offset address.
1568	*/
1569	static unsigned int *
1570	to_addr_offs(struct stripe_head sh, struct* raid5_percpu *percpu)
1571	{
1572	return (unsigned int *) (to_addr_conv(sh, percpu, i: `0`) + sh->disks + `2`);
1573	}
1574
1575	static struct dma_async_tx_descriptor *
1576	ops_run_compute5(struct stripe_head sh, struct* raid5_percpu *percpu)
1577	{
1578	int disks = sh->disks;
1579	struct page **xor_srcs = to_addr_page(percpu, i: `0`);
1580	unsigned int *off_srcs = to_addr_offs(sh, percpu);
1581	int target = sh->ops.target;
1582	struct r5dev *tgt = &sh->dev[target];
1583	struct page *xor_dest = tgt->page;
1584	unsigned int off_dest = tgt->offset;
1585	int count = `0`;
1586	struct dma_async_tx_descriptor *tx;
1587	struct async_submit_ctl submit;
1588	int i;
1589
1590	BUG_ON(sh->batch_head);
1591
1592	pr_debug("%s: stripe %llu block: %d\n",
1593	__func__, (unsigned long long)sh->sector, target);
1594	BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags));
1595
1596	for (i = disks; i--; ) {
1597	if (i != target) {
1598	off_srcs[count] = sh->dev[i].offset;
1599	xor_srcs[count++] = sh->dev[i].page;
1600	}
1601	}
1602
1603	atomic_inc(v: &sh->count);
1604
1605	init_async_submit(args: &submit, flags: ASYNC_TX_FENCE\|ASYNC_TX_XOR_ZERO_DST, NULL,
1606	cb_fn: ops_complete_compute, cb_param: sh, scribble: to_addr_conv(sh, percpu, i: `0`));
1607	if (unlikely(count == `1`))
1608	tx = async_memcpy(dest: xor_dest, src: xor_srcs[`0`], dest_offset: off_dest, src_offset: off_srcs[`0`],
1609	RAID5_STRIPE_SIZE(sh->raid_conf), submit: &submit);
1610	else
1611	tx = async_xor_offs(dest: xor_dest, offset: off_dest, src_list: xor_srcs, src_offset: off_srcs, src_cnt: count,
1612	RAID5_STRIPE_SIZE(sh->raid_conf), submit: &submit);
1613
1614	return tx;
1615	}
1616
1617	/ set_syndrome_sources - populate source buffers for gen_syndrome*
1618	* @srcs - (struct page *) array of size sh->disks
1619	* @offs - (unsigned int) array of offset for each page
1620	* @sh - stripe_head to parse
1621	*
1622	* Populates srcs in proper layout order for the stripe and returns the
1623	* 'count' of sources to be used in a call to async_gen_syndrome. The P
1624	* destination buffer is recorded in srcs[count] and the Q destination
1625	* is recorded in srcs[count+1]].
1626	*/
1627	static int set_syndrome_sources(struct page **srcs,
1628	unsigned int *offs,
1629	struct stripe_head *sh,
1630	int srctype)
1631	{
1632	int disks = sh->disks;
1633	int syndrome_disks = sh->ddf_layout ? disks : (disks - `2`);
1634	int d0_idx = raid6_d0(sh);
1635	int count;
1636	int i;
1637
1638	for (i = `0`; i < disks; i++)
1639	srcs[i] = NULL;
1640
1641	count = `0`;
1642	i = d0_idx;
1643	do {
1644	int slot = raid6_idx_to_slot(idx: i, sh, count: &count, syndrome_disks);
1645	struct r5dev *dev = &sh->dev[i];
1646
1647	if (i == sh->qd_idx \|\| i == sh->pd_idx \|\|
1648	(srctype == SYNDROME_SRC_ALL) \|\|
1649	(srctype == SYNDROME_SRC_WANT_DRAIN &&
1650	(test_bit(R5_Wantdrain, &dev->flags) \|\|
1651	test_bit(R5_InJournal, &dev->flags))) \|\|
1652	(srctype == SYNDROME_SRC_WRITTEN &&
1653	(dev->written \|\|
1654	test_bit(R5_InJournal, &dev->flags)))) {
1655	if (test_bit(R5_InJournal, &dev->flags))
1656	srcs[slot] = sh->dev[i].orig_page;
1657	else
1658	srcs[slot] = sh->dev[i].page;
1659	/*
1660	* For R5_InJournal, PAGE_SIZE must be 4KB and will
1661	* not shared page. In that case, dev[i].offset
1662	* is 0.
1663	*/
1664	offs[slot] = sh->dev[i].offset;
1665	}
1666	i = raid6_next_disk(disk: i, raid_disks: disks);
1667	} while (i != d0_idx);
1668
1669	return syndrome_disks;
1670	}
1671
1672	static struct dma_async_tx_descriptor *
1673	ops_run_compute6_1(struct stripe_head sh, struct* raid5_percpu *percpu)
1674	{
1675	int disks = sh->disks;
1676	struct page **blocks = to_addr_page(percpu, i: `0`);
1677	unsigned int *offs = to_addr_offs(sh, percpu);
1678	int target;
1679	int qd_idx = sh->qd_idx;
1680	struct dma_async_tx_descriptor *tx;
1681	struct async_submit_ctl submit;
1682	struct r5dev *tgt;
1683	struct page *dest;
1684	unsigned int dest_off;
1685	int i;
1686	int count;
1687
1688	BUG_ON(sh->batch_head);
1689	if (sh->ops.target < `0`)
1690	target = sh->ops.target2;
1691	else if (sh->ops.target2 < `0`)
1692	target = sh->ops.target;
1693	else
1694	/ we should only have one valid target /
1695	BUG();
1696	BUG_ON(target < `0`);
1697	pr_debug("%s: stripe %llu block: %d\n",
1698	__func__, (unsigned long long)sh->sector, target);
1699
1700	tgt = &sh->dev[target];
1701	BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags));
1702	dest = tgt->page;
1703	dest_off = tgt->offset;
1704
1705	atomic_inc(v: &sh->count);
1706
1707	if (target == qd_idx) {
1708	count = set_syndrome_sources(srcs: blocks, offs, sh, srctype: SYNDROME_SRC_ALL);
1709	blocks[count] = NULL; / regenerating p is not necessary /
1710	BUG_ON(blocks[count+`1`] != dest); / q should already be set /
1711	init_async_submit(args: &submit, flags: ASYNC_TX_FENCE, NULL,
1712	cb_fn: ops_complete_compute, cb_param: sh,
1713	scribble: to_addr_conv(sh, percpu, i: `0`));
1714	tx = async_gen_syndrome(blocks, offsets: offs, src_cnt: count+`2`,
1715	RAID5_STRIPE_SIZE(sh->raid_conf), submit: &submit);
1716	} else {
1717	/ Compute any data- or p-drive using XOR /
1718	count = `0`;
1719	for (i = disks; i-- ; ) {
1720	if (i == target \|\| i == qd_idx)
1721	continue;
1722	offs[count] = sh->dev[i].offset;
1723	blocks[count++] = sh->dev[i].page;
1724	}
1725
1726	init_async_submit(args: &submit, flags: ASYNC_TX_FENCE\|ASYNC_TX_XOR_ZERO_DST,
1727	NULL, cb_fn: ops_complete_compute, cb_param: sh,
1728	scribble: to_addr_conv(sh, percpu, i: `0`));
1729	tx = async_xor_offs(dest, offset: dest_off, src_list: blocks, src_offset: offs, src_cnt: count,
1730	RAID5_STRIPE_SIZE(sh->raid_conf), submit: &submit);
1731	}
1732
1733	return tx;
1734	}
1735
1736	static struct dma_async_tx_descriptor *
1737	ops_run_compute6_2(struct stripe_head sh, struct* raid5_percpu *percpu)
1738	{
1739	int i, count, disks = sh->disks;
1740	int syndrome_disks = sh->ddf_layout ? disks : disks-`2`;
1741	int d0_idx = raid6_d0(sh);
1742	int faila = -`1`, failb = -`1`;
1743	int target = sh->ops.target;
1744	int target2 = sh->ops.target2;
1745	struct r5dev *tgt = &sh->dev[target];
1746	struct r5dev *tgt2 = &sh->dev[target2];
1747	struct dma_async_tx_descriptor *tx;
1748	struct page **blocks = to_addr_page(percpu, i: `0`);
1749	unsigned int *offs = to_addr_offs(sh, percpu);
1750	struct async_submit_ctl submit;
1751
1752	BUG_ON(sh->batch_head);
1753	pr_debug("%s: stripe %llu block1: %d block2: %d\n",
1754	__func__, (unsigned long long)sh->sector, target, target2);
1755	BUG_ON(target < `0` \|\| target2 < `0`);
1756	BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags));
1757	BUG_ON(!test_bit(R5_Wantcompute, &tgt2->flags));
1758
1759	/ we need to open-code set_syndrome_sources to handle the*
1760	* slot number conversion for 'faila' and 'failb'
1761	*/
1762	for (i = `0`; i < disks ; i++) {
1763	offs[i] = `0`;
1764	blocks[i] = NULL;
1765	}
1766	count = `0`;
1767	i = d0_idx;
1768	do {
1769	int slot = raid6_idx_to_slot(idx: i, sh, count: &count, syndrome_disks);
1770
1771	offs[slot] = sh->dev[i].offset;
1772	blocks[slot] = sh->dev[i].page;
1773
1774	if (i == target)
1775	faila = slot;
1776	if (i == target2)
1777	failb = slot;
1778	i = raid6_next_disk(disk: i, raid_disks: disks);
1779	} while (i != d0_idx);
1780
1781	BUG_ON(faila == failb);
1782	if (failb < faila)
1783	swap(faila, failb);
1784	pr_debug("%s: stripe: %llu faila: %d failb: %d\n",
1785	__func__, (unsigned long long)sh->sector, faila, failb);
1786
1787	atomic_inc(v: &sh->count);
1788
1789	if (failb == syndrome_disks+`1`) {
1790	/ Q disk is one of the missing disks /
1791	if (faila == syndrome_disks) {
1792	/ Missing P+Q, just recompute /
1793	init_async_submit(args: &submit, flags: ASYNC_TX_FENCE, NULL,
1794	cb_fn: ops_complete_compute, cb_param: sh,
1795	scribble: to_addr_conv(sh, percpu, i: `0`));
1796	return async_gen_syndrome(blocks, offsets: offs, src_cnt: syndrome_disks+`2`,
1797	RAID5_STRIPE_SIZE(sh->raid_conf),
1798	submit: &submit);
1799	} else {
1800	struct page *dest;
1801	unsigned int dest_off;
1802	int data_target;
1803	int qd_idx = sh->qd_idx;
1804
1805	/ Missing D+Q: recompute D from P, then recompute Q /
1806	if (target == qd_idx)
1807	data_target = target2;
1808	else
1809	data_target = target;
1810
1811	count = `0`;
1812	for (i = disks; i-- ; ) {
1813	if (i == data_target \|\| i == qd_idx)
1814	continue;
1815	offs[count] = sh->dev[i].offset;
1816	blocks[count++] = sh->dev[i].page;
1817	}
1818	dest = sh->dev[data_target].page;
1819	dest_off = sh->dev[data_target].offset;
1820	init_async_submit(args: &submit,
1821	flags: ASYNC_TX_FENCE\|ASYNC_TX_XOR_ZERO_DST,
1822	NULL, NULL, NULL,
1823	scribble: to_addr_conv(sh, percpu, i: `0`));
1824	tx = async_xor_offs(dest, offset: dest_off, src_list: blocks, src_offset: offs, src_cnt: count,
1825	RAID5_STRIPE_SIZE(sh->raid_conf),
1826	submit: &submit);
1827
1828	count = set_syndrome_sources(srcs: blocks, offs, sh, srctype: SYNDROME_SRC_ALL);
1829	init_async_submit(args: &submit, flags: ASYNC_TX_FENCE, tx,
1830	cb_fn: ops_complete_compute, cb_param: sh,
1831	scribble: to_addr_conv(sh, percpu, i: `0`));
1832	return async_gen_syndrome(blocks, offsets: offs, src_cnt: count+`2`,
1833	RAID5_STRIPE_SIZE(sh->raid_conf),
1834	submit: &submit);
1835	}
1836	} else {
1837	init_async_submit(args: &submit, flags: ASYNC_TX_FENCE, NULL,
1838	cb_fn: ops_complete_compute, cb_param: sh,
1839	scribble: to_addr_conv(sh, percpu, i: `0`));
1840	if (failb == syndrome_disks) {
1841	/ We're missing D+P. /
1842	return async_raid6_datap_recov(src_num: syndrome_disks+`2`,
1843	RAID5_STRIPE_SIZE(sh->raid_conf),
1844	faila,
1845	ptrs: blocks, offs, submit: &submit);
1846	} else {
1847	/ We're missing D+D. /
1848	return async_raid6_2data_recov(src_num: syndrome_disks+`2`,
1849	RAID5_STRIPE_SIZE(sh->raid_conf),
1850	faila, failb,
1851	ptrs: blocks, offs, submit: &submit);
1852	}
1853	}
1854	}
1855
1856	static void ops_complete_prexor(void *stripe_head_ref)
1857	{
1858	struct stripe_head *sh = stripe_head_ref;
1859
1860	pr_debug("%s: stripe %llu\n", __func__,
1861	(unsigned long long)sh->sector);
1862
1863	if (r5c_is_writeback(log: sh->raid_conf->log))
1864	/*
1865	* raid5-cache write back uses orig_page during prexor.
1866	* After prexor, it is time to free orig_page
1867	*/
1868	r5c_release_extra_page(sh);
1869	}
1870
1871	static struct dma_async_tx_descriptor *
1872	ops_run_prexor5(struct stripe_head sh, struct* raid5_percpu *percpu,
1873	struct dma_async_tx_descriptor *tx)
1874	{
1875	int disks = sh->disks;
1876	struct page **xor_srcs = to_addr_page(percpu, i: `0`);
1877	unsigned int *off_srcs = to_addr_offs(sh, percpu);
1878	int count = `0`, pd_idx = sh->pd_idx, i;
1879	struct async_submit_ctl submit;
1880
1881	/ existing parity data subtracted /
1882	unsigned int off_dest = off_srcs[count] = sh->dev[pd_idx].offset;
1883	struct page *xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page;
1884
1885	BUG_ON(sh->batch_head);
1886	pr_debug("%s: stripe %llu\n", __func__,
1887	(unsigned long long)sh->sector);
1888
1889	for (i = disks; i--; ) {
1890	struct r5dev *dev = &sh->dev[i];
1891	/ Only process blocks that are known to be uptodate /
1892	if (test_bit(R5_InJournal, &dev->flags)) {
1893	/*
1894	* For this case, PAGE_SIZE must be equal to 4KB and
1895	* page offset is zero.
1896	*/
1897	off_srcs[count] = dev->offset;
1898	xor_srcs[count++] = dev->orig_page;
1899	} else if (test_bit(R5_Wantdrain, &dev->flags)) {
1900	off_srcs[count] = dev->offset;
1901	xor_srcs[count++] = dev->page;
1902	}
1903	}
1904
1905	init_async_submit(args: &submit, flags: ASYNC_TX_FENCE\|ASYNC_TX_XOR_DROP_DST, tx,
1906	cb_fn: ops_complete_prexor, cb_param: sh, scribble: to_addr_conv(sh, percpu, i: `0`));
1907	tx = async_xor_offs(dest: xor_dest, offset: off_dest, src_list: xor_srcs, src_offset: off_srcs, src_cnt: count,
1908	RAID5_STRIPE_SIZE(sh->raid_conf), submit: &submit);
1909
1910	return tx;
1911	}
1912
1913	static struct dma_async_tx_descriptor *
1914	ops_run_prexor6(struct stripe_head sh, struct* raid5_percpu *percpu,
1915	struct dma_async_tx_descriptor *tx)
1916	{
1917	struct page **blocks = to_addr_page(percpu, i: `0`);
1918	unsigned int *offs = to_addr_offs(sh, percpu);
1919	int count;
1920	struct async_submit_ctl submit;
1921
1922	pr_debug("%s: stripe %llu\n", __func__,
1923	(unsigned long long)sh->sector);
1924
1925	count = set_syndrome_sources(srcs: blocks, offs, sh, srctype: SYNDROME_SRC_WANT_DRAIN);
1926
1927	init_async_submit(args: &submit, flags: ASYNC_TX_FENCE\|ASYNC_TX_PQ_XOR_DST, tx,
1928	cb_fn: ops_complete_prexor, cb_param: sh, scribble: to_addr_conv(sh, percpu, i: `0`));
1929	tx = async_gen_syndrome(blocks, offsets: offs, src_cnt: count+`2`,
1930	RAID5_STRIPE_SIZE(sh->raid_conf), submit: &submit);
1931
1932	return tx;
1933	}
1934
1935	static struct dma_async_tx_descriptor *
1936	ops_run_biodrain(struct stripe_head sh, struct* dma_async_tx_descriptor *tx)
1937	{
1938	struct r5conf *conf = sh->raid_conf;
1939	int disks = sh->disks;
1940	int i;
1941	struct stripe_head *head_sh = sh;
1942
1943	pr_debug("%s: stripe %llu\n", __func__,
1944	(unsigned long long)sh->sector);
1945
1946	for (i = disks; i--; ) {
1947	struct r5dev *dev;
1948	struct bio *chosen;
1949
1950	sh = head_sh;
1951	if (test_and_clear_bit(nr: R5_Wantdrain, addr: &head_sh->dev[i].flags)) {
1952	struct bio *wbi;
1953
1954	again:
1955	dev = &sh->dev[i];
1956	/*
1957	* clear R5_InJournal, so when rewriting a page in
1958	* journal, it is not skipped by r5l_log_stripe()
1959	*/
1960	clear_bit(nr: R5_InJournal, addr: &dev->flags);
1961	spin_lock_irq(lock: &sh->stripe_lock);
1962	chosen = dev->towrite;
1963	dev->towrite = NULL;
1964	sh->overwrite_disks = `0`;
1965	BUG_ON(dev->written);
1966	wbi = dev->written = chosen;
1967	spin_unlock_irq(lock: &sh->stripe_lock);
1968	WARN_ON(dev->page != dev->orig_page);
1969
1970	while (wbi && wbi->bi_iter.bi_sector <
1971	dev->sector + RAID5_STRIPE_SECTORS(conf)) {
1972	if (wbi->bi_opf & REQ_FUA)
1973	set_bit(nr: R5_WantFUA, addr: &dev->flags);
1974	if (wbi->bi_opf & REQ_SYNC)
1975	set_bit(nr: R5_SyncIO, addr: &dev->flags);
1976	if (bio_op(bio: wbi) == REQ_OP_DISCARD)
1977	set_bit(nr: R5_Discard, addr: &dev->flags);
1978	else {
1979	tx = async_copy_data(frombio: `1`, bio: wbi, page: &dev->page,
1980	poff: dev->offset,
1981	sector: dev->sector, tx, sh,
1982	no_skipcopy: r5c_is_writeback(log: conf->log));
1983	if (dev->page != dev->orig_page &&
1984	!r5c_is_writeback(log: conf->log)) {
1985	set_bit(nr: R5_SkipCopy, addr: &dev->flags);
1986	clear_bit(nr: R5_UPTODATE, addr: &dev->flags);
1987	clear_bit(nr: R5_OVERWRITE, addr: &dev->flags);
1988	}
1989	}
1990	wbi = r5_next_bio(conf, bio: wbi, sector: dev->sector);
1991	}
1992
1993	if (head_sh->batch_head) {
1994	sh = list_first_entry(&sh->batch_list,
1995	struct stripe_head,
1996	batch_list);
1997	if (sh == head_sh)
1998	continue;
1999	goto again;
2000	}
2001	}
2002	}
2003
2004	return tx;
2005	}
2006
2007	static void ops_complete_reconstruct(void *stripe_head_ref)
2008	{
2009	struct stripe_head *sh = stripe_head_ref;
2010	int disks = sh->disks;
2011	int pd_idx = sh->pd_idx;
2012	int qd_idx = sh->qd_idx;
2013	int i;
2014	bool fua = false, sync = false, discard = false;
2015
2016	pr_debug("%s: stripe %llu\n", __func__,
2017	(unsigned long long)sh->sector);
2018
2019	for (i = disks; i--; ) {
2020	fua \|= test_bit(R5_WantFUA, &sh->dev[i].flags);
2021	sync \|= test_bit(R5_SyncIO, &sh->dev[i].flags);
2022	discard \|= test_bit(R5_Discard, &sh->dev[i].flags);
2023	}
2024
2025	for (i = disks; i--; ) {
2026	struct r5dev *dev = &sh->dev[i];
2027
2028	if (dev->written \|\| i == pd_idx \|\| i == qd_idx) {
2029	if (!discard && !test_bit(R5_SkipCopy, &dev->flags)) {
2030	set_bit(nr: R5_UPTODATE, addr: &dev->flags);
2031	if (test_bit(STRIPE_EXPAND_READY, &sh->state))
2032	set_bit(nr: R5_Expanded, addr: &dev->flags);
2033	}
2034	if (fua)
2035	set_bit(nr: R5_WantFUA, addr: &dev->flags);
2036	if (sync)
2037	set_bit(nr: R5_SyncIO, addr: &dev->flags);
2038	}
2039	}
2040
2041	if (sh->reconstruct_state == reconstruct_state_drain_run)
2042	sh->reconstruct_state = reconstruct_state_drain_result;
2043	else if (sh->reconstruct_state == reconstruct_state_prexor_drain_run)
2044	sh->reconstruct_state = reconstruct_state_prexor_drain_result;
2045	else {
2046	BUG_ON(sh->reconstruct_state != reconstruct_state_run);
2047	sh->reconstruct_state = reconstruct_state_result;
2048	}
2049
2050	set_bit(nr: STRIPE_HANDLE, addr: &sh->state);
2051	raid5_release_stripe(sh);
2052	}
2053
2054	static void
2055	ops_run_reconstruct5(struct stripe_head sh, struct* raid5_percpu *percpu,
2056	struct dma_async_tx_descriptor *tx)
2057	{
2058	int disks = sh->disks;
2059	struct page **xor_srcs;
2060	unsigned int *off_srcs;
2061	struct async_submit_ctl submit;
2062	int count, pd_idx = sh->pd_idx, i;
2063	struct page *xor_dest;
2064	unsigned int off_dest;
2065	int prexor = `0`;
2066	unsigned long flags;
2067	int j = `0`;
2068	struct stripe_head *head_sh = sh;
2069	int last_stripe;
2070
2071	pr_debug("%s: stripe %llu\n", __func__,
2072	(unsigned long long)sh->sector);
2073
2074	for (i = `0`; i < sh->disks; i++) {
2075	if (pd_idx == i)
2076	continue;
2077	if (!test_bit(R5_Discard, &sh->dev[i].flags))
2078	break;
2079	}
2080	if (i >= sh->disks) {
2081	atomic_inc(v: &sh->count);
2082	set_bit(nr: R5_Discard, addr: &sh->dev[pd_idx].flags);
2083	ops_complete_reconstruct(stripe_head_ref: sh);
2084	return;
2085	}
2086	again:
2087	count = `0`;
2088	xor_srcs = to_addr_page(percpu, i: j);
2089	off_srcs = to_addr_offs(sh, percpu);
2090	/ check if prexor is active which means only process blocks*
2091	* that are part of a read-modify-write (written)
2092	*/
2093	if (head_sh->reconstruct_state == reconstruct_state_prexor_drain_run) {
2094	prexor = `1`;
2095	off_dest = off_srcs[count] = sh->dev[pd_idx].offset;
2096	xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page;
2097	for (i = disks; i--; ) {
2098	struct r5dev *dev = &sh->dev[i];
2099	if (head_sh->dev[i].written \|\|
2100	test_bit(R5_InJournal, &head_sh->dev[i].flags)) {
2101	off_srcs[count] = dev->offset;
2102	xor_srcs[count++] = dev->page;
2103	}
2104	}
2105	} else {
2106	xor_dest = sh->dev[pd_idx].page;
2107	off_dest = sh->dev[pd_idx].offset;
2108	for (i = disks; i--; ) {
2109	struct r5dev *dev = &sh->dev[i];
2110	if (i != pd_idx) {
2111	off_srcs[count] = dev->offset;
2112	xor_srcs[count++] = dev->page;
2113	}
2114	}
2115	}
2116
2117	/ 1/ if we prexor'd then the dest is reused as a source*
2118	* 2/ if we did not prexor then we are redoing the parity
2119	* set ASYNC_TX_XOR_DROP_DST and ASYNC_TX_XOR_ZERO_DST
2120	* for the synchronous xor case
2121	*/
2122	last_stripe = !head_sh->batch_head \|\|
2123	list_first_entry(&sh->batch_list,
2124	struct stripe_head, batch_list) == head_sh;
2125	if (last_stripe) {
2126	flags = ASYNC_TX_ACK \|
2127	(prexor ? ASYNC_TX_XOR_DROP_DST : ASYNC_TX_XOR_ZERO_DST);
2128
2129	atomic_inc(v: &head_sh->count);
2130	init_async_submit(args: &submit, flags, tx, cb_fn: ops_complete_reconstruct, cb_param: head_sh,
2131	scribble: to_addr_conv(sh, percpu, i: j));
2132	} else {
2133	flags = prexor ? ASYNC_TX_XOR_DROP_DST : ASYNC_TX_XOR_ZERO_DST;
2134	init_async_submit(args: &submit, flags, tx, NULL, NULL,
2135	scribble: to_addr_conv(sh, percpu, i: j));
2136	}
2137
2138	if (unlikely(count == `1`))
2139	tx = async_memcpy(dest: xor_dest, src: xor_srcs[`0`], dest_offset: off_dest, src_offset: off_srcs[`0`],
2140	RAID5_STRIPE_SIZE(sh->raid_conf), submit: &submit);
2141	else
2142	tx = async_xor_offs(dest: xor_dest, offset: off_dest, src_list: xor_srcs, src_offset: off_srcs, src_cnt: count,
2143	RAID5_STRIPE_SIZE(sh->raid_conf), submit: &submit);
2144	if (!last_stripe) {
2145	j++;
2146	sh = list_first_entry(&sh->batch_list, struct stripe_head,
2147	batch_list);
2148	goto again;
2149	}
2150	}
2151
2152	static void
2153	ops_run_reconstruct6(struct stripe_head sh, struct* raid5_percpu *percpu,
2154	struct dma_async_tx_descriptor *tx)
2155	{
2156	struct async_submit_ctl submit;
2157	struct page **blocks;
2158	unsigned int *offs;
2159	int count, i, j = `0`;
2160	struct stripe_head *head_sh = sh;
2161	int last_stripe;
2162	int synflags;
2163	unsigned long txflags;
2164
2165	pr_debug("%s: stripe %llu\n", __func__, (unsigned long long)sh->sector);
2166
2167	for (i = `0`; i < sh->disks; i++) {
2168	if (sh->pd_idx == i \|\| sh->qd_idx == i)
2169	continue;
2170	if (!test_bit(R5_Discard, &sh->dev[i].flags))
2171	break;
2172	}
2173	if (i >= sh->disks) {
2174	atomic_inc(v: &sh->count);
2175	set_bit(nr: R5_Discard, addr: &sh->dev[sh->pd_idx].flags);
2176	set_bit(nr: R5_Discard, addr: &sh->dev[sh->qd_idx].flags);
2177	ops_complete_reconstruct(stripe_head_ref: sh);
2178	return;
2179	}
2180
2181	again:
2182	blocks = to_addr_page(percpu, i: j);
2183	offs = to_addr_offs(sh, percpu);
2184
2185	if (sh->reconstruct_state == reconstruct_state_prexor_drain_run) {
2186	synflags = SYNDROME_SRC_WRITTEN;
2187	txflags = ASYNC_TX_ACK \| ASYNC_TX_PQ_XOR_DST;
2188	} else {
2189	synflags = SYNDROME_SRC_ALL;
2190	txflags = ASYNC_TX_ACK;
2191	}
2192
2193	count = set_syndrome_sources(srcs: blocks, offs, sh, srctype: synflags);
2194	last_stripe = !head_sh->batch_head \|\|
2195	list_first_entry(&sh->batch_list,
2196	struct stripe_head, batch_list) == head_sh;
2197
2198	if (last_stripe) {
2199	atomic_inc(v: &head_sh->count);
2200	init_async_submit(args: &submit, flags: txflags, tx, cb_fn: ops_complete_reconstruct,
2201	cb_param: head_sh, scribble: to_addr_conv(sh, percpu, i: j));
2202	} else
2203	init_async_submit(args: &submit, flags: `0`, tx, NULL, NULL,
2204	scribble: to_addr_conv(sh, percpu, i: j));
2205	tx = async_gen_syndrome(blocks, offsets: offs, src_cnt: count+`2`,
2206	RAID5_STRIPE_SIZE(sh->raid_conf), submit: &submit);
2207	if (!last_stripe) {
2208	j++;
2209	sh = list_first_entry(&sh->batch_list, struct stripe_head,
2210	batch_list);
2211	goto again;
2212	}
2213	}
2214
2215	static void ops_complete_check(void *stripe_head_ref)
2216	{
2217	struct stripe_head *sh = stripe_head_ref;
2218
2219	pr_debug("%s: stripe %llu\n", __func__,
2220	(unsigned long long)sh->sector);
2221
2222	sh->check_state = check_state_check_result;
2223	set_bit(nr: STRIPE_HANDLE, addr: &sh->state);
2224	raid5_release_stripe(sh);
2225	}
2226
2227	static void ops_run_check_p(struct stripe_head sh, struct* raid5_percpu *percpu)
2228	{
2229	int disks = sh->disks;
2230	int pd_idx = sh->pd_idx;
2231	int qd_idx = sh->qd_idx;
2232	struct page *xor_dest;
2233	unsigned int off_dest;
2234	struct page **xor_srcs = to_addr_page(percpu, i: `0`);
2235	unsigned int *off_srcs = to_addr_offs(sh, percpu);
2236	struct dma_async_tx_descriptor *tx;
2237	struct async_submit_ctl submit;
2238	int count;
2239	int i;
2240
2241	pr_debug("%s: stripe %llu\n", __func__,
2242	(unsigned long long)sh->sector);
2243
2244	BUG_ON(sh->batch_head);
2245	count = `0`;
2246	xor_dest = sh->dev[pd_idx].page;
2247	off_dest = sh->dev[pd_idx].offset;
2248	off_srcs[count] = off_dest;
2249	xor_srcs[count++] = xor_dest;
2250	for (i = disks; i--; ) {
2251	if (i == pd_idx \|\| i == qd_idx)
2252	continue;
2253	off_srcs[count] = sh->dev[i].offset;
2254	xor_srcs[count++] = sh->dev[i].page;
2255	}
2256
2257	init_async_submit(args: &submit, flags: `0`, NULL, NULL, NULL,
2258	scribble: to_addr_conv(sh, percpu, i: `0`));
2259	tx = async_xor_val_offs(dest: xor_dest, offset: off_dest, src_list: xor_srcs, src_offset: off_srcs, src_cnt: count,
2260	RAID5_STRIPE_SIZE(sh->raid_conf),
2261	result: &sh->ops.zero_sum_result, submit: &submit);
2262
2263	atomic_inc(v: &sh->count);
2264	init_async_submit(args: &submit, flags: ASYNC_TX_ACK, tx, cb_fn: ops_complete_check, cb_param: sh, NULL);
2265	tx = async_trigger_callback(submit: &submit);
2266	}
2267
2268	static void ops_run_check_pq(struct stripe_head sh, struct* raid5_percpu percpu, int* checkp)
2269	{
2270	struct page **srcs = to_addr_page(percpu, i: `0`);
2271	unsigned int *offs = to_addr_offs(sh, percpu);
2272	struct async_submit_ctl submit;
2273	int count;
2274
2275	pr_debug("%s: stripe %llu checkp: %d\n", __func__,
2276	(unsigned long long)sh->sector, checkp);
2277
2278	BUG_ON(sh->batch_head);
2279	count = set_syndrome_sources(srcs, offs, sh, srctype: SYNDROME_SRC_ALL);
2280	if (!checkp)
2281	srcs[count] = NULL;
2282
2283	atomic_inc(v: &sh->count);
2284	init_async_submit(args: &submit, flags: ASYNC_TX_ACK, NULL, cb_fn: ops_complete_check,
2285	cb_param: sh, scribble: to_addr_conv(sh, percpu, i: `0`));
2286	async_syndrome_val(blocks: srcs, offsets: offs, src_cnt: count+`2`,
2287	RAID5_STRIPE_SIZE(sh->raid_conf),
2288	pqres: &sh->ops.zero_sum_result, spare: percpu->spare_page, s_off: `0`, submit: &submit);
2289	}
2290
2291	static void raid_run_ops(struct stripe_head sh, unsigned* long ops_request)
2292	{
2293	int overlap_clear = `0`, i, disks = sh->disks;
2294	struct dma_async_tx_descriptor *tx = NULL;
2295	struct r5conf *conf = sh->raid_conf;
2296	int level = conf->level;
2297	struct raid5_percpu *percpu;
2298
2299	local_lock(&conf->percpu->lock);
2300	percpu = this_cpu_ptr(conf->percpu);
2301	if (test_bit(STRIPE_OP_BIOFILL, &ops_request)) {
2302	ops_run_biofill(sh);
2303	overlap_clear++;
2304	}
2305
2306	if (test_bit(STRIPE_OP_COMPUTE_BLK, &ops_request)) {
2307	if (level < `6`)
2308	tx = ops_run_compute5(sh, percpu);
2309	else {
2310	if (sh->ops.target2 < `0` \|\| sh->ops.target < `0`)
2311	tx = ops_run_compute6_1(sh, percpu);
2312	else
2313	tx = ops_run_compute6_2(sh, percpu);
2314	}
2315	/ terminate the chain if reconstruct is not set to be run /
2316	if (tx && !test_bit(STRIPE_OP_RECONSTRUCT, &ops_request))
2317	async_tx_ack(tx);
2318	}
2319
2320	if (test_bit(STRIPE_OP_PREXOR, &ops_request)) {
2321	if (level < `6`)
2322	tx = ops_run_prexor5(sh, percpu, tx);
2323	else
2324	tx = ops_run_prexor6(sh, percpu, tx);
2325	}
2326
2327	if (test_bit(STRIPE_OP_PARTIAL_PARITY, &ops_request))
2328	tx = ops_run_partial_parity(sh, percpu, tx);
2329
2330	if (test_bit(STRIPE_OP_BIODRAIN, &ops_request)) {
2331	tx = ops_run_biodrain(sh, tx);
2332	overlap_clear++;
2333	}
2334
2335	if (test_bit(STRIPE_OP_RECONSTRUCT, &ops_request)) {
2336	if (level < `6`)
2337	ops_run_reconstruct5(sh, percpu, tx);
2338	else
2339	ops_run_reconstruct6(sh, percpu, tx);
2340	}
2341
2342	if (test_bit(STRIPE_OP_CHECK, &ops_request)) {
2343	if (sh->check_state == check_state_run)
2344	ops_run_check_p(sh, percpu);
2345	else if (sh->check_state == check_state_run_q)
2346	ops_run_check_pq(sh, percpu, checkp: `0`);
2347	else if (sh->check_state == check_state_run_pq)
2348	ops_run_check_pq(sh, percpu, checkp: `1`);
2349	else
2350	BUG();
2351	}
2352
2353	if (overlap_clear && !sh->batch_head) {
2354	for (i = disks; i--; ) {
2355	struct r5dev *dev = &sh->dev[i];
2356	if (test_and_clear_bit(nr: R5_Overlap, addr: &dev->flags))
2357	wake_up(&sh->raid_conf->wait_for_overlap);
2358	}
2359	}
2360	local_unlock(&conf->percpu->lock);
2361	}
2362
2363	static void free_stripe(struct kmem_cache sc, struct* stripe_head *sh)
2364	{
2365	#if PAGE_SIZE != DEFAULT_STRIPE_SIZE
2366	kfree(sh->pages);
2367	#endif
2368	if (sh->ppl_page)
2369	__free_page(sh->ppl_page);
2370	kmem_cache_free(s: sc, objp: sh);
2371	}
2372
2373	static struct stripe_head alloc_stripe(struct* kmem_cache *sc, gfp_t gfp,
2374	int disks, struct r5conf *conf)
2375	{
2376	struct stripe_head *sh;
2377
2378	sh = kmem_cache_zalloc(k: sc, flags: gfp);
2379	if (sh) {
2380	spin_lock_init(&sh->stripe_lock);
2381	spin_lock_init(&sh->batch_lock);
2382	INIT_LIST_HEAD(list: &sh->batch_list);
2383	INIT_LIST_HEAD(list: &sh->lru);
2384	INIT_LIST_HEAD(list: &sh->r5c);
2385	INIT_LIST_HEAD(list: &sh->log_list);
2386	atomic_set(v: &sh->count, i: `1`);
2387	sh->raid_conf = conf;
2388	sh->log_start = MaxSector;
2389
2390	if (raid5_has_ppl(conf)) {
2391	sh->ppl_page = alloc_page(gfp);
2392	if (!sh->ppl_page) {
2393	free_stripe(sc, sh);
2394	return NULL;
2395	}
2396	}
2397	#if PAGE_SIZE != DEFAULT_STRIPE_SIZE
2398	if (init_stripe_shared_pages(sh, conf, disks)) {
2399	free_stripe(sc, sh);
2400	return NULL;
2401	}
2402	#endif
2403	}
2404	return sh;
2405	}
2406	static int grow_one_stripe(struct r5conf *conf, gfp_t gfp)
2407	{
2408	struct stripe_head *sh;
2409
2410	sh = alloc_stripe(sc: conf->slab_cache, gfp, disks: conf->pool_size, conf);
2411	if (!sh)
2412	return `0`;
2413
2414	if (grow_buffers(sh, gfp)) {
2415	shrink_buffers(sh);
2416	free_stripe(sc: conf->slab_cache, sh);
2417	return `0`;
2418	}
2419	sh->hash_lock_index =
2420	conf->max_nr_stripes % NR_STRIPE_HASH_LOCKS;
2421	/ we just created an active stripe so... /
2422	atomic_inc(v: &conf->active_stripes);
2423
2424	raid5_release_stripe(sh);
2425	conf->max_nr_stripes++;
2426	return `1`;
2427	}
2428
2429	static int grow_stripes(struct r5conf conf, int* num)
2430	{
2431	struct kmem_cache *sc;
2432	size_t namelen = sizeof(conf->cache_name[`0`]);
2433	int devs = max(conf->raid_disks, conf->previous_raid_disks);
2434
2435	if (conf->mddev->gendisk)
2436	snprintf(buf: conf->cache_name[`0`], size: namelen,
2437	fmt: "raid%d-%s", conf->level, mdname(mddev: conf->mddev));
2438	else
2439	snprintf(buf: conf->cache_name[`0`], size: namelen,
2440	fmt: "raid%d-%p", conf->level, conf->mddev);
2441	snprintf(buf: conf->cache_name[`1`], size: namelen, fmt: "%.27s-alt", conf->cache_name[`0`]);
2442
2443	conf->active_name = `0`;
2444	sc = kmem_cache_create(name: conf->cache_name[conf->active_name],
2445	struct_size_t(struct stripe_head, dev, devs),
2446	align: `0`, flags: `0`, NULL);
2447	if (!sc)
2448	return `1`;
2449	conf->slab_cache = sc;
2450	conf->pool_size = devs;
2451	while (num--)
2452	if (!grow_one_stripe(conf, GFP_KERNEL))
2453	return `1`;
2454
2455	return `0`;
2456	}
2457
2458	/**
2459	* scribble_alloc - allocate percpu scribble buffer for required size
2460	* of the scribble region
2461	* @percpu: from for_each_present_cpu() of the caller
2462	* @num: total number of disks in the array
2463	* @cnt: scribble objs count for required size of the scribble region
2464	*
2465	* The scribble buffer size must be enough to contain:
2466	* 1/ a struct page pointer for each device in the array +2
2467	* 2/ room to convert each entry in (1) to its corresponding dma
2468	* (dma_map_page()) or page (page_address()) address.
2469	*
2470	* Note: the +2 is for the destination buffers of the ddf/raid6 case where we
2471	* calculate over all devices (not just the data blocks), using zeros in place
2472	* of the P and Q blocks.
2473	*/
2474	static int scribble_alloc(struct raid5_percpu *percpu,
2475	int num, int cnt)
2476	{
2477	size_t obj_size =
2478	sizeof(struct page ) (num + `2`) +
2479	sizeof(addr_conv_t) * (num + `2`) +
2480	sizeof(unsigned int) * (num + `2`);
2481	void *scribble;
2482
2483	/*
2484	* If here is in raid array suspend context, it is in memalloc noio
2485	* context as well, there is no potential recursive memory reclaim
2486	* I/Os with the GFP_KERNEL flag.
2487	*/
2488	scribble = kvmalloc_array(n: cnt, size: obj_size, GFP_KERNEL);
2489	if (!scribble)
2490	return -ENOMEM;
2491
2492	kvfree(addr: percpu->scribble);
2493
2494	percpu->scribble = scribble;
2495	percpu->scribble_obj_size = obj_size;
2496	return `0`;
2497	}
2498
2499	static int resize_chunks(struct r5conf conf, int* new_disks, int new_sectors)
2500	{
2501	unsigned long cpu;
2502	int err = `0`;
2503
2504	/ Never shrink. /
2505	if (conf->scribble_disks >= new_disks &&
2506	conf->scribble_sectors >= new_sectors)
2507	return `0`;
2508
2509	raid5_quiesce(mddev: conf->mddev, quiesce: true);
2510	cpus_read_lock();
2511
2512	for_each_present_cpu(cpu) {
2513	struct raid5_percpu *percpu;
2514
2515	percpu = per_cpu_ptr(conf->percpu, cpu);
2516	err = scribble_alloc(percpu, num: new_disks,
2517	cnt: new_sectors / RAID5_STRIPE_SECTORS(conf));
2518	if (err)
2519	break;
2520	}
2521
2522	cpus_read_unlock();
2523	raid5_quiesce(mddev: conf->mddev, quiesce: false);
2524
2525	if (!err) {
2526	conf->scribble_disks = new_disks;
2527	conf->scribble_sectors = new_sectors;
2528	}
2529	return err;
2530	}
2531
2532	static int resize_stripes(struct r5conf conf, int* newsize)
2533	{
2534	/ Make all the stripes able to hold 'newsize' devices.*
2535	* New slots in each stripe get 'page' set to a new page.
2536	*
2537	* This happens in stages:
2538	* 1/ create a new kmem_cache and allocate the required number of
2539	* stripe_heads.
2540	* 2/ gather all the old stripe_heads and transfer the pages across
2541	* to the new stripe_heads. This will have the side effect of
2542	* freezing the array as once all stripe_heads have been collected,
2543	* no IO will be possible. Old stripe heads are freed once their
2544	* pages have been transferred over, and the old kmem_cache is
2545	* freed when all stripes are done.
2546	* 3/ reallocate conf->disks to be suitable bigger. If this fails,
2547	* we simple return a failure status - no need to clean anything up.
2548	* 4/ allocate new pages for the new slots in the new stripe_heads.
2549	* If this fails, we don't bother trying the shrink the
2550	* stripe_heads down again, we just leave them as they are.
2551	* As each stripe_head is processed the new one is released into
2552	* active service.
2553	*
2554	* Once step2 is started, we cannot afford to wait for a write,
2555	* so we use GFP_NOIO allocations.
2556	*/
2557	struct stripe_head osh, nsh;
2558	LIST_HEAD(newstripes);
2559	struct disk_info *ndisks;
2560	int err = `0`;
2561	struct kmem_cache *sc;
2562	int i;
2563	int hash, cnt;
2564
2565	md_allow_write(mddev: conf->mddev);
2566
2567	/ Step 1 /
2568	sc = kmem_cache_create(name: conf->cache_name[`1`-conf->active_name],
2569	struct_size_t(struct stripe_head, dev, newsize),
2570	align: `0`, flags: `0`, NULL);
2571	if (!sc)
2572	return -ENOMEM;
2573
2574	/ Need to ensure auto-resizing doesn't interfere /
2575	mutex_lock(&conf->cache_size_mutex);
2576
2577	for (i = conf->max_nr_stripes; i; i--) {
2578	nsh = alloc_stripe(sc, GFP_KERNEL, disks: newsize, conf);
2579	if (!nsh)
2580	break;
2581
2582	list_add(new: &nsh->lru, head: &newstripes);
2583	}
2584	if (i) {
2585	/ didn't get enough, give up /
2586	while (!list_empty(head: &newstripes)) {
2587	nsh = list_entry(newstripes.next, struct stripe_head, lru);
2588	list_del(entry: &nsh->lru);
2589	free_stripe(sc, sh: nsh);
2590	}
2591	kmem_cache_destroy(s: sc);
2592	mutex_unlock(lock: &conf->cache_size_mutex);
2593	return -ENOMEM;
2594	}
2595	/ Step 2 - Must use GFP_NOIO now.*
2596	* OK, we have enough stripes, start collecting inactive
2597	* stripes and copying them over
2598	*/
2599	hash = `0`;
2600	cnt = `0`;
2601	list_for_each_entry(nsh, &newstripes, lru) {
2602	lock_device_hash_lock(conf, hash);
2603	wait_event_cmd(conf->wait_for_stripe,
2604	!list_empty(conf->inactive_list + hash),
2605	unlock_device_hash_lock(conf, hash),
2606	lock_device_hash_lock(conf, hash));
2607	osh = get_free_stripe(conf, hash);
2608	unlock_device_hash_lock(conf, hash);
2609
2610	#if PAGE_SIZE != DEFAULT_STRIPE_SIZE
2611	for (i = `0`; i < osh->nr_pages; i++) {
2612	nsh->pages[i] = osh->pages[i];
2613	osh->pages[i] = NULL;
2614	}
2615	#endif
2616	for(i=`0`; i<conf->pool_size; i++) {
2617	nsh->dev[i].page = osh->dev[i].page;
2618	nsh->dev[i].orig_page = osh->dev[i].page;
2619	nsh->dev[i].offset = osh->dev[i].offset;
2620	}
2621	nsh->hash_lock_index = hash;
2622	free_stripe(sc: conf->slab_cache, sh: osh);
2623	cnt++;
2624	if (cnt >= conf->max_nr_stripes / NR_STRIPE_HASH_LOCKS +
2625	!!((conf->max_nr_stripes % NR_STRIPE_HASH_LOCKS) > hash)) {
2626	hash++;
2627	cnt = `0`;
2628	}
2629	}
2630	kmem_cache_destroy(s: conf->slab_cache);
2631
2632	/ Step 3.*
2633	* At this point, we are holding all the stripes so the array
2634	* is completely stalled, so now is a good time to resize
2635	* conf->disks and the scribble region
2636	*/
2637	ndisks = kcalloc(n: newsize, size: sizeof(struct disk_info), GFP_NOIO);
2638	if (ndisks) {
2639	for (i = `0`; i < conf->pool_size; i++)
2640	ndisks[i] = conf->disks[i];
2641
2642	for (i = conf->pool_size; i < newsize; i++) {
2643	ndisks[i].extra_page = alloc_page(GFP_NOIO);
2644	if (!ndisks[i].extra_page)
2645	err = -ENOMEM;
2646	}
2647
2648	if (err) {
2649	for (i = conf->pool_size; i < newsize; i++)
2650	if (ndisks[i].extra_page)
2651	put_page(page: ndisks[i].extra_page);
2652	kfree(objp: ndisks);
2653	} else {
2654	kfree(objp: conf->disks);
2655	conf->disks = ndisks;
2656	}
2657	} else
2658	err = -ENOMEM;
2659
2660	conf->slab_cache = sc;
2661	conf->active_name = `1`-conf->active_name;
2662
2663	/ Step 4, return new stripes to service /
2664	while(!list_empty(head: &newstripes)) {
2665	nsh = list_entry(newstripes.next, struct stripe_head, lru);
2666	list_del_init(entry: &nsh->lru);
2667
2668	#if PAGE_SIZE != DEFAULT_STRIPE_SIZE
2669	for (i = `0`; i < nsh->nr_pages; i++) {
2670	if (nsh->pages[i])
2671	continue;
2672	nsh->pages[i] = alloc_page(GFP_NOIO);
2673	if (!nsh->pages[i])
2674	err = -ENOMEM;
2675	}
2676
2677	for (i = conf->raid_disks; i < newsize; i++) {
2678	if (nsh->dev[i].page)
2679	continue;
2680	nsh->dev[i].page = raid5_get_dev_page(nsh, i);
2681	nsh->dev[i].orig_page = nsh->dev[i].page;
2682	nsh->dev[i].offset = raid5_get_page_offset(nsh, i);
2683	}
2684	#else
2685	for (i=conf->raid_disks; i < newsize; i++)
2686	if (nsh->dev[i].page == NULL) {
2687	struct page *p = alloc_page(GFP_NOIO);
2688	nsh->dev[i].page = p;
2689	nsh->dev[i].orig_page = p;
2690	nsh->dev[i].offset = `0`;
2691	if (!p)
2692	err = -ENOMEM;
2693	}
2694	#endif
2695	raid5_release_stripe(sh: nsh);
2696	}
2697	/ critical section pass, GFP_NOIO no longer needed /
2698
2699	if (!err)
2700	conf->pool_size = newsize;
2701	mutex_unlock(lock: &conf->cache_size_mutex);
2702
2703	return err;
2704	}
2705
2706	static int drop_one_stripe(struct r5conf *conf)
2707	{
2708	struct stripe_head *sh;
2709	int hash = (conf->max_nr_stripes - `1`) & STRIPE_HASH_LOCKS_MASK;
2710
2711	spin_lock_irq(lock: conf->hash_locks + hash);
2712	sh = get_free_stripe(conf, hash);
2713	spin_unlock_irq(lock: conf->hash_locks + hash);
2714	if (!sh)
2715	return `0`;
2716	BUG_ON(atomic_read(&sh->count));
2717	shrink_buffers(sh);
2718	free_stripe(sc: conf->slab_cache, sh);
2719	atomic_dec(v: &conf->active_stripes);
2720	conf->max_nr_stripes--;
2721	return `1`;
2722	}
2723
2724	static void shrink_stripes(struct r5conf *conf)
2725	{
2726	while (conf->max_nr_stripes &&
2727	drop_one_stripe(conf))
2728	;
2729
2730	kmem_cache_destroy(s: conf->slab_cache);
2731	conf->slab_cache = NULL;
2732	}
2733
2734	/*
2735	* This helper wraps rcu_dereference_protected() and can be used when
2736	* it is known that the nr_pending of the rdev is elevated.
2737	*/
2738	static struct md_rdev rdev_pend_deref(struct* md_rdev __rcu *rdev)
2739	{
2740	return rcu_dereference_protected(rdev,
2741	atomic_read(&rcu_access_pointer(rdev)->nr_pending));
2742	}
2743
2744	/*
2745	* This helper wraps rcu_dereference_protected() and should be used
2746	* when it is known that the mddev_lock() is held. This is safe
2747	* seeing raid5_remove_disk() has the same lock held.
2748	*/
2749	static struct md_rdev rdev_mdlock_deref(struct* mddev *mddev,
2750	struct md_rdev __rcu *rdev)
2751	{
2752	return rcu_dereference_protected(rdev,
2753	lockdep_is_held(&mddev->reconfig_mutex));
2754	}
2755
2756	static void raid5_end_read_request(struct bio * bi)
2757	{
2758	struct stripe_head *sh = bi->bi_private;
2759	struct r5conf *conf = sh->raid_conf;
2760	int disks = sh->disks, i;
2761	struct md_rdev *rdev = NULL;
2762	sector_t s;
2763
2764	for (i=`0` ; i<disks; i++)
2765	if (bi == &sh->dev[i].req)
2766	break;
2767
2768	pr_debug("end_read_request %llu/%d, count: %d, error %d.\n",
2769	(unsigned long long)sh->sector, i, atomic_read(&sh->count),
2770	bi->bi_status);
2771	if (i == disks) {
2772	BUG();
2773	return;
2774	}
2775	if (test_bit(R5_ReadRepl, &sh->dev[i].flags))
2776	/ If replacement finished while this request was outstanding,*
2777	* 'replacement' might be NULL already.
2778	* In that case it moved down to 'rdev'.
2779	* rdev is not removed until all requests are finished.
2780	*/
2781	rdev = rdev_pend_deref(rdev: conf->disks[i].replacement);
2782	if (!rdev)
2783	rdev = rdev_pend_deref(rdev: conf->disks[i].rdev);
2784
2785	if (use_new_offset(conf, sh))
2786	s = sh->sector + rdev->new_data_offset;
2787	else
2788	s = sh->sector + rdev->data_offset;
2789	if (!bi->bi_status) {
2790	set_bit(nr: R5_UPTODATE, addr: &sh->dev[i].flags);
2791	if (test_bit(R5_ReadError, &sh->dev[i].flags)) {
2792	/ Note that this cannot happen on a*
2793	* replacement device. We just fail those on
2794	* any error
2795	*/
2796	pr_info_ratelimited(
2797	"md/raid:%s: read error corrected (%lu sectors at %llu on %pg)\n",
2798	mdname(conf->mddev), RAID5_STRIPE_SECTORS(conf),
2799	(unsigned long long)s,
2800	rdev->bdev);
2801	atomic_add(RAID5_STRIPE_SECTORS(conf), v: &rdev->corrected_errors);
2802	clear_bit(nr: R5_ReadError, addr: &sh->dev[i].flags);
2803	clear_bit(nr: R5_ReWrite, addr: &sh->dev[i].flags);
2804	} else if (test_bit(R5_ReadNoMerge, &sh->dev[i].flags))
2805	clear_bit(nr: R5_ReadNoMerge, addr: &sh->dev[i].flags);
2806
2807	if (test_bit(R5_InJournal, &sh->dev[i].flags))
2808	/*
2809	* end read for a page in journal, this
2810	* must be preparing for prexor in rmw
2811	*/
2812	set_bit(nr: R5_OrigPageUPTDODATE, addr: &sh->dev[i].flags);
2813
2814	if (atomic_read(v: &rdev->read_errors))
2815	atomic_set(v: &rdev->read_errors, i: `0`);
2816	} else {
2817	int retry = `0`;
2818	int set_bad = `0`;
2819
2820	clear_bit(nr: R5_UPTODATE, addr: &sh->dev[i].flags);
2821	if (!(bi->bi_status == BLK_STS_PROTECTION))
2822	atomic_inc(v: &rdev->read_errors);
2823	if (test_bit(R5_ReadRepl, &sh->dev[i].flags))
2824	pr_warn_ratelimited(
2825	"md/raid:%s: read error on replacement device (sector %llu on %pg).\n",
2826	mdname(conf->mddev),
2827	(unsigned long long)s,
2828	rdev->bdev);
2829	else if (conf->mddev->degraded >= conf->max_degraded) {
2830	set_bad = `1`;
2831	pr_warn_ratelimited(
2832	"md/raid:%s: read error not correctable (sector %llu on %pg).\n",
2833	mdname(conf->mddev),
2834	(unsigned long long)s,
2835	rdev->bdev);
2836	} else if (test_bit(R5_ReWrite, &sh->dev[i].flags)) {
2837	/ Oh, no!!! /
2838	set_bad = `1`;
2839	pr_warn_ratelimited(
2840	"md/raid:%s: read error NOT corrected!! (sector %llu on %pg).\n",
2841	mdname(conf->mddev),
2842	(unsigned long long)s,
2843	rdev->bdev);
2844	} else if (atomic_read(v: &rdev->read_errors)
2845	> conf->max_nr_stripes) {
2846	if (!test_bit(Faulty, &rdev->flags)) {
2847	pr_warn("md/raid:%s: %d read_errors > %d stripes\n",
2848	mdname(conf->mddev),
2849	atomic_read(&rdev->read_errors),
2850	conf->max_nr_stripes);
2851	pr_warn("md/raid:%s: Too many read errors, failing device %pg.\n",
2852	mdname(conf->mddev), rdev->bdev);
2853	}
2854	} else
2855	retry = `1`;
2856	if (set_bad && test_bit(In_sync, &rdev->flags)
2857	&& !test_bit(R5_ReadNoMerge, &sh->dev[i].flags))
2858	retry = `1`;
2859	if (retry)
2860	if (sh->qd_idx >= `0` && sh->pd_idx == i)
2861	set_bit(nr: R5_ReadError, addr: &sh->dev[i].flags);
2862	else if (test_bit(R5_ReadNoMerge, &sh->dev[i].flags)) {
2863	set_bit(nr: R5_ReadError, addr: &sh->dev[i].flags);
2864	clear_bit(nr: R5_ReadNoMerge, addr: &sh->dev[i].flags);
2865	} else
2866	set_bit(nr: R5_ReadNoMerge, addr: &sh->dev[i].flags);
2867	else {
2868	clear_bit(nr: R5_ReadError, addr: &sh->dev[i].flags);
2869	clear_bit(nr: R5_ReWrite, addr: &sh->dev[i].flags);
2870	if (!(set_bad
2871	&& test_bit(In_sync, &rdev->flags)
2872	&& rdev_set_badblocks(
2873	rdev, s: sh->sector, RAID5_STRIPE_SECTORS(conf), is_new: `0`)))
2874	md_error(mddev: conf->mddev, rdev);
2875	}
2876	}
2877	rdev_dec_pending(rdev, mddev: conf->mddev);
2878	bio_uninit(bi);
2879	clear_bit(nr: R5_LOCKED, addr: &sh->dev[i].flags);
2880	set_bit(nr: STRIPE_HANDLE, addr: &sh->state);
2881	raid5_release_stripe(sh);
2882	}
2883
2884	static void raid5_end_write_request(struct bio *bi)
2885	{
2886	struct stripe_head *sh = bi->bi_private;
2887	struct r5conf *conf = sh->raid_conf;
2888	int disks = sh->disks, i;
2889	struct md_rdev *rdev;
2890	sector_t first_bad;
2891	int bad_sectors;
2892	int replacement = `0`;
2893
2894	for (i = `0` ; i < disks; i++) {
2895	if (bi == &sh->dev[i].req) {
2896	rdev = rdev_pend_deref(rdev: conf->disks[i].rdev);
2897	break;
2898	}
2899	if (bi == &sh->dev[i].rreq) {
2900	rdev = rdev_pend_deref(rdev: conf->disks[i].replacement);
2901	if (rdev)
2902	replacement = `1`;
2903	else
2904	/ rdev was removed and 'replacement'*
2905	* replaced it. rdev is not removed
2906	* until all requests are finished.
2907	*/
2908	rdev = rdev_pend_deref(rdev: conf->disks[i].rdev);
2909	break;
2910	}
2911	}
2912	pr_debug("end_write_request %llu/%d, count %d, error: %d.\n",
2913	(unsigned long long)sh->sector, i, atomic_read(&sh->count),
2914	bi->bi_status);
2915	if (i == disks) {
2916	BUG();
2917	return;
2918	}
2919
2920	if (replacement) {
2921	if (bi->bi_status)
2922	md_error(mddev: conf->mddev, rdev);
2923	else if (is_badblock(rdev, s: sh->sector,
2924	RAID5_STRIPE_SECTORS(conf),
2925	first_bad: &first_bad, bad_sectors: &bad_sectors))
2926	set_bit(nr: R5_MadeGoodRepl, addr: &sh->dev[i].flags);
2927	} else {
2928	if (bi->bi_status) {
2929	set_bit(nr: STRIPE_DEGRADED, addr: &sh->state);
2930	set_bit(nr: WriteErrorSeen, addr: &rdev->flags);
2931	set_bit(nr: R5_WriteError, addr: &sh->dev[i].flags);
2932	if (!test_and_set_bit(nr: WantReplacement, addr: &rdev->flags))
2933	set_bit(nr: MD_RECOVERY_NEEDED,
2934	addr: &rdev->mddev->recovery);
2935	} else if (is_badblock(rdev, s: sh->sector,
2936	RAID5_STRIPE_SECTORS(conf),
2937	first_bad: &first_bad, bad_sectors: &bad_sectors)) {
2938	set_bit(nr: R5_MadeGood, addr: &sh->dev[i].flags);
2939	if (test_bit(R5_ReadError, &sh->dev[i].flags))
2940	/ That was a successful write so make*
2941	* sure it looks like we already did
2942	* a re-write.
2943	*/
2944	set_bit(nr: R5_ReWrite, addr: &sh->dev[i].flags);
2945	}
2946	}
2947	rdev_dec_pending(rdev, mddev: conf->mddev);
2948
2949	if (sh->batch_head && bi->bi_status && !replacement)
2950	set_bit(nr: STRIPE_BATCH_ERR, addr: &sh->batch_head->state);
2951
2952	bio_uninit(bi);
2953	if (!test_and_clear_bit(nr: R5_DOUBLE_LOCKED, addr: &sh->dev[i].flags))
2954	clear_bit(nr: R5_LOCKED, addr: &sh->dev[i].flags);
2955	set_bit(nr: STRIPE_HANDLE, addr: &sh->state);
2956
2957	if (sh->batch_head && sh != sh->batch_head)
2958	raid5_release_stripe(sh: sh->batch_head);
2959	raid5_release_stripe(sh);
2960	}
2961
2962	static void raid5_error(struct mddev mddev, struct* md_rdev *rdev)
2963	{
2964	struct r5conf *conf = mddev->private;
2965	unsigned long flags;
2966	pr_debug("raid456: error called\n");
2967
2968	pr_crit("md/raid:%s: Disk failure on %pg, disabling device.\n",
2969	mdname(mddev), rdev->bdev);
2970
2971	spin_lock_irqsave(&conf->device_lock, flags);
2972	set_bit(nr: Faulty, addr: &rdev->flags);
2973	clear_bit(nr: In_sync, addr: &rdev->flags);
2974	mddev->degraded = raid5_calc_degraded(conf);
2975
2976	if (has_failed(conf)) {
2977	set_bit(nr: MD_BROKEN, addr: &conf->mddev->flags);
2978	conf->recovery_disabled = mddev->recovery_disabled;
2979
2980	pr_crit("md/raid:%s: Cannot continue operation (%d/%d failed).\n",
2981	mdname(mddev), mddev->degraded, conf->raid_disks);
2982	} else {
2983	pr_crit("md/raid:%s: Operation continuing on %d devices.\n",
2984	mdname(mddev), conf->raid_disks - mddev->degraded);
2985	}
2986
2987	spin_unlock_irqrestore(lock: &conf->device_lock, flags);
2988	set_bit(nr: MD_RECOVERY_INTR, addr: &mddev->recovery);
2989
2990	set_bit(nr: Blocked, addr: &rdev->flags);
2991	set_mask_bits(&mddev->sb_flags, `0`,
2992	BIT(MD_SB_CHANGE_DEVS) \| BIT(MD_SB_CHANGE_PENDING));
2993	r5c_update_on_rdev_error(mddev, rdev);
2994	}
2995
2996	/*
2997	* Input: a 'big' sector number,
2998	* Output: index of the data and parity disk, and the sector # in them.
2999	*/
3000	sector_t raid5_compute_sector(struct r5conf *conf, sector_t r_sector,
3001	int previous, int *dd_idx,
3002	struct stripe_head *sh)
3003	{
3004	sector_t stripe, stripe2;
3005	sector_t chunk_number;
3006	unsigned int chunk_offset;
3007	int pd_idx, qd_idx;
3008	int ddf_layout = `0`;
3009	sector_t new_sector;
3010	int algorithm = previous ? conf->prev_algo
3011	: conf->algorithm;
3012	int sectors_per_chunk = previous ? conf->prev_chunk_sectors
3013	: conf->chunk_sectors;
3014	int raid_disks = previous ? conf->previous_raid_disks
3015	: conf->raid_disks;
3016	int data_disks = raid_disks - conf->max_degraded;
3017
3018	/ First compute the information on this sector /
3019
3020	/*
3021	* Compute the chunk number and the sector offset inside the chunk
3022	*/
3023	chunk_offset = sector_div(r_sector, sectors_per_chunk);
3024	chunk_number = r_sector;
3025
3026	/*
3027	* Compute the stripe number
3028	*/
3029	stripe = chunk_number;
3030	*dd_idx = sector_div(stripe, data_disks);
3031	stripe2 = stripe;
3032	/*
3033	* Select the parity disk based on the user selected algorithm.
3034	*/
3035	pd_idx = qd_idx = -`1`;
3036	switch(conf->level) {
3037	case `4`:
3038	pd_idx = data_disks;
3039	break;
3040	case `5`:
3041	switch (algorithm) {
3042	case ALGORITHM_LEFT_ASYMMETRIC:
3043	pd_idx = data_disks - sector_div(stripe2, raid_disks);
3044	if (*dd_idx >= pd_idx)
3045	(*dd_idx)++;
3046	break;
3047	case ALGORITHM_RIGHT_ASYMMETRIC:
3048	pd_idx = sector_div(stripe2, raid_disks);
3049	if (*dd_idx >= pd_idx)
3050	(*dd_idx)++;
3051	break;
3052	case ALGORITHM_LEFT_SYMMETRIC:
3053	pd_idx = data_disks - sector_div(stripe2, raid_disks);
3054	dd_idx = (pd_idx + `1` + dd_idx) % raid_disks;
3055	break;
3056	case ALGORITHM_RIGHT_SYMMETRIC:
3057	pd_idx = sector_div(stripe2, raid_disks);
3058	dd_idx = (pd_idx + `1` + dd_idx) % raid_disks;
3059	break;
3060	case ALGORITHM_PARITY_0:
3061	pd_idx = `0`;
3062	(*dd_idx)++;
3063	break;
3064	case ALGORITHM_PARITY_N:
3065	pd_idx = data_disks;
3066	break;
3067	default:
3068	BUG();
3069	}
3070	break;
3071	case `6`:
3072
3073	switch (algorithm) {
3074	case ALGORITHM_LEFT_ASYMMETRIC:
3075	pd_idx = raid_disks - `1` - sector_div(stripe2, raid_disks);
3076	qd_idx = pd_idx + `1`;
3077	if (pd_idx == raid_disks-`1`) {
3078	(dd_idx)++; /* Q D D D P /
3079	qd_idx = `0`;
3080	} else if (*dd_idx >= pd_idx)
3081	(dd_idx) += `2`; /* D D P Q D /
3082	break;
3083	case ALGORITHM_RIGHT_ASYMMETRIC:
3084	pd_idx = sector_div(stripe2, raid_disks);
3085	qd_idx = pd_idx + `1`;
3086	if (pd_idx == raid_disks-`1`) {
3087	(dd_idx)++; /* Q D D D P /
3088	qd_idx = `0`;
3089	} else if (*dd_idx >= pd_idx)
3090	(dd_idx) += `2`; /* D D P Q D /
3091	break;
3092	case ALGORITHM_LEFT_SYMMETRIC:
3093	pd_idx = raid_disks - `1` - sector_div(stripe2, raid_disks);
3094	qd_idx = (pd_idx + `1`) % raid_disks;
3095	dd_idx = (pd_idx + `2` + dd_idx) % raid_disks;
3096	break;
3097	case ALGORITHM_RIGHT_SYMMETRIC:
3098	pd_idx = sector_div(stripe2, raid_disks);
3099	qd_idx = (pd_idx + `1`) % raid_disks;
3100	dd_idx = (pd_idx + `2` + dd_idx) % raid_disks;
3101	break;
3102
3103	case ALGORITHM_PARITY_0:
3104	pd_idx = `0`;
3105	qd_idx = `1`;
3106	(*dd_idx) += `2`;
3107	break;
3108	case ALGORITHM_PARITY_N:
3109	pd_idx = data_disks;
3110	qd_idx = data_disks + `1`;
3111	break;
3112
3113	case ALGORITHM_ROTATING_ZERO_RESTART:
3114	/ Exactly the same as RIGHT_ASYMMETRIC, but or*
3115	* of blocks for computing Q is different.
3116	*/
3117	pd_idx = sector_div(stripe2, raid_disks);
3118	qd_idx = pd_idx + `1`;
3119	if (pd_idx == raid_disks-`1`) {
3120	(dd_idx)++; /* Q D D D P /
3121	qd_idx = `0`;
3122	} else if (*dd_idx >= pd_idx)
3123	(dd_idx) += `2`; /* D D P Q D /
3124	ddf_layout = `1`;
3125	break;
3126
3127	case ALGORITHM_ROTATING_N_RESTART:
3128	/ Same a left_asymmetric, by first stripe is*
3129	* D D D P Q rather than
3130	* Q D D D P
3131	*/
3132	stripe2 += `1`;
3133	pd_idx = raid_disks - `1` - sector_div(stripe2, raid_disks);
3134	qd_idx = pd_idx + `1`;
3135	if (pd_idx == raid_disks-`1`) {
3136	(dd_idx)++; /* Q D D D P /
3137	qd_idx = `0`;
3138	} else if (*dd_idx >= pd_idx)
3139	(dd_idx) += `2`; /* D D P Q D /
3140	ddf_layout = `1`;
3141	break;
3142
3143	case ALGORITHM_ROTATING_N_CONTINUE:
3144	/ Same as left_symmetric but Q is before P /
3145	pd_idx = raid_disks - `1` - sector_div(stripe2, raid_disks);
3146	qd_idx = (pd_idx + raid_disks - `1`) % raid_disks;
3147	dd_idx = (pd_idx + `1` + dd_idx) % raid_disks;
3148	ddf_layout = `1`;
3149	break;
3150
3151	case ALGORITHM_LEFT_ASYMMETRIC_6:
3152	/ RAID5 left_asymmetric, with Q on last device /
3153	pd_idx = data_disks - sector_div(stripe2, raid_disks-`1`);
3154	if (*dd_idx >= pd_idx)
3155	(*dd_idx)++;
3156	qd_idx = raid_disks - `1`;
3157	break;
3158
3159	case ALGORITHM_RIGHT_ASYMMETRIC_6:
3160	pd_idx = sector_div(stripe2, raid_disks-`1`);
3161	if (*dd_idx >= pd_idx)
3162	(*dd_idx)++;
3163	qd_idx = raid_disks - `1`;
3164	break;
3165
3166	case ALGORITHM_LEFT_SYMMETRIC_6:
3167	pd_idx = data_disks - sector_div(stripe2, raid_disks-`1`);
3168	dd_idx = (pd_idx + `1` + dd_idx) % (raid_disks-`1`);
3169	qd_idx = raid_disks - `1`;
3170	break;
3171
3172	case ALGORITHM_RIGHT_SYMMETRIC_6:
3173	pd_idx = sector_div(stripe2, raid_disks-`1`);
3174	dd_idx = (pd_idx + `1` + dd_idx) % (raid_disks-`1`);
3175	qd_idx = raid_disks - `1`;
3176	break;
3177
3178	case ALGORITHM_PARITY_0_6:
3179	pd_idx = `0`;
3180	(*dd_idx)++;
3181	qd_idx = raid_disks - `1`;
3182	break;
3183
3184	default:
3185	BUG();
3186	}
3187	break;
3188	}
3189
3190	if (sh) {
3191	sh->pd_idx = pd_idx;
3192	sh->qd_idx = qd_idx;
3193	sh->ddf_layout = ddf_layout;
3194	}
3195	/*
3196	* Finally, compute the new sector number
3197	*/
3198	new_sector = (sector_t)stripe * sectors_per_chunk + chunk_offset;
3199	return new_sector;
3200	}
3201
3202	sector_t raid5_compute_blocknr(struct stripe_head sh, int* i, int previous)
3203	{
3204	struct r5conf *conf = sh->raid_conf;
3205	int raid_disks = sh->disks;
3206	int data_disks = raid_disks - conf->max_degraded;
3207	sector_t new_sector = sh->sector, check;
3208	int sectors_per_chunk = previous ? conf->prev_chunk_sectors
3209	: conf->chunk_sectors;
3210	int algorithm = previous ? conf->prev_algo
3211	: conf->algorithm;
3212	sector_t stripe;
3213	int chunk_offset;
3214	sector_t chunk_number;
3215	int dummy1, dd_idx = i;
3216	sector_t r_sector;
3217	struct stripe_head sh2;
3218
3219	chunk_offset = sector_div(new_sector, sectors_per_chunk);
3220	stripe = new_sector;
3221
3222	if (i == sh->pd_idx)
3223	return `0`;
3224	switch(conf->level) {
3225	case `4`: break;
3226	case `5`:
3227	switch (algorithm) {
3228	case ALGORITHM_LEFT_ASYMMETRIC:
3229	case ALGORITHM_RIGHT_ASYMMETRIC:
3230	if (i > sh->pd_idx)
3231	i--;
3232	break;
3233	case ALGORITHM_LEFT_SYMMETRIC:
3234	case ALGORITHM_RIGHT_SYMMETRIC:
3235	if (i < sh->pd_idx)
3236	i += raid_disks;
3237	i -= (sh->pd_idx + `1`);
3238	break;
3239	case ALGORITHM_PARITY_0:
3240	i -= `1`;
3241	break;
3242	case ALGORITHM_PARITY_N:
3243	break;
3244	default:
3245	BUG();
3246	}
3247	break;
3248	case `6`:
3249	if (i == sh->qd_idx)
3250	return `0`; / It is the Q disk /
3251	switch (algorithm) {
3252	case ALGORITHM_LEFT_ASYMMETRIC:
3253	case ALGORITHM_RIGHT_ASYMMETRIC:
3254	case ALGORITHM_ROTATING_ZERO_RESTART:
3255	case ALGORITHM_ROTATING_N_RESTART:
3256	if (sh->pd_idx == raid_disks-`1`)
3257	i--; / Q D D D P /
3258	else if (i > sh->pd_idx)
3259	i -= `2`; / D D P Q D /
3260	break;
3261	case ALGORITHM_LEFT_SYMMETRIC:
3262	case ALGORITHM_RIGHT_SYMMETRIC:
3263	if (sh->pd_idx == raid_disks-`1`)
3264	i--; / Q D D D P /
3265	else {
3266	/ D D P Q D /
3267	if (i < sh->pd_idx)
3268	i += raid_disks;
3269	i -= (sh->pd_idx + `2`);
3270	}
3271	break;
3272	case ALGORITHM_PARITY_0:
3273	i -= `2`;
3274	break;
3275	case ALGORITHM_PARITY_N:
3276	break;
3277	case ALGORITHM_ROTATING_N_CONTINUE:
3278	/ Like left_symmetric, but P is before Q /
3279	if (sh->pd_idx == `0`)
3280	i--; / P D D D Q /
3281	else {
3282	/ D D Q P D /
3283	if (i < sh->pd_idx)
3284	i += raid_disks;
3285	i -= (sh->pd_idx + `1`);
3286	}
3287	break;
3288	case ALGORITHM_LEFT_ASYMMETRIC_6:
3289	case ALGORITHM_RIGHT_ASYMMETRIC_6:
3290	if (i > sh->pd_idx)
3291	i--;
3292	break;
3293	case ALGORITHM_LEFT_SYMMETRIC_6:
3294	case ALGORITHM_RIGHT_SYMMETRIC_6:
3295	if (i < sh->pd_idx)
3296	i += data_disks + `1`;
3297	i -= (sh->pd_idx + `1`);
3298	break;
3299	case ALGORITHM_PARITY_0_6:
3300	i -= `1`;
3301	break;
3302	default:
3303	BUG();
3304	}
3305	break;
3306	}
3307
3308	chunk_number = stripe * data_disks + i;
3309	r_sector = chunk_number * sectors_per_chunk + chunk_offset;
3310
3311	check = raid5_compute_sector(conf, r_sector,
3312	previous, dd_idx: &dummy1, sh: &sh2);
3313	if (check != sh->sector \|\| dummy1 != dd_idx \|\| sh2.pd_idx != sh->pd_idx
3314	\|\| sh2.qd_idx != sh->qd_idx) {
3315	pr_warn("md/raid:%s: compute_blocknr: map not correct\n",
3316	mdname(conf->mddev));
3317	return `0`;
3318	}
3319	return r_sector;
3320	}
3321
3322	/*
3323	* There are cases where we want handle_stripe_dirtying() and
3324	* schedule_reconstruction() to delay towrite to some dev of a stripe.
3325	*
3326	* This function checks whether we want to delay the towrite. Specifically,
3327	* we delay the towrite when:
3328	*
3329	* 1. degraded stripe has a non-overwrite to the missing dev, AND this
3330	* stripe has data in journal (for other devices).
3331	*
3332	* In this case, when reading data for the non-overwrite dev, it is
3333	* necessary to handle complex rmw of write back cache (prexor with
3334	* orig_page, and xor with page). To keep read path simple, we would
3335	* like to flush data in journal to RAID disks first, so complex rmw
3336	* is handled in the write patch (handle_stripe_dirtying).
3337	*
3338	* 2. when journal space is critical (R5C_LOG_CRITICAL=1)
3339	*
3340	* It is important to be able to flush all stripes in raid5-cache.
3341	* Therefore, we need reserve some space on the journal device for
3342	* these flushes. If flush operation includes pending writes to the
3343	* stripe, we need to reserve (conf->raid_disk + 1) pages per stripe
3344	* for the flush out. If we exclude these pending writes from flush
3345	* operation, we only need (conf->max_degraded + 1) pages per stripe.
3346	* Therefore, excluding pending writes in these cases enables more
3347	* efficient use of the journal device.
3348	*
3349	* Note: To make sure the stripe makes progress, we only delay
3350	* towrite for stripes with data already in journal (injournal > 0).
3351	* When LOG_CRITICAL, stripes with injournal == 0 will be sent to
3352	* no_space_stripes list.
3353	*
3354	* 3. during journal failure
3355	* In journal failure, we try to flush all cached data to raid disks
3356	* based on data in stripe cache. The array is read-only to upper
3357	* layers, so we would skip all pending writes.
3358	*
3359	*/
3360	static inline bool delay_towrite(struct r5conf *conf,
3361	struct r5dev *dev,
3362	struct stripe_head_state *s)
3363	{
3364	/ case 1 above /
3365	if (!test_bit(R5_OVERWRITE, &dev->flags) &&
3366	!test_bit(R5_Insync, &dev->flags) && s->injournal)
3367	return true;
3368	/ case 2 above /
3369	if (test_bit(R5C_LOG_CRITICAL, &conf->cache_state) &&
3370	s->injournal > `0`)
3371	return true;
3372	/ case 3 above /
3373	if (s->log_failed && s->injournal)
3374	return true;
3375	return false;
3376	}
3377
3378	static void
3379	schedule_reconstruction(struct stripe_head sh, struct* stripe_head_state *s,
3380	int rcw, int expand)
3381	{
3382	int i, pd_idx = sh->pd_idx, qd_idx = sh->qd_idx, disks = sh->disks;
3383	struct r5conf *conf = sh->raid_conf;
3384	int level = conf->level;
3385
3386	if (rcw) {
3387	/*
3388	* In some cases, handle_stripe_dirtying initially decided to
3389	* run rmw and allocates extra page for prexor. However, rcw is
3390	* cheaper later on. We need to free the extra page now,
3391	* because we won't be able to do that in ops_complete_prexor().
3392	*/
3393	r5c_release_extra_page(sh);
3394
3395	for (i = disks; i--; ) {
3396	struct r5dev *dev = &sh->dev[i];
3397
3398	if (dev->towrite && !delay_towrite(conf, dev, s)) {
3399	set_bit(nr: R5_LOCKED, addr: &dev->flags);
3400	set_bit(nr: R5_Wantdrain, addr: &dev->flags);
3401	if (!expand)
3402	clear_bit(nr: R5_UPTODATE, addr: &dev->flags);
3403	s->locked++;
3404	} else if (test_bit(R5_InJournal, &dev->flags)) {
3405	set_bit(nr: R5_LOCKED, addr: &dev->flags);
3406	s->locked++;
3407	}
3408	}
3409	/ if we are not expanding this is a proper write request, and*
3410	* there will be bios with new data to be drained into the
3411	* stripe cache
3412	*/
3413	if (!expand) {
3414	if (!s->locked)
3415	/ False alarm, nothing to do /
3416	return;
3417	sh->reconstruct_state = reconstruct_state_drain_run;
3418	set_bit(nr: STRIPE_OP_BIODRAIN, addr: &s->ops_request);
3419	} else
3420	sh->reconstruct_state = reconstruct_state_run;
3421
3422	set_bit(nr: STRIPE_OP_RECONSTRUCT, addr: &s->ops_request);
3423
3424	if (s->locked + conf->max_degraded == disks)
3425	if (!test_and_set_bit(nr: STRIPE_FULL_WRITE, addr: &sh->state))
3426	atomic_inc(v: &conf->pending_full_writes);
3427	} else {
3428	BUG_ON(!(test_bit(R5_UPTODATE, &sh->dev[pd_idx].flags) \|\|
3429	test_bit(R5_Wantcompute, &sh->dev[pd_idx].flags)));
3430	BUG_ON(level == `6` &&
3431	(!(test_bit(R5_UPTODATE, &sh->dev[qd_idx].flags) \|\|
3432	test_bit(R5_Wantcompute, &sh->dev[qd_idx].flags))));
3433
3434	for (i = disks; i--; ) {
3435	struct r5dev *dev = &sh->dev[i];
3436	if (i == pd_idx \|\| i == qd_idx)
3437	continue;
3438
3439	if (dev->towrite &&
3440	(test_bit(R5_UPTODATE, &dev->flags) \|\|
3441	test_bit(R5_Wantcompute, &dev->flags))) {
3442	set_bit(nr: R5_Wantdrain, addr: &dev->flags);
3443	set_bit(nr: R5_LOCKED, addr: &dev->flags);
3444	clear_bit(nr: R5_UPTODATE, addr: &dev->flags);
3445	s->locked++;
3446	} else if (test_bit(R5_InJournal, &dev->flags)) {
3447	set_bit(nr: R5_LOCKED, addr: &dev->flags);
3448	s->locked++;
3449	}
3450	}
3451	if (!s->locked)
3452	/ False alarm - nothing to do /
3453	return;
3454	sh->reconstruct_state = reconstruct_state_prexor_drain_run;
3455	set_bit(nr: STRIPE_OP_PREXOR, addr: &s->ops_request);
3456	set_bit(nr: STRIPE_OP_BIODRAIN, addr: &s->ops_request);
3457	set_bit(nr: STRIPE_OP_RECONSTRUCT, addr: &s->ops_request);
3458	}
3459
3460	/ keep the parity disk(s) locked while asynchronous operations*
3461	* are in flight
3462	*/
3463	set_bit(nr: R5_LOCKED, addr: &sh->dev[pd_idx].flags);
3464	clear_bit(nr: R5_UPTODATE, addr: &sh->dev[pd_idx].flags);
3465	s->locked++;
3466
3467	if (level == `6`) {
3468	int qd_idx = sh->qd_idx;
3469	struct r5dev *dev = &sh->dev[qd_idx];
3470
3471	set_bit(nr: R5_LOCKED, addr: &dev->flags);
3472	clear_bit(nr: R5_UPTODATE, addr: &dev->flags);
3473	s->locked++;
3474	}
3475
3476	if (raid5_has_ppl(conf: sh->raid_conf) && sh->ppl_page &&
3477	test_bit(STRIPE_OP_BIODRAIN, &s->ops_request) &&
3478	!test_bit(STRIPE_FULL_WRITE, &sh->state) &&
3479	test_bit(R5_Insync, &sh->dev[pd_idx].flags))
3480	set_bit(nr: STRIPE_OP_PARTIAL_PARITY, addr: &s->ops_request);
3481
3482	pr_debug("%s: stripe %llu locked: %d ops_request: %lx\n",
3483	__func__, (unsigned long long)sh->sector,
3484	s->locked, s->ops_request);
3485	}
3486
3487	static bool stripe_bio_overlaps(struct stripe_head sh, struct* bio *bi,
3488	int dd_idx, int forwrite)
3489	{
3490	struct r5conf *conf = sh->raid_conf;
3491	struct bio **bip;
3492
3493	pr_debug("checking bi b#%llu to stripe s#%llu\n",
3494	bi->bi_iter.bi_sector, sh->sector);
3495
3496	/ Don't allow new IO added to stripes in batch list /
3497	if (sh->batch_head)
3498	return true;
3499
3500	if (forwrite)
3501	bip = &sh->dev[dd_idx].towrite;
3502	else
3503	bip = &sh->dev[dd_idx].toread;
3504
3505	while (bip && (bip)->bi_iter.bi_sector < bi->bi_iter.bi_sector) {
3506	if (bio_end_sector(*bip) > bi->bi_iter.bi_sector)
3507	return true;
3508	bip = &(*bip)->bi_next;
3509	}
3510
3511	if (bip && (bip)->bi_iter.bi_sector < bio_end_sector(bi))
3512	return true;
3513
3514	if (forwrite && raid5_has_ppl(conf)) {
3515	/*
3516	* With PPL only writes to consecutive data chunks within a
3517	* stripe are allowed because for a single stripe_head we can
3518	* only have one PPL entry at a time, which describes one data
3519	* range. Not really an overlap, but wait_for_overlap can be
3520	* used to handle this.
3521	*/
3522	sector_t sector;
3523	sector_t first = `0`;
3524	sector_t last = `0`;
3525	int count = `0`;
3526	int i;
3527
3528	for (i = `0`; i < sh->disks; i++) {
3529	if (i != sh->pd_idx &&
3530	(i == dd_idx \|\| sh->dev[i].towrite)) {
3531	sector = sh->dev[i].sector;
3532	if (count == `0` \|\| sector < first)
3533	first = sector;
3534	if (sector > last)
3535	last = sector;
3536	count++;
3537	}
3538	}
3539
3540	if (first + conf->chunk_sectors * (count - `1`) != last)
3541	return true;
3542	}
3543
3544	return false;
3545	}
3546
3547	static void __add_stripe_bio(struct stripe_head sh, struct* bio *bi,
3548	int dd_idx, int forwrite, int previous)
3549	{
3550	struct r5conf *conf = sh->raid_conf;
3551	struct bio **bip;
3552	int firstwrite = `0`;
3553
3554	if (forwrite) {
3555	bip = &sh->dev[dd_idx].towrite;
3556	if (!*bip)
3557	firstwrite = `1`;
3558	} else {
3559	bip = &sh->dev[dd_idx].toread;
3560	}
3561
3562	while (bip && (bip)->bi_iter.bi_sector < bi->bi_iter.bi_sector)
3563	bip = &(*bip)->bi_next;
3564
3565	if (!forwrite \|\| previous)
3566	clear_bit(nr: STRIPE_BATCH_READY, addr: &sh->state);
3567
3568	BUG_ON(bip && bi->bi_next && (bip) != bi->bi_next);
3569	if (*bip)
3570	bi->bi_next = *bip;
3571	*bip = bi;
3572	bio_inc_remaining(bio: bi);
3573	md_write_inc(mddev: conf->mddev, bi);
3574
3575	if (forwrite) {
3576	/ check if page is covered /
3577	sector_t sector = sh->dev[dd_idx].sector;
3578	for (bi=sh->dev[dd_idx].towrite;
3579	sector < sh->dev[dd_idx].sector + RAID5_STRIPE_SECTORS(conf) &&
3580	bi && bi->bi_iter.bi_sector <= sector;
3581	bi = r5_next_bio(conf, bio: bi, sector: sh->dev[dd_idx].sector)) {
3582	if (bio_end_sector(bi) >= sector)
3583	sector = bio_end_sector(bi);
3584	}
3585	if (sector >= sh->dev[dd_idx].sector + RAID5_STRIPE_SECTORS(conf))
3586	if (!test_and_set_bit(nr: R5_OVERWRITE, addr: &sh->dev[dd_idx].flags))
3587	sh->overwrite_disks++;
3588	}
3589
3590	pr_debug("added bi b#%llu to stripe s#%llu, disk %d, logical %llu\n",
3591	(*bip)->bi_iter.bi_sector, sh->sector, dd_idx,
3592	sh->dev[dd_idx].sector);
3593
3594	if (conf->mddev->bitmap && firstwrite) {
3595	/ Cannot hold spinlock over bitmap_startwrite,*
3596	* but must ensure this isn't added to a batch until
3597	* we have added to the bitmap and set bm_seq.
3598	* So set STRIPE_BITMAP_PENDING to prevent
3599	* batching.
3600	* If multiple __add_stripe_bio() calls race here they
3601	* much all set STRIPE_BITMAP_PENDING. So only the first one
3602	* to complete "bitmap_startwrite" gets to set
3603	* STRIPE_BIT_DELAY. This is important as once a stripe
3604	* is added to a batch, STRIPE_BIT_DELAY cannot be changed
3605	* any more.
3606	*/
3607	set_bit(nr: STRIPE_BITMAP_PENDING, addr: &sh->state);
3608	spin_unlock_irq(lock: &sh->stripe_lock);
3609	md_bitmap_startwrite(bitmap: conf->mddev->bitmap, offset: sh->sector,
3610	RAID5_STRIPE_SECTORS(conf), behind: `0`);
3611	spin_lock_irq(lock: &sh->stripe_lock);
3612	clear_bit(nr: STRIPE_BITMAP_PENDING, addr: &sh->state);
3613	if (!sh->batch_head) {
3614	sh->bm_seq = conf->seq_flush+`1`;
3615	set_bit(nr: STRIPE_BIT_DELAY, addr: &sh->state);
3616	}
3617	}
3618	}
3619
3620	/*
3621	* Each stripe/dev can have one or more bios attached.
3622	* toread/towrite point to the first in a chain.
3623	* The bi_next chain must be in order.
3624	*/
3625	static bool add_stripe_bio(struct stripe_head sh, struct* bio *bi,
3626	int dd_idx, int forwrite, int previous)
3627	{
3628	spin_lock_irq(lock: &sh->stripe_lock);
3629
3630	if (stripe_bio_overlaps(sh, bi, dd_idx, forwrite)) {
3631	set_bit(nr: R5_Overlap, addr: &sh->dev[dd_idx].flags);
3632	spin_unlock_irq(lock: &sh->stripe_lock);
3633	return false;
3634	}
3635
3636	__add_stripe_bio(sh, bi, dd_idx, forwrite, previous);
3637	spin_unlock_irq(lock: &sh->stripe_lock);
3638	return true;
3639	}
3640
3641	static void end_reshape(struct r5conf *conf);
3642
3643	static void stripe_set_idx(sector_t stripe, struct r5conf conf, int* previous,
3644	struct stripe_head *sh)
3645	{
3646	int sectors_per_chunk =
3647	previous ? conf->prev_chunk_sectors : conf->chunk_sectors;
3648	int dd_idx;
3649	int chunk_offset = sector_div(stripe, sectors_per_chunk);
3650	int disks = previous ? conf->previous_raid_disks : conf->raid_disks;
3651
3652	raid5_compute_sector(conf,
3653	r_sector: stripe * (disks - conf->max_degraded)
3654	*sectors_per_chunk + chunk_offset,
3655	previous,
3656	dd_idx: &dd_idx, sh);
3657	}
3658
3659	static void
3660	handle_failed_stripe(struct r5conf conf, struct* stripe_head *sh,
3661	struct stripe_head_state s, int* disks)
3662	{
3663	int i;
3664	BUG_ON(sh->batch_head);
3665	for (i = disks; i--; ) {
3666	struct bio *bi;
3667	int bitmap_end = `0`;
3668
3669	if (test_bit(R5_ReadError, &sh->dev[i].flags)) {
3670	struct md_rdev *rdev;
3671	rcu_read_lock();
3672	rdev = rcu_dereference(conf->disks[i].rdev);
3673	if (rdev && test_bit(In_sync, &rdev->flags) &&
3674	!test_bit(Faulty, &rdev->flags))
3675	atomic_inc(v: &rdev->nr_pending);
3676	else
3677	rdev = NULL;
3678	rcu_read_unlock();
3679	if (rdev) {
3680	if (!rdev_set_badblocks(
3681	rdev,
3682	s: sh->sector,
3683	RAID5_STRIPE_SECTORS(conf), is_new: `0`))
3684	md_error(mddev: conf->mddev, rdev);
3685	rdev_dec_pending(rdev, mddev: conf->mddev);
3686	}
3687	}
3688	spin_lock_irq(lock: &sh->stripe_lock);
3689	/ fail all writes first /
3690	bi = sh->dev[i].towrite;
3691	sh->dev[i].towrite = NULL;
3692	sh->overwrite_disks = `0`;
3693	spin_unlock_irq(lock: &sh->stripe_lock);
3694	if (bi)
3695	bitmap_end = `1`;
3696
3697	log_stripe_write_finished(sh);
3698
3699	if (test_and_clear_bit(nr: R5_Overlap, addr: &sh->dev[i].flags))
3700	wake_up(&conf->wait_for_overlap);
3701
3702	while (bi && bi->bi_iter.bi_sector <
3703	sh->dev[i].sector + RAID5_STRIPE_SECTORS(conf)) {
3704	struct bio *nextbi = r5_next_bio(conf, bio: bi, sector: sh->dev[i].sector);
3705
3706	md_write_end(mddev: conf->mddev);
3707	bio_io_error(bio: bi);
3708	bi = nextbi;
3709	}
3710	if (bitmap_end)
3711	md_bitmap_endwrite(bitmap: conf->mddev->bitmap, offset: sh->sector,
3712	RAID5_STRIPE_SECTORS(conf), success: `0`, behind: `0`);
3713	bitmap_end = `0`;
3714	/ and fail all 'written' /
3715	bi = sh->dev[i].written;
3716	sh->dev[i].written = NULL;
3717	if (test_and_clear_bit(nr: R5_SkipCopy, addr: &sh->dev[i].flags)) {
3718	WARN_ON(test_bit(R5_UPTODATE, &sh->dev[i].flags));
3719	sh->dev[i].page = sh->dev[i].orig_page;
3720	}
3721
3722	if (bi) bitmap_end = `1`;
3723	while (bi && bi->bi_iter.bi_sector <
3724	sh->dev[i].sector + RAID5_STRIPE_SECTORS(conf)) {
3725	struct bio *bi2 = r5_next_bio(conf, bio: bi, sector: sh->dev[i].sector);
3726
3727	md_write_end(mddev: conf->mddev);
3728	bio_io_error(bio: bi);
3729	bi = bi2;
3730	}
3731
3732	/ fail any reads if this device is non-operational and*
3733	* the data has not reached the cache yet.
3734	*/
3735	if (!test_bit(R5_Wantfill, &sh->dev[i].flags) &&
3736	s->failed > conf->max_degraded &&
3737	(!test_bit(R5_Insync, &sh->dev[i].flags) \|\|
3738	test_bit(R5_ReadError, &sh->dev[i].flags))) {
3739	spin_lock_irq(lock: &sh->stripe_lock);
3740	bi = sh->dev[i].toread;
3741	sh->dev[i].toread = NULL;
3742	spin_unlock_irq(lock: &sh->stripe_lock);
3743	if (test_and_clear_bit(nr: R5_Overlap, addr: &sh->dev[i].flags))
3744	wake_up(&conf->wait_for_overlap);
3745	if (bi)
3746	s->to_read--;
3747	while (bi && bi->bi_iter.bi_sector <
3748	sh->dev[i].sector + RAID5_STRIPE_SECTORS(conf)) {
3749	struct bio *nextbi =
3750	r5_next_bio(conf, bio: bi, sector: sh->dev[i].sector);
3751
3752	bio_io_error(bio: bi);
3753	bi = nextbi;
3754	}
3755	}
3756	if (bitmap_end)
3757	md_bitmap_endwrite(bitmap: conf->mddev->bitmap, offset: sh->sector,
3758	RAID5_STRIPE_SECTORS(conf), success: `0`, behind: `0`);
3759	/ If we were in the middle of a write the parity block might*
3760	* still be locked - so just clear all R5_LOCKED flags
3761	*/
3762	clear_bit(nr: R5_LOCKED, addr: &sh->dev[i].flags);
3763	}
3764	s->to_write = `0`;
3765	s->written = `0`;
3766
3767	if (test_and_clear_bit(nr: STRIPE_FULL_WRITE, addr: &sh->state))
3768	if (atomic_dec_and_test(v: &conf->pending_full_writes))
3769	md_wakeup_thread(thread: conf->mddev->thread);
3770	}
3771
3772	static void
3773	handle_failed_sync(struct r5conf conf, struct* stripe_head *sh,
3774	struct stripe_head_state *s)
3775	{
3776	int abort = `0`;
3777	int i;
3778
3779	BUG_ON(sh->batch_head);
3780	clear_bit(nr: STRIPE_SYNCING, addr: &sh->state);
3781	if (test_and_clear_bit(nr: R5_Overlap, addr: &sh->dev[sh->pd_idx].flags))
3782	wake_up(&conf->wait_for_overlap);
3783	s->syncing = `0`;
3784	s->replacing = `0`;
3785	/ There is nothing more to do for sync/check/repair.*
3786	* Don't even need to abort as that is handled elsewhere
3787	* if needed, and not always wanted e.g. if there is a known
3788	* bad block here.
3789	* For recover/replace we need to record a bad block on all
3790	* non-sync devices, or abort the recovery
3791	*/
3792	if (test_bit(MD_RECOVERY_RECOVER, &conf->mddev->recovery)) {
3793	/ During recovery devices cannot be removed, so*
3794	* locking and refcounting of rdevs is not needed
3795	*/
3796	rcu_read_lock();
3797	for (i = `0`; i < conf->raid_disks; i++) {
3798	struct md_rdev *rdev = rcu_dereference(conf->disks[i].rdev);
3799	if (rdev
3800	&& !test_bit(Faulty, &rdev->flags)
3801	&& !test_bit(In_sync, &rdev->flags)
3802	&& !rdev_set_badblocks(rdev, s: sh->sector,
3803	RAID5_STRIPE_SECTORS(conf), is_new: `0`))
3804	abort = `1`;
3805	rdev = rcu_dereference(conf->disks[i].replacement);
3806	if (rdev
3807	&& !test_bit(Faulty, &rdev->flags)
3808	&& !test_bit(In_sync, &rdev->flags)
3809	&& !rdev_set_badblocks(rdev, s: sh->sector,
3810	RAID5_STRIPE_SECTORS(conf), is_new: `0`))
3811	abort = `1`;
3812	}
3813	rcu_read_unlock();
3814	if (abort)
3815	conf->recovery_disabled =
3816	conf->mddev->recovery_disabled;
3817	}
3818	md_done_sync(mddev: conf->mddev, RAID5_STRIPE_SECTORS(conf), ok: !abort);
3819	}
3820
3821	static int want_replace(struct stripe_head sh, int* disk_idx)
3822	{
3823	struct md_rdev *rdev;
3824	int rv = `0`;
3825
3826	rcu_read_lock();
3827	rdev = rcu_dereference(sh->raid_conf->disks[disk_idx].replacement);
3828	if (rdev
3829	&& !test_bit(Faulty, &rdev->flags)
3830	&& !test_bit(In_sync, &rdev->flags)
3831	&& (rdev->recovery_offset <= sh->sector
3832	\|\| rdev->mddev->recovery_cp <= sh->sector))
3833	rv = `1`;
3834	rcu_read_unlock();
3835	return rv;
3836	}
3837
3838	static int need_this_block(struct stripe_head sh, struct* stripe_head_state *s,
3839	int disk_idx, int disks)
3840	{
3841	struct r5dev *dev = &sh->dev[disk_idx];
3842	struct r5dev *fdev[`2`] = { &sh->dev[s->failed_num[`0`]],
3843	&sh->dev[s->failed_num[`1`]] };
3844	int i;
3845	bool force_rcw = (sh->raid_conf->rmw_level == PARITY_DISABLE_RMW);
3846
3847
3848	if (test_bit(R5_LOCKED, &dev->flags) \|\|
3849	test_bit(R5_UPTODATE, &dev->flags))
3850	/ No point reading this as we already have it or have*
3851	* decided to get it.
3852	*/
3853	return `0`;
3854
3855	if (dev->toread \|\|
3856	(dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags)))
3857	/ We need this block to directly satisfy a request /
3858	return `1`;
3859
3860	if (s->syncing \|\| s->expanding \|\|
3861	(s->replacing && want_replace(sh, disk_idx)))
3862	/ When syncing, or expanding we read everything.*
3863	* When replacing, we need the replaced block.
3864	*/
3865	return `1`;
3866
3867	if ((s->failed >= `1` && fdev[`0`]->toread) \|\|
3868	(s->failed >= `2` && fdev[`1`]->toread))
3869	/ If we want to read from a failed device, then*
3870	* we need to actually read every other device.
3871	*/
3872	return `1`;
3873
3874	/ Sometimes neither read-modify-write nor reconstruct-write*
3875	* cycles can work. In those cases we read every block we
3876	* can. Then the parity-update is certain to have enough to
3877	* work with.
3878	* This can only be a problem when we need to write something,
3879	* and some device has failed. If either of those tests
3880	* fail we need look no further.
3881	*/
3882	if (!s->failed \|\| !s->to_write)
3883	return `0`;
3884
3885	if (test_bit(R5_Insync, &dev->flags) &&
3886	!test_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
3887	/ Pre-reads at not permitted until after short delay*
3888	* to gather multiple requests. However if this
3889	* device is no Insync, the block could only be computed
3890	* and there is no need to delay that.
3891	*/
3892	return `0`;
3893
3894	for (i = `0`; i < s->failed && i < `2`; i++) {
3895	if (fdev[i]->towrite &&
3896	!test_bit(R5_UPTODATE, &fdev[i]->flags) &&
3897	!test_bit(R5_OVERWRITE, &fdev[i]->flags))
3898	/ If we have a partial write to a failed*
3899	* device, then we will need to reconstruct
3900	* the content of that device, so all other
3901	* devices must be read.
3902	*/
3903	return `1`;
3904
3905	if (s->failed >= `2` &&
3906	(fdev[i]->towrite \|\|
3907	s->failed_num[i] == sh->pd_idx \|\|
3908	s->failed_num[i] == sh->qd_idx) &&
3909	!test_bit(R5_UPTODATE, &fdev[i]->flags))
3910	/ In max degraded raid6, If the failed disk is P, Q,*
3911	* or we want to read the failed disk, we need to do
3912	* reconstruct-write.
3913	*/
3914	force_rcw = true;
3915	}
3916
3917	/ If we are forced to do a reconstruct-write, because parity*
3918	* cannot be trusted and we are currently recovering it, there
3919	* is extra need to be careful.
3920	* If one of the devices that we would need to read, because
3921	* it is not being overwritten (and maybe not written at all)
3922	* is missing/faulty, then we need to read everything we can.
3923	*/
3924	if (!force_rcw &&
3925	sh->sector < sh->raid_conf->mddev->recovery_cp)
3926	/ reconstruct-write isn't being forced /
3927	return `0`;
3928	for (i = `0`; i < s->failed && i < `2`; i++) {
3929	if (s->failed_num[i] != sh->pd_idx &&
3930	s->failed_num[i] != sh->qd_idx &&
3931	!test_bit(R5_UPTODATE, &fdev[i]->flags) &&
3932	!test_bit(R5_OVERWRITE, &fdev[i]->flags))
3933	return `1`;
3934	}
3935
3936	return `0`;
3937	}
3938
3939	/ fetch_block - checks the given member device to see if its data needs*
3940	* to be read or computed to satisfy a request.
3941	*
3942	* Returns 1 when no more member devices need to be checked, otherwise returns
3943	* 0 to tell the loop in handle_stripe_fill to continue
3944	*/
3945	static int fetch_block(struct stripe_head sh, struct* stripe_head_state *s,
3946	int disk_idx, int disks)
3947	{
3948	struct r5dev *dev = &sh->dev[disk_idx];
3949
3950	/ is the data in this block needed, and can we get it? /
3951	if (need_this_block(sh, s, disk_idx, disks)) {
3952	/ we would like to get this block, possibly by computing it,*
3953	* otherwise read it if the backing disk is insync
3954	*/
3955	BUG_ON(test_bit(R5_Wantcompute, &dev->flags));
3956	BUG_ON(test_bit(R5_Wantread, &dev->flags));
3957	BUG_ON(sh->batch_head);
3958
3959	/*
3960	* In the raid6 case if the only non-uptodate disk is P
3961	* then we already trusted P to compute the other failed
3962	* drives. It is safe to compute rather than re-read P.
3963	* In other cases we only compute blocks from failed
3964	* devices, otherwise check/repair might fail to detect
3965	* a real inconsistency.
3966	*/
3967
3968	if ((s->uptodate == disks - `1`) &&
3969	((sh->qd_idx >= `0` && sh->pd_idx == disk_idx) \|\|
3970	(s->failed && (disk_idx == s->failed_num[`0`] \|\|
3971	disk_idx == s->failed_num[`1`])))) {
3972	/ have disk failed, and we're requested to fetch it;*
3973	* do compute it
3974	*/
3975	pr_debug("Computing stripe %llu block %d\n",
3976	(unsigned long long)sh->sector, disk_idx);
3977	set_bit(nr: STRIPE_COMPUTE_RUN, addr: &sh->state);
3978	set_bit(nr: STRIPE_OP_COMPUTE_BLK, addr: &s->ops_request);
3979	set_bit(nr: R5_Wantcompute, addr: &dev->flags);
3980	sh->ops.target = disk_idx;
3981	sh->ops.target2 = -`1`; / no 2nd target /
3982	s->req_compute = `1`;
3983	/ Careful: from this point on 'uptodate' is in the eye*
3984	* of raid_run_ops which services 'compute' operations
3985	* before writes. R5_Wantcompute flags a block that will
3986	* be R5_UPTODATE by the time it is needed for a
3987	* subsequent operation.
3988	*/
3989	s->uptodate++;
3990	return `1`;
3991	} else if (s->uptodate == disks-`2` && s->failed >= `2`) {
3992	/ Computing 2-failure is very expensive; only*
3993	* do it if failed >= 2
3994	*/
3995	int other;
3996	for (other = disks; other--; ) {
3997	if (other == disk_idx)
3998	continue;
3999	if (!test_bit(R5_UPTODATE,
4000	&sh->dev[other].flags))
4001	break;
4002	}
4003	BUG_ON(other < `0`);
4004	pr_debug("Computing stripe %llu blocks %d,%d\n",
4005	(unsigned long long)sh->sector,
4006	disk_idx, other);
4007	set_bit(nr: STRIPE_COMPUTE_RUN, addr: &sh->state);
4008	set_bit(nr: STRIPE_OP_COMPUTE_BLK, addr: &s->ops_request);
4009	set_bit(nr: R5_Wantcompute, addr: &sh->dev[disk_idx].flags);
4010	set_bit(nr: R5_Wantcompute, addr: &sh->dev[other].flags);
4011	sh->ops.target = disk_idx;
4012	sh->ops.target2 = other;
4013	s->uptodate += `2`;
4014	s->req_compute = `1`;
4015	return `1`;
4016	} else if (test_bit(R5_Insync, &dev->flags)) {
4017	set_bit(nr: R5_LOCKED, addr: &dev->flags);
4018	set_bit(nr: R5_Wantread, addr: &dev->flags);
4019	s->locked++;
4020	pr_debug("Reading block %d (sync=%d)\n",
4021	disk_idx, s->syncing);
4022	}
4023	}
4024
4025	return `0`;
4026	}
4027
4028	/*
4029	* handle_stripe_fill - read or compute data to satisfy pending requests.
4030	*/
4031	static void handle_stripe_fill(struct stripe_head *sh,
4032	struct stripe_head_state *s,
4033	int disks)
4034	{
4035	int i;
4036
4037	/ look for blocks to read/compute, skip this if a compute*
4038	* is already in flight, or if the stripe contents are in the
4039	* midst of changing due to a write
4040	*/
4041	if (!test_bit(STRIPE_COMPUTE_RUN, &sh->state) && !sh->check_state &&
4042	!sh->reconstruct_state) {
4043
4044	/*
4045	* For degraded stripe with data in journal, do not handle
4046	* read requests yet, instead, flush the stripe to raid
4047	* disks first, this avoids handling complex rmw of write
4048	* back cache (prexor with orig_page, and then xor with
4049	* page) in the read path
4050	*/
4051	if (s->to_read && s->injournal && s->failed) {
4052	if (test_bit(STRIPE_R5C_CACHING, &sh->state))
4053	r5c_make_stripe_write_out(sh);
4054	goto out;
4055	}
4056
4057	for (i = disks; i--; )
4058	if (fetch_block(sh, s, disk_idx: i, disks))
4059	break;
4060	}
4061	out:
4062	set_bit(nr: STRIPE_HANDLE, addr: &sh->state);
4063	}
4064
4065	static void break_stripe_batch_list(struct stripe_head *head_sh,
4066	unsigned long handle_flags);
4067	/ handle_stripe_clean_event*
4068	* any written block on an uptodate or failed drive can be returned.
4069	* Note that if we 'wrote' to a failed drive, it will be UPTODATE, but
4070	* never LOCKED, so we don't need to test 'failed' directly.
4071	*/
4072	static void handle_stripe_clean_event(struct r5conf *conf,
4073	struct stripe_head sh, int* disks)
4074	{
4075	int i;
4076	struct r5dev *dev;
4077	int discard_pending = `0`;
4078	struct stripe_head *head_sh = sh;
4079	bool do_endio = false;
4080
4081	for (i = disks; i--; )
4082	if (sh->dev[i].written) {
4083	dev = &sh->dev[i];
4084	if (!test_bit(R5_LOCKED, &dev->flags) &&
4085	(test_bit(R5_UPTODATE, &dev->flags) \|\|
4086	test_bit(R5_Discard, &dev->flags) \|\|
4087	test_bit(R5_SkipCopy, &dev->flags))) {
4088	/ We can return any write requests /
4089	struct bio wbi, wbi2;
4090	pr_debug("Return write for disc %d\n", i);
4091	if (test_and_clear_bit(nr: R5_Discard, addr: &dev->flags))
4092	clear_bit(nr: R5_UPTODATE, addr: &dev->flags);
4093	if (test_and_clear_bit(nr: R5_SkipCopy, addr: &dev->flags)) {
4094	WARN_ON(test_bit(R5_UPTODATE, &dev->flags));
4095	}
4096	do_endio = true;
4097
4098	returnbi:
4099	dev->page = dev->orig_page;
4100	wbi = dev->written;
4101	dev->written = NULL;
4102	while (wbi && wbi->bi_iter.bi_sector <
4103	dev->sector + RAID5_STRIPE_SECTORS(conf)) {
4104	wbi2 = r5_next_bio(conf, bio: wbi, sector: dev->sector);
4105	md_write_end(mddev: conf->mddev);
4106	bio_endio(wbi);
4107	wbi = wbi2;
4108	}
4109	md_bitmap_endwrite(bitmap: conf->mddev->bitmap, offset: sh->sector,
4110	RAID5_STRIPE_SECTORS(conf),
4111	success: !test_bit(STRIPE_DEGRADED, &sh->state),
4112	behind: `0`);
4113	if (head_sh->batch_head) {
4114	sh = list_first_entry(&sh->batch_list,
4115	struct stripe_head,
4116	batch_list);
4117	if (sh != head_sh) {
4118	dev = &sh->dev[i];
4119	goto returnbi;
4120	}
4121	}
4122	sh = head_sh;
4123	dev = &sh->dev[i];
4124	} else if (test_bit(R5_Discard, &dev->flags))
4125	discard_pending = `1`;
4126	}
4127
4128	log_stripe_write_finished(sh);
4129
4130	if (!discard_pending &&
4131	test_bit(R5_Discard, &sh->dev[sh->pd_idx].flags)) {
4132	int hash;
4133	clear_bit(nr: R5_Discard, addr: &sh->dev[sh->pd_idx].flags);
4134	clear_bit(nr: R5_UPTODATE, addr: &sh->dev[sh->pd_idx].flags);
4135	if (sh->qd_idx >= `0`) {
4136	clear_bit(nr: R5_Discard, addr: &sh->dev[sh->qd_idx].flags);
4137	clear_bit(nr: R5_UPTODATE, addr: &sh->dev[sh->qd_idx].flags);
4138	}
4139	/ now that discard is done we can proceed with any sync /
4140	clear_bit(nr: STRIPE_DISCARD, addr: &sh->state);
4141	/*
4142	* SCSI discard will change some bio fields and the stripe has
4143	* no updated data, so remove it from hash list and the stripe
4144	* will be reinitialized
4145	*/
4146	unhash:
4147	hash = sh->hash_lock_index;
4148	spin_lock_irq(lock: conf->hash_locks + hash);
4149	remove_hash(sh);
4150	spin_unlock_irq(lock: conf->hash_locks + hash);
4151	if (head_sh->batch_head) {
4152	sh = list_first_entry(&sh->batch_list,
4153	struct stripe_head, batch_list);
4154	if (sh != head_sh)
4155	goto unhash;
4156	}
4157	sh = head_sh;
4158
4159	if (test_bit(STRIPE_SYNC_REQUESTED, &sh->state))
4160	set_bit(nr: STRIPE_HANDLE, addr: &sh->state);
4161
4162	}
4163
4164	if (test_and_clear_bit(nr: STRIPE_FULL_WRITE, addr: &sh->state))
4165	if (atomic_dec_and_test(v: &conf->pending_full_writes))
4166	md_wakeup_thread(thread: conf->mddev->thread);
4167
4168	if (head_sh->batch_head && do_endio)
4169	break_stripe_batch_list(head_sh, STRIPE_EXPAND_SYNC_FLAGS);
4170	}
4171
4172	/*
4173	* For RMW in write back cache, we need extra page in prexor to store the
4174	* old data. This page is stored in dev->orig_page.
4175	*
4176	* This function checks whether we have data for prexor. The exact logic
4177	* is:
4178	* R5_UPTODATE && (!R5_InJournal \|\| R5_OrigPageUPTDODATE)
4179	*/
4180	static inline bool uptodate_for_rmw(struct r5dev *dev)
4181	{
4182	return (test_bit(R5_UPTODATE, &dev->flags)) &&
4183	(!test_bit(R5_InJournal, &dev->flags) \|\|
4184	test_bit(R5_OrigPageUPTDODATE, &dev->flags));
4185	}
4186
4187	static int handle_stripe_dirtying(struct r5conf *conf,
4188	struct stripe_head *sh,
4189	struct stripe_head_state *s,
4190	int disks)
4191	{
4192	int rmw = `0`, rcw = `0`, i;
4193	sector_t recovery_cp = conf->mddev->recovery_cp;
4194
4195	/ Check whether resync is now happening or should start.*
4196	* If yes, then the array is dirty (after unclean shutdown or
4197	* initial creation), so parity in some stripes might be inconsistent.
4198	* In this case, we need to always do reconstruct-write, to ensure
4199	* that in case of drive failure or read-error correction, we
4200	* generate correct data from the parity.
4201	*/
4202	if (conf->rmw_level == PARITY_DISABLE_RMW \|\|
4203	(recovery_cp < MaxSector && sh->sector >= recovery_cp &&
4204	s->failed == `0`)) {
4205	/ Calculate the real rcw later - for now make it*
4206	* look like rcw is cheaper
4207	*/
4208	rcw = `1`; rmw = `2`;
4209	pr_debug("force RCW rmw_level=%u, recovery_cp=%llu sh->sector=%llu\n",
4210	conf->rmw_level, (unsigned long long)recovery_cp,
4211	(unsigned long long)sh->sector);
4212	} else for (i = disks; i--; ) {
4213	/ would I have to read this buffer for read_modify_write /
4214	struct r5dev *dev = &sh->dev[i];
4215	if (((dev->towrite && !delay_towrite(conf, dev, s)) \|\|
4216	i == sh->pd_idx \|\| i == sh->qd_idx \|\|
4217	test_bit(R5_InJournal, &dev->flags)) &&
4218	!test_bit(R5_LOCKED, &dev->flags) &&
4219	!(uptodate_for_rmw(dev) \|\|
4220	test_bit(R5_Wantcompute, &dev->flags))) {
4221	if (test_bit(R5_Insync, &dev->flags))
4222	rmw++;
4223	else
4224	rmw += `2`disks; /* cannot read it /
4225	}
4226	/ Would I have to read this buffer for reconstruct_write /
4227	if (!test_bit(R5_OVERWRITE, &dev->flags) &&
4228	i != sh->pd_idx && i != sh->qd_idx &&
4229	!test_bit(R5_LOCKED, &dev->flags) &&
4230	!(test_bit(R5_UPTODATE, &dev->flags) \|\|
4231	test_bit(R5_Wantcompute, &dev->flags))) {
4232	if (test_bit(R5_Insync, &dev->flags))
4233	rcw++;
4234	else
4235	rcw += `2`*disks;
4236	}
4237	}
4238
4239	pr_debug("for sector %llu state 0x%lx, rmw=%d rcw=%d\n",
4240	(unsigned long long)sh->sector, sh->state, rmw, rcw);
4241	set_bit(nr: STRIPE_HANDLE, addr: &sh->state);
4242	if ((rmw < rcw \|\| (rmw == rcw && conf->rmw_level == PARITY_PREFER_RMW)) && rmw > `0`) {
4243	/ prefer read-modify-write, but need to get some data /
4244	if (conf->mddev->queue)
4245	blk_add_trace_msg(conf->mddev->queue,
4246	"raid5 rmw %llu %d",
4247	(unsigned long long)sh->sector, rmw);
4248	for (i = disks; i--; ) {
4249	struct r5dev *dev = &sh->dev[i];
4250	if (test_bit(R5_InJournal, &dev->flags) &&
4251	dev->page == dev->orig_page &&
4252	!test_bit(R5_LOCKED, &sh->dev[sh->pd_idx].flags)) {
4253	/ alloc page for prexor /
4254	struct page *p = alloc_page(GFP_NOIO);
4255
4256	if (p) {
4257	dev->orig_page = p;
4258	continue;
4259	}
4260
4261	/*
4262	* alloc_page() failed, try use
4263	* disk_info->extra_page
4264	*/
4265	if (!test_and_set_bit(nr: R5C_EXTRA_PAGE_IN_USE,
4266	addr: &conf->cache_state)) {
4267	r5c_use_extra_page(sh);
4268	break;
4269	}
4270
4271	/ extra_page in use, add to delayed_list /
4272	set_bit(nr: STRIPE_DELAYED, addr: &sh->state);
4273	s->waiting_extra_page = `1`;
4274	return -EAGAIN;
4275	}
4276	}
4277
4278	for (i = disks; i--; ) {
4279	struct r5dev *dev = &sh->dev[i];
4280	if (((dev->towrite && !delay_towrite(conf, dev, s)) \|\|
4281	i == sh->pd_idx \|\| i == sh->qd_idx \|\|
4282	test_bit(R5_InJournal, &dev->flags)) &&
4283	!test_bit(R5_LOCKED, &dev->flags) &&
4284	!(uptodate_for_rmw(dev) \|\|
4285	test_bit(R5_Wantcompute, &dev->flags)) &&
4286	test_bit(R5_Insync, &dev->flags)) {
4287	if (test_bit(STRIPE_PREREAD_ACTIVE,
4288	&sh->state)) {
4289	pr_debug("Read_old block %d for r-m-w\n",
4290	i);
4291	set_bit(nr: R5_LOCKED, addr: &dev->flags);
4292	set_bit(nr: R5_Wantread, addr: &dev->flags);
4293	s->locked++;
4294	} else
4295	set_bit(nr: STRIPE_DELAYED, addr: &sh->state);
4296	}
4297	}
4298	}
4299	if ((rcw < rmw \|\| (rcw == rmw && conf->rmw_level != PARITY_PREFER_RMW)) && rcw > `0`) {
4300	/ want reconstruct write, but need to get some data /
4301	int qread =`0`;
4302	rcw = `0`;
4303	for (i = disks; i--; ) {
4304	struct r5dev *dev = &sh->dev[i];
4305	if (!test_bit(R5_OVERWRITE, &dev->flags) &&
4306	i != sh->pd_idx && i != sh->qd_idx &&
4307	!test_bit(R5_LOCKED, &dev->flags) &&
4308	!(test_bit(R5_UPTODATE, &dev->flags) \|\|
4309	test_bit(R5_Wantcompute, &dev->flags))) {
4310	rcw++;
4311	if (test_bit(R5_Insync, &dev->flags) &&
4312	test_bit(STRIPE_PREREAD_ACTIVE,
4313	&sh->state)) {
4314	pr_debug("Read_old block "
4315	"%d for Reconstruct\n", i);
4316	set_bit(nr: R5_LOCKED, addr: &dev->flags);
4317	set_bit(nr: R5_Wantread, addr: &dev->flags);
4318	s->locked++;
4319	qread++;
4320	} else
4321	set_bit(nr: STRIPE_DELAYED, addr: &sh->state);
4322	}
4323	}
4324	if (rcw && conf->mddev->queue)
4325	blk_add_trace_msg(conf->mddev->queue, "raid5 rcw %llu %d %d %d",
4326	(unsigned long long)sh->sector,
4327	rcw, qread, test_bit(STRIPE_DELAYED, &sh->state));
4328	}
4329
4330	if (rcw > disks && rmw > disks &&
4331	!test_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
4332	set_bit(nr: STRIPE_DELAYED, addr: &sh->state);
4333
4334	/ now if nothing is locked, and if we have enough data,*
4335	* we can start a write request
4336	*/
4337	/ since handle_stripe can be called at any time we need to handle the*
4338	* case where a compute block operation has been submitted and then a
4339	* subsequent call wants to start a write request. raid_run_ops only
4340	* handles the case where compute block and reconstruct are requested
4341	* simultaneously. If this is not the case then new writes need to be
4342	* held off until the compute completes.
4343	*/
4344	if ((s->req_compute \|\| !test_bit(STRIPE_COMPUTE_RUN, &sh->state)) &&
4345	(s->locked == `0` && (rcw == `0` \|\| rmw == `0`) &&
4346	!test_bit(STRIPE_BIT_DELAY, &sh->state)))
4347	schedule_reconstruction(sh, s, rcw: rcw == `0`, expand: `0`);
4348	return `0`;
4349	}
4350
4351	static void handle_parity_checks5(struct r5conf conf, struct* stripe_head *sh,
4352	struct stripe_head_state s, int* disks)
4353	{
4354	struct r5dev *dev = NULL;
4355
4356	BUG_ON(sh->batch_head);
4357	set_bit(nr: STRIPE_HANDLE, addr: &sh->state);
4358
4359	switch (sh->check_state) {
4360	case check_state_idle:
4361	/ start a new check operation if there are no failures /
4362	if (s->failed == `0`) {
4363	BUG_ON(s->uptodate != disks);
4364	sh->check_state = check_state_run;
4365	set_bit(nr: STRIPE_OP_CHECK, addr: &s->ops_request);
4366	clear_bit(nr: R5_UPTODATE, addr: &sh->dev[sh->pd_idx].flags);
4367	s->uptodate--;
4368	break;
4369	}
4370	dev = &sh->dev[s->failed_num[`0`]];
4371	fallthrough;
4372	case check_state_compute_result:
4373	sh->check_state = check_state_idle;
4374	if (!dev)
4375	dev = &sh->dev[sh->pd_idx];
4376
4377	/ check that a write has not made the stripe insync /
4378	if (test_bit(STRIPE_INSYNC, &sh->state))
4379	break;
4380
4381	/ either failed parity check, or recovery is happening /
4382	BUG_ON(!test_bit(R5_UPTODATE, &dev->flags));
4383	BUG_ON(s->uptodate != disks);
4384
4385	set_bit(nr: R5_LOCKED, addr: &dev->flags);
4386	s->locked++;
4387	set_bit(nr: R5_Wantwrite, addr: &dev->flags);
4388
4389	clear_bit(nr: STRIPE_DEGRADED, addr: &sh->state);
4390	set_bit(nr: STRIPE_INSYNC, addr: &sh->state);
4391	break;
4392	case check_state_run:
4393	break; / we will be called again upon completion /
4394	case check_state_check_result:
4395	sh->check_state = check_state_idle;
4396
4397	/ if a failure occurred during the check operation, leave*
4398	* STRIPE_INSYNC not set and let the stripe be handled again
4399	*/
4400	if (s->failed)
4401	break;
4402
4403	/ handle a successful check operation, if parity is correct*
4404	* we are done. Otherwise update the mismatch count and repair
4405	* parity if !MD_RECOVERY_CHECK
4406	*/
4407	if ((sh->ops.zero_sum_result & SUM_CHECK_P_RESULT) == `0`)
4408	/ parity is correct (on disc,*
4409	* not in buffer any more)
4410	*/
4411	set_bit(nr: STRIPE_INSYNC, addr: &sh->state);
4412	else {
4413	atomic64_add(RAID5_STRIPE_SECTORS(conf), v: &conf->mddev->resync_mismatches);
4414	if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery)) {
4415	/ don't try to repair!! /
4416	set_bit(nr: STRIPE_INSYNC, addr: &sh->state);
4417	pr_warn_ratelimited("%s: mismatch sector in range "
4418	"%llu-%llu\n", mdname(conf->mddev),
4419	(unsigned long long) sh->sector,
4420	(unsigned long long) sh->sector +
4421	RAID5_STRIPE_SECTORS(conf));
4422	} else {
4423	sh->check_state = check_state_compute_run;
4424	set_bit(nr: STRIPE_COMPUTE_RUN, addr: &sh->state);
4425	set_bit(nr: STRIPE_OP_COMPUTE_BLK, addr: &s->ops_request);
4426	set_bit(nr: R5_Wantcompute,
4427	addr: &sh->dev[sh->pd_idx].flags);
4428	sh->ops.target = sh->pd_idx;
4429	sh->ops.target2 = -`1`;
4430	s->uptodate++;
4431	}
4432	}
4433	break;
4434	case check_state_compute_run:
4435	break;
4436	default:
4437	pr_err("%s: unknown check_state: %d sector: %llu\n",
4438	__func__, sh->check_state,
4439	(unsigned long long) sh->sector);
4440	BUG();
4441	}
4442	}
4443
4444	static void handle_parity_checks6(struct r5conf conf, struct* stripe_head *sh,
4445	struct stripe_head_state *s,
4446	int disks)
4447	{
4448	int pd_idx = sh->pd_idx;
4449	int qd_idx = sh->qd_idx;
4450	struct r5dev *dev;
4451
4452	BUG_ON(sh->batch_head);
4453	set_bit(nr: STRIPE_HANDLE, addr: &sh->state);
4454
4455	BUG_ON(s->failed > `2`);
4456
4457	/ Want to check and possibly repair P and Q.*
4458	* However there could be one 'failed' device, in which
4459	* case we can only check one of them, possibly using the
4460	* other to generate missing data
4461	*/
4462
4463	switch (sh->check_state) {
4464	case check_state_idle:
4465	/ start a new check operation if there are < 2 failures /
4466	if (s->failed == s->q_failed) {
4467	/ The only possible failed device holds Q, so it*
4468	* makes sense to check P (If anything else were failed,
4469	* we would have used P to recreate it).
4470	*/
4471	sh->check_state = check_state_run;
4472	}
4473	if (!s->q_failed && s->failed < `2`) {
4474	/ Q is not failed, and we didn't use it to generate*
4475	* anything, so it makes sense to check it
4476	*/
4477	if (sh->check_state == check_state_run)
4478	sh->check_state = check_state_run_pq;
4479	else
4480	sh->check_state = check_state_run_q;
4481	}
4482
4483	/ discard potentially stale zero_sum_result /
4484	sh->ops.zero_sum_result = `0`;
4485
4486	if (sh->check_state == check_state_run) {
4487	/ async_xor_zero_sum destroys the contents of P /
4488	clear_bit(nr: R5_UPTODATE, addr: &sh->dev[pd_idx].flags);
4489	s->uptodate--;
4490	}
4491	if (sh->check_state >= check_state_run &&
4492	sh->check_state <= check_state_run_pq) {
4493	/ async_syndrome_zero_sum preserves P and Q, so*
4494	* no need to mark them !uptodate here
4495	*/
4496	set_bit(nr: STRIPE_OP_CHECK, addr: &s->ops_request);
4497	break;
4498	}
4499
4500	/ we have 2-disk failure /
4501	BUG_ON(s->failed != `2`);
4502	fallthrough;
4503	case check_state_compute_result:
4504	sh->check_state = check_state_idle;
4505
4506	/ check that a write has not made the stripe insync /
4507	if (test_bit(STRIPE_INSYNC, &sh->state))
4508	break;
4509
4510	/ now write out any block on a failed drive,*
4511	* or P or Q if they were recomputed
4512	*/
4513	dev = NULL;
4514	if (s->failed == `2`) {
4515	dev = &sh->dev[s->failed_num[`1`]];
4516	s->locked++;
4517	set_bit(nr: R5_LOCKED, addr: &dev->flags);
4518	set_bit(nr: R5_Wantwrite, addr: &dev->flags);
4519	}
4520	if (s->failed >= `1`) {
4521	dev = &sh->dev[s->failed_num[`0`]];
4522	s->locked++;
4523	set_bit(nr: R5_LOCKED, addr: &dev->flags);
4524	set_bit(nr: R5_Wantwrite, addr: &dev->flags);
4525	}
4526	if (sh->ops.zero_sum_result & SUM_CHECK_P_RESULT) {
4527	dev = &sh->dev[pd_idx];
4528	s->locked++;
4529	set_bit(nr: R5_LOCKED, addr: &dev->flags);
4530	set_bit(nr: R5_Wantwrite, addr: &dev->flags);
4531	}
4532	if (sh->ops.zero_sum_result & SUM_CHECK_Q_RESULT) {
4533	dev = &sh->dev[qd_idx];
4534	s->locked++;
4535	set_bit(nr: R5_LOCKED, addr: &dev->flags);
4536	set_bit(nr: R5_Wantwrite, addr: &dev->flags);
4537	}
4538	if (WARN_ONCE(dev && !test_bit(R5_UPTODATE, &dev->flags),
4539	"%s: disk%td not up to date\n",
4540	mdname(conf->mddev),
4541	dev - (struct r5dev *) &sh->dev)) {
4542	clear_bit(nr: R5_LOCKED, addr: &dev->flags);
4543	clear_bit(nr: R5_Wantwrite, addr: &dev->flags);
4544	s->locked--;
4545	}
4546	clear_bit(nr: STRIPE_DEGRADED, addr: &sh->state);
4547
4548	set_bit(nr: STRIPE_INSYNC, addr: &sh->state);
4549	break;
4550	case check_state_run:
4551	case check_state_run_q:
4552	case check_state_run_pq:
4553	break; / we will be called again upon completion /
4554	case check_state_check_result:
4555	sh->check_state = check_state_idle;
4556
4557	/ handle a successful check operation, if parity is correct*
4558	* we are done. Otherwise update the mismatch count and repair
4559	* parity if !MD_RECOVERY_CHECK
4560	*/
4561	if (sh->ops.zero_sum_result == `0`) {
4562	/ both parities are correct /
4563	if (!s->failed)
4564	set_bit(nr: STRIPE_INSYNC, addr: &sh->state);
4565	else {
4566	/ in contrast to the raid5 case we can validate*
4567	* parity, but still have a failure to write
4568	* back
4569	*/
4570	sh->check_state = check_state_compute_result;
4571	/ Returning at this point means that we may go*
4572	* off and bring p and/or q uptodate again so
4573	* we make sure to check zero_sum_result again
4574	* to verify if p or q need writeback
4575	*/
4576	}
4577	} else {
4578	atomic64_add(RAID5_STRIPE_SECTORS(conf), v: &conf->mddev->resync_mismatches);
4579	if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery)) {
4580	/ don't try to repair!! /
4581	set_bit(nr: STRIPE_INSYNC, addr: &sh->state);
4582	pr_warn_ratelimited("%s: mismatch sector in range "
4583	"%llu-%llu\n", mdname(conf->mddev),
4584	(unsigned long long) sh->sector,
4585	(unsigned long long) sh->sector +
4586	RAID5_STRIPE_SECTORS(conf));
4587	} else {
4588	int *target = &sh->ops.target;
4589
4590	sh->ops.target = -`1`;
4591	sh->ops.target2 = -`1`;
4592	sh->check_state = check_state_compute_run;
4593	set_bit(nr: STRIPE_COMPUTE_RUN, addr: &sh->state);
4594	set_bit(nr: STRIPE_OP_COMPUTE_BLK, addr: &s->ops_request);
4595	if (sh->ops.zero_sum_result & SUM_CHECK_P_RESULT) {
4596	set_bit(nr: R5_Wantcompute,
4597	addr: &sh->dev[pd_idx].flags);
4598	*target = pd_idx;
4599	target = &sh->ops.target2;
4600	s->uptodate++;
4601	}
4602	if (sh->ops.zero_sum_result & SUM_CHECK_Q_RESULT) {
4603	set_bit(nr: R5_Wantcompute,
4604	addr: &sh->dev[qd_idx].flags);
4605	*target = qd_idx;
4606	s->uptodate++;
4607	}
4608	}
4609	}
4610	break;
4611	case check_state_compute_run:
4612	break;
4613	default:
4614	pr_warn("%s: unknown check_state: %d sector: %llu\n",
4615	__func__, sh->check_state,
4616	(unsigned long long) sh->sector);
4617	BUG();
4618	}
4619	}
4620
4621	static void handle_stripe_expansion(struct r5conf conf, struct* stripe_head *sh)
4622	{
4623	int i;
4624
4625	/ We have read all the blocks in this stripe and now we need to*
4626	* copy some of them into a target stripe for expand.
4627	*/
4628	struct dma_async_tx_descriptor *tx = NULL;
4629	BUG_ON(sh->batch_head);
4630	clear_bit(nr: STRIPE_EXPAND_SOURCE, addr: &sh->state);
4631	for (i = `0`; i < sh->disks; i++)
4632	if (i != sh->pd_idx && i != sh->qd_idx) {
4633	int dd_idx, j;
4634	struct stripe_head *sh2;
4635	struct async_submit_ctl submit;
4636
4637	sector_t bn = raid5_compute_blocknr(sh, i, previous: `1`);
4638	sector_t s = raid5_compute_sector(conf, r_sector: bn, previous: `0`,
4639	dd_idx: &dd_idx, NULL);
4640	sh2 = raid5_get_active_stripe(conf, NULL, sector: s,
4641	R5_GAS_NOBLOCK \| R5_GAS_NOQUIESCE);
4642	if (sh2 == NULL)
4643	/ so far only the early blocks of this stripe*
4644	* have been requested. When later blocks
4645	* get requested, we will try again
4646	*/
4647	continue;
4648	if (!test_bit(STRIPE_EXPANDING, &sh2->state) \|\|
4649	test_bit(R5_Expanded, &sh2->dev[dd_idx].flags)) {
4650	/ must have already done this block /
4651	raid5_release_stripe(sh: sh2);
4652	continue;
4653	}
4654
4655	/ place all the copies on one channel /
4656	init_async_submit(args: &submit, flags: `0`, tx, NULL, NULL, NULL);
4657	tx = async_memcpy(dest: sh2->dev[dd_idx].page,
4658	src: sh->dev[i].page, dest_offset: sh2->dev[dd_idx].offset,
4659	src_offset: sh->dev[i].offset, RAID5_STRIPE_SIZE(conf),
4660	submit: &submit);
4661
4662	set_bit(nr: R5_Expanded, addr: &sh2->dev[dd_idx].flags);
4663	set_bit(nr: R5_UPTODATE, addr: &sh2->dev[dd_idx].flags);
4664	for (j = `0`; j < conf->raid_disks; j++)
4665	if (j != sh2->pd_idx &&
4666	j != sh2->qd_idx &&
4667	!test_bit(R5_Expanded, &sh2->dev[j].flags))
4668	break;
4669	if (j == conf->raid_disks) {
4670	set_bit(nr: STRIPE_EXPAND_READY, addr: &sh2->state);
4671	set_bit(nr: STRIPE_HANDLE, addr: &sh2->state);
4672	}
4673	raid5_release_stripe(sh: sh2);
4674
4675	}
4676	/ done submitting copies, wait for them to complete /
4677	async_tx_quiesce(tx: &tx);
4678	}
4679
4680	/*
4681	* handle_stripe - do things to a stripe.
4682	*
4683	* We lock the stripe by setting STRIPE_ACTIVE and then examine the
4684	* state of various bits to see what needs to be done.
4685	* Possible results:
4686	* return some read requests which now have data
4687	* return some write requests which are safely on storage
4688	* schedule a read on some buffers
4689	* schedule a write of some buffers
4690	* return confirmation of parity correctness
4691	*
4692	*/
4693
4694	static void analyse_stripe(struct stripe_head sh, struct* stripe_head_state *s)
4695	{
4696	struct r5conf *conf = sh->raid_conf;
4697	int disks = sh->disks;
4698	struct r5dev *dev;
4699	int i;
4700	int do_recovery = `0`;
4701
4702	memset(s, `0`, sizeof(*s));
4703
4704	s->expanding = test_bit(STRIPE_EXPAND_SOURCE, &sh->state) && !sh->batch_head;
4705	s->expanded = test_bit(STRIPE_EXPAND_READY, &sh->state) && !sh->batch_head;
4706	s->failed_num[`0`] = -`1`;
4707	s->failed_num[`1`] = -`1`;
4708	s->log_failed = r5l_log_disk_error(conf);
4709
4710	/ Now to look around and see what can be done /
4711	rcu_read_lock();
4712	for (i=disks; i--; ) {
4713	struct md_rdev *rdev;
4714	sector_t first_bad;
4715	int bad_sectors;
4716	int is_bad = `0`;
4717
4718	dev = &sh->dev[i];
4719
4720	pr_debug("check %d: state 0x%lx read %p write %p written %p\n",
4721	i, dev->flags,
4722	dev->toread, dev->towrite, dev->written);
4723	/ maybe we can reply to a read*
4724	*
4725	* new wantfill requests are only permitted while
4726	* ops_complete_biofill is guaranteed to be inactive
4727	*/
4728	if (test_bit(R5_UPTODATE, &dev->flags) && dev->toread &&
4729	!test_bit(STRIPE_BIOFILL_RUN, &sh->state))
4730	set_bit(nr: R5_Wantfill, addr: &dev->flags);
4731
4732	/ now count some things /
4733	if (test_bit(R5_LOCKED, &dev->flags))
4734	s->locked++;
4735	if (test_bit(R5_UPTODATE, &dev->flags))
4736	s->uptodate++;
4737	if (test_bit(R5_Wantcompute, &dev->flags)) {
4738	s->compute++;
4739	BUG_ON(s->compute > `2`);
4740	}
4741
4742	if (test_bit(R5_Wantfill, &dev->flags))
4743	s->to_fill++;
4744	else if (dev->toread)
4745	s->to_read++;
4746	if (dev->towrite) {
4747	s->to_write++;
4748	if (!test_bit(R5_OVERWRITE, &dev->flags))
4749	s->non_overwrite++;
4750	}
4751	if (dev->written)
4752	s->written++;
4753	/ Prefer to use the replacement for reads, but only*
4754	* if it is recovered enough and has no bad blocks.
4755	*/
4756	rdev = rcu_dereference(conf->disks[i].replacement);
4757	if (rdev && !test_bit(Faulty, &rdev->flags) &&
4758	rdev->recovery_offset >= sh->sector + RAID5_STRIPE_SECTORS(conf) &&
4759	!is_badblock(rdev, s: sh->sector, RAID5_STRIPE_SECTORS(conf),
4760	first_bad: &first_bad, bad_sectors: &bad_sectors))
4761	set_bit(nr: R5_ReadRepl, addr: &dev->flags);
4762	else {
4763	if (rdev && !test_bit(Faulty, &rdev->flags))
4764	set_bit(nr: R5_NeedReplace, addr: &dev->flags);
4765	else
4766	clear_bit(nr: R5_NeedReplace, addr: &dev->flags);
4767	rdev = rcu_dereference(conf->disks[i].rdev);
4768	clear_bit(nr: R5_ReadRepl, addr: &dev->flags);
4769	}
4770	if (rdev && test_bit(Faulty, &rdev->flags))
4771	rdev = NULL;
4772	if (rdev) {
4773	is_bad = is_badblock(rdev, s: sh->sector, RAID5_STRIPE_SECTORS(conf),
4774	first_bad: &first_bad, bad_sectors: &bad_sectors);
4775	if (s->blocked_rdev == NULL
4776	&& (test_bit(Blocked, &rdev->flags)
4777	\|\| is_bad < `0`)) {
4778	if (is_bad < `0`)
4779	set_bit(nr: BlockedBadBlocks,
4780	addr: &rdev->flags);
4781	s->blocked_rdev = rdev;
4782	atomic_inc(v: &rdev->nr_pending);
4783	}
4784	}
4785	clear_bit(nr: R5_Insync, addr: &dev->flags);
4786	if (!rdev)
4787	/ Not in-sync /;
4788	else if (is_bad) {
4789	/ also not in-sync /
4790	if (!test_bit(WriteErrorSeen, &rdev->flags) &&
4791	test_bit(R5_UPTODATE, &dev->flags)) {
4792	/ treat as in-sync, but with a read error*
4793	* which we can now try to correct
4794	*/
4795	set_bit(nr: R5_Insync, addr: &dev->flags);
4796	set_bit(nr: R5_ReadError, addr: &dev->flags);
4797	}
4798	} else if (test_bit(In_sync, &rdev->flags))
4799	set_bit(nr: R5_Insync, addr: &dev->flags);
4800	else if (sh->sector + RAID5_STRIPE_SECTORS(conf) <= rdev->recovery_offset)
4801	/ in sync if before recovery_offset /
4802	set_bit(nr: R5_Insync, addr: &dev->flags);
4803	else if (test_bit(R5_UPTODATE, &dev->flags) &&
4804	test_bit(R5_Expanded, &dev->flags))
4805	/ If we've reshaped into here, we assume it is Insync.*
4806	* We will shortly update recovery_offset to make
4807	* it official.
4808	*/
4809	set_bit(nr: R5_Insync, addr: &dev->flags);
4810
4811	if (test_bit(R5_WriteError, &dev->flags)) {
4812	/ This flag does not apply to '.replacement'*
4813	* only to .rdev, so make sure to check that*/
4814	struct md_rdev *rdev2 = rcu_dereference(
4815	conf->disks[i].rdev);
4816	if (rdev2 == rdev)
4817	clear_bit(nr: R5_Insync, addr: &dev->flags);
4818	if (rdev2 && !test_bit(Faulty, &rdev2->flags)) {
4819	s->handle_bad_blocks = `1`;
4820	atomic_inc(v: &rdev2->nr_pending);
4821	} else
4822	clear_bit(nr: R5_WriteError, addr: &dev->flags);
4823	}
4824	if (test_bit(R5_MadeGood, &dev->flags)) {
4825	/ This flag does not apply to '.replacement'*
4826	* only to .rdev, so make sure to check that*/
4827	struct md_rdev *rdev2 = rcu_dereference(
4828	conf->disks[i].rdev);
4829	if (rdev2 && !test_bit(Faulty, &rdev2->flags)) {
4830	s->handle_bad_blocks = `1`;
4831	atomic_inc(v: &rdev2->nr_pending);
4832	} else
4833	clear_bit(nr: R5_MadeGood, addr: &dev->flags);
4834	}
4835	if (test_bit(R5_MadeGoodRepl, &dev->flags)) {
4836	struct md_rdev *rdev2 = rcu_dereference(
4837	conf->disks[i].replacement);
4838	if (rdev2 && !test_bit(Faulty, &rdev2->flags)) {
4839	s->handle_bad_blocks = `1`;
4840	atomic_inc(v: &rdev2->nr_pending);
4841	} else
4842	clear_bit(nr: R5_MadeGoodRepl, addr: &dev->flags);
4843	}
4844	if (!test_bit(R5_Insync, &dev->flags)) {
4845	/ The ReadError flag will just be confusing now /
4846	clear_bit(nr: R5_ReadError, addr: &dev->flags);
4847	clear_bit(nr: R5_ReWrite, addr: &dev->flags);
4848	}
4849	if (test_bit(R5_ReadError, &dev->flags))
4850	clear_bit(nr: R5_Insync, addr: &dev->flags);
4851	if (!test_bit(R5_Insync, &dev->flags)) {
4852	if (s->failed < `2`)
4853	s->failed_num[s->failed] = i;
4854	s->failed++;
4855	if (rdev && !test_bit(Faulty, &rdev->flags))
4856	do_recovery = `1`;
4857	else if (!rdev) {
4858	rdev = rcu_dereference(
4859	conf->disks[i].replacement);
4860	if (rdev && !test_bit(Faulty, &rdev->flags))
4861	do_recovery = `1`;
4862	}
4863	}
4864
4865	if (test_bit(R5_InJournal, &dev->flags))
4866	s->injournal++;
4867	if (test_bit(R5_InJournal, &dev->flags) && dev->written)
4868	s->just_cached++;
4869	}
4870	if (test_bit(STRIPE_SYNCING, &sh->state)) {
4871	/ If there is a failed device being replaced,*
4872	* we must be recovering.
4873	* else if we are after recovery_cp, we must be syncing
4874	* else if MD_RECOVERY_REQUESTED is set, we also are syncing.
4875	* else we can only be replacing
4876	* sync and recovery both need to read all devices, and so
4877	* use the same flag.
4878	*/
4879	if (do_recovery \|\|
4880	sh->sector >= conf->mddev->recovery_cp \|\|
4881	test_bit(MD_RECOVERY_REQUESTED, &(conf->mddev->recovery)))
4882	s->syncing = `1`;
4883	else
4884	s->replacing = `1`;
4885	}
4886	rcu_read_unlock();
4887	}
4888
4889	/*
4890	* Return '1' if this is a member of batch, or '0' if it is a lone stripe or
4891	* a head which can now be handled.
4892	*/
4893	static int clear_batch_ready(struct stripe_head *sh)
4894	{
4895	struct stripe_head *tmp;
4896	if (!test_and_clear_bit(nr: STRIPE_BATCH_READY, addr: &sh->state))
4897	return (sh->batch_head && sh->batch_head != sh);
4898	spin_lock(lock: &sh->stripe_lock);
4899	if (!sh->batch_head) {
4900	spin_unlock(lock: &sh->stripe_lock);
4901	return `0`;
4902	}
4903
4904	/*
4905	* this stripe could be added to a batch list before we check
4906	* BATCH_READY, skips it
4907	*/
4908	if (sh->batch_head != sh) {
4909	spin_unlock(lock: &sh->stripe_lock);
4910	return `1`;
4911	}
4912	spin_lock(lock: &sh->batch_lock);
4913	list_for_each_entry(tmp, &sh->batch_list, batch_list)
4914	clear_bit(nr: STRIPE_BATCH_READY, addr: &tmp->state);
4915	spin_unlock(lock: &sh->batch_lock);
4916	spin_unlock(lock: &sh->stripe_lock);
4917
4918	/*
4919	* BATCH_READY is cleared, no new stripes can be added.
4920	* batch_list can be accessed without lock
4921	*/
4922	return `0`;
4923	}
4924
4925	static void break_stripe_batch_list(struct stripe_head *head_sh,
4926	unsigned long handle_flags)
4927	{
4928	struct stripe_head sh, next;
4929	int i;
4930	int do_wakeup = `0`;
4931
4932	list_for_each_entry_safe(sh, next, &head_sh->batch_list, batch_list) {
4933
4934	list_del_init(entry: &sh->batch_list);
4935
4936	WARN_ONCE(sh->state & ((`1` << STRIPE_ACTIVE) \|
4937	(`1` << STRIPE_SYNCING) \|
4938	(`1` << STRIPE_REPLACED) \|
4939	(`1` << STRIPE_DELAYED) \|
4940	(`1` << STRIPE_BIT_DELAY) \|
4941	(`1` << STRIPE_FULL_WRITE) \|
4942	(`1` << STRIPE_BIOFILL_RUN) \|
4943	(`1` << STRIPE_COMPUTE_RUN) \|
4944	(`1` << STRIPE_DISCARD) \|
4945	(`1` << STRIPE_BATCH_READY) \|
4946	(`1` << STRIPE_BATCH_ERR) \|
4947	(`1` << STRIPE_BITMAP_PENDING)),
4948	"stripe state: %lx\n", sh->state);
4949	WARN_ONCE(head_sh->state & ((`1` << STRIPE_DISCARD) \|
4950	(`1` << STRIPE_REPLACED)),
4951	"head stripe state: %lx\n", head_sh->state);
4952
4953	set_mask_bits(&sh->state, ~(STRIPE_EXPAND_SYNC_FLAGS \|
4954	(`1` << STRIPE_PREREAD_ACTIVE) \|
4955	(`1` << STRIPE_DEGRADED) \|
4956	(`1` << STRIPE_ON_UNPLUG_LIST)),
4957	head_sh->state & (`1` << STRIPE_INSYNC));
4958
4959	sh->check_state = head_sh->check_state;
4960	sh->reconstruct_state = head_sh->reconstruct_state;
4961	spin_lock_irq(lock: &sh->stripe_lock);
4962	sh->batch_head = NULL;
4963	spin_unlock_irq(lock: &sh->stripe_lock);
4964	for (i = `0`; i < sh->disks; i++) {
4965	if (test_and_clear_bit(nr: R5_Overlap, addr: &sh->dev[i].flags))
4966	do_wakeup = `1`;
4967	sh->dev[i].flags = head_sh->dev[i].flags &
4968	(~((`1` << R5_WriteError) \| (`1` << R5_Overlap)));
4969	}
4970	if (handle_flags == `0` \|\|
4971	sh->state & handle_flags)
4972	set_bit(nr: STRIPE_HANDLE, addr: &sh->state);
4973	raid5_release_stripe(sh);
4974	}
4975	spin_lock_irq(lock: &head_sh->stripe_lock);
4976	head_sh->batch_head = NULL;
4977	spin_unlock_irq(lock: &head_sh->stripe_lock);
4978	for (i = `0`; i < head_sh->disks; i++)
4979	if (test_and_clear_bit(nr: R5_Overlap, addr: &head_sh->dev[i].flags))
4980	do_wakeup = `1`;
4981	if (head_sh->state & handle_flags)
4982	set_bit(nr: STRIPE_HANDLE, addr: &head_sh->state);
4983
4984	if (do_wakeup)
4985	wake_up(&head_sh->raid_conf->wait_for_overlap);
4986	}
4987
4988	static void handle_stripe(struct stripe_head *sh)
4989	{
4990	struct stripe_head_state s;
4991	struct r5conf *conf = sh->raid_conf;
4992	int i;
4993	int prexor;
4994	int disks = sh->disks;
4995	struct r5dev pdev, qdev;
4996
4997	clear_bit(nr: STRIPE_HANDLE, addr: &sh->state);
4998
4999	/*
5000	* handle_stripe should not continue handle the batched stripe, only
5001	* the head of batch list or lone stripe can continue. Otherwise we
5002	* could see break_stripe_batch_list warns about the STRIPE_ACTIVE
5003	* is set for the batched stripe.
5004	*/
5005	if (clear_batch_ready(sh))
5006	return;
5007
5008	if (test_and_set_bit_lock(nr: STRIPE_ACTIVE, addr: &sh->state)) {
5009	/ already being handled, ensure it gets handled*
5010	* again when current action finishes */
5011	set_bit(nr: STRIPE_HANDLE, addr: &sh->state);
5012	return;
5013	}
5014
5015	if (test_and_clear_bit(nr: STRIPE_BATCH_ERR, addr: &sh->state))
5016	break_stripe_batch_list(head_sh: sh, handle_flags: `0`);
5017
5018	if (test_bit(STRIPE_SYNC_REQUESTED, &sh->state) && !sh->batch_head) {
5019	spin_lock(lock: &sh->stripe_lock);
5020	/*
5021	* Cannot process 'sync' concurrently with 'discard'.
5022	* Flush data in r5cache before 'sync'.
5023	*/
5024	if (!test_bit(STRIPE_R5C_PARTIAL_STRIPE, &sh->state) &&
5025	!test_bit(STRIPE_R5C_FULL_STRIPE, &sh->state) &&
5026	!test_bit(STRIPE_DISCARD, &sh->state) &&
5027	test_and_clear_bit(nr: STRIPE_SYNC_REQUESTED, addr: &sh->state)) {
5028	set_bit(nr: STRIPE_SYNCING, addr: &sh->state);
5029	clear_bit(nr: STRIPE_INSYNC, addr: &sh->state);
5030	clear_bit(nr: STRIPE_REPLACED, addr: &sh->state);
5031	}
5032	spin_unlock(lock: &sh->stripe_lock);
5033	}
5034	clear_bit(nr: STRIPE_DELAYED, addr: &sh->state);
5035
5036	pr_debug("handling stripe %llu, state=%#lx cnt=%d, "
5037	"pd_idx=%d, qd_idx=%d\n, check:%d, reconstruct:%d\n",
5038	(unsigned long long)sh->sector, sh->state,
5039	atomic_read(&sh->count), sh->pd_idx, sh->qd_idx,
5040	sh->check_state, sh->reconstruct_state);
5041
5042	analyse_stripe(sh, s: &s);
5043
5044	if (test_bit(STRIPE_LOG_TRAPPED, &sh->state))
5045	goto finish;
5046
5047	if (s.handle_bad_blocks \|\|
5048	test_bit(MD_SB_CHANGE_PENDING, &conf->mddev->sb_flags)) {
5049	set_bit(nr: STRIPE_HANDLE, addr: &sh->state);
5050	goto finish;
5051	}
5052
5053	if (unlikely(s.blocked_rdev)) {
5054	if (s.syncing \|\| s.expanding \|\| s.expanded \|\|
5055	s.replacing \|\| s.to_write \|\| s.written) {
5056	set_bit(nr: STRIPE_HANDLE, addr: &sh->state);
5057	goto finish;
5058	}
5059	/ There is nothing for the blocked_rdev to block /
5060	rdev_dec_pending(rdev: s.blocked_rdev, mddev: conf->mddev);
5061	s.blocked_rdev = NULL;
5062	}
5063
5064	if (s.to_fill && !test_bit(STRIPE_BIOFILL_RUN, &sh->state)) {
5065	set_bit(nr: STRIPE_OP_BIOFILL, addr: &s.ops_request);
5066	set_bit(nr: STRIPE_BIOFILL_RUN, addr: &sh->state);
5067	}
5068
5069	pr_debug("locked=%d uptodate=%d to_read=%d"
5070	" to_write=%d failed=%d failed_num=%d,%d\n",
5071	s.locked, s.uptodate, s.to_read, s.to_write, s.failed,
5072	s.failed_num[`0`], s.failed_num[`1`]);
5073	/*
5074	* check if the array has lost more than max_degraded devices and,
5075	* if so, some requests might need to be failed.
5076	*
5077	* When journal device failed (log_failed), we will only process
5078	* the stripe if there is data need write to raid disks
5079	*/
5080	if (s.failed > conf->max_degraded \|\|
5081	(s.log_failed && s.injournal == `0`)) {
5082	sh->check_state = `0`;
5083	sh->reconstruct_state = `0`;
5084	break_stripe_batch_list(head_sh: sh, handle_flags: `0`);
5085	if (s.to_read+s.to_write+s.written)
5086	handle_failed_stripe(conf, sh, s: &s, disks);
5087	if (s.syncing + s.replacing)
5088	handle_failed_sync(conf, sh, s: &s);
5089	}
5090
5091	/ Now we check to see if any write operations have recently*
5092	* completed
5093	*/
5094	prexor = `0`;
5095	if (sh->reconstruct_state == reconstruct_state_prexor_drain_result)
5096	prexor = `1`;
5097	if (sh->reconstruct_state == reconstruct_state_drain_result \|\|
5098	sh->reconstruct_state == reconstruct_state_prexor_drain_result) {
5099	sh->reconstruct_state = reconstruct_state_idle;
5100
5101	/ All the 'written' buffers and the parity block are ready to*
5102	* be written back to disk
5103	*/
5104	BUG_ON(!test_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags) &&
5105	!test_bit(R5_Discard, &sh->dev[sh->pd_idx].flags));
5106	BUG_ON(sh->qd_idx >= `0` &&
5107	!test_bit(R5_UPTODATE, &sh->dev[sh->qd_idx].flags) &&
5108	!test_bit(R5_Discard, &sh->dev[sh->qd_idx].flags));
5109	for (i = disks; i--; ) {
5110	struct r5dev *dev = &sh->dev[i];
5111	if (test_bit(R5_LOCKED, &dev->flags) &&
5112	(i == sh->pd_idx \|\| i == sh->qd_idx \|\|
5113	dev->written \|\| test_bit(R5_InJournal,
5114	&dev->flags))) {
5115	pr_debug("Writing block %d\n", i);
5116	set_bit(nr: R5_Wantwrite, addr: &dev->flags);
5117	if (prexor)
5118	continue;
5119	if (s.failed > `1`)
5120	continue;
5121	if (!test_bit(R5_Insync, &dev->flags) \|\|
5122	((i == sh->pd_idx \|\| i == sh->qd_idx) &&
5123	s.failed == `0`))
5124	set_bit(nr: STRIPE_INSYNC, addr: &sh->state);
5125	}
5126	}
5127	if (test_and_clear_bit(nr: STRIPE_PREREAD_ACTIVE, addr: &sh->state))
5128	s.dec_preread_active = `1`;
5129	}
5130
5131	/*
5132	* might be able to return some write requests if the parity blocks
5133	* are safe, or on a failed drive
5134	*/
5135	pdev = &sh->dev[sh->pd_idx];
5136	s.p_failed = (s.failed >= `1` && s.failed_num[`0`] == sh->pd_idx)
5137	\|\| (s.failed >= `2` && s.failed_num[`1`] == sh->pd_idx);
5138	qdev = &sh->dev[sh->qd_idx];
5139	s.q_failed = (s.failed >= `1` && s.failed_num[`0`] == sh->qd_idx)
5140	\|\| (s.failed >= `2` && s.failed_num[`1`] == sh->qd_idx)
5141	\|\| conf->level < `6`;
5142
5143	if (s.written &&
5144	(s.p_failed \|\| ((test_bit(R5_Insync, &pdev->flags)
5145	&& !test_bit(R5_LOCKED, &pdev->flags)
5146	&& (test_bit(R5_UPTODATE, &pdev->flags) \|\|
5147	test_bit(R5_Discard, &pdev->flags))))) &&
5148	(s.q_failed \|\| ((test_bit(R5_Insync, &qdev->flags)
5149	&& !test_bit(R5_LOCKED, &qdev->flags)
5150	&& (test_bit(R5_UPTODATE, &qdev->flags) \|\|
5151	test_bit(R5_Discard, &qdev->flags))))))
5152	handle_stripe_clean_event(conf, sh, disks);
5153
5154	if (s.just_cached)
5155	r5c_handle_cached_data_endio(conf, sh, disks);
5156	log_stripe_write_finished(sh);
5157
5158	/ Now we might consider reading some blocks, either to check/generate*
5159	* parity, or to satisfy requests
5160	* or to load a block that is being partially written.
5161	*/
5162	if (s.to_read \|\| s.non_overwrite
5163	\|\| (s.to_write && s.failed)
5164	\|\| (s.syncing && (s.uptodate + s.compute < disks))
5165	\|\| s.replacing
5166	\|\| s.expanding)
5167	handle_stripe_fill(sh, s: &s, disks);
5168
5169	/*
5170	* When the stripe finishes full journal write cycle (write to journal
5171	* and raid disk), this is the clean up procedure so it is ready for
5172	* next operation.
5173	*/
5174	r5c_finish_stripe_write_out(conf, sh, s: &s);
5175
5176	/*
5177	* Now to consider new write requests, cache write back and what else,
5178	* if anything should be read. We do not handle new writes when:
5179	* 1/ A 'write' operation (copy+xor) is already in flight.
5180	* 2/ A 'check' operation is in flight, as it may clobber the parity
5181	* block.
5182	* 3/ A r5c cache log write is in flight.
5183	*/
5184
5185	if (!sh->reconstruct_state && !sh->check_state && !sh->log_io) {
5186	if (!r5c_is_writeback(log: conf->log)) {
5187	if (s.to_write)
5188	handle_stripe_dirtying(conf, sh, s: &s, disks);
5189	} else { / write back cache /
5190	int ret = `0`;
5191
5192	/ First, try handle writes in caching phase /
5193	if (s.to_write)
5194	ret = r5c_try_caching_write(conf, sh, s: &s,
5195	disks);
5196	/*
5197	* If caching phase failed: ret == -EAGAIN
5198	* OR
5199	* stripe under reclaim: !caching && injournal
5200	*
5201	* fall back to handle_stripe_dirtying()
5202	*/
5203	if (ret == -EAGAIN \|\|
5204	/ stripe under reclaim: !caching && injournal /
5205	(!test_bit(STRIPE_R5C_CACHING, &sh->state) &&
5206	s.injournal > `0`)) {
5207	ret = handle_stripe_dirtying(conf, sh, s: &s,
5208	disks);
5209	if (ret == -EAGAIN)
5210	goto finish;
5211	}
5212	}
5213	}
5214
5215	/ maybe we need to check and possibly fix the parity for this stripe*
5216	* Any reads will already have been scheduled, so we just see if enough
5217	* data is available. The parity check is held off while parity
5218	* dependent operations are in flight.
5219	*/
5220	if (sh->check_state \|\|
5221	(s.syncing && s.locked == `0` &&
5222	!test_bit(STRIPE_COMPUTE_RUN, &sh->state) &&
5223	!test_bit(STRIPE_INSYNC, &sh->state))) {
5224	if (conf->level == `6`)
5225	handle_parity_checks6(conf, sh, s: &s, disks);
5226	else
5227	handle_parity_checks5(conf, sh, s: &s, disks);
5228	}
5229
5230	if ((s.replacing \|\| s.syncing) && s.locked == `0`
5231	&& !test_bit(STRIPE_COMPUTE_RUN, &sh->state)
5232	&& !test_bit(STRIPE_REPLACED, &sh->state)) {
5233	/ Write out to replacement devices where possible /
5234	for (i = `0`; i < conf->raid_disks; i++)
5235	if (test_bit(R5_NeedReplace, &sh->dev[i].flags)) {
5236	WARN_ON(!test_bit(R5_UPTODATE, &sh->dev[i].flags));
5237	set_bit(nr: R5_WantReplace, addr: &sh->dev[i].flags);
5238	set_bit(nr: R5_LOCKED, addr: &sh->dev[i].flags);
5239	s.locked++;
5240	}
5241	if (s.replacing)
5242	set_bit(nr: STRIPE_INSYNC, addr: &sh->state);
5243	set_bit(nr: STRIPE_REPLACED, addr: &sh->state);
5244	}
5245	if ((s.syncing \|\| s.replacing) && s.locked == `0` &&
5246	!test_bit(STRIPE_COMPUTE_RUN, &sh->state) &&
5247	test_bit(STRIPE_INSYNC, &sh->state)) {
5248	md_done_sync(mddev: conf->mddev, RAID5_STRIPE_SECTORS(conf), ok: `1`);
5249	clear_bit(nr: STRIPE_SYNCING, addr: &sh->state);
5250	if (test_and_clear_bit(nr: R5_Overlap, addr: &sh->dev[sh->pd_idx].flags))
5251	wake_up(&conf->wait_for_overlap);
5252	}
5253
5254	/ If the failed drives are just a ReadError, then we might need*
5255	* to progress the repair/check process
5256	*/
5257	if (s.failed <= conf->max_degraded && !conf->mddev->ro)
5258	for (i = `0`; i < s.failed; i++) {
5259	struct r5dev *dev = &sh->dev[s.failed_num[i]];
5260	if (test_bit(R5_ReadError, &dev->flags)
5261	&& !test_bit(R5_LOCKED, &dev->flags)
5262	&& test_bit(R5_UPTODATE, &dev->flags)
5263	) {
5264	if (!test_bit(R5_ReWrite, &dev->flags)) {
5265	set_bit(nr: R5_Wantwrite, addr: &dev->flags);
5266	set_bit(nr: R5_ReWrite, addr: &dev->flags);
5267	} else
5268	/ let's read it back /
5269	set_bit(nr: R5_Wantread, addr: &dev->flags);
5270	set_bit(nr: R5_LOCKED, addr: &dev->flags);
5271	s.locked++;
5272	}
5273	}
5274
5275	/ Finish reconstruct operations initiated by the expansion process /
5276	if (sh->reconstruct_state == reconstruct_state_result) {
5277	struct stripe_head *sh_src
5278	= raid5_get_active_stripe(conf, NULL, sector: sh->sector,
5279	R5_GAS_PREVIOUS \| R5_GAS_NOBLOCK \|
5280	R5_GAS_NOQUIESCE);
5281	if (sh_src && test_bit(STRIPE_EXPAND_SOURCE, &sh_src->state)) {
5282	/ sh cannot be written until sh_src has been read.*
5283	* so arrange for sh to be delayed a little
5284	*/
5285	set_bit(nr: STRIPE_DELAYED, addr: &sh->state);
5286	set_bit(nr: STRIPE_HANDLE, addr: &sh->state);
5287	if (!test_and_set_bit(nr: STRIPE_PREREAD_ACTIVE,
5288	addr: &sh_src->state))
5289	atomic_inc(v: &conf->preread_active_stripes);
5290	raid5_release_stripe(sh: sh_src);
5291	goto finish;
5292	}
5293	if (sh_src)
5294	raid5_release_stripe(sh: sh_src);
5295
5296	sh->reconstruct_state = reconstruct_state_idle;
5297	clear_bit(nr: STRIPE_EXPANDING, addr: &sh->state);
5298	for (i = conf->raid_disks; i--; ) {
5299	set_bit(nr: R5_Wantwrite, addr: &sh->dev[i].flags);
5300	set_bit(nr: R5_LOCKED, addr: &sh->dev[i].flags);
5301	s.locked++;
5302	}
5303	}
5304
5305	if (s.expanded && test_bit(STRIPE_EXPANDING, &sh->state) &&
5306	!sh->reconstruct_state) {
5307	/ Need to write out all blocks after computing parity /
5308	sh->disks = conf->raid_disks;
5309	stripe_set_idx(stripe: sh->sector, conf, previous: `0`, sh);
5310	schedule_reconstruction(sh, s: &s, rcw: `1`, expand: `1`);
5311	} else if (s.expanded && !sh->reconstruct_state && s.locked == `0`) {
5312	clear_bit(nr: STRIPE_EXPAND_READY, addr: &sh->state);
5313	atomic_dec(v: &conf->reshape_stripes);
5314	wake_up(&conf->wait_for_overlap);
5315	md_done_sync(mddev: conf->mddev, RAID5_STRIPE_SECTORS(conf), ok: `1`);
5316	}
5317
5318	if (s.expanding && s.locked == `0` &&
5319	!test_bit(STRIPE_COMPUTE_RUN, &sh->state))
5320	handle_stripe_expansion(conf, sh);
5321
5322	finish:
5323	/ wait for this device to become unblocked /
5324	if (unlikely(s.blocked_rdev)) {
5325	if (conf->mddev->external)
5326	md_wait_for_blocked_rdev(rdev: s.blocked_rdev,
5327	mddev: conf->mddev);
5328	else
5329	/ Internal metadata will immediately*
5330	* be written by raid5d, so we don't
5331	* need to wait here.
5332	*/
5333	rdev_dec_pending(rdev: s.blocked_rdev,
5334	mddev: conf->mddev);
5335	}
5336
5337	if (s.handle_bad_blocks)
5338	for (i = disks; i--; ) {
5339	struct md_rdev *rdev;
5340	struct r5dev *dev = &sh->dev[i];
5341	if (test_and_clear_bit(nr: R5_WriteError, addr: &dev->flags)) {
5342	/ We own a safe reference to the rdev /
5343	rdev = rdev_pend_deref(rdev: conf->disks[i].rdev);
5344	if (!rdev_set_badblocks(rdev, s: sh->sector,
5345	RAID5_STRIPE_SECTORS(conf), is_new: `0`))
5346	md_error(mddev: conf->mddev, rdev);
5347	rdev_dec_pending(rdev, mddev: conf->mddev);
5348	}
5349	if (test_and_clear_bit(nr: R5_MadeGood, addr: &dev->flags)) {
5350	rdev = rdev_pend_deref(rdev: conf->disks[i].rdev);
5351	rdev_clear_badblocks(rdev, s: sh->sector,
5352	RAID5_STRIPE_SECTORS(conf), is_new: `0`);
5353	rdev_dec_pending(rdev, mddev: conf->mddev);
5354	}
5355	if (test_and_clear_bit(nr: R5_MadeGoodRepl, addr: &dev->flags)) {
5356	rdev = rdev_pend_deref(rdev: conf->disks[i].replacement);
5357	if (!rdev)
5358	/ rdev have been moved down /
5359	rdev = rdev_pend_deref(rdev: conf->disks[i].rdev);
5360	rdev_clear_badblocks(rdev, s: sh->sector,
5361	RAID5_STRIPE_SECTORS(conf), is_new: `0`);
5362	rdev_dec_pending(rdev, mddev: conf->mddev);
5363	}
5364	}
5365
5366	if (s.ops_request)
5367	raid_run_ops(sh, ops_request: s.ops_request);
5368
5369	ops_run_io(sh, s: &s);
5370
5371	if (s.dec_preread_active) {
5372	/ We delay this until after ops_run_io so that if make_request*
5373	* is waiting on a flush, it won't continue until the writes
5374	* have actually been submitted.
5375	*/
5376	atomic_dec(v: &conf->preread_active_stripes);
5377	if (atomic_read(v: &conf->preread_active_stripes) <
5378	IO_THRESHOLD)
5379	md_wakeup_thread(thread: conf->mddev->thread);
5380	}
5381
5382	clear_bit_unlock(nr: STRIPE_ACTIVE, addr: &sh->state);
5383	}
5384
5385	static void raid5_activate_delayed(struct r5conf *conf)
5386	__must_hold(&conf->device_lock)
5387	{
5388	if (atomic_read(v: &conf->preread_active_stripes) < IO_THRESHOLD) {
5389	while (!list_empty(head: &conf->delayed_list)) {
5390	struct list_head *l = conf->delayed_list.next;
5391	struct stripe_head *sh;
5392	sh = list_entry(l, struct stripe_head, lru);
5393	list_del_init(entry: l);
5394	clear_bit(nr: STRIPE_DELAYED, addr: &sh->state);
5395	if (!test_and_set_bit(nr: STRIPE_PREREAD_ACTIVE, addr: &sh->state))
5396	atomic_inc(v: &conf->preread_active_stripes);
5397	list_add_tail(new: &sh->lru, head: &conf->hold_list);
5398	raid5_wakeup_stripe_thread(sh);
5399	}
5400	}
5401	}
5402
5403	static void activate_bit_delay(struct r5conf *conf,
5404	struct list_head *temp_inactive_list)
5405	__must_hold(&conf->device_lock)
5406	{
5407	struct list_head head;
5408	list_add(new: &head, head: &conf->bitmap_list);
5409	list_del_init(entry: &conf->bitmap_list);
5410	while (!list_empty(head: &head)) {
5411	struct stripe_head sh = list_entry(head.next, struct* stripe_head, lru);
5412	int hash;
5413	list_del_init(entry: &sh->lru);
5414	atomic_inc(v: &sh->count);
5415	hash = sh->hash_lock_index;
5416	__release_stripe(conf, sh, temp_inactive_list: &temp_inactive_list[hash]);
5417	}
5418	}
5419
5420	static int in_chunk_boundary(struct mddev mddev, struct* bio *bio)
5421	{
5422	struct r5conf *conf = mddev->private;
5423	sector_t sector = bio->bi_iter.bi_sector;
5424	unsigned int chunk_sectors;
5425	unsigned int bio_sectors = bio_sectors(bio);
5426
5427	chunk_sectors = min(conf->chunk_sectors, conf->prev_chunk_sectors);
5428	return chunk_sectors >=
5429	((sector & (chunk_sectors - `1`)) + bio_sectors);
5430	}
5431
5432	/*
5433	* add bio to the retry LIFO ( in O(1) ... we are in interrupt )
5434	* later sampled by raid5d.
5435	*/
5436	static void add_bio_to_retry(struct bio bi,struct* r5conf *conf)
5437	{
5438	unsigned long flags;
5439
5440	spin_lock_irqsave(&conf->device_lock, flags);
5441
5442	bi->bi_next = conf->retry_read_aligned_list;
5443	conf->retry_read_aligned_list = bi;
5444
5445	spin_unlock_irqrestore(lock: &conf->device_lock, flags);
5446	md_wakeup_thread(thread: conf->mddev->thread);
5447	}
5448
5449	static struct bio remove_bio_from_retry(struct* r5conf *conf,
5450	unsigned int *offset)
5451	{
5452	struct bio *bi;
5453
5454	bi = conf->retry_read_aligned;
5455	if (bi) {
5456	*offset = conf->retry_read_offset;
5457	conf->retry_read_aligned = NULL;
5458	return bi;
5459	}
5460	bi = conf->retry_read_aligned_list;
5461	if(bi) {
5462	conf->retry_read_aligned_list = bi->bi_next;
5463	bi->bi_next = NULL;
5464	*offset = `0`;
5465	}
5466
5467	return bi;
5468	}
5469
5470	/*
5471	* The "raid5_align_endio" should check if the read succeeded and if it
5472	* did, call bio_endio on the original bio (having bio_put the new bio
5473	* first).
5474	* If the read failed..
5475	*/
5476	static void raid5_align_endio(struct bio *bi)
5477	{
5478	struct bio *raid_bi = bi->bi_private;
5479	struct md_rdev rdev = (void* *)raid_bi->bi_next;
5480	struct mddev *mddev = rdev->mddev;
5481	struct r5conf *conf = mddev->private;
5482	blk_status_t error = bi->bi_status;
5483
5484	bio_put(bi);
5485	raid_bi->bi_next = NULL;
5486	rdev_dec_pending(rdev, mddev: conf->mddev);
5487
5488	if (!error) {
5489	bio_endio(raid_bi);
5490	if (atomic_dec_and_test(v: &conf->active_aligned_reads))
5491	wake_up(&conf->wait_for_quiescent);
5492	return;
5493	}
5494
5495	pr_debug("raid5_align_endio : io error...handing IO for a retry\n");
5496
5497	add_bio_to_retry(bi: raid_bi, conf);
5498	}
5499
5500	static int raid5_read_one_chunk(struct mddev mddev, struct* bio *raid_bio)
5501	{
5502	struct r5conf *conf = mddev->private;
5503	struct bio *align_bio;
5504	struct md_rdev *rdev;
5505	sector_t sector, end_sector, first_bad;
5506	int bad_sectors, dd_idx;
5507	bool did_inc;
5508
5509	if (!in_chunk_boundary(mddev, bio: raid_bio)) {
5510	pr_debug("%s: non aligned\n", __func__);
5511	return `0`;
5512	}
5513
5514	sector = raid5_compute_sector(conf, r_sector: raid_bio->bi_iter.bi_sector, previous: `0`,
5515	dd_idx: &dd_idx, NULL);
5516	end_sector = sector + bio_sectors(raid_bio);
5517
5518	rcu_read_lock();
5519	if (r5c_big_stripe_cached(conf, sect: sector))
5520	goto out_rcu_unlock;
5521
5522	rdev = rcu_dereference(conf->disks[dd_idx].replacement);
5523	if (!rdev \|\| test_bit(Faulty, &rdev->flags) \|\|
5524	rdev->recovery_offset < end_sector) {
5525	rdev = rcu_dereference(conf->disks[dd_idx].rdev);
5526	if (!rdev)
5527	goto out_rcu_unlock;
5528	if (test_bit(Faulty, &rdev->flags) \|\|
5529	!(test_bit(In_sync, &rdev->flags) \|\|
5530	rdev->recovery_offset >= end_sector))
5531	goto out_rcu_unlock;
5532	}
5533
5534	atomic_inc(v: &rdev->nr_pending);
5535	rcu_read_unlock();
5536
5537	if (is_badblock(rdev, s: sector, bio_sectors(raid_bio), first_bad: &first_bad,
5538	bad_sectors: &bad_sectors)) {
5539	rdev_dec_pending(rdev, mddev);
5540	return `0`;
5541	}
5542
5543	md_account_bio(mddev, bio: &raid_bio);
5544	raid_bio->bi_next = (void *)rdev;
5545
5546	align_bio = bio_alloc_clone(bdev: rdev->bdev, bio_src: raid_bio, GFP_NOIO,
5547	bs: &mddev->bio_set);
5548	align_bio->bi_end_io = raid5_align_endio;
5549	align_bio->bi_private = raid_bio;
5550	align_bio->bi_iter.bi_sector = sector;
5551
5552	/ No reshape active, so we can trust rdev->data_offset /
5553	align_bio->bi_iter.bi_sector += rdev->data_offset;
5554
5555	did_inc = false;
5556	if (conf->quiesce == `0`) {
5557	atomic_inc(v: &conf->active_aligned_reads);
5558	did_inc = true;
5559	}
5560	/ need a memory barrier to detect the race with raid5_quiesce() /
5561	if (!did_inc \|\| smp_load_acquire(&conf->quiesce) != `0`) {
5562	/ quiesce is in progress, so we need to undo io activation and wait*
5563	* for it to finish
5564	*/
5565	if (did_inc && atomic_dec_and_test(v: &conf->active_aligned_reads))
5566	wake_up(&conf->wait_for_quiescent);
5567	spin_lock_irq(lock: &conf->device_lock);
5568	wait_event_lock_irq(conf->wait_for_quiescent, conf->quiesce == `0`,
5569	conf->device_lock);
5570	atomic_inc(v: &conf->active_aligned_reads);
5571	spin_unlock_irq(lock: &conf->device_lock);
5572	}
5573
5574	if (mddev->gendisk)
5575	trace_block_bio_remap(bio: align_bio, dev: disk_devt(disk: mddev->gendisk),
5576	from: raid_bio->bi_iter.bi_sector);
5577	submit_bio_noacct(bio: align_bio);
5578	return `1`;
5579
5580	out_rcu_unlock:
5581	rcu_read_unlock();
5582	return `0`;
5583	}
5584
5585	static struct bio chunk_aligned_read(struct* mddev mddev, struct* bio *raid_bio)
5586	{
5587	struct bio *split;
5588	sector_t sector = raid_bio->bi_iter.bi_sector;
5589	unsigned chunk_sects = mddev->chunk_sectors;
5590	unsigned sectors = chunk_sects - (sector & (chunk_sects-`1`));
5591
5592	if (sectors < bio_sectors(raid_bio)) {
5593	struct r5conf *conf = mddev->private;
5594	split = bio_split(bio: raid_bio, sectors, GFP_NOIO, bs: &conf->bio_split);
5595	bio_chain(split, raid_bio);
5596	submit_bio_noacct(bio: raid_bio);
5597	raid_bio = split;
5598	}
5599
5600	if (!raid5_read_one_chunk(mddev, raid_bio))
5601	return raid_bio;
5602
5603	return NULL;
5604	}
5605
5606	/ __get_priority_stripe - get the next stripe to process*
5607	*
5608	* Full stripe writes are allowed to pass preread active stripes up until
5609	* the bypass_threshold is exceeded. In general the bypass_count
5610	* increments when the handle_list is handled before the hold_list; however, it
5611	* will not be incremented when STRIPE_IO_STARTED is sampled set signifying a
5612	* stripe with in flight i/o. The bypass_count will be reset when the
5613	* head of the hold_list has changed, i.e. the head was promoted to the
5614	* handle_list.
5615	*/
5616	static struct stripe_head __get_priority_stripe(struct* r5conf conf, int* group)
5617	__must_hold(&conf->device_lock)
5618	{
5619	struct stripe_head sh, tmp;
5620	struct list_head *handle_list = NULL;
5621	struct r5worker_group *wg;
5622	bool second_try = !r5c_is_writeback(log: conf->log) &&
5623	!r5l_log_disk_error(conf);
5624	bool try_loprio = test_bit(R5C_LOG_TIGHT, &conf->cache_state) \|\|
5625	r5l_log_disk_error(conf);
5626
5627	again:
5628	wg = NULL;
5629	sh = NULL;
5630	if (conf->worker_cnt_per_group == `0`) {
5631	handle_list = try_loprio ? &conf->loprio_list :
5632	&conf->handle_list;
5633	} else if (group != ANY_GROUP) {
5634	handle_list = try_loprio ? &conf->worker_groups[group].loprio_list :
5635	&conf->worker_groups[group].handle_list;
5636	wg = &conf->worker_groups[group];
5637	} else {
5638	int i;
5639	for (i = `0`; i < conf->group_cnt; i++) {
5640	handle_list = try_loprio ? &conf->worker_groups[i].loprio_list :
5641	&conf->worker_groups[i].handle_list;
5642	wg = &conf->worker_groups[i];
5643	if (!list_empty(head: handle_list))
5644	break;
5645	}
5646	}
5647
5648	pr_debug("%s: handle: %s hold: %s full_writes: %d bypass_count: %d\n",
5649	__func__,
5650	list_empty(handle_list) ? "empty" : "busy",
5651	list_empty(&conf->hold_list) ? "empty" : "busy",
5652	atomic_read(&conf->pending_full_writes), conf->bypass_count);
5653
5654	if (!list_empty(head: handle_list)) {
5655	sh = list_entry(handle_list->next, typeof(*sh), lru);
5656
5657	if (list_empty(head: &conf->hold_list))
5658	conf->bypass_count = `0`;
5659	else if (!test_bit(STRIPE_IO_STARTED, &sh->state)) {
5660	if (conf->hold_list.next == conf->last_hold)
5661	conf->bypass_count++;
5662	else {
5663	conf->last_hold = conf->hold_list.next;
5664	conf->bypass_count -= conf->bypass_threshold;
5665	if (conf->bypass_count < `0`)
5666	conf->bypass_count = `0`;
5667	}
5668	}
5669	} else if (!list_empty(head: &conf->hold_list) &&
5670	((conf->bypass_threshold &&
5671	conf->bypass_count > conf->bypass_threshold) \|\|
5672	atomic_read(v: &conf->pending_full_writes) == `0`)) {
5673
5674	list_for_each_entry(tmp, &conf->hold_list, lru) {
5675	if (conf->worker_cnt_per_group == `0` \|\|
5676	group == ANY_GROUP \|\|
5677	!cpu_online(cpu: tmp->cpu) \|\|
5678	cpu_to_group(tmp->cpu) == group) {
5679	sh = tmp;
5680	break;
5681	}
5682	}
5683
5684	if (sh) {
5685	conf->bypass_count -= conf->bypass_threshold;
5686	if (conf->bypass_count < `0`)
5687	conf->bypass_count = `0`;
5688	}
5689	wg = NULL;
5690	}
5691
5692	if (!sh) {
5693	if (second_try)
5694	return NULL;
5695	second_try = true;
5696	try_loprio = !try_loprio;
5697	goto again;
5698	}
5699
5700	if (wg) {
5701	wg->stripes_cnt--;
5702	sh->group = NULL;
5703	}
5704	list_del_init(entry: &sh->lru);
5705	BUG_ON(atomic_inc_return(&sh->count) != `1`);
5706	return sh;
5707	}
5708
5709	struct raid5_plug_cb {
5710	struct blk_plug_cb cb;
5711	struct list_head list;
5712	struct list_head temp_inactive_list[NR_STRIPE_HASH_LOCKS];
5713	};
5714
5715	static void raid5_unplug(struct blk_plug_cb *blk_cb, bool from_schedule)
5716	{
5717	struct raid5_plug_cb *cb = container_of(
5718	blk_cb, struct raid5_plug_cb, cb);
5719	struct stripe_head *sh;
5720	struct mddev *mddev = cb->cb.data;
5721	struct r5conf *conf = mddev->private;
5722	int cnt = `0`;
5723	int hash;
5724
5725	if (cb->list.next && !list_empty(head: &cb->list)) {
5726	spin_lock_irq(lock: &conf->device_lock);
5727	while (!list_empty(head: &cb->list)) {
5728	sh = list_first_entry(&cb->list, struct stripe_head, lru);
5729	list_del_init(entry: &sh->lru);
5730	/*
5731	* avoid race release_stripe_plug() sees
5732	* STRIPE_ON_UNPLUG_LIST clear but the stripe
5733	* is still in our list
5734	*/
5735	smp_mb__before_atomic();
5736	clear_bit(nr: STRIPE_ON_UNPLUG_LIST, addr: &sh->state);
5737	/*
5738	* STRIPE_ON_RELEASE_LIST could be set here. In that
5739	* case, the count is always > 1 here
5740	*/
5741	hash = sh->hash_lock_index;
5742	__release_stripe(conf, sh, temp_inactive_list: &cb->temp_inactive_list[hash]);
5743	cnt++;
5744	}
5745	spin_unlock_irq(lock: &conf->device_lock);
5746	}
5747	release_inactive_stripe_list(conf, temp_inactive_list: cb->temp_inactive_list,
5748	NR_STRIPE_HASH_LOCKS);
5749	if (mddev->queue)
5750	trace_block_unplug(q: mddev->queue, depth: cnt, explicit: !from_schedule);
5751	kfree(objp: cb);
5752	}
5753
5754	static void release_stripe_plug(struct mddev *mddev,
5755	struct stripe_head *sh)
5756	{
5757	struct blk_plug_cb *blk_cb = blk_check_plugged(
5758	unplug: raid5_unplug, data: mddev,
5759	size: sizeof(struct raid5_plug_cb));
5760	struct raid5_plug_cb *cb;
5761
5762	if (!blk_cb) {
5763	raid5_release_stripe(sh);
5764	return;
5765	}
5766
5767	cb = container_of(blk_cb, struct raid5_plug_cb, cb);
5768
5769	if (cb->list.next == NULL) {
5770	int i;
5771	INIT_LIST_HEAD(list: &cb->list);
5772	for (i = `0`; i < NR_STRIPE_HASH_LOCKS; i++)
5773	INIT_LIST_HEAD(list: cb->temp_inactive_list + i);
5774	}
5775
5776	if (!test_and_set_bit(nr: STRIPE_ON_UNPLUG_LIST, addr: &sh->state))
5777	list_add_tail(new: &sh->lru, head: &cb->list);
5778	else
5779	raid5_release_stripe(sh);
5780	}
5781
5782	static void make_discard_request(struct mddev mddev, struct* bio *bi)
5783	{
5784	struct r5conf *conf = mddev->private;
5785	sector_t logical_sector, last_sector;
5786	struct stripe_head *sh;
5787	int stripe_sectors;
5788
5789	/ We need to handle this when io_uring supports discard/trim /
5790	if (WARN_ON_ONCE(bi->bi_opf & REQ_NOWAIT))
5791	return;
5792
5793	if (mddev->reshape_position != MaxSector)
5794	/ Skip discard while reshape is happening /
5795	return;
5796
5797	logical_sector = bi->bi_iter.bi_sector & ~((sector_t)RAID5_STRIPE_SECTORS(conf)-`1`);
5798	last_sector = bio_end_sector(bi);
5799
5800	bi->bi_next = NULL;
5801
5802	stripe_sectors = conf->chunk_sectors *
5803	(conf->raid_disks - conf->max_degraded);
5804	logical_sector = DIV_ROUND_UP_SECTOR_T(logical_sector,
5805	stripe_sectors);
5806	sector_div(last_sector, stripe_sectors);
5807
5808	logical_sector *= conf->chunk_sectors;
5809	last_sector *= conf->chunk_sectors;
5810
5811	for (; logical_sector < last_sector;
5812	logical_sector += RAID5_STRIPE_SECTORS(conf)) {
5813	DEFINE_WAIT(w);
5814	int d;
5815	again:
5816	sh = raid5_get_active_stripe(conf, NULL, sector: logical_sector, flags: `0`);
5817	prepare_to_wait(wq_head: &conf->wait_for_overlap, wq_entry: &w,
5818	TASK_UNINTERRUPTIBLE);
5819	set_bit(nr: R5_Overlap, addr: &sh->dev[sh->pd_idx].flags);
5820	if (test_bit(STRIPE_SYNCING, &sh->state)) {
5821	raid5_release_stripe(sh);
5822	schedule();
5823	goto again;
5824	}
5825	clear_bit(nr: R5_Overlap, addr: &sh->dev[sh->pd_idx].flags);
5826	spin_lock_irq(lock: &sh->stripe_lock);
5827	for (d = `0`; d < conf->raid_disks; d++) {
5828	if (d == sh->pd_idx \|\| d == sh->qd_idx)
5829	continue;
5830	if (sh->dev[d].towrite \|\| sh->dev[d].toread) {
5831	set_bit(nr: R5_Overlap, addr: &sh->dev[d].flags);
5832	spin_unlock_irq(lock: &sh->stripe_lock);
5833	raid5_release_stripe(sh);
5834	schedule();
5835	goto again;
5836	}
5837	}
5838	set_bit(nr: STRIPE_DISCARD, addr: &sh->state);
5839	finish_wait(wq_head: &conf->wait_for_overlap, wq_entry: &w);
5840	sh->overwrite_disks = `0`;
5841	for (d = `0`; d < conf->raid_disks; d++) {
5842	if (d == sh->pd_idx \|\| d == sh->qd_idx)
5843	continue;
5844	sh->dev[d].towrite = bi;
5845	set_bit(nr: R5_OVERWRITE, addr: &sh->dev[d].flags);
5846	bio_inc_remaining(bio: bi);
5847	md_write_inc(mddev, bi);
5848	sh->overwrite_disks++;
5849	}
5850	spin_unlock_irq(lock: &sh->stripe_lock);
5851	if (conf->mddev->bitmap) {
5852	for (d = `0`;
5853	d < conf->raid_disks - conf->max_degraded;
5854	d++)
5855	md_bitmap_startwrite(bitmap: mddev->bitmap,
5856	offset: sh->sector,
5857	RAID5_STRIPE_SECTORS(conf),
5858	behind: `0`);
5859	sh->bm_seq = conf->seq_flush + `1`;
5860	set_bit(nr: STRIPE_BIT_DELAY, addr: &sh->state);
5861	}
5862
5863	set_bit(nr: STRIPE_HANDLE, addr: &sh->state);
5864	clear_bit(nr: STRIPE_DELAYED, addr: &sh->state);
5865	if (!test_and_set_bit(nr: STRIPE_PREREAD_ACTIVE, addr: &sh->state))
5866	atomic_inc(v: &conf->preread_active_stripes);
5867	release_stripe_plug(mddev, sh);
5868	}
5869
5870	bio_endio(bi);
5871	}
5872
5873	static bool ahead_of_reshape(struct mddev *mddev, sector_t sector,
5874	sector_t reshape_sector)
5875	{
5876	return mddev->reshape_backwards ? sector < reshape_sector :
5877	sector >= reshape_sector;
5878	}
5879
5880	static bool range_ahead_of_reshape(struct mddev *mddev, sector_t min,
5881	sector_t max, sector_t reshape_sector)
5882	{
5883	return mddev->reshape_backwards ? max < reshape_sector :
5884	min >= reshape_sector;
5885	}
5886
5887	static bool stripe_ahead_of_reshape(struct mddev mddev, struct* r5conf *conf,
5888	struct stripe_head *sh)
5889	{
5890	sector_t max_sector = `0`, min_sector = MaxSector;
5891	bool ret = false;
5892	int dd_idx;
5893
5894	for (dd_idx = `0`; dd_idx < sh->disks; dd_idx++) {
5895	if (dd_idx == sh->pd_idx)
5896	continue;
5897
5898	min_sector = min(min_sector, sh->dev[dd_idx].sector);
5899	max_sector = min(max_sector, sh->dev[dd_idx].sector);
5900	}
5901
5902	spin_lock_irq(lock: &conf->device_lock);
5903
5904	if (!range_ahead_of_reshape(mddev, min: min_sector, max: max_sector,
5905	reshape_sector: conf->reshape_progress))
5906	/ mismatch, need to try again /
5907	ret = true;
5908
5909	spin_unlock_irq(lock: &conf->device_lock);
5910
5911	return ret;
5912	}
5913
5914	static int add_all_stripe_bios(struct r5conf *conf,
5915	struct stripe_request_ctx ctx, struct* stripe_head *sh,
5916	struct bio bi, int* forwrite, int previous)
5917	{
5918	int dd_idx;
5919	int ret = `1`;
5920
5921	spin_lock_irq(lock: &sh->stripe_lock);
5922
5923	for (dd_idx = `0`; dd_idx < sh->disks; dd_idx++) {
5924	struct r5dev *dev = &sh->dev[dd_idx];
5925
5926	if (dd_idx == sh->pd_idx \|\| dd_idx == sh->qd_idx)
5927	continue;
5928
5929	if (dev->sector < ctx->first_sector \|\|
5930	dev->sector >= ctx->last_sector)
5931	continue;
5932
5933	if (stripe_bio_overlaps(sh, bi, dd_idx, forwrite)) {
5934	set_bit(nr: R5_Overlap, addr: &dev->flags);
5935	ret = `0`;
5936	continue;
5937	}
5938	}
5939
5940	if (!ret)
5941	goto out;
5942
5943	for (dd_idx = `0`; dd_idx < sh->disks; dd_idx++) {
5944	struct r5dev *dev = &sh->dev[dd_idx];
5945
5946	if (dd_idx == sh->pd_idx \|\| dd_idx == sh->qd_idx)
5947	continue;
5948
5949	if (dev->sector < ctx->first_sector \|\|
5950	dev->sector >= ctx->last_sector)
5951	continue;
5952
5953	__add_stripe_bio(sh, bi, dd_idx, forwrite, previous);
5954	clear_bit(nr: (dev->sector - ctx->first_sector) >>
5955	RAID5_STRIPE_SHIFT(conf), addr: ctx->sectors_to_do);
5956	}
5957
5958	out:
5959	spin_unlock_irq(lock: &sh->stripe_lock);
5960	return ret;
5961	}
5962
5963	static enum stripe_result make_stripe_request(struct mddev *mddev,
5964	struct r5conf conf, struct* stripe_request_ctx *ctx,
5965	sector_t logical_sector, struct bio *bi)
5966	{
5967	const int rw = bio_data_dir(bi);
5968	enum stripe_result ret;
5969	struct stripe_head *sh;
5970	sector_t new_sector;
5971	int previous = `0`, flags = `0`;
5972	int seq, dd_idx;
5973
5974	seq = read_seqcount_begin(&conf->gen_lock);
5975
5976	if (unlikely(conf->reshape_progress != MaxSector)) {
5977	/*
5978	* Spinlock is needed as reshape_progress may be
5979	* 64bit on a 32bit platform, and so it might be
5980	* possible to see a half-updated value
5981	* Of course reshape_progress could change after
5982	* the lock is dropped, so once we get a reference
5983	* to the stripe that we think it is, we will have
5984	* to check again.
5985	*/
5986	spin_lock_irq(lock: &conf->device_lock);
5987	if (ahead_of_reshape(mddev, sector: logical_sector,
5988	reshape_sector: conf->reshape_progress)) {
5989	previous = `1`;
5990	} else {
5991	if (ahead_of_reshape(mddev, sector: logical_sector,
5992	reshape_sector: conf->reshape_safe)) {
5993	spin_unlock_irq(lock: &conf->device_lock);
5994	return STRIPE_SCHEDULE_AND_RETRY;
5995	}
5996	}
5997	spin_unlock_irq(lock: &conf->device_lock);
5998	}
5999
6000	new_sector = raid5_compute_sector(conf, r_sector: logical_sector, previous,
6001	dd_idx: &dd_idx, NULL);
6002	pr_debug("raid456: %s, sector %llu logical %llu\n", __func__,
6003	new_sector, logical_sector);
6004
6005	if (previous)
6006	flags \|= R5_GAS_PREVIOUS;
6007	if (bi->bi_opf & REQ_RAHEAD)
6008	flags \|= R5_GAS_NOBLOCK;
6009	sh = raid5_get_active_stripe(conf, ctx, sector: new_sector, flags);
6010	if (unlikely(!sh)) {
6011	/ cannot get stripe, just give-up /
6012	bi->bi_status = BLK_STS_IOERR;
6013	return STRIPE_FAIL;
6014	}
6015
6016	if (unlikely(previous) &&
6017	stripe_ahead_of_reshape(mddev, conf, sh)) {
6018	/*
6019	* Expansion moved on while waiting for a stripe.
6020	* Expansion could still move past after this
6021	* test, but as we are holding a reference to
6022	* 'sh', we know that if that happens,
6023	* STRIPE_EXPANDING will get set and the expansion
6024	* won't proceed until we finish with the stripe.
6025	*/
6026	ret = STRIPE_SCHEDULE_AND_RETRY;
6027	goto out_release;
6028	}
6029
6030	if (read_seqcount_retry(&conf->gen_lock, seq)) {
6031	/ Might have got the wrong stripe_head by accident /
6032	ret = STRIPE_RETRY;
6033	goto out_release;
6034	}
6035
6036	if (test_bit(STRIPE_EXPANDING, &sh->state) \|\|
6037	!add_all_stripe_bios(conf, ctx, sh, bi, forwrite: rw, previous)) {
6038	/*
6039	* Stripe is busy expanding or add failed due to
6040	* overlap. Flush everything and wait a while.
6041	*/
6042	md_wakeup_thread(thread: mddev->thread);
6043	ret = STRIPE_SCHEDULE_AND_RETRY;
6044	goto out_release;
6045	}
6046
6047	if (stripe_can_batch(sh)) {
6048	stripe_add_to_batch_list(conf, sh, last_sh: ctx->batch_last);
6049	if (ctx->batch_last)
6050	raid5_release_stripe(sh: ctx->batch_last);
6051	atomic_inc(v: &sh->count);
6052	ctx->batch_last = sh;
6053	}
6054
6055	if (ctx->do_flush) {
6056	set_bit(nr: STRIPE_R5C_PREFLUSH, addr: &sh->state);
6057	/ we only need flush for one stripe /
6058	ctx->do_flush = false;
6059	}
6060
6061	set_bit(nr: STRIPE_HANDLE, addr: &sh->state);
6062	clear_bit(nr: STRIPE_DELAYED, addr: &sh->state);
6063	if ((!sh->batch_head \|\| sh == sh->batch_head) &&
6064	(bi->bi_opf & REQ_SYNC) &&
6065	!test_and_set_bit(nr: STRIPE_PREREAD_ACTIVE, addr: &sh->state))
6066	atomic_inc(v: &conf->preread_active_stripes);
6067
6068	release_stripe_plug(mddev, sh);
6069	return STRIPE_SUCCESS;
6070
6071	out_release:
6072	raid5_release_stripe(sh);
6073	return ret;
6074	}
6075
6076	/*
6077	* If the bio covers multiple data disks, find sector within the bio that has
6078	* the lowest chunk offset in the first chunk.
6079	*/
6080	static sector_t raid5_bio_lowest_chunk_sector(struct r5conf *conf,
6081	struct bio *bi)
6082	{
6083	int sectors_per_chunk = conf->chunk_sectors;
6084	int raid_disks = conf->raid_disks;
6085	int dd_idx;
6086	struct stripe_head sh;
6087	unsigned int chunk_offset;
6088	sector_t r_sector = bi->bi_iter.bi_sector & ~((sector_t)RAID5_STRIPE_SECTORS(conf)-`1`);
6089	sector_t sector;
6090
6091	/ We pass in fake stripe_head to get back parity disk numbers /
6092	sector = raid5_compute_sector(conf, r_sector, previous: `0`, dd_idx: &dd_idx, sh: &sh);
6093	chunk_offset = sector_div(sector, sectors_per_chunk);
6094	if (sectors_per_chunk - chunk_offset >= bio_sectors(bi))
6095	return r_sector;
6096	/*
6097	* Bio crosses to the next data disk. Check whether it's in the same
6098	* chunk.
6099	*/
6100	dd_idx++;
6101	while (dd_idx == sh.pd_idx \|\| dd_idx == sh.qd_idx)
6102	dd_idx++;
6103	if (dd_idx >= raid_disks)
6104	return r_sector;
6105	return r_sector + sectors_per_chunk - chunk_offset;
6106	}
6107
6108	static bool raid5_make_request(struct mddev mddev, struct* bio * bi)
6109	{
6110	DEFINE_WAIT_FUNC(wait, woken_wake_function);
6111	struct r5conf *conf = mddev->private;
6112	sector_t logical_sector;
6113	struct stripe_request_ctx ctx = {};
6114	const int rw = bio_data_dir(bi);
6115	enum stripe_result res;
6116	int s, stripe_cnt;
6117
6118	if (unlikely(bi->bi_opf & REQ_PREFLUSH)) {
6119	int ret = log_handle_flush_request(conf, bio: bi);
6120
6121	if (ret == `0`)
6122	return true;
6123	if (ret == -ENODEV) {
6124	if (md_flush_request(mddev, bio: bi))
6125	return true;
6126	}
6127	/ ret == -EAGAIN, fallback /
6128	/*
6129	* if r5l_handle_flush_request() didn't clear REQ_PREFLUSH,
6130	* we need to flush journal device
6131	*/
6132	ctx.do_flush = bi->bi_opf & REQ_PREFLUSH;
6133	}
6134
6135	if (!md_write_start(mddev, bi))
6136	return false;
6137	/*
6138	* If array is degraded, better not do chunk aligned read because
6139	* later we might have to read it again in order to reconstruct
6140	* data on failed drives.
6141	*/
6142	if (rw == READ && mddev->degraded == `0` &&
6143	mddev->reshape_position == MaxSector) {
6144	bi = chunk_aligned_read(mddev, raid_bio: bi);
6145	if (!bi)
6146	return true;
6147	}
6148
6149	if (unlikely(bio_op(bi) == REQ_OP_DISCARD)) {
6150	make_discard_request(mddev, bi);
6151	md_write_end(mddev);
6152	return true;
6153	}
6154
6155	logical_sector = bi->bi_iter.bi_sector & ~((sector_t)RAID5_STRIPE_SECTORS(conf)-`1`);
6156	ctx.first_sector = logical_sector;
6157	ctx.last_sector = bio_end_sector(bi);
6158	bi->bi_next = NULL;
6159
6160	stripe_cnt = DIV_ROUND_UP_SECTOR_T(ctx.last_sector - logical_sector,
6161	RAID5_STRIPE_SECTORS(conf));
6162	bitmap_set(map: ctx.sectors_to_do, start: `0`, nbits: stripe_cnt);
6163
6164	pr_debug("raid456: %s, logical %llu to %llu\n", __func__,
6165	bi->bi_iter.bi_sector, ctx.last_sector);
6166
6167	/ Bail out if conflicts with reshape and REQ_NOWAIT is set /
6168	if ((bi->bi_opf & REQ_NOWAIT) &&
6169	(conf->reshape_progress != MaxSector) &&
6170	!ahead_of_reshape(mddev, sector: logical_sector, reshape_sector: conf->reshape_progress) &&
6171	ahead_of_reshape(mddev, sector: logical_sector, reshape_sector: conf->reshape_safe)) {
6172	bio_wouldblock_error(bio: bi);
6173	if (rw == WRITE)
6174	md_write_end(mddev);
6175	return true;
6176	}
6177	md_account_bio(mddev, bio: &bi);
6178
6179	/*
6180	* Lets start with the stripe with the lowest chunk offset in the first
6181	* chunk. That has the best chances of creating IOs adjacent to
6182	* previous IOs in case of sequential IO and thus creates the most
6183	* sequential IO pattern. We don't bother with the optimization when
6184	* reshaping as the performance benefit is not worth the complexity.
6185	*/
6186	if (likely(conf->reshape_progress == MaxSector))
6187	logical_sector = raid5_bio_lowest_chunk_sector(conf, bi);
6188	s = (logical_sector - ctx.first_sector) >> RAID5_STRIPE_SHIFT(conf);
6189
6190	add_wait_queue(wq_head: &conf->wait_for_overlap, wq_entry: &wait);
6191	while (`1`) {
6192	res = make_stripe_request(mddev, conf, ctx: &ctx, logical_sector,
6193	bi);
6194	if (res == STRIPE_FAIL)
6195	break;
6196
6197	if (res == STRIPE_RETRY)
6198	continue;
6199
6200	if (res == STRIPE_SCHEDULE_AND_RETRY) {
6201	/*
6202	* Must release the reference to batch_last before
6203	* scheduling and waiting for work to be done,
6204	* otherwise the batch_last stripe head could prevent
6205	* raid5_activate_delayed() from making progress
6206	* and thus deadlocking.
6207	*/
6208	if (ctx.batch_last) {
6209	raid5_release_stripe(sh: ctx.batch_last);
6210	ctx.batch_last = NULL;
6211	}
6212
6213	wait_woken(wq_entry: &wait, TASK_UNINTERRUPTIBLE,
6214	MAX_SCHEDULE_TIMEOUT);
6215	continue;
6216	}
6217
6218	s = find_next_bit_wrap(addr: ctx.sectors_to_do, size: stripe_cnt, offset: s);
6219	if (s == stripe_cnt)
6220	break;
6221
6222	logical_sector = ctx.first_sector +
6223	(s << RAID5_STRIPE_SHIFT(conf));
6224	}
6225	remove_wait_queue(wq_head: &conf->wait_for_overlap, wq_entry: &wait);
6226
6227	if (ctx.batch_last)
6228	raid5_release_stripe(sh: ctx.batch_last);
6229
6230	if (rw == WRITE)
6231	md_write_end(mddev);
6232	bio_endio(bi);
6233	return true;
6234	}
6235
6236	static sector_t raid5_size(struct mddev mddev, sector_t sectors, int* raid_disks);
6237
6238	static sector_t reshape_request(struct mddev mddev, sector_t sector_nr, int* *skipped)
6239	{
6240	/ reshaping is quite different to recovery/resync so it is*
6241	* handled quite separately ... here.
6242	*
6243	* On each call to sync_request, we gather one chunk worth of
6244	* destination stripes and flag them as expanding.
6245	* Then we find all the source stripes and request reads.
6246	* As the reads complete, handle_stripe will copy the data
6247	* into the destination stripe and release that stripe.
6248	*/
6249	struct r5conf *conf = mddev->private;
6250	struct stripe_head *sh;
6251	struct md_rdev *rdev;
6252	sector_t first_sector, last_sector;
6253	int raid_disks = conf->previous_raid_disks;
6254	int data_disks = raid_disks - conf->max_degraded;
6255	int new_data_disks = conf->raid_disks - conf->max_degraded;
6256	int i;
6257	int dd_idx;
6258	sector_t writepos, readpos, safepos;
6259	sector_t stripe_addr;
6260	int reshape_sectors;
6261	struct list_head stripes;
6262	sector_t retn;
6263
6264	if (sector_nr == `0`) {
6265	/ If restarting in the middle, skip the initial sectors /
6266	if (mddev->reshape_backwards &&
6267	conf->reshape_progress < raid5_size(mddev, sectors: `0`, raid_disks: `0`)) {
6268	sector_nr = raid5_size(mddev, sectors: `0`, raid_disks: `0`)
6269	- conf->reshape_progress;
6270	} else if (mddev->reshape_backwards &&
6271	conf->reshape_progress == MaxSector) {
6272	/ shouldn't happen, but just in case, finish up./
6273	sector_nr = MaxSector;
6274	} else if (!mddev->reshape_backwards &&
6275	conf->reshape_progress > `0`)
6276	sector_nr = conf->reshape_progress;
6277	sector_div(sector_nr, new_data_disks);
6278	if (sector_nr) {
6279	mddev->curr_resync_completed = sector_nr;
6280	sysfs_notify_dirent_safe(sd: mddev->sysfs_completed);
6281	*skipped = `1`;
6282	retn = sector_nr;
6283	goto finish;
6284	}
6285	}
6286
6287	/ We need to process a full chunk at a time.*
6288	* If old and new chunk sizes differ, we need to process the
6289	* largest of these
6290	*/
6291
6292	reshape_sectors = max(conf->chunk_sectors, conf->prev_chunk_sectors);
6293
6294	/ We update the metadata at least every 10 seconds, or when*
6295	* the data about to be copied would over-write the source of
6296	* the data at the front of the range. i.e. one new_stripe
6297	* along from reshape_progress new_maps to after where
6298	* reshape_safe old_maps to
6299	*/
6300	writepos = conf->reshape_progress;
6301	sector_div(writepos, new_data_disks);
6302	readpos = conf->reshape_progress;
6303	sector_div(readpos, data_disks);
6304	safepos = conf->reshape_safe;
6305	sector_div(safepos, data_disks);
6306	if (mddev->reshape_backwards) {
6307	BUG_ON(writepos < reshape_sectors);
6308	writepos -= reshape_sectors;
6309	readpos += reshape_sectors;
6310	safepos += reshape_sectors;
6311	} else {
6312	writepos += reshape_sectors;
6313	/ readpos and safepos are worst-case calculations.*
6314	* A negative number is overly pessimistic, and causes
6315	* obvious problems for unsigned storage. So clip to 0.
6316	*/
6317	readpos -= min_t(sector_t, reshape_sectors, readpos);
6318	safepos -= min_t(sector_t, reshape_sectors, safepos);
6319	}
6320
6321	/ Having calculated the 'writepos' possibly use it*
6322	* to set 'stripe_addr' which is where we will write to.
6323	*/
6324	if (mddev->reshape_backwards) {
6325	BUG_ON(conf->reshape_progress == `0`);
6326	stripe_addr = writepos;
6327	BUG_ON((mddev->dev_sectors &
6328	~((sector_t)reshape_sectors - `1`))
6329	- reshape_sectors - stripe_addr
6330	!= sector_nr);
6331	} else {
6332	BUG_ON(writepos != sector_nr + reshape_sectors);
6333	stripe_addr = sector_nr;
6334	}
6335
6336	/ 'writepos' is the most advanced device address we might write.*
6337	* 'readpos' is the least advanced device address we might read.
6338	* 'safepos' is the least address recorded in the metadata as having
6339	* been reshaped.
6340	* If there is a min_offset_diff, these are adjusted either by
6341	* increasing the safepos/readpos if diff is negative, or
6342	* increasing writepos if diff is positive.
6343	* If 'readpos' is then behind 'writepos', there is no way that we can
6344	* ensure safety in the face of a crash - that must be done by userspace
6345	* making a backup of the data. So in that case there is no particular
6346	* rush to update metadata.
6347	* Otherwise if 'safepos' is behind 'writepos', then we really need to
6348	* update the metadata to advance 'safepos' to match 'readpos' so that
6349	* we can be safe in the event of a crash.
6350	* So we insist on updating metadata if safepos is behind writepos and
6351	* readpos is beyond writepos.
6352	* In any case, update the metadata every 10 seconds.
6353	* Maybe that number should be configurable, but I'm not sure it is
6354	* worth it.... maybe it could be a multiple of safemode_delay???
6355	*/
6356	if (conf->min_offset_diff < `0`) {
6357	safepos += -conf->min_offset_diff;
6358	readpos += -conf->min_offset_diff;
6359	} else
6360	writepos += conf->min_offset_diff;
6361
6362	if ((mddev->reshape_backwards
6363	? (safepos > writepos && readpos < writepos)
6364	: (safepos < writepos && readpos > writepos)) \|\|
6365	time_after(jiffies, conf->reshape_checkpoint + `10`*HZ)) {
6366	/ Cannot proceed until we've updated the superblock... /
6367	wait_event(conf->wait_for_overlap,
6368	atomic_read(&conf->reshape_stripes)==`0`
6369	\|\| test_bit(MD_RECOVERY_INTR, &mddev->recovery));
6370	if (atomic_read(v: &conf->reshape_stripes) != `0`)
6371	return `0`;
6372	mddev->reshape_position = conf->reshape_progress;
6373	mddev->curr_resync_completed = sector_nr;
6374	if (!mddev->reshape_backwards)
6375	/ Can update recovery_offset /
6376	rdev_for_each(rdev, mddev)
6377	if (rdev->raid_disk >= `0` &&
6378	!test_bit(Journal, &rdev->flags) &&
6379	!test_bit(In_sync, &rdev->flags) &&
6380	rdev->recovery_offset < sector_nr)
6381	rdev->recovery_offset = sector_nr;
6382
6383	conf->reshape_checkpoint = jiffies;
6384	set_bit(nr: MD_SB_CHANGE_DEVS, addr: &mddev->sb_flags);
6385	md_wakeup_thread(thread: mddev->thread);
6386	wait_event(mddev->sb_wait, mddev->sb_flags == `0` \|\|
6387	test_bit(MD_RECOVERY_INTR, &mddev->recovery));
6388	if (test_bit(MD_RECOVERY_INTR, &mddev->recovery))
6389	return `0`;
6390	spin_lock_irq(lock: &conf->device_lock);
6391	conf->reshape_safe = mddev->reshape_position;
6392	spin_unlock_irq(lock: &conf->device_lock);
6393	wake_up(&conf->wait_for_overlap);
6394	sysfs_notify_dirent_safe(sd: mddev->sysfs_completed);
6395	}
6396
6397	INIT_LIST_HEAD(list: &stripes);
6398	for (i = `0`; i < reshape_sectors; i += RAID5_STRIPE_SECTORS(conf)) {
6399	int j;
6400	int skipped_disk = `0`;
6401	sh = raid5_get_active_stripe(conf, NULL, sector: stripe_addr+i,
6402	R5_GAS_NOQUIESCE);
6403	set_bit(nr: STRIPE_EXPANDING, addr: &sh->state);
6404	atomic_inc(v: &conf->reshape_stripes);
6405	/ If any of this stripe is beyond the end of the old*
6406	* array, then we need to zero those blocks
6407	*/
6408	for (j=sh->disks; j--;) {
6409	sector_t s;
6410	if (j == sh->pd_idx)
6411	continue;
6412	if (conf->level == `6` &&
6413	j == sh->qd_idx)
6414	continue;
6415	s = raid5_compute_blocknr(sh, i: j, previous: `0`);
6416	if (s < raid5_size(mddev, sectors: `0`, raid_disks: `0`)) {
6417	skipped_disk = `1`;
6418	continue;
6419	}
6420	memset(page_address(sh->dev[j].page), `0`, RAID5_STRIPE_SIZE(conf));
6421	set_bit(nr: R5_Expanded, addr: &sh->dev[j].flags);
6422	set_bit(nr: R5_UPTODATE, addr: &sh->dev[j].flags);
6423	}
6424	if (!skipped_disk) {
6425	set_bit(nr: STRIPE_EXPAND_READY, addr: &sh->state);
6426	set_bit(nr: STRIPE_HANDLE, addr: &sh->state);
6427	}
6428	list_add(new: &sh->lru, head: &stripes);
6429	}
6430	spin_lock_irq(lock: &conf->device_lock);
6431	if (mddev->reshape_backwards)
6432	conf->reshape_progress -= reshape_sectors * new_data_disks;
6433	else
6434	conf->reshape_progress += reshape_sectors * new_data_disks;
6435	spin_unlock_irq(lock: &conf->device_lock);
6436	/ Ok, those stripe are ready. We can start scheduling*
6437	* reads on the source stripes.
6438	* The source stripes are determined by mapping the first and last
6439	* block on the destination stripes.
6440	*/
6441	first_sector =
6442	raid5_compute_sector(conf, r_sector: stripe_addr*(new_data_disks),
6443	previous: `1`, dd_idx: &dd_idx, NULL);
6444	last_sector =
6445	raid5_compute_sector(conf, r_sector: ((stripe_addr+reshape_sectors)
6446	* new_data_disks - `1`),
6447	previous: `1`, dd_idx: &dd_idx, NULL);
6448	if (last_sector >= mddev->dev_sectors)
6449	last_sector = mddev->dev_sectors - `1`;
6450	while (first_sector <= last_sector) {
6451	sh = raid5_get_active_stripe(conf, NULL, sector: first_sector,
6452	R5_GAS_PREVIOUS \| R5_GAS_NOQUIESCE);
6453	set_bit(nr: STRIPE_EXPAND_SOURCE, addr: &sh->state);
6454	set_bit(nr: STRIPE_HANDLE, addr: &sh->state);
6455	raid5_release_stripe(sh);
6456	first_sector += RAID5_STRIPE_SECTORS(conf);
6457	}
6458	/ Now that the sources are clearly marked, we can release*
6459	* the destination stripes
6460	*/
6461	while (!list_empty(head: &stripes)) {
6462	sh = list_entry(stripes.next, struct stripe_head, lru);
6463	list_del_init(entry: &sh->lru);
6464	raid5_release_stripe(sh);
6465	}
6466	/ If this takes us to the resync_max point where we have to pause,*
6467	* then we need to write out the superblock.
6468	*/
6469	sector_nr += reshape_sectors;
6470	retn = reshape_sectors;
6471	finish:
6472	if (mddev->curr_resync_completed > mddev->resync_max \|\|
6473	(sector_nr - mddev->curr_resync_completed) * `2`
6474	>= mddev->resync_max - mddev->curr_resync_completed) {
6475	/ Cannot proceed until we've updated the superblock... /
6476	wait_event(conf->wait_for_overlap,
6477	atomic_read(&conf->reshape_stripes) == `0`
6478	\|\| test_bit(MD_RECOVERY_INTR, &mddev->recovery));
6479	if (atomic_read(v: &conf->reshape_stripes) != `0`)
6480	goto ret;
6481	mddev->reshape_position = conf->reshape_progress;
6482	mddev->curr_resync_completed = sector_nr;
6483	if (!mddev->reshape_backwards)
6484	/ Can update recovery_offset /
6485	rdev_for_each(rdev, mddev)
6486	if (rdev->raid_disk >= `0` &&
6487	!test_bit(Journal, &rdev->flags) &&
6488	!test_bit(In_sync, &rdev->flags) &&
6489	rdev->recovery_offset < sector_nr)
6490	rdev->recovery_offset = sector_nr;
6491	conf->reshape_checkpoint = jiffies;
6492	set_bit(nr: MD_SB_CHANGE_DEVS, addr: &mddev->sb_flags);
6493	md_wakeup_thread(thread: mddev->thread);
6494	wait_event(mddev->sb_wait,
6495	!test_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags)
6496	\|\| test_bit(MD_RECOVERY_INTR, &mddev->recovery));
6497	if (test_bit(MD_RECOVERY_INTR, &mddev->recovery))
6498	goto ret;
6499	spin_lock_irq(lock: &conf->device_lock);
6500	conf->reshape_safe = mddev->reshape_position;
6501	spin_unlock_irq(lock: &conf->device_lock);
6502	wake_up(&conf->wait_for_overlap);
6503	sysfs_notify_dirent_safe(sd: mddev->sysfs_completed);
6504	}
6505	ret:
6506	return retn;
6507	}
6508
6509	static inline sector_t raid5_sync_request(struct mddev *mddev, sector_t sector_nr,
6510	int *skipped)
6511	{
6512	struct r5conf *conf = mddev->private;
6513	struct stripe_head *sh;
6514	sector_t max_sector = mddev->dev_sectors;
6515	sector_t sync_blocks;
6516	int still_degraded = `0`;
6517	int i;
6518
6519	if (sector_nr >= max_sector) {
6520	/ just being told to finish up .. nothing much to do /
6521
6522	if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) {
6523	end_reshape(conf);
6524	return `0`;
6525	}
6526
6527	if (mddev->curr_resync < max_sector) / aborted /
6528	md_bitmap_end_sync(bitmap: mddev->bitmap, offset: mddev->curr_resync,
6529	blocks: &sync_blocks, aborted: `1`);
6530	else / completed sync /
6531	conf->fullsync = `0`;
6532	md_bitmap_close_sync(bitmap: mddev->bitmap);
6533
6534	return `0`;
6535	}
6536
6537	/ Allow raid5_quiesce to complete /
6538	wait_event(conf->wait_for_overlap, conf->quiesce != `2`);
6539
6540	if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
6541	return reshape_request(mddev, sector_nr, skipped);
6542
6543	/ No need to check resync_max as we never do more than one*
6544	* stripe, and as resync_max will always be on a chunk boundary,
6545	* if the check in md_do_sync didn't fire, there is no chance
6546	* of overstepping resync_max here
6547	*/
6548
6549	/ if there is too many failed drives and we are trying*
6550	* to resync, then assert that we are finished, because there is
6551	* nothing we can do.
6552	*/
6553	if (mddev->degraded >= conf->max_degraded &&
6554	test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
6555	sector_t rv = mddev->dev_sectors - sector_nr;
6556	*skipped = `1`;
6557	return rv;
6558	}
6559	if (!test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery) &&
6560	!conf->fullsync &&
6561	!md_bitmap_start_sync(bitmap: mddev->bitmap, offset: sector_nr, blocks: &sync_blocks, degraded: `1`) &&
6562	sync_blocks >= RAID5_STRIPE_SECTORS(conf)) {
6563	/ we can skip this block, and probably more /
6564	do_div(sync_blocks, RAID5_STRIPE_SECTORS(conf));
6565	*skipped = `1`;
6566	/ keep things rounded to whole stripes /
6567	return sync_blocks * RAID5_STRIPE_SECTORS(conf);
6568	}
6569
6570	md_bitmap_cond_end_sync(bitmap: mddev->bitmap, sector: sector_nr, force: false);
6571
6572	sh = raid5_get_active_stripe(conf, NULL, sector: sector_nr,
6573	R5_GAS_NOBLOCK);
6574	if (sh == NULL) {
6575	sh = raid5_get_active_stripe(conf, NULL, sector: sector_nr, flags: `0`);
6576	/ make sure we don't swamp the stripe cache if someone else*
6577	* is trying to get access
6578	*/
6579	schedule_timeout_uninterruptible(timeout: `1`);
6580	}
6581	/ Need to check if array will still be degraded after recovery/resync*
6582	* Note in case of > 1 drive failures it's possible we're rebuilding
6583	* one drive while leaving another faulty drive in array.
6584	*/
6585	rcu_read_lock();
6586	for (i = `0`; i < conf->raid_disks; i++) {
6587	struct md_rdev *rdev = rcu_dereference(conf->disks[i].rdev);
6588
6589	if (rdev == NULL \|\| test_bit(Faulty, &rdev->flags))
6590	still_degraded = `1`;
6591	}
6592	rcu_read_unlock();
6593
6594	md_bitmap_start_sync(bitmap: mddev->bitmap, offset: sector_nr, blocks: &sync_blocks, degraded: still_degraded);
6595
6596	set_bit(nr: STRIPE_SYNC_REQUESTED, addr: &sh->state);
6597	set_bit(nr: STRIPE_HANDLE, addr: &sh->state);
6598
6599	raid5_release_stripe(sh);
6600
6601	return RAID5_STRIPE_SECTORS(conf);
6602	}
6603
6604	static int retry_aligned_read(struct r5conf conf, struct* bio *raid_bio,
6605	unsigned int offset)
6606	{
6607	/ We may not be able to submit a whole bio at once as there*
6608	* may not be enough stripe_heads available.
6609	* We cannot pre-allocate enough stripe_heads as we may need
6610	* more than exist in the cache (if we allow ever large chunks).
6611	* So we do one stripe head at a time and record in
6612	* ->bi_hw_segments how many have been done.
6613	*
6614	* We know that this entire raid_bio is in one chunk, so
6615	* it will be only one 'dd_idx' and only need one call to raid5_compute_sector.
6616	*/
6617	struct stripe_head *sh;
6618	int dd_idx;
6619	sector_t sector, logical_sector, last_sector;
6620	int scnt = `0`;
6621	int handled = `0`;
6622
6623	logical_sector = raid_bio->bi_iter.bi_sector &
6624	~((sector_t)RAID5_STRIPE_SECTORS(conf)-`1`);
6625	sector = raid5_compute_sector(conf, r_sector: logical_sector,
6626	previous: `0`, dd_idx: &dd_idx, NULL);
6627	last_sector = bio_end_sector(raid_bio);
6628
6629	for (; logical_sector < last_sector;
6630	logical_sector += RAID5_STRIPE_SECTORS(conf),
6631	sector += RAID5_STRIPE_SECTORS(conf),
6632	scnt++) {
6633
6634	if (scnt < offset)
6635	/ already done this stripe /
6636	continue;
6637
6638	sh = raid5_get_active_stripe(conf, NULL, sector,
6639	R5_GAS_NOBLOCK \| R5_GAS_NOQUIESCE);
6640	if (!sh) {
6641	/ failed to get a stripe - must wait /
6642	conf->retry_read_aligned = raid_bio;
6643	conf->retry_read_offset = scnt;
6644	return handled;
6645	}
6646
6647	if (!add_stripe_bio(sh, bi: raid_bio, dd_idx, forwrite: `0`, previous: `0`)) {
6648	raid5_release_stripe(sh);
6649	conf->retry_read_aligned = raid_bio;
6650	conf->retry_read_offset = scnt;
6651	return handled;
6652	}
6653
6654	set_bit(nr: R5_ReadNoMerge, addr: &sh->dev[dd_idx].flags);
6655	handle_stripe(sh);
6656	raid5_release_stripe(sh);
6657	handled++;
6658	}
6659
6660	bio_endio(raid_bio);
6661
6662	if (atomic_dec_and_test(v: &conf->active_aligned_reads))
6663	wake_up(&conf->wait_for_quiescent);
6664	return handled;
6665	}
6666
6667	static int handle_active_stripes(struct r5conf conf, int* group,
6668	struct r5worker *worker,
6669	struct list_head *temp_inactive_list)
6670	__must_hold(&conf->device_lock)
6671	{
6672	struct stripe_head batch[MAX_STRIPE_BATCH], sh;
6673	int i, batch_size = `0`, hash;
6674	bool release_inactive = false;
6675
6676	while (batch_size < MAX_STRIPE_BATCH &&
6677	(sh = __get_priority_stripe(conf, group)) != NULL)
6678	batch[batch_size++] = sh;
6679
6680	if (batch_size == `0`) {
6681	for (i = `0`; i < NR_STRIPE_HASH_LOCKS; i++)
6682	if (!list_empty(head: temp_inactive_list + i))
6683	break;
6684	if (i == NR_STRIPE_HASH_LOCKS) {
6685	spin_unlock_irq(lock: &conf->device_lock);
6686	log_flush_stripe_to_raid(conf);
6687	spin_lock_irq(lock: &conf->device_lock);
6688	return batch_size;
6689	}
6690	release_inactive = true;
6691	}
6692	spin_unlock_irq(lock: &conf->device_lock);
6693
6694	release_inactive_stripe_list(conf, temp_inactive_list,
6695	NR_STRIPE_HASH_LOCKS);
6696
6697	r5l_flush_stripe_to_raid(log: conf->log);
6698	if (release_inactive) {
6699	spin_lock_irq(lock: &conf->device_lock);
6700	return `0`;
6701	}
6702
6703	for (i = `0`; i < batch_size; i++)
6704	handle_stripe(sh: batch[i]);
6705	log_write_stripe_run(conf);
6706
6707	cond_resched();
6708
6709	spin_lock_irq(lock: &conf->device_lock);
6710	for (i = `0`; i < batch_size; i++) {
6711	hash = batch[i]->hash_lock_index;
6712	__release_stripe(conf, sh: batch[i], temp_inactive_list: &temp_inactive_list[hash]);
6713	}
6714	return batch_size;
6715	}
6716
6717	static void raid5_do_work(struct work_struct *work)
6718	{
6719	struct r5worker worker = container_of(work, struct* r5worker, work);
6720	struct r5worker_group *group = worker->group;
6721	struct r5conf *conf = group->conf;
6722	struct mddev *mddev = conf->mddev;
6723	int group_id = group - conf->worker_groups;
6724	int handled;
6725	struct blk_plug plug;
6726
6727	pr_debug("+++ raid5worker active\n");
6728
6729	blk_start_plug(&plug);
6730	handled = `0`;
6731	spin_lock_irq(lock: &conf->device_lock);
6732	while (`1`) {
6733	int batch_size, released;
6734
6735	released = release_stripe_list(conf, temp_inactive_list: worker->temp_inactive_list);
6736
6737	batch_size = handle_active_stripes(conf, group: group_id, worker,
6738	temp_inactive_list: worker->temp_inactive_list);
6739	worker->working = false;
6740	if (!batch_size && !released)
6741	break;
6742	handled += batch_size;
6743	wait_event_lock_irq(mddev->sb_wait,
6744	!test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags),
6745	conf->device_lock);
6746	}
6747	pr_debug("%d stripes handled\n", handled);
6748
6749	spin_unlock_irq(lock: &conf->device_lock);
6750
6751	flush_deferred_bios(conf);
6752
6753	r5l_flush_stripe_to_raid(log: conf->log);
6754
6755	async_tx_issue_pending_all();
6756	blk_finish_plug(&plug);
6757
6758	pr_debug("--- raid5worker inactive\n");
6759	}
6760
6761	/*
6762	* This is our raid5 kernel thread.
6763	*
6764	* We scan the hash table for stripes which can be handled now.
6765	* During the scan, completed stripes are saved for us by the interrupt
6766	* handler, so that they will not have to wait for our next wakeup.
6767	*/
6768	static void raid5d(struct md_thread *thread)
6769	{
6770	struct mddev *mddev = thread->mddev;
6771	struct r5conf *conf = mddev->private;
6772	int handled;
6773	struct blk_plug plug;
6774
6775	pr_debug("+++ raid5d active\n");
6776
6777	md_check_recovery(mddev);
6778
6779	blk_start_plug(&plug);
6780	handled = `0`;
6781	spin_lock_irq(lock: &conf->device_lock);
6782	while (`1`) {
6783	struct bio *bio;
6784	int batch_size, released;
6785	unsigned int offset;
6786
6787	released = release_stripe_list(conf, temp_inactive_list: conf->temp_inactive_list);
6788	if (released)
6789	clear_bit(nr: R5_DID_ALLOC, addr: &conf->cache_state);
6790
6791	if (
6792	!list_empty(head: &conf->bitmap_list)) {
6793	/ Now is a good time to flush some bitmap updates /
6794	conf->seq_flush++;
6795	spin_unlock_irq(lock: &conf->device_lock);
6796	md_bitmap_unplug(bitmap: mddev->bitmap);
6797	spin_lock_irq(lock: &conf->device_lock);
6798	conf->seq_write = conf->seq_flush;
6799	activate_bit_delay(conf, temp_inactive_list: conf->temp_inactive_list);
6800	}
6801	raid5_activate_delayed(conf);
6802
6803	while ((bio = remove_bio_from_retry(conf, offset: &offset))) {
6804	int ok;
6805	spin_unlock_irq(lock: &conf->device_lock);
6806	ok = retry_aligned_read(conf, raid_bio: bio, offset);
6807	spin_lock_irq(lock: &conf->device_lock);
6808	if (!ok)
6809	break;
6810	handled++;
6811	}
6812
6813	batch_size = handle_active_stripes(conf, ANY_GROUP, NULL,
6814	temp_inactive_list: conf->temp_inactive_list);
6815	if (!batch_size && !released)
6816	break;
6817	handled += batch_size;
6818
6819	if (mddev->sb_flags & ~(`1` << MD_SB_CHANGE_PENDING)) {
6820	spin_unlock_irq(lock: &conf->device_lock);
6821	md_check_recovery(mddev);
6822	spin_lock_irq(lock: &conf->device_lock);
6823
6824	/*
6825	* Waiting on MD_SB_CHANGE_PENDING below may deadlock
6826	* seeing md_check_recovery() is needed to clear
6827	* the flag when using mdmon.
6828	*/
6829	continue;
6830	}
6831
6832	wait_event_lock_irq(mddev->sb_wait,
6833	!test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags),
6834	conf->device_lock);
6835	}
6836	pr_debug("%d stripes handled\n", handled);
6837
6838	spin_unlock_irq(lock: &conf->device_lock);
6839	if (test_and_clear_bit(nr: R5_ALLOC_MORE, addr: &conf->cache_state) &&
6840	mutex_trylock(lock: &conf->cache_size_mutex)) {
6841	grow_one_stripe(conf, __GFP_NOWARN);
6842	/ Set flag even if allocation failed. This helps*
6843	* slow down allocation requests when mem is short
6844	*/
6845	set_bit(nr: R5_DID_ALLOC, addr: &conf->cache_state);
6846	mutex_unlock(lock: &conf->cache_size_mutex);
6847	}
6848
6849	flush_deferred_bios(conf);
6850
6851	r5l_flush_stripe_to_raid(log: conf->log);
6852
6853	async_tx_issue_pending_all();
6854	blk_finish_plug(&plug);
6855
6856	pr_debug("--- raid5d inactive\n");
6857	}
6858
6859	static ssize_t
6860	raid5_show_stripe_cache_size(struct mddev mddev, char* *page)
6861	{
6862	struct r5conf *conf;
6863	int ret = `0`;
6864	spin_lock(lock: &mddev->lock);
6865	conf = mddev->private;
6866	if (conf)
6867	ret = sprintf(buf: page, fmt: "%d\n", conf->min_nr_stripes);
6868	spin_unlock(lock: &mddev->lock);
6869	return ret;
6870	}
6871
6872	int
6873	raid5_set_cache_size(struct mddev mddev, int* size)
6874	{
6875	int result = `0`;
6876	struct r5conf *conf = mddev->private;
6877
6878	if (size <= `16` \|\| size > `32768`)
6879	return -EINVAL;
6880
6881	conf->min_nr_stripes = size;
6882	mutex_lock(&conf->cache_size_mutex);
6883	while (size < conf->max_nr_stripes &&
6884	drop_one_stripe(conf))
6885	;
6886	mutex_unlock(lock: &conf->cache_size_mutex);
6887
6888	md_allow_write(mddev);
6889
6890	mutex_lock(&conf->cache_size_mutex);
6891	while (size > conf->max_nr_stripes)
6892	if (!grow_one_stripe(conf, GFP_KERNEL)) {
6893	conf->min_nr_stripes = conf->max_nr_stripes;
6894	result = -ENOMEM;
6895	break;
6896	}
6897	mutex_unlock(lock: &conf->cache_size_mutex);
6898
6899	return result;
6900	}
6901	EXPORT_SYMBOL(raid5_set_cache_size);
6902
6903	static ssize_t
6904	raid5_store_stripe_cache_size(struct mddev mddev, const* char *page, size_t len)
6905	{
6906	struct r5conf *conf;
6907	unsigned long new;
6908	int err;
6909
6910	if (len >= PAGE_SIZE)
6911	return -EINVAL;
6912	if (kstrtoul(s: page, base: `10`, res: &new))
6913	return -EINVAL;
6914	err = mddev_lock(mddev);
6915	if (err)
6916	return err;
6917	conf = mddev->private;
6918	if (!conf)
6919	err = -ENODEV;
6920	else
6921	err = raid5_set_cache_size(mddev, new);
6922	mddev_unlock(mddev);
6923
6924	return err ?: len;
6925	}
6926
6927	static struct md_sysfs_entry
6928	raid5_stripecache_size = __ATTR(stripe_cache_size, S_IRUGO \| S_IWUSR,
6929	raid5_show_stripe_cache_size,
6930	raid5_store_stripe_cache_size);
6931
6932	static ssize_t
6933	raid5_show_rmw_level(struct mddev mddev, char* *page)
6934	{
6935	struct r5conf *conf = mddev->private;
6936	if (conf)
6937	return sprintf(buf: page, fmt: "%d\n", conf->rmw_level);
6938	else
6939	return `0`;
6940	}
6941
6942	static ssize_t
6943	raid5_store_rmw_level(struct mddev mddev, const* char *page, size_t len)
6944	{
6945	struct r5conf *conf = mddev->private;
6946	unsigned long new;
6947
6948	if (!conf)
6949	return -ENODEV;
6950
6951	if (len >= PAGE_SIZE)
6952	return -EINVAL;
6953
6954	if (kstrtoul(s: page, base: `10`, res: &new))
6955	return -EINVAL;
6956
6957	if (new != PARITY_DISABLE_RMW && !raid6_call.xor_syndrome)
6958	return -EINVAL;
6959
6960	if (new != PARITY_DISABLE_RMW &&
6961	new != PARITY_ENABLE_RMW &&
6962	new != PARITY_PREFER_RMW)
6963	return -EINVAL;
6964
6965	conf->rmw_level = new;
6966	return len;
6967	}
6968
6969	static struct md_sysfs_entry
6970	raid5_rmw_level = __ATTR(rmw_level, S_IRUGO \| S_IWUSR,
6971	raid5_show_rmw_level,
6972	raid5_store_rmw_level);
6973
6974	static ssize_t
6975	raid5_show_stripe_size(struct mddev mddev, char* *page)
6976	{
6977	struct r5conf *conf;
6978	int ret = `0`;
6979
6980	spin_lock(lock: &mddev->lock);
6981	conf = mddev->private;
6982	if (conf)
6983	ret = sprintf(buf: page, fmt: "%lu\n", RAID5_STRIPE_SIZE(conf));
6984	spin_unlock(lock: &mddev->lock);
6985	return ret;
6986	}
6987
6988	#if PAGE_SIZE != DEFAULT_STRIPE_SIZE
6989	static ssize_t
6990	raid5_store_stripe_size(struct mddev mddev, const* char *page, size_t len)
6991	{
6992	struct r5conf *conf;
6993	unsigned long new;
6994	int err;
6995	int size;
6996
6997	if (len >= PAGE_SIZE)
6998	return -EINVAL;
6999	if (kstrtoul(page, `10`, &new))
7000	return -EINVAL;
7001
7002	/*
7003	* The value should not be bigger than PAGE_SIZE. It requires to
7004	* be multiple of DEFAULT_STRIPE_SIZE and the value should be power
7005	* of two.
7006	*/
7007	if (new % DEFAULT_STRIPE_SIZE != `0` \|\|
7008	new > PAGE_SIZE \|\| new == `0` \|\|
7009	new != roundup_pow_of_two(new))
7010	return -EINVAL;
7011
7012	err = mddev_suspend_and_lock(mddev);
7013	if (err)
7014	return err;
7015
7016	conf = mddev->private;
7017	if (!conf) {
7018	err = -ENODEV;
7019	goto out_unlock;
7020	}
7021
7022	if (new == conf->stripe_size)
7023	goto out_unlock;
7024
7025	pr_debug("md/raid: change stripe_size from %lu to %lu\n",
7026	conf->stripe_size, new);
7027
7028	if (mddev->sync_thread \|\|
7029	test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) \|\|
7030	mddev->reshape_position != MaxSector \|\|
7031	mddev->sysfs_active) {
7032	err = -EBUSY;
7033	goto out_unlock;
7034	}
7035
7036	mutex_lock(&conf->cache_size_mutex);
7037	size = conf->max_nr_stripes;
7038
7039	shrink_stripes(conf);
7040
7041	conf->stripe_size = new;
7042	conf->stripe_shift = ilog2(new) - `9`;
7043	conf->stripe_sectors = new >> `9`;
7044	if (grow_stripes(conf, size)) {
7045	pr_warn("md/raid:%s: couldn't allocate buffers\n",
7046	mdname(mddev));
7047	err = -ENOMEM;
7048	}
7049	mutex_unlock(&conf->cache_size_mutex);
7050
7051	out_unlock:
7052	mddev_unlock_and_resume(mddev);
7053	return err ?: len;
7054	}
7055
7056	static struct md_sysfs_entry
7057	raid5_stripe_size = __ATTR(stripe_size, `0644`,
7058	raid5_show_stripe_size,
7059	raid5_store_stripe_size);
7060	#else
7061	static struct md_sysfs_entry
7062	raid5_stripe_size = __ATTR(stripe_size, `0444`,
7063	raid5_show_stripe_size,
7064	NULL);
7065	#endif
7066
7067	static ssize_t
7068	raid5_show_preread_threshold(struct mddev mddev, char* *page)
7069	{
7070	struct r5conf *conf;
7071	int ret = `0`;
7072	spin_lock(lock: &mddev->lock);
7073	conf = mddev->private;
7074	if (conf)
7075	ret = sprintf(buf: page, fmt: "%d\n", conf->bypass_threshold);
7076	spin_unlock(lock: &mddev->lock);
7077	return ret;
7078	}
7079
7080	static ssize_t
7081	raid5_store_preread_threshold(struct mddev mddev, const* char *page, size_t len)
7082	{
7083	struct r5conf *conf;
7084	unsigned long new;
7085	int err;
7086
7087	if (len >= PAGE_SIZE)
7088	return -EINVAL;
7089	if (kstrtoul(s: page, base: `10`, res: &new))
7090	return -EINVAL;
7091
7092	err = mddev_lock(mddev);
7093	if (err)
7094	return err;
7095	conf = mddev->private;
7096	if (!conf)
7097	err = -ENODEV;
7098	else if (new > conf->min_nr_stripes)
7099	err = -EINVAL;
7100	else
7101	conf->bypass_threshold = new;
7102	mddev_unlock(mddev);
7103	return err ?: len;
7104	}
7105
7106	static struct md_sysfs_entry
7107	raid5_preread_bypass_threshold = __ATTR(preread_bypass_threshold,
7108	S_IRUGO \| S_IWUSR,
7109	raid5_show_preread_threshold,
7110	raid5_store_preread_threshold);
7111
7112	static ssize_t
7113	raid5_show_skip_copy(struct mddev mddev, char* *page)
7114	{
7115	struct r5conf *conf;
7116	int ret = `0`;
7117	spin_lock(lock: &mddev->lock);
7118	conf = mddev->private;
7119	if (conf)
7120	ret = sprintf(buf: page, fmt: "%d\n", conf->skip_copy);
7121	spin_unlock(lock: &mddev->lock);
7122	return ret;
7123	}
7124
7125	static ssize_t
7126	raid5_store_skip_copy(struct mddev mddev, const* char *page, size_t len)
7127	{
7128	struct r5conf *conf;
7129	unsigned long new;
7130	int err;
7131
7132	if (len >= PAGE_SIZE)
7133	return -EINVAL;
7134	if (kstrtoul(s: page, base: `10`, res: &new))
7135	return -EINVAL;
7136	new = !!new;
7137
7138	err = mddev_suspend_and_lock(mddev);
7139	if (err)
7140	return err;
7141	conf = mddev->private;
7142	if (!conf)
7143	err = -ENODEV;
7144	else if (new != conf->skip_copy) {
7145	struct request_queue *q = mddev->queue;
7146
7147	conf->skip_copy = new;
7148	if (new)
7149	blk_queue_flag_set(QUEUE_FLAG_STABLE_WRITES, q);
7150	else
7151	blk_queue_flag_clear(QUEUE_FLAG_STABLE_WRITES, q);
7152	}
7153	mddev_unlock_and_resume(mddev);
7154	return err ?: len;
7155	}
7156
7157	static struct md_sysfs_entry
7158	raid5_skip_copy = __ATTR(skip_copy, S_IRUGO \| S_IWUSR,
7159	raid5_show_skip_copy,
7160	raid5_store_skip_copy);
7161
7162	static ssize_t
7163	stripe_cache_active_show(struct mddev mddev, char* *page)
7164	{
7165	struct r5conf *conf = mddev->private;
7166	if (conf)
7167	return sprintf(buf: page, fmt: "%d\n", atomic_read(v: &conf->active_stripes));
7168	else
7169	return `0`;
7170	}
7171
7172	static struct md_sysfs_entry
7173	raid5_stripecache_active = __ATTR_RO(stripe_cache_active);
7174
7175	static ssize_t
7176	raid5_show_group_thread_cnt(struct mddev mddev, char* *page)
7177	{
7178	struct r5conf *conf;
7179	int ret = `0`;
7180	spin_lock(lock: &mddev->lock);
7181	conf = mddev->private;
7182	if (conf)
7183	ret = sprintf(buf: page, fmt: "%d\n", conf->worker_cnt_per_group);
7184	spin_unlock(lock: &mddev->lock);
7185	return ret;
7186	}
7187
7188	static int alloc_thread_groups(struct r5conf conf, int* cnt,
7189	int *group_cnt,
7190	struct r5worker_group **worker_groups);
7191	static ssize_t
7192	raid5_store_group_thread_cnt(struct mddev mddev, const* char *page, size_t len)
7193	{
7194	struct r5conf *conf;
7195	unsigned int new;
7196	int err;
7197	struct r5worker_group new_groups, old_groups;
7198	int group_cnt;
7199
7200	if (len >= PAGE_SIZE)
7201	return -EINVAL;
7202	if (kstrtouint(s: page, base: `10`, res: &new))
7203	return -EINVAL;
7204	/ 8192 should be big enough /
7205	if (new > `8192`)
7206	return -EINVAL;
7207
7208	err = mddev_suspend_and_lock(mddev);
7209	if (err)
7210	return err;
7211	conf = mddev->private;
7212	if (!conf)
7213	err = -ENODEV;
7214	else if (new != conf->worker_cnt_per_group) {
7215	old_groups = conf->worker_groups;
7216	if (old_groups)
7217	flush_workqueue(raid5_wq);
7218
7219	err = alloc_thread_groups(conf, cnt: new, group_cnt: &group_cnt, worker_groups: &new_groups);
7220	if (!err) {
7221	spin_lock_irq(lock: &conf->device_lock);
7222	conf->group_cnt = group_cnt;
7223	conf->worker_cnt_per_group = new;
7224	conf->worker_groups = new_groups;
7225	spin_unlock_irq(lock: &conf->device_lock);
7226
7227	if (old_groups)
7228	kfree(objp: old_groups[`0`].workers);
7229	kfree(objp: old_groups);
7230	}
7231	}
7232	mddev_unlock_and_resume(mddev);
7233
7234	return err ?: len;
7235	}
7236
7237	static struct md_sysfs_entry
7238	raid5_group_thread_cnt = __ATTR(group_thread_cnt, S_IRUGO \| S_IWUSR,
7239	raid5_show_group_thread_cnt,
7240	raid5_store_group_thread_cnt);
7241
7242	static struct attribute *raid5_attrs[] = {
7243	&raid5_stripecache_size.attr,
7244	&raid5_stripecache_active.attr,
7245	&raid5_preread_bypass_threshold.attr,
7246	&raid5_group_thread_cnt.attr,
7247	&raid5_skip_copy.attr,
7248	&raid5_rmw_level.attr,
7249	&raid5_stripe_size.attr,
7250	&r5c_journal_mode.attr,
7251	&ppl_write_hint.attr,
7252	NULL,
7253	};
7254	static const struct attribute_group raid5_attrs_group = {
7255	.name = NULL,
7256	.attrs = raid5_attrs,
7257	};
7258
7259	static int alloc_thread_groups(struct r5conf conf, int* cnt, int *group_cnt,
7260	struct r5worker_group **worker_groups)
7261	{
7262	int i, j, k;
7263	ssize_t size;
7264	struct r5worker *workers;
7265
7266	if (cnt == `0`) {
7267	*group_cnt = `0`;
7268	*worker_groups = NULL;
7269	return `0`;
7270	}
7271	*group_cnt = num_possible_nodes();
7272	size = sizeof(struct r5worker) * cnt;
7273	workers = kcalloc(n: size, size: *group_cnt, GFP_NOIO);
7274	worker_groups = kcalloc(n: group_cnt, size: sizeof(struct r5worker_group),
7275	GFP_NOIO);
7276	if (!*worker_groups \|\| !workers) {
7277	kfree(objp: workers);
7278	kfree(objp: *worker_groups);
7279	return -ENOMEM;
7280	}
7281
7282	for (i = `0`; i < *group_cnt; i++) {
7283	struct r5worker_group *group;
7284
7285	group = &(*worker_groups)[i];
7286	INIT_LIST_HEAD(list: &group->handle_list);
7287	INIT_LIST_HEAD(list: &group->loprio_list);
7288	group->conf = conf;
7289	group->workers = workers + i * cnt;
7290
7291	for (j = `0`; j < cnt; j++) {
7292	struct r5worker *worker = group->workers + j;
7293	worker->group = group;
7294	INIT_WORK(&worker->work, raid5_do_work);
7295
7296	for (k = `0`; k < NR_STRIPE_HASH_LOCKS; k++)
7297	INIT_LIST_HEAD(list: worker->temp_inactive_list + k);
7298	}
7299	}
7300
7301	return `0`;
7302	}
7303
7304	static void free_thread_groups(struct r5conf *conf)
7305	{
7306	if (conf->worker_groups)
7307	kfree(objp: conf->worker_groups[`0`].workers);
7308	kfree(objp: conf->worker_groups);
7309	conf->worker_groups = NULL;
7310	}
7311
7312	static sector_t
7313	raid5_size(struct mddev mddev, sector_t sectors, int* raid_disks)
7314	{
7315	struct r5conf *conf = mddev->private;
7316
7317	if (!sectors)
7318	sectors = mddev->dev_sectors;
7319	if (!raid_disks)
7320	/ size is defined by the smallest of previous and new size /
7321	raid_disks = min(conf->raid_disks, conf->previous_raid_disks);
7322
7323	sectors &= ~((sector_t)conf->chunk_sectors - `1`);
7324	sectors &= ~((sector_t)conf->prev_chunk_sectors - `1`);
7325	return sectors * (raid_disks - conf->max_degraded);
7326	}
7327
7328	static void free_scratch_buffer(struct r5conf conf, struct* raid5_percpu *percpu)
7329	{
7330	safe_put_page(p: percpu->spare_page);
7331	percpu->spare_page = NULL;
7332	kvfree(addr: percpu->scribble);
7333	percpu->scribble = NULL;
7334	}
7335
7336	static int alloc_scratch_buffer(struct r5conf conf, struct* raid5_percpu *percpu)
7337	{
7338	if (conf->level == `6` && !percpu->spare_page) {
7339	percpu->spare_page = alloc_page(GFP_KERNEL);
7340	if (!percpu->spare_page)
7341	return -ENOMEM;
7342	}
7343
7344	if (scribble_alloc(percpu,
7345	max(conf->raid_disks,
7346	conf->previous_raid_disks),
7347	max(conf->chunk_sectors,
7348	conf->prev_chunk_sectors)
7349	/ RAID5_STRIPE_SECTORS(conf))) {
7350	free_scratch_buffer(conf, percpu);
7351	return -ENOMEM;
7352	}
7353
7354	local_lock_init(&percpu->lock);
7355	return `0`;
7356	}
7357
7358	static int raid456_cpu_dead(unsigned int cpu, struct hlist_node *node)
7359	{
7360	struct r5conf conf = hlist_entry_safe(node, struct* r5conf, node);
7361
7362	free_scratch_buffer(conf, per_cpu_ptr(conf->percpu, cpu));
7363	return `0`;
7364	}
7365
7366	static void raid5_free_percpu(struct r5conf *conf)
7367	{
7368	if (!conf->percpu)
7369	return;
7370
7371	cpuhp_state_remove_instance(state: CPUHP_MD_RAID5_PREPARE, node: &conf->node);
7372	free_percpu(pdata: conf->percpu);
7373	}
7374
7375	static void free_conf(struct r5conf *conf)
7376	{
7377	int i;
7378
7379	log_exit(conf);
7380
7381	shrinker_free(shrinker: conf->shrinker);
7382	free_thread_groups(conf);
7383	shrink_stripes(conf);
7384	raid5_free_percpu(conf);
7385	for (i = `0`; i < conf->pool_size; i++)
7386	if (conf->disks[i].extra_page)
7387	put_page(page: conf->disks[i].extra_page);
7388	kfree(objp: conf->disks);
7389	bioset_exit(&conf->bio_split);
7390	kfree(objp: conf->stripe_hashtbl);
7391	kfree(objp: conf->pending_data);
7392	kfree(objp: conf);
7393	}
7394
7395	static int raid456_cpu_up_prepare(unsigned int cpu, struct hlist_node *node)
7396	{
7397	struct r5conf conf = hlist_entry_safe(node, struct* r5conf, node);
7398	struct raid5_percpu *percpu = per_cpu_ptr(conf->percpu, cpu);
7399
7400	if (alloc_scratch_buffer(conf, percpu)) {
7401	pr_warn("%s: failed memory allocation for cpu%u\n",
7402	__func__, cpu);
7403	return -ENOMEM;
7404	}
7405	return `0`;
7406	}
7407
7408	static int raid5_alloc_percpu(struct r5conf *conf)
7409	{
7410	int err = `0`;
7411
7412	conf->percpu = alloc_percpu(struct raid5_percpu);
7413	if (!conf->percpu)
7414	return -ENOMEM;
7415
7416	err = cpuhp_state_add_instance(state: CPUHP_MD_RAID5_PREPARE, node: &conf->node);
7417	if (!err) {
7418	conf->scribble_disks = max(conf->raid_disks,
7419	conf->previous_raid_disks);
7420	conf->scribble_sectors = max(conf->chunk_sectors,
7421	conf->prev_chunk_sectors);
7422	}
7423	return err;
7424	}
7425
7426	static unsigned long raid5_cache_scan(struct shrinker *shrink,
7427	struct shrink_control *sc)
7428	{
7429	struct r5conf *conf = shrink->private_data;
7430	unsigned long ret = SHRINK_STOP;
7431
7432	if (mutex_trylock(lock: &conf->cache_size_mutex)) {
7433	ret= `0`;
7434	while (ret < sc->nr_to_scan &&
7435	conf->max_nr_stripes > conf->min_nr_stripes) {
7436	if (drop_one_stripe(conf) == `0`) {
7437	ret = SHRINK_STOP;
7438	break;
7439	}
7440	ret++;
7441	}
7442	mutex_unlock(lock: &conf->cache_size_mutex);
7443	}
7444	return ret;
7445	}
7446
7447	static unsigned long raid5_cache_count(struct shrinker *shrink,
7448	struct shrink_control *sc)
7449	{
7450	struct r5conf *conf = shrink->private_data;
7451
7452	if (conf->max_nr_stripes < conf->min_nr_stripes)
7453	/ unlikely, but not impossible /
7454	return `0`;
7455	return conf->max_nr_stripes - conf->min_nr_stripes;
7456	}
7457
7458	static struct r5conf setup_conf(struct* mddev *mddev)
7459	{
7460	struct r5conf *conf;
7461	int raid_disk, memory, max_disks;
7462	struct md_rdev *rdev;
7463	struct disk_info *disk;
7464	char pers_name[`6`];
7465	int i;
7466	int group_cnt;
7467	struct r5worker_group *new_group;
7468	int ret = -ENOMEM;
7469
7470	if (mddev->new_level != `5`
7471	&& mddev->new_level != `4`
7472	&& mddev->new_level != `6`) {
7473	pr_warn("md/raid:%s: raid level not set to 4/5/6 (%d)\n",
7474	mdname(mddev), mddev->new_level);
7475	return ERR_PTR(error: -EIO);
7476	}
7477	if ((mddev->new_level == `5`
7478	&& !algorithm_valid_raid5(layout: mddev->new_layout)) \|\|
7479	(mddev->new_level == `6`
7480	&& !algorithm_valid_raid6(layout: mddev->new_layout))) {
7481	pr_warn("md/raid:%s: layout %d not supported\n",
7482	mdname(mddev), mddev->new_layout);
7483	return ERR_PTR(error: -EIO);
7484	}
7485	if (mddev->new_level == `6` && mddev->raid_disks < `4`) {
7486	pr_warn("md/raid:%s: not enough configured devices (%d, minimum 4)\n",
7487	mdname(mddev), mddev->raid_disks);
7488	return ERR_PTR(error: -EINVAL);
7489	}
7490
7491	if (!mddev->new_chunk_sectors \|\|
7492	(mddev->new_chunk_sectors << `9`) % PAGE_SIZE \|\|
7493	!is_power_of_2(n: mddev->new_chunk_sectors)) {
7494	pr_warn("md/raid:%s: invalid chunk size %d\n",
7495	mdname(mddev), mddev->new_chunk_sectors << `9`);
7496	return ERR_PTR(error: -EINVAL);
7497	}
7498
7499	conf = kzalloc(size: sizeof(struct r5conf), GFP_KERNEL);
7500	if (conf == NULL)
7501	goto abort;
7502
7503	#if PAGE_SIZE != DEFAULT_STRIPE_SIZE
7504	conf->stripe_size = DEFAULT_STRIPE_SIZE;
7505	conf->stripe_shift = ilog2(DEFAULT_STRIPE_SIZE) - `9`;
7506	conf->stripe_sectors = DEFAULT_STRIPE_SIZE >> `9`;
7507	#endif
7508	INIT_LIST_HEAD(list: &conf->free_list);
7509	INIT_LIST_HEAD(list: &conf->pending_list);
7510	conf->pending_data = kcalloc(PENDING_IO_MAX,
7511	size: sizeof(struct r5pending_data),
7512	GFP_KERNEL);
7513	if (!conf->pending_data)
7514	goto abort;
7515	for (i = `0`; i < PENDING_IO_MAX; i++)
7516	list_add(new: &conf->pending_data[i].sibling, head: &conf->free_list);
7517	/ Don't enable multi-threading by default/
7518	if (!alloc_thread_groups(conf, cnt: `0`, group_cnt: &group_cnt, worker_groups: &new_group)) {
7519	conf->group_cnt = group_cnt;
7520	conf->worker_cnt_per_group = `0`;
7521	conf->worker_groups = new_group;
7522	} else
7523	goto abort;
7524	spin_lock_init(&conf->device_lock);
7525	seqcount_spinlock_init(&conf->gen_lock, &conf->device_lock);
7526	mutex_init(&conf->cache_size_mutex);
7527
7528	init_waitqueue_head(&conf->wait_for_quiescent);
7529	init_waitqueue_head(&conf->wait_for_stripe);
7530	init_waitqueue_head(&conf->wait_for_overlap);
7531	INIT_LIST_HEAD(list: &conf->handle_list);
7532	INIT_LIST_HEAD(list: &conf->loprio_list);
7533	INIT_LIST_HEAD(list: &conf->hold_list);
7534	INIT_LIST_HEAD(list: &conf->delayed_list);
7535	INIT_LIST_HEAD(list: &conf->bitmap_list);
7536	init_llist_head(list: &conf->released_stripes);
7537	atomic_set(v: &conf->active_stripes, i: `0`);
7538	atomic_set(v: &conf->preread_active_stripes, i: `0`);
7539	atomic_set(v: &conf->active_aligned_reads, i: `0`);
7540	spin_lock_init(&conf->pending_bios_lock);
7541	conf->batch_bio_dispatch = true;
7542	rdev_for_each(rdev, mddev) {
7543	if (test_bit(Journal, &rdev->flags))
7544	continue;
7545	if (bdev_nonrot(bdev: rdev->bdev)) {
7546	conf->batch_bio_dispatch = false;
7547	break;
7548	}
7549	}
7550
7551	conf->bypass_threshold = BYPASS_THRESHOLD;
7552	conf->recovery_disabled = mddev->recovery_disabled - `1`;
7553
7554	conf->raid_disks = mddev->raid_disks;
7555	if (mddev->reshape_position == MaxSector)
7556	conf->previous_raid_disks = mddev->raid_disks;
7557	else
7558	conf->previous_raid_disks = mddev->raid_disks - mddev->delta_disks;
7559	max_disks = max(conf->raid_disks, conf->previous_raid_disks);
7560
7561	conf->disks = kcalloc(n: max_disks, size: sizeof(struct disk_info),
7562	GFP_KERNEL);
7563
7564	if (!conf->disks)
7565	goto abort;
7566
7567	for (i = `0`; i < max_disks; i++) {
7568	conf->disks[i].extra_page = alloc_page(GFP_KERNEL);
7569	if (!conf->disks[i].extra_page)
7570	goto abort;
7571	}
7572
7573	ret = bioset_init(&conf->bio_split, BIO_POOL_SIZE, `0`, flags: `0`);
7574	if (ret)
7575	goto abort;
7576	conf->mddev = mddev;
7577
7578	ret = -ENOMEM;
7579	conf->stripe_hashtbl = kzalloc(PAGE_SIZE, GFP_KERNEL);
7580	if (!conf->stripe_hashtbl)
7581	goto abort;
7582
7583	/ We init hash_locks[0] separately to that it can be used*
7584	* as the reference lock in the spin_lock_nest_lock() call
7585	* in lock_all_device_hash_locks_irq in order to convince
7586	* lockdep that we know what we are doing.
7587	*/
7588	spin_lock_init(conf->hash_locks);
7589	for (i = `1`; i < NR_STRIPE_HASH_LOCKS; i++)
7590	spin_lock_init(conf->hash_locks + i);
7591
7592	for (i = `0`; i < NR_STRIPE_HASH_LOCKS; i++)
7593	INIT_LIST_HEAD(list: conf->inactive_list + i);
7594
7595	for (i = `0`; i < NR_STRIPE_HASH_LOCKS; i++)
7596	INIT_LIST_HEAD(list: conf->temp_inactive_list + i);
7597
7598	atomic_set(v: &conf->r5c_cached_full_stripes, i: `0`);
7599	INIT_LIST_HEAD(list: &conf->r5c_full_stripe_list);
7600	atomic_set(v: &conf->r5c_cached_partial_stripes, i: `0`);
7601	INIT_LIST_HEAD(list: &conf->r5c_partial_stripe_list);
7602	atomic_set(v: &conf->r5c_flushing_full_stripes, i: `0`);
7603	atomic_set(v: &conf->r5c_flushing_partial_stripes, i: `0`);
7604
7605	conf->level = mddev->new_level;
7606	conf->chunk_sectors = mddev->new_chunk_sectors;
7607	ret = raid5_alloc_percpu(conf);
7608	if (ret)
7609	goto abort;
7610
7611	pr_debug("raid456: run(%s) called.\n", mdname(mddev));
7612
7613	ret = -EIO;
7614	rdev_for_each(rdev, mddev) {
7615	raid_disk = rdev->raid_disk;
7616	if (raid_disk >= max_disks
7617	\|\| raid_disk < `0` \|\| test_bit(Journal, &rdev->flags))
7618	continue;
7619	disk = conf->disks + raid_disk;
7620
7621	if (test_bit(Replacement, &rdev->flags)) {
7622	if (disk->replacement)
7623	goto abort;
7624	RCU_INIT_POINTER(disk->replacement, rdev);
7625	} else {
7626	if (disk->rdev)
7627	goto abort;
7628	RCU_INIT_POINTER(disk->rdev, rdev);
7629	}
7630
7631	if (test_bit(In_sync, &rdev->flags)) {
7632	pr_info("md/raid:%s: device %pg operational as raid disk %d\n",
7633	mdname(mddev), rdev->bdev, raid_disk);
7634	} else if (rdev->saved_raid_disk != raid_disk)
7635	/ Cannot rely on bitmap to complete recovery /
7636	conf->fullsync = `1`;
7637	}
7638
7639	conf->level = mddev->new_level;
7640	if (conf->level == `6`) {
7641	conf->max_degraded = `2`;
7642	if (raid6_call.xor_syndrome)
7643	conf->rmw_level = PARITY_ENABLE_RMW;
7644	else
7645	conf->rmw_level = PARITY_DISABLE_RMW;
7646	} else {
7647	conf->max_degraded = `1`;
7648	conf->rmw_level = PARITY_ENABLE_RMW;
7649	}
7650	conf->algorithm = mddev->new_layout;
7651	conf->reshape_progress = mddev->reshape_position;
7652	if (conf->reshape_progress != MaxSector) {
7653	conf->prev_chunk_sectors = mddev->chunk_sectors;
7654	conf->prev_algo = mddev->layout;
7655	} else {
7656	conf->prev_chunk_sectors = conf->chunk_sectors;
7657	conf->prev_algo = conf->algorithm;
7658	}
7659
7660	conf->min_nr_stripes = NR_STRIPES;
7661	if (mddev->reshape_position != MaxSector) {
7662	int stripes = max_t(int,
7663	((mddev->chunk_sectors << `9`) / RAID5_STRIPE_SIZE(conf)) * `4`,
7664	((mddev->new_chunk_sectors << `9`) / RAID5_STRIPE_SIZE(conf)) * `4`);
7665	conf->min_nr_stripes = max(NR_STRIPES, stripes);
7666	if (conf->min_nr_stripes != NR_STRIPES)
7667	pr_info("md/raid:%s: force stripe size %d for reshape\n",
7668	mdname(mddev), conf->min_nr_stripes);
7669	}
7670	memory = conf->min_nr_stripes * (sizeof(struct stripe_head) +
7671	max_disks * ((sizeof(struct bio) + PAGE_SIZE))) / `1024`;
7672	atomic_set(v: &conf->empty_inactive_list_nr, NR_STRIPE_HASH_LOCKS);
7673	if (grow_stripes(conf, num: conf->min_nr_stripes)) {
7674	pr_warn("md/raid:%s: couldn't allocate %dkB for buffers\n",
7675	mdname(mddev), memory);
7676	ret = -ENOMEM;
7677	goto abort;
7678	} else
7679	pr_debug("md/raid:%s: allocated %dkB\n", mdname(mddev), memory);
7680	/*
7681	* Losing a stripe head costs more than the time to refill it,
7682	* it reduces the queue depth and so can hurt throughput.
7683	* So set it rather large, scaled by number of devices.
7684	*/
7685	conf->shrinker = shrinker_alloc(flags: `0`, fmt: "md-raid5:%s", mdname(mddev));
7686	if (!conf->shrinker) {
7687	ret = -ENOMEM;
7688	pr_warn("md/raid:%s: couldn't allocate shrinker.\n",
7689	mdname(mddev));
7690	goto abort;
7691	}
7692
7693	conf->shrinker->seeks = DEFAULT_SEEKS * conf->raid_disks * `4`;
7694	conf->shrinker->scan_objects = raid5_cache_scan;
7695	conf->shrinker->count_objects = raid5_cache_count;
7696	conf->shrinker->batch = `128`;
7697	conf->shrinker->private_data = conf;
7698
7699	shrinker_register(shrinker: conf->shrinker);
7700
7701	sprintf(buf: pers_name, fmt: "raid%d", mddev->new_level);
7702	rcu_assign_pointer(conf->thread,
7703	md_register_thread(raid5d, mddev, pers_name));
7704	if (!conf->thread) {
7705	pr_warn("md/raid:%s: couldn't allocate thread.\n",
7706	mdname(mddev));
7707	ret = -ENOMEM;
7708	goto abort;
7709	}
7710
7711	return conf;
7712
7713	abort:
7714	if (conf)
7715	free_conf(conf);
7716	return ERR_PTR(error: ret);
7717	}
7718
7719	static int only_parity(int raid_disk, int algo, int raid_disks, int max_degraded)
7720	{
7721	switch (algo) {
7722	case ALGORITHM_PARITY_0:
7723	if (raid_disk < max_degraded)
7724	return `1`;
7725	break;
7726	case ALGORITHM_PARITY_N:
7727	if (raid_disk >= raid_disks - max_degraded)
7728	return `1`;
7729	break;
7730	case ALGORITHM_PARITY_0_6:
7731	if (raid_disk == `0` \|\|
7732	raid_disk == raid_disks - `1`)
7733	return `1`;
7734	break;
7735	case ALGORITHM_LEFT_ASYMMETRIC_6:
7736	case ALGORITHM_RIGHT_ASYMMETRIC_6:
7737	case ALGORITHM_LEFT_SYMMETRIC_6:
7738	case ALGORITHM_RIGHT_SYMMETRIC_6:
7739	if (raid_disk == raid_disks - `1`)
7740	return `1`;
7741	}
7742	return `0`;
7743	}
7744
7745	static void raid5_set_io_opt(struct r5conf *conf)
7746	{
7747	blk_queue_io_opt(q: conf->mddev->queue, opt: (conf->chunk_sectors << `9`) *
7748	(conf->raid_disks - conf->max_degraded));
7749	}
7750
7751	static int raid5_run(struct mddev *mddev)
7752	{
7753	struct r5conf *conf;
7754	int dirty_parity_disks = `0`;
7755	struct md_rdev *rdev;
7756	struct md_rdev *journal_dev = NULL;
7757	sector_t reshape_offset = `0`;
7758	int i;
7759	long long min_offset_diff = `0`;
7760	int first = `1`;
7761
7762	if (mddev->recovery_cp != MaxSector)
7763	pr_notice("md/raid:%s: not clean -- starting background reconstruction\n",
7764	mdname(mddev));
7765
7766	rdev_for_each(rdev, mddev) {
7767	long long diff;
7768
7769	if (test_bit(Journal, &rdev->flags)) {
7770	journal_dev = rdev;
7771	continue;
7772	}
7773	if (rdev->raid_disk < `0`)
7774	continue;
7775	diff = (rdev->new_data_offset - rdev->data_offset);
7776	if (first) {
7777	min_offset_diff = diff;
7778	first = `0`;
7779	} else if (mddev->reshape_backwards &&
7780	diff < min_offset_diff)
7781	min_offset_diff = diff;
7782	else if (!mddev->reshape_backwards &&
7783	diff > min_offset_diff)
7784	min_offset_diff = diff;
7785	}
7786
7787	if ((test_bit(MD_HAS_JOURNAL, &mddev->flags) \|\| journal_dev) &&
7788	(mddev->bitmap_info.offset \|\| mddev->bitmap_info.file)) {
7789	pr_notice("md/raid:%s: array cannot have both journal and bitmap\n",
7790	mdname(mddev));
7791	return -EINVAL;
7792	}
7793
7794	if (mddev->reshape_position != MaxSector) {
7795	/ Check that we can continue the reshape.*
7796	* Difficulties arise if the stripe we would write to
7797	* next is at or after the stripe we would read from next.
7798	* For a reshape that changes the number of devices, this
7799	* is only possible for a very short time, and mdadm makes
7800	* sure that time appears to have past before assembling
7801	* the array. So we fail if that time hasn't passed.
7802	* For a reshape that keeps the number of devices the same
7803	* mdadm must be monitoring the reshape can keeping the
7804	* critical areas read-only and backed up. It will start
7805	* the array in read-only mode, so we check for that.
7806	*/
7807	sector_t here_new, here_old;
7808	int old_disks;
7809	int max_degraded = (mddev->level == `6` ? `2` : `1`);
7810	int chunk_sectors;
7811	int new_data_disks;
7812
7813	if (journal_dev) {
7814	pr_warn("md/raid:%s: don't support reshape with journal - aborting.\n",
7815	mdname(mddev));
7816	return -EINVAL;
7817	}
7818
7819	if (mddev->new_level != mddev->level) {
7820	pr_warn("md/raid:%s: unsupported reshape required - aborting.\n",
7821	mdname(mddev));
7822	return -EINVAL;
7823	}
7824	old_disks = mddev->raid_disks - mddev->delta_disks;
7825	/ reshape_position must be on a new-stripe boundary, and one*
7826	* further up in new geometry must map after here in old
7827	* geometry.
7828	* If the chunk sizes are different, then as we perform reshape
7829	* in units of the largest of the two, reshape_position needs
7830	* be a multiple of the largest chunk size times new data disks.
7831	*/
7832	here_new = mddev->reshape_position;
7833	chunk_sectors = max(mddev->chunk_sectors, mddev->new_chunk_sectors);
7834	new_data_disks = mddev->raid_disks - max_degraded;
7835	if (sector_div(here_new, chunk_sectors * new_data_disks)) {
7836	pr_warn("md/raid:%s: reshape_position not on a stripe boundary\n",
7837	mdname(mddev));
7838	return -EINVAL;
7839	}
7840	reshape_offset = here_new * chunk_sectors;
7841	/ here_new is the stripe we will write to /
7842	here_old = mddev->reshape_position;
7843	sector_div(here_old, chunk_sectors * (old_disks-max_degraded));
7844	/ here_old is the first stripe that we might need to read*
7845	* from */
7846	if (mddev->delta_disks == `0`) {
7847	/ We cannot be sure it is safe to start an in-place*
7848	* reshape. It is only safe if user-space is monitoring
7849	* and taking constant backups.
7850	* mdadm always starts a situation like this in
7851	* readonly mode so it can take control before
7852	* allowing any writes. So just check for that.
7853	*/
7854	if (abs(min_offset_diff) >= mddev->chunk_sectors &&
7855	abs(min_offset_diff) >= mddev->new_chunk_sectors)
7856	/ not really in-place - so OK /;
7857	else if (mddev->ro == `0`) {
7858	pr_warn("md/raid:%s: in-place reshape must be started in read-only mode - aborting\n",
7859	mdname(mddev));
7860	return -EINVAL;
7861	}
7862	} else if (mddev->reshape_backwards
7863	? (here_new * chunk_sectors + min_offset_diff <=
7864	here_old * chunk_sectors)
7865	: (here_new * chunk_sectors >=
7866	here_old * chunk_sectors + (-min_offset_diff))) {
7867	/ Reading from the same stripe as writing to - bad /
7868	pr_warn("md/raid:%s: reshape_position too early for auto-recovery - aborting.\n",
7869	mdname(mddev));
7870	return -EINVAL;
7871	}
7872	pr_debug("md/raid:%s: reshape will continue\n", mdname(mddev));
7873	/ OK, we should be able to continue; /
7874	} else {
7875	BUG_ON(mddev->level != mddev->new_level);
7876	BUG_ON(mddev->layout != mddev->new_layout);
7877	BUG_ON(mddev->chunk_sectors != mddev->new_chunk_sectors);
7878	BUG_ON(mddev->delta_disks != `0`);
7879	}
7880
7881	if (test_bit(MD_HAS_JOURNAL, &mddev->flags) &&
7882	test_bit(MD_HAS_PPL, &mddev->flags)) {
7883	pr_warn("md/raid:%s: using journal device and PPL not allowed - disabling PPL\n",
7884	mdname(mddev));
7885	clear_bit(nr: MD_HAS_PPL, addr: &mddev->flags);
7886	clear_bit(nr: MD_HAS_MULTIPLE_PPLS, addr: &mddev->flags);
7887	}
7888
7889	if (mddev->private == NULL)
7890	conf = setup_conf(mddev);
7891	else
7892	conf = mddev->private;
7893
7894	if (IS_ERR(ptr: conf))
7895	return PTR_ERR(ptr: conf);
7896
7897	if (test_bit(MD_HAS_JOURNAL, &mddev->flags)) {
7898	if (!journal_dev) {
7899	pr_warn("md/raid:%s: journal disk is missing, force array readonly\n",
7900	mdname(mddev));
7901	mddev->ro = `1`;
7902	set_disk_ro(disk: mddev->gendisk, read_only: `1`);
7903	} else if (mddev->recovery_cp == MaxSector)
7904	set_bit(nr: MD_JOURNAL_CLEAN, addr: &mddev->flags);
7905	}
7906
7907	conf->min_offset_diff = min_offset_diff;
7908	rcu_assign_pointer(mddev->thread, conf->thread);
7909	rcu_assign_pointer(conf->thread, NULL);
7910	mddev->private = conf;
7911
7912	for (i = `0`; i < conf->raid_disks && conf->previous_raid_disks;
7913	i++) {
7914	rdev = rdev_mdlock_deref(mddev, rdev: conf->disks[i].rdev);
7915	if (!rdev && conf->disks[i].replacement) {
7916	/ The replacement is all we have yet /
7917	rdev = rdev_mdlock_deref(mddev,
7918	rdev: conf->disks[i].replacement);
7919	conf->disks[i].replacement = NULL;
7920	clear_bit(nr: Replacement, addr: &rdev->flags);
7921	rcu_assign_pointer(conf->disks[i].rdev, rdev);
7922	}
7923	if (!rdev)
7924	continue;
7925	if (rcu_access_pointer(conf->disks[i].replacement) &&
7926	conf->reshape_progress != MaxSector) {
7927	/ replacements and reshape simply do not mix. /
7928	pr_warn("md: cannot handle concurrent replacement and reshape.\n");
7929	goto abort;
7930	}
7931	if (test_bit(In_sync, &rdev->flags))
7932	continue;
7933	/ This disc is not fully in-sync. However if it*
7934	* just stored parity (beyond the recovery_offset),
7935	* when we don't need to be concerned about the
7936	* array being dirty.
7937	* When reshape goes 'backwards', we never have
7938	* partially completed devices, so we only need
7939	* to worry about reshape going forwards.
7940	*/
7941	/ Hack because v0.91 doesn't store recovery_offset properly. /
7942	if (mddev->major_version == `0` &&
7943	mddev->minor_version > `90`)
7944	rdev->recovery_offset = reshape_offset;
7945
7946	if (rdev->recovery_offset < reshape_offset) {
7947	/ We need to check old and new layout /
7948	if (!only_parity(raid_disk: rdev->raid_disk,
7949	algo: conf->algorithm,
7950	raid_disks: conf->raid_disks,
7951	max_degraded: conf->max_degraded))
7952	continue;
7953	}
7954	if (!only_parity(raid_disk: rdev->raid_disk,
7955	algo: conf->prev_algo,
7956	raid_disks: conf->previous_raid_disks,
7957	max_degraded: conf->max_degraded))
7958	continue;
7959	dirty_parity_disks++;
7960	}
7961
7962	/*
7963	* 0 for a fully functional array, 1 or 2 for a degraded array.
7964	*/
7965	mddev->degraded = raid5_calc_degraded(conf);
7966
7967	if (has_failed(conf)) {
7968	pr_crit("md/raid:%s: not enough operational devices (%d/%d failed)\n",
7969	mdname(mddev), mddev->degraded, conf->raid_disks);
7970	goto abort;
7971	}
7972
7973	/ device size must be a multiple of chunk size /
7974	mddev->dev_sectors &= ~((sector_t)mddev->chunk_sectors - `1`);
7975	mddev->resync_max_sectors = mddev->dev_sectors;
7976
7977	if (mddev->degraded > dirty_parity_disks &&
7978	mddev->recovery_cp != MaxSector) {
7979	if (test_bit(MD_HAS_PPL, &mddev->flags))
7980	pr_crit("md/raid:%s: starting dirty degraded array with PPL.\n",
7981	mdname(mddev));
7982	else if (mddev->ok_start_degraded)
7983	pr_crit("md/raid:%s: starting dirty degraded array - data corruption possible.\n",
7984	mdname(mddev));
7985	else {
7986	pr_crit("md/raid:%s: cannot start dirty degraded array.\n",
7987	mdname(mddev));
7988	goto abort;
7989	}
7990	}
7991
7992	pr_info("md/raid:%s: raid level %d active with %d out of %d devices, algorithm %d\n",
7993	mdname(mddev), conf->level,
7994	mddev->raid_disks-mddev->degraded, mddev->raid_disks,
7995	mddev->new_layout);
7996
7997	print_raid5_conf(conf);
7998
7999	if (conf->reshape_progress != MaxSector) {
8000	conf->reshape_safe = conf->reshape_progress;
8001	atomic_set(v: &conf->reshape_stripes, i: `0`);
8002	clear_bit(nr: MD_RECOVERY_SYNC, addr: &mddev->recovery);
8003	clear_bit(nr: MD_RECOVERY_CHECK, addr: &mddev->recovery);
8004	set_bit(nr: MD_RECOVERY_RESHAPE, addr: &mddev->recovery);
8005	set_bit(nr: MD_RECOVERY_RUNNING, addr: &mddev->recovery);
8006	rcu_assign_pointer(mddev->sync_thread,
8007	md_register_thread(md_do_sync, mddev, "reshape"));
8008	if (!mddev->sync_thread)
8009	goto abort;
8010	}
8011
8012	/ Ok, everything is just fine now /
8013	if (mddev->to_remove == &raid5_attrs_group)
8014	mddev->to_remove = NULL;
8015	else if (mddev->kobj.sd &&
8016	sysfs_create_group(kobj: &mddev->kobj, grp: &raid5_attrs_group))
8017	pr_warn("raid5: failed to create sysfs attributes for %s\n",
8018	mdname(mddev));
8019	md_set_array_sectors(mddev, array_sectors: raid5_size(mddev, sectors: `0`, raid_disks: `0`));
8020
8021	if (mddev->queue) {
8022	int chunk_size;
8023	/ read-ahead size must cover two whole stripes, which*
8024	* is 2 * (datadisks) * chunksize where 'n' is the
8025	* number of raid devices
8026	*/
8027	int data_disks = conf->previous_raid_disks - conf->max_degraded;
8028	int stripe = data_disks *
8029	((mddev->chunk_sectors << `9`) / PAGE_SIZE);
8030
8031	chunk_size = mddev->chunk_sectors << `9`;
8032	blk_queue_io_min(q: mddev->queue, min: chunk_size);
8033	raid5_set_io_opt(conf);
8034	mddev->queue->limits.raid_partial_stripes_expensive = `1`;
8035	/*
8036	* We can only discard a whole stripe. It doesn't make sense to
8037	* discard data disk but write parity disk
8038	*/
8039	stripe = stripe * PAGE_SIZE;
8040	stripe = roundup_pow_of_two(stripe);
8041	mddev->queue->limits.discard_granularity = stripe;
8042
8043	blk_queue_max_write_zeroes_sectors(q: mddev->queue, max_write_same_sectors: `0`);
8044
8045	rdev_for_each(rdev, mddev) {
8046	disk_stack_limits(disk: mddev->gendisk, bdev: rdev->bdev,
8047	offset: rdev->data_offset << `9`);
8048	disk_stack_limits(disk: mddev->gendisk, bdev: rdev->bdev,
8049	offset: rdev->new_data_offset << `9`);
8050	}
8051
8052	/*
8053	* zeroing is required, otherwise data
8054	* could be lost. Consider a scenario: discard a stripe
8055	* (the stripe could be inconsistent if
8056	* discard_zeroes_data is 0); write one disk of the
8057	* stripe (the stripe could be inconsistent again
8058	* depending on which disks are used to calculate
8059	* parity); the disk is broken; The stripe data of this
8060	* disk is lost.
8061	*
8062	* We only allow DISCARD if the sysadmin has confirmed that
8063	* only safe devices are in use by setting a module parameter.
8064	* A better idea might be to turn DISCARD into WRITE_ZEROES
8065	* requests, as that is required to be safe.
8066	*/
8067	if (!devices_handle_discard_safely \|\|
8068	mddev->queue->limits.max_discard_sectors < (stripe >> `9`) \|\|
8069	mddev->queue->limits.discard_granularity < stripe)
8070	blk_queue_max_discard_sectors(q: mddev->queue, max_discard_sectors: `0`);
8071
8072	/*
8073	* Requests require having a bitmap for each stripe.
8074	* Limit the max sectors based on this.
8075	*/
8076	blk_queue_max_hw_sectors(mddev->queue,
8077	RAID5_MAX_REQ_STRIPES << RAID5_STRIPE_SHIFT(conf));
8078
8079	/ No restrictions on the number of segments in the request /
8080	blk_queue_max_segments(mddev->queue, USHRT_MAX);
8081	}
8082
8083	if (log_init(conf, journal_dev, ppl: raid5_has_ppl(conf)))
8084	goto abort;
8085
8086	return `0`;
8087	abort:
8088	md_unregister_thread(mddev, threadp: &mddev->thread);
8089	print_raid5_conf(conf);
8090	free_conf(conf);
8091	mddev->private = NULL;
8092	pr_warn("md/raid:%s: failed to run raid set.\n", mdname(mddev));
8093	return -EIO;
8094	}
8095
8096	static void raid5_free(struct mddev mddev, void* *priv)
8097	{
8098	struct r5conf *conf = priv;
8099
8100	free_conf(conf);
8101	mddev->to_remove = &raid5_attrs_group;
8102	}
8103
8104	static void raid5_status(struct seq_file seq, struct* mddev *mddev)
8105	{
8106	struct r5conf *conf = mddev->private;
8107	int i;
8108
8109	seq_printf(m: seq, fmt: " level %d, %dk chunk, algorithm %d", mddev->level,
8110	conf->chunk_sectors / `2`, mddev->layout);
8111	seq_printf (m: seq, fmt: " [%d/%d] [", conf->raid_disks, conf->raid_disks - mddev->degraded);
8112	rcu_read_lock();
8113	for (i = `0`; i < conf->raid_disks; i++) {
8114	struct md_rdev *rdev = rcu_dereference(conf->disks[i].rdev);
8115	seq_printf (m: seq, fmt: "%s", rdev && test_bit(In_sync, &rdev->flags) ? "U" : "_");
8116	}
8117	rcu_read_unlock();
8118	seq_printf (m: seq, fmt: "]");
8119	}
8120
8121	static void print_raid5_conf (struct r5conf *conf)
8122	{
8123	struct md_rdev *rdev;
8124	int i;
8125
8126	pr_debug("RAID conf printout:\n");
8127	if (!conf) {
8128	pr_debug("(conf==NULL)\n");
8129	return;
8130	}
8131	pr_debug(" --- level:%d rd:%d wd:%d\n", conf->level,
8132	conf->raid_disks,
8133	conf->raid_disks - conf->mddev->degraded);
8134
8135	rcu_read_lock();
8136	for (i = `0`; i < conf->raid_disks; i++) {
8137	rdev = rcu_dereference(conf->disks[i].rdev);
8138	if (rdev)
8139	pr_debug(" disk %d, o:%d, dev:%pg\n",
8140	i, !test_bit(Faulty, &rdev->flags),
8141	rdev->bdev);
8142	}
8143	rcu_read_unlock();
8144	}
8145
8146	static int raid5_spare_active(struct mddev *mddev)
8147	{
8148	int i;
8149	struct r5conf *conf = mddev->private;
8150	struct md_rdev rdev, replacement;
8151	int count = `0`;
8152	unsigned long flags;
8153
8154	for (i = `0`; i < conf->raid_disks; i++) {
8155	rdev = rdev_mdlock_deref(mddev, rdev: conf->disks[i].rdev);
8156	replacement = rdev_mdlock_deref(mddev,
8157	rdev: conf->disks[i].replacement);
8158	if (replacement
8159	&& replacement->recovery_offset == MaxSector
8160	&& !test_bit(Faulty, &replacement->flags)
8161	&& !test_and_set_bit(nr: In_sync, addr: &replacement->flags)) {
8162	/ Replacement has just become active. /
8163	if (!rdev
8164	\|\| !test_and_clear_bit(nr: In_sync, addr: &rdev->flags))
8165	count++;
8166	if (rdev) {
8167	/ Replaced device not technically faulty,*
8168	* but we need to be sure it gets removed
8169	* and never re-added.
8170	*/
8171	set_bit(nr: Faulty, addr: &rdev->flags);
8172	sysfs_notify_dirent_safe(
8173	sd: rdev->sysfs_state);
8174	}
8175	sysfs_notify_dirent_safe(sd: replacement->sysfs_state);
8176	} else if (rdev
8177	&& rdev->recovery_offset == MaxSector
8178	&& !test_bit(Faulty, &rdev->flags)
8179	&& !test_and_set_bit(nr: In_sync, addr: &rdev->flags)) {
8180	count++;
8181	sysfs_notify_dirent_safe(sd: rdev->sysfs_state);
8182	}
8183	}
8184	spin_lock_irqsave(&conf->device_lock, flags);
8185	mddev->degraded = raid5_calc_degraded(conf);
8186	spin_unlock_irqrestore(lock: &conf->device_lock, flags);
8187	print_raid5_conf(conf);
8188	return count;
8189	}
8190
8191	static int raid5_remove_disk(struct mddev mddev, struct* md_rdev *rdev)
8192	{
8193	struct r5conf *conf = mddev->private;
8194	int err = `0`;
8195	int number = rdev->raid_disk;
8196	struct md_rdev __rcu **rdevp;
8197	struct disk_info *p;
8198	struct md_rdev *tmp;
8199
8200	print_raid5_conf(conf);
8201	if (test_bit(Journal, &rdev->flags) && conf->log) {
8202	/*
8203	* we can't wait pending write here, as this is called in
8204	* raid5d, wait will deadlock.
8205	* neilb: there is no locking about new writes here,
8206	* so this cannot be safe.
8207	*/
8208	if (atomic_read(v: &conf->active_stripes) \|\|
8209	atomic_read(v: &conf->r5c_cached_full_stripes) \|\|
8210	atomic_read(v: &conf->r5c_cached_partial_stripes)) {
8211	return -EBUSY;
8212	}
8213	log_exit(conf);
8214	return `0`;
8215	}
8216	if (unlikely(number >= conf->pool_size))
8217	return `0`;
8218	p = conf->disks + number;
8219	if (rdev == rcu_access_pointer(p->rdev))
8220	rdevp = &p->rdev;
8221	else if (rdev == rcu_access_pointer(p->replacement))
8222	rdevp = &p->replacement;
8223	else
8224	return `0`;
8225
8226	if (number >= conf->raid_disks &&
8227	conf->reshape_progress == MaxSector)
8228	clear_bit(nr: In_sync, addr: &rdev->flags);
8229
8230	if (test_bit(In_sync, &rdev->flags) \|\|
8231	atomic_read(v: &rdev->nr_pending)) {
8232	err = -EBUSY;
8233	goto abort;
8234	}
8235	/ Only remove non-faulty devices if recovery*
8236	* isn't possible.
8237	*/
8238	if (!test_bit(Faulty, &rdev->flags) &&
8239	mddev->recovery_disabled != conf->recovery_disabled &&
8240	!has_failed(conf) &&
8241	(!rcu_access_pointer(p->replacement) \|\|
8242	rcu_access_pointer(p->replacement) == rdev) &&
8243	number < conf->raid_disks) {
8244	err = -EBUSY;
8245	goto abort;
8246	}
8247	*rdevp = NULL;
8248	if (!test_bit(RemoveSynchronized, &rdev->flags)) {
8249	lockdep_assert_held(&mddev->reconfig_mutex);
8250	synchronize_rcu();
8251	if (atomic_read(v: &rdev->nr_pending)) {
8252	/ lost the race, try later /
8253	err = -EBUSY;
8254	rcu_assign_pointer(*rdevp, rdev);
8255	}
8256	}
8257	if (!err) {
8258	err = log_modify(conf, rdev, add: false);
8259	if (err)
8260	goto abort;
8261	}
8262
8263	tmp = rcu_access_pointer(p->replacement);
8264	if (tmp) {
8265	/ We must have just cleared 'rdev' /
8266	rcu_assign_pointer(p->rdev, tmp);
8267	clear_bit(nr: Replacement, addr: &tmp->flags);
8268	smp_mb(); / Make sure other CPUs may see both as identical*
8269	* but will never see neither - if they are careful
8270	*/
8271	rcu_assign_pointer(p->replacement, NULL);
8272
8273	if (!err)
8274	err = log_modify(conf, rdev: tmp, add: true);
8275	}
8276
8277	clear_bit(nr: WantReplacement, addr: &rdev->flags);
8278	abort:
8279
8280	print_raid5_conf(conf);
8281	return err;
8282	}
8283
8284	static int raid5_add_disk(struct mddev mddev, struct* md_rdev *rdev)
8285	{
8286	struct r5conf *conf = mddev->private;
8287	int ret, err = -EEXIST;
8288	int disk;
8289	struct disk_info *p;
8290	struct md_rdev *tmp;
8291	int first = `0`;
8292	int last = conf->raid_disks - `1`;
8293
8294	if (test_bit(Journal, &rdev->flags)) {
8295	if (conf->log)
8296	return -EBUSY;
8297
8298	rdev->raid_disk = `0`;
8299	/*
8300	* The array is in readonly mode if journal is missing, so no
8301	* write requests running. We should be safe
8302	*/
8303	ret = log_init(conf, journal_dev: rdev, ppl: false);
8304	if (ret)
8305	return ret;
8306
8307	ret = r5l_start(log: conf->log);
8308	if (ret)
8309	return ret;
8310
8311	return `0`;
8312	}
8313	if (mddev->recovery_disabled == conf->recovery_disabled)
8314	return -EBUSY;
8315
8316	if (rdev->saved_raid_disk < `0` && has_failed(conf))
8317	/ no point adding a device /
8318	return -EINVAL;
8319
8320	if (rdev->raid_disk >= `0`)
8321	first = last = rdev->raid_disk;
8322
8323	/*
8324	* find the disk ... but prefer rdev->saved_raid_disk
8325	* if possible.
8326	*/
8327	if (rdev->saved_raid_disk >= first &&
8328	rdev->saved_raid_disk <= last &&
8329	conf->disks[rdev->saved_raid_disk].rdev == NULL)
8330	first = rdev->saved_raid_disk;
8331
8332	for (disk = first; disk <= last; disk++) {
8333	p = conf->disks + disk;
8334	if (p->rdev == NULL) {
8335	clear_bit(nr: In_sync, addr: &rdev->flags);
8336	rdev->raid_disk = disk;
8337	if (rdev->saved_raid_disk != disk)
8338	conf->fullsync = `1`;
8339	rcu_assign_pointer(p->rdev, rdev);
8340
8341	err = log_modify(conf, rdev, add: true);
8342
8343	goto out;
8344	}
8345	}
8346	for (disk = first; disk <= last; disk++) {
8347	p = conf->disks + disk;
8348	tmp = rdev_mdlock_deref(mddev, rdev: p->rdev);
8349	if (test_bit(WantReplacement, &tmp->flags) &&
8350	mddev->reshape_position == MaxSector &&
8351	p->replacement == NULL) {
8352	clear_bit(nr: In_sync, addr: &rdev->flags);
8353	set_bit(nr: Replacement, addr: &rdev->flags);
8354	rdev->raid_disk = disk;
8355	err = `0`;
8356	conf->fullsync = `1`;
8357	rcu_assign_pointer(p->replacement, rdev);
8358	break;
8359	}
8360	}
8361	out:
8362	print_raid5_conf(conf);
8363	return err;
8364	}
8365
8366	static int raid5_resize(struct mddev *mddev, sector_t sectors)
8367	{
8368	/ no resync is happening, and there is enough space*
8369	* on all devices, so we can resize.
8370	* We need to make sure resync covers any new space.
8371	* If the array is shrinking we should possibly wait until
8372	* any io in the removed space completes, but it hardly seems
8373	* worth it.
8374	*/
8375	sector_t newsize;
8376	struct r5conf *conf = mddev->private;
8377
8378	if (raid5_has_log(conf) \|\| raid5_has_ppl(conf))
8379	return -EINVAL;
8380	sectors &= ~((sector_t)conf->chunk_sectors - `1`);
8381	newsize = raid5_size(mddev, sectors, raid_disks: mddev->raid_disks);
8382	if (mddev->external_size &&
8383	mddev->array_sectors > newsize)
8384	return -EINVAL;
8385	if (mddev->bitmap) {
8386	int ret = md_bitmap_resize(bitmap: mddev->bitmap, blocks: sectors, chunksize: `0`, init: `0`);
8387	if (ret)
8388	return ret;
8389	}
8390	md_set_array_sectors(mddev, array_sectors: newsize);
8391	if (sectors > mddev->dev_sectors &&
8392	mddev->recovery_cp > mddev->dev_sectors) {
8393	mddev->recovery_cp = mddev->dev_sectors;
8394	set_bit(nr: MD_RECOVERY_NEEDED, addr: &mddev->recovery);
8395	}
8396	mddev->dev_sectors = sectors;
8397	mddev->resync_max_sectors = sectors;
8398	return `0`;
8399	}
8400
8401	static int check_stripe_cache(struct mddev *mddev)
8402	{
8403	/ Can only proceed if there are plenty of stripe_heads.*
8404	* We need a minimum of one full stripe,, and for sensible progress
8405	* it is best to have about 4 times that.
8406	* If we require 4 times, then the default 256 4K stripe_heads will
8407	* allow for chunk sizes up to 256K, which is probably OK.
8408	* If the chunk size is greater, user-space should request more
8409	* stripe_heads first.
8410	*/
8411	struct r5conf *conf = mddev->private;
8412	if (((mddev->chunk_sectors << `9`) / RAID5_STRIPE_SIZE(conf)) * `4`
8413	> conf->min_nr_stripes \|\|
8414	((mddev->new_chunk_sectors << `9`) / RAID5_STRIPE_SIZE(conf)) * `4`
8415	> conf->min_nr_stripes) {
8416	pr_warn("md/raid:%s: reshape: not enough stripes. Needed %lu\n",
8417	mdname(mddev),
8418	((max(mddev->chunk_sectors, mddev->new_chunk_sectors) << `9`)
8419	/ RAID5_STRIPE_SIZE(conf))*`4`);
8420	return `0`;
8421	}
8422	return `1`;
8423	}
8424
8425	static int check_reshape(struct mddev *mddev)
8426	{
8427	struct r5conf *conf = mddev->private;
8428
8429	if (raid5_has_log(conf) \|\| raid5_has_ppl(conf))
8430	return -EINVAL;
8431	if (mddev->delta_disks == `0` &&
8432	mddev->new_layout == mddev->layout &&
8433	mddev->new_chunk_sectors == mddev->chunk_sectors)
8434	return `0`; / nothing to do /
8435	if (has_failed(conf))
8436	return -EINVAL;
8437	if (mddev->delta_disks < `0` && mddev->reshape_position == MaxSector) {
8438	/ We might be able to shrink, but the devices must*
8439	* be made bigger first.
8440	* For raid6, 4 is the minimum size.
8441	* Otherwise 2 is the minimum
8442	*/
8443	int min = `2`;
8444	if (mddev->level == `6`)
8445	min = `4`;
8446	if (mddev->raid_disks + mddev->delta_disks < min)
8447	return -EINVAL;
8448	}
8449
8450	if (!check_stripe_cache(mddev))
8451	return -ENOSPC;
8452
8453	if (mddev->new_chunk_sectors > mddev->chunk_sectors \|\|
8454	mddev->delta_disks > `0`)
8455	if (resize_chunks(conf,
8456	new_disks: conf->previous_raid_disks
8457	+ max(`0`, mddev->delta_disks),
8458	max(mddev->new_chunk_sectors,
8459	mddev->chunk_sectors)
8460	) < `0`)
8461	return -ENOMEM;
8462
8463	if (conf->previous_raid_disks + mddev->delta_disks <= conf->pool_size)
8464	return `0`; / never bother to shrink /
8465	return resize_stripes(conf, newsize: (conf->previous_raid_disks
8466	+ mddev->delta_disks));
8467	}
8468
8469	static int raid5_start_reshape(struct mddev *mddev)
8470	{
8471	struct r5conf *conf = mddev->private;
8472	struct md_rdev *rdev;
8473	int spares = `0`;
8474	int i;
8475	unsigned long flags;
8476
8477	if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
8478	return -EBUSY;
8479
8480	if (!check_stripe_cache(mddev))
8481	return -ENOSPC;
8482
8483	if (has_failed(conf))
8484	return -EINVAL;
8485
8486	/ raid5 can't handle concurrent reshape and recovery /
8487	if (mddev->recovery_cp < MaxSector)
8488	return -EBUSY;
8489	for (i = `0`; i < conf->raid_disks; i++)
8490	if (rdev_mdlock_deref(mddev, rdev: conf->disks[i].replacement))
8491	return -EBUSY;
8492
8493	rdev_for_each(rdev, mddev) {
8494	if (!test_bit(In_sync, &rdev->flags)
8495	&& !test_bit(Faulty, &rdev->flags))
8496	spares++;
8497	}
8498
8499	if (spares - mddev->degraded < mddev->delta_disks - conf->max_degraded)
8500	/ Not enough devices even to make a degraded array*
8501	* of that size
8502	*/
8503	return -EINVAL;
8504
8505	/ Refuse to reduce size of the array. Any reductions in*
8506	* array size must be through explicit setting of array_size
8507	* attribute.
8508	*/
8509	if (raid5_size(mddev, sectors: `0`, raid_disks: conf->raid_disks + mddev->delta_disks)
8510	< mddev->array_sectors) {
8511	pr_warn("md/raid:%s: array size must be reduced before number of disks\n",
8512	mdname(mddev));
8513	return -EINVAL;
8514	}
8515
8516	atomic_set(v: &conf->reshape_stripes, i: `0`);
8517	spin_lock_irq(lock: &conf->device_lock);
8518	write_seqcount_begin(&conf->gen_lock);
8519	conf->previous_raid_disks = conf->raid_disks;
8520	conf->raid_disks += mddev->delta_disks;
8521	conf->prev_chunk_sectors = conf->chunk_sectors;
8522	conf->chunk_sectors = mddev->new_chunk_sectors;
8523	conf->prev_algo = conf->algorithm;
8524	conf->algorithm = mddev->new_layout;
8525	conf->generation++;
8526	/ Code that selects data_offset needs to see the generation update*
8527	* if reshape_progress has been set - so a memory barrier needed.
8528	*/
8529	smp_mb();
8530	if (mddev->reshape_backwards)
8531	conf->reshape_progress = raid5_size(mddev, sectors: `0`, raid_disks: `0`);
8532	else
8533	conf->reshape_progress = `0`;
8534	conf->reshape_safe = conf->reshape_progress;
8535	write_seqcount_end(&conf->gen_lock);
8536	spin_unlock_irq(lock: &conf->device_lock);
8537
8538	/ Now make sure any requests that proceeded on the assumption*
8539	* the reshape wasn't running - like Discard or Read - have
8540	* completed.
8541	*/
8542	raid5_quiesce(mddev, quiesce: true);
8543	raid5_quiesce(mddev, quiesce: false);
8544
8545	/ Add some new drives, as many as will fit.*
8546	* We know there are enough to make the newly sized array work.
8547	* Don't add devices if we are reducing the number of
8548	* devices in the array. This is because it is not possible
8549	* to correctly record the "partially reconstructed" state of
8550	* such devices during the reshape and confusion could result.
8551	*/
8552	if (mddev->delta_disks >= `0`) {
8553	rdev_for_each(rdev, mddev)
8554	if (rdev->raid_disk < `0` &&
8555	!test_bit(Faulty, &rdev->flags)) {
8556	if (raid5_add_disk(mddev, rdev) == `0`) {
8557	if (rdev->raid_disk
8558	>= conf->previous_raid_disks)
8559	set_bit(nr: In_sync, addr: &rdev->flags);
8560	else
8561	rdev->recovery_offset = `0`;
8562
8563	/ Failure here is OK /
8564	sysfs_link_rdev(mddev, rdev);
8565	}
8566	} else if (rdev->raid_disk >= conf->previous_raid_disks
8567	&& !test_bit(Faulty, &rdev->flags)) {
8568	/ This is a spare that was manually added /
8569	set_bit(nr: In_sync, addr: &rdev->flags);
8570	}
8571
8572	/ When a reshape changes the number of devices,*
8573	* ->degraded is measured against the larger of the
8574	* pre and post number of devices.
8575	*/
8576	spin_lock_irqsave(&conf->device_lock, flags);
8577	mddev->degraded = raid5_calc_degraded(conf);
8578	spin_unlock_irqrestore(lock: &conf->device_lock, flags);
8579	}
8580	mddev->raid_disks = conf->raid_disks;
8581	mddev->reshape_position = conf->reshape_progress;
8582	set_bit(nr: MD_SB_CHANGE_DEVS, addr: &mddev->sb_flags);
8583
8584	clear_bit(nr: MD_RECOVERY_SYNC, addr: &mddev->recovery);
8585	clear_bit(nr: MD_RECOVERY_CHECK, addr: &mddev->recovery);
8586	clear_bit(nr: MD_RECOVERY_DONE, addr: &mddev->recovery);
8587	set_bit(nr: MD_RECOVERY_RESHAPE, addr: &mddev->recovery);
8588	set_bit(nr: MD_RECOVERY_RUNNING, addr: &mddev->recovery);
8589	rcu_assign_pointer(mddev->sync_thread,
8590	md_register_thread(md_do_sync, mddev, "reshape"));
8591	if (!mddev->sync_thread) {
8592	mddev->recovery = `0`;
8593	spin_lock_irq(lock: &conf->device_lock);
8594	write_seqcount_begin(&conf->gen_lock);
8595	mddev->raid_disks = conf->raid_disks = conf->previous_raid_disks;
8596	mddev->new_chunk_sectors =
8597	conf->chunk_sectors = conf->prev_chunk_sectors;
8598	mddev->new_layout = conf->algorithm = conf->prev_algo;
8599	rdev_for_each(rdev, mddev)
8600	rdev->new_data_offset = rdev->data_offset;
8601	smp_wmb();
8602	conf->generation --;
8603	conf->reshape_progress = MaxSector;
8604	mddev->reshape_position = MaxSector;
8605	write_seqcount_end(&conf->gen_lock);
8606	spin_unlock_irq(lock: &conf->device_lock);
8607	return -EAGAIN;
8608	}
8609	conf->reshape_checkpoint = jiffies;
8610	md_wakeup_thread(thread: mddev->sync_thread);
8611	md_new_event();
8612	return `0`;
8613	}
8614
8615	/ This is called from the reshape thread and should make any*
8616	* changes needed in 'conf'
8617	*/
8618	static void end_reshape(struct r5conf *conf)
8619	{
8620
8621	if (!test_bit(MD_RECOVERY_INTR, &conf->mddev->recovery)) {
8622	struct md_rdev *rdev;
8623
8624	spin_lock_irq(lock: &conf->device_lock);
8625	conf->previous_raid_disks = conf->raid_disks;
8626	md_finish_reshape(mddev: conf->mddev);
8627	smp_wmb();
8628	conf->reshape_progress = MaxSector;
8629	conf->mddev->reshape_position = MaxSector;
8630	rdev_for_each(rdev, conf->mddev)
8631	if (rdev->raid_disk >= `0` &&
8632	!test_bit(Journal, &rdev->flags) &&
8633	!test_bit(In_sync, &rdev->flags))
8634	rdev->recovery_offset = MaxSector;
8635	spin_unlock_irq(lock: &conf->device_lock);
8636	wake_up(&conf->wait_for_overlap);
8637
8638	if (conf->mddev->queue)
8639	raid5_set_io_opt(conf);
8640	}
8641	}
8642
8643	/ This is called from the raid5d thread with mddev_lock held.*
8644	* It makes config changes to the device.
8645	*/
8646	static void raid5_finish_reshape(struct mddev *mddev)
8647	{
8648	struct r5conf *conf = mddev->private;
8649	struct md_rdev *rdev;
8650
8651	if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
8652
8653	if (mddev->delta_disks <= `0`) {
8654	int d;
8655	spin_lock_irq(lock: &conf->device_lock);
8656	mddev->degraded = raid5_calc_degraded(conf);
8657	spin_unlock_irq(lock: &conf->device_lock);
8658	for (d = conf->raid_disks ;
8659	d < conf->raid_disks - mddev->delta_disks;
8660	d++) {
8661	rdev = rdev_mdlock_deref(mddev,
8662	rdev: conf->disks[d].rdev);
8663	if (rdev)
8664	clear_bit(nr: In_sync, addr: &rdev->flags);
8665	rdev = rdev_mdlock_deref(mddev,
8666	rdev: conf->disks[d].replacement);
8667	if (rdev)
8668	clear_bit(nr: In_sync, addr: &rdev->flags);
8669	}
8670	}
8671	mddev->layout = conf->algorithm;
8672	mddev->chunk_sectors = conf->chunk_sectors;
8673	mddev->reshape_position = MaxSector;
8674	mddev->delta_disks = `0`;
8675	mddev->reshape_backwards = `0`;
8676	}
8677	}
8678
8679	static void raid5_quiesce(struct mddev mddev, int* quiesce)
8680	{
8681	struct r5conf *conf = mddev->private;
8682
8683	if (quiesce) {
8684	/ stop all writes /
8685	lock_all_device_hash_locks_irq(conf);
8686	/ '2' tells resync/reshape to pause so that all*
8687	* active stripes can drain
8688	*/
8689	r5c_flush_cache(conf, INT_MAX);
8690	/ need a memory barrier to make sure read_one_chunk() sees*
8691	* quiesce started and reverts to slow (locked) path.
8692	*/
8693	smp_store_release(&conf->quiesce, `2`);
8694	wait_event_cmd(conf->wait_for_quiescent,
8695	atomic_read(&conf->active_stripes) == `0` &&
8696	atomic_read(&conf->active_aligned_reads) == `0`,
8697	unlock_all_device_hash_locks_irq(conf),
8698	lock_all_device_hash_locks_irq(conf));
8699	conf->quiesce = `1`;
8700	unlock_all_device_hash_locks_irq(conf);
8701	/ allow reshape to continue /
8702	wake_up(&conf->wait_for_overlap);
8703	} else {
8704	/ re-enable writes /
8705	lock_all_device_hash_locks_irq(conf);
8706	conf->quiesce = `0`;
8707	wake_up(&conf->wait_for_quiescent);
8708	wake_up(&conf->wait_for_overlap);
8709	unlock_all_device_hash_locks_irq(conf);
8710	}
8711	log_quiesce(conf, quiesce);
8712	}
8713
8714	static void raid45_takeover_raid0(struct* mddev mddev, int* level)
8715	{
8716	struct r0conf *raid0_conf = mddev->private;
8717	sector_t sectors;
8718
8719	/ for raid0 takeover only one zone is supported /
8720	if (raid0_conf->nr_strip_zones > `1`) {
8721	pr_warn("md/raid:%s: cannot takeover raid0 with more than one zone.\n",
8722	mdname(mddev));
8723	return ERR_PTR(error: -EINVAL);
8724	}
8725
8726	sectors = raid0_conf->strip_zone[`0`].zone_end;
8727	sector_div(sectors, raid0_conf->strip_zone[`0`].nb_dev);
8728	mddev->dev_sectors = sectors;
8729	mddev->new_level = level;
8730	mddev->new_layout = ALGORITHM_PARITY_N;
8731	mddev->new_chunk_sectors = mddev->chunk_sectors;
8732	mddev->raid_disks += `1`;
8733	mddev->delta_disks = `1`;
8734	/ make sure it will be not marked as dirty /
8735	mddev->recovery_cp = MaxSector;
8736
8737	return setup_conf(mddev);
8738	}
8739
8740	static void raid5_takeover_raid1(struct* mddev *mddev)
8741	{
8742	int chunksect;
8743	void *ret;
8744
8745	if (mddev->raid_disks != `2` \|\|
8746	mddev->degraded > `1`)
8747	return ERR_PTR(error: -EINVAL);
8748
8749	/ Should check if there are write-behind devices? /
8750
8751	chunksect = `64``2`; /* 64K by default /
8752
8753	/ The array must be an exact multiple of chunksize /
8754	while (chunksect && (mddev->array_sectors & (chunksect-`1`)))
8755	chunksect >>= `1`;
8756
8757	if ((chunksect<<`9`) < RAID5_STRIPE_SIZE((struct r5conf *)mddev->private))
8758	/ array size does not allow a suitable chunk size /
8759	return ERR_PTR(error: -EINVAL);
8760
8761	mddev->new_level = `5`;
8762	mddev->new_layout = ALGORITHM_LEFT_SYMMETRIC;
8763	mddev->new_chunk_sectors = chunksect;
8764
8765	ret = setup_conf(mddev);
8766	if (!IS_ERR(ptr: ret))
8767	mddev_clear_unsupported_flags(mddev,
8768	UNSUPPORTED_MDDEV_FLAGS);
8769	return ret;
8770	}
8771
8772	static void raid5_takeover_raid6(struct* mddev *mddev)
8773	{
8774	int new_layout;
8775
8776	switch (mddev->layout) {
8777	case ALGORITHM_LEFT_ASYMMETRIC_6:
8778	new_layout = ALGORITHM_LEFT_ASYMMETRIC;
8779	break;
8780	case ALGORITHM_RIGHT_ASYMMETRIC_6:
8781	new_layout = ALGORITHM_RIGHT_ASYMMETRIC;
8782	break;
8783	case ALGORITHM_LEFT_SYMMETRIC_6:
8784	new_layout = ALGORITHM_LEFT_SYMMETRIC;
8785	break;
8786	case ALGORITHM_RIGHT_SYMMETRIC_6:
8787	new_layout = ALGORITHM_RIGHT_SYMMETRIC;
8788	break;
8789	case ALGORITHM_PARITY_0_6:
8790	new_layout = ALGORITHM_PARITY_0;
8791	break;
8792	case ALGORITHM_PARITY_N:
8793	new_layout = ALGORITHM_PARITY_N;
8794	break;
8795	default:
8796	return ERR_PTR(error: -EINVAL);
8797	}
8798	mddev->new_level = `5`;
8799	mddev->new_layout = new_layout;
8800	mddev->delta_disks = -`1`;
8801	mddev->raid_disks -= `1`;
8802	return setup_conf(mddev);
8803	}
8804
8805	static int raid5_check_reshape(struct mddev *mddev)
8806	{
8807	/ For a 2-drive array, the layout and chunk size can be changed*
8808	* immediately as not restriping is needed.
8809	* For larger arrays we record the new value - after validation
8810	* to be used by a reshape pass.
8811	*/
8812	struct r5conf *conf = mddev->private;
8813	int new_chunk = mddev->new_chunk_sectors;
8814
8815	if (mddev->new_layout >= `0` && !algorithm_valid_raid5(layout: mddev->new_layout))
8816	return -EINVAL;
8817	if (new_chunk > `0`) {
8818	if (!is_power_of_2(n: new_chunk))
8819	return -EINVAL;
8820	if (new_chunk < (PAGE_SIZE>>`9`))
8821	return -EINVAL;
8822	if (mddev->array_sectors & (new_chunk-`1`))
8823	/ not factor of array size /
8824	return -EINVAL;
8825	}
8826
8827	/ They look valid /
8828
8829	if (mddev->raid_disks == `2`) {
8830	/ can make the change immediately /
8831	if (mddev->new_layout >= `0`) {
8832	conf->algorithm = mddev->new_layout;
8833	mddev->layout = mddev->new_layout;
8834	}
8835	if (new_chunk > `0`) {
8836	conf->chunk_sectors = new_chunk ;
8837	mddev->chunk_sectors = new_chunk;
8838	}
8839	set_bit(nr: MD_SB_CHANGE_DEVS, addr: &mddev->sb_flags);
8840	md_wakeup_thread(thread: mddev->thread);
8841	}
8842	return check_reshape(mddev);
8843	}
8844
8845	static int raid6_check_reshape(struct mddev *mddev)
8846	{
8847	int new_chunk = mddev->new_chunk_sectors;
8848
8849	if (mddev->new_layout >= `0` && !algorithm_valid_raid6(layout: mddev->new_layout))
8850	return -EINVAL;
8851	if (new_chunk > `0`) {
8852	if (!is_power_of_2(n: new_chunk))
8853	return -EINVAL;
8854	if (new_chunk < (PAGE_SIZE >> `9`))
8855	return -EINVAL;
8856	if (mddev->array_sectors & (new_chunk-`1`))
8857	/ not factor of array size /
8858	return -EINVAL;
8859	}
8860
8861	/ They look valid /
8862	return check_reshape(mddev);
8863	}
8864
8865	static void raid5_takeover(struct* mddev *mddev)
8866	{
8867	/ raid5 can take over:*
8868	* raid0 - if there is only one strip zone - make it a raid4 layout
8869	* raid1 - if there are two drives. We need to know the chunk size
8870	* raid4 - trivial - just use a raid4 layout.
8871	* raid6 - Providing it is a *_6 layout
8872	*/
8873	if (mddev->level == `0`)
8874	return raid45_takeover_raid0(mddev, level: `5`);
8875	if (mddev->level == `1`)
8876	return raid5_takeover_raid1(mddev);
8877	if (mddev->level == `4`) {
8878	mddev->new_layout = ALGORITHM_PARITY_N;
8879	mddev->new_level = `5`;
8880	return setup_conf(mddev);
8881	}
8882	if (mddev->level == `6`)
8883	return raid5_takeover_raid6(mddev);
8884
8885	return ERR_PTR(error: -EINVAL);
8886	}
8887
8888	static void raid4_takeover(struct* mddev *mddev)
8889	{
8890	/ raid4 can take over:*
8891	* raid0 - if there is only one strip zone
8892	* raid5 - if layout is right
8893	*/
8894	if (mddev->level == `0`)
8895	return raid45_takeover_raid0(mddev, level: `4`);
8896	if (mddev->level == `5` &&
8897	mddev->layout == ALGORITHM_PARITY_N) {
8898	mddev->new_layout = `0`;
8899	mddev->new_level = `4`;
8900	return setup_conf(mddev);
8901	}
8902	return ERR_PTR(error: -EINVAL);
8903	}
8904
8905	static struct md_personality raid5_personality;
8906
8907	static void raid6_takeover(struct* mddev *mddev)
8908	{
8909	/ Currently can only take over a raid5. We map the*
8910	* personality to an equivalent raid6 personality
8911	* with the Q block at the end.
8912	*/
8913	int new_layout;
8914
8915	if (mddev->pers != &raid5_personality)
8916	return ERR_PTR(error: -EINVAL);
8917	if (mddev->degraded > `1`)
8918	return ERR_PTR(error: -EINVAL);
8919	if (mddev->raid_disks > `253`)
8920	return ERR_PTR(error: -EINVAL);
8921	if (mddev->raid_disks < `3`)
8922	return ERR_PTR(error: -EINVAL);
8923
8924	switch (mddev->layout) {
8925	case ALGORITHM_LEFT_ASYMMETRIC:
8926	new_layout = ALGORITHM_LEFT_ASYMMETRIC_6;
8927	break;
8928	case ALGORITHM_RIGHT_ASYMMETRIC:
8929	new_layout = ALGORITHM_RIGHT_ASYMMETRIC_6;
8930	break;
8931	case ALGORITHM_LEFT_SYMMETRIC:
8932	new_layout = ALGORITHM_LEFT_SYMMETRIC_6;
8933	break;
8934	case ALGORITHM_RIGHT_SYMMETRIC:
8935	new_layout = ALGORITHM_RIGHT_SYMMETRIC_6;
8936	break;
8937	case ALGORITHM_PARITY_0:
8938	new_layout = ALGORITHM_PARITY_0_6;
8939	break;
8940	case ALGORITHM_PARITY_N:
8941	new_layout = ALGORITHM_PARITY_N;
8942	break;
8943	default:
8944	return ERR_PTR(error: -EINVAL);
8945	}
8946	mddev->new_level = `6`;
8947	mddev->new_layout = new_layout;
8948	mddev->delta_disks = `1`;
8949	mddev->raid_disks += `1`;
8950	return setup_conf(mddev);
8951	}
8952
8953	static int raid5_change_consistency_policy(struct mddev mddev, const* char *buf)
8954	{
8955	struct r5conf *conf;
8956	int err;
8957
8958	err = mddev_suspend_and_lock(mddev);
8959	if (err)
8960	return err;
8961	conf = mddev->private;
8962	if (!conf) {
8963	mddev_unlock_and_resume(mddev);
8964	return -ENODEV;
8965	}
8966
8967	if (strncmp(buf, "ppl", `3`) == `0`) {
8968	/ ppl only works with RAID 5 /
8969	if (!raid5_has_ppl(conf) && conf->level == `5`) {
8970	err = log_init(conf, NULL, ppl: true);
8971	if (!err) {
8972	err = resize_stripes(conf, newsize: conf->pool_size);
8973	if (err)
8974	log_exit(conf);
8975	}
8976	} else
8977	err = -EINVAL;
8978	} else if (strncmp(buf, "resync", `6`) == `0`) {
8979	if (raid5_has_ppl(conf)) {
8980	log_exit(conf);
8981	err = resize_stripes(conf, newsize: conf->pool_size);
8982	} else if (test_bit(MD_HAS_JOURNAL, &conf->mddev->flags) &&
8983	r5l_log_disk_error(conf)) {
8984	bool journal_dev_exists = false;
8985	struct md_rdev *rdev;
8986
8987	rdev_for_each(rdev, mddev)
8988	if (test_bit(Journal, &rdev->flags)) {
8989	journal_dev_exists = true;
8990	break;
8991	}
8992
8993	if (!journal_dev_exists)
8994	clear_bit(nr: MD_HAS_JOURNAL, addr: &mddev->flags);
8995	else / need remove journal device first /
8996	err = -EBUSY;
8997	} else
8998	err = -EINVAL;
8999	} else {
9000	err = -EINVAL;
9001	}
9002
9003	if (!err)
9004	md_update_sb(mddev, force: `1`);
9005
9006	mddev_unlock_and_resume(mddev);
9007
9008	return err;
9009	}
9010
9011	static int raid5_start(struct mddev *mddev)
9012	{
9013	struct r5conf *conf = mddev->private;
9014
9015	return r5l_start(log: conf->log);
9016	}
9017
9018	static struct md_personality raid6_personality =
9019	{
9020	.name = "raid6",
9021	.level = `6`,
9022	.owner = THIS_MODULE,
9023	.make_request = raid5_make_request,
9024	.run = raid5_run,
9025	.start = raid5_start,
9026	.free = raid5_free,
9027	.status = raid5_status,
9028	.error_handler = raid5_error,
9029	.hot_add_disk = raid5_add_disk,
9030	.hot_remove_disk= raid5_remove_disk,
9031	.spare_active = raid5_spare_active,
9032	.sync_request = raid5_sync_request,
9033	.resize = raid5_resize,
9034	.size = raid5_size,
9035	.check_reshape = raid6_check_reshape,
9036	.start_reshape = raid5_start_reshape,
9037	.finish_reshape = raid5_finish_reshape,
9038	.quiesce = raid5_quiesce,
9039	.takeover = raid6_takeover,
9040	.change_consistency_policy = raid5_change_consistency_policy,
9041	};
9042	static struct md_personality raid5_personality =
9043	{
9044	.name = "raid5",
9045	.level = `5`,
9046	.owner = THIS_MODULE,
9047	.make_request = raid5_make_request,
9048	.run = raid5_run,
9049	.start = raid5_start,
9050	.free = raid5_free,
9051	.status = raid5_status,
9052	.error_handler = raid5_error,
9053	.hot_add_disk = raid5_add_disk,
9054	.hot_remove_disk= raid5_remove_disk,
9055	.spare_active = raid5_spare_active,
9056	.sync_request = raid5_sync_request,
9057	.resize = raid5_resize,
9058	.size = raid5_size,
9059	.check_reshape = raid5_check_reshape,
9060	.start_reshape = raid5_start_reshape,
9061	.finish_reshape = raid5_finish_reshape,
9062	.quiesce = raid5_quiesce,
9063	.takeover = raid5_takeover,
9064	.change_consistency_policy = raid5_change_consistency_policy,
9065	};
9066
9067	static struct md_personality raid4_personality =
9068	{
9069	.name = "raid4",
9070	.level = `4`,
9071	.owner = THIS_MODULE,
9072	.make_request = raid5_make_request,
9073	.run = raid5_run,
9074	.start = raid5_start,
9075	.free = raid5_free,
9076	.status = raid5_status,
9077	.error_handler = raid5_error,
9078	.hot_add_disk = raid5_add_disk,
9079	.hot_remove_disk= raid5_remove_disk,
9080	.spare_active = raid5_spare_active,
9081	.sync_request = raid5_sync_request,
9082	.resize = raid5_resize,
9083	.size = raid5_size,
9084	.check_reshape = raid5_check_reshape,
9085	.start_reshape = raid5_start_reshape,
9086	.finish_reshape = raid5_finish_reshape,
9087	.quiesce = raid5_quiesce,
9088	.takeover = raid4_takeover,
9089	.change_consistency_policy = raid5_change_consistency_policy,
9090	};
9091
9092	static int __init raid5_init(void)
9093	{
9094	int ret;
9095
9096	raid5_wq = alloc_workqueue(fmt: "raid5wq",
9097	flags: WQ_UNBOUND\|WQ_MEM_RECLAIM\|WQ_CPU_INTENSIVE\|WQ_SYSFS, max_active: `0`);
9098	if (!raid5_wq)
9099	return -ENOMEM;
9100
9101	ret = cpuhp_setup_state_multi(state: CPUHP_MD_RAID5_PREPARE,
9102	name: "md/raid5:prepare",
9103	startup: raid456_cpu_up_prepare,
9104	teardown: raid456_cpu_dead);
9105	if (ret) {
9106	destroy_workqueue(wq: raid5_wq);
9107	return ret;
9108	}
9109	register_md_personality(p: &raid6_personality);
9110	register_md_personality(p: &raid5_personality);
9111	register_md_personality(p: &raid4_personality);
9112	return `0`;
9113	}
9114
9115	static void raid5_exit(void)
9116	{
9117	unregister_md_personality(p: &raid6_personality);
9118	unregister_md_personality(p: &raid5_personality);
9119	unregister_md_personality(p: &raid4_personality);
9120	cpuhp_remove_multi_state(state: CPUHP_MD_RAID5_PREPARE);
9121	destroy_workqueue(wq: raid5_wq);
9122	}
9123
9124	module_init(raid5_init);
9125	module_exit(raid5_exit);
9126	MODULE_LICENSE("GPL");
9127	MODULE_DESCRIPTION("RAID4/5/6 (striping with parity) personality for MD");
9128	MODULE_ALIAS("md-personality-4"); / RAID5 /
9129	MODULE_ALIAS("md-raid5");
9130	MODULE_ALIAS("md-raid4");
9131	MODULE_ALIAS("md-level-5");
9132	MODULE_ALIAS("md-level-4");
9133	MODULE_ALIAS("md-personality-8"); / RAID6 /
9134	MODULE_ALIAS("md-raid6");
9135	MODULE_ALIAS("md-level-6");
9136
9137	/ This used to be two separate modules, they were: /
9138	MODULE_ALIAS("raid5");
9139	MODULE_ALIAS("raid6");
9140

source code of linux/drivers/md/raid5.c