raid56.c source code [linux/fs/btrfs/raid56.c]

1	// SPDX-License-Identifier: GPL-2.0
2	/*
3	* Copyright (C) 2012 Fusion-io All rights reserved.
4	* Copyright (C) 2012 Intel Corp. All rights reserved.
5	*/
6
7	#include <linux/sched.h>
8	#include <linux/bio.h>
9	#include <linux/slab.h>
10	#include <linux/blkdev.h>
11	#include <linux/raid/pq.h>
12	#include <linux/hash.h>
13	#include <linux/list_sort.h>
14	#include <linux/raid/xor.h>
15	#include <linux/mm.h>
16	#include "messages.h"
17	#include "misc.h"
18	#include "ctree.h"
19	#include "disk-io.h"
20	#include "volumes.h"
21	#include "raid56.h"
22	#include "async-thread.h"
23	#include "file-item.h"
24	#include "btrfs_inode.h"
25
26	/ set when additional merges to this rbio are not allowed /
27	#define RBIO_RMW_LOCKED_BIT 1
28
29	/*
30	* set when this rbio is sitting in the hash, but it is just a cache
31	* of past RMW
32	*/
33	#define RBIO_CACHE_BIT 2
34
35	/*
36	* set when it is safe to trust the stripe_pages for caching
37	*/
38	#define RBIO_CACHE_READY_BIT 3
39
40	#define RBIO_CACHE_SIZE 1024
41
42	#define BTRFS_STRIPE_HASH_TABLE_BITS 11
43
44	/ Used by the raid56 code to lock stripes for read/modify/write /
45	struct btrfs_stripe_hash {
46	struct list_head hash_list;
47	spinlock_t lock;
48	};
49
50	/ Used by the raid56 code to lock stripes for read/modify/write /
51	struct btrfs_stripe_hash_table {
52	struct list_head stripe_cache;
53	spinlock_t cache_lock;
54	int cache_size;
55	struct btrfs_stripe_hash table[];
56	};
57
58	/*
59	* A bvec like structure to present a sector inside a page.
60	*
61	* Unlike bvec we don't need bvlen, as it's fixed to sectorsize.
62	*/
63	struct sector_ptr {
64	struct page *page;
65	unsigned int pgoff:`24`;
66	unsigned int uptodate:`8`;
67	};
68
69	static void rmw_rbio_work(struct work_struct *work);
70	static void rmw_rbio_work_locked(struct work_struct *work);
71	static void index_rbio_pages(struct btrfs_raid_bio *rbio);
72	static int alloc_rbio_pages(struct btrfs_raid_bio *rbio);
73
74	static int finish_parity_scrub(struct btrfs_raid_bio *rbio);
75	static void scrub_rbio_work_locked(struct work_struct *work);
76
77	static void free_raid_bio_pointers(struct btrfs_raid_bio *rbio)
78	{
79	bitmap_free(bitmap: rbio->error_bitmap);
80	kfree(objp: rbio->stripe_pages);
81	kfree(objp: rbio->bio_sectors);
82	kfree(objp: rbio->stripe_sectors);
83	kfree(objp: rbio->finish_pointers);
84	}
85
86	static void free_raid_bio(struct btrfs_raid_bio *rbio)
87	{
88	int i;
89
90	if (!refcount_dec_and_test(r: &rbio->refs))
91	return;
92
93	WARN_ON(!list_empty(&rbio->stripe_cache));
94	WARN_ON(!list_empty(&rbio->hash_list));
95	WARN_ON(!bio_list_empty(&rbio->bio_list));
96
97	for (i = `0`; i < rbio->nr_pages; i++) {
98	if (rbio->stripe_pages[i]) {
99	__free_page(rbio->stripe_pages[i]);
100	rbio->stripe_pages[i] = NULL;
101	}
102	}
103
104	btrfs_put_bioc(bioc: rbio->bioc);
105	free_raid_bio_pointers(rbio);
106	kfree(objp: rbio);
107	}
108
109	static void start_async_work(struct btrfs_raid_bio *rbio, work_func_t work_func)
110	{
111	INIT_WORK(&rbio->work, work_func);
112	queue_work(wq: rbio->bioc->fs_info->rmw_workers, work: &rbio->work);
113	}
114
115	/*
116	* the stripe hash table is used for locking, and to collect
117	* bios in hopes of making a full stripe
118	*/
119	int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info)
120	{
121	struct btrfs_stripe_hash_table *table;
122	struct btrfs_stripe_hash_table *x;
123	struct btrfs_stripe_hash *cur;
124	struct btrfs_stripe_hash *h;
125	int num_entries = `1` << BTRFS_STRIPE_HASH_TABLE_BITS;
126	int i;
127
128	if (info->stripe_hash_table)
129	return `0`;
130
131	/*
132	* The table is large, starting with order 4 and can go as high as
133	* order 7 in case lock debugging is turned on.
134	*
135	* Try harder to allocate and fallback to vmalloc to lower the chance
136	* of a failing mount.
137	*/
138	table = kvzalloc(struct_size(table, table, num_entries), GFP_KERNEL);
139	if (!table)
140	return -ENOMEM;
141
142	spin_lock_init(&table->cache_lock);
143	INIT_LIST_HEAD(list: &table->stripe_cache);
144
145	h = table->table;
146
147	for (i = `0`; i < num_entries; i++) {
148	cur = h + i;
149	INIT_LIST_HEAD(list: &cur->hash_list);
150	spin_lock_init(&cur->lock);
151	}
152
153	x = cmpxchg(&info->stripe_hash_table, NULL, table);
154	kvfree(addr: x);
155	return `0`;
156	}
157
158	/*
159	* caching an rbio means to copy anything from the
160	* bio_sectors array into the stripe_pages array. We
161	* use the page uptodate bit in the stripe cache array
162	* to indicate if it has valid data
163	*
164	* once the caching is done, we set the cache ready
165	* bit.
166	*/
167	static void cache_rbio_pages(struct btrfs_raid_bio *rbio)
168	{
169	int i;
170	int ret;
171
172	ret = alloc_rbio_pages(rbio);
173	if (ret)
174	return;
175
176	for (i = `0`; i < rbio->nr_sectors; i++) {
177	/ Some range not covered by bio (partial write), skip it /
178	if (!rbio->bio_sectors[i].page) {
179	/*
180	* Even if the sector is not covered by bio, if it is
181	* a data sector it should still be uptodate as it is
182	* read from disk.
183	*/
184	if (i < rbio->nr_data * rbio->stripe_nsectors)
185	ASSERT(rbio->stripe_sectors[i].uptodate);
186	continue;
187	}
188
189	ASSERT(rbio->stripe_sectors[i].page);
190	memcpy_page(dst_page: rbio->stripe_sectors[i].page,
191	dst_off: rbio->stripe_sectors[i].pgoff,
192	src_page: rbio->bio_sectors[i].page,
193	src_off: rbio->bio_sectors[i].pgoff,
194	len: rbio->bioc->fs_info->sectorsize);
195	rbio->stripe_sectors[i].uptodate = `1`;
196	}
197	set_bit(RBIO_CACHE_READY_BIT, addr: &rbio->flags);
198	}
199
200	/*
201	* we hash on the first logical address of the stripe
202	*/
203	static int rbio_bucket(struct btrfs_raid_bio *rbio)
204	{
205	u64 num = rbio->bioc->full_stripe_logical;
206
207	/*
208	* we shift down quite a bit. We're using byte
209	* addressing, and most of the lower bits are zeros.
210	* This tends to upset hash_64, and it consistently
211	* returns just one or two different values.
212	*
213	* shifting off the lower bits fixes things.
214	*/
215	return hash_64(val: num >> `16`, BTRFS_STRIPE_HASH_TABLE_BITS);
216	}
217
218	static bool full_page_sectors_uptodate(struct btrfs_raid_bio *rbio,
219	unsigned int page_nr)
220	{
221	const u32 sectorsize = rbio->bioc->fs_info->sectorsize;
222	const u32 sectors_per_page = PAGE_SIZE / sectorsize;
223	int i;
224
225	ASSERT(page_nr < rbio->nr_pages);
226
227	for (i = sectors_per_page * page_nr;
228	i < sectors_per_page * page_nr + sectors_per_page;
229	i++) {
230	if (!rbio->stripe_sectors[i].uptodate)
231	return false;
232	}
233	return true;
234	}
235
236	/*
237	* Update the stripe_sectors[] array to use correct page and pgoff
238	*
239	* Should be called every time any page pointer in stripes_pages[] got modified.
240	*/
241	static void index_stripe_sectors(struct btrfs_raid_bio *rbio)
242	{
243	const u32 sectorsize = rbio->bioc->fs_info->sectorsize;
244	u32 offset;
245	int i;
246
247	for (i = `0`, offset = `0`; i < rbio->nr_sectors; i++, offset += sectorsize) {
248	int page_index = offset >> PAGE_SHIFT;
249
250	ASSERT(page_index < rbio->nr_pages);
251	rbio->stripe_sectors[i].page = rbio->stripe_pages[page_index];
252	rbio->stripe_sectors[i].pgoff = offset_in_page(offset);
253	}
254	}
255
256	static void steal_rbio_page(struct btrfs_raid_bio *src,
257	struct btrfs_raid_bio dest, int* page_nr)
258	{
259	const u32 sectorsize = src->bioc->fs_info->sectorsize;
260	const u32 sectors_per_page = PAGE_SIZE / sectorsize;
261	int i;
262
263	if (dest->stripe_pages[page_nr])
264	__free_page(dest->stripe_pages[page_nr]);
265	dest->stripe_pages[page_nr] = src->stripe_pages[page_nr];
266	src->stripe_pages[page_nr] = NULL;
267
268	/ Also update the sector->uptodate bits. /
269	for (i = sectors_per_page * page_nr;
270	i < sectors_per_page * page_nr + sectors_per_page; i++)
271	dest->stripe_sectors[i].uptodate = true;
272	}
273
274	static bool is_data_stripe_page(struct btrfs_raid_bio rbio, int* page_nr)
275	{
276	const int sector_nr = (page_nr << PAGE_SHIFT) >>
277	rbio->bioc->fs_info->sectorsize_bits;
278
279	/*
280	* We have ensured PAGE_SIZE is aligned with sectorsize, thus
281	* we won't have a page which is half data half parity.
282	*
283	* Thus if the first sector of the page belongs to data stripes, then
284	* the full page belongs to data stripes.
285	*/
286	return (sector_nr < rbio->nr_data * rbio->stripe_nsectors);
287	}
288
289	/*
290	* Stealing an rbio means taking all the uptodate pages from the stripe array
291	* in the source rbio and putting them into the destination rbio.
292	*
293	* This will also update the involved stripe_sectors[] which are referring to
294	* the old pages.
295	*/
296	static void steal_rbio(struct btrfs_raid_bio src, struct* btrfs_raid_bio *dest)
297	{
298	int i;
299
300	if (!test_bit(RBIO_CACHE_READY_BIT, &src->flags))
301	return;
302
303	for (i = `0`; i < dest->nr_pages; i++) {
304	struct page *p = src->stripe_pages[i];
305
306	/*
307	* We don't need to steal P/Q pages as they will always be
308	* regenerated for RMW or full write anyway.
309	*/
310	if (!is_data_stripe_page(rbio: src, page_nr: i))
311	continue;
312
313	/*
314	* If @src already has RBIO_CACHE_READY_BIT, it should have
315	* all data stripe pages present and uptodate.
316	*/
317	ASSERT(p);
318	ASSERT(full_page_sectors_uptodate(src, i));
319	steal_rbio_page(src, dest, page_nr: i);
320	}
321	index_stripe_sectors(rbio: dest);
322	index_stripe_sectors(rbio: src);
323	}
324
325	/*
326	* merging means we take the bio_list from the victim and
327	* splice it into the destination. The victim should
328	* be discarded afterwards.
329	*
330	* must be called with dest->rbio_list_lock held
331	*/
332	static void merge_rbio(struct btrfs_raid_bio *dest,
333	struct btrfs_raid_bio *victim)
334	{
335	bio_list_merge(bl: &dest->bio_list, bl2: &victim->bio_list);
336	dest->bio_list_bytes += victim->bio_list_bytes;
337	/ Also inherit the bitmaps from @victim. /
338	bitmap_or(dst: &dest->dbitmap, src1: &victim->dbitmap, src2: &dest->dbitmap,
339	nbits: dest->stripe_nsectors);
340	bio_list_init(bl: &victim->bio_list);
341	}
342
343	/*
344	* used to prune items that are in the cache. The caller
345	* must hold the hash table lock.
346	*/
347	static void __remove_rbio_from_cache(struct btrfs_raid_bio *rbio)
348	{
349	int bucket = rbio_bucket(rbio);
350	struct btrfs_stripe_hash_table *table;
351	struct btrfs_stripe_hash *h;
352	int freeit = `0`;
353
354	/*
355	* check the bit again under the hash table lock.
356	*/
357	if (!test_bit(RBIO_CACHE_BIT, &rbio->flags))
358	return;
359
360	table = rbio->bioc->fs_info->stripe_hash_table;
361	h = table->table + bucket;
362
363	/ hold the lock for the bucket because we may be*
364	* removing it from the hash table
365	*/
366	spin_lock(lock: &h->lock);
367
368	/*
369	* hold the lock for the bio list because we need
370	* to make sure the bio list is empty
371	*/
372	spin_lock(lock: &rbio->bio_list_lock);
373
374	if (test_and_clear_bit(RBIO_CACHE_BIT, addr: &rbio->flags)) {
375	list_del_init(entry: &rbio->stripe_cache);
376	table->cache_size -= `1`;
377	freeit = `1`;
378
379	/ if the bio list isn't empty, this rbio is*
380	* still involved in an IO. We take it out
381	* of the cache list, and drop the ref that
382	* was held for the list.
383	*
384	* If the bio_list was empty, we also remove
385	* the rbio from the hash_table, and drop
386	* the corresponding ref
387	*/
388	if (bio_list_empty(bl: &rbio->bio_list)) {
389	if (!list_empty(head: &rbio->hash_list)) {
390	list_del_init(entry: &rbio->hash_list);
391	refcount_dec(r: &rbio->refs);
392	BUG_ON(!list_empty(&rbio->plug_list));
393	}
394	}
395	}
396
397	spin_unlock(lock: &rbio->bio_list_lock);
398	spin_unlock(lock: &h->lock);
399
400	if (freeit)
401	free_raid_bio(rbio);
402	}
403
404	/*
405	* prune a given rbio from the cache
406	*/
407	static void remove_rbio_from_cache(struct btrfs_raid_bio *rbio)
408	{
409	struct btrfs_stripe_hash_table *table;
410
411	if (!test_bit(RBIO_CACHE_BIT, &rbio->flags))
412	return;
413
414	table = rbio->bioc->fs_info->stripe_hash_table;
415
416	spin_lock(lock: &table->cache_lock);
417	__remove_rbio_from_cache(rbio);
418	spin_unlock(lock: &table->cache_lock);
419	}
420
421	/*
422	* remove everything in the cache
423	*/
424	static void btrfs_clear_rbio_cache(struct btrfs_fs_info *info)
425	{
426	struct btrfs_stripe_hash_table *table;
427	struct btrfs_raid_bio *rbio;
428
429	table = info->stripe_hash_table;
430
431	spin_lock(lock: &table->cache_lock);
432	while (!list_empty(head: &table->stripe_cache)) {
433	rbio = list_entry(table->stripe_cache.next,
434	struct btrfs_raid_bio,
435	stripe_cache);
436	__remove_rbio_from_cache(rbio);
437	}
438	spin_unlock(lock: &table->cache_lock);
439	}
440
441	/*
442	* remove all cached entries and free the hash table
443	* used by unmount
444	*/
445	void btrfs_free_stripe_hash_table(struct btrfs_fs_info *info)
446	{
447	if (!info->stripe_hash_table)
448	return;
449	btrfs_clear_rbio_cache(info);
450	kvfree(addr: info->stripe_hash_table);
451	info->stripe_hash_table = NULL;
452	}
453
454	/*
455	* insert an rbio into the stripe cache. It
456	* must have already been prepared by calling
457	* cache_rbio_pages
458	*
459	* If this rbio was already cached, it gets
460	* moved to the front of the lru.
461	*
462	* If the size of the rbio cache is too big, we
463	* prune an item.
464	*/
465	static void cache_rbio(struct btrfs_raid_bio *rbio)
466	{
467	struct btrfs_stripe_hash_table *table;
468
469	if (!test_bit(RBIO_CACHE_READY_BIT, &rbio->flags))
470	return;
471
472	table = rbio->bioc->fs_info->stripe_hash_table;
473
474	spin_lock(lock: &table->cache_lock);
475	spin_lock(lock: &rbio->bio_list_lock);
476
477	/ bump our ref if we were not in the list before /
478	if (!test_and_set_bit(RBIO_CACHE_BIT, addr: &rbio->flags))
479	refcount_inc(r: &rbio->refs);
480
481	if (!list_empty(head: &rbio->stripe_cache)){
482	list_move(list: &rbio->stripe_cache, head: &table->stripe_cache);
483	} else {
484	list_add(new: &rbio->stripe_cache, head: &table->stripe_cache);
485	table->cache_size += `1`;
486	}
487
488	spin_unlock(lock: &rbio->bio_list_lock);
489
490	if (table->cache_size > RBIO_CACHE_SIZE) {
491	struct btrfs_raid_bio *found;
492
493	found = list_entry(table->stripe_cache.prev,
494	struct btrfs_raid_bio,
495	stripe_cache);
496
497	if (found != rbio)
498	__remove_rbio_from_cache(rbio: found);
499	}
500
501	spin_unlock(lock: &table->cache_lock);
502	}
503
504	/*
505	* helper function to run the xor_blocks api. It is only
506	* able to do MAX_XOR_BLOCKS at a time, so we need to
507	* loop through.
508	*/
509	static void run_xor(void *pages, int* src_cnt, ssize_t len)
510	{
511	int src_off = `0`;
512	int xor_src_cnt = `0`;
513	void *dest = pages[src_cnt];
514
515	while(src_cnt > `0`) {
516	xor_src_cnt = min(src_cnt, MAX_XOR_BLOCKS);
517	xor_blocks(count: xor_src_cnt, bytes: len, dest, srcs: pages + src_off);
518
519	src_cnt -= xor_src_cnt;
520	src_off += xor_src_cnt;
521	}
522	}
523
524	/*
525	* Returns true if the bio list inside this rbio covers an entire stripe (no
526	* rmw required).
527	*/
528	static int rbio_is_full(struct btrfs_raid_bio *rbio)
529	{
530	unsigned long size = rbio->bio_list_bytes;
531	int ret = `1`;
532
533	spin_lock(lock: &rbio->bio_list_lock);
534	if (size != rbio->nr_data * BTRFS_STRIPE_LEN)
535	ret = `0`;
536	BUG_ON(size > rbio->nr_data * BTRFS_STRIPE_LEN);
537	spin_unlock(lock: &rbio->bio_list_lock);
538
539	return ret;
540	}
541
542	/*
543	* returns 1 if it is safe to merge two rbios together.
544	* The merging is safe if the two rbios correspond to
545	* the same stripe and if they are both going in the same
546	* direction (read vs write), and if neither one is
547	* locked for final IO
548	*
549	* The caller is responsible for locking such that
550	* rmw_locked is safe to test
551	*/
552	static int rbio_can_merge(struct btrfs_raid_bio *last,
553	struct btrfs_raid_bio *cur)
554	{
555	if (test_bit(RBIO_RMW_LOCKED_BIT, &last->flags) \|\|
556	test_bit(RBIO_RMW_LOCKED_BIT, &cur->flags))
557	return `0`;
558
559	/*
560	* we can't merge with cached rbios, since the
561	* idea is that when we merge the destination
562	* rbio is going to run our IO for us. We can
563	* steal from cached rbios though, other functions
564	* handle that.
565	*/
566	if (test_bit(RBIO_CACHE_BIT, &last->flags) \|\|
567	test_bit(RBIO_CACHE_BIT, &cur->flags))
568	return `0`;
569
570	if (last->bioc->full_stripe_logical != cur->bioc->full_stripe_logical)
571	return `0`;
572
573	/ we can't merge with different operations /
574	if (last->operation != cur->operation)
575	return `0`;
576	/*
577	* We've need read the full stripe from the drive.
578	* check and repair the parity and write the new results.
579	*
580	* We're not allowed to add any new bios to the
581	* bio list here, anyone else that wants to
582	* change this stripe needs to do their own rmw.
583	*/
584	if (last->operation == BTRFS_RBIO_PARITY_SCRUB)
585	return `0`;
586
587	if (last->operation == BTRFS_RBIO_READ_REBUILD)
588	return `0`;
589
590	return `1`;
591	}
592
593	static unsigned int rbio_stripe_sector_index(const struct btrfs_raid_bio *rbio,
594	unsigned int stripe_nr,
595	unsigned int sector_nr)
596	{
597	ASSERT(stripe_nr < rbio->real_stripes);
598	ASSERT(sector_nr < rbio->stripe_nsectors);
599
600	return stripe_nr * rbio->stripe_nsectors + sector_nr;
601	}
602
603	/ Return a sector from rbio->stripe_sectors, not from the bio list /
604	static struct sector_ptr rbio_stripe_sector(const* struct btrfs_raid_bio *rbio,
605	unsigned int stripe_nr,
606	unsigned int sector_nr)
607	{
608	return &rbio->stripe_sectors[rbio_stripe_sector_index(rbio, stripe_nr,
609	sector_nr)];
610	}
611
612	/ Grab a sector inside P stripe /
613	static struct sector_ptr rbio_pstripe_sector(const* struct btrfs_raid_bio *rbio,
614	unsigned int sector_nr)
615	{
616	return rbio_stripe_sector(rbio, stripe_nr: rbio->nr_data, sector_nr);
617	}
618
619	/ Grab a sector inside Q stripe, return NULL if not RAID6 /
620	static struct sector_ptr rbio_qstripe_sector(const* struct btrfs_raid_bio *rbio,
621	unsigned int sector_nr)
622	{
623	if (rbio->nr_data + `1` == rbio->real_stripes)
624	return NULL;
625	return rbio_stripe_sector(rbio, stripe_nr: rbio->nr_data + `1`, sector_nr);
626	}
627
628	/*
629	* The first stripe in the table for a logical address
630	* has the lock. rbios are added in one of three ways:
631	*
632	* 1) Nobody has the stripe locked yet. The rbio is given
633	* the lock and 0 is returned. The caller must start the IO
634	* themselves.
635	*
636	* 2) Someone has the stripe locked, but we're able to merge
637	* with the lock owner. The rbio is freed and the IO will
638	* start automatically along with the existing rbio. 1 is returned.
639	*
640	* 3) Someone has the stripe locked, but we're not able to merge.
641	* The rbio is added to the lock owner's plug list, or merged into
642	* an rbio already on the plug list. When the lock owner unlocks,
643	* the next rbio on the list is run and the IO is started automatically.
644	* 1 is returned
645	*
646	* If we return 0, the caller still owns the rbio and must continue with
647	* IO submission. If we return 1, the caller must assume the rbio has
648	* already been freed.
649	*/
650	static noinline int lock_stripe_add(struct btrfs_raid_bio *rbio)
651	{
652	struct btrfs_stripe_hash *h;
653	struct btrfs_raid_bio *cur;
654	struct btrfs_raid_bio *pending;
655	struct btrfs_raid_bio *freeit = NULL;
656	struct btrfs_raid_bio *cache_drop = NULL;
657	int ret = `0`;
658
659	h = rbio->bioc->fs_info->stripe_hash_table->table + rbio_bucket(rbio);
660
661	spin_lock(lock: &h->lock);
662	list_for_each_entry(cur, &h->hash_list, hash_list) {
663	if (cur->bioc->full_stripe_logical != rbio->bioc->full_stripe_logical)
664	continue;
665
666	spin_lock(lock: &cur->bio_list_lock);
667
668	/ Can we steal this cached rbio's pages? /
669	if (bio_list_empty(bl: &cur->bio_list) &&
670	list_empty(head: &cur->plug_list) &&
671	test_bit(RBIO_CACHE_BIT, &cur->flags) &&
672	!test_bit(RBIO_RMW_LOCKED_BIT, &cur->flags)) {
673	list_del_init(entry: &cur->hash_list);
674	refcount_dec(r: &cur->refs);
675
676	steal_rbio(src: cur, dest: rbio);
677	cache_drop = cur;
678	spin_unlock(lock: &cur->bio_list_lock);
679
680	goto lockit;
681	}
682
683	/ Can we merge into the lock owner? /
684	if (rbio_can_merge(last: cur, cur: rbio)) {
685	merge_rbio(dest: cur, victim: rbio);
686	spin_unlock(lock: &cur->bio_list_lock);
687	freeit = rbio;
688	ret = `1`;
689	goto out;
690	}
691
692
693	/*
694	* We couldn't merge with the running rbio, see if we can merge
695	* with the pending ones. We don't have to check for rmw_locked
696	* because there is no way they are inside finish_rmw right now
697	*/
698	list_for_each_entry(pending, &cur->plug_list, plug_list) {
699	if (rbio_can_merge(last: pending, cur: rbio)) {
700	merge_rbio(dest: pending, victim: rbio);
701	spin_unlock(lock: &cur->bio_list_lock);
702	freeit = rbio;
703	ret = `1`;
704	goto out;
705	}
706	}
707
708	/*
709	* No merging, put us on the tail of the plug list, our rbio
710	* will be started with the currently running rbio unlocks
711	*/
712	list_add_tail(new: &rbio->plug_list, head: &cur->plug_list);
713	spin_unlock(lock: &cur->bio_list_lock);
714	ret = `1`;
715	goto out;
716	}
717	lockit:
718	refcount_inc(r: &rbio->refs);
719	list_add(new: &rbio->hash_list, head: &h->hash_list);
720	out:
721	spin_unlock(lock: &h->lock);
722	if (cache_drop)
723	remove_rbio_from_cache(rbio: cache_drop);
724	if (freeit)
725	free_raid_bio(rbio: freeit);
726	return ret;
727	}
728
729	static void recover_rbio_work_locked(struct work_struct *work);
730
731	/*
732	* called as rmw or parity rebuild is completed. If the plug list has more
733	* rbios waiting for this stripe, the next one on the list will be started
734	*/
735	static noinline void unlock_stripe(struct btrfs_raid_bio *rbio)
736	{
737	int bucket;
738	struct btrfs_stripe_hash *h;
739	int keep_cache = `0`;
740
741	bucket = rbio_bucket(rbio);
742	h = rbio->bioc->fs_info->stripe_hash_table->table + bucket;
743
744	if (list_empty(head: &rbio->plug_list))
745	cache_rbio(rbio);
746
747	spin_lock(lock: &h->lock);
748	spin_lock(lock: &rbio->bio_list_lock);
749
750	if (!list_empty(head: &rbio->hash_list)) {
751	/*
752	* if we're still cached and there is no other IO
753	* to perform, just leave this rbio here for others
754	* to steal from later
755	*/
756	if (list_empty(head: &rbio->plug_list) &&
757	test_bit(RBIO_CACHE_BIT, &rbio->flags)) {
758	keep_cache = `1`;
759	clear_bit(RBIO_RMW_LOCKED_BIT, addr: &rbio->flags);
760	BUG_ON(!bio_list_empty(&rbio->bio_list));
761	goto done;
762	}
763
764	list_del_init(entry: &rbio->hash_list);
765	refcount_dec(r: &rbio->refs);
766
767	/*
768	* we use the plug list to hold all the rbios
769	* waiting for the chance to lock this stripe.
770	* hand the lock over to one of them.
771	*/
772	if (!list_empty(head: &rbio->plug_list)) {
773	struct btrfs_raid_bio *next;
774	struct list_head *head = rbio->plug_list.next;
775
776	next = list_entry(head, struct btrfs_raid_bio,
777	plug_list);
778
779	list_del_init(entry: &rbio->plug_list);
780
781	list_add(new: &next->hash_list, head: &h->hash_list);
782	refcount_inc(r: &next->refs);
783	spin_unlock(lock: &rbio->bio_list_lock);
784	spin_unlock(lock: &h->lock);
785
786	if (next->operation == BTRFS_RBIO_READ_REBUILD) {
787	start_async_work(rbio: next, work_func: recover_rbio_work_locked);
788	} else if (next->operation == BTRFS_RBIO_WRITE) {
789	steal_rbio(src: rbio, dest: next);
790	start_async_work(rbio: next, work_func: rmw_rbio_work_locked);
791	} else if (next->operation == BTRFS_RBIO_PARITY_SCRUB) {
792	steal_rbio(src: rbio, dest: next);
793	start_async_work(rbio: next, work_func: scrub_rbio_work_locked);
794	}
795
796	goto done_nolock;
797	}
798	}
799	done:
800	spin_unlock(lock: &rbio->bio_list_lock);
801	spin_unlock(lock: &h->lock);
802
803	done_nolock:
804	if (!keep_cache)
805	remove_rbio_from_cache(rbio);
806	}
807
808	static void rbio_endio_bio_list(struct bio *cur, blk_status_t err)
809	{
810	struct bio *next;
811
812	while (cur) {
813	next = cur->bi_next;
814	cur->bi_next = NULL;
815	cur->bi_status = err;
816	bio_endio(cur);
817	cur = next;
818	}
819	}
820
821	/*
822	* this frees the rbio and runs through all the bios in the
823	* bio_list and calls end_io on them
824	*/
825	static void rbio_orig_end_io(struct btrfs_raid_bio *rbio, blk_status_t err)
826	{
827	struct bio *cur = bio_list_get(bl: &rbio->bio_list);
828	struct bio *extra;
829
830	kfree(objp: rbio->csum_buf);
831	bitmap_free(bitmap: rbio->csum_bitmap);
832	rbio->csum_buf = NULL;
833	rbio->csum_bitmap = NULL;
834
835	/*
836	* Clear the data bitmap, as the rbio may be cached for later usage.
837	* do this before before unlock_stripe() so there will be no new bio
838	* for this bio.
839	*/
840	bitmap_clear(map: &rbio->dbitmap, start: `0`, nbits: rbio->stripe_nsectors);
841
842	/*
843	* At this moment, rbio->bio_list is empty, however since rbio does not
844	* always have RBIO_RMW_LOCKED_BIT set and rbio is still linked on the
845	* hash list, rbio may be merged with others so that rbio->bio_list
846	* becomes non-empty.
847	* Once unlock_stripe() is done, rbio->bio_list will not be updated any
848	* more and we can call bio_endio() on all queued bios.
849	*/
850	unlock_stripe(rbio);
851	extra = bio_list_get(bl: &rbio->bio_list);
852	free_raid_bio(rbio);
853
854	rbio_endio_bio_list(cur, err);
855	if (extra)
856	rbio_endio_bio_list(cur: extra, err);
857	}
858
859	/*
860	* Get a sector pointer specified by its @stripe_nr and @sector_nr.
861	*
862	* @rbio: The raid bio
863	* @stripe_nr: Stripe number, valid range [0, real_stripe)
864	* @sector_nr: Sector number inside the stripe,
865	* valid range [0, stripe_nsectors)
866	* @bio_list_only: Whether to use sectors inside the bio list only.
867	*
868	* The read/modify/write code wants to reuse the original bio page as much
869	* as possible, and only use stripe_sectors as fallback.
870	*/
871	static struct sector_ptr sector_in_rbio(struct* btrfs_raid_bio *rbio,
872	int stripe_nr, int sector_nr,
873	bool bio_list_only)
874	{
875	struct sector_ptr *sector;
876	int index;
877
878	ASSERT(stripe_nr >= `0` && stripe_nr < rbio->real_stripes);
879	ASSERT(sector_nr >= `0` && sector_nr < rbio->stripe_nsectors);
880
881	index = stripe_nr * rbio->stripe_nsectors + sector_nr;
882	ASSERT(index >= `0` && index < rbio->nr_sectors);
883
884	spin_lock(lock: &rbio->bio_list_lock);
885	sector = &rbio->bio_sectors[index];
886	if (sector->page \|\| bio_list_only) {
887	/ Don't return sector without a valid page pointer /
888	if (!sector->page)
889	sector = NULL;
890	spin_unlock(lock: &rbio->bio_list_lock);
891	return sector;
892	}
893	spin_unlock(lock: &rbio->bio_list_lock);
894
895	return &rbio->stripe_sectors[index];
896	}
897
898	/*
899	* allocation and initial setup for the btrfs_raid_bio. Not
900	* this does not allocate any pages for rbio->pages.
901	*/
902	static struct btrfs_raid_bio alloc_rbio(struct* btrfs_fs_info *fs_info,
903	struct btrfs_io_context *bioc)
904	{
905	const unsigned int real_stripes = bioc->num_stripes - bioc->replace_nr_stripes;
906	const unsigned int stripe_npages = BTRFS_STRIPE_LEN >> PAGE_SHIFT;
907	const unsigned int num_pages = stripe_npages * real_stripes;
908	const unsigned int stripe_nsectors =
909	BTRFS_STRIPE_LEN >> fs_info->sectorsize_bits;
910	const unsigned int num_sectors = stripe_nsectors * real_stripes;
911	struct btrfs_raid_bio *rbio;
912
913	/ PAGE_SIZE must also be aligned to sectorsize for subpage support /
914	ASSERT(IS_ALIGNED(PAGE_SIZE, fs_info->sectorsize));
915	/*
916	* Our current stripe len should be fixed to 64k thus stripe_nsectors
917	* (at most 16) should be no larger than BITS_PER_LONG.
918	*/
919	ASSERT(stripe_nsectors <= BITS_PER_LONG);
920
921	rbio = kzalloc(size: sizeof(*rbio), GFP_NOFS);
922	if (!rbio)
923	return ERR_PTR(error: -ENOMEM);
924	rbio->stripe_pages = kcalloc(n: num_pages, size: sizeof(struct page *),
925	GFP_NOFS);
926	rbio->bio_sectors = kcalloc(n: num_sectors, size: sizeof(struct sector_ptr),
927	GFP_NOFS);
928	rbio->stripe_sectors = kcalloc(n: num_sectors, size: sizeof(struct sector_ptr),
929	GFP_NOFS);
930	rbio->finish_pointers = kcalloc(n: real_stripes, size: sizeof(void *), GFP_NOFS);
931	rbio->error_bitmap = bitmap_zalloc(nbits: num_sectors, GFP_NOFS);
932
933	if (!rbio->stripe_pages \|\| !rbio->bio_sectors \|\| !rbio->stripe_sectors \|\|
934	!rbio->finish_pointers \|\| !rbio->error_bitmap) {
935	free_raid_bio_pointers(rbio);
936	kfree(objp: rbio);
937	return ERR_PTR(error: -ENOMEM);
938	}
939
940	bio_list_init(bl: &rbio->bio_list);
941	init_waitqueue_head(&rbio->io_wait);
942	INIT_LIST_HEAD(list: &rbio->plug_list);
943	spin_lock_init(&rbio->bio_list_lock);
944	INIT_LIST_HEAD(list: &rbio->stripe_cache);
945	INIT_LIST_HEAD(list: &rbio->hash_list);
946	btrfs_get_bioc(bioc);
947	rbio->bioc = bioc;
948	rbio->nr_pages = num_pages;
949	rbio->nr_sectors = num_sectors;
950	rbio->real_stripes = real_stripes;
951	rbio->stripe_npages = stripe_npages;
952	rbio->stripe_nsectors = stripe_nsectors;
953	refcount_set(r: &rbio->refs, n: `1`);
954	atomic_set(v: &rbio->stripes_pending, i: `0`);
955
956	ASSERT(btrfs_nr_parity_stripes(bioc->map_type));
957	rbio->nr_data = real_stripes - btrfs_nr_parity_stripes(type: bioc->map_type);
958
959	return rbio;
960	}
961
962	/ allocate pages for all the stripes in the bio, including parity /
963	static int alloc_rbio_pages(struct btrfs_raid_bio *rbio)
964	{
965	int ret;
966
967	ret = btrfs_alloc_page_array(nr_pages: rbio->nr_pages, page_array: rbio->stripe_pages);
968	if (ret < `0`)
969	return ret;
970	/ Mapping all sectors /
971	index_stripe_sectors(rbio);
972	return `0`;
973	}
974
975	/ only allocate pages for p/q stripes /
976	static int alloc_rbio_parity_pages(struct btrfs_raid_bio *rbio)
977	{
978	const int data_pages = rbio->nr_data * rbio->stripe_npages;
979	int ret;
980
981	ret = btrfs_alloc_page_array(nr_pages: rbio->nr_pages - data_pages,
982	page_array: rbio->stripe_pages + data_pages);
983	if (ret < `0`)
984	return ret;
985
986	index_stripe_sectors(rbio);
987	return `0`;
988	}
989
990	/*
991	* Return the total number of errors found in the vertical stripe of @sector_nr.
992	*
993	* @faila and @failb will also be updated to the first and second stripe
994	* number of the errors.
995	*/
996	static int get_rbio_veritical_errors(struct btrfs_raid_bio rbio, int* sector_nr,
997	int faila, int* *failb)
998	{
999	int stripe_nr;
1000	int found_errors = `0`;
1001
1002	if (faila \|\| failb) {
1003	/*
1004	* Both @faila and @failb should be valid pointers if any of
1005	* them is specified.
1006	*/
1007	ASSERT(faila && failb);
1008	*faila = -`1`;
1009	*failb = -`1`;
1010	}
1011
1012	for (stripe_nr = `0`; stripe_nr < rbio->real_stripes; stripe_nr++) {
1013	int total_sector_nr = stripe_nr * rbio->stripe_nsectors + sector_nr;
1014
1015	if (test_bit(total_sector_nr, rbio->error_bitmap)) {
1016	found_errors++;
1017	if (faila) {
1018	/ Update faila and failb. /
1019	if (*faila < `0`)
1020	*faila = stripe_nr;
1021	else if (*failb < `0`)
1022	*failb = stripe_nr;
1023	}
1024	}
1025	}
1026	return found_errors;
1027	}
1028
1029	/*
1030	* Add a single sector @sector into our list of bios for IO.
1031	*
1032	* Return 0 if everything went well.
1033	* Return <0 for error.
1034	*/
1035	static int rbio_add_io_sector(struct btrfs_raid_bio *rbio,
1036	struct bio_list *bio_list,
1037	struct sector_ptr *sector,
1038	unsigned int stripe_nr,
1039	unsigned int sector_nr,
1040	enum req_op op)
1041	{
1042	const u32 sectorsize = rbio->bioc->fs_info->sectorsize;
1043	struct bio *last = bio_list->tail;
1044	int ret;
1045	struct bio *bio;
1046	struct btrfs_io_stripe *stripe;
1047	u64 disk_start;
1048
1049	/*
1050	* Note: here stripe_nr has taken device replace into consideration,
1051	* thus it can be larger than rbio->real_stripe.
1052	* So here we check against bioc->num_stripes, not rbio->real_stripes.
1053	*/
1054	ASSERT(stripe_nr >= `0` && stripe_nr < rbio->bioc->num_stripes);
1055	ASSERT(sector_nr >= `0` && sector_nr < rbio->stripe_nsectors);
1056	ASSERT(sector->page);
1057
1058	stripe = &rbio->bioc->stripes[stripe_nr];
1059	disk_start = stripe->physical + sector_nr * sectorsize;
1060
1061	/ if the device is missing, just fail this stripe /
1062	if (!stripe->dev->bdev) {
1063	int found_errors;
1064
1065	set_bit(nr: stripe_nr * rbio->stripe_nsectors + sector_nr,
1066	addr: rbio->error_bitmap);
1067
1068	/ Check if we have reached tolerance early. /
1069	found_errors = get_rbio_veritical_errors(rbio, sector_nr,
1070	NULL, NULL);
1071	if (found_errors > rbio->bioc->max_errors)
1072	return -EIO;
1073	return `0`;
1074	}
1075
1076	/ see if we can add this page onto our existing bio /
1077	if (last) {
1078	u64 last_end = last->bi_iter.bi_sector << SECTOR_SHIFT;
1079	last_end += last->bi_iter.bi_size;
1080
1081	/*
1082	* we can't merge these if they are from different
1083	* devices or if they are not contiguous
1084	*/
1085	if (last_end == disk_start && !last->bi_status &&
1086	last->bi_bdev == stripe->dev->bdev) {
1087	ret = bio_add_page(bio: last, page: sector->page, len: sectorsize,
1088	off: sector->pgoff);
1089	if (ret == sectorsize)
1090	return `0`;
1091	}
1092	}
1093
1094	/ put a new bio on the list /
1095	bio = bio_alloc(bdev: stripe->dev->bdev,
1096	max(BTRFS_STRIPE_LEN >> PAGE_SHIFT, `1`),
1097	opf: op, GFP_NOFS);
1098	bio->bi_iter.bi_sector = disk_start >> SECTOR_SHIFT;
1099	bio->bi_private = rbio;
1100
1101	__bio_add_page(bio, page: sector->page, len: sectorsize, off: sector->pgoff);
1102	bio_list_add(bl: bio_list, bio);
1103	return `0`;
1104	}
1105
1106	static void index_one_bio(struct btrfs_raid_bio rbio, struct* bio *bio)
1107	{
1108	const u32 sectorsize = rbio->bioc->fs_info->sectorsize;
1109	struct bio_vec bvec;
1110	struct bvec_iter iter;
1111	u32 offset = (bio->bi_iter.bi_sector << SECTOR_SHIFT) -
1112	rbio->bioc->full_stripe_logical;
1113
1114	bio_for_each_segment(bvec, bio, iter) {
1115	u32 bvec_offset;
1116
1117	for (bvec_offset = `0`; bvec_offset < bvec.bv_len;
1118	bvec_offset += sectorsize, offset += sectorsize) {
1119	int index = offset / sectorsize;
1120	struct sector_ptr *sector = &rbio->bio_sectors[index];
1121
1122	sector->page = bvec.bv_page;
1123	sector->pgoff = bvec.bv_offset + bvec_offset;
1124	ASSERT(sector->pgoff < PAGE_SIZE);
1125	}
1126	}
1127	}
1128
1129	/*
1130	* helper function to walk our bio list and populate the bio_pages array with
1131	* the result. This seems expensive, but it is faster than constantly
1132	* searching through the bio list as we setup the IO in finish_rmw or stripe
1133	* reconstruction.
1134	*
1135	* This must be called before you trust the answers from page_in_rbio
1136	*/
1137	static void index_rbio_pages(struct btrfs_raid_bio *rbio)
1138	{
1139	struct bio *bio;
1140
1141	spin_lock(lock: &rbio->bio_list_lock);
1142	bio_list_for_each(bio, &rbio->bio_list)
1143	index_one_bio(rbio, bio);
1144
1145	spin_unlock(lock: &rbio->bio_list_lock);
1146	}
1147
1148	static void bio_get_trace_info(struct btrfs_raid_bio rbio, struct* bio *bio,
1149	struct raid56_bio_trace_info *trace_info)
1150	{
1151	const struct btrfs_io_context *bioc = rbio->bioc;
1152	int i;
1153
1154	ASSERT(bioc);
1155
1156	/ We rely on bio->bi_bdev to find the stripe number. /
1157	if (!bio->bi_bdev)
1158	goto not_found;
1159
1160	for (i = `0`; i < bioc->num_stripes; i++) {
1161	if (bio->bi_bdev != bioc->stripes[i].dev->bdev)
1162	continue;
1163	trace_info->stripe_nr = i;
1164	trace_info->devid = bioc->stripes[i].dev->devid;
1165	trace_info->offset = (bio->bi_iter.bi_sector << SECTOR_SHIFT) -
1166	bioc->stripes[i].physical;
1167	return;
1168	}
1169
1170	not_found:
1171	trace_info->devid = -`1`;
1172	trace_info->offset = -`1`;
1173	trace_info->stripe_nr = -`1`;
1174	}
1175
1176	static inline void bio_list_put(struct bio_list *bio_list)
1177	{
1178	struct bio *bio;
1179
1180	while ((bio = bio_list_pop(bl: bio_list)))
1181	bio_put(bio);
1182	}
1183
1184	/ Generate PQ for one vertical stripe. /
1185	static void generate_pq_vertical(struct btrfs_raid_bio rbio, int* sectornr)
1186	{
1187	void **pointers = rbio->finish_pointers;
1188	const u32 sectorsize = rbio->bioc->fs_info->sectorsize;
1189	struct sector_ptr *sector;
1190	int stripe;
1191	const bool has_qstripe = rbio->bioc->map_type & BTRFS_BLOCK_GROUP_RAID6;
1192
1193	/ First collect one sector from each data stripe /
1194	for (stripe = `0`; stripe < rbio->nr_data; stripe++) {
1195	sector = sector_in_rbio(rbio, stripe_nr: stripe, sector_nr: sectornr, bio_list_only: `0`);
1196	pointers[stripe] = kmap_local_page(page: sector->page) +
1197	sector->pgoff;
1198	}
1199
1200	/ Then add the parity stripe /
1201	sector = rbio_pstripe_sector(rbio, sector_nr: sectornr);
1202	sector->uptodate = `1`;
1203	pointers[stripe++] = kmap_local_page(page: sector->page) + sector->pgoff;
1204
1205	if (has_qstripe) {
1206	/*
1207	* RAID6, add the qstripe and call the library function
1208	* to fill in our p/q
1209	*/
1210	sector = rbio_qstripe_sector(rbio, sector_nr: sectornr);
1211	sector->uptodate = `1`;
1212	pointers[stripe++] = kmap_local_page(page: sector->page) +
1213	sector->pgoff;
1214
1215	raid6_call.gen_syndrome(rbio->real_stripes, sectorsize,
1216	pointers);
1217	} else {
1218	/ raid5 /
1219	memcpy(pointers[rbio->nr_data], pointers[`0`], sectorsize);
1220	run_xor(pages: pointers + `1`, src_cnt: rbio->nr_data - `1`, len: sectorsize);
1221	}
1222	for (stripe = stripe - `1`; stripe >= `0`; stripe--)
1223	kunmap_local(pointers[stripe]);
1224	}
1225
1226	static int rmw_assemble_write_bios(struct btrfs_raid_bio *rbio,
1227	struct bio_list *bio_list)
1228	{
1229	/ The total sector number inside the full stripe. /
1230	int total_sector_nr;
1231	int sectornr;
1232	int stripe;
1233	int ret;
1234
1235	ASSERT(bio_list_size(bio_list) == `0`);
1236
1237	/ We should have at least one data sector. /
1238	ASSERT(bitmap_weight(&rbio->dbitmap, rbio->stripe_nsectors));
1239
1240	/*
1241	* Reset errors, as we may have errors inherited from from degraded
1242	* write.
1243	*/
1244	bitmap_clear(map: rbio->error_bitmap, start: `0`, nbits: rbio->nr_sectors);
1245
1246	/*
1247	* Start assembly. Make bios for everything from the higher layers (the
1248	* bio_list in our rbio) and our P/Q. Ignore everything else.
1249	*/
1250	for (total_sector_nr = `0`; total_sector_nr < rbio->nr_sectors;
1251	total_sector_nr++) {
1252	struct sector_ptr *sector;
1253
1254	stripe = total_sector_nr / rbio->stripe_nsectors;
1255	sectornr = total_sector_nr % rbio->stripe_nsectors;
1256
1257	/ This vertical stripe has no data, skip it. /
1258	if (!test_bit(sectornr, &rbio->dbitmap))
1259	continue;
1260
1261	if (stripe < rbio->nr_data) {
1262	sector = sector_in_rbio(rbio, stripe_nr: stripe, sector_nr: sectornr, bio_list_only: `1`);
1263	if (!sector)
1264	continue;
1265	} else {
1266	sector = rbio_stripe_sector(rbio, stripe_nr: stripe, sector_nr: sectornr);
1267	}
1268
1269	ret = rbio_add_io_sector(rbio, bio_list, sector, stripe_nr: stripe,
1270	sector_nr: sectornr, op: REQ_OP_WRITE);
1271	if (ret)
1272	goto error;
1273	}
1274
1275	if (likely(!rbio->bioc->replace_nr_stripes))
1276	return `0`;
1277
1278	/*
1279	* Make a copy for the replace target device.
1280	*
1281	* Thus the source stripe number (in replace_stripe_src) should be valid.
1282	*/
1283	ASSERT(rbio->bioc->replace_stripe_src >= `0`);
1284
1285	for (total_sector_nr = `0`; total_sector_nr < rbio->nr_sectors;
1286	total_sector_nr++) {
1287	struct sector_ptr *sector;
1288
1289	stripe = total_sector_nr / rbio->stripe_nsectors;
1290	sectornr = total_sector_nr % rbio->stripe_nsectors;
1291
1292	/*
1293	* For RAID56, there is only one device that can be replaced,
1294	* and replace_stripe_src[0] indicates the stripe number we
1295	* need to copy from.
1296	*/
1297	if (stripe != rbio->bioc->replace_stripe_src) {
1298	/*
1299	* We can skip the whole stripe completely, note
1300	* total_sector_nr will be increased by one anyway.
1301	*/
1302	ASSERT(sectornr == `0`);
1303	total_sector_nr += rbio->stripe_nsectors - `1`;
1304	continue;
1305	}
1306
1307	/ This vertical stripe has no data, skip it. /
1308	if (!test_bit(sectornr, &rbio->dbitmap))
1309	continue;
1310
1311	if (stripe < rbio->nr_data) {
1312	sector = sector_in_rbio(rbio, stripe_nr: stripe, sector_nr: sectornr, bio_list_only: `1`);
1313	if (!sector)
1314	continue;
1315	} else {
1316	sector = rbio_stripe_sector(rbio, stripe_nr: stripe, sector_nr: sectornr);
1317	}
1318
1319	ret = rbio_add_io_sector(rbio, bio_list, sector,
1320	stripe_nr: rbio->real_stripes,
1321	sector_nr: sectornr, op: REQ_OP_WRITE);
1322	if (ret)
1323	goto error;
1324	}
1325
1326	return `0`;
1327	error:
1328	bio_list_put(bio_list);
1329	return -EIO;
1330	}
1331
1332	static void set_rbio_range_error(struct btrfs_raid_bio rbio, struct* bio *bio)
1333	{
1334	struct btrfs_fs_info *fs_info = rbio->bioc->fs_info;
1335	u32 offset = (bio->bi_iter.bi_sector << SECTOR_SHIFT) -
1336	rbio->bioc->full_stripe_logical;
1337	int total_nr_sector = offset >> fs_info->sectorsize_bits;
1338
1339	ASSERT(total_nr_sector < rbio->nr_data * rbio->stripe_nsectors);
1340
1341	bitmap_set(map: rbio->error_bitmap, start: total_nr_sector,
1342	nbits: bio->bi_iter.bi_size >> fs_info->sectorsize_bits);
1343
1344	/*
1345	* Special handling for raid56_alloc_missing_rbio() used by
1346	* scrub/replace. Unlike call path in raid56_parity_recover(), they
1347	* pass an empty bio here. Thus we have to find out the missing device
1348	* and mark the stripe error instead.
1349	*/
1350	if (bio->bi_iter.bi_size == `0`) {
1351	bool found_missing = false;
1352	int stripe_nr;
1353
1354	for (stripe_nr = `0`; stripe_nr < rbio->real_stripes; stripe_nr++) {
1355	if (!rbio->bioc->stripes[stripe_nr].dev->bdev) {
1356	found_missing = true;
1357	bitmap_set(map: rbio->error_bitmap,
1358	start: stripe_nr * rbio->stripe_nsectors,
1359	nbits: rbio->stripe_nsectors);
1360	}
1361	}
1362	ASSERT(found_missing);
1363	}
1364	}
1365
1366	/*
1367	* For subpage case, we can no longer set page Up-to-date directly for
1368	* stripe_pages[], thus we need to locate the sector.
1369	*/
1370	static struct sector_ptr find_stripe_sector(struct* btrfs_raid_bio *rbio,
1371	struct page *page,
1372	unsigned int pgoff)
1373	{
1374	int i;
1375
1376	for (i = `0`; i < rbio->nr_sectors; i++) {
1377	struct sector_ptr *sector = &rbio->stripe_sectors[i];
1378
1379	if (sector->page == page && sector->pgoff == pgoff)
1380	return sector;
1381	}
1382	return NULL;
1383	}
1384
1385	/*
1386	* this sets each page in the bio uptodate. It should only be used on private
1387	* rbio pages, nothing that comes in from the higher layers
1388	*/
1389	static void set_bio_pages_uptodate(struct btrfs_raid_bio rbio, struct* bio *bio)
1390	{
1391	const u32 sectorsize = rbio->bioc->fs_info->sectorsize;
1392	struct bio_vec *bvec;
1393	struct bvec_iter_all iter_all;
1394
1395	ASSERT(!bio_flagged(bio, BIO_CLONED));
1396
1397	bio_for_each_segment_all(bvec, bio, iter_all) {
1398	struct sector_ptr *sector;
1399	int pgoff;
1400
1401	for (pgoff = bvec->bv_offset; pgoff - bvec->bv_offset < bvec->bv_len;
1402	pgoff += sectorsize) {
1403	sector = find_stripe_sector(rbio, page: bvec->bv_page, pgoff);
1404	ASSERT(sector);
1405	if (sector)
1406	sector->uptodate = `1`;
1407	}
1408	}
1409	}
1410
1411	static int get_bio_sector_nr(struct btrfs_raid_bio rbio, struct* bio *bio)
1412	{
1413	struct bio_vec *bv = bio_first_bvec_all(bio);
1414	int i;
1415
1416	for (i = `0`; i < rbio->nr_sectors; i++) {
1417	struct sector_ptr *sector;
1418
1419	sector = &rbio->stripe_sectors[i];
1420	if (sector->page == bv->bv_page && sector->pgoff == bv->bv_offset)
1421	break;
1422	sector = &rbio->bio_sectors[i];
1423	if (sector->page == bv->bv_page && sector->pgoff == bv->bv_offset)
1424	break;
1425	}
1426	ASSERT(i < rbio->nr_sectors);
1427	return i;
1428	}
1429
1430	static void rbio_update_error_bitmap(struct btrfs_raid_bio rbio, struct* bio *bio)
1431	{
1432	int total_sector_nr = get_bio_sector_nr(rbio, bio);
1433	u32 bio_size = `0`;
1434	struct bio_vec *bvec;
1435	int i;
1436
1437	bio_for_each_bvec_all(bvec, bio, i)
1438	bio_size += bvec->bv_len;
1439
1440	/*
1441	* Since we can have multiple bios touching the error_bitmap, we cannot
1442	* call bitmap_set() without protection.
1443	*
1444	* Instead use set_bit() for each bit, as set_bit() itself is atomic.
1445	*/
1446	for (i = total_sector_nr; i < total_sector_nr +
1447	(bio_size >> rbio->bioc->fs_info->sectorsize_bits); i++)
1448	set_bit(nr: i, addr: rbio->error_bitmap);
1449	}
1450
1451	/ Verify the data sectors at read time. /
1452	static void verify_bio_data_sectors(struct btrfs_raid_bio *rbio,
1453	struct bio *bio)
1454	{
1455	struct btrfs_fs_info *fs_info = rbio->bioc->fs_info;
1456	int total_sector_nr = get_bio_sector_nr(rbio, bio);
1457	struct bio_vec *bvec;
1458	struct bvec_iter_all iter_all;
1459
1460	/ No data csum for the whole stripe, no need to verify. /
1461	if (!rbio->csum_bitmap \|\| !rbio->csum_buf)
1462	return;
1463
1464	/ P/Q stripes, they have no data csum to verify against. /
1465	if (total_sector_nr >= rbio->nr_data * rbio->stripe_nsectors)
1466	return;
1467
1468	bio_for_each_segment_all(bvec, bio, iter_all) {
1469	int bv_offset;
1470
1471	for (bv_offset = bvec->bv_offset;
1472	bv_offset < bvec->bv_offset + bvec->bv_len;
1473	bv_offset += fs_info->sectorsize, total_sector_nr++) {
1474	u8 csum_buf[BTRFS_CSUM_SIZE];
1475	u8 *expected_csum = rbio->csum_buf +
1476	total_sector_nr * fs_info->csum_size;
1477	int ret;
1478
1479	/ No csum for this sector, skip to the next sector. /
1480	if (!test_bit(total_sector_nr, rbio->csum_bitmap))
1481	continue;
1482
1483	ret = btrfs_check_sector_csum(fs_info, page: bvec->bv_page,
1484	pgoff: bv_offset, csum: csum_buf, csum_expected: expected_csum);
1485	if (ret < `0`)
1486	set_bit(nr: total_sector_nr, addr: rbio->error_bitmap);
1487	}
1488	}
1489	}
1490
1491	static void raid_wait_read_end_io(struct bio *bio)
1492	{
1493	struct btrfs_raid_bio *rbio = bio->bi_private;
1494
1495	if (bio->bi_status) {
1496	rbio_update_error_bitmap(rbio, bio);
1497	} else {
1498	set_bio_pages_uptodate(rbio, bio);
1499	verify_bio_data_sectors(rbio, bio);
1500	}
1501
1502	bio_put(bio);
1503	if (atomic_dec_and_test(v: &rbio->stripes_pending))
1504	wake_up(&rbio->io_wait);
1505	}
1506
1507	static void submit_read_wait_bio_list(struct btrfs_raid_bio *rbio,
1508	struct bio_list *bio_list)
1509	{
1510	struct bio *bio;
1511
1512	atomic_set(v: &rbio->stripes_pending, i: bio_list_size(bl: bio_list));
1513	while ((bio = bio_list_pop(bl: bio_list))) {
1514	bio->bi_end_io = raid_wait_read_end_io;
1515
1516	if (trace_raid56_read_enabled()) {
1517	struct raid56_bio_trace_info trace_info = { `0` };
1518
1519	bio_get_trace_info(rbio, bio, trace_info: &trace_info);
1520	trace_raid56_read(rbio, bio, trace_info: &trace_info);
1521	}
1522	submit_bio(bio);
1523	}
1524
1525	wait_event(rbio->io_wait, atomic_read(&rbio->stripes_pending) == `0`);
1526	}
1527
1528	static int alloc_rbio_data_pages(struct btrfs_raid_bio *rbio)
1529	{
1530	const int data_pages = rbio->nr_data * rbio->stripe_npages;
1531	int ret;
1532
1533	ret = btrfs_alloc_page_array(nr_pages: data_pages, page_array: rbio->stripe_pages);
1534	if (ret < `0`)
1535	return ret;
1536
1537	index_stripe_sectors(rbio);
1538	return `0`;
1539	}
1540
1541	/*
1542	* We use plugging call backs to collect full stripes.
1543	* Any time we get a partial stripe write while plugged
1544	* we collect it into a list. When the unplug comes down,
1545	* we sort the list by logical block number and merge
1546	* everything we can into the same rbios
1547	*/
1548	struct btrfs_plug_cb {
1549	struct blk_plug_cb cb;
1550	struct btrfs_fs_info *info;
1551	struct list_head rbio_list;
1552	struct work_struct work;
1553	};
1554
1555	/*
1556	* rbios on the plug list are sorted for easier merging.
1557	*/
1558	static int plug_cmp(void priv, const* struct list_head *a,
1559	const struct list_head *b)
1560	{
1561	const struct btrfs_raid_bio ra = container_of(a, struct* btrfs_raid_bio,
1562	plug_list);
1563	const struct btrfs_raid_bio rb = container_of(b, struct* btrfs_raid_bio,
1564	plug_list);
1565	u64 a_sector = ra->bio_list.head->bi_iter.bi_sector;
1566	u64 b_sector = rb->bio_list.head->bi_iter.bi_sector;
1567
1568	if (a_sector < b_sector)
1569	return -`1`;
1570	if (a_sector > b_sector)
1571	return `1`;
1572	return `0`;
1573	}
1574
1575	static void raid_unplug(struct blk_plug_cb *cb, bool from_schedule)
1576	{
1577	struct btrfs_plug_cb plug = container_of(cb, struct* btrfs_plug_cb, cb);
1578	struct btrfs_raid_bio *cur;
1579	struct btrfs_raid_bio *last = NULL;
1580
1581	list_sort(NULL, head: &plug->rbio_list, cmp: plug_cmp);
1582
1583	while (!list_empty(head: &plug->rbio_list)) {
1584	cur = list_entry(plug->rbio_list.next,
1585	struct btrfs_raid_bio, plug_list);
1586	list_del_init(entry: &cur->plug_list);
1587
1588	if (rbio_is_full(rbio: cur)) {
1589	/ We have a full stripe, queue it down. /
1590	start_async_work(rbio: cur, work_func: rmw_rbio_work);
1591	continue;
1592	}
1593	if (last) {
1594	if (rbio_can_merge(last, cur)) {
1595	merge_rbio(dest: last, victim: cur);
1596	free_raid_bio(rbio: cur);
1597	continue;
1598	}
1599	start_async_work(rbio: last, work_func: rmw_rbio_work);
1600	}
1601	last = cur;
1602	}
1603	if (last)
1604	start_async_work(rbio: last, work_func: rmw_rbio_work);
1605	kfree(objp: plug);
1606	}
1607
1608	/ Add the original bio into rbio->bio_list, and update rbio::dbitmap. /
1609	static void rbio_add_bio(struct btrfs_raid_bio rbio, struct* bio *orig_bio)
1610	{
1611	const struct btrfs_fs_info *fs_info = rbio->bioc->fs_info;
1612	const u64 orig_logical = orig_bio->bi_iter.bi_sector << SECTOR_SHIFT;
1613	const u64 full_stripe_start = rbio->bioc->full_stripe_logical;
1614	const u32 orig_len = orig_bio->bi_iter.bi_size;
1615	const u32 sectorsize = fs_info->sectorsize;
1616	u64 cur_logical;
1617
1618	ASSERT(orig_logical >= full_stripe_start &&
1619	orig_logical + orig_len <= full_stripe_start +
1620	rbio->nr_data * BTRFS_STRIPE_LEN);
1621
1622	bio_list_add(bl: &rbio->bio_list, bio: orig_bio);
1623	rbio->bio_list_bytes += orig_bio->bi_iter.bi_size;
1624
1625	/ Update the dbitmap. /
1626	for (cur_logical = orig_logical; cur_logical < orig_logical + orig_len;
1627	cur_logical += sectorsize) {
1628	int bit = ((u32)(cur_logical - full_stripe_start) >>
1629	fs_info->sectorsize_bits) % rbio->stripe_nsectors;
1630
1631	set_bit(nr: bit, addr: &rbio->dbitmap);
1632	}
1633	}
1634
1635	/*
1636	* our main entry point for writes from the rest of the FS.
1637	*/
1638	void raid56_parity_write(struct bio bio, struct* btrfs_io_context *bioc)
1639	{
1640	struct btrfs_fs_info *fs_info = bioc->fs_info;
1641	struct btrfs_raid_bio *rbio;
1642	struct btrfs_plug_cb *plug = NULL;
1643	struct blk_plug_cb *cb;
1644
1645	rbio = alloc_rbio(fs_info, bioc);
1646	if (IS_ERR(ptr: rbio)) {
1647	bio->bi_status = errno_to_blk_status(errno: PTR_ERR(ptr: rbio));
1648	bio_endio(bio);
1649	return;
1650	}
1651	rbio->operation = BTRFS_RBIO_WRITE;
1652	rbio_add_bio(rbio, orig_bio: bio);
1653
1654	/*
1655	* Don't plug on full rbios, just get them out the door
1656	* as quickly as we can
1657	*/
1658	if (!rbio_is_full(rbio)) {
1659	cb = blk_check_plugged(unplug: raid_unplug, data: fs_info, size: sizeof(*plug));
1660	if (cb) {
1661	plug = container_of(cb, struct btrfs_plug_cb, cb);
1662	if (!plug->info) {
1663	plug->info = fs_info;
1664	INIT_LIST_HEAD(list: &plug->rbio_list);
1665	}
1666	list_add_tail(new: &rbio->plug_list, head: &plug->rbio_list);
1667	return;
1668	}
1669	}
1670
1671	/*
1672	* Either we don't have any existing plug, or we're doing a full stripe,
1673	* queue the rmw work now.
1674	*/
1675	start_async_work(rbio, work_func: rmw_rbio_work);
1676	}
1677
1678	static int verify_one_sector(struct btrfs_raid_bio *rbio,
1679	int stripe_nr, int sector_nr)
1680	{
1681	struct btrfs_fs_info *fs_info = rbio->bioc->fs_info;
1682	struct sector_ptr *sector;
1683	u8 csum_buf[BTRFS_CSUM_SIZE];
1684	u8 *csum_expected;
1685	int ret;
1686
1687	if (!rbio->csum_bitmap \|\| !rbio->csum_buf)
1688	return `0`;
1689
1690	/ No way to verify P/Q as they are not covered by data csum. /
1691	if (stripe_nr >= rbio->nr_data)
1692	return `0`;
1693	/*
1694	* If we're rebuilding a read, we have to use pages from the
1695	* bio list if possible.
1696	*/
1697	if (rbio->operation == BTRFS_RBIO_READ_REBUILD) {
1698	sector = sector_in_rbio(rbio, stripe_nr, sector_nr, bio_list_only: `0`);
1699	} else {
1700	sector = rbio_stripe_sector(rbio, stripe_nr, sector_nr);
1701	}
1702
1703	ASSERT(sector->page);
1704
1705	csum_expected = rbio->csum_buf +
1706	(stripe_nr * rbio->stripe_nsectors + sector_nr) *
1707	fs_info->csum_size;
1708	ret = btrfs_check_sector_csum(fs_info, page: sector->page, pgoff: sector->pgoff,
1709	csum: csum_buf, csum_expected);
1710	return ret;
1711	}
1712
1713	/*
1714	* Recover a vertical stripe specified by @sector_nr.
1715	* @*pointers are the pre-allocated pointers by the caller, so we don't
1716	* need to allocate/free the pointers again and again.
1717	*/
1718	static int recover_vertical(struct btrfs_raid_bio rbio, int* sector_nr,
1719	void *pointers, void* **unmap_array)
1720	{
1721	struct btrfs_fs_info *fs_info = rbio->bioc->fs_info;
1722	struct sector_ptr *sector;
1723	const u32 sectorsize = fs_info->sectorsize;
1724	int found_errors;
1725	int faila;
1726	int failb;
1727	int stripe_nr;
1728	int ret = `0`;
1729
1730	/*
1731	* Now we just use bitmap to mark the horizontal stripes in
1732	* which we have data when doing parity scrub.
1733	*/
1734	if (rbio->operation == BTRFS_RBIO_PARITY_SCRUB &&
1735	!test_bit(sector_nr, &rbio->dbitmap))
1736	return `0`;
1737
1738	found_errors = get_rbio_veritical_errors(rbio, sector_nr, faila: &faila,
1739	failb: &failb);
1740	/*
1741	* No errors in the vertical stripe, skip it. Can happen for recovery
1742	* which only part of a stripe failed csum check.
1743	*/
1744	if (!found_errors)
1745	return `0`;
1746
1747	if (found_errors > rbio->bioc->max_errors)
1748	return -EIO;
1749
1750	/*
1751	* Setup our array of pointers with sectors from each stripe
1752	*
1753	* NOTE: store a duplicate array of pointers to preserve the
1754	* pointer order.
1755	*/
1756	for (stripe_nr = `0`; stripe_nr < rbio->real_stripes; stripe_nr++) {
1757	/*
1758	* If we're rebuilding a read, we have to use pages from the
1759	* bio list if possible.
1760	*/
1761	if (rbio->operation == BTRFS_RBIO_READ_REBUILD) {
1762	sector = sector_in_rbio(rbio, stripe_nr, sector_nr, bio_list_only: `0`);
1763	} else {
1764	sector = rbio_stripe_sector(rbio, stripe_nr, sector_nr);
1765	}
1766	ASSERT(sector->page);
1767	pointers[stripe_nr] = kmap_local_page(page: sector->page) +
1768	sector->pgoff;
1769	unmap_array[stripe_nr] = pointers[stripe_nr];
1770	}
1771
1772	/ All raid6 handling here /
1773	if (rbio->bioc->map_type & BTRFS_BLOCK_GROUP_RAID6) {
1774	/ Single failure, rebuild from parity raid5 style /
1775	if (failb < `0`) {
1776	if (faila == rbio->nr_data)
1777	/*
1778	* Just the P stripe has failed, without
1779	* a bad data or Q stripe.
1780	* We have nothing to do, just skip the
1781	* recovery for this stripe.
1782	*/
1783	goto cleanup;
1784	/*
1785	* a single failure in raid6 is rebuilt
1786	* in the pstripe code below
1787	*/
1788	goto pstripe;
1789	}
1790
1791	/*
1792	* If the q stripe is failed, do a pstripe reconstruction from
1793	* the xors.
1794	* If both the q stripe and the P stripe are failed, we're
1795	* here due to a crc mismatch and we can't give them the
1796	* data they want.
1797	*/
1798	if (failb == rbio->real_stripes - `1`) {
1799	if (faila == rbio->real_stripes - `2`)
1800	/*
1801	* Only P and Q are corrupted.
1802	* We only care about data stripes recovery,
1803	* can skip this vertical stripe.
1804	*/
1805	goto cleanup;
1806	/*
1807	* Otherwise we have one bad data stripe and
1808	* a good P stripe. raid5!
1809	*/
1810	goto pstripe;
1811	}
1812
1813	if (failb == rbio->real_stripes - `2`) {
1814	raid6_datap_recov(rbio->real_stripes, sectorsize,
1815	faila, pointers);
1816	} else {
1817	raid6_2data_recov(rbio->real_stripes, sectorsize,
1818	faila, failb, pointers);
1819	}
1820	} else {
1821	void *p;
1822
1823	/ Rebuild from P stripe here (raid5 or raid6). /
1824	ASSERT(failb == -`1`);
1825	pstripe:
1826	/ Copy parity block into failed block to start with /
1827	memcpy(pointers[faila], pointers[rbio->nr_data], sectorsize);
1828
1829	/ Rearrange the pointer array /
1830	p = pointers[faila];
1831	for (stripe_nr = faila; stripe_nr < rbio->nr_data - `1`;
1832	stripe_nr++)
1833	pointers[stripe_nr] = pointers[stripe_nr + `1`];
1834	pointers[rbio->nr_data - `1`] = p;
1835
1836	/ Xor in the rest /
1837	run_xor(pages: pointers, src_cnt: rbio->nr_data - `1`, len: sectorsize);
1838
1839	}
1840
1841	/*
1842	* No matter if this is a RMW or recovery, we should have all
1843	* failed sectors repaired in the vertical stripe, thus they are now
1844	* uptodate.
1845	* Especially if we determine to cache the rbio, we need to
1846	* have at least all data sectors uptodate.
1847	*
1848	* If possible, also check if the repaired sector matches its data
1849	* checksum.
1850	*/
1851	if (faila >= `0`) {
1852	ret = verify_one_sector(rbio, stripe_nr: faila, sector_nr);
1853	if (ret < `0`)
1854	goto cleanup;
1855
1856	sector = rbio_stripe_sector(rbio, stripe_nr: faila, sector_nr);
1857	sector->uptodate = `1`;
1858	}
1859	if (failb >= `0`) {
1860	ret = verify_one_sector(rbio, stripe_nr: failb, sector_nr);
1861	if (ret < `0`)
1862	goto cleanup;
1863
1864	sector = rbio_stripe_sector(rbio, stripe_nr: failb, sector_nr);
1865	sector->uptodate = `1`;
1866	}
1867
1868	cleanup:
1869	for (stripe_nr = rbio->real_stripes - `1`; stripe_nr >= `0`; stripe_nr--)
1870	kunmap_local(unmap_array[stripe_nr]);
1871	return ret;
1872	}
1873
1874	static int recover_sectors(struct btrfs_raid_bio *rbio)
1875	{
1876	void **pointers = NULL;
1877	void **unmap_array = NULL;
1878	int sectornr;
1879	int ret = `0`;
1880
1881	/*
1882	* @pointers array stores the pointer for each sector.
1883	*
1884	* @unmap_array stores copy of pointers that does not get reordered
1885	* during reconstruction so that kunmap_local works.
1886	*/
1887	pointers = kcalloc(n: rbio->real_stripes, size: sizeof(void *), GFP_NOFS);
1888	unmap_array = kcalloc(n: rbio->real_stripes, size: sizeof(void *), GFP_NOFS);
1889	if (!pointers \|\| !unmap_array) {
1890	ret = -ENOMEM;
1891	goto out;
1892	}
1893
1894	if (rbio->operation == BTRFS_RBIO_READ_REBUILD) {
1895	spin_lock(lock: &rbio->bio_list_lock);
1896	set_bit(RBIO_RMW_LOCKED_BIT, addr: &rbio->flags);
1897	spin_unlock(lock: &rbio->bio_list_lock);
1898	}
1899
1900	index_rbio_pages(rbio);
1901
1902	for (sectornr = `0`; sectornr < rbio->stripe_nsectors; sectornr++) {
1903	ret = recover_vertical(rbio, sector_nr: sectornr, pointers, unmap_array);
1904	if (ret < `0`)
1905	break;
1906	}
1907
1908	out:
1909	kfree(objp: pointers);
1910	kfree(objp: unmap_array);
1911	return ret;
1912	}
1913
1914	static void recover_rbio(struct btrfs_raid_bio *rbio)
1915	{
1916	struct bio_list bio_list = BIO_EMPTY_LIST;
1917	int total_sector_nr;
1918	int ret = `0`;
1919
1920	/*
1921	* Either we're doing recover for a read failure or degraded write,
1922	* caller should have set error bitmap correctly.
1923	*/
1924	ASSERT(bitmap_weight(rbio->error_bitmap, rbio->nr_sectors));
1925
1926	/ For recovery, we need to read all sectors including P/Q. /
1927	ret = alloc_rbio_pages(rbio);
1928	if (ret < `0`)
1929	goto out;
1930
1931	index_rbio_pages(rbio);
1932
1933	/*
1934	* Read everything that hasn't failed. However this time we will
1935	* not trust any cached sector.
1936	* As we may read out some stale data but higher layer is not reading
1937	* that stale part.
1938	*
1939	* So here we always re-read everything in recovery path.
1940	*/
1941	for (total_sector_nr = `0`; total_sector_nr < rbio->nr_sectors;
1942	total_sector_nr++) {
1943	int stripe = total_sector_nr / rbio->stripe_nsectors;
1944	int sectornr = total_sector_nr % rbio->stripe_nsectors;
1945	struct sector_ptr *sector;
1946
1947	/*
1948	* Skip the range which has error. It can be a range which is
1949	* marked error (for csum mismatch), or it can be a missing
1950	* device.
1951	*/
1952	if (!rbio->bioc->stripes[stripe].dev->bdev \|\|
1953	test_bit(total_sector_nr, rbio->error_bitmap)) {
1954	/*
1955	* Also set the error bit for missing device, which
1956	* may not yet have its error bit set.
1957	*/
1958	set_bit(nr: total_sector_nr, addr: rbio->error_bitmap);
1959	continue;
1960	}
1961
1962	sector = rbio_stripe_sector(rbio, stripe_nr: stripe, sector_nr: sectornr);
1963	ret = rbio_add_io_sector(rbio, bio_list: &bio_list, sector, stripe_nr: stripe,
1964	sector_nr: sectornr, op: REQ_OP_READ);
1965	if (ret < `0`) {
1966	bio_list_put(bio_list: &bio_list);
1967	goto out;
1968	}
1969	}
1970
1971	submit_read_wait_bio_list(rbio, bio_list: &bio_list);
1972	ret = recover_sectors(rbio);
1973	out:
1974	rbio_orig_end_io(rbio, err: errno_to_blk_status(errno: ret));
1975	}
1976
1977	static void recover_rbio_work(struct work_struct *work)
1978	{
1979	struct btrfs_raid_bio *rbio;
1980
1981	rbio = container_of(work, struct btrfs_raid_bio, work);
1982	if (!lock_stripe_add(rbio))
1983	recover_rbio(rbio);
1984	}
1985
1986	static void recover_rbio_work_locked(struct work_struct *work)
1987	{
1988	recover_rbio(container_of(work, struct btrfs_raid_bio, work));
1989	}
1990
1991	static void set_rbio_raid6_extra_error(struct btrfs_raid_bio rbio, int* mirror_num)
1992	{
1993	bool found = false;
1994	int sector_nr;
1995
1996	/*
1997	* This is for RAID6 extra recovery tries, thus mirror number should
1998	* be large than 2.
1999	* Mirror 1 means read from data stripes. Mirror 2 means rebuild using
2000	* RAID5 methods.
2001	*/
2002	ASSERT(mirror_num > `2`);
2003	for (sector_nr = `0`; sector_nr < rbio->stripe_nsectors; sector_nr++) {
2004	int found_errors;
2005	int faila;
2006	int failb;
2007
2008	found_errors = get_rbio_veritical_errors(rbio, sector_nr,
2009	faila: &faila, failb: &failb);
2010	/ This vertical stripe doesn't have errors. /
2011	if (!found_errors)
2012	continue;
2013
2014	/*
2015	* If we found errors, there should be only one error marked
2016	* by previous set_rbio_range_error().
2017	*/
2018	ASSERT(found_errors == `1`);
2019	found = true;
2020
2021	/ Now select another stripe to mark as error. /
2022	failb = rbio->real_stripes - (mirror_num - `1`);
2023	if (failb <= faila)
2024	failb--;
2025
2026	/ Set the extra bit in error bitmap. /
2027	if (failb >= `0`)
2028	set_bit(nr: failb * rbio->stripe_nsectors + sector_nr,
2029	addr: rbio->error_bitmap);
2030	}
2031
2032	/ We should found at least one vertical stripe with error./
2033	ASSERT(found);
2034	}
2035
2036	/*
2037	* the main entry point for reads from the higher layers. This
2038	* is really only called when the normal read path had a failure,
2039	* so we assume the bio they send down corresponds to a failed part
2040	* of the drive.
2041	*/
2042	void raid56_parity_recover(struct bio bio, struct* btrfs_io_context *bioc,
2043	int mirror_num)
2044	{
2045	struct btrfs_fs_info *fs_info = bioc->fs_info;
2046	struct btrfs_raid_bio *rbio;
2047
2048	rbio = alloc_rbio(fs_info, bioc);
2049	if (IS_ERR(ptr: rbio)) {
2050	bio->bi_status = errno_to_blk_status(errno: PTR_ERR(ptr: rbio));
2051	bio_endio(bio);
2052	return;
2053	}
2054
2055	rbio->operation = BTRFS_RBIO_READ_REBUILD;
2056	rbio_add_bio(rbio, orig_bio: bio);
2057
2058	set_rbio_range_error(rbio, bio);
2059
2060	/*
2061	* Loop retry:
2062	* for 'mirror == 2', reconstruct from all other stripes.
2063	* for 'mirror_num > 2', select a stripe to fail on every retry.
2064	*/
2065	if (mirror_num > `2`)
2066	set_rbio_raid6_extra_error(rbio, mirror_num);
2067
2068	start_async_work(rbio, work_func: recover_rbio_work);
2069	}
2070
2071	static void fill_data_csums(struct btrfs_raid_bio *rbio)
2072	{
2073	struct btrfs_fs_info *fs_info = rbio->bioc->fs_info;
2074	struct btrfs_root *csum_root = btrfs_csum_root(fs_info,
2075	bytenr: rbio->bioc->full_stripe_logical);
2076	const u64 start = rbio->bioc->full_stripe_logical;
2077	const u32 len = (rbio->nr_data * rbio->stripe_nsectors) <<
2078	fs_info->sectorsize_bits;
2079	int ret;
2080
2081	/ The rbio should not have its csum buffer initialized. /
2082	ASSERT(!rbio->csum_buf && !rbio->csum_bitmap);
2083
2084	/*
2085	* Skip the csum search if:
2086	*
2087	* - The rbio doesn't belong to data block groups
2088	* Then we are doing IO for tree blocks, no need to search csums.
2089	*
2090	* - The rbio belongs to mixed block groups
2091	* This is to avoid deadlock, as we're already holding the full
2092	* stripe lock, if we trigger a metadata read, and it needs to do
2093	* raid56 recovery, we will deadlock.
2094	*/
2095	if (!(rbio->bioc->map_type & BTRFS_BLOCK_GROUP_DATA) \|\|
2096	rbio->bioc->map_type & BTRFS_BLOCK_GROUP_METADATA)
2097	return;
2098
2099	rbio->csum_buf = kzalloc(size: rbio->nr_data * rbio->stripe_nsectors *
2100	fs_info->csum_size, GFP_NOFS);
2101	rbio->csum_bitmap = bitmap_zalloc(nbits: rbio->nr_data * rbio->stripe_nsectors,
2102	GFP_NOFS);
2103	if (!rbio->csum_buf \|\| !rbio->csum_bitmap) {
2104	ret = -ENOMEM;
2105	goto error;
2106	}
2107
2108	ret = btrfs_lookup_csums_bitmap(root: csum_root, NULL, start, end: start + len - `1`,
2109	csum_buf: rbio->csum_buf, csum_bitmap: rbio->csum_bitmap);
2110	if (ret < `0`)
2111	goto error;
2112	if (bitmap_empty(src: rbio->csum_bitmap, nbits: len >> fs_info->sectorsize_bits))
2113	goto no_csum;
2114	return;
2115
2116	error:
2117	/*
2118	* We failed to allocate memory or grab the csum, but it's not fatal,
2119	* we can still continue. But better to warn users that RMW is no
2120	* longer safe for this particular sub-stripe write.
2121	*/
2122	btrfs_warn_rl(fs_info,
2123	"sub-stripe write for full stripe %llu is not safe, failed to get csum: %d",
2124	rbio->bioc->full_stripe_logical, ret);
2125	no_csum:
2126	kfree(objp: rbio->csum_buf);
2127	bitmap_free(bitmap: rbio->csum_bitmap);
2128	rbio->csum_buf = NULL;
2129	rbio->csum_bitmap = NULL;
2130	}
2131
2132	static int rmw_read_wait_recover(struct btrfs_raid_bio *rbio)
2133	{
2134	struct bio_list bio_list = BIO_EMPTY_LIST;
2135	int total_sector_nr;
2136	int ret = `0`;
2137
2138	/*
2139	* Fill the data csums we need for data verification. We need to fill
2140	* the csum_bitmap/csum_buf first, as our endio function will try to
2141	* verify the data sectors.
2142	*/
2143	fill_data_csums(rbio);
2144
2145	/*
2146	* Build a list of bios to read all sectors (including data and P/Q).
2147	*
2148	* This behavior is to compensate the later csum verification and recovery.
2149	*/
2150	for (total_sector_nr = `0`; total_sector_nr < rbio->nr_sectors;
2151	total_sector_nr++) {
2152	struct sector_ptr *sector;
2153	int stripe = total_sector_nr / rbio->stripe_nsectors;
2154	int sectornr = total_sector_nr % rbio->stripe_nsectors;
2155
2156	sector = rbio_stripe_sector(rbio, stripe_nr: stripe, sector_nr: sectornr);
2157	ret = rbio_add_io_sector(rbio, bio_list: &bio_list, sector,
2158	stripe_nr: stripe, sector_nr: sectornr, op: REQ_OP_READ);
2159	if (ret) {
2160	bio_list_put(bio_list: &bio_list);
2161	return ret;
2162	}
2163	}
2164
2165	/*
2166	* We may or may not have any corrupted sectors (including missing dev
2167	* and csum mismatch), just let recover_sectors() to handle them all.
2168	*/
2169	submit_read_wait_bio_list(rbio, bio_list: &bio_list);
2170	return recover_sectors(rbio);
2171	}
2172
2173	static void raid_wait_write_end_io(struct bio *bio)
2174	{
2175	struct btrfs_raid_bio *rbio = bio->bi_private;
2176	blk_status_t err = bio->bi_status;
2177
2178	if (err)
2179	rbio_update_error_bitmap(rbio, bio);
2180	bio_put(bio);
2181	if (atomic_dec_and_test(v: &rbio->stripes_pending))
2182	wake_up(&rbio->io_wait);
2183	}
2184
2185	static void submit_write_bios(struct btrfs_raid_bio *rbio,
2186	struct bio_list *bio_list)
2187	{
2188	struct bio *bio;
2189
2190	atomic_set(v: &rbio->stripes_pending, i: bio_list_size(bl: bio_list));
2191	while ((bio = bio_list_pop(bl: bio_list))) {
2192	bio->bi_end_io = raid_wait_write_end_io;
2193
2194	if (trace_raid56_write_enabled()) {
2195	struct raid56_bio_trace_info trace_info = { `0` };
2196
2197	bio_get_trace_info(rbio, bio, trace_info: &trace_info);
2198	trace_raid56_write(rbio, bio, trace_info: &trace_info);
2199	}
2200	submit_bio(bio);
2201	}
2202	}
2203
2204	/*
2205	* To determine if we need to read any sector from the disk.
2206	* Should only be utilized in RMW path, to skip cached rbio.
2207	*/
2208	static bool need_read_stripe_sectors(struct btrfs_raid_bio *rbio)
2209	{
2210	int i;
2211
2212	for (i = `0`; i < rbio->nr_data * rbio->stripe_nsectors; i++) {
2213	struct sector_ptr *sector = &rbio->stripe_sectors[i];
2214
2215	/*
2216	* We have a sector which doesn't have page nor uptodate,
2217	* thus this rbio can not be cached one, as cached one must
2218	* have all its data sectors present and uptodate.
2219	*/
2220	if (!sector->page \|\| !sector->uptodate)
2221	return true;
2222	}
2223	return false;
2224	}
2225
2226	static void rmw_rbio(struct btrfs_raid_bio *rbio)
2227	{
2228	struct bio_list bio_list;
2229	int sectornr;
2230	int ret = `0`;
2231
2232	/*
2233	* Allocate the pages for parity first, as P/Q pages will always be
2234	* needed for both full-stripe and sub-stripe writes.
2235	*/
2236	ret = alloc_rbio_parity_pages(rbio);
2237	if (ret < `0`)
2238	goto out;
2239
2240	/*
2241	* Either full stripe write, or we have every data sector already
2242	* cached, can go to write path immediately.
2243	*/
2244	if (!rbio_is_full(rbio) && need_read_stripe_sectors(rbio)) {
2245	/*
2246	* Now we're doing sub-stripe write, also need all data stripes
2247	* to do the full RMW.
2248	*/
2249	ret = alloc_rbio_data_pages(rbio);
2250	if (ret < `0`)
2251	goto out;
2252
2253	index_rbio_pages(rbio);
2254
2255	ret = rmw_read_wait_recover(rbio);
2256	if (ret < `0`)
2257	goto out;
2258	}
2259
2260	/*
2261	* At this stage we're not allowed to add any new bios to the
2262	* bio list any more, anyone else that wants to change this stripe
2263	* needs to do their own rmw.
2264	*/
2265	spin_lock(lock: &rbio->bio_list_lock);
2266	set_bit(RBIO_RMW_LOCKED_BIT, addr: &rbio->flags);
2267	spin_unlock(lock: &rbio->bio_list_lock);
2268
2269	bitmap_clear(map: rbio->error_bitmap, start: `0`, nbits: rbio->nr_sectors);
2270
2271	index_rbio_pages(rbio);
2272
2273	/*
2274	* We don't cache full rbios because we're assuming
2275	* the higher layers are unlikely to use this area of
2276	* the disk again soon. If they do use it again,
2277	* hopefully they will send another full bio.
2278	*/
2279	if (!rbio_is_full(rbio))
2280	cache_rbio_pages(rbio);
2281	else
2282	clear_bit(RBIO_CACHE_READY_BIT, addr: &rbio->flags);
2283
2284	for (sectornr = `0`; sectornr < rbio->stripe_nsectors; sectornr++)
2285	generate_pq_vertical(rbio, sectornr);
2286
2287	bio_list_init(bl: &bio_list);
2288	ret = rmw_assemble_write_bios(rbio, bio_list: &bio_list);
2289	if (ret < `0`)
2290	goto out;
2291
2292	/ We should have at least one bio assembled. /
2293	ASSERT(bio_list_size(&bio_list));
2294	submit_write_bios(rbio, bio_list: &bio_list);
2295	wait_event(rbio->io_wait, atomic_read(&rbio->stripes_pending) == `0`);
2296
2297	/ We may have more errors than our tolerance during the read. /
2298	for (sectornr = `0`; sectornr < rbio->stripe_nsectors; sectornr++) {
2299	int found_errors;
2300
2301	found_errors = get_rbio_veritical_errors(rbio, sector_nr: sectornr, NULL, NULL);
2302	if (found_errors > rbio->bioc->max_errors) {
2303	ret = -EIO;
2304	break;
2305	}
2306	}
2307	out:
2308	rbio_orig_end_io(rbio, err: errno_to_blk_status(errno: ret));
2309	}
2310
2311	static void rmw_rbio_work(struct work_struct *work)
2312	{
2313	struct btrfs_raid_bio *rbio;
2314
2315	rbio = container_of(work, struct btrfs_raid_bio, work);
2316	if (lock_stripe_add(rbio) == `0`)
2317	rmw_rbio(rbio);
2318	}
2319
2320	static void rmw_rbio_work_locked(struct work_struct *work)
2321	{
2322	rmw_rbio(container_of(work, struct btrfs_raid_bio, work));
2323	}
2324
2325	/*
2326	* The following code is used to scrub/replace the parity stripe
2327	*
2328	* Caller must have already increased bio_counter for getting @bioc.
2329	*
2330	* Note: We need make sure all the pages that add into the scrub/replace
2331	* raid bio are correct and not be changed during the scrub/replace. That
2332	* is those pages just hold metadata or file data with checksum.
2333	*/
2334
2335	struct btrfs_raid_bio raid56_parity_alloc_scrub_rbio(struct* bio *bio,
2336	struct btrfs_io_context *bioc,
2337	struct btrfs_device *scrub_dev,
2338	unsigned long dbitmap, int* stripe_nsectors)
2339	{
2340	struct btrfs_fs_info *fs_info = bioc->fs_info;
2341	struct btrfs_raid_bio *rbio;
2342	int i;
2343
2344	rbio = alloc_rbio(fs_info, bioc);
2345	if (IS_ERR(ptr: rbio))
2346	return NULL;
2347	bio_list_add(bl: &rbio->bio_list, bio);
2348	/*
2349	* This is a special bio which is used to hold the completion handler
2350	* and make the scrub rbio is similar to the other types
2351	*/
2352	ASSERT(!bio->bi_iter.bi_size);
2353	rbio->operation = BTRFS_RBIO_PARITY_SCRUB;
2354
2355	/*
2356	* After mapping bioc with BTRFS_MAP_WRITE, parities have been sorted
2357	* to the end position, so this search can start from the first parity
2358	* stripe.
2359	*/
2360	for (i = rbio->nr_data; i < rbio->real_stripes; i++) {
2361	if (bioc->stripes[i].dev == scrub_dev) {
2362	rbio->scrubp = i;
2363	break;
2364	}
2365	}
2366	ASSERT(i < rbio->real_stripes);
2367
2368	bitmap_copy(dst: &rbio->dbitmap, src: dbitmap, nbits: stripe_nsectors);
2369	return rbio;
2370	}
2371
2372	/*
2373	* We just scrub the parity that we have correct data on the same horizontal,
2374	* so we needn't allocate all pages for all the stripes.
2375	*/
2376	static int alloc_rbio_essential_pages(struct btrfs_raid_bio *rbio)
2377	{
2378	const u32 sectorsize = rbio->bioc->fs_info->sectorsize;
2379	int total_sector_nr;
2380
2381	for (total_sector_nr = `0`; total_sector_nr < rbio->nr_sectors;
2382	total_sector_nr++) {
2383	struct page *page;
2384	int sectornr = total_sector_nr % rbio->stripe_nsectors;
2385	int index = (total_sector_nr * sectorsize) >> PAGE_SHIFT;
2386
2387	if (!test_bit(sectornr, &rbio->dbitmap))
2388	continue;
2389	if (rbio->stripe_pages[index])
2390	continue;
2391	page = alloc_page(GFP_NOFS);
2392	if (!page)
2393	return -ENOMEM;
2394	rbio->stripe_pages[index] = page;
2395	}
2396	index_stripe_sectors(rbio);
2397	return `0`;
2398	}
2399
2400	static int finish_parity_scrub(struct btrfs_raid_bio *rbio)
2401	{
2402	struct btrfs_io_context *bioc = rbio->bioc;
2403	const u32 sectorsize = bioc->fs_info->sectorsize;
2404	void **pointers = rbio->finish_pointers;
2405	unsigned long *pbitmap = &rbio->finish_pbitmap;
2406	int nr_data = rbio->nr_data;
2407	int stripe;
2408	int sectornr;
2409	bool has_qstripe;
2410	struct sector_ptr p_sector = { `0` };
2411	struct sector_ptr q_sector = { `0` };
2412	struct bio_list bio_list;
2413	int is_replace = `0`;
2414	int ret;
2415
2416	bio_list_init(bl: &bio_list);
2417
2418	if (rbio->real_stripes - rbio->nr_data == `1`)
2419	has_qstripe = false;
2420	else if (rbio->real_stripes - rbio->nr_data == `2`)
2421	has_qstripe = true;
2422	else
2423	BUG();
2424
2425	/*
2426	* Replace is running and our P/Q stripe is being replaced, then we
2427	* need to duplicate the final write to replace target.
2428	*/
2429	if (bioc->replace_nr_stripes && bioc->replace_stripe_src == rbio->scrubp) {
2430	is_replace = `1`;
2431	bitmap_copy(dst: pbitmap, src: &rbio->dbitmap, nbits: rbio->stripe_nsectors);
2432	}
2433
2434	/*
2435	* Because the higher layers(scrubber) are unlikely to
2436	* use this area of the disk again soon, so don't cache
2437	* it.
2438	*/
2439	clear_bit(RBIO_CACHE_READY_BIT, addr: &rbio->flags);
2440
2441	p_sector.page = alloc_page(GFP_NOFS);
2442	if (!p_sector.page)
2443	return -ENOMEM;
2444	p_sector.pgoff = `0`;
2445	p_sector.uptodate = `1`;
2446
2447	if (has_qstripe) {
2448	/ RAID6, allocate and map temp space for the Q stripe /
2449	q_sector.page = alloc_page(GFP_NOFS);
2450	if (!q_sector.page) {
2451	__free_page(p_sector.page);
2452	p_sector.page = NULL;
2453	return -ENOMEM;
2454	}
2455	q_sector.pgoff = `0`;
2456	q_sector.uptodate = `1`;
2457	pointers[rbio->real_stripes - `1`] = kmap_local_page(page: q_sector.page);
2458	}
2459
2460	bitmap_clear(map: rbio->error_bitmap, start: `0`, nbits: rbio->nr_sectors);
2461
2462	/ Map the parity stripe just once /
2463	pointers[nr_data] = kmap_local_page(page: p_sector.page);
2464
2465	for_each_set_bit(sectornr, &rbio->dbitmap, rbio->stripe_nsectors) {
2466	struct sector_ptr *sector;
2467	void *parity;
2468
2469	/ first collect one page from each data stripe /
2470	for (stripe = `0`; stripe < nr_data; stripe++) {
2471	sector = sector_in_rbio(rbio, stripe_nr: stripe, sector_nr: sectornr, bio_list_only: `0`);
2472	pointers[stripe] = kmap_local_page(page: sector->page) +
2473	sector->pgoff;
2474	}
2475
2476	if (has_qstripe) {
2477	/ RAID6, call the library function to fill in our P/Q /
2478	raid6_call.gen_syndrome(rbio->real_stripes, sectorsize,
2479	pointers);
2480	} else {
2481	/ raid5 /
2482	memcpy(pointers[nr_data], pointers[`0`], sectorsize);
2483	run_xor(pages: pointers + `1`, src_cnt: nr_data - `1`, len: sectorsize);
2484	}
2485
2486	/ Check scrubbing parity and repair it /
2487	sector = rbio_stripe_sector(rbio, stripe_nr: rbio->scrubp, sector_nr: sectornr);
2488	parity = kmap_local_page(page: sector->page) + sector->pgoff;
2489	if (memcmp(p: parity, q: pointers[rbio->scrubp], size: sectorsize) != `0`)
2490	memcpy(parity, pointers[rbio->scrubp], sectorsize);
2491	else
2492	/ Parity is right, needn't writeback /
2493	bitmap_clear(map: &rbio->dbitmap, start: sectornr, nbits: `1`);
2494	kunmap_local(parity);
2495
2496	for (stripe = nr_data - `1`; stripe >= `0`; stripe--)
2497	kunmap_local(pointers[stripe]);
2498	}
2499
2500	kunmap_local(pointers[nr_data]);
2501	__free_page(p_sector.page);
2502	p_sector.page = NULL;
2503	if (q_sector.page) {
2504	kunmap_local(pointers[rbio->real_stripes - `1`]);
2505	__free_page(q_sector.page);
2506	q_sector.page = NULL;
2507	}
2508
2509	/*
2510	* time to start writing. Make bios for everything from the
2511	* higher layers (the bio_list in our rbio) and our p/q. Ignore
2512	* everything else.
2513	*/
2514	for_each_set_bit(sectornr, &rbio->dbitmap, rbio->stripe_nsectors) {
2515	struct sector_ptr *sector;
2516
2517	sector = rbio_stripe_sector(rbio, stripe_nr: rbio->scrubp, sector_nr: sectornr);
2518	ret = rbio_add_io_sector(rbio, bio_list: &bio_list, sector, stripe_nr: rbio->scrubp,
2519	sector_nr: sectornr, op: REQ_OP_WRITE);
2520	if (ret)
2521	goto cleanup;
2522	}
2523
2524	if (!is_replace)
2525	goto submit_write;
2526
2527	/*
2528	* Replace is running and our parity stripe needs to be duplicated to
2529	* the target device. Check we have a valid source stripe number.
2530	*/
2531	ASSERT(rbio->bioc->replace_stripe_src >= `0`);
2532	for_each_set_bit(sectornr, pbitmap, rbio->stripe_nsectors) {
2533	struct sector_ptr *sector;
2534
2535	sector = rbio_stripe_sector(rbio, stripe_nr: rbio->scrubp, sector_nr: sectornr);
2536	ret = rbio_add_io_sector(rbio, bio_list: &bio_list, sector,
2537	stripe_nr: rbio->real_stripes,
2538	sector_nr: sectornr, op: REQ_OP_WRITE);
2539	if (ret)
2540	goto cleanup;
2541	}
2542
2543	submit_write:
2544	submit_write_bios(rbio, bio_list: &bio_list);
2545	return `0`;
2546
2547	cleanup:
2548	bio_list_put(bio_list: &bio_list);
2549	return ret;
2550	}
2551
2552	static inline int is_data_stripe(struct btrfs_raid_bio rbio, int* stripe)
2553	{
2554	if (stripe >= `0` && stripe < rbio->nr_data)
2555	return `1`;
2556	return `0`;
2557	}
2558
2559	static int recover_scrub_rbio(struct btrfs_raid_bio *rbio)
2560	{
2561	void **pointers = NULL;
2562	void **unmap_array = NULL;
2563	int sector_nr;
2564	int ret = `0`;
2565
2566	/*
2567	* @pointers array stores the pointer for each sector.
2568	*
2569	* @unmap_array stores copy of pointers that does not get reordered
2570	* during reconstruction so that kunmap_local works.
2571	*/
2572	pointers = kcalloc(n: rbio->real_stripes, size: sizeof(void *), GFP_NOFS);
2573	unmap_array = kcalloc(n: rbio->real_stripes, size: sizeof(void *), GFP_NOFS);
2574	if (!pointers \|\| !unmap_array) {
2575	ret = -ENOMEM;
2576	goto out;
2577	}
2578
2579	for (sector_nr = `0`; sector_nr < rbio->stripe_nsectors; sector_nr++) {
2580	int dfail = `0`, failp = -`1`;
2581	int faila;
2582	int failb;
2583	int found_errors;
2584
2585	found_errors = get_rbio_veritical_errors(rbio, sector_nr,
2586	faila: &faila, failb: &failb);
2587	if (found_errors > rbio->bioc->max_errors) {
2588	ret = -EIO;
2589	goto out;
2590	}
2591	if (found_errors == `0`)
2592	continue;
2593
2594	/ We should have at least one error here. /
2595	ASSERT(faila >= `0` \|\| failb >= `0`);
2596
2597	if (is_data_stripe(rbio, stripe: faila))
2598	dfail++;
2599	else if (is_parity_stripe(faila))
2600	failp = faila;
2601
2602	if (is_data_stripe(rbio, stripe: failb))
2603	dfail++;
2604	else if (is_parity_stripe(failb))
2605	failp = failb;
2606	/*
2607	* Because we can not use a scrubbing parity to repair the
2608	* data, so the capability of the repair is declined. (In the
2609	* case of RAID5, we can not repair anything.)
2610	*/
2611	if (dfail > rbio->bioc->max_errors - `1`) {
2612	ret = -EIO;
2613	goto out;
2614	}
2615	/*
2616	* If all data is good, only parity is correctly, just repair
2617	* the parity, no need to recover data stripes.
2618	*/
2619	if (dfail == `0`)
2620	continue;
2621
2622	/*
2623	* Here means we got one corrupted data stripe and one
2624	* corrupted parity on RAID6, if the corrupted parity is
2625	* scrubbing parity, luckily, use the other one to repair the
2626	* data, or we can not repair the data stripe.
2627	*/
2628	if (failp != rbio->scrubp) {
2629	ret = -EIO;
2630	goto out;
2631	}
2632
2633	ret = recover_vertical(rbio, sector_nr, pointers, unmap_array);
2634	if (ret < `0`)
2635	goto out;
2636	}
2637	out:
2638	kfree(objp: pointers);
2639	kfree(objp: unmap_array);
2640	return ret;
2641	}
2642
2643	static int scrub_assemble_read_bios(struct btrfs_raid_bio *rbio)
2644	{
2645	struct bio_list bio_list = BIO_EMPTY_LIST;
2646	int total_sector_nr;
2647	int ret = `0`;
2648
2649	/ Build a list of bios to read all the missing parts. /
2650	for (total_sector_nr = `0`; total_sector_nr < rbio->nr_sectors;
2651	total_sector_nr++) {
2652	int sectornr = total_sector_nr % rbio->stripe_nsectors;
2653	int stripe = total_sector_nr / rbio->stripe_nsectors;
2654	struct sector_ptr *sector;
2655
2656	/ No data in the vertical stripe, no need to read. /
2657	if (!test_bit(sectornr, &rbio->dbitmap))
2658	continue;
2659
2660	/*
2661	* We want to find all the sectors missing from the rbio and
2662	* read them from the disk. If sector_in_rbio() finds a sector
2663	* in the bio list we don't need to read it off the stripe.
2664	*/
2665	sector = sector_in_rbio(rbio, stripe_nr: stripe, sector_nr: sectornr, bio_list_only: `1`);
2666	if (sector)
2667	continue;
2668
2669	sector = rbio_stripe_sector(rbio, stripe_nr: stripe, sector_nr: sectornr);
2670	/*
2671	* The bio cache may have handed us an uptodate sector. If so,
2672	* use it.
2673	*/
2674	if (sector->uptodate)
2675	continue;
2676
2677	ret = rbio_add_io_sector(rbio, bio_list: &bio_list, sector, stripe_nr: stripe,
2678	sector_nr: sectornr, op: REQ_OP_READ);
2679	if (ret) {
2680	bio_list_put(bio_list: &bio_list);
2681	return ret;
2682	}
2683	}
2684
2685	submit_read_wait_bio_list(rbio, bio_list: &bio_list);
2686	return `0`;
2687	}
2688
2689	static void scrub_rbio(struct btrfs_raid_bio *rbio)
2690	{
2691	int sector_nr;
2692	int ret;
2693
2694	ret = alloc_rbio_essential_pages(rbio);
2695	if (ret)
2696	goto out;
2697
2698	bitmap_clear(map: rbio->error_bitmap, start: `0`, nbits: rbio->nr_sectors);
2699
2700	ret = scrub_assemble_read_bios(rbio);
2701	if (ret < `0`)
2702	goto out;
2703
2704	/ We may have some failures, recover the failed sectors first. /
2705	ret = recover_scrub_rbio(rbio);
2706	if (ret < `0`)
2707	goto out;
2708
2709	/*
2710	* We have every sector properly prepared. Can finish the scrub
2711	* and writeback the good content.
2712	*/
2713	ret = finish_parity_scrub(rbio);
2714	wait_event(rbio->io_wait, atomic_read(&rbio->stripes_pending) == `0`);
2715	for (sector_nr = `0`; sector_nr < rbio->stripe_nsectors; sector_nr++) {
2716	int found_errors;
2717
2718	found_errors = get_rbio_veritical_errors(rbio, sector_nr, NULL, NULL);
2719	if (found_errors > rbio->bioc->max_errors) {
2720	ret = -EIO;
2721	break;
2722	}
2723	}
2724	out:
2725	rbio_orig_end_io(rbio, err: errno_to_blk_status(errno: ret));
2726	}
2727
2728	static void scrub_rbio_work_locked(struct work_struct *work)
2729	{
2730	scrub_rbio(container_of(work, struct btrfs_raid_bio, work));
2731	}
2732
2733	void raid56_parity_submit_scrub_rbio(struct btrfs_raid_bio *rbio)
2734	{
2735	if (!lock_stripe_add(rbio))
2736	start_async_work(rbio, work_func: scrub_rbio_work_locked);
2737	}
2738
2739	/*
2740	* This is for scrub call sites where we already have correct data contents.
2741	* This allows us to avoid reading data stripes again.
2742	*
2743	* Unfortunately here we have to do page copy, other than reusing the pages.
2744	* This is due to the fact rbio has its own page management for its cache.
2745	*/
2746	void raid56_parity_cache_data_pages(struct btrfs_raid_bio *rbio,
2747	struct page **data_pages, u64 data_logical)
2748	{
2749	const u64 offset_in_full_stripe = data_logical -
2750	rbio->bioc->full_stripe_logical;
2751	const int page_index = offset_in_full_stripe >> PAGE_SHIFT;
2752	const u32 sectorsize = rbio->bioc->fs_info->sectorsize;
2753	const u32 sectors_per_page = PAGE_SIZE / sectorsize;
2754	int ret;
2755
2756	/*
2757	* If we hit ENOMEM temporarily, but later at
2758	* raid56_parity_submit_scrub_rbio() time it succeeded, we just do
2759	* the extra read, not a big deal.
2760	*
2761	* If we hit ENOMEM later at raid56_parity_submit_scrub_rbio() time,
2762	* the bio would got proper error number set.
2763	*/
2764	ret = alloc_rbio_data_pages(rbio);
2765	if (ret < `0`)
2766	return;
2767
2768	/ data_logical must be at stripe boundary and inside the full stripe. /
2769	ASSERT(IS_ALIGNED(offset_in_full_stripe, BTRFS_STRIPE_LEN));
2770	ASSERT(offset_in_full_stripe < (rbio->nr_data << BTRFS_STRIPE_LEN_SHIFT));
2771
2772	for (int page_nr = `0`; page_nr < (BTRFS_STRIPE_LEN >> PAGE_SHIFT); page_nr++) {
2773	struct page *dst = rbio->stripe_pages[page_nr + page_index];
2774	struct page *src = data_pages[page_nr];
2775
2776	memcpy_page(dst_page: dst, dst_off: `0`, src_page: src, src_off: `0`, PAGE_SIZE);
2777	for (int sector_nr = sectors_per_page * page_index;
2778	sector_nr < sectors_per_page * (page_index + `1`);
2779	sector_nr++)
2780	rbio->stripe_sectors[sector_nr].uptodate = true;
2781	}
2782	}
2783

source code of linux/fs/btrfs/raid56.c