buffered_write.c source code [linux/fs/netfs/buffered_write.c]

1	// SPDX-License-Identifier: GPL-2.0-only
2	/ Network filesystem high-level write support.*
3	*
4	* Copyright (C) 2023 Red Hat, Inc. All Rights Reserved.
5	* Written by David Howells (dhowells@redhat.com)
6	*/
7
8	#include <linux/export.h>
9	#include <linux/fs.h>
10	#include <linux/mm.h>
11	#include <linux/pagemap.h>
12	#include <linux/slab.h>
13	#include <linux/pagevec.h>
14	#include "internal.h"
15
16	/*
17	* Determined write method. Adjust netfs_folio_traces if this is changed.
18	*/
19	enum netfs_how_to_modify {
20	NETFS_FOLIO_IS_UPTODATE, / Folio is uptodate already /
21	NETFS_JUST_PREFETCH, / We have to read the folio anyway /
22	NETFS_WHOLE_FOLIO_MODIFY, / We're going to overwrite the whole folio /
23	NETFS_MODIFY_AND_CLEAR, / We can assume there is no data to be downloaded. /
24	NETFS_STREAMING_WRITE, / Store incomplete data in non-uptodate page. /
25	NETFS_STREAMING_WRITE_CONT, / Continue streaming write. /
26	NETFS_FLUSH_CONTENT, / Flush incompatible content. /
27	};
28
29	static void netfs_cleanup_buffered_write(struct netfs_io_request *wreq);
30
31	static void netfs_set_group(struct folio folio, struct* netfs_group *netfs_group)
32	{
33	if (netfs_group && !folio_get_private(folio))
34	folio_attach_private(folio, data: netfs_get_group(netfs_group));
35	}
36
37	#if IS_ENABLED(CONFIG_FSCACHE)
38	static void netfs_folio_start_fscache(bool caching, struct folio *folio)
39	{
40	if (caching)
41	folio_start_fscache(folio);
42	}
43	#else
44	static void netfs_folio_start_fscache(bool caching, struct folio *folio)
45	{
46	}
47	#endif
48
49	/*
50	* Decide how we should modify a folio. We might be attempting to do
51	* write-streaming, in which case we don't want to a local RMW cycle if we can
52	* avoid it. If we're doing local caching or content crypto, we award that
53	* priority over avoiding RMW. If the file is open readably, then we also
54	* assume that we may want to read what we wrote.
55	*/
56	static enum netfs_how_to_modify netfs_how_to_modify(struct netfs_inode *ctx,
57	struct file *file,
58	struct folio *folio,
59	void *netfs_group,
60	size_t flen,
61	size_t offset,
62	size_t len,
63	bool maybe_trouble)
64	{
65	struct netfs_folio *finfo = netfs_folio_info(folio);
66	loff_t pos = folio_file_pos(folio);
67
68	_enter("");
69
70	if (netfs_folio_group(folio) != netfs_group)
71	return NETFS_FLUSH_CONTENT;
72
73	if (folio_test_uptodate(folio))
74	return NETFS_FOLIO_IS_UPTODATE;
75
76	if (pos >= ctx->zero_point)
77	return NETFS_MODIFY_AND_CLEAR;
78
79	if (!maybe_trouble && offset == `0` && len >= flen)
80	return NETFS_WHOLE_FOLIO_MODIFY;
81
82	if (file->f_mode & FMODE_READ)
83	goto no_write_streaming;
84	if (test_bit(NETFS_ICTX_NO_WRITE_STREAMING, &ctx->flags))
85	goto no_write_streaming;
86
87	if (netfs_is_cache_enabled(ctx)) {
88	/ We don't want to get a streaming write on a file that loses*
89	* caching service temporarily because the backing store got
90	* culled.
91	*/
92	if (!test_bit(NETFS_ICTX_NO_WRITE_STREAMING, &ctx->flags))
93	set_bit(NETFS_ICTX_NO_WRITE_STREAMING, addr: &ctx->flags);
94	goto no_write_streaming;
95	}
96
97	if (!finfo)
98	return NETFS_STREAMING_WRITE;
99
100	/ We can continue a streaming write only if it continues on from the*
101	* previous. If it overlaps, we must flush lest we suffer a partial
102	* copy and disjoint dirty regions.
103	*/
104	if (offset == finfo->dirty_offset + finfo->dirty_len)
105	return NETFS_STREAMING_WRITE_CONT;
106	return NETFS_FLUSH_CONTENT;
107
108	no_write_streaming:
109	if (finfo) {
110	netfs_stat(stat: &netfs_n_wh_wstream_conflict);
111	return NETFS_FLUSH_CONTENT;
112	}
113	return NETFS_JUST_PREFETCH;
114	}
115
116	/*
117	* Grab a folio for writing and lock it. Attempt to allocate as large a folio
118	* as possible to hold as much of the remaining length as possible in one go.
119	*/
120	static struct folio netfs_grab_folio_for_write(struct* address_space *mapping,
121	loff_t pos, size_t part)
122	{
123	pgoff_t index = pos / PAGE_SIZE;
124	fgf_t fgp_flags = FGP_WRITEBEGIN;
125
126	if (mapping_large_folio_support(mapping))
127	fgp_flags \|= fgf_set_order(size: pos % PAGE_SIZE + part);
128
129	return __filemap_get_folio(mapping, index, fgp_flags,
130	gfp: mapping_gfp_mask(mapping));
131	}
132
133	/**
134	* netfs_perform_write - Copy data into the pagecache.
135	* @iocb: The operation parameters
136	* @iter: The source buffer
137	* @netfs_group: Grouping for dirty pages (eg. ceph snaps).
138	*
139	* Copy data into pagecache pages attached to the inode specified by @iocb.
140	* The caller must hold appropriate inode locks.
141	*
142	* Dirty pages are tagged with a netfs_folio struct if they're not up to date
143	* to indicate the range modified. Dirty pages may also be tagged with a
144	* netfs-specific grouping such that data from an old group gets flushed before
145	* a new one is started.
146	*/
147	ssize_t netfs_perform_write(struct kiocb iocb, struct* iov_iter *iter,
148	struct netfs_group *netfs_group)
149	{
150	struct file *file = iocb->ki_filp;
151	struct inode *inode = file_inode(f: file);
152	struct address_space *mapping = inode->i_mapping;
153	struct netfs_inode *ctx = netfs_inode(inode);
154	struct writeback_control wbc = {
155	.sync_mode = WB_SYNC_NONE,
156	.for_sync = true,
157	.nr_to_write = LONG_MAX,
158	.range_start = iocb->ki_pos,
159	.range_end = iocb->ki_pos + iter->count,
160	};
161	struct netfs_io_request *wreq = NULL;
162	struct netfs_folio *finfo;
163	struct folio *folio;
164	enum netfs_how_to_modify howto;
165	enum netfs_folio_trace trace;
166	unsigned int bdp_flags = (iocb->ki_flags & IOCB_SYNC) ? `0`: BDP_ASYNC;
167	ssize_t written = `0`, ret;
168	loff_t i_size, pos = iocb->ki_pos, from, to;
169	size_t max_chunk = PAGE_SIZE << MAX_PAGECACHE_ORDER;
170	bool maybe_trouble = false;
171
172	if (unlikely(test_bit(NETFS_ICTX_WRITETHROUGH, &ctx->flags) \|\|
173	iocb->ki_flags & (IOCB_DSYNC \| IOCB_SYNC))
174	) {
175	if (pos < i_size_read(inode)) {
176	ret = filemap_write_and_wait_range(mapping, lstart: pos, lend: pos + iter->count);
177	if (ret < `0`) {
178	goto out;
179	}
180	}
181
182	wbc_attach_fdatawrite_inode(wbc: &wbc, inode: mapping->host);
183
184	wreq = netfs_begin_writethrough(iocb, len: iter->count);
185	if (IS_ERR(ptr: wreq)) {
186	wbc_detach_inode(wbc: &wbc);
187	ret = PTR_ERR(ptr: wreq);
188	wreq = NULL;
189	goto out;
190	}
191	if (!is_sync_kiocb(kiocb: iocb))
192	wreq->iocb = iocb;
193	wreq->cleanup = netfs_cleanup_buffered_write;
194	}
195
196	do {
197	size_t flen;
198	size_t offset; / Offset into pagecache folio /
199	size_t part; / Bytes to write to folio /
200	size_t copied; / Bytes copied from user /
201
202	ret = balance_dirty_pages_ratelimited_flags(mapping, flags: bdp_flags);
203	if (unlikely(ret < `0`))
204	break;
205
206	offset = pos & (max_chunk - `1`);
207	part = min(max_chunk - offset, iov_iter_count(iter));
208
209	/ Bring in the user pages that we will copy from _first_ lest*
210	* we hit a nasty deadlock on copying from the same page as
211	* we're writing to, without it being marked uptodate.
212	*
213	* Not only is this an optimisation, but it is also required to
214	* check that the address is actually valid, when atomic
215	* usercopies are used below.
216	*
217	* We rely on the page being held onto long enough by the LRU
218	* that we can grab it below if this causes it to be read.
219	*/
220	ret = -EFAULT;
221	if (unlikely(fault_in_iov_iter_readable(iter, part) == part))
222	break;
223
224	folio = netfs_grab_folio_for_write(mapping, pos, part);
225	if (IS_ERR(ptr: folio)) {
226	ret = PTR_ERR(ptr: folio);
227	break;
228	}
229
230	flen = folio_size(folio);
231	offset = pos & (flen - `1`);
232	part = min_t(size_t, flen - offset, part);
233
234	if (signal_pending(current)) {
235	ret = written ? -EINTR : -ERESTARTSYS;
236	goto error_folio_unlock;
237	}
238
239	/ See if we need to prefetch the area we're going to modify.*
240	* We need to do this before we get a lock on the folio in case
241	* there's more than one writer competing for the same cache
242	* block.
243	*/
244	howto = netfs_how_to_modify(ctx, file, folio, netfs_group,
245	flen, offset, len: part, maybe_trouble);
246	_debug("howto %u", howto);
247	switch (howto) {
248	case NETFS_JUST_PREFETCH:
249	ret = netfs_prefetch_for_write(file, folio, offset, len: part);
250	if (ret < `0`) {
251	_debug("prefetch = %zd", ret);
252	goto error_folio_unlock;
253	}
254	break;
255	case NETFS_FOLIO_IS_UPTODATE:
256	case NETFS_WHOLE_FOLIO_MODIFY:
257	case NETFS_STREAMING_WRITE_CONT:
258	break;
259	case NETFS_MODIFY_AND_CLEAR:
260	zero_user_segment(page: &folio->page, start: `0`, end: offset);
261	break;
262	case NETFS_STREAMING_WRITE:
263	ret = -EIO;
264	if (WARN_ON(folio_get_private(folio)))
265	goto error_folio_unlock;
266	break;
267	case NETFS_FLUSH_CONTENT:
268	trace_netfs_folio(folio, why: netfs_flush_content);
269	from = folio_pos(folio);
270	to = from + folio_size(folio) - `1`;
271	folio_unlock(folio);
272	folio_put(folio);
273	ret = filemap_write_and_wait_range(mapping, lstart: from, lend: to);
274	if (ret < `0`)
275	goto error_folio_unlock;
276	continue;
277	}
278
279	if (mapping_writably_mapped(mapping))
280	flush_dcache_folio(folio);
281
282	copied = copy_folio_from_iter_atomic(folio, offset, bytes: part, i: iter);
283
284	flush_dcache_folio(folio);
285
286	/ Deal with a (partially) failed copy /
287	if (copied == `0`) {
288	ret = -EFAULT;
289	goto error_folio_unlock;
290	}
291
292	trace = (enum netfs_folio_trace)howto;
293	switch (howto) {
294	case NETFS_FOLIO_IS_UPTODATE:
295	case NETFS_JUST_PREFETCH:
296	netfs_set_group(folio, netfs_group);
297	break;
298	case NETFS_MODIFY_AND_CLEAR:
299	zero_user_segment(page: &folio->page, start: offset + copied, end: flen);
300	netfs_set_group(folio, netfs_group);
301	folio_mark_uptodate(folio);
302	break;
303	case NETFS_WHOLE_FOLIO_MODIFY:
304	if (unlikely(copied < part)) {
305	maybe_trouble = true;
306	iov_iter_revert(i: iter, bytes: copied);
307	copied = `0`;
308	goto retry;
309	}
310	netfs_set_group(folio, netfs_group);
311	folio_mark_uptodate(folio);
312	break;
313	case NETFS_STREAMING_WRITE:
314	if (offset == `0` && copied == flen) {
315	netfs_set_group(folio, netfs_group);
316	folio_mark_uptodate(folio);
317	trace = netfs_streaming_filled_page;
318	break;
319	}
320	finfo = kzalloc(size: sizeof(*finfo), GFP_KERNEL);
321	if (!finfo) {
322	iov_iter_revert(i: iter, bytes: copied);
323	ret = -ENOMEM;
324	goto error_folio_unlock;
325	}
326	finfo->netfs_group = netfs_get_group(netfs_group);
327	finfo->dirty_offset = offset;
328	finfo->dirty_len = copied;
329	folio_attach_private(folio, data: (void )((unsigned* long)finfo \|
330	NETFS_FOLIO_INFO));
331	break;
332	case NETFS_STREAMING_WRITE_CONT:
333	finfo = netfs_folio_info(folio);
334	finfo->dirty_len += copied;
335	if (finfo->dirty_offset == `0` && finfo->dirty_len == flen) {
336	if (finfo->netfs_group)
337	folio_change_private(folio, data: finfo->netfs_group);
338	else
339	folio_detach_private(folio);
340	folio_mark_uptodate(folio);
341	kfree(objp: finfo);
342	trace = netfs_streaming_cont_filled_page;
343	}
344	break;
345	default:
346	WARN(true, "Unexpected modify type %u ix=%lx\n",
347	howto, folio->index);
348	ret = -EIO;
349	goto error_folio_unlock;
350	}
351
352	trace_netfs_folio(folio, why: trace);
353
354	/ Update the inode size if we moved the EOF marker /
355	i_size = i_size_read(inode);
356	pos += copied;
357	if (pos > i_size) {
358	if (ctx->ops->update_i_size) {
359	ctx->ops->update_i_size(inode, pos);
360	} else {
361	i_size_write(inode, i_size: pos);
362	#if IS_ENABLED(CONFIG_FSCACHE)
363	fscache_update_cookie(cookie: ctx->cache, NULL, object_size: &pos);
364	#endif
365	}
366	}
367	written += copied;
368
369	if (likely(!wreq)) {
370	folio_mark_dirty(folio);
371	} else {
372	if (folio_test_dirty(folio))
373	/ Sigh. mmap. /
374	folio_clear_dirty_for_io(folio);
375	/ We make multiple writes to the folio... /
376	if (!folio_test_writeback(folio)) {
377	folio_wait_fscache(folio);
378	folio_start_writeback(folio);
379	folio_start_fscache(folio);
380	if (wreq->iter.count == `0`)
381	trace_netfs_folio(folio, why: netfs_folio_trace_wthru);
382	else
383	trace_netfs_folio(folio, why: netfs_folio_trace_wthru_plus);
384	}
385	netfs_advance_writethrough(wreq, copied,
386	to_page_end: offset + copied == flen);
387	}
388	retry:
389	folio_unlock(folio);
390	folio_put(folio);
391	folio = NULL;
392
393	cond_resched();
394	} while (iov_iter_count(i: iter));
395
396	out:
397	if (unlikely(wreq)) {
398	ret = netfs_end_writethrough(wreq, iocb);
399	wbc_detach_inode(wbc: &wbc);
400	if (ret == -EIOCBQUEUED)
401	return ret;
402	}
403
404	iocb->ki_pos += written;
405	_leave(" = %zd [%zd]", written, ret);
406	return written ? written : ret;
407
408	error_folio_unlock:
409	folio_unlock(folio);
410	folio_put(folio);
411	goto out;
412	}
413	EXPORT_SYMBOL(netfs_perform_write);
414
415	/**
416	* netfs_buffered_write_iter_locked - write data to a file
417	* @iocb: IO state structure (file, offset, etc.)
418	* @from: iov_iter with data to write
419	* @netfs_group: Grouping for dirty pages (eg. ceph snaps).
420	*
421	* This function does all the work needed for actually writing data to a
422	* file. It does all basic checks, removes SUID from the file, updates
423	* modification times and calls proper subroutines depending on whether we
424	* do direct IO or a standard buffered write.
425	*
426	* The caller must hold appropriate locks around this function and have called
427	* generic_write_checks() already. The caller is also responsible for doing
428	* any necessary syncing afterwards.
429	*
430	* This function does not take care of syncing data in case of O_SYNC write.
431	* A caller has to handle it. This is mainly due to the fact that we want to
432	* avoid syncing under i_rwsem.
433	*
434	* Return:
435	* * number of bytes written, even for truncated writes
436	* * negative error code if no data has been written at all
437	*/
438	ssize_t netfs_buffered_write_iter_locked(struct kiocb iocb, struct* iov_iter *from,
439	struct netfs_group *netfs_group)
440	{
441	struct file *file = iocb->ki_filp;
442	ssize_t ret;
443
444	trace_netfs_write_iter(iocb, from);
445
446	ret = file_remove_privs(file);
447	if (ret)
448	return ret;
449
450	ret = file_update_time(file);
451	if (ret)
452	return ret;
453
454	return netfs_perform_write(iocb, from, netfs_group);
455	}
456	EXPORT_SYMBOL(netfs_buffered_write_iter_locked);
457
458	/**
459	* netfs_file_write_iter - write data to a file
460	* @iocb: IO state structure
461	* @from: iov_iter with data to write
462	*
463	* Perform a write to a file, writing into the pagecache if possible and doing
464	* an unbuffered write instead if not.
465	*
466	* Return:
467	* * Negative error code if no data has been written at all of
468	* vfs_fsync_range() failed for a synchronous write
469	* * Number of bytes written, even for truncated writes
470	*/
471	ssize_t netfs_file_write_iter(struct kiocb iocb, struct* iov_iter *from)
472	{
473	struct file *file = iocb->ki_filp;
474	struct inode *inode = file->f_mapping->host;
475	struct netfs_inode *ictx = netfs_inode(inode);
476	ssize_t ret;
477
478	_enter("%llx,%zx,%llx", iocb->ki_pos, iov_iter_count(from), i_size_read(inode));
479
480	if (!iov_iter_count(i: from))
481	return `0`;
482
483	if ((iocb->ki_flags & IOCB_DIRECT) \|\|
484	test_bit(NETFS_ICTX_UNBUFFERED, &ictx->flags))
485	return netfs_unbuffered_write_iter(iocb, from);
486
487	ret = netfs_start_io_write(inode);
488	if (ret < `0`)
489	return ret;
490
491	ret = generic_write_checks(iocb, from);
492	if (ret > `0`)
493	ret = netfs_buffered_write_iter_locked(iocb, from, NULL);
494	netfs_end_io_write(inode);
495	if (ret > `0`)
496	ret = generic_write_sync(iocb, count: ret);
497	return ret;
498	}
499	EXPORT_SYMBOL(netfs_file_write_iter);
500
501	/*
502	* Notification that a previously read-only page is about to become writable.
503	* Note that the caller indicates a single page of a multipage folio.
504	*/
505	vm_fault_t netfs_page_mkwrite(struct vm_fault vmf, struct* netfs_group *netfs_group)
506	{
507	struct folio *folio = page_folio(vmf->page);
508	struct file *file = vmf->vma->vm_file;
509	struct inode *inode = file_inode(f: file);
510	vm_fault_t ret = VM_FAULT_RETRY;
511	int err;
512
513	_enter("%lx", folio->index);
514
515	sb_start_pagefault(sb: inode->i_sb);
516
517	if (folio_wait_writeback_killable(folio))
518	goto out;
519
520	if (folio_lock_killable(folio) < `0`)
521	goto out;
522
523	/ Can we see a streaming write here? /
524	if (WARN_ON(!folio_test_uptodate(folio))) {
525	ret = VM_FAULT_SIGBUS \| VM_FAULT_LOCKED;
526	goto out;
527	}
528
529	if (netfs_folio_group(folio) != netfs_group) {
530	folio_unlock(folio);
531	err = filemap_fdatawait_range(inode->i_mapping,
532	lstart: folio_pos(folio),
533	lend: folio_pos(folio) + folio_size(folio));
534	switch (err) {
535	case `0`:
536	ret = VM_FAULT_RETRY;
537	goto out;
538	case -ENOMEM:
539	ret = VM_FAULT_OOM;
540	goto out;
541	default:
542	ret = VM_FAULT_SIGBUS;
543	goto out;
544	}
545	}
546
547	if (folio_test_dirty(folio))
548	trace_netfs_folio(folio, why: netfs_folio_trace_mkwrite_plus);
549	else
550	trace_netfs_folio(folio, why: netfs_folio_trace_mkwrite);
551	netfs_set_group(folio, netfs_group);
552	file_update_time(file);
553	ret = VM_FAULT_LOCKED;
554	out:
555	sb_end_pagefault(sb: inode->i_sb);
556	return ret;
557	}
558	EXPORT_SYMBOL(netfs_page_mkwrite);
559
560	/*
561	* Kill all the pages in the given range
562	*/
563	static void netfs_kill_pages(struct address_space *mapping,
564	loff_t start, loff_t len)
565	{
566	struct folio *folio;
567	pgoff_t index = start / PAGE_SIZE;
568	pgoff_t last = (start + len - `1`) / PAGE_SIZE, next;
569
570	_enter("%llx-%llx", start, start + len - `1`);
571
572	do {
573	_debug("kill %lx (to %lx)", index, last);
574
575	folio = filemap_get_folio(mapping, index);
576	if (IS_ERR(ptr: folio)) {
577	next = index + `1`;
578	continue;
579	}
580
581	next = folio_next_index(folio);
582
583	trace_netfs_folio(folio, why: netfs_folio_trace_kill);
584	folio_clear_uptodate(folio);
585	if (folio_test_fscache(folio))
586	folio_end_fscache(folio);
587	folio_end_writeback(folio);
588	folio_lock(folio);
589	generic_error_remove_folio(mapping, folio);
590	folio_unlock(folio);
591	folio_put(folio);
592
593	} while (index = next, index <= last);
594
595	_leave("");
596	}
597
598	/*
599	* Redirty all the pages in a given range.
600	*/
601	static void netfs_redirty_pages(struct address_space *mapping,
602	loff_t start, loff_t len)
603	{
604	struct folio *folio;
605	pgoff_t index = start / PAGE_SIZE;
606	pgoff_t last = (start + len - `1`) / PAGE_SIZE, next;
607
608	_enter("%llx-%llx", start, start + len - `1`);
609
610	do {
611	_debug("redirty %llx @%llx", len, start);
612
613	folio = filemap_get_folio(mapping, index);
614	if (IS_ERR(ptr: folio)) {
615	next = index + `1`;
616	continue;
617	}
618
619	next = folio_next_index(folio);
620	trace_netfs_folio(folio, why: netfs_folio_trace_redirty);
621	filemap_dirty_folio(mapping, folio);
622	if (folio_test_fscache(folio))
623	folio_end_fscache(folio);
624	folio_end_writeback(folio);
625	folio_put(folio);
626	} while (index = next, index <= last);
627
628	balance_dirty_pages_ratelimited(mapping);
629
630	_leave("");
631	}
632
633	/*
634	* Completion of write to server
635	*/
636	static void netfs_pages_written_back(struct netfs_io_request *wreq)
637	{
638	struct address_space *mapping = wreq->mapping;
639	struct netfs_folio *finfo;
640	struct netfs_group *group = NULL;
641	struct folio *folio;
642	pgoff_t last;
643	int gcount = `0`;
644
645	XA_STATE(xas, &mapping->i_pages, wreq->start / PAGE_SIZE);
646
647	_enter("%llx-%llx", wreq->start, wreq->start + wreq->len);
648
649	rcu_read_lock();
650
651	last = (wreq->start + wreq->len - `1`) / PAGE_SIZE;
652	xas_for_each(&xas, folio, last) {
653	WARN(!folio_test_writeback(folio),
654	"bad %zx @%llx page %lx %lx\n",
655	wreq->len, wreq->start, folio->index, last);
656
657	if ((finfo = netfs_folio_info(folio))) {
658	/ Streaming writes cannot be redirtied whilst under*
659	* writeback, so discard the streaming record.
660	*/
661	folio_detach_private(folio);
662	group = finfo->netfs_group;
663	gcount++;
664	trace_netfs_folio(folio, why: netfs_folio_trace_clear_s);
665	kfree(objp: finfo);
666	} else if ((group = netfs_folio_group(folio))) {
667	/ Need to detach the group pointer if the page didn't*
668	* get redirtied. If it has been redirtied, then it
669	* must be within the same group.
670	*/
671	if (folio_test_dirty(folio)) {
672	trace_netfs_folio(folio, why: netfs_folio_trace_redirtied);
673	goto end_wb;
674	}
675	if (folio_trylock(folio)) {
676	if (!folio_test_dirty(folio)) {
677	folio_detach_private(folio);
678	gcount++;
679	trace_netfs_folio(folio, why: netfs_folio_trace_clear_g);
680	} else {
681	trace_netfs_folio(folio, why: netfs_folio_trace_redirtied);
682	}
683	folio_unlock(folio);
684	goto end_wb;
685	}
686
687	xas_pause(&xas);
688	rcu_read_unlock();
689	folio_lock(folio);
690	if (!folio_test_dirty(folio)) {
691	folio_detach_private(folio);
692	gcount++;
693	trace_netfs_folio(folio, why: netfs_folio_trace_clear_g);
694	} else {
695	trace_netfs_folio(folio, why: netfs_folio_trace_redirtied);
696	}
697	folio_unlock(folio);
698	rcu_read_lock();
699	} else {
700	trace_netfs_folio(folio, why: netfs_folio_trace_clear);
701	}
702	end_wb:
703	if (folio_test_fscache(folio))
704	folio_end_fscache(folio);
705	xas_advance(xas: &xas, index: folio_next_index(folio) - `1`);
706	folio_end_writeback(folio);
707	}
708
709	rcu_read_unlock();
710	netfs_put_group_many(netfs_group: group, nr: gcount);
711	_leave("");
712	}
713
714	/*
715	* Deal with the disposition of the folios that are under writeback to close
716	* out the operation.
717	*/
718	static void netfs_cleanup_buffered_write(struct netfs_io_request *wreq)
719	{
720	struct address_space *mapping = wreq->mapping;
721
722	_enter("");
723
724	switch (wreq->error) {
725	case `0`:
726	netfs_pages_written_back(wreq);
727	break;
728
729	default:
730	pr_notice("R=%08x Unexpected error %d\n", wreq->debug_id, wreq->error);
731	fallthrough;
732	case -EACCES:
733	case -EPERM:
734	case -ENOKEY:
735	case -EKEYEXPIRED:
736	case -EKEYREJECTED:
737	case -EKEYREVOKED:
738	case -ENETRESET:
739	case -EDQUOT:
740	case -ENOSPC:
741	netfs_redirty_pages(mapping, start: wreq->start, len: wreq->len);
742	break;
743
744	case -EROFS:
745	case -EIO:
746	case -EREMOTEIO:
747	case -EFBIG:
748	case -ENOENT:
749	case -ENOMEDIUM:
750	case -ENXIO:
751	netfs_kill_pages(mapping, start: wreq->start, len: wreq->len);
752	break;
753	}
754
755	if (wreq->error)
756	mapping_set_error(mapping, error: wreq->error);
757	if (wreq->netfs_ops->done)
758	wreq->netfs_ops->done(wreq);
759	}
760
761	/*
762	* Extend the region to be written back to include subsequent contiguously
763	* dirty pages if possible, but don't sleep while doing so.
764	*
765	* If this page holds new content, then we can include filler zeros in the
766	* writeback.
767	*/
768	static void netfs_extend_writeback(struct address_space *mapping,
769	struct netfs_group *group,
770	struct xa_state *xas,
771	long *_count,
772	loff_t start,
773	loff_t max_len,
774	bool caching,
775	size_t *_len,
776	size_t *_top)
777	{
778	struct netfs_folio *finfo;
779	struct folio_batch fbatch;
780	struct folio *folio;
781	unsigned int i;
782	pgoff_t index = (start + *_len) / PAGE_SIZE;
783	size_t len;
784	void *priv;
785	bool stop = true;
786
787	folio_batch_init(fbatch: &fbatch);
788
789	do {
790	/ Firstly, we gather up a batch of contiguous dirty pages*
791	* under the RCU read lock - but we can't clear the dirty flags
792	* there if any of those pages are mapped.
793	*/
794	rcu_read_lock();
795
796	xas_for_each(xas, folio, ULONG_MAX) {
797	stop = true;
798	if (xas_retry(xas, entry: folio))
799	continue;
800	if (xa_is_value(entry: folio))
801	break;
802	if (folio->index != index) {
803	xas_reset(xas);
804	break;
805	}
806
807	if (!folio_try_get_rcu(folio)) {
808	xas_reset(xas);
809	continue;
810	}
811
812	/ Has the folio moved or been split? /
813	if (unlikely(folio != xas_reload(xas))) {
814	folio_put(folio);
815	xas_reset(xas);
816	break;
817	}
818
819	if (!folio_trylock(folio)) {
820	folio_put(folio);
821	xas_reset(xas);
822	break;
823	}
824	if (!folio_test_dirty(folio) \|\|
825	folio_test_writeback(folio) \|\|
826	folio_test_fscache(folio)) {
827	folio_unlock(folio);
828	folio_put(folio);
829	xas_reset(xas);
830	break;
831	}
832
833	stop = false;
834	len = folio_size(folio);
835	priv = folio_get_private(folio);
836	if ((const struct netfs_group *)priv != group) {
837	stop = true;
838	finfo = netfs_folio_info(folio);
839	if (finfo->netfs_group != group \|\|
840	finfo->dirty_offset > `0`) {
841	folio_unlock(folio);
842	folio_put(folio);
843	xas_reset(xas);
844	break;
845	}
846	len = finfo->dirty_len;
847	}
848
849	*_top += folio_size(folio);
850	index += folio_nr_pages(folio);
851	*_count -= folio_nr_pages(folio);
852	*_len += len;
853	if (_len >= max_len \|\| _count <= `0`)
854	stop = true;
855
856	if (!folio_batch_add(fbatch: &fbatch, folio))
857	break;
858	if (stop)
859	break;
860	}
861
862	xas_pause(xas);
863	rcu_read_unlock();
864
865	/ Now, if we obtained any folios, we can shift them to being*
866	* writable and mark them for caching.
867	*/
868	if (!folio_batch_count(fbatch: &fbatch))
869	break;
870
871	for (i = `0`; i < folio_batch_count(fbatch: &fbatch); i++) {
872	folio = fbatch.folios[i];
873	trace_netfs_folio(folio, why: netfs_folio_trace_store_plus);
874
875	if (!folio_clear_dirty_for_io(folio))
876	BUG();
877	folio_start_writeback(folio);
878	netfs_folio_start_fscache(caching, folio);
879	folio_unlock(folio);
880	}
881
882	folio_batch_release(fbatch: &fbatch);
883	cond_resched();
884	} while (!stop);
885	}
886
887	/*
888	* Synchronously write back the locked page and any subsequent non-locked dirty
889	* pages.
890	*/
891	static ssize_t netfs_write_back_from_locked_folio(struct address_space *mapping,
892	struct writeback_control *wbc,
893	struct netfs_group *group,
894	struct xa_state *xas,
895	struct folio *folio,
896	unsigned long long start,
897	unsigned long long end)
898	{
899	struct netfs_io_request *wreq;
900	struct netfs_folio *finfo;
901	struct netfs_inode *ctx = netfs_inode(inode: mapping->host);
902	unsigned long long i_size = i_size_read(inode: &ctx->inode);
903	size_t len, max_len;
904	bool caching = netfs_is_cache_enabled(ctx);
905	long count = wbc->nr_to_write;
906	int ret;
907
908	_enter(",%lx,%llx-%llx,%u", folio->index, start, end, caching);
909
910	wreq = netfs_alloc_request(mapping, NULL, start, len: folio_size(folio),
911	origin: NETFS_WRITEBACK);
912	if (IS_ERR(ptr: wreq)) {
913	folio_unlock(folio);
914	return PTR_ERR(ptr: wreq);
915	}
916
917	if (!folio_clear_dirty_for_io(folio))
918	BUG();
919	folio_start_writeback(folio);
920	netfs_folio_start_fscache(caching, folio);
921
922	count -= folio_nr_pages(folio);
923
924	/ Find all consecutive lockable dirty pages that have contiguous*
925	* written regions, stopping when we find a page that is not
926	* immediately lockable, is not dirty or is missing, or we reach the
927	* end of the range.
928	*/
929	trace_netfs_folio(folio, why: netfs_folio_trace_store);
930
931	len = wreq->len;
932	finfo = netfs_folio_info(folio);
933	if (finfo) {
934	start += finfo->dirty_offset;
935	if (finfo->dirty_offset + finfo->dirty_len != len) {
936	len = finfo->dirty_len;
937	goto cant_expand;
938	}
939	len = finfo->dirty_len;
940	}
941
942	if (start < i_size) {
943	/ Trim the write to the EOF; the extra data is ignored. Also*
944	* put an upper limit on the size of a single storedata op.
945	*/
946	max_len = `65536` * `4096`;
947	max_len = min_t(unsigned long long, max_len, end - start + `1`);
948	max_len = min_t(unsigned long long, max_len, i_size - start);
949
950	if (len < max_len)
951	netfs_extend_writeback(mapping, group, xas, count: &count, start,
952	max_len, caching, len: &len, top: &wreq->upper_len);
953	}
954
955	cant_expand:
956	len = min_t(unsigned long long, len, i_size - start);
957
958	/ We now have a contiguous set of dirty pages, each with writeback*
959	* set; the first page is still locked at this point, but all the rest
960	* have been unlocked.
961	*/
962	folio_unlock(folio);
963	wreq->start = start;
964	wreq->len = len;
965
966	if (start < i_size) {
967	_debug("write back %zx @%llx [%llx]", len, start, i_size);
968
969	/ Speculatively write to the cache. We have to fix this up*
970	* later if the store fails.
971	*/
972	wreq->cleanup = netfs_cleanup_buffered_write;
973
974	iov_iter_xarray(i: &wreq->iter, ITER_SOURCE, xarray: &mapping->i_pages, start,
975	count: wreq->upper_len);
976	__set_bit(NETFS_RREQ_UPLOAD_TO_SERVER, &wreq->flags);
977	ret = netfs_begin_write(wreq, may_wait: true, what: netfs_write_trace_writeback);
978	if (ret == `0` \|\| ret == -EIOCBQUEUED)
979	wbc->nr_to_write -= len / PAGE_SIZE;
980	} else {
981	_debug("write discard %zx @%llx [%llx]", len, start, i_size);
982
983	/ The dirty region was entirely beyond the EOF. /
984	fscache_clear_page_bits(mapping, start, len, caching);
985	netfs_pages_written_back(wreq);
986	ret = `0`;
987	}
988
989	netfs_put_request(rreq: wreq, was_async: false, what: netfs_rreq_trace_put_return);
990	_leave(" = 1");
991	return `1`;
992	}
993
994	/*
995	* Write a region of pages back to the server
996	*/
997	static ssize_t netfs_writepages_begin(struct address_space *mapping,
998	struct writeback_control *wbc,
999	struct netfs_group *group,
1000	struct xa_state *xas,
1001	unsigned long long *_start,
1002	unsigned long long end)
1003	{
1004	const struct netfs_folio *finfo;
1005	struct folio *folio;
1006	unsigned long long start = *_start;
1007	ssize_t ret;
1008	void *priv;
1009	int skips = `0`;
1010
1011	_enter("%llx,%llx,", start, end);
1012
1013	search_again:
1014	/ Find the first dirty page in the group. /
1015	rcu_read_lock();
1016
1017	for (;;) {
1018	folio = xas_find_marked(xas, max: end / PAGE_SIZE, PAGECACHE_TAG_DIRTY);
1019	if (xas_retry(xas, entry: folio) \|\| xa_is_value(entry: folio))
1020	continue;
1021	if (!folio)
1022	break;
1023
1024	if (!folio_try_get_rcu(folio)) {
1025	xas_reset(xas);
1026	continue;
1027	}
1028
1029	if (unlikely(folio != xas_reload(xas))) {
1030	folio_put(folio);
1031	xas_reset(xas);
1032	continue;
1033	}
1034
1035	/ Skip any dirty folio that's not in the group of interest. /
1036	priv = folio_get_private(folio);
1037	if ((const struct netfs_group *)priv != group) {
1038	finfo = netfs_folio_info(folio);
1039	if (finfo->netfs_group != group) {
1040	folio_put(folio);
1041	continue;
1042	}
1043	}
1044
1045	xas_pause(xas);
1046	break;
1047	}
1048	rcu_read_unlock();
1049	if (!folio)
1050	return `0`;
1051
1052	start = folio_pos(folio); / May regress with THPs /
1053
1054	_debug("wback %lx", folio->index);
1055
1056	/ At this point we hold neither the i_pages lock nor the page lock:*
1057	* the page may be truncated or invalidated (changing page->mapping to
1058	* NULL), or even swizzled back from swapper_space to tmpfs file
1059	* mapping
1060	*/
1061	lock_again:
1062	if (wbc->sync_mode != WB_SYNC_NONE) {
1063	ret = folio_lock_killable(folio);
1064	if (ret < `0`)
1065	return ret;
1066	} else {
1067	if (!folio_trylock(folio))
1068	goto search_again;
1069	}
1070
1071	if (folio->mapping != mapping \|\|
1072	!folio_test_dirty(folio)) {
1073	start += folio_size(folio);
1074	folio_unlock(folio);
1075	goto search_again;
1076	}
1077
1078	if (folio_test_writeback(folio) \|\|
1079	folio_test_fscache(folio)) {
1080	folio_unlock(folio);
1081	if (wbc->sync_mode != WB_SYNC_NONE) {
1082	folio_wait_writeback(folio);
1083	#ifdef CONFIG_FSCACHE
1084	folio_wait_fscache(folio);
1085	#endif
1086	goto lock_again;
1087	}
1088
1089	start += folio_size(folio);
1090	if (wbc->sync_mode == WB_SYNC_NONE) {
1091	if (skips >= `5` \|\| need_resched()) {
1092	ret = `0`;
1093	goto out;
1094	}
1095	skips++;
1096	}
1097	goto search_again;
1098	}
1099
1100	ret = netfs_write_back_from_locked_folio(mapping, wbc, group, xas,
1101	folio, start, end);
1102	out:
1103	if (ret > `0`)
1104	*_start = start + ret;
1105	_leave(" = %zd [%llx]", ret, *_start);
1106	return ret;
1107	}
1108
1109	/*
1110	* Write a region of pages back to the server
1111	*/
1112	static int netfs_writepages_region(struct address_space *mapping,
1113	struct writeback_control *wbc,
1114	struct netfs_group *group,
1115	unsigned long long *_start,
1116	unsigned long long end)
1117	{
1118	ssize_t ret;
1119
1120	XA_STATE(xas, &mapping->i_pages, *_start / PAGE_SIZE);
1121
1122	do {
1123	ret = netfs_writepages_begin(mapping, wbc, group, xas: &xas,
1124	_start, end);
1125	if (ret > `0` && wbc->nr_to_write > `0`)
1126	cond_resched();
1127	} while (ret > `0` && wbc->nr_to_write > `0`);
1128
1129	return ret > `0` ? `0` : ret;
1130	}
1131
1132	/*
1133	* write some of the pending data back to the server
1134	*/
1135	int netfs_writepages(struct address_space *mapping,
1136	struct writeback_control *wbc)
1137	{
1138	struct netfs_group *group = NULL;
1139	loff_t start, end;
1140	int ret;
1141
1142	_enter("");
1143
1144	/ We have to be careful as we can end up racing with setattr()*
1145	* truncating the pagecache since the caller doesn't take a lock here
1146	* to prevent it.
1147	*/
1148
1149	if (wbc->range_cyclic && mapping->writeback_index) {
1150	start = mapping->writeback_index * PAGE_SIZE;
1151	ret = netfs_writepages_region(mapping, wbc, group,
1152	start: &start, LLONG_MAX);
1153	if (ret < `0`)
1154	goto out;
1155
1156	if (wbc->nr_to_write <= `0`) {
1157	mapping->writeback_index = start / PAGE_SIZE;
1158	goto out;
1159	}
1160
1161	start = `0`;
1162	end = mapping->writeback_index * PAGE_SIZE;
1163	mapping->writeback_index = `0`;
1164	ret = netfs_writepages_region(mapping, wbc, group, start: &start, end);
1165	if (ret == `0`)
1166	mapping->writeback_index = start / PAGE_SIZE;
1167	} else if (wbc->range_start == `0` && wbc->range_end == LLONG_MAX) {
1168	start = `0`;
1169	ret = netfs_writepages_region(mapping, wbc, group,
1170	start: &start, LLONG_MAX);
1171	if (wbc->nr_to_write > `0` && ret == `0`)
1172	mapping->writeback_index = start / PAGE_SIZE;
1173	} else {
1174	start = wbc->range_start;
1175	ret = netfs_writepages_region(mapping, wbc, group,
1176	start: &start, end: wbc->range_end);
1177	}
1178
1179	out:
1180	_leave(" = %d", ret);
1181	return ret;
1182	}
1183	EXPORT_SYMBOL(netfs_writepages);
1184
1185	/*
1186	* Deal with the disposition of a laundered folio.
1187	*/
1188	static void netfs_cleanup_launder_folio(struct netfs_io_request *wreq)
1189	{
1190	if (wreq->error) {
1191	pr_notice("R=%08x Laundering error %d\n", wreq->debug_id, wreq->error);
1192	mapping_set_error(mapping: wreq->mapping, error: wreq->error);
1193	}
1194	}
1195
1196	/**
1197	* netfs_launder_folio - Clean up a dirty folio that's being invalidated
1198	* @folio: The folio to clean
1199	*
1200	* This is called to write back a folio that's being invalidated when an inode
1201	* is getting torn down. Ideally, writepages would be used instead.
1202	*/
1203	int netfs_launder_folio(struct folio *folio)
1204	{
1205	struct netfs_io_request *wreq;
1206	struct address_space *mapping = folio->mapping;
1207	struct netfs_folio *finfo = netfs_folio_info(folio);
1208	struct netfs_group *group = netfs_folio_group(folio);
1209	struct bio_vec bvec;
1210	unsigned long long i_size = i_size_read(inode: mapping->host);
1211	unsigned long long start = folio_pos(folio);
1212	size_t offset = `0`, len;
1213	int ret = `0`;
1214
1215	if (finfo) {
1216	offset = finfo->dirty_offset;
1217	start += offset;
1218	len = finfo->dirty_len;
1219	} else {
1220	len = folio_size(folio);
1221	}
1222	len = min_t(unsigned long long, len, i_size - start);
1223
1224	wreq = netfs_alloc_request(mapping, NULL, start, len, origin: NETFS_LAUNDER_WRITE);
1225	if (IS_ERR(ptr: wreq)) {
1226	ret = PTR_ERR(ptr: wreq);
1227	goto out;
1228	}
1229
1230	if (!folio_clear_dirty_for_io(folio))
1231	goto out_put;
1232
1233	trace_netfs_folio(folio, why: netfs_folio_trace_launder);
1234
1235	_debug("launder %llx-%llx", start, start + len - `1`);
1236
1237	/ Speculatively write to the cache. We have to fix this up later if*
1238	* the store fails.
1239	*/
1240	wreq->cleanup = netfs_cleanup_launder_folio;
1241
1242	bvec_set_folio(bv: &bvec, folio, len, offset);
1243	iov_iter_bvec(i: &wreq->iter, ITER_SOURCE, bvec: &bvec, nr_segs: `1`, count: len);
1244	__set_bit(NETFS_RREQ_UPLOAD_TO_SERVER, &wreq->flags);
1245	ret = netfs_begin_write(wreq, may_wait: true, what: netfs_write_trace_launder);
1246
1247	out_put:
1248	folio_detach_private(folio);
1249	netfs_put_group(netfs_group: group);
1250	kfree(objp: finfo);
1251	netfs_put_request(rreq: wreq, was_async: false, what: netfs_rreq_trace_put_return);
1252	out:
1253	folio_wait_fscache(folio);
1254	_leave(" = %d", ret);
1255	return ret;
1256	}
1257	EXPORT_SYMBOL(netfs_launder_folio);
1258

source code of linux/fs/netfs/buffered_write.c