file.c source code [linux/fs/ntfs/file.c]

1	// SPDX-License-Identifier: GPL-2.0-or-later
2	/*
3	* file.c - NTFS kernel file operations. Part of the Linux-NTFS project.
4	*
5	* Copyright (c) 2001-2015 Anton Altaparmakov and Tuxera Inc.
6	*/
7
8	#include <linux/blkdev.h>
9	#include <linux/backing-dev.h>
10	#include <linux/buffer_head.h>
11	#include <linux/gfp.h>
12	#include <linux/pagemap.h>
13	#include <linux/pagevec.h>
14	#include <linux/sched/signal.h>
15	#include <linux/swap.h>
16	#include <linux/uio.h>
17	#include <linux/writeback.h>
18
19	#include <asm/page.h>
20	#include <linux/uaccess.h>
21
22	#include "attrib.h"
23	#include "bitmap.h"
24	#include "inode.h"
25	#include "debug.h"
26	#include "lcnalloc.h"
27	#include "malloc.h"
28	#include "mft.h"
29	#include "ntfs.h"
30
31	/**
32	* ntfs_file_open - called when an inode is about to be opened
33	* @vi: inode to be opened
34	* @filp: file structure describing the inode
35	*
36	* Limit file size to the page cache limit on architectures where unsigned long
37	* is 32-bits. This is the most we can do for now without overflowing the page
38	* cache page index. Doing it this way means we don't run into problems because
39	* of existing too large files. It would be better to allow the user to read
40	* the beginning of the file but I doubt very much anyone is going to hit this
41	* check on a 32-bit architecture, so there is no point in adding the extra
42	* complexity required to support this.
43	*
44	* On 64-bit architectures, the check is hopefully optimized away by the
45	* compiler.
46	*
47	* After the check passes, just call generic_file_open() to do its work.
48	*/
49	static int ntfs_file_open(struct inode vi, struct* file *filp)
50	{
51	if (sizeof(unsigned long) < `8`) {
52	if (i_size_read(inode: vi) > MAX_LFS_FILESIZE)
53	return -EOVERFLOW;
54	}
55	return generic_file_open(inode: vi, filp);
56	}
57
58	#ifdef NTFS_RW
59
60	/**
61	* ntfs_attr_extend_initialized - extend the initialized size of an attribute
62	* @ni: ntfs inode of the attribute to extend
63	* @new_init_size: requested new initialized size in bytes
64	*
65	* Extend the initialized size of an attribute described by the ntfs inode @ni
66	* to @new_init_size bytes. This involves zeroing any non-sparse space between
67	* the old initialized size and @new_init_size both in the page cache and on
68	* disk (if relevant complete pages are already uptodate in the page cache then
69	* these are simply marked dirty).
70	*
71	* As a side-effect, the file size (vfs inode->i_size) may be incremented as,
72	* in the resident attribute case, it is tied to the initialized size and, in
73	* the non-resident attribute case, it may not fall below the initialized size.
74	*
75	* Note that if the attribute is resident, we do not need to touch the page
76	* cache at all. This is because if the page cache page is not uptodate we
77	* bring it uptodate later, when doing the write to the mft record since we
78	* then already have the page mapped. And if the page is uptodate, the
79	* non-initialized region will already have been zeroed when the page was
80	* brought uptodate and the region may in fact already have been overwritten
81	* with new data via mmap() based writes, so we cannot just zero it. And since
82	* POSIX specifies that the behaviour of resizing a file whilst it is mmap()ped
83	* is unspecified, we choose not to do zeroing and thus we do not need to touch
84	* the page at all. For a more detailed explanation see ntfs_truncate() in
85	* fs/ntfs/inode.c.
86	*
87	* Return 0 on success and -errno on error. In the case that an error is
88	* encountered it is possible that the initialized size will already have been
89	* incremented some way towards @new_init_size but it is guaranteed that if
90	* this is the case, the necessary zeroing will also have happened and that all
91	* metadata is self-consistent.
92	*
93	* Locking: i_mutex on the vfs inode corrseponsind to the ntfs inode @ni must be
94	* held by the caller.
95	*/
96	static int ntfs_attr_extend_initialized(ntfs_inode ni, const* s64 new_init_size)
97	{
98	s64 old_init_size;
99	loff_t old_i_size;
100	pgoff_t index, end_index;
101	unsigned long flags;
102	struct inode *vi = VFS_I(ni);
103	ntfs_inode *base_ni;
104	MFT_RECORD *m = NULL;
105	ATTR_RECORD *a;
106	ntfs_attr_search_ctx *ctx = NULL;
107	struct address_space *mapping;
108	struct page *page = NULL;
109	u8 *kattr;
110	int err;
111	u32 attr_len;
112
113	read_lock_irqsave(&ni->size_lock, flags);
114	old_init_size = ni->initialized_size;
115	old_i_size = i_size_read(inode: vi);
116	BUG_ON(new_init_size > ni->allocated_size);
117	read_unlock_irqrestore(&ni->size_lock, flags);
118	ntfs_debug("Entering for i_ino 0x%lx, attribute type 0x%x, "
119	"old_initialized_size 0x%llx, "
120	"new_initialized_size 0x%llx, i_size 0x%llx.",
121	vi->i_ino, (unsigned)le32_to_cpu(ni->type),
122	(unsigned long long)old_init_size,
123	(unsigned long long)new_init_size, old_i_size);
124	if (!NInoAttr(ni))
125	base_ni = ni;
126	else
127	base_ni = ni->ext.base_ntfs_ino;
128	/ Use goto to reduce indentation and we need the label below anyway. /
129	if (NInoNonResident(ni))
130	goto do_non_resident_extend;
131	BUG_ON(old_init_size != old_i_size);
132	m = map_mft_record(ni: base_ni);
133	if (IS_ERR(ptr: m)) {
134	err = PTR_ERR(ptr: m);
135	m = NULL;
136	goto err_out;
137	}
138	ctx = ntfs_attr_get_search_ctx(ni: base_ni, mrec: m);
139	if (unlikely(!ctx)) {
140	err = -ENOMEM;
141	goto err_out;
142	}
143	err = ntfs_attr_lookup(type: ni->type, name: ni->name, name_len: ni->name_len,
144	ic: CASE_SENSITIVE, lowest_vcn: `0`, NULL, val_len: `0`, ctx);
145	if (unlikely(err)) {
146	if (err == -ENOENT)
147	err = -EIO;
148	goto err_out;
149	}
150	m = ctx->mrec;
151	a = ctx->attr;
152	BUG_ON(a->non_resident);
153	/ The total length of the attribute value. /
154	attr_len = le32_to_cpu(a->data.resident.value_length);
155	BUG_ON(old_i_size != (loff_t)attr_len);
156	/*
157	* Do the zeroing in the mft record and update the attribute size in
158	* the mft record.
159	*/
160	kattr = (u8*)a + le16_to_cpu(a->data.resident.value_offset);
161	memset(kattr + attr_len, `0`, new_init_size - attr_len);
162	a->data.resident.value_length = cpu_to_le32((u32)new_init_size);
163	/ Finally, update the sizes in the vfs and ntfs inodes. /
164	write_lock_irqsave(&ni->size_lock, flags);
165	i_size_write(inode: vi, i_size: new_init_size);
166	ni->initialized_size = new_init_size;
167	write_unlock_irqrestore(&ni->size_lock, flags);
168	goto done;
169	do_non_resident_extend:
170	/*
171	* If the new initialized size @new_init_size exceeds the current file
172	* size (vfs inode->i_size), we need to extend the file size to the
173	* new initialized size.
174	*/
175	if (new_init_size > old_i_size) {
176	m = map_mft_record(ni: base_ni);
177	if (IS_ERR(ptr: m)) {
178	err = PTR_ERR(ptr: m);
179	m = NULL;
180	goto err_out;
181	}
182	ctx = ntfs_attr_get_search_ctx(ni: base_ni, mrec: m);
183	if (unlikely(!ctx)) {
184	err = -ENOMEM;
185	goto err_out;
186	}
187	err = ntfs_attr_lookup(type: ni->type, name: ni->name, name_len: ni->name_len,
188	ic: CASE_SENSITIVE, lowest_vcn: `0`, NULL, val_len: `0`, ctx);
189	if (unlikely(err)) {
190	if (err == -ENOENT)
191	err = -EIO;
192	goto err_out;
193	}
194	m = ctx->mrec;
195	a = ctx->attr;
196	BUG_ON(!a->non_resident);
197	BUG_ON(old_i_size != (loff_t)
198	sle64_to_cpu(a->data.non_resident.data_size));
199	a->data.non_resident.data_size = cpu_to_sle64(x: new_init_size);
200	flush_dcache_mft_record_page(ni: ctx->ntfs_ino);
201	mark_mft_record_dirty(ni: ctx->ntfs_ino);
202	/ Update the file size in the vfs inode. /
203	i_size_write(inode: vi, i_size: new_init_size);
204	ntfs_attr_put_search_ctx(ctx);
205	ctx = NULL;
206	unmap_mft_record(ni: base_ni);
207	m = NULL;
208	}
209	mapping = vi->i_mapping;
210	index = old_init_size >> PAGE_SHIFT;
211	end_index = (new_init_size + PAGE_SIZE - `1`) >> PAGE_SHIFT;
212	do {
213	/*
214	* Read the page. If the page is not present, this will zero
215	* the uninitialized regions for us.
216	*/
217	page = read_mapping_page(mapping, index, NULL);
218	if (IS_ERR(ptr: page)) {
219	err = PTR_ERR(ptr: page);
220	goto init_err_out;
221	}
222	/*
223	* Update the initialized size in the ntfs inode. This is
224	* enough to make ntfs_writepage() work.
225	*/
226	write_lock_irqsave(&ni->size_lock, flags);
227	ni->initialized_size = (s64)(index + `1`) << PAGE_SHIFT;
228	if (ni->initialized_size > new_init_size)
229	ni->initialized_size = new_init_size;
230	write_unlock_irqrestore(&ni->size_lock, flags);
231	/ Set the page dirty so it gets written out. /
232	set_page_dirty(page);
233	put_page(page);
234	/*
235	* Play nice with the vm and the rest of the system. This is
236	* very much needed as we can potentially be modifying the
237	* initialised size from a very small value to a really huge
238	* value, e.g.
239	* f = open(somefile, O_TRUNC);
240	* truncate(f, 10GiB);
241	* seek(f, 10GiB);
242	* write(f, 1);
243	* And this would mean we would be marking dirty hundreds of
244	* thousands of pages or as in the above example more than
245	* two and a half million pages!
246	*
247	* TODO: For sparse pages could optimize this workload by using
248	* the FsMisc / MiscFs page bit as a "PageIsSparse" bit. This
249	* would be set in read_folio for sparse pages and here we would
250	* not need to mark dirty any pages which have this bit set.
251	* The only caveat is that we have to clear the bit everywhere
252	* where we allocate any clusters that lie in the page or that
253	* contain the page.
254	*
255	* TODO: An even greater optimization would be for us to only
256	* call read_folio() on pages which are not in sparse regions as
257	* determined from the runlist. This would greatly reduce the
258	* number of pages we read and make dirty in the case of sparse
259	* files.
260	*/
261	balance_dirty_pages_ratelimited(mapping);
262	cond_resched();
263	} while (++index < end_index);
264	read_lock_irqsave(&ni->size_lock, flags);
265	BUG_ON(ni->initialized_size != new_init_size);
266	read_unlock_irqrestore(&ni->size_lock, flags);
267	/ Now bring in sync the initialized_size in the mft record. /
268	m = map_mft_record(ni: base_ni);
269	if (IS_ERR(ptr: m)) {
270	err = PTR_ERR(ptr: m);
271	m = NULL;
272	goto init_err_out;
273	}
274	ctx = ntfs_attr_get_search_ctx(ni: base_ni, mrec: m);
275	if (unlikely(!ctx)) {
276	err = -ENOMEM;
277	goto init_err_out;
278	}
279	err = ntfs_attr_lookup(type: ni->type, name: ni->name, name_len: ni->name_len,
280	ic: CASE_SENSITIVE, lowest_vcn: `0`, NULL, val_len: `0`, ctx);
281	if (unlikely(err)) {
282	if (err == -ENOENT)
283	err = -EIO;
284	goto init_err_out;
285	}
286	m = ctx->mrec;
287	a = ctx->attr;
288	BUG_ON(!a->non_resident);
289	a->data.non_resident.initialized_size = cpu_to_sle64(x: new_init_size);
290	done:
291	flush_dcache_mft_record_page(ni: ctx->ntfs_ino);
292	mark_mft_record_dirty(ni: ctx->ntfs_ino);
293	if (ctx)
294	ntfs_attr_put_search_ctx(ctx);
295	if (m)
296	unmap_mft_record(ni: base_ni);
297	ntfs_debug("Done, initialized_size 0x%llx, i_size 0x%llx.",
298	(unsigned long long)new_init_size, i_size_read(vi));
299	return `0`;
300	init_err_out:
301	write_lock_irqsave(&ni->size_lock, flags);
302	ni->initialized_size = old_init_size;
303	write_unlock_irqrestore(&ni->size_lock, flags);
304	err_out:
305	if (ctx)
306	ntfs_attr_put_search_ctx(ctx);
307	if (m)
308	unmap_mft_record(ni: base_ni);
309	ntfs_debug("Failed. Returning error code %i.", err);
310	return err;
311	}
312
313	static ssize_t ntfs_prepare_file_for_write(struct kiocb *iocb,
314	struct iov_iter *from)
315	{
316	loff_t pos;
317	s64 end, ll;
318	ssize_t err;
319	unsigned long flags;
320	struct file *file = iocb->ki_filp;
321	struct inode *vi = file_inode(f: file);
322	ntfs_inode *ni = NTFS_I(inode: vi);
323	ntfs_volume *vol = ni->vol;
324
325	ntfs_debug("Entering for i_ino 0x%lx, attribute type 0x%x, pos "
326	"0x%llx, count 0x%zx.", vi->i_ino,
327	(unsigned)le32_to_cpu(ni->type),
328	(unsigned long long)iocb->ki_pos,
329	iov_iter_count(from));
330	err = generic_write_checks(iocb, from);
331	if (unlikely(err <= `0`))
332	goto out;
333	/*
334	* All checks have passed. Before we start doing any writing we want
335	* to abort any totally illegal writes.
336	*/
337	BUG_ON(NInoMstProtected(ni));
338	BUG_ON(ni->type != AT_DATA);
339	/ If file is encrypted, deny access, just like NT4. /
340	if (NInoEncrypted(ni)) {
341	/ Only $DATA attributes can be encrypted. /
342	/*
343	* Reminder for later: Encrypted files are _always_
344	* non-resident so that the content can always be encrypted.
345	*/
346	ntfs_debug("Denying write access to encrypted file.");
347	err = -EACCES;
348	goto out;
349	}
350	if (NInoCompressed(ni)) {
351	/ Only unnamed $DATA attribute can be compressed. /
352	BUG_ON(ni->name_len);
353	/*
354	* Reminder for later: If resident, the data is not actually
355	* compressed. Only on the switch to non-resident does
356	* compression kick in. This is in contrast to encrypted files
357	* (see above).
358	*/
359	ntfs_error(vi->i_sb, "Writing to compressed files is not "
360	"implemented yet. Sorry.");
361	err = -EOPNOTSUPP;
362	goto out;
363	}
364	err = file_remove_privs(file);
365	if (unlikely(err))
366	goto out;
367	/*
368	* Our ->update_time method always succeeds thus file_update_time()
369	* cannot fail either so there is no need to check the return code.
370	*/
371	file_update_time(file);
372	pos = iocb->ki_pos;
373	/ The first byte after the last cluster being written to. /
374	end = (pos + iov_iter_count(i: from) + vol->cluster_size_mask) &
375	~(u64)vol->cluster_size_mask;
376	/*
377	* If the write goes beyond the allocated size, extend the allocation
378	* to cover the whole of the write, rounded up to the nearest cluster.
379	*/
380	read_lock_irqsave(&ni->size_lock, flags);
381	ll = ni->allocated_size;
382	read_unlock_irqrestore(&ni->size_lock, flags);
383	if (end > ll) {
384	/*
385	* Extend the allocation without changing the data size.
386	*
387	* Note we ensure the allocation is big enough to at least
388	* write some data but we do not require the allocation to be
389	* complete, i.e. it may be partial.
390	*/
391	ll = ntfs_attr_extend_allocation(ni, new_alloc_size: end, new_data_size: -`1`, data_start: pos);
392	if (likely(ll >= `0`)) {
393	BUG_ON(pos >= ll);
394	/ If the extension was partial truncate the write. /
395	if (end > ll) {
396	ntfs_debug("Truncating write to inode 0x%lx, "
397	"attribute type 0x%x, because "
398	"the allocation was only "
399	"partially extended.",
400	vi->i_ino, (unsigned)
401	le32_to_cpu(ni->type));
402	iov_iter_truncate(i: from, count: ll - pos);
403	}
404	} else {
405	err = ll;
406	read_lock_irqsave(&ni->size_lock, flags);
407	ll = ni->allocated_size;
408	read_unlock_irqrestore(&ni->size_lock, flags);
409	/ Perform a partial write if possible or fail. /
410	if (pos < ll) {
411	ntfs_debug("Truncating write to inode 0x%lx "
412	"attribute type 0x%x, because "
413	"extending the allocation "
414	"failed (error %d).",
415	vi->i_ino, (unsigned)
416	le32_to_cpu(ni->type),
417	(int)-err);
418	iov_iter_truncate(i: from, count: ll - pos);
419	} else {
420	if (err != -ENOSPC)
421	ntfs_error(vi->i_sb, "Cannot perform "
422	"write to inode "
423	"0x%lx, attribute "
424	"type 0x%x, because "
425	"extending the "
426	"allocation failed "
427	"(error %ld).",
428	vi->i_ino, (unsigned)
429	le32_to_cpu(ni->type),
430	(long)-err);
431	else
432	ntfs_debug("Cannot perform write to "
433	"inode 0x%lx, "
434	"attribute type 0x%x, "
435	"because there is not "
436	"space left.",
437	vi->i_ino, (unsigned)
438	le32_to_cpu(ni->type));
439	goto out;
440	}
441	}
442	}
443	/*
444	* If the write starts beyond the initialized size, extend it up to the
445	* beginning of the write and initialize all non-sparse space between
446	* the old initialized size and the new one. This automatically also
447	* increments the vfs inode->i_size to keep it above or equal to the
448	* initialized_size.
449	*/
450	read_lock_irqsave(&ni->size_lock, flags);
451	ll = ni->initialized_size;
452	read_unlock_irqrestore(&ni->size_lock, flags);
453	if (pos > ll) {
454	/*
455	* Wait for ongoing direct i/o to complete before proceeding.
456	* New direct i/o cannot start as we hold i_mutex.
457	*/
458	inode_dio_wait(inode: vi);
459	err = ntfs_attr_extend_initialized(ni, new_init_size: pos);
460	if (unlikely(err < `0`))
461	ntfs_error(vi->i_sb, "Cannot perform write to inode "
462	"0x%lx, attribute type 0x%x, because "
463	"extending the initialized size "
464	"failed (error %d).", vi->i_ino,
465	(unsigned)le32_to_cpu(ni->type),
466	(int)-err);
467	}
468	out:
469	return err;
470	}
471
472	/**
473	* __ntfs_grab_cache_pages - obtain a number of locked pages
474	* @mapping: address space mapping from which to obtain page cache pages
475	* @index: starting index in @mapping at which to begin obtaining pages
476	* @nr_pages: number of page cache pages to obtain
477	* @pages: array of pages in which to return the obtained page cache pages
478	* @cached_page: allocated but as yet unused page
479	*
480	* Obtain @nr_pages locked page cache pages from the mapping @mapping and
481	* starting at index @index.
482	*
483	* If a page is newly created, add it to lru list
484	*
485	* Note, the page locks are obtained in ascending page index order.
486	*/
487	static inline int __ntfs_grab_cache_pages(struct address_space *mapping,
488	pgoff_t index, const unsigned nr_pages, struct page **pages,
489	struct page **cached_page)
490	{
491	int err, nr;
492
493	BUG_ON(!nr_pages);
494	err = nr = `0`;
495	do {
496	pages[nr] = find_get_page_flags(mapping, offset: index, FGP_LOCK \|
497	FGP_ACCESSED);
498	if (!pages[nr]) {
499	if (!*cached_page) {
500	*cached_page = page_cache_alloc(x: mapping);
501	if (unlikely(!*cached_page)) {
502	err = -ENOMEM;
503	goto err_out;
504	}
505	}
506	err = add_to_page_cache_lru(page: *cached_page, mapping,
507	index,
508	gfp: mapping_gfp_constraint(mapping, GFP_KERNEL));
509	if (unlikely(err)) {
510	if (err == -EEXIST)
511	continue;
512	goto err_out;
513	}
514	pages[nr] = *cached_page;
515	*cached_page = NULL;
516	}
517	index++;
518	nr++;
519	} while (nr < nr_pages);
520	out:
521	return err;
522	err_out:
523	while (nr > `0`) {
524	unlock_page(page: pages[--nr]);
525	put_page(page: pages[nr]);
526	}
527	goto out;
528	}
529
530	static inline void ntfs_submit_bh_for_read(struct buffer_head *bh)
531	{
532	lock_buffer(bh);
533	get_bh(bh);
534	bh->b_end_io = end_buffer_read_sync;
535	submit_bh(REQ_OP_READ, bh);
536	}
537
538	/**
539	* ntfs_prepare_pages_for_non_resident_write - prepare pages for receiving data
540	* @pages: array of destination pages
541	* @nr_pages: number of pages in @pages
542	* @pos: byte position in file at which the write begins
543	* @bytes: number of bytes to be written
544	*
545	* This is called for non-resident attributes from ntfs_file_buffered_write()
546	* with i_mutex held on the inode (@pages[0]->mapping->host). There are
547	* @nr_pages pages in @pages which are locked but not kmap()ped. The source
548	* data has not yet been copied into the @pages.
549	*
550	* Need to fill any holes with actual clusters, allocate buffers if necessary,
551	* ensure all the buffers are mapped, and bring uptodate any buffers that are
552	* only partially being written to.
553	*
554	* If @nr_pages is greater than one, we are guaranteed that the cluster size is
555	* greater than PAGE_SIZE, that all pages in @pages are entirely inside
556	* the same cluster and that they are the entirety of that cluster, and that
557	* the cluster is sparse, i.e. we need to allocate a cluster to fill the hole.
558	*
559	* i_size is not to be modified yet.
560	*
561	* Return 0 on success or -errno on error.
562	*/
563	static int ntfs_prepare_pages_for_non_resident_write(struct page **pages,
564	unsigned nr_pages, s64 pos, size_t bytes)
565	{
566	VCN vcn, highest_vcn = `0`, cpos, cend, bh_cpos, bh_cend;
567	LCN lcn;
568	s64 bh_pos, vcn_len, end, initialized_size;
569	sector_t lcn_block;
570	struct folio *folio;
571	struct inode *vi;
572	ntfs_inode ni, base_ni = NULL;
573	ntfs_volume *vol;
574	runlist_element rl, rl2;
575	struct buffer_head bh, head, wait[`2`], *wait_bh = wait;
576	ntfs_attr_search_ctx *ctx = NULL;
577	MFT_RECORD *m = NULL;
578	ATTR_RECORD *a = NULL;
579	unsigned long flags;
580	u32 attr_rec_len = `0`;
581	unsigned blocksize, u;
582	int err, mp_size;
583	bool rl_write_locked, was_hole, is_retry;
584	unsigned char blocksize_bits;
585	struct {
586	u8 runlist_merged:`1`;
587	u8 mft_attr_mapped:`1`;
588	u8 mp_rebuilt:`1`;
589	u8 attr_switched:`1`;
590	} status = { `0`, `0`, `0`, `0` };
591
592	BUG_ON(!nr_pages);
593	BUG_ON(!pages);
594	BUG_ON(!*pages);
595	vi = pages[`0`]->mapping->host;
596	ni = NTFS_I(inode: vi);
597	vol = ni->vol;
598	ntfs_debug("Entering for inode 0x%lx, attribute type 0x%x, start page "
599	"index 0x%lx, nr_pages 0x%x, pos 0x%llx, bytes 0x%zx.",
600	vi->i_ino, ni->type, pages[`0`]->index, nr_pages,
601	(long long)pos, bytes);
602	blocksize = vol->sb->s_blocksize;
603	blocksize_bits = vol->sb->s_blocksize_bits;
604	rl_write_locked = false;
605	rl = NULL;
606	err = `0`;
607	vcn = lcn = -`1`;
608	vcn_len = `0`;
609	lcn_block = -`1`;
610	was_hole = false;
611	cpos = pos >> vol->cluster_size_bits;
612	end = pos + bytes;
613	cend = (end + vol->cluster_size - `1`) >> vol->cluster_size_bits;
614	/*
615	* Loop over each buffer in each folio. Use goto to
616	* reduce indentation.
617	*/
618	u = `0`;
619	do_next_folio:
620	folio = page_folio(pages[u]);
621	bh_pos = folio_pos(folio);
622	head = folio_buffers(folio);
623	if (!head)
624	/*
625	* create_empty_buffers() will create uptodate/dirty
626	* buffers if the folio is uptodate/dirty.
627	*/
628	head = create_empty_buffers(folio, blocksize, b_state: `0`);
629	bh = head;
630	do {
631	VCN cdelta;
632	s64 bh_end;
633	unsigned bh_cofs;
634
635	/ Clear buffer_new on all buffers to reinitialise state. /
636	if (buffer_new(bh))
637	clear_buffer_new(bh);
638	bh_end = bh_pos + blocksize;
639	bh_cpos = bh_pos >> vol->cluster_size_bits;
640	bh_cofs = bh_pos & vol->cluster_size_mask;
641	if (buffer_mapped(bh)) {
642	/*
643	* The buffer is already mapped. If it is uptodate,
644	* ignore it.
645	*/
646	if (buffer_uptodate(bh))
647	continue;
648	/*
649	* The buffer is not uptodate. If the folio is uptodate
650	* set the buffer uptodate and otherwise ignore it.
651	*/
652	if (folio_test_uptodate(folio)) {
653	set_buffer_uptodate(bh);
654	continue;
655	}
656	/*
657	* Neither the folio nor the buffer are uptodate. If
658	* the buffer is only partially being written to, we
659	* need to read it in before the write, i.e. now.
660	*/
661	if ((bh_pos < pos && bh_end > pos) \|\|
662	(bh_pos < end && bh_end > end)) {
663	/*
664	* If the buffer is fully or partially within
665	* the initialized size, do an actual read.
666	* Otherwise, simply zero the buffer.
667	*/
668	read_lock_irqsave(&ni->size_lock, flags);
669	initialized_size = ni->initialized_size;
670	read_unlock_irqrestore(&ni->size_lock, flags);
671	if (bh_pos < initialized_size) {
672	ntfs_submit_bh_for_read(bh);
673	*wait_bh++ = bh;
674	} else {
675	folio_zero_range(folio, start: bh_offset(bh),
676	length: blocksize);
677	set_buffer_uptodate(bh);
678	}
679	}
680	continue;
681	}
682	/ Unmapped buffer. Need to map it. /
683	bh->b_bdev = vol->sb->s_bdev;
684	/*
685	* If the current buffer is in the same clusters as the map
686	* cache, there is no need to check the runlist again. The
687	* map cache is made up of @vcn, which is the first cached file
688	* cluster, @vcn_len which is the number of cached file
689	* clusters, @lcn is the device cluster corresponding to @vcn,
690	* and @lcn_block is the block number corresponding to @lcn.
691	*/
692	cdelta = bh_cpos - vcn;
693	if (likely(!cdelta \|\| (cdelta > `0` && cdelta < vcn_len))) {
694	map_buffer_cached:
695	BUG_ON(lcn < `0`);
696	bh->b_blocknr = lcn_block +
697	(cdelta << (vol->cluster_size_bits -
698	blocksize_bits)) +
699	(bh_cofs >> blocksize_bits);
700	set_buffer_mapped(bh);
701	/*
702	* If the folio is uptodate so is the buffer. If the
703	* buffer is fully outside the write, we ignore it if
704	* it was already allocated and we mark it dirty so it
705	* gets written out if we allocated it. On the other
706	* hand, if we allocated the buffer but we are not
707	* marking it dirty we set buffer_new so we can do
708	* error recovery.
709	*/
710	if (folio_test_uptodate(folio)) {
711	if (!buffer_uptodate(bh))
712	set_buffer_uptodate(bh);
713	if (unlikely(was_hole)) {
714	/ We allocated the buffer. /
715	clean_bdev_bh_alias(bh);
716	if (bh_end <= pos \|\| bh_pos >= end)
717	mark_buffer_dirty(bh);
718	else
719	set_buffer_new(bh);
720	}
721	continue;
722	}
723	/ Page is _not_ uptodate. /
724	if (likely(!was_hole)) {
725	/*
726	* Buffer was already allocated. If it is not
727	* uptodate and is only partially being written
728	* to, we need to read it in before the write,
729	* i.e. now.
730	*/
731	if (!buffer_uptodate(bh) && bh_pos < end &&
732	bh_end > pos &&
733	(bh_pos < pos \|\|
734	bh_end > end)) {
735	/*
736	* If the buffer is fully or partially
737	* within the initialized size, do an
738	* actual read. Otherwise, simply zero
739	* the buffer.
740	*/
741	read_lock_irqsave(&ni->size_lock,
742	flags);
743	initialized_size = ni->initialized_size;
744	read_unlock_irqrestore(&ni->size_lock,
745	flags);
746	if (bh_pos < initialized_size) {
747	ntfs_submit_bh_for_read(bh);
748	*wait_bh++ = bh;
749	} else {
750	folio_zero_range(folio,
751	start: bh_offset(bh),
752	length: blocksize);
753	set_buffer_uptodate(bh);
754	}
755	}
756	continue;
757	}
758	/ We allocated the buffer. /
759	clean_bdev_bh_alias(bh);
760	/*
761	* If the buffer is fully outside the write, zero it,
762	* set it uptodate, and mark it dirty so it gets
763	* written out. If it is partially being written to,
764	* zero region surrounding the write but leave it to
765	* commit write to do anything else. Finally, if the
766	* buffer is fully being overwritten, do nothing.
767	*/
768	if (bh_end <= pos \|\| bh_pos >= end) {
769	if (!buffer_uptodate(bh)) {
770	folio_zero_range(folio, start: bh_offset(bh),
771	length: blocksize);
772	set_buffer_uptodate(bh);
773	}
774	mark_buffer_dirty(bh);
775	continue;
776	}
777	set_buffer_new(bh);
778	if (!buffer_uptodate(bh) &&
779	(bh_pos < pos \|\| bh_end > end)) {
780	u8 *kaddr;
781	unsigned pofs;
782
783	kaddr = kmap_local_folio(folio, offset: `0`);
784	if (bh_pos < pos) {
785	pofs = bh_pos & ~PAGE_MASK;
786	memset(kaddr + pofs, `0`, pos - bh_pos);
787	}
788	if (bh_end > end) {
789	pofs = end & ~PAGE_MASK;
790	memset(kaddr + pofs, `0`, bh_end - end);
791	}
792	kunmap_local(kaddr);
793	flush_dcache_folio(folio);
794	}
795	continue;
796	}
797	/*
798	* Slow path: this is the first buffer in the cluster. If it
799	* is outside allocated size and is not uptodate, zero it and
800	* set it uptodate.
801	*/
802	read_lock_irqsave(&ni->size_lock, flags);
803	initialized_size = ni->allocated_size;
804	read_unlock_irqrestore(&ni->size_lock, flags);
805	if (bh_pos > initialized_size) {
806	if (folio_test_uptodate(folio)) {
807	if (!buffer_uptodate(bh))
808	set_buffer_uptodate(bh);
809	} else if (!buffer_uptodate(bh)) {
810	folio_zero_range(folio, start: bh_offset(bh),
811	length: blocksize);
812	set_buffer_uptodate(bh);
813	}
814	continue;
815	}
816	is_retry = false;
817	if (!rl) {
818	down_read(sem: &ni->runlist.lock);
819	retry_remap:
820	rl = ni->runlist.rl;
821	}
822	if (likely(rl != NULL)) {
823	/ Seek to element containing target cluster. /
824	while (rl->length && rl[`1`].vcn <= bh_cpos)
825	rl++;
826	lcn = ntfs_rl_vcn_to_lcn(rl, vcn: bh_cpos);
827	if (likely(lcn >= `0`)) {
828	/*
829	* Successful remap, setup the map cache and
830	* use that to deal with the buffer.
831	*/
832	was_hole = false;
833	vcn = bh_cpos;
834	vcn_len = rl[`1`].vcn - vcn;
835	lcn_block = lcn << (vol->cluster_size_bits -
836	blocksize_bits);
837	cdelta = `0`;
838	/*
839	* If the number of remaining clusters touched
840	* by the write is smaller or equal to the
841	* number of cached clusters, unlock the
842	* runlist as the map cache will be used from
843	* now on.
844	*/
845	if (likely(vcn + vcn_len >= cend)) {
846	if (rl_write_locked) {
847	up_write(sem: &ni->runlist.lock);
848	rl_write_locked = false;
849	} else
850	up_read(sem: &ni->runlist.lock);
851	rl = NULL;
852	}
853	goto map_buffer_cached;
854	}
855	} else
856	lcn = LCN_RL_NOT_MAPPED;
857	/*
858	* If it is not a hole and not out of bounds, the runlist is
859	* probably unmapped so try to map it now.
860	*/
861	if (unlikely(lcn != LCN_HOLE && lcn != LCN_ENOENT)) {
862	if (likely(!is_retry && lcn == LCN_RL_NOT_MAPPED)) {
863	/ Attempt to map runlist. /
864	if (!rl_write_locked) {
865	/*
866	* We need the runlist locked for
867	* writing, so if it is locked for
868	* reading relock it now and retry in
869	* case it changed whilst we dropped
870	* the lock.
871	*/
872	up_read(sem: &ni->runlist.lock);
873	down_write(sem: &ni->runlist.lock);
874	rl_write_locked = true;
875	goto retry_remap;
876	}
877	err = ntfs_map_runlist_nolock(ni, vcn: bh_cpos,
878	NULL);
879	if (likely(!err)) {
880	is_retry = true;
881	goto retry_remap;
882	}
883	/*
884	* If @vcn is out of bounds, pretend @lcn is
885	* LCN_ENOENT. As long as the buffer is out
886	* of bounds this will work fine.
887	*/
888	if (err == -ENOENT) {
889	lcn = LCN_ENOENT;
890	err = `0`;
891	goto rl_not_mapped_enoent;
892	}
893	} else
894	err = -EIO;
895	/ Failed to map the buffer, even after retrying. /
896	bh->b_blocknr = -`1`;
897	ntfs_error(vol->sb, "Failed to write to inode 0x%lx, "
898	"attribute type 0x%x, vcn 0x%llx, "
899	"vcn offset 0x%x, because its "
900	"location on disk could not be "
901	"determined%s (error code %i).",
902	ni->mft_no, ni->type,
903	(unsigned long long)bh_cpos,
904	(unsigned)bh_pos &
905	vol->cluster_size_mask,
906	is_retry ? " even after retrying" : "",
907	err);
908	break;
909	}
910	rl_not_mapped_enoent:
911	/*
912	* The buffer is in a hole or out of bounds. We need to fill
913	* the hole, unless the buffer is in a cluster which is not
914	* touched by the write, in which case we just leave the buffer
915	* unmapped. This can only happen when the cluster size is
916	* less than the page cache size.
917	*/
918	if (unlikely(vol->cluster_size < PAGE_SIZE)) {
919	bh_cend = (bh_end + vol->cluster_size - `1`) >>
920	vol->cluster_size_bits;
921	if ((bh_cend <= cpos \|\| bh_cpos >= cend)) {
922	bh->b_blocknr = -`1`;
923	/*
924	* If the buffer is uptodate we skip it. If it
925	* is not but the folio is uptodate, we can set
926	* the buffer uptodate. If the folio is not
927	* uptodate, we can clear the buffer and set it
928	* uptodate. Whether this is worthwhile is
929	* debatable and this could be removed.
930	*/
931	if (folio_test_uptodate(folio)) {
932	if (!buffer_uptodate(bh))
933	set_buffer_uptodate(bh);
934	} else if (!buffer_uptodate(bh)) {
935	folio_zero_range(folio, start: bh_offset(bh),
936	length: blocksize);
937	set_buffer_uptodate(bh);
938	}
939	continue;
940	}
941	}
942	/*
943	* Out of bounds buffer is invalid if it was not really out of
944	* bounds.
945	*/
946	BUG_ON(lcn != LCN_HOLE);
947	/*
948	* We need the runlist locked for writing, so if it is locked
949	* for reading relock it now and retry in case it changed
950	* whilst we dropped the lock.
951	*/
952	BUG_ON(!rl);
953	if (!rl_write_locked) {
954	up_read(sem: &ni->runlist.lock);
955	down_write(sem: &ni->runlist.lock);
956	rl_write_locked = true;
957	goto retry_remap;
958	}
959	/ Find the previous last allocated cluster. /
960	BUG_ON(rl->lcn != LCN_HOLE);
961	lcn = -`1`;
962	rl2 = rl;
963	while (--rl2 >= ni->runlist.rl) {
964	if (rl2->lcn >= `0`) {
965	lcn = rl2->lcn + rl2->length;
966	break;
967	}
968	}
969	rl2 = ntfs_cluster_alloc(vol, start_vcn: bh_cpos, count: `1`, start_lcn: lcn, zone: DATA_ZONE,
970	is_extension: false);
971	if (IS_ERR(ptr: rl2)) {
972	err = PTR_ERR(ptr: rl2);
973	ntfs_debug("Failed to allocate cluster, error code %i.",
974	err);
975	break;
976	}
977	lcn = rl2->lcn;
978	rl = ntfs_runlists_merge(drl: ni->runlist.rl, srl: rl2);
979	if (IS_ERR(ptr: rl)) {
980	err = PTR_ERR(ptr: rl);
981	if (err != -ENOMEM)
982	err = -EIO;
983	if (ntfs_cluster_free_from_rl(vol, rl: rl2)) {
984	ntfs_error(vol->sb, "Failed to release "
985	"allocated cluster in error "
986	"code path. Run chkdsk to "
987	"recover the lost cluster.");
988	NVolSetErrors(vol);
989	}
990	ntfs_free(addr: rl2);
991	break;
992	}
993	ni->runlist.rl = rl;
994	status.runlist_merged = `1`;
995	ntfs_debug("Allocated cluster, lcn 0x%llx.",
996	(unsigned long long)lcn);
997	/ Map and lock the mft record and get the attribute record. /
998	if (!NInoAttr(ni))
999	base_ni = ni;
1000	else
1001	base_ni = ni->ext.base_ntfs_ino;
1002	m = map_mft_record(ni: base_ni);
1003	if (IS_ERR(ptr: m)) {
1004	err = PTR_ERR(ptr: m);
1005	break;
1006	}
1007	ctx = ntfs_attr_get_search_ctx(ni: base_ni, mrec: m);
1008	if (unlikely(!ctx)) {
1009	err = -ENOMEM;
1010	unmap_mft_record(ni: base_ni);
1011	break;
1012	}
1013	status.mft_attr_mapped = `1`;
1014	err = ntfs_attr_lookup(type: ni->type, name: ni->name, name_len: ni->name_len,
1015	ic: CASE_SENSITIVE, lowest_vcn: bh_cpos, NULL, val_len: `0`, ctx);
1016	if (unlikely(err)) {
1017	if (err == -ENOENT)
1018	err = -EIO;
1019	break;
1020	}
1021	m = ctx->mrec;
1022	a = ctx->attr;
1023	/*
1024	* Find the runlist element with which the attribute extent
1025	* starts. Note, we cannot use the _attr_ version because we
1026	* have mapped the mft record. That is ok because we know the
1027	* runlist fragment must be mapped already to have ever gotten
1028	* here, so we can just use the _rl_ version.
1029	*/
1030	vcn = sle64_to_cpu(x: a->data.non_resident.lowest_vcn);
1031	rl2 = ntfs_rl_find_vcn_nolock(rl, vcn);
1032	BUG_ON(!rl2);
1033	BUG_ON(!rl2->length);
1034	BUG_ON(rl2->lcn < LCN_HOLE);
1035	highest_vcn = sle64_to_cpu(x: a->data.non_resident.highest_vcn);
1036	/*
1037	* If @highest_vcn is zero, calculate the real highest_vcn
1038	* (which can really be zero).
1039	*/
1040	if (!highest_vcn)
1041	highest_vcn = (sle64_to_cpu(
1042	x: a->data.non_resident.allocated_size) >>
1043	vol->cluster_size_bits) - `1`;
1044	/*
1045	* Determine the size of the mapping pairs array for the new
1046	* extent, i.e. the old extent with the hole filled.
1047	*/
1048	mp_size = ntfs_get_size_for_mapping_pairs(vol, rl: rl2, first_vcn: vcn,
1049	last_vcn: highest_vcn);
1050	if (unlikely(mp_size <= `0`)) {
1051	if (!(err = mp_size))
1052	err = -EIO;
1053	ntfs_debug("Failed to get size for mapping pairs "
1054	"array, error code %i.", err);
1055	break;
1056	}
1057	/*
1058	* Resize the attribute record to fit the new mapping pairs
1059	* array.
1060	*/
1061	attr_rec_len = le32_to_cpu(a->length);
1062	err = ntfs_attr_record_resize(m, a, new_size: mp_size + le16_to_cpu(
1063	a->data.non_resident.mapping_pairs_offset));
1064	if (unlikely(err)) {
1065	BUG_ON(err != -ENOSPC);
1066	// TODO: Deal with this by using the current attribute
1067	// and fill it with as much of the mapping pairs
1068	// array as possible. Then loop over each attribute
1069	// extent rewriting the mapping pairs arrays as we go
1070	// along and if when we reach the end we have not
1071	// enough space, try to resize the last attribute
1072	// extent and if even that fails, add a new attribute
1073	// extent.
1074	// We could also try to resize at each step in the hope
1075	// that we will not need to rewrite every single extent.
1076	// Note, we may need to decompress some extents to fill
1077	// the runlist as we are walking the extents...
1078	ntfs_error(vol->sb, "Not enough space in the mft "
1079	"record for the extended attribute "
1080	"record. This case is not "
1081	"implemented yet.");
1082	err = -EOPNOTSUPP;
1083	break ;
1084	}
1085	status.mp_rebuilt = `1`;
1086	/*
1087	* Generate the mapping pairs array directly into the attribute
1088	* record.
1089	*/
1090	err = ntfs_mapping_pairs_build(vol, dst: (u8*)a + le16_to_cpu(
1091	a->data.non_resident.mapping_pairs_offset),
1092	dst_len: mp_size, rl: rl2, first_vcn: vcn, last_vcn: highest_vcn, NULL);
1093	if (unlikely(err)) {
1094	ntfs_error(vol->sb, "Cannot fill hole in inode 0x%lx, "
1095	"attribute type 0x%x, because building "
1096	"the mapping pairs failed with error "
1097	"code %i.", vi->i_ino,
1098	(unsigned)le32_to_cpu(ni->type), err);
1099	err = -EIO;
1100	break;
1101	}
1102	/ Update the highest_vcn but only if it was not set. /
1103	if (unlikely(!a->data.non_resident.highest_vcn))
1104	a->data.non_resident.highest_vcn =
1105	cpu_to_sle64(x: highest_vcn);
1106	/*
1107	* If the attribute is sparse/compressed, update the compressed
1108	* size in the ntfs_inode structure and the attribute record.
1109	*/
1110	if (likely(NInoSparse(ni) \|\| NInoCompressed(ni))) {
1111	/*
1112	* If we are not in the first attribute extent, switch
1113	* to it, but first ensure the changes will make it to
1114	* disk later.
1115	*/
1116	if (a->data.non_resident.lowest_vcn) {
1117	flush_dcache_mft_record_page(ni: ctx->ntfs_ino);
1118	mark_mft_record_dirty(ni: ctx->ntfs_ino);
1119	ntfs_attr_reinit_search_ctx(ctx);
1120	err = ntfs_attr_lookup(type: ni->type, name: ni->name,
1121	name_len: ni->name_len, ic: CASE_SENSITIVE,
1122	lowest_vcn: `0`, NULL, val_len: `0`, ctx);
1123	if (unlikely(err)) {
1124	status.attr_switched = `1`;
1125	break;
1126	}
1127	/ @m is not used any more so do not set it. /
1128	a = ctx->attr;
1129	}
1130	write_lock_irqsave(&ni->size_lock, flags);
1131	ni->itype.compressed.size += vol->cluster_size;
1132	a->data.non_resident.compressed_size =
1133	cpu_to_sle64(x: ni->itype.compressed.size);
1134	write_unlock_irqrestore(&ni->size_lock, flags);
1135	}
1136	/ Ensure the changes make it to disk. /
1137	flush_dcache_mft_record_page(ni: ctx->ntfs_ino);
1138	mark_mft_record_dirty(ni: ctx->ntfs_ino);
1139	ntfs_attr_put_search_ctx(ctx);
1140	unmap_mft_record(ni: base_ni);
1141	/ Successfully filled the hole. /
1142	status.runlist_merged = `0`;
1143	status.mft_attr_mapped = `0`;
1144	status.mp_rebuilt = `0`;
1145	/ Setup the map cache and use that to deal with the buffer. /
1146	was_hole = true;
1147	vcn = bh_cpos;
1148	vcn_len = `1`;
1149	lcn_block = lcn << (vol->cluster_size_bits - blocksize_bits);
1150	cdelta = `0`;
1151	/*
1152	* If the number of remaining clusters in the @pages is smaller
1153	* or equal to the number of cached clusters, unlock the
1154	* runlist as the map cache will be used from now on.
1155	*/
1156	if (likely(vcn + vcn_len >= cend)) {
1157	up_write(sem: &ni->runlist.lock);
1158	rl_write_locked = false;
1159	rl = NULL;
1160	}
1161	goto map_buffer_cached;
1162	} while (bh_pos += blocksize, (bh = bh->b_this_page) != head);
1163	/ If there are no errors, do the next page. /
1164	if (likely(!err && ++u < nr_pages))
1165	goto do_next_folio;
1166	/ If there are no errors, release the runlist lock if we took it. /
1167	if (likely(!err)) {
1168	if (unlikely(rl_write_locked)) {
1169	up_write(sem: &ni->runlist.lock);
1170	rl_write_locked = false;
1171	} else if (unlikely(rl))
1172	up_read(sem: &ni->runlist.lock);
1173	rl = NULL;
1174	}
1175	/ If we issued read requests, let them complete. /
1176	read_lock_irqsave(&ni->size_lock, flags);
1177	initialized_size = ni->initialized_size;
1178	read_unlock_irqrestore(&ni->size_lock, flags);
1179	while (wait_bh > wait) {
1180	bh = *--wait_bh;
1181	wait_on_buffer(bh);
1182	if (likely(buffer_uptodate(bh))) {
1183	folio = bh->b_folio;
1184	bh_pos = folio_pos(folio) + bh_offset(bh);
1185	/*
1186	* If the buffer overflows the initialized size, need
1187	* to zero the overflowing region.
1188	*/
1189	if (unlikely(bh_pos + blocksize > initialized_size)) {
1190	int ofs = `0`;
1191
1192	if (likely(bh_pos < initialized_size))
1193	ofs = initialized_size - bh_pos;
1194	folio_zero_segment(folio, start: bh_offset(bh) + ofs,
1195	xend: blocksize);
1196	}
1197	} else / if (unlikely(!buffer_uptodate(bh))) /
1198	err = -EIO;
1199	}
1200	if (likely(!err)) {
1201	/ Clear buffer_new on all buffers. /
1202	u = `0`;
1203	do {
1204	bh = head = page_buffers(pages[u]);
1205	do {
1206	if (buffer_new(bh))
1207	clear_buffer_new(bh);
1208	} while ((bh = bh->b_this_page) != head);
1209	} while (++u < nr_pages);
1210	ntfs_debug("Done.");
1211	return err;
1212	}
1213	if (status.attr_switched) {
1214	/ Get back to the attribute extent we modified. /
1215	ntfs_attr_reinit_search_ctx(ctx);
1216	if (ntfs_attr_lookup(type: ni->type, name: ni->name, name_len: ni->name_len,
1217	ic: CASE_SENSITIVE, lowest_vcn: bh_cpos, NULL, val_len: `0`, ctx)) {
1218	ntfs_error(vol->sb, "Failed to find required "
1219	"attribute extent of attribute in "
1220	"error code path. Run chkdsk to "
1221	"recover.");
1222	write_lock_irqsave(&ni->size_lock, flags);
1223	ni->itype.compressed.size += vol->cluster_size;
1224	write_unlock_irqrestore(&ni->size_lock, flags);
1225	flush_dcache_mft_record_page(ni: ctx->ntfs_ino);
1226	mark_mft_record_dirty(ni: ctx->ntfs_ino);
1227	/*
1228	* The only thing that is now wrong is the compressed
1229	* size of the base attribute extent which chkdsk
1230	* should be able to fix.
1231	*/
1232	NVolSetErrors(vol);
1233	} else {
1234	m = ctx->mrec;
1235	a = ctx->attr;
1236	status.attr_switched = `0`;
1237	}
1238	}
1239	/*
1240	* If the runlist has been modified, need to restore it by punching a
1241	* hole into it and we then need to deallocate the on-disk cluster as
1242	* well. Note, we only modify the runlist if we are able to generate a
1243	* new mapping pairs array, i.e. only when the mapped attribute extent
1244	* is not switched.
1245	*/
1246	if (status.runlist_merged && !status.attr_switched) {
1247	BUG_ON(!rl_write_locked);
1248	/ Make the file cluster we allocated sparse in the runlist. /
1249	if (ntfs_rl_punch_nolock(vol, runlist: &ni->runlist, start: bh_cpos, length: `1`)) {
1250	ntfs_error(vol->sb, "Failed to punch hole into "
1251	"attribute runlist in error code "
1252	"path. Run chkdsk to recover the "
1253	"lost cluster.");
1254	NVolSetErrors(vol);
1255	} else / if (success) / {
1256	status.runlist_merged = `0`;
1257	/*
1258	* Deallocate the on-disk cluster we allocated but only
1259	* if we succeeded in punching its vcn out of the
1260	* runlist.
1261	*/
1262	down_write(sem: &vol->lcnbmp_lock);
1263	if (ntfs_bitmap_clear_bit(vi: vol->lcnbmp_ino, bit: lcn)) {
1264	ntfs_error(vol->sb, "Failed to release "
1265	"allocated cluster in error "
1266	"code path. Run chkdsk to "
1267	"recover the lost cluster.");
1268	NVolSetErrors(vol);
1269	}
1270	up_write(sem: &vol->lcnbmp_lock);
1271	}
1272	}
1273	/*
1274	* Resize the attribute record to its old size and rebuild the mapping
1275	* pairs array. Note, we only can do this if the runlist has been
1276	* restored to its old state which also implies that the mapped
1277	* attribute extent is not switched.
1278	*/
1279	if (status.mp_rebuilt && !status.runlist_merged) {
1280	if (ntfs_attr_record_resize(m, a, new_size: attr_rec_len)) {
1281	ntfs_error(vol->sb, "Failed to restore attribute "
1282	"record in error code path. Run "
1283	"chkdsk to recover.");
1284	NVolSetErrors(vol);
1285	} else / if (success) / {
1286	if (ntfs_mapping_pairs_build(vol, dst: (u8*)a +
1287	le16_to_cpu(a->data.non_resident.
1288	mapping_pairs_offset), dst_len: attr_rec_len -
1289	le16_to_cpu(a->data.non_resident.
1290	mapping_pairs_offset), rl: ni->runlist.rl,
1291	first_vcn: vcn, last_vcn: highest_vcn, NULL)) {
1292	ntfs_error(vol->sb, "Failed to restore "
1293	"mapping pairs array in error "
1294	"code path. Run chkdsk to "
1295	"recover.");
1296	NVolSetErrors(vol);
1297	}
1298	flush_dcache_mft_record_page(ni: ctx->ntfs_ino);
1299	mark_mft_record_dirty(ni: ctx->ntfs_ino);
1300	}
1301	}
1302	/ Release the mft record and the attribute. /
1303	if (status.mft_attr_mapped) {
1304	ntfs_attr_put_search_ctx(ctx);
1305	unmap_mft_record(ni: base_ni);
1306	}
1307	/ Release the runlist lock. /
1308	if (rl_write_locked)
1309	up_write(sem: &ni->runlist.lock);
1310	else if (rl)
1311	up_read(sem: &ni->runlist.lock);
1312	/*
1313	* Zero out any newly allocated blocks to avoid exposing stale data.
1314	* If BH_New is set, we know that the block was newly allocated above
1315	* and that it has not been fully zeroed and marked dirty yet.
1316	*/
1317	nr_pages = u;
1318	u = `0`;
1319	end = bh_cpos << vol->cluster_size_bits;
1320	do {
1321	folio = page_folio(pages[u]);
1322	bh = head = folio_buffers(folio);
1323	do {
1324	if (u == nr_pages &&
1325	folio_pos(folio) + bh_offset(bh) >= end)
1326	break;
1327	if (!buffer_new(bh))
1328	continue;
1329	clear_buffer_new(bh);
1330	if (!buffer_uptodate(bh)) {
1331	if (folio_test_uptodate(folio))
1332	set_buffer_uptodate(bh);
1333	else {
1334	folio_zero_range(folio, start: bh_offset(bh),
1335	length: blocksize);
1336	set_buffer_uptodate(bh);
1337	}
1338	}
1339	mark_buffer_dirty(bh);
1340	} while ((bh = bh->b_this_page) != head);
1341	} while (++u <= nr_pages);
1342	ntfs_error(vol->sb, "Failed. Returning error code %i.", err);
1343	return err;
1344	}
1345
1346	static inline void ntfs_flush_dcache_pages(struct page **pages,
1347	unsigned nr_pages)
1348	{
1349	BUG_ON(!nr_pages);
1350	/*
1351	* Warning: Do not do the decrement at the same time as the call to
1352	* flush_dcache_page() because it is a NULL macro on i386 and hence the
1353	* decrement never happens so the loop never terminates.
1354	*/
1355	do {
1356	--nr_pages;
1357	flush_dcache_page(page: pages[nr_pages]);
1358	} while (nr_pages > `0`);
1359	}
1360
1361	/**
1362	* ntfs_commit_pages_after_non_resident_write - commit the received data
1363	* @pages: array of destination pages
1364	* @nr_pages: number of pages in @pages
1365	* @pos: byte position in file at which the write begins
1366	* @bytes: number of bytes to be written
1367	*
1368	* See description of ntfs_commit_pages_after_write(), below.
1369	*/
1370	static inline int ntfs_commit_pages_after_non_resident_write(
1371	struct page *pages, const* unsigned nr_pages,
1372	s64 pos, size_t bytes)
1373	{
1374	s64 end, initialized_size;
1375	struct inode *vi;
1376	ntfs_inode ni, base_ni;
1377	struct buffer_head bh, head;
1378	ntfs_attr_search_ctx *ctx;
1379	MFT_RECORD *m;
1380	ATTR_RECORD *a;
1381	unsigned long flags;
1382	unsigned blocksize, u;
1383	int err;
1384
1385	vi = pages[`0`]->mapping->host;
1386	ni = NTFS_I(inode: vi);
1387	blocksize = vi->i_sb->s_blocksize;
1388	end = pos + bytes;
1389	u = `0`;
1390	do {
1391	s64 bh_pos;
1392	struct page *page;
1393	bool partial;
1394
1395	page = pages[u];
1396	bh_pos = (s64)page->index << PAGE_SHIFT;
1397	bh = head = page_buffers(page);
1398	partial = false;
1399	do {
1400	s64 bh_end;
1401
1402	bh_end = bh_pos + blocksize;
1403	if (bh_end <= pos \|\| bh_pos >= end) {
1404	if (!buffer_uptodate(bh))
1405	partial = true;
1406	} else {
1407	set_buffer_uptodate(bh);
1408	mark_buffer_dirty(bh);
1409	}
1410	} while (bh_pos += blocksize, (bh = bh->b_this_page) != head);
1411	/*
1412	* If all buffers are now uptodate but the page is not, set the
1413	* page uptodate.
1414	*/
1415	if (!partial && !PageUptodate(page))
1416	SetPageUptodate(page);
1417	} while (++u < nr_pages);
1418	/*
1419	* Finally, if we do not need to update initialized_size or i_size we
1420	* are finished.
1421	*/
1422	read_lock_irqsave(&ni->size_lock, flags);
1423	initialized_size = ni->initialized_size;
1424	read_unlock_irqrestore(&ni->size_lock, flags);
1425	if (end <= initialized_size) {
1426	ntfs_debug("Done.");
1427	return `0`;
1428	}
1429	/*
1430	* Update initialized_size/i_size as appropriate, both in the inode and
1431	* the mft record.
1432	*/
1433	if (!NInoAttr(ni))
1434	base_ni = ni;
1435	else
1436	base_ni = ni->ext.base_ntfs_ino;
1437	/ Map, pin, and lock the mft record. /
1438	m = map_mft_record(ni: base_ni);
1439	if (IS_ERR(ptr: m)) {
1440	err = PTR_ERR(ptr: m);
1441	m = NULL;
1442	ctx = NULL;
1443	goto err_out;
1444	}
1445	BUG_ON(!NInoNonResident(ni));
1446	ctx = ntfs_attr_get_search_ctx(ni: base_ni, mrec: m);
1447	if (unlikely(!ctx)) {
1448	err = -ENOMEM;
1449	goto err_out;
1450	}
1451	err = ntfs_attr_lookup(type: ni->type, name: ni->name, name_len: ni->name_len,
1452	ic: CASE_SENSITIVE, lowest_vcn: `0`, NULL, val_len: `0`, ctx);
1453	if (unlikely(err)) {
1454	if (err == -ENOENT)
1455	err = -EIO;
1456	goto err_out;
1457	}
1458	a = ctx->attr;
1459	BUG_ON(!a->non_resident);
1460	write_lock_irqsave(&ni->size_lock, flags);
1461	BUG_ON(end > ni->allocated_size);
1462	ni->initialized_size = end;
1463	a->data.non_resident.initialized_size = cpu_to_sle64(x: end);
1464	if (end > i_size_read(inode: vi)) {
1465	i_size_write(inode: vi, i_size: end);
1466	a->data.non_resident.data_size =
1467	a->data.non_resident.initialized_size;
1468	}
1469	write_unlock_irqrestore(&ni->size_lock, flags);
1470	/ Mark the mft record dirty, so it gets written back. /
1471	flush_dcache_mft_record_page(ni: ctx->ntfs_ino);
1472	mark_mft_record_dirty(ni: ctx->ntfs_ino);
1473	ntfs_attr_put_search_ctx(ctx);
1474	unmap_mft_record(ni: base_ni);
1475	ntfs_debug("Done.");
1476	return `0`;
1477	err_out:
1478	if (ctx)
1479	ntfs_attr_put_search_ctx(ctx);
1480	if (m)
1481	unmap_mft_record(ni: base_ni);
1482	ntfs_error(vi->i_sb, "Failed to update initialized_size/i_size (error "
1483	"code %i).", err);
1484	if (err != -ENOMEM)
1485	NVolSetErrors(vol: ni->vol);
1486	return err;
1487	}
1488
1489	/**
1490	* ntfs_commit_pages_after_write - commit the received data
1491	* @pages: array of destination pages
1492	* @nr_pages: number of pages in @pages
1493	* @pos: byte position in file at which the write begins
1494	* @bytes: number of bytes to be written
1495	*
1496	* This is called from ntfs_file_buffered_write() with i_mutex held on the inode
1497	* (@pages[0]->mapping->host). There are @nr_pages pages in @pages which are
1498	* locked but not kmap()ped. The source data has already been copied into the
1499	* @page. ntfs_prepare_pages_for_non_resident_write() has been called before
1500	* the data was copied (for non-resident attributes only) and it returned
1501	* success.
1502	*
1503	* Need to set uptodate and mark dirty all buffers within the boundary of the
1504	* write. If all buffers in a page are uptodate we set the page uptodate, too.
1505	*
1506	* Setting the buffers dirty ensures that they get written out later when
1507	* ntfs_writepage() is invoked by the VM.
1508	*
1509	* Finally, we need to update i_size and initialized_size as appropriate both
1510	* in the inode and the mft record.
1511	*
1512	* This is modelled after fs/buffer.c::generic_commit_write(), which marks
1513	* buffers uptodate and dirty, sets the page uptodate if all buffers in the
1514	* page are uptodate, and updates i_size if the end of io is beyond i_size. In
1515	* that case, it also marks the inode dirty.
1516	*
1517	* If things have gone as outlined in
1518	* ntfs_prepare_pages_for_non_resident_write(), we do not need to do any page
1519	* content modifications here for non-resident attributes. For resident
1520	* attributes we need to do the uptodate bringing here which we combine with
1521	* the copying into the mft record which means we save one atomic kmap.
1522	*
1523	* Return 0 on success or -errno on error.
1524	*/
1525	static int ntfs_commit_pages_after_write(struct page **pages,
1526	const unsigned nr_pages, s64 pos, size_t bytes)
1527	{
1528	s64 end, initialized_size;
1529	loff_t i_size;
1530	struct inode *vi;
1531	ntfs_inode ni, base_ni;
1532	struct page *page;
1533	ntfs_attr_search_ctx *ctx;
1534	MFT_RECORD *m;
1535	ATTR_RECORD *a;
1536	char kattr, kaddr;
1537	unsigned long flags;
1538	u32 attr_len;
1539	int err;
1540
1541	BUG_ON(!nr_pages);
1542	BUG_ON(!pages);
1543	page = pages[`0`];
1544	BUG_ON(!page);
1545	vi = page->mapping->host;
1546	ni = NTFS_I(inode: vi);
1547	ntfs_debug("Entering for inode 0x%lx, attribute type 0x%x, start page "
1548	"index 0x%lx, nr_pages 0x%x, pos 0x%llx, bytes 0x%zx.",
1549	vi->i_ino, ni->type, page->index, nr_pages,
1550	(long long)pos, bytes);
1551	if (NInoNonResident(ni))
1552	return ntfs_commit_pages_after_non_resident_write(pages,
1553	nr_pages, pos, bytes);
1554	BUG_ON(nr_pages > `1`);
1555	/*
1556	* Attribute is resident, implying it is not compressed, encrypted, or
1557	* sparse.
1558	*/
1559	if (!NInoAttr(ni))
1560	base_ni = ni;
1561	else
1562	base_ni = ni->ext.base_ntfs_ino;
1563	BUG_ON(NInoNonResident(ni));
1564	/ Map, pin, and lock the mft record. /
1565	m = map_mft_record(ni: base_ni);
1566	if (IS_ERR(ptr: m)) {
1567	err = PTR_ERR(ptr: m);
1568	m = NULL;
1569	ctx = NULL;
1570	goto err_out;
1571	}
1572	ctx = ntfs_attr_get_search_ctx(ni: base_ni, mrec: m);
1573	if (unlikely(!ctx)) {
1574	err = -ENOMEM;
1575	goto err_out;
1576	}
1577	err = ntfs_attr_lookup(type: ni->type, name: ni->name, name_len: ni->name_len,
1578	ic: CASE_SENSITIVE, lowest_vcn: `0`, NULL, val_len: `0`, ctx);
1579	if (unlikely(err)) {
1580	if (err == -ENOENT)
1581	err = -EIO;
1582	goto err_out;
1583	}
1584	a = ctx->attr;
1585	BUG_ON(a->non_resident);
1586	/ The total length of the attribute value. /
1587	attr_len = le32_to_cpu(a->data.resident.value_length);
1588	i_size = i_size_read(inode: vi);
1589	BUG_ON(attr_len != i_size);
1590	BUG_ON(pos > attr_len);
1591	end = pos + bytes;
1592	BUG_ON(end > le32_to_cpu(a->length) -
1593	le16_to_cpu(a->data.resident.value_offset));
1594	kattr = (u8*)a + le16_to_cpu(a->data.resident.value_offset);
1595	kaddr = kmap_atomic(page);
1596	/ Copy the received data from the page to the mft record. /
1597	memcpy(kattr + pos, kaddr + pos, bytes);
1598	/ Update the attribute length if necessary. /
1599	if (end > attr_len) {
1600	attr_len = end;
1601	a->data.resident.value_length = cpu_to_le32(attr_len);
1602	}
1603	/*
1604	* If the page is not uptodate, bring the out of bounds area(s)
1605	* uptodate by copying data from the mft record to the page.
1606	*/
1607	if (!PageUptodate(page)) {
1608	if (pos > `0`)
1609	memcpy(kaddr, kattr, pos);
1610	if (end < attr_len)
1611	memcpy(kaddr + end, kattr + end, attr_len - end);
1612	/ Zero the region outside the end of the attribute value. /
1613	memset(kaddr + attr_len, `0`, PAGE_SIZE - attr_len);
1614	flush_dcache_page(page);
1615	SetPageUptodate(page);
1616	}
1617	kunmap_atomic(kaddr);
1618	/ Update initialized_size/i_size if necessary. /
1619	read_lock_irqsave(&ni->size_lock, flags);
1620	initialized_size = ni->initialized_size;
1621	BUG_ON(end > ni->allocated_size);
1622	read_unlock_irqrestore(&ni->size_lock, flags);
1623	BUG_ON(initialized_size != i_size);
1624	if (end > initialized_size) {
1625	write_lock_irqsave(&ni->size_lock, flags);
1626	ni->initialized_size = end;
1627	i_size_write(inode: vi, i_size: end);
1628	write_unlock_irqrestore(&ni->size_lock, flags);
1629	}
1630	/ Mark the mft record dirty, so it gets written back. /
1631	flush_dcache_mft_record_page(ni: ctx->ntfs_ino);
1632	mark_mft_record_dirty(ni: ctx->ntfs_ino);
1633	ntfs_attr_put_search_ctx(ctx);
1634	unmap_mft_record(ni: base_ni);
1635	ntfs_debug("Done.");
1636	return `0`;
1637	err_out:
1638	if (err == -ENOMEM) {
1639	ntfs_warning(vi->i_sb, "Error allocating memory required to "
1640	"commit the write.");
1641	if (PageUptodate(page)) {
1642	ntfs_warning(vi->i_sb, "Page is uptodate, setting "
1643	"dirty so the write will be retried "
1644	"later on by the VM.");
1645	/*
1646	* Put the page on mapping->dirty_pages, but leave its
1647	* buffers' dirty state as-is.
1648	*/
1649	__set_page_dirty_nobuffers(page);
1650	err = `0`;
1651	} else
1652	ntfs_error(vi->i_sb, "Page is not uptodate. Written "
1653	"data has been lost.");
1654	} else {
1655	ntfs_error(vi->i_sb, "Resident attribute commit write failed "
1656	"with error %i.", err);
1657	NVolSetErrors(vol: ni->vol);
1658	}
1659	if (ctx)
1660	ntfs_attr_put_search_ctx(ctx);
1661	if (m)
1662	unmap_mft_record(ni: base_ni);
1663	return err;
1664	}
1665
1666	/*
1667	* Copy as much as we can into the pages and return the number of bytes which
1668	* were successfully copied. If a fault is encountered then clear the pages
1669	* out to (ofs + bytes) and return the number of bytes which were copied.
1670	*/
1671	static size_t ntfs_copy_from_user_iter(struct page *pages, unsigned* nr_pages,
1672	unsigned ofs, struct iov_iter *i, size_t bytes)
1673	{
1674	struct page **last_page = pages + nr_pages;
1675	size_t total = `0`;
1676	unsigned len, copied;
1677
1678	do {
1679	len = PAGE_SIZE - ofs;
1680	if (len > bytes)
1681	len = bytes;
1682	copied = copy_page_from_iter_atomic(page: *pages, offset: ofs, bytes: len, i);
1683	total += copied;
1684	bytes -= copied;
1685	if (!bytes)
1686	break;
1687	if (copied < len)
1688	goto err;
1689	ofs = `0`;
1690	} while (++pages < last_page);
1691	out:
1692	return total;
1693	err:
1694	/ Zero the rest of the target like __copy_from_user(). /
1695	len = PAGE_SIZE - copied;
1696	do {
1697	if (len > bytes)
1698	len = bytes;
1699	zero_user(page: *pages, start: copied, size: len);
1700	bytes -= len;
1701	copied = `0`;
1702	len = PAGE_SIZE;
1703	} while (++pages < last_page);
1704	goto out;
1705	}
1706
1707	/**
1708	* ntfs_perform_write - perform buffered write to a file
1709	* @file: file to write to
1710	* @i: iov_iter with data to write
1711	* @pos: byte offset in file at which to begin writing to
1712	*/
1713	static ssize_t ntfs_perform_write(struct file file, struct* iov_iter *i,
1714	loff_t pos)
1715	{
1716	struct address_space *mapping = file->f_mapping;
1717	struct inode *vi = mapping->host;
1718	ntfs_inode *ni = NTFS_I(inode: vi);
1719	ntfs_volume *vol = ni->vol;
1720	struct page *pages[NTFS_MAX_PAGES_PER_CLUSTER];
1721	struct page *cached_page = NULL;
1722	VCN last_vcn;
1723	LCN lcn;
1724	size_t bytes;
1725	ssize_t status, written = `0`;
1726	unsigned nr_pages;
1727
1728	ntfs_debug("Entering for i_ino 0x%lx, attribute type 0x%x, pos "
1729	"0x%llx, count 0x%lx.", vi->i_ino,
1730	(unsigned)le32_to_cpu(ni->type),
1731	(unsigned long long)pos,
1732	(unsigned long)iov_iter_count(i));
1733	/*
1734	* If a previous ntfs_truncate() failed, repeat it and abort if it
1735	* fails again.
1736	*/
1737	if (unlikely(NInoTruncateFailed(ni))) {
1738	int err;
1739
1740	inode_dio_wait(inode: vi);
1741	err = ntfs_truncate(vi);
1742	if (err \|\| NInoTruncateFailed(ni)) {
1743	if (!err)
1744	err = -EIO;
1745	ntfs_error(vol->sb, "Cannot perform write to inode "
1746	"0x%lx, attribute type 0x%x, because "
1747	"ntfs_truncate() failed (error code "
1748	"%i).", vi->i_ino,
1749	(unsigned)le32_to_cpu(ni->type), err);
1750	return err;
1751	}
1752	}
1753	/*
1754	* Determine the number of pages per cluster for non-resident
1755	* attributes.
1756	*/
1757	nr_pages = `1`;
1758	if (vol->cluster_size > PAGE_SIZE && NInoNonResident(ni))
1759	nr_pages = vol->cluster_size >> PAGE_SHIFT;
1760	last_vcn = -`1`;
1761	do {
1762	VCN vcn;
1763	pgoff_t start_idx;
1764	unsigned ofs, do_pages, u;
1765	size_t copied;
1766
1767	start_idx = pos >> PAGE_SHIFT;
1768	ofs = pos & ~PAGE_MASK;
1769	bytes = PAGE_SIZE - ofs;
1770	do_pages = `1`;
1771	if (nr_pages > `1`) {
1772	vcn = pos >> vol->cluster_size_bits;
1773	if (vcn != last_vcn) {
1774	last_vcn = vcn;
1775	/*
1776	* Get the lcn of the vcn the write is in. If
1777	* it is a hole, need to lock down all pages in
1778	* the cluster.
1779	*/
1780	down_read(sem: &ni->runlist.lock);
1781	lcn = ntfs_attr_vcn_to_lcn_nolock(ni, vcn: pos >>
1782	vol->cluster_size_bits, write_locked: false);
1783	up_read(sem: &ni->runlist.lock);
1784	if (unlikely(lcn < LCN_HOLE)) {
1785	if (lcn == LCN_ENOMEM)
1786	status = -ENOMEM;
1787	else {
1788	status = -EIO;
1789	ntfs_error(vol->sb, "Cannot "
1790	"perform write to "
1791	"inode 0x%lx, "
1792	"attribute type 0x%x, "
1793	"because the attribute "
1794	"is corrupt.",
1795	vi->i_ino, (unsigned)
1796	le32_to_cpu(ni->type));
1797	}
1798	break;
1799	}
1800	if (lcn == LCN_HOLE) {
1801	start_idx = (pos & ~(s64)
1802	vol->cluster_size_mask)
1803	>> PAGE_SHIFT;
1804	bytes = vol->cluster_size - (pos &
1805	vol->cluster_size_mask);
1806	do_pages = nr_pages;
1807	}
1808	}
1809	}
1810	if (bytes > iov_iter_count(i))
1811	bytes = iov_iter_count(i);
1812	again:
1813	/*
1814	* Bring in the user page(s) that we will copy from _first_.
1815	* Otherwise there is a nasty deadlock on copying from the same
1816	* page(s) as we are writing to, without it/them being marked
1817	* up-to-date. Note, at present there is nothing to stop the
1818	* pages being swapped out between us bringing them into memory
1819	* and doing the actual copying.
1820	*/
1821	if (unlikely(fault_in_iov_iter_readable(i, bytes))) {
1822	status = -EFAULT;
1823	break;
1824	}
1825	/ Get and lock @do_pages starting at index @start_idx. /
1826	status = __ntfs_grab_cache_pages(mapping, index: start_idx, nr_pages: do_pages,
1827	pages, cached_page: &cached_page);
1828	if (unlikely(status))
1829	break;
1830	/*
1831	* For non-resident attributes, we need to fill any holes with
1832	* actual clusters and ensure all bufferes are mapped. We also
1833	* need to bring uptodate any buffers that are only partially
1834	* being written to.
1835	*/
1836	if (NInoNonResident(ni)) {
1837	status = ntfs_prepare_pages_for_non_resident_write(
1838	pages, nr_pages: do_pages, pos, bytes);
1839	if (unlikely(status)) {
1840	do {
1841	unlock_page(page: pages[--do_pages]);
1842	put_page(page: pages[do_pages]);
1843	} while (do_pages);
1844	break;
1845	}
1846	}
1847	u = (pos >> PAGE_SHIFT) - pages[`0`]->index;
1848	copied = ntfs_copy_from_user_iter(pages: pages + u, nr_pages: do_pages - u, ofs,
1849	i, bytes);
1850	ntfs_flush_dcache_pages(pages: pages + u, nr_pages: do_pages - u);
1851	status = `0`;
1852	if (likely(copied == bytes)) {
1853	status = ntfs_commit_pages_after_write(pages, nr_pages: do_pages,
1854	pos, bytes);
1855	}
1856	do {
1857	unlock_page(page: pages[--do_pages]);
1858	put_page(page: pages[do_pages]);
1859	} while (do_pages);
1860	if (unlikely(status < `0`)) {
1861	iov_iter_revert(i, bytes: copied);
1862	break;
1863	}
1864	cond_resched();
1865	if (unlikely(copied < bytes)) {
1866	iov_iter_revert(i, bytes: copied);
1867	if (copied)
1868	bytes = copied;
1869	else if (bytes > PAGE_SIZE - ofs)
1870	bytes = PAGE_SIZE - ofs;
1871	goto again;
1872	}
1873	pos += copied;
1874	written += copied;
1875	balance_dirty_pages_ratelimited(mapping);
1876	if (fatal_signal_pending(current)) {
1877	status = -EINTR;
1878	break;
1879	}
1880	} while (iov_iter_count(i));
1881	if (cached_page)
1882	put_page(page: cached_page);
1883	ntfs_debug("Done. Returning %s (written 0x%lx, status %li).",
1884	written ? "written" : "status", (unsigned long)written,
1885	(long)status);
1886	return written ? written : status;
1887	}
1888
1889	/**
1890	* ntfs_file_write_iter - simple wrapper for ntfs_file_write_iter_nolock()
1891	* @iocb: IO state structure
1892	* @from: iov_iter with data to write
1893	*
1894	* Basically the same as generic_file_write_iter() except that it ends up
1895	* up calling ntfs_perform_write() instead of generic_perform_write() and that
1896	* O_DIRECT is not implemented.
1897	*/
1898	static ssize_t ntfs_file_write_iter(struct kiocb iocb, struct* iov_iter *from)
1899	{
1900	struct file *file = iocb->ki_filp;
1901	struct inode *vi = file_inode(f: file);
1902	ssize_t written = `0`;
1903	ssize_t err;
1904
1905	inode_lock(inode: vi);
1906	/ We can write back this queue in page reclaim. /
1907	err = ntfs_prepare_file_for_write(iocb, from);
1908	if (iov_iter_count(i: from) && !err)
1909	written = ntfs_perform_write(file, i: from, pos: iocb->ki_pos);
1910	inode_unlock(inode: vi);
1911	iocb->ki_pos += written;
1912	if (likely(written > `0`))
1913	written = generic_write_sync(iocb, count: written);
1914	return written ? written : err;
1915	}
1916
1917	/**
1918	* ntfs_file_fsync - sync a file to disk
1919	* @filp: file to be synced
1920	* @datasync: if non-zero only flush user data and not metadata
1921	*
1922	* Data integrity sync of a file to disk. Used for fsync, fdatasync, and msync
1923	* system calls. This function is inspired by fs/buffer.c::file_fsync().
1924	*
1925	* If @datasync is false, write the mft record and all associated extent mft
1926	* records as well as the $DATA attribute and then sync the block device.
1927	*
1928	* If @datasync is true and the attribute is non-resident, we skip the writing
1929	* of the mft record and all associated extent mft records (this might still
1930	* happen due to the write_inode_now() call).
1931	*
1932	* Also, if @datasync is true, we do not wait on the inode to be written out
1933	* but we always wait on the page cache pages to be written out.
1934	*
1935	* Locking: Caller must hold i_mutex on the inode.
1936	*
1937	* TODO: We should probably also write all attribute/index inodes associated
1938	* with this inode but since we have no simple way of getting to them we ignore
1939	* this problem for now.
1940	*/
1941	static int ntfs_file_fsync(struct file *filp, loff_t start, loff_t end,
1942	int datasync)
1943	{
1944	struct inode *vi = filp->f_mapping->host;
1945	int err, ret = `0`;
1946
1947	ntfs_debug("Entering for inode 0x%lx.", vi->i_ino);
1948
1949	err = file_write_and_wait_range(file: filp, start, end);
1950	if (err)
1951	return err;
1952	inode_lock(inode: vi);
1953
1954	BUG_ON(S_ISDIR(vi->i_mode));
1955	if (!datasync \|\| !NInoNonResident(ni: NTFS_I(inode: vi)))
1956	ret = __ntfs_write_inode(vi, sync: `1`);
1957	write_inode_now(vi, sync: !datasync);
1958	/*
1959	* NOTE: If we were to use mapping->private_list (see ext2 and
1960	* fs/buffer.c) for dirty blocks then we could optimize the below to be
1961	* sync_mapping_buffers(vi->i_mapping).
1962	*/
1963	err = sync_blockdev(bdev: vi->i_sb->s_bdev);
1964	if (unlikely(err && !ret))
1965	ret = err;
1966	if (likely(!ret))
1967	ntfs_debug("Done.");
1968	else
1969	ntfs_warning(vi->i_sb, "Failed to f%ssync inode 0x%lx. Error "
1970	"%u.", datasync ? "data" : "", vi->i_ino, -ret);
1971	inode_unlock(inode: vi);
1972	return ret;
1973	}
1974
1975	#endif /* NTFS_RW */
1976
1977	const struct file_operations ntfs_file_ops = {
1978	.llseek = generic_file_llseek,
1979	.read_iter = generic_file_read_iter,
1980	#ifdef NTFS_RW
1981	.write_iter = ntfs_file_write_iter,
1982	.fsync = ntfs_file_fsync,
1983	#endif /* NTFS_RW */
1984	.mmap = generic_file_mmap,
1985	.open = ntfs_file_open,
1986	.splice_read = filemap_splice_read,
1987	};
1988
1989	const struct inode_operations ntfs_file_inode_ops = {
1990	#ifdef NTFS_RW
1991	.setattr = ntfs_setattr,
1992	#endif /* NTFS_RW */
1993	};
1994
1995	const struct file_operations ntfs_empty_file_ops = {};
1996
1997	const struct inode_operations ntfs_empty_inode_ops = {};
1998

source code of linux/fs/ntfs/file.c