aops.c source code [linux/fs/ntfs/aops.c]

1	// SPDX-License-Identifier: GPL-2.0-or-later
2	/*
3	* aops.c - NTFS kernel address space operations and page cache handling.
4	*
5	* Copyright (c) 2001-2014 Anton Altaparmakov and Tuxera Inc.
6	* Copyright (c) 2002 Richard Russon
7	*/
8
9	#include <linux/errno.h>
10	#include <linux/fs.h>
11	#include <linux/gfp.h>
12	#include <linux/mm.h>
13	#include <linux/pagemap.h>
14	#include <linux/swap.h>
15	#include <linux/buffer_head.h>
16	#include <linux/writeback.h>
17	#include <linux/bit_spinlock.h>
18	#include <linux/bio.h>
19
20	#include "aops.h"
21	#include "attrib.h"
22	#include "debug.h"
23	#include "inode.h"
24	#include "mft.h"
25	#include "runlist.h"
26	#include "types.h"
27	#include "ntfs.h"
28
29	/**
30	* ntfs_end_buffer_async_read - async io completion for reading attributes
31	* @bh: buffer head on which io is completed
32	* @uptodate: whether @bh is now uptodate or not
33	*
34	* Asynchronous I/O completion handler for reading pages belonging to the
35	* attribute address space of an inode. The inodes can either be files or
36	* directories or they can be fake inodes describing some attribute.
37	*
38	* If NInoMstProtected(), perform the post read mst fixups when all IO on the
39	* page has been completed and mark the page uptodate or set the error bit on
40	* the page. To determine the size of the records that need fixing up, we
41	* cheat a little bit by setting the index_block_size in ntfs_inode to the ntfs
42	* record size, and index_block_size_bits, to the log(base 2) of the ntfs
43	* record size.
44	*/
45	static void ntfs_end_buffer_async_read(struct buffer_head bh, int* uptodate)
46	{
47	unsigned long flags;
48	struct buffer_head first, tmp;
49	struct page *page;
50	struct inode *vi;
51	ntfs_inode *ni;
52	int page_uptodate = `1`;
53
54	page = bh->b_page;
55	vi = page->mapping->host;
56	ni = NTFS_I(inode: vi);
57
58	if (likely(uptodate)) {
59	loff_t i_size;
60	s64 file_ofs, init_size;
61
62	set_buffer_uptodate(bh);
63
64	file_ofs = ((s64)page->index << PAGE_SHIFT) +
65	bh_offset(bh);
66	read_lock_irqsave(&ni->size_lock, flags);
67	init_size = ni->initialized_size;
68	i_size = i_size_read(inode: vi);
69	read_unlock_irqrestore(&ni->size_lock, flags);
70	if (unlikely(init_size > i_size)) {
71	/ Race with shrinking truncate. /
72	init_size = i_size;
73	}
74	/ Check for the current buffer head overflowing. /
75	if (unlikely(file_ofs + bh->b_size > init_size)) {
76	int ofs;
77	void *kaddr;
78
79	ofs = `0`;
80	if (file_ofs < init_size)
81	ofs = init_size - file_ofs;
82	kaddr = kmap_atomic(page);
83	memset(kaddr + bh_offset(bh) + ofs, `0`,
84	bh->b_size - ofs);
85	flush_dcache_page(page);
86	kunmap_atomic(kaddr);
87	}
88	} else {
89	clear_buffer_uptodate(bh);
90	SetPageError(page);
91	ntfs_error(ni->vol->sb, "Buffer I/O error, logical block "
92	"0x%llx.", (unsigned long long)bh->b_blocknr);
93	}
94	first = page_buffers(page);
95	spin_lock_irqsave(&first->b_uptodate_lock, flags);
96	clear_buffer_async_read(bh);
97	unlock_buffer(bh);
98	tmp = bh;
99	do {
100	if (!buffer_uptodate(bh: tmp))
101	page_uptodate = `0`;
102	if (buffer_async_read(bh: tmp)) {
103	if (likely(buffer_locked(tmp)))
104	goto still_busy;
105	/ Async buffers must be locked. /
106	BUG();
107	}
108	tmp = tmp->b_this_page;
109	} while (tmp != bh);
110	spin_unlock_irqrestore(lock: &first->b_uptodate_lock, flags);
111	/*
112	* If none of the buffers had errors then we can set the page uptodate,
113	* but we first have to perform the post read mst fixups, if the
114	* attribute is mst protected, i.e. if NInoMstProteced(ni) is true.
115	* Note we ignore fixup errors as those are detected when
116	* map_mft_record() is called which gives us per record granularity
117	* rather than per page granularity.
118	*/
119	if (!NInoMstProtected(ni)) {
120	if (likely(page_uptodate && !PageError(page)))
121	SetPageUptodate(page);
122	} else {
123	u8 *kaddr;
124	unsigned int i, recs;
125	u32 rec_size;
126
127	rec_size = ni->itype.index.block_size;
128	recs = PAGE_SIZE / rec_size;
129	/ Should have been verified before we got here... /
130	BUG_ON(!recs);
131	kaddr = kmap_atomic(page);
132	for (i = `0`; i < recs; i++)
133	post_read_mst_fixup(b: (NTFS_RECORD*)(kaddr +
134	i * rec_size), size: rec_size);
135	kunmap_atomic(kaddr);
136	flush_dcache_page(page);
137	if (likely(page_uptodate && !PageError(page)))
138	SetPageUptodate(page);
139	}
140	unlock_page(page);
141	return;
142	still_busy:
143	spin_unlock_irqrestore(lock: &first->b_uptodate_lock, flags);
144	return;
145	}
146
147	/**
148	* ntfs_read_block - fill a @folio of an address space with data
149	* @folio: page cache folio to fill with data
150	*
151	* We read each buffer asynchronously and when all buffers are read in, our io
152	* completion handler ntfs_end_buffer_read_async(), if required, automatically
153	* applies the mst fixups to the folio before finally marking it uptodate and
154	* unlocking it.
155	*
156	* We only enforce allocated_size limit because i_size is checked for in
157	* generic_file_read().
158	*
159	* Return 0 on success and -errno on error.
160	*
161	* Contains an adapted version of fs/buffer.c::block_read_full_folio().
162	*/
163	static int ntfs_read_block(struct folio *folio)
164	{
165	loff_t i_size;
166	VCN vcn;
167	LCN lcn;
168	s64 init_size;
169	struct inode *vi;
170	ntfs_inode *ni;
171	ntfs_volume *vol;
172	runlist_element *rl;
173	struct buffer_head bh, head, *arr[MAX_BUF_PER_PAGE];
174	sector_t iblock, lblock, zblock;
175	unsigned long flags;
176	unsigned int blocksize, vcn_ofs;
177	int i, nr;
178	unsigned char blocksize_bits;
179
180	vi = folio->mapping->host;
181	ni = NTFS_I(inode: vi);
182	vol = ni->vol;
183
184	/ $MFT/$DATA must have its complete runlist in memory at all times. /
185	BUG_ON(!ni->runlist.rl && !ni->mft_no && !NInoAttr(ni));
186
187	blocksize = vol->sb->s_blocksize;
188	blocksize_bits = vol->sb->s_blocksize_bits;
189
190	head = folio_buffers(folio);
191	if (!head)
192	head = create_empty_buffers(folio, blocksize, b_state: `0`);
193	bh = head;
194
195	/*
196	* We may be racing with truncate. To avoid some of the problems we
197	* now take a snapshot of the various sizes and use those for the whole
198	* of the function. In case of an extending truncate it just means we
199	* may leave some buffers unmapped which are now allocated. This is
200	* not a problem since these buffers will just get mapped when a write
201	* occurs. In case of a shrinking truncate, we will detect this later
202	* on due to the runlist being incomplete and if the folio is being
203	* fully truncated, truncate will throw it away as soon as we unlock
204	* it so no need to worry what we do with it.
205	*/
206	iblock = (s64)folio->index << (PAGE_SHIFT - blocksize_bits);
207	read_lock_irqsave(&ni->size_lock, flags);
208	lblock = (ni->allocated_size + blocksize - `1`) >> blocksize_bits;
209	init_size = ni->initialized_size;
210	i_size = i_size_read(inode: vi);
211	read_unlock_irqrestore(&ni->size_lock, flags);
212	if (unlikely(init_size > i_size)) {
213	/ Race with shrinking truncate. /
214	init_size = i_size;
215	}
216	zblock = (init_size + blocksize - `1`) >> blocksize_bits;
217
218	/ Loop through all the buffers in the folio. /
219	rl = NULL;
220	nr = i = `0`;
221	do {
222	int err = `0`;
223
224	if (unlikely(buffer_uptodate(bh)))
225	continue;
226	if (unlikely(buffer_mapped(bh))) {
227	arr[nr++] = bh;
228	continue;
229	}
230	bh->b_bdev = vol->sb->s_bdev;
231	/ Is the block within the allowed limits? /
232	if (iblock < lblock) {
233	bool is_retry = false;
234
235	/ Convert iblock into corresponding vcn and offset. /
236	vcn = (VCN)iblock << blocksize_bits >>
237	vol->cluster_size_bits;
238	vcn_ofs = ((VCN)iblock << blocksize_bits) &
239	vol->cluster_size_mask;
240	if (!rl) {
241	lock_retry_remap:
242	down_read(sem: &ni->runlist.lock);
243	rl = ni->runlist.rl;
244	}
245	if (likely(rl != NULL)) {
246	/ Seek to element containing target vcn. /
247	while (rl->length && rl[`1`].vcn <= vcn)
248	rl++;
249	lcn = ntfs_rl_vcn_to_lcn(rl, vcn);
250	} else
251	lcn = LCN_RL_NOT_MAPPED;
252	/ Successful remap. /
253	if (lcn >= `0`) {
254	/ Setup buffer head to correct block. /
255	bh->b_blocknr = ((lcn << vol->cluster_size_bits)
256	+ vcn_ofs) >> blocksize_bits;
257	set_buffer_mapped(bh);
258	/ Only read initialized data blocks. /
259	if (iblock < zblock) {
260	arr[nr++] = bh;
261	continue;
262	}
263	/ Fully non-initialized data block, zero it. /
264	goto handle_zblock;
265	}
266	/ It is a hole, need to zero it. /
267	if (lcn == LCN_HOLE)
268	goto handle_hole;
269	/ If first try and runlist unmapped, map and retry. /
270	if (!is_retry && lcn == LCN_RL_NOT_MAPPED) {
271	is_retry = true;
272	/*
273	* Attempt to map runlist, dropping lock for
274	* the duration.
275	*/
276	up_read(sem: &ni->runlist.lock);
277	err = ntfs_map_runlist(ni, vcn);
278	if (likely(!err))
279	goto lock_retry_remap;
280	rl = NULL;
281	} else if (!rl)
282	up_read(sem: &ni->runlist.lock);
283	/*
284	* If buffer is outside the runlist, treat it as a
285	* hole. This can happen due to concurrent truncate
286	* for example.
287	*/
288	if (err == -ENOENT \|\| lcn == LCN_ENOENT) {
289	err = `0`;
290	goto handle_hole;
291	}
292	/ Hard error, zero out region. /
293	if (!err)
294	err = -EIO;
295	bh->b_blocknr = -`1`;
296	folio_set_error(folio);
297	ntfs_error(vol->sb, "Failed to read from inode 0x%lx, "
298	"attribute type 0x%x, vcn 0x%llx, "
299	"offset 0x%x because its location on "
300	"disk could not be determined%s "
301	"(error code %i).", ni->mft_no,
302	ni->type, (unsigned long long)vcn,
303	vcn_ofs, is_retry ? " even after "
304	"retrying" : "", err);
305	}
306	/*
307	* Either iblock was outside lblock limits or
308	* ntfs_rl_vcn_to_lcn() returned error. Just zero that portion
309	* of the folio and set the buffer uptodate.
310	*/
311	handle_hole:
312	bh->b_blocknr = -`1UL`;
313	clear_buffer_mapped(bh);
314	handle_zblock:
315	folio_zero_range(folio, start: i * blocksize, length: blocksize);
316	if (likely(!err))
317	set_buffer_uptodate(bh);
318	} while (i++, iblock++, (bh = bh->b_this_page) != head);
319
320	/ Release the lock if we took it. /
321	if (rl)
322	up_read(sem: &ni->runlist.lock);
323
324	/ Check we have at least one buffer ready for i/o. /
325	if (nr) {
326	struct buffer_head *tbh;
327
328	/ Lock the buffers. /
329	for (i = `0`; i < nr; i++) {
330	tbh = arr[i];
331	lock_buffer(bh: tbh);
332	tbh->b_end_io = ntfs_end_buffer_async_read;
333	set_buffer_async_read(tbh);
334	}
335	/ Finally, start i/o on the buffers. /
336	for (i = `0`; i < nr; i++) {
337	tbh = arr[i];
338	if (likely(!buffer_uptodate(tbh)))
339	submit_bh(REQ_OP_READ, tbh);
340	else
341	ntfs_end_buffer_async_read(bh: tbh, uptodate: `1`);
342	}
343	return `0`;
344	}
345	/ No i/o was scheduled on any of the buffers. /
346	if (likely(!folio_test_error(folio)))
347	folio_mark_uptodate(folio);
348	else / Signal synchronous i/o error. /
349	nr = -EIO;
350	folio_unlock(folio);
351	return nr;
352	}
353
354	/**
355	* ntfs_read_folio - fill a @folio of a @file with data from the device
356	* @file: open file to which the folio @folio belongs or NULL
357	* @folio: page cache folio to fill with data
358	*
359	* For non-resident attributes, ntfs_read_folio() fills the @folio of the open
360	* file @file by calling the ntfs version of the generic block_read_full_folio()
361	* function, ntfs_read_block(), which in turn creates and reads in the buffers
362	* associated with the folio asynchronously.
363	*
364	* For resident attributes, OTOH, ntfs_read_folio() fills @folio by copying the
365	* data from the mft record (which at this stage is most likely in memory) and
366	* fills the remainder with zeroes. Thus, in this case, I/O is synchronous, as
367	* even if the mft record is not cached at this point in time, we need to wait
368	* for it to be read in before we can do the copy.
369	*
370	* Return 0 on success and -errno on error.
371	*/
372	static int ntfs_read_folio(struct file file, struct* folio *folio)
373	{
374	struct page *page = &folio->page;
375	loff_t i_size;
376	struct inode *vi;
377	ntfs_inode ni, base_ni;
378	u8 *addr;
379	ntfs_attr_search_ctx *ctx;
380	MFT_RECORD *mrec;
381	unsigned long flags;
382	u32 attr_len;
383	int err = `0`;
384
385	retry_readpage:
386	BUG_ON(!PageLocked(page));
387	vi = page->mapping->host;
388	i_size = i_size_read(inode: vi);
389	/ Is the page fully outside i_size? (truncate in progress) /
390	if (unlikely(page->index >= (i_size + PAGE_SIZE - `1`) >>
391	PAGE_SHIFT)) {
392	zero_user(page, start: `0`, PAGE_SIZE);
393	ntfs_debug("Read outside i_size - truncated?");
394	goto done;
395	}
396	/*
397	* This can potentially happen because we clear PageUptodate() during
398	* ntfs_writepage() of MstProtected() attributes.
399	*/
400	if (PageUptodate(page)) {
401	unlock_page(page);
402	return `0`;
403	}
404	ni = NTFS_I(inode: vi);
405	/*
406	* Only $DATA attributes can be encrypted and only unnamed $DATA
407	* attributes can be compressed. Index root can have the flags set but
408	* this means to create compressed/encrypted files, not that the
409	* attribute is compressed/encrypted. Note we need to check for
410	* AT_INDEX_ALLOCATION since this is the type of both directory and
411	* index inodes.
412	*/
413	if (ni->type != AT_INDEX_ALLOCATION) {
414	/ If attribute is encrypted, deny access, just like NT4. /
415	if (NInoEncrypted(ni)) {
416	BUG_ON(ni->type != AT_DATA);
417	err = -EACCES;
418	goto err_out;
419	}
420	/ Compressed data streams are handled in compress.c. /
421	if (NInoNonResident(ni) && NInoCompressed(ni)) {
422	BUG_ON(ni->type != AT_DATA);
423	BUG_ON(ni->name_len);
424	return ntfs_read_compressed_block(page);
425	}
426	}
427	/ NInoNonResident() == NInoIndexAllocPresent() /
428	if (NInoNonResident(ni)) {
429	/ Normal, non-resident data stream. /
430	return ntfs_read_block(folio);
431	}
432	/*
433	* Attribute is resident, implying it is not compressed or encrypted.
434	* This also means the attribute is smaller than an mft record and
435	* hence smaller than a page, so can simply zero out any pages with
436	* index above 0. Note the attribute can actually be marked compressed
437	* but if it is resident the actual data is not compressed so we are
438	* ok to ignore the compressed flag here.
439	*/
440	if (unlikely(page->index > `0`)) {
441	zero_user(page, start: `0`, PAGE_SIZE);
442	goto done;
443	}
444	if (!NInoAttr(ni))
445	base_ni = ni;
446	else
447	base_ni = ni->ext.base_ntfs_ino;
448	/ Map, pin, and lock the mft record. /
449	mrec = map_mft_record(ni: base_ni);
450	if (IS_ERR(ptr: mrec)) {
451	err = PTR_ERR(ptr: mrec);
452	goto err_out;
453	}
454	/*
455	* If a parallel write made the attribute non-resident, drop the mft
456	* record and retry the read_folio.
457	*/
458	if (unlikely(NInoNonResident(ni))) {
459	unmap_mft_record(ni: base_ni);
460	goto retry_readpage;
461	}
462	ctx = ntfs_attr_get_search_ctx(ni: base_ni, mrec);
463	if (unlikely(!ctx)) {
464	err = -ENOMEM;
465	goto unm_err_out;
466	}
467	err = ntfs_attr_lookup(type: ni->type, name: ni->name, name_len: ni->name_len,
468	ic: CASE_SENSITIVE, lowest_vcn: `0`, NULL, val_len: `0`, ctx);
469	if (unlikely(err))
470	goto put_unm_err_out;
471	attr_len = le32_to_cpu(ctx->attr->data.resident.value_length);
472	read_lock_irqsave(&ni->size_lock, flags);
473	if (unlikely(attr_len > ni->initialized_size))
474	attr_len = ni->initialized_size;
475	i_size = i_size_read(inode: vi);
476	read_unlock_irqrestore(&ni->size_lock, flags);
477	if (unlikely(attr_len > i_size)) {
478	/ Race with shrinking truncate. /
479	attr_len = i_size;
480	}
481	addr = kmap_atomic(page);
482	/ Copy the data to the page. /
483	memcpy(addr, (u8*)ctx->attr +
484	le16_to_cpu(ctx->attr->data.resident.value_offset),
485	attr_len);
486	/ Zero the remainder of the page. /
487	memset(addr + attr_len, `0`, PAGE_SIZE - attr_len);
488	flush_dcache_page(page);
489	kunmap_atomic(addr);
490	put_unm_err_out:
491	ntfs_attr_put_search_ctx(ctx);
492	unm_err_out:
493	unmap_mft_record(ni: base_ni);
494	done:
495	SetPageUptodate(page);
496	err_out:
497	unlock_page(page);
498	return err;
499	}
500
501	#ifdef NTFS_RW
502
503	/**
504	* ntfs_write_block - write a @folio to the backing store
505	* @folio: page cache folio to write out
506	* @wbc: writeback control structure
507	*
508	* This function is for writing folios belonging to non-resident, non-mst
509	* protected attributes to their backing store.
510	*
511	* For a folio with buffers, map and write the dirty buffers asynchronously
512	* under folio writeback. For a folio without buffers, create buffers for the
513	* folio, then proceed as above.
514	*
515	* If a folio doesn't have buffers the folio dirty state is definitive. If
516	* a folio does have buffers, the folio dirty state is just a hint,
517	* and the buffer dirty state is definitive. (A hint which has rules:
518	* dirty buffers against a clean folio is illegal. Other combinations are
519	* legal and need to be handled. In particular a dirty folio containing
520	* clean buffers for example.)
521	*
522	* Return 0 on success and -errno on error.
523	*
524	* Based on ntfs_read_block() and __block_write_full_folio().
525	*/
526	static int ntfs_write_block(struct folio folio, struct* writeback_control *wbc)
527	{
528	VCN vcn;
529	LCN lcn;
530	s64 initialized_size;
531	loff_t i_size;
532	sector_t block, dblock, iblock;
533	struct inode *vi;
534	ntfs_inode *ni;
535	ntfs_volume *vol;
536	runlist_element *rl;
537	struct buffer_head bh, head;
538	unsigned long flags;
539	unsigned int blocksize, vcn_ofs;
540	int err;
541	bool need_end_writeback;
542	unsigned char blocksize_bits;
543
544	vi = folio->mapping->host;
545	ni = NTFS_I(inode: vi);
546	vol = ni->vol;
547
548	ntfs_debug("Entering for inode 0x%lx, attribute type 0x%x, page index "
549	"0x%lx.", ni->mft_no, ni->type, folio->index);
550
551	BUG_ON(!NInoNonResident(ni));
552	BUG_ON(NInoMstProtected(ni));
553	blocksize = vol->sb->s_blocksize;
554	blocksize_bits = vol->sb->s_blocksize_bits;
555	head = folio_buffers(folio);
556	if (!head) {
557	BUG_ON(!folio_test_uptodate(folio));
558	head = create_empty_buffers(folio, blocksize,
559	b_state: (`1` << BH_Uptodate) \| (`1` << BH_Dirty));
560	}
561	bh = head;
562
563	/ NOTE: Different naming scheme to ntfs_read_block()! /
564
565	/ The first block in the folio. /
566	block = (s64)folio->index << (PAGE_SHIFT - blocksize_bits);
567
568	read_lock_irqsave(&ni->size_lock, flags);
569	i_size = i_size_read(inode: vi);
570	initialized_size = ni->initialized_size;
571	read_unlock_irqrestore(&ni->size_lock, flags);
572
573	/ The first out of bounds block for the data size. /
574	dblock = (i_size + blocksize - `1`) >> blocksize_bits;
575
576	/ The last (fully or partially) initialized block. /
577	iblock = initialized_size >> blocksize_bits;
578
579	/*
580	* Be very careful. We have no exclusion from block_dirty_folio
581	* here, and the (potentially unmapped) buffers may become dirty at
582	* any time. If a buffer becomes dirty here after we've inspected it
583	* then we just miss that fact, and the folio stays dirty.
584	*
585	* Buffers outside i_size may be dirtied by block_dirty_folio;
586	* handle that here by just cleaning them.
587	*/
588
589	/*
590	* Loop through all the buffers in the folio, mapping all the dirty
591	* buffers to disk addresses and handling any aliases from the
592	* underlying block device's mapping.
593	*/
594	rl = NULL;
595	err = `0`;
596	do {
597	bool is_retry = false;
598
599	if (unlikely(block >= dblock)) {
600	/*
601	* Mapped buffers outside i_size will occur, because
602	* this folio can be outside i_size when there is a
603	* truncate in progress. The contents of such buffers
604	* were zeroed by ntfs_writepage().
605	*
606	* FIXME: What about the small race window where
607	* ntfs_writepage() has not done any clearing because
608	* the folio was within i_size but before we get here,
609	* vmtruncate() modifies i_size?
610	*/
611	clear_buffer_dirty(bh);
612	set_buffer_uptodate(bh);
613	continue;
614	}
615
616	/ Clean buffers are not written out, so no need to map them. /
617	if (!buffer_dirty(bh))
618	continue;
619
620	/ Make sure we have enough initialized size. /
621	if (unlikely((block >= iblock) &&
622	(initialized_size < i_size))) {
623	/*
624	* If this folio is fully outside initialized
625	* size, zero out all folios between the current
626	* initialized size and the current folio. Just
627	* use ntfs_read_folio() to do the zeroing
628	* transparently.
629	*/
630	if (block > iblock) {
631	// TODO:
632	// For each folio do:
633	// - read_cache_folio()
634	// Again for each folio do:
635	// - wait_on_folio_locked()
636	// - Check (folio_test_uptodate(folio) &&
637	// !folio_test_error(folio))
638	// Update initialized size in the attribute and
639	// in the inode.
640	// Again, for each folio do:
641	// block_dirty_folio();
642	// folio_put()
643	// We don't need to wait on the writes.
644	// Update iblock.
645	}
646	/*
647	* The current folio straddles initialized size. Zero
648	* all non-uptodate buffers and set them uptodate (and
649	* dirty?). Note, there aren't any non-uptodate buffers
650	* if the folio is uptodate.
651	* FIXME: For an uptodate folio, the buffers may need to
652	* be written out because they were not initialized on
653	* disk before.
654	*/
655	if (!folio_test_uptodate(folio)) {
656	// TODO:
657	// Zero any non-uptodate buffers up to i_size.
658	// Set them uptodate and dirty.
659	}
660	// TODO:
661	// Update initialized size in the attribute and in the
662	// inode (up to i_size).
663	// Update iblock.
664	// FIXME: This is inefficient. Try to batch the two
665	// size changes to happen in one go.
666	ntfs_error(vol->sb, "Writing beyond initialized size "
667	"is not supported yet. Sorry.");
668	err = -EOPNOTSUPP;
669	break;
670	// Do NOT set_buffer_new() BUT DO clear buffer range
671	// outside write request range.
672	// set_buffer_uptodate() on complete buffers as well as
673	// set_buffer_dirty().
674	}
675
676	/ No need to map buffers that are already mapped. /
677	if (buffer_mapped(bh))
678	continue;
679
680	/ Unmapped, dirty buffer. Need to map it. /
681	bh->b_bdev = vol->sb->s_bdev;
682
683	/ Convert block into corresponding vcn and offset. /
684	vcn = (VCN)block << blocksize_bits;
685	vcn_ofs = vcn & vol->cluster_size_mask;
686	vcn >>= vol->cluster_size_bits;
687	if (!rl) {
688	lock_retry_remap:
689	down_read(sem: &ni->runlist.lock);
690	rl = ni->runlist.rl;
691	}
692	if (likely(rl != NULL)) {
693	/ Seek to element containing target vcn. /
694	while (rl->length && rl[`1`].vcn <= vcn)
695	rl++;
696	lcn = ntfs_rl_vcn_to_lcn(rl, vcn);
697	} else
698	lcn = LCN_RL_NOT_MAPPED;
699	/ Successful remap. /
700	if (lcn >= `0`) {
701	/ Setup buffer head to point to correct block. /
702	bh->b_blocknr = ((lcn << vol->cluster_size_bits) +
703	vcn_ofs) >> blocksize_bits;
704	set_buffer_mapped(bh);
705	continue;
706	}
707	/ It is a hole, need to instantiate it. /
708	if (lcn == LCN_HOLE) {
709	u8 *kaddr;
710	unsigned long bpos, bend;
711
712	/ Check if the buffer is zero. /
713	kaddr = kmap_local_folio(folio, offset: bh_offset(bh));
714	bpos = (unsigned long *)kaddr;
715	bend = (unsigned long *)(kaddr + blocksize);
716	do {
717	if (unlikely(*bpos))
718	break;
719	} while (likely(++bpos < bend));
720	kunmap_local(kaddr);
721	if (bpos == bend) {
722	/*
723	* Buffer is zero and sparse, no need to write
724	* it.
725	*/
726	bh->b_blocknr = -`1`;
727	clear_buffer_dirty(bh);
728	continue;
729	}
730	// TODO: Instantiate the hole.
731	// clear_buffer_new(bh);
732	// clean_bdev_bh_alias(bh);
733	ntfs_error(vol->sb, "Writing into sparse regions is "
734	"not supported yet. Sorry.");
735	err = -EOPNOTSUPP;
736	break;
737	}
738	/ If first try and runlist unmapped, map and retry. /
739	if (!is_retry && lcn == LCN_RL_NOT_MAPPED) {
740	is_retry = true;
741	/*
742	* Attempt to map runlist, dropping lock for
743	* the duration.
744	*/
745	up_read(sem: &ni->runlist.lock);
746	err = ntfs_map_runlist(ni, vcn);
747	if (likely(!err))
748	goto lock_retry_remap;
749	rl = NULL;
750	} else if (!rl)
751	up_read(sem: &ni->runlist.lock);
752	/*
753	* If buffer is outside the runlist, truncate has cut it out
754	* of the runlist. Just clean and clear the buffer and set it
755	* uptodate so it can get discarded by the VM.
756	*/
757	if (err == -ENOENT \|\| lcn == LCN_ENOENT) {
758	bh->b_blocknr = -`1`;
759	clear_buffer_dirty(bh);
760	folio_zero_range(folio, start: bh_offset(bh), length: blocksize);
761	set_buffer_uptodate(bh);
762	err = `0`;
763	continue;
764	}
765	/ Failed to map the buffer, even after retrying. /
766	if (!err)
767	err = -EIO;
768	bh->b_blocknr = -`1`;
769	ntfs_error(vol->sb, "Failed to write to inode 0x%lx, "
770	"attribute type 0x%x, vcn 0x%llx, offset 0x%x "
771	"because its location on disk could not be "
772	"determined%s (error code %i).", ni->mft_no,
773	ni->type, (unsigned long long)vcn,
774	vcn_ofs, is_retry ? " even after "
775	"retrying" : "", err);
776	break;
777	} while (block++, (bh = bh->b_this_page) != head);
778
779	/ Release the lock if we took it. /
780	if (rl)
781	up_read(sem: &ni->runlist.lock);
782
783	/ For the error case, need to reset bh to the beginning. /
784	bh = head;
785
786	/ Just an optimization, so ->read_folio() is not called later. /
787	if (unlikely(!folio_test_uptodate(folio))) {
788	int uptodate = `1`;
789	do {
790	if (!buffer_uptodate(bh)) {
791	uptodate = `0`;
792	bh = head;
793	break;
794	}
795	} while ((bh = bh->b_this_page) != head);
796	if (uptodate)
797	folio_mark_uptodate(folio);
798	}
799
800	/ Setup all mapped, dirty buffers for async write i/o. /
801	do {
802	if (buffer_mapped(bh) && buffer_dirty(bh)) {
803	lock_buffer(bh);
804	if (test_clear_buffer_dirty(bh)) {
805	BUG_ON(!buffer_uptodate(bh));
806	mark_buffer_async_write(bh);
807	} else
808	unlock_buffer(bh);
809	} else if (unlikely(err)) {
810	/*
811	* For the error case. The buffer may have been set
812	* dirty during attachment to a dirty folio.
813	*/
814	if (err != -ENOMEM)
815	clear_buffer_dirty(bh);
816	}
817	} while ((bh = bh->b_this_page) != head);
818
819	if (unlikely(err)) {
820	// TODO: Remove the -EOPNOTSUPP check later on...
821	if (unlikely(err == -EOPNOTSUPP))
822	err = `0`;
823	else if (err == -ENOMEM) {
824	ntfs_warning(vol->sb, "Error allocating memory. "
825	"Redirtying folio so we try again "
826	"later.");
827	/*
828	* Put the folio back on mapping->dirty_pages, but
829	* leave its buffer's dirty state as-is.
830	*/
831	folio_redirty_for_writepage(wbc, folio);
832	err = `0`;
833	} else
834	folio_set_error(folio);
835	}
836
837	BUG_ON(folio_test_writeback(folio));
838	folio_start_writeback(folio); / Keeps try_to_free_buffers() away. /
839
840	/ Submit the prepared buffers for i/o. /
841	need_end_writeback = true;
842	do {
843	struct buffer_head *next = bh->b_this_page;
844	if (buffer_async_write(bh)) {
845	submit_bh(REQ_OP_WRITE, bh);
846	need_end_writeback = false;
847	}
848	bh = next;
849	} while (bh != head);
850	folio_unlock(folio);
851
852	/ If no i/o was started, need to end writeback here. /
853	if (unlikely(need_end_writeback))
854	folio_end_writeback(folio);
855
856	ntfs_debug("Done.");
857	return err;
858	}
859
860	/**
861	* ntfs_write_mst_block - write a @page to the backing store
862	* @page: page cache page to write out
863	* @wbc: writeback control structure
864	*
865	* This function is for writing pages belonging to non-resident, mst protected
866	* attributes to their backing store. The only supported attributes are index
867	* allocation and $MFT/$DATA. Both directory inodes and index inodes are
868	* supported for the index allocation case.
869	*
870	* The page must remain locked for the duration of the write because we apply
871	* the mst fixups, write, and then undo the fixups, so if we were to unlock the
872	* page before undoing the fixups, any other user of the page will see the
873	* page contents as corrupt.
874	*
875	* We clear the page uptodate flag for the duration of the function to ensure
876	* exclusion for the $MFT/$DATA case against someone mapping an mft record we
877	* are about to apply the mst fixups to.
878	*
879	* Return 0 on success and -errno on error.
880	*
881	* Based on ntfs_write_block(), ntfs_mft_writepage(), and
882	* write_mft_record_nolock().
883	*/
884	static int ntfs_write_mst_block(struct page *page,
885	struct writeback_control *wbc)
886	{
887	sector_t block, dblock, rec_block;
888	struct inode *vi = page->mapping->host;
889	ntfs_inode *ni = NTFS_I(inode: vi);
890	ntfs_volume *vol = ni->vol;
891	u8 *kaddr;
892	unsigned int rec_size = ni->itype.index.block_size;
893	ntfs_inode *locked_nis[PAGE_SIZE / NTFS_BLOCK_SIZE];
894	struct buffer_head bh, head, tbh, rec_start_bh;
895	struct buffer_head *bhs[MAX_BUF_PER_PAGE];
896	runlist_element *rl;
897	int i, nr_locked_nis, nr_recs, nr_bhs, max_bhs, bhs_per_rec, err, err2;
898	unsigned bh_size, rec_size_bits;
899	bool sync, is_mft, page_is_dirty, rec_is_dirty;
900	unsigned char bh_size_bits;
901
902	if (WARN_ON(rec_size < NTFS_BLOCK_SIZE))
903	return -EINVAL;
904
905	ntfs_debug("Entering for inode 0x%lx, attribute type 0x%x, page index "
906	"0x%lx.", vi->i_ino, ni->type, page->index);
907	BUG_ON(!NInoNonResident(ni));
908	BUG_ON(!NInoMstProtected(ni));
909	is_mft = (S_ISREG(vi->i_mode) && !vi->i_ino);
910	/*
911	* NOTE: ntfs_write_mst_block() would be called for $MFTMirr if a page
912	* in its page cache were to be marked dirty. However this should
913	* never happen with the current driver and considering we do not
914	* handle this case here we do want to BUG(), at least for now.
915	*/
916	BUG_ON(!(is_mft \|\| S_ISDIR(vi->i_mode) \|\|
917	(NInoAttr(ni) && ni->type == AT_INDEX_ALLOCATION)));
918	bh_size = vol->sb->s_blocksize;
919	bh_size_bits = vol->sb->s_blocksize_bits;
920	max_bhs = PAGE_SIZE / bh_size;
921	BUG_ON(!max_bhs);
922	BUG_ON(max_bhs > MAX_BUF_PER_PAGE);
923
924	/ Were we called for sync purposes? /
925	sync = (wbc->sync_mode == WB_SYNC_ALL);
926
927	/ Make sure we have mapped buffers. /
928	bh = head = page_buffers(page);
929	BUG_ON(!bh);
930
931	rec_size_bits = ni->itype.index.block_size_bits;
932	BUG_ON(!(PAGE_SIZE >> rec_size_bits));
933	bhs_per_rec = rec_size >> bh_size_bits;
934	BUG_ON(!bhs_per_rec);
935
936	/ The first block in the page. /
937	rec_block = block = (sector_t)page->index <<
938	(PAGE_SHIFT - bh_size_bits);
939
940	/ The first out of bounds block for the data size. /
941	dblock = (i_size_read(inode: vi) + bh_size - `1`) >> bh_size_bits;
942
943	rl = NULL;
944	err = err2 = nr_bhs = nr_recs = nr_locked_nis = `0`;
945	page_is_dirty = rec_is_dirty = false;
946	rec_start_bh = NULL;
947	do {
948	bool is_retry = false;
949
950	if (likely(block < rec_block)) {
951	if (unlikely(block >= dblock)) {
952	clear_buffer_dirty(bh);
953	set_buffer_uptodate(bh);
954	continue;
955	}
956	/*
957	* This block is not the first one in the record. We
958	* ignore the buffer's dirty state because we could
959	* have raced with a parallel mark_ntfs_record_dirty().
960	*/
961	if (!rec_is_dirty)
962	continue;
963	if (unlikely(err2)) {
964	if (err2 != -ENOMEM)
965	clear_buffer_dirty(bh);
966	continue;
967	}
968	} else / if (block == rec_block) / {
969	BUG_ON(block > rec_block);
970	/ This block is the first one in the record. /
971	rec_block += bhs_per_rec;
972	err2 = `0`;
973	if (unlikely(block >= dblock)) {
974	clear_buffer_dirty(bh);
975	continue;
976	}
977	if (!buffer_dirty(bh)) {
978	/ Clean records are not written out. /
979	rec_is_dirty = false;
980	continue;
981	}
982	rec_is_dirty = true;
983	rec_start_bh = bh;
984	}
985	/ Need to map the buffer if it is not mapped already. /
986	if (unlikely(!buffer_mapped(bh))) {
987	VCN vcn;
988	LCN lcn;
989	unsigned int vcn_ofs;
990
991	bh->b_bdev = vol->sb->s_bdev;
992	/ Obtain the vcn and offset of the current block. /
993	vcn = (VCN)block << bh_size_bits;
994	vcn_ofs = vcn & vol->cluster_size_mask;
995	vcn >>= vol->cluster_size_bits;
996	if (!rl) {
997	lock_retry_remap:
998	down_read(sem: &ni->runlist.lock);
999	rl = ni->runlist.rl;
1000	}
1001	if (likely(rl != NULL)) {
1002	/ Seek to element containing target vcn. /
1003	while (rl->length && rl[`1`].vcn <= vcn)
1004	rl++;
1005	lcn = ntfs_rl_vcn_to_lcn(rl, vcn);
1006	} else
1007	lcn = LCN_RL_NOT_MAPPED;
1008	/ Successful remap. /
1009	if (likely(lcn >= `0`)) {
1010	/ Setup buffer head to correct block. /
1011	bh->b_blocknr = ((lcn <<
1012	vol->cluster_size_bits) +
1013	vcn_ofs) >> bh_size_bits;
1014	set_buffer_mapped(bh);
1015	} else {
1016	/*
1017	* Remap failed. Retry to map the runlist once
1018	* unless we are working on $MFT which always
1019	* has the whole of its runlist in memory.
1020	*/
1021	if (!is_mft && !is_retry &&
1022	lcn == LCN_RL_NOT_MAPPED) {
1023	is_retry = true;
1024	/*
1025	* Attempt to map runlist, dropping
1026	* lock for the duration.
1027	*/
1028	up_read(sem: &ni->runlist.lock);
1029	err2 = ntfs_map_runlist(ni, vcn);
1030	if (likely(!err2))
1031	goto lock_retry_remap;
1032	if (err2 == -ENOMEM)
1033	page_is_dirty = true;
1034	lcn = err2;
1035	} else {
1036	err2 = -EIO;
1037	if (!rl)
1038	up_read(sem: &ni->runlist.lock);
1039	}
1040	/ Hard error. Abort writing this record. /
1041	if (!err \|\| err == -ENOMEM)
1042	err = err2;
1043	bh->b_blocknr = -`1`;
1044	ntfs_error(vol->sb, "Cannot write ntfs record "
1045	"0x%llx (inode 0x%lx, "
1046	"attribute type 0x%x) because "
1047	"its location on disk could "
1048	"not be determined (error "
1049	"code %lli).",
1050	(long long)block <<
1051	bh_size_bits >>
1052	vol->mft_record_size_bits,
1053	ni->mft_no, ni->type,
1054	(long long)lcn);
1055	/*
1056	* If this is not the first buffer, remove the
1057	* buffers in this record from the list of
1058	* buffers to write and clear their dirty bit
1059	* if not error -ENOMEM.
1060	*/
1061	if (rec_start_bh != bh) {
1062	while (bhs[--nr_bhs] != rec_start_bh)
1063	;
1064	if (err2 != -ENOMEM) {
1065	do {
1066	clear_buffer_dirty(
1067	bh: rec_start_bh);
1068	} while ((rec_start_bh =
1069	rec_start_bh->
1070	b_this_page) !=
1071	bh);
1072	}
1073	}
1074	continue;
1075	}
1076	}
1077	BUG_ON(!buffer_uptodate(bh));
1078	BUG_ON(nr_bhs >= max_bhs);
1079	bhs[nr_bhs++] = bh;
1080	} while (block++, (bh = bh->b_this_page) != head);
1081	if (unlikely(rl))
1082	up_read(sem: &ni->runlist.lock);
1083	/ If there were no dirty buffers, we are done. /
1084	if (!nr_bhs)
1085	goto done;
1086	/ Map the page so we can access its contents. /
1087	kaddr = kmap(page);
1088	/ Clear the page uptodate flag whilst the mst fixups are applied. /
1089	BUG_ON(!PageUptodate(page));
1090	ClearPageUptodate(page);
1091	for (i = `0`; i < nr_bhs; i++) {
1092	unsigned int ofs;
1093
1094	/ Skip buffers which are not at the beginning of records. /
1095	if (i % bhs_per_rec)
1096	continue;
1097	tbh = bhs[i];
1098	ofs = bh_offset(bh: tbh);
1099	if (is_mft) {
1100	ntfs_inode *tni;
1101	unsigned long mft_no;
1102
1103	/ Get the mft record number. /
1104	mft_no = (((s64)page->index << PAGE_SHIFT) + ofs)
1105	>> rec_size_bits;
1106	/ Check whether to write this mft record. /
1107	tni = NULL;
1108	if (!ntfs_may_write_mft_record(vol, mft_no,
1109	m: (MFT_RECORD*)(kaddr + ofs), locked_ni: &tni)) {
1110	/*
1111	* The record should not be written. This
1112	* means we need to redirty the page before
1113	* returning.
1114	*/
1115	page_is_dirty = true;
1116	/*
1117	* Remove the buffers in this mft record from
1118	* the list of buffers to write.
1119	*/
1120	do {
1121	bhs[i] = NULL;
1122	} while (++i % bhs_per_rec);
1123	continue;
1124	}
1125	/*
1126	* The record should be written. If a locked ntfs
1127	* inode was returned, add it to the array of locked
1128	* ntfs inodes.
1129	*/
1130	if (tni)
1131	locked_nis[nr_locked_nis++] = tni;
1132	}
1133	/ Apply the mst protection fixups. /
1134	err2 = pre_write_mst_fixup(b: (NTFS_RECORD*)(kaddr + ofs),
1135	size: rec_size);
1136	if (unlikely(err2)) {
1137	if (!err \|\| err == -ENOMEM)
1138	err = -EIO;
1139	ntfs_error(vol->sb, "Failed to apply mst fixups "
1140	"(inode 0x%lx, attribute type 0x%x, "
1141	"page index 0x%lx, page offset 0x%x)!"
1142	" Unmount and run chkdsk.", vi->i_ino,
1143	ni->type, page->index, ofs);
1144	/*
1145	* Mark all the buffers in this record clean as we do
1146	* not want to write corrupt data to disk.
1147	*/
1148	do {
1149	clear_buffer_dirty(bh: bhs[i]);
1150	bhs[i] = NULL;
1151	} while (++i % bhs_per_rec);
1152	continue;
1153	}
1154	nr_recs++;
1155	}
1156	/ If no records are to be written out, we are done. /
1157	if (!nr_recs)
1158	goto unm_done;
1159	flush_dcache_page(page);
1160	/ Lock buffers and start synchronous write i/o on them. /
1161	for (i = `0`; i < nr_bhs; i++) {
1162	tbh = bhs[i];
1163	if (!tbh)
1164	continue;
1165	if (!trylock_buffer(bh: tbh))
1166	BUG();
1167	/ The buffer dirty state is now irrelevant, just clean it. /
1168	clear_buffer_dirty(bh: tbh);
1169	BUG_ON(!buffer_uptodate(tbh));
1170	BUG_ON(!buffer_mapped(tbh));
1171	get_bh(bh: tbh);
1172	tbh->b_end_io = end_buffer_write_sync;
1173	submit_bh(REQ_OP_WRITE, tbh);
1174	}
1175	/ Synchronize the mft mirror now if not @sync. /
1176	if (is_mft && !sync)
1177	goto do_mirror;
1178	do_wait:
1179	/ Wait on i/o completion of buffers. /
1180	for (i = `0`; i < nr_bhs; i++) {
1181	tbh = bhs[i];
1182	if (!tbh)
1183	continue;
1184	wait_on_buffer(bh: tbh);
1185	if (unlikely(!buffer_uptodate(tbh))) {
1186	ntfs_error(vol->sb, "I/O error while writing ntfs "
1187	"record buffer (inode 0x%lx, "
1188	"attribute type 0x%x, page index "
1189	"0x%lx, page offset 0x%lx)! Unmount "
1190	"and run chkdsk.", vi->i_ino, ni->type,
1191	page->index, bh_offset(tbh));
1192	if (!err \|\| err == -ENOMEM)
1193	err = -EIO;
1194	/*
1195	* Set the buffer uptodate so the page and buffer
1196	* states do not become out of sync.
1197	*/
1198	set_buffer_uptodate(tbh);
1199	}
1200	}
1201	/ If @sync, now synchronize the mft mirror. /
1202	if (is_mft && sync) {
1203	do_mirror:
1204	for (i = `0`; i < nr_bhs; i++) {
1205	unsigned long mft_no;
1206	unsigned int ofs;
1207
1208	/*
1209	* Skip buffers which are not at the beginning of
1210	* records.
1211	*/
1212	if (i % bhs_per_rec)
1213	continue;
1214	tbh = bhs[i];
1215	/ Skip removed buffers (and hence records). /
1216	if (!tbh)
1217	continue;
1218	ofs = bh_offset(bh: tbh);
1219	/ Get the mft record number. /
1220	mft_no = (((s64)page->index << PAGE_SHIFT) + ofs)
1221	>> rec_size_bits;
1222	if (mft_no < vol->mftmirr_size)
1223	ntfs_sync_mft_mirror(vol, mft_no,
1224	m: (MFT_RECORD*)(kaddr + ofs),
1225	sync);
1226	}
1227	if (!sync)
1228	goto do_wait;
1229	}
1230	/ Remove the mst protection fixups again. /
1231	for (i = `0`; i < nr_bhs; i++) {
1232	if (!(i % bhs_per_rec)) {
1233	tbh = bhs[i];
1234	if (!tbh)
1235	continue;
1236	post_write_mst_fixup(b: (NTFS_RECORD*)(kaddr +
1237	bh_offset(bh: tbh)));
1238	}
1239	}
1240	flush_dcache_page(page);
1241	unm_done:
1242	/ Unlock any locked inodes. /
1243	while (nr_locked_nis-- > `0`) {
1244	ntfs_inode tni, base_tni;
1245
1246	tni = locked_nis[nr_locked_nis];
1247	/ Get the base inode. /
1248	mutex_lock(&tni->extent_lock);
1249	if (tni->nr_extents >= `0`)
1250	base_tni = tni;
1251	else {
1252	base_tni = tni->ext.base_ntfs_ino;
1253	BUG_ON(!base_tni);
1254	}
1255	mutex_unlock(lock: &tni->extent_lock);
1256	ntfs_debug("Unlocking %s inode 0x%lx.",
1257	tni == base_tni ? "base" : "extent",
1258	tni->mft_no);
1259	mutex_unlock(lock: &tni->mrec_lock);
1260	atomic_dec(v: &tni->count);
1261	iput(VFS_I(ni: base_tni));
1262	}
1263	SetPageUptodate(page);
1264	kunmap(page);
1265	done:
1266	if (unlikely(err && err != -ENOMEM)) {
1267	/*
1268	* Set page error if there is only one ntfs record in the page.
1269	* Otherwise we would loose per-record granularity.
1270	*/
1271	if (ni->itype.index.block_size == PAGE_SIZE)
1272	SetPageError(page);
1273	NVolSetErrors(vol);
1274	}
1275	if (page_is_dirty) {
1276	ntfs_debug("Page still contains one or more dirty ntfs "
1277	"records. Redirtying the page starting at "
1278	"record 0x%lx.", page->index <<
1279	(PAGE_SHIFT - rec_size_bits));
1280	redirty_page_for_writepage(wbc, page);
1281	unlock_page(page);
1282	} else {
1283	/*
1284	* Keep the VM happy. This must be done otherwise the
1285	* radix-tree tag PAGECACHE_TAG_DIRTY remains set even though
1286	* the page is clean.
1287	*/
1288	BUG_ON(PageWriteback(page));
1289	set_page_writeback(page);
1290	unlock_page(page);
1291	end_page_writeback(page);
1292	}
1293	if (likely(!err))
1294	ntfs_debug("Done.");
1295	return err;
1296	}
1297
1298	/**
1299	* ntfs_writepage - write a @page to the backing store
1300	* @page: page cache page to write out
1301	* @wbc: writeback control structure
1302	*
1303	* This is called from the VM when it wants to have a dirty ntfs page cache
1304	* page cleaned. The VM has already locked the page and marked it clean.
1305	*
1306	* For non-resident attributes, ntfs_writepage() writes the @page by calling
1307	* the ntfs version of the generic block_write_full_page() function,
1308	* ntfs_write_block(), which in turn if necessary creates and writes the
1309	* buffers associated with the page asynchronously.
1310	*
1311	* For resident attributes, OTOH, ntfs_writepage() writes the @page by copying
1312	* the data to the mft record (which at this stage is most likely in memory).
1313	* The mft record is then marked dirty and written out asynchronously via the
1314	* vfs inode dirty code path for the inode the mft record belongs to or via the
1315	* vm page dirty code path for the page the mft record is in.
1316	*
1317	* Based on ntfs_read_folio() and fs/buffer.c::block_write_full_page().
1318	*
1319	* Return 0 on success and -errno on error.
1320	*/
1321	static int ntfs_writepage(struct page page, struct* writeback_control *wbc)
1322	{
1323	struct folio *folio = page_folio(page);
1324	loff_t i_size;
1325	struct inode *vi = folio->mapping->host;
1326	ntfs_inode base_ni = NULL, ni = NTFS_I(inode: vi);
1327	char *addr;
1328	ntfs_attr_search_ctx *ctx = NULL;
1329	MFT_RECORD *m = NULL;
1330	u32 attr_len;
1331	int err;
1332
1333	retry_writepage:
1334	BUG_ON(!folio_test_locked(folio));
1335	i_size = i_size_read(inode: vi);
1336	/ Is the folio fully outside i_size? (truncate in progress) /
1337	if (unlikely(folio->index >= (i_size + PAGE_SIZE - `1`) >>
1338	PAGE_SHIFT)) {
1339	/*
1340	* The folio may have dirty, unmapped buffers. Make them
1341	* freeable here, so the page does not leak.
1342	*/
1343	block_invalidate_folio(folio, offset: `0`, length: folio_size(folio));
1344	folio_unlock(folio);
1345	ntfs_debug("Write outside i_size - truncated?");
1346	return `0`;
1347	}
1348	/*
1349	* Only $DATA attributes can be encrypted and only unnamed $DATA
1350	* attributes can be compressed. Index root can have the flags set but
1351	* this means to create compressed/encrypted files, not that the
1352	* attribute is compressed/encrypted. Note we need to check for
1353	* AT_INDEX_ALLOCATION since this is the type of both directory and
1354	* index inodes.
1355	*/
1356	if (ni->type != AT_INDEX_ALLOCATION) {
1357	/ If file is encrypted, deny access, just like NT4. /
1358	if (NInoEncrypted(ni)) {
1359	folio_unlock(folio);
1360	BUG_ON(ni->type != AT_DATA);
1361	ntfs_debug("Denying write access to encrypted file.");
1362	return -EACCES;
1363	}
1364	/ Compressed data streams are handled in compress.c. /
1365	if (NInoNonResident(ni) && NInoCompressed(ni)) {
1366	BUG_ON(ni->type != AT_DATA);
1367	BUG_ON(ni->name_len);
1368	// TODO: Implement and replace this with
1369	// return ntfs_write_compressed_block(page);
1370	folio_unlock(folio);
1371	ntfs_error(vi->i_sb, "Writing to compressed files is "
1372	"not supported yet. Sorry.");
1373	return -EOPNOTSUPP;
1374	}
1375	// TODO: Implement and remove this check.
1376	if (NInoNonResident(ni) && NInoSparse(ni)) {
1377	folio_unlock(folio);
1378	ntfs_error(vi->i_sb, "Writing to sparse files is not "
1379	"supported yet. Sorry.");
1380	return -EOPNOTSUPP;
1381	}
1382	}
1383	/ NInoNonResident() == NInoIndexAllocPresent() /
1384	if (NInoNonResident(ni)) {
1385	/ We have to zero every time due to mmap-at-end-of-file. /
1386	if (folio->index >= (i_size >> PAGE_SHIFT)) {
1387	/ The folio straddles i_size. /
1388	unsigned int ofs = i_size & (folio_size(folio) - `1`);
1389	folio_zero_segment(folio, start: ofs, xend: folio_size(folio));
1390	}
1391	/ Handle mst protected attributes. /
1392	if (NInoMstProtected(ni))
1393	return ntfs_write_mst_block(page, wbc);
1394	/ Normal, non-resident data stream. /
1395	return ntfs_write_block(folio, wbc);
1396	}
1397	/*
1398	* Attribute is resident, implying it is not compressed, encrypted, or
1399	* mst protected. This also means the attribute is smaller than an mft
1400	* record and hence smaller than a folio, so can simply return error on
1401	* any folios with index above 0. Note the attribute can actually be
1402	* marked compressed but if it is resident the actual data is not
1403	* compressed so we are ok to ignore the compressed flag here.
1404	*/
1405	BUG_ON(folio_buffers(folio));
1406	BUG_ON(!folio_test_uptodate(folio));
1407	if (unlikely(folio->index > `0`)) {
1408	ntfs_error(vi->i_sb, "BUG()! folio->index (0x%lx) > 0. "
1409	"Aborting write.", folio->index);
1410	BUG_ON(folio_test_writeback(folio));
1411	folio_start_writeback(folio);
1412	folio_unlock(folio);
1413	folio_end_writeback(folio);
1414	return -EIO;
1415	}
1416	if (!NInoAttr(ni))
1417	base_ni = ni;
1418	else
1419	base_ni = ni->ext.base_ntfs_ino;
1420	/ Map, pin, and lock the mft record. /
1421	m = map_mft_record(ni: base_ni);
1422	if (IS_ERR(ptr: m)) {
1423	err = PTR_ERR(ptr: m);
1424	m = NULL;
1425	ctx = NULL;
1426	goto err_out;
1427	}
1428	/*
1429	* If a parallel write made the attribute non-resident, drop the mft
1430	* record and retry the writepage.
1431	*/
1432	if (unlikely(NInoNonResident(ni))) {
1433	unmap_mft_record(ni: base_ni);
1434	goto retry_writepage;
1435	}
1436	ctx = ntfs_attr_get_search_ctx(ni: base_ni, mrec: m);
1437	if (unlikely(!ctx)) {
1438	err = -ENOMEM;
1439	goto err_out;
1440	}
1441	err = ntfs_attr_lookup(type: ni->type, name: ni->name, name_len: ni->name_len,
1442	ic: CASE_SENSITIVE, lowest_vcn: `0`, NULL, val_len: `0`, ctx);
1443	if (unlikely(err))
1444	goto err_out;
1445	/*
1446	* Keep the VM happy. This must be done otherwise
1447	* PAGECACHE_TAG_DIRTY remains set even though the folio is clean.
1448	*/
1449	BUG_ON(folio_test_writeback(folio));
1450	folio_start_writeback(folio);
1451	folio_unlock(folio);
1452	attr_len = le32_to_cpu(ctx->attr->data.resident.value_length);
1453	i_size = i_size_read(inode: vi);
1454	if (unlikely(attr_len > i_size)) {
1455	/ Race with shrinking truncate or a failed truncate. /
1456	attr_len = i_size;
1457	/*
1458	* If the truncate failed, fix it up now. If a concurrent
1459	* truncate, we do its job, so it does not have to do anything.
1460	*/
1461	err = ntfs_resident_attr_value_resize(m: ctx->mrec, a: ctx->attr,
1462	new_size: attr_len);
1463	/ Shrinking cannot fail. /
1464	BUG_ON(err);
1465	}
1466	addr = kmap_local_folio(folio, offset: `0`);
1467	/ Copy the data from the folio to the mft record. /
1468	memcpy((u8*)ctx->attr +
1469	le16_to_cpu(ctx->attr->data.resident.value_offset),
1470	addr, attr_len);
1471	/ Zero out of bounds area in the page cache folio. /
1472	memset(addr + attr_len, `0`, folio_size(folio) - attr_len);
1473	kunmap_local(addr);
1474	flush_dcache_folio(folio);
1475	flush_dcache_mft_record_page(ni: ctx->ntfs_ino);
1476	/ We are done with the folio. /
1477	folio_end_writeback(folio);
1478	/ Finally, mark the mft record dirty, so it gets written back. /
1479	mark_mft_record_dirty(ni: ctx->ntfs_ino);
1480	ntfs_attr_put_search_ctx(ctx);
1481	unmap_mft_record(ni: base_ni);
1482	return `0`;
1483	err_out:
1484	if (err == -ENOMEM) {
1485	ntfs_warning(vi->i_sb, "Error allocating memory. Redirtying "
1486	"page so we try again later.");
1487	/*
1488	* Put the folio back on mapping->dirty_pages, but leave its
1489	* buffers' dirty state as-is.
1490	*/
1491	folio_redirty_for_writepage(wbc, folio);
1492	err = `0`;
1493	} else {
1494	ntfs_error(vi->i_sb, "Resident attribute write failed with "
1495	"error %i.", err);
1496	folio_set_error(folio);
1497	NVolSetErrors(vol: ni->vol);
1498	}
1499	folio_unlock(folio);
1500	if (ctx)
1501	ntfs_attr_put_search_ctx(ctx);
1502	if (m)
1503	unmap_mft_record(ni: base_ni);
1504	return err;
1505	}
1506
1507	#endif /* NTFS_RW */
1508
1509	/**
1510	* ntfs_bmap - map logical file block to physical device block
1511	* @mapping: address space mapping to which the block to be mapped belongs
1512	* @block: logical block to map to its physical device block
1513	*
1514	* For regular, non-resident files (i.e. not compressed and not encrypted), map
1515	* the logical @block belonging to the file described by the address space
1516	* mapping @mapping to its physical device block.
1517	*
1518	* The size of the block is equal to the @s_blocksize field of the super block
1519	* of the mounted file system which is guaranteed to be smaller than or equal
1520	* to the cluster size thus the block is guaranteed to fit entirely inside the
1521	* cluster which means we do not need to care how many contiguous bytes are
1522	* available after the beginning of the block.
1523	*
1524	* Return the physical device block if the mapping succeeded or 0 if the block
1525	* is sparse or there was an error.
1526	*
1527	* Note: This is a problem if someone tries to run bmap() on $Boot system file
1528	* as that really is in block zero but there is nothing we can do. bmap() is
1529	* just broken in that respect (just like it cannot distinguish sparse from
1530	* not available or error).
1531	*/
1532	static sector_t ntfs_bmap(struct address_space *mapping, sector_t block)
1533	{
1534	s64 ofs, size;
1535	loff_t i_size;
1536	LCN lcn;
1537	unsigned long blocksize, flags;
1538	ntfs_inode *ni = NTFS_I(inode: mapping->host);
1539	ntfs_volume *vol = ni->vol;
1540	unsigned delta;
1541	unsigned char blocksize_bits, cluster_size_shift;
1542
1543	ntfs_debug("Entering for mft_no 0x%lx, logical block 0x%llx.",
1544	ni->mft_no, (unsigned long long)block);
1545	if (ni->type != AT_DATA \|\| !NInoNonResident(ni) \|\| NInoEncrypted(ni)) {
1546	ntfs_error(vol->sb, "BMAP does not make sense for %s "
1547	"attributes, returning 0.",
1548	(ni->type != AT_DATA) ? "non-data" :
1549	(!NInoNonResident(ni) ? "resident" :
1550	"encrypted"));
1551	return `0`;
1552	}
1553	/ None of these can happen. /
1554	BUG_ON(NInoCompressed(ni));
1555	BUG_ON(NInoMstProtected(ni));
1556	blocksize = vol->sb->s_blocksize;
1557	blocksize_bits = vol->sb->s_blocksize_bits;
1558	ofs = (s64)block << blocksize_bits;
1559	read_lock_irqsave(&ni->size_lock, flags);
1560	size = ni->initialized_size;
1561	i_size = i_size_read(inode: VFS_I(ni));
1562	read_unlock_irqrestore(&ni->size_lock, flags);
1563	/*
1564	* If the offset is outside the initialized size or the block straddles
1565	* the initialized size then pretend it is a hole unless the
1566	* initialized size equals the file size.
1567	*/
1568	if (unlikely(ofs >= size \|\| (ofs + blocksize > size && size < i_size)))
1569	goto hole;
1570	cluster_size_shift = vol->cluster_size_bits;
1571	down_read(sem: &ni->runlist.lock);
1572	lcn = ntfs_attr_vcn_to_lcn_nolock(ni, vcn: ofs >> cluster_size_shift, write_locked: false);
1573	up_read(sem: &ni->runlist.lock);
1574	if (unlikely(lcn < LCN_HOLE)) {
1575	/*
1576	* Step down to an integer to avoid gcc doing a long long
1577	* comparision in the switch when we know @lcn is between
1578	* LCN_HOLE and LCN_EIO (i.e. -1 to -5).
1579	*
1580	* Otherwise older gcc (at least on some architectures) will
1581	* try to use __cmpdi2() which is of course not available in
1582	* the kernel.
1583	*/
1584	switch ((int)lcn) {
1585	case LCN_ENOENT:
1586	/*
1587	* If the offset is out of bounds then pretend it is a
1588	* hole.
1589	*/
1590	goto hole;
1591	case LCN_ENOMEM:
1592	ntfs_error(vol->sb, "Not enough memory to complete "
1593	"mapping for inode 0x%lx. "
1594	"Returning 0.", ni->mft_no);
1595	break;
1596	default:
1597	ntfs_error(vol->sb, "Failed to complete mapping for "
1598	"inode 0x%lx. Run chkdsk. "
1599	"Returning 0.", ni->mft_no);
1600	break;
1601	}
1602	return `0`;
1603	}
1604	if (lcn < `0`) {
1605	/ It is a hole. /
1606	hole:
1607	ntfs_debug("Done (returning hole).");
1608	return `0`;
1609	}
1610	/*
1611	* The block is really allocated and fullfils all our criteria.
1612	* Convert the cluster to units of block size and return the result.
1613	*/
1614	delta = ofs & vol->cluster_size_mask;
1615	if (unlikely(sizeof(block) < sizeof(lcn))) {
1616	block = lcn = ((lcn << cluster_size_shift) + delta) >>
1617	blocksize_bits;
1618	/ If the block number was truncated return 0. /
1619	if (unlikely(block != lcn)) {
1620	ntfs_error(vol->sb, "Physical block 0x%llx is too "
1621	"large to be returned, returning 0.",
1622	(long long)lcn);
1623	return `0`;
1624	}
1625	} else
1626	block = ((lcn << cluster_size_shift) + delta) >>
1627	blocksize_bits;
1628	ntfs_debug("Done (returning block 0x%llx).", (unsigned long long)lcn);
1629	return block;
1630	}
1631
1632	/*
1633	* ntfs_normal_aops - address space operations for normal inodes and attributes
1634	*
1635	* Note these are not used for compressed or mst protected inodes and
1636	* attributes.
1637	*/
1638	const struct address_space_operations ntfs_normal_aops = {
1639	.read_folio = ntfs_read_folio,
1640	#ifdef NTFS_RW
1641	.writepage = ntfs_writepage,
1642	.dirty_folio = block_dirty_folio,
1643	#endif /* NTFS_RW */
1644	.bmap = ntfs_bmap,
1645	.migrate_folio = buffer_migrate_folio,
1646	.is_partially_uptodate = block_is_partially_uptodate,
1647	.error_remove_page = generic_error_remove_page,
1648	};
1649
1650	/*
1651	* ntfs_compressed_aops - address space operations for compressed inodes
1652	*/
1653	const struct address_space_operations ntfs_compressed_aops = {
1654	.read_folio = ntfs_read_folio,
1655	#ifdef NTFS_RW
1656	.writepage = ntfs_writepage,
1657	.dirty_folio = block_dirty_folio,
1658	#endif /* NTFS_RW */
1659	.migrate_folio = buffer_migrate_folio,
1660	.is_partially_uptodate = block_is_partially_uptodate,
1661	.error_remove_page = generic_error_remove_page,
1662	};
1663
1664	/*
1665	* ntfs_mst_aops - general address space operations for mst protecteed inodes
1666	* and attributes
1667	*/
1668	const struct address_space_operations ntfs_mst_aops = {
1669	.read_folio = ntfs_read_folio, / Fill page with data. /
1670	#ifdef NTFS_RW
1671	.writepage = ntfs_writepage, / Write dirty page to disk. /
1672	.dirty_folio = filemap_dirty_folio,
1673	#endif /* NTFS_RW */
1674	.migrate_folio = buffer_migrate_folio,
1675	.is_partially_uptodate = block_is_partially_uptodate,
1676	.error_remove_page = generic_error_remove_page,
1677	};
1678
1679	#ifdef NTFS_RW
1680
1681	/**
1682	* mark_ntfs_record_dirty - mark an ntfs record dirty
1683	* @page: page containing the ntfs record to mark dirty
1684	* @ofs: byte offset within @page at which the ntfs record begins
1685	*
1686	* Set the buffers and the page in which the ntfs record is located dirty.
1687	*
1688	* The latter also marks the vfs inode the ntfs record belongs to dirty
1689	* (I_DIRTY_PAGES only).
1690	*
1691	* If the page does not have buffers, we create them and set them uptodate.
1692	* The page may not be locked which is why we need to handle the buffers under
1693	* the mapping->private_lock. Once the buffers are marked dirty we no longer
1694	* need the lock since try_to_free_buffers() does not free dirty buffers.
1695	*/
1696	void mark_ntfs_record_dirty(struct page page, const* unsigned int ofs) {
1697	struct address_space *mapping = page->mapping;
1698	ntfs_inode *ni = NTFS_I(inode: mapping->host);
1699	struct buffer_head bh, head, *buffers_to_free = NULL;
1700	unsigned int end, bh_size, bh_ofs;
1701
1702	BUG_ON(!PageUptodate(page));
1703	end = ofs + ni->itype.index.block_size;
1704	bh_size = VFS_I(ni)->i_sb->s_blocksize;
1705	spin_lock(lock: &mapping->private_lock);
1706	if (unlikely(!page_has_buffers(page))) {
1707	spin_unlock(lock: &mapping->private_lock);
1708	bh = head = alloc_page_buffers(page, size: bh_size, retry: true);
1709	spin_lock(lock: &mapping->private_lock);
1710	if (likely(!page_has_buffers(page))) {
1711	struct buffer_head *tail;
1712
1713	do {
1714	set_buffer_uptodate(bh);
1715	tail = bh;
1716	bh = bh->b_this_page;
1717	} while (bh);
1718	tail->b_this_page = head;
1719	attach_page_private(page, data: head);
1720	} else
1721	buffers_to_free = bh;
1722	}
1723	bh = head = page_buffers(page);
1724	BUG_ON(!bh);
1725	do {
1726	bh_ofs = bh_offset(bh);
1727	if (bh_ofs + bh_size <= ofs)
1728	continue;
1729	if (unlikely(bh_ofs >= end))
1730	break;
1731	set_buffer_dirty(bh);
1732	} while ((bh = bh->b_this_page) != head);
1733	spin_unlock(lock: &mapping->private_lock);
1734	filemap_dirty_folio(mapping, page_folio(page));
1735	if (unlikely(buffers_to_free)) {
1736	do {
1737	bh = buffers_to_free->b_this_page;
1738	free_buffer_head(bh: buffers_to_free);
1739	buffers_to_free = bh;
1740	} while (buffers_to_free);
1741	}
1742	}
1743
1744	#endif /* NTFS_RW */
1745

source code of linux/fs/ntfs/aops.c