mft.c source code [linux/fs/ntfs/mft.c]

1	// SPDX-License-Identifier: GPL-2.0-or-later
2	/*
3	* mft.c - NTFS kernel mft record operations. Part of the Linux-NTFS project.
4	*
5	* Copyright (c) 2001-2012 Anton Altaparmakov and Tuxera Inc.
6	* Copyright (c) 2002 Richard Russon
7	*/
8
9	#include <linux/buffer_head.h>
10	#include <linux/slab.h>
11	#include <linux/swap.h>
12	#include <linux/bio.h>
13
14	#include "attrib.h"
15	#include "aops.h"
16	#include "bitmap.h"
17	#include "debug.h"
18	#include "dir.h"
19	#include "lcnalloc.h"
20	#include "malloc.h"
21	#include "mft.h"
22	#include "ntfs.h"
23
24	#define MAX_BHS (PAGE_SIZE / NTFS_BLOCK_SIZE)
25
26	/**
27	* map_mft_record_page - map the page in which a specific mft record resides
28	* @ni: ntfs inode whose mft record page to map
29	*
30	* This maps the page in which the mft record of the ntfs inode @ni is situated
31	* and returns a pointer to the mft record within the mapped page.
32	*
33	* Return value needs to be checked with IS_ERR() and if that is true PTR_ERR()
34	* contains the negative error code returned.
35	*/
36	static inline MFT_RECORD map_mft_record_page(ntfs_inode ni)
37	{
38	loff_t i_size;
39	ntfs_volume *vol = ni->vol;
40	struct inode *mft_vi = vol->mft_ino;
41	struct page *page;
42	unsigned long index, end_index;
43	unsigned ofs;
44
45	BUG_ON(ni->page);
46	/*
47	* The index into the page cache and the offset within the page cache
48	* page of the wanted mft record. FIXME: We need to check for
49	* overflowing the unsigned long, but I don't think we would ever get
50	* here if the volume was that big...
51	*/
52	index = (u64)ni->mft_no << vol->mft_record_size_bits >>
53	PAGE_SHIFT;
54	ofs = (ni->mft_no << vol->mft_record_size_bits) & ~PAGE_MASK;
55
56	i_size = i_size_read(inode: mft_vi);
57	/ The maximum valid index into the page cache for $MFT's data. /
58	end_index = i_size >> PAGE_SHIFT;
59
60	/ If the wanted index is out of bounds the mft record doesn't exist. /
61	if (unlikely(index >= end_index)) {
62	if (index > end_index \|\| (i_size & ~PAGE_MASK) < ofs +
63	vol->mft_record_size) {
64	page = ERR_PTR(error: -ENOENT);
65	ntfs_error(vol->sb, "Attempt to read mft record 0x%lx, "
66	"which is beyond the end of the mft. "
67	"This is probably a bug in the ntfs "
68	"driver.", ni->mft_no);
69	goto err_out;
70	}
71	}
72	/ Read, map, and pin the page. /
73	page = ntfs_map_page(mapping: mft_vi->i_mapping, index);
74	if (!IS_ERR(ptr: page)) {
75	/ Catch multi sector transfer fixup errors. /
76	if (likely(ntfs_is_mft_recordp((le32*)(page_address(page) +
77	ofs)))) {
78	ni->page = page;
79	ni->page_ofs = ofs;
80	return page_address(page) + ofs;
81	}
82	ntfs_error(vol->sb, "Mft record 0x%lx is corrupt. "
83	"Run chkdsk.", ni->mft_no);
84	ntfs_unmap_page(page);
85	page = ERR_PTR(error: -EIO);
86	NVolSetErrors(vol);
87	}
88	err_out:
89	ni->page = NULL;
90	ni->page_ofs = `0`;
91	return (void*)page;
92	}
93
94	/**
95	* map_mft_record - map, pin and lock an mft record
96	* @ni: ntfs inode whose MFT record to map
97	*
98	* First, take the mrec_lock mutex. We might now be sleeping, while waiting
99	* for the mutex if it was already locked by someone else.
100	*
101	* The page of the record is mapped using map_mft_record_page() before being
102	* returned to the caller.
103	*
104	* This in turn uses ntfs_map_page() to get the page containing the wanted mft
105	* record (it in turn calls read_cache_page() which reads it in from disk if
106	* necessary, increments the use count on the page so that it cannot disappear
107	* under us and returns a reference to the page cache page).
108	*
109	* If read_cache_page() invokes ntfs_readpage() to load the page from disk, it
110	* sets PG_locked and clears PG_uptodate on the page. Once I/O has completed
111	* and the post-read mst fixups on each mft record in the page have been
112	* performed, the page gets PG_uptodate set and PG_locked cleared (this is done
113	* in our asynchronous I/O completion handler end_buffer_read_mft_async()).
114	* ntfs_map_page() waits for PG_locked to become clear and checks if
115	* PG_uptodate is set and returns an error code if not. This provides
116	* sufficient protection against races when reading/using the page.
117	*
118	* However there is the write mapping to think about. Doing the above described
119	* checking here will be fine, because when initiating the write we will set
120	* PG_locked and clear PG_uptodate making sure nobody is touching the page
121	* contents. Doing the locking this way means that the commit to disk code in
122	* the page cache code paths is automatically sufficiently locked with us as
123	* we will not touch a page that has been locked or is not uptodate. The only
124	* locking problem then is them locking the page while we are accessing it.
125	*
126	* So that code will end up having to own the mrec_lock of all mft
127	* records/inodes present in the page before I/O can proceed. In that case we
128	* wouldn't need to bother with PG_locked and PG_uptodate as nobody will be
129	* accessing anything without owning the mrec_lock mutex. But we do need to
130	* use them because of the read_cache_page() invocation and the code becomes so
131	* much simpler this way that it is well worth it.
132	*
133	* The mft record is now ours and we return a pointer to it. You need to check
134	* the returned pointer with IS_ERR() and if that is true, PTR_ERR() will return
135	* the error code.
136	*
137	* NOTE: Caller is responsible for setting the mft record dirty before calling
138	* unmap_mft_record(). This is obviously only necessary if the caller really
139	* modified the mft record...
140	* Q: Do we want to recycle one of the VFS inode state bits instead?
141	* A: No, the inode ones mean we want to change the mft record, not we want to
142	* write it out.
143	*/
144	MFT_RECORD map_mft_record(ntfs_inode ni)
145	{
146	MFT_RECORD *m;
147
148	ntfs_debug("Entering for mft_no 0x%lx.", ni->mft_no);
149
150	/ Make sure the ntfs inode doesn't go away. /
151	atomic_inc(v: &ni->count);
152
153	/ Serialize access to this mft record. /
154	mutex_lock(&ni->mrec_lock);
155
156	m = map_mft_record_page(ni);
157	if (!IS_ERR(ptr: m))
158	return m;
159
160	mutex_unlock(lock: &ni->mrec_lock);
161	atomic_dec(v: &ni->count);
162	ntfs_error(ni->vol->sb, "Failed with error code %lu.", -PTR_ERR(m));
163	return m;
164	}
165
166	/**
167	* unmap_mft_record_page - unmap the page in which a specific mft record resides
168	* @ni: ntfs inode whose mft record page to unmap
169	*
170	* This unmaps the page in which the mft record of the ntfs inode @ni is
171	* situated and returns. This is a NOOP if highmem is not configured.
172	*
173	* The unmap happens via ntfs_unmap_page() which in turn decrements the use
174	* count on the page thus releasing it from the pinned state.
175	*
176	* We do not actually unmap the page from memory of course, as that will be
177	* done by the page cache code itself when memory pressure increases or
178	* whatever.
179	*/
180	static inline void unmap_mft_record_page(ntfs_inode *ni)
181	{
182	BUG_ON(!ni->page);
183
184	// TODO: If dirty, blah...
185	ntfs_unmap_page(page: ni->page);
186	ni->page = NULL;
187	ni->page_ofs = `0`;
188	return;
189	}
190
191	/**
192	* unmap_mft_record - release a mapped mft record
193	* @ni: ntfs inode whose MFT record to unmap
194	*
195	* We release the page mapping and the mrec_lock mutex which unmaps the mft
196	* record and releases it for others to get hold of. We also release the ntfs
197	* inode by decrementing the ntfs inode reference count.
198	*
199	* NOTE: If caller has modified the mft record, it is imperative to set the mft
200	* record dirty BEFORE calling unmap_mft_record().
201	*/
202	void unmap_mft_record(ntfs_inode *ni)
203	{
204	struct page *page = ni->page;
205
206	BUG_ON(!page);
207
208	ntfs_debug("Entering for mft_no 0x%lx.", ni->mft_no);
209
210	unmap_mft_record_page(ni);
211	mutex_unlock(lock: &ni->mrec_lock);
212	atomic_dec(v: &ni->count);
213	/*
214	* If pure ntfs_inode, i.e. no vfs inode attached, we leave it to
215	* ntfs_clear_extent_inode() in the extent inode case, and to the
216	* caller in the non-extent, yet pure ntfs inode case, to do the actual
217	* tear down of all structures and freeing of all allocated memory.
218	*/
219	return;
220	}
221
222	/**
223	* map_extent_mft_record - load an extent inode and attach it to its base
224	* @base_ni: base ntfs inode
225	* @mref: mft reference of the extent inode to load
226	* @ntfs_ino: on successful return, pointer to the ntfs_inode structure
227	*
228	* Load the extent mft record @mref and attach it to its base inode @base_ni.
229	* Return the mapped extent mft record if IS_ERR(result) is false. Otherwise
230	* PTR_ERR(result) gives the negative error code.
231	*
232	* On successful return, @ntfs_ino contains a pointer to the ntfs_inode
233	* structure of the mapped extent inode.
234	*/
235	MFT_RECORD map_extent_mft_record(ntfs_inode base_ni, MFT_REF mref,
236	ntfs_inode **ntfs_ino)
237	{
238	MFT_RECORD *m;
239	ntfs_inode *ni = NULL;
240	ntfs_inode **extent_nis = NULL;
241	int i;
242	unsigned long mft_no = MREF(mref);
243	u16 seq_no = MSEQNO(mref);
244	bool destroy_ni = false;
245
246	ntfs_debug("Mapping extent mft record 0x%lx (base mft record 0x%lx).",
247	mft_no, base_ni->mft_no);
248	/ Make sure the base ntfs inode doesn't go away. /
249	atomic_inc(v: &base_ni->count);
250	/*
251	* Check if this extent inode has already been added to the base inode,
252	* in which case just return it. If not found, add it to the base
253	* inode before returning it.
254	*/
255	mutex_lock(&base_ni->extent_lock);
256	if (base_ni->nr_extents > `0`) {
257	extent_nis = base_ni->ext.extent_ntfs_inos;
258	for (i = `0`; i < base_ni->nr_extents; i++) {
259	if (mft_no != extent_nis[i]->mft_no)
260	continue;
261	ni = extent_nis[i];
262	/ Make sure the ntfs inode doesn't go away. /
263	atomic_inc(v: &ni->count);
264	break;
265	}
266	}
267	if (likely(ni != NULL)) {
268	mutex_unlock(lock: &base_ni->extent_lock);
269	atomic_dec(v: &base_ni->count);
270	/ We found the record; just have to map and return it. /
271	m = map_mft_record(ni);
272	/ map_mft_record() has incremented this on success. /
273	atomic_dec(v: &ni->count);
274	if (!IS_ERR(ptr: m)) {
275	/ Verify the sequence number. /
276	if (likely(le16_to_cpu(m->sequence_number) == seq_no)) {
277	ntfs_debug("Done 1.");
278	*ntfs_ino = ni;
279	return m;
280	}
281	unmap_mft_record(ni);
282	ntfs_error(base_ni->vol->sb, "Found stale extent mft "
283	"reference! Corrupt filesystem. "
284	"Run chkdsk.");
285	return ERR_PTR(error: -EIO);
286	}
287	map_err_out:
288	ntfs_error(base_ni->vol->sb, "Failed to map extent "
289	"mft record, error code %ld.", -PTR_ERR(m));
290	return m;
291	}
292	/ Record wasn't there. Get a new ntfs inode and initialize it. /
293	ni = ntfs_new_extent_inode(sb: base_ni->vol->sb, mft_no);
294	if (unlikely(!ni)) {
295	mutex_unlock(lock: &base_ni->extent_lock);
296	atomic_dec(v: &base_ni->count);
297	return ERR_PTR(error: -ENOMEM);
298	}
299	ni->vol = base_ni->vol;
300	ni->seq_no = seq_no;
301	ni->nr_extents = -`1`;
302	ni->ext.base_ntfs_ino = base_ni;
303	/ Now map the record. /
304	m = map_mft_record(ni);
305	if (IS_ERR(ptr: m)) {
306	mutex_unlock(lock: &base_ni->extent_lock);
307	atomic_dec(v: &base_ni->count);
308	ntfs_clear_extent_inode(ni);
309	goto map_err_out;
310	}
311	/ Verify the sequence number if it is present. /
312	if (seq_no && (le16_to_cpu(m->sequence_number) != seq_no)) {
313	ntfs_error(base_ni->vol->sb, "Found stale extent mft "
314	"reference! Corrupt filesystem. Run chkdsk.");
315	destroy_ni = true;
316	m = ERR_PTR(error: -EIO);
317	goto unm_err_out;
318	}
319	/ Attach extent inode to base inode, reallocating memory if needed. /
320	if (!(base_ni->nr_extents & `3`)) {
321	ntfs_inode **tmp;
322	int new_size = (base_ni->nr_extents + `4`) * sizeof(ntfs_inode *);
323
324	tmp = kmalloc(size: new_size, GFP_NOFS);
325	if (unlikely(!tmp)) {
326	ntfs_error(base_ni->vol->sb, "Failed to allocate "
327	"internal buffer.");
328	destroy_ni = true;
329	m = ERR_PTR(error: -ENOMEM);
330	goto unm_err_out;
331	}
332	if (base_ni->nr_extents) {
333	BUG_ON(!base_ni->ext.extent_ntfs_inos);
334	memcpy(tmp, base_ni->ext.extent_ntfs_inos, new_size -
335	`4` * sizeof(ntfs_inode *));
336	kfree(objp: base_ni->ext.extent_ntfs_inos);
337	}
338	base_ni->ext.extent_ntfs_inos = tmp;
339	}
340	base_ni->ext.extent_ntfs_inos[base_ni->nr_extents++] = ni;
341	mutex_unlock(lock: &base_ni->extent_lock);
342	atomic_dec(v: &base_ni->count);
343	ntfs_debug("Done 2.");
344	*ntfs_ino = ni;
345	return m;
346	unm_err_out:
347	unmap_mft_record(ni);
348	mutex_unlock(lock: &base_ni->extent_lock);
349	atomic_dec(v: &base_ni->count);
350	/*
351	* If the extent inode was not attached to the base inode we need to
352	* release it or we will leak memory.
353	*/
354	if (destroy_ni)
355	ntfs_clear_extent_inode(ni);
356	return m;
357	}
358
359	#ifdef NTFS_RW
360
361	/**
362	* __mark_mft_record_dirty - set the mft record and the page containing it dirty
363	* @ni: ntfs inode describing the mapped mft record
364	*
365	* Internal function. Users should call mark_mft_record_dirty() instead.
366	*
367	* Set the mapped (extent) mft record of the (base or extent) ntfs inode @ni,
368	* as well as the page containing the mft record, dirty. Also, mark the base
369	* vfs inode dirty. This ensures that any changes to the mft record are
370	* written out to disk.
371	*
372	* NOTE: We only set I_DIRTY_DATASYNC (and not I_DIRTY_PAGES)
373	* on the base vfs inode, because even though file data may have been modified,
374	* it is dirty in the inode meta data rather than the data page cache of the
375	* inode, and thus there are no data pages that need writing out. Therefore, a
376	* full mark_inode_dirty() is overkill. A mark_inode_dirty_sync(), on the
377	* other hand, is not sufficient, because ->write_inode needs to be called even
378	* in case of fdatasync. This needs to happen or the file data would not
379	* necessarily hit the device synchronously, even though the vfs inode has the
380	* O_SYNC flag set. Also, I_DIRTY_DATASYNC simply "feels" better than just
381	* I_DIRTY_SYNC, since the file data has not actually hit the block device yet,
382	* which is not what I_DIRTY_SYNC on its own would suggest.
383	*/
384	void __mark_mft_record_dirty(ntfs_inode *ni)
385	{
386	ntfs_inode *base_ni;
387
388	ntfs_debug("Entering for inode 0x%lx.", ni->mft_no);
389	BUG_ON(NInoAttr(ni));
390	mark_ntfs_record_dirty(page: ni->page, ofs: ni->page_ofs);
391	/ Determine the base vfs inode and mark it dirty, too. /
392	mutex_lock(&ni->extent_lock);
393	if (likely(ni->nr_extents >= `0`))
394	base_ni = ni;
395	else
396	base_ni = ni->ext.base_ntfs_ino;
397	mutex_unlock(lock: &ni->extent_lock);
398	__mark_inode_dirty(VFS_I(ni: base_ni), I_DIRTY_DATASYNC);
399	}
400
401	static const char *ntfs_please_email = "Please email "
402	"linux-ntfs-dev@lists.sourceforge.net and say that you saw "
403	"this message. Thank you.";
404
405	/**
406	* ntfs_sync_mft_mirror_umount - synchronise an mft record to the mft mirror
407	* @vol: ntfs volume on which the mft record to synchronize resides
408	* @mft_no: mft record number of mft record to synchronize
409	* @m: mapped, mst protected (extent) mft record to synchronize
410	*
411	* Write the mapped, mst protected (extent) mft record @m with mft record
412	* number @mft_no to the mft mirror ($MFTMirr) of the ntfs volume @vol,
413	* bypassing the page cache and the $MFTMirr inode itself.
414	*
415	* This function is only for use at umount time when the mft mirror inode has
416	* already been disposed off. We BUG() if we are called while the mft mirror
417	* inode is still attached to the volume.
418	*
419	* On success return 0. On error return -errno.
420	*
421	* NOTE: This function is not implemented yet as I am not convinced it can
422	* actually be triggered considering the sequence of commits we do in super.c::
423	* ntfs_put_super(). But just in case we provide this place holder as the
424	* alternative would be either to BUG() or to get a NULL pointer dereference
425	* and Oops.
426	*/
427	static int ntfs_sync_mft_mirror_umount(ntfs_volume *vol,
428	const unsigned long mft_no, MFT_RECORD *m)
429	{
430	BUG_ON(vol->mftmirr_ino);
431	ntfs_error(vol->sb, "Umount time mft mirror syncing is not "
432	"implemented yet. %s", ntfs_please_email);
433	return -EOPNOTSUPP;
434	}
435
436	/**
437	* ntfs_sync_mft_mirror - synchronize an mft record to the mft mirror
438	* @vol: ntfs volume on which the mft record to synchronize resides
439	* @mft_no: mft record number of mft record to synchronize
440	* @m: mapped, mst protected (extent) mft record to synchronize
441	* @sync: if true, wait for i/o completion
442	*
443	* Write the mapped, mst protected (extent) mft record @m with mft record
444	* number @mft_no to the mft mirror ($MFTMirr) of the ntfs volume @vol.
445	*
446	* On success return 0. On error return -errno and set the volume errors flag
447	* in the ntfs volume @vol.
448	*
449	* NOTE: We always perform synchronous i/o and ignore the @sync parameter.
450	*
451	* TODO: If @sync is false, want to do truly asynchronous i/o, i.e. just
452	* schedule i/o via ->writepage or do it via kntfsd or whatever.
453	*/
454	int ntfs_sync_mft_mirror(ntfs_volume vol, const* unsigned long mft_no,
455	MFT_RECORD m, int* sync)
456	{
457	struct page *page;
458	unsigned int blocksize = vol->sb->s_blocksize;
459	int max_bhs = vol->mft_record_size / blocksize;
460	struct buffer_head *bhs[MAX_BHS];
461	struct buffer_head bh, head;
462	u8 *kmirr;
463	runlist_element *rl;
464	unsigned int block_start, block_end, m_start, m_end, page_ofs;
465	int i_bhs, nr_bhs, err = `0`;
466	unsigned char blocksize_bits = vol->sb->s_blocksize_bits;
467
468	ntfs_debug("Entering for inode 0x%lx.", mft_no);
469	BUG_ON(!max_bhs);
470	if (WARN_ON(max_bhs > MAX_BHS))
471	return -EINVAL;
472	if (unlikely(!vol->mftmirr_ino)) {
473	/ This could happen during umount... /
474	err = ntfs_sync_mft_mirror_umount(vol, mft_no, m);
475	if (likely(!err))
476	return err;
477	goto err_out;
478	}
479	/ Get the page containing the mirror copy of the mft record @m. /
480	page = ntfs_map_page(mapping: vol->mftmirr_ino->i_mapping, index: mft_no >>
481	(PAGE_SHIFT - vol->mft_record_size_bits));
482	if (IS_ERR(ptr: page)) {
483	ntfs_error(vol->sb, "Failed to map mft mirror page.");
484	err = PTR_ERR(ptr: page);
485	goto err_out;
486	}
487	lock_page(page);
488	BUG_ON(!PageUptodate(page));
489	ClearPageUptodate(page);
490	/ Offset of the mft mirror record inside the page. /
491	page_ofs = (mft_no << vol->mft_record_size_bits) & ~PAGE_MASK;
492	/ The address in the page of the mirror copy of the mft record @m. /
493	kmirr = page_address(page) + page_ofs;
494	/ Copy the mst protected mft record to the mirror. /
495	memcpy(kmirr, m, vol->mft_record_size);
496	/ Create uptodate buffers if not present. /
497	if (unlikely(!page_has_buffers(page))) {
498	struct buffer_head *tail;
499
500	bh = head = alloc_page_buffers(page, size: blocksize, retry: true);
501	do {
502	set_buffer_uptodate(bh);
503	tail = bh;
504	bh = bh->b_this_page;
505	} while (bh);
506	tail->b_this_page = head;
507	attach_page_private(page, data: head);
508	}
509	bh = head = page_buffers(page);
510	BUG_ON(!bh);
511	rl = NULL;
512	nr_bhs = `0`;
513	block_start = `0`;
514	m_start = kmirr - (u8*)page_address(page);
515	m_end = m_start + vol->mft_record_size;
516	do {
517	block_end = block_start + blocksize;
518	/ If the buffer is outside the mft record, skip it. /
519	if (block_end <= m_start)
520	continue;
521	if (unlikely(block_start >= m_end))
522	break;
523	/ Need to map the buffer if it is not mapped already. /
524	if (unlikely(!buffer_mapped(bh))) {
525	VCN vcn;
526	LCN lcn;
527	unsigned int vcn_ofs;
528
529	bh->b_bdev = vol->sb->s_bdev;
530	/ Obtain the vcn and offset of the current block. /
531	vcn = ((VCN)mft_no << vol->mft_record_size_bits) +
532	(block_start - m_start);
533	vcn_ofs = vcn & vol->cluster_size_mask;
534	vcn >>= vol->cluster_size_bits;
535	if (!rl) {
536	down_read(sem: &NTFS_I(inode: vol->mftmirr_ino)->
537	runlist.lock);
538	rl = NTFS_I(inode: vol->mftmirr_ino)->runlist.rl;
539	/*
540	* $MFTMirr always has the whole of its runlist
541	* in memory.
542	*/
543	BUG_ON(!rl);
544	}
545	/ Seek to element containing target vcn. /
546	while (rl->length && rl[`1`].vcn <= vcn)
547	rl++;
548	lcn = ntfs_rl_vcn_to_lcn(rl, vcn);
549	/ For $MFTMirr, only lcn >= 0 is a successful remap. /
550	if (likely(lcn >= `0`)) {
551	/ Setup buffer head to correct block. /
552	bh->b_blocknr = ((lcn <<
553	vol->cluster_size_bits) +
554	vcn_ofs) >> blocksize_bits;
555	set_buffer_mapped(bh);
556	} else {
557	bh->b_blocknr = -`1`;
558	ntfs_error(vol->sb, "Cannot write mft mirror "
559	"record 0x%lx because its "
560	"location on disk could not "
561	"be determined (error code "
562	"%lli).", mft_no,
563	(long long)lcn);
564	err = -EIO;
565	}
566	}
567	BUG_ON(!buffer_uptodate(bh));
568	BUG_ON(!nr_bhs && (m_start != block_start));
569	BUG_ON(nr_bhs >= max_bhs);
570	bhs[nr_bhs++] = bh;
571	BUG_ON((nr_bhs >= max_bhs) && (m_end != block_end));
572	} while (block_start = block_end, (bh = bh->b_this_page) != head);
573	if (unlikely(rl))
574	up_read(sem: &NTFS_I(inode: vol->mftmirr_ino)->runlist.lock);
575	if (likely(!err)) {
576	/ Lock buffers and start synchronous write i/o on them. /
577	for (i_bhs = `0`; i_bhs < nr_bhs; i_bhs++) {
578	struct buffer_head *tbh = bhs[i_bhs];
579
580	if (!trylock_buffer(bh: tbh))
581	BUG();
582	BUG_ON(!buffer_uptodate(tbh));
583	clear_buffer_dirty(bh: tbh);
584	get_bh(bh: tbh);
585	tbh->b_end_io = end_buffer_write_sync;
586	submit_bh(REQ_OP_WRITE, tbh);
587	}
588	/ Wait on i/o completion of buffers. /
589	for (i_bhs = `0`; i_bhs < nr_bhs; i_bhs++) {
590	struct buffer_head *tbh = bhs[i_bhs];
591
592	wait_on_buffer(bh: tbh);
593	if (unlikely(!buffer_uptodate(tbh))) {
594	err = -EIO;
595	/*
596	* Set the buffer uptodate so the page and
597	* buffer states do not become out of sync.
598	*/
599	set_buffer_uptodate(tbh);
600	}
601	}
602	} else / if (unlikely(err)) / {
603	/ Clean the buffers. /
604	for (i_bhs = `0`; i_bhs < nr_bhs; i_bhs++)
605	clear_buffer_dirty(bh: bhs[i_bhs]);
606	}
607	/ Current state: all buffers are clean, unlocked, and uptodate. /
608	/ Remove the mst protection fixups again. /
609	post_write_mst_fixup(b: (NTFS_RECORD*)kmirr);
610	flush_dcache_page(page);
611	SetPageUptodate(page);
612	unlock_page(page);
613	ntfs_unmap_page(page);
614	if (likely(!err)) {
615	ntfs_debug("Done.");
616	} else {
617	ntfs_error(vol->sb, "I/O error while writing mft mirror "
618	"record 0x%lx!", mft_no);
619	err_out:
620	ntfs_error(vol->sb, "Failed to synchronize $MFTMirr (error "
621	"code %i). Volume will be left marked dirty "
622	"on umount. Run ntfsfix on the partition "
623	"after umounting to correct this.", -err);
624	NVolSetErrors(vol);
625	}
626	return err;
627	}
628
629	/**
630	* write_mft_record_nolock - write out a mapped (extent) mft record
631	* @ni: ntfs inode describing the mapped (extent) mft record
632	* @m: mapped (extent) mft record to write
633	* @sync: if true, wait for i/o completion
634	*
635	* Write the mapped (extent) mft record @m described by the (regular or extent)
636	* ntfs inode @ni to backing store. If the mft record @m has a counterpart in
637	* the mft mirror, that is also updated.
638	*
639	* We only write the mft record if the ntfs inode @ni is dirty and the first
640	* buffer belonging to its mft record is dirty, too. We ignore the dirty state
641	* of subsequent buffers because we could have raced with
642	* fs/ntfs/aops.c::mark_ntfs_record_dirty().
643	*
644	* On success, clean the mft record and return 0. On error, leave the mft
645	* record dirty and return -errno.
646	*
647	* NOTE: We always perform synchronous i/o and ignore the @sync parameter.
648	* However, if the mft record has a counterpart in the mft mirror and @sync is
649	* true, we write the mft record, wait for i/o completion, and only then write
650	* the mft mirror copy. This ensures that if the system crashes either the mft
651	* or the mft mirror will contain a self-consistent mft record @m. If @sync is
652	* false on the other hand, we start i/o on both and then wait for completion
653	* on them. This provides a speedup but no longer guarantees that you will end
654	* up with a self-consistent mft record in the case of a crash but if you asked
655	* for asynchronous writing you probably do not care about that anyway.
656	*
657	* TODO: If @sync is false, want to do truly asynchronous i/o, i.e. just
658	* schedule i/o via ->writepage or do it via kntfsd or whatever.
659	*/
660	int write_mft_record_nolock(ntfs_inode ni, MFT_RECORD m, int sync)
661	{
662	ntfs_volume *vol = ni->vol;
663	struct page *page = ni->page;
664	unsigned int blocksize = vol->sb->s_blocksize;
665	unsigned char blocksize_bits = vol->sb->s_blocksize_bits;
666	int max_bhs = vol->mft_record_size / blocksize;
667	struct buffer_head *bhs[MAX_BHS];
668	struct buffer_head bh, head;
669	runlist_element *rl;
670	unsigned int block_start, block_end, m_start, m_end;
671	int i_bhs, nr_bhs, err = `0`;
672
673	ntfs_debug("Entering for inode 0x%lx.", ni->mft_no);
674	BUG_ON(NInoAttr(ni));
675	BUG_ON(!max_bhs);
676	BUG_ON(!PageLocked(page));
677	if (WARN_ON(max_bhs > MAX_BHS)) {
678	err = -EINVAL;
679	goto err_out;
680	}
681	/*
682	* If the ntfs_inode is clean no need to do anything. If it is dirty,
683	* mark it as clean now so that it can be redirtied later on if needed.
684	* There is no danger of races since the caller is holding the locks
685	* for the mft record @m and the page it is in.
686	*/
687	if (!NInoTestClearDirty(ni))
688	goto done;
689	bh = head = page_buffers(page);
690	BUG_ON(!bh);
691	rl = NULL;
692	nr_bhs = `0`;
693	block_start = `0`;
694	m_start = ni->page_ofs;
695	m_end = m_start + vol->mft_record_size;
696	do {
697	block_end = block_start + blocksize;
698	/ If the buffer is outside the mft record, skip it. /
699	if (block_end <= m_start)
700	continue;
701	if (unlikely(block_start >= m_end))
702	break;
703	/*
704	* If this block is not the first one in the record, we ignore
705	* the buffer's dirty state because we could have raced with a
706	* parallel mark_ntfs_record_dirty().
707	*/
708	if (block_start == m_start) {
709	/ This block is the first one in the record. /
710	if (!buffer_dirty(bh)) {
711	BUG_ON(nr_bhs);
712	/ Clean records are not written out. /
713	break;
714	}
715	}
716	/ Need to map the buffer if it is not mapped already. /
717	if (unlikely(!buffer_mapped(bh))) {
718	VCN vcn;
719	LCN lcn;
720	unsigned int vcn_ofs;
721
722	bh->b_bdev = vol->sb->s_bdev;
723	/ Obtain the vcn and offset of the current block. /
724	vcn = ((VCN)ni->mft_no << vol->mft_record_size_bits) +
725	(block_start - m_start);
726	vcn_ofs = vcn & vol->cluster_size_mask;
727	vcn >>= vol->cluster_size_bits;
728	if (!rl) {
729	down_read(sem: &NTFS_I(inode: vol->mft_ino)->runlist.lock);
730	rl = NTFS_I(inode: vol->mft_ino)->runlist.rl;
731	BUG_ON(!rl);
732	}
733	/ Seek to element containing target vcn. /
734	while (rl->length && rl[`1`].vcn <= vcn)
735	rl++;
736	lcn = ntfs_rl_vcn_to_lcn(rl, vcn);
737	/ For $MFT, only lcn >= 0 is a successful remap. /
738	if (likely(lcn >= `0`)) {
739	/ Setup buffer head to correct block. /
740	bh->b_blocknr = ((lcn <<
741	vol->cluster_size_bits) +
742	vcn_ofs) >> blocksize_bits;
743	set_buffer_mapped(bh);
744	} else {
745	bh->b_blocknr = -`1`;
746	ntfs_error(vol->sb, "Cannot write mft record "
747	"0x%lx because its location "
748	"on disk could not be "
749	"determined (error code %lli).",
750	ni->mft_no, (long long)lcn);
751	err = -EIO;
752	}
753	}
754	BUG_ON(!buffer_uptodate(bh));
755	BUG_ON(!nr_bhs && (m_start != block_start));
756	BUG_ON(nr_bhs >= max_bhs);
757	bhs[nr_bhs++] = bh;
758	BUG_ON((nr_bhs >= max_bhs) && (m_end != block_end));
759	} while (block_start = block_end, (bh = bh->b_this_page) != head);
760	if (unlikely(rl))
761	up_read(sem: &NTFS_I(inode: vol->mft_ino)->runlist.lock);
762	if (!nr_bhs)
763	goto done;
764	if (unlikely(err))
765	goto cleanup_out;
766	/ Apply the mst protection fixups. /
767	err = pre_write_mst_fixup(b: (NTFS_RECORD*)m, size: vol->mft_record_size);
768	if (err) {
769	ntfs_error(vol->sb, "Failed to apply mst fixups!");
770	goto cleanup_out;
771	}
772	flush_dcache_mft_record_page(ni);
773	/ Lock buffers and start synchronous write i/o on them. /
774	for (i_bhs = `0`; i_bhs < nr_bhs; i_bhs++) {
775	struct buffer_head *tbh = bhs[i_bhs];
776
777	if (!trylock_buffer(bh: tbh))
778	BUG();
779	BUG_ON(!buffer_uptodate(tbh));
780	clear_buffer_dirty(bh: tbh);
781	get_bh(bh: tbh);
782	tbh->b_end_io = end_buffer_write_sync;
783	submit_bh(REQ_OP_WRITE, tbh);
784	}
785	/ Synchronize the mft mirror now if not @sync. /
786	if (!sync && ni->mft_no < vol->mftmirr_size)
787	ntfs_sync_mft_mirror(vol, mft_no: ni->mft_no, m, sync);
788	/ Wait on i/o completion of buffers. /
789	for (i_bhs = `0`; i_bhs < nr_bhs; i_bhs++) {
790	struct buffer_head *tbh = bhs[i_bhs];
791
792	wait_on_buffer(bh: tbh);
793	if (unlikely(!buffer_uptodate(tbh))) {
794	err = -EIO;
795	/*
796	* Set the buffer uptodate so the page and buffer
797	* states do not become out of sync.
798	*/
799	if (PageUptodate(page))
800	set_buffer_uptodate(tbh);
801	}
802	}
803	/ If @sync, now synchronize the mft mirror. /
804	if (sync && ni->mft_no < vol->mftmirr_size)
805	ntfs_sync_mft_mirror(vol, mft_no: ni->mft_no, m, sync);
806	/ Remove the mst protection fixups again. /
807	post_write_mst_fixup(b: (NTFS_RECORD*)m);
808	flush_dcache_mft_record_page(ni);
809	if (unlikely(err)) {
810	/ I/O error during writing. This is really bad! /
811	ntfs_error(vol->sb, "I/O error while writing mft record "
812	"0x%lx! Marking base inode as bad. You "
813	"should unmount the volume and run chkdsk.",
814	ni->mft_no);
815	goto err_out;
816	}
817	done:
818	ntfs_debug("Done.");
819	return `0`;
820	cleanup_out:
821	/ Clean the buffers. /
822	for (i_bhs = `0`; i_bhs < nr_bhs; i_bhs++)
823	clear_buffer_dirty(bh: bhs[i_bhs]);
824	err_out:
825	/*
826	* Current state: all buffers are clean, unlocked, and uptodate.
827	* The caller should mark the base inode as bad so that no more i/o
828	* happens. ->clear_inode() will still be invoked so all extent inodes
829	* and other allocated memory will be freed.
830	*/
831	if (err == -ENOMEM) {
832	ntfs_error(vol->sb, "Not enough memory to write mft record. "
833	"Redirtying so the write is retried later.");
834	mark_mft_record_dirty(ni);
835	err = `0`;
836	} else
837	NVolSetErrors(vol);
838	return err;
839	}
840
841	/**
842	* ntfs_may_write_mft_record - check if an mft record may be written out
843	* @vol: [IN] ntfs volume on which the mft record to check resides
844	* @mft_no: [IN] mft record number of the mft record to check
845	* @m: [IN] mapped mft record to check
846	* @locked_ni: [OUT] caller has to unlock this ntfs inode if one is returned
847	*
848	* Check if the mapped (base or extent) mft record @m with mft record number
849	* @mft_no belonging to the ntfs volume @vol may be written out. If necessary
850	* and possible the ntfs inode of the mft record is locked and the base vfs
851	* inode is pinned. The locked ntfs inode is then returned in @locked_ni. The
852	* caller is responsible for unlocking the ntfs inode and unpinning the base
853	* vfs inode.
854	*
855	* Return 'true' if the mft record may be written out and 'false' if not.
856	*
857	* The caller has locked the page and cleared the uptodate flag on it which
858	* means that we can safely write out any dirty mft records that do not have
859	* their inodes in icache as determined by ilookup5() as anyone
860	* opening/creating such an inode would block when attempting to map the mft
861	* record in read_cache_page() until we are finished with the write out.
862	*
863	* Here is a description of the tests we perform:
864	*
865	* If the inode is found in icache we know the mft record must be a base mft
866	* record. If it is dirty, we do not write it and return 'false' as the vfs
867	* inode write paths will result in the access times being updated which would
868	* cause the base mft record to be redirtied and written out again. (We know
869	* the access time update will modify the base mft record because Windows
870	* chkdsk complains if the standard information attribute is not in the base
871	* mft record.)
872	*
873	* If the inode is in icache and not dirty, we attempt to lock the mft record
874	* and if we find the lock was already taken, it is not safe to write the mft
875	* record and we return 'false'.
876	*
877	* If we manage to obtain the lock we have exclusive access to the mft record,
878	* which also allows us safe writeout of the mft record. We then set
879	* @locked_ni to the locked ntfs inode and return 'true'.
880	*
881	* Note we cannot just lock the mft record and sleep while waiting for the lock
882	* because this would deadlock due to lock reversal (normally the mft record is
883	* locked before the page is locked but we already have the page locked here
884	* when we try to lock the mft record).
885	*
886	* If the inode is not in icache we need to perform further checks.
887	*
888	* If the mft record is not a FILE record or it is a base mft record, we can
889	* safely write it and return 'true'.
890	*
891	* We now know the mft record is an extent mft record. We check if the inode
892	* corresponding to its base mft record is in icache and obtain a reference to
893	* it if it is. If it is not, we can safely write it and return 'true'.
894	*
895	* We now have the base inode for the extent mft record. We check if it has an
896	* ntfs inode for the extent mft record attached and if not it is safe to write
897	* the extent mft record and we return 'true'.
898	*
899	* The ntfs inode for the extent mft record is attached to the base inode so we
900	* attempt to lock the extent mft record and if we find the lock was already
901	* taken, it is not safe to write the extent mft record and we return 'false'.
902	*
903	* If we manage to obtain the lock we have exclusive access to the extent mft
904	* record, which also allows us safe writeout of the extent mft record. We
905	* set the ntfs inode of the extent mft record clean and then set @locked_ni to
906	* the now locked ntfs inode and return 'true'.
907	*
908	* Note, the reason for actually writing dirty mft records here and not just
909	* relying on the vfs inode dirty code paths is that we can have mft records
910	* modified without them ever having actual inodes in memory. Also we can have
911	* dirty mft records with clean ntfs inodes in memory. None of the described
912	* cases would result in the dirty mft records being written out if we only
913	* relied on the vfs inode dirty code paths. And these cases can really occur
914	* during allocation of new mft records and in particular when the
915	* initialized_size of the $MFT/$DATA attribute is extended and the new space
916	* is initialized using ntfs_mft_record_format(). The clean inode can then
917	* appear if the mft record is reused for a new inode before it got written
918	* out.
919	*/
920	bool ntfs_may_write_mft_record(ntfs_volume vol, const* unsigned long mft_no,
921	const MFT_RECORD m, ntfs_inode *locked_ni)
922	{
923	struct super_block *sb = vol->sb;
924	struct inode *mft_vi = vol->mft_ino;
925	struct inode *vi;
926	ntfs_inode ni, eni, **extent_nis;
927	int i;
928	ntfs_attr na;
929
930	ntfs_debug("Entering for inode 0x%lx.", mft_no);
931	/*
932	* Normally we do not return a locked inode so set @locked_ni to NULL.
933	*/
934	BUG_ON(!locked_ni);
935	*locked_ni = NULL;
936	/*
937	* Check if the inode corresponding to this mft record is in the VFS
938	* inode cache and obtain a reference to it if it is.
939	*/
940	ntfs_debug("Looking for inode 0x%lx in icache.", mft_no);
941	na.mft_no = mft_no;
942	na.name = NULL;
943	na.name_len = `0`;
944	na.type = AT_UNUSED;
945	/*
946	* Optimize inode 0, i.e. $MFT itself, since we have it in memory and
947	* we get here for it rather often.
948	*/
949	if (!mft_no) {
950	/ Balance the below iput(). /
951	vi = igrab(mft_vi);
952	BUG_ON(vi != mft_vi);
953	} else {
954	/*
955	* Have to use ilookup5_nowait() since ilookup5() waits for the
956	* inode lock which causes ntfs to deadlock when a concurrent
957	* inode write via the inode dirty code paths and the page
958	* dirty code path of the inode dirty code path when writing
959	* $MFT occurs.
960	*/
961	vi = ilookup5_nowait(sb, hashval: mft_no, test: ntfs_test_inode, data: &na);
962	}
963	if (vi) {
964	ntfs_debug("Base inode 0x%lx is in icache.", mft_no);
965	/ The inode is in icache. /
966	ni = NTFS_I(inode: vi);
967	/ Take a reference to the ntfs inode. /
968	atomic_inc(v: &ni->count);
969	/ If the inode is dirty, do not write this record. /
970	if (NInoDirty(ni)) {
971	ntfs_debug("Inode 0x%lx is dirty, do not write it.",
972	mft_no);
973	atomic_dec(v: &ni->count);
974	iput(vi);
975	return false;
976	}
977	ntfs_debug("Inode 0x%lx is not dirty.", mft_no);
978	/ The inode is not dirty, try to take the mft record lock. /
979	if (unlikely(!mutex_trylock(&ni->mrec_lock))) {
980	ntfs_debug("Mft record 0x%lx is already locked, do "
981	"not write it.", mft_no);
982	atomic_dec(v: &ni->count);
983	iput(vi);
984	return false;
985	}
986	ntfs_debug("Managed to lock mft record 0x%lx, write it.",
987	mft_no);
988	/*
989	* The write has to occur while we hold the mft record lock so
990	* return the locked ntfs inode.
991	*/
992	*locked_ni = ni;
993	return true;
994	}
995	ntfs_debug("Inode 0x%lx is not in icache.", mft_no);
996	/ The inode is not in icache. /
997	/ Write the record if it is not a mft record (type "FILE"). /
998	if (!ntfs_is_mft_record(m->magic)) {
999	ntfs_debug("Mft record 0x%lx is not a FILE record, write it.",
1000	mft_no);
1001	return true;
1002	}
1003	/ Write the mft record if it is a base inode. /
1004	if (!m->base_mft_record) {
1005	ntfs_debug("Mft record 0x%lx is a base record, write it.",
1006	mft_no);
1007	return true;
1008	}
1009	/*
1010	* This is an extent mft record. Check if the inode corresponding to
1011	* its base mft record is in icache and obtain a reference to it if it
1012	* is.
1013	*/
1014	na.mft_no = MREF_LE(m->base_mft_record);
1015	ntfs_debug("Mft record 0x%lx is an extent record. Looking for base "
1016	"inode 0x%lx in icache.", mft_no, na.mft_no);
1017	if (!na.mft_no) {
1018	/ Balance the below iput(). /
1019	vi = igrab(mft_vi);
1020	BUG_ON(vi != mft_vi);
1021	} else
1022	vi = ilookup5_nowait(sb, hashval: na.mft_no, test: ntfs_test_inode,
1023	data: &na);
1024	if (!vi) {
1025	/*
1026	* The base inode is not in icache, write this extent mft
1027	* record.
1028	*/
1029	ntfs_debug("Base inode 0x%lx is not in icache, write the "
1030	"extent record.", na.mft_no);
1031	return true;
1032	}
1033	ntfs_debug("Base inode 0x%lx is in icache.", na.mft_no);
1034	/*
1035	* The base inode is in icache. Check if it has the extent inode
1036	* corresponding to this extent mft record attached.
1037	*/
1038	ni = NTFS_I(inode: vi);
1039	mutex_lock(&ni->extent_lock);
1040	if (ni->nr_extents <= `0`) {
1041	/*
1042	* The base inode has no attached extent inodes, write this
1043	* extent mft record.
1044	*/
1045	mutex_unlock(lock: &ni->extent_lock);
1046	iput(vi);
1047	ntfs_debug("Base inode 0x%lx has no attached extent inodes, "
1048	"write the extent record.", na.mft_no);
1049	return true;
1050	}
1051	/ Iterate over the attached extent inodes. /
1052	extent_nis = ni->ext.extent_ntfs_inos;
1053	for (eni = NULL, i = `0`; i < ni->nr_extents; ++i) {
1054	if (mft_no == extent_nis[i]->mft_no) {
1055	/*
1056	* Found the extent inode corresponding to this extent
1057	* mft record.
1058	*/
1059	eni = extent_nis[i];
1060	break;
1061	}
1062	}
1063	/*
1064	* If the extent inode was not attached to the base inode, write this
1065	* extent mft record.
1066	*/
1067	if (!eni) {
1068	mutex_unlock(lock: &ni->extent_lock);
1069	iput(vi);
1070	ntfs_debug("Extent inode 0x%lx is not attached to its base "
1071	"inode 0x%lx, write the extent record.",
1072	mft_no, na.mft_no);
1073	return true;
1074	}
1075	ntfs_debug("Extent inode 0x%lx is attached to its base inode 0x%lx.",
1076	mft_no, na.mft_no);
1077	/ Take a reference to the extent ntfs inode. /
1078	atomic_inc(v: &eni->count);
1079	mutex_unlock(lock: &ni->extent_lock);
1080	/*
1081	* Found the extent inode coresponding to this extent mft record.
1082	* Try to take the mft record lock.
1083	*/
1084	if (unlikely(!mutex_trylock(&eni->mrec_lock))) {
1085	atomic_dec(v: &eni->count);
1086	iput(vi);
1087	ntfs_debug("Extent mft record 0x%lx is already locked, do "
1088	"not write it.", mft_no);
1089	return false;
1090	}
1091	ntfs_debug("Managed to lock extent mft record 0x%lx, write it.",
1092	mft_no);
1093	if (NInoTestClearDirty(ni: eni))
1094	ntfs_debug("Extent inode 0x%lx is dirty, marking it clean.",
1095	mft_no);
1096	/*
1097	* The write has to occur while we hold the mft record lock so return
1098	* the locked extent ntfs inode.
1099	*/
1100	*locked_ni = eni;
1101	return true;
1102	}
1103
1104	static const char *es = " Leaving inconsistent metadata. Unmount and run "
1105	"chkdsk.";
1106
1107	/**
1108	* ntfs_mft_bitmap_find_and_alloc_free_rec_nolock - see name
1109	* @vol: volume on which to search for a free mft record
1110	* @base_ni: open base inode if allocating an extent mft record or NULL
1111	*
1112	* Search for a free mft record in the mft bitmap attribute on the ntfs volume
1113	* @vol.
1114	*
1115	* If @base_ni is NULL start the search at the default allocator position.
1116	*
1117	* If @base_ni is not NULL start the search at the mft record after the base
1118	* mft record @base_ni.
1119	*
1120	* Return the free mft record on success and -errno on error. An error code of
1121	* -ENOSPC means that there are no free mft records in the currently
1122	* initialized mft bitmap.
1123	*
1124	* Locking: Caller must hold vol->mftbmp_lock for writing.
1125	*/
1126	static int ntfs_mft_bitmap_find_and_alloc_free_rec_nolock(ntfs_volume *vol,
1127	ntfs_inode *base_ni)
1128	{
1129	s64 pass_end, ll, data_pos, pass_start, ofs, bit;
1130	unsigned long flags;
1131	struct address_space *mftbmp_mapping;
1132	u8 buf, byte;
1133	struct page *page;
1134	unsigned int page_ofs, size;
1135	u8 pass, b;
1136
1137	ntfs_debug("Searching for free mft record in the currently "
1138	"initialized mft bitmap.");
1139	mftbmp_mapping = vol->mftbmp_ino->i_mapping;
1140	/*
1141	* Set the end of the pass making sure we do not overflow the mft
1142	* bitmap.
1143	*/
1144	read_lock_irqsave(&NTFS_I(vol->mft_ino)->size_lock, flags);
1145	pass_end = NTFS_I(inode: vol->mft_ino)->allocated_size >>
1146	vol->mft_record_size_bits;
1147	read_unlock_irqrestore(&NTFS_I(vol->mft_ino)->size_lock, flags);
1148	read_lock_irqsave(&NTFS_I(vol->mftbmp_ino)->size_lock, flags);
1149	ll = NTFS_I(inode: vol->mftbmp_ino)->initialized_size << `3`;
1150	read_unlock_irqrestore(&NTFS_I(vol->mftbmp_ino)->size_lock, flags);
1151	if (pass_end > ll)
1152	pass_end = ll;
1153	pass = `1`;
1154	if (!base_ni)
1155	data_pos = vol->mft_data_pos;
1156	else
1157	data_pos = base_ni->mft_no + `1`;
1158	if (data_pos < `24`)
1159	data_pos = `24`;
1160	if (data_pos >= pass_end) {
1161	data_pos = `24`;
1162	pass = `2`;
1163	/ This happens on a freshly formatted volume. /
1164	if (data_pos >= pass_end)
1165	return -ENOSPC;
1166	}
1167	pass_start = data_pos;
1168	ntfs_debug("Starting bitmap search: pass %u, pass_start 0x%llx, "
1169	"pass_end 0x%llx, data_pos 0x%llx.", pass,
1170	(long long)pass_start, (long long)pass_end,
1171	(long long)data_pos);
1172	/ Loop until a free mft record is found. /
1173	for (; pass <= `2`;) {
1174	/ Cap size to pass_end. /
1175	ofs = data_pos >> `3`;
1176	page_ofs = ofs & ~PAGE_MASK;
1177	size = PAGE_SIZE - page_ofs;
1178	ll = ((pass_end + `7`) >> `3`) - ofs;
1179	if (size > ll)
1180	size = ll;
1181	size <<= `3`;
1182	/*
1183	* If we are still within the active pass, search the next page
1184	* for a zero bit.
1185	*/
1186	if (size) {
1187	page = ntfs_map_page(mapping: mftbmp_mapping,
1188	index: ofs >> PAGE_SHIFT);
1189	if (IS_ERR(ptr: page)) {
1190	ntfs_error(vol->sb, "Failed to read mft "
1191	"bitmap, aborting.");
1192	return PTR_ERR(ptr: page);
1193	}
1194	buf = (u8*)page_address(page) + page_ofs;
1195	bit = data_pos & `7`;
1196	data_pos &= ~`7ull`;
1197	ntfs_debug("Before inner for loop: size 0x%x, "
1198	"data_pos 0x%llx, bit 0x%llx", size,
1199	(long long)data_pos, (long long)bit);
1200	for (; bit < size && data_pos + bit < pass_end;
1201	bit &= ~`7ull`, bit += `8`) {
1202	byte = buf + (bit >> `3`);
1203	if (*byte == `0xff`)
1204	continue;
1205	b = ffz((unsigned long)*byte);
1206	if (b < `8` && b >= (bit & `7`)) {
1207	ll = data_pos + (bit & ~`7ull`) + b;
1208	if (unlikely(ll > (`1ll` << `32`))) {
1209	ntfs_unmap_page(page);
1210	return -ENOSPC;
1211	}
1212	*byte \|= `1` << b;
1213	flush_dcache_page(page);
1214	set_page_dirty(page);
1215	ntfs_unmap_page(page);
1216	ntfs_debug("Done. (Found and "
1217	"allocated mft record "
1218	"0x%llx.)",
1219	(long long)ll);
1220	return ll;
1221	}
1222	}
1223	ntfs_debug("After inner for loop: size 0x%x, "
1224	"data_pos 0x%llx, bit 0x%llx", size,
1225	(long long)data_pos, (long long)bit);
1226	data_pos += size;
1227	ntfs_unmap_page(page);
1228	/*
1229	* If the end of the pass has not been reached yet,
1230	* continue searching the mft bitmap for a zero bit.
1231	*/
1232	if (data_pos < pass_end)
1233	continue;
1234	}
1235	/ Do the next pass. /
1236	if (++pass == `2`) {
1237	/*
1238	* Starting the second pass, in which we scan the first
1239	* part of the zone which we omitted earlier.
1240	*/
1241	pass_end = pass_start;
1242	data_pos = pass_start = `24`;
1243	ntfs_debug("pass %i, pass_start 0x%llx, pass_end "
1244	"0x%llx.", pass, (long long)pass_start,
1245	(long long)pass_end);
1246	if (data_pos >= pass_end)
1247	break;
1248	}
1249	}
1250	/ No free mft records in currently initialized mft bitmap. /
1251	ntfs_debug("Done. (No free mft records left in currently initialized "
1252	"mft bitmap.)");
1253	return -ENOSPC;
1254	}
1255
1256	/**
1257	* ntfs_mft_bitmap_extend_allocation_nolock - extend mft bitmap by a cluster
1258	* @vol: volume on which to extend the mft bitmap attribute
1259	*
1260	* Extend the mft bitmap attribute on the ntfs volume @vol by one cluster.
1261	*
1262	* Note: Only changes allocated_size, i.e. does not touch initialized_size or
1263	* data_size.
1264	*
1265	* Return 0 on success and -errno on error.
1266	*
1267	* Locking: - Caller must hold vol->mftbmp_lock for writing.
1268	* - This function takes NTFS_I(vol->mftbmp_ino)->runlist.lock for
1269	* writing and releases it before returning.
1270	* - This function takes vol->lcnbmp_lock for writing and releases it
1271	* before returning.
1272	*/
1273	static int ntfs_mft_bitmap_extend_allocation_nolock(ntfs_volume *vol)
1274	{
1275	LCN lcn;
1276	s64 ll;
1277	unsigned long flags;
1278	struct page *page;
1279	ntfs_inode mft_ni, mftbmp_ni;
1280	runlist_element rl, rl2 = NULL;
1281	ntfs_attr_search_ctx *ctx = NULL;
1282	MFT_RECORD *mrec;
1283	ATTR_RECORD *a = NULL;
1284	int ret, mp_size;
1285	u32 old_alen = `0`;
1286	u8 *b, tb;
1287	struct {
1288	u8 added_cluster:`1`;
1289	u8 added_run:`1`;
1290	u8 mp_rebuilt:`1`;
1291	} status = { `0`, `0`, `0` };
1292
1293	ntfs_debug("Extending mft bitmap allocation.");
1294	mft_ni = NTFS_I(inode: vol->mft_ino);
1295	mftbmp_ni = NTFS_I(inode: vol->mftbmp_ino);
1296	/*
1297	* Determine the last lcn of the mft bitmap. The allocated size of the
1298	* mft bitmap cannot be zero so we are ok to do this.
1299	*/
1300	down_write(sem: &mftbmp_ni->runlist.lock);
1301	read_lock_irqsave(&mftbmp_ni->size_lock, flags);
1302	ll = mftbmp_ni->allocated_size;
1303	read_unlock_irqrestore(&mftbmp_ni->size_lock, flags);
1304	rl = ntfs_attr_find_vcn_nolock(ni: mftbmp_ni,
1305	vcn: (ll - `1`) >> vol->cluster_size_bits, NULL);
1306	if (IS_ERR(ptr: rl) \|\| unlikely(!rl->length \|\| rl->lcn < `0`)) {
1307	up_write(sem: &mftbmp_ni->runlist.lock);
1308	ntfs_error(vol->sb, "Failed to determine last allocated "
1309	"cluster of mft bitmap attribute.");
1310	if (!IS_ERR(ptr: rl))
1311	ret = -EIO;
1312	else
1313	ret = PTR_ERR(ptr: rl);
1314	return ret;
1315	}
1316	lcn = rl->lcn + rl->length;
1317	ntfs_debug("Last lcn of mft bitmap attribute is 0x%llx.",
1318	(long long)lcn);
1319	/*
1320	* Attempt to get the cluster following the last allocated cluster by
1321	* hand as it may be in the MFT zone so the allocator would not give it
1322	* to us.
1323	*/
1324	ll = lcn >> `3`;
1325	page = ntfs_map_page(mapping: vol->lcnbmp_ino->i_mapping,
1326	index: ll >> PAGE_SHIFT);
1327	if (IS_ERR(ptr: page)) {
1328	up_write(sem: &mftbmp_ni->runlist.lock);
1329	ntfs_error(vol->sb, "Failed to read from lcn bitmap.");
1330	return PTR_ERR(ptr: page);
1331	}
1332	b = (u8*)page_address(page) + (ll & ~PAGE_MASK);
1333	tb = `1` << (lcn & `7ull`);
1334	down_write(sem: &vol->lcnbmp_lock);
1335	if (b != `0xff` && !(b & tb)) {
1336	/ Next cluster is free, allocate it. /
1337	*b \|= tb;
1338	flush_dcache_page(page);
1339	set_page_dirty(page);
1340	up_write(sem: &vol->lcnbmp_lock);
1341	ntfs_unmap_page(page);
1342	/ Update the mft bitmap runlist. /
1343	rl->length++;
1344	rl[`1`].vcn++;
1345	status.added_cluster = `1`;
1346	ntfs_debug("Appending one cluster to mft bitmap.");
1347	} else {
1348	up_write(sem: &vol->lcnbmp_lock);
1349	ntfs_unmap_page(page);
1350	/ Allocate a cluster from the DATA_ZONE. /
1351	rl2 = ntfs_cluster_alloc(vol, start_vcn: rl[`1`].vcn, count: `1`, start_lcn: lcn, zone: DATA_ZONE,
1352	is_extension: true);
1353	if (IS_ERR(ptr: rl2)) {
1354	up_write(sem: &mftbmp_ni->runlist.lock);
1355	ntfs_error(vol->sb, "Failed to allocate a cluster for "
1356	"the mft bitmap.");
1357	return PTR_ERR(ptr: rl2);
1358	}
1359	rl = ntfs_runlists_merge(drl: mftbmp_ni->runlist.rl, srl: rl2);
1360	if (IS_ERR(ptr: rl)) {
1361	up_write(sem: &mftbmp_ni->runlist.lock);
1362	ntfs_error(vol->sb, "Failed to merge runlists for mft "
1363	"bitmap.");
1364	if (ntfs_cluster_free_from_rl(vol, rl: rl2)) {
1365	ntfs_error(vol->sb, "Failed to deallocate "
1366	"allocated cluster.%s", es);
1367	NVolSetErrors(vol);
1368	}
1369	ntfs_free(addr: rl2);
1370	return PTR_ERR(ptr: rl);
1371	}
1372	mftbmp_ni->runlist.rl = rl;
1373	status.added_run = `1`;
1374	ntfs_debug("Adding one run to mft bitmap.");
1375	/ Find the last run in the new runlist. /
1376	for (; rl[`1`].length; rl++)
1377	;
1378	}
1379	/*
1380	* Update the attribute record as well. Note: @rl is the last
1381	* (non-terminator) runlist element of mft bitmap.
1382	*/
1383	mrec = map_mft_record(ni: mft_ni);
1384	if (IS_ERR(ptr: mrec)) {
1385	ntfs_error(vol->sb, "Failed to map mft record.");
1386	ret = PTR_ERR(ptr: mrec);
1387	goto undo_alloc;
1388	}
1389	ctx = ntfs_attr_get_search_ctx(ni: mft_ni, mrec);
1390	if (unlikely(!ctx)) {
1391	ntfs_error(vol->sb, "Failed to get search context.");
1392	ret = -ENOMEM;
1393	goto undo_alloc;
1394	}
1395	ret = ntfs_attr_lookup(type: mftbmp_ni->type, name: mftbmp_ni->name,
1396	name_len: mftbmp_ni->name_len, ic: CASE_SENSITIVE, lowest_vcn: rl[`1`].vcn, NULL,
1397	val_len: `0`, ctx);
1398	if (unlikely(ret)) {
1399	ntfs_error(vol->sb, "Failed to find last attribute extent of "
1400	"mft bitmap attribute.");
1401	if (ret == -ENOENT)
1402	ret = -EIO;
1403	goto undo_alloc;
1404	}
1405	a = ctx->attr;
1406	ll = sle64_to_cpu(x: a->data.non_resident.lowest_vcn);
1407	/ Search back for the previous last allocated cluster of mft bitmap. /
1408	for (rl2 = rl; rl2 > mftbmp_ni->runlist.rl; rl2--) {
1409	if (ll >= rl2->vcn)
1410	break;
1411	}
1412	BUG_ON(ll < rl2->vcn);
1413	BUG_ON(ll >= rl2->vcn + rl2->length);
1414	/ Get the size for the new mapping pairs array for this extent. /
1415	mp_size = ntfs_get_size_for_mapping_pairs(vol, rl: rl2, first_vcn: ll, last_vcn: -`1`);
1416	if (unlikely(mp_size <= `0`)) {
1417	ntfs_error(vol->sb, "Get size for mapping pairs failed for "
1418	"mft bitmap attribute extent.");
1419	ret = mp_size;
1420	if (!ret)
1421	ret = -EIO;
1422	goto undo_alloc;
1423	}
1424	/ Expand the attribute record if necessary. /
1425	old_alen = le32_to_cpu(a->length);
1426	ret = ntfs_attr_record_resize(m: ctx->mrec, a, new_size: mp_size +
1427	le16_to_cpu(a->data.non_resident.mapping_pairs_offset));
1428	if (unlikely(ret)) {
1429	if (ret != -ENOSPC) {
1430	ntfs_error(vol->sb, "Failed to resize attribute "
1431	"record for mft bitmap attribute.");
1432	goto undo_alloc;
1433	}
1434	// TODO: Deal with this by moving this extent to a new mft
1435	// record or by starting a new extent in a new mft record or by
1436	// moving other attributes out of this mft record.
1437	// Note: It will need to be a special mft record and if none of
1438	// those are available it gets rather complicated...
1439	ntfs_error(vol->sb, "Not enough space in this mft record to "
1440	"accommodate extended mft bitmap attribute "
1441	"extent. Cannot handle this yet.");
1442	ret = -EOPNOTSUPP;
1443	goto undo_alloc;
1444	}
1445	status.mp_rebuilt = `1`;
1446	/ Generate the mapping pairs array directly into the attr record. /
1447	ret = ntfs_mapping_pairs_build(vol, dst: (u8*)a +
1448	le16_to_cpu(a->data.non_resident.mapping_pairs_offset),
1449	dst_len: mp_size, rl: rl2, first_vcn: ll, last_vcn: -`1`, NULL);
1450	if (unlikely(ret)) {
1451	ntfs_error(vol->sb, "Failed to build mapping pairs array for "
1452	"mft bitmap attribute.");
1453	goto undo_alloc;
1454	}
1455	/ Update the highest_vcn. /
1456	a->data.non_resident.highest_vcn = cpu_to_sle64(x: rl[`1`].vcn - `1`);
1457	/*
1458	* We now have extended the mft bitmap allocated_size by one cluster.
1459	* Reflect this in the ntfs_inode structure and the attribute record.
1460	*/
1461	if (a->data.non_resident.lowest_vcn) {
1462	/*
1463	* We are not in the first attribute extent, switch to it, but
1464	* first ensure the changes will make it to disk later.
1465	*/
1466	flush_dcache_mft_record_page(ni: ctx->ntfs_ino);
1467	mark_mft_record_dirty(ni: ctx->ntfs_ino);
1468	ntfs_attr_reinit_search_ctx(ctx);
1469	ret = ntfs_attr_lookup(type: mftbmp_ni->type, name: mftbmp_ni->name,
1470	name_len: mftbmp_ni->name_len, ic: CASE_SENSITIVE, lowest_vcn: `0`, NULL,
1471	val_len: `0`, ctx);
1472	if (unlikely(ret)) {
1473	ntfs_error(vol->sb, "Failed to find first attribute "
1474	"extent of mft bitmap attribute.");
1475	goto restore_undo_alloc;
1476	}
1477	a = ctx->attr;
1478	}
1479	write_lock_irqsave(&mftbmp_ni->size_lock, flags);
1480	mftbmp_ni->allocated_size += vol->cluster_size;
1481	a->data.non_resident.allocated_size =
1482	cpu_to_sle64(x: mftbmp_ni->allocated_size);
1483	write_unlock_irqrestore(&mftbmp_ni->size_lock, flags);
1484	/ Ensure the changes make it to disk. /
1485	flush_dcache_mft_record_page(ni: ctx->ntfs_ino);
1486	mark_mft_record_dirty(ni: ctx->ntfs_ino);
1487	ntfs_attr_put_search_ctx(ctx);
1488	unmap_mft_record(ni: mft_ni);
1489	up_write(sem: &mftbmp_ni->runlist.lock);
1490	ntfs_debug("Done.");
1491	return `0`;
1492	restore_undo_alloc:
1493	ntfs_attr_reinit_search_ctx(ctx);
1494	if (ntfs_attr_lookup(type: mftbmp_ni->type, name: mftbmp_ni->name,
1495	name_len: mftbmp_ni->name_len, ic: CASE_SENSITIVE, lowest_vcn: rl[`1`].vcn, NULL,
1496	val_len: `0`, ctx)) {
1497	ntfs_error(vol->sb, "Failed to find last attribute extent of "
1498	"mft bitmap attribute.%s", es);
1499	write_lock_irqsave(&mftbmp_ni->size_lock, flags);
1500	mftbmp_ni->allocated_size += vol->cluster_size;
1501	write_unlock_irqrestore(&mftbmp_ni->size_lock, flags);
1502	ntfs_attr_put_search_ctx(ctx);
1503	unmap_mft_record(ni: mft_ni);
1504	up_write(sem: &mftbmp_ni->runlist.lock);
1505	/*
1506	* The only thing that is now wrong is ->allocated_size of the
1507	* base attribute extent which chkdsk should be able to fix.
1508	*/
1509	NVolSetErrors(vol);
1510	return ret;
1511	}
1512	a = ctx->attr;
1513	a->data.non_resident.highest_vcn = cpu_to_sle64(x: rl[`1`].vcn - `2`);
1514	undo_alloc:
1515	if (status.added_cluster) {
1516	/ Truncate the last run in the runlist by one cluster. /
1517	rl->length--;
1518	rl[`1`].vcn--;
1519	} else if (status.added_run) {
1520	lcn = rl->lcn;
1521	/ Remove the last run from the runlist. /
1522	rl->lcn = rl[`1`].lcn;
1523	rl->length = `0`;
1524	}
1525	/ Deallocate the cluster. /
1526	down_write(sem: &vol->lcnbmp_lock);
1527	if (ntfs_bitmap_clear_bit(vi: vol->lcnbmp_ino, bit: lcn)) {
1528	ntfs_error(vol->sb, "Failed to free allocated cluster.%s", es);
1529	NVolSetErrors(vol);
1530	}
1531	up_write(sem: &vol->lcnbmp_lock);
1532	if (status.mp_rebuilt) {
1533	if (ntfs_mapping_pairs_build(vol, dst: (u8*)a + le16_to_cpu(
1534	a->data.non_resident.mapping_pairs_offset),
1535	dst_len: old_alen - le16_to_cpu(
1536	a->data.non_resident.mapping_pairs_offset),
1537	rl: rl2, first_vcn: ll, last_vcn: -`1`, NULL)) {
1538	ntfs_error(vol->sb, "Failed to restore mapping pairs "
1539	"array.%s", es);
1540	NVolSetErrors(vol);
1541	}
1542	if (ntfs_attr_record_resize(m: ctx->mrec, a, new_size: old_alen)) {
1543	ntfs_error(vol->sb, "Failed to restore attribute "
1544	"record.%s", es);
1545	NVolSetErrors(vol);
1546	}
1547	flush_dcache_mft_record_page(ni: ctx->ntfs_ino);
1548	mark_mft_record_dirty(ni: ctx->ntfs_ino);
1549	}
1550	if (ctx)
1551	ntfs_attr_put_search_ctx(ctx);
1552	if (!IS_ERR(ptr: mrec))
1553	unmap_mft_record(ni: mft_ni);
1554	up_write(sem: &mftbmp_ni->runlist.lock);
1555	return ret;
1556	}
1557
1558	/**
1559	* ntfs_mft_bitmap_extend_initialized_nolock - extend mftbmp initialized data
1560	* @vol: volume on which to extend the mft bitmap attribute
1561	*
1562	* Extend the initialized portion of the mft bitmap attribute on the ntfs
1563	* volume @vol by 8 bytes.
1564	*
1565	* Note: Only changes initialized_size and data_size, i.e. requires that
1566	* allocated_size is big enough to fit the new initialized_size.
1567	*
1568	* Return 0 on success and -error on error.
1569	*
1570	* Locking: Caller must hold vol->mftbmp_lock for writing.
1571	*/
1572	static int ntfs_mft_bitmap_extend_initialized_nolock(ntfs_volume *vol)
1573	{
1574	s64 old_data_size, old_initialized_size;
1575	unsigned long flags;
1576	struct inode *mftbmp_vi;
1577	ntfs_inode mft_ni, mftbmp_ni;
1578	ntfs_attr_search_ctx *ctx;
1579	MFT_RECORD *mrec;
1580	ATTR_RECORD *a;
1581	int ret;
1582
1583	ntfs_debug("Extending mft bitmap initiailized (and data) size.");
1584	mft_ni = NTFS_I(inode: vol->mft_ino);
1585	mftbmp_vi = vol->mftbmp_ino;
1586	mftbmp_ni = NTFS_I(inode: mftbmp_vi);
1587	/ Get the attribute record. /
1588	mrec = map_mft_record(ni: mft_ni);
1589	if (IS_ERR(ptr: mrec)) {
1590	ntfs_error(vol->sb, "Failed to map mft record.");
1591	return PTR_ERR(ptr: mrec);
1592	}
1593	ctx = ntfs_attr_get_search_ctx(ni: mft_ni, mrec);
1594	if (unlikely(!ctx)) {
1595	ntfs_error(vol->sb, "Failed to get search context.");
1596	ret = -ENOMEM;
1597	goto unm_err_out;
1598	}
1599	ret = ntfs_attr_lookup(type: mftbmp_ni->type, name: mftbmp_ni->name,
1600	name_len: mftbmp_ni->name_len, ic: CASE_SENSITIVE, lowest_vcn: `0`, NULL, val_len: `0`, ctx);
1601	if (unlikely(ret)) {
1602	ntfs_error(vol->sb, "Failed to find first attribute extent of "
1603	"mft bitmap attribute.");
1604	if (ret == -ENOENT)
1605	ret = -EIO;
1606	goto put_err_out;
1607	}
1608	a = ctx->attr;
1609	write_lock_irqsave(&mftbmp_ni->size_lock, flags);
1610	old_data_size = i_size_read(inode: mftbmp_vi);
1611	old_initialized_size = mftbmp_ni->initialized_size;
1612	/*
1613	* We can simply update the initialized_size before filling the space
1614	* with zeroes because the caller is holding the mft bitmap lock for
1615	* writing which ensures that no one else is trying to access the data.
1616	*/
1617	mftbmp_ni->initialized_size += `8`;
1618	a->data.non_resident.initialized_size =
1619	cpu_to_sle64(x: mftbmp_ni->initialized_size);
1620	if (mftbmp_ni->initialized_size > old_data_size) {
1621	i_size_write(inode: mftbmp_vi, i_size: mftbmp_ni->initialized_size);
1622	a->data.non_resident.data_size =
1623	cpu_to_sle64(x: mftbmp_ni->initialized_size);
1624	}
1625	write_unlock_irqrestore(&mftbmp_ni->size_lock, flags);
1626	/ Ensure the changes make it to disk. /
1627	flush_dcache_mft_record_page(ni: ctx->ntfs_ino);
1628	mark_mft_record_dirty(ni: ctx->ntfs_ino);
1629	ntfs_attr_put_search_ctx(ctx);
1630	unmap_mft_record(ni: mft_ni);
1631	/ Initialize the mft bitmap attribute value with zeroes. /
1632	ret = ntfs_attr_set(ni: mftbmp_ni, ofs: old_initialized_size, cnt: `8`, val: `0`);
1633	if (likely(!ret)) {
1634	ntfs_debug("Done. (Wrote eight initialized bytes to mft "
1635	"bitmap.");
1636	return `0`;
1637	}
1638	ntfs_error(vol->sb, "Failed to write to mft bitmap.");
1639	/ Try to recover from the error. /
1640	mrec = map_mft_record(ni: mft_ni);
1641	if (IS_ERR(ptr: mrec)) {
1642	ntfs_error(vol->sb, "Failed to map mft record.%s", es);
1643	NVolSetErrors(vol);
1644	return ret;
1645	}
1646	ctx = ntfs_attr_get_search_ctx(ni: mft_ni, mrec);
1647	if (unlikely(!ctx)) {
1648	ntfs_error(vol->sb, "Failed to get search context.%s", es);
1649	NVolSetErrors(vol);
1650	goto unm_err_out;
1651	}
1652	if (ntfs_attr_lookup(type: mftbmp_ni->type, name: mftbmp_ni->name,
1653	name_len: mftbmp_ni->name_len, ic: CASE_SENSITIVE, lowest_vcn: `0`, NULL, val_len: `0`, ctx)) {
1654	ntfs_error(vol->sb, "Failed to find first attribute extent of "
1655	"mft bitmap attribute.%s", es);
1656	NVolSetErrors(vol);
1657	put_err_out:
1658	ntfs_attr_put_search_ctx(ctx);
1659	unm_err_out:
1660	unmap_mft_record(ni: mft_ni);
1661	goto err_out;
1662	}
1663	a = ctx->attr;
1664	write_lock_irqsave(&mftbmp_ni->size_lock, flags);
1665	mftbmp_ni->initialized_size = old_initialized_size;
1666	a->data.non_resident.initialized_size =
1667	cpu_to_sle64(x: old_initialized_size);
1668	if (i_size_read(inode: mftbmp_vi) != old_data_size) {
1669	i_size_write(inode: mftbmp_vi, i_size: old_data_size);
1670	a->data.non_resident.data_size = cpu_to_sle64(x: old_data_size);
1671	}
1672	write_unlock_irqrestore(&mftbmp_ni->size_lock, flags);
1673	flush_dcache_mft_record_page(ni: ctx->ntfs_ino);
1674	mark_mft_record_dirty(ni: ctx->ntfs_ino);
1675	ntfs_attr_put_search_ctx(ctx);
1676	unmap_mft_record(ni: mft_ni);
1677	#ifdef DEBUG
1678	read_lock_irqsave(&mftbmp_ni->size_lock, flags);
1679	ntfs_debug("Restored status of mftbmp: allocated_size 0x%llx, "
1680	"data_size 0x%llx, initialized_size 0x%llx.",
1681	(long long)mftbmp_ni->allocated_size,
1682	(long long)i_size_read(mftbmp_vi),
1683	(long long)mftbmp_ni->initialized_size);
1684	read_unlock_irqrestore(&mftbmp_ni->size_lock, flags);
1685	#endif /* DEBUG */
1686	err_out:
1687	return ret;
1688	}
1689
1690	/**
1691	* ntfs_mft_data_extend_allocation_nolock - extend mft data attribute
1692	* @vol: volume on which to extend the mft data attribute
1693	*
1694	* Extend the mft data attribute on the ntfs volume @vol by 16 mft records
1695	* worth of clusters or if not enough space for this by one mft record worth
1696	* of clusters.
1697	*
1698	* Note: Only changes allocated_size, i.e. does not touch initialized_size or
1699	* data_size.
1700	*
1701	* Return 0 on success and -errno on error.
1702	*
1703	* Locking: - Caller must hold vol->mftbmp_lock for writing.
1704	* - This function takes NTFS_I(vol->mft_ino)->runlist.lock for
1705	* writing and releases it before returning.
1706	* - This function calls functions which take vol->lcnbmp_lock for
1707	* writing and release it before returning.
1708	*/
1709	static int ntfs_mft_data_extend_allocation_nolock(ntfs_volume *vol)
1710	{
1711	LCN lcn;
1712	VCN old_last_vcn;
1713	s64 min_nr, nr, ll;
1714	unsigned long flags;
1715	ntfs_inode *mft_ni;
1716	runlist_element rl, rl2;
1717	ntfs_attr_search_ctx *ctx = NULL;
1718	MFT_RECORD *mrec;
1719	ATTR_RECORD *a = NULL;
1720	int ret, mp_size;
1721	u32 old_alen = `0`;
1722	bool mp_rebuilt = false;
1723
1724	ntfs_debug("Extending mft data allocation.");
1725	mft_ni = NTFS_I(inode: vol->mft_ino);
1726	/*
1727	* Determine the preferred allocation location, i.e. the last lcn of
1728	* the mft data attribute. The allocated size of the mft data
1729	* attribute cannot be zero so we are ok to do this.
1730	*/
1731	down_write(sem: &mft_ni->runlist.lock);
1732	read_lock_irqsave(&mft_ni->size_lock, flags);
1733	ll = mft_ni->allocated_size;
1734	read_unlock_irqrestore(&mft_ni->size_lock, flags);
1735	rl = ntfs_attr_find_vcn_nolock(ni: mft_ni,
1736	vcn: (ll - `1`) >> vol->cluster_size_bits, NULL);
1737	if (IS_ERR(ptr: rl) \|\| unlikely(!rl->length \|\| rl->lcn < `0`)) {
1738	up_write(sem: &mft_ni->runlist.lock);
1739	ntfs_error(vol->sb, "Failed to determine last allocated "
1740	"cluster of mft data attribute.");
1741	if (!IS_ERR(ptr: rl))
1742	ret = -EIO;
1743	else
1744	ret = PTR_ERR(ptr: rl);
1745	return ret;
1746	}
1747	lcn = rl->lcn + rl->length;
1748	ntfs_debug("Last lcn of mft data attribute is 0x%llx.", (long long)lcn);
1749	/ Minimum allocation is one mft record worth of clusters. /
1750	min_nr = vol->mft_record_size >> vol->cluster_size_bits;
1751	if (!min_nr)
1752	min_nr = `1`;
1753	/ Want to allocate 16 mft records worth of clusters. /
1754	nr = vol->mft_record_size << `4` >> vol->cluster_size_bits;
1755	if (!nr)
1756	nr = min_nr;
1757	/ Ensure we do not go above 2^32-1 mft records. /
1758	read_lock_irqsave(&mft_ni->size_lock, flags);
1759	ll = mft_ni->allocated_size;
1760	read_unlock_irqrestore(&mft_ni->size_lock, flags);
1761	if (unlikely((ll + (nr << vol->cluster_size_bits)) >>
1762	vol->mft_record_size_bits >= (`1ll` << `32`))) {
1763	nr = min_nr;
1764	if (unlikely((ll + (nr << vol->cluster_size_bits)) >>
1765	vol->mft_record_size_bits >= (`1ll` << `32`))) {
1766	ntfs_warning(vol->sb, "Cannot allocate mft record "
1767	"because the maximum number of inodes "
1768	"(2^32) has already been reached.");
1769	up_write(sem: &mft_ni->runlist.lock);
1770	return -ENOSPC;
1771	}
1772	}
1773	ntfs_debug("Trying mft data allocation with %s cluster count %lli.",
1774	nr > min_nr ? "default" : "minimal", (long long)nr);
1775	old_last_vcn = rl[`1`].vcn;
1776	do {
1777	rl2 = ntfs_cluster_alloc(vol, start_vcn: old_last_vcn, count: nr, start_lcn: lcn, zone: MFT_ZONE,
1778	is_extension: true);
1779	if (!IS_ERR(ptr: rl2))
1780	break;
1781	if (PTR_ERR(ptr: rl2) != -ENOSPC \|\| nr == min_nr) {
1782	ntfs_error(vol->sb, "Failed to allocate the minimal "
1783	"number of clusters (%lli) for the "
1784	"mft data attribute.", (long long)nr);
1785	up_write(sem: &mft_ni->runlist.lock);
1786	return PTR_ERR(ptr: rl2);
1787	}
1788	/*
1789	* There is not enough space to do the allocation, but there
1790	* might be enough space to do a minimal allocation so try that
1791	* before failing.
1792	*/
1793	nr = min_nr;
1794	ntfs_debug("Retrying mft data allocation with minimal cluster "
1795	"count %lli.", (long long)nr);
1796	} while (`1`);
1797	rl = ntfs_runlists_merge(drl: mft_ni->runlist.rl, srl: rl2);
1798	if (IS_ERR(ptr: rl)) {
1799	up_write(sem: &mft_ni->runlist.lock);
1800	ntfs_error(vol->sb, "Failed to merge runlists for mft data "
1801	"attribute.");
1802	if (ntfs_cluster_free_from_rl(vol, rl: rl2)) {
1803	ntfs_error(vol->sb, "Failed to deallocate clusters "
1804	"from the mft data attribute.%s", es);
1805	NVolSetErrors(vol);
1806	}
1807	ntfs_free(addr: rl2);
1808	return PTR_ERR(ptr: rl);
1809	}
1810	mft_ni->runlist.rl = rl;
1811	ntfs_debug("Allocated %lli clusters.", (long long)nr);
1812	/ Find the last run in the new runlist. /
1813	for (; rl[`1`].length; rl++)
1814	;
1815	/ Update the attribute record as well. /
1816	mrec = map_mft_record(ni: mft_ni);
1817	if (IS_ERR(ptr: mrec)) {
1818	ntfs_error(vol->sb, "Failed to map mft record.");
1819	ret = PTR_ERR(ptr: mrec);
1820	goto undo_alloc;
1821	}
1822	ctx = ntfs_attr_get_search_ctx(ni: mft_ni, mrec);
1823	if (unlikely(!ctx)) {
1824	ntfs_error(vol->sb, "Failed to get search context.");
1825	ret = -ENOMEM;
1826	goto undo_alloc;
1827	}
1828	ret = ntfs_attr_lookup(type: mft_ni->type, name: mft_ni->name, name_len: mft_ni->name_len,
1829	ic: CASE_SENSITIVE, lowest_vcn: rl[`1`].vcn, NULL, val_len: `0`, ctx);
1830	if (unlikely(ret)) {
1831	ntfs_error(vol->sb, "Failed to find last attribute extent of "
1832	"mft data attribute.");
1833	if (ret == -ENOENT)
1834	ret = -EIO;
1835	goto undo_alloc;
1836	}
1837	a = ctx->attr;
1838	ll = sle64_to_cpu(x: a->data.non_resident.lowest_vcn);
1839	/ Search back for the previous last allocated cluster of mft bitmap. /
1840	for (rl2 = rl; rl2 > mft_ni->runlist.rl; rl2--) {
1841	if (ll >= rl2->vcn)
1842	break;
1843	}
1844	BUG_ON(ll < rl2->vcn);
1845	BUG_ON(ll >= rl2->vcn + rl2->length);
1846	/ Get the size for the new mapping pairs array for this extent. /
1847	mp_size = ntfs_get_size_for_mapping_pairs(vol, rl: rl2, first_vcn: ll, last_vcn: -`1`);
1848	if (unlikely(mp_size <= `0`)) {
1849	ntfs_error(vol->sb, "Get size for mapping pairs failed for "
1850	"mft data attribute extent.");
1851	ret = mp_size;
1852	if (!ret)
1853	ret = -EIO;
1854	goto undo_alloc;
1855	}
1856	/ Expand the attribute record if necessary. /
1857	old_alen = le32_to_cpu(a->length);
1858	ret = ntfs_attr_record_resize(m: ctx->mrec, a, new_size: mp_size +
1859	le16_to_cpu(a->data.non_resident.mapping_pairs_offset));
1860	if (unlikely(ret)) {
1861	if (ret != -ENOSPC) {
1862	ntfs_error(vol->sb, "Failed to resize attribute "
1863	"record for mft data attribute.");
1864	goto undo_alloc;
1865	}
1866	// TODO: Deal with this by moving this extent to a new mft
1867	// record or by starting a new extent in a new mft record or by
1868	// moving other attributes out of this mft record.
1869	// Note: Use the special reserved mft records and ensure that
1870	// this extent is not required to find the mft record in
1871	// question. If no free special records left we would need to
1872	// move an existing record away, insert ours in its place, and
1873	// then place the moved record into the newly allocated space
1874	// and we would then need to update all references to this mft
1875	// record appropriately. This is rather complicated...
1876	ntfs_error(vol->sb, "Not enough space in this mft record to "
1877	"accommodate extended mft data attribute "
1878	"extent. Cannot handle this yet.");
1879	ret = -EOPNOTSUPP;
1880	goto undo_alloc;
1881	}
1882	mp_rebuilt = true;
1883	/ Generate the mapping pairs array directly into the attr record. /
1884	ret = ntfs_mapping_pairs_build(vol, dst: (u8*)a +
1885	le16_to_cpu(a->data.non_resident.mapping_pairs_offset),
1886	dst_len: mp_size, rl: rl2, first_vcn: ll, last_vcn: -`1`, NULL);
1887	if (unlikely(ret)) {
1888	ntfs_error(vol->sb, "Failed to build mapping pairs array of "
1889	"mft data attribute.");
1890	goto undo_alloc;
1891	}
1892	/ Update the highest_vcn. /
1893	a->data.non_resident.highest_vcn = cpu_to_sle64(x: rl[`1`].vcn - `1`);
1894	/*
1895	* We now have extended the mft data allocated_size by nr clusters.
1896	* Reflect this in the ntfs_inode structure and the attribute record.
1897	* @rl is the last (non-terminator) runlist element of mft data
1898	* attribute.
1899	*/
1900	if (a->data.non_resident.lowest_vcn) {
1901	/*
1902	* We are not in the first attribute extent, switch to it, but
1903	* first ensure the changes will make it to disk later.
1904	*/
1905	flush_dcache_mft_record_page(ni: ctx->ntfs_ino);
1906	mark_mft_record_dirty(ni: ctx->ntfs_ino);
1907	ntfs_attr_reinit_search_ctx(ctx);
1908	ret = ntfs_attr_lookup(type: mft_ni->type, name: mft_ni->name,
1909	name_len: mft_ni->name_len, ic: CASE_SENSITIVE, lowest_vcn: `0`, NULL, val_len: `0`,
1910	ctx);
1911	if (unlikely(ret)) {
1912	ntfs_error(vol->sb, "Failed to find first attribute "
1913	"extent of mft data attribute.");
1914	goto restore_undo_alloc;
1915	}
1916	a = ctx->attr;
1917	}
1918	write_lock_irqsave(&mft_ni->size_lock, flags);
1919	mft_ni->allocated_size += nr << vol->cluster_size_bits;
1920	a->data.non_resident.allocated_size =
1921	cpu_to_sle64(x: mft_ni->allocated_size);
1922	write_unlock_irqrestore(&mft_ni->size_lock, flags);
1923	/ Ensure the changes make it to disk. /
1924	flush_dcache_mft_record_page(ni: ctx->ntfs_ino);
1925	mark_mft_record_dirty(ni: ctx->ntfs_ino);
1926	ntfs_attr_put_search_ctx(ctx);
1927	unmap_mft_record(ni: mft_ni);
1928	up_write(sem: &mft_ni->runlist.lock);
1929	ntfs_debug("Done.");
1930	return `0`;
1931	restore_undo_alloc:
1932	ntfs_attr_reinit_search_ctx(ctx);
1933	if (ntfs_attr_lookup(type: mft_ni->type, name: mft_ni->name, name_len: mft_ni->name_len,
1934	ic: CASE_SENSITIVE, lowest_vcn: rl[`1`].vcn, NULL, val_len: `0`, ctx)) {
1935	ntfs_error(vol->sb, "Failed to find last attribute extent of "
1936	"mft data attribute.%s", es);
1937	write_lock_irqsave(&mft_ni->size_lock, flags);
1938	mft_ni->allocated_size += nr << vol->cluster_size_bits;
1939	write_unlock_irqrestore(&mft_ni->size_lock, flags);
1940	ntfs_attr_put_search_ctx(ctx);
1941	unmap_mft_record(ni: mft_ni);
1942	up_write(sem: &mft_ni->runlist.lock);
1943	/*
1944	* The only thing that is now wrong is ->allocated_size of the
1945	* base attribute extent which chkdsk should be able to fix.
1946	*/
1947	NVolSetErrors(vol);
1948	return ret;
1949	}
1950	ctx->attr->data.non_resident.highest_vcn =
1951	cpu_to_sle64(x: old_last_vcn - `1`);
1952	undo_alloc:
1953	if (ntfs_cluster_free(ni: mft_ni, start_vcn: old_last_vcn, count: -`1`, ctx) < `0`) {
1954	ntfs_error(vol->sb, "Failed to free clusters from mft data "
1955	"attribute.%s", es);
1956	NVolSetErrors(vol);
1957	}
1958
1959	if (ntfs_rl_truncate_nolock(vol, runlist: &mft_ni->runlist, new_length: old_last_vcn)) {
1960	ntfs_error(vol->sb, "Failed to truncate mft data attribute "
1961	"runlist.%s", es);
1962	NVolSetErrors(vol);
1963	}
1964	if (ctx) {
1965	a = ctx->attr;
1966	if (mp_rebuilt && !IS_ERR(ptr: ctx->mrec)) {
1967	if (ntfs_mapping_pairs_build(vol, dst: (u8 *)a + le16_to_cpu(
1968	a->data.non_resident.mapping_pairs_offset),
1969	dst_len: old_alen - le16_to_cpu(
1970	a->data.non_resident.mapping_pairs_offset),
1971	rl: rl2, first_vcn: ll, last_vcn: -`1`, NULL)) {
1972	ntfs_error(vol->sb, "Failed to restore mapping pairs "
1973	"array.%s", es);
1974	NVolSetErrors(vol);
1975	}
1976	if (ntfs_attr_record_resize(m: ctx->mrec, a, new_size: old_alen)) {
1977	ntfs_error(vol->sb, "Failed to restore attribute "
1978	"record.%s", es);
1979	NVolSetErrors(vol);
1980	}
1981	flush_dcache_mft_record_page(ni: ctx->ntfs_ino);
1982	mark_mft_record_dirty(ni: ctx->ntfs_ino);
1983	} else if (IS_ERR(ptr: ctx->mrec)) {
1984	ntfs_error(vol->sb, "Failed to restore attribute search "
1985	"context.%s", es);
1986	NVolSetErrors(vol);
1987	}
1988	ntfs_attr_put_search_ctx(ctx);
1989	}
1990	if (!IS_ERR(ptr: mrec))
1991	unmap_mft_record(ni: mft_ni);
1992	up_write(sem: &mft_ni->runlist.lock);
1993	return ret;
1994	}
1995
1996	/**
1997	* ntfs_mft_record_layout - layout an mft record into a memory buffer
1998	* @vol: volume to which the mft record will belong
1999	* @mft_no: mft reference specifying the mft record number
2000	* @m: destination buffer of size >= @vol->mft_record_size bytes
2001	*
2002	* Layout an empty, unused mft record with the mft record number @mft_no into
2003	* the buffer @m. The volume @vol is needed because the mft record structure
2004	* was modified in NTFS 3.1 so we need to know which volume version this mft
2005	* record will be used on.
2006	*
2007	* Return 0 on success and -errno on error.
2008	*/
2009	static int ntfs_mft_record_layout(const ntfs_volume vol, const* s64 mft_no,
2010	MFT_RECORD *m)
2011	{
2012	ATTR_RECORD *a;
2013
2014	ntfs_debug("Entering for mft record 0x%llx.", (long long)mft_no);
2015	if (mft_no >= (`1ll` << `32`)) {
2016	ntfs_error(vol->sb, "Mft record number 0x%llx exceeds "
2017	"maximum of 2^32.", (long long)mft_no);
2018	return -ERANGE;
2019	}
2020	/ Start by clearing the whole mft record to gives us a clean slate. /
2021	memset(m, `0`, vol->mft_record_size);
2022	/ Aligned to 2-byte boundary. /
2023	if (vol->major_ver < `3` \|\| (vol->major_ver == `3` && !vol->minor_ver))
2024	m->usa_ofs = cpu_to_le16((sizeof(MFT_RECORD_OLD) + `1`) & ~`1`);
2025	else {
2026	m->usa_ofs = cpu_to_le16((sizeof(MFT_RECORD) + `1`) & ~`1`);
2027	/*
2028	* Set the NTFS 3.1+ specific fields while we know that the
2029	* volume version is 3.1+.
2030	*/
2031	m->reserved = `0`;
2032	m->mft_record_number = cpu_to_le32((u32)mft_no);
2033	}
2034	m->magic = magic_FILE;
2035	if (vol->mft_record_size >= NTFS_BLOCK_SIZE)
2036	m->usa_count = cpu_to_le16(vol->mft_record_size /
2037	NTFS_BLOCK_SIZE + `1`);
2038	else {
2039	m->usa_count = cpu_to_le16(`1`);
2040	ntfs_warning(vol->sb, "Sector size is bigger than mft record "
2041	"size. Setting usa_count to 1. If chkdsk "
2042	"reports this as corruption, please email "
2043	"linux-ntfs-dev@lists.sourceforge.net stating "
2044	"that you saw this message and that the "
2045	"modified filesystem created was corrupt. "
2046	"Thank you.");
2047	}
2048	/ Set the update sequence number to 1. /
2049	(le16)((u8*)m + le16_to_cpu(m->usa_ofs)) = cpu_to_le16(`1`);
2050	m->lsn = `0`;
2051	m->sequence_number = cpu_to_le16(`1`);
2052	m->link_count = `0`;
2053	/*
2054	* Place the attributes straight after the update sequence array,
2055	* aligned to 8-byte boundary.
2056	*/
2057	m->attrs_offset = cpu_to_le16((le16_to_cpu(m->usa_ofs) +
2058	(le16_to_cpu(m->usa_count) << `1`) + `7`) & ~`7`);
2059	m->flags = `0`;
2060	/*
2061	* Using attrs_offset plus eight bytes (for the termination attribute).
2062	* attrs_offset is already aligned to 8-byte boundary, so no need to
2063	* align again.
2064	*/
2065	m->bytes_in_use = cpu_to_le32(le16_to_cpu(m->attrs_offset) + `8`);
2066	m->bytes_allocated = cpu_to_le32(vol->mft_record_size);
2067	m->base_mft_record = `0`;
2068	m->next_attr_instance = `0`;
2069	/ Add the termination attribute. /
2070	a = (ATTR_RECORD)((u8)m + le16_to_cpu(m->attrs_offset));
2071	a->type = AT_END;
2072	a->length = `0`;
2073	ntfs_debug("Done.");
2074	return `0`;
2075	}
2076
2077	/**
2078	* ntfs_mft_record_format - format an mft record on an ntfs volume
2079	* @vol: volume on which to format the mft record
2080	* @mft_no: mft record number to format
2081	*
2082	* Format the mft record @mft_no in $MFT/$DATA, i.e. lay out an empty, unused
2083	* mft record into the appropriate place of the mft data attribute. This is
2084	* used when extending the mft data attribute.
2085	*
2086	* Return 0 on success and -errno on error.
2087	*/
2088	static int ntfs_mft_record_format(const ntfs_volume vol, const* s64 mft_no)
2089	{
2090	loff_t i_size;
2091	struct inode *mft_vi = vol->mft_ino;
2092	struct page *page;
2093	MFT_RECORD *m;
2094	pgoff_t index, end_index;
2095	unsigned int ofs;
2096	int err;
2097
2098	ntfs_debug("Entering for mft record 0x%llx.", (long long)mft_no);
2099	/*
2100	* The index into the page cache and the offset within the page cache
2101	* page of the wanted mft record.
2102	*/
2103	index = mft_no << vol->mft_record_size_bits >> PAGE_SHIFT;
2104	ofs = (mft_no << vol->mft_record_size_bits) & ~PAGE_MASK;
2105	/ The maximum valid index into the page cache for $MFT's data. /
2106	i_size = i_size_read(inode: mft_vi);
2107	end_index = i_size >> PAGE_SHIFT;
2108	if (unlikely(index >= end_index)) {
2109	if (unlikely(index > end_index \|\| ofs + vol->mft_record_size >=
2110	(i_size & ~PAGE_MASK))) {
2111	ntfs_error(vol->sb, "Tried to format non-existing mft "
2112	"record 0x%llx.", (long long)mft_no);
2113	return -ENOENT;
2114	}
2115	}
2116	/ Read, map, and pin the page containing the mft record. /
2117	page = ntfs_map_page(mapping: mft_vi->i_mapping, index);
2118	if (IS_ERR(ptr: page)) {
2119	ntfs_error(vol->sb, "Failed to map page containing mft record "
2120	"to format 0x%llx.", (long long)mft_no);
2121	return PTR_ERR(ptr: page);
2122	}
2123	lock_page(page);
2124	BUG_ON(!PageUptodate(page));
2125	ClearPageUptodate(page);
2126	m = (MFT_RECORD)((u8)page_address(page) + ofs);
2127	err = ntfs_mft_record_layout(vol, mft_no, m);
2128	if (unlikely(err)) {
2129	ntfs_error(vol->sb, "Failed to layout mft record 0x%llx.",
2130	(long long)mft_no);
2131	SetPageUptodate(page);
2132	unlock_page(page);
2133	ntfs_unmap_page(page);
2134	return err;
2135	}
2136	flush_dcache_page(page);
2137	SetPageUptodate(page);
2138	unlock_page(page);
2139	/*
2140	* Make sure the mft record is written out to disk. We could use
2141	* ilookup5() to check if an inode is in icache and so on but this is
2142	* unnecessary as ntfs_writepage() will write the dirty record anyway.
2143	*/
2144	mark_ntfs_record_dirty(page, ofs);
2145	ntfs_unmap_page(page);
2146	ntfs_debug("Done.");
2147	return `0`;
2148	}
2149
2150	/**
2151	* ntfs_mft_record_alloc - allocate an mft record on an ntfs volume
2152	* @vol: [IN] volume on which to allocate the mft record
2153	* @mode: [IN] mode if want a file or directory, i.e. base inode or 0
2154	* @base_ni: [IN] open base inode if allocating an extent mft record or NULL
2155	* @mrec: [OUT] on successful return this is the mapped mft record
2156	*
2157	* Allocate an mft record in $MFT/$DATA of an open ntfs volume @vol.
2158	*
2159	* If @base_ni is NULL make the mft record a base mft record, i.e. a file or
2160	* direvctory inode, and allocate it at the default allocator position. In
2161	* this case @mode is the file mode as given to us by the caller. We in
2162	* particular use @mode to distinguish whether a file or a directory is being
2163	* created (S_IFDIR(mode) and S_IFREG(mode), respectively).
2164	*
2165	* If @base_ni is not NULL make the allocated mft record an extent record,
2166	* allocate it starting at the mft record after the base mft record and attach
2167	* the allocated and opened ntfs inode to the base inode @base_ni. In this
2168	* case @mode must be 0 as it is meaningless for extent inodes.
2169	*
2170	* You need to check the return value with IS_ERR(). If false, the function
2171	* was successful and the return value is the now opened ntfs inode of the
2172	* allocated mft record. *@mrec is then set to the allocated, mapped, pinned,
2173	* and locked mft record. If IS_ERR() is true, the function failed and the
2174	* error code is obtained from PTR_ERR(return value). *@mrec is undefined in
2175	* this case.
2176	*
2177	* Allocation strategy:
2178	*
2179	* To find a free mft record, we scan the mft bitmap for a zero bit. To
2180	* optimize this we start scanning at the place specified by @base_ni or if
2181	* @base_ni is NULL we start where we last stopped and we perform wrap around
2182	* when we reach the end. Note, we do not try to allocate mft records below
2183	* number 24 because numbers 0 to 15 are the defined system files anyway and 16
2184	* to 24 are special in that they are used for storing extension mft records
2185	* for the $DATA attribute of $MFT. This is required to avoid the possibility
2186	* of creating a runlist with a circular dependency which once written to disk
2187	* can never be read in again. Windows will only use records 16 to 24 for
2188	* normal files if the volume is completely out of space. We never use them
2189	* which means that when the volume is really out of space we cannot create any
2190	* more files while Windows can still create up to 8 small files. We can start
2191	* doing this at some later time, it does not matter much for now.
2192	*
2193	* When scanning the mft bitmap, we only search up to the last allocated mft
2194	* record. If there are no free records left in the range 24 to number of
2195	* allocated mft records, then we extend the $MFT/$DATA attribute in order to
2196	* create free mft records. We extend the allocated size of $MFT/$DATA by 16
2197	* records at a time or one cluster, if cluster size is above 16kiB. If there
2198	* is not sufficient space to do this, we try to extend by a single mft record
2199	* or one cluster, if cluster size is above the mft record size.
2200	*
2201	* No matter how many mft records we allocate, we initialize only the first
2202	* allocated mft record, incrementing mft data size and initialized size
2203	* accordingly, open an ntfs_inode for it and return it to the caller, unless
2204	* there are less than 24 mft records, in which case we allocate and initialize
2205	* mft records until we reach record 24 which we consider as the first free mft
2206	* record for use by normal files.
2207	*
2208	* If during any stage we overflow the initialized data in the mft bitmap, we
2209	* extend the initialized size (and data size) by 8 bytes, allocating another
2210	* cluster if required. The bitmap data size has to be at least equal to the
2211	* number of mft records in the mft, but it can be bigger, in which case the
2212	* superflous bits are padded with zeroes.
2213	*
2214	* Thus, when we return successfully (IS_ERR() is false), we will have:
2215	* - initialized / extended the mft bitmap if necessary,
2216	* - initialized / extended the mft data if necessary,
2217	* - set the bit corresponding to the mft record being allocated in the
2218	* mft bitmap,
2219	* - opened an ntfs_inode for the allocated mft record, and we will have
2220	* - returned the ntfs_inode as well as the allocated mapped, pinned, and
2221	* locked mft record.
2222	*
2223	* On error, the volume will be left in a consistent state and no record will
2224	* be allocated. If rolling back a partial operation fails, we may leave some
2225	* inconsistent metadata in which case we set NVolErrors() so the volume is
2226	* left dirty when unmounted.
2227	*
2228	* Note, this function cannot make use of most of the normal functions, like
2229	* for example for attribute resizing, etc, because when the run list overflows
2230	* the base mft record and an attribute list is used, it is very important that
2231	* the extension mft records used to store the $DATA attribute of $MFT can be
2232	* reached without having to read the information contained inside them, as
2233	* this would make it impossible to find them in the first place after the
2234	* volume is unmounted. $MFT/$BITMAP probably does not need to follow this
2235	* rule because the bitmap is not essential for finding the mft records, but on
2236	* the other hand, handling the bitmap in this special way would make life
2237	* easier because otherwise there might be circular invocations of functions
2238	* when reading the bitmap.
2239	*/
2240	ntfs_inode ntfs_mft_record_alloc(ntfs_volume vol, const int mode,
2241	ntfs_inode base_ni, MFT_RECORD *mrec)
2242	{
2243	s64 ll, bit, old_data_initialized, old_data_size;
2244	unsigned long flags;
2245	struct inode *vi;
2246	struct page *page;
2247	ntfs_inode mft_ni, mftbmp_ni, *ni;
2248	ntfs_attr_search_ctx *ctx;
2249	MFT_RECORD *m;
2250	ATTR_RECORD *a;
2251	pgoff_t index;
2252	unsigned int ofs;
2253	int err;
2254	le16 seq_no, usn;
2255	bool record_formatted = false;
2256
2257	if (base_ni) {
2258	ntfs_debug("Entering (allocating an extent mft record for "
2259	"base mft record 0x%llx).",
2260	(long long)base_ni->mft_no);
2261	/ @mode and @base_ni are mutually exclusive. /
2262	BUG_ON(mode);
2263	} else
2264	ntfs_debug("Entering (allocating a base mft record).");
2265	if (mode) {
2266	/ @mode and @base_ni are mutually exclusive. /
2267	BUG_ON(base_ni);
2268	/ We only support creation of normal files and directories. /
2269	if (!S_ISREG(mode) && !S_ISDIR(mode))
2270	return ERR_PTR(error: -EOPNOTSUPP);
2271	}
2272	BUG_ON(!mrec);
2273	mft_ni = NTFS_I(inode: vol->mft_ino);
2274	mftbmp_ni = NTFS_I(inode: vol->mftbmp_ino);
2275	down_write(sem: &vol->mftbmp_lock);
2276	bit = ntfs_mft_bitmap_find_and_alloc_free_rec_nolock(vol, base_ni);
2277	if (bit >= `0`) {
2278	ntfs_debug("Found and allocated free record (#1), bit 0x%llx.",
2279	(long long)bit);
2280	goto have_alloc_rec;
2281	}
2282	if (bit != -ENOSPC) {
2283	up_write(sem: &vol->mftbmp_lock);
2284	return ERR_PTR(error: bit);
2285	}
2286	/*
2287	* No free mft records left. If the mft bitmap already covers more
2288	* than the currently used mft records, the next records are all free,
2289	* so we can simply allocate the first unused mft record.
2290	* Note: We also have to make sure that the mft bitmap at least covers
2291	* the first 24 mft records as they are special and whilst they may not
2292	* be in use, we do not allocate from them.
2293	*/
2294	read_lock_irqsave(&mft_ni->size_lock, flags);
2295	ll = mft_ni->initialized_size >> vol->mft_record_size_bits;
2296	read_unlock_irqrestore(&mft_ni->size_lock, flags);
2297	read_lock_irqsave(&mftbmp_ni->size_lock, flags);
2298	old_data_initialized = mftbmp_ni->initialized_size;
2299	read_unlock_irqrestore(&mftbmp_ni->size_lock, flags);
2300	if (old_data_initialized << `3` > ll && old_data_initialized > `3`) {
2301	bit = ll;
2302	if (bit < `24`)
2303	bit = `24`;
2304	if (unlikely(bit >= (`1ll` << `32`)))
2305	goto max_err_out;
2306	ntfs_debug("Found free record (#2), bit 0x%llx.",
2307	(long long)bit);
2308	goto found_free_rec;
2309	}
2310	/*
2311	* The mft bitmap needs to be expanded until it covers the first unused
2312	* mft record that we can allocate.
2313	* Note: The smallest mft record we allocate is mft record 24.
2314	*/
2315	bit = old_data_initialized << `3`;
2316	if (unlikely(bit >= (`1ll` << `32`)))
2317	goto max_err_out;
2318	read_lock_irqsave(&mftbmp_ni->size_lock, flags);
2319	old_data_size = mftbmp_ni->allocated_size;
2320	ntfs_debug("Status of mftbmp before extension: allocated_size 0x%llx, "
2321	"data_size 0x%llx, initialized_size 0x%llx.",
2322	(long long)old_data_size,
2323	(long long)i_size_read(vol->mftbmp_ino),
2324	(long long)old_data_initialized);
2325	read_unlock_irqrestore(&mftbmp_ni->size_lock, flags);
2326	if (old_data_initialized + `8` > old_data_size) {
2327	/ Need to extend bitmap by one more cluster. /
2328	ntfs_debug("mftbmp: initialized_size + 8 > allocated_size.");
2329	err = ntfs_mft_bitmap_extend_allocation_nolock(vol);
2330	if (unlikely(err)) {
2331	up_write(sem: &vol->mftbmp_lock);
2332	goto err_out;
2333	}
2334	#ifdef DEBUG
2335	read_lock_irqsave(&mftbmp_ni->size_lock, flags);
2336	ntfs_debug("Status of mftbmp after allocation extension: "
2337	"allocated_size 0x%llx, data_size 0x%llx, "
2338	"initialized_size 0x%llx.",
2339	(long long)mftbmp_ni->allocated_size,
2340	(long long)i_size_read(vol->mftbmp_ino),
2341	(long long)mftbmp_ni->initialized_size);
2342	read_unlock_irqrestore(&mftbmp_ni->size_lock, flags);
2343	#endif /* DEBUG */
2344	}
2345	/*
2346	* We now have sufficient allocated space, extend the initialized_size
2347	* as well as the data_size if necessary and fill the new space with
2348	* zeroes.
2349	*/
2350	err = ntfs_mft_bitmap_extend_initialized_nolock(vol);
2351	if (unlikely(err)) {
2352	up_write(sem: &vol->mftbmp_lock);
2353	goto err_out;
2354	}
2355	#ifdef DEBUG
2356	read_lock_irqsave(&mftbmp_ni->size_lock, flags);
2357	ntfs_debug("Status of mftbmp after initialized extension: "
2358	"allocated_size 0x%llx, data_size 0x%llx, "
2359	"initialized_size 0x%llx.",
2360	(long long)mftbmp_ni->allocated_size,
2361	(long long)i_size_read(vol->mftbmp_ino),
2362	(long long)mftbmp_ni->initialized_size);
2363	read_unlock_irqrestore(&mftbmp_ni->size_lock, flags);
2364	#endif /* DEBUG */
2365	ntfs_debug("Found free record (#3), bit 0x%llx.", (long long)bit);
2366	found_free_rec:
2367	/ @bit is the found free mft record, allocate it in the mft bitmap. /
2368	ntfs_debug("At found_free_rec.");
2369	err = ntfs_bitmap_set_bit(vi: vol->mftbmp_ino, bit);
2370	if (unlikely(err)) {
2371	ntfs_error(vol->sb, "Failed to allocate bit in mft bitmap.");
2372	up_write(sem: &vol->mftbmp_lock);
2373	goto err_out;
2374	}
2375	ntfs_debug("Set bit 0x%llx in mft bitmap.", (long long)bit);
2376	have_alloc_rec:
2377	/*
2378	* The mft bitmap is now uptodate. Deal with mft data attribute now.
2379	* Note, we keep hold of the mft bitmap lock for writing until all
2380	* modifications to the mft data attribute are complete, too, as they
2381	* will impact decisions for mft bitmap and mft record allocation done
2382	* by a parallel allocation and if the lock is not maintained a
2383	* parallel allocation could allocate the same mft record as this one.
2384	*/
2385	ll = (bit + `1`) << vol->mft_record_size_bits;
2386	read_lock_irqsave(&mft_ni->size_lock, flags);
2387	old_data_initialized = mft_ni->initialized_size;
2388	read_unlock_irqrestore(&mft_ni->size_lock, flags);
2389	if (ll <= old_data_initialized) {
2390	ntfs_debug("Allocated mft record already initialized.");
2391	goto mft_rec_already_initialized;
2392	}
2393	ntfs_debug("Initializing allocated mft record.");
2394	/*
2395	* The mft record is outside the initialized data. Extend the mft data
2396	* attribute until it covers the allocated record. The loop is only
2397	* actually traversed more than once when a freshly formatted volume is
2398	* first written to so it optimizes away nicely in the common case.
2399	*/
2400	read_lock_irqsave(&mft_ni->size_lock, flags);
2401	ntfs_debug("Status of mft data before extension: "
2402	"allocated_size 0x%llx, data_size 0x%llx, "
2403	"initialized_size 0x%llx.",
2404	(long long)mft_ni->allocated_size,
2405	(long long)i_size_read(vol->mft_ino),
2406	(long long)mft_ni->initialized_size);
2407	while (ll > mft_ni->allocated_size) {
2408	read_unlock_irqrestore(&mft_ni->size_lock, flags);
2409	err = ntfs_mft_data_extend_allocation_nolock(vol);
2410	if (unlikely(err)) {
2411	ntfs_error(vol->sb, "Failed to extend mft data "
2412	"allocation.");
2413	goto undo_mftbmp_alloc_nolock;
2414	}
2415	read_lock_irqsave(&mft_ni->size_lock, flags);
2416	ntfs_debug("Status of mft data after allocation extension: "
2417	"allocated_size 0x%llx, data_size 0x%llx, "
2418	"initialized_size 0x%llx.",
2419	(long long)mft_ni->allocated_size,
2420	(long long)i_size_read(vol->mft_ino),
2421	(long long)mft_ni->initialized_size);
2422	}
2423	read_unlock_irqrestore(&mft_ni->size_lock, flags);
2424	/*
2425	* Extend mft data initialized size (and data size of course) to reach
2426	* the allocated mft record, formatting the mft records allong the way.
2427	* Note: We only modify the ntfs_inode structure as that is all that is
2428	* needed by ntfs_mft_record_format(). We will update the attribute
2429	* record itself in one fell swoop later on.
2430	*/
2431	write_lock_irqsave(&mft_ni->size_lock, flags);
2432	old_data_initialized = mft_ni->initialized_size;
2433	old_data_size = vol->mft_ino->i_size;
2434	while (ll > mft_ni->initialized_size) {
2435	s64 new_initialized_size, mft_no;
2436
2437	new_initialized_size = mft_ni->initialized_size +
2438	vol->mft_record_size;
2439	mft_no = mft_ni->initialized_size >> vol->mft_record_size_bits;
2440	if (new_initialized_size > i_size_read(inode: vol->mft_ino))
2441	i_size_write(inode: vol->mft_ino, i_size: new_initialized_size);
2442	write_unlock_irqrestore(&mft_ni->size_lock, flags);
2443	ntfs_debug("Initializing mft record 0x%llx.",
2444	(long long)mft_no);
2445	err = ntfs_mft_record_format(vol, mft_no);
2446	if (unlikely(err)) {
2447	ntfs_error(vol->sb, "Failed to format mft record.");
2448	goto undo_data_init;
2449	}
2450	write_lock_irqsave(&mft_ni->size_lock, flags);
2451	mft_ni->initialized_size = new_initialized_size;
2452	}
2453	write_unlock_irqrestore(&mft_ni->size_lock, flags);
2454	record_formatted = true;
2455	/ Update the mft data attribute record to reflect the new sizes. /
2456	m = map_mft_record(ni: mft_ni);
2457	if (IS_ERR(ptr: m)) {
2458	ntfs_error(vol->sb, "Failed to map mft record.");
2459	err = PTR_ERR(ptr: m);
2460	goto undo_data_init;
2461	}
2462	ctx = ntfs_attr_get_search_ctx(ni: mft_ni, mrec: m);
2463	if (unlikely(!ctx)) {
2464	ntfs_error(vol->sb, "Failed to get search context.");
2465	err = -ENOMEM;
2466	unmap_mft_record(ni: mft_ni);
2467	goto undo_data_init;
2468	}
2469	err = ntfs_attr_lookup(type: mft_ni->type, name: mft_ni->name, name_len: mft_ni->name_len,
2470	ic: CASE_SENSITIVE, lowest_vcn: `0`, NULL, val_len: `0`, ctx);
2471	if (unlikely(err)) {
2472	ntfs_error(vol->sb, "Failed to find first attribute extent of "
2473	"mft data attribute.");
2474	ntfs_attr_put_search_ctx(ctx);
2475	unmap_mft_record(ni: mft_ni);
2476	goto undo_data_init;
2477	}
2478	a = ctx->attr;
2479	read_lock_irqsave(&mft_ni->size_lock, flags);
2480	a->data.non_resident.initialized_size =
2481	cpu_to_sle64(x: mft_ni->initialized_size);
2482	a->data.non_resident.data_size =
2483	cpu_to_sle64(x: i_size_read(inode: vol->mft_ino));
2484	read_unlock_irqrestore(&mft_ni->size_lock, flags);
2485	/ Ensure the changes make it to disk. /
2486	flush_dcache_mft_record_page(ni: ctx->ntfs_ino);
2487	mark_mft_record_dirty(ni: ctx->ntfs_ino);
2488	ntfs_attr_put_search_ctx(ctx);
2489	unmap_mft_record(ni: mft_ni);
2490	read_lock_irqsave(&mft_ni->size_lock, flags);
2491	ntfs_debug("Status of mft data after mft record initialization: "
2492	"allocated_size 0x%llx, data_size 0x%llx, "
2493	"initialized_size 0x%llx.",
2494	(long long)mft_ni->allocated_size,
2495	(long long)i_size_read(vol->mft_ino),
2496	(long long)mft_ni->initialized_size);
2497	BUG_ON(i_size_read(vol->mft_ino) > mft_ni->allocated_size);
2498	BUG_ON(mft_ni->initialized_size > i_size_read(vol->mft_ino));
2499	read_unlock_irqrestore(&mft_ni->size_lock, flags);
2500	mft_rec_already_initialized:
2501	/*
2502	* We can finally drop the mft bitmap lock as the mft data attribute
2503	* has been fully updated. The only disparity left is that the
2504	* allocated mft record still needs to be marked as in use to match the
2505	* set bit in the mft bitmap but this is actually not a problem since
2506	* this mft record is not referenced from anywhere yet and the fact
2507	* that it is allocated in the mft bitmap means that no-one will try to
2508	* allocate it either.
2509	*/
2510	up_write(sem: &vol->mftbmp_lock);
2511	/*
2512	* We now have allocated and initialized the mft record. Calculate the
2513	* index of and the offset within the page cache page the record is in.
2514	*/
2515	index = bit << vol->mft_record_size_bits >> PAGE_SHIFT;
2516	ofs = (bit << vol->mft_record_size_bits) & ~PAGE_MASK;
2517	/ Read, map, and pin the page containing the mft record. /
2518	page = ntfs_map_page(mapping: vol->mft_ino->i_mapping, index);
2519	if (IS_ERR(ptr: page)) {
2520	ntfs_error(vol->sb, "Failed to map page containing allocated "
2521	"mft record 0x%llx.", (long long)bit);
2522	err = PTR_ERR(ptr: page);
2523	goto undo_mftbmp_alloc;
2524	}
2525	lock_page(page);
2526	BUG_ON(!PageUptodate(page));
2527	ClearPageUptodate(page);
2528	m = (MFT_RECORD)((u8)page_address(page) + ofs);
2529	/ If we just formatted the mft record no need to do it again. /
2530	if (!record_formatted) {
2531	/ Sanity check that the mft record is really not in use. /
2532	if (ntfs_is_file_record(m->magic) &&
2533	(m->flags & MFT_RECORD_IN_USE)) {
2534	ntfs_error(vol->sb, "Mft record 0x%llx was marked "
2535	"free in mft bitmap but is marked "
2536	"used itself. Corrupt filesystem. "
2537	"Unmount and run chkdsk.",
2538	(long long)bit);
2539	err = -EIO;
2540	SetPageUptodate(page);
2541	unlock_page(page);
2542	ntfs_unmap_page(page);
2543	NVolSetErrors(vol);
2544	goto undo_mftbmp_alloc;
2545	}
2546	/*
2547	* We need to (re-)format the mft record, preserving the
2548	* sequence number if it is not zero as well as the update
2549	* sequence number if it is not zero or -1 (0xffff). This
2550	* means we do not need to care whether or not something went
2551	* wrong with the previous mft record.
2552	*/
2553	seq_no = m->sequence_number;
2554	usn = (le16)((u8*)m + le16_to_cpu(m->usa_ofs));
2555	err = ntfs_mft_record_layout(vol, mft_no: bit, m);
2556	if (unlikely(err)) {
2557	ntfs_error(vol->sb, "Failed to layout allocated mft "
2558	"record 0x%llx.", (long long)bit);
2559	SetPageUptodate(page);
2560	unlock_page(page);
2561	ntfs_unmap_page(page);
2562	goto undo_mftbmp_alloc;
2563	}
2564	if (seq_no)
2565	m->sequence_number = seq_no;
2566	if (usn && le16_to_cpu(usn) != `0xffff`)
2567	(le16)((u8*)m + le16_to_cpu(m->usa_ofs)) = usn;
2568	}
2569	/ Set the mft record itself in use. /
2570	m->flags \|= MFT_RECORD_IN_USE;
2571	if (S_ISDIR(mode))
2572	m->flags \|= MFT_RECORD_IS_DIRECTORY;
2573	flush_dcache_page(page);
2574	SetPageUptodate(page);
2575	if (base_ni) {
2576	MFT_RECORD *m_tmp;
2577
2578	/*
2579	* Setup the base mft record in the extent mft record. This
2580	* completes initialization of the allocated extent mft record
2581	* and we can simply use it with map_extent_mft_record().
2582	*/
2583	m->base_mft_record = MK_LE_MREF(base_ni->mft_no,
2584	base_ni->seq_no);
2585	/*
2586	* Allocate an extent inode structure for the new mft record,
2587	* attach it to the base inode @base_ni and map, pin, and lock
2588	* its, i.e. the allocated, mft record.
2589	*/
2590	m_tmp = map_extent_mft_record(base_ni, mref: bit, ntfs_ino: &ni);
2591	if (IS_ERR(ptr: m_tmp)) {
2592	ntfs_error(vol->sb, "Failed to map allocated extent "
2593	"mft record 0x%llx.", (long long)bit);
2594	err = PTR_ERR(ptr: m_tmp);
2595	/ Set the mft record itself not in use. /
2596	m->flags &= cpu_to_le16(
2597	~le16_to_cpu(MFT_RECORD_IN_USE));
2598	flush_dcache_page(page);
2599	/ Make sure the mft record is written out to disk. /
2600	mark_ntfs_record_dirty(page, ofs);
2601	unlock_page(page);
2602	ntfs_unmap_page(page);
2603	goto undo_mftbmp_alloc;
2604	}
2605	BUG_ON(m != m_tmp);
2606	/*
2607	* Make sure the allocated mft record is written out to disk.
2608	* No need to set the inode dirty because the caller is going
2609	* to do that anyway after finishing with the new extent mft
2610	* record (e.g. at a minimum a new attribute will be added to
2611	* the mft record.
2612	*/
2613	mark_ntfs_record_dirty(page, ofs);
2614	unlock_page(page);
2615	/*
2616	* Need to unmap the page since map_extent_mft_record() mapped
2617	* it as well so we have it mapped twice at the moment.
2618	*/
2619	ntfs_unmap_page(page);
2620	} else {
2621	/*
2622	* Allocate a new VFS inode and set it up. NOTE: @vi->i_nlink
2623	* is set to 1 but the mft record->link_count is 0. The caller
2624	* needs to bear this in mind.
2625	*/
2626	vi = new_inode(sb: vol->sb);
2627	if (unlikely(!vi)) {
2628	err = -ENOMEM;
2629	/ Set the mft record itself not in use. /
2630	m->flags &= cpu_to_le16(
2631	~le16_to_cpu(MFT_RECORD_IN_USE));
2632	flush_dcache_page(page);
2633	/ Make sure the mft record is written out to disk. /
2634	mark_ntfs_record_dirty(page, ofs);
2635	unlock_page(page);
2636	ntfs_unmap_page(page);
2637	goto undo_mftbmp_alloc;
2638	}
2639	vi->i_ino = bit;
2640
2641	/ The owner and group come from the ntfs volume. /
2642	vi->i_uid = vol->uid;
2643	vi->i_gid = vol->gid;
2644
2645	/ Initialize the ntfs specific part of @vi. /
2646	ntfs_init_big_inode(vi);
2647	ni = NTFS_I(inode: vi);
2648	/*
2649	* Set the appropriate mode, attribute type, and name. For
2650	* directories, also setup the index values to the defaults.
2651	*/
2652	if (S_ISDIR(mode)) {
2653	vi->i_mode = S_IFDIR \| S_IRWXUGO;
2654	vi->i_mode &= ~vol->dmask;
2655
2656	NInoSetMstProtected(ni);
2657	ni->type = AT_INDEX_ALLOCATION;
2658	ni->name = I30;
2659	ni->name_len = `4`;
2660
2661	ni->itype.index.block_size = `4096`;
2662	ni->itype.index.block_size_bits = ntfs_ffs(x: `4096`) - `1`;
2663	ni->itype.index.collation_rule = COLLATION_FILE_NAME;
2664	if (vol->cluster_size <= ni->itype.index.block_size) {
2665	ni->itype.index.vcn_size = vol->cluster_size;
2666	ni->itype.index.vcn_size_bits =
2667	vol->cluster_size_bits;
2668	} else {
2669	ni->itype.index.vcn_size = vol->sector_size;
2670	ni->itype.index.vcn_size_bits =
2671	vol->sector_size_bits;
2672	}
2673	} else {
2674	vi->i_mode = S_IFREG \| S_IRWXUGO;
2675	vi->i_mode &= ~vol->fmask;
2676
2677	ni->type = AT_DATA;
2678	ni->name = NULL;
2679	ni->name_len = `0`;
2680	}
2681	if (IS_RDONLY(vi))
2682	vi->i_mode &= ~S_IWUGO;
2683
2684	/ Set the inode times to the current time. /
2685	simple_inode_init_ts(inode: vi);
2686	/*
2687	* Set the file size to 0, the ntfs inode sizes are set to 0 by
2688	* the call to ntfs_init_big_inode() below.
2689	*/
2690	vi->i_size = `0`;
2691	vi->i_blocks = `0`;
2692
2693	/ Set the sequence number. /
2694	vi->i_generation = ni->seq_no = le16_to_cpu(m->sequence_number);
2695	/*
2696	* Manually map, pin, and lock the mft record as we already
2697	* have its page mapped and it is very easy to do.
2698	*/
2699	atomic_inc(v: &ni->count);
2700	mutex_lock(&ni->mrec_lock);
2701	ni->page = page;
2702	ni->page_ofs = ofs;
2703	/*
2704	* Make sure the allocated mft record is written out to disk.
2705	* NOTE: We do not set the ntfs inode dirty because this would
2706	* fail in ntfs_write_inode() because the inode does not have a
2707	* standard information attribute yet. Also, there is no need
2708	* to set the inode dirty because the caller is going to do
2709	* that anyway after finishing with the new mft record (e.g. at
2710	* a minimum some new attributes will be added to the mft
2711	* record.
2712	*/
2713	mark_ntfs_record_dirty(page, ofs);
2714	unlock_page(page);
2715
2716	/ Add the inode to the inode hash for the superblock. /
2717	insert_inode_hash(inode: vi);
2718
2719	/ Update the default mft allocation position. /
2720	vol->mft_data_pos = bit + `1`;
2721	}
2722	/*
2723	* Return the opened, allocated inode of the allocated mft record as
2724	* well as the mapped, pinned, and locked mft record.
2725	*/
2726	ntfs_debug("Returning opened, allocated %sinode 0x%llx.",
2727	base_ni ? "extent " : "", (long long)bit);
2728	*mrec = m;
2729	return ni;
2730	undo_data_init:
2731	write_lock_irqsave(&mft_ni->size_lock, flags);
2732	mft_ni->initialized_size = old_data_initialized;
2733	i_size_write(inode: vol->mft_ino, i_size: old_data_size);
2734	write_unlock_irqrestore(&mft_ni->size_lock, flags);
2735	goto undo_mftbmp_alloc_nolock;
2736	undo_mftbmp_alloc:
2737	down_write(sem: &vol->mftbmp_lock);
2738	undo_mftbmp_alloc_nolock:
2739	if (ntfs_bitmap_clear_bit(vi: vol->mftbmp_ino, bit)) {
2740	ntfs_error(vol->sb, "Failed to clear bit in mft bitmap.%s", es);
2741	NVolSetErrors(vol);
2742	}
2743	up_write(sem: &vol->mftbmp_lock);
2744	err_out:
2745	return ERR_PTR(error: err);
2746	max_err_out:
2747	ntfs_warning(vol->sb, "Cannot allocate mft record because the maximum "
2748	"number of inodes (2^32) has already been reached.");
2749	up_write(sem: &vol->mftbmp_lock);
2750	return ERR_PTR(error: -ENOSPC);
2751	}
2752
2753	/**
2754	* ntfs_extent_mft_record_free - free an extent mft record on an ntfs volume
2755	* @ni: ntfs inode of the mapped extent mft record to free
2756	* @m: mapped extent mft record of the ntfs inode @ni
2757	*
2758	* Free the mapped extent mft record @m of the extent ntfs inode @ni.
2759	*
2760	* Note that this function unmaps the mft record and closes and destroys @ni
2761	* internally and hence you cannot use either @ni nor @m any more after this
2762	* function returns success.
2763	*
2764	* On success return 0 and on error return -errno. @ni and @m are still valid
2765	* in this case and have not been freed.
2766	*
2767	* For some errors an error message is displayed and the success code 0 is
2768	* returned and the volume is then left dirty on umount. This makes sense in
2769	* case we could not rollback the changes that were already done since the
2770	* caller no longer wants to reference this mft record so it does not matter to
2771	* the caller if something is wrong with it as long as it is properly detached
2772	* from the base inode.
2773	*/
2774	int ntfs_extent_mft_record_free(ntfs_inode ni, MFT_RECORD m)
2775	{
2776	unsigned long mft_no = ni->mft_no;
2777	ntfs_volume *vol = ni->vol;
2778	ntfs_inode *base_ni;
2779	ntfs_inode **extent_nis;
2780	int i, err;
2781	le16 old_seq_no;
2782	u16 seq_no;
2783
2784	BUG_ON(NInoAttr(ni));
2785	BUG_ON(ni->nr_extents != -`1`);
2786
2787	mutex_lock(&ni->extent_lock);
2788	base_ni = ni->ext.base_ntfs_ino;
2789	mutex_unlock(lock: &ni->extent_lock);
2790
2791	BUG_ON(base_ni->nr_extents <= `0`);
2792
2793	ntfs_debug("Entering for extent inode 0x%lx, base inode 0x%lx.\n",
2794	mft_no, base_ni->mft_no);
2795
2796	mutex_lock(&base_ni->extent_lock);
2797
2798	/ Make sure we are holding the only reference to the extent inode. /
2799	if (atomic_read(v: &ni->count) > `2`) {
2800	ntfs_error(vol->sb, "Tried to free busy extent inode 0x%lx, "
2801	"not freeing.", base_ni->mft_no);
2802	mutex_unlock(lock: &base_ni->extent_lock);
2803	return -EBUSY;
2804	}
2805
2806	/ Dissociate the ntfs inode from the base inode. /
2807	extent_nis = base_ni->ext.extent_ntfs_inos;
2808	err = -ENOENT;
2809	for (i = `0`; i < base_ni->nr_extents; i++) {
2810	if (ni != extent_nis[i])
2811	continue;
2812	extent_nis += i;
2813	base_ni->nr_extents--;
2814	memmove(extent_nis, extent_nis + `1`, (base_ni->nr_extents - i) *
2815	sizeof(ntfs_inode*));
2816	err = `0`;
2817	break;
2818	}
2819
2820	mutex_unlock(lock: &base_ni->extent_lock);
2821
2822	if (unlikely(err)) {
2823	ntfs_error(vol->sb, "Extent inode 0x%lx is not attached to "
2824	"its base inode 0x%lx.", mft_no,
2825	base_ni->mft_no);
2826	BUG();
2827	}
2828
2829	/*
2830	* The extent inode is no longer attached to the base inode so no one
2831	* can get a reference to it any more.
2832	*/
2833
2834	/ Mark the mft record as not in use. /
2835	m->flags &= ~MFT_RECORD_IN_USE;
2836
2837	/ Increment the sequence number, skipping zero, if it is not zero. /
2838	old_seq_no = m->sequence_number;
2839	seq_no = le16_to_cpu(old_seq_no);
2840	if (seq_no == `0xffff`)
2841	seq_no = `1`;
2842	else if (seq_no)
2843	seq_no++;
2844	m->sequence_number = cpu_to_le16(seq_no);
2845
2846	/*
2847	* Set the ntfs inode dirty and write it out. We do not need to worry
2848	* about the base inode here since whatever caused the extent mft
2849	* record to be freed is guaranteed to do it already.
2850	*/
2851	NInoSetDirty(ni);
2852	err = write_mft_record(ni, m, sync: `0`);
2853	if (unlikely(err)) {
2854	ntfs_error(vol->sb, "Failed to write mft record 0x%lx, not "
2855	"freeing.", mft_no);
2856	goto rollback;
2857	}
2858	rollback_error:
2859	/ Unmap and throw away the now freed extent inode. /
2860	unmap_extent_mft_record(ni);
2861	ntfs_clear_extent_inode(ni);
2862
2863	/ Clear the bit in the $MFT/$BITMAP corresponding to this record. /
2864	down_write(sem: &vol->mftbmp_lock);
2865	err = ntfs_bitmap_clear_bit(vi: vol->mftbmp_ino, bit: mft_no);
2866	up_write(sem: &vol->mftbmp_lock);
2867	if (unlikely(err)) {
2868	/*
2869	* The extent inode is gone but we failed to deallocate it in
2870	* the mft bitmap. Just emit a warning and leave the volume
2871	* dirty on umount.
2872	*/
2873	ntfs_error(vol->sb, "Failed to clear bit in mft bitmap.%s", es);
2874	NVolSetErrors(vol);
2875	}
2876	return `0`;
2877	rollback:
2878	/ Rollback what we did... /
2879	mutex_lock(&base_ni->extent_lock);
2880	extent_nis = base_ni->ext.extent_ntfs_inos;
2881	if (!(base_ni->nr_extents & `3`)) {
2882	int new_size = (base_ni->nr_extents + `4`) * sizeof(ntfs_inode*);
2883
2884	extent_nis = kmalloc(size: new_size, GFP_NOFS);
2885	if (unlikely(!extent_nis)) {
2886	ntfs_error(vol->sb, "Failed to allocate internal "
2887	"buffer during rollback.%s", es);
2888	mutex_unlock(lock: &base_ni->extent_lock);
2889	NVolSetErrors(vol);
2890	goto rollback_error;
2891	}
2892	if (base_ni->nr_extents) {
2893	BUG_ON(!base_ni->ext.extent_ntfs_inos);
2894	memcpy(extent_nis, base_ni->ext.extent_ntfs_inos,
2895	new_size - `4` * sizeof(ntfs_inode*));
2896	kfree(objp: base_ni->ext.extent_ntfs_inos);
2897	}
2898	base_ni->ext.extent_ntfs_inos = extent_nis;
2899	}
2900	m->flags \|= MFT_RECORD_IN_USE;
2901	m->sequence_number = old_seq_no;
2902	extent_nis[base_ni->nr_extents++] = ni;
2903	mutex_unlock(lock: &base_ni->extent_lock);
2904	mark_mft_record_dirty(ni);
2905	return err;
2906	}
2907	#endif /* NTFS_RW */
2908

source code of linux/fs/ntfs/mft.c