dax.c source code [linux/fs/dax.c]

1	// SPDX-License-Identifier: GPL-2.0-only
2	/*
3	* fs/dax.c - Direct Access filesystem code
4	* Copyright (c) 2013-2014 Intel Corporation
5	* Author: Matthew Wilcox <matthew.r.wilcox@intel.com>
6	* Author: Ross Zwisler <ross.zwisler@linux.intel.com>
7	*/
8
9	#include <linux/atomic.h>
10	#include <linux/blkdev.h>
11	#include <linux/buffer_head.h>
12	#include <linux/dax.h>
13	#include <linux/fs.h>
14	#include <linux/highmem.h>
15	#include <linux/memcontrol.h>
16	#include <linux/mm.h>
17	#include <linux/mutex.h>
18	#include <linux/pagevec.h>
19	#include <linux/sched.h>
20	#include <linux/sched/signal.h>
21	#include <linux/uio.h>
22	#include <linux/vmstat.h>
23	#include <linux/pfn_t.h>
24	#include <linux/sizes.h>
25	#include <linux/mmu_notifier.h>
26	#include <linux/iomap.h>
27	#include <linux/rmap.h>
28	#include <asm/pgalloc.h>
29
30	#define CREATE_TRACE_POINTS
31	#include <trace/events/fs_dax.h>
32
33	/ We choose 4096 entries - same as per-zone page wait tables /
34	#define DAX_WAIT_TABLE_BITS 12
35	#define DAX_WAIT_TABLE_ENTRIES (1 << DAX_WAIT_TABLE_BITS)
36
37	/ The 'colour' (ie low bits) within a PMD of a page offset. /
38	#define PG_PMD_COLOUR ((PMD_SIZE >> PAGE_SHIFT) - 1)
39	#define PG_PMD_NR (PMD_SIZE >> PAGE_SHIFT)
40
41	static wait_queue_head_t wait_table[DAX_WAIT_TABLE_ENTRIES];
42
43	static int __init init_dax_wait_table(void)
44	{
45	int i;
46
47	for (i = `0`; i < DAX_WAIT_TABLE_ENTRIES; i++)
48	init_waitqueue_head(wait_table + i);
49	return `0`;
50	}
51	fs_initcall(init_dax_wait_table);
52
53	/*
54	* DAX pagecache entries use XArray value entries so they can't be mistaken
55	* for pages. We use one bit for locking, one bit for the entry size (PMD)
56	* and two more to tell us if the entry is a zero page or an empty entry that
57	* is just used for locking. In total four special bits.
58	*
59	* If the PMD bit isn't set the entry has size PAGE_SIZE, and if the ZERO_PAGE
60	* and EMPTY bits aren't set the entry is a normal DAX entry with a filesystem
61	* block allocation.
62	*/
63	#define DAX_SHIFT (4)
64	#define DAX_LOCKED (1UL << 0)
65	#define DAX_PMD (1UL << 1)
66	#define DAX_ZERO_PAGE (1UL << 2)
67	#define DAX_EMPTY (1UL << 3)
68
69	static unsigned long dax_to_pfn(void *entry)
70	{
71	return xa_to_value(entry) >> DAX_SHIFT;
72	}
73
74	static void dax_make_entry(pfn_t pfn, unsigned* long flags)
75	{
76	return xa_mk_value(v: flags \| (pfn_t_to_pfn(pfn) << DAX_SHIFT));
77	}
78
79	static bool dax_is_locked(void *entry)
80	{
81	return xa_to_value(entry) & DAX_LOCKED;
82	}
83
84	static unsigned int dax_entry_order(void *entry)
85	{
86	if (xa_to_value(entry) & DAX_PMD)
87	return PMD_ORDER;
88	return `0`;
89	}
90
91	static unsigned long dax_is_pmd_entry(void *entry)
92	{
93	return xa_to_value(entry) & DAX_PMD;
94	}
95
96	static bool dax_is_pte_entry(void *entry)
97	{
98	return !(xa_to_value(entry) & DAX_PMD);
99	}
100
101	static int dax_is_zero_entry(void *entry)
102	{
103	return xa_to_value(entry) & DAX_ZERO_PAGE;
104	}
105
106	static int dax_is_empty_entry(void *entry)
107	{
108	return xa_to_value(entry) & DAX_EMPTY;
109	}
110
111	/*
112	* true if the entry that was found is of a smaller order than the entry
113	* we were looking for
114	*/
115	static bool dax_is_conflict(void *entry)
116	{
117	return entry == XA_RETRY_ENTRY;
118	}
119
120	/*
121	* DAX page cache entry locking
122	*/
123	struct exceptional_entry_key {
124	struct xarray *xa;
125	pgoff_t entry_start;
126	};
127
128	struct wait_exceptional_entry_queue {
129	wait_queue_entry_t wait;
130	struct exceptional_entry_key key;
131	};
132
133	/**
134	* enum dax_wake_mode: waitqueue wakeup behaviour
135	* @WAKE_ALL: wake all waiters in the waitqueue
136	* @WAKE_NEXT: wake only the first waiter in the waitqueue
137	*/
138	enum dax_wake_mode {
139	WAKE_ALL,
140	WAKE_NEXT,
141	};
142
143	static wait_queue_head_t dax_entry_waitqueue(struct* xa_state *xas,
144	void entry, struct* exceptional_entry_key *key)
145	{
146	unsigned long hash;
147	unsigned long index = xas->xa_index;
148
149	/*
150	* If 'entry' is a PMD, align the 'index' that we use for the wait
151	* queue to the start of that PMD. This ensures that all offsets in
152	* the range covered by the PMD map to the same bit lock.
153	*/
154	if (dax_is_pmd_entry(entry))
155	index &= ~PG_PMD_COLOUR;
156	key->xa = xas->xa;
157	key->entry_start = index;
158
159	hash = hash_long((unsigned long)xas->xa ^ index, DAX_WAIT_TABLE_BITS);
160	return wait_table + hash;
161	}
162
163	static int wake_exceptional_entry_func(wait_queue_entry_t *wait,
164	unsigned int mode, int sync, void *keyp)
165	{
166	struct exceptional_entry_key *key = keyp;
167	struct wait_exceptional_entry_queue *ewait =
168	container_of(wait, struct wait_exceptional_entry_queue, wait);
169
170	if (key->xa != ewait->key.xa \|\|
171	key->entry_start != ewait->key.entry_start)
172	return `0`;
173	return autoremove_wake_function(wq_entry: wait, mode, sync, NULL);
174	}
175
176	/*
177	* @entry may no longer be the entry at the index in the mapping.
178	* The important information it's conveying is whether the entry at
179	* this index used to be a PMD entry.
180	*/
181	static void dax_wake_entry(struct xa_state xas, void* *entry,
182	enum dax_wake_mode mode)
183	{
184	struct exceptional_entry_key key;
185	wait_queue_head_t *wq;
186
187	wq = dax_entry_waitqueue(xas, entry, key: &key);
188
189	/*
190	* Checking for locked entry and prepare_to_wait_exclusive() happens
191	* under the i_pages lock, ditto for entry handling in our callers.
192	* So at this point all tasks that could have seen our entry locked
193	* must be in the waitqueue and the following check will see them.
194	*/
195	if (waitqueue_active(wq_head: wq))
196	__wake_up(wq_head: wq, TASK_NORMAL, nr: mode == WAKE_ALL ? `0` : `1`, key: &key);
197	}
198
199	/*
200	* Look up entry in page cache, wait for it to become unlocked if it
201	* is a DAX entry and return it. The caller must subsequently call
202	* put_unlocked_entry() if it did not lock the entry or dax_unlock_entry()
203	* if it did. The entry returned may have a larger order than @order.
204	* If @order is larger than the order of the entry found in i_pages, this
205	* function returns a dax_is_conflict entry.
206	*
207	* Must be called with the i_pages lock held.
208	*/
209	static void get_unlocked_entry(struct* xa_state xas, unsigned* int order)
210	{
211	void *entry;
212	struct wait_exceptional_entry_queue ewait;
213	wait_queue_head_t *wq;
214
215	init_wait(&ewait.wait);
216	ewait.wait.func = wake_exceptional_entry_func;
217
218	for (;;) {
219	entry = xas_find_conflict(xas);
220	if (!entry \|\| WARN_ON_ONCE(!xa_is_value(entry)))
221	return entry;
222	if (dax_entry_order(entry) < order)
223	return XA_RETRY_ENTRY;
224	if (!dax_is_locked(entry))
225	return entry;
226
227	wq = dax_entry_waitqueue(xas, entry, key: &ewait.key);
228	prepare_to_wait_exclusive(wq_head: wq, wq_entry: &ewait.wait,
229	TASK_UNINTERRUPTIBLE);
230	xas_unlock_irq(xas);
231	xas_reset(xas);
232	schedule();
233	finish_wait(wq_head: wq, wq_entry: &ewait.wait);
234	xas_lock_irq(xas);
235	}
236	}
237
238	/*
239	* The only thing keeping the address space around is the i_pages lock
240	* (it's cycled in clear_inode() after removing the entries from i_pages)
241	* After we call xas_unlock_irq(), we cannot touch xas->xa.
242	*/
243	static void wait_entry_unlocked(struct xa_state xas, void* *entry)
244	{
245	struct wait_exceptional_entry_queue ewait;
246	wait_queue_head_t *wq;
247
248	init_wait(&ewait.wait);
249	ewait.wait.func = wake_exceptional_entry_func;
250
251	wq = dax_entry_waitqueue(xas, entry, key: &ewait.key);
252	/*
253	* Unlike get_unlocked_entry() there is no guarantee that this
254	* path ever successfully retrieves an unlocked entry before an
255	* inode dies. Perform a non-exclusive wait in case this path
256	* never successfully performs its own wake up.
257	*/
258	prepare_to_wait(wq_head: wq, wq_entry: &ewait.wait, TASK_UNINTERRUPTIBLE);
259	xas_unlock_irq(xas);
260	schedule();
261	finish_wait(wq_head: wq, wq_entry: &ewait.wait);
262	}
263
264	static void put_unlocked_entry(struct xa_state xas, void* *entry,
265	enum dax_wake_mode mode)
266	{
267	if (entry && !dax_is_conflict(entry))
268	dax_wake_entry(xas, entry, mode);
269	}
270
271	/*
272	* We used the xa_state to get the entry, but then we locked the entry and
273	* dropped the xa_lock, so we know the xa_state is stale and must be reset
274	* before use.
275	*/
276	static void dax_unlock_entry(struct xa_state xas, void* *entry)
277	{
278	void *old;
279
280	BUG_ON(dax_is_locked(entry));
281	xas_reset(xas);
282	xas_lock_irq(xas);
283	old = xas_store(xas, entry);
284	xas_unlock_irq(xas);
285	BUG_ON(!dax_is_locked(old));
286	dax_wake_entry(xas, entry, mode: WAKE_NEXT);
287	}
288
289	/*
290	* Return: The entry stored at this location before it was locked.
291	*/
292	static void dax_lock_entry(struct* xa_state xas, void* *entry)
293	{
294	unsigned long v = xa_to_value(entry);
295	return xas_store(xas, entry: xa_mk_value(v: v \| DAX_LOCKED));
296	}
297
298	static unsigned long dax_entry_size(void *entry)
299	{
300	if (dax_is_zero_entry(entry))
301	return `0`;
302	else if (dax_is_empty_entry(entry))
303	return `0`;
304	else if (dax_is_pmd_entry(entry))
305	return PMD_SIZE;
306	else
307	return PAGE_SIZE;
308	}
309
310	static unsigned long dax_end_pfn(void *entry)
311	{
312	return dax_to_pfn(entry) + dax_entry_size(entry) / PAGE_SIZE;
313	}
314
315	/*
316	* Iterate through all mapped pfns represented by an entry, i.e. skip
317	* 'empty' and 'zero' entries.
318	*/
319	#define for_each_mapped_pfn(entry, pfn) \
320	for (pfn = dax_to_pfn(entry); \
321	pfn < dax_end_pfn(entry); pfn++)
322
323	static inline bool dax_page_is_shared(struct page *page)
324	{
325	return page->mapping == PAGE_MAPPING_DAX_SHARED;
326	}
327
328	/*
329	* Set the page->mapping with PAGE_MAPPING_DAX_SHARED flag, increase the
330	* refcount.
331	*/
332	static inline void dax_page_share_get(struct page *page)
333	{
334	if (page->mapping != PAGE_MAPPING_DAX_SHARED) {
335	/*
336	* Reset the index if the page was already mapped
337	* regularly before.
338	*/
339	if (page->mapping)
340	page->share = `1`;
341	page->mapping = PAGE_MAPPING_DAX_SHARED;
342	}
343	page->share++;
344	}
345
346	static inline unsigned long dax_page_share_put(struct page *page)
347	{
348	return --page->share;
349	}
350
351	/*
352	* When it is called in dax_insert_entry(), the shared flag will indicate that
353	* whether this entry is shared by multiple files. If so, set the page->mapping
354	* PAGE_MAPPING_DAX_SHARED, and use page->share as refcount.
355	*/
356	static void dax_associate_entry(void entry, struct* address_space *mapping,
357	struct vm_area_struct vma, unsigned* long address, bool shared)
358	{
359	unsigned long size = dax_entry_size(entry), pfn, index;
360	int i = `0`;
361
362	if (IS_ENABLED(CONFIG_FS_DAX_LIMITED))
363	return;
364
365	index = linear_page_index(vma, address: address & ~(size - `1`));
366	for_each_mapped_pfn(entry, pfn) {
367	struct page *page = pfn_to_page(pfn);
368
369	if (shared) {
370	dax_page_share_get(page);
371	} else {
372	WARN_ON_ONCE(page->mapping);
373	page->mapping = mapping;
374	page->index = index + i++;
375	}
376	}
377	}
378
379	static void dax_disassociate_entry(void entry, struct* address_space *mapping,
380	bool trunc)
381	{
382	unsigned long pfn;
383
384	if (IS_ENABLED(CONFIG_FS_DAX_LIMITED))
385	return;
386
387	for_each_mapped_pfn(entry, pfn) {
388	struct page *page = pfn_to_page(pfn);
389
390	WARN_ON_ONCE(trunc && page_ref_count(page) > `1`);
391	if (dax_page_is_shared(page)) {
392	/ keep the shared flag if this page is still shared /
393	if (dax_page_share_put(page) > `0`)
394	continue;
395	} else
396	WARN_ON_ONCE(page->mapping && page->mapping != mapping);
397	page->mapping = NULL;
398	page->index = `0`;
399	}
400	}
401
402	static struct page dax_busy_page(void* *entry)
403	{
404	unsigned long pfn;
405
406	for_each_mapped_pfn(entry, pfn) {
407	struct page *page = pfn_to_page(pfn);
408
409	if (page_ref_count(page) > `1`)
410	return page;
411	}
412	return NULL;
413	}
414
415	/**
416	* dax_lock_folio - Lock the DAX entry corresponding to a folio
417	* @folio: The folio whose entry we want to lock
418	*
419	* Context: Process context.
420	* Return: A cookie to pass to dax_unlock_folio() or 0 if the entry could
421	* not be locked.
422	*/
423	dax_entry_t dax_lock_folio(struct folio *folio)
424	{
425	XA_STATE(xas, NULL, `0`);
426	void *entry;
427
428	/ Ensure folio->mapping isn't freed while we look at it /
429	rcu_read_lock();
430	for (;;) {
431	struct address_space *mapping = READ_ONCE(folio->mapping);
432
433	entry = NULL;
434	if (!mapping \|\| !dax_mapping(mapping))
435	break;
436
437	/*
438	* In the device-dax case there's no need to lock, a
439	* struct dev_pagemap pin is sufficient to keep the
440	* inode alive, and we assume we have dev_pagemap pin
441	* otherwise we would not have a valid pfn_to_page()
442	* translation.
443	*/
444	entry = (void *)~`0UL`;
445	if (S_ISCHR(mapping->host->i_mode))
446	break;
447
448	xas.xa = &mapping->i_pages;
449	xas_lock_irq(&xas);
450	if (mapping != folio->mapping) {
451	xas_unlock_irq(&xas);
452	continue;
453	}
454	xas_set(xas: &xas, index: folio->index);
455	entry = xas_load(&xas);
456	if (dax_is_locked(entry)) {
457	rcu_read_unlock();
458	wait_entry_unlocked(xas: &xas, entry);
459	rcu_read_lock();
460	continue;
461	}
462	dax_lock_entry(xas: &xas, entry);
463	xas_unlock_irq(&xas);
464	break;
465	}
466	rcu_read_unlock();
467	return (dax_entry_t)entry;
468	}
469
470	void dax_unlock_folio(struct folio *folio, dax_entry_t cookie)
471	{
472	struct address_space *mapping = folio->mapping;
473	XA_STATE(xas, &mapping->i_pages, folio->index);
474
475	if (S_ISCHR(mapping->host->i_mode))
476	return;
477
478	dax_unlock_entry(xas: &xas, entry: (void *)cookie);
479	}
480
481	/*
482	* dax_lock_mapping_entry - Lock the DAX entry corresponding to a mapping
483	* @mapping: the file's mapping whose entry we want to lock
484	* @index: the offset within this file
485	* @page: output the dax page corresponding to this dax entry
486	*
487	* Return: A cookie to pass to dax_unlock_mapping_entry() or 0 if the entry
488	* could not be locked.
489	*/
490	dax_entry_t dax_lock_mapping_entry(struct address_space *mapping, pgoff_t index,
491	struct page **page)
492	{
493	XA_STATE(xas, NULL, `0`);
494	void *entry;
495
496	rcu_read_lock();
497	for (;;) {
498	entry = NULL;
499	if (!dax_mapping(mapping))
500	break;
501
502	xas.xa = &mapping->i_pages;
503	xas_lock_irq(&xas);
504	xas_set(xas: &xas, index);
505	entry = xas_load(&xas);
506	if (dax_is_locked(entry)) {
507	rcu_read_unlock();
508	wait_entry_unlocked(xas: &xas, entry);
509	rcu_read_lock();
510	continue;
511	}
512	if (!entry \|\|
513	dax_is_zero_entry(entry) \|\| dax_is_empty_entry(entry)) {
514	/*
515	* Because we are looking for entry from file's mapping
516	* and index, so the entry may not be inserted for now,
517	* or even a zero/empty entry. We don't think this is
518	* an error case. So, return a special value and do
519	* not output @page.
520	*/
521	entry = (void *)~`0UL`;
522	} else {
523	*page = pfn_to_page(dax_to_pfn(entry));
524	dax_lock_entry(xas: &xas, entry);
525	}
526	xas_unlock_irq(&xas);
527	break;
528	}
529	rcu_read_unlock();
530	return (dax_entry_t)entry;
531	}
532
533	void dax_unlock_mapping_entry(struct address_space *mapping, pgoff_t index,
534	dax_entry_t cookie)
535	{
536	XA_STATE(xas, &mapping->i_pages, index);
537
538	if (cookie == ~`0UL`)
539	return;
540
541	dax_unlock_entry(xas: &xas, entry: (void *)cookie);
542	}
543
544	/*
545	* Find page cache entry at given index. If it is a DAX entry, return it
546	* with the entry locked. If the page cache doesn't contain an entry at
547	* that index, add a locked empty entry.
548	*
549	* When requesting an entry with size DAX_PMD, grab_mapping_entry() will
550	* either return that locked entry or will return VM_FAULT_FALLBACK.
551	* This will happen if there are any PTE entries within the PMD range
552	* that we are requesting.
553	*
554	* We always favor PTE entries over PMD entries. There isn't a flow where we
555	* evict PTE entries in order to 'upgrade' them to a PMD entry. A PMD
556	* insertion will fail if it finds any PTE entries already in the tree, and a
557	* PTE insertion will cause an existing PMD entry to be unmapped and
558	* downgraded to PTE entries. This happens for both PMD zero pages as
559	* well as PMD empty entries.
560	*
561	* The exception to this downgrade path is for PMD entries that have
562	* real storage backing them. We will leave these real PMD entries in
563	* the tree, and PTE writes will simply dirty the entire PMD entry.
564	*
565	* Note: Unlike filemap_fault() we don't honor FAULT_FLAG_RETRY flags. For
566	* persistent memory the benefit is doubtful. We can add that later if we can
567	* show it helps.
568	*
569	* On error, this function does not return an ERR_PTR. Instead it returns
570	* a VM_FAULT code, encoded as an xarray internal entry. The ERR_PTR values
571	* overlap with xarray value entries.
572	*/
573	static void grab_mapping_entry(struct* xa_state *xas,
574	struct address_space mapping, unsigned* int order)
575	{
576	unsigned long index = xas->xa_index;
577	bool pmd_downgrade; / splitting PMD entry into PTE entries? /
578	void *entry;
579
580	retry:
581	pmd_downgrade = false;
582	xas_lock_irq(xas);
583	entry = get_unlocked_entry(xas, order);
584
585	if (entry) {
586	if (dax_is_conflict(entry))
587	goto fallback;
588	if (!xa_is_value(entry)) {
589	xas_set_err(xas, err: -EIO);
590	goto out_unlock;
591	}
592
593	if (order == `0`) {
594	if (dax_is_pmd_entry(entry) &&
595	(dax_is_zero_entry(entry) \|\|
596	dax_is_empty_entry(entry))) {
597	pmd_downgrade = true;
598	}
599	}
600	}
601
602	if (pmd_downgrade) {
603	/*
604	* Make sure 'entry' remains valid while we drop
605	* the i_pages lock.
606	*/
607	dax_lock_entry(xas, entry);
608
609	/*
610	* Besides huge zero pages the only other thing that gets
611	* downgraded are empty entries which don't need to be
612	* unmapped.
613	*/
614	if (dax_is_zero_entry(entry)) {
615	xas_unlock_irq(xas);
616	unmap_mapping_pages(mapping,
617	start: xas->xa_index & ~PG_PMD_COLOUR,
618	PG_PMD_NR, even_cows: false);
619	xas_reset(xas);
620	xas_lock_irq(xas);
621	}
622
623	dax_disassociate_entry(entry, mapping, trunc: false);
624	xas_store(xas, NULL); / undo the PMD join /
625	dax_wake_entry(xas, entry, mode: WAKE_ALL);
626	mapping->nrpages -= PG_PMD_NR;
627	entry = NULL;
628	xas_set(xas, index);
629	}
630
631	if (entry) {
632	dax_lock_entry(xas, entry);
633	} else {
634	unsigned long flags = DAX_EMPTY;
635
636	if (order > `0`)
637	flags \|= DAX_PMD;
638	entry = dax_make_entry(pfn: pfn_to_pfn_t(pfn: `0`), flags);
639	dax_lock_entry(xas, entry);
640	if (xas_error(xas))
641	goto out_unlock;
642	mapping->nrpages += `1UL` << order;
643	}
644
645	out_unlock:
646	xas_unlock_irq(xas);
647	if (xas_nomem(xas, mapping_gfp_mask(mapping) & ~__GFP_HIGHMEM))
648	goto retry;
649	if (xas->xa_node == XA_ERROR(-ENOMEM))
650	return xa_mk_internal(v: VM_FAULT_OOM);
651	if (xas_error(xas))
652	return xa_mk_internal(v: VM_FAULT_SIGBUS);
653	return entry;
654	fallback:
655	xas_unlock_irq(xas);
656	return xa_mk_internal(v: VM_FAULT_FALLBACK);
657	}
658
659	/**
660	* dax_layout_busy_page_range - find first pinned page in @mapping
661	* @mapping: address space to scan for a page with ref count > 1
662	* @start: Starting offset. Page containing 'start' is included.
663	* @end: End offset. Page containing 'end' is included. If 'end' is LLONG_MAX,
664	* pages from 'start' till the end of file are included.
665	*
666	* DAX requires ZONE_DEVICE mapped pages. These pages are never
667	* 'onlined' to the page allocator so they are considered idle when
668	* page->count == 1. A filesystem uses this interface to determine if
669	* any page in the mapping is busy, i.e. for DMA, or other
670	* get_user_pages() usages.
671	*
672	* It is expected that the filesystem is holding locks to block the
673	* establishment of new mappings in this address_space. I.e. it expects
674	* to be able to run unmap_mapping_range() and subsequently not race
675	* mapping_mapped() becoming true.
676	*/
677	struct page dax_layout_busy_page_range(struct* address_space *mapping,
678	loff_t start, loff_t end)
679	{
680	void *entry;
681	unsigned int scanned = `0`;
682	struct page *page = NULL;
683	pgoff_t start_idx = start >> PAGE_SHIFT;
684	pgoff_t end_idx;
685	XA_STATE(xas, &mapping->i_pages, start_idx);
686
687	/*
688	* In the 'limited' case get_user_pages() for dax is disabled.
689	*/
690	if (IS_ENABLED(CONFIG_FS_DAX_LIMITED))
691	return NULL;
692
693	if (!dax_mapping(mapping) \|\| !mapping_mapped(mapping))
694	return NULL;
695
696	/ If end == LLONG_MAX, all pages from start to till end of file /
697	if (end == LLONG_MAX)
698	end_idx = ULONG_MAX;
699	else
700	end_idx = end >> PAGE_SHIFT;
701	/*
702	* If we race get_user_pages_fast() here either we'll see the
703	* elevated page count in the iteration and wait, or
704	* get_user_pages_fast() will see that the page it took a reference
705	* against is no longer mapped in the page tables and bail to the
706	* get_user_pages() slow path. The slow path is protected by
707	* pte_lock() and pmd_lock(). New references are not taken without
708	* holding those locks, and unmap_mapping_pages() will not zero the
709	* pte or pmd without holding the respective lock, so we are
710	* guaranteed to either see new references or prevent new
711	* references from being established.
712	*/
713	unmap_mapping_pages(mapping, start: start_idx, nr: end_idx - start_idx + `1`, even_cows: `0`);
714
715	xas_lock_irq(&xas);
716	xas_for_each(&xas, entry, end_idx) {
717	if (WARN_ON_ONCE(!xa_is_value(entry)))
718	continue;
719	if (unlikely(dax_is_locked(entry)))
720	entry = get_unlocked_entry(xas: &xas, order: `0`);
721	if (entry)
722	page = dax_busy_page(entry);
723	put_unlocked_entry(xas: &xas, entry, mode: WAKE_NEXT);
724	if (page)
725	break;
726	if (++scanned % XA_CHECK_SCHED)
727	continue;
728
729	xas_pause(&xas);
730	xas_unlock_irq(&xas);
731	cond_resched();
732	xas_lock_irq(&xas);
733	}
734	xas_unlock_irq(&xas);
735	return page;
736	}
737	EXPORT_SYMBOL_GPL(dax_layout_busy_page_range);
738
739	struct page dax_layout_busy_page(struct* address_space *mapping)
740	{
741	return dax_layout_busy_page_range(mapping, `0`, LLONG_MAX);
742	}
743	EXPORT_SYMBOL_GPL(dax_layout_busy_page);
744
745	static int __dax_invalidate_entry(struct address_space *mapping,
746	pgoff_t index, bool trunc)
747	{
748	XA_STATE(xas, &mapping->i_pages, index);
749	int ret = `0`;
750	void *entry;
751
752	xas_lock_irq(&xas);
753	entry = get_unlocked_entry(xas: &xas, order: `0`);
754	if (!entry \|\| WARN_ON_ONCE(!xa_is_value(entry)))
755	goto out;
756	if (!trunc &&
757	(xas_get_mark(&xas, PAGECACHE_TAG_DIRTY) \|\|
758	xas_get_mark(&xas, PAGECACHE_TAG_TOWRITE)))
759	goto out;
760	dax_disassociate_entry(entry, mapping, trunc);
761	xas_store(&xas, NULL);
762	mapping->nrpages -= `1UL` << dax_entry_order(entry);
763	ret = `1`;
764	out:
765	put_unlocked_entry(xas: &xas, entry, mode: WAKE_ALL);
766	xas_unlock_irq(&xas);
767	return ret;
768	}
769
770	static int __dax_clear_dirty_range(struct address_space *mapping,
771	pgoff_t start, pgoff_t end)
772	{
773	XA_STATE(xas, &mapping->i_pages, start);
774	unsigned int scanned = `0`;
775	void *entry;
776
777	xas_lock_irq(&xas);
778	xas_for_each(&xas, entry, end) {
779	entry = get_unlocked_entry(xas: &xas, order: `0`);
780	xas_clear_mark(&xas, PAGECACHE_TAG_DIRTY);
781	xas_clear_mark(&xas, PAGECACHE_TAG_TOWRITE);
782	put_unlocked_entry(xas: &xas, entry, mode: WAKE_NEXT);
783
784	if (++scanned % XA_CHECK_SCHED)
785	continue;
786
787	xas_pause(&xas);
788	xas_unlock_irq(&xas);
789	cond_resched();
790	xas_lock_irq(&xas);
791	}
792	xas_unlock_irq(&xas);
793
794	return `0`;
795	}
796
797	/*
798	* Delete DAX entry at @index from @mapping. Wait for it
799	* to be unlocked before deleting it.
800	*/
801	int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index)
802	{
803	int ret = __dax_invalidate_entry(mapping, index, trunc: true);
804
805	/*
806	* This gets called from truncate / punch_hole path. As such, the caller
807	* must hold locks protecting against concurrent modifications of the
808	* page cache (usually fs-private i_mmap_sem for writing). Since the
809	* caller has seen a DAX entry for this index, we better find it
810	* at that index as well...
811	*/
812	WARN_ON_ONCE(!ret);
813	return ret;
814	}
815
816	/*
817	* Invalidate DAX entry if it is clean.
818	*/
819	int dax_invalidate_mapping_entry_sync(struct address_space *mapping,
820	pgoff_t index)
821	{
822	return __dax_invalidate_entry(mapping, index, trunc: false);
823	}
824
825	static pgoff_t dax_iomap_pgoff(const struct iomap *iomap, loff_t pos)
826	{
827	return PHYS_PFN(iomap->addr + (pos & PAGE_MASK) - iomap->offset);
828	}
829
830	static int copy_cow_page_dax(struct vm_fault vmf, const* struct iomap_iter *iter)
831	{
832	pgoff_t pgoff = dax_iomap_pgoff(iomap: &iter->iomap, pos: iter->pos);
833	void vto, kaddr;
834	long rc;
835	int id;
836
837	id = dax_read_lock();
838	rc = dax_direct_access(dax_dev: iter->iomap.dax_dev, pgoff, nr_pages: `1`, mode: DAX_ACCESS,
839	kaddr: &kaddr, NULL);
840	if (rc < `0`) {
841	dax_read_unlock(id);
842	return rc;
843	}
844	vto = kmap_atomic(page: vmf->cow_page);
845	copy_user_page(to: vto, from: kaddr, vaddr: vmf->address, topage: vmf->cow_page);
846	kunmap_atomic(vto);
847	dax_read_unlock(id);
848	return `0`;
849	}
850
851	/*
852	* MAP_SYNC on a dax mapping guarantees dirty metadata is
853	* flushed on write-faults (non-cow), but not read-faults.
854	*/
855	static bool dax_fault_is_synchronous(const struct iomap_iter *iter,
856	struct vm_area_struct *vma)
857	{
858	return (iter->flags & IOMAP_WRITE) && (vma->vm_flags & VM_SYNC) &&
859	(iter->iomap.flags & IOMAP_F_DIRTY);
860	}
861
862	/*
863	* By this point grab_mapping_entry() has ensured that we have a locked entry
864	* of the appropriate size so we don't have to worry about downgrading PMDs to
865	* PTEs. If we happen to be trying to insert a PTE and there is a PMD
866	* already in the tree, we will skip the insertion and just dirty the PMD as
867	* appropriate.
868	*/
869	static void dax_insert_entry(struct* xa_state xas, struct* vm_fault *vmf,
870	const struct iomap_iter iter, void* *entry, pfn_t pfn,
871	unsigned long flags)
872	{
873	struct address_space *mapping = vmf->vma->vm_file->f_mapping;
874	void *new_entry = dax_make_entry(pfn, flags);
875	bool write = iter->flags & IOMAP_WRITE;
876	bool dirty = write && !dax_fault_is_synchronous(iter, vma: vmf->vma);
877	bool shared = iter->iomap.flags & IOMAP_F_SHARED;
878
879	if (dirty)
880	__mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
881
882	if (shared \|\| (dax_is_zero_entry(entry) && !(flags & DAX_ZERO_PAGE))) {
883	unsigned long index = xas->xa_index;
884	/ we are replacing a zero page with block mapping /
885	if (dax_is_pmd_entry(entry))
886	unmap_mapping_pages(mapping, start: index & ~PG_PMD_COLOUR,
887	PG_PMD_NR, even_cows: false);
888	else / pte entry /
889	unmap_mapping_pages(mapping, start: index, nr: `1`, even_cows: false);
890	}
891
892	xas_reset(xas);
893	xas_lock_irq(xas);
894	if (shared \|\| dax_is_zero_entry(entry) \|\| dax_is_empty_entry(entry)) {
895	void *old;
896
897	dax_disassociate_entry(entry, mapping, trunc: false);
898	dax_associate_entry(entry: new_entry, mapping, vma: vmf->vma, address: vmf->address,
899	shared);
900	/*
901	* Only swap our new entry into the page cache if the current
902	* entry is a zero page or an empty entry. If a normal PTE or
903	* PMD entry is already in the cache, we leave it alone. This
904	* means that if we are trying to insert a PTE and the
905	* existing entry is a PMD, we will just leave the PMD in the
906	* tree and dirty it if necessary.
907	*/
908	old = dax_lock_entry(xas, entry: new_entry);
909	WARN_ON_ONCE(old != xa_mk_value(xa_to_value(entry) \|
910	DAX_LOCKED));
911	entry = new_entry;
912	} else {
913	xas_load(xas); / Walk the xa_state /
914	}
915
916	if (dirty)
917	xas_set_mark(xas, PAGECACHE_TAG_DIRTY);
918
919	if (write && shared)
920	xas_set_mark(xas, PAGECACHE_TAG_TOWRITE);
921
922	xas_unlock_irq(xas);
923	return entry;
924	}
925
926	static int dax_writeback_one(struct xa_state xas, struct* dax_device *dax_dev,
927	struct address_space mapping, void* *entry)
928	{
929	unsigned long pfn, index, count, end;
930	long ret = `0`;
931	struct vm_area_struct *vma;
932
933	/*
934	* A page got tagged dirty in DAX mapping? Something is seriously
935	* wrong.
936	*/
937	if (WARN_ON(!xa_is_value(entry)))
938	return -EIO;
939
940	if (unlikely(dax_is_locked(entry))) {
941	void *old_entry = entry;
942
943	entry = get_unlocked_entry(xas, order: `0`);
944
945	/ Entry got punched out / reallocated? /
946	if (!entry \|\| WARN_ON_ONCE(!xa_is_value(entry)))
947	goto put_unlocked;
948	/*
949	* Entry got reallocated elsewhere? No need to writeback.
950	* We have to compare pfns as we must not bail out due to
951	* difference in lockbit or entry type.
952	*/
953	if (dax_to_pfn(entry: old_entry) != dax_to_pfn(entry))
954	goto put_unlocked;
955	if (WARN_ON_ONCE(dax_is_empty_entry(entry) \|\|
956	dax_is_zero_entry(entry))) {
957	ret = -EIO;
958	goto put_unlocked;
959	}
960
961	/ Another fsync thread may have already done this entry /
962	if (!xas_get_mark(xas, PAGECACHE_TAG_TOWRITE))
963	goto put_unlocked;
964	}
965
966	/ Lock the entry to serialize with page faults /
967	dax_lock_entry(xas, entry);
968
969	/*
970	* We can clear the tag now but we have to be careful so that concurrent
971	* dax_writeback_one() calls for the same index cannot finish before we
972	* actually flush the caches. This is achieved as the calls will look
973	* at the entry only under the i_pages lock and once they do that
974	* they will see the entry locked and wait for it to unlock.
975	*/
976	xas_clear_mark(xas, PAGECACHE_TAG_TOWRITE);
977	xas_unlock_irq(xas);
978
979	/*
980	* If dax_writeback_mapping_range() was given a wbc->range_start
981	* in the middle of a PMD, the 'index' we use needs to be
982	* aligned to the start of the PMD.
983	* This allows us to flush for PMD_SIZE and not have to worry about
984	* partial PMD writebacks.
985	*/
986	pfn = dax_to_pfn(entry);
987	count = `1UL` << dax_entry_order(entry);
988	index = xas->xa_index & ~(count - `1`);
989	end = index + count - `1`;
990
991	/ Walk all mappings of a given index of a file and writeprotect them /
992	i_mmap_lock_read(mapping);
993	vma_interval_tree_foreach(vma, &mapping->i_mmap, index, end) {
994	pfn_mkclean_range(pfn, nr_pages: count, pgoff: index, vma);
995	cond_resched();
996	}
997	i_mmap_unlock_read(mapping);
998
999	dax_flush(dax_dev, page_address(pfn_to_page(pfn)), size: count * PAGE_SIZE);
1000	/*
1001	* After we have flushed the cache, we can clear the dirty tag. There
1002	* cannot be new dirty data in the pfn after the flush has completed as
1003	* the pfn mappings are writeprotected and fault waits for mapping
1004	* entry lock.
1005	*/
1006	xas_reset(xas);
1007	xas_lock_irq(xas);
1008	xas_store(xas, entry);
1009	xas_clear_mark(xas, PAGECACHE_TAG_DIRTY);
1010	dax_wake_entry(xas, entry, mode: WAKE_NEXT);
1011
1012	trace_dax_writeback_one(inode: mapping->host, pgoff: index, pglen: count);
1013	return ret;
1014
1015	put_unlocked:
1016	put_unlocked_entry(xas, entry, mode: WAKE_NEXT);
1017	return ret;
1018	}
1019
1020	/*
1021	* Flush the mapping to the persistent domain within the byte range of [start,
1022	* end]. This is required by data integrity operations to ensure file data is
1023	* on persistent storage prior to completion of the operation.
1024	*/
1025	int dax_writeback_mapping_range(struct address_space *mapping,
1026	struct dax_device dax_dev, struct* writeback_control *wbc)
1027	{
1028	XA_STATE(xas, &mapping->i_pages, wbc->range_start >> PAGE_SHIFT);
1029	struct inode *inode = mapping->host;
1030	pgoff_t end_index = wbc->range_end >> PAGE_SHIFT;
1031	void *entry;
1032	int ret = `0`;
1033	unsigned int scanned = `0`;
1034
1035	if (WARN_ON_ONCE(inode->i_blkbits != PAGE_SHIFT))
1036	return -EIO;
1037
1038	if (mapping_empty(mapping) \|\| wbc->sync_mode != WB_SYNC_ALL)
1039	return `0`;
1040
1041	trace_dax_writeback_range(inode, start_index: xas.xa_index, end_index);
1042
1043	tag_pages_for_writeback(mapping, start: xas.xa_index, end: end_index);
1044
1045	xas_lock_irq(&xas);
1046	xas_for_each_marked(&xas, entry, end_index, PAGECACHE_TAG_TOWRITE) {
1047	ret = dax_writeback_one(xas: &xas, dax_dev, mapping, entry);
1048	if (ret < `0`) {
1049	mapping_set_error(mapping, error: ret);
1050	break;
1051	}
1052	if (++scanned % XA_CHECK_SCHED)
1053	continue;
1054
1055	xas_pause(&xas);
1056	xas_unlock_irq(&xas);
1057	cond_resched();
1058	xas_lock_irq(&xas);
1059	}
1060	xas_unlock_irq(&xas);
1061	trace_dax_writeback_range_done(inode, start_index: xas.xa_index, end_index);
1062	return ret;
1063	}
1064	EXPORT_SYMBOL_GPL(dax_writeback_mapping_range);
1065
1066	static int dax_iomap_direct_access(const struct iomap *iomap, loff_t pos,
1067	size_t size, void *kaddr, pfn_t pfnp)
1068	{
1069	pgoff_t pgoff = dax_iomap_pgoff(iomap, pos);
1070	int id, rc = `0`;
1071	long length;
1072
1073	id = dax_read_lock();
1074	length = dax_direct_access(dax_dev: iomap->dax_dev, pgoff, PHYS_PFN(size),
1075	mode: DAX_ACCESS, kaddr, pfn: pfnp);
1076	if (length < `0`) {
1077	rc = length;
1078	goto out;
1079	}
1080	if (!pfnp)
1081	goto out_check_addr;
1082	rc = -EINVAL;
1083	if (PFN_PHYS(length) < size)
1084	goto out;
1085	if (pfn_t_to_pfn(pfn: *pfnp) & (PHYS_PFN(size)-`1`))
1086	goto out;
1087	/ For larger pages we need devmap /
1088	if (length > `1` && !pfn_t_devmap(pfn: *pfnp))
1089	goto out;
1090	rc = `0`;
1091
1092	out_check_addr:
1093	if (!kaddr)
1094	goto out;
1095	if (!*kaddr)
1096	rc = -EFAULT;
1097	out:
1098	dax_read_unlock(id);
1099	return rc;
1100	}
1101
1102	/**
1103	* dax_iomap_copy_around - Prepare for an unaligned write to a shared/cow page
1104	* by copying the data before and after the range to be written.
1105	* @pos: address to do copy from.
1106	* @length: size of copy operation.
1107	* @align_size: aligned w.r.t align_size (either PMD_SIZE or PAGE_SIZE)
1108	* @srcmap: iomap srcmap
1109	* @daddr: destination address to copy to.
1110	*
1111	* This can be called from two places. Either during DAX write fault (page
1112	* aligned), to copy the length size data to daddr. Or, while doing normal DAX
1113	* write operation, dax_iomap_iter() might call this to do the copy of either
1114	* start or end unaligned address. In the latter case the rest of the copy of
1115	* aligned ranges is taken care by dax_iomap_iter() itself.
1116	* If the srcmap contains invalid data, such as HOLE and UNWRITTEN, zero the
1117	* area to make sure no old data remains.
1118	*/
1119	static int dax_iomap_copy_around(loff_t pos, uint64_t length, size_t align_size,
1120	const struct iomap srcmap, void* *daddr)
1121	{
1122	loff_t head_off = pos & (align_size - `1`);
1123	size_t size = ALIGN(head_off + length, align_size);
1124	loff_t end = pos + length;
1125	loff_t pg_end = round_up(end, align_size);
1126	/ copy_all is usually in page fault case /
1127	bool copy_all = head_off == `0` && end == pg_end;
1128	/ zero the edges if srcmap is a HOLE or IOMAP_UNWRITTEN /
1129	bool zero_edge = srcmap->flags & IOMAP_F_SHARED \|\|
1130	srcmap->type == IOMAP_UNWRITTEN;
1131	void *saddr = NULL;
1132	int ret = `0`;
1133
1134	if (!zero_edge) {
1135	ret = dax_iomap_direct_access(iomap: srcmap, pos, size, kaddr: &saddr, NULL);
1136	if (ret)
1137	return dax_mem2blk_err(err: ret);
1138	}
1139
1140	if (copy_all) {
1141	if (zero_edge)
1142	memset(daddr, `0`, size);
1143	else
1144	ret = copy_mc_to_kernel(to: daddr, from: saddr, len: length);
1145	goto out;
1146	}
1147
1148	/ Copy the head part of the range /
1149	if (head_off) {
1150	if (zero_edge)
1151	memset(daddr, `0`, head_off);
1152	else {
1153	ret = copy_mc_to_kernel(to: daddr, from: saddr, len: head_off);
1154	if (ret)
1155	return -EIO;
1156	}
1157	}
1158
1159	/ Copy the tail part of the range /
1160	if (end < pg_end) {
1161	loff_t tail_off = head_off + length;
1162	loff_t tail_len = pg_end - end;
1163
1164	if (zero_edge)
1165	memset(daddr + tail_off, `0`, tail_len);
1166	else {
1167	ret = copy_mc_to_kernel(to: daddr + tail_off,
1168	from: saddr + tail_off, len: tail_len);
1169	if (ret)
1170	return -EIO;
1171	}
1172	}
1173	out:
1174	if (zero_edge)
1175	dax_flush(dax_dev: srcmap->dax_dev, addr: daddr, size);
1176	return ret ? -EIO : `0`;
1177	}
1178
1179	/*
1180	* The user has performed a load from a hole in the file. Allocating a new
1181	* page in the file would cause excessive storage usage for workloads with
1182	* sparse files. Instead we insert a read-only mapping of the 4k zero page.
1183	* If this page is ever written to we will re-fault and change the mapping to
1184	* point to real DAX storage instead.
1185	*/
1186	static vm_fault_t dax_load_hole(struct xa_state xas, struct* vm_fault *vmf,
1187	const struct iomap_iter iter, void* **entry)
1188	{
1189	struct inode *inode = iter->inode;
1190	unsigned long vaddr = vmf->address;
1191	pfn_t pfn = pfn_to_pfn_t(pfn: my_zero_pfn(addr: vaddr));
1192	vm_fault_t ret;
1193
1194	entry = dax_insert_entry(xas, vmf, iter, entry: entry, pfn, DAX_ZERO_PAGE);
1195
1196	ret = vmf_insert_mixed(vma: vmf->vma, addr: vaddr, pfn);
1197	trace_dax_load_hole(inode, vmf, result: ret);
1198	return ret;
1199	}
1200
1201	#ifdef CONFIG_FS_DAX_PMD
1202	static vm_fault_t dax_pmd_load_hole(struct xa_state xas, struct* vm_fault *vmf,
1203	const struct iomap_iter iter, void* **entry)
1204	{
1205	struct address_space *mapping = vmf->vma->vm_file->f_mapping;
1206	unsigned long pmd_addr = vmf->address & PMD_MASK;
1207	struct vm_area_struct *vma = vmf->vma;
1208	struct inode *inode = mapping->host;
1209	pgtable_t pgtable = NULL;
1210	struct page *zero_page;
1211	spinlock_t *ptl;
1212	pmd_t pmd_entry;
1213	pfn_t pfn;
1214
1215	zero_page = mm_get_huge_zero_page(mm: vmf->vma->vm_mm);
1216
1217	if (unlikely(!zero_page))
1218	goto fallback;
1219
1220	pfn = page_to_pfn_t(page: zero_page);
1221	entry = dax_insert_entry(xas, vmf, iter, entry: entry, pfn,
1222	DAX_PMD \| DAX_ZERO_PAGE);
1223
1224	if (arch_needs_pgtable_deposit()) {
1225	pgtable = pte_alloc_one(vma->vm_mm);
1226	if (!pgtable)
1227	return VM_FAULT_OOM;
1228	}
1229
1230	ptl = pmd_lock(mm: vmf->vma->vm_mm, pmd: vmf->pmd);
1231	if (!pmd_none(pmd: *(vmf->pmd))) {
1232	spin_unlock(lock: ptl);
1233	goto fallback;
1234	}
1235
1236	if (pgtable) {
1237	pgtable_trans_huge_deposit(mm: vma->vm_mm, pmdp: vmf->pmd, pgtable);
1238	mm_inc_nr_ptes(mm: vma->vm_mm);
1239	}
1240	pmd_entry = mk_pmd(zero_page, vmf->vma->vm_page_prot);
1241	pmd_entry = pmd_mkhuge(pmd: pmd_entry);
1242	set_pmd_at(mm: vmf->vma->vm_mm, addr: pmd_addr, pmdp: vmf->pmd, pmd: pmd_entry);
1243	spin_unlock(lock: ptl);
1244	trace_dax_pmd_load_hole(inode, vmf, zero_page, radix_entry: *entry);
1245	return VM_FAULT_NOPAGE;
1246
1247	fallback:
1248	if (pgtable)
1249	pte_free(mm: vma->vm_mm, pte_page: pgtable);
1250	trace_dax_pmd_load_hole_fallback(inode, vmf, zero_page, radix_entry: *entry);
1251	return VM_FAULT_FALLBACK;
1252	}
1253	#else
1254	static vm_fault_t dax_pmd_load_hole(struct xa_state xas, struct* vm_fault *vmf,
1255	const struct iomap_iter iter, void* **entry)
1256	{
1257	return VM_FAULT_FALLBACK;
1258	}
1259	#endif /* CONFIG_FS_DAX_PMD */
1260
1261	static s64 dax_unshare_iter(struct iomap_iter *iter)
1262	{
1263	struct iomap *iomap = &iter->iomap;
1264	const struct iomap *srcmap = iomap_iter_srcmap(i: iter);
1265	loff_t pos = iter->pos;
1266	loff_t length = iomap_length(iter);
1267	int id = `0`;
1268	s64 ret = `0`;
1269	void daddr = NULL, saddr = NULL;
1270
1271	/ don't bother with blocks that are not shared to start with /
1272	if (!(iomap->flags & IOMAP_F_SHARED))
1273	return length;
1274
1275	id = dax_read_lock();
1276	ret = dax_iomap_direct_access(iomap, pos, size: length, kaddr: &daddr, NULL);
1277	if (ret < `0`)
1278	goto out_unlock;
1279
1280	/ zero the distance if srcmap is HOLE or UNWRITTEN /
1281	if (srcmap->flags & IOMAP_F_SHARED \|\| srcmap->type == IOMAP_UNWRITTEN) {
1282	memset(daddr, `0`, length);
1283	dax_flush(dax_dev: iomap->dax_dev, addr: daddr, size: length);
1284	ret = length;
1285	goto out_unlock;
1286	}
1287
1288	ret = dax_iomap_direct_access(iomap: srcmap, pos, size: length, kaddr: &saddr, NULL);
1289	if (ret < `0`)
1290	goto out_unlock;
1291
1292	if (copy_mc_to_kernel(to: daddr, from: saddr, len: length) == `0`)
1293	ret = length;
1294	else
1295	ret = -EIO;
1296
1297	out_unlock:
1298	dax_read_unlock(id);
1299	return dax_mem2blk_err(err: ret);
1300	}
1301
1302	int dax_file_unshare(struct inode *inode, loff_t pos, loff_t len,
1303	const struct iomap_ops *ops)
1304	{
1305	struct iomap_iter iter = {
1306	.inode = inode,
1307	.pos = pos,
1308	.len = len,
1309	.flags = IOMAP_WRITE \| IOMAP_UNSHARE \| IOMAP_DAX,
1310	};
1311	int ret;
1312
1313	while ((ret = iomap_iter(iter: &iter, ops)) > `0`)
1314	iter.processed = dax_unshare_iter(iter: &iter);
1315	return ret;
1316	}
1317	EXPORT_SYMBOL_GPL(dax_file_unshare);
1318
1319	static int dax_memzero(struct iomap_iter *iter, loff_t pos, size_t size)
1320	{
1321	const struct iomap *iomap = &iter->iomap;
1322	const struct iomap *srcmap = iomap_iter_srcmap(i: iter);
1323	unsigned offset = offset_in_page(pos);
1324	pgoff_t pgoff = dax_iomap_pgoff(iomap, pos);
1325	void *kaddr;
1326	long ret;
1327
1328	ret = dax_direct_access(dax_dev: iomap->dax_dev, pgoff, nr_pages: `1`, mode: DAX_ACCESS, kaddr: &kaddr,
1329	NULL);
1330	if (ret < `0`)
1331	return dax_mem2blk_err(err: ret);
1332
1333	memset(kaddr + offset, `0`, size);
1334	if (iomap->flags & IOMAP_F_SHARED)
1335	ret = dax_iomap_copy_around(pos, length: size, PAGE_SIZE, srcmap,
1336	daddr: kaddr);
1337	else
1338	dax_flush(dax_dev: iomap->dax_dev, addr: kaddr + offset, size);
1339	return ret;
1340	}
1341
1342	static s64 dax_zero_iter(struct iomap_iter iter, bool did_zero)
1343	{
1344	const struct iomap *iomap = &iter->iomap;
1345	const struct iomap *srcmap = iomap_iter_srcmap(i: iter);
1346	loff_t pos = iter->pos;
1347	u64 length = iomap_length(iter);
1348	s64 written = `0`;
1349
1350	/ already zeroed? we're done. /
1351	if (srcmap->type == IOMAP_HOLE \|\| srcmap->type == IOMAP_UNWRITTEN)
1352	return length;
1353
1354	/*
1355	* invalidate the pages whose sharing state is to be changed
1356	* because of CoW.
1357	*/
1358	if (iomap->flags & IOMAP_F_SHARED)
1359	invalidate_inode_pages2_range(mapping: iter->inode->i_mapping,
1360	start: pos >> PAGE_SHIFT,
1361	end: (pos + length - `1`) >> PAGE_SHIFT);
1362
1363	do {
1364	unsigned offset = offset_in_page(pos);
1365	unsigned size = min_t(u64, PAGE_SIZE - offset, length);
1366	pgoff_t pgoff = dax_iomap_pgoff(iomap, pos);
1367	long rc;
1368	int id;
1369
1370	id = dax_read_lock();
1371	if (IS_ALIGNED(pos, PAGE_SIZE) && size == PAGE_SIZE)
1372	rc = dax_zero_page_range(dax_dev: iomap->dax_dev, pgoff, nr_pages: `1`);
1373	else
1374	rc = dax_memzero(iter, pos, size);
1375	dax_read_unlock(id);
1376
1377	if (rc < `0`)
1378	return rc;
1379	pos += size;
1380	length -= size;
1381	written += size;
1382	} while (length > `0`);
1383
1384	if (did_zero)
1385	*did_zero = true;
1386	return written;
1387	}
1388
1389	int dax_zero_range(struct inode inode, loff_t pos, loff_t len, bool did_zero,
1390	const struct iomap_ops *ops)
1391	{
1392	struct iomap_iter iter = {
1393	.inode = inode,
1394	.pos = pos,
1395	.len = len,
1396	.flags = IOMAP_DAX \| IOMAP_ZERO,
1397	};
1398	int ret;
1399
1400	while ((ret = iomap_iter(iter: &iter, ops)) > `0`)
1401	iter.processed = dax_zero_iter(iter: &iter, did_zero);
1402	return ret;
1403	}
1404	EXPORT_SYMBOL_GPL(dax_zero_range);
1405
1406	int dax_truncate_page(struct inode inode, loff_t pos, bool did_zero,
1407	const struct iomap_ops *ops)
1408	{
1409	unsigned int blocksize = i_blocksize(node: inode);
1410	unsigned int off = pos & (blocksize - `1`);
1411
1412	/ Block boundary? Nothing to do /
1413	if (!off)
1414	return `0`;
1415	return dax_zero_range(inode, pos, blocksize - off, did_zero, ops);
1416	}
1417	EXPORT_SYMBOL_GPL(dax_truncate_page);
1418
1419	static loff_t dax_iomap_iter(const struct iomap_iter *iomi,
1420	struct iov_iter *iter)
1421	{
1422	const struct iomap *iomap = &iomi->iomap;
1423	const struct iomap *srcmap = iomap_iter_srcmap(i: iomi);
1424	loff_t length = iomap_length(iter: iomi);
1425	loff_t pos = iomi->pos;
1426	struct dax_device *dax_dev = iomap->dax_dev;
1427	loff_t end = pos + length, done = `0`;
1428	bool write = iov_iter_rw(i: iter) == WRITE;
1429	bool cow = write && iomap->flags & IOMAP_F_SHARED;
1430	ssize_t ret = `0`;
1431	size_t xfer;
1432	int id;
1433
1434	if (!write) {
1435	end = min(end, i_size_read(iomi->inode));
1436	if (pos >= end)
1437	return `0`;
1438
1439	if (iomap->type == IOMAP_HOLE \|\| iomap->type == IOMAP_UNWRITTEN)
1440	return iov_iter_zero(min(length, end - pos), iter);
1441	}
1442
1443	/*
1444	* In DAX mode, enforce either pure overwrites of written extents, or
1445	* writes to unwritten extents as part of a copy-on-write operation.
1446	*/
1447	if (WARN_ON_ONCE(iomap->type != IOMAP_MAPPED &&
1448	!(iomap->flags & IOMAP_F_SHARED)))
1449	return -EIO;
1450
1451	/*
1452	* Write can allocate block for an area which has a hole page mapped
1453	* into page tables. We have to tear down these mappings so that data
1454	* written by write(2) is visible in mmap.
1455	*/
1456	if (iomap->flags & IOMAP_F_NEW \|\| cow) {
1457	/*
1458	* Filesystem allows CoW on non-shared extents. The src extents
1459	* may have been mmapped with dirty mark before. To be able to
1460	* invalidate its dax entries, we need to clear the dirty mark
1461	* in advance.
1462	*/
1463	if (cow)
1464	__dax_clear_dirty_range(mapping: iomi->inode->i_mapping,
1465	start: pos >> PAGE_SHIFT,
1466	end: (end - `1`) >> PAGE_SHIFT);
1467	invalidate_inode_pages2_range(mapping: iomi->inode->i_mapping,
1468	start: pos >> PAGE_SHIFT,
1469	end: (end - `1`) >> PAGE_SHIFT);
1470	}
1471
1472	id = dax_read_lock();
1473	while (pos < end) {
1474	unsigned offset = pos & (PAGE_SIZE - `1`);
1475	const size_t size = ALIGN(length + offset, PAGE_SIZE);
1476	pgoff_t pgoff = dax_iomap_pgoff(iomap, pos);
1477	ssize_t map_len;
1478	bool recovery = false;
1479	void *kaddr;
1480
1481	if (fatal_signal_pending(current)) {
1482	ret = -EINTR;
1483	break;
1484	}
1485
1486	map_len = dax_direct_access(dax_dev, pgoff, PHYS_PFN(size),
1487	mode: DAX_ACCESS, kaddr: &kaddr, NULL);
1488	if (map_len == -EHWPOISON && iov_iter_rw(i: iter) == WRITE) {
1489	map_len = dax_direct_access(dax_dev, pgoff,
1490	PHYS_PFN(size), mode: DAX_RECOVERY_WRITE,
1491	kaddr: &kaddr, NULL);
1492	if (map_len > `0`)
1493	recovery = true;
1494	}
1495	if (map_len < `0`) {
1496	ret = dax_mem2blk_err(err: map_len);
1497	break;
1498	}
1499
1500	if (cow) {
1501	ret = dax_iomap_copy_around(pos, length, PAGE_SIZE,
1502	srcmap, daddr: kaddr);
1503	if (ret)
1504	break;
1505	}
1506
1507	map_len = PFN_PHYS(map_len);
1508	kaddr += offset;
1509	map_len -= offset;
1510	if (map_len > end - pos)
1511	map_len = end - pos;
1512
1513	if (recovery)
1514	xfer = dax_recovery_write(dax_dev, pgoff, addr: kaddr,
1515	bytes: map_len, i: iter);
1516	else if (write)
1517	xfer = dax_copy_from_iter(dax_dev, pgoff, addr: kaddr,
1518	bytes: map_len, i: iter);
1519	else
1520	xfer = dax_copy_to_iter(dax_dev, pgoff, addr: kaddr,
1521	bytes: map_len, i: iter);
1522
1523	pos += xfer;
1524	length -= xfer;
1525	done += xfer;
1526
1527	if (xfer == `0`)
1528	ret = -EFAULT;
1529	if (xfer < map_len)
1530	break;
1531	}
1532	dax_read_unlock(id);
1533
1534	return done ? done : ret;
1535	}
1536
1537	/**
1538	* dax_iomap_rw - Perform I/O to a DAX file
1539	* @iocb: The control block for this I/O
1540	* @iter: The addresses to do I/O from or to
1541	* @ops: iomap ops passed from the file system
1542	*
1543	* This function performs read and write operations to directly mapped
1544	* persistent memory. The callers needs to take care of read/write exclusion
1545	* and evicting any page cache pages in the region under I/O.
1546	*/
1547	ssize_t
1548	dax_iomap_rw(struct kiocb iocb, struct* iov_iter *iter,
1549	const struct iomap_ops *ops)
1550	{
1551	struct iomap_iter iomi = {
1552	.inode = iocb->ki_filp->f_mapping->host,
1553	.pos = iocb->ki_pos,
1554	.len = iov_iter_count(i: iter),
1555	.flags = IOMAP_DAX,
1556	};
1557	loff_t done = `0`;
1558	int ret;
1559
1560	if (!iomi.len)
1561	return `0`;
1562
1563	if (iov_iter_rw(i: iter) == WRITE) {
1564	lockdep_assert_held_write(&iomi.inode->i_rwsem);
1565	iomi.flags \|= IOMAP_WRITE;
1566	} else {
1567	lockdep_assert_held(&iomi.inode->i_rwsem);
1568	}
1569
1570	if (iocb->ki_flags & IOCB_NOWAIT)
1571	iomi.flags \|= IOMAP_NOWAIT;
1572
1573	while ((ret = iomap_iter(iter: &iomi, ops)) > `0`)
1574	iomi.processed = dax_iomap_iter(iomi: &iomi, iter);
1575
1576	done = iomi.pos - iocb->ki_pos;
1577	iocb->ki_pos = iomi.pos;
1578	return done ? done : ret;
1579	}
1580	EXPORT_SYMBOL_GPL(dax_iomap_rw);
1581
1582	static vm_fault_t dax_fault_return(int error)
1583	{
1584	if (error == `0`)
1585	return VM_FAULT_NOPAGE;
1586	return vmf_error(err: error);
1587	}
1588
1589	/*
1590	* When handling a synchronous page fault and the inode need a fsync, we can
1591	* insert the PTE/PMD into page tables only after that fsync happened. Skip
1592	* insertion for now and return the pfn so that caller can insert it after the
1593	* fsync is done.
1594	*/
1595	static vm_fault_t dax_fault_synchronous_pfnp(pfn_t *pfnp, pfn_t pfn)
1596	{
1597	if (WARN_ON_ONCE(!pfnp))
1598	return VM_FAULT_SIGBUS;
1599	*pfnp = pfn;
1600	return VM_FAULT_NEEDDSYNC;
1601	}
1602
1603	static vm_fault_t dax_fault_cow_page(struct vm_fault *vmf,
1604	const struct iomap_iter *iter)
1605	{
1606	vm_fault_t ret;
1607	int error = `0`;
1608
1609	switch (iter->iomap.type) {
1610	case IOMAP_HOLE:
1611	case IOMAP_UNWRITTEN:
1612	clear_user_highpage(page: vmf->cow_page, vaddr: vmf->address);
1613	break;
1614	case IOMAP_MAPPED:
1615	error = copy_cow_page_dax(vmf, iter);
1616	break;
1617	default:
1618	WARN_ON_ONCE(`1`);
1619	error = -EIO;
1620	break;
1621	}
1622
1623	if (error)
1624	return dax_fault_return(error);
1625
1626	__SetPageUptodate(page: vmf->cow_page);
1627	ret = finish_fault(vmf);
1628	if (!ret)
1629	return VM_FAULT_DONE_COW;
1630	return ret;
1631	}
1632
1633	/**
1634	* dax_fault_iter - Common actor to handle pfn insertion in PTE/PMD fault.
1635	* @vmf: vm fault instance
1636	* @iter: iomap iter
1637	* @pfnp: pfn to be returned
1638	* @xas: the dax mapping tree of a file
1639	* @entry: an unlocked dax entry to be inserted
1640	* @pmd: distinguish whether it is a pmd fault
1641	*/
1642	static vm_fault_t dax_fault_iter(struct vm_fault *vmf,
1643	const struct iomap_iter iter, pfn_t pfnp,
1644	struct xa_state xas, void* **entry, bool pmd)
1645	{
1646	const struct iomap *iomap = &iter->iomap;
1647	const struct iomap *srcmap = iomap_iter_srcmap(i: iter);
1648	size_t size = pmd ? PMD_SIZE : PAGE_SIZE;
1649	loff_t pos = (loff_t)xas->xa_index << PAGE_SHIFT;
1650	bool write = iter->flags & IOMAP_WRITE;
1651	unsigned long entry_flags = pmd ? DAX_PMD : `0`;
1652	int err = `0`;
1653	pfn_t pfn;
1654	void *kaddr;
1655
1656	if (!pmd && vmf->cow_page)
1657	return dax_fault_cow_page(vmf, iter);
1658
1659	/ if we are reading UNWRITTEN and HOLE, return a hole. /
1660	if (!write &&
1661	(iomap->type == IOMAP_UNWRITTEN \|\| iomap->type == IOMAP_HOLE)) {
1662	if (!pmd)
1663	return dax_load_hole(xas, vmf, iter, entry);
1664	return dax_pmd_load_hole(xas, vmf, iter, entry);
1665	}
1666
1667	if (iomap->type != IOMAP_MAPPED && !(iomap->flags & IOMAP_F_SHARED)) {
1668	WARN_ON_ONCE(`1`);
1669	return pmd ? VM_FAULT_FALLBACK : VM_FAULT_SIGBUS;
1670	}
1671
1672	err = dax_iomap_direct_access(iomap, pos, size, kaddr: &kaddr, pfnp: &pfn);
1673	if (err)
1674	return pmd ? VM_FAULT_FALLBACK : dax_fault_return(error: err);
1675
1676	entry = dax_insert_entry(xas, vmf, iter, entry: entry, pfn, flags: entry_flags);
1677
1678	if (write && iomap->flags & IOMAP_F_SHARED) {
1679	err = dax_iomap_copy_around(pos, length: size, align_size: size, srcmap, daddr: kaddr);
1680	if (err)
1681	return dax_fault_return(error: err);
1682	}
1683
1684	if (dax_fault_is_synchronous(iter, vma: vmf->vma))
1685	return dax_fault_synchronous_pfnp(pfnp, pfn);
1686
1687	/ insert PMD pfn /
1688	if (pmd)
1689	return vmf_insert_pfn_pmd(vmf, pfn, write);
1690
1691	/ insert PTE pfn /
1692	if (write)
1693	return vmf_insert_mixed_mkwrite(vma: vmf->vma, addr: vmf->address, pfn);
1694	return vmf_insert_mixed(vma: vmf->vma, addr: vmf->address, pfn);
1695	}
1696
1697	static vm_fault_t dax_iomap_pte_fault(struct vm_fault vmf, pfn_t pfnp,
1698	int iomap_errp, const* struct iomap_ops *ops)
1699	{
1700	struct address_space *mapping = vmf->vma->vm_file->f_mapping;
1701	XA_STATE(xas, &mapping->i_pages, vmf->pgoff);
1702	struct iomap_iter iter = {
1703	.inode = mapping->host,
1704	.pos = (loff_t)vmf->pgoff << PAGE_SHIFT,
1705	.len = PAGE_SIZE,
1706	.flags = IOMAP_DAX \| IOMAP_FAULT,
1707	};
1708	vm_fault_t ret = `0`;
1709	void *entry;
1710	int error;
1711
1712	trace_dax_pte_fault(inode: iter.inode, vmf, result: ret);
1713	/*
1714	* Check whether offset isn't beyond end of file now. Caller is supposed
1715	* to hold locks serializing us with truncate / punch hole so this is
1716	* a reliable test.
1717	*/
1718	if (iter.pos >= i_size_read(inode: iter.inode)) {
1719	ret = VM_FAULT_SIGBUS;
1720	goto out;
1721	}
1722
1723	if ((vmf->flags & FAULT_FLAG_WRITE) && !vmf->cow_page)
1724	iter.flags \|= IOMAP_WRITE;
1725
1726	entry = grab_mapping_entry(xas: &xas, mapping, order: `0`);
1727	if (xa_is_internal(entry)) {
1728	ret = xa_to_internal(entry);
1729	goto out;
1730	}
1731
1732	/*
1733	* It is possible, particularly with mixed reads & writes to private
1734	* mappings, that we have raced with a PMD fault that overlaps with
1735	* the PTE we need to set up. If so just return and the fault will be
1736	* retried.
1737	*/
1738	if (pmd_trans_huge(pmd: vmf->pmd) \|\| pmd_devmap(pmd: vmf->pmd)) {
1739	ret = VM_FAULT_NOPAGE;
1740	goto unlock_entry;
1741	}
1742
1743	while ((error = iomap_iter(iter: &iter, ops)) > `0`) {
1744	if (WARN_ON_ONCE(iomap_length(&iter) < PAGE_SIZE)) {
1745	iter.processed = -EIO; / fs corruption? /
1746	continue;
1747	}
1748
1749	ret = dax_fault_iter(vmf, iter: &iter, pfnp, xas: &xas, entry: &entry, pmd: false);
1750	if (ret != VM_FAULT_SIGBUS &&
1751	(iter.iomap.flags & IOMAP_F_NEW)) {
1752	count_vm_event(item: PGMAJFAULT);
1753	count_memcg_event_mm(mm: vmf->vma->vm_mm, idx: PGMAJFAULT);
1754	ret \|= VM_FAULT_MAJOR;
1755	}
1756
1757	if (!(ret & VM_FAULT_ERROR))
1758	iter.processed = PAGE_SIZE;
1759	}
1760
1761	if (iomap_errp)
1762	*iomap_errp = error;
1763	if (!ret && error)
1764	ret = dax_fault_return(error);
1765
1766	unlock_entry:
1767	dax_unlock_entry(xas: &xas, entry);
1768	out:
1769	trace_dax_pte_fault_done(inode: iter.inode, vmf, result: ret);
1770	return ret;
1771	}
1772
1773	#ifdef CONFIG_FS_DAX_PMD
1774	static bool dax_fault_check_fallback(struct vm_fault vmf, struct* xa_state *xas,
1775	pgoff_t max_pgoff)
1776	{
1777	unsigned long pmd_addr = vmf->address & PMD_MASK;
1778	bool write = vmf->flags & FAULT_FLAG_WRITE;
1779
1780	/*
1781	* Make sure that the faulting address's PMD offset (color) matches
1782	* the PMD offset from the start of the file. This is necessary so
1783	* that a PMD range in the page table overlaps exactly with a PMD
1784	* range in the page cache.
1785	*/
1786	if ((vmf->pgoff & PG_PMD_COLOUR) !=
1787	((vmf->address >> PAGE_SHIFT) & PG_PMD_COLOUR))
1788	return true;
1789
1790	/ Fall back to PTEs if we're going to COW /
1791	if (write && !(vmf->vma->vm_flags & VM_SHARED))
1792	return true;
1793
1794	/ If the PMD would extend outside the VMA /
1795	if (pmd_addr < vmf->vma->vm_start)
1796	return true;
1797	if ((pmd_addr + PMD_SIZE) > vmf->vma->vm_end)
1798	return true;
1799
1800	/ If the PMD would extend beyond the file size /
1801	if ((xas->xa_index \| PG_PMD_COLOUR) >= max_pgoff)
1802	return true;
1803
1804	return false;
1805	}
1806
1807	static vm_fault_t dax_iomap_pmd_fault(struct vm_fault vmf, pfn_t pfnp,
1808	const struct iomap_ops *ops)
1809	{
1810	struct address_space *mapping = vmf->vma->vm_file->f_mapping;
1811	XA_STATE_ORDER(xas, &mapping->i_pages, vmf->pgoff, PMD_ORDER);
1812	struct iomap_iter iter = {
1813	.inode = mapping->host,
1814	.len = PMD_SIZE,
1815	.flags = IOMAP_DAX \| IOMAP_FAULT,
1816	};
1817	vm_fault_t ret = VM_FAULT_FALLBACK;
1818	pgoff_t max_pgoff;
1819	void *entry;
1820
1821	if (vmf->flags & FAULT_FLAG_WRITE)
1822	iter.flags \|= IOMAP_WRITE;
1823
1824	/*
1825	* Check whether offset isn't beyond end of file now. Caller is
1826	* supposed to hold locks serializing us with truncate / punch hole so
1827	* this is a reliable test.
1828	*/
1829	max_pgoff = DIV_ROUND_UP(i_size_read(iter.inode), PAGE_SIZE);
1830
1831	trace_dax_pmd_fault(inode: iter.inode, vmf, max_pgoff, result: `0`);
1832
1833	if (xas.xa_index >= max_pgoff) {
1834	ret = VM_FAULT_SIGBUS;
1835	goto out;
1836	}
1837
1838	if (dax_fault_check_fallback(vmf, xas: &xas, max_pgoff))
1839	goto fallback;
1840
1841	/*
1842	* grab_mapping_entry() will make sure we get an empty PMD entry,
1843	* a zero PMD entry or a DAX PMD. If it can't (because a PTE
1844	* entry is already in the array, for instance), it will return
1845	* VM_FAULT_FALLBACK.
1846	*/
1847	entry = grab_mapping_entry(xas: &xas, mapping, PMD_ORDER);
1848	if (xa_is_internal(entry)) {
1849	ret = xa_to_internal(entry);
1850	goto fallback;
1851	}
1852
1853	/*
1854	* It is possible, particularly with mixed reads & writes to private
1855	* mappings, that we have raced with a PTE fault that overlaps with
1856	* the PMD we need to set up. If so just return and the fault will be
1857	* retried.
1858	*/
1859	if (!pmd_none(pmd: vmf->pmd) && !pmd_trans_huge(pmd: vmf->pmd) &&
1860	!pmd_devmap(pmd: *vmf->pmd)) {
1861	ret = `0`;
1862	goto unlock_entry;
1863	}
1864
1865	iter.pos = (loff_t)xas.xa_index << PAGE_SHIFT;
1866	while (iomap_iter(iter: &iter, ops) > `0`) {
1867	if (iomap_length(iter: &iter) < PMD_SIZE)
1868	continue; / actually breaks out of the loop /
1869
1870	ret = dax_fault_iter(vmf, iter: &iter, pfnp, xas: &xas, entry: &entry, pmd: true);
1871	if (ret != VM_FAULT_FALLBACK)
1872	iter.processed = PMD_SIZE;
1873	}
1874
1875	unlock_entry:
1876	dax_unlock_entry(xas: &xas, entry);
1877	fallback:
1878	if (ret == VM_FAULT_FALLBACK) {
1879	split_huge_pmd(vmf->vma, vmf->pmd, vmf->address);
1880	count_vm_event(item: THP_FAULT_FALLBACK);
1881	}
1882	out:
1883	trace_dax_pmd_fault_done(inode: iter.inode, vmf, max_pgoff, result: ret);
1884	return ret;
1885	}
1886	#else
1887	static vm_fault_t dax_iomap_pmd_fault(struct vm_fault vmf, pfn_t pfnp,
1888	const struct iomap_ops *ops)
1889	{
1890	return VM_FAULT_FALLBACK;
1891	}
1892	#endif /* CONFIG_FS_DAX_PMD */
1893
1894	/**
1895	* dax_iomap_fault - handle a page fault on a DAX file
1896	* @vmf: The description of the fault
1897	* @order: Order of the page to fault in
1898	* @pfnp: PFN to insert for synchronous faults if fsync is required
1899	* @iomap_errp: Storage for detailed error code in case of error
1900	* @ops: Iomap ops passed from the file system
1901	*
1902	* When a page fault occurs, filesystems may call this helper in
1903	* their fault handler for DAX files. dax_iomap_fault() assumes the caller
1904	* has done all the necessary locking for page fault to proceed
1905	* successfully.
1906	*/
1907	vm_fault_t dax_iomap_fault(struct vm_fault vmf, unsigned* int order,
1908	pfn_t pfnp, int* iomap_errp, const* struct iomap_ops *ops)
1909	{
1910	if (order == `0`)
1911	return dax_iomap_pte_fault(vmf, pfnp, iomap_errp, ops);
1912	else if (order == PMD_ORDER)
1913	return dax_iomap_pmd_fault(vmf, pfnp, ops);
1914	else
1915	return VM_FAULT_FALLBACK;
1916	}
1917	EXPORT_SYMBOL_GPL(dax_iomap_fault);
1918
1919	/*
1920	* dax_insert_pfn_mkwrite - insert PTE or PMD entry into page tables
1921	* @vmf: The description of the fault
1922	* @pfn: PFN to insert
1923	* @order: Order of entry to insert.
1924	*
1925	* This function inserts a writeable PTE or PMD entry into the page tables
1926	* for an mmaped DAX file. It also marks the page cache entry as dirty.
1927	*/
1928	static vm_fault_t
1929	dax_insert_pfn_mkwrite(struct vm_fault vmf, pfn_t pfn, unsigned* int order)
1930	{
1931	struct address_space *mapping = vmf->vma->vm_file->f_mapping;
1932	XA_STATE_ORDER(xas, &mapping->i_pages, vmf->pgoff, order);
1933	void *entry;
1934	vm_fault_t ret;
1935
1936	xas_lock_irq(&xas);
1937	entry = get_unlocked_entry(xas: &xas, order);
1938	/ Did we race with someone splitting entry or so? /
1939	if (!entry \|\| dax_is_conflict(entry) \|\|
1940	(order == `0` && !dax_is_pte_entry(entry))) {
1941	put_unlocked_entry(xas: &xas, entry, mode: WAKE_NEXT);
1942	xas_unlock_irq(&xas);
1943	trace_dax_insert_pfn_mkwrite_no_entry(inode: mapping->host, vmf,
1944	result: VM_FAULT_NOPAGE);
1945	return VM_FAULT_NOPAGE;
1946	}
1947	xas_set_mark(&xas, PAGECACHE_TAG_DIRTY);
1948	dax_lock_entry(xas: &xas, entry);
1949	xas_unlock_irq(&xas);
1950	if (order == `0`)
1951	ret = vmf_insert_mixed_mkwrite(vma: vmf->vma, addr: vmf->address, pfn);
1952	#ifdef CONFIG_FS_DAX_PMD
1953	else if (order == PMD_ORDER)
1954	ret = vmf_insert_pfn_pmd(vmf, pfn, write: FAULT_FLAG_WRITE);
1955	#endif
1956	else
1957	ret = VM_FAULT_FALLBACK;
1958	dax_unlock_entry(xas: &xas, entry);
1959	trace_dax_insert_pfn_mkwrite(inode: mapping->host, vmf, result: ret);
1960	return ret;
1961	}
1962
1963	/**
1964	* dax_finish_sync_fault - finish synchronous page fault
1965	* @vmf: The description of the fault
1966	* @order: Order of entry to be inserted
1967	* @pfn: PFN to insert
1968	*
1969	* This function ensures that the file range touched by the page fault is
1970	* stored persistently on the media and handles inserting of appropriate page
1971	* table entry.
1972	*/
1973	vm_fault_t dax_finish_sync_fault(struct vm_fault vmf, unsigned* int order,
1974	pfn_t pfn)
1975	{
1976	int err;
1977	loff_t start = ((loff_t)vmf->pgoff) << PAGE_SHIFT;
1978	size_t len = PAGE_SIZE << order;
1979
1980	err = vfs_fsync_range(file: vmf->vma->vm_file, start, end: start + len - `1`, datasync: `1`);
1981	if (err)
1982	return VM_FAULT_SIGBUS;
1983	return dax_insert_pfn_mkwrite(vmf, pfn, order);
1984	}
1985	EXPORT_SYMBOL_GPL(dax_finish_sync_fault);
1986
1987	static loff_t dax_range_compare_iter(struct iomap_iter *it_src,
1988	struct iomap_iter it_dest, u64 len, bool same)
1989	{
1990	const struct iomap *smap = &it_src->iomap;
1991	const struct iomap *dmap = &it_dest->iomap;
1992	loff_t pos1 = it_src->pos, pos2 = it_dest->pos;
1993	void saddr, daddr;
1994	int id, ret;
1995
1996	len = min(len, min(smap->length, dmap->length));
1997
1998	if (smap->type == IOMAP_HOLE && dmap->type == IOMAP_HOLE) {
1999	*same = true;
2000	return len;
2001	}
2002
2003	if (smap->type == IOMAP_HOLE \|\| dmap->type == IOMAP_HOLE) {
2004	*same = false;
2005	return `0`;
2006	}
2007
2008	id = dax_read_lock();
2009	ret = dax_iomap_direct_access(iomap: smap, pos: pos1, ALIGN(pos1 + len, PAGE_SIZE),
2010	kaddr: &saddr, NULL);
2011	if (ret < `0`)
2012	goto out_unlock;
2013
2014	ret = dax_iomap_direct_access(iomap: dmap, pos: pos2, ALIGN(pos2 + len, PAGE_SIZE),
2015	kaddr: &daddr, NULL);
2016	if (ret < `0`)
2017	goto out_unlock;
2018
2019	*same = !memcmp(p: saddr, q: daddr, size: len);
2020	if (!*same)
2021	len = `0`;
2022	dax_read_unlock(id);
2023	return len;
2024
2025	out_unlock:
2026	dax_read_unlock(id);
2027	return -EIO;
2028	}
2029
2030	int dax_dedupe_file_range_compare(struct inode *src, loff_t srcoff,
2031	struct inode dst, loff_t dstoff, loff_t len, bool same,
2032	const struct iomap_ops *ops)
2033	{
2034	struct iomap_iter src_iter = {
2035	.inode = src,
2036	.pos = srcoff,
2037	.len = len,
2038	.flags = IOMAP_DAX,
2039	};
2040	struct iomap_iter dst_iter = {
2041	.inode = dst,
2042	.pos = dstoff,
2043	.len = len,
2044	.flags = IOMAP_DAX,
2045	};
2046	int ret, compared = `0`;
2047
2048	while ((ret = iomap_iter(iter: &src_iter, ops)) > `0` &&
2049	(ret = iomap_iter(iter: &dst_iter, ops)) > `0`) {
2050	compared = dax_range_compare_iter(it_src: &src_iter, it_dest: &dst_iter,
2051	min(src_iter.len, dst_iter.len), same);
2052	if (compared < `0`)
2053	return ret;
2054	src_iter.processed = dst_iter.processed = compared;
2055	}
2056	return ret;
2057	}
2058
2059	int dax_remap_file_range_prep(struct file *file_in, loff_t pos_in,
2060	struct file *file_out, loff_t pos_out,
2061	loff_t len, unsigned* int remap_flags,
2062	const struct iomap_ops *ops)
2063	{
2064	return __generic_remap_file_range_prep(file_in, pos_in, file_out,
2065	pos_out, len, remap_flags, dax_read_ops: ops);
2066	}
2067	EXPORT_SYMBOL_GPL(dax_remap_file_range_prep);
2068

source code of linux/fs/dax.c