readahead.c source code [linux/mm/readahead.c]

1	// SPDX-License-Identifier: GPL-2.0-only
2	/*
3	* mm/readahead.c - address_space-level file readahead.
4	*
5	* Copyright (C) 2002, Linus Torvalds
6	*
7	* 09Apr2002 Andrew Morton
8	* Initial version.
9	*/
10
11	/**
12	* DOC: Readahead Overview
13	*
14	* Readahead is used to read content into the page cache before it is
15	* explicitly requested by the application. Readahead only ever
16	* attempts to read folios that are not yet in the page cache. If a
17	* folio is present but not up-to-date, readahead will not try to read
18	* it. In that case a simple ->read_folio() will be requested.
19	*
20	* Readahead is triggered when an application read request (whether a
21	* system call or a page fault) finds that the requested folio is not in
22	* the page cache, or that it is in the page cache and has the
23	* readahead flag set. This flag indicates that the folio was read
24	* as part of a previous readahead request and now that it has been
25	* accessed, it is time for the next readahead.
26	*
27	* Each readahead request is partly synchronous read, and partly async
28	* readahead. This is reflected in the struct file_ra_state which
29	* contains ->size being the total number of pages, and ->async_size
30	* which is the number of pages in the async section. The readahead
31	* flag will be set on the first folio in this async section to trigger
32	* a subsequent readahead. Once a series of sequential reads has been
33	* established, there should be no need for a synchronous component and
34	* all readahead request will be fully asynchronous.
35	*
36	* When either of the triggers causes a readahead, three numbers need
37	* to be determined: the start of the region to read, the size of the
38	* region, and the size of the async tail.
39	*
40	* The start of the region is simply the first page address at or after
41	* the accessed address, which is not currently populated in the page
42	* cache. This is found with a simple search in the page cache.
43	*
44	* The size of the async tail is determined by subtracting the size that
45	* was explicitly requested from the determined request size, unless
46	* this would be less than zero - then zero is used. NOTE THIS
47	* CALCULATION IS WRONG WHEN THE START OF THE REGION IS NOT THE ACCESSED
48	* PAGE. ALSO THIS CALCULATION IS NOT USED CONSISTENTLY.
49	*
50	* The size of the region is normally determined from the size of the
51	* previous readahead which loaded the preceding pages. This may be
52	* discovered from the struct file_ra_state for simple sequential reads,
53	* or from examining the state of the page cache when multiple
54	* sequential reads are interleaved. Specifically: where the readahead
55	* was triggered by the readahead flag, the size of the previous
56	* readahead is assumed to be the number of pages from the triggering
57	* page to the start of the new readahead. In these cases, the size of
58	* the previous readahead is scaled, often doubled, for the new
59	* readahead, though see get_next_ra_size() for details.
60	*
61	* If the size of the previous read cannot be determined, the number of
62	* preceding pages in the page cache is used to estimate the size of
63	* a previous read. This estimate could easily be misled by random
64	* reads being coincidentally adjacent, so it is ignored unless it is
65	* larger than the current request, and it is not scaled up, unless it
66	* is at the start of file.
67	*
68	* In general readahead is accelerated at the start of the file, as
69	* reads from there are often sequential. There are other minor
70	* adjustments to the readahead size in various special cases and these
71	* are best discovered by reading the code.
72	*
73	* The above calculation, based on the previous readahead size,
74	* determines the size of the readahead, to which any requested read
75	* size may be added.
76	*
77	* Readahead requests are sent to the filesystem using the ->readahead()
78	* address space operation, for which mpage_readahead() is a canonical
79	* implementation. ->readahead() should normally initiate reads on all
80	* folios, but may fail to read any or all folios without causing an I/O
81	* error. The page cache reading code will issue a ->read_folio() request
82	* for any folio which ->readahead() did not read, and only an error
83	* from this will be final.
84	*
85	* ->readahead() will generally call readahead_folio() repeatedly to get
86	* each folio from those prepared for readahead. It may fail to read a
87	* folio by:
88	*
89	* * not calling readahead_folio() sufficiently many times, effectively
90	* ignoring some folios, as might be appropriate if the path to
91	* storage is congested.
92	*
93	* * failing to actually submit a read request for a given folio,
94	* possibly due to insufficient resources, or
95	*
96	* * getting an error during subsequent processing of a request.
97	*
98	* In the last two cases, the folio should be unlocked by the filesystem
99	* to indicate that the read attempt has failed. In the first case the
100	* folio will be unlocked by the VFS.
101	*
102	* Those folios not in the final ``async_size`` of the request should be
103	* considered to be important and ->readahead() should not fail them due
104	* to congestion or temporary resource unavailability, but should wait
105	* for necessary resources (e.g. memory or indexing information) to
106	* become available. Folios in the final ``async_size`` may be
107	* considered less urgent and failure to read them is more acceptable.
108	* In this case it is best to use filemap_remove_folio() to remove the
109	* folios from the page cache as is automatically done for folios that
110	* were not fetched with readahead_folio(). This will allow a
111	* subsequent synchronous readahead request to try them again. If they
112	* are left in the page cache, then they will be read individually using
113	* ->read_folio() which may be less efficient.
114	*/
115
116	#include <linux/blkdev.h>
117	#include <linux/kernel.h>
118	#include <linux/dax.h>
119	#include <linux/gfp.h>
120	#include <linux/export.h>
121	#include <linux/backing-dev.h>
122	#include <linux/task_io_accounting_ops.h>
123	#include <linux/pagemap.h>
124	#include <linux/psi.h>
125	#include <linux/syscalls.h>
126	#include <linux/file.h>
127	#include <linux/mm_inline.h>
128	#include <linux/blk-cgroup.h>
129	#include <linux/fadvise.h>
130	#include <linux/sched/mm.h>
131
132	#include "internal.h"
133
134	/*
135	* Initialise a struct file's readahead state. Assumes that the caller has
136	* memset *ra to zero.
137	*/
138	void
139	file_ra_state_init(struct file_ra_state ra, struct* address_space *mapping)
140	{
141	ra->ra_pages = inode_to_bdi(inode: mapping->host)->ra_pages;
142	ra->prev_pos = -`1`;
143	}
144	EXPORT_SYMBOL_GPL(file_ra_state_init);
145
146	static void read_pages(struct readahead_control *rac)
147	{
148	const struct address_space_operations *aops = rac->mapping->a_ops;
149	struct folio *folio;
150	struct blk_plug plug;
151
152	if (!readahead_count(rac))
153	return;
154
155	if (unlikely(rac->_workingset))
156	psi_memstall_enter(flags: &rac->_pflags);
157	blk_start_plug(&plug);
158
159	if (aops->readahead) {
160	aops->readahead(rac);
161	/*
162	* Clean up the remaining folios. The sizes in ->ra
163	* may be used to size the next readahead, so make sure
164	* they accurately reflect what happened.
165	*/
166	while ((folio = readahead_folio(ractl: rac)) != NULL) {
167	unsigned long nr = folio_nr_pages(folio);
168
169	folio_get(folio);
170	rac->ra->size -= nr;
171	if (rac->ra->async_size >= nr) {
172	rac->ra->async_size -= nr;
173	filemap_remove_folio(folio);
174	}
175	folio_unlock(folio);
176	folio_put(folio);
177	}
178	} else {
179	while ((folio = readahead_folio(ractl: rac)) != NULL)
180	aops->read_folio(rac->file, folio);
181	}
182
183	blk_finish_plug(&plug);
184	if (unlikely(rac->_workingset))
185	psi_memstall_leave(flags: &rac->_pflags);
186	rac->_workingset = false;
187
188	BUG_ON(readahead_count(rac));
189	}
190
191	/**
192	* page_cache_ra_unbounded - Start unchecked readahead.
193	* @ractl: Readahead control.
194	* @nr_to_read: The number of pages to read.
195	* @lookahead_size: Where to start the next readahead.
196	*
197	* This function is for filesystems to call when they want to start
198	* readahead beyond a file's stated i_size. This is almost certainly
199	* not the function you want to call. Use page_cache_async_readahead()
200	* or page_cache_sync_readahead() instead.
201	*
202	* Context: File is referenced by caller. Mutexes may be held by caller.
203	* May sleep, but will not reenter filesystem to reclaim memory.
204	*/
205	void page_cache_ra_unbounded(struct readahead_control *ractl,
206	unsigned long nr_to_read, unsigned long lookahead_size)
207	{
208	struct address_space *mapping = ractl->mapping;
209	unsigned long index = readahead_index(rac: ractl);
210	gfp_t gfp_mask = readahead_gfp_mask(x: mapping);
211	unsigned long i;
212
213	/*
214	* Partway through the readahead operation, we will have added
215	* locked pages to the page cache, but will not yet have submitted
216	* them for I/O. Adding another page may need to allocate memory,
217	* which can trigger memory reclaim. Telling the VM we're in
218	* the middle of a filesystem operation will cause it to not
219	* touch file-backed pages, preventing a deadlock. Most (all?)
220	* filesystems already specify __GFP_NOFS in their mapping's
221	* gfp_mask, but let's be explicit here.
222	*/
223	unsigned int nofs = memalloc_nofs_save();
224
225	filemap_invalidate_lock_shared(mapping);
226	/*
227	* Preallocate as many pages as we will need.
228	*/
229	for (i = `0`; i < nr_to_read; i++) {
230	struct folio *folio = xa_load(&mapping->i_pages, index: index + i);
231
232	if (folio && !xa_is_value(entry: folio)) {
233	/*
234	* Page already present? Kick off the current batch
235	* of contiguous pages before continuing with the
236	* next batch. This page may be the one we would
237	* have intended to mark as Readahead, but we don't
238	* have a stable reference to this page, and it's
239	* not worth getting one just for that.
240	*/
241	read_pages(rac: ractl);
242	ractl->_index++;
243	i = ractl->_index + ractl->_nr_pages - index - `1`;
244	continue;
245	}
246
247	folio = filemap_alloc_folio(gfp: gfp_mask, order: `0`);
248	if (!folio)
249	break;
250	if (filemap_add_folio(mapping, folio, index: index + i,
251	gfp: gfp_mask) < `0`) {
252	folio_put(folio);
253	read_pages(rac: ractl);
254	ractl->_index++;
255	i = ractl->_index + ractl->_nr_pages - index - `1`;
256	continue;
257	}
258	if (i == nr_to_read - lookahead_size)
259	folio_set_readahead(folio);
260	ractl->_workingset \|= folio_test_workingset(folio);
261	ractl->_nr_pages++;
262	}
263
264	/*
265	* Now start the IO. We ignore I/O errors - if the folio is not
266	* uptodate then the caller will launch read_folio again, and
267	* will then handle the error.
268	*/
269	read_pages(rac: ractl);
270	filemap_invalidate_unlock_shared(mapping);
271	memalloc_nofs_restore(flags: nofs);
272	}
273	EXPORT_SYMBOL_GPL(page_cache_ra_unbounded);
274
275	/*
276	* do_page_cache_ra() actually reads a chunk of disk. It allocates
277	* the pages first, then submits them for I/O. This avoids the very bad
278	* behaviour which would occur if page allocations are causing VM writeback.
279	* We really don't want to intermingle reads and writes like that.
280	*/
281	static void do_page_cache_ra(struct readahead_control *ractl,
282	unsigned long nr_to_read, unsigned long lookahead_size)
283	{
284	struct inode *inode = ractl->mapping->host;
285	unsigned long index = readahead_index(rac: ractl);
286	loff_t isize = i_size_read(inode);
287	pgoff_t end_index; / The last page we want to read /
288
289	if (isize == `0`)
290	return;
291
292	end_index = (isize - `1`) >> PAGE_SHIFT;
293	if (index > end_index)
294	return;
295	/ Don't read past the page containing the last byte of the file /
296	if (nr_to_read > end_index - index)
297	nr_to_read = end_index - index + `1`;
298
299	page_cache_ra_unbounded(ractl, nr_to_read, lookahead_size);
300	}
301
302	/*
303	* Chunk the readahead into 2 megabyte units, so that we don't pin too much
304	* memory at once.
305	*/
306	void force_page_cache_ra(struct readahead_control *ractl,
307	unsigned long nr_to_read)
308	{
309	struct address_space *mapping = ractl->mapping;
310	struct file_ra_state *ra = ractl->ra;
311	struct backing_dev_info *bdi = inode_to_bdi(inode: mapping->host);
312	unsigned long max_pages, index;
313
314	if (unlikely(!mapping->a_ops->read_folio && !mapping->a_ops->readahead))
315	return;
316
317	/*
318	* If the request exceeds the readahead window, allow the read to
319	* be up to the optimal hardware IO size
320	*/
321	index = readahead_index(rac: ractl);
322	max_pages = max_t(unsigned long, bdi->io_pages, ra->ra_pages);
323	nr_to_read = min_t(unsigned long, nr_to_read, max_pages);
324	while (nr_to_read) {
325	unsigned long this_chunk = (`2` * `1024` * `1024`) / PAGE_SIZE;
326
327	if (this_chunk > nr_to_read)
328	this_chunk = nr_to_read;
329	ractl->_index = index;
330	do_page_cache_ra(ractl, nr_to_read: this_chunk, lookahead_size: `0`);
331
332	index += this_chunk;
333	nr_to_read -= this_chunk;
334	}
335	}
336
337	/*
338	* Set the initial window size, round to next power of 2 and square
339	* for small size, x 4 for medium, and x 2 for large
340	* for 128k (32 page) max ra
341	* 1-2 page = 16k, 3-4 page 32k, 5-8 page = 64k, > 8 page = 128k initial
342	*/
343	static unsigned long get_init_ra_size(unsigned long size, unsigned long max)
344	{
345	unsigned long newsize = roundup_pow_of_two(size);
346
347	if (newsize <= max / `32`)
348	newsize = newsize * `4`;
349	else if (newsize <= max / `4`)
350	newsize = newsize * `2`;
351	else
352	newsize = max;
353
354	return newsize;
355	}
356
357	/*
358	* Get the previous window size, ramp it up, and
359	* return it as the new window size.
360	*/
361	static unsigned long get_next_ra_size(struct file_ra_state *ra,
362	unsigned long max)
363	{
364	unsigned long cur = ra->size;
365
366	if (cur < max / `16`)
367	return `4` * cur;
368	if (cur <= max / `2`)
369	return `2` * cur;
370	return max;
371	}
372
373	/*
374	* On-demand readahead design.
375	*
376	* The fields in struct file_ra_state represent the most-recently-executed
377	* readahead attempt:
378	*
379	* \|<----- async_size ---------\|
380	* \|------------------- size -------------------->\|
381	* \|==================#===========================\|
382	* ^start ^page marked with PG_readahead
383	*
384	* To overlap application thinking time and disk I/O time, we do
385	* `readahead pipelining': Do not wait until the application consumed all
386	* readahead pages and stalled on the missing page at readahead_index;
387	* Instead, submit an asynchronous readahead I/O as soon as there are
388	* only async_size pages left in the readahead window. Normally async_size
389	* will be equal to size, for maximum pipelining.
390	*
391	* In interleaved sequential reads, concurrent streams on the same fd can
392	* be invalidating each other's readahead state. So we flag the new readahead
393	* page at (start+size-async_size) with PG_readahead, and use it as readahead
394	* indicator. The flag won't be set on already cached pages, to avoid the
395	* readahead-for-nothing fuss, saving pointless page cache lookups.
396	*
397	* prev_pos tracks the last visited byte in the _previous_ read request.
398	* It should be maintained by the caller, and will be used for detecting
399	* small random reads. Note that the readahead algorithm checks loosely
400	* for sequential patterns. Hence interleaved reads might be served as
401	* sequential ones.
402	*
403	* There is a special-case: if the first page which the application tries to
404	* read happens to be the first page of the file, it is assumed that a linear
405	* read is about to happen and the window is immediately set to the initial size
406	* based on I/O request size and the max_readahead.
407	*
408	* The code ramps up the readahead size aggressively at first, but slow down as
409	* it approaches max_readhead.
410	*/
411
412	/*
413	* Count contiguously cached pages from @index-1 to @index-@max,
414	* this count is a conservative estimation of
415	* - length of the sequential read sequence, or
416	* - thrashing threshold in memory tight systems
417	*/
418	static pgoff_t count_history_pages(struct address_space *mapping,
419	pgoff_t index, unsigned long max)
420	{
421	pgoff_t head;
422
423	rcu_read_lock();
424	head = page_cache_prev_miss(mapping, index: index - `1`, max_scan: max);
425	rcu_read_unlock();
426
427	return index - `1` - head;
428	}
429
430	/*
431	* page cache context based readahead
432	*/
433	static int try_context_readahead(struct address_space *mapping,
434	struct file_ra_state *ra,
435	pgoff_t index,
436	unsigned long req_size,
437	unsigned long max)
438	{
439	pgoff_t size;
440
441	size = count_history_pages(mapping, index, max);
442
443	/*
444	* not enough history pages:
445	* it could be a random read
446	*/
447	if (size <= req_size)
448	return `0`;
449
450	/*
451	* starts from beginning of file:
452	* it is a strong indication of long-run stream (or whole-file-read)
453	*/
454	if (size >= index)
455	size *= `2`;
456
457	ra->start = index;
458	ra->size = min(size + req_size, max);
459	ra->async_size = `1`;
460
461	return `1`;
462	}
463
464	static inline int ra_alloc_folio(struct readahead_control *ractl, pgoff_t index,
465	pgoff_t mark, unsigned int order, gfp_t gfp)
466	{
467	int err;
468	struct folio *folio = filemap_alloc_folio(gfp, order);
469
470	if (!folio)
471	return -ENOMEM;
472	mark = round_up(mark, `1UL` << order);
473	if (index == mark)
474	folio_set_readahead(folio);
475	err = filemap_add_folio(mapping: ractl->mapping, folio, index, gfp);
476	if (err) {
477	folio_put(folio);
478	return err;
479	}
480
481	ractl->_nr_pages += `1UL` << order;
482	ractl->_workingset \|= folio_test_workingset(folio);
483	return `0`;
484	}
485
486	void page_cache_ra_order(struct readahead_control *ractl,
487	struct file_ra_state ra, unsigned* int new_order)
488	{
489	struct address_space *mapping = ractl->mapping;
490	pgoff_t index = readahead_index(rac: ractl);
491	pgoff_t limit = (i_size_read(inode: mapping->host) - `1`) >> PAGE_SHIFT;
492	pgoff_t mark = index + ra->size - ra->async_size;
493	int err = `0`;
494	gfp_t gfp = readahead_gfp_mask(x: mapping);
495
496	if (!mapping_large_folio_support(mapping) \|\| ra->size < `4`)
497	goto fallback;
498
499	limit = min(limit, index + ra->size - `1`);
500
501	if (new_order < MAX_PAGECACHE_ORDER) {
502	new_order += `2`;
503	if (new_order > MAX_PAGECACHE_ORDER)
504	new_order = MAX_PAGECACHE_ORDER;
505	while ((`1` << new_order) > ra->size)
506	new_order--;
507	}
508
509	filemap_invalidate_lock_shared(mapping);
510	while (index <= limit) {
511	unsigned int order = new_order;
512
513	/ Align with smaller pages if needed /
514	if (index & ((`1UL` << order) - `1`)) {
515	order = __ffs(index);
516	if (order == `1`)
517	order = `0`;
518	}
519	/ Don't allocate pages past EOF /
520	while (index + (`1UL` << order) - `1` > limit) {
521	if (--order == `1`)
522	order = `0`;
523	}
524	err = ra_alloc_folio(ractl, index, mark, order, gfp);
525	if (err)
526	break;
527	index += `1UL` << order;
528	}
529
530	if (index > limit) {
531	ra->size += index - limit - `1`;
532	ra->async_size += index - limit - `1`;
533	}
534
535	read_pages(rac: ractl);
536	filemap_invalidate_unlock_shared(mapping);
537
538	/*
539	* If there were already pages in the page cache, then we may have
540	* left some gaps. Let the regular readahead code take care of this
541	* situation.
542	*/
543	if (!err)
544	return;
545	fallback:
546	do_page_cache_ra(ractl, nr_to_read: ra->size, lookahead_size: ra->async_size);
547	}
548
549	/*
550	* A minimal readahead algorithm for trivial sequential/random reads.
551	*/
552	static void ondemand_readahead(struct readahead_control *ractl,
553	struct folio folio, unsigned* long req_size)
554	{
555	struct backing_dev_info *bdi = inode_to_bdi(inode: ractl->mapping->host);
556	struct file_ra_state *ra = ractl->ra;
557	unsigned long max_pages = ra->ra_pages;
558	unsigned long add_pages;
559	pgoff_t index = readahead_index(rac: ractl);
560	pgoff_t expected, prev_index;
561	unsigned int order = folio ? folio_order(folio) : `0`;
562
563	/*
564	* If the request exceeds the readahead window, allow the read to
565	* be up to the optimal hardware IO size
566	*/
567	if (req_size > max_pages && bdi->io_pages > max_pages)
568	max_pages = min(req_size, bdi->io_pages);
569
570	/*
571	* start of file
572	*/
573	if (!index)
574	goto initial_readahead;
575
576	/*
577	* It's the expected callback index, assume sequential access.
578	* Ramp up sizes, and push forward the readahead window.
579	*/
580	expected = round_up(ra->start + ra->size - ra->async_size,
581	`1UL` << order);
582	if (index == expected \|\| index == (ra->start + ra->size)) {
583	ra->start += ra->size;
584	ra->size = get_next_ra_size(ra, max: max_pages);
585	ra->async_size = ra->size;
586	goto readit;
587	}
588
589	/*
590	* Hit a marked folio without valid readahead state.
591	* E.g. interleaved reads.
592	* Query the pagecache for async_size, which normally equals to
593	* readahead size. Ramp it up and use it as the new readahead size.
594	*/
595	if (folio) {
596	pgoff_t start;
597
598	rcu_read_lock();
599	start = page_cache_next_miss(mapping: ractl->mapping, index: index + `1`,
600	max_scan: max_pages);
601	rcu_read_unlock();
602
603	if (!start \|\| start - index > max_pages)
604	return;
605
606	ra->start = start;
607	ra->size = start - index; / old async_size /
608	ra->size += req_size;
609	ra->size = get_next_ra_size(ra, max: max_pages);
610	ra->async_size = ra->size;
611	goto readit;
612	}
613
614	/*
615	* oversize read
616	*/
617	if (req_size > max_pages)
618	goto initial_readahead;
619
620	/*
621	* sequential cache miss
622	* trivial case: (index - prev_index) == 1
623	* unaligned reads: (index - prev_index) == 0
624	*/
625	prev_index = (unsigned long long)ra->prev_pos >> PAGE_SHIFT;
626	if (index - prev_index <= `1UL`)
627	goto initial_readahead;
628
629	/*
630	* Query the page cache and look for the traces(cached history pages)
631	* that a sequential stream would leave behind.
632	*/
633	if (try_context_readahead(mapping: ractl->mapping, ra, index, req_size,
634	max: max_pages))
635	goto readit;
636
637	/*
638	* standalone, small random read
639	* Read as is, and do not pollute the readahead state.
640	*/
641	do_page_cache_ra(ractl, nr_to_read: req_size, lookahead_size: `0`);
642	return;
643
644	initial_readahead:
645	ra->start = index;
646	ra->size = get_init_ra_size(size: req_size, max: max_pages);
647	ra->async_size = ra->size > req_size ? ra->size - req_size : ra->size;
648
649	readit:
650	/*
651	* Will this read hit the readahead marker made by itself?
652	* If so, trigger the readahead marker hit now, and merge
653	* the resulted next readahead window into the current one.
654	* Take care of maximum IO pages as above.
655	*/
656	if (index == ra->start && ra->size == ra->async_size) {
657	add_pages = get_next_ra_size(ra, max: max_pages);
658	if (ra->size + add_pages <= max_pages) {
659	ra->async_size = add_pages;
660	ra->size += add_pages;
661	} else {
662	ra->size = max_pages;
663	ra->async_size = max_pages >> `1`;
664	}
665	}
666
667	ractl->_index = ra->start;
668	page_cache_ra_order(ractl, ra, new_order: order);
669	}
670
671	void page_cache_sync_ra(struct readahead_control *ractl,
672	unsigned long req_count)
673	{
674	bool do_forced_ra = ractl->file && (ractl->file->f_mode & FMODE_RANDOM);
675
676	/*
677	* Even if readahead is disabled, issue this request as readahead
678	* as we'll need it to satisfy the requested range. The forced
679	* readahead will do the right thing and limit the read to just the
680	* requested range, which we'll set to 1 page for this case.
681	*/
682	if (!ractl->ra->ra_pages \|\| blk_cgroup_congested()) {
683	if (!ractl->file)
684	return;
685	req_count = `1`;
686	do_forced_ra = true;
687	}
688
689	/ be dumb /
690	if (do_forced_ra) {
691	force_page_cache_ra(ractl, nr_to_read: req_count);
692	return;
693	}
694
695	ondemand_readahead(ractl, NULL, req_size: req_count);
696	}
697	EXPORT_SYMBOL_GPL(page_cache_sync_ra);
698
699	void page_cache_async_ra(struct readahead_control *ractl,
700	struct folio folio, unsigned* long req_count)
701	{
702	/ no readahead /
703	if (!ractl->ra->ra_pages)
704	return;
705
706	/*
707	* Same bit is used for PG_readahead and PG_reclaim.
708	*/
709	if (folio_test_writeback(folio))
710	return;
711
712	folio_clear_readahead(folio);
713
714	if (blk_cgroup_congested())
715	return;
716
717	ondemand_readahead(ractl, folio, req_size: req_count);
718	}
719	EXPORT_SYMBOL_GPL(page_cache_async_ra);
720
721	ssize_t ksys_readahead(int fd, loff_t offset, size_t count)
722	{
723	ssize_t ret;
724	struct fd f;
725
726	ret = -EBADF;
727	f = fdget(fd);
728	if (!f.file \|\| !(f.file->f_mode & FMODE_READ))
729	goto out;
730
731	/*
732	* The readahead() syscall is intended to run only on files
733	* that can execute readahead. If readahead is not possible
734	* on this file, then we must return -EINVAL.
735	*/
736	ret = -EINVAL;
737	if (!f.file->f_mapping \|\| !f.file->f_mapping->a_ops \|\|
738	(!S_ISREG(file_inode(f.file)->i_mode) &&
739	!S_ISBLK(file_inode(f.file)->i_mode)))
740	goto out;
741
742	ret = vfs_fadvise(file: f.file, offset, len: count, POSIX_FADV_WILLNEED);
743	out:
744	fdput(fd: f);
745	return ret;
746	}
747
748	SYSCALL_DEFINE3(readahead, int, fd, loff_t, offset, size_t, count)
749	{
750	return ksys_readahead(fd, offset, count);
751	}
752
753	#if defined(CONFIG_COMPAT) && defined(__ARCH_WANT_COMPAT_READAHEAD)
754	COMPAT_SYSCALL_DEFINE4(readahead, int, fd, compat_arg_u64_dual(offset), size_t, count)
755	{
756	return ksys_readahead(fd, compat_arg_u64_glue(offset), count);
757	}
758	#endif
759
760	/**
761	* readahead_expand - Expand a readahead request
762	* @ractl: The request to be expanded
763	* @new_start: The revised start
764	* @new_len: The revised size of the request
765	*
766	* Attempt to expand a readahead request outwards from the current size to the
767	* specified size by inserting locked pages before and after the current window
768	* to increase the size to the new window. This may involve the insertion of
769	* THPs, in which case the window may get expanded even beyond what was
770	* requested.
771	*
772	* The algorithm will stop if it encounters a conflicting page already in the
773	* pagecache and leave a smaller expansion than requested.
774	*
775	* The caller must check for this by examining the revised @ractl object for a
776	* different expansion than was requested.
777	*/
778	void readahead_expand(struct readahead_control *ractl,
779	loff_t new_start, size_t new_len)
780	{
781	struct address_space *mapping = ractl->mapping;
782	struct file_ra_state *ra = ractl->ra;
783	pgoff_t new_index, new_nr_pages;
784	gfp_t gfp_mask = readahead_gfp_mask(x: mapping);
785
786	new_index = new_start / PAGE_SIZE;
787
788	/ Expand the leading edge downwards /
789	while (ractl->_index > new_index) {
790	unsigned long index = ractl->_index - `1`;
791	struct folio *folio = xa_load(&mapping->i_pages, index);
792
793	if (folio && !xa_is_value(entry: folio))
794	return; / Folio apparently present /
795
796	folio = filemap_alloc_folio(gfp: gfp_mask, order: `0`);
797	if (!folio)
798	return;
799	if (filemap_add_folio(mapping, folio, index, gfp: gfp_mask) < `0`) {
800	folio_put(folio);
801	return;
802	}
803	if (unlikely(folio_test_workingset(folio)) &&
804	!ractl->_workingset) {
805	ractl->_workingset = true;
806	psi_memstall_enter(flags: &ractl->_pflags);
807	}
808	ractl->_nr_pages++;
809	ractl->_index = folio->index;
810	}
811
812	new_len += new_start - readahead_pos(rac: ractl);
813	new_nr_pages = DIV_ROUND_UP(new_len, PAGE_SIZE);
814
815	/ Expand the trailing edge upwards /
816	while (ractl->_nr_pages < new_nr_pages) {
817	unsigned long index = ractl->_index + ractl->_nr_pages;
818	struct folio *folio = xa_load(&mapping->i_pages, index);
819
820	if (folio && !xa_is_value(entry: folio))
821	return; / Folio apparently present /
822
823	folio = filemap_alloc_folio(gfp: gfp_mask, order: `0`);
824	if (!folio)
825	return;
826	if (filemap_add_folio(mapping, folio, index, gfp: gfp_mask) < `0`) {
827	folio_put(folio);
828	return;
829	}
830	if (unlikely(folio_test_workingset(folio)) &&
831	!ractl->_workingset) {
832	ractl->_workingset = true;
833	psi_memstall_enter(flags: &ractl->_pflags);
834	}
835	ractl->_nr_pages++;
836	if (ra) {
837	ra->size++;
838	ra->async_size++;
839	}
840	}
841	}
842	EXPORT_SYMBOL(readahead_expand);
843

source code of linux/mm/readahead.c