1 | // SPDX-License-Identifier: GPL-2.0-or-later |
2 | /* |
3 | * Copyright (c) 2023-2024 Oracle. All Rights Reserved. |
4 | * Author: Darrick J. Wong <djwong@kernel.org> |
5 | */ |
6 | #include "xfs.h" |
7 | #include "xfs_fs.h" |
8 | #include "xfs_buf.h" |
9 | #include "xfs_buf_mem.h" |
10 | #include "xfs_trace.h" |
11 | #include <linux/shmem_fs.h> |
12 | #include "xfs_log_format.h" |
13 | #include "xfs_trans.h" |
14 | #include "xfs_buf_item.h" |
15 | #include "xfs_error.h" |
16 | |
17 | /* |
18 | * Buffer Cache for In-Memory Files |
19 | * ================================ |
20 | * |
21 | * Online fsck wants to create ephemeral ordered recordsets. The existing |
22 | * btree infrastructure can do this, but we need the buffer cache to target |
23 | * memory instead of block devices. |
24 | * |
25 | * When CONFIG_TMPFS=y, shmemfs is enough of a filesystem to meet those |
26 | * requirements. Therefore, the xmbuf mechanism uses an unlinked shmem file to |
27 | * store our staging data. This file is not installed in the file descriptor |
28 | * table so that user programs cannot access the data, which means that the |
29 | * xmbuf must be freed with xmbuf_destroy. |
30 | * |
31 | * xmbufs assume that the caller will handle all required concurrency |
32 | * management; standard vfs locks (freezer and inode) are not taken. Reads |
33 | * and writes are satisfied directly from the page cache. |
34 | * |
35 | * The only supported block size is PAGE_SIZE, and we cannot use highmem. |
36 | */ |
37 | |
38 | /* |
39 | * shmem files used to back an in-memory buffer cache must not be exposed to |
40 | * userspace. Upper layers must coordinate access to the one handle returned |
41 | * by the constructor, so establish a separate lock class for xmbufs to avoid |
42 | * confusing lockdep. |
43 | */ |
44 | static struct lock_class_key xmbuf_i_mutex_key; |
45 | |
46 | /* |
47 | * Allocate a buffer cache target for a memory-backed file and set up the |
48 | * buffer target. |
49 | */ |
50 | int |
51 | xmbuf_alloc( |
52 | struct xfs_mount *mp, |
53 | const char *descr, |
54 | struct xfs_buftarg **btpp) |
55 | { |
56 | struct file *file; |
57 | struct inode *inode; |
58 | struct xfs_buftarg *btp; |
59 | int error; |
60 | |
61 | btp = kzalloc(struct_size(btp, bt_cache, 1), GFP_KERNEL); |
62 | if (!btp) |
63 | return -ENOMEM; |
64 | |
65 | file = shmem_kernel_file_setup(name: descr, size: 0, flags: 0); |
66 | if (IS_ERR(ptr: file)) { |
67 | error = PTR_ERR(ptr: file); |
68 | goto out_free_btp; |
69 | } |
70 | inode = file_inode(f: file); |
71 | |
72 | /* private file, private locking */ |
73 | lockdep_set_class(&inode->i_rwsem, &xmbuf_i_mutex_key); |
74 | |
75 | /* |
76 | * We don't want to bother with kmapping data during repair, so don't |
77 | * allow highmem pages to back this mapping. |
78 | */ |
79 | mapping_set_gfp_mask(m: inode->i_mapping, GFP_KERNEL); |
80 | |
81 | /* ensure all writes are below EOF to avoid pagecache zeroing */ |
82 | i_size_write(inode, i_size: inode->i_sb->s_maxbytes); |
83 | |
84 | error = xfs_buf_cache_init(bch: btp->bt_cache); |
85 | if (error) |
86 | goto out_file; |
87 | |
88 | /* Initialize buffer target */ |
89 | btp->bt_mount = mp; |
90 | btp->bt_dev = (dev_t)-1U; |
91 | btp->bt_bdev = NULL; /* in-memory buftargs have no bdev */ |
92 | btp->bt_file = file; |
93 | btp->bt_meta_sectorsize = XMBUF_BLOCKSIZE; |
94 | btp->bt_meta_sectormask = XMBUF_BLOCKSIZE - 1; |
95 | |
96 | error = xfs_init_buftarg(btp, XMBUF_BLOCKSIZE, descr); |
97 | if (error) |
98 | goto out_bcache; |
99 | |
100 | trace_xmbuf_create(btp); |
101 | |
102 | *btpp = btp; |
103 | return 0; |
104 | |
105 | out_bcache: |
106 | xfs_buf_cache_destroy(bch: btp->bt_cache); |
107 | out_file: |
108 | fput(file); |
109 | out_free_btp: |
110 | kfree(objp: btp); |
111 | return error; |
112 | } |
113 | |
114 | /* Free a buffer cache target for a memory-backed buffer cache. */ |
115 | void |
116 | xmbuf_free( |
117 | struct xfs_buftarg *btp) |
118 | { |
119 | ASSERT(xfs_buftarg_is_mem(btp)); |
120 | ASSERT(percpu_counter_sum(&btp->bt_io_count) == 0); |
121 | |
122 | trace_xmbuf_free(btp); |
123 | |
124 | xfs_destroy_buftarg(btp); |
125 | xfs_buf_cache_destroy(bch: btp->bt_cache); |
126 | fput(btp->bt_file); |
127 | kfree(objp: btp); |
128 | } |
129 | |
130 | /* Directly map a shmem page into the buffer cache. */ |
131 | int |
132 | xmbuf_map_page( |
133 | struct xfs_buf *bp) |
134 | { |
135 | struct inode *inode = file_inode(f: bp->b_target->bt_file); |
136 | struct folio *folio = NULL; |
137 | struct page *page; |
138 | loff_t pos = BBTOB(xfs_buf_daddr(bp)); |
139 | int error; |
140 | |
141 | ASSERT(xfs_buftarg_is_mem(bp->b_target)); |
142 | |
143 | if (bp->b_map_count != 1) |
144 | return -ENOMEM; |
145 | if (BBTOB(bp->b_length) != XMBUF_BLOCKSIZE) |
146 | return -ENOMEM; |
147 | if (offset_in_page(pos) != 0) { |
148 | ASSERT(offset_in_page(pos)); |
149 | return -ENOMEM; |
150 | } |
151 | |
152 | error = shmem_get_folio(inode, index: pos >> PAGE_SHIFT, foliop: &folio, sgp: SGP_CACHE); |
153 | if (error) |
154 | return error; |
155 | |
156 | if (filemap_check_wb_err(mapping: inode->i_mapping, since: 0)) { |
157 | folio_unlock(folio); |
158 | folio_put(folio); |
159 | return -EIO; |
160 | } |
161 | |
162 | page = folio_file_page(folio, index: pos >> PAGE_SHIFT); |
163 | |
164 | /* |
165 | * Mark the page dirty so that it won't be reclaimed once we drop the |
166 | * (potentially last) reference in xmbuf_unmap_page. |
167 | */ |
168 | set_page_dirty(page); |
169 | unlock_page(page); |
170 | |
171 | bp->b_addr = page_address(page); |
172 | bp->b_pages = bp->b_page_array; |
173 | bp->b_pages[0] = page; |
174 | bp->b_page_count = 1; |
175 | return 0; |
176 | } |
177 | |
178 | /* Unmap a shmem page that was mapped into the buffer cache. */ |
179 | void |
180 | xmbuf_unmap_page( |
181 | struct xfs_buf *bp) |
182 | { |
183 | struct page *page = bp->b_pages[0]; |
184 | |
185 | ASSERT(xfs_buftarg_is_mem(bp->b_target)); |
186 | |
187 | put_page(page); |
188 | |
189 | bp->b_addr = NULL; |
190 | bp->b_pages[0] = NULL; |
191 | bp->b_pages = NULL; |
192 | bp->b_page_count = 0; |
193 | } |
194 | |
195 | /* Is this a valid daddr within the buftarg? */ |
196 | bool |
197 | xmbuf_verify_daddr( |
198 | struct xfs_buftarg *btp, |
199 | xfs_daddr_t daddr) |
200 | { |
201 | struct inode *inode = file_inode(f: btp->bt_file); |
202 | |
203 | ASSERT(xfs_buftarg_is_mem(btp)); |
204 | |
205 | return daddr < (inode->i_sb->s_maxbytes >> BBSHIFT); |
206 | } |
207 | |
208 | /* Discard the page backing this buffer. */ |
209 | static void |
210 | xmbuf_stale( |
211 | struct xfs_buf *bp) |
212 | { |
213 | struct inode *inode = file_inode(f: bp->b_target->bt_file); |
214 | loff_t pos; |
215 | |
216 | ASSERT(xfs_buftarg_is_mem(bp->b_target)); |
217 | |
218 | pos = BBTOB(xfs_buf_daddr(bp)); |
219 | shmem_truncate_range(inode, start: pos, end: pos + BBTOB(bp->b_length) - 1); |
220 | } |
221 | |
222 | /* |
223 | * Finalize a buffer -- discard the backing page if it's stale, or run the |
224 | * write verifier to detect problems. |
225 | */ |
226 | int |
227 | xmbuf_finalize( |
228 | struct xfs_buf *bp) |
229 | { |
230 | xfs_failaddr_t fa; |
231 | int error = 0; |
232 | |
233 | if (bp->b_flags & XBF_STALE) { |
234 | xmbuf_stale(bp); |
235 | return 0; |
236 | } |
237 | |
238 | /* |
239 | * Although this btree is ephemeral, validate the buffer structure so |
240 | * that we can detect memory corruption errors and software bugs. |
241 | */ |
242 | fa = bp->b_ops->verify_struct(bp); |
243 | if (fa) { |
244 | error = -EFSCORRUPTED; |
245 | xfs_verifier_error(bp, error, fa); |
246 | } |
247 | |
248 | return error; |
249 | } |
250 | |
251 | /* |
252 | * Detach this xmbuf buffer from the transaction by any means necessary. |
253 | * All buffers are direct-mapped, so they do not need bwrite. |
254 | */ |
255 | void |
256 | xmbuf_trans_bdetach( |
257 | struct xfs_trans *tp, |
258 | struct xfs_buf *bp) |
259 | { |
260 | struct xfs_buf_log_item *bli = bp->b_log_item; |
261 | |
262 | ASSERT(bli != NULL); |
263 | |
264 | bli->bli_flags &= ~(XFS_BLI_DIRTY | XFS_BLI_ORDERED | |
265 | XFS_BLI_LOGGED | XFS_BLI_STALE); |
266 | clear_bit(XFS_LI_DIRTY, addr: &bli->bli_item.li_flags); |
267 | |
268 | while (bp->b_log_item != NULL) |
269 | xfs_trans_bdetach(tp, bp); |
270 | } |
271 | |