1 | // SPDX-License-Identifier: GPL-2.0 |
2 | /* |
3 | * Copyright (c) 2014 Christoph Hellwig. |
4 | */ |
5 | #include "xfs.h" |
6 | #include "xfs_shared.h" |
7 | #include "xfs_format.h" |
8 | #include "xfs_log_format.h" |
9 | #include "xfs_trans_resv.h" |
10 | #include "xfs_mount.h" |
11 | #include "xfs_inode.h" |
12 | #include "xfs_trans.h" |
13 | #include "xfs_bmap.h" |
14 | #include "xfs_iomap.h" |
15 | #include "xfs_pnfs.h" |
16 | |
17 | /* |
18 | * Ensure that we do not have any outstanding pNFS layouts that can be used by |
19 | * clients to directly read from or write to this inode. This must be called |
20 | * before every operation that can remove blocks from the extent map. |
21 | * Additionally we call it during the write operation, where aren't concerned |
22 | * about exposing unallocated blocks but just want to provide basic |
23 | * synchronization between a local writer and pNFS clients. mmap writes would |
24 | * also benefit from this sort of synchronization, but due to the tricky locking |
25 | * rules in the page fault path we don't bother. |
26 | */ |
27 | int |
28 | xfs_break_leased_layouts( |
29 | struct inode *inode, |
30 | uint *iolock, |
31 | bool *did_unlock) |
32 | { |
33 | struct xfs_inode *ip = XFS_I(inode); |
34 | int error; |
35 | |
36 | while ((error = break_layout(inode, wait: false)) == -EWOULDBLOCK) { |
37 | xfs_iunlock(ip, *iolock); |
38 | *did_unlock = true; |
39 | error = break_layout(inode, wait: true); |
40 | *iolock &= ~XFS_IOLOCK_SHARED; |
41 | *iolock |= XFS_IOLOCK_EXCL; |
42 | xfs_ilock(ip, *iolock); |
43 | } |
44 | |
45 | return error; |
46 | } |
47 | |
48 | /* |
49 | * Get a unique ID including its location so that the client can identify |
50 | * the exported device. |
51 | */ |
52 | int |
53 | xfs_fs_get_uuid( |
54 | struct super_block *sb, |
55 | u8 *buf, |
56 | u32 *len, |
57 | u64 *offset) |
58 | { |
59 | struct xfs_mount *mp = XFS_M(sb); |
60 | |
61 | xfs_notice_once(mp, |
62 | "Using experimental pNFS feature, use at your own risk!" ); |
63 | |
64 | if (*len < sizeof(uuid_t)) |
65 | return -EINVAL; |
66 | |
67 | memcpy(buf, &mp->m_sb.sb_uuid, sizeof(uuid_t)); |
68 | *len = sizeof(uuid_t); |
69 | *offset = offsetof(struct xfs_dsb, sb_uuid); |
70 | return 0; |
71 | } |
72 | |
73 | /* |
74 | * We cannot use file based VFS helpers such as file_modified() to update |
75 | * inode state as we modify the data/metadata in the inode here. Hence we have |
76 | * to open code the timestamp updates and SUID/SGID stripping. We also need |
77 | * to set the inode prealloc flag to ensure that the extents we allocate are not |
78 | * removed if the inode is reclaimed from memory before xfs_fs_block_commit() |
79 | * is from the client to indicate that data has been written and the file size |
80 | * can be extended. |
81 | */ |
82 | static int |
83 | xfs_fs_map_update_inode( |
84 | struct xfs_inode *ip) |
85 | { |
86 | struct xfs_trans *tp; |
87 | int error; |
88 | |
89 | error = xfs_trans_alloc(mp: ip->i_mount, resp: &M_RES(ip->i_mount)->tr_writeid, |
90 | blocks: 0, rtextents: 0, flags: 0, tpp: &tp); |
91 | if (error) |
92 | return error; |
93 | |
94 | xfs_ilock(ip, XFS_ILOCK_EXCL); |
95 | xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); |
96 | |
97 | VFS_I(ip)->i_mode &= ~S_ISUID; |
98 | if (VFS_I(ip)->i_mode & S_IXGRP) |
99 | VFS_I(ip)->i_mode &= ~S_ISGID; |
100 | xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); |
101 | ip->i_diflags |= XFS_DIFLAG_PREALLOC; |
102 | |
103 | xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); |
104 | return xfs_trans_commit(tp); |
105 | } |
106 | |
107 | /* |
108 | * Get a layout for the pNFS client. |
109 | */ |
110 | int |
111 | xfs_fs_map_blocks( |
112 | struct inode *inode, |
113 | loff_t offset, |
114 | u64 length, |
115 | struct iomap *iomap, |
116 | bool write, |
117 | u32 *device_generation) |
118 | { |
119 | struct xfs_inode *ip = XFS_I(inode); |
120 | struct xfs_mount *mp = ip->i_mount; |
121 | struct xfs_bmbt_irec imap; |
122 | xfs_fileoff_t offset_fsb, end_fsb; |
123 | loff_t limit; |
124 | int bmapi_flags = XFS_BMAPI_ENTIRE; |
125 | int nimaps = 1; |
126 | uint lock_flags; |
127 | int error = 0; |
128 | u64 seq; |
129 | |
130 | if (xfs_is_shutdown(mp)) |
131 | return -EIO; |
132 | |
133 | /* |
134 | * We can't export inodes residing on the realtime device. The realtime |
135 | * device doesn't have a UUID to identify it, so the client has no way |
136 | * to find it. |
137 | */ |
138 | if (XFS_IS_REALTIME_INODE(ip)) |
139 | return -ENXIO; |
140 | |
141 | /* |
142 | * The pNFS block layout spec actually supports reflink like |
143 | * functionality, but the Linux pNFS server doesn't implement it yet. |
144 | */ |
145 | if (xfs_is_reflink_inode(ip)) |
146 | return -ENXIO; |
147 | |
148 | /* |
149 | * Lock out any other I/O before we flush and invalidate the pagecache, |
150 | * and then hand out a layout to the remote system. This is very |
151 | * similar to direct I/O, except that the synchronization is much more |
152 | * complicated. See the comment near xfs_break_leased_layouts |
153 | * for a detailed explanation. |
154 | */ |
155 | xfs_ilock(ip, XFS_IOLOCK_EXCL); |
156 | |
157 | error = -EINVAL; |
158 | limit = mp->m_super->s_maxbytes; |
159 | if (!write) |
160 | limit = max(limit, round_up(i_size_read(inode), |
161 | inode->i_sb->s_blocksize)); |
162 | if (offset > limit) |
163 | goto out_unlock; |
164 | if (offset > limit - length) |
165 | length = limit - offset; |
166 | |
167 | error = filemap_write_and_wait(mapping: inode->i_mapping); |
168 | if (error) |
169 | goto out_unlock; |
170 | error = invalidate_inode_pages2(mapping: inode->i_mapping); |
171 | if (WARN_ON_ONCE(error)) |
172 | goto out_unlock; |
173 | |
174 | end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + length); |
175 | offset_fsb = XFS_B_TO_FSBT(mp, offset); |
176 | |
177 | lock_flags = xfs_ilock_data_map_shared(ip); |
178 | error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb, |
179 | &imap, &nimaps, bmapi_flags); |
180 | seq = xfs_iomap_inode_sequence(ip, iomap_flags: 0); |
181 | |
182 | ASSERT(!nimaps || imap.br_startblock != DELAYSTARTBLOCK); |
183 | |
184 | if (!error && write && |
185 | (!nimaps || imap.br_startblock == HOLESTARTBLOCK)) { |
186 | if (offset + length > XFS_ISIZE(ip)) |
187 | end_fsb = xfs_iomap_eof_align_last_fsb(ip, end_fsb); |
188 | else if (nimaps && imap.br_startblock == HOLESTARTBLOCK) |
189 | end_fsb = min(end_fsb, imap.br_startoff + |
190 | imap.br_blockcount); |
191 | xfs_iunlock(ip, lock_flags); |
192 | |
193 | error = xfs_iomap_write_direct(ip, offset_fsb, |
194 | end_fsb - offset_fsb, 0, &imap, &seq); |
195 | if (error) |
196 | goto out_unlock; |
197 | |
198 | /* |
199 | * Ensure the next transaction is committed synchronously so |
200 | * that the blocks allocated and handed out to the client are |
201 | * guaranteed to be present even after a server crash. |
202 | */ |
203 | error = xfs_fs_map_update_inode(ip); |
204 | if (!error) |
205 | error = xfs_log_force_inode(ip); |
206 | if (error) |
207 | goto out_unlock; |
208 | |
209 | } else { |
210 | xfs_iunlock(ip, lock_flags); |
211 | } |
212 | xfs_iunlock(ip, XFS_IOLOCK_EXCL); |
213 | |
214 | error = xfs_bmbt_to_iomap(ip, iomap, imap: &imap, mapping_flags: 0, iomap_flags: 0, sequence_cookie: seq); |
215 | *device_generation = mp->m_generation; |
216 | return error; |
217 | out_unlock: |
218 | xfs_iunlock(ip, XFS_IOLOCK_EXCL); |
219 | return error; |
220 | } |
221 | |
222 | /* |
223 | * Ensure the size update falls into a valid allocated block. |
224 | */ |
225 | static int |
226 | xfs_pnfs_validate_isize( |
227 | struct xfs_inode *ip, |
228 | xfs_off_t isize) |
229 | { |
230 | struct xfs_bmbt_irec imap; |
231 | int nimaps = 1; |
232 | int error = 0; |
233 | |
234 | xfs_ilock(ip, XFS_ILOCK_SHARED); |
235 | error = xfs_bmapi_read(ip, XFS_B_TO_FSBT(ip->i_mount, isize - 1), 1, |
236 | &imap, &nimaps, 0); |
237 | xfs_iunlock(ip, XFS_ILOCK_SHARED); |
238 | if (error) |
239 | return error; |
240 | |
241 | if (imap.br_startblock == HOLESTARTBLOCK || |
242 | imap.br_startblock == DELAYSTARTBLOCK || |
243 | imap.br_state == XFS_EXT_UNWRITTEN) |
244 | return -EIO; |
245 | return 0; |
246 | } |
247 | |
248 | /* |
249 | * Make sure the blocks described by maps are stable on disk. This includes |
250 | * converting any unwritten extents, flushing the disk cache and updating the |
251 | * time stamps. |
252 | * |
253 | * Note that we rely on the caller to always send us a timestamp update so that |
254 | * we always commit a transaction here. If that stops being true we will have |
255 | * to manually flush the cache here similar to what the fsync code path does |
256 | * for datasyncs on files that have no dirty metadata. |
257 | */ |
258 | int |
259 | xfs_fs_commit_blocks( |
260 | struct inode *inode, |
261 | struct iomap *maps, |
262 | int nr_maps, |
263 | struct iattr *iattr) |
264 | { |
265 | struct xfs_inode *ip = XFS_I(inode); |
266 | struct xfs_mount *mp = ip->i_mount; |
267 | struct xfs_trans *tp; |
268 | bool update_isize = false; |
269 | int error, i; |
270 | loff_t size; |
271 | |
272 | ASSERT(iattr->ia_valid & (ATTR_ATIME|ATTR_CTIME|ATTR_MTIME)); |
273 | |
274 | xfs_ilock(ip, XFS_IOLOCK_EXCL); |
275 | |
276 | size = i_size_read(inode); |
277 | if ((iattr->ia_valid & ATTR_SIZE) && iattr->ia_size > size) { |
278 | update_isize = true; |
279 | size = iattr->ia_size; |
280 | } |
281 | |
282 | for (i = 0; i < nr_maps; i++) { |
283 | u64 start, length, end; |
284 | |
285 | start = maps[i].offset; |
286 | if (start > size) |
287 | continue; |
288 | |
289 | end = start + maps[i].length; |
290 | if (end > size) |
291 | end = size; |
292 | |
293 | length = end - start; |
294 | if (!length) |
295 | continue; |
296 | |
297 | /* |
298 | * Make sure reads through the pagecache see the new data. |
299 | */ |
300 | error = invalidate_inode_pages2_range(mapping: inode->i_mapping, |
301 | start: start >> PAGE_SHIFT, |
302 | end: (end - 1) >> PAGE_SHIFT); |
303 | WARN_ON_ONCE(error); |
304 | |
305 | error = xfs_iomap_write_unwritten(ip, start, length, false); |
306 | if (error) |
307 | goto out_drop_iolock; |
308 | } |
309 | |
310 | if (update_isize) { |
311 | error = xfs_pnfs_validate_isize(ip, isize: size); |
312 | if (error) |
313 | goto out_drop_iolock; |
314 | } |
315 | |
316 | error = xfs_trans_alloc(mp, resp: &M_RES(mp)->tr_ichange, blocks: 0, rtextents: 0, flags: 0, tpp: &tp); |
317 | if (error) |
318 | goto out_drop_iolock; |
319 | |
320 | xfs_ilock(ip, XFS_ILOCK_EXCL); |
321 | xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); |
322 | xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); |
323 | |
324 | ASSERT(!(iattr->ia_valid & (ATTR_UID | ATTR_GID))); |
325 | setattr_copy(&nop_mnt_idmap, inode, attr: iattr); |
326 | if (update_isize) { |
327 | i_size_write(inode, i_size: iattr->ia_size); |
328 | ip->i_disk_size = iattr->ia_size; |
329 | } |
330 | |
331 | xfs_trans_set_sync(tp); |
332 | error = xfs_trans_commit(tp); |
333 | |
334 | out_drop_iolock: |
335 | xfs_iunlock(ip, XFS_IOLOCK_EXCL); |
336 | return error; |
337 | } |
338 | |