1 | // SPDX-License-Identifier: GPL-2.0-only |
2 | /* |
3 | * linux/fs/nfs/direct.c |
4 | * |
5 | * Copyright (C) 2003 by Chuck Lever <cel@netapp.com> |
6 | * |
7 | * High-performance uncached I/O for the Linux NFS client |
8 | * |
9 | * There are important applications whose performance or correctness |
10 | * depends on uncached access to file data. Database clusters |
11 | * (multiple copies of the same instance running on separate hosts) |
12 | * implement their own cache coherency protocol that subsumes file |
13 | * system cache protocols. Applications that process datasets |
14 | * considerably larger than the client's memory do not always benefit |
15 | * from a local cache. A streaming video server, for instance, has no |
16 | * need to cache the contents of a file. |
17 | * |
18 | * When an application requests uncached I/O, all read and write requests |
19 | * are made directly to the server; data stored or fetched via these |
20 | * requests is not cached in the Linux page cache. The client does not |
21 | * correct unaligned requests from applications. All requested bytes are |
22 | * held on permanent storage before a direct write system call returns to |
23 | * an application. |
24 | * |
25 | * Solaris implements an uncached I/O facility called directio() that |
26 | * is used for backups and sequential I/O to very large files. Solaris |
27 | * also supports uncaching whole NFS partitions with "-o forcedirectio," |
28 | * an undocumented mount option. |
29 | * |
30 | * Designed by Jeff Kimmel, Chuck Lever, and Trond Myklebust, with |
31 | * help from Andrew Morton. |
32 | * |
33 | * 18 Dec 2001 Initial implementation for 2.4 --cel |
34 | * 08 Jul 2002 Version for 2.4.19, with bug fixes --trondmy |
35 | * 08 Jun 2003 Port to 2.5 APIs --cel |
36 | * 31 Mar 2004 Handle direct I/O without VFS support --cel |
37 | * 15 Sep 2004 Parallel async reads --cel |
38 | * 04 May 2005 support O_DIRECT with aio --cel |
39 | * |
40 | */ |
41 | |
42 | #include <linux/errno.h> |
43 | #include <linux/sched.h> |
44 | #include <linux/kernel.h> |
45 | #include <linux/file.h> |
46 | #include <linux/pagemap.h> |
47 | #include <linux/kref.h> |
48 | #include <linux/slab.h> |
49 | #include <linux/task_io_accounting_ops.h> |
50 | #include <linux/module.h> |
51 | |
52 | #include <linux/nfs_fs.h> |
53 | #include <linux/nfs_page.h> |
54 | #include <linux/sunrpc/clnt.h> |
55 | |
56 | #include <linux/uaccess.h> |
57 | #include <linux/atomic.h> |
58 | |
59 | #include "internal.h" |
60 | #include "iostat.h" |
61 | #include "pnfs.h" |
62 | #include "fscache.h" |
63 | #include "nfstrace.h" |
64 | |
65 | #define NFSDBG_FACILITY NFSDBG_VFS |
66 | |
67 | static struct kmem_cache *nfs_direct_cachep; |
68 | |
69 | static const struct nfs_pgio_completion_ops nfs_direct_write_completion_ops; |
70 | static const struct nfs_commit_completion_ops nfs_direct_commit_completion_ops; |
71 | static void nfs_direct_write_complete(struct nfs_direct_req *dreq); |
72 | static void nfs_direct_write_schedule_work(struct work_struct *work); |
73 | |
74 | static inline void get_dreq(struct nfs_direct_req *dreq) |
75 | { |
76 | atomic_inc(v: &dreq->io_count); |
77 | } |
78 | |
79 | static inline int put_dreq(struct nfs_direct_req *dreq) |
80 | { |
81 | return atomic_dec_and_test(v: &dreq->io_count); |
82 | } |
83 | |
84 | static void |
85 | nfs_direct_handle_truncated(struct nfs_direct_req *dreq, |
86 | const struct nfs_pgio_header *hdr, |
87 | ssize_t dreq_len) |
88 | { |
89 | if (!(test_bit(NFS_IOHDR_ERROR, &hdr->flags) || |
90 | test_bit(NFS_IOHDR_EOF, &hdr->flags))) |
91 | return; |
92 | if (dreq->max_count >= dreq_len) { |
93 | dreq->max_count = dreq_len; |
94 | if (dreq->count > dreq_len) |
95 | dreq->count = dreq_len; |
96 | } |
97 | |
98 | if (test_bit(NFS_IOHDR_ERROR, &hdr->flags) && !dreq->error) |
99 | dreq->error = hdr->error; |
100 | } |
101 | |
102 | static void |
103 | nfs_direct_count_bytes(struct nfs_direct_req *dreq, |
104 | const struct nfs_pgio_header *hdr) |
105 | { |
106 | loff_t hdr_end = hdr->io_start + hdr->good_bytes; |
107 | ssize_t dreq_len = 0; |
108 | |
109 | if (hdr_end > dreq->io_start) |
110 | dreq_len = hdr_end - dreq->io_start; |
111 | |
112 | nfs_direct_handle_truncated(dreq, hdr, dreq_len); |
113 | |
114 | if (dreq_len > dreq->max_count) |
115 | dreq_len = dreq->max_count; |
116 | |
117 | if (dreq->count < dreq_len) |
118 | dreq->count = dreq_len; |
119 | } |
120 | |
121 | static void nfs_direct_truncate_request(struct nfs_direct_req *dreq, |
122 | struct nfs_page *req) |
123 | { |
124 | loff_t offs = req_offset(req); |
125 | size_t req_start = (size_t)(offs - dreq->io_start); |
126 | |
127 | if (req_start < dreq->max_count) |
128 | dreq->max_count = req_start; |
129 | if (req_start < dreq->count) |
130 | dreq->count = req_start; |
131 | } |
132 | |
133 | /** |
134 | * nfs_swap_rw - NFS address space operation for swap I/O |
135 | * @iocb: target I/O control block |
136 | * @iter: I/O buffer |
137 | * |
138 | * Perform IO to the swap-file. This is much like direct IO. |
139 | */ |
140 | int nfs_swap_rw(struct kiocb *iocb, struct iov_iter *iter) |
141 | { |
142 | ssize_t ret; |
143 | |
144 | VM_BUG_ON(iov_iter_count(iter) != PAGE_SIZE); |
145 | |
146 | if (iov_iter_rw(i: iter) == READ) |
147 | ret = nfs_file_direct_read(iocb, iter, swap: true); |
148 | else |
149 | ret = nfs_file_direct_write(iocb, iter, swap: true); |
150 | if (ret < 0) |
151 | return ret; |
152 | return 0; |
153 | } |
154 | |
155 | static void nfs_direct_release_pages(struct page **pages, unsigned int npages) |
156 | { |
157 | unsigned int i; |
158 | for (i = 0; i < npages; i++) |
159 | put_page(page: pages[i]); |
160 | } |
161 | |
162 | void nfs_init_cinfo_from_dreq(struct nfs_commit_info *cinfo, |
163 | struct nfs_direct_req *dreq) |
164 | { |
165 | cinfo->inode = dreq->inode; |
166 | cinfo->mds = &dreq->mds_cinfo; |
167 | cinfo->ds = &dreq->ds_cinfo; |
168 | cinfo->dreq = dreq; |
169 | cinfo->completion_ops = &nfs_direct_commit_completion_ops; |
170 | } |
171 | |
172 | static inline struct nfs_direct_req *nfs_direct_req_alloc(void) |
173 | { |
174 | struct nfs_direct_req *dreq; |
175 | |
176 | dreq = kmem_cache_zalloc(k: nfs_direct_cachep, GFP_KERNEL); |
177 | if (!dreq) |
178 | return NULL; |
179 | |
180 | kref_init(kref: &dreq->kref); |
181 | kref_get(kref: &dreq->kref); |
182 | init_completion(x: &dreq->completion); |
183 | INIT_LIST_HEAD(list: &dreq->mds_cinfo.list); |
184 | pnfs_init_ds_commit_info(fl_cinfo: &dreq->ds_cinfo); |
185 | INIT_WORK(&dreq->work, nfs_direct_write_schedule_work); |
186 | spin_lock_init(&dreq->lock); |
187 | |
188 | return dreq; |
189 | } |
190 | |
191 | static void nfs_direct_req_free(struct kref *kref) |
192 | { |
193 | struct nfs_direct_req *dreq = container_of(kref, struct nfs_direct_req, kref); |
194 | |
195 | pnfs_release_ds_info(fl_cinfo: &dreq->ds_cinfo, inode: dreq->inode); |
196 | if (dreq->l_ctx != NULL) |
197 | nfs_put_lock_context(l_ctx: dreq->l_ctx); |
198 | if (dreq->ctx != NULL) |
199 | put_nfs_open_context(ctx: dreq->ctx); |
200 | kmem_cache_free(s: nfs_direct_cachep, objp: dreq); |
201 | } |
202 | |
203 | static void nfs_direct_req_release(struct nfs_direct_req *dreq) |
204 | { |
205 | kref_put(kref: &dreq->kref, release: nfs_direct_req_free); |
206 | } |
207 | |
208 | ssize_t nfs_dreq_bytes_left(struct nfs_direct_req *dreq, loff_t offset) |
209 | { |
210 | loff_t start = offset - dreq->io_start; |
211 | return dreq->max_count - start; |
212 | } |
213 | EXPORT_SYMBOL_GPL(nfs_dreq_bytes_left); |
214 | |
215 | /* |
216 | * Collects and returns the final error value/byte-count. |
217 | */ |
218 | static ssize_t nfs_direct_wait(struct nfs_direct_req *dreq) |
219 | { |
220 | ssize_t result = -EIOCBQUEUED; |
221 | |
222 | /* Async requests don't wait here */ |
223 | if (dreq->iocb) |
224 | goto out; |
225 | |
226 | result = wait_for_completion_killable(x: &dreq->completion); |
227 | |
228 | if (!result) { |
229 | result = dreq->count; |
230 | WARN_ON_ONCE(dreq->count < 0); |
231 | } |
232 | if (!result) |
233 | result = dreq->error; |
234 | |
235 | out: |
236 | return (ssize_t) result; |
237 | } |
238 | |
239 | /* |
240 | * Synchronous I/O uses a stack-allocated iocb. Thus we can't trust |
241 | * the iocb is still valid here if this is a synchronous request. |
242 | */ |
243 | static void nfs_direct_complete(struct nfs_direct_req *dreq) |
244 | { |
245 | struct inode *inode = dreq->inode; |
246 | |
247 | inode_dio_end(inode); |
248 | |
249 | if (dreq->iocb) { |
250 | long res = (long) dreq->error; |
251 | if (dreq->count != 0) { |
252 | res = (long) dreq->count; |
253 | WARN_ON_ONCE(dreq->count < 0); |
254 | } |
255 | dreq->iocb->ki_complete(dreq->iocb, res); |
256 | } |
257 | |
258 | complete(&dreq->completion); |
259 | |
260 | nfs_direct_req_release(dreq); |
261 | } |
262 | |
263 | static void nfs_direct_read_completion(struct nfs_pgio_header *hdr) |
264 | { |
265 | unsigned long bytes = 0; |
266 | struct nfs_direct_req *dreq = hdr->dreq; |
267 | |
268 | spin_lock(lock: &dreq->lock); |
269 | if (test_bit(NFS_IOHDR_REDO, &hdr->flags)) { |
270 | spin_unlock(lock: &dreq->lock); |
271 | goto out_put; |
272 | } |
273 | |
274 | nfs_direct_count_bytes(dreq, hdr); |
275 | spin_unlock(lock: &dreq->lock); |
276 | |
277 | while (!list_empty(head: &hdr->pages)) { |
278 | struct nfs_page *req = nfs_list_entry(head: hdr->pages.next); |
279 | struct page *page = req->wb_page; |
280 | |
281 | if (!PageCompound(page) && bytes < hdr->good_bytes && |
282 | (dreq->flags == NFS_ODIRECT_SHOULD_DIRTY)) |
283 | set_page_dirty(page); |
284 | bytes += req->wb_bytes; |
285 | nfs_list_remove_request(req); |
286 | nfs_release_request(req); |
287 | } |
288 | out_put: |
289 | if (put_dreq(dreq)) |
290 | nfs_direct_complete(dreq); |
291 | hdr->release(hdr); |
292 | } |
293 | |
294 | static void nfs_read_sync_pgio_error(struct list_head *head, int error) |
295 | { |
296 | struct nfs_page *req; |
297 | |
298 | while (!list_empty(head)) { |
299 | req = nfs_list_entry(head: head->next); |
300 | nfs_list_remove_request(req); |
301 | nfs_release_request(req); |
302 | } |
303 | } |
304 | |
305 | static void nfs_direct_pgio_init(struct nfs_pgio_header *hdr) |
306 | { |
307 | get_dreq(dreq: hdr->dreq); |
308 | } |
309 | |
310 | static const struct nfs_pgio_completion_ops nfs_direct_read_completion_ops = { |
311 | .error_cleanup = nfs_read_sync_pgio_error, |
312 | .init_hdr = nfs_direct_pgio_init, |
313 | .completion = nfs_direct_read_completion, |
314 | }; |
315 | |
316 | /* |
317 | * For each rsize'd chunk of the user's buffer, dispatch an NFS READ |
318 | * operation. If nfs_readdata_alloc() or get_user_pages() fails, |
319 | * bail and stop sending more reads. Read length accounting is |
320 | * handled automatically by nfs_direct_read_result(). Otherwise, if |
321 | * no requests have been sent, just return an error. |
322 | */ |
323 | |
324 | static ssize_t nfs_direct_read_schedule_iovec(struct nfs_direct_req *dreq, |
325 | struct iov_iter *iter, |
326 | loff_t pos) |
327 | { |
328 | struct nfs_pageio_descriptor desc; |
329 | struct inode *inode = dreq->inode; |
330 | ssize_t result = -EINVAL; |
331 | size_t requested_bytes = 0; |
332 | size_t rsize = max_t(size_t, NFS_SERVER(inode)->rsize, PAGE_SIZE); |
333 | |
334 | nfs_pageio_init_read(pgio: &desc, inode: dreq->inode, force_mds: false, |
335 | compl_ops: &nfs_direct_read_completion_ops); |
336 | get_dreq(dreq); |
337 | desc.pg_dreq = dreq; |
338 | inode_dio_begin(inode); |
339 | |
340 | while (iov_iter_count(i: iter)) { |
341 | struct page **pagevec; |
342 | size_t bytes; |
343 | size_t pgbase; |
344 | unsigned npages, i; |
345 | |
346 | result = iov_iter_get_pages_alloc2(i: iter, pages: &pagevec, |
347 | maxsize: rsize, start: &pgbase); |
348 | if (result < 0) |
349 | break; |
350 | |
351 | bytes = result; |
352 | npages = (result + pgbase + PAGE_SIZE - 1) / PAGE_SIZE; |
353 | for (i = 0; i < npages; i++) { |
354 | struct nfs_page *req; |
355 | unsigned int req_len = min_t(size_t, bytes, PAGE_SIZE - pgbase); |
356 | /* XXX do we need to do the eof zeroing found in async_filler? */ |
357 | req = nfs_page_create_from_page(ctx: dreq->ctx, page: pagevec[i], |
358 | pgbase, offset: pos, count: req_len); |
359 | if (IS_ERR(ptr: req)) { |
360 | result = PTR_ERR(ptr: req); |
361 | break; |
362 | } |
363 | if (!nfs_pageio_add_request(&desc, req)) { |
364 | result = desc.pg_error; |
365 | nfs_release_request(req); |
366 | break; |
367 | } |
368 | pgbase = 0; |
369 | bytes -= req_len; |
370 | requested_bytes += req_len; |
371 | pos += req_len; |
372 | } |
373 | nfs_direct_release_pages(pages: pagevec, npages); |
374 | kvfree(addr: pagevec); |
375 | if (result < 0) |
376 | break; |
377 | } |
378 | |
379 | nfs_pageio_complete(desc: &desc); |
380 | |
381 | /* |
382 | * If no bytes were started, return the error, and let the |
383 | * generic layer handle the completion. |
384 | */ |
385 | if (requested_bytes == 0) { |
386 | inode_dio_end(inode); |
387 | nfs_direct_req_release(dreq); |
388 | return result < 0 ? result : -EIO; |
389 | } |
390 | |
391 | if (put_dreq(dreq)) |
392 | nfs_direct_complete(dreq); |
393 | return requested_bytes; |
394 | } |
395 | |
396 | /** |
397 | * nfs_file_direct_read - file direct read operation for NFS files |
398 | * @iocb: target I/O control block |
399 | * @iter: vector of user buffers into which to read data |
400 | * @swap: flag indicating this is swap IO, not O_DIRECT IO |
401 | * |
402 | * We use this function for direct reads instead of calling |
403 | * generic_file_aio_read() in order to avoid gfar's check to see if |
404 | * the request starts before the end of the file. For that check |
405 | * to work, we must generate a GETATTR before each direct read, and |
406 | * even then there is a window between the GETATTR and the subsequent |
407 | * READ where the file size could change. Our preference is simply |
408 | * to do all reads the application wants, and the server will take |
409 | * care of managing the end of file boundary. |
410 | * |
411 | * This function also eliminates unnecessarily updating the file's |
412 | * atime locally, as the NFS server sets the file's atime, and this |
413 | * client must read the updated atime from the server back into its |
414 | * cache. |
415 | */ |
416 | ssize_t nfs_file_direct_read(struct kiocb *iocb, struct iov_iter *iter, |
417 | bool swap) |
418 | { |
419 | struct file *file = iocb->ki_filp; |
420 | struct address_space *mapping = file->f_mapping; |
421 | struct inode *inode = mapping->host; |
422 | struct nfs_direct_req *dreq; |
423 | struct nfs_lock_context *l_ctx; |
424 | ssize_t result, requested; |
425 | size_t count = iov_iter_count(i: iter); |
426 | nfs_add_stats(inode: mapping->host, stat: NFSIOS_DIRECTREADBYTES, addend: count); |
427 | |
428 | dfprintk(FILE, "NFS: direct read(%pD2, %zd@%Ld)\n" , |
429 | file, count, (long long) iocb->ki_pos); |
430 | |
431 | result = 0; |
432 | if (!count) |
433 | goto out; |
434 | |
435 | task_io_account_read(bytes: count); |
436 | |
437 | result = -ENOMEM; |
438 | dreq = nfs_direct_req_alloc(); |
439 | if (dreq == NULL) |
440 | goto out; |
441 | |
442 | dreq->inode = inode; |
443 | dreq->max_count = count; |
444 | dreq->io_start = iocb->ki_pos; |
445 | dreq->ctx = get_nfs_open_context(ctx: nfs_file_open_context(filp: iocb->ki_filp)); |
446 | l_ctx = nfs_get_lock_context(ctx: dreq->ctx); |
447 | if (IS_ERR(ptr: l_ctx)) { |
448 | result = PTR_ERR(ptr: l_ctx); |
449 | nfs_direct_req_release(dreq); |
450 | goto out_release; |
451 | } |
452 | dreq->l_ctx = l_ctx; |
453 | if (!is_sync_kiocb(kiocb: iocb)) |
454 | dreq->iocb = iocb; |
455 | |
456 | if (user_backed_iter(i: iter)) |
457 | dreq->flags = NFS_ODIRECT_SHOULD_DIRTY; |
458 | |
459 | if (!swap) |
460 | nfs_start_io_direct(inode); |
461 | |
462 | NFS_I(inode)->read_io += count; |
463 | requested = nfs_direct_read_schedule_iovec(dreq, iter, pos: iocb->ki_pos); |
464 | |
465 | if (!swap) |
466 | nfs_end_io_direct(inode); |
467 | |
468 | if (requested > 0) { |
469 | result = nfs_direct_wait(dreq); |
470 | if (result > 0) { |
471 | requested -= result; |
472 | iocb->ki_pos += result; |
473 | } |
474 | iov_iter_revert(i: iter, bytes: requested); |
475 | } else { |
476 | result = requested; |
477 | } |
478 | |
479 | out_release: |
480 | nfs_direct_req_release(dreq); |
481 | out: |
482 | return result; |
483 | } |
484 | |
485 | static void nfs_direct_add_page_head(struct list_head *list, |
486 | struct nfs_page *req) |
487 | { |
488 | struct nfs_page *head = req->wb_head; |
489 | |
490 | if (!list_empty(head: &head->wb_list) || !nfs_lock_request(req: head)) |
491 | return; |
492 | if (!list_empty(head: &head->wb_list)) { |
493 | nfs_unlock_request(req: head); |
494 | return; |
495 | } |
496 | list_add(new: &head->wb_list, head: list); |
497 | kref_get(kref: &head->wb_kref); |
498 | kref_get(kref: &head->wb_kref); |
499 | } |
500 | |
501 | static void nfs_direct_join_group(struct list_head *list, |
502 | struct nfs_commit_info *cinfo, |
503 | struct inode *inode) |
504 | { |
505 | struct nfs_page *req, *subreq; |
506 | |
507 | list_for_each_entry(req, list, wb_list) { |
508 | if (req->wb_head != req) { |
509 | nfs_direct_add_page_head(list: &req->wb_list, req); |
510 | continue; |
511 | } |
512 | subreq = req->wb_this_page; |
513 | if (subreq == req) |
514 | continue; |
515 | do { |
516 | /* |
517 | * Remove subrequests from this list before freeing |
518 | * them in the call to nfs_join_page_group(). |
519 | */ |
520 | if (!list_empty(head: &subreq->wb_list)) { |
521 | nfs_list_remove_request(req: subreq); |
522 | nfs_release_request(subreq); |
523 | } |
524 | } while ((subreq = subreq->wb_this_page) != req); |
525 | nfs_join_page_group(head: req, cinfo, inode); |
526 | } |
527 | } |
528 | |
529 | static void |
530 | nfs_direct_write_scan_commit_list(struct inode *inode, |
531 | struct list_head *list, |
532 | struct nfs_commit_info *cinfo) |
533 | { |
534 | mutex_lock(&NFS_I(cinfo->inode)->commit_mutex); |
535 | pnfs_recover_commit_reqs(head: list, cinfo); |
536 | nfs_scan_commit_list(src: &cinfo->mds->list, dst: list, cinfo, max: 0); |
537 | mutex_unlock(lock: &NFS_I(inode: cinfo->inode)->commit_mutex); |
538 | } |
539 | |
540 | static void nfs_direct_write_reschedule(struct nfs_direct_req *dreq) |
541 | { |
542 | struct nfs_pageio_descriptor desc; |
543 | struct nfs_page *req; |
544 | LIST_HEAD(reqs); |
545 | struct nfs_commit_info cinfo; |
546 | |
547 | nfs_init_cinfo_from_dreq(cinfo: &cinfo, dreq); |
548 | nfs_direct_write_scan_commit_list(inode: dreq->inode, list: &reqs, cinfo: &cinfo); |
549 | |
550 | nfs_direct_join_group(list: &reqs, cinfo: &cinfo, inode: dreq->inode); |
551 | |
552 | nfs_clear_pnfs_ds_commit_verifiers(cinfo: &dreq->ds_cinfo); |
553 | get_dreq(dreq); |
554 | |
555 | nfs_pageio_init_write(pgio: &desc, inode: dreq->inode, FLUSH_STABLE, force_mds: false, |
556 | compl_ops: &nfs_direct_write_completion_ops); |
557 | desc.pg_dreq = dreq; |
558 | |
559 | while (!list_empty(head: &reqs)) { |
560 | req = nfs_list_entry(head: reqs.next); |
561 | /* Bump the transmission count */ |
562 | req->wb_nio++; |
563 | if (!nfs_pageio_add_request(&desc, req)) { |
564 | spin_lock(lock: &dreq->lock); |
565 | if (dreq->error < 0) { |
566 | desc.pg_error = dreq->error; |
567 | } else if (desc.pg_error != -EAGAIN) { |
568 | dreq->flags = 0; |
569 | if (!desc.pg_error) |
570 | desc.pg_error = -EIO; |
571 | dreq->error = desc.pg_error; |
572 | } else |
573 | dreq->flags = NFS_ODIRECT_RESCHED_WRITES; |
574 | spin_unlock(lock: &dreq->lock); |
575 | break; |
576 | } |
577 | nfs_release_request(req); |
578 | } |
579 | nfs_pageio_complete(desc: &desc); |
580 | |
581 | while (!list_empty(head: &reqs)) { |
582 | req = nfs_list_entry(head: reqs.next); |
583 | nfs_list_remove_request(req); |
584 | nfs_unlock_and_release_request(req); |
585 | if (desc.pg_error == -EAGAIN) { |
586 | nfs_mark_request_commit(req, NULL, cinfo: &cinfo, ds_commit_idx: 0); |
587 | } else { |
588 | spin_lock(lock: &dreq->lock); |
589 | nfs_direct_truncate_request(dreq, req); |
590 | spin_unlock(lock: &dreq->lock); |
591 | nfs_release_request(req); |
592 | } |
593 | } |
594 | |
595 | if (put_dreq(dreq)) |
596 | nfs_direct_write_complete(dreq); |
597 | } |
598 | |
599 | static void nfs_direct_commit_complete(struct nfs_commit_data *data) |
600 | { |
601 | const struct nfs_writeverf *verf = data->res.verf; |
602 | struct nfs_direct_req *dreq = data->dreq; |
603 | struct nfs_commit_info cinfo; |
604 | struct nfs_page *req; |
605 | int status = data->task.tk_status; |
606 | |
607 | trace_nfs_direct_commit_complete(dreq); |
608 | |
609 | spin_lock(lock: &dreq->lock); |
610 | if (status < 0) { |
611 | /* Errors in commit are fatal */ |
612 | dreq->error = status; |
613 | dreq->flags = NFS_ODIRECT_DONE; |
614 | } else { |
615 | status = dreq->error; |
616 | } |
617 | spin_unlock(lock: &dreq->lock); |
618 | |
619 | nfs_init_cinfo_from_dreq(cinfo: &cinfo, dreq); |
620 | |
621 | while (!list_empty(head: &data->pages)) { |
622 | req = nfs_list_entry(head: data->pages.next); |
623 | nfs_list_remove_request(req); |
624 | if (status < 0) { |
625 | spin_lock(lock: &dreq->lock); |
626 | nfs_direct_truncate_request(dreq, req); |
627 | spin_unlock(lock: &dreq->lock); |
628 | nfs_release_request(req); |
629 | } else if (!nfs_write_match_verf(verf, req)) { |
630 | spin_lock(lock: &dreq->lock); |
631 | if (dreq->flags == 0) |
632 | dreq->flags = NFS_ODIRECT_RESCHED_WRITES; |
633 | spin_unlock(lock: &dreq->lock); |
634 | /* |
635 | * Despite the reboot, the write was successful, |
636 | * so reset wb_nio. |
637 | */ |
638 | req->wb_nio = 0; |
639 | nfs_mark_request_commit(req, NULL, cinfo: &cinfo, ds_commit_idx: 0); |
640 | } else |
641 | nfs_release_request(req); |
642 | nfs_unlock_and_release_request(req); |
643 | } |
644 | |
645 | if (nfs_commit_end(cinfo: cinfo.mds)) |
646 | nfs_direct_write_complete(dreq); |
647 | } |
648 | |
649 | static void nfs_direct_resched_write(struct nfs_commit_info *cinfo, |
650 | struct nfs_page *req) |
651 | { |
652 | struct nfs_direct_req *dreq = cinfo->dreq; |
653 | |
654 | trace_nfs_direct_resched_write(dreq); |
655 | |
656 | spin_lock(lock: &dreq->lock); |
657 | if (dreq->flags != NFS_ODIRECT_DONE) |
658 | dreq->flags = NFS_ODIRECT_RESCHED_WRITES; |
659 | spin_unlock(lock: &dreq->lock); |
660 | nfs_mark_request_commit(req, NULL, cinfo, ds_commit_idx: 0); |
661 | } |
662 | |
663 | static const struct nfs_commit_completion_ops nfs_direct_commit_completion_ops = { |
664 | .completion = nfs_direct_commit_complete, |
665 | .resched_write = nfs_direct_resched_write, |
666 | }; |
667 | |
668 | static void nfs_direct_commit_schedule(struct nfs_direct_req *dreq) |
669 | { |
670 | int res; |
671 | struct nfs_commit_info cinfo; |
672 | LIST_HEAD(mds_list); |
673 | |
674 | nfs_init_cinfo_from_dreq(cinfo: &cinfo, dreq); |
675 | nfs_commit_begin(cinfo: cinfo.mds); |
676 | nfs_scan_commit(inode: dreq->inode, dst: &mds_list, cinfo: &cinfo); |
677 | res = nfs_generic_commit_list(inode: dreq->inode, head: &mds_list, how: 0, cinfo: &cinfo); |
678 | if (res < 0) { /* res == -ENOMEM */ |
679 | spin_lock(lock: &dreq->lock); |
680 | if (dreq->flags == 0) |
681 | dreq->flags = NFS_ODIRECT_RESCHED_WRITES; |
682 | spin_unlock(lock: &dreq->lock); |
683 | } |
684 | if (nfs_commit_end(cinfo: cinfo.mds)) |
685 | nfs_direct_write_complete(dreq); |
686 | } |
687 | |
688 | static void nfs_direct_write_clear_reqs(struct nfs_direct_req *dreq) |
689 | { |
690 | struct nfs_commit_info cinfo; |
691 | struct nfs_page *req; |
692 | LIST_HEAD(reqs); |
693 | |
694 | nfs_init_cinfo_from_dreq(cinfo: &cinfo, dreq); |
695 | nfs_direct_write_scan_commit_list(inode: dreq->inode, list: &reqs, cinfo: &cinfo); |
696 | |
697 | while (!list_empty(head: &reqs)) { |
698 | req = nfs_list_entry(head: reqs.next); |
699 | nfs_list_remove_request(req); |
700 | nfs_direct_truncate_request(dreq, req); |
701 | nfs_release_request(req); |
702 | nfs_unlock_and_release_request(req); |
703 | } |
704 | } |
705 | |
706 | static void nfs_direct_write_schedule_work(struct work_struct *work) |
707 | { |
708 | struct nfs_direct_req *dreq = container_of(work, struct nfs_direct_req, work); |
709 | int flags = dreq->flags; |
710 | |
711 | dreq->flags = 0; |
712 | switch (flags) { |
713 | case NFS_ODIRECT_DO_COMMIT: |
714 | nfs_direct_commit_schedule(dreq); |
715 | break; |
716 | case NFS_ODIRECT_RESCHED_WRITES: |
717 | nfs_direct_write_reschedule(dreq); |
718 | break; |
719 | default: |
720 | nfs_direct_write_clear_reqs(dreq); |
721 | nfs_zap_mapping(inode: dreq->inode, mapping: dreq->inode->i_mapping); |
722 | nfs_direct_complete(dreq); |
723 | } |
724 | } |
725 | |
726 | static void nfs_direct_write_complete(struct nfs_direct_req *dreq) |
727 | { |
728 | trace_nfs_direct_write_complete(dreq); |
729 | queue_work(wq: nfsiod_workqueue, work: &dreq->work); /* Calls nfs_direct_write_schedule_work */ |
730 | } |
731 | |
732 | static void nfs_direct_write_completion(struct nfs_pgio_header *hdr) |
733 | { |
734 | struct nfs_direct_req *dreq = hdr->dreq; |
735 | struct nfs_commit_info cinfo; |
736 | struct nfs_page *req = nfs_list_entry(head: hdr->pages.next); |
737 | int flags = NFS_ODIRECT_DONE; |
738 | |
739 | trace_nfs_direct_write_completion(dreq); |
740 | |
741 | nfs_init_cinfo_from_dreq(cinfo: &cinfo, dreq); |
742 | |
743 | spin_lock(lock: &dreq->lock); |
744 | if (test_bit(NFS_IOHDR_REDO, &hdr->flags)) { |
745 | spin_unlock(lock: &dreq->lock); |
746 | goto out_put; |
747 | } |
748 | |
749 | nfs_direct_count_bytes(dreq, hdr); |
750 | if (test_bit(NFS_IOHDR_UNSTABLE_WRITES, &hdr->flags) && |
751 | !test_bit(NFS_IOHDR_ERROR, &hdr->flags)) { |
752 | if (!dreq->flags) |
753 | dreq->flags = NFS_ODIRECT_DO_COMMIT; |
754 | flags = dreq->flags; |
755 | } |
756 | spin_unlock(lock: &dreq->lock); |
757 | |
758 | while (!list_empty(head: &hdr->pages)) { |
759 | |
760 | req = nfs_list_entry(head: hdr->pages.next); |
761 | nfs_list_remove_request(req); |
762 | if (flags == NFS_ODIRECT_DO_COMMIT) { |
763 | kref_get(kref: &req->wb_kref); |
764 | memcpy(&req->wb_verf, &hdr->verf.verifier, |
765 | sizeof(req->wb_verf)); |
766 | nfs_mark_request_commit(req, lseg: hdr->lseg, cinfo: &cinfo, |
767 | ds_commit_idx: hdr->ds_commit_idx); |
768 | } else if (flags == NFS_ODIRECT_RESCHED_WRITES) { |
769 | kref_get(kref: &req->wb_kref); |
770 | nfs_mark_request_commit(req, NULL, cinfo: &cinfo, ds_commit_idx: 0); |
771 | } |
772 | nfs_unlock_and_release_request(req); |
773 | } |
774 | |
775 | out_put: |
776 | if (put_dreq(dreq)) |
777 | nfs_direct_write_complete(dreq); |
778 | hdr->release(hdr); |
779 | } |
780 | |
781 | static void nfs_write_sync_pgio_error(struct list_head *head, int error) |
782 | { |
783 | struct nfs_page *req; |
784 | |
785 | while (!list_empty(head)) { |
786 | req = nfs_list_entry(head: head->next); |
787 | nfs_list_remove_request(req); |
788 | nfs_unlock_and_release_request(req); |
789 | } |
790 | } |
791 | |
792 | static void nfs_direct_write_reschedule_io(struct nfs_pgio_header *hdr) |
793 | { |
794 | struct nfs_direct_req *dreq = hdr->dreq; |
795 | struct nfs_page *req; |
796 | struct nfs_commit_info cinfo; |
797 | |
798 | trace_nfs_direct_write_reschedule_io(dreq); |
799 | |
800 | nfs_init_cinfo_from_dreq(cinfo: &cinfo, dreq); |
801 | spin_lock(lock: &dreq->lock); |
802 | if (dreq->error == 0) |
803 | dreq->flags = NFS_ODIRECT_RESCHED_WRITES; |
804 | set_bit(nr: NFS_IOHDR_REDO, addr: &hdr->flags); |
805 | spin_unlock(lock: &dreq->lock); |
806 | while (!list_empty(head: &hdr->pages)) { |
807 | req = nfs_list_entry(head: hdr->pages.next); |
808 | nfs_list_remove_request(req); |
809 | nfs_unlock_request(req); |
810 | nfs_mark_request_commit(req, NULL, cinfo: &cinfo, ds_commit_idx: 0); |
811 | } |
812 | } |
813 | |
814 | static const struct nfs_pgio_completion_ops nfs_direct_write_completion_ops = { |
815 | .error_cleanup = nfs_write_sync_pgio_error, |
816 | .init_hdr = nfs_direct_pgio_init, |
817 | .completion = nfs_direct_write_completion, |
818 | .reschedule_io = nfs_direct_write_reschedule_io, |
819 | }; |
820 | |
821 | |
822 | /* |
823 | * NB: Return the value of the first error return code. Subsequent |
824 | * errors after the first one are ignored. |
825 | */ |
826 | /* |
827 | * For each wsize'd chunk of the user's buffer, dispatch an NFS WRITE |
828 | * operation. If nfs_writedata_alloc() or get_user_pages() fails, |
829 | * bail and stop sending more writes. Write length accounting is |
830 | * handled automatically by nfs_direct_write_result(). Otherwise, if |
831 | * no requests have been sent, just return an error. |
832 | */ |
833 | static ssize_t nfs_direct_write_schedule_iovec(struct nfs_direct_req *dreq, |
834 | struct iov_iter *iter, |
835 | loff_t pos, int ioflags) |
836 | { |
837 | struct nfs_pageio_descriptor desc; |
838 | struct inode *inode = dreq->inode; |
839 | struct nfs_commit_info cinfo; |
840 | ssize_t result = 0; |
841 | size_t requested_bytes = 0; |
842 | size_t wsize = max_t(size_t, NFS_SERVER(inode)->wsize, PAGE_SIZE); |
843 | bool defer = false; |
844 | |
845 | trace_nfs_direct_write_schedule_iovec(dreq); |
846 | |
847 | nfs_pageio_init_write(pgio: &desc, inode, ioflags, force_mds: false, |
848 | compl_ops: &nfs_direct_write_completion_ops); |
849 | desc.pg_dreq = dreq; |
850 | get_dreq(dreq); |
851 | inode_dio_begin(inode); |
852 | |
853 | NFS_I(inode)->write_io += iov_iter_count(i: iter); |
854 | while (iov_iter_count(i: iter)) { |
855 | struct page **pagevec; |
856 | size_t bytes; |
857 | size_t pgbase; |
858 | unsigned npages, i; |
859 | |
860 | result = iov_iter_get_pages_alloc2(i: iter, pages: &pagevec, |
861 | maxsize: wsize, start: &pgbase); |
862 | if (result < 0) |
863 | break; |
864 | |
865 | bytes = result; |
866 | npages = (result + pgbase + PAGE_SIZE - 1) / PAGE_SIZE; |
867 | for (i = 0; i < npages; i++) { |
868 | struct nfs_page *req; |
869 | unsigned int req_len = min_t(size_t, bytes, PAGE_SIZE - pgbase); |
870 | |
871 | req = nfs_page_create_from_page(ctx: dreq->ctx, page: pagevec[i], |
872 | pgbase, offset: pos, count: req_len); |
873 | if (IS_ERR(ptr: req)) { |
874 | result = PTR_ERR(ptr: req); |
875 | break; |
876 | } |
877 | |
878 | if (desc.pg_error < 0) { |
879 | nfs_free_request(req); |
880 | result = desc.pg_error; |
881 | break; |
882 | } |
883 | |
884 | pgbase = 0; |
885 | bytes -= req_len; |
886 | requested_bytes += req_len; |
887 | pos += req_len; |
888 | |
889 | if (defer) { |
890 | nfs_mark_request_commit(req, NULL, cinfo: &cinfo, ds_commit_idx: 0); |
891 | continue; |
892 | } |
893 | |
894 | nfs_lock_request(req); |
895 | if (nfs_pageio_add_request(&desc, req)) |
896 | continue; |
897 | |
898 | /* Exit on hard errors */ |
899 | if (desc.pg_error < 0 && desc.pg_error != -EAGAIN) { |
900 | result = desc.pg_error; |
901 | nfs_unlock_and_release_request(req); |
902 | break; |
903 | } |
904 | |
905 | /* If the error is soft, defer remaining requests */ |
906 | nfs_init_cinfo_from_dreq(cinfo: &cinfo, dreq); |
907 | spin_lock(lock: &dreq->lock); |
908 | dreq->flags = NFS_ODIRECT_RESCHED_WRITES; |
909 | spin_unlock(lock: &dreq->lock); |
910 | nfs_unlock_request(req); |
911 | nfs_mark_request_commit(req, NULL, cinfo: &cinfo, ds_commit_idx: 0); |
912 | desc.pg_error = 0; |
913 | defer = true; |
914 | } |
915 | nfs_direct_release_pages(pages: pagevec, npages); |
916 | kvfree(addr: pagevec); |
917 | if (result < 0) |
918 | break; |
919 | } |
920 | nfs_pageio_complete(desc: &desc); |
921 | |
922 | /* |
923 | * If no bytes were started, return the error, and let the |
924 | * generic layer handle the completion. |
925 | */ |
926 | if (requested_bytes == 0) { |
927 | inode_dio_end(inode); |
928 | nfs_direct_req_release(dreq); |
929 | return result < 0 ? result : -EIO; |
930 | } |
931 | |
932 | if (put_dreq(dreq)) |
933 | nfs_direct_write_complete(dreq); |
934 | return requested_bytes; |
935 | } |
936 | |
937 | /** |
938 | * nfs_file_direct_write - file direct write operation for NFS files |
939 | * @iocb: target I/O control block |
940 | * @iter: vector of user buffers from which to write data |
941 | * @swap: flag indicating this is swap IO, not O_DIRECT IO |
942 | * |
943 | * We use this function for direct writes instead of calling |
944 | * generic_file_aio_write() in order to avoid taking the inode |
945 | * semaphore and updating the i_size. The NFS server will set |
946 | * the new i_size and this client must read the updated size |
947 | * back into its cache. We let the server do generic write |
948 | * parameter checking and report problems. |
949 | * |
950 | * We eliminate local atime updates, see direct read above. |
951 | * |
952 | * We avoid unnecessary page cache invalidations for normal cached |
953 | * readers of this file. |
954 | * |
955 | * Note that O_APPEND is not supported for NFS direct writes, as there |
956 | * is no atomic O_APPEND write facility in the NFS protocol. |
957 | */ |
958 | ssize_t nfs_file_direct_write(struct kiocb *iocb, struct iov_iter *iter, |
959 | bool swap) |
960 | { |
961 | ssize_t result, requested; |
962 | size_t count; |
963 | struct file *file = iocb->ki_filp; |
964 | struct address_space *mapping = file->f_mapping; |
965 | struct inode *inode = mapping->host; |
966 | struct nfs_direct_req *dreq; |
967 | struct nfs_lock_context *l_ctx; |
968 | loff_t pos, end; |
969 | |
970 | dfprintk(FILE, "NFS: direct write(%pD2, %zd@%Ld)\n" , |
971 | file, iov_iter_count(iter), (long long) iocb->ki_pos); |
972 | |
973 | if (swap) |
974 | /* bypass generic checks */ |
975 | result = iov_iter_count(i: iter); |
976 | else |
977 | result = generic_write_checks(iocb, iter); |
978 | if (result <= 0) |
979 | return result; |
980 | count = result; |
981 | nfs_add_stats(inode: mapping->host, stat: NFSIOS_DIRECTWRITTENBYTES, addend: count); |
982 | |
983 | pos = iocb->ki_pos; |
984 | end = (pos + iov_iter_count(i: iter) - 1) >> PAGE_SHIFT; |
985 | |
986 | task_io_account_write(bytes: count); |
987 | |
988 | result = -ENOMEM; |
989 | dreq = nfs_direct_req_alloc(); |
990 | if (!dreq) |
991 | goto out; |
992 | |
993 | dreq->inode = inode; |
994 | dreq->max_count = count; |
995 | dreq->io_start = pos; |
996 | dreq->ctx = get_nfs_open_context(ctx: nfs_file_open_context(filp: iocb->ki_filp)); |
997 | l_ctx = nfs_get_lock_context(ctx: dreq->ctx); |
998 | if (IS_ERR(ptr: l_ctx)) { |
999 | result = PTR_ERR(ptr: l_ctx); |
1000 | nfs_direct_req_release(dreq); |
1001 | goto out_release; |
1002 | } |
1003 | dreq->l_ctx = l_ctx; |
1004 | if (!is_sync_kiocb(kiocb: iocb)) |
1005 | dreq->iocb = iocb; |
1006 | pnfs_init_ds_commit_info_ops(fl_cinfo: &dreq->ds_cinfo, inode); |
1007 | |
1008 | if (swap) { |
1009 | requested = nfs_direct_write_schedule_iovec(dreq, iter, pos, |
1010 | FLUSH_STABLE); |
1011 | } else { |
1012 | nfs_start_io_direct(inode); |
1013 | |
1014 | requested = nfs_direct_write_schedule_iovec(dreq, iter, pos, |
1015 | FLUSH_COND_STABLE); |
1016 | |
1017 | if (mapping->nrpages) { |
1018 | invalidate_inode_pages2_range(mapping, |
1019 | start: pos >> PAGE_SHIFT, end); |
1020 | } |
1021 | |
1022 | nfs_end_io_direct(inode); |
1023 | } |
1024 | |
1025 | if (requested > 0) { |
1026 | result = nfs_direct_wait(dreq); |
1027 | if (result > 0) { |
1028 | requested -= result; |
1029 | iocb->ki_pos = pos + result; |
1030 | /* XXX: should check the generic_write_sync retval */ |
1031 | generic_write_sync(iocb, count: result); |
1032 | } |
1033 | iov_iter_revert(i: iter, bytes: requested); |
1034 | } else { |
1035 | result = requested; |
1036 | } |
1037 | nfs_fscache_invalidate(inode, FSCACHE_INVAL_DIO_WRITE); |
1038 | out_release: |
1039 | nfs_direct_req_release(dreq); |
1040 | out: |
1041 | return result; |
1042 | } |
1043 | |
1044 | /** |
1045 | * nfs_init_directcache - create a slab cache for nfs_direct_req structures |
1046 | * |
1047 | */ |
1048 | int __init nfs_init_directcache(void) |
1049 | { |
1050 | nfs_direct_cachep = kmem_cache_create(name: "nfs_direct_cache" , |
1051 | size: sizeof(struct nfs_direct_req), |
1052 | align: 0, SLAB_RECLAIM_ACCOUNT, |
1053 | NULL); |
1054 | if (nfs_direct_cachep == NULL) |
1055 | return -ENOMEM; |
1056 | |
1057 | return 0; |
1058 | } |
1059 | |
1060 | /** |
1061 | * nfs_destroy_directcache - destroy the slab cache for nfs_direct_req structures |
1062 | * |
1063 | */ |
1064 | void nfs_destroy_directcache(void) |
1065 | { |
1066 | kmem_cache_destroy(s: nfs_direct_cachep); |
1067 | } |
1068 | |