1 | // SPDX-License-Identifier: GPL-2.0-only |
2 | /* |
3 | * Copyright (C) 2005-2008 Red Hat, Inc. All rights reserved. |
4 | */ |
5 | |
6 | #include <linux/fs.h> |
7 | #include <linux/filelock.h> |
8 | #include <linux/miscdevice.h> |
9 | #include <linux/poll.h> |
10 | #include <linux/dlm.h> |
11 | #include <linux/dlm_plock.h> |
12 | #include <linux/slab.h> |
13 | |
14 | #include <trace/events/dlm.h> |
15 | |
16 | #include "dlm_internal.h" |
17 | #include "lockspace.h" |
18 | |
19 | static DEFINE_SPINLOCK(ops_lock); |
20 | static LIST_HEAD(send_list); |
21 | static LIST_HEAD(recv_list); |
22 | static DECLARE_WAIT_QUEUE_HEAD(send_wq); |
23 | static DECLARE_WAIT_QUEUE_HEAD(recv_wq); |
24 | |
25 | struct plock_async_data { |
26 | void *fl; |
27 | void *file; |
28 | struct file_lock flc; |
29 | int (*callback)(struct file_lock *fl, int result); |
30 | }; |
31 | |
32 | struct plock_op { |
33 | struct list_head list; |
34 | int done; |
35 | struct dlm_plock_info info; |
36 | /* if set indicates async handling */ |
37 | struct plock_async_data *data; |
38 | }; |
39 | |
40 | static inline void set_version(struct dlm_plock_info *info) |
41 | { |
42 | info->version[0] = DLM_PLOCK_VERSION_MAJOR; |
43 | info->version[1] = DLM_PLOCK_VERSION_MINOR; |
44 | info->version[2] = DLM_PLOCK_VERSION_PATCH; |
45 | } |
46 | |
47 | static struct plock_op *plock_lookup_waiter(const struct dlm_plock_info *info) |
48 | { |
49 | struct plock_op *op = NULL, *iter; |
50 | |
51 | list_for_each_entry(iter, &recv_list, list) { |
52 | if (iter->info.fsid == info->fsid && |
53 | iter->info.number == info->number && |
54 | iter->info.owner == info->owner && |
55 | iter->info.pid == info->pid && |
56 | iter->info.start == info->start && |
57 | iter->info.end == info->end && |
58 | iter->info.ex == info->ex && |
59 | iter->info.wait) { |
60 | op = iter; |
61 | break; |
62 | } |
63 | } |
64 | |
65 | return op; |
66 | } |
67 | |
68 | static int check_version(struct dlm_plock_info *info) |
69 | { |
70 | if ((DLM_PLOCK_VERSION_MAJOR != info->version[0]) || |
71 | (DLM_PLOCK_VERSION_MINOR < info->version[1])) { |
72 | log_print("plock device version mismatch: " |
73 | "kernel (%u.%u.%u), user (%u.%u.%u)" , |
74 | DLM_PLOCK_VERSION_MAJOR, |
75 | DLM_PLOCK_VERSION_MINOR, |
76 | DLM_PLOCK_VERSION_PATCH, |
77 | info->version[0], |
78 | info->version[1], |
79 | info->version[2]); |
80 | return -EINVAL; |
81 | } |
82 | return 0; |
83 | } |
84 | |
85 | static void dlm_release_plock_op(struct plock_op *op) |
86 | { |
87 | kfree(objp: op->data); |
88 | kfree(objp: op); |
89 | } |
90 | |
91 | static void send_op(struct plock_op *op) |
92 | { |
93 | set_version(&op->info); |
94 | spin_lock(lock: &ops_lock); |
95 | list_add_tail(new: &op->list, head: &send_list); |
96 | spin_unlock(lock: &ops_lock); |
97 | wake_up(&send_wq); |
98 | } |
99 | |
100 | static int do_lock_cancel(const struct dlm_plock_info *orig_info) |
101 | { |
102 | struct plock_op *op; |
103 | int rv; |
104 | |
105 | op = kzalloc(size: sizeof(*op), GFP_NOFS); |
106 | if (!op) |
107 | return -ENOMEM; |
108 | |
109 | op->info = *orig_info; |
110 | op->info.optype = DLM_PLOCK_OP_CANCEL; |
111 | op->info.wait = 0; |
112 | |
113 | send_op(op); |
114 | wait_event(recv_wq, (op->done != 0)); |
115 | |
116 | rv = op->info.rv; |
117 | |
118 | dlm_release_plock_op(op); |
119 | return rv; |
120 | } |
121 | |
122 | int dlm_posix_lock(dlm_lockspace_t *lockspace, u64 number, struct file *file, |
123 | int cmd, struct file_lock *fl) |
124 | { |
125 | struct plock_async_data *op_data; |
126 | struct dlm_ls *ls; |
127 | struct plock_op *op; |
128 | int rv; |
129 | |
130 | ls = dlm_find_lockspace_local(id: lockspace); |
131 | if (!ls) |
132 | return -EINVAL; |
133 | |
134 | op = kzalloc(size: sizeof(*op), GFP_NOFS); |
135 | if (!op) { |
136 | rv = -ENOMEM; |
137 | goto out; |
138 | } |
139 | |
140 | op->info.optype = DLM_PLOCK_OP_LOCK; |
141 | op->info.pid = fl->fl_pid; |
142 | op->info.ex = (fl->fl_type == F_WRLCK); |
143 | op->info.wait = IS_SETLKW(cmd); |
144 | op->info.fsid = ls->ls_global_id; |
145 | op->info.number = number; |
146 | op->info.start = fl->fl_start; |
147 | op->info.end = fl->fl_end; |
148 | /* async handling */ |
149 | if (fl->fl_lmops && fl->fl_lmops->lm_grant) { |
150 | op_data = kzalloc(size: sizeof(*op_data), GFP_NOFS); |
151 | if (!op_data) { |
152 | dlm_release_plock_op(op); |
153 | rv = -ENOMEM; |
154 | goto out; |
155 | } |
156 | |
157 | /* fl_owner is lockd which doesn't distinguish |
158 | processes on the nfs client */ |
159 | op->info.owner = (__u64) fl->fl_pid; |
160 | op_data->callback = fl->fl_lmops->lm_grant; |
161 | locks_init_lock(&op_data->flc); |
162 | locks_copy_lock(&op_data->flc, fl); |
163 | op_data->fl = fl; |
164 | op_data->file = file; |
165 | |
166 | op->data = op_data; |
167 | |
168 | send_op(op); |
169 | rv = FILE_LOCK_DEFERRED; |
170 | goto out; |
171 | } else { |
172 | op->info.owner = (__u64)(long) fl->fl_owner; |
173 | } |
174 | |
175 | send_op(op); |
176 | |
177 | if (op->info.wait) { |
178 | rv = wait_event_interruptible(recv_wq, (op->done != 0)); |
179 | if (rv == -ERESTARTSYS) { |
180 | spin_lock(lock: &ops_lock); |
181 | /* recheck under ops_lock if we got a done != 0, |
182 | * if so this interrupt case should be ignored |
183 | */ |
184 | if (op->done != 0) { |
185 | spin_unlock(lock: &ops_lock); |
186 | goto do_lock_wait; |
187 | } |
188 | spin_unlock(lock: &ops_lock); |
189 | |
190 | rv = do_lock_cancel(orig_info: &op->info); |
191 | switch (rv) { |
192 | case 0: |
193 | /* waiter was deleted in user space, answer will never come |
194 | * remove original request. The original request must be |
195 | * on recv_list because the answer of do_lock_cancel() |
196 | * synchronized it. |
197 | */ |
198 | spin_lock(lock: &ops_lock); |
199 | list_del(entry: &op->list); |
200 | spin_unlock(lock: &ops_lock); |
201 | rv = -EINTR; |
202 | break; |
203 | case -ENOENT: |
204 | /* cancellation wasn't successful but op should be done */ |
205 | fallthrough; |
206 | default: |
207 | /* internal error doing cancel we need to wait */ |
208 | goto wait; |
209 | } |
210 | |
211 | log_debug(ls, "%s: wait interrupted %x %llx pid %d" , |
212 | __func__, ls->ls_global_id, |
213 | (unsigned long long)number, op->info.pid); |
214 | dlm_release_plock_op(op); |
215 | goto out; |
216 | } |
217 | } else { |
218 | wait: |
219 | wait_event(recv_wq, (op->done != 0)); |
220 | } |
221 | |
222 | do_lock_wait: |
223 | |
224 | WARN_ON(!list_empty(&op->list)); |
225 | |
226 | rv = op->info.rv; |
227 | |
228 | if (!rv) { |
229 | if (locks_lock_file_wait(filp: file, fl) < 0) |
230 | log_error(ls, "dlm_posix_lock: vfs lock error %llx" , |
231 | (unsigned long long)number); |
232 | } |
233 | |
234 | dlm_release_plock_op(op); |
235 | out: |
236 | dlm_put_lockspace(ls); |
237 | return rv; |
238 | } |
239 | EXPORT_SYMBOL_GPL(dlm_posix_lock); |
240 | |
241 | /* Returns failure iff a successful lock operation should be canceled */ |
242 | static int dlm_plock_callback(struct plock_op *op) |
243 | { |
244 | struct plock_async_data *op_data = op->data; |
245 | struct file *file; |
246 | struct file_lock *fl; |
247 | struct file_lock *flc; |
248 | int (*notify)(struct file_lock *fl, int result) = NULL; |
249 | int rv = 0; |
250 | |
251 | WARN_ON(!list_empty(&op->list)); |
252 | |
253 | /* check if the following 2 are still valid or make a copy */ |
254 | file = op_data->file; |
255 | flc = &op_data->flc; |
256 | fl = op_data->fl; |
257 | notify = op_data->callback; |
258 | |
259 | if (op->info.rv) { |
260 | notify(fl, op->info.rv); |
261 | goto out; |
262 | } |
263 | |
264 | /* got fs lock; bookkeep locally as well: */ |
265 | flc->fl_flags &= ~FL_SLEEP; |
266 | if (posix_lock_file(file, flc, NULL)) { |
267 | /* |
268 | * This can only happen in the case of kmalloc() failure. |
269 | * The filesystem's own lock is the authoritative lock, |
270 | * so a failure to get the lock locally is not a disaster. |
271 | * As long as the fs cannot reliably cancel locks (especially |
272 | * in a low-memory situation), we're better off ignoring |
273 | * this failure than trying to recover. |
274 | */ |
275 | log_print("dlm_plock_callback: vfs lock error %llx file %p fl %p" , |
276 | (unsigned long long)op->info.number, file, fl); |
277 | } |
278 | |
279 | rv = notify(fl, 0); |
280 | if (rv) { |
281 | /* XXX: We need to cancel the fs lock here: */ |
282 | log_print("%s: lock granted after lock request failed; dangling lock!" , |
283 | __func__); |
284 | goto out; |
285 | } |
286 | |
287 | out: |
288 | dlm_release_plock_op(op); |
289 | return rv; |
290 | } |
291 | |
292 | int dlm_posix_unlock(dlm_lockspace_t *lockspace, u64 number, struct file *file, |
293 | struct file_lock *fl) |
294 | { |
295 | struct dlm_ls *ls; |
296 | struct plock_op *op; |
297 | int rv; |
298 | unsigned char fl_flags = fl->fl_flags; |
299 | |
300 | ls = dlm_find_lockspace_local(id: lockspace); |
301 | if (!ls) |
302 | return -EINVAL; |
303 | |
304 | op = kzalloc(size: sizeof(*op), GFP_NOFS); |
305 | if (!op) { |
306 | rv = -ENOMEM; |
307 | goto out; |
308 | } |
309 | |
310 | /* cause the vfs unlock to return ENOENT if lock is not found */ |
311 | fl->fl_flags |= FL_EXISTS; |
312 | |
313 | rv = locks_lock_file_wait(filp: file, fl); |
314 | if (rv == -ENOENT) { |
315 | rv = 0; |
316 | goto out_free; |
317 | } |
318 | if (rv < 0) { |
319 | log_error(ls, "dlm_posix_unlock: vfs unlock error %d %llx" , |
320 | rv, (unsigned long long)number); |
321 | } |
322 | |
323 | op->info.optype = DLM_PLOCK_OP_UNLOCK; |
324 | op->info.pid = fl->fl_pid; |
325 | op->info.fsid = ls->ls_global_id; |
326 | op->info.number = number; |
327 | op->info.start = fl->fl_start; |
328 | op->info.end = fl->fl_end; |
329 | if (fl->fl_lmops && fl->fl_lmops->lm_grant) |
330 | op->info.owner = (__u64) fl->fl_pid; |
331 | else |
332 | op->info.owner = (__u64)(long) fl->fl_owner; |
333 | |
334 | if (fl->fl_flags & FL_CLOSE) { |
335 | op->info.flags |= DLM_PLOCK_FL_CLOSE; |
336 | send_op(op); |
337 | rv = 0; |
338 | goto out; |
339 | } |
340 | |
341 | send_op(op); |
342 | wait_event(recv_wq, (op->done != 0)); |
343 | |
344 | WARN_ON(!list_empty(&op->list)); |
345 | |
346 | rv = op->info.rv; |
347 | |
348 | if (rv == -ENOENT) |
349 | rv = 0; |
350 | |
351 | out_free: |
352 | dlm_release_plock_op(op); |
353 | out: |
354 | dlm_put_lockspace(ls); |
355 | fl->fl_flags = fl_flags; |
356 | return rv; |
357 | } |
358 | EXPORT_SYMBOL_GPL(dlm_posix_unlock); |
359 | |
360 | /* |
361 | * NOTE: This implementation can only handle async lock requests as nfs |
362 | * do it. It cannot handle cancellation of a pending lock request sitting |
363 | * in wait_event(), but for now only nfs is the only user local kernel |
364 | * user. |
365 | */ |
366 | int dlm_posix_cancel(dlm_lockspace_t *lockspace, u64 number, struct file *file, |
367 | struct file_lock *fl) |
368 | { |
369 | struct dlm_plock_info info; |
370 | struct plock_op *op; |
371 | struct dlm_ls *ls; |
372 | int rv; |
373 | |
374 | /* this only works for async request for now and nfs is the only |
375 | * kernel user right now. |
376 | */ |
377 | if (WARN_ON_ONCE(!fl->fl_lmops || !fl->fl_lmops->lm_grant)) |
378 | return -EOPNOTSUPP; |
379 | |
380 | ls = dlm_find_lockspace_local(id: lockspace); |
381 | if (!ls) |
382 | return -EINVAL; |
383 | |
384 | memset(&info, 0, sizeof(info)); |
385 | info.pid = fl->fl_pid; |
386 | info.ex = (fl->fl_type == F_WRLCK); |
387 | info.fsid = ls->ls_global_id; |
388 | dlm_put_lockspace(ls); |
389 | info.number = number; |
390 | info.start = fl->fl_start; |
391 | info.end = fl->fl_end; |
392 | info.owner = (__u64)fl->fl_pid; |
393 | |
394 | rv = do_lock_cancel(orig_info: &info); |
395 | switch (rv) { |
396 | case 0: |
397 | spin_lock(lock: &ops_lock); |
398 | /* lock request to cancel must be on recv_list because |
399 | * do_lock_cancel() synchronizes it. |
400 | */ |
401 | op = plock_lookup_waiter(info: &info); |
402 | if (WARN_ON_ONCE(!op)) { |
403 | spin_unlock(lock: &ops_lock); |
404 | rv = -ENOLCK; |
405 | break; |
406 | } |
407 | |
408 | list_del(entry: &op->list); |
409 | spin_unlock(lock: &ops_lock); |
410 | WARN_ON(op->info.optype != DLM_PLOCK_OP_LOCK); |
411 | op->data->callback(op->data->fl, -EINTR); |
412 | dlm_release_plock_op(op); |
413 | rv = -EINTR; |
414 | break; |
415 | case -ENOENT: |
416 | /* if cancel wasn't successful we probably were to late |
417 | * or it was a non-blocking lock request, so just unlock it. |
418 | */ |
419 | rv = dlm_posix_unlock(lockspace, number, file, fl); |
420 | break; |
421 | default: |
422 | break; |
423 | } |
424 | |
425 | return rv; |
426 | } |
427 | EXPORT_SYMBOL_GPL(dlm_posix_cancel); |
428 | |
429 | int dlm_posix_get(dlm_lockspace_t *lockspace, u64 number, struct file *file, |
430 | struct file_lock *fl) |
431 | { |
432 | struct dlm_ls *ls; |
433 | struct plock_op *op; |
434 | int rv; |
435 | |
436 | ls = dlm_find_lockspace_local(id: lockspace); |
437 | if (!ls) |
438 | return -EINVAL; |
439 | |
440 | op = kzalloc(size: sizeof(*op), GFP_NOFS); |
441 | if (!op) { |
442 | rv = -ENOMEM; |
443 | goto out; |
444 | } |
445 | |
446 | op->info.optype = DLM_PLOCK_OP_GET; |
447 | op->info.pid = fl->fl_pid; |
448 | op->info.ex = (fl->fl_type == F_WRLCK); |
449 | op->info.fsid = ls->ls_global_id; |
450 | op->info.number = number; |
451 | op->info.start = fl->fl_start; |
452 | op->info.end = fl->fl_end; |
453 | if (fl->fl_lmops && fl->fl_lmops->lm_grant) |
454 | op->info.owner = (__u64) fl->fl_pid; |
455 | else |
456 | op->info.owner = (__u64)(long) fl->fl_owner; |
457 | |
458 | send_op(op); |
459 | wait_event(recv_wq, (op->done != 0)); |
460 | |
461 | WARN_ON(!list_empty(&op->list)); |
462 | |
463 | /* info.rv from userspace is 1 for conflict, 0 for no-conflict, |
464 | -ENOENT if there are no locks on the file */ |
465 | |
466 | rv = op->info.rv; |
467 | |
468 | fl->fl_type = F_UNLCK; |
469 | if (rv == -ENOENT) |
470 | rv = 0; |
471 | else if (rv > 0) { |
472 | locks_init_lock(fl); |
473 | fl->fl_type = (op->info.ex) ? F_WRLCK : F_RDLCK; |
474 | fl->fl_flags = FL_POSIX; |
475 | fl->fl_pid = op->info.pid; |
476 | if (op->info.nodeid != dlm_our_nodeid()) |
477 | fl->fl_pid = -fl->fl_pid; |
478 | fl->fl_start = op->info.start; |
479 | fl->fl_end = op->info.end; |
480 | rv = 0; |
481 | } |
482 | |
483 | dlm_release_plock_op(op); |
484 | out: |
485 | dlm_put_lockspace(ls); |
486 | return rv; |
487 | } |
488 | EXPORT_SYMBOL_GPL(dlm_posix_get); |
489 | |
490 | /* a read copies out one plock request from the send list */ |
491 | static ssize_t dev_read(struct file *file, char __user *u, size_t count, |
492 | loff_t *ppos) |
493 | { |
494 | struct dlm_plock_info info; |
495 | struct plock_op *op = NULL; |
496 | |
497 | if (count < sizeof(info)) |
498 | return -EINVAL; |
499 | |
500 | spin_lock(lock: &ops_lock); |
501 | if (!list_empty(head: &send_list)) { |
502 | op = list_first_entry(&send_list, struct plock_op, list); |
503 | if (op->info.flags & DLM_PLOCK_FL_CLOSE) |
504 | list_del(entry: &op->list); |
505 | else |
506 | list_move_tail(list: &op->list, head: &recv_list); |
507 | memcpy(&info, &op->info, sizeof(info)); |
508 | } |
509 | spin_unlock(lock: &ops_lock); |
510 | |
511 | if (!op) |
512 | return -EAGAIN; |
513 | |
514 | trace_dlm_plock_read(info: &info); |
515 | |
516 | /* there is no need to get a reply from userspace for unlocks |
517 | that were generated by the vfs cleaning up for a close |
518 | (the process did not make an unlock call). */ |
519 | |
520 | if (op->info.flags & DLM_PLOCK_FL_CLOSE) |
521 | dlm_release_plock_op(op); |
522 | |
523 | if (copy_to_user(to: u, from: &info, n: sizeof(info))) |
524 | return -EFAULT; |
525 | return sizeof(info); |
526 | } |
527 | |
528 | /* a write copies in one plock result that should match a plock_op |
529 | on the recv list */ |
530 | static ssize_t dev_write(struct file *file, const char __user *u, size_t count, |
531 | loff_t *ppos) |
532 | { |
533 | struct plock_op *op = NULL, *iter; |
534 | struct dlm_plock_info info; |
535 | int do_callback = 0; |
536 | |
537 | if (count != sizeof(info)) |
538 | return -EINVAL; |
539 | |
540 | if (copy_from_user(to: &info, from: u, n: sizeof(info))) |
541 | return -EFAULT; |
542 | |
543 | trace_dlm_plock_write(info: &info); |
544 | |
545 | if (check_version(info: &info)) |
546 | return -EINVAL; |
547 | |
548 | /* |
549 | * The results for waiting ops (SETLKW) can be returned in any |
550 | * order, so match all fields to find the op. The results for |
551 | * non-waiting ops are returned in the order that they were sent |
552 | * to userspace, so match the result with the first non-waiting op. |
553 | */ |
554 | spin_lock(lock: &ops_lock); |
555 | if (info.wait) { |
556 | op = plock_lookup_waiter(info: &info); |
557 | } else { |
558 | list_for_each_entry(iter, &recv_list, list) { |
559 | if (!iter->info.wait && |
560 | iter->info.fsid == info.fsid) { |
561 | op = iter; |
562 | break; |
563 | } |
564 | } |
565 | } |
566 | |
567 | if (op) { |
568 | /* Sanity check that op and info match. */ |
569 | if (info.wait) |
570 | WARN_ON(op->info.optype != DLM_PLOCK_OP_LOCK); |
571 | else |
572 | WARN_ON(op->info.number != info.number || |
573 | op->info.owner != info.owner || |
574 | op->info.optype != info.optype); |
575 | |
576 | list_del_init(entry: &op->list); |
577 | memcpy(&op->info, &info, sizeof(info)); |
578 | if (op->data) |
579 | do_callback = 1; |
580 | else |
581 | op->done = 1; |
582 | } |
583 | spin_unlock(lock: &ops_lock); |
584 | |
585 | if (op) { |
586 | if (do_callback) |
587 | dlm_plock_callback(op); |
588 | else |
589 | wake_up(&recv_wq); |
590 | } else |
591 | pr_debug("%s: no op %x %llx" , __func__, |
592 | info.fsid, (unsigned long long)info.number); |
593 | return count; |
594 | } |
595 | |
596 | static __poll_t dev_poll(struct file *file, poll_table *wait) |
597 | { |
598 | __poll_t mask = 0; |
599 | |
600 | poll_wait(filp: file, wait_address: &send_wq, p: wait); |
601 | |
602 | spin_lock(lock: &ops_lock); |
603 | if (!list_empty(head: &send_list)) |
604 | mask = EPOLLIN | EPOLLRDNORM; |
605 | spin_unlock(lock: &ops_lock); |
606 | |
607 | return mask; |
608 | } |
609 | |
610 | static const struct file_operations dev_fops = { |
611 | .read = dev_read, |
612 | .write = dev_write, |
613 | .poll = dev_poll, |
614 | .owner = THIS_MODULE, |
615 | .llseek = noop_llseek, |
616 | }; |
617 | |
618 | static struct miscdevice plock_dev_misc = { |
619 | .minor = MISC_DYNAMIC_MINOR, |
620 | .name = DLM_PLOCK_MISC_NAME, |
621 | .fops = &dev_fops |
622 | }; |
623 | |
624 | int dlm_plock_init(void) |
625 | { |
626 | int rv; |
627 | |
628 | rv = misc_register(misc: &plock_dev_misc); |
629 | if (rv) |
630 | log_print("dlm_plock_init: misc_register failed %d" , rv); |
631 | return rv; |
632 | } |
633 | |
634 | void dlm_plock_exit(void) |
635 | { |
636 | misc_deregister(misc: &plock_dev_misc); |
637 | WARN_ON(!list_empty(&send_list)); |
638 | WARN_ON(!list_empty(&recv_list)); |
639 | } |
640 | |
641 | |