1 | // SPDX-License-Identifier: GPL-2.0-only |
2 | /* |
3 | * Copyright (C) 2005-2008 Red Hat, Inc. All rights reserved. |
4 | */ |
5 | |
6 | #include <linux/fs.h> |
7 | #include <linux/filelock.h> |
8 | #include <linux/miscdevice.h> |
9 | #include <linux/poll.h> |
10 | #include <linux/dlm.h> |
11 | #include <linux/dlm_plock.h> |
12 | #include <linux/slab.h> |
13 | |
14 | #include <trace/events/dlm.h> |
15 | |
16 | #include "dlm_internal.h" |
17 | #include "lockspace.h" |
18 | |
19 | static DEFINE_SPINLOCK(ops_lock); |
20 | static LIST_HEAD(send_list); |
21 | static LIST_HEAD(recv_list); |
22 | static DECLARE_WAIT_QUEUE_HEAD(send_wq); |
23 | static DECLARE_WAIT_QUEUE_HEAD(recv_wq); |
24 | |
25 | struct plock_async_data { |
26 | void *fl; |
27 | void *file; |
28 | struct file_lock flc; |
29 | int (*callback)(struct file_lock *fl, int result); |
30 | }; |
31 | |
32 | struct plock_op { |
33 | struct list_head list; |
34 | int done; |
35 | struct dlm_plock_info info; |
36 | /* if set indicates async handling */ |
37 | struct plock_async_data *data; |
38 | }; |
39 | |
40 | static inline void set_version(struct dlm_plock_info *info) |
41 | { |
42 | info->version[0] = DLM_PLOCK_VERSION_MAJOR; |
43 | info->version[1] = DLM_PLOCK_VERSION_MINOR; |
44 | info->version[2] = DLM_PLOCK_VERSION_PATCH; |
45 | } |
46 | |
47 | static struct plock_op *plock_lookup_waiter(const struct dlm_plock_info *info) |
48 | { |
49 | struct plock_op *op = NULL, *iter; |
50 | |
51 | list_for_each_entry(iter, &recv_list, list) { |
52 | if (iter->info.fsid == info->fsid && |
53 | iter->info.number == info->number && |
54 | iter->info.owner == info->owner && |
55 | iter->info.pid == info->pid && |
56 | iter->info.start == info->start && |
57 | iter->info.end == info->end && |
58 | iter->info.ex == info->ex && |
59 | iter->info.wait) { |
60 | op = iter; |
61 | break; |
62 | } |
63 | } |
64 | |
65 | return op; |
66 | } |
67 | |
68 | static int check_version(struct dlm_plock_info *info) |
69 | { |
70 | if ((DLM_PLOCK_VERSION_MAJOR != info->version[0]) || |
71 | (DLM_PLOCK_VERSION_MINOR < info->version[1])) { |
72 | log_print("plock device version mismatch: " |
73 | "kernel (%u.%u.%u), user (%u.%u.%u)" , |
74 | DLM_PLOCK_VERSION_MAJOR, |
75 | DLM_PLOCK_VERSION_MINOR, |
76 | DLM_PLOCK_VERSION_PATCH, |
77 | info->version[0], |
78 | info->version[1], |
79 | info->version[2]); |
80 | return -EINVAL; |
81 | } |
82 | return 0; |
83 | } |
84 | |
85 | static void dlm_release_plock_op(struct plock_op *op) |
86 | { |
87 | kfree(objp: op->data); |
88 | kfree(objp: op); |
89 | } |
90 | |
91 | static void send_op(struct plock_op *op) |
92 | { |
93 | set_version(&op->info); |
94 | spin_lock(lock: &ops_lock); |
95 | list_add_tail(new: &op->list, head: &send_list); |
96 | spin_unlock(lock: &ops_lock); |
97 | wake_up(&send_wq); |
98 | } |
99 | |
100 | static int do_lock_cancel(const struct dlm_plock_info *orig_info) |
101 | { |
102 | struct plock_op *op; |
103 | int rv; |
104 | |
105 | op = kzalloc(size: sizeof(*op), GFP_NOFS); |
106 | if (!op) |
107 | return -ENOMEM; |
108 | |
109 | op->info = *orig_info; |
110 | op->info.optype = DLM_PLOCK_OP_CANCEL; |
111 | op->info.wait = 0; |
112 | |
113 | send_op(op); |
114 | wait_event(recv_wq, (op->done != 0)); |
115 | |
116 | rv = op->info.rv; |
117 | |
118 | dlm_release_plock_op(op); |
119 | return rv; |
120 | } |
121 | |
122 | int dlm_posix_lock(dlm_lockspace_t *lockspace, u64 number, struct file *file, |
123 | int cmd, struct file_lock *fl) |
124 | { |
125 | struct plock_async_data *op_data; |
126 | struct dlm_ls *ls; |
127 | struct plock_op *op; |
128 | int rv; |
129 | |
130 | ls = dlm_find_lockspace_local(id: lockspace); |
131 | if (!ls) |
132 | return -EINVAL; |
133 | |
134 | op = kzalloc(size: sizeof(*op), GFP_NOFS); |
135 | if (!op) { |
136 | rv = -ENOMEM; |
137 | goto out; |
138 | } |
139 | |
140 | op->info.optype = DLM_PLOCK_OP_LOCK; |
141 | op->info.pid = fl->c.flc_pid; |
142 | op->info.ex = lock_is_write(fl); |
143 | op->info.wait = !!(fl->c.flc_flags & FL_SLEEP); |
144 | op->info.fsid = ls->ls_global_id; |
145 | op->info.number = number; |
146 | op->info.start = fl->fl_start; |
147 | op->info.end = fl->fl_end; |
148 | op->info.owner = (__u64)(long) fl->c.flc_owner; |
149 | /* async handling */ |
150 | if (fl->fl_lmops && fl->fl_lmops->lm_grant) { |
151 | op_data = kzalloc(size: sizeof(*op_data), GFP_NOFS); |
152 | if (!op_data) { |
153 | dlm_release_plock_op(op); |
154 | rv = -ENOMEM; |
155 | goto out; |
156 | } |
157 | |
158 | op_data->callback = fl->fl_lmops->lm_grant; |
159 | locks_init_lock(&op_data->flc); |
160 | locks_copy_lock(&op_data->flc, fl); |
161 | op_data->fl = fl; |
162 | op_data->file = file; |
163 | |
164 | op->data = op_data; |
165 | |
166 | send_op(op); |
167 | rv = FILE_LOCK_DEFERRED; |
168 | goto out; |
169 | } |
170 | |
171 | send_op(op); |
172 | |
173 | if (op->info.wait) { |
174 | rv = wait_event_interruptible(recv_wq, (op->done != 0)); |
175 | if (rv == -ERESTARTSYS) { |
176 | spin_lock(lock: &ops_lock); |
177 | /* recheck under ops_lock if we got a done != 0, |
178 | * if so this interrupt case should be ignored |
179 | */ |
180 | if (op->done != 0) { |
181 | spin_unlock(lock: &ops_lock); |
182 | goto do_lock_wait; |
183 | } |
184 | spin_unlock(lock: &ops_lock); |
185 | |
186 | rv = do_lock_cancel(orig_info: &op->info); |
187 | switch (rv) { |
188 | case 0: |
189 | /* waiter was deleted in user space, answer will never come |
190 | * remove original request. The original request must be |
191 | * on recv_list because the answer of do_lock_cancel() |
192 | * synchronized it. |
193 | */ |
194 | spin_lock(lock: &ops_lock); |
195 | list_del(entry: &op->list); |
196 | spin_unlock(lock: &ops_lock); |
197 | rv = -EINTR; |
198 | break; |
199 | case -ENOENT: |
200 | /* cancellation wasn't successful but op should be done */ |
201 | fallthrough; |
202 | default: |
203 | /* internal error doing cancel we need to wait */ |
204 | goto wait; |
205 | } |
206 | |
207 | log_debug(ls, "%s: wait interrupted %x %llx pid %d" , |
208 | __func__, ls->ls_global_id, |
209 | (unsigned long long)number, op->info.pid); |
210 | dlm_release_plock_op(op); |
211 | goto out; |
212 | } |
213 | } else { |
214 | wait: |
215 | wait_event(recv_wq, (op->done != 0)); |
216 | } |
217 | |
218 | do_lock_wait: |
219 | |
220 | WARN_ON(!list_empty(&op->list)); |
221 | |
222 | rv = op->info.rv; |
223 | |
224 | if (!rv) { |
225 | if (locks_lock_file_wait(filp: file, fl) < 0) |
226 | log_error(ls, "dlm_posix_lock: vfs lock error %llx" , |
227 | (unsigned long long)number); |
228 | } |
229 | |
230 | dlm_release_plock_op(op); |
231 | out: |
232 | dlm_put_lockspace(ls); |
233 | return rv; |
234 | } |
235 | EXPORT_SYMBOL_GPL(dlm_posix_lock); |
236 | |
237 | /* Returns failure iff a successful lock operation should be canceled */ |
238 | static int dlm_plock_callback(struct plock_op *op) |
239 | { |
240 | struct plock_async_data *op_data = op->data; |
241 | struct file *file; |
242 | struct file_lock *fl; |
243 | struct file_lock *flc; |
244 | int (*notify)(struct file_lock *fl, int result) = NULL; |
245 | int rv = 0; |
246 | |
247 | WARN_ON(!list_empty(&op->list)); |
248 | |
249 | /* check if the following 2 are still valid or make a copy */ |
250 | file = op_data->file; |
251 | flc = &op_data->flc; |
252 | fl = op_data->fl; |
253 | notify = op_data->callback; |
254 | |
255 | if (op->info.rv) { |
256 | notify(fl, op->info.rv); |
257 | goto out; |
258 | } |
259 | |
260 | /* got fs lock; bookkeep locally as well: */ |
261 | flc->c.flc_flags &= ~FL_SLEEP; |
262 | if (posix_lock_file(file, flc, NULL)) { |
263 | /* |
264 | * This can only happen in the case of kmalloc() failure. |
265 | * The filesystem's own lock is the authoritative lock, |
266 | * so a failure to get the lock locally is not a disaster. |
267 | * As long as the fs cannot reliably cancel locks (especially |
268 | * in a low-memory situation), we're better off ignoring |
269 | * this failure than trying to recover. |
270 | */ |
271 | log_print("dlm_plock_callback: vfs lock error %llx file %p fl %p" , |
272 | (unsigned long long)op->info.number, file, fl); |
273 | } |
274 | |
275 | rv = notify(fl, 0); |
276 | if (rv) { |
277 | /* XXX: We need to cancel the fs lock here: */ |
278 | log_print("%s: lock granted after lock request failed; dangling lock!" , |
279 | __func__); |
280 | goto out; |
281 | } |
282 | |
283 | out: |
284 | dlm_release_plock_op(op); |
285 | return rv; |
286 | } |
287 | |
288 | int dlm_posix_unlock(dlm_lockspace_t *lockspace, u64 number, struct file *file, |
289 | struct file_lock *fl) |
290 | { |
291 | struct dlm_ls *ls; |
292 | struct plock_op *op; |
293 | int rv; |
294 | unsigned char saved_flags = fl->c.flc_flags; |
295 | |
296 | ls = dlm_find_lockspace_local(id: lockspace); |
297 | if (!ls) |
298 | return -EINVAL; |
299 | |
300 | op = kzalloc(size: sizeof(*op), GFP_NOFS); |
301 | if (!op) { |
302 | rv = -ENOMEM; |
303 | goto out; |
304 | } |
305 | |
306 | /* cause the vfs unlock to return ENOENT if lock is not found */ |
307 | fl->c.flc_flags |= FL_EXISTS; |
308 | |
309 | rv = locks_lock_file_wait(filp: file, fl); |
310 | if (rv == -ENOENT) { |
311 | rv = 0; |
312 | goto out_free; |
313 | } |
314 | if (rv < 0) { |
315 | log_error(ls, "dlm_posix_unlock: vfs unlock error %d %llx" , |
316 | rv, (unsigned long long)number); |
317 | } |
318 | |
319 | op->info.optype = DLM_PLOCK_OP_UNLOCK; |
320 | op->info.pid = fl->c.flc_pid; |
321 | op->info.fsid = ls->ls_global_id; |
322 | op->info.number = number; |
323 | op->info.start = fl->fl_start; |
324 | op->info.end = fl->fl_end; |
325 | op->info.owner = (__u64)(long) fl->c.flc_owner; |
326 | |
327 | if (fl->c.flc_flags & FL_CLOSE) { |
328 | op->info.flags |= DLM_PLOCK_FL_CLOSE; |
329 | send_op(op); |
330 | rv = 0; |
331 | goto out; |
332 | } |
333 | |
334 | send_op(op); |
335 | wait_event(recv_wq, (op->done != 0)); |
336 | |
337 | WARN_ON(!list_empty(&op->list)); |
338 | |
339 | rv = op->info.rv; |
340 | |
341 | if (rv == -ENOENT) |
342 | rv = 0; |
343 | |
344 | out_free: |
345 | dlm_release_plock_op(op); |
346 | out: |
347 | dlm_put_lockspace(ls); |
348 | fl->c.flc_flags = saved_flags; |
349 | return rv; |
350 | } |
351 | EXPORT_SYMBOL_GPL(dlm_posix_unlock); |
352 | |
353 | /* |
354 | * NOTE: This implementation can only handle async lock requests as nfs |
355 | * do it. It cannot handle cancellation of a pending lock request sitting |
356 | * in wait_event(), but for now only nfs is the only user local kernel |
357 | * user. |
358 | */ |
359 | int dlm_posix_cancel(dlm_lockspace_t *lockspace, u64 number, struct file *file, |
360 | struct file_lock *fl) |
361 | { |
362 | struct dlm_plock_info info; |
363 | struct plock_op *op; |
364 | struct dlm_ls *ls; |
365 | int rv; |
366 | |
367 | /* this only works for async request for now and nfs is the only |
368 | * kernel user right now. |
369 | */ |
370 | if (WARN_ON_ONCE(!fl->fl_lmops || !fl->fl_lmops->lm_grant)) |
371 | return -EOPNOTSUPP; |
372 | |
373 | ls = dlm_find_lockspace_local(id: lockspace); |
374 | if (!ls) |
375 | return -EINVAL; |
376 | |
377 | memset(&info, 0, sizeof(info)); |
378 | info.pid = fl->c.flc_pid; |
379 | info.ex = lock_is_write(fl); |
380 | info.fsid = ls->ls_global_id; |
381 | dlm_put_lockspace(ls); |
382 | info.number = number; |
383 | info.start = fl->fl_start; |
384 | info.end = fl->fl_end; |
385 | info.owner = (__u64)(long) fl->c.flc_owner; |
386 | |
387 | rv = do_lock_cancel(orig_info: &info); |
388 | switch (rv) { |
389 | case 0: |
390 | spin_lock(lock: &ops_lock); |
391 | /* lock request to cancel must be on recv_list because |
392 | * do_lock_cancel() synchronizes it. |
393 | */ |
394 | op = plock_lookup_waiter(info: &info); |
395 | if (WARN_ON_ONCE(!op)) { |
396 | spin_unlock(lock: &ops_lock); |
397 | rv = -ENOLCK; |
398 | break; |
399 | } |
400 | |
401 | list_del(entry: &op->list); |
402 | spin_unlock(lock: &ops_lock); |
403 | WARN_ON(op->info.optype != DLM_PLOCK_OP_LOCK); |
404 | op->data->callback(op->data->fl, -EINTR); |
405 | dlm_release_plock_op(op); |
406 | rv = -EINTR; |
407 | break; |
408 | case -ENOENT: |
409 | /* if cancel wasn't successful we probably were to late |
410 | * or it was a non-blocking lock request, so just unlock it. |
411 | */ |
412 | rv = dlm_posix_unlock(lockspace, number, file, fl); |
413 | break; |
414 | default: |
415 | break; |
416 | } |
417 | |
418 | return rv; |
419 | } |
420 | EXPORT_SYMBOL_GPL(dlm_posix_cancel); |
421 | |
422 | int dlm_posix_get(dlm_lockspace_t *lockspace, u64 number, struct file *file, |
423 | struct file_lock *fl) |
424 | { |
425 | struct dlm_ls *ls; |
426 | struct plock_op *op; |
427 | int rv; |
428 | |
429 | ls = dlm_find_lockspace_local(id: lockspace); |
430 | if (!ls) |
431 | return -EINVAL; |
432 | |
433 | op = kzalloc(size: sizeof(*op), GFP_NOFS); |
434 | if (!op) { |
435 | rv = -ENOMEM; |
436 | goto out; |
437 | } |
438 | |
439 | op->info.optype = DLM_PLOCK_OP_GET; |
440 | op->info.pid = fl->c.flc_pid; |
441 | op->info.ex = lock_is_write(fl); |
442 | op->info.fsid = ls->ls_global_id; |
443 | op->info.number = number; |
444 | op->info.start = fl->fl_start; |
445 | op->info.end = fl->fl_end; |
446 | op->info.owner = (__u64)(long) fl->c.flc_owner; |
447 | |
448 | send_op(op); |
449 | wait_event(recv_wq, (op->done != 0)); |
450 | |
451 | WARN_ON(!list_empty(&op->list)); |
452 | |
453 | /* info.rv from userspace is 1 for conflict, 0 for no-conflict, |
454 | -ENOENT if there are no locks on the file */ |
455 | |
456 | rv = op->info.rv; |
457 | |
458 | fl->c.flc_type = F_UNLCK; |
459 | if (rv == -ENOENT) |
460 | rv = 0; |
461 | else if (rv > 0) { |
462 | locks_init_lock(fl); |
463 | fl->c.flc_type = (op->info.ex) ? F_WRLCK : F_RDLCK; |
464 | fl->c.flc_flags = FL_POSIX; |
465 | fl->c.flc_pid = op->info.pid; |
466 | if (op->info.nodeid != dlm_our_nodeid()) |
467 | fl->c.flc_pid = -fl->c.flc_pid; |
468 | fl->fl_start = op->info.start; |
469 | fl->fl_end = op->info.end; |
470 | rv = 0; |
471 | } |
472 | |
473 | dlm_release_plock_op(op); |
474 | out: |
475 | dlm_put_lockspace(ls); |
476 | return rv; |
477 | } |
478 | EXPORT_SYMBOL_GPL(dlm_posix_get); |
479 | |
480 | /* a read copies out one plock request from the send list */ |
481 | static ssize_t dev_read(struct file *file, char __user *u, size_t count, |
482 | loff_t *ppos) |
483 | { |
484 | struct dlm_plock_info info; |
485 | struct plock_op *op = NULL; |
486 | |
487 | if (count < sizeof(info)) |
488 | return -EINVAL; |
489 | |
490 | spin_lock(lock: &ops_lock); |
491 | if (!list_empty(head: &send_list)) { |
492 | op = list_first_entry(&send_list, struct plock_op, list); |
493 | if (op->info.flags & DLM_PLOCK_FL_CLOSE) |
494 | list_del(entry: &op->list); |
495 | else |
496 | list_move_tail(list: &op->list, head: &recv_list); |
497 | memcpy(&info, &op->info, sizeof(info)); |
498 | } |
499 | spin_unlock(lock: &ops_lock); |
500 | |
501 | if (!op) |
502 | return -EAGAIN; |
503 | |
504 | trace_dlm_plock_read(info: &info); |
505 | |
506 | /* there is no need to get a reply from userspace for unlocks |
507 | that were generated by the vfs cleaning up for a close |
508 | (the process did not make an unlock call). */ |
509 | |
510 | if (op->info.flags & DLM_PLOCK_FL_CLOSE) |
511 | dlm_release_plock_op(op); |
512 | |
513 | if (copy_to_user(to: u, from: &info, n: sizeof(info))) |
514 | return -EFAULT; |
515 | return sizeof(info); |
516 | } |
517 | |
518 | /* a write copies in one plock result that should match a plock_op |
519 | on the recv list */ |
520 | static ssize_t dev_write(struct file *file, const char __user *u, size_t count, |
521 | loff_t *ppos) |
522 | { |
523 | struct plock_op *op = NULL, *iter; |
524 | struct dlm_plock_info info; |
525 | int do_callback = 0; |
526 | |
527 | if (count != sizeof(info)) |
528 | return -EINVAL; |
529 | |
530 | if (copy_from_user(to: &info, from: u, n: sizeof(info))) |
531 | return -EFAULT; |
532 | |
533 | trace_dlm_plock_write(info: &info); |
534 | |
535 | if (check_version(info: &info)) |
536 | return -EINVAL; |
537 | |
538 | /* |
539 | * The results for waiting ops (SETLKW) can be returned in any |
540 | * order, so match all fields to find the op. The results for |
541 | * non-waiting ops are returned in the order that they were sent |
542 | * to userspace, so match the result with the first non-waiting op. |
543 | */ |
544 | spin_lock(lock: &ops_lock); |
545 | if (info.wait) { |
546 | op = plock_lookup_waiter(info: &info); |
547 | } else { |
548 | list_for_each_entry(iter, &recv_list, list) { |
549 | if (!iter->info.wait && |
550 | iter->info.fsid == info.fsid) { |
551 | op = iter; |
552 | break; |
553 | } |
554 | } |
555 | } |
556 | |
557 | if (op) { |
558 | /* Sanity check that op and info match. */ |
559 | if (info.wait) |
560 | WARN_ON(op->info.optype != DLM_PLOCK_OP_LOCK); |
561 | else |
562 | WARN_ON(op->info.number != info.number || |
563 | op->info.owner != info.owner || |
564 | op->info.optype != info.optype); |
565 | |
566 | list_del_init(entry: &op->list); |
567 | memcpy(&op->info, &info, sizeof(info)); |
568 | if (op->data) |
569 | do_callback = 1; |
570 | else |
571 | op->done = 1; |
572 | } |
573 | spin_unlock(lock: &ops_lock); |
574 | |
575 | if (op) { |
576 | if (do_callback) |
577 | dlm_plock_callback(op); |
578 | else |
579 | wake_up(&recv_wq); |
580 | } else |
581 | pr_debug("%s: no op %x %llx" , __func__, |
582 | info.fsid, (unsigned long long)info.number); |
583 | return count; |
584 | } |
585 | |
586 | static __poll_t dev_poll(struct file *file, poll_table *wait) |
587 | { |
588 | __poll_t mask = 0; |
589 | |
590 | poll_wait(filp: file, wait_address: &send_wq, p: wait); |
591 | |
592 | spin_lock(lock: &ops_lock); |
593 | if (!list_empty(head: &send_list)) |
594 | mask = EPOLLIN | EPOLLRDNORM; |
595 | spin_unlock(lock: &ops_lock); |
596 | |
597 | return mask; |
598 | } |
599 | |
600 | static const struct file_operations dev_fops = { |
601 | .read = dev_read, |
602 | .write = dev_write, |
603 | .poll = dev_poll, |
604 | .owner = THIS_MODULE, |
605 | .llseek = noop_llseek, |
606 | }; |
607 | |
608 | static struct miscdevice plock_dev_misc = { |
609 | .minor = MISC_DYNAMIC_MINOR, |
610 | .name = DLM_PLOCK_MISC_NAME, |
611 | .fops = &dev_fops |
612 | }; |
613 | |
614 | int dlm_plock_init(void) |
615 | { |
616 | int rv; |
617 | |
618 | rv = misc_register(misc: &plock_dev_misc); |
619 | if (rv) |
620 | log_print("dlm_plock_init: misc_register failed %d" , rv); |
621 | return rv; |
622 | } |
623 | |
624 | void dlm_plock_exit(void) |
625 | { |
626 | misc_deregister(misc: &plock_dev_misc); |
627 | WARN_ON(!list_empty(&send_list)); |
628 | WARN_ON(!list_empty(&recv_list)); |
629 | } |
630 | |
631 | |