1 | // SPDX-License-Identifier: GPL-2.0 |
2 | /* |
3 | * Copyright (C) 2008 Oracle. All rights reserved. |
4 | */ |
5 | |
6 | #include <linux/sched.h> |
7 | #include <linux/pagemap.h> |
8 | #include <linux/spinlock.h> |
9 | #include <linux/page-flags.h> |
10 | #include <asm/bug.h> |
11 | #include <trace/events/btrfs.h> |
12 | #include "misc.h" |
13 | #include "ctree.h" |
14 | #include "extent_io.h" |
15 | #include "locking.h" |
16 | #include "accessors.h" |
17 | |
18 | /* |
19 | * Lockdep class keys for extent_buffer->lock's in this root. For a given |
20 | * eb, the lockdep key is determined by the btrfs_root it belongs to and |
21 | * the level the eb occupies in the tree. |
22 | * |
23 | * Different roots are used for different purposes and may nest inside each |
24 | * other and they require separate keysets. As lockdep keys should be |
25 | * static, assign keysets according to the purpose of the root as indicated |
26 | * by btrfs_root->root_key.objectid. This ensures that all special purpose |
27 | * roots have separate keysets. |
28 | * |
29 | * Lock-nesting across peer nodes is always done with the immediate parent |
30 | * node locked thus preventing deadlock. As lockdep doesn't know this, use |
31 | * subclass to avoid triggering lockdep warning in such cases. |
32 | * |
33 | * The key is set by the readpage_end_io_hook after the buffer has passed |
34 | * csum validation but before the pages are unlocked. It is also set by |
35 | * btrfs_init_new_buffer on freshly allocated blocks. |
36 | * |
37 | * We also add a check to make sure the highest level of the tree is the |
38 | * same as our lockdep setup here. If BTRFS_MAX_LEVEL changes, this code |
39 | * needs update as well. |
40 | */ |
41 | #ifdef CONFIG_DEBUG_LOCK_ALLOC |
42 | #if BTRFS_MAX_LEVEL != 8 |
43 | #error |
44 | #endif |
45 | |
46 | #define DEFINE_LEVEL(stem, level) \ |
47 | .names[level] = "btrfs-" stem "-0" #level, |
48 | |
49 | #define DEFINE_NAME(stem) \ |
50 | DEFINE_LEVEL(stem, 0) \ |
51 | DEFINE_LEVEL(stem, 1) \ |
52 | DEFINE_LEVEL(stem, 2) \ |
53 | DEFINE_LEVEL(stem, 3) \ |
54 | DEFINE_LEVEL(stem, 4) \ |
55 | DEFINE_LEVEL(stem, 5) \ |
56 | DEFINE_LEVEL(stem, 6) \ |
57 | DEFINE_LEVEL(stem, 7) |
58 | |
59 | static struct btrfs_lockdep_keyset { |
60 | u64 id; /* root objectid */ |
61 | /* Longest entry: btrfs-block-group-00 */ |
62 | char names[BTRFS_MAX_LEVEL][24]; |
63 | struct lock_class_key keys[BTRFS_MAX_LEVEL]; |
64 | } btrfs_lockdep_keysets[] = { |
65 | { .id = BTRFS_ROOT_TREE_OBJECTID, DEFINE_NAME("root" ) }, |
66 | { .id = BTRFS_EXTENT_TREE_OBJECTID, DEFINE_NAME("extent" ) }, |
67 | { .id = BTRFS_CHUNK_TREE_OBJECTID, DEFINE_NAME("chunk" ) }, |
68 | { .id = BTRFS_DEV_TREE_OBJECTID, DEFINE_NAME("dev" ) }, |
69 | { .id = BTRFS_CSUM_TREE_OBJECTID, DEFINE_NAME("csum" ) }, |
70 | { .id = BTRFS_QUOTA_TREE_OBJECTID, DEFINE_NAME("quota" ) }, |
71 | { .id = BTRFS_TREE_LOG_OBJECTID, DEFINE_NAME("log" ) }, |
72 | { .id = BTRFS_TREE_RELOC_OBJECTID, DEFINE_NAME("treloc" ) }, |
73 | { .id = BTRFS_DATA_RELOC_TREE_OBJECTID, DEFINE_NAME("dreloc" ) }, |
74 | { .id = BTRFS_UUID_TREE_OBJECTID, DEFINE_NAME("uuid" ) }, |
75 | { .id = BTRFS_FREE_SPACE_TREE_OBJECTID, DEFINE_NAME("free-space" ) }, |
76 | { .id = BTRFS_BLOCK_GROUP_TREE_OBJECTID, DEFINE_NAME("block-group" ) }, |
77 | { .id = BTRFS_RAID_STRIPE_TREE_OBJECTID, DEFINE_NAME("raid-stripe" ) }, |
78 | { .id = 0, DEFINE_NAME("tree" ) }, |
79 | }; |
80 | |
81 | #undef DEFINE_LEVEL |
82 | #undef DEFINE_NAME |
83 | |
84 | void btrfs_set_buffer_lockdep_class(u64 objectid, struct extent_buffer *eb, int level) |
85 | { |
86 | struct btrfs_lockdep_keyset *ks; |
87 | |
88 | BUG_ON(level >= ARRAY_SIZE(ks->keys)); |
89 | |
90 | /* Find the matching keyset, id 0 is the default entry */ |
91 | for (ks = btrfs_lockdep_keysets; ks->id; ks++) |
92 | if (ks->id == objectid) |
93 | break; |
94 | |
95 | lockdep_set_class_and_name(&eb->lock, &ks->keys[level], ks->names[level]); |
96 | } |
97 | |
98 | void btrfs_maybe_reset_lockdep_class(struct btrfs_root *root, struct extent_buffer *eb) |
99 | { |
100 | if (test_bit(BTRFS_ROOT_RESET_LOCKDEP_CLASS, &root->state)) |
101 | btrfs_set_buffer_lockdep_class(objectid: root->root_key.objectid, |
102 | eb, level: btrfs_header_level(eb)); |
103 | } |
104 | |
105 | #endif |
106 | |
107 | #ifdef CONFIG_BTRFS_DEBUG |
108 | static void btrfs_set_eb_lock_owner(struct extent_buffer *eb, pid_t owner) |
109 | { |
110 | eb->lock_owner = owner; |
111 | } |
112 | #else |
113 | static void btrfs_set_eb_lock_owner(struct extent_buffer *eb, pid_t owner) { } |
114 | #endif |
115 | |
116 | /* |
117 | * Extent buffer locking |
118 | * ===================== |
119 | * |
120 | * We use a rw_semaphore for tree locking, and the semantics are exactly the |
121 | * same: |
122 | * |
123 | * - reader/writer exclusion |
124 | * - writer/writer exclusion |
125 | * - reader/reader sharing |
126 | * - try-lock semantics for readers and writers |
127 | * |
128 | * The rwsem implementation does opportunistic spinning which reduces number of |
129 | * times the locking task needs to sleep. |
130 | */ |
131 | |
132 | /* |
133 | * __btrfs_tree_read_lock - lock extent buffer for read |
134 | * @eb: the eb to be locked |
135 | * @nest: the nesting level to be used for lockdep |
136 | * |
137 | * This takes the read lock on the extent buffer, using the specified nesting |
138 | * level for lockdep purposes. |
139 | */ |
140 | void __btrfs_tree_read_lock(struct extent_buffer *eb, enum btrfs_lock_nesting nest) |
141 | { |
142 | u64 start_ns = 0; |
143 | |
144 | if (trace_btrfs_tree_read_lock_enabled()) |
145 | start_ns = ktime_get_ns(); |
146 | |
147 | down_read_nested(sem: &eb->lock, subclass: nest); |
148 | trace_btrfs_tree_read_lock(eb, start_ns); |
149 | } |
150 | |
151 | void btrfs_tree_read_lock(struct extent_buffer *eb) |
152 | { |
153 | __btrfs_tree_read_lock(eb, nest: BTRFS_NESTING_NORMAL); |
154 | } |
155 | |
156 | /* |
157 | * Try-lock for read. |
158 | * |
159 | * Return 1 if the rwlock has been taken, 0 otherwise |
160 | */ |
161 | int btrfs_try_tree_read_lock(struct extent_buffer *eb) |
162 | { |
163 | if (down_read_trylock(sem: &eb->lock)) { |
164 | trace_btrfs_try_tree_read_lock(eb); |
165 | return 1; |
166 | } |
167 | return 0; |
168 | } |
169 | |
170 | /* |
171 | * Try-lock for write. |
172 | * |
173 | * Return 1 if the rwlock has been taken, 0 otherwise |
174 | */ |
175 | int btrfs_try_tree_write_lock(struct extent_buffer *eb) |
176 | { |
177 | if (down_write_trylock(sem: &eb->lock)) { |
178 | btrfs_set_eb_lock_owner(eb, current->pid); |
179 | trace_btrfs_try_tree_write_lock(eb); |
180 | return 1; |
181 | } |
182 | return 0; |
183 | } |
184 | |
185 | /* |
186 | * Release read lock. |
187 | */ |
188 | void btrfs_tree_read_unlock(struct extent_buffer *eb) |
189 | { |
190 | trace_btrfs_tree_read_unlock(eb); |
191 | up_read(sem: &eb->lock); |
192 | } |
193 | |
194 | /* |
195 | * Lock eb for write. |
196 | * |
197 | * @eb: the eb to lock |
198 | * @nest: the nesting to use for the lock |
199 | * |
200 | * Returns with the eb->lock write locked. |
201 | */ |
202 | void __btrfs_tree_lock(struct extent_buffer *eb, enum btrfs_lock_nesting nest) |
203 | __acquires(&eb->lock) |
204 | { |
205 | u64 start_ns = 0; |
206 | |
207 | if (trace_btrfs_tree_lock_enabled()) |
208 | start_ns = ktime_get_ns(); |
209 | |
210 | down_write_nested(sem: &eb->lock, subclass: nest); |
211 | btrfs_set_eb_lock_owner(eb, current->pid); |
212 | trace_btrfs_tree_lock(eb, start_ns); |
213 | } |
214 | |
215 | void btrfs_tree_lock(struct extent_buffer *eb) |
216 | { |
217 | __btrfs_tree_lock(eb, nest: BTRFS_NESTING_NORMAL); |
218 | } |
219 | |
220 | /* |
221 | * Release the write lock. |
222 | */ |
223 | void btrfs_tree_unlock(struct extent_buffer *eb) |
224 | { |
225 | trace_btrfs_tree_unlock(eb); |
226 | btrfs_set_eb_lock_owner(eb, owner: 0); |
227 | up_write(sem: &eb->lock); |
228 | } |
229 | |
230 | /* |
231 | * This releases any locks held in the path starting at level and going all the |
232 | * way up to the root. |
233 | * |
234 | * btrfs_search_slot will keep the lock held on higher nodes in a few corner |
235 | * cases, such as COW of the block at slot zero in the node. This ignores |
236 | * those rules, and it should only be called when there are no more updates to |
237 | * be done higher up in the tree. |
238 | */ |
239 | void btrfs_unlock_up_safe(struct btrfs_path *path, int level) |
240 | { |
241 | int i; |
242 | |
243 | if (path->keep_locks) |
244 | return; |
245 | |
246 | for (i = level; i < BTRFS_MAX_LEVEL; i++) { |
247 | if (!path->nodes[i]) |
248 | continue; |
249 | if (!path->locks[i]) |
250 | continue; |
251 | btrfs_tree_unlock_rw(eb: path->nodes[i], rw: path->locks[i]); |
252 | path->locks[i] = 0; |
253 | } |
254 | } |
255 | |
256 | /* |
257 | * Loop around taking references on and locking the root node of the tree until |
258 | * we end up with a lock on the root node. |
259 | * |
260 | * Return: root extent buffer with write lock held |
261 | */ |
262 | struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root) |
263 | { |
264 | struct extent_buffer *eb; |
265 | |
266 | while (1) { |
267 | eb = btrfs_root_node(root); |
268 | |
269 | btrfs_maybe_reset_lockdep_class(root, eb); |
270 | btrfs_tree_lock(eb); |
271 | if (eb == root->node) |
272 | break; |
273 | btrfs_tree_unlock(eb); |
274 | free_extent_buffer(eb); |
275 | } |
276 | return eb; |
277 | } |
278 | |
279 | /* |
280 | * Loop around taking references on and locking the root node of the tree until |
281 | * we end up with a lock on the root node. |
282 | * |
283 | * Return: root extent buffer with read lock held |
284 | */ |
285 | struct extent_buffer *btrfs_read_lock_root_node(struct btrfs_root *root) |
286 | { |
287 | struct extent_buffer *eb; |
288 | |
289 | while (1) { |
290 | eb = btrfs_root_node(root); |
291 | |
292 | btrfs_maybe_reset_lockdep_class(root, eb); |
293 | btrfs_tree_read_lock(eb); |
294 | if (eb == root->node) |
295 | break; |
296 | btrfs_tree_read_unlock(eb); |
297 | free_extent_buffer(eb); |
298 | } |
299 | return eb; |
300 | } |
301 | |
302 | /* |
303 | * Loop around taking references on and locking the root node of the tree in |
304 | * nowait mode until we end up with a lock on the root node or returning to |
305 | * avoid blocking. |
306 | * |
307 | * Return: root extent buffer with read lock held or -EAGAIN. |
308 | */ |
309 | struct extent_buffer *btrfs_try_read_lock_root_node(struct btrfs_root *root) |
310 | { |
311 | struct extent_buffer *eb; |
312 | |
313 | while (1) { |
314 | eb = btrfs_root_node(root); |
315 | if (!btrfs_try_tree_read_lock(eb)) { |
316 | free_extent_buffer(eb); |
317 | return ERR_PTR(error: -EAGAIN); |
318 | } |
319 | if (eb == root->node) |
320 | break; |
321 | btrfs_tree_read_unlock(eb); |
322 | free_extent_buffer(eb); |
323 | } |
324 | return eb; |
325 | } |
326 | |
327 | /* |
328 | * DREW locks |
329 | * ========== |
330 | * |
331 | * DREW stands for double-reader-writer-exclusion lock. It's used in situation |
332 | * where you want to provide A-B exclusion but not AA or BB. |
333 | * |
334 | * Currently implementation gives more priority to reader. If a reader and a |
335 | * writer both race to acquire their respective sides of the lock the writer |
336 | * would yield its lock as soon as it detects a concurrent reader. Additionally |
337 | * if there are pending readers no new writers would be allowed to come in and |
338 | * acquire the lock. |
339 | */ |
340 | |
341 | void btrfs_drew_lock_init(struct btrfs_drew_lock *lock) |
342 | { |
343 | atomic_set(v: &lock->readers, i: 0); |
344 | atomic_set(v: &lock->writers, i: 0); |
345 | init_waitqueue_head(&lock->pending_readers); |
346 | init_waitqueue_head(&lock->pending_writers); |
347 | } |
348 | |
349 | /* Return true if acquisition is successful, false otherwise */ |
350 | bool btrfs_drew_try_write_lock(struct btrfs_drew_lock *lock) |
351 | { |
352 | if (atomic_read(v: &lock->readers)) |
353 | return false; |
354 | |
355 | atomic_inc(v: &lock->writers); |
356 | |
357 | /* Ensure writers count is updated before we check for pending readers */ |
358 | smp_mb__after_atomic(); |
359 | if (atomic_read(v: &lock->readers)) { |
360 | btrfs_drew_write_unlock(lock); |
361 | return false; |
362 | } |
363 | |
364 | return true; |
365 | } |
366 | |
367 | void btrfs_drew_write_lock(struct btrfs_drew_lock *lock) |
368 | { |
369 | while (true) { |
370 | if (btrfs_drew_try_write_lock(lock)) |
371 | return; |
372 | wait_event(lock->pending_writers, !atomic_read(&lock->readers)); |
373 | } |
374 | } |
375 | |
376 | void btrfs_drew_write_unlock(struct btrfs_drew_lock *lock) |
377 | { |
378 | atomic_dec(v: &lock->writers); |
379 | cond_wake_up(wq: &lock->pending_readers); |
380 | } |
381 | |
382 | void btrfs_drew_read_lock(struct btrfs_drew_lock *lock) |
383 | { |
384 | atomic_inc(v: &lock->readers); |
385 | |
386 | /* |
387 | * Ensure the pending reader count is perceieved BEFORE this reader |
388 | * goes to sleep in case of active writers. This guarantees new writers |
389 | * won't be allowed and that the current reader will be woken up when |
390 | * the last active writer finishes its jobs. |
391 | */ |
392 | smp_mb__after_atomic(); |
393 | |
394 | wait_event(lock->pending_readers, atomic_read(&lock->writers) == 0); |
395 | } |
396 | |
397 | void btrfs_drew_read_unlock(struct btrfs_drew_lock *lock) |
398 | { |
399 | /* |
400 | * atomic_dec_and_test implies a full barrier, so woken up writers |
401 | * are guaranteed to see the decrement |
402 | */ |
403 | if (atomic_dec_and_test(v: &lock->readers)) |
404 | wake_up(&lock->pending_writers); |
405 | } |
406 | |