1 | // SPDX-License-Identifier: GPL-2.0 |
2 | |
3 | /* erasure coding */ |
4 | |
5 | #include "bcachefs.h" |
6 | #include "alloc_background.h" |
7 | #include "alloc_foreground.h" |
8 | #include "backpointers.h" |
9 | #include "bkey_buf.h" |
10 | #include "bset.h" |
11 | #include "btree_gc.h" |
12 | #include "btree_update.h" |
13 | #include "btree_write_buffer.h" |
14 | #include "buckets.h" |
15 | #include "checksum.h" |
16 | #include "disk_groups.h" |
17 | #include "ec.h" |
18 | #include "error.h" |
19 | #include "io_read.h" |
20 | #include "keylist.h" |
21 | #include "recovery.h" |
22 | #include "replicas.h" |
23 | #include "super-io.h" |
24 | #include "util.h" |
25 | |
26 | #include <linux/sort.h> |
27 | |
28 | #ifdef __KERNEL__ |
29 | |
30 | #include <linux/raid/pq.h> |
31 | #include <linux/raid/xor.h> |
32 | |
33 | static void raid5_recov(unsigned disks, unsigned failed_idx, |
34 | size_t size, void **data) |
35 | { |
36 | unsigned i = 2, nr; |
37 | |
38 | BUG_ON(failed_idx >= disks); |
39 | |
40 | swap(data[0], data[failed_idx]); |
41 | memcpy(data[0], data[1], size); |
42 | |
43 | while (i < disks) { |
44 | nr = min_t(unsigned, disks - i, MAX_XOR_BLOCKS); |
45 | xor_blocks(count: nr, bytes: size, dest: data[0], srcs: data + i); |
46 | i += nr; |
47 | } |
48 | |
49 | swap(data[0], data[failed_idx]); |
50 | } |
51 | |
52 | static void raid_gen(int nd, int np, size_t size, void **v) |
53 | { |
54 | if (np >= 1) |
55 | raid5_recov(disks: nd + np, failed_idx: nd, size, data: v); |
56 | if (np >= 2) |
57 | raid6_call.gen_syndrome(nd + np, size, v); |
58 | BUG_ON(np > 2); |
59 | } |
60 | |
61 | static void raid_rec(int nr, int *ir, int nd, int np, size_t size, void **v) |
62 | { |
63 | switch (nr) { |
64 | case 0: |
65 | break; |
66 | case 1: |
67 | if (ir[0] < nd + 1) |
68 | raid5_recov(disks: nd + 1, failed_idx: ir[0], size, data: v); |
69 | else |
70 | raid6_call.gen_syndrome(nd + np, size, v); |
71 | break; |
72 | case 2: |
73 | if (ir[1] < nd) { |
74 | /* data+data failure. */ |
75 | raid6_2data_recov(nd + np, size, ir[0], ir[1], v); |
76 | } else if (ir[0] < nd) { |
77 | /* data + p/q failure */ |
78 | |
79 | if (ir[1] == nd) /* data + p failure */ |
80 | raid6_datap_recov(nd + np, size, ir[0], v); |
81 | else { /* data + q failure */ |
82 | raid5_recov(disks: nd + 1, failed_idx: ir[0], size, data: v); |
83 | raid6_call.gen_syndrome(nd + np, size, v); |
84 | } |
85 | } else { |
86 | raid_gen(nd, np, size, v); |
87 | } |
88 | break; |
89 | default: |
90 | BUG(); |
91 | } |
92 | } |
93 | |
94 | #else |
95 | |
96 | #include <raid/raid.h> |
97 | |
98 | #endif |
99 | |
100 | struct ec_bio { |
101 | struct bch_dev *ca; |
102 | struct ec_stripe_buf *buf; |
103 | size_t idx; |
104 | struct bio bio; |
105 | }; |
106 | |
107 | /* Stripes btree keys: */ |
108 | |
109 | int bch2_stripe_invalid(struct bch_fs *c, struct bkey_s_c k, |
110 | enum bkey_invalid_flags flags, |
111 | struct printbuf *err) |
112 | { |
113 | const struct bch_stripe *s = bkey_s_c_to_stripe(k).v; |
114 | int ret = 0; |
115 | |
116 | bkey_fsck_err_on(bkey_eq(k.k->p, POS_MIN) || |
117 | bpos_gt(k.k->p, POS(0, U32_MAX)), c, err, |
118 | stripe_pos_bad, |
119 | "stripe at bad pos" ); |
120 | |
121 | bkey_fsck_err_on(bkey_val_u64s(k.k) < stripe_val_u64s(s), c, err, |
122 | stripe_val_size_bad, |
123 | "incorrect value size (%zu < %u)" , |
124 | bkey_val_u64s(k.k), stripe_val_u64s(s)); |
125 | |
126 | ret = bch2_bkey_ptrs_invalid(c, k, flags, err); |
127 | fsck_err: |
128 | return ret; |
129 | } |
130 | |
131 | void bch2_stripe_to_text(struct printbuf *out, struct bch_fs *c, |
132 | struct bkey_s_c k) |
133 | { |
134 | const struct bch_stripe *sp = bkey_s_c_to_stripe(k).v; |
135 | struct bch_stripe s = {}; |
136 | |
137 | memcpy(&s, sp, min(sizeof(s), bkey_val_bytes(k.k))); |
138 | |
139 | unsigned nr_data = s.nr_blocks - s.nr_redundant; |
140 | |
141 | prt_printf(out, "algo %u sectors %u blocks %u:%u csum " , |
142 | s.algorithm, |
143 | le16_to_cpu(s.sectors), |
144 | nr_data, |
145 | s.nr_redundant); |
146 | bch2_prt_csum_type(out, s.csum_type); |
147 | prt_printf(out, " gran %u" , 1U << s.csum_granularity_bits); |
148 | |
149 | for (unsigned i = 0; i < s.nr_blocks; i++) { |
150 | const struct bch_extent_ptr *ptr = sp->ptrs + i; |
151 | |
152 | if ((void *) ptr >= bkey_val_end(k)) |
153 | break; |
154 | |
155 | bch2_extent_ptr_to_text(out, c, ptr); |
156 | |
157 | if (s.csum_type < BCH_CSUM_NR && |
158 | i < nr_data && |
159 | stripe_blockcount_offset(s: &s, idx: i) < bkey_val_bytes(k: k.k)) |
160 | prt_printf(out, "#%u" , stripe_blockcount_get(sp, i)); |
161 | } |
162 | } |
163 | |
164 | /* Triggers: */ |
165 | |
166 | static int bch2_trans_mark_stripe_bucket(struct btree_trans *trans, |
167 | struct bkey_s_c_stripe s, |
168 | unsigned idx, bool deleting) |
169 | { |
170 | struct bch_fs *c = trans->c; |
171 | const struct bch_extent_ptr *ptr = &s.v->ptrs[idx]; |
172 | struct btree_iter iter; |
173 | struct bkey_i_alloc_v4 *a; |
174 | enum bch_data_type data_type = idx >= s.v->nr_blocks - s.v->nr_redundant |
175 | ? BCH_DATA_parity : 0; |
176 | s64 sectors = data_type ? le16_to_cpu(s.v->sectors) : 0; |
177 | int ret = 0; |
178 | |
179 | if (deleting) |
180 | sectors = -sectors; |
181 | |
182 | a = bch2_trans_start_alloc_update(trans, &iter, PTR_BUCKET_POS(c, ptr)); |
183 | if (IS_ERR(ptr: a)) |
184 | return PTR_ERR(ptr: a); |
185 | |
186 | ret = bch2_check_bucket_ref(trans, s.s_c, ptr, sectors, data_type, |
187 | a->v.gen, a->v.data_type, |
188 | a->v.dirty_sectors); |
189 | if (ret) |
190 | goto err; |
191 | |
192 | if (!deleting) { |
193 | if (bch2_trans_inconsistent_on(a->v.stripe || |
194 | a->v.stripe_redundancy, trans, |
195 | "bucket %llu:%llu gen %u data type %s dirty_sectors %u: multiple stripes using same bucket (%u, %llu)" , |
196 | iter.pos.inode, iter.pos.offset, a->v.gen, |
197 | bch2_data_type_str(a->v.data_type), |
198 | a->v.dirty_sectors, |
199 | a->v.stripe, s.k->p.offset)) { |
200 | ret = -EIO; |
201 | goto err; |
202 | } |
203 | |
204 | if (bch2_trans_inconsistent_on(data_type && a->v.dirty_sectors, trans, |
205 | "bucket %llu:%llu gen %u data type %s dirty_sectors %u: data already in stripe bucket %llu" , |
206 | iter.pos.inode, iter.pos.offset, a->v.gen, |
207 | bch2_data_type_str(a->v.data_type), |
208 | a->v.dirty_sectors, |
209 | s.k->p.offset)) { |
210 | ret = -EIO; |
211 | goto err; |
212 | } |
213 | |
214 | a->v.stripe = s.k->p.offset; |
215 | a->v.stripe_redundancy = s.v->nr_redundant; |
216 | a->v.data_type = BCH_DATA_stripe; |
217 | } else { |
218 | if (bch2_trans_inconsistent_on(a->v.stripe != s.k->p.offset || |
219 | a->v.stripe_redundancy != s.v->nr_redundant, trans, |
220 | "bucket %llu:%llu gen %u: not marked as stripe when deleting stripe %llu (got %u)" , |
221 | iter.pos.inode, iter.pos.offset, a->v.gen, |
222 | s.k->p.offset, a->v.stripe)) { |
223 | ret = -EIO; |
224 | goto err; |
225 | } |
226 | |
227 | a->v.stripe = 0; |
228 | a->v.stripe_redundancy = 0; |
229 | a->v.data_type = alloc_data_type(a: a->v, data_type: BCH_DATA_user); |
230 | } |
231 | |
232 | a->v.dirty_sectors += sectors; |
233 | if (data_type) |
234 | a->v.data_type = !deleting ? data_type : 0; |
235 | |
236 | ret = bch2_trans_update(trans, &iter, &a->k_i, 0); |
237 | if (ret) |
238 | goto err; |
239 | err: |
240 | bch2_trans_iter_exit(trans, &iter); |
241 | return ret; |
242 | } |
243 | |
244 | static int mark_stripe_bucket(struct btree_trans *trans, |
245 | struct bkey_s_c k, |
246 | unsigned ptr_idx, |
247 | unsigned flags) |
248 | { |
249 | struct bch_fs *c = trans->c; |
250 | const struct bch_stripe *s = bkey_s_c_to_stripe(k).v; |
251 | unsigned nr_data = s->nr_blocks - s->nr_redundant; |
252 | bool parity = ptr_idx >= nr_data; |
253 | enum bch_data_type data_type = parity ? BCH_DATA_parity : BCH_DATA_stripe; |
254 | s64 sectors = parity ? le16_to_cpu(s->sectors) : 0; |
255 | const struct bch_extent_ptr *ptr = s->ptrs + ptr_idx; |
256 | struct bch_dev *ca = bch_dev_bkey_exists(c, idx: ptr->dev); |
257 | struct bucket old, new, *g; |
258 | struct printbuf buf = PRINTBUF; |
259 | int ret = 0; |
260 | |
261 | BUG_ON(!(flags & BTREE_TRIGGER_GC)); |
262 | |
263 | /* * XXX doesn't handle deletion */ |
264 | |
265 | percpu_down_read(sem: &c->mark_lock); |
266 | g = PTR_GC_BUCKET(ca, ptr); |
267 | |
268 | if (g->dirty_sectors || |
269 | (g->stripe && g->stripe != k.k->p.offset)) { |
270 | bch2_fs_inconsistent(c, |
271 | "bucket %u:%zu gen %u: multiple stripes using same bucket\n%s" , |
272 | ptr->dev, PTR_BUCKET_NR(ca, ptr), g->gen, |
273 | (bch2_bkey_val_to_text(&buf, c, k), buf.buf)); |
274 | ret = -EINVAL; |
275 | goto err; |
276 | } |
277 | |
278 | bucket_lock(b: g); |
279 | old = *g; |
280 | |
281 | ret = bch2_check_bucket_ref(trans, k, ptr, sectors, data_type, |
282 | g->gen, g->data_type, |
283 | g->dirty_sectors); |
284 | if (ret) |
285 | goto err; |
286 | |
287 | g->data_type = data_type; |
288 | g->dirty_sectors += sectors; |
289 | |
290 | g->stripe = k.k->p.offset; |
291 | g->stripe_redundancy = s->nr_redundant; |
292 | new = *g; |
293 | err: |
294 | bucket_unlock(b: g); |
295 | if (!ret) |
296 | bch2_dev_usage_update_m(c, ca, &old, &new); |
297 | percpu_up_read(sem: &c->mark_lock); |
298 | printbuf_exit(&buf); |
299 | return ret; |
300 | } |
301 | |
302 | int bch2_trigger_stripe(struct btree_trans *trans, |
303 | enum btree_id btree_id, unsigned level, |
304 | struct bkey_s_c old, struct bkey_s _new, |
305 | unsigned flags) |
306 | { |
307 | struct bkey_s_c new = _new.s_c; |
308 | struct bch_fs *c = trans->c; |
309 | u64 idx = new.k->p.offset; |
310 | const struct bch_stripe *old_s = old.k->type == KEY_TYPE_stripe |
311 | ? bkey_s_c_to_stripe(k: old).v : NULL; |
312 | const struct bch_stripe *new_s = new.k->type == KEY_TYPE_stripe |
313 | ? bkey_s_c_to_stripe(k: new).v : NULL; |
314 | |
315 | if (flags & BTREE_TRIGGER_TRANSACTIONAL) { |
316 | /* |
317 | * If the pointers aren't changing, we don't need to do anything: |
318 | */ |
319 | if (new_s && old_s && |
320 | new_s->nr_blocks == old_s->nr_blocks && |
321 | new_s->nr_redundant == old_s->nr_redundant && |
322 | !memcmp(p: old_s->ptrs, q: new_s->ptrs, |
323 | size: new_s->nr_blocks * sizeof(struct bch_extent_ptr))) |
324 | return 0; |
325 | |
326 | BUG_ON(new_s && old_s && |
327 | (new_s->nr_blocks != old_s->nr_blocks || |
328 | new_s->nr_redundant != old_s->nr_redundant)); |
329 | |
330 | if (new_s) { |
331 | s64 sectors = le16_to_cpu(new_s->sectors); |
332 | |
333 | struct bch_replicas_padded r; |
334 | bch2_bkey_to_replicas(&r.e, new); |
335 | int ret = bch2_update_replicas_list(trans, &r.e, sectors * new_s->nr_redundant); |
336 | if (ret) |
337 | return ret; |
338 | } |
339 | |
340 | if (old_s) { |
341 | s64 sectors = -((s64) le16_to_cpu(old_s->sectors)); |
342 | |
343 | struct bch_replicas_padded r; |
344 | bch2_bkey_to_replicas(&r.e, old); |
345 | int ret = bch2_update_replicas_list(trans, &r.e, sectors * old_s->nr_redundant); |
346 | if (ret) |
347 | return ret; |
348 | } |
349 | |
350 | unsigned nr_blocks = new_s ? new_s->nr_blocks : old_s->nr_blocks; |
351 | for (unsigned i = 0; i < nr_blocks; i++) { |
352 | if (new_s && old_s && |
353 | !memcmp(p: &new_s->ptrs[i], |
354 | q: &old_s->ptrs[i], |
355 | size: sizeof(new_s->ptrs[i]))) |
356 | continue; |
357 | |
358 | if (new_s) { |
359 | int ret = bch2_trans_mark_stripe_bucket(trans, |
360 | s: bkey_s_c_to_stripe(k: new), idx: i, deleting: false); |
361 | if (ret) |
362 | return ret; |
363 | } |
364 | |
365 | if (old_s) { |
366 | int ret = bch2_trans_mark_stripe_bucket(trans, |
367 | s: bkey_s_c_to_stripe(k: old), idx: i, deleting: true); |
368 | if (ret) |
369 | return ret; |
370 | } |
371 | } |
372 | } |
373 | |
374 | if (flags & BTREE_TRIGGER_ATOMIC) { |
375 | struct stripe *m = genradix_ptr(&c->stripes, idx); |
376 | |
377 | if (!m) { |
378 | struct printbuf buf1 = PRINTBUF; |
379 | struct printbuf buf2 = PRINTBUF; |
380 | |
381 | bch2_bkey_val_to_text(&buf1, c, old); |
382 | bch2_bkey_val_to_text(&buf2, c, new); |
383 | bch_err_ratelimited(c, "error marking nonexistent stripe %llu while marking\n" |
384 | "old %s\n" |
385 | "new %s" , idx, buf1.buf, buf2.buf); |
386 | printbuf_exit(&buf2); |
387 | printbuf_exit(&buf1); |
388 | bch2_inconsistent_error(c); |
389 | return -1; |
390 | } |
391 | |
392 | if (!new_s) { |
393 | bch2_stripes_heap_del(c, m, idx); |
394 | |
395 | memset(m, 0, sizeof(*m)); |
396 | } else { |
397 | m->sectors = le16_to_cpu(new_s->sectors); |
398 | m->algorithm = new_s->algorithm; |
399 | m->nr_blocks = new_s->nr_blocks; |
400 | m->nr_redundant = new_s->nr_redundant; |
401 | m->blocks_nonempty = 0; |
402 | |
403 | for (unsigned i = 0; i < new_s->nr_blocks; i++) |
404 | m->blocks_nonempty += !!stripe_blockcount_get(s: new_s, idx: i); |
405 | |
406 | if (!old_s) |
407 | bch2_stripes_heap_insert(c, m, idx); |
408 | else |
409 | bch2_stripes_heap_update(c, m, idx); |
410 | } |
411 | } |
412 | |
413 | if (flags & BTREE_TRIGGER_GC) { |
414 | struct gc_stripe *m = |
415 | genradix_ptr_alloc(&c->gc_stripes, idx, GFP_KERNEL); |
416 | |
417 | if (!m) { |
418 | bch_err(c, "error allocating memory for gc_stripes, idx %llu" , |
419 | idx); |
420 | return -BCH_ERR_ENOMEM_mark_stripe; |
421 | } |
422 | /* |
423 | * This will be wrong when we bring back runtime gc: we should |
424 | * be unmarking the old key and then marking the new key |
425 | */ |
426 | m->alive = true; |
427 | m->sectors = le16_to_cpu(new_s->sectors); |
428 | m->nr_blocks = new_s->nr_blocks; |
429 | m->nr_redundant = new_s->nr_redundant; |
430 | |
431 | for (unsigned i = 0; i < new_s->nr_blocks; i++) |
432 | m->ptrs[i] = new_s->ptrs[i]; |
433 | |
434 | bch2_bkey_to_replicas(&m->r.e, new); |
435 | |
436 | /* |
437 | * gc recalculates this field from stripe ptr |
438 | * references: |
439 | */ |
440 | memset(m->block_sectors, 0, sizeof(m->block_sectors)); |
441 | |
442 | for (unsigned i = 0; i < new_s->nr_blocks; i++) { |
443 | int ret = mark_stripe_bucket(trans, k: new, ptr_idx: i, flags); |
444 | if (ret) |
445 | return ret; |
446 | } |
447 | |
448 | int ret = bch2_update_replicas(c, new, &m->r.e, |
449 | ((s64) m->sectors * m->nr_redundant), |
450 | 0, true); |
451 | if (ret) { |
452 | struct printbuf buf = PRINTBUF; |
453 | |
454 | bch2_bkey_val_to_text(&buf, c, new); |
455 | bch2_fs_fatal_error(c, ": no replicas entry for %s" , buf.buf); |
456 | printbuf_exit(&buf); |
457 | return ret; |
458 | } |
459 | } |
460 | |
461 | return 0; |
462 | } |
463 | |
464 | /* returns blocknr in stripe that we matched: */ |
465 | static const struct bch_extent_ptr *bkey_matches_stripe(struct bch_stripe *s, |
466 | struct bkey_s_c k, unsigned *block) |
467 | { |
468 | struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); |
469 | unsigned i, nr_data = s->nr_blocks - s->nr_redundant; |
470 | |
471 | bkey_for_each_ptr(ptrs, ptr) |
472 | for (i = 0; i < nr_data; i++) |
473 | if (__bch2_ptr_matches_stripe(stripe_ptr: &s->ptrs[i], data_ptr: ptr, |
474 | le16_to_cpu(s->sectors))) { |
475 | *block = i; |
476 | return ptr; |
477 | } |
478 | |
479 | return NULL; |
480 | } |
481 | |
482 | static bool extent_has_stripe_ptr(struct bkey_s_c k, u64 idx) |
483 | { |
484 | switch (k.k->type) { |
485 | case KEY_TYPE_extent: { |
486 | struct bkey_s_c_extent e = bkey_s_c_to_extent(k); |
487 | const union bch_extent_entry *entry; |
488 | |
489 | extent_for_each_entry(e, entry) |
490 | if (extent_entry_type(e: entry) == |
491 | BCH_EXTENT_ENTRY_stripe_ptr && |
492 | entry->stripe_ptr.idx == idx) |
493 | return true; |
494 | |
495 | break; |
496 | } |
497 | } |
498 | |
499 | return false; |
500 | } |
501 | |
502 | /* Stripe bufs: */ |
503 | |
504 | static void ec_stripe_buf_exit(struct ec_stripe_buf *buf) |
505 | { |
506 | if (buf->key.k.type == KEY_TYPE_stripe) { |
507 | struct bkey_i_stripe *s = bkey_i_to_stripe(k: &buf->key); |
508 | unsigned i; |
509 | |
510 | for (i = 0; i < s->v.nr_blocks; i++) { |
511 | kvfree(addr: buf->data[i]); |
512 | buf->data[i] = NULL; |
513 | } |
514 | } |
515 | } |
516 | |
517 | /* XXX: this is a non-mempoolified memory allocation: */ |
518 | static int ec_stripe_buf_init(struct ec_stripe_buf *buf, |
519 | unsigned offset, unsigned size) |
520 | { |
521 | struct bch_stripe *v = &bkey_i_to_stripe(k: &buf->key)->v; |
522 | unsigned csum_granularity = 1U << v->csum_granularity_bits; |
523 | unsigned end = offset + size; |
524 | unsigned i; |
525 | |
526 | BUG_ON(end > le16_to_cpu(v->sectors)); |
527 | |
528 | offset = round_down(offset, csum_granularity); |
529 | end = min_t(unsigned, le16_to_cpu(v->sectors), |
530 | round_up(end, csum_granularity)); |
531 | |
532 | buf->offset = offset; |
533 | buf->size = end - offset; |
534 | |
535 | memset(buf->valid, 0xFF, sizeof(buf->valid)); |
536 | |
537 | for (i = 0; i < v->nr_blocks; i++) { |
538 | buf->data[i] = kvmalloc(size: buf->size << 9, GFP_KERNEL); |
539 | if (!buf->data[i]) |
540 | goto err; |
541 | } |
542 | |
543 | return 0; |
544 | err: |
545 | ec_stripe_buf_exit(buf); |
546 | return -BCH_ERR_ENOMEM_stripe_buf; |
547 | } |
548 | |
549 | /* Checksumming: */ |
550 | |
551 | static struct bch_csum ec_block_checksum(struct ec_stripe_buf *buf, |
552 | unsigned block, unsigned offset) |
553 | { |
554 | struct bch_stripe *v = &bkey_i_to_stripe(k: &buf->key)->v; |
555 | unsigned csum_granularity = 1 << v->csum_granularity_bits; |
556 | unsigned end = buf->offset + buf->size; |
557 | unsigned len = min(csum_granularity, end - offset); |
558 | |
559 | BUG_ON(offset >= end); |
560 | BUG_ON(offset < buf->offset); |
561 | BUG_ON(offset & (csum_granularity - 1)); |
562 | BUG_ON(offset + len != le16_to_cpu(v->sectors) && |
563 | (len & (csum_granularity - 1))); |
564 | |
565 | return bch2_checksum(NULL, v->csum_type, |
566 | null_nonce(), |
567 | buf->data[block] + ((offset - buf->offset) << 9), |
568 | len << 9); |
569 | } |
570 | |
571 | static void ec_generate_checksums(struct ec_stripe_buf *buf) |
572 | { |
573 | struct bch_stripe *v = &bkey_i_to_stripe(k: &buf->key)->v; |
574 | unsigned i, j, csums_per_device = stripe_csums_per_device(s: v); |
575 | |
576 | if (!v->csum_type) |
577 | return; |
578 | |
579 | BUG_ON(buf->offset); |
580 | BUG_ON(buf->size != le16_to_cpu(v->sectors)); |
581 | |
582 | for (i = 0; i < v->nr_blocks; i++) |
583 | for (j = 0; j < csums_per_device; j++) |
584 | stripe_csum_set(s: v, block: i, csum_idx: j, |
585 | csum: ec_block_checksum(buf, block: i, offset: j << v->csum_granularity_bits)); |
586 | } |
587 | |
588 | static void ec_validate_checksums(struct bch_fs *c, struct ec_stripe_buf *buf) |
589 | { |
590 | struct bch_stripe *v = &bkey_i_to_stripe(k: &buf->key)->v; |
591 | unsigned csum_granularity = 1 << v->csum_granularity_bits; |
592 | unsigned i; |
593 | |
594 | if (!v->csum_type) |
595 | return; |
596 | |
597 | for (i = 0; i < v->nr_blocks; i++) { |
598 | unsigned offset = buf->offset; |
599 | unsigned end = buf->offset + buf->size; |
600 | |
601 | if (!test_bit(i, buf->valid)) |
602 | continue; |
603 | |
604 | while (offset < end) { |
605 | unsigned j = offset >> v->csum_granularity_bits; |
606 | unsigned len = min(csum_granularity, end - offset); |
607 | struct bch_csum want = stripe_csum_get(s: v, block: i, csum_idx: j); |
608 | struct bch_csum got = ec_block_checksum(buf, block: i, offset); |
609 | |
610 | if (bch2_crc_cmp(l: want, r: got)) { |
611 | struct printbuf err = PRINTBUF; |
612 | struct bch_dev *ca = bch_dev_bkey_exists(c, idx: v->ptrs[i].dev); |
613 | |
614 | prt_str(out: &err, str: "stripe " ); |
615 | bch2_csum_err_msg(out: &err, type: v->csum_type, expected: want, got); |
616 | prt_printf(&err, " for %ps at %u of\n " , (void *) _RET_IP_, i); |
617 | bch2_bkey_val_to_text(&err, c, bkey_i_to_s_c(k: &buf->key)); |
618 | bch_err_ratelimited(ca, "%s" , err.buf); |
619 | printbuf_exit(&err); |
620 | |
621 | clear_bit(nr: i, addr: buf->valid); |
622 | |
623 | bch2_io_error(ca, BCH_MEMBER_ERROR_checksum); |
624 | break; |
625 | } |
626 | |
627 | offset += len; |
628 | } |
629 | } |
630 | } |
631 | |
632 | /* Erasure coding: */ |
633 | |
634 | static void ec_generate_ec(struct ec_stripe_buf *buf) |
635 | { |
636 | struct bch_stripe *v = &bkey_i_to_stripe(k: &buf->key)->v; |
637 | unsigned nr_data = v->nr_blocks - v->nr_redundant; |
638 | unsigned bytes = le16_to_cpu(v->sectors) << 9; |
639 | |
640 | raid_gen(nd: nr_data, np: v->nr_redundant, size: bytes, v: buf->data); |
641 | } |
642 | |
643 | static unsigned ec_nr_failed(struct ec_stripe_buf *buf) |
644 | { |
645 | struct bch_stripe *v = &bkey_i_to_stripe(k: &buf->key)->v; |
646 | |
647 | return v->nr_blocks - bitmap_weight(src: buf->valid, nbits: v->nr_blocks); |
648 | } |
649 | |
650 | static int ec_do_recov(struct bch_fs *c, struct ec_stripe_buf *buf) |
651 | { |
652 | struct bch_stripe *v = &bkey_i_to_stripe(k: &buf->key)->v; |
653 | unsigned i, failed[BCH_BKEY_PTRS_MAX], nr_failed = 0; |
654 | unsigned nr_data = v->nr_blocks - v->nr_redundant; |
655 | unsigned bytes = buf->size << 9; |
656 | |
657 | if (ec_nr_failed(buf) > v->nr_redundant) { |
658 | bch_err_ratelimited(c, |
659 | "error doing reconstruct read: unable to read enough blocks" ); |
660 | return -1; |
661 | } |
662 | |
663 | for (i = 0; i < nr_data; i++) |
664 | if (!test_bit(i, buf->valid)) |
665 | failed[nr_failed++] = i; |
666 | |
667 | raid_rec(nr: nr_failed, ir: failed, nd: nr_data, np: v->nr_redundant, size: bytes, v: buf->data); |
668 | return 0; |
669 | } |
670 | |
671 | /* IO: */ |
672 | |
673 | static void ec_block_endio(struct bio *bio) |
674 | { |
675 | struct ec_bio *ec_bio = container_of(bio, struct ec_bio, bio); |
676 | struct bch_stripe *v = &bkey_i_to_stripe(k: &ec_bio->buf->key)->v; |
677 | struct bch_extent_ptr *ptr = &v->ptrs[ec_bio->idx]; |
678 | struct bch_dev *ca = ec_bio->ca; |
679 | struct closure *cl = bio->bi_private; |
680 | |
681 | if (bch2_dev_io_err_on(bio->bi_status, ca, |
682 | bio_data_dir(bio) |
683 | ? BCH_MEMBER_ERROR_write |
684 | : BCH_MEMBER_ERROR_read, |
685 | "erasure coding %s error: %s" , |
686 | bio_data_dir(bio) ? "write" : "read" , |
687 | bch2_blk_status_to_str(bio->bi_status))) |
688 | clear_bit(nr: ec_bio->idx, addr: ec_bio->buf->valid); |
689 | |
690 | if (ptr_stale(ca, ptr)) { |
691 | bch_err_ratelimited(ca->fs, |
692 | "error %s stripe: stale pointer after io" , |
693 | bio_data_dir(bio) == READ ? "reading from" : "writing to" ); |
694 | clear_bit(nr: ec_bio->idx, addr: ec_bio->buf->valid); |
695 | } |
696 | |
697 | bio_put(&ec_bio->bio); |
698 | percpu_ref_put(ref: &ca->io_ref); |
699 | closure_put(cl); |
700 | } |
701 | |
702 | static void ec_block_io(struct bch_fs *c, struct ec_stripe_buf *buf, |
703 | blk_opf_t opf, unsigned idx, struct closure *cl) |
704 | { |
705 | struct bch_stripe *v = &bkey_i_to_stripe(k: &buf->key)->v; |
706 | unsigned offset = 0, bytes = buf->size << 9; |
707 | struct bch_extent_ptr *ptr = &v->ptrs[idx]; |
708 | struct bch_dev *ca = bch_dev_bkey_exists(c, idx: ptr->dev); |
709 | enum bch_data_type data_type = idx < v->nr_blocks - v->nr_redundant |
710 | ? BCH_DATA_user |
711 | : BCH_DATA_parity; |
712 | int rw = op_is_write(op: opf); |
713 | |
714 | if (ptr_stale(ca, ptr)) { |
715 | bch_err_ratelimited(c, |
716 | "error %s stripe: stale pointer" , |
717 | rw == READ ? "reading from" : "writing to" ); |
718 | clear_bit(nr: idx, addr: buf->valid); |
719 | return; |
720 | } |
721 | |
722 | if (!bch2_dev_get_ioref(ca, rw)) { |
723 | clear_bit(nr: idx, addr: buf->valid); |
724 | return; |
725 | } |
726 | |
727 | this_cpu_add(ca->io_done->sectors[rw][data_type], buf->size); |
728 | |
729 | while (offset < bytes) { |
730 | unsigned nr_iovecs = min_t(size_t, BIO_MAX_VECS, |
731 | DIV_ROUND_UP(bytes, PAGE_SIZE)); |
732 | unsigned b = min_t(size_t, bytes - offset, |
733 | nr_iovecs << PAGE_SHIFT); |
734 | struct ec_bio *ec_bio; |
735 | |
736 | ec_bio = container_of(bio_alloc_bioset(ca->disk_sb.bdev, |
737 | nr_iovecs, |
738 | opf, |
739 | GFP_KERNEL, |
740 | &c->ec_bioset), |
741 | struct ec_bio, bio); |
742 | |
743 | ec_bio->ca = ca; |
744 | ec_bio->buf = buf; |
745 | ec_bio->idx = idx; |
746 | |
747 | ec_bio->bio.bi_iter.bi_sector = ptr->offset + buf->offset + (offset >> 9); |
748 | ec_bio->bio.bi_end_io = ec_block_endio; |
749 | ec_bio->bio.bi_private = cl; |
750 | |
751 | bch2_bio_map(bio: &ec_bio->bio, base: buf->data[idx] + offset, b); |
752 | |
753 | closure_get(cl); |
754 | percpu_ref_get(ref: &ca->io_ref); |
755 | |
756 | submit_bio(bio: &ec_bio->bio); |
757 | |
758 | offset += b; |
759 | } |
760 | |
761 | percpu_ref_put(ref: &ca->io_ref); |
762 | } |
763 | |
764 | static int get_stripe_key_trans(struct btree_trans *trans, u64 idx, |
765 | struct ec_stripe_buf *stripe) |
766 | { |
767 | struct btree_iter iter; |
768 | struct bkey_s_c k; |
769 | int ret; |
770 | |
771 | k = bch2_bkey_get_iter(trans, iter: &iter, btree_id: BTREE_ID_stripes, |
772 | POS(0, idx), flags: BTREE_ITER_SLOTS); |
773 | ret = bkey_err(k); |
774 | if (ret) |
775 | goto err; |
776 | if (k.k->type != KEY_TYPE_stripe) { |
777 | ret = -ENOENT; |
778 | goto err; |
779 | } |
780 | bkey_reassemble(dst: &stripe->key, src: k); |
781 | err: |
782 | bch2_trans_iter_exit(trans, &iter); |
783 | return ret; |
784 | } |
785 | |
786 | /* recovery read path: */ |
787 | int bch2_ec_read_extent(struct btree_trans *trans, struct bch_read_bio *rbio) |
788 | { |
789 | struct bch_fs *c = trans->c; |
790 | struct ec_stripe_buf *buf; |
791 | struct closure cl; |
792 | struct bch_stripe *v; |
793 | unsigned i, offset; |
794 | int ret = 0; |
795 | |
796 | closure_init_stack(cl: &cl); |
797 | |
798 | BUG_ON(!rbio->pick.has_ec); |
799 | |
800 | buf = kzalloc(size: sizeof(*buf), GFP_NOFS); |
801 | if (!buf) |
802 | return -BCH_ERR_ENOMEM_ec_read_extent; |
803 | |
804 | ret = lockrestart_do(trans, get_stripe_key_trans(trans, rbio->pick.ec.idx, buf)); |
805 | if (ret) { |
806 | bch_err_ratelimited(c, |
807 | "error doing reconstruct read: error %i looking up stripe" , ret); |
808 | kfree(objp: buf); |
809 | return -EIO; |
810 | } |
811 | |
812 | v = &bkey_i_to_stripe(k: &buf->key)->v; |
813 | |
814 | if (!bch2_ptr_matches_stripe(s: v, p: rbio->pick)) { |
815 | bch_err_ratelimited(c, |
816 | "error doing reconstruct read: pointer doesn't match stripe" ); |
817 | ret = -EIO; |
818 | goto err; |
819 | } |
820 | |
821 | offset = rbio->bio.bi_iter.bi_sector - v->ptrs[rbio->pick.ec.block].offset; |
822 | if (offset + bio_sectors(&rbio->bio) > le16_to_cpu(v->sectors)) { |
823 | bch_err_ratelimited(c, |
824 | "error doing reconstruct read: read is bigger than stripe" ); |
825 | ret = -EIO; |
826 | goto err; |
827 | } |
828 | |
829 | ret = ec_stripe_buf_init(buf, offset, bio_sectors(&rbio->bio)); |
830 | if (ret) |
831 | goto err; |
832 | |
833 | for (i = 0; i < v->nr_blocks; i++) |
834 | ec_block_io(c, buf, opf: REQ_OP_READ, idx: i, cl: &cl); |
835 | |
836 | closure_sync(cl: &cl); |
837 | |
838 | if (ec_nr_failed(buf) > v->nr_redundant) { |
839 | bch_err_ratelimited(c, |
840 | "error doing reconstruct read: unable to read enough blocks" ); |
841 | ret = -EIO; |
842 | goto err; |
843 | } |
844 | |
845 | ec_validate_checksums(c, buf); |
846 | |
847 | ret = ec_do_recov(c, buf); |
848 | if (ret) |
849 | goto err; |
850 | |
851 | memcpy_to_bio(&rbio->bio, rbio->bio.bi_iter, |
852 | buf->data[rbio->pick.ec.block] + ((offset - buf->offset) << 9)); |
853 | err: |
854 | ec_stripe_buf_exit(buf); |
855 | kfree(objp: buf); |
856 | return ret; |
857 | } |
858 | |
859 | /* stripe bucket accounting: */ |
860 | |
861 | static int __ec_stripe_mem_alloc(struct bch_fs *c, size_t idx, gfp_t gfp) |
862 | { |
863 | ec_stripes_heap n, *h = &c->ec_stripes_heap; |
864 | |
865 | if (idx >= h->size) { |
866 | if (!init_heap(&n, max(1024UL, roundup_pow_of_two(idx + 1)), gfp)) |
867 | return -BCH_ERR_ENOMEM_ec_stripe_mem_alloc; |
868 | |
869 | mutex_lock(&c->ec_stripes_heap_lock); |
870 | if (n.size > h->size) { |
871 | memcpy(n.data, h->data, h->used * sizeof(h->data[0])); |
872 | n.used = h->used; |
873 | swap(*h, n); |
874 | } |
875 | mutex_unlock(lock: &c->ec_stripes_heap_lock); |
876 | |
877 | free_heap(&n); |
878 | } |
879 | |
880 | if (!genradix_ptr_alloc(&c->stripes, idx, gfp)) |
881 | return -BCH_ERR_ENOMEM_ec_stripe_mem_alloc; |
882 | |
883 | if (c->gc_pos.phase != GC_PHASE_NOT_RUNNING && |
884 | !genradix_ptr_alloc(&c->gc_stripes, idx, gfp)) |
885 | return -BCH_ERR_ENOMEM_ec_stripe_mem_alloc; |
886 | |
887 | return 0; |
888 | } |
889 | |
890 | static int ec_stripe_mem_alloc(struct btree_trans *trans, |
891 | struct btree_iter *iter) |
892 | { |
893 | return allocate_dropping_locks_errcode(trans, |
894 | __ec_stripe_mem_alloc(trans->c, iter->pos.offset, _gfp)); |
895 | } |
896 | |
897 | /* |
898 | * Hash table of open stripes: |
899 | * Stripes that are being created or modified are kept in a hash table, so that |
900 | * stripe deletion can skip them. |
901 | */ |
902 | |
903 | static bool __bch2_stripe_is_open(struct bch_fs *c, u64 idx) |
904 | { |
905 | unsigned hash = hash_64(val: idx, ilog2(ARRAY_SIZE(c->ec_stripes_new))); |
906 | struct ec_stripe_new *s; |
907 | |
908 | hlist_for_each_entry(s, &c->ec_stripes_new[hash], hash) |
909 | if (s->idx == idx) |
910 | return true; |
911 | return false; |
912 | } |
913 | |
914 | static bool bch2_stripe_is_open(struct bch_fs *c, u64 idx) |
915 | { |
916 | bool ret = false; |
917 | |
918 | spin_lock(lock: &c->ec_stripes_new_lock); |
919 | ret = __bch2_stripe_is_open(c, idx); |
920 | spin_unlock(lock: &c->ec_stripes_new_lock); |
921 | |
922 | return ret; |
923 | } |
924 | |
925 | static bool bch2_try_open_stripe(struct bch_fs *c, |
926 | struct ec_stripe_new *s, |
927 | u64 idx) |
928 | { |
929 | bool ret; |
930 | |
931 | spin_lock(lock: &c->ec_stripes_new_lock); |
932 | ret = !__bch2_stripe_is_open(c, idx); |
933 | if (ret) { |
934 | unsigned hash = hash_64(val: idx, ilog2(ARRAY_SIZE(c->ec_stripes_new))); |
935 | |
936 | s->idx = idx; |
937 | hlist_add_head(n: &s->hash, h: &c->ec_stripes_new[hash]); |
938 | } |
939 | spin_unlock(lock: &c->ec_stripes_new_lock); |
940 | |
941 | return ret; |
942 | } |
943 | |
944 | static void bch2_stripe_close(struct bch_fs *c, struct ec_stripe_new *s) |
945 | { |
946 | BUG_ON(!s->idx); |
947 | |
948 | spin_lock(lock: &c->ec_stripes_new_lock); |
949 | hlist_del_init(n: &s->hash); |
950 | spin_unlock(lock: &c->ec_stripes_new_lock); |
951 | |
952 | s->idx = 0; |
953 | } |
954 | |
955 | /* Heap of all existing stripes, ordered by blocks_nonempty */ |
956 | |
957 | static u64 stripe_idx_to_delete(struct bch_fs *c) |
958 | { |
959 | ec_stripes_heap *h = &c->ec_stripes_heap; |
960 | |
961 | lockdep_assert_held(&c->ec_stripes_heap_lock); |
962 | |
963 | if (h->used && |
964 | h->data[0].blocks_nonempty == 0 && |
965 | !bch2_stripe_is_open(c, idx: h->data[0].idx)) |
966 | return h->data[0].idx; |
967 | |
968 | return 0; |
969 | } |
970 | |
971 | static inline int ec_stripes_heap_cmp(ec_stripes_heap *h, |
972 | struct ec_stripe_heap_entry l, |
973 | struct ec_stripe_heap_entry r) |
974 | { |
975 | return ((l.blocks_nonempty > r.blocks_nonempty) - |
976 | (l.blocks_nonempty < r.blocks_nonempty)); |
977 | } |
978 | |
979 | static inline void ec_stripes_heap_set_backpointer(ec_stripes_heap *h, |
980 | size_t i) |
981 | { |
982 | struct bch_fs *c = container_of(h, struct bch_fs, ec_stripes_heap); |
983 | |
984 | genradix_ptr(&c->stripes, h->data[i].idx)->heap_idx = i; |
985 | } |
986 | |
987 | static void heap_verify_backpointer(struct bch_fs *c, size_t idx) |
988 | { |
989 | ec_stripes_heap *h = &c->ec_stripes_heap; |
990 | struct stripe *m = genradix_ptr(&c->stripes, idx); |
991 | |
992 | BUG_ON(m->heap_idx >= h->used); |
993 | BUG_ON(h->data[m->heap_idx].idx != idx); |
994 | } |
995 | |
996 | void bch2_stripes_heap_del(struct bch_fs *c, |
997 | struct stripe *m, size_t idx) |
998 | { |
999 | mutex_lock(&c->ec_stripes_heap_lock); |
1000 | heap_verify_backpointer(c, idx); |
1001 | |
1002 | heap_del(&c->ec_stripes_heap, m->heap_idx, |
1003 | ec_stripes_heap_cmp, |
1004 | ec_stripes_heap_set_backpointer); |
1005 | mutex_unlock(lock: &c->ec_stripes_heap_lock); |
1006 | } |
1007 | |
1008 | void bch2_stripes_heap_insert(struct bch_fs *c, |
1009 | struct stripe *m, size_t idx) |
1010 | { |
1011 | mutex_lock(&c->ec_stripes_heap_lock); |
1012 | BUG_ON(heap_full(&c->ec_stripes_heap)); |
1013 | |
1014 | heap_add(&c->ec_stripes_heap, ((struct ec_stripe_heap_entry) { |
1015 | .idx = idx, |
1016 | .blocks_nonempty = m->blocks_nonempty, |
1017 | }), |
1018 | ec_stripes_heap_cmp, |
1019 | ec_stripes_heap_set_backpointer); |
1020 | |
1021 | heap_verify_backpointer(c, idx); |
1022 | mutex_unlock(lock: &c->ec_stripes_heap_lock); |
1023 | } |
1024 | |
1025 | void bch2_stripes_heap_update(struct bch_fs *c, |
1026 | struct stripe *m, size_t idx) |
1027 | { |
1028 | ec_stripes_heap *h = &c->ec_stripes_heap; |
1029 | bool do_deletes; |
1030 | size_t i; |
1031 | |
1032 | mutex_lock(&c->ec_stripes_heap_lock); |
1033 | heap_verify_backpointer(c, idx); |
1034 | |
1035 | h->data[m->heap_idx].blocks_nonempty = m->blocks_nonempty; |
1036 | |
1037 | i = m->heap_idx; |
1038 | heap_sift_up(h, i, ec_stripes_heap_cmp, |
1039 | ec_stripes_heap_set_backpointer); |
1040 | heap_sift_down(h, i, ec_stripes_heap_cmp, |
1041 | ec_stripes_heap_set_backpointer); |
1042 | |
1043 | heap_verify_backpointer(c, idx); |
1044 | |
1045 | do_deletes = stripe_idx_to_delete(c) != 0; |
1046 | mutex_unlock(lock: &c->ec_stripes_heap_lock); |
1047 | |
1048 | if (do_deletes) |
1049 | bch2_do_stripe_deletes(c); |
1050 | } |
1051 | |
1052 | /* stripe deletion */ |
1053 | |
1054 | static int ec_stripe_delete(struct btree_trans *trans, u64 idx) |
1055 | { |
1056 | struct bch_fs *c = trans->c; |
1057 | struct btree_iter iter; |
1058 | struct bkey_s_c k; |
1059 | struct bkey_s_c_stripe s; |
1060 | int ret; |
1061 | |
1062 | k = bch2_bkey_get_iter(trans, iter: &iter, btree_id: BTREE_ID_stripes, POS(0, idx), |
1063 | flags: BTREE_ITER_INTENT); |
1064 | ret = bkey_err(k); |
1065 | if (ret) |
1066 | goto err; |
1067 | |
1068 | if (k.k->type != KEY_TYPE_stripe) { |
1069 | bch2_fs_inconsistent(c, "attempting to delete nonexistent stripe %llu" , idx); |
1070 | ret = -EINVAL; |
1071 | goto err; |
1072 | } |
1073 | |
1074 | s = bkey_s_c_to_stripe(k); |
1075 | for (unsigned i = 0; i < s.v->nr_blocks; i++) |
1076 | if (stripe_blockcount_get(s: s.v, idx: i)) { |
1077 | struct printbuf buf = PRINTBUF; |
1078 | |
1079 | bch2_bkey_val_to_text(&buf, c, k); |
1080 | bch2_fs_inconsistent(c, "attempting to delete nonempty stripe %s" , buf.buf); |
1081 | printbuf_exit(&buf); |
1082 | ret = -EINVAL; |
1083 | goto err; |
1084 | } |
1085 | |
1086 | ret = bch2_btree_delete_at(trans, &iter, 0); |
1087 | err: |
1088 | bch2_trans_iter_exit(trans, &iter); |
1089 | return ret; |
1090 | } |
1091 | |
1092 | static void ec_stripe_delete_work(struct work_struct *work) |
1093 | { |
1094 | struct bch_fs *c = |
1095 | container_of(work, struct bch_fs, ec_stripe_delete_work); |
1096 | |
1097 | while (1) { |
1098 | mutex_lock(&c->ec_stripes_heap_lock); |
1099 | u64 idx = stripe_idx_to_delete(c); |
1100 | mutex_unlock(lock: &c->ec_stripes_heap_lock); |
1101 | |
1102 | if (!idx) |
1103 | break; |
1104 | |
1105 | int ret = bch2_trans_do(c, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, |
1106 | ec_stripe_delete(trans, idx)); |
1107 | bch_err_fn(c, ret); |
1108 | if (ret) |
1109 | break; |
1110 | } |
1111 | |
1112 | bch2_write_ref_put(c, ref: BCH_WRITE_REF_stripe_delete); |
1113 | } |
1114 | |
1115 | void bch2_do_stripe_deletes(struct bch_fs *c) |
1116 | { |
1117 | if (bch2_write_ref_tryget(c, ref: BCH_WRITE_REF_stripe_delete) && |
1118 | !queue_work(wq: c->write_ref_wq, work: &c->ec_stripe_delete_work)) |
1119 | bch2_write_ref_put(c, ref: BCH_WRITE_REF_stripe_delete); |
1120 | } |
1121 | |
1122 | /* stripe creation: */ |
1123 | |
1124 | static int ec_stripe_key_update(struct btree_trans *trans, |
1125 | struct bkey_i_stripe *new, |
1126 | bool create) |
1127 | { |
1128 | struct bch_fs *c = trans->c; |
1129 | struct btree_iter iter; |
1130 | struct bkey_s_c k; |
1131 | int ret; |
1132 | |
1133 | k = bch2_bkey_get_iter(trans, iter: &iter, btree_id: BTREE_ID_stripes, |
1134 | pos: new->k.p, flags: BTREE_ITER_INTENT); |
1135 | ret = bkey_err(k); |
1136 | if (ret) |
1137 | goto err; |
1138 | |
1139 | if (k.k->type != (create ? KEY_TYPE_deleted : KEY_TYPE_stripe)) { |
1140 | bch2_fs_inconsistent(c, "error %s stripe: got existing key type %s" , |
1141 | create ? "creating" : "updating" , |
1142 | bch2_bkey_types[k.k->type]); |
1143 | ret = -EINVAL; |
1144 | goto err; |
1145 | } |
1146 | |
1147 | if (k.k->type == KEY_TYPE_stripe) { |
1148 | const struct bch_stripe *old = bkey_s_c_to_stripe(k).v; |
1149 | unsigned i; |
1150 | |
1151 | if (old->nr_blocks != new->v.nr_blocks) { |
1152 | bch_err(c, "error updating stripe: nr_blocks does not match" ); |
1153 | ret = -EINVAL; |
1154 | goto err; |
1155 | } |
1156 | |
1157 | for (i = 0; i < new->v.nr_blocks; i++) { |
1158 | unsigned v = stripe_blockcount_get(s: old, idx: i); |
1159 | |
1160 | BUG_ON(v && |
1161 | (old->ptrs[i].dev != new->v.ptrs[i].dev || |
1162 | old->ptrs[i].gen != new->v.ptrs[i].gen || |
1163 | old->ptrs[i].offset != new->v.ptrs[i].offset)); |
1164 | |
1165 | stripe_blockcount_set(s: &new->v, idx: i, v); |
1166 | } |
1167 | } |
1168 | |
1169 | ret = bch2_trans_update(trans, &iter, &new->k_i, 0); |
1170 | err: |
1171 | bch2_trans_iter_exit(trans, &iter); |
1172 | return ret; |
1173 | } |
1174 | |
1175 | static int ec_stripe_update_extent(struct btree_trans *trans, |
1176 | struct bpos bucket, u8 gen, |
1177 | struct ec_stripe_buf *s, |
1178 | struct bpos *bp_pos) |
1179 | { |
1180 | struct bch_stripe *v = &bkey_i_to_stripe(k: &s->key)->v; |
1181 | struct bch_fs *c = trans->c; |
1182 | struct bch_backpointer bp; |
1183 | struct btree_iter iter; |
1184 | struct bkey_s_c k; |
1185 | const struct bch_extent_ptr *ptr_c; |
1186 | struct bch_extent_ptr *ptr, *ec_ptr = NULL; |
1187 | struct bch_extent_stripe_ptr stripe_ptr; |
1188 | struct bkey_i *n; |
1189 | int ret, dev, block; |
1190 | |
1191 | ret = bch2_get_next_backpointer(trans, bucket, gen, |
1192 | bp_pos, &bp, BTREE_ITER_CACHED); |
1193 | if (ret) |
1194 | return ret; |
1195 | if (bpos_eq(l: *bp_pos, SPOS_MAX)) |
1196 | return 0; |
1197 | |
1198 | if (bp.level) { |
1199 | struct printbuf buf = PRINTBUF; |
1200 | struct btree_iter node_iter; |
1201 | struct btree *b; |
1202 | |
1203 | b = bch2_backpointer_get_node(trans, &node_iter, *bp_pos, bp); |
1204 | bch2_trans_iter_exit(trans, &node_iter); |
1205 | |
1206 | if (!b) |
1207 | return 0; |
1208 | |
1209 | prt_printf(&buf, "found btree node in erasure coded bucket: b=%px\n" , b); |
1210 | bch2_backpointer_to_text(&buf, &bp); |
1211 | |
1212 | bch2_fs_inconsistent(c, "%s" , buf.buf); |
1213 | printbuf_exit(&buf); |
1214 | return -EIO; |
1215 | } |
1216 | |
1217 | k = bch2_backpointer_get_key(trans, &iter, *bp_pos, bp, BTREE_ITER_INTENT); |
1218 | ret = bkey_err(k); |
1219 | if (ret) |
1220 | return ret; |
1221 | if (!k.k) { |
1222 | /* |
1223 | * extent no longer exists - we could flush the btree |
1224 | * write buffer and retry to verify, but no need: |
1225 | */ |
1226 | return 0; |
1227 | } |
1228 | |
1229 | if (extent_has_stripe_ptr(k, idx: s->key.k.p.offset)) |
1230 | goto out; |
1231 | |
1232 | ptr_c = bkey_matches_stripe(s: v, k, block: &block); |
1233 | /* |
1234 | * It doesn't generally make sense to erasure code cached ptrs: |
1235 | * XXX: should we be incrementing a counter? |
1236 | */ |
1237 | if (!ptr_c || ptr_c->cached) |
1238 | goto out; |
1239 | |
1240 | dev = v->ptrs[block].dev; |
1241 | |
1242 | n = bch2_trans_kmalloc(trans, bkey_bytes(k.k) + sizeof(stripe_ptr)); |
1243 | ret = PTR_ERR_OR_ZERO(ptr: n); |
1244 | if (ret) |
1245 | goto out; |
1246 | |
1247 | bkey_reassemble(dst: n, src: k); |
1248 | |
1249 | bch2_bkey_drop_ptrs(bkey_i_to_s(n), ptr, ptr->dev != dev); |
1250 | ec_ptr = bch2_bkey_has_device(k: bkey_i_to_s(k: n), dev); |
1251 | BUG_ON(!ec_ptr); |
1252 | |
1253 | stripe_ptr = (struct bch_extent_stripe_ptr) { |
1254 | .type = 1 << BCH_EXTENT_ENTRY_stripe_ptr, |
1255 | .block = block, |
1256 | .redundancy = v->nr_redundant, |
1257 | .idx = s->key.k.p.offset, |
1258 | }; |
1259 | |
1260 | __extent_entry_insert(k: n, |
1261 | dst: (union bch_extent_entry *) ec_ptr, |
1262 | new: (union bch_extent_entry *) &stripe_ptr); |
1263 | |
1264 | ret = bch2_trans_update(trans, &iter, n, 0); |
1265 | out: |
1266 | bch2_trans_iter_exit(trans, &iter); |
1267 | return ret; |
1268 | } |
1269 | |
1270 | static int ec_stripe_update_bucket(struct btree_trans *trans, struct ec_stripe_buf *s, |
1271 | unsigned block) |
1272 | { |
1273 | struct bch_fs *c = trans->c; |
1274 | struct bch_stripe *v = &bkey_i_to_stripe(k: &s->key)->v; |
1275 | struct bch_extent_ptr bucket = v->ptrs[block]; |
1276 | struct bpos bucket_pos = PTR_BUCKET_POS(c, ptr: &bucket); |
1277 | struct bpos bp_pos = POS_MIN; |
1278 | int ret = 0; |
1279 | |
1280 | while (1) { |
1281 | ret = commit_do(trans, NULL, NULL, |
1282 | BCH_TRANS_COMMIT_no_check_rw| |
1283 | BCH_TRANS_COMMIT_no_enospc, |
1284 | ec_stripe_update_extent(trans, bucket_pos, bucket.gen, |
1285 | s, &bp_pos)); |
1286 | if (ret) |
1287 | break; |
1288 | if (bkey_eq(l: bp_pos, POS_MAX)) |
1289 | break; |
1290 | |
1291 | bp_pos = bpos_nosnap_successor(p: bp_pos); |
1292 | } |
1293 | |
1294 | return ret; |
1295 | } |
1296 | |
1297 | static int ec_stripe_update_extents(struct bch_fs *c, struct ec_stripe_buf *s) |
1298 | { |
1299 | struct btree_trans *trans = bch2_trans_get(c); |
1300 | struct bch_stripe *v = &bkey_i_to_stripe(k: &s->key)->v; |
1301 | unsigned i, nr_data = v->nr_blocks - v->nr_redundant; |
1302 | int ret = 0; |
1303 | |
1304 | ret = bch2_btree_write_buffer_flush_sync(trans); |
1305 | if (ret) |
1306 | goto err; |
1307 | |
1308 | for (i = 0; i < nr_data; i++) { |
1309 | ret = ec_stripe_update_bucket(trans, s, block: i); |
1310 | if (ret) |
1311 | break; |
1312 | } |
1313 | err: |
1314 | bch2_trans_put(trans); |
1315 | |
1316 | return ret; |
1317 | } |
1318 | |
1319 | static void zero_out_rest_of_ec_bucket(struct bch_fs *c, |
1320 | struct ec_stripe_new *s, |
1321 | unsigned block, |
1322 | struct open_bucket *ob) |
1323 | { |
1324 | struct bch_dev *ca = bch_dev_bkey_exists(c, idx: ob->dev); |
1325 | unsigned offset = ca->mi.bucket_size - ob->sectors_free; |
1326 | int ret; |
1327 | |
1328 | if (!bch2_dev_get_ioref(ca, WRITE)) { |
1329 | s->err = -BCH_ERR_erofs_no_writes; |
1330 | return; |
1331 | } |
1332 | |
1333 | memset(s->new_stripe.data[block] + (offset << 9), |
1334 | 0, |
1335 | ob->sectors_free << 9); |
1336 | |
1337 | ret = blkdev_issue_zeroout(bdev: ca->disk_sb.bdev, |
1338 | sector: ob->bucket * ca->mi.bucket_size + offset, |
1339 | nr_sects: ob->sectors_free, |
1340 | GFP_KERNEL, flags: 0); |
1341 | |
1342 | percpu_ref_put(ref: &ca->io_ref); |
1343 | |
1344 | if (ret) |
1345 | s->err = ret; |
1346 | } |
1347 | |
1348 | void bch2_ec_stripe_new_free(struct bch_fs *c, struct ec_stripe_new *s) |
1349 | { |
1350 | if (s->idx) |
1351 | bch2_stripe_close(c, s); |
1352 | kfree(objp: s); |
1353 | } |
1354 | |
1355 | /* |
1356 | * data buckets of new stripe all written: create the stripe |
1357 | */ |
1358 | static void ec_stripe_create(struct ec_stripe_new *s) |
1359 | { |
1360 | struct bch_fs *c = s->c; |
1361 | struct open_bucket *ob; |
1362 | struct bch_stripe *v = &bkey_i_to_stripe(k: &s->new_stripe.key)->v; |
1363 | unsigned i, nr_data = v->nr_blocks - v->nr_redundant; |
1364 | int ret; |
1365 | |
1366 | BUG_ON(s->h->s == s); |
1367 | |
1368 | closure_sync(cl: &s->iodone); |
1369 | |
1370 | if (!s->err) { |
1371 | for (i = 0; i < nr_data; i++) |
1372 | if (s->blocks[i]) { |
1373 | ob = c->open_buckets + s->blocks[i]; |
1374 | |
1375 | if (ob->sectors_free) |
1376 | zero_out_rest_of_ec_bucket(c, s, block: i, ob); |
1377 | } |
1378 | } |
1379 | |
1380 | if (s->err) { |
1381 | if (!bch2_err_matches(s->err, EROFS)) |
1382 | bch_err(c, "error creating stripe: error writing data buckets" ); |
1383 | goto err; |
1384 | } |
1385 | |
1386 | if (s->have_existing_stripe) { |
1387 | ec_validate_checksums(c, buf: &s->existing_stripe); |
1388 | |
1389 | if (ec_do_recov(c, buf: &s->existing_stripe)) { |
1390 | bch_err(c, "error creating stripe: error reading existing stripe" ); |
1391 | goto err; |
1392 | } |
1393 | |
1394 | for (i = 0; i < nr_data; i++) |
1395 | if (stripe_blockcount_get(s: &bkey_i_to_stripe(k: &s->existing_stripe.key)->v, idx: i)) |
1396 | swap(s->new_stripe.data[i], |
1397 | s->existing_stripe.data[i]); |
1398 | |
1399 | ec_stripe_buf_exit(buf: &s->existing_stripe); |
1400 | } |
1401 | |
1402 | BUG_ON(!s->allocated); |
1403 | BUG_ON(!s->idx); |
1404 | |
1405 | ec_generate_ec(buf: &s->new_stripe); |
1406 | |
1407 | ec_generate_checksums(buf: &s->new_stripe); |
1408 | |
1409 | /* write p/q: */ |
1410 | for (i = nr_data; i < v->nr_blocks; i++) |
1411 | ec_block_io(c, buf: &s->new_stripe, opf: REQ_OP_WRITE, idx: i, cl: &s->iodone); |
1412 | closure_sync(cl: &s->iodone); |
1413 | |
1414 | if (ec_nr_failed(buf: &s->new_stripe)) { |
1415 | bch_err(c, "error creating stripe: error writing redundancy buckets" ); |
1416 | goto err; |
1417 | } |
1418 | |
1419 | ret = bch2_trans_do(c, &s->res, NULL, |
1420 | BCH_TRANS_COMMIT_no_check_rw| |
1421 | BCH_TRANS_COMMIT_no_enospc, |
1422 | ec_stripe_key_update(trans, |
1423 | bkey_i_to_stripe(&s->new_stripe.key), |
1424 | !s->have_existing_stripe)); |
1425 | bch_err_msg(c, ret, "creating stripe key" ); |
1426 | if (ret) { |
1427 | goto err; |
1428 | } |
1429 | |
1430 | ret = ec_stripe_update_extents(c, s: &s->new_stripe); |
1431 | bch_err_msg(c, ret, "error updating extents" ); |
1432 | if (ret) |
1433 | goto err; |
1434 | err: |
1435 | bch2_disk_reservation_put(c, res: &s->res); |
1436 | |
1437 | for (i = 0; i < v->nr_blocks; i++) |
1438 | if (s->blocks[i]) { |
1439 | ob = c->open_buckets + s->blocks[i]; |
1440 | |
1441 | if (i < nr_data) { |
1442 | ob->ec = NULL; |
1443 | __bch2_open_bucket_put(c, ob); |
1444 | } else { |
1445 | bch2_open_bucket_put(c, ob); |
1446 | } |
1447 | } |
1448 | |
1449 | mutex_lock(&c->ec_stripe_new_lock); |
1450 | list_del(entry: &s->list); |
1451 | mutex_unlock(lock: &c->ec_stripe_new_lock); |
1452 | wake_up(&c->ec_stripe_new_wait); |
1453 | |
1454 | ec_stripe_buf_exit(buf: &s->existing_stripe); |
1455 | ec_stripe_buf_exit(buf: &s->new_stripe); |
1456 | closure_debug_destroy(cl: &s->iodone); |
1457 | |
1458 | ec_stripe_new_put(c, s, ref: STRIPE_REF_stripe); |
1459 | } |
1460 | |
1461 | static struct ec_stripe_new *get_pending_stripe(struct bch_fs *c) |
1462 | { |
1463 | struct ec_stripe_new *s; |
1464 | |
1465 | mutex_lock(&c->ec_stripe_new_lock); |
1466 | list_for_each_entry(s, &c->ec_stripe_new_list, list) |
1467 | if (!atomic_read(v: &s->ref[STRIPE_REF_io])) |
1468 | goto out; |
1469 | s = NULL; |
1470 | out: |
1471 | mutex_unlock(lock: &c->ec_stripe_new_lock); |
1472 | |
1473 | return s; |
1474 | } |
1475 | |
1476 | static void ec_stripe_create_work(struct work_struct *work) |
1477 | { |
1478 | struct bch_fs *c = container_of(work, |
1479 | struct bch_fs, ec_stripe_create_work); |
1480 | struct ec_stripe_new *s; |
1481 | |
1482 | while ((s = get_pending_stripe(c))) |
1483 | ec_stripe_create(s); |
1484 | |
1485 | bch2_write_ref_put(c, ref: BCH_WRITE_REF_stripe_create); |
1486 | } |
1487 | |
1488 | void bch2_ec_do_stripe_creates(struct bch_fs *c) |
1489 | { |
1490 | bch2_write_ref_get(c, ref: BCH_WRITE_REF_stripe_create); |
1491 | |
1492 | if (!queue_work(wq: system_long_wq, work: &c->ec_stripe_create_work)) |
1493 | bch2_write_ref_put(c, ref: BCH_WRITE_REF_stripe_create); |
1494 | } |
1495 | |
1496 | static void ec_stripe_set_pending(struct bch_fs *c, struct ec_stripe_head *h) |
1497 | { |
1498 | struct ec_stripe_new *s = h->s; |
1499 | |
1500 | BUG_ON(!s->allocated && !s->err); |
1501 | |
1502 | h->s = NULL; |
1503 | s->pending = true; |
1504 | |
1505 | mutex_lock(&c->ec_stripe_new_lock); |
1506 | list_add(new: &s->list, head: &c->ec_stripe_new_list); |
1507 | mutex_unlock(lock: &c->ec_stripe_new_lock); |
1508 | |
1509 | ec_stripe_new_put(c, s, ref: STRIPE_REF_io); |
1510 | } |
1511 | |
1512 | void bch2_ec_bucket_cancel(struct bch_fs *c, struct open_bucket *ob) |
1513 | { |
1514 | struct ec_stripe_new *s = ob->ec; |
1515 | |
1516 | s->err = -EIO; |
1517 | } |
1518 | |
1519 | void *bch2_writepoint_ec_buf(struct bch_fs *c, struct write_point *wp) |
1520 | { |
1521 | struct open_bucket *ob = ec_open_bucket(c, obs: &wp->ptrs); |
1522 | struct bch_dev *ca; |
1523 | unsigned offset; |
1524 | |
1525 | if (!ob) |
1526 | return NULL; |
1527 | |
1528 | BUG_ON(!ob->ec->new_stripe.data[ob->ec_idx]); |
1529 | |
1530 | ca = bch_dev_bkey_exists(c, idx: ob->dev); |
1531 | offset = ca->mi.bucket_size - ob->sectors_free; |
1532 | |
1533 | return ob->ec->new_stripe.data[ob->ec_idx] + (offset << 9); |
1534 | } |
1535 | |
1536 | static int unsigned_cmp(const void *_l, const void *_r) |
1537 | { |
1538 | unsigned l = *((const unsigned *) _l); |
1539 | unsigned r = *((const unsigned *) _r); |
1540 | |
1541 | return cmp_int(l, r); |
1542 | } |
1543 | |
1544 | /* pick most common bucket size: */ |
1545 | static unsigned pick_blocksize(struct bch_fs *c, |
1546 | struct bch_devs_mask *devs) |
1547 | { |
1548 | unsigned nr = 0, sizes[BCH_SB_MEMBERS_MAX]; |
1549 | struct { |
1550 | unsigned nr, size; |
1551 | } cur = { 0, 0 }, best = { 0, 0 }; |
1552 | |
1553 | for_each_member_device_rcu(c, ca, devs) |
1554 | sizes[nr++] = ca->mi.bucket_size; |
1555 | |
1556 | sort(base: sizes, num: nr, size: sizeof(unsigned), cmp_func: unsigned_cmp, NULL); |
1557 | |
1558 | for (unsigned i = 0; i < nr; i++) { |
1559 | if (sizes[i] != cur.size) { |
1560 | if (cur.nr > best.nr) |
1561 | best = cur; |
1562 | |
1563 | cur.nr = 0; |
1564 | cur.size = sizes[i]; |
1565 | } |
1566 | |
1567 | cur.nr++; |
1568 | } |
1569 | |
1570 | if (cur.nr > best.nr) |
1571 | best = cur; |
1572 | |
1573 | return best.size; |
1574 | } |
1575 | |
1576 | static bool may_create_new_stripe(struct bch_fs *c) |
1577 | { |
1578 | return false; |
1579 | } |
1580 | |
1581 | static void ec_stripe_key_init(struct bch_fs *c, |
1582 | struct bkey_i *k, |
1583 | unsigned nr_data, |
1584 | unsigned nr_parity, |
1585 | unsigned stripe_size) |
1586 | { |
1587 | struct bkey_i_stripe *s = bkey_stripe_init(k: k); |
1588 | unsigned u64s; |
1589 | |
1590 | s->v.sectors = cpu_to_le16(stripe_size); |
1591 | s->v.algorithm = 0; |
1592 | s->v.nr_blocks = nr_data + nr_parity; |
1593 | s->v.nr_redundant = nr_parity; |
1594 | s->v.csum_granularity_bits = ilog2(c->opts.encoded_extent_max >> 9); |
1595 | s->v.csum_type = BCH_CSUM_crc32c; |
1596 | s->v.pad = 0; |
1597 | |
1598 | while ((u64s = stripe_val_u64s(s: &s->v)) > BKEY_VAL_U64s_MAX) { |
1599 | BUG_ON(1 << s->v.csum_granularity_bits >= |
1600 | le16_to_cpu(s->v.sectors) || |
1601 | s->v.csum_granularity_bits == U8_MAX); |
1602 | s->v.csum_granularity_bits++; |
1603 | } |
1604 | |
1605 | set_bkey_val_u64s(k: &s->k, val_u64s: u64s); |
1606 | } |
1607 | |
1608 | static int ec_new_stripe_alloc(struct bch_fs *c, struct ec_stripe_head *h) |
1609 | { |
1610 | struct ec_stripe_new *s; |
1611 | |
1612 | lockdep_assert_held(&h->lock); |
1613 | |
1614 | s = kzalloc(size: sizeof(*s), GFP_KERNEL); |
1615 | if (!s) |
1616 | return -BCH_ERR_ENOMEM_ec_new_stripe_alloc; |
1617 | |
1618 | mutex_init(&s->lock); |
1619 | closure_init(cl: &s->iodone, NULL); |
1620 | atomic_set(v: &s->ref[STRIPE_REF_stripe], i: 1); |
1621 | atomic_set(v: &s->ref[STRIPE_REF_io], i: 1); |
1622 | s->c = c; |
1623 | s->h = h; |
1624 | s->nr_data = min_t(unsigned, h->nr_active_devs, |
1625 | BCH_BKEY_PTRS_MAX) - h->redundancy; |
1626 | s->nr_parity = h->redundancy; |
1627 | |
1628 | ec_stripe_key_init(c, k: &s->new_stripe.key, |
1629 | nr_data: s->nr_data, nr_parity: s->nr_parity, stripe_size: h->blocksize); |
1630 | |
1631 | h->s = s; |
1632 | return 0; |
1633 | } |
1634 | |
1635 | static struct ec_stripe_head * |
1636 | ec_new_stripe_head_alloc(struct bch_fs *c, unsigned target, |
1637 | unsigned algo, unsigned redundancy, |
1638 | enum bch_watermark watermark) |
1639 | { |
1640 | struct ec_stripe_head *h; |
1641 | |
1642 | h = kzalloc(size: sizeof(*h), GFP_KERNEL); |
1643 | if (!h) |
1644 | return NULL; |
1645 | |
1646 | mutex_init(&h->lock); |
1647 | BUG_ON(!mutex_trylock(&h->lock)); |
1648 | |
1649 | h->target = target; |
1650 | h->algo = algo; |
1651 | h->redundancy = redundancy; |
1652 | h->watermark = watermark; |
1653 | |
1654 | rcu_read_lock(); |
1655 | h->devs = target_rw_devs(c, data_type: BCH_DATA_user, target); |
1656 | |
1657 | for_each_member_device_rcu(c, ca, &h->devs) |
1658 | if (!ca->mi.durability) |
1659 | __clear_bit(ca->dev_idx, h->devs.d); |
1660 | |
1661 | h->blocksize = pick_blocksize(c, devs: &h->devs); |
1662 | |
1663 | for_each_member_device_rcu(c, ca, &h->devs) |
1664 | if (ca->mi.bucket_size == h->blocksize) |
1665 | h->nr_active_devs++; |
1666 | |
1667 | rcu_read_unlock(); |
1668 | |
1669 | /* |
1670 | * If we only have redundancy + 1 devices, we're better off with just |
1671 | * replication: |
1672 | */ |
1673 | if (h->nr_active_devs < h->redundancy + 2) |
1674 | bch_err(c, "insufficient devices available to create stripe (have %u, need %u) - mismatched bucket sizes?" , |
1675 | h->nr_active_devs, h->redundancy + 2); |
1676 | |
1677 | list_add(new: &h->list, head: &c->ec_stripe_head_list); |
1678 | return h; |
1679 | } |
1680 | |
1681 | void bch2_ec_stripe_head_put(struct bch_fs *c, struct ec_stripe_head *h) |
1682 | { |
1683 | if (h->s && |
1684 | h->s->allocated && |
1685 | bitmap_weight(src: h->s->blocks_allocated, |
1686 | nbits: h->s->nr_data) == h->s->nr_data) |
1687 | ec_stripe_set_pending(c, h); |
1688 | |
1689 | mutex_unlock(lock: &h->lock); |
1690 | } |
1691 | |
1692 | static struct ec_stripe_head * |
1693 | __bch2_ec_stripe_head_get(struct btree_trans *trans, |
1694 | unsigned target, |
1695 | unsigned algo, |
1696 | unsigned redundancy, |
1697 | enum bch_watermark watermark) |
1698 | { |
1699 | struct bch_fs *c = trans->c; |
1700 | struct ec_stripe_head *h; |
1701 | int ret; |
1702 | |
1703 | if (!redundancy) |
1704 | return NULL; |
1705 | |
1706 | ret = bch2_trans_mutex_lock(trans, lock: &c->ec_stripe_head_lock); |
1707 | if (ret) |
1708 | return ERR_PTR(error: ret); |
1709 | |
1710 | if (test_bit(BCH_FS_going_ro, &c->flags)) { |
1711 | h = ERR_PTR(error: -BCH_ERR_erofs_no_writes); |
1712 | goto found; |
1713 | } |
1714 | |
1715 | list_for_each_entry(h, &c->ec_stripe_head_list, list) |
1716 | if (h->target == target && |
1717 | h->algo == algo && |
1718 | h->redundancy == redundancy && |
1719 | h->watermark == watermark) { |
1720 | ret = bch2_trans_mutex_lock(trans, lock: &h->lock); |
1721 | if (ret) |
1722 | h = ERR_PTR(error: ret); |
1723 | goto found; |
1724 | } |
1725 | |
1726 | h = ec_new_stripe_head_alloc(c, target, algo, redundancy, watermark); |
1727 | found: |
1728 | if (!IS_ERR_OR_NULL(ptr: h) && |
1729 | h->nr_active_devs < h->redundancy + 2) { |
1730 | mutex_unlock(lock: &h->lock); |
1731 | h = NULL; |
1732 | } |
1733 | mutex_unlock(lock: &c->ec_stripe_head_lock); |
1734 | return h; |
1735 | } |
1736 | |
1737 | static int new_stripe_alloc_buckets(struct btree_trans *trans, struct ec_stripe_head *h, |
1738 | enum bch_watermark watermark, struct closure *cl) |
1739 | { |
1740 | struct bch_fs *c = trans->c; |
1741 | struct bch_devs_mask devs = h->devs; |
1742 | struct open_bucket *ob; |
1743 | struct open_buckets buckets; |
1744 | struct bch_stripe *v = &bkey_i_to_stripe(k: &h->s->new_stripe.key)->v; |
1745 | unsigned i, j, nr_have_parity = 0, nr_have_data = 0; |
1746 | bool have_cache = true; |
1747 | int ret = 0; |
1748 | |
1749 | BUG_ON(v->nr_blocks != h->s->nr_data + h->s->nr_parity); |
1750 | BUG_ON(v->nr_redundant != h->s->nr_parity); |
1751 | |
1752 | for_each_set_bit(i, h->s->blocks_gotten, v->nr_blocks) { |
1753 | __clear_bit(v->ptrs[i].dev, devs.d); |
1754 | if (i < h->s->nr_data) |
1755 | nr_have_data++; |
1756 | else |
1757 | nr_have_parity++; |
1758 | } |
1759 | |
1760 | BUG_ON(nr_have_data > h->s->nr_data); |
1761 | BUG_ON(nr_have_parity > h->s->nr_parity); |
1762 | |
1763 | buckets.nr = 0; |
1764 | if (nr_have_parity < h->s->nr_parity) { |
1765 | ret = bch2_bucket_alloc_set_trans(trans, &buckets, |
1766 | &h->parity_stripe, |
1767 | &devs, |
1768 | h->s->nr_parity, |
1769 | &nr_have_parity, |
1770 | &have_cache, 0, |
1771 | BCH_DATA_parity, |
1772 | watermark, |
1773 | cl); |
1774 | |
1775 | open_bucket_for_each(c, &buckets, ob, i) { |
1776 | j = find_next_zero_bit(addr: h->s->blocks_gotten, |
1777 | size: h->s->nr_data + h->s->nr_parity, |
1778 | offset: h->s->nr_data); |
1779 | BUG_ON(j >= h->s->nr_data + h->s->nr_parity); |
1780 | |
1781 | h->s->blocks[j] = buckets.v[i]; |
1782 | v->ptrs[j] = bch2_ob_ptr(c, ob); |
1783 | __set_bit(j, h->s->blocks_gotten); |
1784 | } |
1785 | |
1786 | if (ret) |
1787 | return ret; |
1788 | } |
1789 | |
1790 | buckets.nr = 0; |
1791 | if (nr_have_data < h->s->nr_data) { |
1792 | ret = bch2_bucket_alloc_set_trans(trans, &buckets, |
1793 | &h->block_stripe, |
1794 | &devs, |
1795 | h->s->nr_data, |
1796 | &nr_have_data, |
1797 | &have_cache, 0, |
1798 | BCH_DATA_user, |
1799 | watermark, |
1800 | cl); |
1801 | |
1802 | open_bucket_for_each(c, &buckets, ob, i) { |
1803 | j = find_next_zero_bit(addr: h->s->blocks_gotten, |
1804 | size: h->s->nr_data, offset: 0); |
1805 | BUG_ON(j >= h->s->nr_data); |
1806 | |
1807 | h->s->blocks[j] = buckets.v[i]; |
1808 | v->ptrs[j] = bch2_ob_ptr(c, ob); |
1809 | __set_bit(j, h->s->blocks_gotten); |
1810 | } |
1811 | |
1812 | if (ret) |
1813 | return ret; |
1814 | } |
1815 | |
1816 | return 0; |
1817 | } |
1818 | |
1819 | /* XXX: doesn't obey target: */ |
1820 | static s64 get_existing_stripe(struct bch_fs *c, |
1821 | struct ec_stripe_head *head) |
1822 | { |
1823 | ec_stripes_heap *h = &c->ec_stripes_heap; |
1824 | struct stripe *m; |
1825 | size_t heap_idx; |
1826 | u64 stripe_idx; |
1827 | s64 ret = -1; |
1828 | |
1829 | if (may_create_new_stripe(c)) |
1830 | return -1; |
1831 | |
1832 | mutex_lock(&c->ec_stripes_heap_lock); |
1833 | for (heap_idx = 0; heap_idx < h->used; heap_idx++) { |
1834 | /* No blocks worth reusing, stripe will just be deleted: */ |
1835 | if (!h->data[heap_idx].blocks_nonempty) |
1836 | continue; |
1837 | |
1838 | stripe_idx = h->data[heap_idx].idx; |
1839 | |
1840 | m = genradix_ptr(&c->stripes, stripe_idx); |
1841 | |
1842 | if (m->algorithm == head->algo && |
1843 | m->nr_redundant == head->redundancy && |
1844 | m->sectors == head->blocksize && |
1845 | m->blocks_nonempty < m->nr_blocks - m->nr_redundant && |
1846 | bch2_try_open_stripe(c, s: head->s, idx: stripe_idx)) { |
1847 | ret = stripe_idx; |
1848 | break; |
1849 | } |
1850 | } |
1851 | mutex_unlock(lock: &c->ec_stripes_heap_lock); |
1852 | return ret; |
1853 | } |
1854 | |
1855 | static int __bch2_ec_stripe_head_reuse(struct btree_trans *trans, struct ec_stripe_head *h) |
1856 | { |
1857 | struct bch_fs *c = trans->c; |
1858 | struct bch_stripe *new_v = &bkey_i_to_stripe(k: &h->s->new_stripe.key)->v; |
1859 | struct bch_stripe *existing_v; |
1860 | unsigned i; |
1861 | s64 idx; |
1862 | int ret; |
1863 | |
1864 | /* |
1865 | * If we can't allocate a new stripe, and there's no stripes with empty |
1866 | * blocks for us to reuse, that means we have to wait on copygc: |
1867 | */ |
1868 | idx = get_existing_stripe(c, head: h); |
1869 | if (idx < 0) |
1870 | return -BCH_ERR_stripe_alloc_blocked; |
1871 | |
1872 | ret = get_stripe_key_trans(trans, idx, stripe: &h->s->existing_stripe); |
1873 | bch2_fs_fatal_err_on(ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart), c, |
1874 | "reading stripe key: %s" , bch2_err_str(ret)); |
1875 | if (ret) { |
1876 | bch2_stripe_close(c, s: h->s); |
1877 | return ret; |
1878 | } |
1879 | |
1880 | existing_v = &bkey_i_to_stripe(k: &h->s->existing_stripe.key)->v; |
1881 | |
1882 | BUG_ON(existing_v->nr_redundant != h->s->nr_parity); |
1883 | h->s->nr_data = existing_v->nr_blocks - |
1884 | existing_v->nr_redundant; |
1885 | |
1886 | ret = ec_stripe_buf_init(buf: &h->s->existing_stripe, offset: 0, size: h->blocksize); |
1887 | if (ret) { |
1888 | bch2_stripe_close(c, s: h->s); |
1889 | return ret; |
1890 | } |
1891 | |
1892 | BUG_ON(h->s->existing_stripe.size != h->blocksize); |
1893 | BUG_ON(h->s->existing_stripe.size != le16_to_cpu(existing_v->sectors)); |
1894 | |
1895 | /* |
1896 | * Free buckets we initially allocated - they might conflict with |
1897 | * blocks from the stripe we're reusing: |
1898 | */ |
1899 | for_each_set_bit(i, h->s->blocks_gotten, new_v->nr_blocks) { |
1900 | bch2_open_bucket_put(c, ob: c->open_buckets + h->s->blocks[i]); |
1901 | h->s->blocks[i] = 0; |
1902 | } |
1903 | memset(h->s->blocks_gotten, 0, sizeof(h->s->blocks_gotten)); |
1904 | memset(h->s->blocks_allocated, 0, sizeof(h->s->blocks_allocated)); |
1905 | |
1906 | for (i = 0; i < existing_v->nr_blocks; i++) { |
1907 | if (stripe_blockcount_get(s: existing_v, idx: i)) { |
1908 | __set_bit(i, h->s->blocks_gotten); |
1909 | __set_bit(i, h->s->blocks_allocated); |
1910 | } |
1911 | |
1912 | ec_block_io(c, buf: &h->s->existing_stripe, READ, idx: i, cl: &h->s->iodone); |
1913 | } |
1914 | |
1915 | bkey_copy(dst: &h->s->new_stripe.key, src: &h->s->existing_stripe.key); |
1916 | h->s->have_existing_stripe = true; |
1917 | |
1918 | return 0; |
1919 | } |
1920 | |
1921 | static int __bch2_ec_stripe_head_reserve(struct btree_trans *trans, struct ec_stripe_head *h) |
1922 | { |
1923 | struct bch_fs *c = trans->c; |
1924 | struct btree_iter iter; |
1925 | struct bkey_s_c k; |
1926 | struct bpos min_pos = POS(0, 1); |
1927 | struct bpos start_pos = bpos_max(l: min_pos, POS(0, c->ec_stripe_hint)); |
1928 | int ret; |
1929 | |
1930 | if (!h->s->res.sectors) { |
1931 | ret = bch2_disk_reservation_get(c, res: &h->s->res, |
1932 | sectors: h->blocksize, |
1933 | nr_replicas: h->s->nr_parity, |
1934 | BCH_DISK_RESERVATION_NOFAIL); |
1935 | if (ret) |
1936 | return ret; |
1937 | } |
1938 | |
1939 | for_each_btree_key_norestart(trans, iter, BTREE_ID_stripes, start_pos, |
1940 | BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k, ret) { |
1941 | if (bkey_gt(l: k.k->p, POS(0, U32_MAX))) { |
1942 | if (start_pos.offset) { |
1943 | start_pos = min_pos; |
1944 | bch2_btree_iter_set_pos(iter: &iter, new_pos: start_pos); |
1945 | continue; |
1946 | } |
1947 | |
1948 | ret = -BCH_ERR_ENOSPC_stripe_create; |
1949 | break; |
1950 | } |
1951 | |
1952 | if (bkey_deleted(k.k) && |
1953 | bch2_try_open_stripe(c, s: h->s, idx: k.k->p.offset)) |
1954 | break; |
1955 | } |
1956 | |
1957 | c->ec_stripe_hint = iter.pos.offset; |
1958 | |
1959 | if (ret) |
1960 | goto err; |
1961 | |
1962 | ret = ec_stripe_mem_alloc(trans, iter: &iter); |
1963 | if (ret) { |
1964 | bch2_stripe_close(c, s: h->s); |
1965 | goto err; |
1966 | } |
1967 | |
1968 | h->s->new_stripe.key.k.p = iter.pos; |
1969 | out: |
1970 | bch2_trans_iter_exit(trans, &iter); |
1971 | return ret; |
1972 | err: |
1973 | bch2_disk_reservation_put(c, res: &h->s->res); |
1974 | goto out; |
1975 | } |
1976 | |
1977 | struct ec_stripe_head *bch2_ec_stripe_head_get(struct btree_trans *trans, |
1978 | unsigned target, |
1979 | unsigned algo, |
1980 | unsigned redundancy, |
1981 | enum bch_watermark watermark, |
1982 | struct closure *cl) |
1983 | { |
1984 | struct bch_fs *c = trans->c; |
1985 | struct ec_stripe_head *h; |
1986 | bool waiting = false; |
1987 | int ret; |
1988 | |
1989 | h = __bch2_ec_stripe_head_get(trans, target, algo, redundancy, watermark); |
1990 | if (IS_ERR_OR_NULL(ptr: h)) |
1991 | return h; |
1992 | |
1993 | if (!h->s) { |
1994 | ret = ec_new_stripe_alloc(c, h); |
1995 | if (ret) { |
1996 | bch_err(c, "failed to allocate new stripe" ); |
1997 | goto err; |
1998 | } |
1999 | } |
2000 | |
2001 | if (h->s->allocated) |
2002 | goto allocated; |
2003 | |
2004 | if (h->s->have_existing_stripe) |
2005 | goto alloc_existing; |
2006 | |
2007 | /* First, try to allocate a full stripe: */ |
2008 | ret = new_stripe_alloc_buckets(trans, h, watermark: BCH_WATERMARK_stripe, NULL) ?: |
2009 | __bch2_ec_stripe_head_reserve(trans, h); |
2010 | if (!ret) |
2011 | goto allocate_buf; |
2012 | if (bch2_err_matches(ret, BCH_ERR_transaction_restart) || |
2013 | bch2_err_matches(ret, ENOMEM)) |
2014 | goto err; |
2015 | |
2016 | /* |
2017 | * Not enough buckets available for a full stripe: we must reuse an |
2018 | * existing stripe: |
2019 | */ |
2020 | while (1) { |
2021 | ret = __bch2_ec_stripe_head_reuse(trans, h); |
2022 | if (!ret) |
2023 | break; |
2024 | if (waiting || !cl || ret != -BCH_ERR_stripe_alloc_blocked) |
2025 | goto err; |
2026 | |
2027 | if (watermark == BCH_WATERMARK_copygc) { |
2028 | ret = new_stripe_alloc_buckets(trans, h, watermark, NULL) ?: |
2029 | __bch2_ec_stripe_head_reserve(trans, h); |
2030 | if (ret) |
2031 | goto err; |
2032 | goto allocate_buf; |
2033 | } |
2034 | |
2035 | /* XXX freelist_wait? */ |
2036 | closure_wait(list: &c->freelist_wait, cl); |
2037 | waiting = true; |
2038 | } |
2039 | |
2040 | if (waiting) |
2041 | closure_wake_up(list: &c->freelist_wait); |
2042 | alloc_existing: |
2043 | /* |
2044 | * Retry allocating buckets, with the watermark for this |
2045 | * particular write: |
2046 | */ |
2047 | ret = new_stripe_alloc_buckets(trans, h, watermark, cl); |
2048 | if (ret) |
2049 | goto err; |
2050 | |
2051 | allocate_buf: |
2052 | ret = ec_stripe_buf_init(buf: &h->s->new_stripe, offset: 0, size: h->blocksize); |
2053 | if (ret) |
2054 | goto err; |
2055 | |
2056 | h->s->allocated = true; |
2057 | allocated: |
2058 | BUG_ON(!h->s->idx); |
2059 | BUG_ON(!h->s->new_stripe.data[0]); |
2060 | BUG_ON(trans->restarted); |
2061 | return h; |
2062 | err: |
2063 | bch2_ec_stripe_head_put(c, h); |
2064 | return ERR_PTR(error: ret); |
2065 | } |
2066 | |
2067 | static void __bch2_ec_stop(struct bch_fs *c, struct bch_dev *ca) |
2068 | { |
2069 | struct ec_stripe_head *h; |
2070 | struct open_bucket *ob; |
2071 | unsigned i; |
2072 | |
2073 | mutex_lock(&c->ec_stripe_head_lock); |
2074 | list_for_each_entry(h, &c->ec_stripe_head_list, list) { |
2075 | mutex_lock(&h->lock); |
2076 | if (!h->s) |
2077 | goto unlock; |
2078 | |
2079 | if (!ca) |
2080 | goto found; |
2081 | |
2082 | for (i = 0; i < bkey_i_to_stripe(k: &h->s->new_stripe.key)->v.nr_blocks; i++) { |
2083 | if (!h->s->blocks[i]) |
2084 | continue; |
2085 | |
2086 | ob = c->open_buckets + h->s->blocks[i]; |
2087 | if (ob->dev == ca->dev_idx) |
2088 | goto found; |
2089 | } |
2090 | goto unlock; |
2091 | found: |
2092 | h->s->err = -BCH_ERR_erofs_no_writes; |
2093 | ec_stripe_set_pending(c, h); |
2094 | unlock: |
2095 | mutex_unlock(lock: &h->lock); |
2096 | } |
2097 | mutex_unlock(lock: &c->ec_stripe_head_lock); |
2098 | } |
2099 | |
2100 | void bch2_ec_stop_dev(struct bch_fs *c, struct bch_dev *ca) |
2101 | { |
2102 | __bch2_ec_stop(c, ca); |
2103 | } |
2104 | |
2105 | void bch2_fs_ec_stop(struct bch_fs *c) |
2106 | { |
2107 | __bch2_ec_stop(c, NULL); |
2108 | } |
2109 | |
2110 | static bool bch2_fs_ec_flush_done(struct bch_fs *c) |
2111 | { |
2112 | bool ret; |
2113 | |
2114 | mutex_lock(&c->ec_stripe_new_lock); |
2115 | ret = list_empty(head: &c->ec_stripe_new_list); |
2116 | mutex_unlock(lock: &c->ec_stripe_new_lock); |
2117 | |
2118 | return ret; |
2119 | } |
2120 | |
2121 | void bch2_fs_ec_flush(struct bch_fs *c) |
2122 | { |
2123 | wait_event(c->ec_stripe_new_wait, bch2_fs_ec_flush_done(c)); |
2124 | } |
2125 | |
2126 | int bch2_stripes_read(struct bch_fs *c) |
2127 | { |
2128 | int ret = bch2_trans_run(c, |
2129 | for_each_btree_key(trans, iter, BTREE_ID_stripes, POS_MIN, |
2130 | BTREE_ITER_PREFETCH, k, ({ |
2131 | if (k.k->type != KEY_TYPE_stripe) |
2132 | continue; |
2133 | |
2134 | ret = __ec_stripe_mem_alloc(c, k.k->p.offset, GFP_KERNEL); |
2135 | if (ret) |
2136 | break; |
2137 | |
2138 | const struct bch_stripe *s = bkey_s_c_to_stripe(k).v; |
2139 | |
2140 | struct stripe *m = genradix_ptr(&c->stripes, k.k->p.offset); |
2141 | m->sectors = le16_to_cpu(s->sectors); |
2142 | m->algorithm = s->algorithm; |
2143 | m->nr_blocks = s->nr_blocks; |
2144 | m->nr_redundant = s->nr_redundant; |
2145 | m->blocks_nonempty = 0; |
2146 | |
2147 | for (unsigned i = 0; i < s->nr_blocks; i++) |
2148 | m->blocks_nonempty += !!stripe_blockcount_get(s, i); |
2149 | |
2150 | bch2_stripes_heap_insert(c, m, k.k->p.offset); |
2151 | 0; |
2152 | }))); |
2153 | bch_err_fn(c, ret); |
2154 | return ret; |
2155 | } |
2156 | |
2157 | void bch2_stripes_heap_to_text(struct printbuf *out, struct bch_fs *c) |
2158 | { |
2159 | ec_stripes_heap *h = &c->ec_stripes_heap; |
2160 | struct stripe *m; |
2161 | size_t i; |
2162 | |
2163 | mutex_lock(&c->ec_stripes_heap_lock); |
2164 | for (i = 0; i < min_t(size_t, h->used, 50); i++) { |
2165 | m = genradix_ptr(&c->stripes, h->data[i].idx); |
2166 | |
2167 | prt_printf(out, "%zu %u/%u+%u" , h->data[i].idx, |
2168 | h->data[i].blocks_nonempty, |
2169 | m->nr_blocks - m->nr_redundant, |
2170 | m->nr_redundant); |
2171 | if (bch2_stripe_is_open(c, idx: h->data[i].idx)) |
2172 | prt_str(out, str: " open" ); |
2173 | prt_newline(out); |
2174 | } |
2175 | mutex_unlock(lock: &c->ec_stripes_heap_lock); |
2176 | } |
2177 | |
2178 | void bch2_new_stripes_to_text(struct printbuf *out, struct bch_fs *c) |
2179 | { |
2180 | struct ec_stripe_head *h; |
2181 | struct ec_stripe_new *s; |
2182 | |
2183 | mutex_lock(&c->ec_stripe_head_lock); |
2184 | list_for_each_entry(h, &c->ec_stripe_head_list, list) { |
2185 | prt_printf(out, "target %u algo %u redundancy %u %s:\n" , |
2186 | h->target, h->algo, h->redundancy, |
2187 | bch2_watermarks[h->watermark]); |
2188 | |
2189 | if (h->s) |
2190 | prt_printf(out, "\tidx %llu blocks %u+%u allocated %u\n" , |
2191 | h->s->idx, h->s->nr_data, h->s->nr_parity, |
2192 | bitmap_weight(h->s->blocks_allocated, |
2193 | h->s->nr_data)); |
2194 | } |
2195 | mutex_unlock(lock: &c->ec_stripe_head_lock); |
2196 | |
2197 | prt_printf(out, "in flight:\n" ); |
2198 | |
2199 | mutex_lock(&c->ec_stripe_new_lock); |
2200 | list_for_each_entry(s, &c->ec_stripe_new_list, list) { |
2201 | prt_printf(out, "\tidx %llu blocks %u+%u ref %u %u %s\n" , |
2202 | s->idx, s->nr_data, s->nr_parity, |
2203 | atomic_read(&s->ref[STRIPE_REF_io]), |
2204 | atomic_read(&s->ref[STRIPE_REF_stripe]), |
2205 | bch2_watermarks[s->h->watermark]); |
2206 | } |
2207 | mutex_unlock(lock: &c->ec_stripe_new_lock); |
2208 | } |
2209 | |
2210 | void bch2_fs_ec_exit(struct bch_fs *c) |
2211 | { |
2212 | struct ec_stripe_head *h; |
2213 | unsigned i; |
2214 | |
2215 | while (1) { |
2216 | mutex_lock(&c->ec_stripe_head_lock); |
2217 | h = list_first_entry_or_null(&c->ec_stripe_head_list, |
2218 | struct ec_stripe_head, list); |
2219 | if (h) |
2220 | list_del(entry: &h->list); |
2221 | mutex_unlock(lock: &c->ec_stripe_head_lock); |
2222 | if (!h) |
2223 | break; |
2224 | |
2225 | if (h->s) { |
2226 | for (i = 0; i < bkey_i_to_stripe(k: &h->s->new_stripe.key)->v.nr_blocks; i++) |
2227 | BUG_ON(h->s->blocks[i]); |
2228 | |
2229 | kfree(objp: h->s); |
2230 | } |
2231 | kfree(objp: h); |
2232 | } |
2233 | |
2234 | BUG_ON(!list_empty(&c->ec_stripe_new_list)); |
2235 | |
2236 | free_heap(&c->ec_stripes_heap); |
2237 | genradix_free(&c->stripes); |
2238 | bioset_exit(&c->ec_bioset); |
2239 | } |
2240 | |
2241 | void bch2_fs_ec_init_early(struct bch_fs *c) |
2242 | { |
2243 | spin_lock_init(&c->ec_stripes_new_lock); |
2244 | mutex_init(&c->ec_stripes_heap_lock); |
2245 | |
2246 | INIT_LIST_HEAD(list: &c->ec_stripe_head_list); |
2247 | mutex_init(&c->ec_stripe_head_lock); |
2248 | |
2249 | INIT_LIST_HEAD(list: &c->ec_stripe_new_list); |
2250 | mutex_init(&c->ec_stripe_new_lock); |
2251 | init_waitqueue_head(&c->ec_stripe_new_wait); |
2252 | |
2253 | INIT_WORK(&c->ec_stripe_create_work, ec_stripe_create_work); |
2254 | INIT_WORK(&c->ec_stripe_delete_work, ec_stripe_delete_work); |
2255 | } |
2256 | |
2257 | int bch2_fs_ec_init(struct bch_fs *c) |
2258 | { |
2259 | return bioset_init(&c->ec_bioset, 1, offsetof(struct ec_bio, bio), |
2260 | flags: BIOSET_NEED_BVECS); |
2261 | } |
2262 | |