1 | // SPDX-License-Identifier: GPL-2.0 |
2 | /* |
3 | * Copyright (c) 2017-2018 Christoph Hellwig. |
4 | */ |
5 | |
6 | #include <linux/backing-dev.h> |
7 | #include <linux/moduleparam.h> |
8 | #include <linux/vmalloc.h> |
9 | #include <trace/events/block.h> |
10 | #include "nvme.h" |
11 | |
12 | bool multipath = true; |
13 | module_param(multipath, bool, 0444); |
14 | MODULE_PARM_DESC(multipath, |
15 | "turn on native support for multiple controllers per subsystem" ); |
16 | |
17 | static const char *nvme_iopolicy_names[] = { |
18 | [NVME_IOPOLICY_NUMA] = "numa" , |
19 | [NVME_IOPOLICY_RR] = "round-robin" , |
20 | }; |
21 | |
22 | static int iopolicy = NVME_IOPOLICY_NUMA; |
23 | |
24 | static int nvme_set_iopolicy(const char *val, const struct kernel_param *kp) |
25 | { |
26 | if (!val) |
27 | return -EINVAL; |
28 | if (!strncmp(val, "numa" , 4)) |
29 | iopolicy = NVME_IOPOLICY_NUMA; |
30 | else if (!strncmp(val, "round-robin" , 11)) |
31 | iopolicy = NVME_IOPOLICY_RR; |
32 | else |
33 | return -EINVAL; |
34 | |
35 | return 0; |
36 | } |
37 | |
38 | static int nvme_get_iopolicy(char *buf, const struct kernel_param *kp) |
39 | { |
40 | return sprintf(buf, fmt: "%s\n" , nvme_iopolicy_names[iopolicy]); |
41 | } |
42 | |
43 | module_param_call(iopolicy, nvme_set_iopolicy, nvme_get_iopolicy, |
44 | &iopolicy, 0644); |
45 | MODULE_PARM_DESC(iopolicy, |
46 | "Default multipath I/O policy; 'numa' (default) or 'round-robin'" ); |
47 | |
48 | void nvme_mpath_default_iopolicy(struct nvme_subsystem *subsys) |
49 | { |
50 | subsys->iopolicy = iopolicy; |
51 | } |
52 | |
53 | void nvme_mpath_unfreeze(struct nvme_subsystem *subsys) |
54 | { |
55 | struct nvme_ns_head *h; |
56 | |
57 | lockdep_assert_held(&subsys->lock); |
58 | list_for_each_entry(h, &subsys->nsheads, entry) |
59 | if (h->disk) |
60 | blk_mq_unfreeze_queue(q: h->disk->queue); |
61 | } |
62 | |
63 | void nvme_mpath_wait_freeze(struct nvme_subsystem *subsys) |
64 | { |
65 | struct nvme_ns_head *h; |
66 | |
67 | lockdep_assert_held(&subsys->lock); |
68 | list_for_each_entry(h, &subsys->nsheads, entry) |
69 | if (h->disk) |
70 | blk_mq_freeze_queue_wait(q: h->disk->queue); |
71 | } |
72 | |
73 | void nvme_mpath_start_freeze(struct nvme_subsystem *subsys) |
74 | { |
75 | struct nvme_ns_head *h; |
76 | |
77 | lockdep_assert_held(&subsys->lock); |
78 | list_for_each_entry(h, &subsys->nsheads, entry) |
79 | if (h->disk) |
80 | blk_freeze_queue_start(q: h->disk->queue); |
81 | } |
82 | |
83 | void nvme_failover_req(struct request *req) |
84 | { |
85 | struct nvme_ns *ns = req->q->queuedata; |
86 | u16 status = nvme_req(req)->status & 0x7ff; |
87 | unsigned long flags; |
88 | struct bio *bio; |
89 | |
90 | nvme_mpath_clear_current_path(ns); |
91 | |
92 | /* |
93 | * If we got back an ANA error, we know the controller is alive but not |
94 | * ready to serve this namespace. Kick of a re-read of the ANA |
95 | * information page, and just try any other available path for now. |
96 | */ |
97 | if (nvme_is_ana_error(status) && ns->ctrl->ana_log_buf) { |
98 | set_bit(NVME_NS_ANA_PENDING, addr: &ns->flags); |
99 | queue_work(wq: nvme_wq, work: &ns->ctrl->ana_work); |
100 | } |
101 | |
102 | spin_lock_irqsave(&ns->head->requeue_lock, flags); |
103 | for (bio = req->bio; bio; bio = bio->bi_next) { |
104 | bio_set_dev(bio, bdev: ns->head->disk->part0); |
105 | if (bio->bi_opf & REQ_POLLED) { |
106 | bio->bi_opf &= ~REQ_POLLED; |
107 | bio->bi_cookie = BLK_QC_T_NONE; |
108 | } |
109 | /* |
110 | * The alternate request queue that we may end up submitting |
111 | * the bio to may be frozen temporarily, in this case REQ_NOWAIT |
112 | * will fail the I/O immediately with EAGAIN to the issuer. |
113 | * We are not in the issuer context which cannot block. Clear |
114 | * the flag to avoid spurious EAGAIN I/O failures. |
115 | */ |
116 | bio->bi_opf &= ~REQ_NOWAIT; |
117 | } |
118 | blk_steal_bios(list: &ns->head->requeue_list, rq: req); |
119 | spin_unlock_irqrestore(lock: &ns->head->requeue_lock, flags); |
120 | |
121 | blk_mq_end_request(rq: req, error: 0); |
122 | kblockd_schedule_work(work: &ns->head->requeue_work); |
123 | } |
124 | |
125 | void nvme_mpath_start_request(struct request *rq) |
126 | { |
127 | struct nvme_ns *ns = rq->q->queuedata; |
128 | struct gendisk *disk = ns->head->disk; |
129 | |
130 | if (!blk_queue_io_stat(disk->queue) || blk_rq_is_passthrough(rq)) |
131 | return; |
132 | |
133 | nvme_req(req: rq)->flags |= NVME_MPATH_IO_STATS; |
134 | nvme_req(req: rq)->start_time = bdev_start_io_acct(bdev: disk->part0, op: req_op(req: rq), |
135 | start_time: jiffies); |
136 | } |
137 | EXPORT_SYMBOL_GPL(nvme_mpath_start_request); |
138 | |
139 | void nvme_mpath_end_request(struct request *rq) |
140 | { |
141 | struct nvme_ns *ns = rq->q->queuedata; |
142 | |
143 | if (!(nvme_req(req: rq)->flags & NVME_MPATH_IO_STATS)) |
144 | return; |
145 | bdev_end_io_acct(bdev: ns->head->disk->part0, op: req_op(req: rq), |
146 | sectors: blk_rq_bytes(rq) >> SECTOR_SHIFT, |
147 | start_time: nvme_req(req: rq)->start_time); |
148 | } |
149 | |
150 | void nvme_kick_requeue_lists(struct nvme_ctrl *ctrl) |
151 | { |
152 | struct nvme_ns *ns; |
153 | |
154 | down_read(sem: &ctrl->namespaces_rwsem); |
155 | list_for_each_entry(ns, &ctrl->namespaces, list) { |
156 | if (!ns->head->disk) |
157 | continue; |
158 | kblockd_schedule_work(work: &ns->head->requeue_work); |
159 | if (ctrl->state == NVME_CTRL_LIVE) |
160 | disk_uevent(disk: ns->head->disk, action: KOBJ_CHANGE); |
161 | } |
162 | up_read(sem: &ctrl->namespaces_rwsem); |
163 | } |
164 | |
165 | static const char *nvme_ana_state_names[] = { |
166 | [0] = "invalid state" , |
167 | [NVME_ANA_OPTIMIZED] = "optimized" , |
168 | [NVME_ANA_NONOPTIMIZED] = "non-optimized" , |
169 | [NVME_ANA_INACCESSIBLE] = "inaccessible" , |
170 | [NVME_ANA_PERSISTENT_LOSS] = "persistent-loss" , |
171 | [NVME_ANA_CHANGE] = "change" , |
172 | }; |
173 | |
174 | bool nvme_mpath_clear_current_path(struct nvme_ns *ns) |
175 | { |
176 | struct nvme_ns_head *head = ns->head; |
177 | bool changed = false; |
178 | int node; |
179 | |
180 | if (!head) |
181 | goto out; |
182 | |
183 | for_each_node(node) { |
184 | if (ns == rcu_access_pointer(head->current_path[node])) { |
185 | rcu_assign_pointer(head->current_path[node], NULL); |
186 | changed = true; |
187 | } |
188 | } |
189 | out: |
190 | return changed; |
191 | } |
192 | |
193 | void nvme_mpath_clear_ctrl_paths(struct nvme_ctrl *ctrl) |
194 | { |
195 | struct nvme_ns *ns; |
196 | |
197 | down_read(sem: &ctrl->namespaces_rwsem); |
198 | list_for_each_entry(ns, &ctrl->namespaces, list) { |
199 | nvme_mpath_clear_current_path(ns); |
200 | kblockd_schedule_work(work: &ns->head->requeue_work); |
201 | } |
202 | up_read(sem: &ctrl->namespaces_rwsem); |
203 | } |
204 | |
205 | void nvme_mpath_revalidate_paths(struct nvme_ns *ns) |
206 | { |
207 | struct nvme_ns_head *head = ns->head; |
208 | sector_t capacity = get_capacity(disk: head->disk); |
209 | int node; |
210 | int srcu_idx; |
211 | |
212 | srcu_idx = srcu_read_lock(ssp: &head->srcu); |
213 | list_for_each_entry_rcu(ns, &head->list, siblings) { |
214 | if (capacity != get_capacity(disk: ns->disk)) |
215 | clear_bit(NVME_NS_READY, addr: &ns->flags); |
216 | } |
217 | srcu_read_unlock(ssp: &head->srcu, idx: srcu_idx); |
218 | |
219 | for_each_node(node) |
220 | rcu_assign_pointer(head->current_path[node], NULL); |
221 | kblockd_schedule_work(work: &head->requeue_work); |
222 | } |
223 | |
224 | static bool nvme_path_is_disabled(struct nvme_ns *ns) |
225 | { |
226 | /* |
227 | * We don't treat NVME_CTRL_DELETING as a disabled path as I/O should |
228 | * still be able to complete assuming that the controller is connected. |
229 | * Otherwise it will fail immediately and return to the requeue list. |
230 | */ |
231 | if (ns->ctrl->state != NVME_CTRL_LIVE && |
232 | ns->ctrl->state != NVME_CTRL_DELETING) |
233 | return true; |
234 | if (test_bit(NVME_NS_ANA_PENDING, &ns->flags) || |
235 | !test_bit(NVME_NS_READY, &ns->flags)) |
236 | return true; |
237 | return false; |
238 | } |
239 | |
240 | static struct nvme_ns *__nvme_find_path(struct nvme_ns_head *head, int node) |
241 | { |
242 | int found_distance = INT_MAX, fallback_distance = INT_MAX, distance; |
243 | struct nvme_ns *found = NULL, *fallback = NULL, *ns; |
244 | |
245 | list_for_each_entry_rcu(ns, &head->list, siblings) { |
246 | if (nvme_path_is_disabled(ns)) |
247 | continue; |
248 | |
249 | if (READ_ONCE(head->subsys->iopolicy) == NVME_IOPOLICY_NUMA) |
250 | distance = node_distance(node, ns->ctrl->numa_node); |
251 | else |
252 | distance = LOCAL_DISTANCE; |
253 | |
254 | switch (ns->ana_state) { |
255 | case NVME_ANA_OPTIMIZED: |
256 | if (distance < found_distance) { |
257 | found_distance = distance; |
258 | found = ns; |
259 | } |
260 | break; |
261 | case NVME_ANA_NONOPTIMIZED: |
262 | if (distance < fallback_distance) { |
263 | fallback_distance = distance; |
264 | fallback = ns; |
265 | } |
266 | break; |
267 | default: |
268 | break; |
269 | } |
270 | } |
271 | |
272 | if (!found) |
273 | found = fallback; |
274 | if (found) |
275 | rcu_assign_pointer(head->current_path[node], found); |
276 | return found; |
277 | } |
278 | |
279 | static struct nvme_ns *nvme_next_ns(struct nvme_ns_head *head, |
280 | struct nvme_ns *ns) |
281 | { |
282 | ns = list_next_or_null_rcu(&head->list, &ns->siblings, struct nvme_ns, |
283 | siblings); |
284 | if (ns) |
285 | return ns; |
286 | return list_first_or_null_rcu(&head->list, struct nvme_ns, siblings); |
287 | } |
288 | |
289 | static struct nvme_ns *nvme_round_robin_path(struct nvme_ns_head *head, |
290 | int node, struct nvme_ns *old) |
291 | { |
292 | struct nvme_ns *ns, *found = NULL; |
293 | |
294 | if (list_is_singular(head: &head->list)) { |
295 | if (nvme_path_is_disabled(ns: old)) |
296 | return NULL; |
297 | return old; |
298 | } |
299 | |
300 | for (ns = nvme_next_ns(head, ns: old); |
301 | ns && ns != old; |
302 | ns = nvme_next_ns(head, ns)) { |
303 | if (nvme_path_is_disabled(ns)) |
304 | continue; |
305 | |
306 | if (ns->ana_state == NVME_ANA_OPTIMIZED) { |
307 | found = ns; |
308 | goto out; |
309 | } |
310 | if (ns->ana_state == NVME_ANA_NONOPTIMIZED) |
311 | found = ns; |
312 | } |
313 | |
314 | /* |
315 | * The loop above skips the current path for round-robin semantics. |
316 | * Fall back to the current path if either: |
317 | * - no other optimized path found and current is optimized, |
318 | * - no other usable path found and current is usable. |
319 | */ |
320 | if (!nvme_path_is_disabled(ns: old) && |
321 | (old->ana_state == NVME_ANA_OPTIMIZED || |
322 | (!found && old->ana_state == NVME_ANA_NONOPTIMIZED))) |
323 | return old; |
324 | |
325 | if (!found) |
326 | return NULL; |
327 | out: |
328 | rcu_assign_pointer(head->current_path[node], found); |
329 | return found; |
330 | } |
331 | |
332 | static inline bool nvme_path_is_optimized(struct nvme_ns *ns) |
333 | { |
334 | return ns->ctrl->state == NVME_CTRL_LIVE && |
335 | ns->ana_state == NVME_ANA_OPTIMIZED; |
336 | } |
337 | |
338 | inline struct nvme_ns *nvme_find_path(struct nvme_ns_head *head) |
339 | { |
340 | int node = numa_node_id(); |
341 | struct nvme_ns *ns; |
342 | |
343 | ns = srcu_dereference(head->current_path[node], &head->srcu); |
344 | if (unlikely(!ns)) |
345 | return __nvme_find_path(head, node); |
346 | |
347 | if (READ_ONCE(head->subsys->iopolicy) == NVME_IOPOLICY_RR) |
348 | return nvme_round_robin_path(head, node, old: ns); |
349 | if (unlikely(!nvme_path_is_optimized(ns))) |
350 | return __nvme_find_path(head, node); |
351 | return ns; |
352 | } |
353 | |
354 | static bool nvme_available_path(struct nvme_ns_head *head) |
355 | { |
356 | struct nvme_ns *ns; |
357 | |
358 | list_for_each_entry_rcu(ns, &head->list, siblings) { |
359 | if (test_bit(NVME_CTRL_FAILFAST_EXPIRED, &ns->ctrl->flags)) |
360 | continue; |
361 | switch (ns->ctrl->state) { |
362 | case NVME_CTRL_LIVE: |
363 | case NVME_CTRL_RESETTING: |
364 | case NVME_CTRL_CONNECTING: |
365 | /* fallthru */ |
366 | return true; |
367 | default: |
368 | break; |
369 | } |
370 | } |
371 | return false; |
372 | } |
373 | |
374 | static void nvme_ns_head_submit_bio(struct bio *bio) |
375 | { |
376 | struct nvme_ns_head *head = bio->bi_bdev->bd_disk->private_data; |
377 | struct device *dev = disk_to_dev(head->disk); |
378 | struct nvme_ns *ns; |
379 | int srcu_idx; |
380 | |
381 | /* |
382 | * The namespace might be going away and the bio might be moved to a |
383 | * different queue via blk_steal_bios(), so we need to use the bio_split |
384 | * pool from the original queue to allocate the bvecs from. |
385 | */ |
386 | bio = bio_split_to_limits(bio); |
387 | if (!bio) |
388 | return; |
389 | |
390 | srcu_idx = srcu_read_lock(ssp: &head->srcu); |
391 | ns = nvme_find_path(head); |
392 | if (likely(ns)) { |
393 | bio_set_dev(bio, bdev: ns->disk->part0); |
394 | bio->bi_opf |= REQ_NVME_MPATH; |
395 | trace_block_bio_remap(bio, dev: disk_devt(disk: ns->head->disk), |
396 | from: bio->bi_iter.bi_sector); |
397 | submit_bio_noacct(bio); |
398 | } else if (nvme_available_path(head)) { |
399 | dev_warn_ratelimited(dev, "no usable path - requeuing I/O\n" ); |
400 | |
401 | spin_lock_irq(lock: &head->requeue_lock); |
402 | bio_list_add(bl: &head->requeue_list, bio); |
403 | spin_unlock_irq(lock: &head->requeue_lock); |
404 | } else { |
405 | dev_warn_ratelimited(dev, "no available path - failing I/O\n" ); |
406 | |
407 | bio_io_error(bio); |
408 | } |
409 | |
410 | srcu_read_unlock(ssp: &head->srcu, idx: srcu_idx); |
411 | } |
412 | |
413 | static int nvme_ns_head_open(struct gendisk *disk, blk_mode_t mode) |
414 | { |
415 | if (!nvme_tryget_ns_head(head: disk->private_data)) |
416 | return -ENXIO; |
417 | return 0; |
418 | } |
419 | |
420 | static void nvme_ns_head_release(struct gendisk *disk) |
421 | { |
422 | nvme_put_ns_head(head: disk->private_data); |
423 | } |
424 | |
425 | #ifdef CONFIG_BLK_DEV_ZONED |
426 | static int nvme_ns_head_report_zones(struct gendisk *disk, sector_t sector, |
427 | unsigned int nr_zones, report_zones_cb cb, void *data) |
428 | { |
429 | struct nvme_ns_head *head = disk->private_data; |
430 | struct nvme_ns *ns; |
431 | int srcu_idx, ret = -EWOULDBLOCK; |
432 | |
433 | srcu_idx = srcu_read_lock(ssp: &head->srcu); |
434 | ns = nvme_find_path(head); |
435 | if (ns) |
436 | ret = nvme_ns_report_zones(ns, sector, nr_zones, cb, data); |
437 | srcu_read_unlock(ssp: &head->srcu, idx: srcu_idx); |
438 | return ret; |
439 | } |
440 | #else |
441 | #define nvme_ns_head_report_zones NULL |
442 | #endif /* CONFIG_BLK_DEV_ZONED */ |
443 | |
444 | const struct block_device_operations nvme_ns_head_ops = { |
445 | .owner = THIS_MODULE, |
446 | .submit_bio = nvme_ns_head_submit_bio, |
447 | .open = nvme_ns_head_open, |
448 | .release = nvme_ns_head_release, |
449 | .ioctl = nvme_ns_head_ioctl, |
450 | .compat_ioctl = blkdev_compat_ptr_ioctl, |
451 | .getgeo = nvme_getgeo, |
452 | .report_zones = nvme_ns_head_report_zones, |
453 | .pr_ops = &nvme_pr_ops, |
454 | }; |
455 | |
456 | static inline struct nvme_ns_head *cdev_to_ns_head(struct cdev *cdev) |
457 | { |
458 | return container_of(cdev, struct nvme_ns_head, cdev); |
459 | } |
460 | |
461 | static int nvme_ns_head_chr_open(struct inode *inode, struct file *file) |
462 | { |
463 | if (!nvme_tryget_ns_head(head: cdev_to_ns_head(cdev: inode->i_cdev))) |
464 | return -ENXIO; |
465 | return 0; |
466 | } |
467 | |
468 | static int nvme_ns_head_chr_release(struct inode *inode, struct file *file) |
469 | { |
470 | nvme_put_ns_head(head: cdev_to_ns_head(cdev: inode->i_cdev)); |
471 | return 0; |
472 | } |
473 | |
474 | static const struct file_operations nvme_ns_head_chr_fops = { |
475 | .owner = THIS_MODULE, |
476 | .open = nvme_ns_head_chr_open, |
477 | .release = nvme_ns_head_chr_release, |
478 | .unlocked_ioctl = nvme_ns_head_chr_ioctl, |
479 | .compat_ioctl = compat_ptr_ioctl, |
480 | .uring_cmd = nvme_ns_head_chr_uring_cmd, |
481 | .uring_cmd_iopoll = nvme_ns_chr_uring_cmd_iopoll, |
482 | }; |
483 | |
484 | static int nvme_add_ns_head_cdev(struct nvme_ns_head *head) |
485 | { |
486 | int ret; |
487 | |
488 | head->cdev_device.parent = &head->subsys->dev; |
489 | ret = dev_set_name(dev: &head->cdev_device, name: "ng%dn%d" , |
490 | head->subsys->instance, head->instance); |
491 | if (ret) |
492 | return ret; |
493 | ret = nvme_cdev_add(cdev: &head->cdev, cdev_device: &head->cdev_device, |
494 | fops: &nvme_ns_head_chr_fops, THIS_MODULE); |
495 | return ret; |
496 | } |
497 | |
498 | static void nvme_requeue_work(struct work_struct *work) |
499 | { |
500 | struct nvme_ns_head *head = |
501 | container_of(work, struct nvme_ns_head, requeue_work); |
502 | struct bio *bio, *next; |
503 | |
504 | spin_lock_irq(lock: &head->requeue_lock); |
505 | next = bio_list_get(bl: &head->requeue_list); |
506 | spin_unlock_irq(lock: &head->requeue_lock); |
507 | |
508 | while ((bio = next) != NULL) { |
509 | next = bio->bi_next; |
510 | bio->bi_next = NULL; |
511 | |
512 | submit_bio_noacct(bio); |
513 | } |
514 | } |
515 | |
516 | int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl, struct nvme_ns_head *head) |
517 | { |
518 | bool vwc = false; |
519 | |
520 | mutex_init(&head->lock); |
521 | bio_list_init(bl: &head->requeue_list); |
522 | spin_lock_init(&head->requeue_lock); |
523 | INIT_WORK(&head->requeue_work, nvme_requeue_work); |
524 | |
525 | /* |
526 | * Add a multipath node if the subsystems supports multiple controllers. |
527 | * We also do this for private namespaces as the namespace sharing flag |
528 | * could change after a rescan. |
529 | */ |
530 | if (!(ctrl->subsys->cmic & NVME_CTRL_CMIC_MULTI_CTRL) || |
531 | !nvme_is_unique_nsid(ctrl, head) || !multipath) |
532 | return 0; |
533 | |
534 | head->disk = blk_alloc_disk(ctrl->numa_node); |
535 | if (!head->disk) |
536 | return -ENOMEM; |
537 | head->disk->fops = &nvme_ns_head_ops; |
538 | head->disk->private_data = head; |
539 | sprintf(buf: head->disk->disk_name, fmt: "nvme%dn%d" , |
540 | ctrl->subsys->instance, head->instance); |
541 | |
542 | blk_queue_flag_set(QUEUE_FLAG_NONROT, q: head->disk->queue); |
543 | blk_queue_flag_set(QUEUE_FLAG_NOWAIT, q: head->disk->queue); |
544 | blk_queue_flag_set(QUEUE_FLAG_IO_STAT, q: head->disk->queue); |
545 | /* |
546 | * This assumes all controllers that refer to a namespace either |
547 | * support poll queues or not. That is not a strict guarantee, |
548 | * but if the assumption is wrong the effect is only suboptimal |
549 | * performance but not correctness problem. |
550 | */ |
551 | if (ctrl->tagset->nr_maps > HCTX_TYPE_POLL && |
552 | ctrl->tagset->map[HCTX_TYPE_POLL].nr_queues) |
553 | blk_queue_flag_set(QUEUE_FLAG_POLL, q: head->disk->queue); |
554 | |
555 | /* set to a default value of 512 until the disk is validated */ |
556 | blk_queue_logical_block_size(head->disk->queue, 512); |
557 | blk_set_stacking_limits(lim: &head->disk->queue->limits); |
558 | blk_queue_dma_alignment(head->disk->queue, 3); |
559 | |
560 | /* we need to propagate up the VMC settings */ |
561 | if (ctrl->vwc & NVME_CTRL_VWC_PRESENT) |
562 | vwc = true; |
563 | blk_queue_write_cache(q: head->disk->queue, enabled: vwc, fua: vwc); |
564 | return 0; |
565 | } |
566 | |
567 | static void nvme_mpath_set_live(struct nvme_ns *ns) |
568 | { |
569 | struct nvme_ns_head *head = ns->head; |
570 | int rc; |
571 | |
572 | if (!head->disk) |
573 | return; |
574 | |
575 | /* |
576 | * test_and_set_bit() is used because it is protecting against two nvme |
577 | * paths simultaneously calling device_add_disk() on the same namespace |
578 | * head. |
579 | */ |
580 | if (!test_and_set_bit(NVME_NSHEAD_DISK_LIVE, addr: &head->flags)) { |
581 | rc = device_add_disk(parent: &head->subsys->dev, disk: head->disk, |
582 | groups: nvme_ns_id_attr_groups); |
583 | if (rc) { |
584 | clear_bit(NVME_NSHEAD_DISK_LIVE, addr: &ns->flags); |
585 | return; |
586 | } |
587 | nvme_add_ns_head_cdev(head); |
588 | } |
589 | |
590 | mutex_lock(&head->lock); |
591 | if (nvme_path_is_optimized(ns)) { |
592 | int node, srcu_idx; |
593 | |
594 | srcu_idx = srcu_read_lock(ssp: &head->srcu); |
595 | for_each_node(node) |
596 | __nvme_find_path(head, node); |
597 | srcu_read_unlock(ssp: &head->srcu, idx: srcu_idx); |
598 | } |
599 | mutex_unlock(lock: &head->lock); |
600 | |
601 | synchronize_srcu(ssp: &head->srcu); |
602 | kblockd_schedule_work(work: &head->requeue_work); |
603 | } |
604 | |
605 | static int nvme_parse_ana_log(struct nvme_ctrl *ctrl, void *data, |
606 | int (*cb)(struct nvme_ctrl *ctrl, struct nvme_ana_group_desc *, |
607 | void *)) |
608 | { |
609 | void *base = ctrl->ana_log_buf; |
610 | size_t offset = sizeof(struct nvme_ana_rsp_hdr); |
611 | int error, i; |
612 | |
613 | lockdep_assert_held(&ctrl->ana_lock); |
614 | |
615 | for (i = 0; i < le16_to_cpu(ctrl->ana_log_buf->ngrps); i++) { |
616 | struct nvme_ana_group_desc *desc = base + offset; |
617 | u32 nr_nsids; |
618 | size_t nsid_buf_size; |
619 | |
620 | if (WARN_ON_ONCE(offset > ctrl->ana_log_size - sizeof(*desc))) |
621 | return -EINVAL; |
622 | |
623 | nr_nsids = le32_to_cpu(desc->nnsids); |
624 | nsid_buf_size = flex_array_size(desc, nsids, nr_nsids); |
625 | |
626 | if (WARN_ON_ONCE(desc->grpid == 0)) |
627 | return -EINVAL; |
628 | if (WARN_ON_ONCE(le32_to_cpu(desc->grpid) > ctrl->anagrpmax)) |
629 | return -EINVAL; |
630 | if (WARN_ON_ONCE(desc->state == 0)) |
631 | return -EINVAL; |
632 | if (WARN_ON_ONCE(desc->state > NVME_ANA_CHANGE)) |
633 | return -EINVAL; |
634 | |
635 | offset += sizeof(*desc); |
636 | if (WARN_ON_ONCE(offset > ctrl->ana_log_size - nsid_buf_size)) |
637 | return -EINVAL; |
638 | |
639 | error = cb(ctrl, desc, data); |
640 | if (error) |
641 | return error; |
642 | |
643 | offset += nsid_buf_size; |
644 | } |
645 | |
646 | return 0; |
647 | } |
648 | |
649 | static inline bool nvme_state_is_live(enum nvme_ana_state state) |
650 | { |
651 | return state == NVME_ANA_OPTIMIZED || state == NVME_ANA_NONOPTIMIZED; |
652 | } |
653 | |
654 | static void nvme_update_ns_ana_state(struct nvme_ana_group_desc *desc, |
655 | struct nvme_ns *ns) |
656 | { |
657 | ns->ana_grpid = le32_to_cpu(desc->grpid); |
658 | ns->ana_state = desc->state; |
659 | clear_bit(NVME_NS_ANA_PENDING, addr: &ns->flags); |
660 | /* |
661 | * nvme_mpath_set_live() will trigger I/O to the multipath path device |
662 | * and in turn to this path device. However we cannot accept this I/O |
663 | * if the controller is not live. This may deadlock if called from |
664 | * nvme_mpath_init_identify() and the ctrl will never complete |
665 | * initialization, preventing I/O from completing. For this case we |
666 | * will reprocess the ANA log page in nvme_mpath_update() once the |
667 | * controller is ready. |
668 | */ |
669 | if (nvme_state_is_live(state: ns->ana_state) && |
670 | ns->ctrl->state == NVME_CTRL_LIVE) |
671 | nvme_mpath_set_live(ns); |
672 | } |
673 | |
674 | static int nvme_update_ana_state(struct nvme_ctrl *ctrl, |
675 | struct nvme_ana_group_desc *desc, void *data) |
676 | { |
677 | u32 nr_nsids = le32_to_cpu(desc->nnsids), n = 0; |
678 | unsigned *nr_change_groups = data; |
679 | struct nvme_ns *ns; |
680 | |
681 | dev_dbg(ctrl->device, "ANA group %d: %s.\n" , |
682 | le32_to_cpu(desc->grpid), |
683 | nvme_ana_state_names[desc->state]); |
684 | |
685 | if (desc->state == NVME_ANA_CHANGE) |
686 | (*nr_change_groups)++; |
687 | |
688 | if (!nr_nsids) |
689 | return 0; |
690 | |
691 | down_read(sem: &ctrl->namespaces_rwsem); |
692 | list_for_each_entry(ns, &ctrl->namespaces, list) { |
693 | unsigned nsid; |
694 | again: |
695 | nsid = le32_to_cpu(desc->nsids[n]); |
696 | if (ns->head->ns_id < nsid) |
697 | continue; |
698 | if (ns->head->ns_id == nsid) |
699 | nvme_update_ns_ana_state(desc, ns); |
700 | if (++n == nr_nsids) |
701 | break; |
702 | if (ns->head->ns_id > nsid) |
703 | goto again; |
704 | } |
705 | up_read(sem: &ctrl->namespaces_rwsem); |
706 | return 0; |
707 | } |
708 | |
709 | static int nvme_read_ana_log(struct nvme_ctrl *ctrl) |
710 | { |
711 | u32 nr_change_groups = 0; |
712 | int error; |
713 | |
714 | mutex_lock(&ctrl->ana_lock); |
715 | error = nvme_get_log(ctrl, NVME_NSID_ALL, log_page: NVME_LOG_ANA, lsp: 0, csi: NVME_CSI_NVM, |
716 | log: ctrl->ana_log_buf, size: ctrl->ana_log_size, offset: 0); |
717 | if (error) { |
718 | dev_warn(ctrl->device, "Failed to get ANA log: %d\n" , error); |
719 | goto out_unlock; |
720 | } |
721 | |
722 | error = nvme_parse_ana_log(ctrl, data: &nr_change_groups, |
723 | cb: nvme_update_ana_state); |
724 | if (error) |
725 | goto out_unlock; |
726 | |
727 | /* |
728 | * In theory we should have an ANATT timer per group as they might enter |
729 | * the change state at different times. But that is a lot of overhead |
730 | * just to protect against a target that keeps entering new changes |
731 | * states while never finishing previous ones. But we'll still |
732 | * eventually time out once all groups are in change state, so this |
733 | * isn't a big deal. |
734 | * |
735 | * We also double the ANATT value to provide some slack for transports |
736 | * or AEN processing overhead. |
737 | */ |
738 | if (nr_change_groups) |
739 | mod_timer(timer: &ctrl->anatt_timer, expires: ctrl->anatt * HZ * 2 + jiffies); |
740 | else |
741 | del_timer_sync(timer: &ctrl->anatt_timer); |
742 | out_unlock: |
743 | mutex_unlock(lock: &ctrl->ana_lock); |
744 | return error; |
745 | } |
746 | |
747 | static void nvme_ana_work(struct work_struct *work) |
748 | { |
749 | struct nvme_ctrl *ctrl = container_of(work, struct nvme_ctrl, ana_work); |
750 | |
751 | if (ctrl->state != NVME_CTRL_LIVE) |
752 | return; |
753 | |
754 | nvme_read_ana_log(ctrl); |
755 | } |
756 | |
757 | void nvme_mpath_update(struct nvme_ctrl *ctrl) |
758 | { |
759 | u32 nr_change_groups = 0; |
760 | |
761 | if (!ctrl->ana_log_buf) |
762 | return; |
763 | |
764 | mutex_lock(&ctrl->ana_lock); |
765 | nvme_parse_ana_log(ctrl, data: &nr_change_groups, cb: nvme_update_ana_state); |
766 | mutex_unlock(lock: &ctrl->ana_lock); |
767 | } |
768 | |
769 | static void nvme_anatt_timeout(struct timer_list *t) |
770 | { |
771 | struct nvme_ctrl *ctrl = from_timer(ctrl, t, anatt_timer); |
772 | |
773 | dev_info(ctrl->device, "ANATT timeout, resetting controller.\n" ); |
774 | nvme_reset_ctrl(ctrl); |
775 | } |
776 | |
777 | void nvme_mpath_stop(struct nvme_ctrl *ctrl) |
778 | { |
779 | if (!nvme_ctrl_use_ana(ctrl)) |
780 | return; |
781 | del_timer_sync(timer: &ctrl->anatt_timer); |
782 | cancel_work_sync(work: &ctrl->ana_work); |
783 | } |
784 | |
785 | #define SUBSYS_ATTR_RW(_name, _mode, _show, _store) \ |
786 | struct device_attribute subsys_attr_##_name = \ |
787 | __ATTR(_name, _mode, _show, _store) |
788 | |
789 | static ssize_t nvme_subsys_iopolicy_show(struct device *dev, |
790 | struct device_attribute *attr, char *buf) |
791 | { |
792 | struct nvme_subsystem *subsys = |
793 | container_of(dev, struct nvme_subsystem, dev); |
794 | |
795 | return sysfs_emit(buf, fmt: "%s\n" , |
796 | nvme_iopolicy_names[READ_ONCE(subsys->iopolicy)]); |
797 | } |
798 | |
799 | static ssize_t nvme_subsys_iopolicy_store(struct device *dev, |
800 | struct device_attribute *attr, const char *buf, size_t count) |
801 | { |
802 | struct nvme_subsystem *subsys = |
803 | container_of(dev, struct nvme_subsystem, dev); |
804 | int i; |
805 | |
806 | for (i = 0; i < ARRAY_SIZE(nvme_iopolicy_names); i++) { |
807 | if (sysfs_streq(s1: buf, s2: nvme_iopolicy_names[i])) { |
808 | WRITE_ONCE(subsys->iopolicy, i); |
809 | return count; |
810 | } |
811 | } |
812 | |
813 | return -EINVAL; |
814 | } |
815 | SUBSYS_ATTR_RW(iopolicy, S_IRUGO | S_IWUSR, |
816 | nvme_subsys_iopolicy_show, nvme_subsys_iopolicy_store); |
817 | |
818 | static ssize_t ana_grpid_show(struct device *dev, struct device_attribute *attr, |
819 | char *buf) |
820 | { |
821 | return sysfs_emit(buf, fmt: "%d\n" , nvme_get_ns_from_dev(dev)->ana_grpid); |
822 | } |
823 | DEVICE_ATTR_RO(ana_grpid); |
824 | |
825 | static ssize_t ana_state_show(struct device *dev, struct device_attribute *attr, |
826 | char *buf) |
827 | { |
828 | struct nvme_ns *ns = nvme_get_ns_from_dev(dev); |
829 | |
830 | return sysfs_emit(buf, fmt: "%s\n" , nvme_ana_state_names[ns->ana_state]); |
831 | } |
832 | DEVICE_ATTR_RO(ana_state); |
833 | |
834 | static int nvme_lookup_ana_group_desc(struct nvme_ctrl *ctrl, |
835 | struct nvme_ana_group_desc *desc, void *data) |
836 | { |
837 | struct nvme_ana_group_desc *dst = data; |
838 | |
839 | if (desc->grpid != dst->grpid) |
840 | return 0; |
841 | |
842 | *dst = *desc; |
843 | return -ENXIO; /* just break out of the loop */ |
844 | } |
845 | |
846 | void nvme_mpath_add_disk(struct nvme_ns *ns, __le32 anagrpid) |
847 | { |
848 | if (nvme_ctrl_use_ana(ctrl: ns->ctrl)) { |
849 | struct nvme_ana_group_desc desc = { |
850 | .grpid = anagrpid, |
851 | .state = 0, |
852 | }; |
853 | |
854 | mutex_lock(&ns->ctrl->ana_lock); |
855 | ns->ana_grpid = le32_to_cpu(anagrpid); |
856 | nvme_parse_ana_log(ctrl: ns->ctrl, data: &desc, cb: nvme_lookup_ana_group_desc); |
857 | mutex_unlock(lock: &ns->ctrl->ana_lock); |
858 | if (desc.state) { |
859 | /* found the group desc: update */ |
860 | nvme_update_ns_ana_state(desc: &desc, ns); |
861 | } else { |
862 | /* group desc not found: trigger a re-read */ |
863 | set_bit(NVME_NS_ANA_PENDING, addr: &ns->flags); |
864 | queue_work(wq: nvme_wq, work: &ns->ctrl->ana_work); |
865 | } |
866 | } else { |
867 | ns->ana_state = NVME_ANA_OPTIMIZED; |
868 | nvme_mpath_set_live(ns); |
869 | } |
870 | |
871 | if (blk_queue_stable_writes(ns->queue) && ns->head->disk) |
872 | blk_queue_flag_set(QUEUE_FLAG_STABLE_WRITES, |
873 | q: ns->head->disk->queue); |
874 | #ifdef CONFIG_BLK_DEV_ZONED |
875 | if (blk_queue_is_zoned(q: ns->queue) && ns->head->disk) |
876 | ns->head->disk->nr_zones = ns->disk->nr_zones; |
877 | #endif |
878 | } |
879 | |
880 | void nvme_mpath_shutdown_disk(struct nvme_ns_head *head) |
881 | { |
882 | if (!head->disk) |
883 | return; |
884 | kblockd_schedule_work(work: &head->requeue_work); |
885 | if (test_bit(NVME_NSHEAD_DISK_LIVE, &head->flags)) { |
886 | nvme_cdev_del(cdev: &head->cdev, cdev_device: &head->cdev_device); |
887 | del_gendisk(gp: head->disk); |
888 | } |
889 | } |
890 | |
891 | void nvme_mpath_remove_disk(struct nvme_ns_head *head) |
892 | { |
893 | if (!head->disk) |
894 | return; |
895 | /* make sure all pending bios are cleaned up */ |
896 | kblockd_schedule_work(work: &head->requeue_work); |
897 | flush_work(work: &head->requeue_work); |
898 | put_disk(disk: head->disk); |
899 | } |
900 | |
901 | void nvme_mpath_init_ctrl(struct nvme_ctrl *ctrl) |
902 | { |
903 | mutex_init(&ctrl->ana_lock); |
904 | timer_setup(&ctrl->anatt_timer, nvme_anatt_timeout, 0); |
905 | INIT_WORK(&ctrl->ana_work, nvme_ana_work); |
906 | } |
907 | |
908 | int nvme_mpath_init_identify(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id) |
909 | { |
910 | size_t max_transfer_size = ctrl->max_hw_sectors << SECTOR_SHIFT; |
911 | size_t ana_log_size; |
912 | int error = 0; |
913 | |
914 | /* check if multipath is enabled and we have the capability */ |
915 | if (!multipath || !ctrl->subsys || |
916 | !(ctrl->subsys->cmic & NVME_CTRL_CMIC_ANA)) |
917 | return 0; |
918 | |
919 | if (!ctrl->max_namespaces || |
920 | ctrl->max_namespaces > le32_to_cpu(id->nn)) { |
921 | dev_err(ctrl->device, |
922 | "Invalid MNAN value %u\n" , ctrl->max_namespaces); |
923 | return -EINVAL; |
924 | } |
925 | |
926 | ctrl->anacap = id->anacap; |
927 | ctrl->anatt = id->anatt; |
928 | ctrl->nanagrpid = le32_to_cpu(id->nanagrpid); |
929 | ctrl->anagrpmax = le32_to_cpu(id->anagrpmax); |
930 | |
931 | ana_log_size = sizeof(struct nvme_ana_rsp_hdr) + |
932 | ctrl->nanagrpid * sizeof(struct nvme_ana_group_desc) + |
933 | ctrl->max_namespaces * sizeof(__le32); |
934 | if (ana_log_size > max_transfer_size) { |
935 | dev_err(ctrl->device, |
936 | "ANA log page size (%zd) larger than MDTS (%zd).\n" , |
937 | ana_log_size, max_transfer_size); |
938 | dev_err(ctrl->device, "disabling ANA support.\n" ); |
939 | goto out_uninit; |
940 | } |
941 | if (ana_log_size > ctrl->ana_log_size) { |
942 | nvme_mpath_stop(ctrl); |
943 | nvme_mpath_uninit(ctrl); |
944 | ctrl->ana_log_buf = kvmalloc(size: ana_log_size, GFP_KERNEL); |
945 | if (!ctrl->ana_log_buf) |
946 | return -ENOMEM; |
947 | } |
948 | ctrl->ana_log_size = ana_log_size; |
949 | error = nvme_read_ana_log(ctrl); |
950 | if (error) |
951 | goto out_uninit; |
952 | return 0; |
953 | |
954 | out_uninit: |
955 | nvme_mpath_uninit(ctrl); |
956 | return error; |
957 | } |
958 | |
959 | void nvme_mpath_uninit(struct nvme_ctrl *ctrl) |
960 | { |
961 | kvfree(addr: ctrl->ana_log_buf); |
962 | ctrl->ana_log_buf = NULL; |
963 | ctrl->ana_log_size = 0; |
964 | } |
965 | |