1
2
3
4
5
6#include <linux/moduleparam.h>
7#include <trace/events/block.h>
8#include "nvme.h"
9
10static bool multipath = false;
11module_param(multipath, bool, 0444);
12MODULE_PARM_DESC(multipath,
13 "turn on native support for multiple controllers per subsystem");
14
15inline bool nvme_ctrl_use_ana(struct nvme_ctrl *ctrl)
16{
17 return ctrl->subsys && (ctrl->subsys->cmic & (1 << 3));
18}
19
20
21
22
23
24
25
26
27void nvme_set_disk_name(char *disk_name, struct nvme_ns *ns,
28 struct nvme_ctrl *ctrl, int *flags)
29{
30 if (!multipath) {
31 sprintf(disk_name, "nvme%dn%d", ctrl->instance, ns->head->instance);
32 } else if (ns->head->disk) {
33 sprintf(disk_name, "nvme%dc%dn%d", ctrl->subsys->instance,
34 ctrl->instance, ns->head->instance);
35 *flags = GENHD_FL_HIDDEN;
36 } else {
37 sprintf(disk_name, "nvme%dn%d", ctrl->subsys->instance,
38 ns->head->instance);
39 }
40}
41
42static bool nvme_ana_error(u16 status)
43{
44 switch (status & 0x7ff) {
45 case NVME_SC_ANA_TRANSITION:
46 case NVME_SC_ANA_INACCESSIBLE:
47 case NVME_SC_ANA_PERSISTENT_LOSS:
48 return true;
49 }
50 return false;
51}
52
53static void __nvme_update_ana(struct nvme_ns *ns)
54{
55 if (!ns->ctrl->ana_log_buf)
56 return;
57
58 set_bit(NVME_NS_ANA_PENDING, &ns->flags);
59 queue_work(nvme_wq, &ns->ctrl->ana_work);
60}
61
62void nvme_update_ana(struct request *req)
63{
64 struct nvme_ns *ns = req->q->queuedata;
65 u16 status = nvme_req(req)->status;
66
67 if (nvme_ana_error(status))
68 __nvme_update_ana(ns);
69}
70
71void nvme_failover_req(struct request *req)
72{
73 struct nvme_ns *ns = req->q->queuedata;
74 u16 status = nvme_req(req)->status;
75 unsigned long flags;
76
77 spin_lock_irqsave(&ns->head->requeue_lock, flags);
78 blk_steal_bios(&ns->head->requeue_list, req);
79 spin_unlock_irqrestore(&ns->head->requeue_lock, flags);
80 blk_mq_end_request(req, 0);
81
82 if (nvme_ana_error(status)) {
83
84
85
86
87
88
89
90
91
92 nvme_mpath_clear_current_path(ns);
93 __nvme_update_ana(ns);
94 goto kick_requeue;
95 }
96
97 switch (status & 0x7ff) {
98 case NVME_SC_HOST_PATH_ERROR:
99
100
101
102
103 nvme_mpath_clear_current_path(ns);
104 break;
105 default:
106
107
108
109
110 nvme_reset_ctrl(ns->ctrl);
111 break;
112 }
113
114kick_requeue:
115 kblockd_schedule_work(&ns->head->requeue_work);
116}
117
118void nvme_kick_requeue_lists(struct nvme_ctrl *ctrl)
119{
120 struct nvme_ns *ns;
121
122 down_read(&ctrl->namespaces_rwsem);
123 list_for_each_entry(ns, &ctrl->namespaces, list) {
124 if (ns->head->disk)
125 kblockd_schedule_work(&ns->head->requeue_work);
126 }
127 up_read(&ctrl->namespaces_rwsem);
128}
129
130static const char *nvme_ana_state_names[] = {
131 [0] = "invalid state",
132 [NVME_ANA_OPTIMIZED] = "optimized",
133 [NVME_ANA_NONOPTIMIZED] = "non-optimized",
134 [NVME_ANA_INACCESSIBLE] = "inaccessible",
135 [NVME_ANA_PERSISTENT_LOSS] = "persistent-loss",
136 [NVME_ANA_CHANGE] = "change",
137};
138
139void nvme_mpath_clear_current_path(struct nvme_ns *ns)
140{
141 struct nvme_ns_head *head = ns->head;
142 int node;
143
144 if (!head)
145 return;
146
147 for_each_node(node) {
148 if (ns == rcu_access_pointer(head->current_path[node]))
149 rcu_assign_pointer(head->current_path[node], NULL);
150 }
151}
152
153static struct nvme_ns *__nvme_find_path(struct nvme_ns_head *head, int node)
154{
155 int found_distance = INT_MAX, fallback_distance = INT_MAX, distance;
156 struct nvme_ns *found = NULL, *fallback = NULL, *ns;
157
158 list_for_each_entry_rcu(ns, &head->list, siblings) {
159 if (ns->ctrl->state != NVME_CTRL_LIVE ||
160 test_bit(NVME_NS_ANA_PENDING, &ns->flags))
161 continue;
162
163 if (READ_ONCE(head->subsys->iopolicy) == NVME_IOPOLICY_NUMA)
164 distance = node_distance(node, ns->ctrl->numa_node);
165 else
166 distance = LOCAL_DISTANCE;
167
168 switch (ns->ana_state) {
169 case NVME_ANA_OPTIMIZED:
170 if (distance < found_distance) {
171 found_distance = distance;
172 found = ns;
173 }
174 break;
175 case NVME_ANA_NONOPTIMIZED:
176 if (distance < fallback_distance) {
177 fallback_distance = distance;
178 fallback = ns;
179 }
180 break;
181 default:
182 break;
183 }
184 }
185
186 if (!found)
187 found = fallback;
188 if (found)
189 rcu_assign_pointer(head->current_path[node], found);
190 return found;
191}
192
193static struct nvme_ns *nvme_next_ns(struct nvme_ns_head *head,
194 struct nvme_ns *ns)
195{
196 ns = list_next_or_null_rcu(&head->list, &ns->siblings, struct nvme_ns,
197 siblings);
198 if (ns)
199 return ns;
200 return list_first_or_null_rcu(&head->list, struct nvme_ns, siblings);
201}
202
203static struct nvme_ns *nvme_round_robin_path(struct nvme_ns_head *head,
204 int node, struct nvme_ns *old)
205{
206 struct nvme_ns *ns, *found, *fallback = NULL;
207
208 if (list_is_singular(&head->list))
209 return old;
210
211 for (ns = nvme_next_ns(head, old);
212 ns != old;
213 ns = nvme_next_ns(head, ns)) {
214 if (ns->ctrl->state != NVME_CTRL_LIVE ||
215 test_bit(NVME_NS_ANA_PENDING, &ns->flags))
216 continue;
217
218 if (ns->ana_state == NVME_ANA_OPTIMIZED) {
219 found = ns;
220 goto out;
221 }
222 if (ns->ana_state == NVME_ANA_NONOPTIMIZED)
223 fallback = ns;
224 }
225
226 if (!fallback)
227 return NULL;
228 found = fallback;
229out:
230 rcu_assign_pointer(head->current_path[node], found);
231 return found;
232}
233
234static inline bool nvme_path_is_optimized(struct nvme_ns *ns)
235{
236 return ns->ctrl->state == NVME_CTRL_LIVE &&
237 ns->ana_state == NVME_ANA_OPTIMIZED;
238}
239
240inline struct nvme_ns *nvme_find_path(struct nvme_ns_head *head)
241{
242 int node = numa_node_id();
243 struct nvme_ns *ns;
244
245 ns = srcu_dereference(head->current_path[node], &head->srcu);
246 if (READ_ONCE(head->subsys->iopolicy) == NVME_IOPOLICY_RR && ns)
247 ns = nvme_round_robin_path(head, node, ns);
248 if (unlikely(!ns || !nvme_path_is_optimized(ns)))
249 ns = __nvme_find_path(head, node);
250 return ns;
251}
252
253static blk_qc_t nvme_ns_head_make_request(struct request_queue *q,
254 struct bio *bio)
255{
256 struct nvme_ns_head *head = q->queuedata;
257 struct device *dev = disk_to_dev(head->disk);
258 struct nvme_ns *ns;
259 blk_qc_t ret = BLK_QC_T_NONE;
260 int srcu_idx;
261
262
263
264
265
266
267
268 blk_queue_split(q, &bio);
269
270 srcu_idx = srcu_read_lock(&head->srcu);
271 ns = nvme_find_path(head);
272 if (likely(ns)) {
273 bio->bi_disk = ns->disk;
274 bio->bi_opf |= REQ_NVME_MPATH;
275 trace_block_bio_remap(bio->bi_disk->queue, bio,
276 disk_devt(ns->head->disk),
277 bio->bi_iter.bi_sector);
278 ret = direct_make_request(bio);
279 } else if (!list_empty_careful(&head->list)) {
280 dev_warn_ratelimited(dev, "no path available - requeuing I/O\n");
281
282 spin_lock_irq(&head->requeue_lock);
283 bio_list_add(&head->requeue_list, bio);
284 spin_unlock_irq(&head->requeue_lock);
285 } else {
286 dev_warn_ratelimited(dev, "no path - failing I/O\n");
287
288 bio->bi_status = BLK_STS_IOERR;
289 bio_endio(bio);
290 }
291
292 srcu_read_unlock(&head->srcu, srcu_idx);
293 return ret;
294}
295
296static void nvme_requeue_work(struct work_struct *work)
297{
298 struct nvme_ns_head *head =
299 container_of(work, struct nvme_ns_head, requeue_work);
300 struct bio *bio, *next;
301
302 spin_lock_irq(&head->requeue_lock);
303 next = bio_list_get(&head->requeue_list);
304 spin_unlock_irq(&head->requeue_lock);
305
306 while ((bio = next) != NULL) {
307 next = bio->bi_next;
308 bio->bi_next = NULL;
309
310
311
312
313
314 bio->bi_disk = head->disk;
315 generic_make_request(bio);
316 }
317}
318
319int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl, struct nvme_ns_head *head)
320{
321 struct request_queue *q;
322 bool vwc = false;
323
324 mutex_init(&head->lock);
325 bio_list_init(&head->requeue_list);
326 spin_lock_init(&head->requeue_lock);
327 INIT_WORK(&head->requeue_work, nvme_requeue_work);
328
329
330
331
332
333
334 if (!(ctrl->subsys->cmic & (1 << 1)) || !multipath)
335 return 0;
336
337 q = blk_alloc_queue_node(GFP_KERNEL, ctrl->numa_node);
338 if (!q)
339 goto out;
340 q->queuedata = head;
341 blk_queue_make_request(q, nvme_ns_head_make_request);
342 blk_queue_flag_set(QUEUE_FLAG_NONROT, q);
343
344 blk_queue_logical_block_size(q, 512);
345 blk_set_stacking_limits(&q->limits);
346
347
348 if (ctrl->vwc & NVME_CTRL_VWC_PRESENT)
349 vwc = true;
350 blk_queue_write_cache(q, vwc, vwc);
351
352 head->disk = alloc_disk(0);
353 if (!head->disk)
354 goto out_cleanup_queue;
355 head->disk->fops = &nvme_ns_head_ops;
356 head->disk->private_data = head;
357 head->disk->queue = q;
358 head->disk->flags = GENHD_FL_EXT_DEVT;
359 sprintf(head->disk->disk_name, "nvme%dn%d",
360 ctrl->subsys->instance, head->instance);
361 return 0;
362
363out_cleanup_queue:
364 blk_cleanup_queue(q);
365out:
366 return -ENOMEM;
367}
368
369static void nvme_mpath_set_live(struct nvme_ns *ns)
370{
371 struct nvme_ns_head *head = ns->head;
372
373 lockdep_assert_held(&ns->head->lock);
374
375 if (!head->disk)
376 return;
377
378 if (!(head->disk->flags & GENHD_FL_UP))
379 device_add_disk(&head->subsys->dev, head->disk,
380 nvme_ns_id_attr_groups);
381
382 if (nvme_path_is_optimized(ns)) {
383 int node, srcu_idx;
384
385 srcu_idx = srcu_read_lock(&head->srcu);
386 for_each_node(node)
387 __nvme_find_path(head, node);
388 srcu_read_unlock(&head->srcu, srcu_idx);
389 }
390
391 if (nvme_path_is_optimized(ns)) {
392 int node, srcu_idx;
393
394 srcu_idx = srcu_read_lock(&head->srcu);
395 for_each_node(node)
396 __nvme_find_path(head, node);
397 srcu_read_unlock(&head->srcu, srcu_idx);
398 }
399
400 kblockd_schedule_work(&ns->head->requeue_work);
401}
402
403static int nvme_parse_ana_log(struct nvme_ctrl *ctrl, void *data,
404 int (*cb)(struct nvme_ctrl *ctrl, struct nvme_ana_group_desc *,
405 void *))
406{
407 void *base = ctrl->ana_log_buf;
408 size_t offset = sizeof(struct nvme_ana_rsp_hdr);
409 int error, i;
410
411 lockdep_assert_held(&ctrl->ana_lock);
412
413 for (i = 0; i < le16_to_cpu(ctrl->ana_log_buf->ngrps); i++) {
414 struct nvme_ana_group_desc *desc = base + offset;
415 u32 nr_nsids = le32_to_cpu(desc->nnsids);
416 size_t nsid_buf_size = nr_nsids * sizeof(__le32);
417
418 if (WARN_ON_ONCE(desc->grpid == 0))
419 return -EINVAL;
420 if (WARN_ON_ONCE(le32_to_cpu(desc->grpid) > ctrl->anagrpmax))
421 return -EINVAL;
422 if (WARN_ON_ONCE(desc->state == 0))
423 return -EINVAL;
424 if (WARN_ON_ONCE(desc->state > NVME_ANA_CHANGE))
425 return -EINVAL;
426
427 offset += sizeof(*desc);
428 if (WARN_ON_ONCE(offset > ctrl->ana_log_size - nsid_buf_size))
429 return -EINVAL;
430
431 error = cb(ctrl, desc, data);
432 if (error)
433 return error;
434
435 offset += nsid_buf_size;
436 if (WARN_ON_ONCE(offset > ctrl->ana_log_size - sizeof(*desc)))
437 return -EINVAL;
438 }
439
440 return 0;
441}
442
443static inline bool nvme_state_is_live(enum nvme_ana_state state)
444{
445 return state == NVME_ANA_OPTIMIZED || state == NVME_ANA_NONOPTIMIZED;
446}
447
448static void nvme_update_ns_ana_state(struct nvme_ana_group_desc *desc,
449 struct nvme_ns *ns)
450{
451 mutex_lock(&ns->head->lock);
452 ns->ana_grpid = le32_to_cpu(desc->grpid);
453 ns->ana_state = desc->state;
454 clear_bit(NVME_NS_ANA_PENDING, &ns->flags);
455
456 if (nvme_state_is_live(ns->ana_state))
457 nvme_mpath_set_live(ns);
458 mutex_unlock(&ns->head->lock);
459}
460
461static int nvme_update_ana_state(struct nvme_ctrl *ctrl,
462 struct nvme_ana_group_desc *desc, void *data)
463{
464 u32 nr_nsids = le32_to_cpu(desc->nnsids), n = 0;
465 unsigned *nr_change_groups = data;
466 struct nvme_ns *ns;
467
468 dev_dbg(ctrl->device, "ANA group %d: %s.\n",
469 le32_to_cpu(desc->grpid),
470 nvme_ana_state_names[desc->state]);
471
472 if (desc->state == NVME_ANA_CHANGE)
473 (*nr_change_groups)++;
474
475 if (!nr_nsids)
476 return 0;
477
478 down_write(&ctrl->namespaces_rwsem);
479 list_for_each_entry(ns, &ctrl->namespaces, list) {
480 if (ns->head->ns_id != le32_to_cpu(desc->nsids[n]))
481 continue;
482 nvme_update_ns_ana_state(desc, ns);
483 if (++n == nr_nsids)
484 break;
485 }
486 up_write(&ctrl->namespaces_rwsem);
487 WARN_ON_ONCE(n < nr_nsids);
488 return 0;
489}
490
491static int nvme_read_ana_log(struct nvme_ctrl *ctrl, bool groups_only)
492{
493 u32 nr_change_groups = 0;
494 int error;
495
496 mutex_lock(&ctrl->ana_lock);
497 error = nvme_get_log(ctrl, NVME_NSID_ALL, NVME_LOG_ANA,
498 groups_only ? NVME_ANA_LOG_RGO : 0,
499 ctrl->ana_log_buf, ctrl->ana_log_size, 0);
500 if (error) {
501 dev_warn(ctrl->device, "Failed to get ANA log: %d\n", error);
502 goto out_unlock;
503 }
504
505 error = nvme_parse_ana_log(ctrl, &nr_change_groups,
506 nvme_update_ana_state);
507 if (error)
508 goto out_unlock;
509
510
511
512
513
514
515
516
517
518
519
520
521 if (nr_change_groups)
522 mod_timer(&ctrl->anatt_timer, ctrl->anatt * HZ * 2 + jiffies);
523 else
524 del_timer_sync(&ctrl->anatt_timer);
525out_unlock:
526 mutex_unlock(&ctrl->ana_lock);
527 return error;
528}
529
530static void nvme_ana_work(struct work_struct *work)
531{
532 struct nvme_ctrl *ctrl = container_of(work, struct nvme_ctrl, ana_work);
533
534 nvme_read_ana_log(ctrl, false);
535}
536
537static void nvme_anatt_timeout(struct timer_list *t)
538{
539 struct nvme_ctrl *ctrl = from_timer(ctrl, t, anatt_timer);
540
541 dev_info(ctrl->device, "ANATT timeout, resetting controller.\n");
542 nvme_reset_ctrl(ctrl);
543}
544
545void nvme_mpath_stop(struct nvme_ctrl *ctrl)
546{
547 if (!nvme_ctrl_use_ana(ctrl))
548 return;
549 del_timer_sync(&ctrl->anatt_timer);
550 cancel_work_sync(&ctrl->ana_work);
551}
552
553#define SUBSYS_ATTR_RW(_name, _mode, _show, _store) \
554 struct device_attribute subsys_attr_##_name = \
555 __ATTR(_name, _mode, _show, _store)
556
557static const char *nvme_iopolicy_names[] = {
558 [NVME_IOPOLICY_NUMA] = "numa",
559 [NVME_IOPOLICY_RR] = "round-robin",
560};
561
562static ssize_t nvme_subsys_iopolicy_show(struct device *dev,
563 struct device_attribute *attr, char *buf)
564{
565 struct nvme_subsystem *subsys =
566 container_of(dev, struct nvme_subsystem, dev);
567
568 return sprintf(buf, "%s\n",
569 nvme_iopolicy_names[READ_ONCE(subsys->iopolicy)]);
570}
571
572static ssize_t nvme_subsys_iopolicy_store(struct device *dev,
573 struct device_attribute *attr, const char *buf, size_t count)
574{
575 struct nvme_subsystem *subsys =
576 container_of(dev, struct nvme_subsystem, dev);
577 int i;
578
579 for (i = 0; i < ARRAY_SIZE(nvme_iopolicy_names); i++) {
580 if (sysfs_streq(buf, nvme_iopolicy_names[i])) {
581 WRITE_ONCE(subsys->iopolicy, i);
582 return count;
583 }
584 }
585
586 return -EINVAL;
587}
588SUBSYS_ATTR_RW(iopolicy, S_IRUGO | S_IWUSR,
589 nvme_subsys_iopolicy_show, nvme_subsys_iopolicy_store);
590
591static ssize_t ana_grpid_show(struct device *dev, struct device_attribute *attr,
592 char *buf)
593{
594 return sprintf(buf, "%d\n", nvme_get_ns_from_dev(dev)->ana_grpid);
595}
596DEVICE_ATTR_RO(ana_grpid);
597
598static ssize_t ana_state_show(struct device *dev, struct device_attribute *attr,
599 char *buf)
600{
601 struct nvme_ns *ns = nvme_get_ns_from_dev(dev);
602
603 return sprintf(buf, "%s\n", nvme_ana_state_names[ns->ana_state]);
604}
605DEVICE_ATTR_RO(ana_state);
606
607static int nvme_set_ns_ana_state(struct nvme_ctrl *ctrl,
608 struct nvme_ana_group_desc *desc, void *data)
609{
610 struct nvme_ns *ns = data;
611
612 if (ns->ana_grpid == le32_to_cpu(desc->grpid)) {
613 nvme_update_ns_ana_state(desc, ns);
614 return -ENXIO;
615 }
616
617 return 0;
618}
619
620void nvme_mpath_add_disk(struct nvme_ns *ns, struct nvme_id_ns *id)
621{
622 if (nvme_ctrl_use_ana(ns->ctrl)) {
623 mutex_lock(&ns->ctrl->ana_lock);
624 ns->ana_grpid = le32_to_cpu(id->anagrpid);
625 nvme_parse_ana_log(ns->ctrl, ns, nvme_set_ns_ana_state);
626 mutex_unlock(&ns->ctrl->ana_lock);
627 } else {
628 mutex_lock(&ns->head->lock);
629 ns->ana_state = NVME_ANA_OPTIMIZED;
630 nvme_mpath_set_live(ns);
631 mutex_unlock(&ns->head->lock);
632 }
633}
634
635void nvme_mpath_remove_disk(struct nvme_ns_head *head)
636{
637 if (!head->disk)
638 return;
639 if (head->disk->flags & GENHD_FL_UP)
640 del_gendisk(head->disk);
641 blk_set_queue_dying(head->disk->queue);
642
643 kblockd_schedule_work(&head->requeue_work);
644 flush_work(&head->requeue_work);
645 blk_cleanup_queue(head->disk->queue);
646 put_disk(head->disk);
647}
648
649int nvme_mpath_init(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id)
650{
651 int error;
652
653 if (!nvme_ctrl_use_ana(ctrl))
654 return 0;
655
656 ctrl->anacap = id->anacap;
657 ctrl->anatt = id->anatt;
658 ctrl->nanagrpid = le32_to_cpu(id->nanagrpid);
659 ctrl->anagrpmax = le32_to_cpu(id->anagrpmax);
660
661 mutex_init(&ctrl->ana_lock);
662 timer_setup(&ctrl->anatt_timer, nvme_anatt_timeout, 0);
663 ctrl->ana_log_size = sizeof(struct nvme_ana_rsp_hdr) +
664 ctrl->nanagrpid * sizeof(struct nvme_ana_group_desc);
665 ctrl->ana_log_size += ctrl->max_namespaces * sizeof(__le32);
666
667 if (ctrl->ana_log_size > ctrl->max_hw_sectors << SECTOR_SHIFT) {
668 dev_err(ctrl->device,
669 "ANA log page size (%zd) larger than MDTS (%d).\n",
670 ctrl->ana_log_size,
671 ctrl->max_hw_sectors << SECTOR_SHIFT);
672 dev_err(ctrl->device, "disabling ANA support.\n");
673 return 0;
674 }
675
676 INIT_WORK(&ctrl->ana_work, nvme_ana_work);
677 ctrl->ana_log_buf = kmalloc(ctrl->ana_log_size, GFP_KERNEL);
678 if (!ctrl->ana_log_buf) {
679 error = -ENOMEM;
680 goto out;
681 }
682
683 error = nvme_read_ana_log(ctrl, true);
684 if (error)
685 goto out_free_ana_log_buf;
686 return 0;
687out_free_ana_log_buf:
688 kfree(ctrl->ana_log_buf);
689 ctrl->ana_log_buf = NULL;
690out:
691 return error;
692}
693
694void nvme_mpath_uninit(struct nvme_ctrl *ctrl)
695{
696 kfree(ctrl->ana_log_buf);
697 ctrl->ana_log_buf = NULL;
698}
699
700