linux/drivers/nvme/host/multipath.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0
   2/*
   3 * Copyright (c) 2017-2018 Christoph Hellwig.
   4 */
   5
   6#include <linux/backing-dev.h>
   7#include <linux/moduleparam.h>
   8#include <trace/events/block.h>
   9#include "nvme.h"
  10
  11static bool multipath = true;
  12module_param(multipath, bool, 0444);
  13MODULE_PARM_DESC(multipath,
  14        "turn on native support for multiple controllers per subsystem");
  15
  16void nvme_mpath_unfreeze(struct nvme_subsystem *subsys)
  17{
  18        struct nvme_ns_head *h;
  19
  20        lockdep_assert_held(&subsys->lock);
  21        list_for_each_entry(h, &subsys->nsheads, entry)
  22                if (h->disk)
  23                        blk_mq_unfreeze_queue(h->disk->queue);
  24}
  25
  26void nvme_mpath_wait_freeze(struct nvme_subsystem *subsys)
  27{
  28        struct nvme_ns_head *h;
  29
  30        lockdep_assert_held(&subsys->lock);
  31        list_for_each_entry(h, &subsys->nsheads, entry)
  32                if (h->disk)
  33                        blk_mq_freeze_queue_wait(h->disk->queue);
  34}
  35
  36void nvme_mpath_start_freeze(struct nvme_subsystem *subsys)
  37{
  38        struct nvme_ns_head *h;
  39
  40        lockdep_assert_held(&subsys->lock);
  41        list_for_each_entry(h, &subsys->nsheads, entry)
  42                if (h->disk)
  43                        blk_freeze_queue_start(h->disk->queue);
  44}
  45
  46/*
  47 * If multipathing is enabled we need to always use the subsystem instance
  48 * number for numbering our devices to avoid conflicts between subsystems that
  49 * have multiple controllers and thus use the multipath-aware subsystem node
  50 * and those that have a single controller and use the controller node
  51 * directly.
  52 */
  53void nvme_set_disk_name(char *disk_name, struct nvme_ns *ns,
  54                        struct nvme_ctrl *ctrl, int *flags)
  55{
  56        if (!multipath) {
  57                sprintf(disk_name, "nvme%dn%d", ctrl->instance, ns->head->instance);
  58        } else if (ns->head->disk) {
  59                sprintf(disk_name, "nvme%dc%dn%d", ctrl->subsys->instance,
  60                                ctrl->instance, ns->head->instance);
  61                *flags = GENHD_FL_HIDDEN;
  62        } else {
  63                sprintf(disk_name, "nvme%dn%d", ctrl->subsys->instance,
  64                                ns->head->instance);
  65        }
  66}
  67
  68void nvme_failover_req(struct request *req)
  69{
  70        struct nvme_ns *ns = req->q->queuedata;
  71        u16 status = nvme_req(req)->status & 0x7ff;
  72        unsigned long flags;
  73
  74        nvme_mpath_clear_current_path(ns);
  75
  76        /*
  77         * If we got back an ANA error, we know the controller is alive but not
  78         * ready to serve this namespace.  Kick of a re-read of the ANA
  79         * information page, and just try any other available path for now.
  80         */
  81        if (nvme_is_ana_error(status) && ns->ctrl->ana_log_buf) {
  82                set_bit(NVME_NS_ANA_PENDING, &ns->flags);
  83                queue_work(nvme_wq, &ns->ctrl->ana_work);
  84        }
  85
  86        spin_lock_irqsave(&ns->head->requeue_lock, flags);
  87        blk_steal_bios(&ns->head->requeue_list, req);
  88        spin_unlock_irqrestore(&ns->head->requeue_lock, flags);
  89
  90        blk_mq_end_request(req, 0);
  91        kblockd_schedule_work(&ns->head->requeue_work);
  92}
  93
  94void nvme_kick_requeue_lists(struct nvme_ctrl *ctrl)
  95{
  96        struct nvme_ns *ns;
  97
  98        down_read(&ctrl->namespaces_rwsem);
  99        list_for_each_entry(ns, &ctrl->namespaces, list) {
 100                if (ns->head->disk)
 101                        kblockd_schedule_work(&ns->head->requeue_work);
 102        }
 103        up_read(&ctrl->namespaces_rwsem);
 104}
 105
 106static const char *nvme_ana_state_names[] = {
 107        [0]                             = "invalid state",
 108        [NVME_ANA_OPTIMIZED]            = "optimized",
 109        [NVME_ANA_NONOPTIMIZED]         = "non-optimized",
 110        [NVME_ANA_INACCESSIBLE]         = "inaccessible",
 111        [NVME_ANA_PERSISTENT_LOSS]      = "persistent-loss",
 112        [NVME_ANA_CHANGE]               = "change",
 113};
 114
 115bool nvme_mpath_clear_current_path(struct nvme_ns *ns)
 116{
 117        struct nvme_ns_head *head = ns->head;
 118        bool changed = false;
 119        int node;
 120
 121        if (!head)
 122                goto out;
 123
 124        for_each_node(node) {
 125                if (ns == rcu_access_pointer(head->current_path[node])) {
 126                        rcu_assign_pointer(head->current_path[node], NULL);
 127                        changed = true;
 128                }
 129        }
 130out:
 131        return changed;
 132}
 133
 134void nvme_mpath_clear_ctrl_paths(struct nvme_ctrl *ctrl)
 135{
 136        struct nvme_ns *ns;
 137
 138        mutex_lock(&ctrl->scan_lock);
 139        down_read(&ctrl->namespaces_rwsem);
 140        list_for_each_entry(ns, &ctrl->namespaces, list)
 141                if (nvme_mpath_clear_current_path(ns))
 142                        kblockd_schedule_work(&ns->head->requeue_work);
 143        up_read(&ctrl->namespaces_rwsem);
 144        mutex_unlock(&ctrl->scan_lock);
 145}
 146
 147static bool nvme_path_is_disabled(struct nvme_ns *ns)
 148{
 149        /*
 150         * We don't treat NVME_CTRL_DELETING as a disabled path as I/O should
 151         * still be able to complete assuming that the controller is connected.
 152         * Otherwise it will fail immediately and return to the requeue list.
 153         */
 154        if (ns->ctrl->state != NVME_CTRL_LIVE &&
 155            ns->ctrl->state != NVME_CTRL_DELETING)
 156                return true;
 157        if (test_bit(NVME_NS_ANA_PENDING, &ns->flags) ||
 158            test_bit(NVME_NS_REMOVING, &ns->flags))
 159                return true;
 160        return false;
 161}
 162
 163static struct nvme_ns *__nvme_find_path(struct nvme_ns_head *head, int node)
 164{
 165        int found_distance = INT_MAX, fallback_distance = INT_MAX, distance;
 166        struct nvme_ns *found = NULL, *fallback = NULL, *ns;
 167
 168        list_for_each_entry_rcu(ns, &head->list, siblings) {
 169                if (nvme_path_is_disabled(ns))
 170                        continue;
 171
 172                if (READ_ONCE(head->subsys->iopolicy) == NVME_IOPOLICY_NUMA)
 173                        distance = node_distance(node, ns->ctrl->numa_node);
 174                else
 175                        distance = LOCAL_DISTANCE;
 176
 177                switch (ns->ana_state) {
 178                case NVME_ANA_OPTIMIZED:
 179                        if (distance < found_distance) {
 180                                found_distance = distance;
 181                                found = ns;
 182                        }
 183                        break;
 184                case NVME_ANA_NONOPTIMIZED:
 185                        if (distance < fallback_distance) {
 186                                fallback_distance = distance;
 187                                fallback = ns;
 188                        }
 189                        break;
 190                default:
 191                        break;
 192                }
 193        }
 194
 195        if (!found)
 196                found = fallback;
 197        if (found)
 198                rcu_assign_pointer(head->current_path[node], found);
 199        return found;
 200}
 201
 202static struct nvme_ns *nvme_next_ns(struct nvme_ns_head *head,
 203                struct nvme_ns *ns)
 204{
 205        ns = list_next_or_null_rcu(&head->list, &ns->siblings, struct nvme_ns,
 206                        siblings);
 207        if (ns)
 208                return ns;
 209        return list_first_or_null_rcu(&head->list, struct nvme_ns, siblings);
 210}
 211
 212static struct nvme_ns *nvme_round_robin_path(struct nvme_ns_head *head,
 213                int node, struct nvme_ns *old)
 214{
 215        struct nvme_ns *ns, *found = NULL;
 216
 217        if (list_is_singular(&head->list)) {
 218                if (nvme_path_is_disabled(old))
 219                        return NULL;
 220                return old;
 221        }
 222
 223        for (ns = nvme_next_ns(head, old);
 224             ns && ns != old;
 225             ns = nvme_next_ns(head, ns)) {
 226                if (nvme_path_is_disabled(ns))
 227                        continue;
 228
 229                if (ns->ana_state == NVME_ANA_OPTIMIZED) {
 230                        found = ns;
 231                        goto out;
 232                }
 233                if (ns->ana_state == NVME_ANA_NONOPTIMIZED)
 234                        found = ns;
 235        }
 236
 237        /*
 238         * The loop above skips the current path for round-robin semantics.
 239         * Fall back to the current path if either:
 240         *  - no other optimized path found and current is optimized,
 241         *  - no other usable path found and current is usable.
 242         */
 243        if (!nvme_path_is_disabled(old) &&
 244            (old->ana_state == NVME_ANA_OPTIMIZED ||
 245             (!found && old->ana_state == NVME_ANA_NONOPTIMIZED)))
 246                return old;
 247
 248        if (!found)
 249                return NULL;
 250out:
 251        rcu_assign_pointer(head->current_path[node], found);
 252        return found;
 253}
 254
 255static inline bool nvme_path_is_optimized(struct nvme_ns *ns)
 256{
 257        return ns->ctrl->state == NVME_CTRL_LIVE &&
 258                ns->ana_state == NVME_ANA_OPTIMIZED;
 259}
 260
 261inline struct nvme_ns *nvme_find_path(struct nvme_ns_head *head)
 262{
 263        int node = numa_node_id();
 264        struct nvme_ns *ns;
 265
 266        ns = srcu_dereference(head->current_path[node], &head->srcu);
 267        if (unlikely(!ns))
 268                return __nvme_find_path(head, node);
 269
 270        if (READ_ONCE(head->subsys->iopolicy) == NVME_IOPOLICY_RR)
 271                return nvme_round_robin_path(head, node, ns);
 272        if (unlikely(!nvme_path_is_optimized(ns)))
 273                return __nvme_find_path(head, node);
 274        return ns;
 275}
 276
 277static bool nvme_available_path(struct nvme_ns_head *head)
 278{
 279        struct nvme_ns *ns;
 280
 281        list_for_each_entry_rcu(ns, &head->list, siblings) {
 282                if (test_bit(NVME_CTRL_FAILFAST_EXPIRED, &ns->ctrl->flags))
 283                        continue;
 284                switch (ns->ctrl->state) {
 285                case NVME_CTRL_LIVE:
 286                case NVME_CTRL_RESETTING:
 287                case NVME_CTRL_CONNECTING:
 288                        /* fallthru */
 289                        return true;
 290                default:
 291                        break;
 292                }
 293        }
 294        return false;
 295}
 296
 297blk_qc_t nvme_ns_head_submit_bio(struct bio *bio)
 298{
 299        struct nvme_ns_head *head = bio->bi_disk->private_data;
 300        struct device *dev = disk_to_dev(head->disk);
 301        struct nvme_ns *ns;
 302        blk_qc_t ret = BLK_QC_T_NONE;
 303        int srcu_idx;
 304
 305        /*
 306         * The namespace might be going away and the bio might be moved to a
 307         * different queue via blk_steal_bios(), so we need to use the bio_split
 308         * pool from the original queue to allocate the bvecs from.
 309         */
 310        blk_queue_split(&bio);
 311
 312        srcu_idx = srcu_read_lock(&head->srcu);
 313        ns = nvme_find_path(head);
 314        if (likely(ns)) {
 315                bio->bi_disk = ns->disk;
 316                bio->bi_opf |= REQ_NVME_MPATH;
 317                trace_block_bio_remap(bio, disk_devt(ns->head->disk),
 318                                      bio->bi_iter.bi_sector);
 319                ret = submit_bio_noacct(bio);
 320        } else if (nvme_available_path(head)) {
 321                dev_warn_ratelimited(dev, "no usable path - requeuing I/O\n");
 322
 323                spin_lock_irq(&head->requeue_lock);
 324                bio_list_add(&head->requeue_list, bio);
 325                spin_unlock_irq(&head->requeue_lock);
 326        } else {
 327                dev_warn_ratelimited(dev, "no available path - failing I/O\n");
 328
 329                bio->bi_status = BLK_STS_IOERR;
 330                bio_endio(bio);
 331        }
 332
 333        srcu_read_unlock(&head->srcu, srcu_idx);
 334        return ret;
 335}
 336
 337static void nvme_requeue_work(struct work_struct *work)
 338{
 339        struct nvme_ns_head *head =
 340                container_of(work, struct nvme_ns_head, requeue_work);
 341        struct bio *bio, *next;
 342
 343        spin_lock_irq(&head->requeue_lock);
 344        next = bio_list_get(&head->requeue_list);
 345        spin_unlock_irq(&head->requeue_lock);
 346
 347        while ((bio = next) != NULL) {
 348                next = bio->bi_next;
 349                bio->bi_next = NULL;
 350
 351                /*
 352                 * Reset disk to the mpath node and resubmit to select a new
 353                 * path.
 354                 */
 355                bio->bi_disk = head->disk;
 356                submit_bio_noacct(bio);
 357        }
 358}
 359
 360int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl, struct nvme_ns_head *head)
 361{
 362        struct request_queue *q;
 363        bool vwc = false;
 364
 365        mutex_init(&head->lock);
 366        bio_list_init(&head->requeue_list);
 367        spin_lock_init(&head->requeue_lock);
 368        INIT_WORK(&head->requeue_work, nvme_requeue_work);
 369
 370        /*
 371         * Add a multipath node if the subsystems supports multiple controllers.
 372         * We also do this for private namespaces as the namespace sharing data could
 373         * change after a rescan.
 374         */
 375        if (!(ctrl->subsys->cmic & NVME_CTRL_CMIC_MULTI_CTRL) || !multipath)
 376                return 0;
 377
 378        q = blk_alloc_queue(ctrl->numa_node);
 379        if (!q)
 380                goto out;
 381        blk_queue_flag_set(QUEUE_FLAG_NONROT, q);
 382        /* set to a default value for 512 until disk is validated */
 383        blk_queue_logical_block_size(q, 512);
 384        blk_set_stacking_limits(&q->limits);
 385
 386        /* we need to propagate up the VMC settings */
 387        if (ctrl->vwc & NVME_CTRL_VWC_PRESENT)
 388                vwc = true;
 389        blk_queue_write_cache(q, vwc, vwc);
 390
 391        head->disk = alloc_disk(0);
 392        if (!head->disk)
 393                goto out_cleanup_queue;
 394        head->disk->fops = &nvme_ns_head_ops;
 395        head->disk->private_data = head;
 396        head->disk->queue = q;
 397        head->disk->flags = GENHD_FL_EXT_DEVT;
 398        sprintf(head->disk->disk_name, "nvme%dn%d",
 399                        ctrl->subsys->instance, head->instance);
 400        return 0;
 401
 402out_cleanup_queue:
 403        blk_cleanup_queue(q);
 404out:
 405        return -ENOMEM;
 406}
 407
 408static void nvme_mpath_set_live(struct nvme_ns *ns)
 409{
 410        struct nvme_ns_head *head = ns->head;
 411
 412        if (!head->disk)
 413                return;
 414
 415        if (!test_and_set_bit(NVME_NSHEAD_DISK_LIVE, &head->flags))
 416                device_add_disk(&head->subsys->dev, head->disk,
 417                                nvme_ns_id_attr_groups);
 418
 419        mutex_lock(&head->lock);
 420        if (nvme_path_is_optimized(ns)) {
 421                int node, srcu_idx;
 422
 423                srcu_idx = srcu_read_lock(&head->srcu);
 424                for_each_node(node)
 425                        __nvme_find_path(head, node);
 426                srcu_read_unlock(&head->srcu, srcu_idx);
 427        }
 428        mutex_unlock(&head->lock);
 429
 430        synchronize_srcu(&head->srcu);
 431        kblockd_schedule_work(&head->requeue_work);
 432}
 433
 434static int nvme_parse_ana_log(struct nvme_ctrl *ctrl, void *data,
 435                int (*cb)(struct nvme_ctrl *ctrl, struct nvme_ana_group_desc *,
 436                        void *))
 437{
 438        void *base = ctrl->ana_log_buf;
 439        size_t offset = sizeof(struct nvme_ana_rsp_hdr);
 440        int error, i;
 441
 442        lockdep_assert_held(&ctrl->ana_lock);
 443
 444        for (i = 0; i < le16_to_cpu(ctrl->ana_log_buf->ngrps); i++) {
 445                struct nvme_ana_group_desc *desc = base + offset;
 446                u32 nr_nsids;
 447                size_t nsid_buf_size;
 448
 449                if (WARN_ON_ONCE(offset > ctrl->ana_log_size - sizeof(*desc)))
 450                        return -EINVAL;
 451
 452                nr_nsids = le32_to_cpu(desc->nnsids);
 453                nsid_buf_size = nr_nsids * sizeof(__le32);
 454
 455                if (WARN_ON_ONCE(desc->grpid == 0))
 456                        return -EINVAL;
 457                if (WARN_ON_ONCE(le32_to_cpu(desc->grpid) > ctrl->anagrpmax))
 458                        return -EINVAL;
 459                if (WARN_ON_ONCE(desc->state == 0))
 460                        return -EINVAL;
 461                if (WARN_ON_ONCE(desc->state > NVME_ANA_CHANGE))
 462                        return -EINVAL;
 463
 464                offset += sizeof(*desc);
 465                if (WARN_ON_ONCE(offset > ctrl->ana_log_size - nsid_buf_size))
 466                        return -EINVAL;
 467
 468                error = cb(ctrl, desc, data);
 469                if (error)
 470                        return error;
 471
 472                offset += nsid_buf_size;
 473        }
 474
 475        return 0;
 476}
 477
 478static inline bool nvme_state_is_live(enum nvme_ana_state state)
 479{
 480        return state == NVME_ANA_OPTIMIZED || state == NVME_ANA_NONOPTIMIZED;
 481}
 482
 483static void nvme_update_ns_ana_state(struct nvme_ana_group_desc *desc,
 484                struct nvme_ns *ns)
 485{
 486        ns->ana_grpid = le32_to_cpu(desc->grpid);
 487        ns->ana_state = desc->state;
 488        clear_bit(NVME_NS_ANA_PENDING, &ns->flags);
 489
 490        if (nvme_state_is_live(ns->ana_state))
 491                nvme_mpath_set_live(ns);
 492}
 493
 494static int nvme_update_ana_state(struct nvme_ctrl *ctrl,
 495                struct nvme_ana_group_desc *desc, void *data)
 496{
 497        u32 nr_nsids = le32_to_cpu(desc->nnsids), n = 0;
 498        unsigned *nr_change_groups = data;
 499        struct nvme_ns *ns;
 500
 501        dev_dbg(ctrl->device, "ANA group %d: %s.\n",
 502                        le32_to_cpu(desc->grpid),
 503                        nvme_ana_state_names[desc->state]);
 504
 505        if (desc->state == NVME_ANA_CHANGE)
 506                (*nr_change_groups)++;
 507
 508        if (!nr_nsids)
 509                return 0;
 510
 511        down_read(&ctrl->namespaces_rwsem);
 512        list_for_each_entry(ns, &ctrl->namespaces, list) {
 513                unsigned nsid = le32_to_cpu(desc->nsids[n]);
 514
 515                if (ns->head->ns_id < nsid)
 516                        continue;
 517                if (ns->head->ns_id == nsid)
 518                        nvme_update_ns_ana_state(desc, ns);
 519                if (++n == nr_nsids)
 520                        break;
 521        }
 522        up_read(&ctrl->namespaces_rwsem);
 523        return 0;
 524}
 525
 526static int nvme_read_ana_log(struct nvme_ctrl *ctrl)
 527{
 528        u32 nr_change_groups = 0;
 529        int error;
 530
 531        mutex_lock(&ctrl->ana_lock);
 532        error = nvme_get_log(ctrl, NVME_NSID_ALL, NVME_LOG_ANA, 0, NVME_CSI_NVM,
 533                        ctrl->ana_log_buf, ctrl->ana_log_size, 0);
 534        if (error) {
 535                dev_warn(ctrl->device, "Failed to get ANA log: %d\n", error);
 536                goto out_unlock;
 537        }
 538
 539        error = nvme_parse_ana_log(ctrl, &nr_change_groups,
 540                        nvme_update_ana_state);
 541        if (error)
 542                goto out_unlock;
 543
 544        /*
 545         * In theory we should have an ANATT timer per group as they might enter
 546         * the change state at different times.  But that is a lot of overhead
 547         * just to protect against a target that keeps entering new changes
 548         * states while never finishing previous ones.  But we'll still
 549         * eventually time out once all groups are in change state, so this
 550         * isn't a big deal.
 551         *
 552         * We also double the ANATT value to provide some slack for transports
 553         * or AEN processing overhead.
 554         */
 555        if (nr_change_groups)
 556                mod_timer(&ctrl->anatt_timer, ctrl->anatt * HZ * 2 + jiffies);
 557        else
 558                del_timer_sync(&ctrl->anatt_timer);
 559out_unlock:
 560        mutex_unlock(&ctrl->ana_lock);
 561        return error;
 562}
 563
 564static void nvme_ana_work(struct work_struct *work)
 565{
 566        struct nvme_ctrl *ctrl = container_of(work, struct nvme_ctrl, ana_work);
 567
 568        if (ctrl->state != NVME_CTRL_LIVE)
 569                return;
 570
 571        nvme_read_ana_log(ctrl);
 572}
 573
 574static void nvme_anatt_timeout(struct timer_list *t)
 575{
 576        struct nvme_ctrl *ctrl = from_timer(ctrl, t, anatt_timer);
 577
 578        dev_info(ctrl->device, "ANATT timeout, resetting controller.\n");
 579        nvme_reset_ctrl(ctrl);
 580}
 581
 582void nvme_mpath_stop(struct nvme_ctrl *ctrl)
 583{
 584        if (!nvme_ctrl_use_ana(ctrl))
 585                return;
 586        del_timer_sync(&ctrl->anatt_timer);
 587        cancel_work_sync(&ctrl->ana_work);
 588}
 589
 590#define SUBSYS_ATTR_RW(_name, _mode, _show, _store)  \
 591        struct device_attribute subsys_attr_##_name =   \
 592                __ATTR(_name, _mode, _show, _store)
 593
 594static const char *nvme_iopolicy_names[] = {
 595        [NVME_IOPOLICY_NUMA]    = "numa",
 596        [NVME_IOPOLICY_RR]      = "round-robin",
 597};
 598
 599static ssize_t nvme_subsys_iopolicy_show(struct device *dev,
 600                struct device_attribute *attr, char *buf)
 601{
 602        struct nvme_subsystem *subsys =
 603                container_of(dev, struct nvme_subsystem, dev);
 604
 605        return sprintf(buf, "%s\n",
 606                        nvme_iopolicy_names[READ_ONCE(subsys->iopolicy)]);
 607}
 608
 609static ssize_t nvme_subsys_iopolicy_store(struct device *dev,
 610                struct device_attribute *attr, const char *buf, size_t count)
 611{
 612        struct nvme_subsystem *subsys =
 613                container_of(dev, struct nvme_subsystem, dev);
 614        int i;
 615
 616        for (i = 0; i < ARRAY_SIZE(nvme_iopolicy_names); i++) {
 617                if (sysfs_streq(buf, nvme_iopolicy_names[i])) {
 618                        WRITE_ONCE(subsys->iopolicy, i);
 619                        return count;
 620                }
 621        }
 622
 623        return -EINVAL;
 624}
 625SUBSYS_ATTR_RW(iopolicy, S_IRUGO | S_IWUSR,
 626                      nvme_subsys_iopolicy_show, nvme_subsys_iopolicy_store);
 627
 628static ssize_t ana_grpid_show(struct device *dev, struct device_attribute *attr,
 629                char *buf)
 630{
 631        return sprintf(buf, "%d\n", nvme_get_ns_from_dev(dev)->ana_grpid);
 632}
 633DEVICE_ATTR_RO(ana_grpid);
 634
 635static ssize_t ana_state_show(struct device *dev, struct device_attribute *attr,
 636                char *buf)
 637{
 638        struct nvme_ns *ns = nvme_get_ns_from_dev(dev);
 639
 640        return sprintf(buf, "%s\n", nvme_ana_state_names[ns->ana_state]);
 641}
 642DEVICE_ATTR_RO(ana_state);
 643
 644static int nvme_lookup_ana_group_desc(struct nvme_ctrl *ctrl,
 645                struct nvme_ana_group_desc *desc, void *data)
 646{
 647        struct nvme_ana_group_desc *dst = data;
 648
 649        if (desc->grpid != dst->grpid)
 650                return 0;
 651
 652        *dst = *desc;
 653        return -ENXIO; /* just break out of the loop */
 654}
 655
 656void nvme_mpath_add_disk(struct nvme_ns *ns, struct nvme_id_ns *id)
 657{
 658        if (nvme_ctrl_use_ana(ns->ctrl)) {
 659                struct nvme_ana_group_desc desc = {
 660                        .grpid = id->anagrpid,
 661                        .state = 0,
 662                };
 663
 664                mutex_lock(&ns->ctrl->ana_lock);
 665                ns->ana_grpid = le32_to_cpu(id->anagrpid);
 666                nvme_parse_ana_log(ns->ctrl, &desc, nvme_lookup_ana_group_desc);
 667                mutex_unlock(&ns->ctrl->ana_lock);
 668                if (desc.state) {
 669                        /* found the group desc: update */
 670                        nvme_update_ns_ana_state(&desc, ns);
 671                }
 672        } else {
 673                ns->ana_state = NVME_ANA_OPTIMIZED; 
 674                nvme_mpath_set_live(ns);
 675        }
 676
 677        if (blk_queue_stable_writes(ns->queue) && ns->head->disk)
 678                blk_queue_flag_set(QUEUE_FLAG_STABLE_WRITES,
 679                                   ns->head->disk->queue);
 680}
 681
 682void nvme_mpath_remove_disk(struct nvme_ns_head *head)
 683{
 684        if (!head->disk)
 685                return;
 686        if (head->disk->flags & GENHD_FL_UP)
 687                del_gendisk(head->disk);
 688        blk_set_queue_dying(head->disk->queue);
 689        /* make sure all pending bios are cleaned up */
 690        kblockd_schedule_work(&head->requeue_work);
 691        flush_work(&head->requeue_work);
 692        blk_cleanup_queue(head->disk->queue);
 693        if (!test_bit(NVME_NSHEAD_DISK_LIVE, &head->flags)) {
 694                /*
 695                 * if device_add_disk wasn't called, prevent
 696                 * disk release to put a bogus reference on the
 697                 * request queue
 698                 */
 699                head->disk->queue = NULL;
 700        }
 701        put_disk(head->disk);
 702}
 703
 704int nvme_mpath_init(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id)
 705{
 706        int error;
 707
 708        /* check if multipath is enabled and we have the capability */
 709        if (!multipath || !ctrl->subsys ||
 710            !(ctrl->subsys->cmic & NVME_CTRL_CMIC_ANA))
 711                return 0;
 712
 713        ctrl->anacap = id->anacap;
 714        ctrl->anatt = id->anatt;
 715        ctrl->nanagrpid = le32_to_cpu(id->nanagrpid);
 716        ctrl->anagrpmax = le32_to_cpu(id->anagrpmax);
 717
 718        mutex_init(&ctrl->ana_lock);
 719        timer_setup(&ctrl->anatt_timer, nvme_anatt_timeout, 0);
 720        ctrl->ana_log_size = sizeof(struct nvme_ana_rsp_hdr) +
 721                ctrl->nanagrpid * sizeof(struct nvme_ana_group_desc);
 722        ctrl->ana_log_size += ctrl->max_namespaces * sizeof(__le32);
 723
 724        if (ctrl->ana_log_size > ctrl->max_hw_sectors << SECTOR_SHIFT) {
 725                dev_err(ctrl->device,
 726                        "ANA log page size (%zd) larger than MDTS (%d).\n",
 727                        ctrl->ana_log_size,
 728                        ctrl->max_hw_sectors << SECTOR_SHIFT);
 729                dev_err(ctrl->device, "disabling ANA support.\n");
 730                return 0;
 731        }
 732
 733        INIT_WORK(&ctrl->ana_work, nvme_ana_work);
 734        kfree(ctrl->ana_log_buf);
 735        ctrl->ana_log_buf = kmalloc(ctrl->ana_log_size, GFP_KERNEL);
 736        if (!ctrl->ana_log_buf) {
 737                error = -ENOMEM;
 738                goto out;
 739        }
 740
 741        error = nvme_read_ana_log(ctrl);
 742        if (error)
 743                goto out_free_ana_log_buf;
 744        return 0;
 745out_free_ana_log_buf:
 746        kfree(ctrl->ana_log_buf);
 747        ctrl->ana_log_buf = NULL;
 748out:
 749        return error;
 750}
 751
 752void nvme_mpath_uninit(struct nvme_ctrl *ctrl)
 753{
 754        kfree(ctrl->ana_log_buf);
 755        ctrl->ana_log_buf = NULL;
 756}
 757
 758