linux/drivers/nvme/host/multipath.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0
   2/*
   3 * Copyright (c) 2017-2018 Christoph Hellwig.
   4 */
   5
   6#include <linux/backing-dev.h>
   7#include <linux/moduleparam.h>
   8#include <trace/events/block.h>
   9#include "nvme.h"
  10
  11static bool multipath = true;
  12module_param(multipath, bool, 0444);
  13MODULE_PARM_DESC(multipath,
  14        "turn on native support for multiple controllers per subsystem");
  15
  16void nvme_mpath_unfreeze(struct nvme_subsystem *subsys)
  17{
  18        struct nvme_ns_head *h;
  19
  20        lockdep_assert_held(&subsys->lock);
  21        list_for_each_entry(h, &subsys->nsheads, entry)
  22                if (h->disk)
  23                        blk_mq_unfreeze_queue(h->disk->queue);
  24}
  25
  26void nvme_mpath_wait_freeze(struct nvme_subsystem *subsys)
  27{
  28        struct nvme_ns_head *h;
  29
  30        lockdep_assert_held(&subsys->lock);
  31        list_for_each_entry(h, &subsys->nsheads, entry)
  32                if (h->disk)
  33                        blk_mq_freeze_queue_wait(h->disk->queue);
  34}
  35
  36void nvme_mpath_start_freeze(struct nvme_subsystem *subsys)
  37{
  38        struct nvme_ns_head *h;
  39
  40        lockdep_assert_held(&subsys->lock);
  41        list_for_each_entry(h, &subsys->nsheads, entry)
  42                if (h->disk)
  43                        blk_freeze_queue_start(h->disk->queue);
  44}
  45
  46/*
  47 * If multipathing is enabled we need to always use the subsystem instance
  48 * number for numbering our devices to avoid conflicts between subsystems that
  49 * have multiple controllers and thus use the multipath-aware subsystem node
  50 * and those that have a single controller and use the controller node
  51 * directly.
  52 */
  53void nvme_set_disk_name(char *disk_name, struct nvme_ns *ns,
  54                        struct nvme_ctrl *ctrl, int *flags)
  55{
  56        if (!multipath) {
  57                sprintf(disk_name, "nvme%dn%d", ctrl->instance, ns->head->instance);
  58        } else if (ns->head->disk) {
  59                sprintf(disk_name, "nvme%dc%dn%d", ctrl->subsys->instance,
  60                                ctrl->instance, ns->head->instance);
  61                *flags = GENHD_FL_HIDDEN;
  62        } else {
  63                sprintf(disk_name, "nvme%dn%d", ctrl->subsys->instance,
  64                                ns->head->instance);
  65        }
  66}
  67
  68void nvme_failover_req(struct request *req)
  69{
  70        struct nvme_ns *ns = req->q->queuedata;
  71        u16 status = nvme_req(req)->status & 0x7ff;
  72        unsigned long flags;
  73
  74        nvme_mpath_clear_current_path(ns);
  75
  76        /*
  77         * If we got back an ANA error, we know the controller is alive but not
  78         * ready to serve this namespace.  Kick of a re-read of the ANA
  79         * information page, and just try any other available path for now.
  80         */
  81        if (nvme_is_ana_error(status) && ns->ctrl->ana_log_buf) {
  82                set_bit(NVME_NS_ANA_PENDING, &ns->flags);
  83                queue_work(nvme_wq, &ns->ctrl->ana_work);
  84        }
  85
  86        spin_lock_irqsave(&ns->head->requeue_lock, flags);
  87        blk_steal_bios(&ns->head->requeue_list, req);
  88        spin_unlock_irqrestore(&ns->head->requeue_lock, flags);
  89
  90        blk_mq_end_request(req, 0);
  91        kblockd_schedule_work(&ns->head->requeue_work);
  92}
  93
  94void nvme_kick_requeue_lists(struct nvme_ctrl *ctrl)
  95{
  96        struct nvme_ns *ns;
  97
  98        down_read(&ctrl->namespaces_rwsem);
  99        list_for_each_entry(ns, &ctrl->namespaces, list) {
 100                if (ns->head->disk)
 101                        kblockd_schedule_work(&ns->head->requeue_work);
 102        }
 103        up_read(&ctrl->namespaces_rwsem);
 104}
 105
 106static const char *nvme_ana_state_names[] = {
 107        [0]                             = "invalid state",
 108        [NVME_ANA_OPTIMIZED]            = "optimized",
 109        [NVME_ANA_NONOPTIMIZED]         = "non-optimized",
 110        [NVME_ANA_INACCESSIBLE]         = "inaccessible",
 111        [NVME_ANA_PERSISTENT_LOSS]      = "persistent-loss",
 112        [NVME_ANA_CHANGE]               = "change",
 113};
 114
 115bool nvme_mpath_clear_current_path(struct nvme_ns *ns)
 116{
 117        struct nvme_ns_head *head = ns->head;
 118        bool changed = false;
 119        int node;
 120
 121        if (!head)
 122                goto out;
 123
 124        for_each_node(node) {
 125                if (ns == rcu_access_pointer(head->current_path[node])) {
 126                        rcu_assign_pointer(head->current_path[node], NULL);
 127                        changed = true;
 128                }
 129        }
 130out:
 131        return changed;
 132}
 133
 134void nvme_mpath_clear_ctrl_paths(struct nvme_ctrl *ctrl)
 135{
 136        struct nvme_ns *ns;
 137
 138        mutex_lock(&ctrl->scan_lock);
 139        down_read(&ctrl->namespaces_rwsem);
 140        list_for_each_entry(ns, &ctrl->namespaces, list)
 141                if (nvme_mpath_clear_current_path(ns))
 142                        kblockd_schedule_work(&ns->head->requeue_work);
 143        up_read(&ctrl->namespaces_rwsem);
 144        mutex_unlock(&ctrl->scan_lock);
 145}
 146
 147static bool nvme_path_is_disabled(struct nvme_ns *ns)
 148{
 149        /*
 150         * We don't treat NVME_CTRL_DELETING as a disabled path as I/O should
 151         * still be able to complete assuming that the controller is connected.
 152         * Otherwise it will fail immediately and return to the requeue list.
 153         */
 154        if (ns->ctrl->state != NVME_CTRL_LIVE &&
 155            ns->ctrl->state != NVME_CTRL_DELETING)
 156                return true;
 157        if (test_bit(NVME_NS_ANA_PENDING, &ns->flags) ||
 158            test_bit(NVME_NS_REMOVING, &ns->flags))
 159                return true;
 160        return false;
 161}
 162
 163static struct nvme_ns *__nvme_find_path(struct nvme_ns_head *head, int node)
 164{
 165        int found_distance = INT_MAX, fallback_distance = INT_MAX, distance;
 166        struct nvme_ns *found = NULL, *fallback = NULL, *ns;
 167
 168        list_for_each_entry_rcu(ns, &head->list, siblings) {
 169                if (nvme_path_is_disabled(ns))
 170                        continue;
 171
 172                if (READ_ONCE(head->subsys->iopolicy) == NVME_IOPOLICY_NUMA)
 173                        distance = node_distance(node, ns->ctrl->numa_node);
 174                else
 175                        distance = LOCAL_DISTANCE;
 176
 177                switch (ns->ana_state) {
 178                case NVME_ANA_OPTIMIZED:
 179                        if (distance < found_distance) {
 180                                found_distance = distance;
 181                                found = ns;
 182                        }
 183                        break;
 184                case NVME_ANA_NONOPTIMIZED:
 185                        if (distance < fallback_distance) {
 186                                fallback_distance = distance;
 187                                fallback = ns;
 188                        }
 189                        break;
 190                default:
 191                        break;
 192                }
 193        }
 194
 195        if (!found)
 196                found = fallback;
 197        if (found)
 198                rcu_assign_pointer(head->current_path[node], found);
 199        return found;
 200}
 201
 202static struct nvme_ns *nvme_next_ns(struct nvme_ns_head *head,
 203                struct nvme_ns *ns)
 204{
 205        ns = list_next_or_null_rcu(&head->list, &ns->siblings, struct nvme_ns,
 206                        siblings);
 207        if (ns)
 208                return ns;
 209        return list_first_or_null_rcu(&head->list, struct nvme_ns, siblings);
 210}
 211
 212static struct nvme_ns *nvme_round_robin_path(struct nvme_ns_head *head,
 213                int node, struct nvme_ns *old)
 214{
 215        struct nvme_ns *ns, *found = NULL;
 216
 217        if (list_is_singular(&head->list)) {
 218                if (nvme_path_is_disabled(old))
 219                        return NULL;
 220                return old;
 221        }
 222
 223        for (ns = nvme_next_ns(head, old);
 224             ns != old;
 225             ns = nvme_next_ns(head, ns)) {
 226                if (nvme_path_is_disabled(ns))
 227                        continue;
 228
 229                if (ns->ana_state == NVME_ANA_OPTIMIZED) {
 230                        found = ns;
 231                        goto out;
 232                }
 233                if (ns->ana_state == NVME_ANA_NONOPTIMIZED)
 234                        found = ns;
 235        }
 236
 237        /*
 238         * The loop above skips the current path for round-robin semantics.
 239         * Fall back to the current path if either:
 240         *  - no other optimized path found and current is optimized,
 241         *  - no other usable path found and current is usable.
 242         */
 243        if (!nvme_path_is_disabled(old) &&
 244            (old->ana_state == NVME_ANA_OPTIMIZED ||
 245             (!found && old->ana_state == NVME_ANA_NONOPTIMIZED)))
 246                return old;
 247
 248        if (!found)
 249                return NULL;
 250out:
 251        rcu_assign_pointer(head->current_path[node], found);
 252        return found;
 253}
 254
 255static inline bool nvme_path_is_optimized(struct nvme_ns *ns)
 256{
 257        return ns->ctrl->state == NVME_CTRL_LIVE &&
 258                ns->ana_state == NVME_ANA_OPTIMIZED;
 259}
 260
 261inline struct nvme_ns *nvme_find_path(struct nvme_ns_head *head)
 262{
 263        int node = numa_node_id();
 264        struct nvme_ns *ns;
 265
 266        ns = srcu_dereference(head->current_path[node], &head->srcu);
 267        if (unlikely(!ns))
 268                return __nvme_find_path(head, node);
 269
 270        if (READ_ONCE(head->subsys->iopolicy) == NVME_IOPOLICY_RR)
 271                return nvme_round_robin_path(head, node, ns);
 272        if (unlikely(!nvme_path_is_optimized(ns)))
 273                return __nvme_find_path(head, node);
 274        return ns;
 275}
 276
 277static bool nvme_available_path(struct nvme_ns_head *head)
 278{
 279        struct nvme_ns *ns;
 280
 281        list_for_each_entry_rcu(ns, &head->list, siblings) {
 282                switch (ns->ctrl->state) {
 283                case NVME_CTRL_LIVE:
 284                case NVME_CTRL_RESETTING:
 285                case NVME_CTRL_CONNECTING:
 286                        /* fallthru */
 287                        return true;
 288                default:
 289                        break;
 290                }
 291        }
 292        return false;
 293}
 294
 295blk_qc_t nvme_ns_head_submit_bio(struct bio *bio)
 296{
 297        struct nvme_ns_head *head = bio->bi_disk->private_data;
 298        struct device *dev = disk_to_dev(head->disk);
 299        struct nvme_ns *ns;
 300        blk_qc_t ret = BLK_QC_T_NONE;
 301        int srcu_idx;
 302
 303        /*
 304         * The namespace might be going away and the bio might be moved to a
 305         * different queue via blk_steal_bios(), so we need to use the bio_split
 306         * pool from the original queue to allocate the bvecs from.
 307         */
 308        blk_queue_split(&bio);
 309
 310        srcu_idx = srcu_read_lock(&head->srcu);
 311        ns = nvme_find_path(head);
 312        if (likely(ns)) {
 313                bio->bi_disk = ns->disk;
 314                bio->bi_opf |= REQ_NVME_MPATH;
 315                trace_block_bio_remap(bio->bi_disk->queue, bio,
 316                                      disk_devt(ns->head->disk),
 317                                      bio->bi_iter.bi_sector);
 318                ret = submit_bio_noacct(bio);
 319        } else if (nvme_available_path(head)) {
 320                dev_warn_ratelimited(dev, "no usable path - requeuing I/O\n");
 321
 322                spin_lock_irq(&head->requeue_lock);
 323                bio_list_add(&head->requeue_list, bio);
 324                spin_unlock_irq(&head->requeue_lock);
 325        } else {
 326                dev_warn_ratelimited(dev, "no available path - failing I/O\n");
 327
 328                bio->bi_status = BLK_STS_IOERR;
 329                bio_endio(bio);
 330        }
 331
 332        srcu_read_unlock(&head->srcu, srcu_idx);
 333        return ret;
 334}
 335
 336static void nvme_requeue_work(struct work_struct *work)
 337{
 338        struct nvme_ns_head *head =
 339                container_of(work, struct nvme_ns_head, requeue_work);
 340        struct bio *bio, *next;
 341
 342        spin_lock_irq(&head->requeue_lock);
 343        next = bio_list_get(&head->requeue_list);
 344        spin_unlock_irq(&head->requeue_lock);
 345
 346        while ((bio = next) != NULL) {
 347                next = bio->bi_next;
 348                bio->bi_next = NULL;
 349
 350                /*
 351                 * Reset disk to the mpath node and resubmit to select a new
 352                 * path.
 353                 */
 354                bio->bi_disk = head->disk;
 355                submit_bio_noacct(bio);
 356        }
 357}
 358
 359int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl, struct nvme_ns_head *head)
 360{
 361        struct request_queue *q;
 362        bool vwc = false;
 363
 364        mutex_init(&head->lock);
 365        bio_list_init(&head->requeue_list);
 366        spin_lock_init(&head->requeue_lock);
 367        INIT_WORK(&head->requeue_work, nvme_requeue_work);
 368
 369        /*
 370         * Add a multipath node if the subsystems supports multiple controllers.
 371         * We also do this for private namespaces as the namespace sharing data could
 372         * change after a rescan.
 373         */
 374        if (!(ctrl->subsys->cmic & NVME_CTRL_CMIC_MULTI_CTRL) || !multipath)
 375                return 0;
 376
 377        q = blk_alloc_queue(ctrl->numa_node);
 378        if (!q)
 379                goto out;
 380        blk_queue_flag_set(QUEUE_FLAG_NONROT, q);
 381        /* set to a default value for 512 until disk is validated */
 382        blk_queue_logical_block_size(q, 512);
 383        blk_set_stacking_limits(&q->limits);
 384
 385        /* we need to propagate up the VMC settings */
 386        if (ctrl->vwc & NVME_CTRL_VWC_PRESENT)
 387                vwc = true;
 388        blk_queue_write_cache(q, vwc, vwc);
 389
 390        head->disk = alloc_disk(0);
 391        if (!head->disk)
 392                goto out_cleanup_queue;
 393        head->disk->fops = &nvme_ns_head_ops;
 394        head->disk->private_data = head;
 395        head->disk->queue = q;
 396        head->disk->flags = GENHD_FL_EXT_DEVT;
 397        sprintf(head->disk->disk_name, "nvme%dn%d",
 398                        ctrl->subsys->instance, head->instance);
 399        return 0;
 400
 401out_cleanup_queue:
 402        blk_cleanup_queue(q);
 403out:
 404        return -ENOMEM;
 405}
 406
 407static void nvme_mpath_set_live(struct nvme_ns *ns)
 408{
 409        struct nvme_ns_head *head = ns->head;
 410
 411        if (!head->disk)
 412                return;
 413
 414        if (!test_and_set_bit(NVME_NSHEAD_DISK_LIVE, &head->flags))
 415                device_add_disk(&head->subsys->dev, head->disk,
 416                                nvme_ns_id_attr_groups);
 417
 418        mutex_lock(&head->lock);
 419        if (nvme_path_is_optimized(ns)) {
 420                int node, srcu_idx;
 421
 422                srcu_idx = srcu_read_lock(&head->srcu);
 423                for_each_node(node)
 424                        __nvme_find_path(head, node);
 425                srcu_read_unlock(&head->srcu, srcu_idx);
 426        }
 427        mutex_unlock(&head->lock);
 428
 429        synchronize_srcu(&head->srcu);
 430        kblockd_schedule_work(&head->requeue_work);
 431}
 432
 433static int nvme_parse_ana_log(struct nvme_ctrl *ctrl, void *data,
 434                int (*cb)(struct nvme_ctrl *ctrl, struct nvme_ana_group_desc *,
 435                        void *))
 436{
 437        void *base = ctrl->ana_log_buf;
 438        size_t offset = sizeof(struct nvme_ana_rsp_hdr);
 439        int error, i;
 440
 441        lockdep_assert_held(&ctrl->ana_lock);
 442
 443        for (i = 0; i < le16_to_cpu(ctrl->ana_log_buf->ngrps); i++) {
 444                struct nvme_ana_group_desc *desc = base + offset;
 445                u32 nr_nsids;
 446                size_t nsid_buf_size;
 447
 448                if (WARN_ON_ONCE(offset > ctrl->ana_log_size - sizeof(*desc)))
 449                        return -EINVAL;
 450
 451                nr_nsids = le32_to_cpu(desc->nnsids);
 452                nsid_buf_size = nr_nsids * sizeof(__le32);
 453
 454                if (WARN_ON_ONCE(desc->grpid == 0))
 455                        return -EINVAL;
 456                if (WARN_ON_ONCE(le32_to_cpu(desc->grpid) > ctrl->anagrpmax))
 457                        return -EINVAL;
 458                if (WARN_ON_ONCE(desc->state == 0))
 459                        return -EINVAL;
 460                if (WARN_ON_ONCE(desc->state > NVME_ANA_CHANGE))
 461                        return -EINVAL;
 462
 463                offset += sizeof(*desc);
 464                if (WARN_ON_ONCE(offset > ctrl->ana_log_size - nsid_buf_size))
 465                        return -EINVAL;
 466
 467                error = cb(ctrl, desc, data);
 468                if (error)
 469                        return error;
 470
 471                offset += nsid_buf_size;
 472        }
 473
 474        return 0;
 475}
 476
 477static inline bool nvme_state_is_live(enum nvme_ana_state state)
 478{
 479        return state == NVME_ANA_OPTIMIZED || state == NVME_ANA_NONOPTIMIZED;
 480}
 481
 482static void nvme_update_ns_ana_state(struct nvme_ana_group_desc *desc,
 483                struct nvme_ns *ns)
 484{
 485        ns->ana_grpid = le32_to_cpu(desc->grpid);
 486        ns->ana_state = desc->state;
 487        clear_bit(NVME_NS_ANA_PENDING, &ns->flags);
 488
 489        if (nvme_state_is_live(ns->ana_state))
 490                nvme_mpath_set_live(ns);
 491}
 492
 493static int nvme_update_ana_state(struct nvme_ctrl *ctrl,
 494                struct nvme_ana_group_desc *desc, void *data)
 495{
 496        u32 nr_nsids = le32_to_cpu(desc->nnsids), n = 0;
 497        unsigned *nr_change_groups = data;
 498        struct nvme_ns *ns;
 499
 500        dev_dbg(ctrl->device, "ANA group %d: %s.\n",
 501                        le32_to_cpu(desc->grpid),
 502                        nvme_ana_state_names[desc->state]);
 503
 504        if (desc->state == NVME_ANA_CHANGE)
 505                (*nr_change_groups)++;
 506
 507        if (!nr_nsids)
 508                return 0;
 509
 510        down_read(&ctrl->namespaces_rwsem);
 511        list_for_each_entry(ns, &ctrl->namespaces, list) {
 512                unsigned nsid = le32_to_cpu(desc->nsids[n]);
 513
 514                if (ns->head->ns_id < nsid)
 515                        continue;
 516                if (ns->head->ns_id == nsid)
 517                        nvme_update_ns_ana_state(desc, ns);
 518                if (++n == nr_nsids)
 519                        break;
 520        }
 521        up_read(&ctrl->namespaces_rwsem);
 522        return 0;
 523}
 524
 525static int nvme_read_ana_log(struct nvme_ctrl *ctrl)
 526{
 527        u32 nr_change_groups = 0;
 528        int error;
 529
 530        mutex_lock(&ctrl->ana_lock);
 531        error = nvme_get_log(ctrl, NVME_NSID_ALL, NVME_LOG_ANA, 0, NVME_CSI_NVM,
 532                        ctrl->ana_log_buf, ctrl->ana_log_size, 0);
 533        if (error) {
 534                dev_warn(ctrl->device, "Failed to get ANA log: %d\n", error);
 535                goto out_unlock;
 536        }
 537
 538        error = nvme_parse_ana_log(ctrl, &nr_change_groups,
 539                        nvme_update_ana_state);
 540        if (error)
 541                goto out_unlock;
 542
 543        /*
 544         * In theory we should have an ANATT timer per group as they might enter
 545         * the change state at different times.  But that is a lot of overhead
 546         * just to protect against a target that keeps entering new changes
 547         * states while never finishing previous ones.  But we'll still
 548         * eventually time out once all groups are in change state, so this
 549         * isn't a big deal.
 550         *
 551         * We also double the ANATT value to provide some slack for transports
 552         * or AEN processing overhead.
 553         */
 554        if (nr_change_groups)
 555                mod_timer(&ctrl->anatt_timer, ctrl->anatt * HZ * 2 + jiffies);
 556        else
 557                del_timer_sync(&ctrl->anatt_timer);
 558out_unlock:
 559        mutex_unlock(&ctrl->ana_lock);
 560        return error;
 561}
 562
 563static void nvme_ana_work(struct work_struct *work)
 564{
 565        struct nvme_ctrl *ctrl = container_of(work, struct nvme_ctrl, ana_work);
 566
 567        if (ctrl->state != NVME_CTRL_LIVE)
 568                return;
 569
 570        nvme_read_ana_log(ctrl);
 571}
 572
 573static void nvme_anatt_timeout(struct timer_list *t)
 574{
 575        struct nvme_ctrl *ctrl = from_timer(ctrl, t, anatt_timer);
 576
 577        dev_info(ctrl->device, "ANATT timeout, resetting controller.\n");
 578        nvme_reset_ctrl(ctrl);
 579}
 580
 581void nvme_mpath_stop(struct nvme_ctrl *ctrl)
 582{
 583        if (!nvme_ctrl_use_ana(ctrl))
 584                return;
 585        del_timer_sync(&ctrl->anatt_timer);
 586        cancel_work_sync(&ctrl->ana_work);
 587}
 588
 589#define SUBSYS_ATTR_RW(_name, _mode, _show, _store)  \
 590        struct device_attribute subsys_attr_##_name =   \
 591                __ATTR(_name, _mode, _show, _store)
 592
 593static const char *nvme_iopolicy_names[] = {
 594        [NVME_IOPOLICY_NUMA]    = "numa",
 595        [NVME_IOPOLICY_RR]      = "round-robin",
 596};
 597
 598static ssize_t nvme_subsys_iopolicy_show(struct device *dev,
 599                struct device_attribute *attr, char *buf)
 600{
 601        struct nvme_subsystem *subsys =
 602                container_of(dev, struct nvme_subsystem, dev);
 603
 604        return sprintf(buf, "%s\n",
 605                        nvme_iopolicy_names[READ_ONCE(subsys->iopolicy)]);
 606}
 607
 608static ssize_t nvme_subsys_iopolicy_store(struct device *dev,
 609                struct device_attribute *attr, const char *buf, size_t count)
 610{
 611        struct nvme_subsystem *subsys =
 612                container_of(dev, struct nvme_subsystem, dev);
 613        int i;
 614
 615        for (i = 0; i < ARRAY_SIZE(nvme_iopolicy_names); i++) {
 616                if (sysfs_streq(buf, nvme_iopolicy_names[i])) {
 617                        WRITE_ONCE(subsys->iopolicy, i);
 618                        return count;
 619                }
 620        }
 621
 622        return -EINVAL;
 623}
 624SUBSYS_ATTR_RW(iopolicy, S_IRUGO | S_IWUSR,
 625                      nvme_subsys_iopolicy_show, nvme_subsys_iopolicy_store);
 626
 627static ssize_t ana_grpid_show(struct device *dev, struct device_attribute *attr,
 628                char *buf)
 629{
 630        return sprintf(buf, "%d\n", nvme_get_ns_from_dev(dev)->ana_grpid);
 631}
 632DEVICE_ATTR_RO(ana_grpid);
 633
 634static ssize_t ana_state_show(struct device *dev, struct device_attribute *attr,
 635                char *buf)
 636{
 637        struct nvme_ns *ns = nvme_get_ns_from_dev(dev);
 638
 639        return sprintf(buf, "%s\n", nvme_ana_state_names[ns->ana_state]);
 640}
 641DEVICE_ATTR_RO(ana_state);
 642
 643static int nvme_lookup_ana_group_desc(struct nvme_ctrl *ctrl,
 644                struct nvme_ana_group_desc *desc, void *data)
 645{
 646        struct nvme_ana_group_desc *dst = data;
 647
 648        if (desc->grpid != dst->grpid)
 649                return 0;
 650
 651        *dst = *desc;
 652        return -ENXIO; /* just break out of the loop */
 653}
 654
 655void nvme_mpath_add_disk(struct nvme_ns *ns, struct nvme_id_ns *id)
 656{
 657        if (nvme_ctrl_use_ana(ns->ctrl)) {
 658                struct nvme_ana_group_desc desc = {
 659                        .grpid = id->anagrpid,
 660                        .state = 0,
 661                };
 662
 663                mutex_lock(&ns->ctrl->ana_lock);
 664                ns->ana_grpid = le32_to_cpu(id->anagrpid);
 665                nvme_parse_ana_log(ns->ctrl, &desc, nvme_lookup_ana_group_desc);
 666                mutex_unlock(&ns->ctrl->ana_lock);
 667                if (desc.state) {
 668                        /* found the group desc: update */
 669                        nvme_update_ns_ana_state(&desc, ns);
 670                }
 671        } else {
 672                ns->ana_state = NVME_ANA_OPTIMIZED; 
 673                nvme_mpath_set_live(ns);
 674        }
 675
 676        if (bdi_cap_stable_pages_required(ns->queue->backing_dev_info)) {
 677                struct gendisk *disk = ns->head->disk;
 678
 679                if (disk)
 680                        disk->queue->backing_dev_info->capabilities |=
 681                                        BDI_CAP_STABLE_WRITES;
 682        }
 683}
 684
 685void nvme_mpath_remove_disk(struct nvme_ns_head *head)
 686{
 687        if (!head->disk)
 688                return;
 689        if (head->disk->flags & GENHD_FL_UP)
 690                del_gendisk(head->disk);
 691        blk_set_queue_dying(head->disk->queue);
 692        /* make sure all pending bios are cleaned up */
 693        kblockd_schedule_work(&head->requeue_work);
 694        flush_work(&head->requeue_work);
 695        blk_cleanup_queue(head->disk->queue);
 696        if (!test_bit(NVME_NSHEAD_DISK_LIVE, &head->flags)) {
 697                /*
 698                 * if device_add_disk wasn't called, prevent
 699                 * disk release to put a bogus reference on the
 700                 * request queue
 701                 */
 702                head->disk->queue = NULL;
 703        }
 704        put_disk(head->disk);
 705}
 706
 707int nvme_mpath_init(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id)
 708{
 709        int error;
 710
 711        /* check if multipath is enabled and we have the capability */
 712        if (!multipath || !ctrl->subsys ||
 713            !(ctrl->subsys->cmic & NVME_CTRL_CMIC_ANA))
 714                return 0;
 715
 716        ctrl->anacap = id->anacap;
 717        ctrl->anatt = id->anatt;
 718        ctrl->nanagrpid = le32_to_cpu(id->nanagrpid);
 719        ctrl->anagrpmax = le32_to_cpu(id->anagrpmax);
 720
 721        mutex_init(&ctrl->ana_lock);
 722        timer_setup(&ctrl->anatt_timer, nvme_anatt_timeout, 0);
 723        ctrl->ana_log_size = sizeof(struct nvme_ana_rsp_hdr) +
 724                ctrl->nanagrpid * sizeof(struct nvme_ana_group_desc);
 725        ctrl->ana_log_size += ctrl->max_namespaces * sizeof(__le32);
 726
 727        if (ctrl->ana_log_size > ctrl->max_hw_sectors << SECTOR_SHIFT) {
 728                dev_err(ctrl->device,
 729                        "ANA log page size (%zd) larger than MDTS (%d).\n",
 730                        ctrl->ana_log_size,
 731                        ctrl->max_hw_sectors << SECTOR_SHIFT);
 732                dev_err(ctrl->device, "disabling ANA support.\n");
 733                return 0;
 734        }
 735
 736        INIT_WORK(&ctrl->ana_work, nvme_ana_work);
 737        kfree(ctrl->ana_log_buf);
 738        ctrl->ana_log_buf = kmalloc(ctrl->ana_log_size, GFP_KERNEL);
 739        if (!ctrl->ana_log_buf) {
 740                error = -ENOMEM;
 741                goto out;
 742        }
 743
 744        error = nvme_read_ana_log(ctrl);
 745        if (error)
 746                goto out_free_ana_log_buf;
 747        return 0;
 748out_free_ana_log_buf:
 749        kfree(ctrl->ana_log_buf);
 750        ctrl->ana_log_buf = NULL;
 751out:
 752        return error;
 753}
 754
 755void nvme_mpath_uninit(struct nvme_ctrl *ctrl)
 756{
 757        kfree(ctrl->ana_log_buf);
 758        ctrl->ana_log_buf = NULL;
 759}
 760
 761