linux/drivers/md/dm-mpath.c
<<
>>
Prefs
   1/*
   2 * Copyright (C) 2003 Sistina Software Limited.
   3 * Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
   4 *
   5 * This file is released under the GPL.
   6 */
   7
   8#include <linux/device-mapper.h>
   9
  10#include "dm-rq.h"
  11#include "dm-bio-record.h"
  12#include "dm-path-selector.h"
  13#include "dm-uevent.h"
  14
  15#include <linux/blkdev.h>
  16#include <linux/ctype.h>
  17#include <linux/init.h>
  18#include <linux/mempool.h>
  19#include <linux/module.h>
  20#include <linux/pagemap.h>
  21#include <linux/slab.h>
  22#include <linux/time.h>
  23#include <linux/timer.h>
  24#include <linux/workqueue.h>
  25#include <linux/delay.h>
  26#include <scsi/scsi_dh.h>
  27#include <linux/atomic.h>
  28#include <linux/blk-mq.h>
  29
  30#define DM_MSG_PREFIX "multipath"
  31#define DM_PG_INIT_DELAY_MSECS 2000
  32#define DM_PG_INIT_DELAY_DEFAULT ((unsigned) -1)
  33#define QUEUE_IF_NO_PATH_TIMEOUT_DEFAULT 0
  34
  35static unsigned long queue_if_no_path_timeout_secs = QUEUE_IF_NO_PATH_TIMEOUT_DEFAULT;
  36
  37/* Path properties */
  38struct pgpath {
  39        struct list_head list;
  40
  41        struct priority_group *pg;      /* Owning PG */
  42        unsigned fail_count;            /* Cumulative failure count */
  43
  44        struct dm_path path;
  45        struct delayed_work activate_path;
  46
  47        bool is_active:1;               /* Path status */
  48};
  49
  50#define path_to_pgpath(__pgp) container_of((__pgp), struct pgpath, path)
  51
  52/*
  53 * Paths are grouped into Priority Groups and numbered from 1 upwards.
  54 * Each has a path selector which controls which path gets used.
  55 */
  56struct priority_group {
  57        struct list_head list;
  58
  59        struct multipath *m;            /* Owning multipath instance */
  60        struct path_selector ps;
  61
  62        unsigned pg_num;                /* Reference number */
  63        unsigned nr_pgpaths;            /* Number of paths in PG */
  64        struct list_head pgpaths;
  65
  66        bool bypassed:1;                /* Temporarily bypass this PG? */
  67};
  68
  69/* Multipath context */
  70struct multipath {
  71        unsigned long flags;            /* Multipath state flags */
  72
  73        spinlock_t lock;
  74        enum dm_queue_mode queue_mode;
  75
  76        struct pgpath *current_pgpath;
  77        struct priority_group *current_pg;
  78        struct priority_group *next_pg; /* Switch to this PG if set */
  79
  80        atomic_t nr_valid_paths;        /* Total number of usable paths */
  81        unsigned nr_priority_groups;
  82        struct list_head priority_groups;
  83
  84        const char *hw_handler_name;
  85        char *hw_handler_params;
  86        wait_queue_head_t pg_init_wait; /* Wait for pg_init completion */
  87        unsigned pg_init_retries;       /* Number of times to retry pg_init */
  88        unsigned pg_init_delay_msecs;   /* Number of msecs before pg_init retry */
  89        atomic_t pg_init_in_progress;   /* Only one pg_init allowed at once */
  90        atomic_t pg_init_count;         /* Number of times pg_init called */
  91
  92        struct mutex work_mutex;
  93        struct work_struct trigger_event;
  94        struct dm_target *ti;
  95
  96        struct work_struct process_queued_bios;
  97        struct bio_list queued_bios;
  98
  99        struct timer_list nopath_timer; /* Timeout for queue_if_no_path */
 100};
 101
 102/*
 103 * Context information attached to each io we process.
 104 */
 105struct dm_mpath_io {
 106        struct pgpath *pgpath;
 107        size_t nr_bytes;
 108};
 109
 110typedef int (*action_fn) (struct pgpath *pgpath);
 111
 112static struct workqueue_struct *kmultipathd, *kmpath_handlerd;
 113static void trigger_event(struct work_struct *work);
 114static void activate_or_offline_path(struct pgpath *pgpath);
 115static void activate_path_work(struct work_struct *work);
 116static void process_queued_bios(struct work_struct *work);
 117static void queue_if_no_path_timeout_work(struct timer_list *t);
 118
 119/*-----------------------------------------------
 120 * Multipath state flags.
 121 *-----------------------------------------------*/
 122
 123#define MPATHF_QUEUE_IO 0                       /* Must we queue all I/O? */
 124#define MPATHF_QUEUE_IF_NO_PATH 1               /* Queue I/O if last path fails? */
 125#define MPATHF_SAVED_QUEUE_IF_NO_PATH 2         /* Saved state during suspension */
 126#define MPATHF_RETAIN_ATTACHED_HW_HANDLER 3     /* If there's already a hw_handler present, don't change it. */
 127#define MPATHF_PG_INIT_DISABLED 4               /* pg_init is not currently allowed */
 128#define MPATHF_PG_INIT_REQUIRED 5               /* pg_init needs calling? */
 129#define MPATHF_PG_INIT_DELAY_RETRY 6            /* Delay pg_init retry? */
 130
 131/*-----------------------------------------------
 132 * Allocation routines
 133 *-----------------------------------------------*/
 134
 135static struct pgpath *alloc_pgpath(void)
 136{
 137        struct pgpath *pgpath = kzalloc(sizeof(*pgpath), GFP_KERNEL);
 138
 139        if (!pgpath)
 140                return NULL;
 141
 142        pgpath->is_active = true;
 143
 144        return pgpath;
 145}
 146
 147static void free_pgpath(struct pgpath *pgpath)
 148{
 149        kfree(pgpath);
 150}
 151
 152static struct priority_group *alloc_priority_group(void)
 153{
 154        struct priority_group *pg;
 155
 156        pg = kzalloc(sizeof(*pg), GFP_KERNEL);
 157
 158        if (pg)
 159                INIT_LIST_HEAD(&pg->pgpaths);
 160
 161        return pg;
 162}
 163
 164static void free_pgpaths(struct list_head *pgpaths, struct dm_target *ti)
 165{
 166        struct pgpath *pgpath, *tmp;
 167
 168        list_for_each_entry_safe(pgpath, tmp, pgpaths, list) {
 169                list_del(&pgpath->list);
 170                dm_put_device(ti, pgpath->path.dev);
 171                free_pgpath(pgpath);
 172        }
 173}
 174
 175static void free_priority_group(struct priority_group *pg,
 176                                struct dm_target *ti)
 177{
 178        struct path_selector *ps = &pg->ps;
 179
 180        if (ps->type) {
 181                ps->type->destroy(ps);
 182                dm_put_path_selector(ps->type);
 183        }
 184
 185        free_pgpaths(&pg->pgpaths, ti);
 186        kfree(pg);
 187}
 188
 189static struct multipath *alloc_multipath(struct dm_target *ti)
 190{
 191        struct multipath *m;
 192
 193        m = kzalloc(sizeof(*m), GFP_KERNEL);
 194        if (m) {
 195                INIT_LIST_HEAD(&m->priority_groups);
 196                spin_lock_init(&m->lock);
 197                atomic_set(&m->nr_valid_paths, 0);
 198                INIT_WORK(&m->trigger_event, trigger_event);
 199                mutex_init(&m->work_mutex);
 200
 201                m->queue_mode = DM_TYPE_NONE;
 202
 203                m->ti = ti;
 204                ti->private = m;
 205
 206                timer_setup(&m->nopath_timer, queue_if_no_path_timeout_work, 0);
 207        }
 208
 209        return m;
 210}
 211
 212static int alloc_multipath_stage2(struct dm_target *ti, struct multipath *m)
 213{
 214        if (m->queue_mode == DM_TYPE_NONE) {
 215                m->queue_mode = DM_TYPE_REQUEST_BASED;
 216        } else if (m->queue_mode == DM_TYPE_BIO_BASED) {
 217                INIT_WORK(&m->process_queued_bios, process_queued_bios);
 218                /*
 219                 * bio-based doesn't support any direct scsi_dh management;
 220                 * it just discovers if a scsi_dh is attached.
 221                 */
 222                set_bit(MPATHF_RETAIN_ATTACHED_HW_HANDLER, &m->flags);
 223        }
 224
 225        dm_table_set_type(ti->table, m->queue_mode);
 226
 227        /*
 228         * Init fields that are only used when a scsi_dh is attached
 229         * - must do this unconditionally (really doesn't hurt non-SCSI uses)
 230         */
 231        set_bit(MPATHF_QUEUE_IO, &m->flags);
 232        atomic_set(&m->pg_init_in_progress, 0);
 233        atomic_set(&m->pg_init_count, 0);
 234        m->pg_init_delay_msecs = DM_PG_INIT_DELAY_DEFAULT;
 235        init_waitqueue_head(&m->pg_init_wait);
 236
 237        return 0;
 238}
 239
 240static void free_multipath(struct multipath *m)
 241{
 242        struct priority_group *pg, *tmp;
 243
 244        list_for_each_entry_safe(pg, tmp, &m->priority_groups, list) {
 245                list_del(&pg->list);
 246                free_priority_group(pg, m->ti);
 247        }
 248
 249        kfree(m->hw_handler_name);
 250        kfree(m->hw_handler_params);
 251        mutex_destroy(&m->work_mutex);
 252        kfree(m);
 253}
 254
 255static struct dm_mpath_io *get_mpio(union map_info *info)
 256{
 257        return info->ptr;
 258}
 259
 260static size_t multipath_per_bio_data_size(void)
 261{
 262        return sizeof(struct dm_mpath_io) + sizeof(struct dm_bio_details);
 263}
 264
 265static struct dm_mpath_io *get_mpio_from_bio(struct bio *bio)
 266{
 267        return dm_per_bio_data(bio, multipath_per_bio_data_size());
 268}
 269
 270static struct dm_bio_details *get_bio_details_from_mpio(struct dm_mpath_io *mpio)
 271{
 272        /* dm_bio_details is immediately after the dm_mpath_io in bio's per-bio-data */
 273        void *bio_details = mpio + 1;
 274        return bio_details;
 275}
 276
 277static void multipath_init_per_bio_data(struct bio *bio, struct dm_mpath_io **mpio_p)
 278{
 279        struct dm_mpath_io *mpio = get_mpio_from_bio(bio);
 280        struct dm_bio_details *bio_details = get_bio_details_from_mpio(mpio);
 281
 282        mpio->nr_bytes = bio->bi_iter.bi_size;
 283        mpio->pgpath = NULL;
 284        *mpio_p = mpio;
 285
 286        dm_bio_record(bio_details, bio);
 287}
 288
 289/*-----------------------------------------------
 290 * Path selection
 291 *-----------------------------------------------*/
 292
 293static int __pg_init_all_paths(struct multipath *m)
 294{
 295        struct pgpath *pgpath;
 296        unsigned long pg_init_delay = 0;
 297
 298        lockdep_assert_held(&m->lock);
 299
 300        if (atomic_read(&m->pg_init_in_progress) || test_bit(MPATHF_PG_INIT_DISABLED, &m->flags))
 301                return 0;
 302
 303        atomic_inc(&m->pg_init_count);
 304        clear_bit(MPATHF_PG_INIT_REQUIRED, &m->flags);
 305
 306        /* Check here to reset pg_init_required */
 307        if (!m->current_pg)
 308                return 0;
 309
 310        if (test_bit(MPATHF_PG_INIT_DELAY_RETRY, &m->flags))
 311                pg_init_delay = msecs_to_jiffies(m->pg_init_delay_msecs != DM_PG_INIT_DELAY_DEFAULT ?
 312                                                 m->pg_init_delay_msecs : DM_PG_INIT_DELAY_MSECS);
 313        list_for_each_entry(pgpath, &m->current_pg->pgpaths, list) {
 314                /* Skip failed paths */
 315                if (!pgpath->is_active)
 316                        continue;
 317                if (queue_delayed_work(kmpath_handlerd, &pgpath->activate_path,
 318                                       pg_init_delay))
 319                        atomic_inc(&m->pg_init_in_progress);
 320        }
 321        return atomic_read(&m->pg_init_in_progress);
 322}
 323
 324static int pg_init_all_paths(struct multipath *m)
 325{
 326        int ret;
 327        unsigned long flags;
 328
 329        spin_lock_irqsave(&m->lock, flags);
 330        ret = __pg_init_all_paths(m);
 331        spin_unlock_irqrestore(&m->lock, flags);
 332
 333        return ret;
 334}
 335
 336static void __switch_pg(struct multipath *m, struct priority_group *pg)
 337{
 338        m->current_pg = pg;
 339
 340        /* Must we initialise the PG first, and queue I/O till it's ready? */
 341        if (m->hw_handler_name) {
 342                set_bit(MPATHF_PG_INIT_REQUIRED, &m->flags);
 343                set_bit(MPATHF_QUEUE_IO, &m->flags);
 344        } else {
 345                clear_bit(MPATHF_PG_INIT_REQUIRED, &m->flags);
 346                clear_bit(MPATHF_QUEUE_IO, &m->flags);
 347        }
 348
 349        atomic_set(&m->pg_init_count, 0);
 350}
 351
 352static struct pgpath *choose_path_in_pg(struct multipath *m,
 353                                        struct priority_group *pg,
 354                                        size_t nr_bytes)
 355{
 356        unsigned long flags;
 357        struct dm_path *path;
 358        struct pgpath *pgpath;
 359
 360        path = pg->ps.type->select_path(&pg->ps, nr_bytes);
 361        if (!path)
 362                return ERR_PTR(-ENXIO);
 363
 364        pgpath = path_to_pgpath(path);
 365
 366        if (unlikely(READ_ONCE(m->current_pg) != pg)) {
 367                /* Only update current_pgpath if pg changed */
 368                spin_lock_irqsave(&m->lock, flags);
 369                m->current_pgpath = pgpath;
 370                __switch_pg(m, pg);
 371                spin_unlock_irqrestore(&m->lock, flags);
 372        }
 373
 374        return pgpath;
 375}
 376
 377static struct pgpath *choose_pgpath(struct multipath *m, size_t nr_bytes)
 378{
 379        unsigned long flags;
 380        struct priority_group *pg;
 381        struct pgpath *pgpath;
 382        unsigned bypassed = 1;
 383
 384        if (!atomic_read(&m->nr_valid_paths)) {
 385                clear_bit(MPATHF_QUEUE_IO, &m->flags);
 386                goto failed;
 387        }
 388
 389        /* Were we instructed to switch PG? */
 390        if (READ_ONCE(m->next_pg)) {
 391                spin_lock_irqsave(&m->lock, flags);
 392                pg = m->next_pg;
 393                if (!pg) {
 394                        spin_unlock_irqrestore(&m->lock, flags);
 395                        goto check_current_pg;
 396                }
 397                m->next_pg = NULL;
 398                spin_unlock_irqrestore(&m->lock, flags);
 399                pgpath = choose_path_in_pg(m, pg, nr_bytes);
 400                if (!IS_ERR_OR_NULL(pgpath))
 401                        return pgpath;
 402        }
 403
 404        /* Don't change PG until it has no remaining paths */
 405check_current_pg:
 406        pg = READ_ONCE(m->current_pg);
 407        if (pg) {
 408                pgpath = choose_path_in_pg(m, pg, nr_bytes);
 409                if (!IS_ERR_OR_NULL(pgpath))
 410                        return pgpath;
 411        }
 412
 413        /*
 414         * Loop through priority groups until we find a valid path.
 415         * First time we skip PGs marked 'bypassed'.
 416         * Second time we only try the ones we skipped, but set
 417         * pg_init_delay_retry so we do not hammer controllers.
 418         */
 419        do {
 420                list_for_each_entry(pg, &m->priority_groups, list) {
 421                        if (pg->bypassed == !!bypassed)
 422                                continue;
 423                        pgpath = choose_path_in_pg(m, pg, nr_bytes);
 424                        if (!IS_ERR_OR_NULL(pgpath)) {
 425                                if (!bypassed)
 426                                        set_bit(MPATHF_PG_INIT_DELAY_RETRY, &m->flags);
 427                                return pgpath;
 428                        }
 429                }
 430        } while (bypassed--);
 431
 432failed:
 433        spin_lock_irqsave(&m->lock, flags);
 434        m->current_pgpath = NULL;
 435        m->current_pg = NULL;
 436        spin_unlock_irqrestore(&m->lock, flags);
 437
 438        return NULL;
 439}
 440
 441/*
 442 * dm_report_EIO() is a macro instead of a function to make pr_debug_ratelimited()
 443 * report the function name and line number of the function from which
 444 * it has been invoked.
 445 */
 446#define dm_report_EIO(m)                                                \
 447do {                                                                    \
 448        struct mapped_device *md = dm_table_get_md((m)->ti->table);     \
 449                                                                        \
 450        DMDEBUG_LIMIT("%s: returning EIO; QIFNP = %d; SQIFNP = %d; DNFS = %d", \
 451                      dm_device_name(md),                               \
 452                      test_bit(MPATHF_QUEUE_IF_NO_PATH, &(m)->flags),   \
 453                      test_bit(MPATHF_SAVED_QUEUE_IF_NO_PATH, &(m)->flags), \
 454                      dm_noflush_suspending((m)->ti));                  \
 455} while (0)
 456
 457/*
 458 * Check whether bios must be queued in the device-mapper core rather
 459 * than here in the target.
 460 */
 461static bool __must_push_back(struct multipath *m)
 462{
 463        return dm_noflush_suspending(m->ti);
 464}
 465
 466static bool must_push_back_rq(struct multipath *m)
 467{
 468        return test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags) || __must_push_back(m);
 469}
 470
 471/*
 472 * Map cloned requests (request-based multipath)
 473 */
 474static int multipath_clone_and_map(struct dm_target *ti, struct request *rq,
 475                                   union map_info *map_context,
 476                                   struct request **__clone)
 477{
 478        struct multipath *m = ti->private;
 479        size_t nr_bytes = blk_rq_bytes(rq);
 480        struct pgpath *pgpath;
 481        struct block_device *bdev;
 482        struct dm_mpath_io *mpio = get_mpio(map_context);
 483        struct request_queue *q;
 484        struct request *clone;
 485
 486        /* Do we need to select a new pgpath? */
 487        pgpath = READ_ONCE(m->current_pgpath);
 488        if (!pgpath || !test_bit(MPATHF_QUEUE_IO, &m->flags))
 489                pgpath = choose_pgpath(m, nr_bytes);
 490
 491        if (!pgpath) {
 492                if (must_push_back_rq(m))
 493                        return DM_MAPIO_DELAY_REQUEUE;
 494                dm_report_EIO(m);       /* Failed */
 495                return DM_MAPIO_KILL;
 496        } else if (test_bit(MPATHF_QUEUE_IO, &m->flags) ||
 497                   test_bit(MPATHF_PG_INIT_REQUIRED, &m->flags)) {
 498                pg_init_all_paths(m);
 499                return DM_MAPIO_DELAY_REQUEUE;
 500        }
 501
 502        mpio->pgpath = pgpath;
 503        mpio->nr_bytes = nr_bytes;
 504
 505        bdev = pgpath->path.dev->bdev;
 506        q = bdev_get_queue(bdev);
 507        clone = blk_get_request(q, rq->cmd_flags | REQ_NOMERGE,
 508                        BLK_MQ_REQ_NOWAIT);
 509        if (IS_ERR(clone)) {
 510                /* EBUSY, ENODEV or EWOULDBLOCK: requeue */
 511                if (blk_queue_dying(q)) {
 512                        atomic_inc(&m->pg_init_in_progress);
 513                        activate_or_offline_path(pgpath);
 514                        return DM_MAPIO_DELAY_REQUEUE;
 515                }
 516
 517                /*
 518                 * blk-mq's SCHED_RESTART can cover this requeue, so we
 519                 * needn't deal with it by DELAY_REQUEUE. More importantly,
 520                 * we have to return DM_MAPIO_REQUEUE so that blk-mq can
 521                 * get the queue busy feedback (via BLK_STS_RESOURCE),
 522                 * otherwise I/O merging can suffer.
 523                 */
 524                return DM_MAPIO_REQUEUE;
 525        }
 526        clone->bio = clone->biotail = NULL;
 527        clone->rq_disk = bdev->bd_disk;
 528        clone->cmd_flags |= REQ_FAILFAST_TRANSPORT;
 529        *__clone = clone;
 530
 531        if (pgpath->pg->ps.type->start_io)
 532                pgpath->pg->ps.type->start_io(&pgpath->pg->ps,
 533                                              &pgpath->path,
 534                                              nr_bytes);
 535        return DM_MAPIO_REMAPPED;
 536}
 537
 538static void multipath_release_clone(struct request *clone,
 539                                    union map_info *map_context)
 540{
 541        if (unlikely(map_context)) {
 542                /*
 543                 * non-NULL map_context means caller is still map
 544                 * method; must undo multipath_clone_and_map()
 545                 */
 546                struct dm_mpath_io *mpio = get_mpio(map_context);
 547                struct pgpath *pgpath = mpio->pgpath;
 548
 549                if (pgpath && pgpath->pg->ps.type->end_io)
 550                        pgpath->pg->ps.type->end_io(&pgpath->pg->ps,
 551                                                    &pgpath->path,
 552                                                    mpio->nr_bytes,
 553                                                    clone->io_start_time_ns);
 554        }
 555
 556        blk_put_request(clone);
 557}
 558
 559/*
 560 * Map cloned bios (bio-based multipath)
 561 */
 562
 563static struct pgpath *__map_bio(struct multipath *m, struct bio *bio)
 564{
 565        struct pgpath *pgpath;
 566        unsigned long flags;
 567        bool queue_io;
 568
 569        /* Do we need to select a new pgpath? */
 570        pgpath = READ_ONCE(m->current_pgpath);
 571        if (!pgpath || !test_bit(MPATHF_QUEUE_IO, &m->flags))
 572                pgpath = choose_pgpath(m, bio->bi_iter.bi_size);
 573
 574        /* MPATHF_QUEUE_IO might have been cleared by choose_pgpath. */
 575        queue_io = test_bit(MPATHF_QUEUE_IO, &m->flags);
 576
 577        if ((pgpath && queue_io) ||
 578            (!pgpath && test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags))) {
 579                /* Queue for the daemon to resubmit */
 580                spin_lock_irqsave(&m->lock, flags);
 581                bio_list_add(&m->queued_bios, bio);
 582                spin_unlock_irqrestore(&m->lock, flags);
 583
 584                /* PG_INIT_REQUIRED cannot be set without QUEUE_IO */
 585                if (queue_io || test_bit(MPATHF_PG_INIT_REQUIRED, &m->flags))
 586                        pg_init_all_paths(m);
 587                else if (!queue_io)
 588                        queue_work(kmultipathd, &m->process_queued_bios);
 589
 590                return ERR_PTR(-EAGAIN);
 591        }
 592
 593        return pgpath;
 594}
 595
 596static int __multipath_map_bio(struct multipath *m, struct bio *bio,
 597                               struct dm_mpath_io *mpio)
 598{
 599        struct pgpath *pgpath = __map_bio(m, bio);
 600
 601        if (IS_ERR(pgpath))
 602                return DM_MAPIO_SUBMITTED;
 603
 604        if (!pgpath) {
 605                if (__must_push_back(m))
 606                        return DM_MAPIO_REQUEUE;
 607                dm_report_EIO(m);
 608                return DM_MAPIO_KILL;
 609        }
 610
 611        mpio->pgpath = pgpath;
 612
 613        bio->bi_status = 0;
 614        bio_set_dev(bio, pgpath->path.dev->bdev);
 615        bio->bi_opf |= REQ_FAILFAST_TRANSPORT;
 616
 617        if (pgpath->pg->ps.type->start_io)
 618                pgpath->pg->ps.type->start_io(&pgpath->pg->ps,
 619                                              &pgpath->path,
 620                                              mpio->nr_bytes);
 621        return DM_MAPIO_REMAPPED;
 622}
 623
 624static int multipath_map_bio(struct dm_target *ti, struct bio *bio)
 625{
 626        struct multipath *m = ti->private;
 627        struct dm_mpath_io *mpio = NULL;
 628
 629        multipath_init_per_bio_data(bio, &mpio);
 630        return __multipath_map_bio(m, bio, mpio);
 631}
 632
 633static void process_queued_io_list(struct multipath *m)
 634{
 635        if (m->queue_mode == DM_TYPE_REQUEST_BASED)
 636                dm_mq_kick_requeue_list(dm_table_get_md(m->ti->table));
 637        else if (m->queue_mode == DM_TYPE_BIO_BASED)
 638                queue_work(kmultipathd, &m->process_queued_bios);
 639}
 640
 641static void process_queued_bios(struct work_struct *work)
 642{
 643        int r;
 644        unsigned long flags;
 645        struct bio *bio;
 646        struct bio_list bios;
 647        struct blk_plug plug;
 648        struct multipath *m =
 649                container_of(work, struct multipath, process_queued_bios);
 650
 651        bio_list_init(&bios);
 652
 653        spin_lock_irqsave(&m->lock, flags);
 654
 655        if (bio_list_empty(&m->queued_bios)) {
 656                spin_unlock_irqrestore(&m->lock, flags);
 657                return;
 658        }
 659
 660        bio_list_merge(&bios, &m->queued_bios);
 661        bio_list_init(&m->queued_bios);
 662
 663        spin_unlock_irqrestore(&m->lock, flags);
 664
 665        blk_start_plug(&plug);
 666        while ((bio = bio_list_pop(&bios))) {
 667                struct dm_mpath_io *mpio = get_mpio_from_bio(bio);
 668                dm_bio_restore(get_bio_details_from_mpio(mpio), bio);
 669                r = __multipath_map_bio(m, bio, mpio);
 670                switch (r) {
 671                case DM_MAPIO_KILL:
 672                        bio->bi_status = BLK_STS_IOERR;
 673                        bio_endio(bio);
 674                        break;
 675                case DM_MAPIO_REQUEUE:
 676                        bio->bi_status = BLK_STS_DM_REQUEUE;
 677                        bio_endio(bio);
 678                        break;
 679                case DM_MAPIO_REMAPPED:
 680                        generic_make_request(bio);
 681                        break;
 682                case DM_MAPIO_SUBMITTED:
 683                        break;
 684                default:
 685                        WARN_ONCE(true, "__multipath_map_bio() returned %d\n", r);
 686                }
 687        }
 688        blk_finish_plug(&plug);
 689}
 690
 691/*
 692 * If we run out of usable paths, should we queue I/O or error it?
 693 */
 694static int queue_if_no_path(struct multipath *m, bool queue_if_no_path,
 695                            bool save_old_value, const char *caller)
 696{
 697        unsigned long flags;
 698        bool queue_if_no_path_bit, saved_queue_if_no_path_bit;
 699        const char *dm_dev_name = dm_device_name(dm_table_get_md(m->ti->table));
 700
 701        DMDEBUG("%s: %s caller=%s queue_if_no_path=%d save_old_value=%d",
 702                dm_dev_name, __func__, caller, queue_if_no_path, save_old_value);
 703
 704        spin_lock_irqsave(&m->lock, flags);
 705
 706        queue_if_no_path_bit = test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags);
 707        saved_queue_if_no_path_bit = test_bit(MPATHF_SAVED_QUEUE_IF_NO_PATH, &m->flags);
 708
 709        if (save_old_value) {
 710                if (unlikely(!queue_if_no_path_bit && saved_queue_if_no_path_bit)) {
 711                        DMERR("%s: QIFNP disabled but saved as enabled, saving again loses state, not saving!",
 712                              dm_dev_name);
 713                } else
 714                        assign_bit(MPATHF_SAVED_QUEUE_IF_NO_PATH, &m->flags, queue_if_no_path_bit);
 715        } else if (!queue_if_no_path && saved_queue_if_no_path_bit) {
 716                /* due to "fail_if_no_path" message, need to honor it. */
 717                clear_bit(MPATHF_SAVED_QUEUE_IF_NO_PATH, &m->flags);
 718        }
 719        assign_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags, queue_if_no_path);
 720
 721        DMDEBUG("%s: after %s changes; QIFNP = %d; SQIFNP = %d; DNFS = %d",
 722                dm_dev_name, __func__,
 723                test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags),
 724                test_bit(MPATHF_SAVED_QUEUE_IF_NO_PATH, &m->flags),
 725                dm_noflush_suspending(m->ti));
 726
 727        spin_unlock_irqrestore(&m->lock, flags);
 728
 729        if (!queue_if_no_path) {
 730                dm_table_run_md_queue_async(m->ti->table);
 731                process_queued_io_list(m);
 732        }
 733
 734        return 0;
 735}
 736
 737/*
 738 * If the queue_if_no_path timeout fires, turn off queue_if_no_path and
 739 * process any queued I/O.
 740 */
 741static void queue_if_no_path_timeout_work(struct timer_list *t)
 742{
 743        struct multipath *m = from_timer(m, t, nopath_timer);
 744        struct mapped_device *md = dm_table_get_md(m->ti->table);
 745
 746        DMWARN("queue_if_no_path timeout on %s, failing queued IO", dm_device_name(md));
 747        queue_if_no_path(m, false, false, __func__);
 748}
 749
 750/*
 751 * Enable the queue_if_no_path timeout if necessary.
 752 * Called with m->lock held.
 753 */
 754static void enable_nopath_timeout(struct multipath *m)
 755{
 756        unsigned long queue_if_no_path_timeout =
 757                READ_ONCE(queue_if_no_path_timeout_secs) * HZ;
 758
 759        lockdep_assert_held(&m->lock);
 760
 761        if (queue_if_no_path_timeout > 0 &&
 762            atomic_read(&m->nr_valid_paths) == 0 &&
 763            test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags)) {
 764                mod_timer(&m->nopath_timer,
 765                          jiffies + queue_if_no_path_timeout);
 766        }
 767}
 768
 769static void disable_nopath_timeout(struct multipath *m)
 770{
 771        del_timer_sync(&m->nopath_timer);
 772}
 773
 774/*
 775 * An event is triggered whenever a path is taken out of use.
 776 * Includes path failure and PG bypass.
 777 */
 778static void trigger_event(struct work_struct *work)
 779{
 780        struct multipath *m =
 781                container_of(work, struct multipath, trigger_event);
 782
 783        dm_table_event(m->ti->table);
 784}
 785
 786/*-----------------------------------------------------------------
 787 * Constructor/argument parsing:
 788 * <#multipath feature args> [<arg>]*
 789 * <#hw_handler args> [hw_handler [<arg>]*]
 790 * <#priority groups>
 791 * <initial priority group>
 792 *     [<selector> <#selector args> [<arg>]*
 793 *      <#paths> <#per-path selector args>
 794 *         [<path> [<arg>]* ]+ ]+
 795 *---------------------------------------------------------------*/
 796static int parse_path_selector(struct dm_arg_set *as, struct priority_group *pg,
 797                               struct dm_target *ti)
 798{
 799        int r;
 800        struct path_selector_type *pst;
 801        unsigned ps_argc;
 802
 803        static const struct dm_arg _args[] = {
 804                {0, 1024, "invalid number of path selector args"},
 805        };
 806
 807        pst = dm_get_path_selector(dm_shift_arg(as));
 808        if (!pst) {
 809                ti->error = "unknown path selector type";
 810                return -EINVAL;
 811        }
 812
 813        r = dm_read_arg_group(_args, as, &ps_argc, &ti->error);
 814        if (r) {
 815                dm_put_path_selector(pst);
 816                return -EINVAL;
 817        }
 818
 819        r = pst->create(&pg->ps, ps_argc, as->argv);
 820        if (r) {
 821                dm_put_path_selector(pst);
 822                ti->error = "path selector constructor failed";
 823                return r;
 824        }
 825
 826        pg->ps.type = pst;
 827        dm_consume_args(as, ps_argc);
 828
 829        return 0;
 830}
 831
 832static int setup_scsi_dh(struct block_device *bdev, struct multipath *m,
 833                         const char **attached_handler_name, char **error)
 834{
 835        struct request_queue *q = bdev_get_queue(bdev);
 836        int r;
 837
 838        if (test_bit(MPATHF_RETAIN_ATTACHED_HW_HANDLER, &m->flags)) {
 839retain:
 840                if (*attached_handler_name) {
 841                        /*
 842                         * Clear any hw_handler_params associated with a
 843                         * handler that isn't already attached.
 844                         */
 845                        if (m->hw_handler_name && strcmp(*attached_handler_name, m->hw_handler_name)) {
 846                                kfree(m->hw_handler_params);
 847                                m->hw_handler_params = NULL;
 848                        }
 849
 850                        /*
 851                         * Reset hw_handler_name to match the attached handler
 852                         *
 853                         * NB. This modifies the table line to show the actual
 854                         * handler instead of the original table passed in.
 855                         */
 856                        kfree(m->hw_handler_name);
 857                        m->hw_handler_name = *attached_handler_name;
 858                        *attached_handler_name = NULL;
 859                }
 860        }
 861
 862        if (m->hw_handler_name) {
 863                r = scsi_dh_attach(q, m->hw_handler_name);
 864                if (r == -EBUSY) {
 865                        char b[BDEVNAME_SIZE];
 866
 867                        printk(KERN_INFO "dm-mpath: retaining handler on device %s\n",
 868                               bdevname(bdev, b));
 869                        goto retain;
 870                }
 871                if (r < 0) {
 872                        *error = "error attaching hardware handler";
 873                        return r;
 874                }
 875
 876                if (m->hw_handler_params) {
 877                        r = scsi_dh_set_params(q, m->hw_handler_params);
 878                        if (r < 0) {
 879                                *error = "unable to set hardware handler parameters";
 880                                return r;
 881                        }
 882                }
 883        }
 884
 885        return 0;
 886}
 887
 888static struct pgpath *parse_path(struct dm_arg_set *as, struct path_selector *ps,
 889                                 struct dm_target *ti)
 890{
 891        int r;
 892        struct pgpath *p;
 893        struct multipath *m = ti->private;
 894        struct request_queue *q;
 895        const char *attached_handler_name = NULL;
 896
 897        /* we need at least a path arg */
 898        if (as->argc < 1) {
 899                ti->error = "no device given";
 900                return ERR_PTR(-EINVAL);
 901        }
 902
 903        p = alloc_pgpath();
 904        if (!p)
 905                return ERR_PTR(-ENOMEM);
 906
 907        r = dm_get_device(ti, dm_shift_arg(as), dm_table_get_mode(ti->table),
 908                          &p->path.dev);
 909        if (r) {
 910                ti->error = "error getting device";
 911                goto bad;
 912        }
 913
 914        q = bdev_get_queue(p->path.dev->bdev);
 915        attached_handler_name = scsi_dh_attached_handler_name(q, GFP_KERNEL);
 916        if (attached_handler_name || m->hw_handler_name) {
 917                INIT_DELAYED_WORK(&p->activate_path, activate_path_work);
 918                r = setup_scsi_dh(p->path.dev->bdev, m, &attached_handler_name, &ti->error);
 919                kfree(attached_handler_name);
 920                if (r) {
 921                        dm_put_device(ti, p->path.dev);
 922                        goto bad;
 923                }
 924        }
 925
 926        r = ps->type->add_path(ps, &p->path, as->argc, as->argv, &ti->error);
 927        if (r) {
 928                dm_put_device(ti, p->path.dev);
 929                goto bad;
 930        }
 931
 932        return p;
 933 bad:
 934        free_pgpath(p);
 935        return ERR_PTR(r);
 936}
 937
 938static struct priority_group *parse_priority_group(struct dm_arg_set *as,
 939                                                   struct multipath *m)
 940{
 941        static const struct dm_arg _args[] = {
 942                {1, 1024, "invalid number of paths"},
 943                {0, 1024, "invalid number of selector args"}
 944        };
 945
 946        int r;
 947        unsigned i, nr_selector_args, nr_args;
 948        struct priority_group *pg;
 949        struct dm_target *ti = m->ti;
 950
 951        if (as->argc < 2) {
 952                as->argc = 0;
 953                ti->error = "not enough priority group arguments";
 954                return ERR_PTR(-EINVAL);
 955        }
 956
 957        pg = alloc_priority_group();
 958        if (!pg) {
 959                ti->error = "couldn't allocate priority group";
 960                return ERR_PTR(-ENOMEM);
 961        }
 962        pg->m = m;
 963
 964        r = parse_path_selector(as, pg, ti);
 965        if (r)
 966                goto bad;
 967
 968        /*
 969         * read the paths
 970         */
 971        r = dm_read_arg(_args, as, &pg->nr_pgpaths, &ti->error);
 972        if (r)
 973                goto bad;
 974
 975        r = dm_read_arg(_args + 1, as, &nr_selector_args, &ti->error);
 976        if (r)
 977                goto bad;
 978
 979        nr_args = 1 + nr_selector_args;
 980        for (i = 0; i < pg->nr_pgpaths; i++) {
 981                struct pgpath *pgpath;
 982                struct dm_arg_set path_args;
 983
 984                if (as->argc < nr_args) {
 985                        ti->error = "not enough path parameters";
 986                        r = -EINVAL;
 987                        goto bad;
 988                }
 989
 990                path_args.argc = nr_args;
 991                path_args.argv = as->argv;
 992
 993                pgpath = parse_path(&path_args, &pg->ps, ti);
 994                if (IS_ERR(pgpath)) {
 995                        r = PTR_ERR(pgpath);
 996                        goto bad;
 997                }
 998
 999                pgpath->pg = pg;
1000                list_add_tail(&pgpath->list, &pg->pgpaths);
1001                dm_consume_args(as, nr_args);
1002        }
1003
1004        return pg;
1005
1006 bad:
1007        free_priority_group(pg, ti);
1008        return ERR_PTR(r);
1009}
1010
1011static int parse_hw_handler(struct dm_arg_set *as, struct multipath *m)
1012{
1013        unsigned hw_argc;
1014        int ret;
1015        struct dm_target *ti = m->ti;
1016
1017        static const struct dm_arg _args[] = {
1018                {0, 1024, "invalid number of hardware handler args"},
1019        };
1020
1021        if (dm_read_arg_group(_args, as, &hw_argc, &ti->error))
1022                return -EINVAL;
1023
1024        if (!hw_argc)
1025                return 0;
1026
1027        if (m->queue_mode == DM_TYPE_BIO_BASED) {
1028                dm_consume_args(as, hw_argc);
1029                DMERR("bio-based multipath doesn't allow hardware handler args");
1030                return 0;
1031        }
1032
1033        m->hw_handler_name = kstrdup(dm_shift_arg(as), GFP_KERNEL);
1034        if (!m->hw_handler_name)
1035                return -EINVAL;
1036
1037        if (hw_argc > 1) {
1038                char *p;
1039                int i, j, len = 4;
1040
1041                for (i = 0; i <= hw_argc - 2; i++)
1042                        len += strlen(as->argv[i]) + 1;
1043                p = m->hw_handler_params = kzalloc(len, GFP_KERNEL);
1044                if (!p) {
1045                        ti->error = "memory allocation failed";
1046                        ret = -ENOMEM;
1047                        goto fail;
1048                }
1049                j = sprintf(p, "%d", hw_argc - 1);
1050                for (i = 0, p+=j+1; i <= hw_argc - 2; i++, p+=j+1)
1051                        j = sprintf(p, "%s", as->argv[i]);
1052        }
1053        dm_consume_args(as, hw_argc - 1);
1054
1055        return 0;
1056fail:
1057        kfree(m->hw_handler_name);
1058        m->hw_handler_name = NULL;
1059        return ret;
1060}
1061
1062static int parse_features(struct dm_arg_set *as, struct multipath *m)
1063{
1064        int r;
1065        unsigned argc;
1066        struct dm_target *ti = m->ti;
1067        const char *arg_name;
1068
1069        static const struct dm_arg _args[] = {
1070                {0, 8, "invalid number of feature args"},
1071                {1, 50, "pg_init_retries must be between 1 and 50"},
1072                {0, 60000, "pg_init_delay_msecs must be between 0 and 60000"},
1073        };
1074
1075        r = dm_read_arg_group(_args, as, &argc, &ti->error);
1076        if (r)
1077                return -EINVAL;
1078
1079        if (!argc)
1080                return 0;
1081
1082        do {
1083                arg_name = dm_shift_arg(as);
1084                argc--;
1085
1086                if (!strcasecmp(arg_name, "queue_if_no_path")) {
1087                        r = queue_if_no_path(m, true, false, __func__);
1088                        continue;
1089                }
1090
1091                if (!strcasecmp(arg_name, "retain_attached_hw_handler")) {
1092                        set_bit(MPATHF_RETAIN_ATTACHED_HW_HANDLER, &m->flags);
1093                        continue;
1094                }
1095
1096                if (!strcasecmp(arg_name, "pg_init_retries") &&
1097                    (argc >= 1)) {
1098                        r = dm_read_arg(_args + 1, as, &m->pg_init_retries, &ti->error);
1099                        argc--;
1100                        continue;
1101                }
1102
1103                if (!strcasecmp(arg_name, "pg_init_delay_msecs") &&
1104                    (argc >= 1)) {
1105                        r = dm_read_arg(_args + 2, as, &m->pg_init_delay_msecs, &ti->error);
1106                        argc--;
1107                        continue;
1108                }
1109
1110                if (!strcasecmp(arg_name, "queue_mode") &&
1111                    (argc >= 1)) {
1112                        const char *queue_mode_name = dm_shift_arg(as);
1113
1114                        if (!strcasecmp(queue_mode_name, "bio"))
1115                                m->queue_mode = DM_TYPE_BIO_BASED;
1116                        else if (!strcasecmp(queue_mode_name, "rq") ||
1117                                 !strcasecmp(queue_mode_name, "mq"))
1118                                m->queue_mode = DM_TYPE_REQUEST_BASED;
1119                        else {
1120                                ti->error = "Unknown 'queue_mode' requested";
1121                                r = -EINVAL;
1122                        }
1123                        argc--;
1124                        continue;
1125                }
1126
1127                ti->error = "Unrecognised multipath feature request";
1128                r = -EINVAL;
1129        } while (argc && !r);
1130
1131        return r;
1132}
1133
1134static int multipath_ctr(struct dm_target *ti, unsigned argc, char **argv)
1135{
1136        /* target arguments */
1137        static const struct dm_arg _args[] = {
1138                {0, 1024, "invalid number of priority groups"},
1139                {0, 1024, "invalid initial priority group number"},
1140        };
1141
1142        int r;
1143        struct multipath *m;
1144        struct dm_arg_set as;
1145        unsigned pg_count = 0;
1146        unsigned next_pg_num;
1147        unsigned long flags;
1148
1149        as.argc = argc;
1150        as.argv = argv;
1151
1152        m = alloc_multipath(ti);
1153        if (!m) {
1154                ti->error = "can't allocate multipath";
1155                return -EINVAL;
1156        }
1157
1158        r = parse_features(&as, m);
1159        if (r)
1160                goto bad;
1161
1162        r = alloc_multipath_stage2(ti, m);
1163        if (r)
1164                goto bad;
1165
1166        r = parse_hw_handler(&as, m);
1167        if (r)
1168                goto bad;
1169
1170        r = dm_read_arg(_args, &as, &m->nr_priority_groups, &ti->error);
1171        if (r)
1172                goto bad;
1173
1174        r = dm_read_arg(_args + 1, &as, &next_pg_num, &ti->error);
1175        if (r)
1176                goto bad;
1177
1178        if ((!m->nr_priority_groups && next_pg_num) ||
1179            (m->nr_priority_groups && !next_pg_num)) {
1180                ti->error = "invalid initial priority group";
1181                r = -EINVAL;
1182                goto bad;
1183        }
1184
1185        /* parse the priority groups */
1186        while (as.argc) {
1187                struct priority_group *pg;
1188                unsigned nr_valid_paths = atomic_read(&m->nr_valid_paths);
1189
1190                pg = parse_priority_group(&as, m);
1191                if (IS_ERR(pg)) {
1192                        r = PTR_ERR(pg);
1193                        goto bad;
1194                }
1195
1196                nr_valid_paths += pg->nr_pgpaths;
1197                atomic_set(&m->nr_valid_paths, nr_valid_paths);
1198
1199                list_add_tail(&pg->list, &m->priority_groups);
1200                pg_count++;
1201                pg->pg_num = pg_count;
1202                if (!--next_pg_num)
1203                        m->next_pg = pg;
1204        }
1205
1206        if (pg_count != m->nr_priority_groups) {
1207                ti->error = "priority group count mismatch";
1208                r = -EINVAL;
1209                goto bad;
1210        }
1211
1212        spin_lock_irqsave(&m->lock, flags);
1213        enable_nopath_timeout(m);
1214        spin_unlock_irqrestore(&m->lock, flags);
1215
1216        ti->num_flush_bios = 1;
1217        ti->num_discard_bios = 1;
1218        ti->num_write_same_bios = 1;
1219        ti->num_write_zeroes_bios = 1;
1220        if (m->queue_mode == DM_TYPE_BIO_BASED)
1221                ti->per_io_data_size = multipath_per_bio_data_size();
1222        else
1223                ti->per_io_data_size = sizeof(struct dm_mpath_io);
1224
1225        return 0;
1226
1227 bad:
1228        free_multipath(m);
1229        return r;
1230}
1231
1232static void multipath_wait_for_pg_init_completion(struct multipath *m)
1233{
1234        DEFINE_WAIT(wait);
1235
1236        while (1) {
1237                prepare_to_wait(&m->pg_init_wait, &wait, TASK_UNINTERRUPTIBLE);
1238
1239                if (!atomic_read(&m->pg_init_in_progress))
1240                        break;
1241
1242                io_schedule();
1243        }
1244        finish_wait(&m->pg_init_wait, &wait);
1245}
1246
1247static void flush_multipath_work(struct multipath *m)
1248{
1249        if (m->hw_handler_name) {
1250                set_bit(MPATHF_PG_INIT_DISABLED, &m->flags);
1251                smp_mb__after_atomic();
1252
1253                if (atomic_read(&m->pg_init_in_progress))
1254                        flush_workqueue(kmpath_handlerd);
1255                multipath_wait_for_pg_init_completion(m);
1256
1257                clear_bit(MPATHF_PG_INIT_DISABLED, &m->flags);
1258                smp_mb__after_atomic();
1259        }
1260
1261        if (m->queue_mode == DM_TYPE_BIO_BASED)
1262                flush_work(&m->process_queued_bios);
1263        flush_work(&m->trigger_event);
1264}
1265
1266static void multipath_dtr(struct dm_target *ti)
1267{
1268        struct multipath *m = ti->private;
1269
1270        disable_nopath_timeout(m);
1271        flush_multipath_work(m);
1272        free_multipath(m);
1273}
1274
1275/*
1276 * Take a path out of use.
1277 */
1278static int fail_path(struct pgpath *pgpath)
1279{
1280        unsigned long flags;
1281        struct multipath *m = pgpath->pg->m;
1282
1283        spin_lock_irqsave(&m->lock, flags);
1284
1285        if (!pgpath->is_active)
1286                goto out;
1287
1288        DMWARN("%s: Failing path %s.",
1289               dm_device_name(dm_table_get_md(m->ti->table)),
1290               pgpath->path.dev->name);
1291
1292        pgpath->pg->ps.type->fail_path(&pgpath->pg->ps, &pgpath->path);
1293        pgpath->is_active = false;
1294        pgpath->fail_count++;
1295
1296        atomic_dec(&m->nr_valid_paths);
1297
1298        if (pgpath == m->current_pgpath)
1299                m->current_pgpath = NULL;
1300
1301        dm_path_uevent(DM_UEVENT_PATH_FAILED, m->ti,
1302                       pgpath->path.dev->name, atomic_read(&m->nr_valid_paths));
1303
1304        schedule_work(&m->trigger_event);
1305
1306        enable_nopath_timeout(m);
1307
1308out:
1309        spin_unlock_irqrestore(&m->lock, flags);
1310
1311        return 0;
1312}
1313
1314/*
1315 * Reinstate a previously-failed path
1316 */
1317static int reinstate_path(struct pgpath *pgpath)
1318{
1319        int r = 0, run_queue = 0;
1320        unsigned long flags;
1321        struct multipath *m = pgpath->pg->m;
1322        unsigned nr_valid_paths;
1323
1324        spin_lock_irqsave(&m->lock, flags);
1325
1326        if (pgpath->is_active)
1327                goto out;
1328
1329        DMWARN("%s: Reinstating path %s.",
1330               dm_device_name(dm_table_get_md(m->ti->table)),
1331               pgpath->path.dev->name);
1332
1333        r = pgpath->pg->ps.type->reinstate_path(&pgpath->pg->ps, &pgpath->path);
1334        if (r)
1335                goto out;
1336
1337        pgpath->is_active = true;
1338
1339        nr_valid_paths = atomic_inc_return(&m->nr_valid_paths);
1340        if (nr_valid_paths == 1) {
1341                m->current_pgpath = NULL;
1342                run_queue = 1;
1343        } else if (m->hw_handler_name && (m->current_pg == pgpath->pg)) {
1344                if (queue_work(kmpath_handlerd, &pgpath->activate_path.work))
1345                        atomic_inc(&m->pg_init_in_progress);
1346        }
1347
1348        dm_path_uevent(DM_UEVENT_PATH_REINSTATED, m->ti,
1349                       pgpath->path.dev->name, nr_valid_paths);
1350
1351        schedule_work(&m->trigger_event);
1352
1353out:
1354        spin_unlock_irqrestore(&m->lock, flags);
1355        if (run_queue) {
1356                dm_table_run_md_queue_async(m->ti->table);
1357                process_queued_io_list(m);
1358        }
1359
1360        if (pgpath->is_active)
1361                disable_nopath_timeout(m);
1362
1363        return r;
1364}
1365
1366/*
1367 * Fail or reinstate all paths that match the provided struct dm_dev.
1368 */
1369static int action_dev(struct multipath *m, struct dm_dev *dev,
1370                      action_fn action)
1371{
1372        int r = -EINVAL;
1373        struct pgpath *pgpath;
1374        struct priority_group *pg;
1375
1376        list_for_each_entry(pg, &m->priority_groups, list) {
1377                list_for_each_entry(pgpath, &pg->pgpaths, list) {
1378                        if (pgpath->path.dev == dev)
1379                                r = action(pgpath);
1380                }
1381        }
1382
1383        return r;
1384}
1385
1386/*
1387 * Temporarily try to avoid having to use the specified PG
1388 */
1389static void bypass_pg(struct multipath *m, struct priority_group *pg,
1390                      bool bypassed)
1391{
1392        unsigned long flags;
1393
1394        spin_lock_irqsave(&m->lock, flags);
1395
1396        pg->bypassed = bypassed;
1397        m->current_pgpath = NULL;
1398        m->current_pg = NULL;
1399
1400        spin_unlock_irqrestore(&m->lock, flags);
1401
1402        schedule_work(&m->trigger_event);
1403}
1404
1405/*
1406 * Switch to using the specified PG from the next I/O that gets mapped
1407 */
1408static int switch_pg_num(struct multipath *m, const char *pgstr)
1409{
1410        struct priority_group *pg;
1411        unsigned pgnum;
1412        unsigned long flags;
1413        char dummy;
1414
1415        if (!pgstr || (sscanf(pgstr, "%u%c", &pgnum, &dummy) != 1) || !pgnum ||
1416            !m->nr_priority_groups || (pgnum > m->nr_priority_groups)) {
1417                DMWARN("invalid PG number supplied to switch_pg_num");
1418                return -EINVAL;
1419        }
1420
1421        spin_lock_irqsave(&m->lock, flags);
1422        list_for_each_entry(pg, &m->priority_groups, list) {
1423                pg->bypassed = false;
1424                if (--pgnum)
1425                        continue;
1426
1427                m->current_pgpath = NULL;
1428                m->current_pg = NULL;
1429                m->next_pg = pg;
1430        }
1431        spin_unlock_irqrestore(&m->lock, flags);
1432
1433        schedule_work(&m->trigger_event);
1434        return 0;
1435}
1436
1437/*
1438 * Set/clear bypassed status of a PG.
1439 * PGs are numbered upwards from 1 in the order they were declared.
1440 */
1441static int bypass_pg_num(struct multipath *m, const char *pgstr, bool bypassed)
1442{
1443        struct priority_group *pg;
1444        unsigned pgnum;
1445        char dummy;
1446
1447        if (!pgstr || (sscanf(pgstr, "%u%c", &pgnum, &dummy) != 1) || !pgnum ||
1448            !m->nr_priority_groups || (pgnum > m->nr_priority_groups)) {
1449                DMWARN("invalid PG number supplied to bypass_pg");
1450                return -EINVAL;
1451        }
1452
1453        list_for_each_entry(pg, &m->priority_groups, list) {
1454                if (!--pgnum)
1455                        break;
1456        }
1457
1458        bypass_pg(m, pg, bypassed);
1459        return 0;
1460}
1461
1462/*
1463 * Should we retry pg_init immediately?
1464 */
1465static bool pg_init_limit_reached(struct multipath *m, struct pgpath *pgpath)
1466{
1467        unsigned long flags;
1468        bool limit_reached = false;
1469
1470        spin_lock_irqsave(&m->lock, flags);
1471
1472        if (atomic_read(&m->pg_init_count) <= m->pg_init_retries &&
1473            !test_bit(MPATHF_PG_INIT_DISABLED, &m->flags))
1474                set_bit(MPATHF_PG_INIT_REQUIRED, &m->flags);
1475        else
1476                limit_reached = true;
1477
1478        spin_unlock_irqrestore(&m->lock, flags);
1479
1480        return limit_reached;
1481}
1482
1483static void pg_init_done(void *data, int errors)
1484{
1485        struct pgpath *pgpath = data;
1486        struct priority_group *pg = pgpath->pg;
1487        struct multipath *m = pg->m;
1488        unsigned long flags;
1489        bool delay_retry = false;
1490
1491        /* device or driver problems */
1492        switch (errors) {
1493        case SCSI_DH_OK:
1494                break;
1495        case SCSI_DH_NOSYS:
1496                if (!m->hw_handler_name) {
1497                        errors = 0;
1498                        break;
1499                }
1500                DMERR("Could not failover the device: Handler scsi_dh_%s "
1501                      "Error %d.", m->hw_handler_name, errors);
1502                /*
1503                 * Fail path for now, so we do not ping pong
1504                 */
1505                fail_path(pgpath);
1506                break;
1507        case SCSI_DH_DEV_TEMP_BUSY:
1508                /*
1509                 * Probably doing something like FW upgrade on the
1510                 * controller so try the other pg.
1511                 */
1512                bypass_pg(m, pg, true);
1513                break;
1514        case SCSI_DH_RETRY:
1515                /* Wait before retrying. */
1516                delay_retry = true;
1517                /* fall through */
1518        case SCSI_DH_IMM_RETRY:
1519        case SCSI_DH_RES_TEMP_UNAVAIL:
1520                if (pg_init_limit_reached(m, pgpath))
1521                        fail_path(pgpath);
1522                errors = 0;
1523                break;
1524        case SCSI_DH_DEV_OFFLINED:
1525        default:
1526                /*
1527                 * We probably do not want to fail the path for a device
1528                 * error, but this is what the old dm did. In future
1529                 * patches we can do more advanced handling.
1530                 */
1531                fail_path(pgpath);
1532        }
1533
1534        spin_lock_irqsave(&m->lock, flags);
1535        if (errors) {
1536                if (pgpath == m->current_pgpath) {
1537                        DMERR("Could not failover device. Error %d.", errors);
1538                        m->current_pgpath = NULL;
1539                        m->current_pg = NULL;
1540                }
1541        } else if (!test_bit(MPATHF_PG_INIT_REQUIRED, &m->flags))
1542                pg->bypassed = false;
1543
1544        if (atomic_dec_return(&m->pg_init_in_progress) > 0)
1545                /* Activations of other paths are still on going */
1546                goto out;
1547
1548        if (test_bit(MPATHF_PG_INIT_REQUIRED, &m->flags)) {
1549                if (delay_retry)
1550                        set_bit(MPATHF_PG_INIT_DELAY_RETRY, &m->flags);
1551                else
1552                        clear_bit(MPATHF_PG_INIT_DELAY_RETRY, &m->flags);
1553
1554                if (__pg_init_all_paths(m))
1555                        goto out;
1556        }
1557        clear_bit(MPATHF_QUEUE_IO, &m->flags);
1558
1559        process_queued_io_list(m);
1560
1561        /*
1562         * Wake up any thread waiting to suspend.
1563         */
1564        wake_up(&m->pg_init_wait);
1565
1566out:
1567        spin_unlock_irqrestore(&m->lock, flags);
1568}
1569
1570static void activate_or_offline_path(struct pgpath *pgpath)
1571{
1572        struct request_queue *q = bdev_get_queue(pgpath->path.dev->bdev);
1573
1574        if (pgpath->is_active && !blk_queue_dying(q))
1575                scsi_dh_activate(q, pg_init_done, pgpath);
1576        else
1577                pg_init_done(pgpath, SCSI_DH_DEV_OFFLINED);
1578}
1579
1580static void activate_path_work(struct work_struct *work)
1581{
1582        struct pgpath *pgpath =
1583                container_of(work, struct pgpath, activate_path.work);
1584
1585        activate_or_offline_path(pgpath);
1586}
1587
1588static int multipath_end_io(struct dm_target *ti, struct request *clone,
1589                            blk_status_t error, union map_info *map_context)
1590{
1591        struct dm_mpath_io *mpio = get_mpio(map_context);
1592        struct pgpath *pgpath = mpio->pgpath;
1593        int r = DM_ENDIO_DONE;
1594
1595        /*
1596         * We don't queue any clone request inside the multipath target
1597         * during end I/O handling, since those clone requests don't have
1598         * bio clones.  If we queue them inside the multipath target,
1599         * we need to make bio clones, that requires memory allocation.
1600         * (See drivers/md/dm-rq.c:end_clone_bio() about why the clone requests
1601         *  don't have bio clones.)
1602         * Instead of queueing the clone request here, we queue the original
1603         * request into dm core, which will remake a clone request and
1604         * clone bios for it and resubmit it later.
1605         */
1606        if (error && blk_path_error(error)) {
1607                struct multipath *m = ti->private;
1608
1609                if (error == BLK_STS_RESOURCE)
1610                        r = DM_ENDIO_DELAY_REQUEUE;
1611                else
1612                        r = DM_ENDIO_REQUEUE;
1613
1614                if (pgpath)
1615                        fail_path(pgpath);
1616
1617                if (atomic_read(&m->nr_valid_paths) == 0 &&
1618                    !must_push_back_rq(m)) {
1619                        if (error == BLK_STS_IOERR)
1620                                dm_report_EIO(m);
1621                        /* complete with the original error */
1622                        r = DM_ENDIO_DONE;
1623                }
1624        }
1625
1626        if (pgpath) {
1627                struct path_selector *ps = &pgpath->pg->ps;
1628
1629                if (ps->type->end_io)
1630                        ps->type->end_io(ps, &pgpath->path, mpio->nr_bytes,
1631                                         clone->io_start_time_ns);
1632        }
1633
1634        return r;
1635}
1636
1637static int multipath_end_io_bio(struct dm_target *ti, struct bio *clone,
1638                                blk_status_t *error)
1639{
1640        struct multipath *m = ti->private;
1641        struct dm_mpath_io *mpio = get_mpio_from_bio(clone);
1642        struct pgpath *pgpath = mpio->pgpath;
1643        unsigned long flags;
1644        int r = DM_ENDIO_DONE;
1645
1646        if (!*error || !blk_path_error(*error))
1647                goto done;
1648
1649        if (pgpath)
1650                fail_path(pgpath);
1651
1652        if (atomic_read(&m->nr_valid_paths) == 0 &&
1653            !test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags)) {
1654                if (__must_push_back(m)) {
1655                        r = DM_ENDIO_REQUEUE;
1656                } else {
1657                        dm_report_EIO(m);
1658                        *error = BLK_STS_IOERR;
1659                }
1660                goto done;
1661        }
1662
1663        spin_lock_irqsave(&m->lock, flags);
1664        bio_list_add(&m->queued_bios, clone);
1665        spin_unlock_irqrestore(&m->lock, flags);
1666        if (!test_bit(MPATHF_QUEUE_IO, &m->flags))
1667                queue_work(kmultipathd, &m->process_queued_bios);
1668
1669        r = DM_ENDIO_INCOMPLETE;
1670done:
1671        if (pgpath) {
1672                struct path_selector *ps = &pgpath->pg->ps;
1673
1674                if (ps->type->end_io)
1675                        ps->type->end_io(ps, &pgpath->path, mpio->nr_bytes,
1676                                         dm_start_time_ns_from_clone(clone));
1677        }
1678
1679        return r;
1680}
1681
1682/*
1683 * Suspend with flush can't complete until all the I/O is processed
1684 * so if the last path fails we must error any remaining I/O.
1685 * - Note that if the freeze_bdev fails while suspending, the
1686 *   queue_if_no_path state is lost - userspace should reset it.
1687 * Otherwise, during noflush suspend, queue_if_no_path will not change.
1688 */
1689static void multipath_presuspend(struct dm_target *ti)
1690{
1691        struct multipath *m = ti->private;
1692
1693        /* FIXME: bio-based shouldn't need to always disable queue_if_no_path */
1694        if (m->queue_mode == DM_TYPE_BIO_BASED || !dm_noflush_suspending(m->ti))
1695                queue_if_no_path(m, false, true, __func__);
1696}
1697
1698static void multipath_postsuspend(struct dm_target *ti)
1699{
1700        struct multipath *m = ti->private;
1701
1702        mutex_lock(&m->work_mutex);
1703        flush_multipath_work(m);
1704        mutex_unlock(&m->work_mutex);
1705}
1706
1707/*
1708 * Restore the queue_if_no_path setting.
1709 */
1710static void multipath_resume(struct dm_target *ti)
1711{
1712        struct multipath *m = ti->private;
1713        unsigned long flags;
1714
1715        spin_lock_irqsave(&m->lock, flags);
1716        if (test_bit(MPATHF_SAVED_QUEUE_IF_NO_PATH, &m->flags)) {
1717                set_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags);
1718                clear_bit(MPATHF_SAVED_QUEUE_IF_NO_PATH, &m->flags);
1719        }
1720
1721        DMDEBUG("%s: %s finished; QIFNP = %d; SQIFNP = %d",
1722                dm_device_name(dm_table_get_md(m->ti->table)), __func__,
1723                test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags),
1724                test_bit(MPATHF_SAVED_QUEUE_IF_NO_PATH, &m->flags));
1725
1726        spin_unlock_irqrestore(&m->lock, flags);
1727}
1728
1729/*
1730 * Info output has the following format:
1731 * num_multipath_feature_args [multipath_feature_args]*
1732 * num_handler_status_args [handler_status_args]*
1733 * num_groups init_group_number
1734 *            [A|D|E num_ps_status_args [ps_status_args]*
1735 *             num_paths num_selector_args
1736 *             [path_dev A|F fail_count [selector_args]* ]+ ]+
1737 *
1738 * Table output has the following format (identical to the constructor string):
1739 * num_feature_args [features_args]*
1740 * num_handler_args hw_handler [hw_handler_args]*
1741 * num_groups init_group_number
1742 *     [priority selector-name num_ps_args [ps_args]*
1743 *      num_paths num_selector_args [path_dev [selector_args]* ]+ ]+
1744 */
1745static void multipath_status(struct dm_target *ti, status_type_t type,
1746                             unsigned status_flags, char *result, unsigned maxlen)
1747{
1748        int sz = 0;
1749        unsigned long flags;
1750        struct multipath *m = ti->private;
1751        struct priority_group *pg;
1752        struct pgpath *p;
1753        unsigned pg_num;
1754        char state;
1755
1756        spin_lock_irqsave(&m->lock, flags);
1757
1758        /* Features */
1759        if (type == STATUSTYPE_INFO)
1760                DMEMIT("2 %u %u ", test_bit(MPATHF_QUEUE_IO, &m->flags),
1761                       atomic_read(&m->pg_init_count));
1762        else {
1763                DMEMIT("%u ", test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags) +
1764                              (m->pg_init_retries > 0) * 2 +
1765                              (m->pg_init_delay_msecs != DM_PG_INIT_DELAY_DEFAULT) * 2 +
1766                              test_bit(MPATHF_RETAIN_ATTACHED_HW_HANDLER, &m->flags) +
1767                              (m->queue_mode != DM_TYPE_REQUEST_BASED) * 2);
1768
1769                if (test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags))
1770                        DMEMIT("queue_if_no_path ");
1771                if (m->pg_init_retries)
1772                        DMEMIT("pg_init_retries %u ", m->pg_init_retries);
1773                if (m->pg_init_delay_msecs != DM_PG_INIT_DELAY_DEFAULT)
1774                        DMEMIT("pg_init_delay_msecs %u ", m->pg_init_delay_msecs);
1775                if (test_bit(MPATHF_RETAIN_ATTACHED_HW_HANDLER, &m->flags))
1776                        DMEMIT("retain_attached_hw_handler ");
1777                if (m->queue_mode != DM_TYPE_REQUEST_BASED) {
1778                        switch(m->queue_mode) {
1779                        case DM_TYPE_BIO_BASED:
1780                                DMEMIT("queue_mode bio ");
1781                                break;
1782                        default:
1783                                WARN_ON_ONCE(true);
1784                                break;
1785                        }
1786                }
1787        }
1788
1789        if (!m->hw_handler_name || type == STATUSTYPE_INFO)
1790                DMEMIT("0 ");
1791        else
1792                DMEMIT("1 %s ", m->hw_handler_name);
1793
1794        DMEMIT("%u ", m->nr_priority_groups);
1795
1796        if (m->next_pg)
1797                pg_num = m->next_pg->pg_num;
1798        else if (m->current_pg)
1799                pg_num = m->current_pg->pg_num;
1800        else
1801                pg_num = (m->nr_priority_groups ? 1 : 0);
1802
1803        DMEMIT("%u ", pg_num);
1804
1805        switch (type) {
1806        case STATUSTYPE_INFO:
1807                list_for_each_entry(pg, &m->priority_groups, list) {
1808                        if (pg->bypassed)
1809                                state = 'D';    /* Disabled */
1810                        else if (pg == m->current_pg)
1811                                state = 'A';    /* Currently Active */
1812                        else
1813                                state = 'E';    /* Enabled */
1814
1815                        DMEMIT("%c ", state);
1816
1817                        if (pg->ps.type->status)
1818                                sz += pg->ps.type->status(&pg->ps, NULL, type,
1819                                                          result + sz,
1820                                                          maxlen - sz);
1821                        else
1822                                DMEMIT("0 ");
1823
1824                        DMEMIT("%u %u ", pg->nr_pgpaths,
1825                               pg->ps.type->info_args);
1826
1827                        list_for_each_entry(p, &pg->pgpaths, list) {
1828                                DMEMIT("%s %s %u ", p->path.dev->name,
1829                                       p->is_active ? "A" : "F",
1830                                       p->fail_count);
1831                                if (pg->ps.type->status)
1832                                        sz += pg->ps.type->status(&pg->ps,
1833                                              &p->path, type, result + sz,
1834                                              maxlen - sz);
1835                        }
1836                }
1837                break;
1838
1839        case STATUSTYPE_TABLE:
1840                list_for_each_entry(pg, &m->priority_groups, list) {
1841                        DMEMIT("%s ", pg->ps.type->name);
1842
1843                        if (pg->ps.type->status)
1844                                sz += pg->ps.type->status(&pg->ps, NULL, type,
1845                                                          result + sz,
1846                                                          maxlen - sz);
1847                        else
1848                                DMEMIT("0 ");
1849
1850                        DMEMIT("%u %u ", pg->nr_pgpaths,
1851                               pg->ps.type->table_args);
1852
1853                        list_for_each_entry(p, &pg->pgpaths, list) {
1854                                DMEMIT("%s ", p->path.dev->name);
1855                                if (pg->ps.type->status)
1856                                        sz += pg->ps.type->status(&pg->ps,
1857                                              &p->path, type, result + sz,
1858                                              maxlen - sz);
1859                        }
1860                }
1861                break;
1862        }
1863
1864        spin_unlock_irqrestore(&m->lock, flags);
1865}
1866
1867static int multipath_message(struct dm_target *ti, unsigned argc, char **argv,
1868                             char *result, unsigned maxlen)
1869{
1870        int r = -EINVAL;
1871        struct dm_dev *dev;
1872        struct multipath *m = ti->private;
1873        action_fn action;
1874        unsigned long flags;
1875
1876        mutex_lock(&m->work_mutex);
1877
1878        if (dm_suspended(ti)) {
1879                r = -EBUSY;
1880                goto out;
1881        }
1882
1883        if (argc == 1) {
1884                if (!strcasecmp(argv[0], "queue_if_no_path")) {
1885                        r = queue_if_no_path(m, true, false, __func__);
1886                        spin_lock_irqsave(&m->lock, flags);
1887                        enable_nopath_timeout(m);
1888                        spin_unlock_irqrestore(&m->lock, flags);
1889                        goto out;
1890                } else if (!strcasecmp(argv[0], "fail_if_no_path")) {
1891                        r = queue_if_no_path(m, false, false, __func__);
1892                        disable_nopath_timeout(m);
1893                        goto out;
1894                }
1895        }
1896
1897        if (argc != 2) {
1898                DMWARN("Invalid multipath message arguments. Expected 2 arguments, got %d.", argc);
1899                goto out;
1900        }
1901
1902        if (!strcasecmp(argv[0], "disable_group")) {
1903                r = bypass_pg_num(m, argv[1], true);
1904                goto out;
1905        } else if (!strcasecmp(argv[0], "enable_group")) {
1906                r = bypass_pg_num(m, argv[1], false);
1907                goto out;
1908        } else if (!strcasecmp(argv[0], "switch_group")) {
1909                r = switch_pg_num(m, argv[1]);
1910                goto out;
1911        } else if (!strcasecmp(argv[0], "reinstate_path"))
1912                action = reinstate_path;
1913        else if (!strcasecmp(argv[0], "fail_path"))
1914                action = fail_path;
1915        else {
1916                DMWARN("Unrecognised multipath message received: %s", argv[0]);
1917                goto out;
1918        }
1919
1920        r = dm_get_device(ti, argv[1], dm_table_get_mode(ti->table), &dev);
1921        if (r) {
1922                DMWARN("message: error getting device %s",
1923                       argv[1]);
1924                goto out;
1925        }
1926
1927        r = action_dev(m, dev, action);
1928
1929        dm_put_device(ti, dev);
1930
1931out:
1932        mutex_unlock(&m->work_mutex);
1933        return r;
1934}
1935
1936static int multipath_prepare_ioctl(struct dm_target *ti,
1937                                   struct block_device **bdev)
1938{
1939        struct multipath *m = ti->private;
1940        struct pgpath *current_pgpath;
1941        int r;
1942
1943        current_pgpath = READ_ONCE(m->current_pgpath);
1944        if (!current_pgpath || !test_bit(MPATHF_QUEUE_IO, &m->flags))
1945                current_pgpath = choose_pgpath(m, 0);
1946
1947        if (current_pgpath) {
1948                if (!test_bit(MPATHF_QUEUE_IO, &m->flags)) {
1949                        *bdev = current_pgpath->path.dev->bdev;
1950                        r = 0;
1951                } else {
1952                        /* pg_init has not started or completed */
1953                        r = -ENOTCONN;
1954                }
1955        } else {
1956                /* No path is available */
1957                if (test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags))
1958                        r = -ENOTCONN;
1959                else
1960                        r = -EIO;
1961        }
1962
1963        if (r == -ENOTCONN) {
1964                if (!READ_ONCE(m->current_pg)) {
1965                        /* Path status changed, redo selection */
1966                        (void) choose_pgpath(m, 0);
1967                }
1968                if (test_bit(MPATHF_PG_INIT_REQUIRED, &m->flags))
1969                        pg_init_all_paths(m);
1970                dm_table_run_md_queue_async(m->ti->table);
1971                process_queued_io_list(m);
1972        }
1973
1974        /*
1975         * Only pass ioctls through if the device sizes match exactly.
1976         */
1977        if (!r && ti->len != i_size_read((*bdev)->bd_inode) >> SECTOR_SHIFT)
1978                return 1;
1979        return r;
1980}
1981
1982static int multipath_iterate_devices(struct dm_target *ti,
1983                                     iterate_devices_callout_fn fn, void *data)
1984{
1985        struct multipath *m = ti->private;
1986        struct priority_group *pg;
1987        struct pgpath *p;
1988        int ret = 0;
1989
1990        list_for_each_entry(pg, &m->priority_groups, list) {
1991                list_for_each_entry(p, &pg->pgpaths, list) {
1992                        ret = fn(ti, p->path.dev, ti->begin, ti->len, data);
1993                        if (ret)
1994                                goto out;
1995                }
1996        }
1997
1998out:
1999        return ret;
2000}
2001
2002static int pgpath_busy(struct pgpath *pgpath)
2003{
2004        struct request_queue *q = bdev_get_queue(pgpath->path.dev->bdev);
2005
2006        return blk_lld_busy(q);
2007}
2008
2009/*
2010 * We return "busy", only when we can map I/Os but underlying devices
2011 * are busy (so even if we map I/Os now, the I/Os will wait on
2012 * the underlying queue).
2013 * In other words, if we want to kill I/Os or queue them inside us
2014 * due to map unavailability, we don't return "busy".  Otherwise,
2015 * dm core won't give us the I/Os and we can't do what we want.
2016 */
2017static int multipath_busy(struct dm_target *ti)
2018{
2019        bool busy = false, has_active = false;
2020        struct multipath *m = ti->private;
2021        struct priority_group *pg, *next_pg;
2022        struct pgpath *pgpath;
2023
2024        /* pg_init in progress */
2025        if (atomic_read(&m->pg_init_in_progress))
2026                return true;
2027
2028        /* no paths available, for blk-mq: rely on IO mapping to delay requeue */
2029        if (!atomic_read(&m->nr_valid_paths) && test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags))
2030                return (m->queue_mode != DM_TYPE_REQUEST_BASED);
2031
2032        /* Guess which priority_group will be used at next mapping time */
2033        pg = READ_ONCE(m->current_pg);
2034        next_pg = READ_ONCE(m->next_pg);
2035        if (unlikely(!READ_ONCE(m->current_pgpath) && next_pg))
2036                pg = next_pg;
2037
2038        if (!pg) {
2039                /*
2040                 * We don't know which pg will be used at next mapping time.
2041                 * We don't call choose_pgpath() here to avoid to trigger
2042                 * pg_init just by busy checking.
2043                 * So we don't know whether underlying devices we will be using
2044                 * at next mapping time are busy or not. Just try mapping.
2045                 */
2046                return busy;
2047        }
2048
2049        /*
2050         * If there is one non-busy active path at least, the path selector
2051         * will be able to select it. So we consider such a pg as not busy.
2052         */
2053        busy = true;
2054        list_for_each_entry(pgpath, &pg->pgpaths, list) {
2055                if (pgpath->is_active) {
2056                        has_active = true;
2057                        if (!pgpath_busy(pgpath)) {
2058                                busy = false;
2059                                break;
2060                        }
2061                }
2062        }
2063
2064        if (!has_active) {
2065                /*
2066                 * No active path in this pg, so this pg won't be used and
2067                 * the current_pg will be changed at next mapping time.
2068                 * We need to try mapping to determine it.
2069                 */
2070                busy = false;
2071        }
2072
2073        return busy;
2074}
2075
2076/*-----------------------------------------------------------------
2077 * Module setup
2078 *---------------------------------------------------------------*/
2079static struct target_type multipath_target = {
2080        .name = "multipath",
2081        .version = {1, 14, 0},
2082        .features = DM_TARGET_SINGLETON | DM_TARGET_IMMUTABLE |
2083                    DM_TARGET_PASSES_INTEGRITY,
2084        .module = THIS_MODULE,
2085        .ctr = multipath_ctr,
2086        .dtr = multipath_dtr,
2087        .clone_and_map_rq = multipath_clone_and_map,
2088        .release_clone_rq = multipath_release_clone,
2089        .rq_end_io = multipath_end_io,
2090        .map = multipath_map_bio,
2091        .end_io = multipath_end_io_bio,
2092        .presuspend = multipath_presuspend,
2093        .postsuspend = multipath_postsuspend,
2094        .resume = multipath_resume,
2095        .status = multipath_status,
2096        .message = multipath_message,
2097        .prepare_ioctl = multipath_prepare_ioctl,
2098        .iterate_devices = multipath_iterate_devices,
2099        .busy = multipath_busy,
2100};
2101
2102static int __init dm_multipath_init(void)
2103{
2104        int r;
2105
2106        kmultipathd = alloc_workqueue("kmpathd", WQ_MEM_RECLAIM, 0);
2107        if (!kmultipathd) {
2108                DMERR("failed to create workqueue kmpathd");
2109                r = -ENOMEM;
2110                goto bad_alloc_kmultipathd;
2111        }
2112
2113        /*
2114         * A separate workqueue is used to handle the device handlers
2115         * to avoid overloading existing workqueue. Overloading the
2116         * old workqueue would also create a bottleneck in the
2117         * path of the storage hardware device activation.
2118         */
2119        kmpath_handlerd = alloc_ordered_workqueue("kmpath_handlerd",
2120                                                  WQ_MEM_RECLAIM);
2121        if (!kmpath_handlerd) {
2122                DMERR("failed to create workqueue kmpath_handlerd");
2123                r = -ENOMEM;
2124                goto bad_alloc_kmpath_handlerd;
2125        }
2126
2127        r = dm_register_target(&multipath_target);
2128        if (r < 0) {
2129                DMERR("request-based register failed %d", r);
2130                r = -EINVAL;
2131                goto bad_register_target;
2132        }
2133
2134        return 0;
2135
2136bad_register_target:
2137        destroy_workqueue(kmpath_handlerd);
2138bad_alloc_kmpath_handlerd:
2139        destroy_workqueue(kmultipathd);
2140bad_alloc_kmultipathd:
2141        return r;
2142}
2143
2144static void __exit dm_multipath_exit(void)
2145{
2146        destroy_workqueue(kmpath_handlerd);
2147        destroy_workqueue(kmultipathd);
2148
2149        dm_unregister_target(&multipath_target);
2150}
2151
2152module_init(dm_multipath_init);
2153module_exit(dm_multipath_exit);
2154
2155module_param_named(queue_if_no_path_timeout_secs,
2156                   queue_if_no_path_timeout_secs, ulong, S_IRUGO | S_IWUSR);
2157MODULE_PARM_DESC(queue_if_no_path_timeout_secs, "No available paths queue IO timeout in seconds");
2158
2159MODULE_DESCRIPTION(DM_NAME " multipath target");
2160MODULE_AUTHOR("Sistina Software <dm-devel@redhat.com>");
2161MODULE_LICENSE("GPL");
2162