linux/drivers/md/md-multipath.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0-or-later
   2/*
   3 * multipath.c : Multiple Devices driver for Linux
   4 *
   5 * Copyright (C) 1999, 2000, 2001 Ingo Molnar, Red Hat
   6 *
   7 * Copyright (C) 1996, 1997, 1998 Ingo Molnar, Miguel de Icaza, Gadi Oxman
   8 *
   9 * MULTIPATH management functions.
  10 *
  11 * derived from raid1.c.
  12 */
  13
  14#include <linux/blkdev.h>
  15#include <linux/module.h>
  16#include <linux/raid/md_u.h>
  17#include <linux/seq_file.h>
  18#include <linux/slab.h>
  19#include "md.h"
  20#include "md-multipath.h"
  21
  22#define MAX_WORK_PER_DISK 128
  23
  24#define NR_RESERVED_BUFS        32
  25
  26static int multipath_map (struct mpconf *conf)
  27{
  28        int i, disks = conf->raid_disks;
  29
  30        /*
  31         * Later we do read balancing on the read side
  32         * now we use the first available disk.
  33         */
  34
  35        rcu_read_lock();
  36        for (i = 0; i < disks; i++) {
  37                struct md_rdev *rdev = rcu_dereference(conf->multipaths[i].rdev);
  38                if (rdev && test_bit(In_sync, &rdev->flags) &&
  39                    !test_bit(Faulty, &rdev->flags)) {
  40                        atomic_inc(&rdev->nr_pending);
  41                        rcu_read_unlock();
  42                        return i;
  43                }
  44        }
  45        rcu_read_unlock();
  46
  47        pr_crit_ratelimited("multipath_map(): no more operational IO paths?\n");
  48        return (-1);
  49}
  50
  51static void multipath_reschedule_retry (struct multipath_bh *mp_bh)
  52{
  53        unsigned long flags;
  54        struct mddev *mddev = mp_bh->mddev;
  55        struct mpconf *conf = mddev->private;
  56
  57        spin_lock_irqsave(&conf->device_lock, flags);
  58        list_add(&mp_bh->retry_list, &conf->retry_list);
  59        spin_unlock_irqrestore(&conf->device_lock, flags);
  60        md_wakeup_thread(mddev->thread);
  61}
  62
  63/*
  64 * multipath_end_bh_io() is called when we have finished servicing a multipathed
  65 * operation and are ready to return a success/failure code to the buffer
  66 * cache layer.
  67 */
  68static void multipath_end_bh_io(struct multipath_bh *mp_bh, blk_status_t status)
  69{
  70        struct bio *bio = mp_bh->master_bio;
  71        struct mpconf *conf = mp_bh->mddev->private;
  72
  73        bio->bi_status = status;
  74        bio_endio(bio);
  75        mempool_free(mp_bh, &conf->pool);
  76}
  77
  78static void multipath_end_request(struct bio *bio)
  79{
  80        struct multipath_bh *mp_bh = bio->bi_private;
  81        struct mpconf *conf = mp_bh->mddev->private;
  82        struct md_rdev *rdev = conf->multipaths[mp_bh->path].rdev;
  83
  84        if (!bio->bi_status)
  85                multipath_end_bh_io(mp_bh, 0);
  86        else if (!(bio->bi_opf & REQ_RAHEAD)) {
  87                /*
  88                 * oops, IO error:
  89                 */
  90                md_error (mp_bh->mddev, rdev);
  91                pr_info("multipath: %pg: rescheduling sector %llu\n",
  92                        rdev->bdev,
  93                        (unsigned long long)bio->bi_iter.bi_sector);
  94                multipath_reschedule_retry(mp_bh);
  95        } else
  96                multipath_end_bh_io(mp_bh, bio->bi_status);
  97        rdev_dec_pending(rdev, conf->mddev);
  98}
  99
 100static bool multipath_make_request(struct mddev *mddev, struct bio * bio)
 101{
 102        struct mpconf *conf = mddev->private;
 103        struct multipath_bh * mp_bh;
 104        struct multipath_info *multipath;
 105
 106        if (unlikely(bio->bi_opf & REQ_PREFLUSH)
 107            && md_flush_request(mddev, bio))
 108                return true;
 109
 110        mp_bh = mempool_alloc(&conf->pool, GFP_NOIO);
 111
 112        mp_bh->master_bio = bio;
 113        mp_bh->mddev = mddev;
 114
 115        mp_bh->path = multipath_map(conf);
 116        if (mp_bh->path < 0) {
 117                bio_io_error(bio);
 118                mempool_free(mp_bh, &conf->pool);
 119                return true;
 120        }
 121        multipath = conf->multipaths + mp_bh->path;
 122
 123        bio_init_clone(multipath->rdev->bdev, &mp_bh->bio, bio, GFP_NOIO);
 124
 125        mp_bh->bio.bi_iter.bi_sector += multipath->rdev->data_offset;
 126        mp_bh->bio.bi_opf |= REQ_FAILFAST_TRANSPORT;
 127        mp_bh->bio.bi_end_io = multipath_end_request;
 128        mp_bh->bio.bi_private = mp_bh;
 129        mddev_check_write_zeroes(mddev, &mp_bh->bio);
 130        submit_bio_noacct(&mp_bh->bio);
 131        return true;
 132}
 133
 134static void multipath_status(struct seq_file *seq, struct mddev *mddev)
 135{
 136        struct mpconf *conf = mddev->private;
 137        int i;
 138
 139        seq_printf (seq, " [%d/%d] [", conf->raid_disks,
 140                    conf->raid_disks - mddev->degraded);
 141        rcu_read_lock();
 142        for (i = 0; i < conf->raid_disks; i++) {
 143                struct md_rdev *rdev = rcu_dereference(conf->multipaths[i].rdev);
 144                seq_printf (seq, "%s", rdev && test_bit(In_sync, &rdev->flags) ? "U" : "_");
 145        }
 146        rcu_read_unlock();
 147        seq_putc(seq, ']');
 148}
 149
 150/*
 151 * Careful, this can execute in IRQ contexts as well!
 152 */
 153static void multipath_error (struct mddev *mddev, struct md_rdev *rdev)
 154{
 155        struct mpconf *conf = mddev->private;
 156
 157        if (conf->raid_disks - mddev->degraded <= 1) {
 158                /*
 159                 * Uh oh, we can do nothing if this is our last path, but
 160                 * first check if this is a queued request for a device
 161                 * which has just failed.
 162                 */
 163                pr_warn("multipath: only one IO path left and IO error.\n");
 164                /* leave it active... it's all we have */
 165                return;
 166        }
 167        /*
 168         * Mark disk as unusable
 169         */
 170        if (test_and_clear_bit(In_sync, &rdev->flags)) {
 171                unsigned long flags;
 172                spin_lock_irqsave(&conf->device_lock, flags);
 173                mddev->degraded++;
 174                spin_unlock_irqrestore(&conf->device_lock, flags);
 175        }
 176        set_bit(Faulty, &rdev->flags);
 177        set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
 178        pr_err("multipath: IO failure on %pg, disabling IO path.\n"
 179               "multipath: Operation continuing on %d IO paths.\n",
 180               rdev->bdev,
 181               conf->raid_disks - mddev->degraded);
 182}
 183
 184static void print_multipath_conf (struct mpconf *conf)
 185{
 186        int i;
 187        struct multipath_info *tmp;
 188
 189        pr_debug("MULTIPATH conf printout:\n");
 190        if (!conf) {
 191                pr_debug("(conf==NULL)\n");
 192                return;
 193        }
 194        pr_debug(" --- wd:%d rd:%d\n", conf->raid_disks - conf->mddev->degraded,
 195                 conf->raid_disks);
 196
 197        for (i = 0; i < conf->raid_disks; i++) {
 198                tmp = conf->multipaths + i;
 199                if (tmp->rdev)
 200                        pr_debug(" disk%d, o:%d, dev:%pg\n",
 201                                 i,!test_bit(Faulty, &tmp->rdev->flags),
 202                                 tmp->rdev->bdev);
 203        }
 204}
 205
 206static int multipath_add_disk(struct mddev *mddev, struct md_rdev *rdev)
 207{
 208        struct mpconf *conf = mddev->private;
 209        int err = -EEXIST;
 210        int path;
 211        struct multipath_info *p;
 212        int first = 0;
 213        int last = mddev->raid_disks - 1;
 214
 215        if (rdev->raid_disk >= 0)
 216                first = last = rdev->raid_disk;
 217
 218        print_multipath_conf(conf);
 219
 220        for (path = first; path <= last; path++)
 221                if ((p=conf->multipaths+path)->rdev == NULL) {
 222                        disk_stack_limits(mddev->gendisk, rdev->bdev,
 223                                          rdev->data_offset << 9);
 224
 225                        err = md_integrity_add_rdev(rdev, mddev);
 226                        if (err)
 227                                break;
 228                        spin_lock_irq(&conf->device_lock);
 229                        mddev->degraded--;
 230                        rdev->raid_disk = path;
 231                        set_bit(In_sync, &rdev->flags);
 232                        spin_unlock_irq(&conf->device_lock);
 233                        rcu_assign_pointer(p->rdev, rdev);
 234                        err = 0;
 235                        break;
 236                }
 237
 238        print_multipath_conf(conf);
 239
 240        return err;
 241}
 242
 243static int multipath_remove_disk(struct mddev *mddev, struct md_rdev *rdev)
 244{
 245        struct mpconf *conf = mddev->private;
 246        int err = 0;
 247        int number = rdev->raid_disk;
 248        struct multipath_info *p = conf->multipaths + number;
 249
 250        print_multipath_conf(conf);
 251
 252        if (rdev == p->rdev) {
 253                if (test_bit(In_sync, &rdev->flags) ||
 254                    atomic_read(&rdev->nr_pending)) {
 255                        pr_warn("hot-remove-disk, slot %d is identified but is still operational!\n", number);
 256                        err = -EBUSY;
 257                        goto abort;
 258                }
 259                p->rdev = NULL;
 260                if (!test_bit(RemoveSynchronized, &rdev->flags)) {
 261                        synchronize_rcu();
 262                        if (atomic_read(&rdev->nr_pending)) {
 263                                /* lost the race, try later */
 264                                err = -EBUSY;
 265                                p->rdev = rdev;
 266                                goto abort;
 267                        }
 268                }
 269                err = md_integrity_register(mddev);
 270        }
 271abort:
 272
 273        print_multipath_conf(conf);
 274        return err;
 275}
 276
 277/*
 278 * This is a kernel thread which:
 279 *
 280 *      1.      Retries failed read operations on working multipaths.
 281 *      2.      Updates the raid superblock when problems encounter.
 282 *      3.      Performs writes following reads for array syncronising.
 283 */
 284
 285static void multipathd(struct md_thread *thread)
 286{
 287        struct mddev *mddev = thread->mddev;
 288        struct multipath_bh *mp_bh;
 289        struct bio *bio;
 290        unsigned long flags;
 291        struct mpconf *conf = mddev->private;
 292        struct list_head *head = &conf->retry_list;
 293
 294        md_check_recovery(mddev);
 295        for (;;) {
 296                spin_lock_irqsave(&conf->device_lock, flags);
 297                if (list_empty(head))
 298                        break;
 299                mp_bh = list_entry(head->prev, struct multipath_bh, retry_list);
 300                list_del(head->prev);
 301                spin_unlock_irqrestore(&conf->device_lock, flags);
 302
 303                bio = &mp_bh->bio;
 304                bio->bi_iter.bi_sector = mp_bh->master_bio->bi_iter.bi_sector;
 305
 306                if ((mp_bh->path = multipath_map (conf))<0) {
 307                        pr_err("multipath: %pg: unrecoverable IO read error for block %llu\n",
 308                               bio->bi_bdev,
 309                               (unsigned long long)bio->bi_iter.bi_sector);
 310                        multipath_end_bh_io(mp_bh, BLK_STS_IOERR);
 311                } else {
 312                        pr_err("multipath: %pg: redirecting sector %llu to another IO path\n",
 313                               bio->bi_bdev,
 314                               (unsigned long long)bio->bi_iter.bi_sector);
 315                        *bio = *(mp_bh->master_bio);
 316                        bio->bi_iter.bi_sector +=
 317                                conf->multipaths[mp_bh->path].rdev->data_offset;
 318                        bio_set_dev(bio, conf->multipaths[mp_bh->path].rdev->bdev);
 319                        bio->bi_opf |= REQ_FAILFAST_TRANSPORT;
 320                        bio->bi_end_io = multipath_end_request;
 321                        bio->bi_private = mp_bh;
 322                        submit_bio_noacct(bio);
 323                }
 324        }
 325        spin_unlock_irqrestore(&conf->device_lock, flags);
 326}
 327
 328static sector_t multipath_size(struct mddev *mddev, sector_t sectors, int raid_disks)
 329{
 330        WARN_ONCE(sectors || raid_disks,
 331                  "%s does not support generic reshape\n", __func__);
 332
 333        return mddev->dev_sectors;
 334}
 335
 336static int multipath_run (struct mddev *mddev)
 337{
 338        struct mpconf *conf;
 339        int disk_idx;
 340        struct multipath_info *disk;
 341        struct md_rdev *rdev;
 342        int working_disks;
 343        int ret;
 344
 345        if (md_check_no_bitmap(mddev))
 346                return -EINVAL;
 347
 348        if (mddev->level != LEVEL_MULTIPATH) {
 349                pr_warn("multipath: %s: raid level not set to multipath IO (%d)\n",
 350                        mdname(mddev), mddev->level);
 351                goto out;
 352        }
 353        /*
 354         * copy the already verified devices into our private MULTIPATH
 355         * bookkeeping area. [whatever we allocate in multipath_run(),
 356         * should be freed in multipath_free()]
 357         */
 358
 359        conf = kzalloc(sizeof(struct mpconf), GFP_KERNEL);
 360        mddev->private = conf;
 361        if (!conf)
 362                goto out;
 363
 364        conf->multipaths = kcalloc(mddev->raid_disks,
 365                                   sizeof(struct multipath_info),
 366                                   GFP_KERNEL);
 367        if (!conf->multipaths)
 368                goto out_free_conf;
 369
 370        working_disks = 0;
 371        rdev_for_each(rdev, mddev) {
 372                disk_idx = rdev->raid_disk;
 373                if (disk_idx < 0 ||
 374                    disk_idx >= mddev->raid_disks)
 375                        continue;
 376
 377                disk = conf->multipaths + disk_idx;
 378                disk->rdev = rdev;
 379                disk_stack_limits(mddev->gendisk, rdev->bdev,
 380                                  rdev->data_offset << 9);
 381
 382                if (!test_bit(Faulty, &rdev->flags))
 383                        working_disks++;
 384        }
 385
 386        conf->raid_disks = mddev->raid_disks;
 387        conf->mddev = mddev;
 388        spin_lock_init(&conf->device_lock);
 389        INIT_LIST_HEAD(&conf->retry_list);
 390
 391        if (!working_disks) {
 392                pr_warn("multipath: no operational IO paths for %s\n",
 393                        mdname(mddev));
 394                goto out_free_conf;
 395        }
 396        mddev->degraded = conf->raid_disks - working_disks;
 397
 398        ret = mempool_init_kmalloc_pool(&conf->pool, NR_RESERVED_BUFS,
 399                                        sizeof(struct multipath_bh));
 400        if (ret)
 401                goto out_free_conf;
 402
 403        mddev->thread = md_register_thread(multipathd, mddev,
 404                                           "multipath");
 405        if (!mddev->thread)
 406                goto out_free_conf;
 407
 408        pr_info("multipath: array %s active with %d out of %d IO paths\n",
 409                mdname(mddev), conf->raid_disks - mddev->degraded,
 410                mddev->raid_disks);
 411        /*
 412         * Ok, everything is just fine now
 413         */
 414        md_set_array_sectors(mddev, multipath_size(mddev, 0, 0));
 415
 416        if (md_integrity_register(mddev))
 417                goto out_free_conf;
 418
 419        return 0;
 420
 421out_free_conf:
 422        mempool_exit(&conf->pool);
 423        kfree(conf->multipaths);
 424        kfree(conf);
 425        mddev->private = NULL;
 426out:
 427        return -EIO;
 428}
 429
 430static void multipath_free(struct mddev *mddev, void *priv)
 431{
 432        struct mpconf *conf = priv;
 433
 434        mempool_exit(&conf->pool);
 435        kfree(conf->multipaths);
 436        kfree(conf);
 437}
 438
 439static struct md_personality multipath_personality =
 440{
 441        .name           = "multipath",
 442        .level          = LEVEL_MULTIPATH,
 443        .owner          = THIS_MODULE,
 444        .make_request   = multipath_make_request,
 445        .run            = multipath_run,
 446        .free           = multipath_free,
 447        .status         = multipath_status,
 448        .error_handler  = multipath_error,
 449        .hot_add_disk   = multipath_add_disk,
 450        .hot_remove_disk= multipath_remove_disk,
 451        .size           = multipath_size,
 452};
 453
 454static int __init multipath_init (void)
 455{
 456        return register_md_personality (&multipath_personality);
 457}
 458
 459static void __exit multipath_exit (void)
 460{
 461        unregister_md_personality (&multipath_personality);
 462}
 463
 464module_init(multipath_init);
 465module_exit(multipath_exit);
 466MODULE_LICENSE("GPL");
 467MODULE_DESCRIPTION("simple multi-path personality for MD (deprecated)");
 468MODULE_ALIAS("md-personality-7"); /* MULTIPATH */
 469MODULE_ALIAS("md-multipath");
 470MODULE_ALIAS("md-level--4");
 471