linux/drivers/md/dm-table.c
<<
>>
Prefs
   1/*
   2 * Copyright (C) 2001 Sistina Software (UK) Limited.
   3 * Copyright (C) 2004-2008 Red Hat, Inc. All rights reserved.
   4 *
   5 * This file is released under the GPL.
   6 */
   7
   8#include "dm-core.h"
   9
  10#include <linux/module.h>
  11#include <linux/vmalloc.h>
  12#include <linux/blkdev.h>
  13#include <linux/namei.h>
  14#include <linux/ctype.h>
  15#include <linux/string.h>
  16#include <linux/overflow.h>  /* until safe to add to slab.h */
  17#include <linux/slab.h>
  18#include <linux/interrupt.h>
  19#include <linux/mutex.h>
  20#include <linux/delay.h>
  21#include <linux/atomic.h>
  22#include <linux/blk-mq.h>
  23#include <linux/mount.h>
  24#include <linux/dax.h>
  25
  26#define DM_MSG_PREFIX "table"
  27
  28#define MAX_DEPTH 16
  29#define NODE_SIZE L1_CACHE_BYTES
  30#define KEYS_PER_NODE (NODE_SIZE / sizeof(sector_t))
  31#define CHILDREN_PER_NODE (KEYS_PER_NODE + 1)
  32
  33struct dm_table {
  34        struct mapped_device *md;
  35        enum dm_queue_mode type;
  36
  37        /* btree table */
  38        unsigned int depth;
  39        unsigned int counts[MAX_DEPTH]; /* in nodes */
  40        sector_t *index[MAX_DEPTH];
  41
  42        unsigned int num_targets;
  43        unsigned int num_allocated;
  44        sector_t *highs;
  45        struct dm_target *targets;
  46
  47        struct target_type *immutable_target_type;
  48
  49        bool integrity_supported:1;
  50        bool singleton:1;
  51        bool all_blk_mq:1;
  52
  53        /*
  54         * Indicates the rw permissions for the new logical
  55         * device.  This should be a combination of FMODE_READ
  56         * and FMODE_WRITE.
  57         */
  58        fmode_t mode;
  59
  60        /* a list of devices used by this table */
  61        struct list_head devices;
  62
  63        /* events get handed up using this callback */
  64        void (*event_fn)(void *);
  65        void *event_context;
  66
  67        struct dm_md_mempools *mempools;
  68
  69        struct list_head target_callbacks;
  70};
  71
  72/*
  73 * Similar to ceiling(log_size(n))
  74 */
  75static unsigned int int_log(unsigned int n, unsigned int base)
  76{
  77        int result = 0;
  78
  79        while (n > 1) {
  80                n = dm_div_up(n, base);
  81                result++;
  82        }
  83
  84        return result;
  85}
  86
  87/*
  88 * Calculate the index of the child node of the n'th node k'th key.
  89 */
  90static inline unsigned int get_child(unsigned int n, unsigned int k)
  91{
  92        return (n * CHILDREN_PER_NODE) + k;
  93}
  94
  95/*
  96 * Return the n'th node of level l from table t.
  97 */
  98static inline sector_t *get_node(struct dm_table *t,
  99                                 unsigned int l, unsigned int n)
 100{
 101        return t->index[l] + (n * KEYS_PER_NODE);
 102}
 103
 104/*
 105 * Return the highest key that you could lookup from the n'th
 106 * node on level l of the btree.
 107 */
 108static sector_t high(struct dm_table *t, unsigned int l, unsigned int n)
 109{
 110        for (; l < t->depth - 1; l++)
 111                n = get_child(n, CHILDREN_PER_NODE - 1);
 112
 113        if (n >= t->counts[l])
 114                return (sector_t) - 1;
 115
 116        return get_node(t, l, n)[KEYS_PER_NODE - 1];
 117}
 118
 119/*
 120 * Fills in a level of the btree based on the highs of the level
 121 * below it.
 122 */
 123static int setup_btree_index(unsigned int l, struct dm_table *t)
 124{
 125        unsigned int n, k;
 126        sector_t *node;
 127
 128        for (n = 0U; n < t->counts[l]; n++) {
 129                node = get_node(t, l, n);
 130
 131                for (k = 0U; k < KEYS_PER_NODE; k++)
 132                        node[k] = high(t, l + 1, get_child(n, k));
 133        }
 134
 135        return 0;
 136}
 137
 138void *dm_vcalloc(unsigned long nmemb, unsigned long elem_size)
 139{
 140        unsigned long size;
 141        void *addr;
 142
 143        /*
 144         * Check that we're not going to overflow.
 145         */
 146        if (nmemb > (ULONG_MAX / elem_size))
 147                return NULL;
 148
 149        size = nmemb * elem_size;
 150        addr = vzalloc(size);
 151
 152        return addr;
 153}
 154EXPORT_SYMBOL(dm_vcalloc);
 155
 156/*
 157 * highs, and targets are managed as dynamic arrays during a
 158 * table load.
 159 */
 160static int alloc_targets(struct dm_table *t, unsigned int num)
 161{
 162        sector_t *n_highs;
 163        struct dm_target *n_targets;
 164
 165        /*
 166         * Allocate both the target array and offset array at once.
 167         * Append an empty entry to catch sectors beyond the end of
 168         * the device.
 169         */
 170        n_highs = (sector_t *) dm_vcalloc(num + 1, sizeof(struct dm_target) +
 171                                          sizeof(sector_t));
 172        if (!n_highs)
 173                return -ENOMEM;
 174
 175        n_targets = (struct dm_target *) (n_highs + num);
 176
 177        memset(n_highs, -1, sizeof(*n_highs) * num);
 178        vfree(t->highs);
 179
 180        t->num_allocated = num;
 181        t->highs = n_highs;
 182        t->targets = n_targets;
 183
 184        return 0;
 185}
 186
 187int dm_table_create(struct dm_table **result, fmode_t mode,
 188                    unsigned num_targets, struct mapped_device *md)
 189{
 190        struct dm_table *t = kzalloc(sizeof(*t), GFP_KERNEL);
 191
 192        if (!t)
 193                return -ENOMEM;
 194
 195        INIT_LIST_HEAD(&t->devices);
 196        INIT_LIST_HEAD(&t->target_callbacks);
 197
 198        if (!num_targets)
 199                num_targets = KEYS_PER_NODE;
 200
 201        num_targets = dm_round_up(num_targets, KEYS_PER_NODE);
 202
 203        if (!num_targets) {
 204                kfree(t);
 205                return -ENOMEM;
 206        }
 207
 208        if (alloc_targets(t, num_targets)) {
 209                kfree(t);
 210                return -ENOMEM;
 211        }
 212
 213        t->type = DM_TYPE_NONE;
 214        t->mode = mode;
 215        t->md = md;
 216        *result = t;
 217        return 0;
 218}
 219
 220static void free_devices(struct list_head *devices, struct mapped_device *md)
 221{
 222        struct list_head *tmp, *next;
 223
 224        list_for_each_safe(tmp, next, devices) {
 225                struct dm_dev_internal *dd =
 226                    list_entry(tmp, struct dm_dev_internal, list);
 227                DMWARN("%s: dm_table_destroy: dm_put_device call missing for %s",
 228                       dm_device_name(md), dd->dm_dev->name);
 229                dm_put_table_device(md, dd->dm_dev);
 230                kfree(dd);
 231        }
 232}
 233
 234void dm_table_destroy(struct dm_table *t)
 235{
 236        unsigned int i;
 237
 238        if (!t)
 239                return;
 240
 241        /* free the indexes */
 242        if (t->depth >= 2)
 243                vfree(t->index[t->depth - 2]);
 244
 245        /* free the targets */
 246        for (i = 0; i < t->num_targets; i++) {
 247                struct dm_target *tgt = t->targets + i;
 248
 249                if (tgt->type->dtr)
 250                        tgt->type->dtr(tgt);
 251
 252                dm_put_target_type(tgt->type);
 253        }
 254
 255        vfree(t->highs);
 256
 257        /* free the device list */
 258        free_devices(&t->devices, t->md);
 259
 260        dm_free_md_mempools(t->mempools);
 261
 262        kfree(t);
 263}
 264
 265/*
 266 * See if we've already got a device in the list.
 267 */
 268static struct dm_dev_internal *find_device(struct list_head *l, dev_t dev)
 269{
 270        struct dm_dev_internal *dd;
 271
 272        list_for_each_entry (dd, l, list)
 273                if (dd->dm_dev->bdev->bd_dev == dev)
 274                        return dd;
 275
 276        return NULL;
 277}
 278
 279/*
 280 * If possible, this checks an area of a destination device is invalid.
 281 */
 282static int device_area_is_invalid(struct dm_target *ti, struct dm_dev *dev,
 283                                  sector_t start, sector_t len, void *data)
 284{
 285        struct request_queue *q;
 286        struct queue_limits *limits = data;
 287        struct block_device *bdev = dev->bdev;
 288        sector_t dev_size =
 289                i_size_read(bdev->bd_inode) >> SECTOR_SHIFT;
 290        unsigned short logical_block_size_sectors =
 291                limits->logical_block_size >> SECTOR_SHIFT;
 292        char b[BDEVNAME_SIZE];
 293
 294        /*
 295         * Some devices exist without request functions,
 296         * such as loop devices not yet bound to backing files.
 297         * Forbid the use of such devices.
 298         */
 299        q = bdev_get_queue(bdev);
 300        if (!q || !q->make_request_fn) {
 301                DMWARN("%s: %s is not yet initialised: "
 302                       "start=%llu, len=%llu, dev_size=%llu",
 303                       dm_device_name(ti->table->md), bdevname(bdev, b),
 304                       (unsigned long long)start,
 305                       (unsigned long long)len,
 306                       (unsigned long long)dev_size);
 307                return 1;
 308        }
 309
 310        if (!dev_size)
 311                return 0;
 312
 313        if ((start >= dev_size) || (start + len > dev_size)) {
 314                DMWARN("%s: %s too small for target: "
 315                       "start=%llu, len=%llu, dev_size=%llu",
 316                       dm_device_name(ti->table->md), bdevname(bdev, b),
 317                       (unsigned long long)start,
 318                       (unsigned long long)len,
 319                       (unsigned long long)dev_size);
 320                return 1;
 321        }
 322
 323        if (logical_block_size_sectors <= 1)
 324                return 0;
 325
 326        if (start & (logical_block_size_sectors - 1)) {
 327                DMWARN("%s: start=%llu not aligned to h/w "
 328                       "logical block size %u of %s",
 329                       dm_device_name(ti->table->md),
 330                       (unsigned long long)start,
 331                       limits->logical_block_size, bdevname(bdev, b));
 332                return 1;
 333        }
 334
 335        if (len & (logical_block_size_sectors - 1)) {
 336                DMWARN("%s: len=%llu not aligned to h/w "
 337                       "logical block size %u of %s",
 338                       dm_device_name(ti->table->md),
 339                       (unsigned long long)len,
 340                       limits->logical_block_size, bdevname(bdev, b));
 341                return 1;
 342        }
 343
 344        return 0;
 345}
 346
 347/*
 348 * This upgrades the mode on an already open dm_dev, being
 349 * careful to leave things as they were if we fail to reopen the
 350 * device and not to touch the existing bdev field in case
 351 * it is accessed concurrently inside dm_table_any_congested().
 352 */
 353static int upgrade_mode(struct dm_dev_internal *dd, fmode_t new_mode,
 354                        struct mapped_device *md)
 355{
 356        int r;
 357        struct dm_dev *old_dev, *new_dev;
 358
 359        old_dev = dd->dm_dev;
 360
 361        r = dm_get_table_device(md, dd->dm_dev->bdev->bd_dev,
 362                                dd->dm_dev->mode | new_mode, &new_dev);
 363        if (r)
 364                return r;
 365
 366        dd->dm_dev = new_dev;
 367        dm_put_table_device(md, old_dev);
 368
 369        return 0;
 370}
 371
 372/*
 373 * Convert the path to a device
 374 */
 375dev_t dm_get_dev_t(const char *path)
 376{
 377        dev_t dev;
 378        struct block_device *bdev;
 379
 380        bdev = lookup_bdev(path);
 381        if (IS_ERR(bdev))
 382                dev = name_to_dev_t(path);
 383        else {
 384                dev = bdev->bd_dev;
 385                bdput(bdev);
 386        }
 387
 388        return dev;
 389}
 390EXPORT_SYMBOL_GPL(dm_get_dev_t);
 391
 392/*
 393 * Add a device to the list, or just increment the usage count if
 394 * it's already present.
 395 */
 396int dm_get_device(struct dm_target *ti, const char *path, fmode_t mode,
 397                  struct dm_dev **result)
 398{
 399        int r;
 400        dev_t dev;
 401        struct dm_dev_internal *dd;
 402        struct dm_table *t = ti->table;
 403
 404        BUG_ON(!t);
 405
 406        dev = dm_get_dev_t(path);
 407        if (!dev)
 408                return -ENODEV;
 409
 410        dd = find_device(&t->devices, dev);
 411        if (!dd) {
 412                dd = kmalloc(sizeof(*dd), GFP_KERNEL);
 413                if (!dd)
 414                        return -ENOMEM;
 415
 416                if ((r = dm_get_table_device(t->md, dev, mode, &dd->dm_dev))) {
 417                        kfree(dd);
 418                        return r;
 419                }
 420
 421                atomic_set(&dd->count, 0);
 422                list_add(&dd->list, &t->devices);
 423
 424        } else if (dd->dm_dev->mode != (mode | dd->dm_dev->mode)) {
 425                r = upgrade_mode(dd, mode, t->md);
 426                if (r)
 427                        return r;
 428        }
 429        atomic_inc(&dd->count);
 430
 431        *result = dd->dm_dev;
 432        return 0;
 433}
 434EXPORT_SYMBOL(dm_get_device);
 435
 436static int dm_set_device_limits(struct dm_target *ti, struct dm_dev *dev,
 437                                sector_t start, sector_t len, void *data)
 438{
 439        struct queue_limits *limits = data;
 440        struct block_device *bdev = dev->bdev;
 441        struct request_queue *q = bdev_get_queue(bdev);
 442        char b[BDEVNAME_SIZE];
 443
 444        if (unlikely(!q)) {
 445                DMWARN("%s: Cannot set limits for nonexistent device %s",
 446                       dm_device_name(ti->table->md), bdevname(bdev, b));
 447                return 0;
 448        }
 449
 450        if (bdev_stack_limits(limits, bdev, start) < 0)
 451                DMWARN("%s: adding target device %s caused an alignment inconsistency: "
 452                       "physical_block_size=%u, logical_block_size=%u, "
 453                       "alignment_offset=%u, start=%llu",
 454                       dm_device_name(ti->table->md), bdevname(bdev, b),
 455                       q->limits.physical_block_size,
 456                       q->limits.logical_block_size,
 457                       q->limits.alignment_offset,
 458                       (unsigned long long) start << SECTOR_SHIFT);
 459
 460        /*
 461         * Check if merge fn is supported.
 462         * If not we'll force DM to use PAGE_SIZE or
 463         * smaller I/O, just to be safe.
 464         */
 465        if (dm_queue_merge_is_compulsory(q) && !ti->type->merge)
 466                blk_limits_max_hw_sectors(limits,
 467                                          (unsigned int) (PAGE_SIZE >> 9));
 468        return 0;
 469}
 470
 471/*
 472 * Decrement a device's use count and remove it if necessary.
 473 */
 474void dm_put_device(struct dm_target *ti, struct dm_dev *d)
 475{
 476        int found = 0;
 477        struct list_head *devices = &ti->table->devices;
 478        struct dm_dev_internal *dd;
 479
 480        list_for_each_entry(dd, devices, list) {
 481                if (dd->dm_dev == d) {
 482                        found = 1;
 483                        break;
 484                }
 485        }
 486        if (!found) {
 487                DMWARN("%s: device %s not in table devices list",
 488                       dm_device_name(ti->table->md), d->name);
 489                return;
 490        }
 491        if (atomic_dec_and_test(&dd->count)) {
 492                dm_put_table_device(ti->table->md, d);
 493                list_del(&dd->list);
 494                kfree(dd);
 495        }
 496}
 497EXPORT_SYMBOL(dm_put_device);
 498
 499/*
 500 * Checks to see if the target joins onto the end of the table.
 501 */
 502static int adjoin(struct dm_table *table, struct dm_target *ti)
 503{
 504        struct dm_target *prev;
 505
 506        if (!table->num_targets)
 507                return !ti->begin;
 508
 509        prev = &table->targets[table->num_targets - 1];
 510        return (ti->begin == (prev->begin + prev->len));
 511}
 512
 513/*
 514 * Used to dynamically allocate the arg array.
 515 *
 516 * We do first allocation with GFP_NOIO because dm-mpath and dm-thin must
 517 * process messages even if some device is suspended. These messages have a
 518 * small fixed number of arguments.
 519 *
 520 * On the other hand, dm-switch needs to process bulk data using messages and
 521 * excessive use of GFP_NOIO could cause trouble.
 522 */
 523static char **realloc_argv(unsigned *size, char **old_argv)
 524{
 525        char **argv;
 526        unsigned new_size;
 527        gfp_t gfp;
 528
 529        if (*size) {
 530                new_size = *size * 2;
 531                gfp = GFP_KERNEL;
 532        } else {
 533                new_size = 8;
 534                gfp = GFP_NOIO;
 535        }
 536        argv = kmalloc(new_size * sizeof(*argv), gfp);
 537        if (argv) {
 538                memcpy(argv, old_argv, *size * sizeof(*argv));
 539                *size = new_size;
 540        }
 541
 542        kfree(old_argv);
 543        return argv;
 544}
 545
 546/*
 547 * Destructively splits up the argument list to pass to ctr.
 548 */
 549int dm_split_args(int *argc, char ***argvp, char *input)
 550{
 551        char *start, *end = input, *out, **argv = NULL;
 552        unsigned array_size = 0;
 553
 554        *argc = 0;
 555
 556        if (!input) {
 557                *argvp = NULL;
 558                return 0;
 559        }
 560
 561        argv = realloc_argv(&array_size, argv);
 562        if (!argv)
 563                return -ENOMEM;
 564
 565        while (1) {
 566                /* Skip whitespace */
 567                start = skip_spaces(end);
 568
 569                if (!*start)
 570                        break;  /* success, we hit the end */
 571
 572                /* 'out' is used to remove any back-quotes */
 573                end = out = start;
 574                while (*end) {
 575                        /* Everything apart from '\0' can be quoted */
 576                        if (*end == '\\' && *(end + 1)) {
 577                                *out++ = *(end + 1);
 578                                end += 2;
 579                                continue;
 580                        }
 581
 582                        if (isspace(*end))
 583                                break;  /* end of token */
 584
 585                        *out++ = *end++;
 586                }
 587
 588                /* have we already filled the array ? */
 589                if ((*argc + 1) > array_size) {
 590                        argv = realloc_argv(&array_size, argv);
 591                        if (!argv)
 592                                return -ENOMEM;
 593                }
 594
 595                /* we know this is whitespace */
 596                if (*end)
 597                        end++;
 598
 599                /* terminate the string and put it in the array */
 600                *out = '\0';
 601                argv[*argc] = start;
 602                (*argc)++;
 603        }
 604
 605        *argvp = argv;
 606        return 0;
 607}
 608
 609/*
 610 * Impose necessary and sufficient conditions on a devices's table such
 611 * that any incoming bio which respects its logical_block_size can be
 612 * processed successfully.  If it falls across the boundary between
 613 * two or more targets, the size of each piece it gets split into must
 614 * be compatible with the logical_block_size of the target processing it.
 615 */
 616static int validate_hardware_logical_block_alignment(struct dm_table *table,
 617                                                 struct queue_limits *limits)
 618{
 619        /*
 620         * This function uses arithmetic modulo the logical_block_size
 621         * (in units of 512-byte sectors).
 622         */
 623        unsigned short device_logical_block_size_sects =
 624                limits->logical_block_size >> SECTOR_SHIFT;
 625
 626        /*
 627         * Offset of the start of the next table entry, mod logical_block_size.
 628         */
 629        unsigned short next_target_start = 0;
 630
 631        /*
 632         * Given an aligned bio that extends beyond the end of a
 633         * target, how many sectors must the next target handle?
 634         */
 635        unsigned short remaining = 0;
 636
 637        struct dm_target *uninitialized_var(ti);
 638        struct queue_limits ti_limits;
 639        struct queue_limits_aux ti_limits_aux;
 640        unsigned i;
 641
 642        /* 
 643         * Initialize limits_aux pointer to stack queue_limits_aux
 644         * members.
 645         */
 646        ti_limits.limits_aux = &ti_limits_aux;
 647
 648        /*
 649         * Check each entry in the table in turn.
 650         */
 651        for (i = 0; i < dm_table_get_num_targets(table); i++) {
 652                ti = dm_table_get_target(table, i);
 653
 654                blk_set_stacking_limits(&ti_limits);
 655
 656                /* combine all target devices' limits */
 657                if (ti->type->iterate_devices)
 658                        ti->type->iterate_devices(ti, dm_set_device_limits,
 659                                                  &ti_limits);
 660
 661                /*
 662                 * If the remaining sectors fall entirely within this
 663                 * table entry are they compatible with its logical_block_size?
 664                 */
 665                if (remaining < ti->len &&
 666                    remaining & ((ti_limits.logical_block_size >>
 667                                  SECTOR_SHIFT) - 1))
 668                        break;  /* Error */
 669
 670                next_target_start =
 671                    (unsigned short) ((next_target_start + ti->len) &
 672                                      (device_logical_block_size_sects - 1));
 673                remaining = next_target_start ?
 674                    device_logical_block_size_sects - next_target_start : 0;
 675        }
 676
 677        if (remaining) {
 678                DMWARN("%s: table line %u (start sect %llu len %llu) "
 679                       "not aligned to h/w logical block size %u",
 680                       dm_device_name(table->md), i,
 681                       (unsigned long long) ti->begin,
 682                       (unsigned long long) ti->len,
 683                       limits->logical_block_size);
 684                return -EINVAL;
 685        }
 686
 687        return 0;
 688}
 689
 690int dm_table_add_target(struct dm_table *t, const char *type,
 691                        sector_t start, sector_t len, char *params)
 692{
 693        int r = -EINVAL, argc;
 694        char **argv;
 695        struct dm_target *tgt;
 696
 697        if (t->singleton) {
 698                DMERR("%s: target type %s must appear alone in table",
 699                      dm_device_name(t->md), t->targets->type->name);
 700                return -EINVAL;
 701        }
 702
 703        BUG_ON(t->num_targets >= t->num_allocated);
 704
 705        tgt = t->targets + t->num_targets;
 706        memset(tgt, 0, sizeof(*tgt));
 707
 708        if (!len) {
 709                DMERR("%s: zero-length target", dm_device_name(t->md));
 710                return -EINVAL;
 711        }
 712
 713        tgt->type = dm_get_target_type(type);
 714        if (!tgt->type) {
 715                DMERR("%s: %s: unknown target type", dm_device_name(t->md), type);
 716                return -EINVAL;
 717        }
 718
 719        if (dm_target_needs_singleton(tgt->type)) {
 720                if (t->num_targets) {
 721                        tgt->error = "singleton target type must appear alone in table";
 722                        goto bad;
 723                }
 724                t->singleton = true;
 725        }
 726
 727        if (dm_target_always_writeable(tgt->type) && !(t->mode & FMODE_WRITE)) {
 728                tgt->error = "target type may not be included in a read-only table";
 729                goto bad;
 730        }
 731
 732        if (t->immutable_target_type) {
 733                if (t->immutable_target_type != tgt->type) {
 734                        tgt->error = "immutable target type cannot be mixed with other target types";
 735                        goto bad;
 736                }
 737        } else if (dm_target_is_immutable(tgt->type)) {
 738                if (t->num_targets) {
 739                        tgt->error = "immutable target type cannot be mixed with other target types";
 740                        goto bad;
 741                }
 742                t->immutable_target_type = tgt->type;
 743        }
 744
 745        tgt->table = t;
 746        tgt->begin = start;
 747        tgt->len = len;
 748        tgt->error = "Unknown error";
 749
 750        /*
 751         * Does this target adjoin the previous one ?
 752         */
 753        if (!adjoin(t, tgt)) {
 754                tgt->error = "Gap in table";
 755                goto bad;
 756        }
 757
 758        r = dm_split_args(&argc, &argv, params);
 759        if (r) {
 760                tgt->error = "couldn't split parameters (insufficient memory)";
 761                goto bad;
 762        }
 763
 764        r = tgt->type->ctr(tgt, argc, argv);
 765        kfree(argv);
 766        if (r)
 767                goto bad;
 768
 769        t->highs[t->num_targets++] = tgt->begin + tgt->len - 1;
 770
 771        if (!tgt->num_discard_bios && tgt->discards_supported)
 772                DMWARN("%s: %s: ignoring discards_supported because num_discard_bios is zero.",
 773                       dm_device_name(t->md), type);
 774
 775        return 0;
 776
 777 bad:
 778        DMERR("%s: %s: %s", dm_device_name(t->md), type, tgt->error);
 779        dm_put_target_type(tgt->type);
 780        return r;
 781}
 782
 783/*
 784 * Target argument parsing helpers.
 785 */
 786static int validate_next_arg(const struct dm_arg *arg,
 787                             struct dm_arg_set *arg_set,
 788                             unsigned *value, char **error, unsigned grouped)
 789{
 790        const char *arg_str = dm_shift_arg(arg_set);
 791        char dummy;
 792
 793        if (!arg_str ||
 794            (sscanf(arg_str, "%u%c", value, &dummy) != 1) ||
 795            (*value < arg->min) ||
 796            (*value > arg->max) ||
 797            (grouped && arg_set->argc < *value)) {
 798                *error = arg->error;
 799                return -EINVAL;
 800        }
 801
 802        return 0;
 803}
 804
 805int dm_read_arg(const struct dm_arg *arg, struct dm_arg_set *arg_set,
 806                unsigned *value, char **error)
 807{
 808        return validate_next_arg(arg, arg_set, value, error, 0);
 809}
 810EXPORT_SYMBOL(dm_read_arg);
 811
 812int dm_read_arg_group(const struct dm_arg *arg, struct dm_arg_set *arg_set,
 813                      unsigned *value, char **error)
 814{
 815        return validate_next_arg(arg, arg_set, value, error, 1);
 816}
 817EXPORT_SYMBOL(dm_read_arg_group);
 818
 819const char *dm_shift_arg(struct dm_arg_set *as)
 820{
 821        char *r;
 822
 823        if (as->argc) {
 824                as->argc--;
 825                r = *as->argv;
 826                as->argv++;
 827                return r;
 828        }
 829
 830        return NULL;
 831}
 832EXPORT_SYMBOL(dm_shift_arg);
 833
 834void dm_consume_args(struct dm_arg_set *as, unsigned num_args)
 835{
 836        BUG_ON(as->argc < num_args);
 837        as->argc -= num_args;
 838        as->argv += num_args;
 839}
 840EXPORT_SYMBOL(dm_consume_args);
 841
 842static bool __table_type_bio_based(enum dm_queue_mode table_type)
 843{
 844        return (table_type == DM_TYPE_BIO_BASED ||
 845                table_type == DM_TYPE_DAX_BIO_BASED);
 846}
 847
 848static bool __table_type_request_based(enum dm_queue_mode table_type)
 849{
 850        return (table_type == DM_TYPE_REQUEST_BASED ||
 851                table_type == DM_TYPE_MQ_REQUEST_BASED);
 852}
 853
 854void dm_table_set_type(struct dm_table *t, enum dm_queue_mode type)
 855{
 856        t->type = type;
 857}
 858EXPORT_SYMBOL_GPL(dm_table_set_type);
 859
 860static int device_supports_dax(struct dm_target *ti, struct dm_dev *dev,
 861                               sector_t start, sector_t len, void *data)
 862{
 863        return bdev_dax_supported(dev->bdev, PAGE_SIZE);
 864}
 865
 866static bool dm_table_supports_dax(struct dm_table *t)
 867{
 868        struct dm_target *ti;
 869        unsigned i;
 870
 871        /* Ensure that all targets support DAX. */
 872        for (i = 0; i < dm_table_get_num_targets(t); i++) {
 873                ti = dm_table_get_target(t, i);
 874
 875                if (!ti->type->direct_access)
 876                        return false;
 877
 878                if (!ti->type->iterate_devices ||
 879                    !ti->type->iterate_devices(ti, device_supports_dax, NULL))
 880                        return false;
 881        }
 882
 883        return true;
 884}
 885
 886static int dm_table_determine_type(struct dm_table *t)
 887{
 888        unsigned i;
 889        unsigned bio_based = 0, request_based = 0, hybrid = 0;
 890        unsigned sq_count = 0, mq_count = 0;
 891        struct dm_target *tgt;
 892        struct dm_dev_internal *dd;
 893        struct list_head *devices = dm_table_get_devices(t);
 894        enum dm_queue_mode live_md_type = dm_get_md_type(t->md);
 895
 896        if (t->type != DM_TYPE_NONE) {
 897                /* target already set the table's type */
 898                if (t->type == DM_TYPE_BIO_BASED)
 899                        return 0;
 900                BUG_ON(t->type == DM_TYPE_DAX_BIO_BASED);
 901                goto verify_rq_based;
 902        }
 903
 904        for (i = 0; i < t->num_targets; i++) {
 905                tgt = t->targets + i;
 906                if (dm_target_hybrid(tgt))
 907                        hybrid = 1;
 908                else if (dm_target_request_based(tgt))
 909                        request_based = 1;
 910                else
 911                        bio_based = 1;
 912
 913                if (bio_based && request_based) {
 914                        DMWARN("Inconsistent table: different target types"
 915                               " can't be mixed up");
 916                        return -EINVAL;
 917                }
 918        }
 919
 920        if (hybrid && !bio_based && !request_based) {
 921                /*
 922                 * The targets can work either way.
 923                 * Determine the type from the live device.
 924                 * Default to bio-based if device is new.
 925                 */
 926                if (__table_type_request_based(live_md_type))
 927                        request_based = 1;
 928                else
 929                        bio_based = 1;
 930        }
 931
 932        if (bio_based) {
 933                /* We must use this table as bio-based */
 934                t->type = DM_TYPE_BIO_BASED;
 935                if (dm_table_supports_dax(t) ||
 936                    (list_empty(devices) && live_md_type == DM_TYPE_DAX_BIO_BASED))
 937                        t->type = DM_TYPE_DAX_BIO_BASED;
 938                return 0;
 939        }
 940
 941        BUG_ON(!request_based); /* No targets in this table */
 942
 943        /*
 944         * The only way to establish DM_TYPE_MQ_REQUEST_BASED is by
 945         * having a compatible target use dm_table_set_type.
 946         */
 947        t->type = DM_TYPE_REQUEST_BASED;
 948
 949verify_rq_based:
 950        /*
 951         * Request-based dm supports only tables that have a single target now.
 952         * To support multiple targets, request splitting support is needed,
 953         * and that needs lots of changes in the block-layer.
 954         * (e.g. request completion process for partial completion.)
 955         */
 956        if (t->num_targets > 1) {
 957                DMWARN("Request-based dm doesn't support multiple targets yet");
 958                return -EINVAL;
 959        }
 960
 961        if (list_empty(devices)) {
 962                int srcu_idx;
 963                struct dm_table *live_table = dm_get_live_table(t->md, &srcu_idx);
 964
 965                /* inherit live table's type and all_blk_mq */
 966                if (live_table) {
 967                        t->type = live_table->type;
 968                        t->all_blk_mq = live_table->all_blk_mq;
 969                }
 970                dm_put_live_table(t->md, srcu_idx);
 971                return 0;
 972        }
 973
 974        /* Non-request-stackable devices can't be used for request-based dm */
 975        list_for_each_entry(dd, devices, list) {
 976                struct request_queue *q = bdev_get_queue(dd->dm_dev->bdev);
 977
 978                if (!blk_queue_stackable(q)) {
 979                        DMERR("table load rejected: including"
 980                              " non-request-stackable devices");
 981                        return -EINVAL;
 982                }
 983
 984                if (q->mq_ops)
 985                        mq_count++;
 986                else
 987                        sq_count++;
 988        }
 989        if (sq_count && mq_count) {
 990                DMERR("table load rejected: not all devices are blk-mq request-stackable");
 991                return -EINVAL;
 992        }
 993        t->all_blk_mq = mq_count > 0;
 994
 995        if (t->type == DM_TYPE_MQ_REQUEST_BASED && !t->all_blk_mq) {
 996                DMERR("table load rejected: all devices are not blk-mq request-stackable");
 997                return -EINVAL;
 998        }
 999
1000        return 0;
1001}
1002
1003enum dm_queue_mode dm_table_get_type(struct dm_table *t)
1004{
1005        return t->type;
1006}
1007
1008struct target_type *dm_table_get_immutable_target_type(struct dm_table *t)
1009{
1010        return t->immutable_target_type;
1011}
1012
1013struct dm_target *dm_table_get_immutable_target(struct dm_table *t)
1014{
1015        /* Immutable target is implicitly a singleton */
1016        if (t->num_targets > 1 ||
1017            !dm_target_is_immutable(t->targets[0].type))
1018                return NULL;
1019
1020        return t->targets;
1021}
1022
1023struct dm_target *dm_table_get_wildcard_target(struct dm_table *t)
1024{
1025        struct dm_target *ti;
1026        unsigned i;
1027
1028        for (i = 0; i < dm_table_get_num_targets(t); i++) {
1029                ti = dm_table_get_target(t, i);
1030                if (dm_target_is_wildcard(ti->type))
1031                        return ti;
1032        }
1033
1034        return NULL;
1035}
1036
1037bool dm_table_bio_based(struct dm_table *t)
1038{
1039        return __table_type_bio_based(dm_table_get_type(t));
1040}
1041
1042bool dm_table_request_based(struct dm_table *t)
1043{
1044        return __table_type_request_based(dm_table_get_type(t));
1045}
1046
1047bool dm_table_all_blk_mq_devices(struct dm_table *t)
1048{
1049        return t->all_blk_mq;
1050}
1051
1052static int dm_table_alloc_md_mempools(struct dm_table *t, struct mapped_device *md)
1053{
1054        enum dm_queue_mode type = dm_table_get_type(t);
1055        unsigned per_io_data_size = 0;
1056        struct dm_target *tgt;
1057        unsigned i;
1058
1059        if (unlikely(type == DM_TYPE_NONE)) {
1060                DMWARN("no table type is set, can't allocate mempools");
1061                return -EINVAL;
1062        }
1063
1064        if (__table_type_bio_based(type))
1065                for (i = 0; i < t->num_targets; i++) {
1066                        tgt = t->targets + i;
1067                        per_io_data_size = max(per_io_data_size, tgt->per_io_data_size);
1068                }
1069
1070        t->mempools = dm_alloc_md_mempools(md, type, t->integrity_supported, per_io_data_size);
1071        if (!t->mempools)
1072                return -ENOMEM;
1073
1074        return 0;
1075}
1076
1077void dm_table_free_md_mempools(struct dm_table *t)
1078{
1079        dm_free_md_mempools(t->mempools);
1080        t->mempools = NULL;
1081}
1082
1083struct dm_md_mempools *dm_table_get_md_mempools(struct dm_table *t)
1084{
1085        return t->mempools;
1086}
1087
1088static int setup_indexes(struct dm_table *t)
1089{
1090        int i;
1091        unsigned int total = 0;
1092        sector_t *indexes;
1093
1094        /* allocate the space for *all* the indexes */
1095        for (i = t->depth - 2; i >= 0; i--) {
1096                t->counts[i] = dm_div_up(t->counts[i + 1], CHILDREN_PER_NODE);
1097                total += t->counts[i];
1098        }
1099
1100        indexes = (sector_t *) dm_vcalloc(total, (unsigned long) NODE_SIZE);
1101        if (!indexes)
1102                return -ENOMEM;
1103
1104        /* set up internal nodes, bottom-up */
1105        for (i = t->depth - 2; i >= 0; i--) {
1106                t->index[i] = indexes;
1107                indexes += (KEYS_PER_NODE * t->counts[i]);
1108                setup_btree_index(i, t);
1109        }
1110
1111        return 0;
1112}
1113
1114/*
1115 * Builds the btree to index the map.
1116 */
1117static int dm_table_build_index(struct dm_table *t)
1118{
1119        int r = 0;
1120        unsigned int leaf_nodes;
1121
1122        /* how many indexes will the btree have ? */
1123        leaf_nodes = dm_div_up(t->num_targets, KEYS_PER_NODE);
1124        t->depth = 1 + int_log(leaf_nodes, CHILDREN_PER_NODE);
1125
1126        /* leaf layer has already been set up */
1127        t->counts[t->depth - 1] = leaf_nodes;
1128        t->index[t->depth - 1] = t->highs;
1129
1130        if (t->depth >= 2)
1131                r = setup_indexes(t);
1132
1133        return r;
1134}
1135
1136/*
1137 * Get a disk whose integrity profile reflects the table's profile.
1138 * If %match_all is true, all devices' profiles must match.
1139 * If %match_all is false, all devices must at least have an
1140 * allocated integrity profile; but uninitialized is ok.
1141 * Returns NULL if integrity support was inconsistent or unavailable.
1142 */
1143static struct gendisk * dm_table_get_integrity_disk(struct dm_table *t,
1144                                                    bool match_all)
1145{
1146        struct list_head *devices = dm_table_get_devices(t);
1147        struct dm_dev_internal *dd = NULL;
1148        struct gendisk *prev_disk = NULL, *template_disk = NULL;
1149
1150        list_for_each_entry(dd, devices, list) {
1151                template_disk = dd->dm_dev->bdev->bd_disk;
1152                if (!blk_get_integrity(template_disk))
1153                        goto no_integrity;
1154                if (!match_all && !blk_integrity_is_initialized(template_disk))
1155                        continue; /* skip uninitialized profiles */
1156                else if (prev_disk &&
1157                         blk_integrity_compare(prev_disk, template_disk) < 0)
1158                        goto no_integrity;
1159                prev_disk = template_disk;
1160        }
1161
1162        return template_disk;
1163
1164no_integrity:
1165        if (prev_disk)
1166                DMWARN("%s: integrity not set: %s and %s profile mismatch",
1167                       dm_device_name(t->md),
1168                       prev_disk->disk_name,
1169                       template_disk->disk_name);
1170        return NULL;
1171}
1172
1173/*
1174 * Register the mapped device for blk_integrity support if
1175 * the underlying devices have an integrity profile.  But all devices
1176 * may not have matching profiles (checking all devices isn't reliable
1177 * during table load because this table may use other DM device(s) which
1178 * must be resumed before they will have an initialized integity profile).
1179 * Stacked DM devices force a 2 stage integrity profile validation:
1180 * 1 - during load, validate all initialized integrity profiles match
1181 * 2 - during resume, validate all integrity profiles match
1182 */
1183static int dm_table_prealloc_integrity(struct dm_table *t, struct mapped_device *md)
1184{
1185        struct gendisk *template_disk = NULL;
1186
1187        template_disk = dm_table_get_integrity_disk(t, false);
1188        if (!template_disk)
1189                return 0;
1190
1191        if (!blk_integrity_is_initialized(dm_disk(md))) {
1192                t->integrity_supported = true;
1193                return blk_integrity_register(dm_disk(md), NULL);
1194        }
1195
1196        /*
1197         * If DM device already has an initalized integrity
1198         * profile the new profile should not conflict.
1199         */
1200        if (blk_integrity_is_initialized(template_disk) &&
1201            blk_integrity_compare(dm_disk(md), template_disk) < 0) {
1202                DMWARN("%s: conflict with existing integrity profile: "
1203                       "%s profile mismatch",
1204                       dm_device_name(t->md),
1205                       template_disk->disk_name);
1206                return 1;
1207        }
1208
1209        /* Preserve existing initialized integrity profile */
1210        t->integrity_supported = true;
1211        return 0;
1212}
1213
1214/*
1215 * Prepares the table for use by building the indices,
1216 * setting the type, and allocating mempools.
1217 */
1218int dm_table_complete(struct dm_table *t)
1219{
1220        int r;
1221
1222        r = dm_table_determine_type(t);
1223        if (r) {
1224                DMERR("unable to determine table type");
1225                return r;
1226        }
1227
1228        r = dm_table_build_index(t);
1229        if (r) {
1230                DMERR("unable to build btrees");
1231                return r;
1232        }
1233
1234        r = dm_table_prealloc_integrity(t, t->md);
1235        if (r) {
1236                DMERR("could not register integrity profile.");
1237                return r;
1238        }
1239
1240        r = dm_table_alloc_md_mempools(t, t->md);
1241        if (r)
1242                DMERR("unable to allocate mempools");
1243
1244        return r;
1245}
1246
1247static DEFINE_MUTEX(_event_lock);
1248void dm_table_event_callback(struct dm_table *t,
1249                             void (*fn)(void *), void *context)
1250{
1251        mutex_lock(&_event_lock);
1252        t->event_fn = fn;
1253        t->event_context = context;
1254        mutex_unlock(&_event_lock);
1255}
1256
1257void dm_table_event(struct dm_table *t)
1258{
1259        /*
1260         * You can no longer call dm_table_event() from interrupt
1261         * context, use a bottom half instead.
1262         */
1263        BUG_ON(in_interrupt());
1264
1265        mutex_lock(&_event_lock);
1266        if (t->event_fn)
1267                t->event_fn(t->event_context);
1268        mutex_unlock(&_event_lock);
1269}
1270EXPORT_SYMBOL(dm_table_event);
1271
1272sector_t dm_table_get_size(struct dm_table *t)
1273{
1274        return t->num_targets ? (t->highs[t->num_targets - 1] + 1) : 0;
1275}
1276EXPORT_SYMBOL(dm_table_get_size);
1277
1278struct dm_target *dm_table_get_target(struct dm_table *t, unsigned int index)
1279{
1280        if (index >= t->num_targets)
1281                return NULL;
1282
1283        return t->targets + index;
1284}
1285
1286/*
1287 * Search the btree for the correct target.
1288 *
1289 * Caller should check returned pointer with dm_target_is_valid()
1290 * to trap I/O beyond end of device.
1291 */
1292struct dm_target *dm_table_find_target(struct dm_table *t, sector_t sector)
1293{
1294        unsigned int l, n = 0, k = 0;
1295        sector_t *node;
1296
1297        for (l = 0; l < t->depth; l++) {
1298                n = get_child(n, k);
1299                node = get_node(t, l, n);
1300
1301                for (k = 0; k < KEYS_PER_NODE; k++)
1302                        if (node[k] >= sector)
1303                                break;
1304        }
1305
1306        return &t->targets[(KEYS_PER_NODE * n) + k];
1307}
1308
1309static int count_device(struct dm_target *ti, struct dm_dev *dev,
1310                        sector_t start, sector_t len, void *data)
1311{
1312        unsigned *num_devices = data;
1313
1314        (*num_devices)++;
1315
1316        return 0;
1317}
1318
1319/*
1320 * Check whether a table has no data devices attached using each
1321 * target's iterate_devices method.
1322 * Returns false if the result is unknown because a target doesn't
1323 * support iterate_devices.
1324 */
1325bool dm_table_has_no_data_devices(struct dm_table *table)
1326{
1327        struct dm_target *ti;
1328        unsigned i, num_devices;
1329
1330        for (i = 0; i < dm_table_get_num_targets(table); i++) {
1331                ti = dm_table_get_target(table, i);
1332
1333                if (!ti->type->iterate_devices)
1334                        return false;
1335
1336                num_devices = 0;
1337                ti->type->iterate_devices(ti, count_device, &num_devices);
1338                if (num_devices)
1339                        return false;
1340        }
1341
1342        return true;
1343}
1344
1345/*
1346 * Establish the new table's queue_limits and validate them.
1347 */
1348int dm_calculate_queue_limits(struct dm_table *table,
1349                              struct queue_limits *limits)
1350{
1351        struct dm_target *uninitialized_var(ti);
1352        struct queue_limits ti_limits;
1353        struct queue_limits_aux ti_limits_aux;
1354        unsigned i;
1355
1356        blk_set_stacking_limits(limits);
1357
1358        /* 
1359         * Initialize limits_aux pointer to stack queue_limits_aux
1360         * members.
1361         */
1362        ti_limits.limits_aux = &ti_limits_aux;
1363
1364        for (i = 0; i < dm_table_get_num_targets(table); i++) {
1365                blk_set_stacking_limits(&ti_limits);
1366
1367                ti = dm_table_get_target(table, i);
1368
1369                if (!ti->type->iterate_devices)
1370                        goto combine_limits;
1371
1372                /*
1373                 * Combine queue limits of all the devices this target uses.
1374                 */
1375                ti->type->iterate_devices(ti, dm_set_device_limits,
1376                                          &ti_limits);
1377
1378                /* Set I/O hints portion of queue limits */
1379                if (ti->type->io_hints)
1380                        ti->type->io_hints(ti, &ti_limits);
1381
1382                /*
1383                 * Check each device area is consistent with the target's
1384                 * overall queue limits.
1385                 */
1386                if (ti->type->iterate_devices(ti, device_area_is_invalid,
1387                                              &ti_limits))
1388                        return -EINVAL;
1389
1390combine_limits:
1391                /*
1392                 * Merge this target's queue limits into the overall limits
1393                 * for the table.
1394                 */
1395                if (blk_stack_limits(limits, &ti_limits, 0) < 0)
1396                        DMWARN("%s: adding target device "
1397                               "(start sect %llu len %llu) "
1398                               "caused an alignment inconsistency",
1399                               dm_device_name(table->md),
1400                               (unsigned long long) ti->begin,
1401                               (unsigned long long) ti->len);
1402        }
1403
1404        return validate_hardware_logical_block_alignment(table, limits);
1405}
1406
1407/*
1408 * Set the integrity profile for this device if all devices used have
1409 * matching profiles.  We're quite deep in the resume path but still
1410 * don't know if all devices (particularly DM devices this device
1411 * may be stacked on) have matching profiles.  Even if the profiles
1412 * don't match we have no way to fail (to resume) at this point.
1413 */
1414static void dm_table_set_integrity(struct dm_table *t)
1415{
1416        struct gendisk *template_disk = NULL;
1417
1418        if (!blk_get_integrity(dm_disk(t->md)))
1419                return;
1420
1421        template_disk = dm_table_get_integrity_disk(t, true);
1422        if (template_disk)
1423                blk_integrity_register(dm_disk(t->md),
1424                                       blk_get_integrity(template_disk));
1425        else if (blk_integrity_is_initialized(dm_disk(t->md)))
1426                DMWARN("%s: device no longer has a valid integrity profile",
1427                       dm_device_name(t->md));
1428        else
1429                DMWARN("%s: unable to establish an integrity profile",
1430                       dm_device_name(t->md));
1431}
1432
1433static int device_flush_capable(struct dm_target *ti, struct dm_dev *dev,
1434                                sector_t start, sector_t len, void *data)
1435{
1436        unsigned flush = (*(unsigned *)data);
1437        struct request_queue *q = bdev_get_queue(dev->bdev);
1438
1439        return q && (q->flush_flags & flush);
1440}
1441
1442static bool dm_table_supports_flush(struct dm_table *t, unsigned flush)
1443{
1444        struct dm_target *ti;
1445        unsigned i;
1446
1447        /*
1448         * Require at least one underlying device to support flushes.
1449         * t->devices includes internal dm devices such as mirror logs
1450         * so we need to use iterate_devices here, which targets
1451         * supporting flushes must provide.
1452         */
1453        for (i = 0; i < dm_table_get_num_targets(t); i++) {
1454                ti = dm_table_get_target(t, i);
1455
1456                if (!ti->num_flush_bios)
1457                        continue;
1458
1459                if (ti->flush_supported)
1460                        return true;
1461
1462                if (ti->type->iterate_devices &&
1463                    ti->type->iterate_devices(ti, device_flush_capable, &flush))
1464                        return true;
1465        }
1466
1467        return false;
1468}
1469
1470static bool dm_table_discard_zeroes_data(struct dm_table *t)
1471{
1472        struct dm_target *ti;
1473        unsigned i = 0;
1474
1475        /* Ensure that all targets supports discard_zeroes_data. */
1476        while (i < dm_table_get_num_targets(t)) {
1477                ti = dm_table_get_target(t, i++);
1478
1479                if (ti->discard_zeroes_data_unsupported)
1480                        return false;
1481        }
1482
1483        return true;
1484}
1485
1486static int device_dax_write_cache_enabled(struct dm_target *ti,
1487                                          struct dm_dev *dev, sector_t start,
1488                                          sector_t len, void *data)
1489{
1490        struct dax_device *dax_dev = dev->dax_dev;
1491
1492        if (!dax_dev)
1493                return false;
1494
1495        if (dax_write_cache_enabled(dax_dev))
1496                return true;
1497        return false;
1498}
1499
1500static int dm_table_supports_dax_write_cache(struct dm_table *t)
1501{
1502        struct dm_target *ti;
1503        unsigned i;
1504
1505        for (i = 0; i < dm_table_get_num_targets(t); i++) {
1506                ti = dm_table_get_target(t, i);
1507
1508                if (ti->type->iterate_devices &&
1509                    ti->type->iterate_devices(ti,
1510                                device_dax_write_cache_enabled, NULL))
1511                        return true;
1512        }
1513
1514        return false;
1515}
1516
1517static int device_is_nonrot(struct dm_target *ti, struct dm_dev *dev,
1518                            sector_t start, sector_t len, void *data)
1519{
1520        struct request_queue *q = bdev_get_queue(dev->bdev);
1521
1522        return q && blk_queue_nonrot(q);
1523}
1524
1525static int device_is_not_random(struct dm_target *ti, struct dm_dev *dev,
1526                             sector_t start, sector_t len, void *data)
1527{
1528        struct request_queue *q = bdev_get_queue(dev->bdev);
1529
1530        return q && !blk_queue_add_random(q);
1531}
1532
1533static int queue_supports_sg_merge(struct dm_target *ti, struct dm_dev *dev,
1534                                   sector_t start, sector_t len, void *data)
1535{
1536        struct request_queue *q = bdev_get_queue(dev->bdev);
1537
1538        return q && !test_bit(QUEUE_FLAG_NO_SG_MERGE, &q->queue_flags);
1539}
1540
1541static bool dm_table_all_devices_attribute(struct dm_table *t,
1542                                           iterate_devices_callout_fn func)
1543{
1544        struct dm_target *ti;
1545        unsigned i;
1546
1547        for (i = 0; i < dm_table_get_num_targets(t); i++) {
1548                ti = dm_table_get_target(t, i);
1549
1550                if (!ti->type->iterate_devices ||
1551                    !ti->type->iterate_devices(ti, func, NULL))
1552                        return false;
1553        }
1554
1555        return true;
1556}
1557
1558static int device_not_write_same_capable(struct dm_target *ti, struct dm_dev *dev,
1559                                         sector_t start, sector_t len, void *data)
1560{
1561        struct request_queue *q = bdev_get_queue(dev->bdev);
1562
1563        return q && !q->limits.max_write_same_sectors;
1564}
1565
1566static bool dm_table_supports_write_same(struct dm_table *t)
1567{
1568        struct dm_target *ti;
1569        unsigned i;
1570
1571        for (i = 0; i < dm_table_get_num_targets(t); i++) {
1572                ti = dm_table_get_target(t, i);
1573
1574                if (!ti->num_write_same_bios)
1575                        return false;
1576
1577                if (!ti->type->iterate_devices ||
1578                    ti->type->iterate_devices(ti, device_not_write_same_capable, NULL))
1579                        return false;
1580        }
1581
1582        return true;
1583}
1584
1585static int device_not_discard_capable(struct dm_target *ti, struct dm_dev *dev,
1586                                      sector_t start, sector_t len, void *data)
1587{
1588        struct request_queue *q = bdev_get_queue(dev->bdev);
1589
1590        return q && !blk_queue_discard(q);
1591}
1592
1593static bool dm_table_supports_discards(struct dm_table *t)
1594{
1595        struct dm_target *ti;
1596        unsigned i;
1597
1598        for (i = 0; i < dm_table_get_num_targets(t); i++) {
1599                ti = dm_table_get_target(t, i);
1600
1601                if (!ti->num_discard_bios)
1602                        return false;
1603
1604                /*
1605                 * Either the target provides discard support (as implied by setting
1606                 * 'discards_supported') or it relies on _all_ data devices having
1607                 * discard support.
1608                 */
1609                if (!ti->discards_supported &&
1610                    (!ti->type->iterate_devices ||
1611                     ti->type->iterate_devices(ti, device_not_discard_capable, NULL)))
1612                        return false;
1613        }
1614
1615        return true;
1616}
1617
1618void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q,
1619                               struct queue_limits *limits)
1620{
1621        unsigned flush = 0;
1622        struct queue_limits_aux *limits_aux = q->limits.limits_aux;
1623
1624        /*
1625         * Copy table's limits to the DM device's request_queue
1626         */
1627        q->limits = *limits;
1628        memcpy(limits_aux, limits->limits_aux, sizeof(struct queue_limits_aux));
1629        q->limits.limits_aux = limits_aux;
1630
1631        if (!dm_table_supports_discards(t)) {
1632                queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD, q);
1633                /* Must also clear discard limits... */
1634                q->limits.max_discard_sectors = 0;
1635                q->limits.discard_granularity = 0;
1636                q->limits.discard_alignment = 0;
1637                q->limits.discard_misaligned = 0;
1638        } else
1639                queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, q);
1640
1641        if (dm_table_supports_flush(t, REQ_FLUSH)) {
1642                flush |= REQ_FLUSH;
1643                if (dm_table_supports_flush(t, REQ_FUA))
1644                        flush |= REQ_FUA;
1645        }
1646        blk_queue_flush(q, flush);
1647
1648        if (dm_table_supports_dax(t))
1649                queue_flag_set_unlocked(QUEUE_FLAG_DAX, q);
1650        else
1651                queue_flag_clear_unlocked(QUEUE_FLAG_DAX, q);
1652
1653        if (dm_table_supports_dax_write_cache(t))
1654                dax_write_cache(t->md->dax_dev, true);
1655
1656        if (!dm_table_discard_zeroes_data(t))
1657                q->limits.discard_zeroes_data = 0;
1658
1659        /* Ensure that all underlying devices are non-rotational. */
1660        if (dm_table_all_devices_attribute(t, device_is_nonrot))
1661                queue_flag_set_unlocked(QUEUE_FLAG_NONROT, q);
1662        else
1663                queue_flag_clear_unlocked(QUEUE_FLAG_NONROT, q);
1664
1665        if (!dm_table_supports_write_same(t))
1666                q->limits.max_write_same_sectors = 0;
1667
1668        if (dm_table_all_devices_attribute(t, queue_supports_sg_merge))
1669                queue_flag_clear_unlocked(QUEUE_FLAG_NO_SG_MERGE, q);
1670        else
1671                queue_flag_set_unlocked(QUEUE_FLAG_NO_SG_MERGE, q);
1672
1673        dm_table_set_integrity(t);
1674
1675        /*
1676         * Determine whether or not this queue's I/O timings contribute
1677         * to the entropy pool, Only request-based targets use this.
1678         * Clear QUEUE_FLAG_ADD_RANDOM if any underlying device does not
1679         * have it set.
1680         */
1681        if (blk_queue_add_random(q) && dm_table_all_devices_attribute(t, device_is_not_random))
1682                queue_flag_clear_unlocked(QUEUE_FLAG_ADD_RANDOM, q);
1683
1684        /*
1685         * QUEUE_FLAG_STACKABLE must be set after all queue settings are
1686         * visible to other CPUs because, once the flag is set, incoming bios
1687         * are processed by request-based dm, which refers to the queue
1688         * settings.
1689         * Until the flag set, bios are passed to bio-based dm and queued to
1690         * md->deferred where queue settings are not needed yet.
1691         * Those bios are passed to request-based dm at the resume time.
1692         */
1693        smp_mb();
1694        if (dm_table_request_based(t))
1695                queue_flag_set_unlocked(QUEUE_FLAG_STACKABLE, q);
1696}
1697
1698unsigned int dm_table_get_num_targets(struct dm_table *t)
1699{
1700        return t->num_targets;
1701}
1702
1703struct list_head *dm_table_get_devices(struct dm_table *t)
1704{
1705        return &t->devices;
1706}
1707
1708fmode_t dm_table_get_mode(struct dm_table *t)
1709{
1710        return t->mode;
1711}
1712EXPORT_SYMBOL(dm_table_get_mode);
1713
1714enum suspend_mode {
1715        PRESUSPEND,
1716        PRESUSPEND_UNDO,
1717        POSTSUSPEND,
1718};
1719
1720static void suspend_targets(struct dm_table *t, enum suspend_mode mode)
1721{
1722        int i = t->num_targets;
1723        struct dm_target *ti = t->targets;
1724
1725        lockdep_assert_held(&t->md->suspend_lock);
1726
1727        while (i--) {
1728                switch (mode) {
1729                case PRESUSPEND:
1730                        if (ti->type->presuspend)
1731                                ti->type->presuspend(ti);
1732                        break;
1733                case PRESUSPEND_UNDO:
1734                        if (ti->type->presuspend_undo)
1735                                ti->type->presuspend_undo(ti);
1736                        break;
1737                case POSTSUSPEND:
1738                        if (ti->type->postsuspend)
1739                                ti->type->postsuspend(ti);
1740                        break;
1741                }
1742                ti++;
1743        }
1744}
1745
1746void dm_table_presuspend_targets(struct dm_table *t)
1747{
1748        if (!t)
1749                return;
1750
1751        suspend_targets(t, PRESUSPEND);
1752}
1753
1754void dm_table_presuspend_undo_targets(struct dm_table *t)
1755{
1756        if (!t)
1757                return;
1758
1759        suspend_targets(t, PRESUSPEND_UNDO);
1760}
1761
1762void dm_table_postsuspend_targets(struct dm_table *t)
1763{
1764        if (!t)
1765                return;
1766
1767        suspend_targets(t, POSTSUSPEND);
1768}
1769
1770int dm_table_resume_targets(struct dm_table *t)
1771{
1772        int i, r = 0;
1773
1774        lockdep_assert_held(&t->md->suspend_lock);
1775
1776        for (i = 0; i < t->num_targets; i++) {
1777                struct dm_target *ti = t->targets + i;
1778
1779                if (!ti->type->preresume)
1780                        continue;
1781
1782                r = ti->type->preresume(ti);
1783                if (r) {
1784                        DMERR("%s: %s: preresume failed, error = %d",
1785                              dm_device_name(t->md), ti->type->name, r);
1786                        return r;
1787                }
1788        }
1789
1790        for (i = 0; i < t->num_targets; i++) {
1791                struct dm_target *ti = t->targets + i;
1792
1793                if (ti->type->resume)
1794                        ti->type->resume(ti);
1795        }
1796
1797        return 0;
1798}
1799
1800void dm_table_add_target_callbacks(struct dm_table *t, struct dm_target_callbacks *cb)
1801{
1802        list_add(&cb->list, &t->target_callbacks);
1803}
1804EXPORT_SYMBOL_GPL(dm_table_add_target_callbacks);
1805
1806int dm_table_any_congested(struct dm_table *t, int bdi_bits)
1807{
1808        struct dm_dev_internal *dd;
1809        struct list_head *devices = dm_table_get_devices(t);
1810        struct dm_target_callbacks *cb;
1811        int r = 0;
1812
1813        list_for_each_entry(dd, devices, list) {
1814                struct request_queue *q = bdev_get_queue(dd->dm_dev->bdev);
1815                char b[BDEVNAME_SIZE];
1816
1817                if (likely(q))
1818                        r |= bdi_congested(&q->backing_dev_info, bdi_bits);
1819                else
1820                        DMWARN_LIMIT("%s: any_congested: nonexistent device %s",
1821                                     dm_device_name(t->md),
1822                                     bdevname(dd->dm_dev->bdev, b));
1823        }
1824
1825        list_for_each_entry(cb, &t->target_callbacks, list)
1826                if (cb->congested_fn)
1827                        r |= cb->congested_fn(cb, bdi_bits);
1828
1829        return r;
1830}
1831
1832struct mapped_device *dm_table_get_md(struct dm_table *t)
1833{
1834        return t->md;
1835}
1836EXPORT_SYMBOL(dm_table_get_md);
1837
1838void dm_table_run_md_queue_async(struct dm_table *t)
1839{
1840        struct mapped_device *md;
1841        struct request_queue *queue;
1842        unsigned long flags;
1843
1844        if (!dm_table_request_based(t))
1845                return;
1846
1847        md = dm_table_get_md(t);
1848        queue = dm_get_md_queue(md);
1849        if (queue) {
1850                if (queue->mq_ops)
1851                        blk_mq_run_hw_queues(queue, true);
1852                else {
1853                        spin_lock_irqsave(queue->queue_lock, flags);
1854                        blk_run_queue_async(queue);
1855                        spin_unlock_irqrestore(queue->queue_lock, flags);
1856                }
1857        }
1858}
1859EXPORT_SYMBOL(dm_table_run_md_queue_async);
1860
1861