linux/drivers/md/dm-switch.c
<<
>>
Prefs
   1/*
   2 * Copyright (C) 2010-2012 by Dell Inc.  All rights reserved.
   3 * Copyright (C) 2011-2013 Red Hat, Inc.
   4 *
   5 * This file is released under the GPL.
   6 *
   7 * dm-switch is a device-mapper target that maps IO to underlying block
   8 * devices efficiently when there are a large number of fixed-sized
   9 * address regions but there is no simple pattern to allow for a compact
  10 * mapping representation such as dm-stripe.
  11 */
  12
  13#include <linux/device-mapper.h>
  14
  15#include <linux/module.h>
  16#include <linux/init.h>
  17#include <linux/vmalloc.h>
  18
  19#define DM_MSG_PREFIX "switch"
  20
  21/*
  22 * One region_table_slot_t holds <region_entries_per_slot> region table
  23 * entries each of which is <region_table_entry_bits> in size.
  24 */
  25typedef unsigned long region_table_slot_t;
  26
  27/*
  28 * A device with the offset to its start sector.
  29 */
  30struct switch_path {
  31        struct dm_dev *dmdev;
  32        sector_t start;
  33};
  34
  35/*
  36 * Context block for a dm switch device.
  37 */
  38struct switch_ctx {
  39        struct dm_target *ti;
  40
  41        unsigned nr_paths;              /* Number of paths in path_list. */
  42
  43        unsigned region_size;           /* Region size in 512-byte sectors */
  44        unsigned long nr_regions;       /* Number of regions making up the device */
  45        signed char region_size_bits;   /* log2 of region_size or -1 */
  46
  47        unsigned char region_table_entry_bits;  /* Number of bits in one region table entry */
  48        unsigned char region_entries_per_slot;  /* Number of entries in one region table slot */
  49        signed char region_entries_per_slot_bits;       /* log2 of region_entries_per_slot or -1 */
  50
  51        region_table_slot_t *region_table;      /* Region table */
  52
  53        /*
  54         * Array of dm devices to switch between.
  55         */
  56        struct switch_path path_list[0];
  57};
  58
  59static struct switch_ctx *alloc_switch_ctx(struct dm_target *ti, unsigned nr_paths,
  60                                           unsigned region_size)
  61{
  62        struct switch_ctx *sctx;
  63
  64        sctx = kzalloc(sizeof(struct switch_ctx) + nr_paths * sizeof(struct switch_path),
  65                       GFP_KERNEL);
  66        if (!sctx)
  67                return NULL;
  68
  69        sctx->ti = ti;
  70        sctx->region_size = region_size;
  71
  72        ti->private = sctx;
  73
  74        return sctx;
  75}
  76
  77static int alloc_region_table(struct dm_target *ti, unsigned nr_paths)
  78{
  79        struct switch_ctx *sctx = ti->private;
  80        sector_t nr_regions = ti->len;
  81        sector_t nr_slots;
  82
  83        if (!(sctx->region_size & (sctx->region_size - 1)))
  84                sctx->region_size_bits = __ffs(sctx->region_size);
  85        else
  86                sctx->region_size_bits = -1;
  87
  88        sctx->region_table_entry_bits = 1;
  89        while (sctx->region_table_entry_bits < sizeof(region_table_slot_t) * 8 &&
  90               (region_table_slot_t)1 << sctx->region_table_entry_bits < nr_paths)
  91                sctx->region_table_entry_bits++;
  92
  93        sctx->region_entries_per_slot = (sizeof(region_table_slot_t) * 8) / sctx->region_table_entry_bits;
  94        if (!(sctx->region_entries_per_slot & (sctx->region_entries_per_slot - 1)))
  95                sctx->region_entries_per_slot_bits = __ffs(sctx->region_entries_per_slot);
  96        else
  97                sctx->region_entries_per_slot_bits = -1;
  98
  99        if (sector_div(nr_regions, sctx->region_size))
 100                nr_regions++;
 101
 102        if (nr_regions >= ULONG_MAX) {
 103                ti->error = "Region table too large";
 104                return -EINVAL;
 105        }
 106        sctx->nr_regions = nr_regions;
 107
 108        nr_slots = nr_regions;
 109        if (sector_div(nr_slots, sctx->region_entries_per_slot))
 110                nr_slots++;
 111
 112        if (nr_slots > ULONG_MAX / sizeof(region_table_slot_t)) {
 113                ti->error = "Region table too large";
 114                return -EINVAL;
 115        }
 116
 117        sctx->region_table = vmalloc(nr_slots * sizeof(region_table_slot_t));
 118        if (!sctx->region_table) {
 119                ti->error = "Cannot allocate region table";
 120                return -ENOMEM;
 121        }
 122
 123        return 0;
 124}
 125
 126static void switch_get_position(struct switch_ctx *sctx, unsigned long region_nr,
 127                                unsigned long *region_index, unsigned *bit)
 128{
 129        if (sctx->region_entries_per_slot_bits >= 0) {
 130                *region_index = region_nr >> sctx->region_entries_per_slot_bits;
 131                *bit = region_nr & (sctx->region_entries_per_slot - 1);
 132        } else {
 133                *region_index = region_nr / sctx->region_entries_per_slot;
 134                *bit = region_nr % sctx->region_entries_per_slot;
 135        }
 136
 137        *bit *= sctx->region_table_entry_bits;
 138}
 139
 140static unsigned switch_region_table_read(struct switch_ctx *sctx, unsigned long region_nr)
 141{
 142        unsigned long region_index;
 143        unsigned bit;
 144
 145        switch_get_position(sctx, region_nr, &region_index, &bit);
 146
 147        return (ACCESS_ONCE(sctx->region_table[region_index]) >> bit) &
 148                ((1 << sctx->region_table_entry_bits) - 1);
 149}
 150
 151/*
 152 * Find which path to use at given offset.
 153 */
 154static unsigned switch_get_path_nr(struct switch_ctx *sctx, sector_t offset)
 155{
 156        unsigned path_nr;
 157        sector_t p;
 158
 159        p = offset;
 160        if (sctx->region_size_bits >= 0)
 161                p >>= sctx->region_size_bits;
 162        else
 163                sector_div(p, sctx->region_size);
 164
 165        path_nr = switch_region_table_read(sctx, p);
 166
 167        /* This can only happen if the processor uses non-atomic stores. */
 168        if (unlikely(path_nr >= sctx->nr_paths))
 169                path_nr = 0;
 170
 171        return path_nr;
 172}
 173
 174static void switch_region_table_write(struct switch_ctx *sctx, unsigned long region_nr,
 175                                      unsigned value)
 176{
 177        unsigned long region_index;
 178        unsigned bit;
 179        region_table_slot_t pte;
 180
 181        switch_get_position(sctx, region_nr, &region_index, &bit);
 182
 183        pte = sctx->region_table[region_index];
 184        pte &= ~((((region_table_slot_t)1 << sctx->region_table_entry_bits) - 1) << bit);
 185        pte |= (region_table_slot_t)value << bit;
 186        sctx->region_table[region_index] = pte;
 187}
 188
 189/*
 190 * Fill the region table with an initial round robin pattern.
 191 */
 192static void initialise_region_table(struct switch_ctx *sctx)
 193{
 194        unsigned path_nr = 0;
 195        unsigned long region_nr;
 196
 197        for (region_nr = 0; region_nr < sctx->nr_regions; region_nr++) {
 198                switch_region_table_write(sctx, region_nr, path_nr);
 199                if (++path_nr >= sctx->nr_paths)
 200                        path_nr = 0;
 201        }
 202}
 203
 204static int parse_path(struct dm_arg_set *as, struct dm_target *ti)
 205{
 206        struct switch_ctx *sctx = ti->private;
 207        unsigned long long start;
 208        int r;
 209
 210        r = dm_get_device(ti, dm_shift_arg(as), dm_table_get_mode(ti->table),
 211                          &sctx->path_list[sctx->nr_paths].dmdev);
 212        if (r) {
 213                ti->error = "Device lookup failed";
 214                return r;
 215        }
 216
 217        if (kstrtoull(dm_shift_arg(as), 10, &start) || start != (sector_t)start) {
 218                ti->error = "Invalid device starting offset";
 219                dm_put_device(ti, sctx->path_list[sctx->nr_paths].dmdev);
 220                return -EINVAL;
 221        }
 222
 223        sctx->path_list[sctx->nr_paths].start = start;
 224
 225        sctx->nr_paths++;
 226
 227        return 0;
 228}
 229
 230/*
 231 * Destructor: Don't free the dm_target, just the ti->private data (if any).
 232 */
 233static void switch_dtr(struct dm_target *ti)
 234{
 235        struct switch_ctx *sctx = ti->private;
 236
 237        while (sctx->nr_paths--)
 238                dm_put_device(ti, sctx->path_list[sctx->nr_paths].dmdev);
 239
 240        vfree(sctx->region_table);
 241        kfree(sctx);
 242}
 243
 244/*
 245 * Constructor arguments:
 246 *   <num_paths> <region_size> <num_optional_args> [<optional_args>...]
 247 *   [<dev_path> <offset>]+
 248 *
 249 * Optional args are to allow for future extension: currently this
 250 * parameter must be 0.
 251 */
 252static int switch_ctr(struct dm_target *ti, unsigned argc, char **argv)
 253{
 254        static const struct dm_arg _args[] = {
 255                {1, (KMALLOC_MAX_SIZE - sizeof(struct switch_ctx)) / sizeof(struct switch_path), "Invalid number of paths"},
 256                {1, UINT_MAX, "Invalid region size"},
 257                {0, 0, "Invalid number of optional args"},
 258        };
 259
 260        struct switch_ctx *sctx;
 261        struct dm_arg_set as;
 262        unsigned nr_paths, region_size, nr_optional_args;
 263        int r;
 264
 265        as.argc = argc;
 266        as.argv = argv;
 267
 268        r = dm_read_arg(_args, &as, &nr_paths, &ti->error);
 269        if (r)
 270                return -EINVAL;
 271
 272        r = dm_read_arg(_args + 1, &as, &region_size, &ti->error);
 273        if (r)
 274                return r;
 275
 276        r = dm_read_arg_group(_args + 2, &as, &nr_optional_args, &ti->error);
 277        if (r)
 278                return r;
 279        /* parse optional arguments here, if we add any */
 280
 281        if (as.argc != nr_paths * 2) {
 282                ti->error = "Incorrect number of path arguments";
 283                return -EINVAL;
 284        }
 285
 286        sctx = alloc_switch_ctx(ti, nr_paths, region_size);
 287        if (!sctx) {
 288                ti->error = "Cannot allocate redirection context";
 289                return -ENOMEM;
 290        }
 291
 292        r = dm_set_target_max_io_len(ti, region_size);
 293        if (r)
 294                goto error;
 295
 296        while (as.argc) {
 297                r = parse_path(&as, ti);
 298                if (r)
 299                        goto error;
 300        }
 301
 302        r = alloc_region_table(ti, nr_paths);
 303        if (r)
 304                goto error;
 305
 306        initialise_region_table(sctx);
 307
 308        /* For UNMAP, sending the request down any path is sufficient */
 309        ti->num_discard_bios = 1;
 310
 311        return 0;
 312
 313error:
 314        switch_dtr(ti);
 315
 316        return r;
 317}
 318
 319static int switch_map(struct dm_target *ti, struct bio *bio)
 320{
 321        struct switch_ctx *sctx = ti->private;
 322        sector_t offset = dm_target_offset(ti, bio->bi_sector);
 323        unsigned path_nr = switch_get_path_nr(sctx, offset);
 324
 325        bio->bi_bdev = sctx->path_list[path_nr].dmdev->bdev;
 326        bio->bi_sector = sctx->path_list[path_nr].start + offset;
 327
 328        return DM_MAPIO_REMAPPED;
 329}
 330
 331/*
 332 * We need to parse hex numbers in the message as quickly as possible.
 333 *
 334 * This table-based hex parser improves performance.
 335 * It improves a time to load 1000000 entries compared to the condition-based
 336 * parser.
 337 *              table-based parser      condition-based parser
 338 * PA-RISC      0.29s                   0.31s
 339 * Opteron      0.0495s                 0.0498s
 340 */
 341static const unsigned char hex_table[256] = {
 342255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
 343255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
 344255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
 3450, 1, 2, 3, 4, 5, 6, 7, 8, 9, 255, 255, 255, 255, 255, 255,
 346255, 10, 11, 12, 13, 14, 15, 255, 255, 255, 255, 255, 255, 255, 255, 255,
 347255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
 348255, 10, 11, 12, 13, 14, 15, 255, 255, 255, 255, 255, 255, 255, 255, 255,
 349255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
 350255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
 351255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
 352255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
 353255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
 354255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
 355255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
 356255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
 357255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255
 358};
 359
 360static __always_inline unsigned long parse_hex(const char **string)
 361{
 362        unsigned char d;
 363        unsigned long r = 0;
 364
 365        while ((d = hex_table[(unsigned char)**string]) < 16) {
 366                r = (r << 4) | d;
 367                (*string)++;
 368        }
 369
 370        return r;
 371}
 372
 373static int process_set_region_mappings(struct switch_ctx *sctx,
 374                                       unsigned argc, char **argv)
 375{
 376        unsigned i;
 377        unsigned long region_index = 0;
 378
 379        for (i = 1; i < argc; i++) {
 380                unsigned long path_nr;
 381                const char *string = argv[i];
 382
 383                if ((*string & 0xdf) == 'R') {
 384                        unsigned long cycle_length, num_write;
 385
 386                        string++;
 387                        if (unlikely(*string == ',')) {
 388                                DMWARN("invalid set_region_mappings argument: '%s'", argv[i]);
 389                                return -EINVAL;
 390                        }
 391                        cycle_length = parse_hex(&string);
 392                        if (unlikely(*string != ',')) {
 393                                DMWARN("invalid set_region_mappings argument: '%s'", argv[i]);
 394                                return -EINVAL;
 395                        }
 396                        string++;
 397                        if (unlikely(!*string)) {
 398                                DMWARN("invalid set_region_mappings argument: '%s'", argv[i]);
 399                                return -EINVAL;
 400                        }
 401                        num_write = parse_hex(&string);
 402                        if (unlikely(*string)) {
 403                                DMWARN("invalid set_region_mappings argument: '%s'", argv[i]);
 404                                return -EINVAL;
 405                        }
 406
 407                        if (unlikely(!cycle_length) || unlikely(cycle_length - 1 > region_index)) {
 408                                DMWARN("invalid set_region_mappings cycle length: %lu > %lu",
 409                                       cycle_length - 1, region_index);
 410                                return -EINVAL;
 411                        }
 412                        if (unlikely(region_index + num_write < region_index) ||
 413                            unlikely(region_index + num_write >= sctx->nr_regions)) {
 414                                DMWARN("invalid set_region_mappings region number: %lu + %lu >= %lu",
 415                                       region_index, num_write, sctx->nr_regions);
 416                                return -EINVAL;
 417                        }
 418
 419                        while (num_write--) {
 420                                region_index++;
 421                                path_nr = switch_region_table_read(sctx, region_index - cycle_length);
 422                                switch_region_table_write(sctx, region_index, path_nr);
 423                        }
 424
 425                        continue;
 426                }
 427
 428                if (*string == ':')
 429                        region_index++;
 430                else {
 431                        region_index = parse_hex(&string);
 432                        if (unlikely(*string != ':')) {
 433                                DMWARN("invalid set_region_mappings argument: '%s'", argv[i]);
 434                                return -EINVAL;
 435                        }
 436                }
 437
 438                string++;
 439                if (unlikely(!*string)) {
 440                        DMWARN("invalid set_region_mappings argument: '%s'", argv[i]);
 441                        return -EINVAL;
 442                }
 443
 444                path_nr = parse_hex(&string);
 445                if (unlikely(*string)) {
 446                        DMWARN("invalid set_region_mappings argument: '%s'", argv[i]);
 447                        return -EINVAL;
 448                }
 449                if (unlikely(region_index >= sctx->nr_regions)) {
 450                        DMWARN("invalid set_region_mappings region number: %lu >= %lu", region_index, sctx->nr_regions);
 451                        return -EINVAL;
 452                }
 453                if (unlikely(path_nr >= sctx->nr_paths)) {
 454                        DMWARN("invalid set_region_mappings device: %lu >= %u", path_nr, sctx->nr_paths);
 455                        return -EINVAL;
 456                }
 457
 458                switch_region_table_write(sctx, region_index, path_nr);
 459        }
 460
 461        return 0;
 462}
 463
 464/*
 465 * Messages are processed one-at-a-time.
 466 *
 467 * Only set_region_mappings is supported.
 468 */
 469static int switch_message(struct dm_target *ti, unsigned argc, char **argv)
 470{
 471        static DEFINE_MUTEX(message_mutex);
 472
 473        struct switch_ctx *sctx = ti->private;
 474        int r = -EINVAL;
 475
 476        mutex_lock(&message_mutex);
 477
 478        if (!strcasecmp(argv[0], "set_region_mappings"))
 479                r = process_set_region_mappings(sctx, argc, argv);
 480        else
 481                DMWARN("Unrecognised message received.");
 482
 483        mutex_unlock(&message_mutex);
 484
 485        return r;
 486}
 487
 488static void switch_status(struct dm_target *ti, status_type_t type,
 489                          unsigned status_flags, char *result, unsigned maxlen)
 490{
 491        struct switch_ctx *sctx = ti->private;
 492        unsigned sz = 0;
 493        int path_nr;
 494
 495        switch (type) {
 496        case STATUSTYPE_INFO:
 497                result[0] = '\0';
 498                break;
 499
 500        case STATUSTYPE_TABLE:
 501                DMEMIT("%u %u 0", sctx->nr_paths, sctx->region_size);
 502                for (path_nr = 0; path_nr < sctx->nr_paths; path_nr++)
 503                        DMEMIT(" %s %llu", sctx->path_list[path_nr].dmdev->name,
 504                               (unsigned long long)sctx->path_list[path_nr].start);
 505                break;
 506        }
 507}
 508
 509/*
 510 * Switch ioctl:
 511 *
 512 * Passthrough all ioctls to the path for sector 0
 513 */
 514static int switch_prepare_ioctl(struct dm_target *ti,
 515                struct block_device **bdev, fmode_t *mode)
 516{
 517        struct switch_ctx *sctx = ti->private;
 518        unsigned path_nr;
 519
 520        path_nr = switch_get_path_nr(sctx, 0);
 521
 522        *bdev = sctx->path_list[path_nr].dmdev->bdev;
 523        *mode = sctx->path_list[path_nr].dmdev->mode;
 524
 525        /*
 526         * Only pass ioctls through if the device sizes match exactly.
 527         */
 528        if (ti->len + sctx->path_list[path_nr].start !=
 529            i_size_read((*bdev)->bd_inode) >> SECTOR_SHIFT)
 530                return 1;
 531        return 0;
 532}
 533
 534static int switch_iterate_devices(struct dm_target *ti,
 535                                  iterate_devices_callout_fn fn, void *data)
 536{
 537        struct switch_ctx *sctx = ti->private;
 538        int path_nr;
 539        int r;
 540
 541        for (path_nr = 0; path_nr < sctx->nr_paths; path_nr++) {
 542                r = fn(ti, sctx->path_list[path_nr].dmdev,
 543                         sctx->path_list[path_nr].start, ti->len, data);
 544                if (r)
 545                        return r;
 546        }
 547
 548        return 0;
 549}
 550
 551static struct target_type switch_target = {
 552        .name = "switch",
 553        .version = {1, 1, 0},
 554        .module = THIS_MODULE,
 555        .ctr = switch_ctr,
 556        .dtr = switch_dtr,
 557        .map = switch_map,
 558        .message = switch_message,
 559        .status = switch_status,
 560        .prepare_ioctl = switch_prepare_ioctl,
 561        .iterate_devices = switch_iterate_devices,
 562};
 563
 564static int __init dm_switch_init(void)
 565{
 566        int r;
 567
 568        r = dm_register_target(&switch_target);
 569        if (r < 0)
 570                DMERR("dm_register_target() failed %d", r);
 571
 572        return r;
 573}
 574
 575static void __exit dm_switch_exit(void)
 576{
 577        dm_unregister_target(&switch_target);
 578}
 579
 580module_init(dm_switch_init);
 581module_exit(dm_switch_exit);
 582
 583MODULE_DESCRIPTION(DM_NAME " dynamic path switching target");
 584MODULE_AUTHOR("Kevin D. O'Kelley <Kevin_OKelley@dell.com>");
 585MODULE_AUTHOR("Narendran Ganapathy <Narendran_Ganapathy@dell.com>");
 586MODULE_AUTHOR("Jim Ramsay <Jim_Ramsay@dell.com>");
 587MODULE_AUTHOR("Mikulas Patocka <mpatocka@redhat.com>");
 588MODULE_LICENSE("GPL");
 589