linux/drivers/md/dm-io.c
<<
>>
Prefs
   1/*
   2 * Copyright (C) 2003 Sistina Software
   3 * Copyright (C) 2006 Red Hat GmbH
   4 *
   5 * This file is released under the GPL.
   6 */
   7
   8#include "dm-core.h"
   9
  10#include <linux/device-mapper.h>
  11
  12#include <linux/bio.h>
  13#include <linux/completion.h>
  14#include <linux/mempool.h>
  15#include <linux/module.h>
  16#include <linux/sched.h>
  17#include <linux/slab.h>
  18#include <linux/dm-io.h>
  19
  20#define DM_MSG_PREFIX "io"
  21
  22#define DM_IO_MAX_REGIONS       BITS_PER_LONG
  23
  24struct dm_io_client {
  25        mempool_t *pool;
  26        struct bio_set *bios;
  27};
  28
  29/*
  30 * Aligning 'struct io' reduces the number of bits required to store
  31 * its address.  Refer to store_io_and_region_in_bio() below.
  32 */
  33struct io {
  34        unsigned long error_bits;
  35        atomic_t count;
  36        struct dm_io_client *client;
  37        io_notify_fn callback;
  38        void *context;
  39        void *vma_invalidate_address;
  40        unsigned long vma_invalidate_size;
  41} __attribute__((aligned(DM_IO_MAX_REGIONS)));
  42
  43static struct kmem_cache *_dm_io_cache;
  44
  45/*
  46 * Create a client with mempool and bioset.
  47 */
  48struct dm_io_client *dm_io_client_create(void)
  49{
  50        struct dm_io_client *client;
  51        unsigned min_ios = dm_get_reserved_bio_based_ios();
  52
  53        client = kmalloc(sizeof(*client), GFP_KERNEL);
  54        if (!client)
  55                return ERR_PTR(-ENOMEM);
  56
  57        client->pool = mempool_create_slab_pool(min_ios, _dm_io_cache);
  58        if (!client->pool)
  59                goto bad;
  60
  61        client->bios = bioset_create(min_ios, 0);
  62        if (!client->bios)
  63                goto bad;
  64
  65        return client;
  66
  67   bad:
  68        mempool_destroy(client->pool);
  69        kfree(client);
  70        return ERR_PTR(-ENOMEM);
  71}
  72EXPORT_SYMBOL(dm_io_client_create);
  73
  74void dm_io_client_destroy(struct dm_io_client *client)
  75{
  76        mempool_destroy(client->pool);
  77        bioset_free(client->bios);
  78        kfree(client);
  79}
  80EXPORT_SYMBOL(dm_io_client_destroy);
  81
  82/*-----------------------------------------------------------------
  83 * We need to keep track of which region a bio is doing io for.
  84 * To avoid a memory allocation to store just 5 or 6 bits, we
  85 * ensure the 'struct io' pointer is aligned so enough low bits are
  86 * always zero and then combine it with the region number directly in
  87 * bi_private.
  88 *---------------------------------------------------------------*/
  89static void store_io_and_region_in_bio(struct bio *bio, struct io *io,
  90                                       unsigned region)
  91{
  92        if (unlikely(!IS_ALIGNED((unsigned long)io, DM_IO_MAX_REGIONS))) {
  93                DMCRIT("Unaligned struct io pointer %p", io);
  94                BUG();
  95        }
  96
  97        bio->bi_private = (void *)((unsigned long)io | region);
  98}
  99
 100static void retrieve_io_and_region_from_bio(struct bio *bio, struct io **io,
 101                                       unsigned *region)
 102{
 103        unsigned long val = (unsigned long)bio->bi_private;
 104
 105        *io = (void *)(val & -(unsigned long)DM_IO_MAX_REGIONS);
 106        *region = val & (DM_IO_MAX_REGIONS - 1);
 107}
 108
 109/*-----------------------------------------------------------------
 110 * We need an io object to keep track of the number of bios that
 111 * have been dispatched for a particular io.
 112 *---------------------------------------------------------------*/
 113static void complete_io(struct io *io)
 114{
 115        unsigned long error_bits = io->error_bits;
 116        io_notify_fn fn = io->callback;
 117        void *context = io->context;
 118
 119        if (io->vma_invalidate_size)
 120                invalidate_kernel_vmap_range(io->vma_invalidate_address,
 121                                             io->vma_invalidate_size);
 122
 123        mempool_free(io, io->client->pool);
 124        fn(error_bits, context);
 125}
 126
 127static void dec_count(struct io *io, unsigned int region, int error)
 128{
 129        if (error)
 130                set_bit(region, &io->error_bits);
 131
 132        if (atomic_dec_and_test(&io->count))
 133                complete_io(io);
 134}
 135
 136static void endio(struct bio *bio)
 137{
 138        struct io *io;
 139        unsigned region;
 140        int error;
 141
 142        if (bio->bi_error && bio_data_dir(bio) == READ)
 143                zero_fill_bio(bio);
 144
 145        /*
 146         * The bio destructor in bio_put() may use the io object.
 147         */
 148        retrieve_io_and_region_from_bio(bio, &io, &region);
 149
 150        error = bio->bi_error;
 151        bio_put(bio);
 152
 153        dec_count(io, region, error);
 154}
 155
 156/*-----------------------------------------------------------------
 157 * These little objects provide an abstraction for getting a new
 158 * destination page for io.
 159 *---------------------------------------------------------------*/
 160struct dpages {
 161        void (*get_page)(struct dpages *dp,
 162                         struct page **p, unsigned long *len, unsigned *offset);
 163        void (*next_page)(struct dpages *dp);
 164
 165        unsigned context_u;
 166        void *context_ptr;
 167
 168        void *vma_invalidate_address;
 169        unsigned long vma_invalidate_size;
 170};
 171
 172/*
 173 * Functions for getting the pages from a list.
 174 */
 175static void list_get_page(struct dpages *dp,
 176                  struct page **p, unsigned long *len, unsigned *offset)
 177{
 178        unsigned o = dp->context_u;
 179        struct page_list *pl = (struct page_list *) dp->context_ptr;
 180
 181        *p = pl->page;
 182        *len = PAGE_SIZE - o;
 183        *offset = o;
 184}
 185
 186static void list_next_page(struct dpages *dp)
 187{
 188        struct page_list *pl = (struct page_list *) dp->context_ptr;
 189        dp->context_ptr = pl->next;
 190        dp->context_u = 0;
 191}
 192
 193static void list_dp_init(struct dpages *dp, struct page_list *pl, unsigned offset)
 194{
 195        dp->get_page = list_get_page;
 196        dp->next_page = list_next_page;
 197        dp->context_u = offset;
 198        dp->context_ptr = pl;
 199}
 200
 201/*
 202 * Functions for getting the pages from a bvec.
 203 */
 204static void bio_get_page(struct dpages *dp, struct page **p,
 205                         unsigned long *len, unsigned *offset)
 206{
 207        struct bio_vec *bvec = dp->context_ptr;
 208        *p = bvec->bv_page;
 209        *len = bvec->bv_len - dp->context_u;
 210        *offset = bvec->bv_offset + dp->context_u;
 211}
 212
 213static void bio_next_page(struct dpages *dp)
 214{
 215        struct bio_vec *bvec = dp->context_ptr;
 216        dp->context_ptr = bvec + 1;
 217        dp->context_u = 0;
 218}
 219
 220static void bio_dp_init(struct dpages *dp, struct bio *bio)
 221{
 222        dp->get_page = bio_get_page;
 223        dp->next_page = bio_next_page;
 224        dp->context_ptr = __bvec_iter_bvec(bio->bi_io_vec, bio->bi_iter);
 225        dp->context_u = bio->bi_iter.bi_bvec_done;
 226}
 227
 228/*
 229 * Functions for getting the pages from a VMA.
 230 */
 231static void vm_get_page(struct dpages *dp,
 232                 struct page **p, unsigned long *len, unsigned *offset)
 233{
 234        *p = vmalloc_to_page(dp->context_ptr);
 235        *offset = dp->context_u;
 236        *len = PAGE_SIZE - dp->context_u;
 237}
 238
 239static void vm_next_page(struct dpages *dp)
 240{
 241        dp->context_ptr += PAGE_SIZE - dp->context_u;
 242        dp->context_u = 0;
 243}
 244
 245static void vm_dp_init(struct dpages *dp, void *data)
 246{
 247        dp->get_page = vm_get_page;
 248        dp->next_page = vm_next_page;
 249        dp->context_u = offset_in_page(data);
 250        dp->context_ptr = data;
 251}
 252
 253/*
 254 * Functions for getting the pages from kernel memory.
 255 */
 256static void km_get_page(struct dpages *dp, struct page **p, unsigned long *len,
 257                        unsigned *offset)
 258{
 259        *p = virt_to_page(dp->context_ptr);
 260        *offset = dp->context_u;
 261        *len = PAGE_SIZE - dp->context_u;
 262}
 263
 264static void km_next_page(struct dpages *dp)
 265{
 266        dp->context_ptr += PAGE_SIZE - dp->context_u;
 267        dp->context_u = 0;
 268}
 269
 270static void km_dp_init(struct dpages *dp, void *data)
 271{
 272        dp->get_page = km_get_page;
 273        dp->next_page = km_next_page;
 274        dp->context_u = offset_in_page(data);
 275        dp->context_ptr = data;
 276}
 277
 278/*-----------------------------------------------------------------
 279 * IO routines that accept a list of pages.
 280 *---------------------------------------------------------------*/
 281static void do_region(int op, int op_flags, unsigned region,
 282                      struct dm_io_region *where, struct dpages *dp,
 283                      struct io *io)
 284{
 285        struct bio *bio;
 286        struct page *page;
 287        unsigned long len;
 288        unsigned offset;
 289        unsigned num_bvecs;
 290        sector_t remaining = where->count;
 291        struct request_queue *q = bdev_get_queue(where->bdev);
 292        unsigned short logical_block_size = queue_logical_block_size(q);
 293        sector_t num_sectors;
 294        unsigned int uninitialized_var(special_cmd_max_sectors);
 295
 296        /*
 297         * Reject unsupported discard and write same requests.
 298         */
 299        if (op == REQ_OP_DISCARD)
 300                special_cmd_max_sectors = q->limits.max_discard_sectors;
 301        else if (op == REQ_OP_WRITE_SAME)
 302                special_cmd_max_sectors = q->limits.max_write_same_sectors;
 303        if ((op == REQ_OP_DISCARD || op == REQ_OP_WRITE_SAME) &&
 304            special_cmd_max_sectors == 0) {
 305                dec_count(io, region, -EOPNOTSUPP);
 306                return;
 307        }
 308
 309        /*
 310         * where->count may be zero if op holds a flush and we need to
 311         * send a zero-sized flush.
 312         */
 313        do {
 314                /*
 315                 * Allocate a suitably sized-bio.
 316                 */
 317                if ((op == REQ_OP_DISCARD) || (op == REQ_OP_WRITE_SAME))
 318                        num_bvecs = 1;
 319                else
 320                        num_bvecs = min_t(int, BIO_MAX_PAGES,
 321                                          dm_sector_div_up(remaining, (PAGE_SIZE >> SECTOR_SHIFT)));
 322
 323                bio = bio_alloc_bioset(GFP_NOIO, num_bvecs, io->client->bios);
 324                bio->bi_iter.bi_sector = where->sector + (where->count - remaining);
 325                bio->bi_bdev = where->bdev;
 326                bio->bi_end_io = endio;
 327                bio_set_op_attrs(bio, op, op_flags);
 328                store_io_and_region_in_bio(bio, io, region);
 329
 330                if (op == REQ_OP_DISCARD) {
 331                        num_sectors = min_t(sector_t, special_cmd_max_sectors, remaining);
 332                        bio->bi_iter.bi_size = num_sectors << SECTOR_SHIFT;
 333                        remaining -= num_sectors;
 334                } else if (op == REQ_OP_WRITE_SAME) {
 335                        /*
 336                         * WRITE SAME only uses a single page.
 337                         */
 338                        dp->get_page(dp, &page, &len, &offset);
 339                        bio_add_page(bio, page, logical_block_size, offset);
 340                        num_sectors = min_t(sector_t, special_cmd_max_sectors, remaining);
 341                        bio->bi_iter.bi_size = num_sectors << SECTOR_SHIFT;
 342
 343                        offset = 0;
 344                        remaining -= num_sectors;
 345                        dp->next_page(dp);
 346                } else while (remaining) {
 347                        /*
 348                         * Try and add as many pages as possible.
 349                         */
 350                        dp->get_page(dp, &page, &len, &offset);
 351                        len = min(len, to_bytes(remaining));
 352                        if (!bio_add_page(bio, page, len, offset))
 353                                break;
 354
 355                        offset = 0;
 356                        remaining -= to_sector(len);
 357                        dp->next_page(dp);
 358                }
 359
 360                atomic_inc(&io->count);
 361                submit_bio(bio);
 362        } while (remaining);
 363}
 364
 365static void dispatch_io(int op, int op_flags, unsigned int num_regions,
 366                        struct dm_io_region *where, struct dpages *dp,
 367                        struct io *io, int sync)
 368{
 369        int i;
 370        struct dpages old_pages = *dp;
 371
 372        BUG_ON(num_regions > DM_IO_MAX_REGIONS);
 373
 374        if (sync)
 375                op_flags |= REQ_SYNC;
 376
 377        /*
 378         * For multiple regions we need to be careful to rewind
 379         * the dp object for each call to do_region.
 380         */
 381        for (i = 0; i < num_regions; i++) {
 382                *dp = old_pages;
 383                if (where[i].count || (op_flags & REQ_PREFLUSH))
 384                        do_region(op, op_flags, i, where + i, dp, io);
 385        }
 386
 387        /*
 388         * Drop the extra reference that we were holding to avoid
 389         * the io being completed too early.
 390         */
 391        dec_count(io, 0, 0);
 392}
 393
 394struct sync_io {
 395        unsigned long error_bits;
 396        struct completion wait;
 397};
 398
 399static void sync_io_complete(unsigned long error, void *context)
 400{
 401        struct sync_io *sio = context;
 402
 403        sio->error_bits = error;
 404        complete(&sio->wait);
 405}
 406
 407static int sync_io(struct dm_io_client *client, unsigned int num_regions,
 408                   struct dm_io_region *where, int op, int op_flags,
 409                   struct dpages *dp, unsigned long *error_bits)
 410{
 411        struct io *io;
 412        struct sync_io sio;
 413
 414        if (num_regions > 1 && !op_is_write(op)) {
 415                WARN_ON(1);
 416                return -EIO;
 417        }
 418
 419        init_completion(&sio.wait);
 420
 421        io = mempool_alloc(client->pool, GFP_NOIO);
 422        io->error_bits = 0;
 423        atomic_set(&io->count, 1); /* see dispatch_io() */
 424        io->client = client;
 425        io->callback = sync_io_complete;
 426        io->context = &sio;
 427
 428        io->vma_invalidate_address = dp->vma_invalidate_address;
 429        io->vma_invalidate_size = dp->vma_invalidate_size;
 430
 431        dispatch_io(op, op_flags, num_regions, where, dp, io, 1);
 432
 433        wait_for_completion_io(&sio.wait);
 434
 435        if (error_bits)
 436                *error_bits = sio.error_bits;
 437
 438        return sio.error_bits ? -EIO : 0;
 439}
 440
 441static int async_io(struct dm_io_client *client, unsigned int num_regions,
 442                    struct dm_io_region *where, int op, int op_flags,
 443                    struct dpages *dp, io_notify_fn fn, void *context)
 444{
 445        struct io *io;
 446
 447        if (num_regions > 1 && !op_is_write(op)) {
 448                WARN_ON(1);
 449                fn(1, context);
 450                return -EIO;
 451        }
 452
 453        io = mempool_alloc(client->pool, GFP_NOIO);
 454        io->error_bits = 0;
 455        atomic_set(&io->count, 1); /* see dispatch_io() */
 456        io->client = client;
 457        io->callback = fn;
 458        io->context = context;
 459
 460        io->vma_invalidate_address = dp->vma_invalidate_address;
 461        io->vma_invalidate_size = dp->vma_invalidate_size;
 462
 463        dispatch_io(op, op_flags, num_regions, where, dp, io, 0);
 464        return 0;
 465}
 466
 467static int dp_init(struct dm_io_request *io_req, struct dpages *dp,
 468                   unsigned long size)
 469{
 470        /* Set up dpages based on memory type */
 471
 472        dp->vma_invalidate_address = NULL;
 473        dp->vma_invalidate_size = 0;
 474
 475        switch (io_req->mem.type) {
 476        case DM_IO_PAGE_LIST:
 477                list_dp_init(dp, io_req->mem.ptr.pl, io_req->mem.offset);
 478                break;
 479
 480        case DM_IO_BIO:
 481                bio_dp_init(dp, io_req->mem.ptr.bio);
 482                break;
 483
 484        case DM_IO_VMA:
 485                flush_kernel_vmap_range(io_req->mem.ptr.vma, size);
 486                if (io_req->bi_op == REQ_OP_READ) {
 487                        dp->vma_invalidate_address = io_req->mem.ptr.vma;
 488                        dp->vma_invalidate_size = size;
 489                }
 490                vm_dp_init(dp, io_req->mem.ptr.vma);
 491                break;
 492
 493        case DM_IO_KMEM:
 494                km_dp_init(dp, io_req->mem.ptr.addr);
 495                break;
 496
 497        default:
 498                return -EINVAL;
 499        }
 500
 501        return 0;
 502}
 503
 504/*
 505 * New collapsed (a)synchronous interface.
 506 *
 507 * If the IO is asynchronous (i.e. it has notify.fn), you must either unplug
 508 * the queue with blk_unplug() some time later or set REQ_SYNC in
 509 * io_req->bi_opf. If you fail to do one of these, the IO will be submitted to
 510 * the disk after q->unplug_delay, which defaults to 3ms in blk-settings.c.
 511 */
 512int dm_io(struct dm_io_request *io_req, unsigned num_regions,
 513          struct dm_io_region *where, unsigned long *sync_error_bits)
 514{
 515        int r;
 516        struct dpages dp;
 517
 518        r = dp_init(io_req, &dp, (unsigned long)where->count << SECTOR_SHIFT);
 519        if (r)
 520                return r;
 521
 522        if (!io_req->notify.fn)
 523                return sync_io(io_req->client, num_regions, where,
 524                               io_req->bi_op, io_req->bi_op_flags, &dp,
 525                               sync_error_bits);
 526
 527        return async_io(io_req->client, num_regions, where, io_req->bi_op,
 528                        io_req->bi_op_flags, &dp, io_req->notify.fn,
 529                        io_req->notify.context);
 530}
 531EXPORT_SYMBOL(dm_io);
 532
 533int __init dm_io_init(void)
 534{
 535        _dm_io_cache = KMEM_CACHE(io, 0);
 536        if (!_dm_io_cache)
 537                return -ENOMEM;
 538
 539        return 0;
 540}
 541
 542void dm_io_exit(void)
 543{
 544        kmem_cache_destroy(_dm_io_cache);
 545        _dm_io_cache = NULL;
 546}
 547