linux/drivers/md/dm-io.c
<<
>>
Prefs
   1/*
   2 * Copyright (C) 2003 Sistina Software
   3 * Copyright (C) 2006 Red Hat GmbH
   4 *
   5 * This file is released under the GPL.
   6 */
   7
   8#include <linux/device-mapper.h>
   9
  10#include <linux/bio.h>
  11#include <linux/mempool.h>
  12#include <linux/module.h>
  13#include <linux/sched.h>
  14#include <linux/slab.h>
  15#include <linux/dm-io.h>
  16
  17struct dm_io_client {
  18        mempool_t *pool;
  19        struct bio_set *bios;
  20};
  21
  22/* FIXME: can we shrink this ? */
  23struct io {
  24        unsigned long error_bits;
  25        unsigned long eopnotsupp_bits;
  26        atomic_t count;
  27        struct task_struct *sleeper;
  28        struct dm_io_client *client;
  29        io_notify_fn callback;
  30        void *context;
  31};
  32
  33/*
  34 * io contexts are only dynamically allocated for asynchronous
  35 * io.  Since async io is likely to be the majority of io we'll
  36 * have the same number of io contexts as bios! (FIXME: must reduce this).
  37 */
  38
  39static unsigned int pages_to_ios(unsigned int pages)
  40{
  41        return 4 * pages;       /* too many ? */
  42}
  43
  44/*
  45 * Create a client with mempool and bioset.
  46 */
  47struct dm_io_client *dm_io_client_create(unsigned num_pages)
  48{
  49        unsigned ios = pages_to_ios(num_pages);
  50        struct dm_io_client *client;
  51
  52        client = kmalloc(sizeof(*client), GFP_KERNEL);
  53        if (!client)
  54                return ERR_PTR(-ENOMEM);
  55
  56        client->pool = mempool_create_kmalloc_pool(ios, sizeof(struct io));
  57        if (!client->pool)
  58                goto bad;
  59
  60        client->bios = bioset_create(16, 0);
  61        if (!client->bios)
  62                goto bad;
  63
  64        return client;
  65
  66   bad:
  67        if (client->pool)
  68                mempool_destroy(client->pool);
  69        kfree(client);
  70        return ERR_PTR(-ENOMEM);
  71}
  72EXPORT_SYMBOL(dm_io_client_create);
  73
  74int dm_io_client_resize(unsigned num_pages, struct dm_io_client *client)
  75{
  76        return mempool_resize(client->pool, pages_to_ios(num_pages),
  77                              GFP_KERNEL);
  78}
  79EXPORT_SYMBOL(dm_io_client_resize);
  80
  81void dm_io_client_destroy(struct dm_io_client *client)
  82{
  83        mempool_destroy(client->pool);
  84        bioset_free(client->bios);
  85        kfree(client);
  86}
  87EXPORT_SYMBOL(dm_io_client_destroy);
  88
  89/*-----------------------------------------------------------------
  90 * We need to keep track of which region a bio is doing io for.
  91 * In order to save a memory allocation we store this the last
  92 * bvec which we know is unused (blech).
  93 * XXX This is ugly and can OOPS with some configs... find another way.
  94 *---------------------------------------------------------------*/
  95static inline void bio_set_region(struct bio *bio, unsigned region)
  96{
  97        bio->bi_io_vec[bio->bi_max_vecs].bv_len = region;
  98}
  99
 100static inline unsigned bio_get_region(struct bio *bio)
 101{
 102        return bio->bi_io_vec[bio->bi_max_vecs].bv_len;
 103}
 104
 105/*-----------------------------------------------------------------
 106 * We need an io object to keep track of the number of bios that
 107 * have been dispatched for a particular io.
 108 *---------------------------------------------------------------*/
 109static void dec_count(struct io *io, unsigned int region, int error)
 110{
 111        if (error) {
 112                set_bit(region, &io->error_bits);
 113                if (error == -EOPNOTSUPP)
 114                        set_bit(region, &io->eopnotsupp_bits);
 115        }
 116
 117        if (atomic_dec_and_test(&io->count)) {
 118                if (io->sleeper)
 119                        wake_up_process(io->sleeper);
 120
 121                else {
 122                        unsigned long r = io->error_bits;
 123                        io_notify_fn fn = io->callback;
 124                        void *context = io->context;
 125
 126                        mempool_free(io, io->client->pool);
 127                        fn(r, context);
 128                }
 129        }
 130}
 131
 132static void endio(struct bio *bio, int error)
 133{
 134        struct io *io;
 135        unsigned region;
 136
 137        if (error && bio_data_dir(bio) == READ)
 138                zero_fill_bio(bio);
 139
 140        /*
 141         * The bio destructor in bio_put() may use the io object.
 142         */
 143        io = bio->bi_private;
 144        region = bio_get_region(bio);
 145
 146        bio->bi_max_vecs++;
 147        bio_put(bio);
 148
 149        dec_count(io, region, error);
 150}
 151
 152/*-----------------------------------------------------------------
 153 * These little objects provide an abstraction for getting a new
 154 * destination page for io.
 155 *---------------------------------------------------------------*/
 156struct dpages {
 157        void (*get_page)(struct dpages *dp,
 158                         struct page **p, unsigned long *len, unsigned *offset);
 159        void (*next_page)(struct dpages *dp);
 160
 161        unsigned context_u;
 162        void *context_ptr;
 163};
 164
 165/*
 166 * Functions for getting the pages from a list.
 167 */
 168static void list_get_page(struct dpages *dp,
 169                  struct page **p, unsigned long *len, unsigned *offset)
 170{
 171        unsigned o = dp->context_u;
 172        struct page_list *pl = (struct page_list *) dp->context_ptr;
 173
 174        *p = pl->page;
 175        *len = PAGE_SIZE - o;
 176        *offset = o;
 177}
 178
 179static void list_next_page(struct dpages *dp)
 180{
 181        struct page_list *pl = (struct page_list *) dp->context_ptr;
 182        dp->context_ptr = pl->next;
 183        dp->context_u = 0;
 184}
 185
 186static void list_dp_init(struct dpages *dp, struct page_list *pl, unsigned offset)
 187{
 188        dp->get_page = list_get_page;
 189        dp->next_page = list_next_page;
 190        dp->context_u = offset;
 191        dp->context_ptr = pl;
 192}
 193
 194/*
 195 * Functions for getting the pages from a bvec.
 196 */
 197static void bvec_get_page(struct dpages *dp,
 198                  struct page **p, unsigned long *len, unsigned *offset)
 199{
 200        struct bio_vec *bvec = (struct bio_vec *) dp->context_ptr;
 201        *p = bvec->bv_page;
 202        *len = bvec->bv_len;
 203        *offset = bvec->bv_offset;
 204}
 205
 206static void bvec_next_page(struct dpages *dp)
 207{
 208        struct bio_vec *bvec = (struct bio_vec *) dp->context_ptr;
 209        dp->context_ptr = bvec + 1;
 210}
 211
 212static void bvec_dp_init(struct dpages *dp, struct bio_vec *bvec)
 213{
 214        dp->get_page = bvec_get_page;
 215        dp->next_page = bvec_next_page;
 216        dp->context_ptr = bvec;
 217}
 218
 219/*
 220 * Functions for getting the pages from a VMA.
 221 */
 222static void vm_get_page(struct dpages *dp,
 223                 struct page **p, unsigned long *len, unsigned *offset)
 224{
 225        *p = vmalloc_to_page(dp->context_ptr);
 226        *offset = dp->context_u;
 227        *len = PAGE_SIZE - dp->context_u;
 228}
 229
 230static void vm_next_page(struct dpages *dp)
 231{
 232        dp->context_ptr += PAGE_SIZE - dp->context_u;
 233        dp->context_u = 0;
 234}
 235
 236static void vm_dp_init(struct dpages *dp, void *data)
 237{
 238        dp->get_page = vm_get_page;
 239        dp->next_page = vm_next_page;
 240        dp->context_u = ((unsigned long) data) & (PAGE_SIZE - 1);
 241        dp->context_ptr = data;
 242}
 243
 244static void dm_bio_destructor(struct bio *bio)
 245{
 246        struct io *io = bio->bi_private;
 247
 248        bio_free(bio, io->client->bios);
 249}
 250
 251/*
 252 * Functions for getting the pages from kernel memory.
 253 */
 254static void km_get_page(struct dpages *dp, struct page **p, unsigned long *len,
 255                        unsigned *offset)
 256{
 257        *p = virt_to_page(dp->context_ptr);
 258        *offset = dp->context_u;
 259        *len = PAGE_SIZE - dp->context_u;
 260}
 261
 262static void km_next_page(struct dpages *dp)
 263{
 264        dp->context_ptr += PAGE_SIZE - dp->context_u;
 265        dp->context_u = 0;
 266}
 267
 268static void km_dp_init(struct dpages *dp, void *data)
 269{
 270        dp->get_page = km_get_page;
 271        dp->next_page = km_next_page;
 272        dp->context_u = ((unsigned long) data) & (PAGE_SIZE - 1);
 273        dp->context_ptr = data;
 274}
 275
 276/*-----------------------------------------------------------------
 277 * IO routines that accept a list of pages.
 278 *---------------------------------------------------------------*/
 279static void do_region(int rw, unsigned region, struct dm_io_region *where,
 280                      struct dpages *dp, struct io *io)
 281{
 282        struct bio *bio;
 283        struct page *page;
 284        unsigned long len;
 285        unsigned offset;
 286        unsigned num_bvecs;
 287        sector_t remaining = where->count;
 288
 289        while (remaining) {
 290                /*
 291                 * Allocate a suitably sized-bio: we add an extra
 292                 * bvec for bio_get/set_region() and decrement bi_max_vecs
 293                 * to hide it from bio_add_page().
 294                 */
 295                num_bvecs = dm_sector_div_up(remaining,
 296                                             (PAGE_SIZE >> SECTOR_SHIFT));
 297                num_bvecs = 1 + min_t(int, bio_get_nr_vecs(where->bdev),
 298                                      num_bvecs);
 299                if (unlikely(num_bvecs > BIO_MAX_PAGES))
 300                        num_bvecs = BIO_MAX_PAGES;
 301                bio = bio_alloc_bioset(GFP_NOIO, num_bvecs, io->client->bios);
 302                bio->bi_sector = where->sector + (where->count - remaining);
 303                bio->bi_bdev = where->bdev;
 304                bio->bi_end_io = endio;
 305                bio->bi_private = io;
 306                bio->bi_destructor = dm_bio_destructor;
 307                bio->bi_max_vecs--;
 308                bio_set_region(bio, region);
 309
 310                /*
 311                 * Try and add as many pages as possible.
 312                 */
 313                while (remaining) {
 314                        dp->get_page(dp, &page, &len, &offset);
 315                        len = min(len, to_bytes(remaining));
 316                        if (!bio_add_page(bio, page, len, offset))
 317                                break;
 318
 319                        offset = 0;
 320                        remaining -= to_sector(len);
 321                        dp->next_page(dp);
 322                }
 323
 324                atomic_inc(&io->count);
 325                submit_bio(rw, bio);
 326        }
 327}
 328
 329static void dispatch_io(int rw, unsigned int num_regions,
 330                        struct dm_io_region *where, struct dpages *dp,
 331                        struct io *io, int sync)
 332{
 333        int i;
 334        struct dpages old_pages = *dp;
 335
 336        if (sync)
 337                rw |= (1 << BIO_RW_SYNCIO) | (1 << BIO_RW_UNPLUG);
 338
 339        /*
 340         * For multiple regions we need to be careful to rewind
 341         * the dp object for each call to do_region.
 342         */
 343        for (i = 0; i < num_regions; i++) {
 344                *dp = old_pages;
 345                if (where[i].count)
 346                        do_region(rw, i, where + i, dp, io);
 347        }
 348
 349        /*
 350         * Drop the extra reference that we were holding to avoid
 351         * the io being completed too early.
 352         */
 353        dec_count(io, 0, 0);
 354}
 355
 356static int sync_io(struct dm_io_client *client, unsigned int num_regions,
 357                   struct dm_io_region *where, int rw, struct dpages *dp,
 358                   unsigned long *error_bits)
 359{
 360        struct io io;
 361
 362        if (num_regions > 1 && (rw & RW_MASK) != WRITE) {
 363                WARN_ON(1);
 364                return -EIO;
 365        }
 366
 367retry:
 368        io.error_bits = 0;
 369        io.eopnotsupp_bits = 0;
 370        atomic_set(&io.count, 1); /* see dispatch_io() */
 371        io.sleeper = current;
 372        io.client = client;
 373
 374        dispatch_io(rw, num_regions, where, dp, &io, 1);
 375
 376        while (1) {
 377                set_current_state(TASK_UNINTERRUPTIBLE);
 378
 379                if (!atomic_read(&io.count))
 380                        break;
 381
 382                io_schedule();
 383        }
 384        set_current_state(TASK_RUNNING);
 385
 386        if (io.eopnotsupp_bits && (rw & (1 << BIO_RW_BARRIER))) {
 387                rw &= ~(1 << BIO_RW_BARRIER);
 388                goto retry;
 389        }
 390
 391        if (error_bits)
 392                *error_bits = io.error_bits;
 393
 394        return io.error_bits ? -EIO : 0;
 395}
 396
 397static int async_io(struct dm_io_client *client, unsigned int num_regions,
 398                    struct dm_io_region *where, int rw, struct dpages *dp,
 399                    io_notify_fn fn, void *context)
 400{
 401        struct io *io;
 402
 403        if (num_regions > 1 && (rw & RW_MASK) != WRITE) {
 404                WARN_ON(1);
 405                fn(1, context);
 406                return -EIO;
 407        }
 408
 409        io = mempool_alloc(client->pool, GFP_NOIO);
 410        io->error_bits = 0;
 411        io->eopnotsupp_bits = 0;
 412        atomic_set(&io->count, 1); /* see dispatch_io() */
 413        io->sleeper = NULL;
 414        io->client = client;
 415        io->callback = fn;
 416        io->context = context;
 417
 418        dispatch_io(rw, num_regions, where, dp, io, 0);
 419        return 0;
 420}
 421
 422static int dp_init(struct dm_io_request *io_req, struct dpages *dp)
 423{
 424        /* Set up dpages based on memory type */
 425        switch (io_req->mem.type) {
 426        case DM_IO_PAGE_LIST:
 427                list_dp_init(dp, io_req->mem.ptr.pl, io_req->mem.offset);
 428                break;
 429
 430        case DM_IO_BVEC:
 431                bvec_dp_init(dp, io_req->mem.ptr.bvec);
 432                break;
 433
 434        case DM_IO_VMA:
 435                vm_dp_init(dp, io_req->mem.ptr.vma);
 436                break;
 437
 438        case DM_IO_KMEM:
 439                km_dp_init(dp, io_req->mem.ptr.addr);
 440                break;
 441
 442        default:
 443                return -EINVAL;
 444        }
 445
 446        return 0;
 447}
 448
 449/*
 450 * New collapsed (a)synchronous interface.
 451 *
 452 * If the IO is asynchronous (i.e. it has notify.fn), you must either unplug
 453 * the queue with blk_unplug() some time later or set the BIO_RW_SYNC bit in
 454 * io_req->bi_rw. If you fail to do one of these, the IO will be submitted to
 455 * the disk after q->unplug_delay, which defaults to 3ms in blk-settings.c.
 456 */
 457int dm_io(struct dm_io_request *io_req, unsigned num_regions,
 458          struct dm_io_region *where, unsigned long *sync_error_bits)
 459{
 460        int r;
 461        struct dpages dp;
 462
 463        r = dp_init(io_req, &dp);
 464        if (r)
 465                return r;
 466
 467        if (!io_req->notify.fn)
 468                return sync_io(io_req->client, num_regions, where,
 469                               io_req->bi_rw, &dp, sync_error_bits);
 470
 471        return async_io(io_req->client, num_regions, where, io_req->bi_rw,
 472                        &dp, io_req->notify.fn, io_req->notify.context);
 473}
 474EXPORT_SYMBOL(dm_io);
 475