linux/drivers/staging/lustre/lustre/llite/lloop.c
<<
>>
Prefs
   1/*
   2 * GPL HEADER START
   3 *
   4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   5 *
   6 * This program is free software; you can redistribute it and/or modify
   7 * it under the terms of the GNU General Public License version 2 only,
   8 * as published by the Free Software Foundation.
   9 *
  10 * This program is distributed in the hope that it will be useful, but
  11 * WITHOUT ANY WARRANTY; without even the implied warranty of
  12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  13 * General Public License version 2 for more details (a copy is included
  14 * in the LICENSE file that accompanied this code).
  15 *
  16 * You should have received a copy of the GNU General Public License
  17 * version 2 along with this program; If not, see
  18 * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
  19 *
  20 * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
  21 * CA 95054 USA or visit www.sun.com if you need additional information or
  22 * have any questions.
  23 *
  24 * GPL HEADER END
  25 */
  26/*
  27 * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
  28 * Use is subject to license terms.
  29 *
  30 * Copyright (c) 2011, 2012, Intel Corporation.
  31 */
  32/*
  33 * This file is part of Lustre, http://www.lustre.org/
  34 * Lustre is a trademark of Sun Microsystems, Inc.
  35 */
  36
  37/*
  38 *  linux/drivers/block/loop.c
  39 *
  40 *  Written by Theodore Ts'o, 3/29/93
  41 *
  42 * Copyright 1993 by Theodore Ts'o.  Redistribution of this file is
  43 * permitted under the GNU General Public License.
  44 *
  45 * Modularized and updated for 1.1.16 kernel - Mitch Dsouza 28th May 1994
  46 * Adapted for 1.3.59 kernel - Andries Brouwer, 1 Feb 1996
  47 *
  48 * Fixed do_loop_request() re-entrancy - Vincent.Renardias@waw.com Mar 20, 1997
  49 *
  50 * Added devfs support - Richard Gooch <rgooch@atnf.csiro.au> 16-Jan-1998
  51 *
  52 * Handle sparse backing files correctly - Kenn Humborg, Jun 28, 1998
  53 *
  54 * Loadable modules and other fixes by AK, 1998
  55 *
  56 * Maximum number of loop devices now dynamic via max_loop module parameter.
  57 * Russell Kroll <rkroll@exploits.org> 19990701
  58 *
  59 * Maximum number of loop devices when compiled-in now selectable by passing
  60 * max_loop=<1-255> to the kernel on boot.
  61 * Erik I. Bols?, <eriki@himolde.no>, Oct 31, 1999
  62 *
  63 * Completely rewrite request handling to be make_request_fn style and
  64 * non blocking, pushing work to a helper thread. Lots of fixes from
  65 * Al Viro too.
  66 * Jens Axboe <axboe@suse.de>, Nov 2000
  67 *
  68 * Support up to 256 loop devices
  69 * Heinz Mauelshagen <mge@sistina.com>, Feb 2002
  70 *
  71 * Support for falling back on the write file operation when the address space
  72 * operations prepare_write and/or commit_write are not available on the
  73 * backing filesystem.
  74 * Anton Altaparmakov, 16 Feb 2005
  75 *
  76 * Still To Fix:
  77 * - Advisory locking is ignored here.
  78 * - Should use an own CAP_* category instead of CAP_SYS_ADMIN
  79 *
  80 */
  81
  82#include <linux/module.h>
  83
  84#include <linux/sched.h>
  85#include <linux/fs.h>
  86#include <linux/file.h>
  87#include <linux/stat.h>
  88#include <linux/errno.h>
  89#include <linux/major.h>
  90#include <linux/wait.h>
  91#include <linux/blkdev.h>
  92#include <linux/blkpg.h>
  93#include <linux/init.h>
  94#include <linux/swap.h>
  95#include <linux/slab.h>
  96#include <linux/suspend.h>
  97#include <linux/writeback.h>
  98#include <linux/buffer_head.h>          /* for invalidate_bdev() */
  99#include <linux/completion.h>
 100#include <linux/highmem.h>
 101#include <linux/gfp.h>
 102#include <linux/swap.h>
 103#include <linux/pagevec.h>
 104
 105#include <asm/uaccess.h>
 106
 107#include <lustre_lib.h>
 108#include <lustre_lite.h>
 109#include "llite_internal.h"
 110
 111#define LLOOP_MAX_SEGMENTS      LNET_MAX_IOV
 112
 113/* Possible states of device */
 114enum {
 115        LLOOP_UNBOUND,
 116        LLOOP_BOUND,
 117        LLOOP_RUNDOWN,
 118};
 119
 120struct lloop_device {
 121        int               lo_number;
 122        int               lo_refcnt;
 123        loff_t         lo_offset;
 124        loff_t         lo_sizelimit;
 125        int               lo_flags;
 126        int             (*ioctl)(struct lloop_device *, int cmd,
 127                                    unsigned long arg);
 128
 129        struct file      *lo_backing_file;
 130        struct block_device *lo_device;
 131        unsigned             lo_blocksize;
 132
 133        int               old_gfp_mask;
 134
 135        spinlock_t              lo_lock;
 136        struct bio              *lo_bio;
 137        struct bio              *lo_biotail;
 138        int                     lo_state;
 139        struct semaphore        lo_sem;
 140        struct mutex            lo_ctl_mutex;
 141        atomic_t         lo_pending;
 142        wait_queue_head_t         lo_bh_wait;
 143
 144        struct request_queue *lo_queue;
 145
 146        const struct lu_env *lo_env;
 147        struct cl_io     lo_io;
 148        struct ll_dio_pages  lo_pvec;
 149
 150        /* data to handle bio for lustre. */
 151        struct lo_request_data {
 152                struct page *lrd_pages[LLOOP_MAX_SEGMENTS];
 153                loff_t       lrd_offsets[LLOOP_MAX_SEGMENTS];
 154        } lo_requests[1];
 155};
 156
 157/*
 158 * Loop flags
 159 */
 160enum {
 161        LO_FLAGS_READ_ONLY       = 1,
 162};
 163
 164static int lloop_major;
 165#define MAX_LOOP_DEFAULT  16
 166static int max_loop = MAX_LOOP_DEFAULT;
 167static struct lloop_device *loop_dev;
 168static struct gendisk **disks;
 169static struct mutex lloop_mutex;
 170static void *ll_iocontrol_magic = NULL;
 171
 172static loff_t get_loop_size(struct lloop_device *lo, struct file *file)
 173{
 174        loff_t size, offset, loopsize;
 175
 176        /* Compute loopsize in bytes */
 177        size = i_size_read(file->f_mapping->host);
 178        offset = lo->lo_offset;
 179        loopsize = size - offset;
 180        if (lo->lo_sizelimit > 0 && lo->lo_sizelimit < loopsize)
 181                loopsize = lo->lo_sizelimit;
 182
 183        /*
 184         * Unfortunately, if we want to do I/O on the device,
 185         * the number of 512-byte sectors has to fit into a sector_t.
 186         */
 187        return loopsize >> 9;
 188}
 189
 190static int do_bio_lustrebacked(struct lloop_device *lo, struct bio *head)
 191{
 192        const struct lu_env  *env   = lo->lo_env;
 193        struct cl_io     *io    = &lo->lo_io;
 194        struct inode     *inode = lo->lo_backing_file->f_dentry->d_inode;
 195        struct cl_object     *obj = ll_i2info(inode)->lli_clob;
 196        pgoff_t        offset;
 197        int                ret;
 198        int                i;
 199        int                rw;
 200        obd_count            page_count = 0;
 201        struct bio_vec       *bvec;
 202        struct bio         *bio;
 203        ssize_t        bytes;
 204
 205        struct ll_dio_pages  *pvec = &lo->lo_pvec;
 206        struct page      **pages = pvec->ldp_pages;
 207        loff_t         *offsets = pvec->ldp_offsets;
 208
 209        truncate_inode_pages(inode->i_mapping, 0);
 210
 211        /* initialize the IO */
 212        memset(io, 0, sizeof(*io));
 213        io->ci_obj = obj;
 214        ret = cl_io_init(env, io, CIT_MISC, obj);
 215        if (ret)
 216                return io->ci_result;
 217        io->ci_lockreq = CILR_NEVER;
 218
 219        LASSERT(head != NULL);
 220        rw = head->bi_rw;
 221        for (bio = head; bio != NULL; bio = bio->bi_next) {
 222                LASSERT(rw == bio->bi_rw);
 223
 224                offset = (pgoff_t)(bio->bi_sector << 9) + lo->lo_offset;
 225                bio_for_each_segment(bvec, bio, i) {
 226                        BUG_ON(bvec->bv_offset != 0);
 227                        BUG_ON(bvec->bv_len != PAGE_CACHE_SIZE);
 228
 229                        pages[page_count] = bvec->bv_page;
 230                        offsets[page_count] = offset;
 231                        page_count++;
 232                        offset += bvec->bv_len;
 233                }
 234                LASSERT(page_count <= LLOOP_MAX_SEGMENTS);
 235        }
 236
 237        ll_stats_ops_tally(ll_i2sbi(inode),
 238                        (rw == WRITE) ? LPROC_LL_BRW_WRITE : LPROC_LL_BRW_READ,
 239                        page_count);
 240
 241        pvec->ldp_size = page_count << PAGE_CACHE_SHIFT;
 242        pvec->ldp_nr = page_count;
 243
 244        /* FIXME: in ll_direct_rw_pages, it has to allocate many cl_page{}s to
 245         * write those pages into OST. Even worse case is that more pages
 246         * would be asked to write out to swap space, and then finally get here
 247         * again.
 248         * Unfortunately this is NOT easy to fix.
 249         * Thoughts on solution:
 250         * 0. Define a reserved pool for cl_pages, which could be a list of
 251         *    pre-allocated cl_pages;
 252         * 1. Define a new operation in cl_object_operations{}, says clo_depth,
 253         *    which measures how many layers for this lustre object. Generally
 254         *    speaking, the depth would be 2, one for llite, and one for lovsub.
 255         *    However, for SNS, there will be more since we need additional page
 256         *    to store parity;
 257         * 2. Reserve the # of (page_count * depth) cl_pages from the reserved
 258         *    pool. Afterwards, the clio would allocate the pages from reserved
 259         *    pool, this guarantees we neeedn't allocate the cl_pages from
 260         *    generic cl_page slab cache.
 261         *    Of course, if there is NOT enough pages in the pool, we might
 262         *    be asked to write less pages once, this purely depends on
 263         *    implementation. Anyway, we should be careful to avoid deadlocking.
 264         */
 265        mutex_lock(&inode->i_mutex);
 266        bytes = ll_direct_rw_pages(env, io, rw, inode, pvec);
 267        mutex_unlock(&inode->i_mutex);
 268        cl_io_fini(env, io);
 269        return (bytes == pvec->ldp_size) ? 0 : (int)bytes;
 270}
 271
 272/*
 273 * Add bio to back of pending list
 274 */
 275static void loop_add_bio(struct lloop_device *lo, struct bio *bio)
 276{
 277        unsigned long flags;
 278
 279        spin_lock_irqsave(&lo->lo_lock, flags);
 280        if (lo->lo_biotail) {
 281                lo->lo_biotail->bi_next = bio;
 282                lo->lo_biotail = bio;
 283        } else
 284                lo->lo_bio = lo->lo_biotail = bio;
 285        spin_unlock_irqrestore(&lo->lo_lock, flags);
 286
 287        atomic_inc(&lo->lo_pending);
 288        if (waitqueue_active(&lo->lo_bh_wait))
 289                wake_up(&lo->lo_bh_wait);
 290}
 291
 292/*
 293 * Grab first pending buffer
 294 */
 295static unsigned int loop_get_bio(struct lloop_device *lo, struct bio **req)
 296{
 297        struct bio *first;
 298        struct bio **bio;
 299        unsigned int count = 0;
 300        unsigned int page_count = 0;
 301        int rw;
 302
 303        spin_lock_irq(&lo->lo_lock);
 304        first = lo->lo_bio;
 305        if (unlikely(first == NULL)) {
 306                spin_unlock_irq(&lo->lo_lock);
 307                return 0;
 308        }
 309
 310        /* TODO: need to split the bio, too bad. */
 311        LASSERT(first->bi_vcnt <= LLOOP_MAX_SEGMENTS);
 312
 313        rw = first->bi_rw;
 314        bio = &lo->lo_bio;
 315        while (*bio && (*bio)->bi_rw == rw) {
 316                CDEBUG(D_INFO, "bio sector %llu size %u count %u vcnt%u \n",
 317                       (unsigned long long)(*bio)->bi_sector, (*bio)->bi_size,
 318                       page_count, (*bio)->bi_vcnt);
 319                if (page_count + (*bio)->bi_vcnt > LLOOP_MAX_SEGMENTS)
 320                        break;
 321
 322
 323                page_count += (*bio)->bi_vcnt;
 324                count++;
 325                bio = &(*bio)->bi_next;
 326        }
 327        if (*bio) {
 328                /* Some of bios can't be mergable. */
 329                lo->lo_bio = *bio;
 330                *bio = NULL;
 331        } else {
 332                /* Hit the end of queue */
 333                lo->lo_biotail = NULL;
 334                lo->lo_bio = NULL;
 335        }
 336        *req = first;
 337        spin_unlock_irq(&lo->lo_lock);
 338        return count;
 339}
 340
 341static ll_mrf_ret
 342loop_make_request(struct request_queue *q, struct bio *old_bio)
 343{
 344        struct lloop_device *lo = q->queuedata;
 345        int rw = bio_rw(old_bio);
 346        int inactive;
 347
 348        if (!lo)
 349                goto err;
 350
 351        CDEBUG(D_INFO, "submit bio sector %llu size %u\n",
 352               (unsigned long long)old_bio->bi_sector, old_bio->bi_size);
 353
 354        spin_lock_irq(&lo->lo_lock);
 355        inactive = (lo->lo_state != LLOOP_BOUND);
 356        spin_unlock_irq(&lo->lo_lock);
 357        if (inactive)
 358                goto err;
 359
 360        if (rw == WRITE) {
 361                if (lo->lo_flags & LO_FLAGS_READ_ONLY)
 362                        goto err;
 363        } else if (rw == READA) {
 364                rw = READ;
 365        } else if (rw != READ) {
 366                CERROR("lloop: unknown command (%x)\n", rw);
 367                goto err;
 368        }
 369        loop_add_bio(lo, old_bio);
 370        LL_MRF_RETURN(0);
 371err:
 372        cfs_bio_io_error(old_bio, old_bio->bi_size);
 373        LL_MRF_RETURN(0);
 374}
 375
 376
 377static inline void loop_handle_bio(struct lloop_device *lo, struct bio *bio)
 378{
 379        int ret;
 380        ret = do_bio_lustrebacked(lo, bio);
 381        while (bio) {
 382                struct bio *tmp = bio->bi_next;
 383                bio->bi_next = NULL;
 384                cfs_bio_endio(bio, bio->bi_size, ret);
 385                bio = tmp;
 386        }
 387}
 388
 389static inline int loop_active(struct lloop_device *lo)
 390{
 391        return atomic_read(&lo->lo_pending) ||
 392                (lo->lo_state == LLOOP_RUNDOWN);
 393}
 394
 395/*
 396 * worker thread that handles reads/writes to file backed loop devices,
 397 * to avoid blocking in our make_request_fn.
 398 */
 399static int loop_thread(void *data)
 400{
 401        struct lloop_device *lo = data;
 402        struct bio *bio;
 403        unsigned int count;
 404        unsigned long times = 0;
 405        unsigned long total_count = 0;
 406
 407        struct lu_env *env;
 408        int refcheck;
 409        int ret = 0;
 410
 411        set_user_nice(current, -20);
 412
 413        lo->lo_state = LLOOP_BOUND;
 414
 415        env = cl_env_get(&refcheck);
 416        if (IS_ERR(env))
 417                GOTO(out, ret = PTR_ERR(env));
 418
 419        lo->lo_env = env;
 420        memset(&lo->lo_pvec, 0, sizeof(lo->lo_pvec));
 421        lo->lo_pvec.ldp_pages   = lo->lo_requests[0].lrd_pages;
 422        lo->lo_pvec.ldp_offsets = lo->lo_requests[0].lrd_offsets;
 423
 424        /*
 425         * up sem, we are running
 426         */
 427        up(&lo->lo_sem);
 428
 429        for (;;) {
 430                wait_event(lo->lo_bh_wait, loop_active(lo));
 431                if (!atomic_read(&lo->lo_pending)) {
 432                        int exiting = 0;
 433                        spin_lock_irq(&lo->lo_lock);
 434                        exiting = (lo->lo_state == LLOOP_RUNDOWN);
 435                        spin_unlock_irq(&lo->lo_lock);
 436                        if (exiting)
 437                                break;
 438                }
 439
 440                bio = NULL;
 441                count = loop_get_bio(lo, &bio);
 442                if (!count) {
 443                        CWARN("lloop(minor: %d): missing bio\n", lo->lo_number);
 444                        continue;
 445                }
 446
 447                total_count += count;
 448                if (total_count < count) {     /* overflow */
 449                        total_count = count;
 450                        times = 1;
 451                } else {
 452                        times++;
 453                }
 454                if ((times & 127) == 0) {
 455                        CDEBUG(D_INFO, "total: %lu, count: %lu, avg: %lu\n",
 456                               total_count, times, total_count / times);
 457                }
 458
 459                LASSERT(bio != NULL);
 460                LASSERT(count <= atomic_read(&lo->lo_pending));
 461                loop_handle_bio(lo, bio);
 462                atomic_sub(count, &lo->lo_pending);
 463        }
 464        cl_env_put(env, &refcheck);
 465
 466out:
 467        up(&lo->lo_sem);
 468        return ret;
 469}
 470
 471static int loop_set_fd(struct lloop_device *lo, struct file *unused,
 472                       struct block_device *bdev, struct file *file)
 473{
 474        struct inode     *inode;
 475        struct address_space *mapping;
 476        int                lo_flags = 0;
 477        int                error;
 478        loff_t          size;
 479
 480        if (!try_module_get(THIS_MODULE))
 481                return -ENODEV;
 482
 483        error = -EBUSY;
 484        if (lo->lo_state != LLOOP_UNBOUND)
 485                goto out;
 486
 487        mapping = file->f_mapping;
 488        inode = mapping->host;
 489
 490        error = -EINVAL;
 491        if (!S_ISREG(inode->i_mode) || inode->i_sb->s_magic != LL_SUPER_MAGIC)
 492                goto out;
 493
 494        if (!(file->f_mode & FMODE_WRITE))
 495                lo_flags |= LO_FLAGS_READ_ONLY;
 496
 497        size = get_loop_size(lo, file);
 498
 499        if ((loff_t)(sector_t)size != size) {
 500                error = -EFBIG;
 501                goto out;
 502        }
 503
 504        /* remove all pages in cache so as dirty pages not to be existent. */
 505        truncate_inode_pages(mapping, 0);
 506
 507        set_device_ro(bdev, (lo_flags & LO_FLAGS_READ_ONLY) != 0);
 508
 509        lo->lo_blocksize = PAGE_CACHE_SIZE;
 510        lo->lo_device = bdev;
 511        lo->lo_flags = lo_flags;
 512        lo->lo_backing_file = file;
 513        lo->ioctl = NULL;
 514        lo->lo_sizelimit = 0;
 515        lo->old_gfp_mask = mapping_gfp_mask(mapping);
 516        mapping_set_gfp_mask(mapping, lo->old_gfp_mask & ~(__GFP_IO|__GFP_FS));
 517
 518        lo->lo_bio = lo->lo_biotail = NULL;
 519
 520        /*
 521         * set queue make_request_fn, and add limits based on lower level
 522         * device
 523         */
 524        blk_queue_make_request(lo->lo_queue, loop_make_request);
 525        lo->lo_queue->queuedata = lo;
 526
 527        /* queue parameters */
 528        CLASSERT(PAGE_CACHE_SIZE < (1 << (sizeof(unsigned short) * 8)));
 529        blk_queue_logical_block_size(lo->lo_queue,
 530                                     (unsigned short)PAGE_CACHE_SIZE);
 531        blk_queue_max_hw_sectors(lo->lo_queue,
 532                                 LLOOP_MAX_SEGMENTS << (PAGE_CACHE_SHIFT - 9));
 533        blk_queue_max_segments(lo->lo_queue, LLOOP_MAX_SEGMENTS);
 534
 535        set_capacity(disks[lo->lo_number], size);
 536        bd_set_size(bdev, size << 9);
 537
 538        set_blocksize(bdev, lo->lo_blocksize);
 539
 540        kthread_run(loop_thread, lo, "lloop%d", lo->lo_number);
 541        down(&lo->lo_sem);
 542        return 0;
 543
 544out:
 545        /* This is safe: open() is still holding a reference. */
 546        module_put(THIS_MODULE);
 547        return error;
 548}
 549
 550static int loop_clr_fd(struct lloop_device *lo, struct block_device *bdev,
 551                       int count)
 552{
 553        struct file *filp = lo->lo_backing_file;
 554        int gfp = lo->old_gfp_mask;
 555
 556        if (lo->lo_state != LLOOP_BOUND)
 557                return -ENXIO;
 558
 559        if (lo->lo_refcnt > count)      /* we needed one fd for the ioctl */
 560                return -EBUSY;
 561
 562        if (filp == NULL)
 563                return -EINVAL;
 564
 565        spin_lock_irq(&lo->lo_lock);
 566        lo->lo_state = LLOOP_RUNDOWN;
 567        spin_unlock_irq(&lo->lo_lock);
 568        wake_up(&lo->lo_bh_wait);
 569
 570        down(&lo->lo_sem);
 571        lo->lo_backing_file = NULL;
 572        lo->ioctl = NULL;
 573        lo->lo_device = NULL;
 574        lo->lo_offset = 0;
 575        lo->lo_sizelimit = 0;
 576        lo->lo_flags = 0;
 577        ll_invalidate_bdev(bdev, 0);
 578        set_capacity(disks[lo->lo_number], 0);
 579        bd_set_size(bdev, 0);
 580        mapping_set_gfp_mask(filp->f_mapping, gfp);
 581        lo->lo_state = LLOOP_UNBOUND;
 582        fput(filp);
 583        /* This is safe: open() is still holding a reference. */
 584        module_put(THIS_MODULE);
 585        return 0;
 586}
 587
 588static int lo_open(struct block_device *bdev, fmode_t mode)
 589{
 590        struct lloop_device *lo = bdev->bd_disk->private_data;
 591
 592        mutex_lock(&lo->lo_ctl_mutex);
 593        lo->lo_refcnt++;
 594        mutex_unlock(&lo->lo_ctl_mutex);
 595
 596        return 0;
 597}
 598
 599static void lo_release(struct gendisk *disk, fmode_t mode)
 600{
 601        struct lloop_device *lo = disk->private_data;
 602
 603        mutex_lock(&lo->lo_ctl_mutex);
 604        --lo->lo_refcnt;
 605        mutex_unlock(&lo->lo_ctl_mutex);
 606}
 607
 608/* lloop device node's ioctl function. */
 609static int lo_ioctl(struct block_device *bdev, fmode_t mode,
 610                    unsigned int cmd, unsigned long arg)
 611{
 612        struct lloop_device *lo = bdev->bd_disk->private_data;
 613        struct inode *inode = NULL;
 614        int err = 0;
 615
 616        mutex_lock(&lloop_mutex);
 617        switch (cmd) {
 618        case LL_IOC_LLOOP_DETACH: {
 619                err = loop_clr_fd(lo, bdev, 2);
 620                if (err == 0)
 621                        ll_blkdev_put(bdev, 0); /* grabbed in LLOOP_ATTACH */
 622                break;
 623        }
 624
 625        case LL_IOC_LLOOP_INFO: {
 626                struct lu_fid fid;
 627
 628                LASSERT(lo->lo_backing_file != NULL);
 629                if (inode == NULL)
 630                        inode = lo->lo_backing_file->f_dentry->d_inode;
 631                if (lo->lo_state == LLOOP_BOUND)
 632                        fid = ll_i2info(inode)->lli_fid;
 633                else
 634                        fid_zero(&fid);
 635
 636                if (copy_to_user((struct lu_fid *)arg, &fid, sizeof(fid)))
 637                        err = -EFAULT;
 638                break;
 639        }
 640
 641        default:
 642                err = -EINVAL;
 643                break;
 644        }
 645        mutex_unlock(&lloop_mutex);
 646
 647        return err;
 648}
 649
 650static struct block_device_operations lo_fops = {
 651        .owner =        THIS_MODULE,
 652        .open =  lo_open,
 653        .release =      lo_release,
 654        .ioctl =        lo_ioctl,
 655};
 656
 657/* dynamic iocontrol callback.
 658 * This callback is registered in lloop_init and will be called by
 659 * ll_iocontrol_call.
 660 *
 661 * This is a llite regular file ioctl function. It takes the responsibility
 662 * of attaching or detaching a file by a lloop's device numner.
 663 */
 664static enum llioc_iter lloop_ioctl(struct inode *unused, struct file *file,
 665                                   unsigned int cmd, unsigned long arg,
 666                                   void *magic, int *rcp)
 667{
 668        struct lloop_device *lo = NULL;
 669        struct block_device *bdev = NULL;
 670        int err = 0;
 671        dev_t dev;
 672
 673        if (magic != ll_iocontrol_magic)
 674                return LLIOC_CONT;
 675
 676        if (disks == NULL)
 677                GOTO(out1, err = -ENODEV);
 678
 679        CWARN("Enter llop_ioctl\n");
 680
 681        mutex_lock(&lloop_mutex);
 682        switch (cmd) {
 683        case LL_IOC_LLOOP_ATTACH: {
 684                struct lloop_device *lo_free = NULL;
 685                int i;
 686
 687                for (i = 0; i < max_loop; i++, lo = NULL) {
 688                        lo = &loop_dev[i];
 689                        if (lo->lo_state == LLOOP_UNBOUND) {
 690                                if (!lo_free)
 691                                        lo_free = lo;
 692                                continue;
 693                        }
 694                        if (lo->lo_backing_file->f_dentry->d_inode ==
 695                            file->f_dentry->d_inode)
 696                                break;
 697                }
 698                if (lo || !lo_free)
 699                        GOTO(out, err = -EBUSY);
 700
 701                lo = lo_free;
 702                dev = MKDEV(lloop_major, lo->lo_number);
 703
 704                /* quit if the used pointer is writable */
 705                if (put_user((long)old_encode_dev(dev), (long*)arg))
 706                        GOTO(out, err = -EFAULT);
 707
 708                bdev = blkdev_get_by_dev(dev, file->f_mode, NULL);
 709                if (IS_ERR(bdev))
 710                        GOTO(out, err = PTR_ERR(bdev));
 711
 712                get_file(file);
 713                err = loop_set_fd(lo, NULL, bdev, file);
 714                if (err) {
 715                        fput(file);
 716                        ll_blkdev_put(bdev, 0);
 717                }
 718
 719                break;
 720        }
 721
 722        case LL_IOC_LLOOP_DETACH_BYDEV: {
 723                int minor;
 724
 725                dev = old_decode_dev(arg);
 726                if (MAJOR(dev) != lloop_major)
 727                        GOTO(out, err = -EINVAL);
 728
 729                minor = MINOR(dev);
 730                if (minor > max_loop - 1)
 731                        GOTO(out, err = -EINVAL);
 732
 733                lo = &loop_dev[minor];
 734                if (lo->lo_state != LLOOP_BOUND)
 735                        GOTO(out, err = -EINVAL);
 736
 737                bdev = lo->lo_device;
 738                err = loop_clr_fd(lo, bdev, 1);
 739                if (err == 0)
 740                        ll_blkdev_put(bdev, 0); /* grabbed in LLOOP_ATTACH */
 741
 742                break;
 743        }
 744
 745        default:
 746                err = -EINVAL;
 747                break;
 748        }
 749
 750out:
 751        mutex_unlock(&lloop_mutex);
 752out1:
 753        if (rcp)
 754                *rcp = err;
 755        return LLIOC_STOP;
 756}
 757
 758static int __init lloop_init(void)
 759{
 760        int     i;
 761        unsigned int cmdlist[] = {
 762                LL_IOC_LLOOP_ATTACH,
 763                LL_IOC_LLOOP_DETACH_BYDEV,
 764        };
 765
 766        if (max_loop < 1 || max_loop > 256) {
 767                max_loop = MAX_LOOP_DEFAULT;
 768                CWARN("lloop: invalid max_loop (must be between"
 769                      " 1 and 256), using default (%u)\n", max_loop);
 770        }
 771
 772        lloop_major = register_blkdev(0, "lloop");
 773        if (lloop_major < 0)
 774                return -EIO;
 775
 776        CDEBUG(D_CONFIG, "registered lloop major %d with %u minors\n",
 777               lloop_major, max_loop);
 778
 779        ll_iocontrol_magic = ll_iocontrol_register(lloop_ioctl, 2, cmdlist);
 780        if (ll_iocontrol_magic == NULL)
 781                goto out_mem1;
 782
 783        OBD_ALLOC_WAIT(loop_dev, max_loop * sizeof(*loop_dev));
 784        if (!loop_dev)
 785                goto out_mem1;
 786
 787        OBD_ALLOC_WAIT(disks, max_loop * sizeof(*disks));
 788        if (!disks)
 789                goto out_mem2;
 790
 791        for (i = 0; i < max_loop; i++) {
 792                disks[i] = alloc_disk(1);
 793                if (!disks[i])
 794                        goto out_mem3;
 795        }
 796
 797        mutex_init(&lloop_mutex);
 798
 799        for (i = 0; i < max_loop; i++) {
 800                struct lloop_device *lo = &loop_dev[i];
 801                struct gendisk *disk = disks[i];
 802
 803                lo->lo_queue = blk_alloc_queue(GFP_KERNEL);
 804                if (!lo->lo_queue)
 805                        goto out_mem4;
 806
 807                mutex_init(&lo->lo_ctl_mutex);
 808                sema_init(&lo->lo_sem, 0);
 809                init_waitqueue_head(&lo->lo_bh_wait);
 810                lo->lo_number = i;
 811                spin_lock_init(&lo->lo_lock);
 812                disk->major = lloop_major;
 813                disk->first_minor = i;
 814                disk->fops = &lo_fops;
 815                sprintf(disk->disk_name, "lloop%d", i);
 816                disk->private_data = lo;
 817                disk->queue = lo->lo_queue;
 818        }
 819
 820        /* We cannot fail after we call this, so another loop!*/
 821        for (i = 0; i < max_loop; i++)
 822                add_disk(disks[i]);
 823        return 0;
 824
 825out_mem4:
 826        while (i--)
 827                blk_cleanup_queue(loop_dev[i].lo_queue);
 828        i = max_loop;
 829out_mem3:
 830        while (i--)
 831                put_disk(disks[i]);
 832        OBD_FREE(disks, max_loop * sizeof(*disks));
 833out_mem2:
 834        OBD_FREE(loop_dev, max_loop * sizeof(*loop_dev));
 835out_mem1:
 836        unregister_blkdev(lloop_major, "lloop");
 837        ll_iocontrol_unregister(ll_iocontrol_magic);
 838        CERROR("lloop: ran out of memory\n");
 839        return -ENOMEM;
 840}
 841
 842static void lloop_exit(void)
 843{
 844        int i;
 845
 846        ll_iocontrol_unregister(ll_iocontrol_magic);
 847        for (i = 0; i < max_loop; i++) {
 848                del_gendisk(disks[i]);
 849                blk_cleanup_queue(loop_dev[i].lo_queue);
 850                put_disk(disks[i]);
 851        }
 852        if (ll_unregister_blkdev(lloop_major, "lloop"))
 853                CWARN("lloop: cannot unregister blkdev\n");
 854        else
 855                CDEBUG(D_CONFIG, "unregistered lloop major %d\n", lloop_major);
 856
 857        OBD_FREE(disks, max_loop * sizeof(*disks));
 858        OBD_FREE(loop_dev, max_loop * sizeof(*loop_dev));
 859}
 860
 861module_init(lloop_init);
 862module_exit(lloop_exit);
 863
 864CFS_MODULE_PARM(max_loop, "i", int, 0444, "maximum of lloop_device");
 865MODULE_AUTHOR("Sun Microsystems, Inc. <http://www.lustre.org/>");
 866MODULE_DESCRIPTION("Lustre virtual block device");
 867MODULE_LICENSE("GPL");
 868