linux/block/genhd.c
<<
>>
Prefs
   1/*
   2 *  gendisk handling
   3 */
   4
   5#include <linux/module.h>
   6#include <linux/fs.h>
   7#include <linux/genhd.h>
   8#include <linux/kdev_t.h>
   9#include <linux/kernel.h>
  10#include <linux/blkdev.h>
  11#include <linux/init.h>
  12#include <linux/spinlock.h>
  13#include <linux/proc_fs.h>
  14#include <linux/seq_file.h>
  15#include <linux/slab.h>
  16#include <linux/kmod.h>
  17#include <linux/kobj_map.h>
  18#include <linux/mutex.h>
  19#include <linux/idr.h>
  20#include <linux/log2.h>
  21#include <linux/pm_runtime.h>
  22#include <linux/badblocks.h>
  23
  24#include "blk.h"
  25
  26static DEFINE_MUTEX(block_class_lock);
  27struct kobject *block_depr;
  28
  29/* for extended dynamic devt allocation, currently only one major is used */
  30#define NR_EXT_DEVT             (1 << MINORBITS)
  31
  32/* For extended devt allocation.  ext_devt_lock prevents look up
  33 * results from going away underneath its user.
  34 */
  35static DEFINE_SPINLOCK(ext_devt_lock);
  36static DEFINE_IDR(ext_devt_idr);
  37
  38static struct device_type disk_type;
  39
  40static void disk_check_events(struct disk_events *ev,
  41                              unsigned int *clearing_ptr);
  42static void disk_alloc_events(struct gendisk *disk);
  43static void disk_add_events(struct gendisk *disk);
  44static void disk_del_events(struct gendisk *disk);
  45static void disk_release_events(struct gendisk *disk);
  46
  47/**
  48 * disk_get_part - get partition
  49 * @disk: disk to look partition from
  50 * @partno: partition number
  51 *
  52 * Look for partition @partno from @disk.  If found, increment
  53 * reference count and return it.
  54 *
  55 * CONTEXT:
  56 * Don't care.
  57 *
  58 * RETURNS:
  59 * Pointer to the found partition on success, NULL if not found.
  60 */
  61struct hd_struct *disk_get_part(struct gendisk *disk, int partno)
  62{
  63        struct hd_struct *part = NULL;
  64        struct disk_part_tbl *ptbl;
  65
  66        if (unlikely(partno < 0))
  67                return NULL;
  68
  69        rcu_read_lock();
  70
  71        ptbl = rcu_dereference(disk->part_tbl);
  72        if (likely(partno < ptbl->len)) {
  73                part = rcu_dereference(ptbl->part[partno]);
  74                if (part)
  75                        get_device(part_to_dev(part));
  76        }
  77
  78        rcu_read_unlock();
  79
  80        return part;
  81}
  82EXPORT_SYMBOL_GPL(disk_get_part);
  83
  84/**
  85 * disk_part_iter_init - initialize partition iterator
  86 * @piter: iterator to initialize
  87 * @disk: disk to iterate over
  88 * @flags: DISK_PITER_* flags
  89 *
  90 * Initialize @piter so that it iterates over partitions of @disk.
  91 *
  92 * CONTEXT:
  93 * Don't care.
  94 */
  95void disk_part_iter_init(struct disk_part_iter *piter, struct gendisk *disk,
  96                          unsigned int flags)
  97{
  98        struct disk_part_tbl *ptbl;
  99
 100        rcu_read_lock();
 101        ptbl = rcu_dereference(disk->part_tbl);
 102
 103        piter->disk = disk;
 104        piter->part = NULL;
 105
 106        if (flags & DISK_PITER_REVERSE)
 107                piter->idx = ptbl->len - 1;
 108        else if (flags & (DISK_PITER_INCL_PART0 | DISK_PITER_INCL_EMPTY_PART0))
 109                piter->idx = 0;
 110        else
 111                piter->idx = 1;
 112
 113        piter->flags = flags;
 114
 115        rcu_read_unlock();
 116}
 117EXPORT_SYMBOL_GPL(disk_part_iter_init);
 118
 119/**
 120 * disk_part_iter_next - proceed iterator to the next partition and return it
 121 * @piter: iterator of interest
 122 *
 123 * Proceed @piter to the next partition and return it.
 124 *
 125 * CONTEXT:
 126 * Don't care.
 127 */
 128struct hd_struct *disk_part_iter_next(struct disk_part_iter *piter)
 129{
 130        struct disk_part_tbl *ptbl;
 131        int inc, end;
 132
 133        /* put the last partition */
 134        disk_put_part(piter->part);
 135        piter->part = NULL;
 136
 137        /* get part_tbl */
 138        rcu_read_lock();
 139        ptbl = rcu_dereference(piter->disk->part_tbl);
 140
 141        /* determine iteration parameters */
 142        if (piter->flags & DISK_PITER_REVERSE) {
 143                inc = -1;
 144                if (piter->flags & (DISK_PITER_INCL_PART0 |
 145                                    DISK_PITER_INCL_EMPTY_PART0))
 146                        end = -1;
 147                else
 148                        end = 0;
 149        } else {
 150                inc = 1;
 151                end = ptbl->len;
 152        }
 153
 154        /* iterate to the next partition */
 155        for (; piter->idx != end; piter->idx += inc) {
 156                struct hd_struct *part;
 157
 158                part = rcu_dereference(ptbl->part[piter->idx]);
 159                if (!part)
 160                        continue;
 161                if (!part_nr_sects_read(part) &&
 162                    !(piter->flags & DISK_PITER_INCL_EMPTY) &&
 163                    !(piter->flags & DISK_PITER_INCL_EMPTY_PART0 &&
 164                      piter->idx == 0))
 165                        continue;
 166
 167                get_device(part_to_dev(part));
 168                piter->part = part;
 169                piter->idx += inc;
 170                break;
 171        }
 172
 173        rcu_read_unlock();
 174
 175        return piter->part;
 176}
 177EXPORT_SYMBOL_GPL(disk_part_iter_next);
 178
 179/**
 180 * disk_part_iter_exit - finish up partition iteration
 181 * @piter: iter of interest
 182 *
 183 * Called when iteration is over.  Cleans up @piter.
 184 *
 185 * CONTEXT:
 186 * Don't care.
 187 */
 188void disk_part_iter_exit(struct disk_part_iter *piter)
 189{
 190        disk_put_part(piter->part);
 191        piter->part = NULL;
 192}
 193EXPORT_SYMBOL_GPL(disk_part_iter_exit);
 194
 195static inline int sector_in_part(struct hd_struct *part, sector_t sector)
 196{
 197        return part->start_sect <= sector &&
 198                sector < part->start_sect + part_nr_sects_read(part);
 199}
 200
 201/**
 202 * disk_map_sector_rcu - map sector to partition
 203 * @disk: gendisk of interest
 204 * @sector: sector to map
 205 *
 206 * Find out which partition @sector maps to on @disk.  This is
 207 * primarily used for stats accounting.
 208 *
 209 * CONTEXT:
 210 * RCU read locked.  The returned partition pointer is valid only
 211 * while preemption is disabled.
 212 *
 213 * RETURNS:
 214 * Found partition on success, part0 is returned if no partition matches
 215 */
 216struct hd_struct *disk_map_sector_rcu(struct gendisk *disk, sector_t sector)
 217{
 218        struct disk_part_tbl *ptbl;
 219        struct hd_struct *part;
 220        int i;
 221
 222        ptbl = rcu_dereference(disk->part_tbl);
 223
 224        part = rcu_dereference(ptbl->last_lookup);
 225        if (part && sector_in_part(part, sector))
 226                return part;
 227
 228        for (i = 1; i < ptbl->len; i++) {
 229                part = rcu_dereference(ptbl->part[i]);
 230
 231                if (part && sector_in_part(part, sector)) {
 232                        rcu_assign_pointer(ptbl->last_lookup, part);
 233                        return part;
 234                }
 235        }
 236        return &disk->part0;
 237}
 238EXPORT_SYMBOL_GPL(disk_map_sector_rcu);
 239
 240/*
 241 * Can be deleted altogether. Later.
 242 *
 243 */
 244static struct blk_major_name {
 245        struct blk_major_name *next;
 246        int major;
 247        char name[16];
 248} *major_names[BLKDEV_MAJOR_HASH_SIZE];
 249
 250/* index in the above - for now: assume no multimajor ranges */
 251static inline int major_to_index(unsigned major)
 252{
 253        return major % BLKDEV_MAJOR_HASH_SIZE;
 254}
 255
 256#ifdef CONFIG_PROC_FS
 257void blkdev_show(struct seq_file *seqf, off_t offset)
 258{
 259        struct blk_major_name *dp;
 260
 261        if (offset < BLKDEV_MAJOR_HASH_SIZE) {
 262                mutex_lock(&block_class_lock);
 263                for (dp = major_names[offset]; dp; dp = dp->next)
 264                        seq_printf(seqf, "%3d %s\n", dp->major, dp->name);
 265                mutex_unlock(&block_class_lock);
 266        }
 267}
 268#endif /* CONFIG_PROC_FS */
 269
 270/**
 271 * register_blkdev - register a new block device
 272 *
 273 * @major: the requested major device number [1..255]. If @major=0, try to
 274 *         allocate any unused major number.
 275 * @name: the name of the new block device as a zero terminated string
 276 *
 277 * The @name must be unique within the system.
 278 *
 279 * The return value depends on the @major input parameter.
 280 *  - if a major device number was requested in range [1..255] then the
 281 *    function returns zero on success, or a negative error code
 282 *  - if any unused major number was requested with @major=0 parameter
 283 *    then the return value is the allocated major number in range
 284 *    [1..255] or a negative error code otherwise
 285 */
 286int register_blkdev(unsigned int major, const char *name)
 287{
 288        struct blk_major_name **n, *p;
 289        int index, ret = 0;
 290
 291        mutex_lock(&block_class_lock);
 292
 293        /* temporary */
 294        if (major == 0) {
 295                for (index = ARRAY_SIZE(major_names)-1; index > 0; index--) {
 296                        if (major_names[index] == NULL)
 297                                break;
 298                }
 299
 300                if (index == 0) {
 301                        printk("register_blkdev: failed to get major for %s\n",
 302                               name);
 303                        ret = -EBUSY;
 304                        goto out;
 305                }
 306                major = index;
 307                ret = major;
 308        }
 309
 310        p = kmalloc(sizeof(struct blk_major_name), GFP_KERNEL);
 311        if (p == NULL) {
 312                ret = -ENOMEM;
 313                goto out;
 314        }
 315
 316        p->major = major;
 317        strlcpy(p->name, name, sizeof(p->name));
 318        p->next = NULL;
 319        index = major_to_index(major);
 320
 321        for (n = &major_names[index]; *n; n = &(*n)->next) {
 322                if ((*n)->major == major)
 323                        break;
 324        }
 325        if (!*n)
 326                *n = p;
 327        else
 328                ret = -EBUSY;
 329
 330        if (ret < 0) {
 331                printk("register_blkdev: cannot get major %d for %s\n",
 332                       major, name);
 333                kfree(p);
 334        }
 335out:
 336        mutex_unlock(&block_class_lock);
 337        return ret;
 338}
 339
 340EXPORT_SYMBOL(register_blkdev);
 341
 342void unregister_blkdev(unsigned int major, const char *name)
 343{
 344        struct blk_major_name **n;
 345        struct blk_major_name *p = NULL;
 346        int index = major_to_index(major);
 347
 348        mutex_lock(&block_class_lock);
 349        for (n = &major_names[index]; *n; n = &(*n)->next)
 350                if ((*n)->major == major)
 351                        break;
 352        if (!*n || strcmp((*n)->name, name)) {
 353                WARN_ON(1);
 354        } else {
 355                p = *n;
 356                *n = p->next;
 357        }
 358        mutex_unlock(&block_class_lock);
 359        kfree(p);
 360}
 361
 362EXPORT_SYMBOL(unregister_blkdev);
 363
 364static struct kobj_map *bdev_map;
 365
 366/**
 367 * blk_mangle_minor - scatter minor numbers apart
 368 * @minor: minor number to mangle
 369 *
 370 * Scatter consecutively allocated @minor number apart if MANGLE_DEVT
 371 * is enabled.  Mangling twice gives the original value.
 372 *
 373 * RETURNS:
 374 * Mangled value.
 375 *
 376 * CONTEXT:
 377 * Don't care.
 378 */
 379static int blk_mangle_minor(int minor)
 380{
 381#ifdef CONFIG_DEBUG_BLOCK_EXT_DEVT
 382        int i;
 383
 384        for (i = 0; i < MINORBITS / 2; i++) {
 385                int low = minor & (1 << i);
 386                int high = minor & (1 << (MINORBITS - 1 - i));
 387                int distance = MINORBITS - 1 - 2 * i;
 388
 389                minor ^= low | high;    /* clear both bits */
 390                low <<= distance;       /* swap the positions */
 391                high >>= distance;
 392                minor |= low | high;    /* and set */
 393        }
 394#endif
 395        return minor;
 396}
 397
 398/**
 399 * blk_alloc_devt - allocate a dev_t for a partition
 400 * @part: partition to allocate dev_t for
 401 * @devt: out parameter for resulting dev_t
 402 *
 403 * Allocate a dev_t for block device.
 404 *
 405 * RETURNS:
 406 * 0 on success, allocated dev_t is returned in *@devt.  -errno on
 407 * failure.
 408 *
 409 * CONTEXT:
 410 * Might sleep.
 411 */
 412int blk_alloc_devt(struct hd_struct *part, dev_t *devt)
 413{
 414        struct gendisk *disk = part_to_disk(part);
 415        int idx;
 416
 417        /* in consecutive minor range? */
 418        if (part->partno < disk->minors) {
 419                *devt = MKDEV(disk->major, disk->first_minor + part->partno);
 420                return 0;
 421        }
 422
 423        /* allocate ext devt */
 424        idr_preload(GFP_KERNEL);
 425
 426        spin_lock_bh(&ext_devt_lock);
 427        idx = idr_alloc(&ext_devt_idr, part, 0, NR_EXT_DEVT, GFP_NOWAIT);
 428        spin_unlock_bh(&ext_devt_lock);
 429
 430        idr_preload_end();
 431        if (idx < 0)
 432                return idx == -ENOSPC ? -EBUSY : idx;
 433
 434        *devt = MKDEV(BLOCK_EXT_MAJOR, blk_mangle_minor(idx));
 435        return 0;
 436}
 437
 438/**
 439 * blk_free_devt - free a dev_t
 440 * @devt: dev_t to free
 441 *
 442 * Free @devt which was allocated using blk_alloc_devt().
 443 *
 444 * CONTEXT:
 445 * Might sleep.
 446 */
 447void blk_free_devt(dev_t devt)
 448{
 449        if (devt == MKDEV(0, 0))
 450                return;
 451
 452        if (MAJOR(devt) == BLOCK_EXT_MAJOR) {
 453                spin_lock_bh(&ext_devt_lock);
 454                idr_remove(&ext_devt_idr, blk_mangle_minor(MINOR(devt)));
 455                spin_unlock_bh(&ext_devt_lock);
 456        }
 457}
 458
 459static char *bdevt_str(dev_t devt, char *buf)
 460{
 461        if (MAJOR(devt) <= 0xff && MINOR(devt) <= 0xff) {
 462                char tbuf[BDEVT_SIZE];
 463                snprintf(tbuf, BDEVT_SIZE, "%02x%02x", MAJOR(devt), MINOR(devt));
 464                snprintf(buf, BDEVT_SIZE, "%-9s", tbuf);
 465        } else
 466                snprintf(buf, BDEVT_SIZE, "%03x:%05x", MAJOR(devt), MINOR(devt));
 467
 468        return buf;
 469}
 470
 471/*
 472 * Register device numbers dev..(dev+range-1)
 473 * range must be nonzero
 474 * The hash chain is sorted on range, so that subranges can override.
 475 */
 476void blk_register_region(dev_t devt, unsigned long range, struct module *module,
 477                         struct kobject *(*probe)(dev_t, int *, void *),
 478                         int (*lock)(dev_t, void *), void *data)
 479{
 480        kobj_map(bdev_map, devt, range, module, probe, lock, data);
 481}
 482
 483EXPORT_SYMBOL(blk_register_region);
 484
 485void blk_unregister_region(dev_t devt, unsigned long range)
 486{
 487        kobj_unmap(bdev_map, devt, range);
 488}
 489
 490EXPORT_SYMBOL(blk_unregister_region);
 491
 492static struct kobject *exact_match(dev_t devt, int *partno, void *data)
 493{
 494        struct gendisk *p = data;
 495
 496        return &disk_to_dev(p)->kobj;
 497}
 498
 499static int exact_lock(dev_t devt, void *data)
 500{
 501        struct gendisk *p = data;
 502
 503        if (!get_disk(p))
 504                return -1;
 505        return 0;
 506}
 507
 508static void register_disk(struct gendisk *disk)
 509{
 510        struct device *ddev = disk_to_dev(disk);
 511        struct block_device *bdev;
 512        struct disk_part_iter piter;
 513        struct hd_struct *part;
 514        int err;
 515
 516        ddev->parent = disk->driverfs_dev;
 517
 518        dev_set_name(ddev, "%s", disk->disk_name);
 519
 520        /* delay uevents, until we scanned partition table */
 521        dev_set_uevent_suppress(ddev, 1);
 522
 523        if (device_add(ddev))
 524                return;
 525        if (!sysfs_deprecated) {
 526                err = sysfs_create_link(block_depr, &ddev->kobj,
 527                                        kobject_name(&ddev->kobj));
 528                if (err) {
 529                        device_del(ddev);
 530                        return;
 531                }
 532        }
 533
 534        /*
 535         * avoid probable deadlock caused by allocating memory with
 536         * GFP_KERNEL in runtime_resume callback of its all ancestor
 537         * devices
 538         */
 539        pm_runtime_set_memalloc_noio(ddev, true);
 540
 541        disk->part0.holder_dir = kobject_create_and_add("holders", &ddev->kobj);
 542        disk->slave_dir = kobject_create_and_add("slaves", &ddev->kobj);
 543
 544        /* No minors to use for partitions */
 545        if (!disk_part_scan_enabled(disk))
 546                goto exit;
 547
 548        /* No such device (e.g., media were just removed) */
 549        if (!get_capacity(disk))
 550                goto exit;
 551
 552        bdev = bdget_disk(disk, 0);
 553        if (!bdev)
 554                goto exit;
 555
 556        bdev->bd_invalidated = 1;
 557        err = blkdev_get(bdev, FMODE_READ, NULL);
 558        if (err < 0)
 559                goto exit;
 560        blkdev_put(bdev, FMODE_READ);
 561
 562exit:
 563        /* announce disk after possible partitions are created */
 564        dev_set_uevent_suppress(ddev, 0);
 565        kobject_uevent(&ddev->kobj, KOBJ_ADD);
 566
 567        /* announce possible partitions */
 568        disk_part_iter_init(&piter, disk, 0);
 569        while ((part = disk_part_iter_next(&piter)))
 570                kobject_uevent(&part_to_dev(part)->kobj, KOBJ_ADD);
 571        disk_part_iter_exit(&piter);
 572}
 573
 574/**
 575 * add_disk - add partitioning information to kernel list
 576 * @disk: per-device partitioning information
 577 *
 578 * This function registers the partitioning information in @disk
 579 * with the kernel.
 580 *
 581 * FIXME: error handling
 582 */
 583void add_disk(struct gendisk *disk)
 584{
 585        struct backing_dev_info *bdi;
 586        dev_t devt;
 587        int retval;
 588
 589        /* minors == 0 indicates to use ext devt from part0 and should
 590         * be accompanied with EXT_DEVT flag.  Make sure all
 591         * parameters make sense.
 592         */
 593        WARN_ON(disk->minors && !(disk->major || disk->first_minor));
 594        WARN_ON(!disk->minors && !(disk->flags & GENHD_FL_EXT_DEVT));
 595
 596        disk->flags |= GENHD_FL_UP;
 597
 598        retval = blk_alloc_devt(&disk->part0, &devt);
 599        if (retval) {
 600                WARN_ON(1);
 601                return;
 602        }
 603        disk_to_dev(disk)->devt = devt;
 604
 605        /* ->major and ->first_minor aren't supposed to be
 606         * dereferenced from here on, but set them just in case.
 607         */
 608        disk->major = MAJOR(devt);
 609        disk->first_minor = MINOR(devt);
 610
 611        disk_alloc_events(disk);
 612
 613        /* Register BDI before referencing it from bdev */
 614        bdi = &disk->queue->backing_dev_info;
 615        bdi_register_dev(bdi, disk_devt(disk));
 616
 617        blk_register_region(disk_devt(disk), disk->minors, NULL,
 618                            exact_match, exact_lock, disk);
 619        register_disk(disk);
 620        blk_register_queue(disk);
 621
 622        /*
 623         * Take an extra ref on queue which will be put on disk_release()
 624         * so that it sticks around as long as @disk is there.
 625         */
 626        WARN_ON_ONCE(!blk_get_queue(disk->queue));
 627
 628        retval = sysfs_create_link(&disk_to_dev(disk)->kobj, &bdi->dev->kobj,
 629                                   "bdi");
 630        WARN_ON(retval);
 631
 632        disk_add_events(disk);
 633}
 634EXPORT_SYMBOL(add_disk);
 635
 636void del_gendisk(struct gendisk *disk)
 637{
 638        struct disk_part_iter piter;
 639        struct hd_struct *part;
 640
 641        disk_del_events(disk);
 642
 643        /* invalidate stuff */
 644        disk_part_iter_init(&piter, disk,
 645                             DISK_PITER_INCL_EMPTY | DISK_PITER_REVERSE);
 646        while ((part = disk_part_iter_next(&piter))) {
 647                invalidate_partition(disk, part->partno);
 648                delete_partition(disk, part->partno);
 649        }
 650        disk_part_iter_exit(&piter);
 651
 652        invalidate_partition(disk, 0);
 653        set_capacity(disk, 0);
 654        disk->flags &= ~GENHD_FL_UP;
 655
 656        sysfs_remove_link(&disk_to_dev(disk)->kobj, "bdi");
 657        bdi_unregister(&disk->queue->backing_dev_info);
 658        blk_unregister_queue(disk);
 659        blk_unregister_region(disk_devt(disk), disk->minors);
 660
 661        part_stat_set_all(&disk->part0, 0);
 662        disk->part0.stamp = 0;
 663
 664        kobject_put(disk->part0.holder_dir);
 665        kobject_put(disk->slave_dir);
 666        if (!sysfs_deprecated)
 667                sysfs_remove_link(block_depr, dev_name(disk_to_dev(disk)));
 668        pm_runtime_set_memalloc_noio(disk_to_dev(disk), false);
 669        device_del(disk_to_dev(disk));
 670}
 671EXPORT_SYMBOL(del_gendisk);
 672
 673/* sysfs access to bad-blocks list. */
 674static ssize_t disk_badblocks_show(struct device *dev,
 675                                        struct device_attribute *attr,
 676                                        char *page)
 677{
 678        struct gendisk *disk = dev_to_disk(dev);
 679
 680        if (!disk->bb)
 681                return sprintf(page, "\n");
 682
 683        return badblocks_show(disk->bb, page, 0);
 684}
 685
 686static ssize_t disk_badblocks_store(struct device *dev,
 687                                        struct device_attribute *attr,
 688                                        const char *page, size_t len)
 689{
 690        struct gendisk *disk = dev_to_disk(dev);
 691
 692        if (!disk->bb)
 693                return -ENXIO;
 694
 695        return badblocks_store(disk->bb, page, len, 0);
 696}
 697
 698/**
 699 * get_gendisk - get partitioning information for a given device
 700 * @devt: device to get partitioning information for
 701 * @partno: returned partition index
 702 *
 703 * This function gets the structure containing partitioning
 704 * information for the given device @devt.
 705 */
 706struct gendisk *get_gendisk(dev_t devt, int *partno)
 707{
 708        struct gendisk *disk = NULL;
 709
 710        if (MAJOR(devt) != BLOCK_EXT_MAJOR) {
 711                struct kobject *kobj;
 712
 713                kobj = kobj_lookup(bdev_map, devt, partno);
 714                if (kobj)
 715                        disk = dev_to_disk(kobj_to_dev(kobj));
 716        } else {
 717                struct hd_struct *part;
 718
 719                spin_lock_bh(&ext_devt_lock);
 720                part = idr_find(&ext_devt_idr, blk_mangle_minor(MINOR(devt)));
 721                if (part && get_disk(part_to_disk(part))) {
 722                        *partno = part->partno;
 723                        disk = part_to_disk(part);
 724                }
 725                spin_unlock_bh(&ext_devt_lock);
 726        }
 727
 728        return disk;
 729}
 730EXPORT_SYMBOL(get_gendisk);
 731
 732/**
 733 * bdget_disk - do bdget() by gendisk and partition number
 734 * @disk: gendisk of interest
 735 * @partno: partition number
 736 *
 737 * Find partition @partno from @disk, do bdget() on it.
 738 *
 739 * CONTEXT:
 740 * Don't care.
 741 *
 742 * RETURNS:
 743 * Resulting block_device on success, NULL on failure.
 744 */
 745struct block_device *bdget_disk(struct gendisk *disk, int partno)
 746{
 747        struct hd_struct *part;
 748        struct block_device *bdev = NULL;
 749
 750        part = disk_get_part(disk, partno);
 751        if (part)
 752                bdev = bdget(part_devt(part));
 753        disk_put_part(part);
 754
 755        return bdev;
 756}
 757EXPORT_SYMBOL(bdget_disk);
 758
 759/*
 760 * print a full list of all partitions - intended for places where the root
 761 * filesystem can't be mounted and thus to give the victim some idea of what
 762 * went wrong
 763 */
 764void __init printk_all_partitions(void)
 765{
 766        struct class_dev_iter iter;
 767        struct device *dev;
 768
 769        class_dev_iter_init(&iter, &block_class, NULL, &disk_type);
 770        while ((dev = class_dev_iter_next(&iter))) {
 771                struct gendisk *disk = dev_to_disk(dev);
 772                struct disk_part_iter piter;
 773                struct hd_struct *part;
 774                char name_buf[BDEVNAME_SIZE];
 775                char devt_buf[BDEVT_SIZE];
 776
 777                /*
 778                 * Don't show empty devices or things that have been
 779                 * suppressed
 780                 */
 781                if (get_capacity(disk) == 0 ||
 782                    (disk->flags & GENHD_FL_SUPPRESS_PARTITION_INFO))
 783                        continue;
 784
 785                /*
 786                 * Note, unlike /proc/partitions, I am showing the
 787                 * numbers in hex - the same format as the root=
 788                 * option takes.
 789                 */
 790                disk_part_iter_init(&piter, disk, DISK_PITER_INCL_PART0);
 791                while ((part = disk_part_iter_next(&piter))) {
 792                        bool is_part0 = part == &disk->part0;
 793
 794                        printk("%s%s %10llu %s %s", is_part0 ? "" : "  ",
 795                               bdevt_str(part_devt(part), devt_buf),
 796                               (unsigned long long)part_nr_sects_read(part) >> 1
 797                               , disk_name(disk, part->partno, name_buf),
 798                               part->info ? part->info->uuid : "");
 799                        if (is_part0) {
 800                                if (disk->driverfs_dev != NULL &&
 801                                    disk->driverfs_dev->driver != NULL)
 802                                        printk(" driver: %s\n",
 803                                              disk->driverfs_dev->driver->name);
 804                                else
 805                                        printk(" (driver?)\n");
 806                        } else
 807                                printk("\n");
 808                }
 809                disk_part_iter_exit(&piter);
 810        }
 811        class_dev_iter_exit(&iter);
 812}
 813
 814#ifdef CONFIG_PROC_FS
 815/* iterator */
 816static void *disk_seqf_start(struct seq_file *seqf, loff_t *pos)
 817{
 818        loff_t skip = *pos;
 819        struct class_dev_iter *iter;
 820        struct device *dev;
 821
 822        iter = kmalloc(sizeof(*iter), GFP_KERNEL);
 823        if (!iter)
 824                return ERR_PTR(-ENOMEM);
 825
 826        seqf->private = iter;
 827        class_dev_iter_init(iter, &block_class, NULL, &disk_type);
 828        do {
 829                dev = class_dev_iter_next(iter);
 830                if (!dev)
 831                        return NULL;
 832        } while (skip--);
 833
 834        return dev_to_disk(dev);
 835}
 836
 837static void *disk_seqf_next(struct seq_file *seqf, void *v, loff_t *pos)
 838{
 839        struct device *dev;
 840
 841        (*pos)++;
 842        dev = class_dev_iter_next(seqf->private);
 843        if (dev)
 844                return dev_to_disk(dev);
 845
 846        return NULL;
 847}
 848
 849static void disk_seqf_stop(struct seq_file *seqf, void *v)
 850{
 851        struct class_dev_iter *iter = seqf->private;
 852
 853        /* stop is called even after start failed :-( */
 854        if (iter) {
 855                class_dev_iter_exit(iter);
 856                kfree(iter);
 857                seqf->private = NULL;
 858        }
 859}
 860
 861static void *show_partition_start(struct seq_file *seqf, loff_t *pos)
 862{
 863        void *p;
 864
 865        p = disk_seqf_start(seqf, pos);
 866        if (!IS_ERR_OR_NULL(p) && !*pos)
 867                seq_puts(seqf, "major minor  #blocks  name\n\n");
 868        return p;
 869}
 870
 871static int show_partition(struct seq_file *seqf, void *v)
 872{
 873        struct gendisk *sgp = v;
 874        struct disk_part_iter piter;
 875        struct hd_struct *part;
 876        char buf[BDEVNAME_SIZE];
 877
 878        /* Don't show non-partitionable removeable devices or empty devices */
 879        if (!get_capacity(sgp) || (!disk_max_parts(sgp) &&
 880                                   (sgp->flags & GENHD_FL_REMOVABLE)))
 881                return 0;
 882        if (sgp->flags & GENHD_FL_SUPPRESS_PARTITION_INFO)
 883                return 0;
 884
 885        /* show the full disk and all non-0 size partitions of it */
 886        disk_part_iter_init(&piter, sgp, DISK_PITER_INCL_PART0);
 887        while ((part = disk_part_iter_next(&piter)))
 888                seq_printf(seqf, "%4d  %7d %10llu %s\n",
 889                           MAJOR(part_devt(part)), MINOR(part_devt(part)),
 890                           (unsigned long long)part_nr_sects_read(part) >> 1,
 891                           disk_name(sgp, part->partno, buf));
 892        disk_part_iter_exit(&piter);
 893
 894        return 0;
 895}
 896
 897static const struct seq_operations partitions_op = {
 898        .start  = show_partition_start,
 899        .next   = disk_seqf_next,
 900        .stop   = disk_seqf_stop,
 901        .show   = show_partition
 902};
 903
 904static int partitions_open(struct inode *inode, struct file *file)
 905{
 906        return seq_open(file, &partitions_op);
 907}
 908
 909static const struct file_operations proc_partitions_operations = {
 910        .open           = partitions_open,
 911        .read           = seq_read,
 912        .llseek         = seq_lseek,
 913        .release        = seq_release,
 914};
 915#endif
 916
 917
 918static struct kobject *base_probe(dev_t devt, int *partno, void *data)
 919{
 920        if (request_module("block-major-%d-%d", MAJOR(devt), MINOR(devt)) > 0)
 921                /* Make old-style 2.4 aliases work */
 922                request_module("block-major-%d", MAJOR(devt));
 923        return NULL;
 924}
 925
 926static int __init genhd_device_init(void)
 927{
 928        int error;
 929
 930        block_class.dev_kobj = sysfs_dev_block_kobj;
 931        error = class_register(&block_class);
 932        if (unlikely(error))
 933                return error;
 934        bdev_map = kobj_map_init(base_probe, &block_class_lock);
 935        blk_dev_init();
 936
 937        register_blkdev(BLOCK_EXT_MAJOR, "blkext");
 938
 939        /* create top-level block dir */
 940        if (!sysfs_deprecated)
 941                block_depr = kobject_create_and_add("block", NULL);
 942        return 0;
 943}
 944
 945subsys_initcall(genhd_device_init);
 946
 947static ssize_t disk_range_show(struct device *dev,
 948                               struct device_attribute *attr, char *buf)
 949{
 950        struct gendisk *disk = dev_to_disk(dev);
 951
 952        return sprintf(buf, "%d\n", disk->minors);
 953}
 954
 955static ssize_t disk_ext_range_show(struct device *dev,
 956                                   struct device_attribute *attr, char *buf)
 957{
 958        struct gendisk *disk = dev_to_disk(dev);
 959
 960        return sprintf(buf, "%d\n", disk_max_parts(disk));
 961}
 962
 963static ssize_t disk_removable_show(struct device *dev,
 964                                   struct device_attribute *attr, char *buf)
 965{
 966        struct gendisk *disk = dev_to_disk(dev);
 967
 968        return sprintf(buf, "%d\n",
 969                       (disk->flags & GENHD_FL_REMOVABLE ? 1 : 0));
 970}
 971
 972static ssize_t disk_ro_show(struct device *dev,
 973                                   struct device_attribute *attr, char *buf)
 974{
 975        struct gendisk *disk = dev_to_disk(dev);
 976
 977        return sprintf(buf, "%d\n", get_disk_ro(disk) ? 1 : 0);
 978}
 979
 980static ssize_t disk_capability_show(struct device *dev,
 981                                    struct device_attribute *attr, char *buf)
 982{
 983        struct gendisk *disk = dev_to_disk(dev);
 984
 985        return sprintf(buf, "%x\n", disk->flags);
 986}
 987
 988static ssize_t disk_alignment_offset_show(struct device *dev,
 989                                          struct device_attribute *attr,
 990                                          char *buf)
 991{
 992        struct gendisk *disk = dev_to_disk(dev);
 993
 994        return sprintf(buf, "%d\n", queue_alignment_offset(disk->queue));
 995}
 996
 997static ssize_t disk_discard_alignment_show(struct device *dev,
 998                                           struct device_attribute *attr,
 999                                           char *buf)
1000{
1001        struct gendisk *disk = dev_to_disk(dev);
1002
1003        return sprintf(buf, "%d\n", queue_discard_alignment(disk->queue));
1004}
1005
1006static DEVICE_ATTR(range, S_IRUGO, disk_range_show, NULL);
1007static DEVICE_ATTR(ext_range, S_IRUGO, disk_ext_range_show, NULL);
1008static DEVICE_ATTR(removable, S_IRUGO, disk_removable_show, NULL);
1009static DEVICE_ATTR(ro, S_IRUGO, disk_ro_show, NULL);
1010static DEVICE_ATTR(size, S_IRUGO, part_size_show, NULL);
1011static DEVICE_ATTR(alignment_offset, S_IRUGO, disk_alignment_offset_show, NULL);
1012static DEVICE_ATTR(discard_alignment, S_IRUGO, disk_discard_alignment_show,
1013                   NULL);
1014static DEVICE_ATTR(capability, S_IRUGO, disk_capability_show, NULL);
1015static DEVICE_ATTR(stat, S_IRUGO, part_stat_show, NULL);
1016static DEVICE_ATTR(inflight, S_IRUGO, part_inflight_show, NULL);
1017static DEVICE_ATTR(badblocks, S_IRUGO | S_IWUSR, disk_badblocks_show,
1018                disk_badblocks_store);
1019#ifdef CONFIG_FAIL_MAKE_REQUEST
1020static struct device_attribute dev_attr_fail =
1021        __ATTR(make-it-fail, S_IRUGO|S_IWUSR, part_fail_show, part_fail_store);
1022#endif
1023#ifdef CONFIG_FAIL_IO_TIMEOUT
1024static struct device_attribute dev_attr_fail_timeout =
1025        __ATTR(io-timeout-fail,  S_IRUGO|S_IWUSR, part_timeout_show,
1026                part_timeout_store);
1027#endif
1028
1029static struct attribute *disk_attrs[] = {
1030        &dev_attr_range.attr,
1031        &dev_attr_ext_range.attr,
1032        &dev_attr_removable.attr,
1033        &dev_attr_ro.attr,
1034        &dev_attr_size.attr,
1035        &dev_attr_alignment_offset.attr,
1036        &dev_attr_discard_alignment.attr,
1037        &dev_attr_capability.attr,
1038        &dev_attr_stat.attr,
1039        &dev_attr_inflight.attr,
1040        &dev_attr_badblocks.attr,
1041#ifdef CONFIG_FAIL_MAKE_REQUEST
1042        &dev_attr_fail.attr,
1043#endif
1044#ifdef CONFIG_FAIL_IO_TIMEOUT
1045        &dev_attr_fail_timeout.attr,
1046#endif
1047        NULL
1048};
1049
1050static struct attribute_group disk_attr_group = {
1051        .attrs = disk_attrs,
1052};
1053
1054static const struct attribute_group *disk_attr_groups[] = {
1055        &disk_attr_group,
1056        NULL
1057};
1058
1059/**
1060 * disk_replace_part_tbl - replace disk->part_tbl in RCU-safe way
1061 * @disk: disk to replace part_tbl for
1062 * @new_ptbl: new part_tbl to install
1063 *
1064 * Replace disk->part_tbl with @new_ptbl in RCU-safe way.  The
1065 * original ptbl is freed using RCU callback.
1066 *
1067 * LOCKING:
1068 * Matching bd_mutx locked.
1069 */
1070static void disk_replace_part_tbl(struct gendisk *disk,
1071                                  struct disk_part_tbl *new_ptbl)
1072{
1073        struct disk_part_tbl *old_ptbl = disk->part_tbl;
1074
1075        rcu_assign_pointer(disk->part_tbl, new_ptbl);
1076
1077        if (old_ptbl) {
1078                rcu_assign_pointer(old_ptbl->last_lookup, NULL);
1079                kfree_rcu(old_ptbl, rcu_head);
1080        }
1081}
1082
1083/**
1084 * disk_expand_part_tbl - expand disk->part_tbl
1085 * @disk: disk to expand part_tbl for
1086 * @partno: expand such that this partno can fit in
1087 *
1088 * Expand disk->part_tbl such that @partno can fit in.  disk->part_tbl
1089 * uses RCU to allow unlocked dereferencing for stats and other stuff.
1090 *
1091 * LOCKING:
1092 * Matching bd_mutex locked, might sleep.
1093 *
1094 * RETURNS:
1095 * 0 on success, -errno on failure.
1096 */
1097int disk_expand_part_tbl(struct gendisk *disk, int partno)
1098{
1099        struct disk_part_tbl *old_ptbl = disk->part_tbl;
1100        struct disk_part_tbl *new_ptbl;
1101        int len = old_ptbl ? old_ptbl->len : 0;
1102        int target = partno + 1;
1103        size_t size;
1104        int i;
1105
1106        /* disk_max_parts() is zero during initialization, ignore if so */
1107        if (disk_max_parts(disk) && target > disk_max_parts(disk))
1108                return -EINVAL;
1109
1110        if (target <= len)
1111                return 0;
1112
1113        size = sizeof(*new_ptbl) + target * sizeof(new_ptbl->part[0]);
1114        new_ptbl = kzalloc_node(size, GFP_KERNEL, disk->node_id);
1115        if (!new_ptbl)
1116                return -ENOMEM;
1117
1118        new_ptbl->len = target;
1119
1120        for (i = 0; i < len; i++)
1121                rcu_assign_pointer(new_ptbl->part[i], old_ptbl->part[i]);
1122
1123        disk_replace_part_tbl(disk, new_ptbl);
1124        return 0;
1125}
1126
1127static void disk_release(struct device *dev)
1128{
1129        struct gendisk *disk = dev_to_disk(dev);
1130
1131        blk_free_devt(dev->devt);
1132        disk_release_events(disk);
1133        kfree(disk->random);
1134        disk_replace_part_tbl(disk, NULL);
1135        free_part_stats(&disk->part0);
1136        free_part_info(&disk->part0);
1137        if (disk->queue)
1138                blk_put_queue(disk->queue);
1139        kfree(disk);
1140}
1141struct class block_class = {
1142        .name           = "block",
1143};
1144
1145static char *block_devnode(struct device *dev, umode_t *mode,
1146                           kuid_t *uid, kgid_t *gid)
1147{
1148        struct gendisk *disk = dev_to_disk(dev);
1149
1150        if (disk->devnode)
1151                return disk->devnode(disk, mode);
1152        return NULL;
1153}
1154
1155static struct device_type disk_type = {
1156        .name           = "disk",
1157        .groups         = disk_attr_groups,
1158        .release        = disk_release,
1159        .devnode        = block_devnode,
1160};
1161
1162#ifdef CONFIG_PROC_FS
1163/*
1164 * aggregate disk stat collector.  Uses the same stats that the sysfs
1165 * entries do, above, but makes them available through one seq_file.
1166 *
1167 * The output looks suspiciously like /proc/partitions with a bunch of
1168 * extra fields.
1169 */
1170static int diskstats_show(struct seq_file *seqf, void *v)
1171{
1172        struct gendisk *gp = v;
1173        struct disk_part_iter piter;
1174        struct hd_struct *hd;
1175        char buf[BDEVNAME_SIZE];
1176        int cpu;
1177
1178        /*
1179        if (&disk_to_dev(gp)->kobj.entry == block_class.devices.next)
1180                seq_puts(seqf,  "major minor name"
1181                                "     rio rmerge rsect ruse wio wmerge "
1182                                "wsect wuse running use aveq"
1183                                "\n\n");
1184        */
1185
1186        disk_part_iter_init(&piter, gp, DISK_PITER_INCL_EMPTY_PART0);
1187        while ((hd = disk_part_iter_next(&piter))) {
1188                cpu = part_stat_lock();
1189                part_round_stats(cpu, hd);
1190                part_stat_unlock();
1191                seq_printf(seqf, "%4d %7d %s %lu %lu %lu "
1192                           "%u %lu %lu %lu %u %u %u %u\n",
1193                           MAJOR(part_devt(hd)), MINOR(part_devt(hd)),
1194                           disk_name(gp, hd->partno, buf),
1195                           part_stat_read(hd, ios[READ]),
1196                           part_stat_read(hd, merges[READ]),
1197                           part_stat_read(hd, sectors[READ]),
1198                           jiffies_to_msecs(part_stat_read(hd, ticks[READ])),
1199                           part_stat_read(hd, ios[WRITE]),
1200                           part_stat_read(hd, merges[WRITE]),
1201                           part_stat_read(hd, sectors[WRITE]),
1202                           jiffies_to_msecs(part_stat_read(hd, ticks[WRITE])),
1203                           part_in_flight(hd),
1204                           jiffies_to_msecs(part_stat_read(hd, io_ticks)),
1205                           jiffies_to_msecs(part_stat_read(hd, time_in_queue))
1206                        );
1207        }
1208        disk_part_iter_exit(&piter);
1209
1210        return 0;
1211}
1212
1213static const struct seq_operations diskstats_op = {
1214        .start  = disk_seqf_start,
1215        .next   = disk_seqf_next,
1216        .stop   = disk_seqf_stop,
1217        .show   = diskstats_show
1218};
1219
1220static int diskstats_open(struct inode *inode, struct file *file)
1221{
1222        return seq_open(file, &diskstats_op);
1223}
1224
1225static const struct file_operations proc_diskstats_operations = {
1226        .open           = diskstats_open,
1227        .read           = seq_read,
1228        .llseek         = seq_lseek,
1229        .release        = seq_release,
1230};
1231
1232static int __init proc_genhd_init(void)
1233{
1234        proc_create("diskstats", 0, NULL, &proc_diskstats_operations);
1235        proc_create("partitions", 0, NULL, &proc_partitions_operations);
1236        return 0;
1237}
1238module_init(proc_genhd_init);
1239#endif /* CONFIG_PROC_FS */
1240
1241dev_t blk_lookup_devt(const char *name, int partno)
1242{
1243        dev_t devt = MKDEV(0, 0);
1244        struct class_dev_iter iter;
1245        struct device *dev;
1246
1247        class_dev_iter_init(&iter, &block_class, NULL, &disk_type);
1248        while ((dev = class_dev_iter_next(&iter))) {
1249                struct gendisk *disk = dev_to_disk(dev);
1250                struct hd_struct *part;
1251
1252                if (strcmp(dev_name(dev), name))
1253                        continue;
1254
1255                if (partno < disk->minors) {
1256                        /* We need to return the right devno, even
1257                         * if the partition doesn't exist yet.
1258                         */
1259                        devt = MKDEV(MAJOR(dev->devt),
1260                                     MINOR(dev->devt) + partno);
1261                        break;
1262                }
1263                part = disk_get_part(disk, partno);
1264                if (part) {
1265                        devt = part_devt(part);
1266                        disk_put_part(part);
1267                        break;
1268                }
1269                disk_put_part(part);
1270        }
1271        class_dev_iter_exit(&iter);
1272        return devt;
1273}
1274EXPORT_SYMBOL(blk_lookup_devt);
1275
1276struct gendisk *alloc_disk(int minors)
1277{
1278        return alloc_disk_node(minors, NUMA_NO_NODE);
1279}
1280EXPORT_SYMBOL(alloc_disk);
1281
1282struct gendisk *alloc_disk_node(int minors, int node_id)
1283{
1284        struct gendisk *disk;
1285
1286        disk = kmalloc_node(sizeof(struct gendisk),
1287                                GFP_KERNEL | __GFP_ZERO, node_id);
1288        if (disk) {
1289                if (!init_part_stats(&disk->part0)) {
1290                        kfree(disk);
1291                        return NULL;
1292                }
1293                disk->node_id = node_id;
1294                if (disk_expand_part_tbl(disk, 0)) {
1295                        free_part_stats(&disk->part0);
1296                        kfree(disk);
1297                        return NULL;
1298                }
1299                disk->part_tbl->part[0] = &disk->part0;
1300
1301                /*
1302                 * set_capacity() and get_capacity() currently don't use
1303                 * seqcounter to read/update the part0->nr_sects. Still init
1304                 * the counter as we can read the sectors in IO submission
1305                 * patch using seqence counters.
1306                 *
1307                 * TODO: Ideally set_capacity() and get_capacity() should be
1308                 * converted to make use of bd_mutex and sequence counters.
1309                 */
1310                seqcount_init(&disk->part0.nr_sects_seq);
1311                hd_ref_init(&disk->part0);
1312
1313                disk->minors = minors;
1314                rand_initialize_disk(disk);
1315                disk_to_dev(disk)->class = &block_class;
1316                disk_to_dev(disk)->type = &disk_type;
1317                device_initialize(disk_to_dev(disk));
1318        }
1319        return disk;
1320}
1321EXPORT_SYMBOL(alloc_disk_node);
1322
1323struct kobject *get_disk(struct gendisk *disk)
1324{
1325        struct module *owner;
1326        struct kobject *kobj;
1327
1328        if (!disk->fops)
1329                return NULL;
1330        owner = disk->fops->owner;
1331        if (owner && !try_module_get(owner))
1332                return NULL;
1333        kobj = kobject_get(&disk_to_dev(disk)->kobj);
1334        if (kobj == NULL) {
1335                module_put(owner);
1336                return NULL;
1337        }
1338        return kobj;
1339
1340}
1341
1342EXPORT_SYMBOL(get_disk);
1343
1344void put_disk(struct gendisk *disk)
1345{
1346        if (disk)
1347                kobject_put(&disk_to_dev(disk)->kobj);
1348}
1349
1350EXPORT_SYMBOL(put_disk);
1351
1352static void set_disk_ro_uevent(struct gendisk *gd, int ro)
1353{
1354        char event[] = "DISK_RO=1";
1355        char *envp[] = { event, NULL };
1356
1357        if (!ro)
1358                event[8] = '0';
1359        kobject_uevent_env(&disk_to_dev(gd)->kobj, KOBJ_CHANGE, envp);
1360}
1361
1362void set_device_ro(struct block_device *bdev, int flag)
1363{
1364        bdev->bd_part->policy = flag;
1365}
1366
1367EXPORT_SYMBOL(set_device_ro);
1368
1369void set_disk_ro(struct gendisk *disk, int flag)
1370{
1371        struct disk_part_iter piter;
1372        struct hd_struct *part;
1373
1374        if (disk->part0.policy != flag) {
1375                set_disk_ro_uevent(disk, flag);
1376                disk->part0.policy = flag;
1377        }
1378
1379        disk_part_iter_init(&piter, disk, DISK_PITER_INCL_EMPTY);
1380        while ((part = disk_part_iter_next(&piter)))
1381                part->policy = flag;
1382        disk_part_iter_exit(&piter);
1383}
1384
1385EXPORT_SYMBOL(set_disk_ro);
1386
1387int bdev_read_only(struct block_device *bdev)
1388{
1389        if (!bdev)
1390                return 0;
1391        return bdev->bd_part->policy;
1392}
1393
1394EXPORT_SYMBOL(bdev_read_only);
1395
1396int invalidate_partition(struct gendisk *disk, int partno)
1397{
1398        int res = 0;
1399        struct block_device *bdev = bdget_disk(disk, partno);
1400        if (bdev) {
1401                fsync_bdev(bdev);
1402                res = __invalidate_device(bdev, true);
1403                bdput(bdev);
1404        }
1405        return res;
1406}
1407
1408EXPORT_SYMBOL(invalidate_partition);
1409
1410/*
1411 * Disk events - monitor disk events like media change and eject request.
1412 */
1413struct disk_events {
1414        struct list_head        node;           /* all disk_event's */
1415        struct gendisk          *disk;          /* the associated disk */
1416        spinlock_t              lock;
1417
1418        struct mutex            block_mutex;    /* protects blocking */
1419        int                     block;          /* event blocking depth */
1420        unsigned int            pending;        /* events already sent out */
1421        unsigned int            clearing;       /* events being cleared */
1422
1423        long                    poll_msecs;     /* interval, -1 for default */
1424        struct delayed_work     dwork;
1425};
1426
1427static const char *disk_events_strs[] = {
1428        [ilog2(DISK_EVENT_MEDIA_CHANGE)]        = "media_change",
1429        [ilog2(DISK_EVENT_EJECT_REQUEST)]       = "eject_request",
1430};
1431
1432static char *disk_uevents[] = {
1433        [ilog2(DISK_EVENT_MEDIA_CHANGE)]        = "DISK_MEDIA_CHANGE=1",
1434        [ilog2(DISK_EVENT_EJECT_REQUEST)]       = "DISK_EJECT_REQUEST=1",
1435};
1436
1437/* list of all disk_events */
1438static DEFINE_MUTEX(disk_events_mutex);
1439static LIST_HEAD(disk_events);
1440
1441/* disable in-kernel polling by default */
1442static unsigned long disk_events_dfl_poll_msecs = 0;
1443
1444static unsigned long disk_events_poll_jiffies(struct gendisk *disk)
1445{
1446        struct disk_events *ev = disk->ev;
1447        long intv_msecs = 0;
1448
1449        /*
1450         * If device-specific poll interval is set, always use it.  If
1451         * the default is being used, poll iff there are events which
1452         * can't be monitored asynchronously.
1453         */
1454        if (ev->poll_msecs >= 0)
1455                intv_msecs = ev->poll_msecs;
1456        else if (disk->events & ~disk->async_events)
1457                intv_msecs = disk_events_dfl_poll_msecs;
1458
1459        return msecs_to_jiffies(intv_msecs);
1460}
1461
1462/**
1463 * disk_block_events - block and flush disk event checking
1464 * @disk: disk to block events for
1465 *
1466 * On return from this function, it is guaranteed that event checking
1467 * isn't in progress and won't happen until unblocked by
1468 * disk_unblock_events().  Events blocking is counted and the actual
1469 * unblocking happens after the matching number of unblocks are done.
1470 *
1471 * Note that this intentionally does not block event checking from
1472 * disk_clear_events().
1473 *
1474 * CONTEXT:
1475 * Might sleep.
1476 */
1477void disk_block_events(struct gendisk *disk)
1478{
1479        struct disk_events *ev = disk->ev;
1480        unsigned long flags;
1481        bool cancel;
1482
1483        if (!ev)
1484                return;
1485
1486        /*
1487         * Outer mutex ensures that the first blocker completes canceling
1488         * the event work before further blockers are allowed to finish.
1489         */
1490        mutex_lock(&ev->block_mutex);
1491
1492        spin_lock_irqsave(&ev->lock, flags);
1493        cancel = !ev->block++;
1494        spin_unlock_irqrestore(&ev->lock, flags);
1495
1496        if (cancel)
1497                cancel_delayed_work_sync(&disk->ev->dwork);
1498
1499        mutex_unlock(&ev->block_mutex);
1500}
1501
1502static void __disk_unblock_events(struct gendisk *disk, bool check_now)
1503{
1504        struct disk_events *ev = disk->ev;
1505        unsigned long intv;
1506        unsigned long flags;
1507
1508        spin_lock_irqsave(&ev->lock, flags);
1509
1510        if (WARN_ON_ONCE(ev->block <= 0))
1511                goto out_unlock;
1512
1513        if (--ev->block)
1514                goto out_unlock;
1515
1516        /*
1517         * Not exactly a latency critical operation, set poll timer
1518         * slack to 25% and kick event check.
1519         */
1520        intv = disk_events_poll_jiffies(disk);
1521        set_timer_slack(&ev->dwork.timer, intv / 4);
1522        if (check_now)
1523                queue_delayed_work(system_freezable_wq, &ev->dwork, 0);
1524        else if (intv)
1525                queue_delayed_work(system_freezable_wq, &ev->dwork, intv);
1526out_unlock:
1527        spin_unlock_irqrestore(&ev->lock, flags);
1528}
1529
1530/**
1531 * disk_unblock_events - unblock disk event checking
1532 * @disk: disk to unblock events for
1533 *
1534 * Undo disk_block_events().  When the block count reaches zero, it
1535 * starts events polling if configured.
1536 *
1537 * CONTEXT:
1538 * Don't care.  Safe to call from irq context.
1539 */
1540void disk_unblock_events(struct gendisk *disk)
1541{
1542        if (disk->ev)
1543                __disk_unblock_events(disk, false);
1544}
1545
1546/**
1547 * disk_flush_events - schedule immediate event checking and flushing
1548 * @disk: disk to check and flush events for
1549 * @mask: events to flush
1550 *
1551 * Schedule immediate event checking on @disk if not blocked.  Events in
1552 * @mask are scheduled to be cleared from the driver.  Note that this
1553 * doesn't clear the events from @disk->ev.
1554 *
1555 * CONTEXT:
1556 * If @mask is non-zero must be called with bdev->bd_mutex held.
1557 */
1558void disk_flush_events(struct gendisk *disk, unsigned int mask)
1559{
1560        struct disk_events *ev = disk->ev;
1561
1562        if (!ev)
1563                return;
1564
1565        spin_lock_irq(&ev->lock);
1566        ev->clearing |= mask;
1567        if (!ev->block)
1568                mod_delayed_work(system_freezable_wq, &ev->dwork, 0);
1569        spin_unlock_irq(&ev->lock);
1570}
1571
1572/**
1573 * disk_clear_events - synchronously check, clear and return pending events
1574 * @disk: disk to fetch and clear events from
1575 * @mask: mask of events to be fetched and clearted
1576 *
1577 * Disk events are synchronously checked and pending events in @mask
1578 * are cleared and returned.  This ignores the block count.
1579 *
1580 * CONTEXT:
1581 * Might sleep.
1582 */
1583unsigned int disk_clear_events(struct gendisk *disk, unsigned int mask)
1584{
1585        const struct block_device_operations *bdops = disk->fops;
1586        struct disk_events *ev = disk->ev;
1587        unsigned int pending;
1588        unsigned int clearing = mask;
1589
1590        if (!ev) {
1591                /* for drivers still using the old ->media_changed method */
1592                if ((mask & DISK_EVENT_MEDIA_CHANGE) &&
1593                    bdops->media_changed && bdops->media_changed(disk))
1594                        return DISK_EVENT_MEDIA_CHANGE;
1595                return 0;
1596        }
1597
1598        disk_block_events(disk);
1599
1600        /*
1601         * store the union of mask and ev->clearing on the stack so that the
1602         * race with disk_flush_events does not cause ambiguity (ev->clearing
1603         * can still be modified even if events are blocked).
1604         */
1605        spin_lock_irq(&ev->lock);
1606        clearing |= ev->clearing;
1607        ev->clearing = 0;
1608        spin_unlock_irq(&ev->lock);
1609
1610        disk_check_events(ev, &clearing);
1611        /*
1612         * if ev->clearing is not 0, the disk_flush_events got called in the
1613         * middle of this function, so we want to run the workfn without delay.
1614         */
1615        __disk_unblock_events(disk, ev->clearing ? true : false);
1616
1617        /* then, fetch and clear pending events */
1618        spin_lock_irq(&ev->lock);
1619        pending = ev->pending & mask;
1620        ev->pending &= ~mask;
1621        spin_unlock_irq(&ev->lock);
1622        WARN_ON_ONCE(clearing & mask);
1623
1624        return pending;
1625}
1626
1627/*
1628 * Separate this part out so that a different pointer for clearing_ptr can be
1629 * passed in for disk_clear_events.
1630 */
1631static void disk_events_workfn(struct work_struct *work)
1632{
1633        struct delayed_work *dwork = to_delayed_work(work);
1634        struct disk_events *ev = container_of(dwork, struct disk_events, dwork);
1635
1636        disk_check_events(ev, &ev->clearing);
1637}
1638
1639static void disk_check_events(struct disk_events *ev,
1640                              unsigned int *clearing_ptr)
1641{
1642        struct gendisk *disk = ev->disk;
1643        char *envp[ARRAY_SIZE(disk_uevents) + 1] = { };
1644        unsigned int clearing = *clearing_ptr;
1645        unsigned int events;
1646        unsigned long intv;
1647        int nr_events = 0, i;
1648
1649        /* check events */
1650        events = disk->fops->check_events(disk, clearing);
1651
1652        /* accumulate pending events and schedule next poll if necessary */
1653        spin_lock_irq(&ev->lock);
1654
1655        events &= ~ev->pending;
1656        ev->pending |= events;
1657        *clearing_ptr &= ~clearing;
1658
1659        intv = disk_events_poll_jiffies(disk);
1660        if (!ev->block && intv)
1661                queue_delayed_work(system_freezable_wq, &ev->dwork, intv);
1662
1663        spin_unlock_irq(&ev->lock);
1664
1665        /*
1666         * Tell userland about new events.  Only the events listed in
1667         * @disk->events are reported.  Unlisted events are processed the
1668         * same internally but never get reported to userland.
1669         */
1670        for (i = 0; i < ARRAY_SIZE(disk_uevents); i++)
1671                if (events & disk->events & (1 << i))
1672                        envp[nr_events++] = disk_uevents[i];
1673
1674        if (nr_events)
1675                kobject_uevent_env(&disk_to_dev(disk)->kobj, KOBJ_CHANGE, envp);
1676}
1677
1678/*
1679 * A disk events enabled device has the following sysfs nodes under
1680 * its /sys/block/X/ directory.
1681 *
1682 * events               : list of all supported events
1683 * events_async         : list of events which can be detected w/o polling
1684 * events_poll_msecs    : polling interval, 0: disable, -1: system default
1685 */
1686static ssize_t __disk_events_show(unsigned int events, char *buf)
1687{
1688        const char *delim = "";
1689        ssize_t pos = 0;
1690        int i;
1691
1692        for (i = 0; i < ARRAY_SIZE(disk_events_strs); i++)
1693                if (events & (1 << i)) {
1694                        pos += sprintf(buf + pos, "%s%s",
1695                                       delim, disk_events_strs[i]);
1696                        delim = " ";
1697                }
1698        if (pos)
1699                pos += sprintf(buf + pos, "\n");
1700        return pos;
1701}
1702
1703static ssize_t disk_events_show(struct device *dev,
1704                                struct device_attribute *attr, char *buf)
1705{
1706        struct gendisk *disk = dev_to_disk(dev);
1707
1708        return __disk_events_show(disk->events, buf);
1709}
1710
1711static ssize_t disk_events_async_show(struct device *dev,
1712                                      struct device_attribute *attr, char *buf)
1713{
1714        struct gendisk *disk = dev_to_disk(dev);
1715
1716        return __disk_events_show(disk->async_events, buf);
1717}
1718
1719static ssize_t disk_events_poll_msecs_show(struct device *dev,
1720                                           struct device_attribute *attr,
1721                                           char *buf)
1722{
1723        struct gendisk *disk = dev_to_disk(dev);
1724
1725        return sprintf(buf, "%ld\n", disk->ev->poll_msecs);
1726}
1727
1728static ssize_t disk_events_poll_msecs_store(struct device *dev,
1729                                            struct device_attribute *attr,
1730                                            const char *buf, size_t count)
1731{
1732        struct gendisk *disk = dev_to_disk(dev);
1733        long intv;
1734
1735        if (!count || !sscanf(buf, "%ld", &intv))
1736                return -EINVAL;
1737
1738        if (intv < 0 && intv != -1)
1739                return -EINVAL;
1740
1741        disk_block_events(disk);
1742        disk->ev->poll_msecs = intv;
1743        __disk_unblock_events(disk, true);
1744
1745        return count;
1746}
1747
1748static const DEVICE_ATTR(events, S_IRUGO, disk_events_show, NULL);
1749static const DEVICE_ATTR(events_async, S_IRUGO, disk_events_async_show, NULL);
1750static const DEVICE_ATTR(events_poll_msecs, S_IRUGO|S_IWUSR,
1751                         disk_events_poll_msecs_show,
1752                         disk_events_poll_msecs_store);
1753
1754static const struct attribute *disk_events_attrs[] = {
1755        &dev_attr_events.attr,
1756        &dev_attr_events_async.attr,
1757        &dev_attr_events_poll_msecs.attr,
1758        NULL,
1759};
1760
1761/*
1762 * The default polling interval can be specified by the kernel
1763 * parameter block.events_dfl_poll_msecs which defaults to 0
1764 * (disable).  This can also be modified runtime by writing to
1765 * /sys/module/block/events_dfl_poll_msecs.
1766 */
1767static int disk_events_set_dfl_poll_msecs(const char *val,
1768                                          const struct kernel_param *kp)
1769{
1770        struct disk_events *ev;
1771        int ret;
1772
1773        ret = param_set_ulong(val, kp);
1774        if (ret < 0)
1775                return ret;
1776
1777        mutex_lock(&disk_events_mutex);
1778
1779        list_for_each_entry(ev, &disk_events, node)
1780                disk_flush_events(ev->disk, 0);
1781
1782        mutex_unlock(&disk_events_mutex);
1783
1784        return 0;
1785}
1786
1787static const struct kernel_param_ops disk_events_dfl_poll_msecs_param_ops = {
1788        .set    = disk_events_set_dfl_poll_msecs,
1789        .get    = param_get_ulong,
1790};
1791
1792#undef MODULE_PARAM_PREFIX
1793#define MODULE_PARAM_PREFIX     "block."
1794
1795module_param_cb(events_dfl_poll_msecs, &disk_events_dfl_poll_msecs_param_ops,
1796                &disk_events_dfl_poll_msecs, 0644);
1797
1798/*
1799 * disk_{alloc|add|del|release}_events - initialize and destroy disk_events.
1800 */
1801static void disk_alloc_events(struct gendisk *disk)
1802{
1803        struct disk_events *ev;
1804
1805        if (!disk->fops->check_events)
1806                return;
1807
1808        ev = kzalloc(sizeof(*ev), GFP_KERNEL);
1809        if (!ev) {
1810                pr_warn("%s: failed to initialize events\n", disk->disk_name);
1811                return;
1812        }
1813
1814        INIT_LIST_HEAD(&ev->node);
1815        ev->disk = disk;
1816        spin_lock_init(&ev->lock);
1817        mutex_init(&ev->block_mutex);
1818        ev->block = 1;
1819        ev->poll_msecs = -1;
1820        INIT_DELAYED_WORK(&ev->dwork, disk_events_workfn);
1821
1822        disk->ev = ev;
1823}
1824
1825static void disk_add_events(struct gendisk *disk)
1826{
1827        if (!disk->ev)
1828                return;
1829
1830        /* FIXME: error handling */
1831        if (sysfs_create_files(&disk_to_dev(disk)->kobj, disk_events_attrs) < 0)
1832                pr_warn("%s: failed to create sysfs files for events\n",
1833                        disk->disk_name);
1834
1835        mutex_lock(&disk_events_mutex);
1836        list_add_tail(&disk->ev->node, &disk_events);
1837        mutex_unlock(&disk_events_mutex);
1838
1839        /*
1840         * Block count is initialized to 1 and the following initial
1841         * unblock kicks it into action.
1842         */
1843        __disk_unblock_events(disk, true);
1844}
1845
1846static void disk_del_events(struct gendisk *disk)
1847{
1848        if (!disk->ev)
1849                return;
1850
1851        disk_block_events(disk);
1852
1853        mutex_lock(&disk_events_mutex);
1854        list_del_init(&disk->ev->node);
1855        mutex_unlock(&disk_events_mutex);
1856
1857        sysfs_remove_files(&disk_to_dev(disk)->kobj, disk_events_attrs);
1858}
1859
1860static void disk_release_events(struct gendisk *disk)
1861{
1862        /* the block count should be 1 from disk_del_events() */
1863        WARN_ON_ONCE(disk->ev && disk->ev->block != 1);
1864        kfree(disk->ev);
1865}
1866