linux/block/fops.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0-only
   2/*
   3 * Copyright (C) 1991, 1992  Linus Torvalds
   4 * Copyright (C) 2001  Andrea Arcangeli <andrea@suse.de> SuSE
   5 * Copyright (C) 2016 - 2020 Christoph Hellwig
   6 */
   7#include <linux/init.h>
   8#include <linux/mm.h>
   9#include <linux/blkdev.h>
  10#include <linux/buffer_head.h>
  11#include <linux/mpage.h>
  12#include <linux/uio.h>
  13#include <linux/namei.h>
  14#include <linux/task_io_accounting_ops.h>
  15#include <linux/falloc.h>
  16#include <linux/suspend.h>
  17#include <linux/fs.h>
  18#include "blk.h"
  19
  20static struct inode *bdev_file_inode(struct file *file)
  21{
  22        return file->f_mapping->host;
  23}
  24
  25static int blkdev_get_block(struct inode *inode, sector_t iblock,
  26                struct buffer_head *bh, int create)
  27{
  28        bh->b_bdev = I_BDEV(inode);
  29        bh->b_blocknr = iblock;
  30        set_buffer_mapped(bh);
  31        return 0;
  32}
  33
  34static unsigned int dio_bio_write_op(struct kiocb *iocb)
  35{
  36        unsigned int op = REQ_OP_WRITE | REQ_SYNC | REQ_IDLE;
  37
  38        /* avoid the need for a I/O completion work item */
  39        if (iocb->ki_flags & IOCB_DSYNC)
  40                op |= REQ_FUA;
  41        return op;
  42}
  43
  44#define DIO_INLINE_BIO_VECS 4
  45
  46static void blkdev_bio_end_io_simple(struct bio *bio)
  47{
  48        struct task_struct *waiter = bio->bi_private;
  49
  50        WRITE_ONCE(bio->bi_private, NULL);
  51        blk_wake_io_task(waiter);
  52}
  53
  54static ssize_t __blkdev_direct_IO_simple(struct kiocb *iocb,
  55                struct iov_iter *iter, unsigned int nr_pages)
  56{
  57        struct file *file = iocb->ki_filp;
  58        struct block_device *bdev = I_BDEV(bdev_file_inode(file));
  59        struct bio_vec inline_vecs[DIO_INLINE_BIO_VECS], *vecs;
  60        loff_t pos = iocb->ki_pos;
  61        bool should_dirty = false;
  62        struct bio bio;
  63        ssize_t ret;
  64        blk_qc_t qc;
  65
  66        if ((pos | iov_iter_alignment(iter)) &
  67            (bdev_logical_block_size(bdev) - 1))
  68                return -EINVAL;
  69
  70        if (nr_pages <= DIO_INLINE_BIO_VECS)
  71                vecs = inline_vecs;
  72        else {
  73                vecs = kmalloc_array(nr_pages, sizeof(struct bio_vec),
  74                                     GFP_KERNEL);
  75                if (!vecs)
  76                        return -ENOMEM;
  77        }
  78
  79        bio_init(&bio, vecs, nr_pages);
  80        bio_set_dev(&bio, bdev);
  81        bio.bi_iter.bi_sector = pos >> 9;
  82        bio.bi_write_hint = iocb->ki_hint;
  83        bio.bi_private = current;
  84        bio.bi_end_io = blkdev_bio_end_io_simple;
  85        bio.bi_ioprio = iocb->ki_ioprio;
  86
  87        ret = bio_iov_iter_get_pages(&bio, iter);
  88        if (unlikely(ret))
  89                goto out;
  90        ret = bio.bi_iter.bi_size;
  91
  92        if (iov_iter_rw(iter) == READ) {
  93                bio.bi_opf = REQ_OP_READ;
  94                if (iter_is_iovec(iter))
  95                        should_dirty = true;
  96        } else {
  97                bio.bi_opf = dio_bio_write_op(iocb);
  98                task_io_account_write(ret);
  99        }
 100        if (iocb->ki_flags & IOCB_NOWAIT)
 101                bio.bi_opf |= REQ_NOWAIT;
 102        if (iocb->ki_flags & IOCB_HIPRI)
 103                bio_set_polled(&bio, iocb);
 104
 105        qc = submit_bio(&bio);
 106        for (;;) {
 107                set_current_state(TASK_UNINTERRUPTIBLE);
 108                if (!READ_ONCE(bio.bi_private))
 109                        break;
 110                if (!(iocb->ki_flags & IOCB_HIPRI) ||
 111                    !blk_poll(bdev_get_queue(bdev), qc, true))
 112                        blk_io_schedule();
 113        }
 114        __set_current_state(TASK_RUNNING);
 115
 116        bio_release_pages(&bio, should_dirty);
 117        if (unlikely(bio.bi_status))
 118                ret = blk_status_to_errno(bio.bi_status);
 119
 120out:
 121        if (vecs != inline_vecs)
 122                kfree(vecs);
 123
 124        bio_uninit(&bio);
 125
 126        return ret;
 127}
 128
 129struct blkdev_dio {
 130        union {
 131                struct kiocb            *iocb;
 132                struct task_struct      *waiter;
 133        };
 134        size_t                  size;
 135        atomic_t                ref;
 136        bool                    multi_bio : 1;
 137        bool                    should_dirty : 1;
 138        bool                    is_sync : 1;
 139        struct bio              bio;
 140};
 141
 142static struct bio_set blkdev_dio_pool;
 143
 144static int blkdev_iopoll(struct kiocb *kiocb, bool wait)
 145{
 146        struct block_device *bdev = I_BDEV(kiocb->ki_filp->f_mapping->host);
 147        struct request_queue *q = bdev_get_queue(bdev);
 148
 149        return blk_poll(q, READ_ONCE(kiocb->ki_cookie), wait);
 150}
 151
 152static void blkdev_bio_end_io(struct bio *bio)
 153{
 154        struct blkdev_dio *dio = bio->bi_private;
 155        bool should_dirty = dio->should_dirty;
 156
 157        if (bio->bi_status && !dio->bio.bi_status)
 158                dio->bio.bi_status = bio->bi_status;
 159
 160        if (!dio->multi_bio || atomic_dec_and_test(&dio->ref)) {
 161                if (!dio->is_sync) {
 162                        struct kiocb *iocb = dio->iocb;
 163                        ssize_t ret;
 164
 165                        if (likely(!dio->bio.bi_status)) {
 166                                ret = dio->size;
 167                                iocb->ki_pos += ret;
 168                        } else {
 169                                ret = blk_status_to_errno(dio->bio.bi_status);
 170                        }
 171
 172                        dio->iocb->ki_complete(iocb, ret, 0);
 173                        if (dio->multi_bio)
 174                                bio_put(&dio->bio);
 175                } else {
 176                        struct task_struct *waiter = dio->waiter;
 177
 178                        WRITE_ONCE(dio->waiter, NULL);
 179                        blk_wake_io_task(waiter);
 180                }
 181        }
 182
 183        if (should_dirty) {
 184                bio_check_pages_dirty(bio);
 185        } else {
 186                bio_release_pages(bio, false);
 187                bio_put(bio);
 188        }
 189}
 190
 191static ssize_t __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
 192                unsigned int nr_pages)
 193{
 194        struct file *file = iocb->ki_filp;
 195        struct inode *inode = bdev_file_inode(file);
 196        struct block_device *bdev = I_BDEV(inode);
 197        struct blk_plug plug;
 198        struct blkdev_dio *dio;
 199        struct bio *bio;
 200        bool is_poll = (iocb->ki_flags & IOCB_HIPRI) != 0;
 201        bool is_read = (iov_iter_rw(iter) == READ), is_sync;
 202        loff_t pos = iocb->ki_pos;
 203        blk_qc_t qc = BLK_QC_T_NONE;
 204        int ret = 0;
 205
 206        if ((pos | iov_iter_alignment(iter)) &
 207            (bdev_logical_block_size(bdev) - 1))
 208                return -EINVAL;
 209
 210        bio = bio_alloc_kiocb(iocb, nr_pages, &blkdev_dio_pool);
 211
 212        dio = container_of(bio, struct blkdev_dio, bio);
 213        dio->is_sync = is_sync = is_sync_kiocb(iocb);
 214        if (dio->is_sync) {
 215                dio->waiter = current;
 216                bio_get(bio);
 217        } else {
 218                dio->iocb = iocb;
 219        }
 220
 221        dio->size = 0;
 222        dio->multi_bio = false;
 223        dio->should_dirty = is_read && iter_is_iovec(iter);
 224
 225        /*
 226         * Don't plug for HIPRI/polled IO, as those should go straight
 227         * to issue
 228         */
 229        if (!is_poll)
 230                blk_start_plug(&plug);
 231
 232        for (;;) {
 233                bio_set_dev(bio, bdev);
 234                bio->bi_iter.bi_sector = pos >> 9;
 235                bio->bi_write_hint = iocb->ki_hint;
 236                bio->bi_private = dio;
 237                bio->bi_end_io = blkdev_bio_end_io;
 238                bio->bi_ioprio = iocb->ki_ioprio;
 239
 240                ret = bio_iov_iter_get_pages(bio, iter);
 241                if (unlikely(ret)) {
 242                        bio->bi_status = BLK_STS_IOERR;
 243                        bio_endio(bio);
 244                        break;
 245                }
 246
 247                if (is_read) {
 248                        bio->bi_opf = REQ_OP_READ;
 249                        if (dio->should_dirty)
 250                                bio_set_pages_dirty(bio);
 251                } else {
 252                        bio->bi_opf = dio_bio_write_op(iocb);
 253                        task_io_account_write(bio->bi_iter.bi_size);
 254                }
 255                if (iocb->ki_flags & IOCB_NOWAIT)
 256                        bio->bi_opf |= REQ_NOWAIT;
 257
 258                dio->size += bio->bi_iter.bi_size;
 259                pos += bio->bi_iter.bi_size;
 260
 261                nr_pages = bio_iov_vecs_to_alloc(iter, BIO_MAX_VECS);
 262                if (!nr_pages) {
 263                        bool polled = false;
 264
 265                        if (iocb->ki_flags & IOCB_HIPRI) {
 266                                bio_set_polled(bio, iocb);
 267                                polled = true;
 268                        }
 269
 270                        qc = submit_bio(bio);
 271
 272                        if (polled)
 273                                WRITE_ONCE(iocb->ki_cookie, qc);
 274                        break;
 275                }
 276
 277                if (!dio->multi_bio) {
 278                        /*
 279                         * AIO needs an extra reference to ensure the dio
 280                         * structure which is embedded into the first bio
 281                         * stays around.
 282                         */
 283                        if (!is_sync)
 284                                bio_get(bio);
 285                        dio->multi_bio = true;
 286                        atomic_set(&dio->ref, 2);
 287                } else {
 288                        atomic_inc(&dio->ref);
 289                }
 290
 291                submit_bio(bio);
 292                bio = bio_alloc(GFP_KERNEL, nr_pages);
 293        }
 294
 295        if (!is_poll)
 296                blk_finish_plug(&plug);
 297
 298        if (!is_sync)
 299                return -EIOCBQUEUED;
 300
 301        for (;;) {
 302                set_current_state(TASK_UNINTERRUPTIBLE);
 303                if (!READ_ONCE(dio->waiter))
 304                        break;
 305
 306                if (!(iocb->ki_flags & IOCB_HIPRI) ||
 307                    !blk_poll(bdev_get_queue(bdev), qc, true))
 308                        blk_io_schedule();
 309        }
 310        __set_current_state(TASK_RUNNING);
 311
 312        if (!ret)
 313                ret = blk_status_to_errno(dio->bio.bi_status);
 314        if (likely(!ret))
 315                ret = dio->size;
 316
 317        bio_put(&dio->bio);
 318        return ret;
 319}
 320
 321static ssize_t blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
 322{
 323        unsigned int nr_pages;
 324
 325        if (!iov_iter_count(iter))
 326                return 0;
 327
 328        nr_pages = bio_iov_vecs_to_alloc(iter, BIO_MAX_VECS + 1);
 329        if (is_sync_kiocb(iocb) && nr_pages <= BIO_MAX_VECS)
 330                return __blkdev_direct_IO_simple(iocb, iter, nr_pages);
 331
 332        return __blkdev_direct_IO(iocb, iter, bio_max_segs(nr_pages));
 333}
 334
 335static int blkdev_writepage(struct page *page, struct writeback_control *wbc)
 336{
 337        return block_write_full_page(page, blkdev_get_block, wbc);
 338}
 339
 340static int blkdev_readpage(struct file * file, struct page * page)
 341{
 342        return block_read_full_page(page, blkdev_get_block);
 343}
 344
 345static void blkdev_readahead(struct readahead_control *rac)
 346{
 347        mpage_readahead(rac, blkdev_get_block);
 348}
 349
 350static int blkdev_write_begin(struct file *file, struct address_space *mapping,
 351                loff_t pos, unsigned len, unsigned flags, struct page **pagep,
 352                void **fsdata)
 353{
 354        return block_write_begin(mapping, pos, len, flags, pagep,
 355                                 blkdev_get_block);
 356}
 357
 358static int blkdev_write_end(struct file *file, struct address_space *mapping,
 359                loff_t pos, unsigned len, unsigned copied, struct page *page,
 360                void *fsdata)
 361{
 362        int ret;
 363        ret = block_write_end(file, mapping, pos, len, copied, page, fsdata);
 364
 365        unlock_page(page);
 366        put_page(page);
 367
 368        return ret;
 369}
 370
 371static int blkdev_writepages(struct address_space *mapping,
 372                             struct writeback_control *wbc)
 373{
 374        return generic_writepages(mapping, wbc);
 375}
 376
 377const struct address_space_operations def_blk_aops = {
 378        .set_page_dirty = __set_page_dirty_buffers,
 379        .readpage       = blkdev_readpage,
 380        .readahead      = blkdev_readahead,
 381        .writepage      = blkdev_writepage,
 382        .write_begin    = blkdev_write_begin,
 383        .write_end      = blkdev_write_end,
 384        .writepages     = blkdev_writepages,
 385        .direct_IO      = blkdev_direct_IO,
 386        .migratepage    = buffer_migrate_page_norefs,
 387        .is_dirty_writeback = buffer_check_dirty_writeback,
 388};
 389
 390/*
 391 * for a block special file file_inode(file)->i_size is zero
 392 * so we compute the size by hand (just as in block_read/write above)
 393 */
 394static loff_t blkdev_llseek(struct file *file, loff_t offset, int whence)
 395{
 396        struct inode *bd_inode = bdev_file_inode(file);
 397        loff_t retval;
 398
 399        inode_lock(bd_inode);
 400        retval = fixed_size_llseek(file, offset, whence, i_size_read(bd_inode));
 401        inode_unlock(bd_inode);
 402        return retval;
 403}
 404
 405static int blkdev_fsync(struct file *filp, loff_t start, loff_t end,
 406                int datasync)
 407{
 408        struct inode *bd_inode = bdev_file_inode(filp);
 409        struct block_device *bdev = I_BDEV(bd_inode);
 410        int error;
 411
 412        error = file_write_and_wait_range(filp, start, end);
 413        if (error)
 414                return error;
 415
 416        /*
 417         * There is no need to serialise calls to blkdev_issue_flush with
 418         * i_mutex and doing so causes performance issues with concurrent
 419         * O_SYNC writers to a block device.
 420         */
 421        error = blkdev_issue_flush(bdev);
 422        if (error == -EOPNOTSUPP)
 423                error = 0;
 424
 425        return error;
 426}
 427
 428static int blkdev_open(struct inode *inode, struct file *filp)
 429{
 430        struct block_device *bdev;
 431
 432        /*
 433         * Preserve backwards compatibility and allow large file access
 434         * even if userspace doesn't ask for it explicitly. Some mkfs
 435         * binary needs it. We might want to drop this workaround
 436         * during an unstable branch.
 437         */
 438        filp->f_flags |= O_LARGEFILE;
 439        filp->f_mode |= FMODE_NOWAIT | FMODE_BUF_RASYNC;
 440
 441        if (filp->f_flags & O_NDELAY)
 442                filp->f_mode |= FMODE_NDELAY;
 443        if (filp->f_flags & O_EXCL)
 444                filp->f_mode |= FMODE_EXCL;
 445        if ((filp->f_flags & O_ACCMODE) == 3)
 446                filp->f_mode |= FMODE_WRITE_IOCTL;
 447
 448        bdev = blkdev_get_by_dev(inode->i_rdev, filp->f_mode, filp);
 449        if (IS_ERR(bdev))
 450                return PTR_ERR(bdev);
 451        filp->f_mapping = bdev->bd_inode->i_mapping;
 452        filp->f_wb_err = filemap_sample_wb_err(filp->f_mapping);
 453        return 0;
 454}
 455
 456static int blkdev_close(struct inode *inode, struct file *filp)
 457{
 458        struct block_device *bdev = I_BDEV(bdev_file_inode(filp));
 459
 460        blkdev_put(bdev, filp->f_mode);
 461        return 0;
 462}
 463
 464static long block_ioctl(struct file *file, unsigned cmd, unsigned long arg)
 465{
 466        struct block_device *bdev = I_BDEV(bdev_file_inode(file));
 467        fmode_t mode = file->f_mode;
 468
 469        /*
 470         * O_NDELAY can be altered using fcntl(.., F_SETFL, ..), so we have
 471         * to updated it before every ioctl.
 472         */
 473        if (file->f_flags & O_NDELAY)
 474                mode |= FMODE_NDELAY;
 475        else
 476                mode &= ~FMODE_NDELAY;
 477
 478        return blkdev_ioctl(bdev, mode, cmd, arg);
 479}
 480
 481/*
 482 * Write data to the block device.  Only intended for the block device itself
 483 * and the raw driver which basically is a fake block device.
 484 *
 485 * Does not take i_mutex for the write and thus is not for general purpose
 486 * use.
 487 */
 488static ssize_t blkdev_write_iter(struct kiocb *iocb, struct iov_iter *from)
 489{
 490        struct file *file = iocb->ki_filp;
 491        struct inode *bd_inode = bdev_file_inode(file);
 492        loff_t size = i_size_read(bd_inode);
 493        struct blk_plug plug;
 494        size_t shorted = 0;
 495        ssize_t ret;
 496
 497        if (bdev_read_only(I_BDEV(bd_inode)))
 498                return -EPERM;
 499
 500        if (IS_SWAPFILE(bd_inode) && !is_hibernate_resume_dev(bd_inode->i_rdev))
 501                return -ETXTBSY;
 502
 503        if (!iov_iter_count(from))
 504                return 0;
 505
 506        if (iocb->ki_pos >= size)
 507                return -ENOSPC;
 508
 509        if ((iocb->ki_flags & (IOCB_NOWAIT | IOCB_DIRECT)) == IOCB_NOWAIT)
 510                return -EOPNOTSUPP;
 511
 512        size -= iocb->ki_pos;
 513        if (iov_iter_count(from) > size) {
 514                shorted = iov_iter_count(from) - size;
 515                iov_iter_truncate(from, size);
 516        }
 517
 518        blk_start_plug(&plug);
 519        ret = __generic_file_write_iter(iocb, from);
 520        if (ret > 0)
 521                ret = generic_write_sync(iocb, ret);
 522        iov_iter_reexpand(from, iov_iter_count(from) + shorted);
 523        blk_finish_plug(&plug);
 524        return ret;
 525}
 526
 527static ssize_t blkdev_read_iter(struct kiocb *iocb, struct iov_iter *to)
 528{
 529        struct file *file = iocb->ki_filp;
 530        struct inode *bd_inode = bdev_file_inode(file);
 531        loff_t size = i_size_read(bd_inode);
 532        loff_t pos = iocb->ki_pos;
 533        size_t shorted = 0;
 534        ssize_t ret;
 535
 536        if (pos >= size)
 537                return 0;
 538
 539        size -= pos;
 540        if (iov_iter_count(to) > size) {
 541                shorted = iov_iter_count(to) - size;
 542                iov_iter_truncate(to, size);
 543        }
 544
 545        ret = generic_file_read_iter(iocb, to);
 546        iov_iter_reexpand(to, iov_iter_count(to) + shorted);
 547        return ret;
 548}
 549
 550#define BLKDEV_FALLOC_FL_SUPPORTED                                      \
 551                (FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE |           \
 552                 FALLOC_FL_ZERO_RANGE | FALLOC_FL_NO_HIDE_STALE)
 553
 554static long blkdev_fallocate(struct file *file, int mode, loff_t start,
 555                             loff_t len)
 556{
 557        struct inode *inode = bdev_file_inode(file);
 558        struct block_device *bdev = I_BDEV(inode);
 559        loff_t end = start + len - 1;
 560        loff_t isize;
 561        int error;
 562
 563        /* Fail if we don't recognize the flags. */
 564        if (mode & ~BLKDEV_FALLOC_FL_SUPPORTED)
 565                return -EOPNOTSUPP;
 566
 567        /* Don't go off the end of the device. */
 568        isize = i_size_read(bdev->bd_inode);
 569        if (start >= isize)
 570                return -EINVAL;
 571        if (end >= isize) {
 572                if (mode & FALLOC_FL_KEEP_SIZE) {
 573                        len = isize - start;
 574                        end = start + len - 1;
 575                } else
 576                        return -EINVAL;
 577        }
 578
 579        /*
 580         * Don't allow IO that isn't aligned to logical block size.
 581         */
 582        if ((start | len) & (bdev_logical_block_size(bdev) - 1))
 583                return -EINVAL;
 584
 585        filemap_invalidate_lock(inode->i_mapping);
 586
 587        /* Invalidate the page cache, including dirty pages. */
 588        error = truncate_bdev_range(bdev, file->f_mode, start, end);
 589        if (error)
 590                goto fail;
 591
 592        switch (mode) {
 593        case FALLOC_FL_ZERO_RANGE:
 594        case FALLOC_FL_ZERO_RANGE | FALLOC_FL_KEEP_SIZE:
 595                error = blkdev_issue_zeroout(bdev, start >> 9, len >> 9,
 596                                            GFP_KERNEL, BLKDEV_ZERO_NOUNMAP);
 597                break;
 598        case FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE:
 599                error = blkdev_issue_zeroout(bdev, start >> 9, len >> 9,
 600                                             GFP_KERNEL, BLKDEV_ZERO_NOFALLBACK);
 601                break;
 602        case FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE | FALLOC_FL_NO_HIDE_STALE:
 603                error = blkdev_issue_discard(bdev, start >> 9, len >> 9,
 604                                             GFP_KERNEL, 0);
 605                break;
 606        default:
 607                error = -EOPNOTSUPP;
 608        }
 609
 610 fail:
 611        filemap_invalidate_unlock(inode->i_mapping);
 612        return error;
 613}
 614
 615const struct file_operations def_blk_fops = {
 616        .open           = blkdev_open,
 617        .release        = blkdev_close,
 618        .llseek         = blkdev_llseek,
 619        .read_iter      = blkdev_read_iter,
 620        .write_iter     = blkdev_write_iter,
 621        .iopoll         = blkdev_iopoll,
 622        .mmap           = generic_file_mmap,
 623        .fsync          = blkdev_fsync,
 624        .unlocked_ioctl = block_ioctl,
 625#ifdef CONFIG_COMPAT
 626        .compat_ioctl   = compat_blkdev_ioctl,
 627#endif
 628        .splice_read    = generic_file_splice_read,
 629        .splice_write   = iter_file_splice_write,
 630        .fallocate      = blkdev_fallocate,
 631};
 632
 633static __init int blkdev_init(void)
 634{
 635        return bioset_init(&blkdev_dio_pool, 4,
 636                                offsetof(struct blkdev_dio, bio),
 637                                BIOSET_NEED_BVECS|BIOSET_PERCPU_CACHE);
 638}
 639module_init(blkdev_init);
 640