linux/fs/read_write.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0
   2/*
   3 *  linux/fs/read_write.c
   4 *
   5 *  Copyright (C) 1991, 1992  Linus Torvalds
   6 */
   7
   8#include <linux/slab.h>
   9#include <linux/stat.h>
  10#include <linux/sched/xacct.h>
  11#include <linux/fcntl.h>
  12#include <linux/file.h>
  13#include <linux/uio.h>
  14#include <linux/fsnotify.h>
  15#include <linux/security.h>
  16#include <linux/export.h>
  17#include <linux/syscalls.h>
  18#include <linux/pagemap.h>
  19#include <linux/splice.h>
  20#include <linux/compat.h>
  21#include <linux/mount.h>
  22#include <linux/fs.h>
  23#include "internal.h"
  24
  25#include <linux/uaccess.h>
  26#include <asm/unistd.h>
  27
  28const struct file_operations generic_ro_fops = {
  29        .llseek         = generic_file_llseek,
  30        .read_iter      = generic_file_read_iter,
  31        .mmap           = generic_file_readonly_mmap,
  32        .splice_read    = generic_file_splice_read,
  33};
  34
  35EXPORT_SYMBOL(generic_ro_fops);
  36
  37static inline bool unsigned_offsets(struct file *file)
  38{
  39        return file->f_mode & FMODE_UNSIGNED_OFFSET;
  40}
  41
  42/**
  43 * vfs_setpos - update the file offset for lseek
  44 * @file:       file structure in question
  45 * @offset:     file offset to seek to
  46 * @maxsize:    maximum file size
  47 *
  48 * This is a low-level filesystem helper for updating the file offset to
  49 * the value specified by @offset if the given offset is valid and it is
  50 * not equal to the current file offset.
  51 *
  52 * Return the specified offset on success and -EINVAL on invalid offset.
  53 */
  54loff_t vfs_setpos(struct file *file, loff_t offset, loff_t maxsize)
  55{
  56        if (offset < 0 && !unsigned_offsets(file))
  57                return -EINVAL;
  58        if (offset > maxsize)
  59                return -EINVAL;
  60
  61        if (offset != file->f_pos) {
  62                file->f_pos = offset;
  63                file->f_version = 0;
  64        }
  65        return offset;
  66}
  67EXPORT_SYMBOL(vfs_setpos);
  68
  69/**
  70 * generic_file_llseek_size - generic llseek implementation for regular files
  71 * @file:       file structure to seek on
  72 * @offset:     file offset to seek to
  73 * @whence:     type of seek
  74 * @size:       max size of this file in file system
  75 * @eof:        offset used for SEEK_END position
  76 *
  77 * This is a variant of generic_file_llseek that allows passing in a custom
  78 * maximum file size and a custom EOF position, for e.g. hashed directories
  79 *
  80 * Synchronization:
  81 * SEEK_SET and SEEK_END are unsynchronized (but atomic on 64bit platforms)
  82 * SEEK_CUR is synchronized against other SEEK_CURs, but not read/writes.
  83 * read/writes behave like SEEK_SET against seeks.
  84 */
  85loff_t
  86generic_file_llseek_size(struct file *file, loff_t offset, int whence,
  87                loff_t maxsize, loff_t eof)
  88{
  89        switch (whence) {
  90        case SEEK_END:
  91                offset += eof;
  92                break;
  93        case SEEK_CUR:
  94                /*
  95                 * Here we special-case the lseek(fd, 0, SEEK_CUR)
  96                 * position-querying operation.  Avoid rewriting the "same"
  97                 * f_pos value back to the file because a concurrent read(),
  98                 * write() or lseek() might have altered it
  99                 */
 100                if (offset == 0)
 101                        return file->f_pos;
 102                /*
 103                 * f_lock protects against read/modify/write race with other
 104                 * SEEK_CURs. Note that parallel writes and reads behave
 105                 * like SEEK_SET.
 106                 */
 107                spin_lock(&file->f_lock);
 108                offset = vfs_setpos(file, file->f_pos + offset, maxsize);
 109                spin_unlock(&file->f_lock);
 110                return offset;
 111        case SEEK_DATA:
 112                /*
 113                 * In the generic case the entire file is data, so as long as
 114                 * offset isn't at the end of the file then the offset is data.
 115                 */
 116                if ((unsigned long long)offset >= eof)
 117                        return -ENXIO;
 118                break;
 119        case SEEK_HOLE:
 120                /*
 121                 * There is a virtual hole at the end of the file, so as long as
 122                 * offset isn't i_size or larger, return i_size.
 123                 */
 124                if ((unsigned long long)offset >= eof)
 125                        return -ENXIO;
 126                offset = eof;
 127                break;
 128        }
 129
 130        return vfs_setpos(file, offset, maxsize);
 131}
 132EXPORT_SYMBOL(generic_file_llseek_size);
 133
 134/**
 135 * generic_file_llseek - generic llseek implementation for regular files
 136 * @file:       file structure to seek on
 137 * @offset:     file offset to seek to
 138 * @whence:     type of seek
 139 *
 140 * This is a generic implemenation of ->llseek useable for all normal local
 141 * filesystems.  It just updates the file offset to the value specified by
 142 * @offset and @whence.
 143 */
 144loff_t generic_file_llseek(struct file *file, loff_t offset, int whence)
 145{
 146        struct inode *inode = file->f_mapping->host;
 147
 148        return generic_file_llseek_size(file, offset, whence,
 149                                        inode->i_sb->s_maxbytes,
 150                                        i_size_read(inode));
 151}
 152EXPORT_SYMBOL(generic_file_llseek);
 153
 154/**
 155 * fixed_size_llseek - llseek implementation for fixed-sized devices
 156 * @file:       file structure to seek on
 157 * @offset:     file offset to seek to
 158 * @whence:     type of seek
 159 * @size:       size of the file
 160 *
 161 */
 162loff_t fixed_size_llseek(struct file *file, loff_t offset, int whence, loff_t size)
 163{
 164        switch (whence) {
 165        case SEEK_SET: case SEEK_CUR: case SEEK_END:
 166                return generic_file_llseek_size(file, offset, whence,
 167                                                size, size);
 168        default:
 169                return -EINVAL;
 170        }
 171}
 172EXPORT_SYMBOL(fixed_size_llseek);
 173
 174/**
 175 * no_seek_end_llseek - llseek implementation for fixed-sized devices
 176 * @file:       file structure to seek on
 177 * @offset:     file offset to seek to
 178 * @whence:     type of seek
 179 *
 180 */
 181loff_t no_seek_end_llseek(struct file *file, loff_t offset, int whence)
 182{
 183        switch (whence) {
 184        case SEEK_SET: case SEEK_CUR:
 185                return generic_file_llseek_size(file, offset, whence,
 186                                                OFFSET_MAX, 0);
 187        default:
 188                return -EINVAL;
 189        }
 190}
 191EXPORT_SYMBOL(no_seek_end_llseek);
 192
 193/**
 194 * no_seek_end_llseek_size - llseek implementation for fixed-sized devices
 195 * @file:       file structure to seek on
 196 * @offset:     file offset to seek to
 197 * @whence:     type of seek
 198 * @size:       maximal offset allowed
 199 *
 200 */
 201loff_t no_seek_end_llseek_size(struct file *file, loff_t offset, int whence, loff_t size)
 202{
 203        switch (whence) {
 204        case SEEK_SET: case SEEK_CUR:
 205                return generic_file_llseek_size(file, offset, whence,
 206                                                size, 0);
 207        default:
 208                return -EINVAL;
 209        }
 210}
 211EXPORT_SYMBOL(no_seek_end_llseek_size);
 212
 213/**
 214 * noop_llseek - No Operation Performed llseek implementation
 215 * @file:       file structure to seek on
 216 * @offset:     file offset to seek to
 217 * @whence:     type of seek
 218 *
 219 * This is an implementation of ->llseek useable for the rare special case when
 220 * userspace expects the seek to succeed but the (device) file is actually not
 221 * able to perform the seek. In this case you use noop_llseek() instead of
 222 * falling back to the default implementation of ->llseek.
 223 */
 224loff_t noop_llseek(struct file *file, loff_t offset, int whence)
 225{
 226        return file->f_pos;
 227}
 228EXPORT_SYMBOL(noop_llseek);
 229
 230loff_t no_llseek(struct file *file, loff_t offset, int whence)
 231{
 232        return -ESPIPE;
 233}
 234EXPORT_SYMBOL(no_llseek);
 235
 236loff_t default_llseek(struct file *file, loff_t offset, int whence)
 237{
 238        struct inode *inode = file_inode(file);
 239        loff_t retval;
 240
 241        inode_lock(inode);
 242        switch (whence) {
 243                case SEEK_END:
 244                        offset += i_size_read(inode);
 245                        break;
 246                case SEEK_CUR:
 247                        if (offset == 0) {
 248                                retval = file->f_pos;
 249                                goto out;
 250                        }
 251                        offset += file->f_pos;
 252                        break;
 253                case SEEK_DATA:
 254                        /*
 255                         * In the generic case the entire file is data, so as
 256                         * long as offset isn't at the end of the file then the
 257                         * offset is data.
 258                         */
 259                        if (offset >= inode->i_size) {
 260                                retval = -ENXIO;
 261                                goto out;
 262                        }
 263                        break;
 264                case SEEK_HOLE:
 265                        /*
 266                         * There is a virtual hole at the end of the file, so
 267                         * as long as offset isn't i_size or larger, return
 268                         * i_size.
 269                         */
 270                        if (offset >= inode->i_size) {
 271                                retval = -ENXIO;
 272                                goto out;
 273                        }
 274                        offset = inode->i_size;
 275                        break;
 276        }
 277        retval = -EINVAL;
 278        if (offset >= 0 || unsigned_offsets(file)) {
 279                if (offset != file->f_pos) {
 280                        file->f_pos = offset;
 281                        file->f_version = 0;
 282                }
 283                retval = offset;
 284        }
 285out:
 286        inode_unlock(inode);
 287        return retval;
 288}
 289EXPORT_SYMBOL(default_llseek);
 290
 291loff_t vfs_llseek(struct file *file, loff_t offset, int whence)
 292{
 293        loff_t (*fn)(struct file *, loff_t, int);
 294
 295        fn = no_llseek;
 296        if (file->f_mode & FMODE_LSEEK) {
 297                if (file->f_op->llseek)
 298                        fn = file->f_op->llseek;
 299        }
 300        return fn(file, offset, whence);
 301}
 302EXPORT_SYMBOL(vfs_llseek);
 303
 304SYSCALL_DEFINE3(lseek, unsigned int, fd, off_t, offset, unsigned int, whence)
 305{
 306        off_t retval;
 307        struct fd f = fdget_pos(fd);
 308        if (!f.file)
 309                return -EBADF;
 310
 311        retval = -EINVAL;
 312        if (whence <= SEEK_MAX) {
 313                loff_t res = vfs_llseek(f.file, offset, whence);
 314                retval = res;
 315                if (res != (loff_t)retval)
 316                        retval = -EOVERFLOW;    /* LFS: should only happen on 32 bit platforms */
 317        }
 318        fdput_pos(f);
 319        return retval;
 320}
 321
 322#ifdef CONFIG_COMPAT
 323COMPAT_SYSCALL_DEFINE3(lseek, unsigned int, fd, compat_off_t, offset, unsigned int, whence)
 324{
 325        return sys_lseek(fd, offset, whence);
 326}
 327#endif
 328
 329#ifdef __ARCH_WANT_SYS_LLSEEK
 330SYSCALL_DEFINE5(llseek, unsigned int, fd, unsigned long, offset_high,
 331                unsigned long, offset_low, loff_t __user *, result,
 332                unsigned int, whence)
 333{
 334        int retval;
 335        struct fd f = fdget_pos(fd);
 336        loff_t offset;
 337
 338        if (!f.file)
 339                return -EBADF;
 340
 341        retval = -EINVAL;
 342        if (whence > SEEK_MAX)
 343                goto out_putf;
 344
 345        offset = vfs_llseek(f.file, ((loff_t) offset_high << 32) | offset_low,
 346                        whence);
 347
 348        retval = (int)offset;
 349        if (offset >= 0) {
 350                retval = -EFAULT;
 351                if (!copy_to_user(result, &offset, sizeof(offset)))
 352                        retval = 0;
 353        }
 354out_putf:
 355        fdput_pos(f);
 356        return retval;
 357}
 358#endif
 359
 360int rw_verify_area(int read_write, struct file *file, const loff_t *ppos, size_t count)
 361{
 362        struct inode *inode;
 363        loff_t pos;
 364        int retval = -EINVAL;
 365
 366        inode = file_inode(file);
 367        if (unlikely((ssize_t) count < 0))
 368                return retval;
 369        pos = *ppos;
 370        if (unlikely(pos < 0)) {
 371                if (!unsigned_offsets(file))
 372                        return retval;
 373                if (count >= -pos) /* both values are in 0..LLONG_MAX */
 374                        return -EOVERFLOW;
 375        } else if (unlikely((loff_t) (pos + count) < 0)) {
 376                if (!unsigned_offsets(file))
 377                        return retval;
 378        }
 379
 380        if (unlikely(inode->i_flctx && mandatory_lock(inode))) {
 381                retval = locks_mandatory_area(inode, file, pos, pos + count - 1,
 382                                read_write == READ ? F_RDLCK : F_WRLCK);
 383                if (retval < 0)
 384                        return retval;
 385        }
 386        return security_file_permission(file,
 387                                read_write == READ ? MAY_READ : MAY_WRITE);
 388}
 389
 390static ssize_t new_sync_read(struct file *filp, char __user *buf, size_t len, loff_t *ppos)
 391{
 392        struct iovec iov = { .iov_base = buf, .iov_len = len };
 393        struct kiocb kiocb;
 394        struct iov_iter iter;
 395        ssize_t ret;
 396
 397        init_sync_kiocb(&kiocb, filp);
 398        kiocb.ki_pos = *ppos;
 399        iov_iter_init(&iter, READ, &iov, 1, len);
 400
 401        ret = call_read_iter(filp, &kiocb, &iter);
 402        BUG_ON(ret == -EIOCBQUEUED);
 403        *ppos = kiocb.ki_pos;
 404        return ret;
 405}
 406
 407ssize_t __vfs_read(struct file *file, char __user *buf, size_t count,
 408                   loff_t *pos)
 409{
 410        if (file->f_op->read)
 411                return file->f_op->read(file, buf, count, pos);
 412        else if (file->f_op->read_iter)
 413                return new_sync_read(file, buf, count, pos);
 414        else
 415                return -EINVAL;
 416}
 417
 418ssize_t kernel_read(struct file *file, void *buf, size_t count, loff_t *pos)
 419{
 420        mm_segment_t old_fs;
 421        ssize_t result;
 422
 423        old_fs = get_fs();
 424        set_fs(get_ds());
 425        /* The cast to a user pointer is valid due to the set_fs() */
 426        result = vfs_read(file, (void __user *)buf, count, pos);
 427        set_fs(old_fs);
 428        return result;
 429}
 430EXPORT_SYMBOL(kernel_read);
 431
 432ssize_t vfs_read(struct file *file, char __user *buf, size_t count, loff_t *pos)
 433{
 434        ssize_t ret;
 435
 436        if (!(file->f_mode & FMODE_READ))
 437                return -EBADF;
 438        if (!(file->f_mode & FMODE_CAN_READ))
 439                return -EINVAL;
 440        if (unlikely(!access_ok(VERIFY_WRITE, buf, count)))
 441                return -EFAULT;
 442
 443        ret = rw_verify_area(READ, file, pos, count);
 444        if (!ret) {
 445                if (count > MAX_RW_COUNT)
 446                        count =  MAX_RW_COUNT;
 447                ret = __vfs_read(file, buf, count, pos);
 448                if (ret > 0) {
 449                        fsnotify_access(file);
 450                        add_rchar(current, ret);
 451                }
 452                inc_syscr(current);
 453        }
 454
 455        return ret;
 456}
 457
 458static ssize_t new_sync_write(struct file *filp, const char __user *buf, size_t len, loff_t *ppos)
 459{
 460        struct iovec iov = { .iov_base = (void __user *)buf, .iov_len = len };
 461        struct kiocb kiocb;
 462        struct iov_iter iter;
 463        ssize_t ret;
 464
 465        init_sync_kiocb(&kiocb, filp);
 466        kiocb.ki_pos = *ppos;
 467        iov_iter_init(&iter, WRITE, &iov, 1, len);
 468
 469        ret = call_write_iter(filp, &kiocb, &iter);
 470        BUG_ON(ret == -EIOCBQUEUED);
 471        if (ret > 0)
 472                *ppos = kiocb.ki_pos;
 473        return ret;
 474}
 475
 476ssize_t __vfs_write(struct file *file, const char __user *p, size_t count,
 477                    loff_t *pos)
 478{
 479        if (file->f_op->write)
 480                return file->f_op->write(file, p, count, pos);
 481        else if (file->f_op->write_iter)
 482                return new_sync_write(file, p, count, pos);
 483        else
 484                return -EINVAL;
 485}
 486
 487ssize_t __kernel_write(struct file *file, const void *buf, size_t count, loff_t *pos)
 488{
 489        mm_segment_t old_fs;
 490        const char __user *p;
 491        ssize_t ret;
 492
 493        if (!(file->f_mode & FMODE_CAN_WRITE))
 494                return -EINVAL;
 495
 496        old_fs = get_fs();
 497        set_fs(get_ds());
 498        p = (__force const char __user *)buf;
 499        if (count > MAX_RW_COUNT)
 500                count =  MAX_RW_COUNT;
 501        ret = __vfs_write(file, p, count, pos);
 502        set_fs(old_fs);
 503        if (ret > 0) {
 504                fsnotify_modify(file);
 505                add_wchar(current, ret);
 506        }
 507        inc_syscw(current);
 508        return ret;
 509}
 510EXPORT_SYMBOL(__kernel_write);
 511
 512ssize_t kernel_write(struct file *file, const void *buf, size_t count,
 513                            loff_t *pos)
 514{
 515        mm_segment_t old_fs;
 516        ssize_t res;
 517
 518        old_fs = get_fs();
 519        set_fs(get_ds());
 520        /* The cast to a user pointer is valid due to the set_fs() */
 521        res = vfs_write(file, (__force const char __user *)buf, count, pos);
 522        set_fs(old_fs);
 523
 524        return res;
 525}
 526EXPORT_SYMBOL(kernel_write);
 527
 528ssize_t vfs_write(struct file *file, const char __user *buf, size_t count, loff_t *pos)
 529{
 530        ssize_t ret;
 531
 532        if (!(file->f_mode & FMODE_WRITE))
 533                return -EBADF;
 534        if (!(file->f_mode & FMODE_CAN_WRITE))
 535                return -EINVAL;
 536        if (unlikely(!access_ok(VERIFY_READ, buf, count)))
 537                return -EFAULT;
 538
 539        ret = rw_verify_area(WRITE, file, pos, count);
 540        if (!ret) {
 541                if (count > MAX_RW_COUNT)
 542                        count =  MAX_RW_COUNT;
 543                file_start_write(file);
 544                ret = __vfs_write(file, buf, count, pos);
 545                if (ret > 0) {
 546                        fsnotify_modify(file);
 547                        add_wchar(current, ret);
 548                }
 549                inc_syscw(current);
 550                file_end_write(file);
 551        }
 552
 553        return ret;
 554}
 555
 556static inline loff_t file_pos_read(struct file *file)
 557{
 558        return file->f_pos;
 559}
 560
 561static inline void file_pos_write(struct file *file, loff_t pos)
 562{
 563        file->f_pos = pos;
 564}
 565
 566SYSCALL_DEFINE3(read, unsigned int, fd, char __user *, buf, size_t, count)
 567{
 568        struct fd f = fdget_pos(fd);
 569        ssize_t ret = -EBADF;
 570
 571        if (f.file) {
 572                loff_t pos = file_pos_read(f.file);
 573                ret = vfs_read(f.file, buf, count, &pos);
 574                if (ret >= 0)
 575                        file_pos_write(f.file, pos);
 576                fdput_pos(f);
 577        }
 578        return ret;
 579}
 580
 581SYSCALL_DEFINE3(write, unsigned int, fd, const char __user *, buf,
 582                size_t, count)
 583{
 584        struct fd f = fdget_pos(fd);
 585        ssize_t ret = -EBADF;
 586
 587        if (f.file) {
 588                loff_t pos = file_pos_read(f.file);
 589                ret = vfs_write(f.file, buf, count, &pos);
 590                if (ret >= 0)
 591                        file_pos_write(f.file, pos);
 592                fdput_pos(f);
 593        }
 594
 595        return ret;
 596}
 597
 598SYSCALL_DEFINE4(pread64, unsigned int, fd, char __user *, buf,
 599                        size_t, count, loff_t, pos)
 600{
 601        struct fd f;
 602        ssize_t ret = -EBADF;
 603
 604        if (pos < 0)
 605                return -EINVAL;
 606
 607        f = fdget(fd);
 608        if (f.file) {
 609                ret = -ESPIPE;
 610                if (f.file->f_mode & FMODE_PREAD)
 611                        ret = vfs_read(f.file, buf, count, &pos);
 612                fdput(f);
 613        }
 614
 615        return ret;
 616}
 617
 618SYSCALL_DEFINE4(pwrite64, unsigned int, fd, const char __user *, buf,
 619                         size_t, count, loff_t, pos)
 620{
 621        struct fd f;
 622        ssize_t ret = -EBADF;
 623
 624        if (pos < 0)
 625                return -EINVAL;
 626
 627        f = fdget(fd);
 628        if (f.file) {
 629                ret = -ESPIPE;
 630                if (f.file->f_mode & FMODE_PWRITE)  
 631                        ret = vfs_write(f.file, buf, count, &pos);
 632                fdput(f);
 633        }
 634
 635        return ret;
 636}
 637
 638/*
 639 * Reduce an iovec's length in-place.  Return the resulting number of segments
 640 */
 641unsigned long iov_shorten(struct iovec *iov, unsigned long nr_segs, size_t to)
 642{
 643        unsigned long seg = 0;
 644        size_t len = 0;
 645
 646        while (seg < nr_segs) {
 647                seg++;
 648                if (len + iov->iov_len >= to) {
 649                        iov->iov_len = to - len;
 650                        break;
 651                }
 652                len += iov->iov_len;
 653                iov++;
 654        }
 655        return seg;
 656}
 657EXPORT_SYMBOL(iov_shorten);
 658
 659static ssize_t do_iter_readv_writev(struct file *filp, struct iov_iter *iter,
 660                loff_t *ppos, int type, rwf_t flags)
 661{
 662        struct kiocb kiocb;
 663        ssize_t ret;
 664
 665        init_sync_kiocb(&kiocb, filp);
 666        ret = kiocb_set_rw_flags(&kiocb, flags);
 667        if (ret)
 668                return ret;
 669        kiocb.ki_pos = *ppos;
 670
 671        if (type == READ)
 672                ret = call_read_iter(filp, &kiocb, iter);
 673        else
 674                ret = call_write_iter(filp, &kiocb, iter);
 675        BUG_ON(ret == -EIOCBQUEUED);
 676        *ppos = kiocb.ki_pos;
 677        return ret;
 678}
 679
 680/* Do it by hand, with file-ops */
 681static ssize_t do_loop_readv_writev(struct file *filp, struct iov_iter *iter,
 682                loff_t *ppos, int type, rwf_t flags)
 683{
 684        ssize_t ret = 0;
 685
 686        if (flags & ~RWF_HIPRI)
 687                return -EOPNOTSUPP;
 688
 689        while (iov_iter_count(iter)) {
 690                struct iovec iovec = iov_iter_iovec(iter);
 691                ssize_t nr;
 692
 693                if (type == READ) {
 694                        nr = filp->f_op->read(filp, iovec.iov_base,
 695                                              iovec.iov_len, ppos);
 696                } else {
 697                        nr = filp->f_op->write(filp, iovec.iov_base,
 698                                               iovec.iov_len, ppos);
 699                }
 700
 701                if (nr < 0) {
 702                        if (!ret)
 703                                ret = nr;
 704                        break;
 705                }
 706                ret += nr;
 707                if (nr != iovec.iov_len)
 708                        break;
 709                iov_iter_advance(iter, nr);
 710        }
 711
 712        return ret;
 713}
 714
 715/* A write operation does a read from user space and vice versa */
 716#define vrfy_dir(type) ((type) == READ ? VERIFY_WRITE : VERIFY_READ)
 717
 718/**
 719 * rw_copy_check_uvector() - Copy an array of &struct iovec from userspace
 720 *     into the kernel and check that it is valid.
 721 *
 722 * @type: One of %CHECK_IOVEC_ONLY, %READ, or %WRITE.
 723 * @uvector: Pointer to the userspace array.
 724 * @nr_segs: Number of elements in userspace array.
 725 * @fast_segs: Number of elements in @fast_pointer.
 726 * @fast_pointer: Pointer to (usually small on-stack) kernel array.
 727 * @ret_pointer: (output parameter) Pointer to a variable that will point to
 728 *     either @fast_pointer, a newly allocated kernel array, or NULL,
 729 *     depending on which array was used.
 730 *
 731 * This function copies an array of &struct iovec of @nr_segs from
 732 * userspace into the kernel and checks that each element is valid (e.g.
 733 * it does not point to a kernel address or cause overflow by being too
 734 * large, etc.).
 735 *
 736 * As an optimization, the caller may provide a pointer to a small
 737 * on-stack array in @fast_pointer, typically %UIO_FASTIOV elements long
 738 * (the size of this array, or 0 if unused, should be given in @fast_segs).
 739 *
 740 * @ret_pointer will always point to the array that was used, so the
 741 * caller must take care not to call kfree() on it e.g. in case the
 742 * @fast_pointer array was used and it was allocated on the stack.
 743 *
 744 * Return: The total number of bytes covered by the iovec array on success
 745 *   or a negative error code on error.
 746 */
 747ssize_t rw_copy_check_uvector(int type, const struct iovec __user * uvector,
 748                              unsigned long nr_segs, unsigned long fast_segs,
 749                              struct iovec *fast_pointer,
 750                              struct iovec **ret_pointer)
 751{
 752        unsigned long seg;
 753        ssize_t ret;
 754        struct iovec *iov = fast_pointer;
 755
 756        /*
 757         * SuS says "The readv() function *may* fail if the iovcnt argument
 758         * was less than or equal to 0, or greater than {IOV_MAX}.  Linux has
 759         * traditionally returned zero for zero segments, so...
 760         */
 761        if (nr_segs == 0) {
 762                ret = 0;
 763                goto out;
 764        }
 765
 766        /*
 767         * First get the "struct iovec" from user memory and
 768         * verify all the pointers
 769         */
 770        if (nr_segs > UIO_MAXIOV) {
 771                ret = -EINVAL;
 772                goto out;
 773        }
 774        if (nr_segs > fast_segs) {
 775                iov = kmalloc(nr_segs*sizeof(struct iovec), GFP_KERNEL);
 776                if (iov == NULL) {
 777                        ret = -ENOMEM;
 778                        goto out;
 779                }
 780        }
 781        if (copy_from_user(iov, uvector, nr_segs*sizeof(*uvector))) {
 782                ret = -EFAULT;
 783                goto out;
 784        }
 785
 786        /*
 787         * According to the Single Unix Specification we should return EINVAL
 788         * if an element length is < 0 when cast to ssize_t or if the
 789         * total length would overflow the ssize_t return value of the
 790         * system call.
 791         *
 792         * Linux caps all read/write calls to MAX_RW_COUNT, and avoids the
 793         * overflow case.
 794         */
 795        ret = 0;
 796        for (seg = 0; seg < nr_segs; seg++) {
 797                void __user *buf = iov[seg].iov_base;
 798                ssize_t len = (ssize_t)iov[seg].iov_len;
 799
 800                /* see if we we're about to use an invalid len or if
 801                 * it's about to overflow ssize_t */
 802                if (len < 0) {
 803                        ret = -EINVAL;
 804                        goto out;
 805                }
 806                if (type >= 0
 807                    && unlikely(!access_ok(vrfy_dir(type), buf, len))) {
 808                        ret = -EFAULT;
 809                        goto out;
 810                }
 811                if (len > MAX_RW_COUNT - ret) {
 812                        len = MAX_RW_COUNT - ret;
 813                        iov[seg].iov_len = len;
 814                }
 815                ret += len;
 816        }
 817out:
 818        *ret_pointer = iov;
 819        return ret;
 820}
 821
 822#ifdef CONFIG_COMPAT
 823ssize_t compat_rw_copy_check_uvector(int type,
 824                const struct compat_iovec __user *uvector, unsigned long nr_segs,
 825                unsigned long fast_segs, struct iovec *fast_pointer,
 826                struct iovec **ret_pointer)
 827{
 828        compat_ssize_t tot_len;
 829        struct iovec *iov = *ret_pointer = fast_pointer;
 830        ssize_t ret = 0;
 831        int seg;
 832
 833        /*
 834         * SuS says "The readv() function *may* fail if the iovcnt argument
 835         * was less than or equal to 0, or greater than {IOV_MAX}.  Linux has
 836         * traditionally returned zero for zero segments, so...
 837         */
 838        if (nr_segs == 0)
 839                goto out;
 840
 841        ret = -EINVAL;
 842        if (nr_segs > UIO_MAXIOV)
 843                goto out;
 844        if (nr_segs > fast_segs) {
 845                ret = -ENOMEM;
 846                iov = kmalloc(nr_segs*sizeof(struct iovec), GFP_KERNEL);
 847                if (iov == NULL)
 848                        goto out;
 849        }
 850        *ret_pointer = iov;
 851
 852        ret = -EFAULT;
 853        if (!access_ok(VERIFY_READ, uvector, nr_segs*sizeof(*uvector)))
 854                goto out;
 855
 856        /*
 857         * Single unix specification:
 858         * We should -EINVAL if an element length is not >= 0 and fitting an
 859         * ssize_t.
 860         *
 861         * In Linux, the total length is limited to MAX_RW_COUNT, there is
 862         * no overflow possibility.
 863         */
 864        tot_len = 0;
 865        ret = -EINVAL;
 866        for (seg = 0; seg < nr_segs; seg++) {
 867                compat_uptr_t buf;
 868                compat_ssize_t len;
 869
 870                if (__get_user(len, &uvector->iov_len) ||
 871                   __get_user(buf, &uvector->iov_base)) {
 872                        ret = -EFAULT;
 873                        goto out;
 874                }
 875                if (len < 0)    /* size_t not fitting in compat_ssize_t .. */
 876                        goto out;
 877                if (type >= 0 &&
 878                    !access_ok(vrfy_dir(type), compat_ptr(buf), len)) {
 879                        ret = -EFAULT;
 880                        goto out;
 881                }
 882                if (len > MAX_RW_COUNT - tot_len)
 883                        len = MAX_RW_COUNT - tot_len;
 884                tot_len += len;
 885                iov->iov_base = compat_ptr(buf);
 886                iov->iov_len = (compat_size_t) len;
 887                uvector++;
 888                iov++;
 889        }
 890        ret = tot_len;
 891
 892out:
 893        return ret;
 894}
 895#endif
 896
 897static ssize_t do_iter_read(struct file *file, struct iov_iter *iter,
 898                loff_t *pos, rwf_t flags)
 899{
 900        size_t tot_len;
 901        ssize_t ret = 0;
 902
 903        if (!(file->f_mode & FMODE_READ))
 904                return -EBADF;
 905        if (!(file->f_mode & FMODE_CAN_READ))
 906                return -EINVAL;
 907
 908        tot_len = iov_iter_count(iter);
 909        if (!tot_len)
 910                goto out;
 911        ret = rw_verify_area(READ, file, pos, tot_len);
 912        if (ret < 0)
 913                return ret;
 914
 915        if (file->f_op->read_iter)
 916                ret = do_iter_readv_writev(file, iter, pos, READ, flags);
 917        else
 918                ret = do_loop_readv_writev(file, iter, pos, READ, flags);
 919out:
 920        if (ret >= 0)
 921                fsnotify_access(file);
 922        return ret;
 923}
 924
 925ssize_t vfs_iter_read(struct file *file, struct iov_iter *iter, loff_t *ppos,
 926                rwf_t flags)
 927{
 928        if (!file->f_op->read_iter)
 929                return -EINVAL;
 930        return do_iter_read(file, iter, ppos, flags);
 931}
 932EXPORT_SYMBOL(vfs_iter_read);
 933
 934static ssize_t do_iter_write(struct file *file, struct iov_iter *iter,
 935                loff_t *pos, rwf_t flags)
 936{
 937        size_t tot_len;
 938        ssize_t ret = 0;
 939
 940        if (!(file->f_mode & FMODE_WRITE))
 941                return -EBADF;
 942        if (!(file->f_mode & FMODE_CAN_WRITE))
 943                return -EINVAL;
 944
 945        tot_len = iov_iter_count(iter);
 946        if (!tot_len)
 947                return 0;
 948        ret = rw_verify_area(WRITE, file, pos, tot_len);
 949        if (ret < 0)
 950                return ret;
 951
 952        if (file->f_op->write_iter)
 953                ret = do_iter_readv_writev(file, iter, pos, WRITE, flags);
 954        else
 955                ret = do_loop_readv_writev(file, iter, pos, WRITE, flags);
 956        if (ret > 0)
 957                fsnotify_modify(file);
 958        return ret;
 959}
 960
 961ssize_t vfs_iter_write(struct file *file, struct iov_iter *iter, loff_t *ppos,
 962                rwf_t flags)
 963{
 964        if (!file->f_op->write_iter)
 965                return -EINVAL;
 966        return do_iter_write(file, iter, ppos, flags);
 967}
 968EXPORT_SYMBOL(vfs_iter_write);
 969
 970ssize_t vfs_readv(struct file *file, const struct iovec __user *vec,
 971                  unsigned long vlen, loff_t *pos, rwf_t flags)
 972{
 973        struct iovec iovstack[UIO_FASTIOV];
 974        struct iovec *iov = iovstack;
 975        struct iov_iter iter;
 976        ssize_t ret;
 977
 978        ret = import_iovec(READ, vec, vlen, ARRAY_SIZE(iovstack), &iov, &iter);
 979        if (ret >= 0) {
 980                ret = do_iter_read(file, &iter, pos, flags);
 981                kfree(iov);
 982        }
 983
 984        return ret;
 985}
 986
 987static ssize_t vfs_writev(struct file *file, const struct iovec __user *vec,
 988                   unsigned long vlen, loff_t *pos, rwf_t flags)
 989{
 990        struct iovec iovstack[UIO_FASTIOV];
 991        struct iovec *iov = iovstack;
 992        struct iov_iter iter;
 993        ssize_t ret;
 994
 995        ret = import_iovec(WRITE, vec, vlen, ARRAY_SIZE(iovstack), &iov, &iter);
 996        if (ret >= 0) {
 997                file_start_write(file);
 998                ret = do_iter_write(file, &iter, pos, flags);
 999                file_end_write(file);
1000                kfree(iov);
1001        }
1002        return ret;
1003}
1004
1005static ssize_t do_readv(unsigned long fd, const struct iovec __user *vec,
1006                        unsigned long vlen, rwf_t flags)
1007{
1008        struct fd f = fdget_pos(fd);
1009        ssize_t ret = -EBADF;
1010
1011        if (f.file) {
1012                loff_t pos = file_pos_read(f.file);
1013                ret = vfs_readv(f.file, vec, vlen, &pos, flags);
1014                if (ret >= 0)
1015                        file_pos_write(f.file, pos);
1016                fdput_pos(f);
1017        }
1018
1019        if (ret > 0)
1020                add_rchar(current, ret);
1021        inc_syscr(current);
1022        return ret;
1023}
1024
1025static ssize_t do_writev(unsigned long fd, const struct iovec __user *vec,
1026                         unsigned long vlen, rwf_t flags)
1027{
1028        struct fd f = fdget_pos(fd);
1029        ssize_t ret = -EBADF;
1030
1031        if (f.file) {
1032                loff_t pos = file_pos_read(f.file);
1033                ret = vfs_writev(f.file, vec, vlen, &pos, flags);
1034                if (ret >= 0)
1035                        file_pos_write(f.file, pos);
1036                fdput_pos(f);
1037        }
1038
1039        if (ret > 0)
1040                add_wchar(current, ret);
1041        inc_syscw(current);
1042        return ret;
1043}
1044
1045static inline loff_t pos_from_hilo(unsigned long high, unsigned long low)
1046{
1047#define HALF_LONG_BITS (BITS_PER_LONG / 2)
1048        return (((loff_t)high << HALF_LONG_BITS) << HALF_LONG_BITS) | low;
1049}
1050
1051static ssize_t do_preadv(unsigned long fd, const struct iovec __user *vec,
1052                         unsigned long vlen, loff_t pos, rwf_t flags)
1053{
1054        struct fd f;
1055        ssize_t ret = -EBADF;
1056
1057        if (pos < 0)
1058                return -EINVAL;
1059
1060        f = fdget(fd);
1061        if (f.file) {
1062                ret = -ESPIPE;
1063                if (f.file->f_mode & FMODE_PREAD)
1064                        ret = vfs_readv(f.file, vec, vlen, &pos, flags);
1065                fdput(f);
1066        }
1067
1068        if (ret > 0)
1069                add_rchar(current, ret);
1070        inc_syscr(current);
1071        return ret;
1072}
1073
1074static ssize_t do_pwritev(unsigned long fd, const struct iovec __user *vec,
1075                          unsigned long vlen, loff_t pos, rwf_t flags)
1076{
1077        struct fd f;
1078        ssize_t ret = -EBADF;
1079
1080        if (pos < 0)
1081                return -EINVAL;
1082
1083        f = fdget(fd);
1084        if (f.file) {
1085                ret = -ESPIPE;
1086                if (f.file->f_mode & FMODE_PWRITE)
1087                        ret = vfs_writev(f.file, vec, vlen, &pos, flags);
1088                fdput(f);
1089        }
1090
1091        if (ret > 0)
1092                add_wchar(current, ret);
1093        inc_syscw(current);
1094        return ret;
1095}
1096
1097SYSCALL_DEFINE3(readv, unsigned long, fd, const struct iovec __user *, vec,
1098                unsigned long, vlen)
1099{
1100        return do_readv(fd, vec, vlen, 0);
1101}
1102
1103SYSCALL_DEFINE3(writev, unsigned long, fd, const struct iovec __user *, vec,
1104                unsigned long, vlen)
1105{
1106        return do_writev(fd, vec, vlen, 0);
1107}
1108
1109SYSCALL_DEFINE5(preadv, unsigned long, fd, const struct iovec __user *, vec,
1110                unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h)
1111{
1112        loff_t pos = pos_from_hilo(pos_h, pos_l);
1113
1114        return do_preadv(fd, vec, vlen, pos, 0);
1115}
1116
1117SYSCALL_DEFINE6(preadv2, unsigned long, fd, const struct iovec __user *, vec,
1118                unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h,
1119                rwf_t, flags)
1120{
1121        loff_t pos = pos_from_hilo(pos_h, pos_l);
1122
1123        if (pos == -1)
1124                return do_readv(fd, vec, vlen, flags);
1125
1126        return do_preadv(fd, vec, vlen, pos, flags);
1127}
1128
1129SYSCALL_DEFINE5(pwritev, unsigned long, fd, const struct iovec __user *, vec,
1130                unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h)
1131{
1132        loff_t pos = pos_from_hilo(pos_h, pos_l);
1133
1134        return do_pwritev(fd, vec, vlen, pos, 0);
1135}
1136
1137SYSCALL_DEFINE6(pwritev2, unsigned long, fd, const struct iovec __user *, vec,
1138                unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h,
1139                rwf_t, flags)
1140{
1141        loff_t pos = pos_from_hilo(pos_h, pos_l);
1142
1143        if (pos == -1)
1144                return do_writev(fd, vec, vlen, flags);
1145
1146        return do_pwritev(fd, vec, vlen, pos, flags);
1147}
1148
1149#ifdef CONFIG_COMPAT
1150static size_t compat_readv(struct file *file,
1151                           const struct compat_iovec __user *vec,
1152                           unsigned long vlen, loff_t *pos, rwf_t flags)
1153{
1154        struct iovec iovstack[UIO_FASTIOV];
1155        struct iovec *iov = iovstack;
1156        struct iov_iter iter;
1157        ssize_t ret;
1158
1159        ret = compat_import_iovec(READ, vec, vlen, UIO_FASTIOV, &iov, &iter);
1160        if (ret >= 0) {
1161                ret = do_iter_read(file, &iter, pos, flags);
1162                kfree(iov);
1163        }
1164        if (ret > 0)
1165                add_rchar(current, ret);
1166        inc_syscr(current);
1167        return ret;
1168}
1169
1170static size_t do_compat_readv(compat_ulong_t fd,
1171                                 const struct compat_iovec __user *vec,
1172                                 compat_ulong_t vlen, rwf_t flags)
1173{
1174        struct fd f = fdget_pos(fd);
1175        ssize_t ret;
1176        loff_t pos;
1177
1178        if (!f.file)
1179                return -EBADF;
1180        pos = f.file->f_pos;
1181        ret = compat_readv(f.file, vec, vlen, &pos, flags);
1182        if (ret >= 0)
1183                f.file->f_pos = pos;
1184        fdput_pos(f);
1185        return ret;
1186
1187}
1188
1189COMPAT_SYSCALL_DEFINE3(readv, compat_ulong_t, fd,
1190                const struct compat_iovec __user *,vec,
1191                compat_ulong_t, vlen)
1192{
1193        return do_compat_readv(fd, vec, vlen, 0);
1194}
1195
1196static long do_compat_preadv64(unsigned long fd,
1197                                  const struct compat_iovec __user *vec,
1198                                  unsigned long vlen, loff_t pos, rwf_t flags)
1199{
1200        struct fd f;
1201        ssize_t ret;
1202
1203        if (pos < 0)
1204                return -EINVAL;
1205        f = fdget(fd);
1206        if (!f.file)
1207                return -EBADF;
1208        ret = -ESPIPE;
1209        if (f.file->f_mode & FMODE_PREAD)
1210                ret = compat_readv(f.file, vec, vlen, &pos, flags);
1211        fdput(f);
1212        return ret;
1213}
1214
1215#ifdef __ARCH_WANT_COMPAT_SYS_PREADV64
1216COMPAT_SYSCALL_DEFINE4(preadv64, unsigned long, fd,
1217                const struct compat_iovec __user *,vec,
1218                unsigned long, vlen, loff_t, pos)
1219{
1220        return do_compat_preadv64(fd, vec, vlen, pos, 0);
1221}
1222#endif
1223
1224COMPAT_SYSCALL_DEFINE5(preadv, compat_ulong_t, fd,
1225                const struct compat_iovec __user *,vec,
1226                compat_ulong_t, vlen, u32, pos_low, u32, pos_high)
1227{
1228        loff_t pos = ((loff_t)pos_high << 32) | pos_low;
1229
1230        return do_compat_preadv64(fd, vec, vlen, pos, 0);
1231}
1232
1233#ifdef __ARCH_WANT_COMPAT_SYS_PREADV64V2
1234COMPAT_SYSCALL_DEFINE5(preadv64v2, unsigned long, fd,
1235                const struct compat_iovec __user *,vec,
1236                unsigned long, vlen, loff_t, pos, rwf_t, flags)
1237{
1238        return do_compat_preadv64(fd, vec, vlen, pos, flags);
1239}
1240#endif
1241
1242COMPAT_SYSCALL_DEFINE6(preadv2, compat_ulong_t, fd,
1243                const struct compat_iovec __user *,vec,
1244                compat_ulong_t, vlen, u32, pos_low, u32, pos_high,
1245                rwf_t, flags)
1246{
1247        loff_t pos = ((loff_t)pos_high << 32) | pos_low;
1248
1249        if (pos == -1)
1250                return do_compat_readv(fd, vec, vlen, flags);
1251
1252        return do_compat_preadv64(fd, vec, vlen, pos, flags);
1253}
1254
1255static size_t compat_writev(struct file *file,
1256                            const struct compat_iovec __user *vec,
1257                            unsigned long vlen, loff_t *pos, rwf_t flags)
1258{
1259        struct iovec iovstack[UIO_FASTIOV];
1260        struct iovec *iov = iovstack;
1261        struct iov_iter iter;
1262        ssize_t ret;
1263
1264        ret = compat_import_iovec(WRITE, vec, vlen, UIO_FASTIOV, &iov, &iter);
1265        if (ret >= 0) {
1266                file_start_write(file);
1267                ret = do_iter_write(file, &iter, pos, flags);
1268                file_end_write(file);
1269                kfree(iov);
1270        }
1271        if (ret > 0)
1272                add_wchar(current, ret);
1273        inc_syscw(current);
1274        return ret;
1275}
1276
1277static size_t do_compat_writev(compat_ulong_t fd,
1278                                  const struct compat_iovec __user* vec,
1279                                  compat_ulong_t vlen, rwf_t flags)
1280{
1281        struct fd f = fdget_pos(fd);
1282        ssize_t ret;
1283        loff_t pos;
1284
1285        if (!f.file)
1286                return -EBADF;
1287        pos = f.file->f_pos;
1288        ret = compat_writev(f.file, vec, vlen, &pos, flags);
1289        if (ret >= 0)
1290                f.file->f_pos = pos;
1291        fdput_pos(f);
1292        return ret;
1293}
1294
1295COMPAT_SYSCALL_DEFINE3(writev, compat_ulong_t, fd,
1296                const struct compat_iovec __user *, vec,
1297                compat_ulong_t, vlen)
1298{
1299        return do_compat_writev(fd, vec, vlen, 0);
1300}
1301
1302static long do_compat_pwritev64(unsigned long fd,
1303                                   const struct compat_iovec __user *vec,
1304                                   unsigned long vlen, loff_t pos, rwf_t flags)
1305{
1306        struct fd f;
1307        ssize_t ret;
1308
1309        if (pos < 0)
1310                return -EINVAL;
1311        f = fdget(fd);
1312        if (!f.file)
1313                return -EBADF;
1314        ret = -ESPIPE;
1315        if (f.file->f_mode & FMODE_PWRITE)
1316                ret = compat_writev(f.file, vec, vlen, &pos, flags);
1317        fdput(f);
1318        return ret;
1319}
1320
1321#ifdef __ARCH_WANT_COMPAT_SYS_PWRITEV64
1322COMPAT_SYSCALL_DEFINE4(pwritev64, unsigned long, fd,
1323                const struct compat_iovec __user *,vec,
1324                unsigned long, vlen, loff_t, pos)
1325{
1326        return do_compat_pwritev64(fd, vec, vlen, pos, 0);
1327}
1328#endif
1329
1330COMPAT_SYSCALL_DEFINE5(pwritev, compat_ulong_t, fd,
1331                const struct compat_iovec __user *,vec,
1332                compat_ulong_t, vlen, u32, pos_low, u32, pos_high)
1333{
1334        loff_t pos = ((loff_t)pos_high << 32) | pos_low;
1335
1336        return do_compat_pwritev64(fd, vec, vlen, pos, 0);
1337}
1338
1339#ifdef __ARCH_WANT_COMPAT_SYS_PWRITEV64V2
1340COMPAT_SYSCALL_DEFINE5(pwritev64v2, unsigned long, fd,
1341                const struct compat_iovec __user *,vec,
1342                unsigned long, vlen, loff_t, pos, rwf_t, flags)
1343{
1344        return do_compat_pwritev64(fd, vec, vlen, pos, flags);
1345}
1346#endif
1347
1348COMPAT_SYSCALL_DEFINE6(pwritev2, compat_ulong_t, fd,
1349                const struct compat_iovec __user *,vec,
1350                compat_ulong_t, vlen, u32, pos_low, u32, pos_high, rwf_t, flags)
1351{
1352        loff_t pos = ((loff_t)pos_high << 32) | pos_low;
1353
1354        if (pos == -1)
1355                return do_compat_writev(fd, vec, vlen, flags);
1356
1357        return do_compat_pwritev64(fd, vec, vlen, pos, flags);
1358}
1359
1360#endif
1361
1362static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos,
1363                           size_t count, loff_t max)
1364{
1365        struct fd in, out;
1366        struct inode *in_inode, *out_inode;
1367        loff_t pos;
1368        loff_t out_pos;
1369        ssize_t retval;
1370        int fl;
1371
1372        /*
1373         * Get input file, and verify that it is ok..
1374         */
1375        retval = -EBADF;
1376        in = fdget(in_fd);
1377        if (!in.file)
1378                goto out;
1379        if (!(in.file->f_mode & FMODE_READ))
1380                goto fput_in;
1381        retval = -ESPIPE;
1382        if (!ppos) {
1383                pos = in.file->f_pos;
1384        } else {
1385                pos = *ppos;
1386                if (!(in.file->f_mode & FMODE_PREAD))
1387                        goto fput_in;
1388        }
1389        retval = rw_verify_area(READ, in.file, &pos, count);
1390        if (retval < 0)
1391                goto fput_in;
1392        if (count > MAX_RW_COUNT)
1393                count =  MAX_RW_COUNT;
1394
1395        /*
1396         * Get output file, and verify that it is ok..
1397         */
1398        retval = -EBADF;
1399        out = fdget(out_fd);
1400        if (!out.file)
1401                goto fput_in;
1402        if (!(out.file->f_mode & FMODE_WRITE))
1403                goto fput_out;
1404        retval = -EINVAL;
1405        in_inode = file_inode(in.file);
1406        out_inode = file_inode(out.file);
1407        out_pos = out.file->f_pos;
1408        retval = rw_verify_area(WRITE, out.file, &out_pos, count);
1409        if (retval < 0)
1410                goto fput_out;
1411
1412        if (!max)
1413                max = min(in_inode->i_sb->s_maxbytes, out_inode->i_sb->s_maxbytes);
1414
1415        if (unlikely(pos + count > max)) {
1416                retval = -EOVERFLOW;
1417                if (pos >= max)
1418                        goto fput_out;
1419                count = max - pos;
1420        }
1421
1422        fl = 0;
1423#if 0
1424        /*
1425         * We need to debate whether we can enable this or not. The
1426         * man page documents EAGAIN return for the output at least,
1427         * and the application is arguably buggy if it doesn't expect
1428         * EAGAIN on a non-blocking file descriptor.
1429         */
1430        if (in.file->f_flags & O_NONBLOCK)
1431                fl = SPLICE_F_NONBLOCK;
1432#endif
1433        file_start_write(out.file);
1434        retval = do_splice_direct(in.file, &pos, out.file, &out_pos, count, fl);
1435        file_end_write(out.file);
1436
1437        if (retval > 0) {
1438                add_rchar(current, retval);
1439                add_wchar(current, retval);
1440                fsnotify_access(in.file);
1441                fsnotify_modify(out.file);
1442                out.file->f_pos = out_pos;
1443                if (ppos)
1444                        *ppos = pos;
1445                else
1446                        in.file->f_pos = pos;
1447        }
1448
1449        inc_syscr(current);
1450        inc_syscw(current);
1451        if (pos > max)
1452                retval = -EOVERFLOW;
1453
1454fput_out:
1455        fdput(out);
1456fput_in:
1457        fdput(in);
1458out:
1459        return retval;
1460}
1461
1462SYSCALL_DEFINE4(sendfile, int, out_fd, int, in_fd, off_t __user *, offset, size_t, count)
1463{
1464        loff_t pos;
1465        off_t off;
1466        ssize_t ret;
1467
1468        if (offset) {
1469                if (unlikely(get_user(off, offset)))
1470                        return -EFAULT;
1471                pos = off;
1472                ret = do_sendfile(out_fd, in_fd, &pos, count, MAX_NON_LFS);
1473                if (unlikely(put_user(pos, offset)))
1474                        return -EFAULT;
1475                return ret;
1476        }
1477
1478        return do_sendfile(out_fd, in_fd, NULL, count, 0);
1479}
1480
1481SYSCALL_DEFINE4(sendfile64, int, out_fd, int, in_fd, loff_t __user *, offset, size_t, count)
1482{
1483        loff_t pos;
1484        ssize_t ret;
1485
1486        if (offset) {
1487                if (unlikely(copy_from_user(&pos, offset, sizeof(loff_t))))
1488                        return -EFAULT;
1489                ret = do_sendfile(out_fd, in_fd, &pos, count, 0);
1490                if (unlikely(put_user(pos, offset)))
1491                        return -EFAULT;
1492                return ret;
1493        }
1494
1495        return do_sendfile(out_fd, in_fd, NULL, count, 0);
1496}
1497
1498#ifdef CONFIG_COMPAT
1499COMPAT_SYSCALL_DEFINE4(sendfile, int, out_fd, int, in_fd,
1500                compat_off_t __user *, offset, compat_size_t, count)
1501{
1502        loff_t pos;
1503        off_t off;
1504        ssize_t ret;
1505
1506        if (offset) {
1507                if (unlikely(get_user(off, offset)))
1508                        return -EFAULT;
1509                pos = off;
1510                ret = do_sendfile(out_fd, in_fd, &pos, count, MAX_NON_LFS);
1511                if (unlikely(put_user(pos, offset)))
1512                        return -EFAULT;
1513                return ret;
1514        }
1515
1516        return do_sendfile(out_fd, in_fd, NULL, count, 0);
1517}
1518
1519COMPAT_SYSCALL_DEFINE4(sendfile64, int, out_fd, int, in_fd,
1520                compat_loff_t __user *, offset, compat_size_t, count)
1521{
1522        loff_t pos;
1523        ssize_t ret;
1524
1525        if (offset) {
1526                if (unlikely(copy_from_user(&pos, offset, sizeof(loff_t))))
1527                        return -EFAULT;
1528                ret = do_sendfile(out_fd, in_fd, &pos, count, 0);
1529                if (unlikely(put_user(pos, offset)))
1530                        return -EFAULT;
1531                return ret;
1532        }
1533
1534        return do_sendfile(out_fd, in_fd, NULL, count, 0);
1535}
1536#endif
1537
1538/*
1539 * copy_file_range() differs from regular file read and write in that it
1540 * specifically allows return partial success.  When it does so is up to
1541 * the copy_file_range method.
1542 */
1543ssize_t vfs_copy_file_range(struct file *file_in, loff_t pos_in,
1544                            struct file *file_out, loff_t pos_out,
1545                            size_t len, unsigned int flags)
1546{
1547        struct inode *inode_in = file_inode(file_in);
1548        struct inode *inode_out = file_inode(file_out);
1549        ssize_t ret;
1550
1551        if (flags != 0)
1552                return -EINVAL;
1553
1554        if (S_ISDIR(inode_in->i_mode) || S_ISDIR(inode_out->i_mode))
1555                return -EISDIR;
1556        if (!S_ISREG(inode_in->i_mode) || !S_ISREG(inode_out->i_mode))
1557                return -EINVAL;
1558
1559        ret = rw_verify_area(READ, file_in, &pos_in, len);
1560        if (unlikely(ret))
1561                return ret;
1562
1563        ret = rw_verify_area(WRITE, file_out, &pos_out, len);
1564        if (unlikely(ret))
1565                return ret;
1566
1567        if (!(file_in->f_mode & FMODE_READ) ||
1568            !(file_out->f_mode & FMODE_WRITE) ||
1569            (file_out->f_flags & O_APPEND))
1570                return -EBADF;
1571
1572        /* this could be relaxed once a method supports cross-fs copies */
1573        if (inode_in->i_sb != inode_out->i_sb)
1574                return -EXDEV;
1575
1576        if (len == 0)
1577                return 0;
1578
1579        file_start_write(file_out);
1580
1581        /*
1582         * Try cloning first, this is supported by more file systems, and
1583         * more efficient if both clone and copy are supported (e.g. NFS).
1584         */
1585        if (file_in->f_op->clone_file_range) {
1586                ret = file_in->f_op->clone_file_range(file_in, pos_in,
1587                                file_out, pos_out, len);
1588                if (ret == 0) {
1589                        ret = len;
1590                        goto done;
1591                }
1592        }
1593
1594        if (file_out->f_op->copy_file_range) {
1595                ret = file_out->f_op->copy_file_range(file_in, pos_in, file_out,
1596                                                      pos_out, len, flags);
1597                if (ret != -EOPNOTSUPP)
1598                        goto done;
1599        }
1600
1601        ret = do_splice_direct(file_in, &pos_in, file_out, &pos_out,
1602                        len > MAX_RW_COUNT ? MAX_RW_COUNT : len, 0);
1603
1604done:
1605        if (ret > 0) {
1606                fsnotify_access(file_in);
1607                add_rchar(current, ret);
1608                fsnotify_modify(file_out);
1609                add_wchar(current, ret);
1610        }
1611
1612        inc_syscr(current);
1613        inc_syscw(current);
1614
1615        file_end_write(file_out);
1616
1617        return ret;
1618}
1619EXPORT_SYMBOL(vfs_copy_file_range);
1620
1621SYSCALL_DEFINE6(copy_file_range, int, fd_in, loff_t __user *, off_in,
1622                int, fd_out, loff_t __user *, off_out,
1623                size_t, len, unsigned int, flags)
1624{
1625        loff_t pos_in;
1626        loff_t pos_out;
1627        struct fd f_in;
1628        struct fd f_out;
1629        ssize_t ret = -EBADF;
1630
1631        f_in = fdget(fd_in);
1632        if (!f_in.file)
1633                goto out2;
1634
1635        f_out = fdget(fd_out);
1636        if (!f_out.file)
1637                goto out1;
1638
1639        ret = -EFAULT;
1640        if (off_in) {
1641                if (copy_from_user(&pos_in, off_in, sizeof(loff_t)))
1642                        goto out;
1643        } else {
1644                pos_in = f_in.file->f_pos;
1645        }
1646
1647        if (off_out) {
1648                if (copy_from_user(&pos_out, off_out, sizeof(loff_t)))
1649                        goto out;
1650        } else {
1651                pos_out = f_out.file->f_pos;
1652        }
1653
1654        ret = vfs_copy_file_range(f_in.file, pos_in, f_out.file, pos_out, len,
1655                                  flags);
1656        if (ret > 0) {
1657                pos_in += ret;
1658                pos_out += ret;
1659
1660                if (off_in) {
1661                        if (copy_to_user(off_in, &pos_in, sizeof(loff_t)))
1662                                ret = -EFAULT;
1663                } else {
1664                        f_in.file->f_pos = pos_in;
1665                }
1666
1667                if (off_out) {
1668                        if (copy_to_user(off_out, &pos_out, sizeof(loff_t)))
1669                                ret = -EFAULT;
1670                } else {
1671                        f_out.file->f_pos = pos_out;
1672                }
1673        }
1674
1675out:
1676        fdput(f_out);
1677out1:
1678        fdput(f_in);
1679out2:
1680        return ret;
1681}
1682
1683static int clone_verify_area(struct file *file, loff_t pos, u64 len, bool write)
1684{
1685        struct inode *inode = file_inode(file);
1686
1687        if (unlikely(pos < 0))
1688                return -EINVAL;
1689
1690         if (unlikely((loff_t) (pos + len) < 0))
1691                return -EINVAL;
1692
1693        if (unlikely(inode->i_flctx && mandatory_lock(inode))) {
1694                loff_t end = len ? pos + len - 1 : OFFSET_MAX;
1695                int retval;
1696
1697                retval = locks_mandatory_area(inode, file, pos, end,
1698                                write ? F_WRLCK : F_RDLCK);
1699                if (retval < 0)
1700                        return retval;
1701        }
1702
1703        return security_file_permission(file, write ? MAY_WRITE : MAY_READ);
1704}
1705
1706/*
1707 * Check that the two inodes are eligible for cloning, the ranges make
1708 * sense, and then flush all dirty data.  Caller must ensure that the
1709 * inodes have been locked against any other modifications.
1710 *
1711 * Returns: 0 for "nothing to clone", 1 for "something to clone", or
1712 * the usual negative error code.
1713 */
1714int vfs_clone_file_prep_inodes(struct inode *inode_in, loff_t pos_in,
1715                               struct inode *inode_out, loff_t pos_out,
1716                               u64 *len, bool is_dedupe)
1717{
1718        loff_t bs = inode_out->i_sb->s_blocksize;
1719        loff_t blen;
1720        loff_t isize;
1721        bool same_inode = (inode_in == inode_out);
1722        int ret;
1723
1724        /* Don't touch certain kinds of inodes */
1725        if (IS_IMMUTABLE(inode_out))
1726                return -EPERM;
1727
1728        if (IS_SWAPFILE(inode_in) || IS_SWAPFILE(inode_out))
1729                return -ETXTBSY;
1730
1731        /* Don't reflink dirs, pipes, sockets... */
1732        if (S_ISDIR(inode_in->i_mode) || S_ISDIR(inode_out->i_mode))
1733                return -EISDIR;
1734        if (!S_ISREG(inode_in->i_mode) || !S_ISREG(inode_out->i_mode))
1735                return -EINVAL;
1736
1737        /* Are we going all the way to the end? */
1738        isize = i_size_read(inode_in);
1739        if (isize == 0)
1740                return 0;
1741
1742        /* Zero length dedupe exits immediately; reflink goes to EOF. */
1743        if (*len == 0) {
1744                if (is_dedupe || pos_in == isize)
1745                        return 0;
1746                if (pos_in > isize)
1747                        return -EINVAL;
1748                *len = isize - pos_in;
1749        }
1750
1751        /* Ensure offsets don't wrap and the input is inside i_size */
1752        if (pos_in + *len < pos_in || pos_out + *len < pos_out ||
1753            pos_in + *len > isize)
1754                return -EINVAL;
1755
1756        /* Don't allow dedupe past EOF in the dest file */
1757        if (is_dedupe) {
1758                loff_t  disize;
1759
1760                disize = i_size_read(inode_out);
1761                if (pos_out >= disize || pos_out + *len > disize)
1762                        return -EINVAL;
1763        }
1764
1765        /* If we're linking to EOF, continue to the block boundary. */
1766        if (pos_in + *len == isize)
1767                blen = ALIGN(isize, bs) - pos_in;
1768        else
1769                blen = *len;
1770
1771        /* Only reflink if we're aligned to block boundaries */
1772        if (!IS_ALIGNED(pos_in, bs) || !IS_ALIGNED(pos_in + blen, bs) ||
1773            !IS_ALIGNED(pos_out, bs) || !IS_ALIGNED(pos_out + blen, bs))
1774                return -EINVAL;
1775
1776        /* Don't allow overlapped reflink within the same file */
1777        if (same_inode) {
1778                if (pos_out + blen > pos_in && pos_out < pos_in + blen)
1779                        return -EINVAL;
1780        }
1781
1782        /* Wait for the completion of any pending IOs on both files */
1783        inode_dio_wait(inode_in);
1784        if (!same_inode)
1785                inode_dio_wait(inode_out);
1786
1787        ret = filemap_write_and_wait_range(inode_in->i_mapping,
1788                        pos_in, pos_in + *len - 1);
1789        if (ret)
1790                return ret;
1791
1792        ret = filemap_write_and_wait_range(inode_out->i_mapping,
1793                        pos_out, pos_out + *len - 1);
1794        if (ret)
1795                return ret;
1796
1797        /*
1798         * Check that the extents are the same.
1799         */
1800        if (is_dedupe) {
1801                bool            is_same = false;
1802
1803                ret = vfs_dedupe_file_range_compare(inode_in, pos_in,
1804                                inode_out, pos_out, *len, &is_same);
1805                if (ret)
1806                        return ret;
1807                if (!is_same)
1808                        return -EBADE;
1809        }
1810
1811        return 1;
1812}
1813EXPORT_SYMBOL(vfs_clone_file_prep_inodes);
1814
1815int vfs_clone_file_range(struct file *file_in, loff_t pos_in,
1816                struct file *file_out, loff_t pos_out, u64 len)
1817{
1818        struct inode *inode_in = file_inode(file_in);
1819        struct inode *inode_out = file_inode(file_out);
1820        int ret;
1821
1822        if (S_ISDIR(inode_in->i_mode) || S_ISDIR(inode_out->i_mode))
1823                return -EISDIR;
1824        if (!S_ISREG(inode_in->i_mode) || !S_ISREG(inode_out->i_mode))
1825                return -EINVAL;
1826
1827        /*
1828         * FICLONE/FICLONERANGE ioctls enforce that src and dest files are on
1829         * the same mount. Practically, they only need to be on the same file
1830         * system.
1831         */
1832        if (inode_in->i_sb != inode_out->i_sb)
1833                return -EXDEV;
1834
1835        if (!(file_in->f_mode & FMODE_READ) ||
1836            !(file_out->f_mode & FMODE_WRITE) ||
1837            (file_out->f_flags & O_APPEND))
1838                return -EBADF;
1839
1840        if (!file_in->f_op->clone_file_range)
1841                return -EOPNOTSUPP;
1842
1843        ret = clone_verify_area(file_in, pos_in, len, false);
1844        if (ret)
1845                return ret;
1846
1847        ret = clone_verify_area(file_out, pos_out, len, true);
1848        if (ret)
1849                return ret;
1850
1851        if (pos_in + len > i_size_read(inode_in))
1852                return -EINVAL;
1853
1854        ret = file_in->f_op->clone_file_range(file_in, pos_in,
1855                        file_out, pos_out, len);
1856        if (!ret) {
1857                fsnotify_access(file_in);
1858                fsnotify_modify(file_out);
1859        }
1860
1861        return ret;
1862}
1863EXPORT_SYMBOL(vfs_clone_file_range);
1864
1865/*
1866 * Read a page's worth of file data into the page cache.  Return the page
1867 * locked.
1868 */
1869static struct page *vfs_dedupe_get_page(struct inode *inode, loff_t offset)
1870{
1871        struct address_space *mapping;
1872        struct page *page;
1873        pgoff_t n;
1874
1875        n = offset >> PAGE_SHIFT;
1876        mapping = inode->i_mapping;
1877        page = read_mapping_page(mapping, n, NULL);
1878        if (IS_ERR(page))
1879                return page;
1880        if (!PageUptodate(page)) {
1881                put_page(page);
1882                return ERR_PTR(-EIO);
1883        }
1884        lock_page(page);
1885        return page;
1886}
1887
1888/*
1889 * Compare extents of two files to see if they are the same.
1890 * Caller must have locked both inodes to prevent write races.
1891 */
1892int vfs_dedupe_file_range_compare(struct inode *src, loff_t srcoff,
1893                                  struct inode *dest, loff_t destoff,
1894                                  loff_t len, bool *is_same)
1895{
1896        loff_t src_poff;
1897        loff_t dest_poff;
1898        void *src_addr;
1899        void *dest_addr;
1900        struct page *src_page;
1901        struct page *dest_page;
1902        loff_t cmp_len;
1903        bool same;
1904        int error;
1905
1906        error = -EINVAL;
1907        same = true;
1908        while (len) {
1909                src_poff = srcoff & (PAGE_SIZE - 1);
1910                dest_poff = destoff & (PAGE_SIZE - 1);
1911                cmp_len = min(PAGE_SIZE - src_poff,
1912                              PAGE_SIZE - dest_poff);
1913                cmp_len = min(cmp_len, len);
1914                if (cmp_len <= 0)
1915                        goto out_error;
1916
1917                src_page = vfs_dedupe_get_page(src, srcoff);
1918                if (IS_ERR(src_page)) {
1919                        error = PTR_ERR(src_page);
1920                        goto out_error;
1921                }
1922                dest_page = vfs_dedupe_get_page(dest, destoff);
1923                if (IS_ERR(dest_page)) {
1924                        error = PTR_ERR(dest_page);
1925                        unlock_page(src_page);
1926                        put_page(src_page);
1927                        goto out_error;
1928                }
1929                src_addr = kmap_atomic(src_page);
1930                dest_addr = kmap_atomic(dest_page);
1931
1932                flush_dcache_page(src_page);
1933                flush_dcache_page(dest_page);
1934
1935                if (memcmp(src_addr + src_poff, dest_addr + dest_poff, cmp_len))
1936                        same = false;
1937
1938                kunmap_atomic(dest_addr);
1939                kunmap_atomic(src_addr);
1940                unlock_page(dest_page);
1941                unlock_page(src_page);
1942                put_page(dest_page);
1943                put_page(src_page);
1944
1945                if (!same)
1946                        break;
1947
1948                srcoff += cmp_len;
1949                destoff += cmp_len;
1950                len -= cmp_len;
1951        }
1952
1953        *is_same = same;
1954        return 0;
1955
1956out_error:
1957        return error;
1958}
1959EXPORT_SYMBOL(vfs_dedupe_file_range_compare);
1960
1961int vfs_dedupe_file_range(struct file *file, struct file_dedupe_range *same)
1962{
1963        struct file_dedupe_range_info *info;
1964        struct inode *src = file_inode(file);
1965        u64 off;
1966        u64 len;
1967        int i;
1968        int ret;
1969        bool is_admin = capable(CAP_SYS_ADMIN);
1970        u16 count = same->dest_count;
1971        struct file *dst_file;
1972        loff_t dst_off;
1973        ssize_t deduped;
1974
1975        if (!(file->f_mode & FMODE_READ))
1976                return -EINVAL;
1977
1978        if (same->reserved1 || same->reserved2)
1979                return -EINVAL;
1980
1981        off = same->src_offset;
1982        len = same->src_length;
1983
1984        ret = -EISDIR;
1985        if (S_ISDIR(src->i_mode))
1986                goto out;
1987
1988        ret = -EINVAL;
1989        if (!S_ISREG(src->i_mode))
1990                goto out;
1991
1992        ret = clone_verify_area(file, off, len, false);
1993        if (ret < 0)
1994                goto out;
1995        ret = 0;
1996
1997        if (off + len > i_size_read(src))
1998                return -EINVAL;
1999
2000        /* pre-format output fields to sane values */
2001        for (i = 0; i < count; i++) {
2002                same->info[i].bytes_deduped = 0ULL;
2003                same->info[i].status = FILE_DEDUPE_RANGE_SAME;
2004        }
2005
2006        for (i = 0, info = same->info; i < count; i++, info++) {
2007                struct inode *dst;
2008                struct fd dst_fd = fdget(info->dest_fd);
2009
2010                dst_file = dst_fd.file;
2011                if (!dst_file) {
2012                        info->status = -EBADF;
2013                        goto next_loop;
2014                }
2015                dst = file_inode(dst_file);
2016
2017                ret = mnt_want_write_file(dst_file);
2018                if (ret) {
2019                        info->status = ret;
2020                        goto next_loop;
2021                }
2022
2023                dst_off = info->dest_offset;
2024                ret = clone_verify_area(dst_file, dst_off, len, true);
2025                if (ret < 0) {
2026                        info->status = ret;
2027                        goto next_file;
2028                }
2029                ret = 0;
2030
2031                if (info->reserved) {
2032                        info->status = -EINVAL;
2033                } else if (!(is_admin || (dst_file->f_mode & FMODE_WRITE))) {
2034                        info->status = -EINVAL;
2035                } else if (file->f_path.mnt != dst_file->f_path.mnt) {
2036                        info->status = -EXDEV;
2037                } else if (S_ISDIR(dst->i_mode)) {
2038                        info->status = -EISDIR;
2039                } else if (dst_file->f_op->dedupe_file_range == NULL) {
2040                        info->status = -EINVAL;
2041                } else {
2042                        deduped = dst_file->f_op->dedupe_file_range(file, off,
2043                                                        len, dst_file,
2044                                                        info->dest_offset);
2045                        if (deduped == -EBADE)
2046                                info->status = FILE_DEDUPE_RANGE_DIFFERS;
2047                        else if (deduped < 0)
2048                                info->status = deduped;
2049                        else
2050                                info->bytes_deduped += deduped;
2051                }
2052
2053next_file:
2054                mnt_drop_write_file(dst_file);
2055next_loop:
2056                fdput(dst_fd);
2057
2058                if (fatal_signal_pending(current))
2059                        goto out;
2060        }
2061
2062out:
2063        return ret;
2064}
2065EXPORT_SYMBOL(vfs_dedupe_file_range);
2066