linux/fs/read_write.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0
   2/*
   3 *  linux/fs/read_write.c
   4 *
   5 *  Copyright (C) 1991, 1992  Linus Torvalds
   6 */
   7
   8#include <linux/slab.h>
   9#include <linux/stat.h>
  10#include <linux/sched/xacct.h>
  11#include <linux/fcntl.h>
  12#include <linux/file.h>
  13#include <linux/uio.h>
  14#include <linux/fsnotify.h>
  15#include <linux/security.h>
  16#include <linux/export.h>
  17#include <linux/syscalls.h>
  18#include <linux/pagemap.h>
  19#include <linux/splice.h>
  20#include <linux/compat.h>
  21#include <linux/mount.h>
  22#include <linux/fs.h>
  23#include "internal.h"
  24
  25#include <linux/uaccess.h>
  26#include <asm/unistd.h>
  27
  28const struct file_operations generic_ro_fops = {
  29        .llseek         = generic_file_llseek,
  30        .read_iter      = generic_file_read_iter,
  31        .mmap           = generic_file_readonly_mmap,
  32        .splice_read    = generic_file_splice_read,
  33};
  34
  35EXPORT_SYMBOL(generic_ro_fops);
  36
  37static inline bool unsigned_offsets(struct file *file)
  38{
  39        return file->f_mode & FMODE_UNSIGNED_OFFSET;
  40}
  41
  42/**
  43 * vfs_setpos - update the file offset for lseek
  44 * @file:       file structure in question
  45 * @offset:     file offset to seek to
  46 * @maxsize:    maximum file size
  47 *
  48 * This is a low-level filesystem helper for updating the file offset to
  49 * the value specified by @offset if the given offset is valid and it is
  50 * not equal to the current file offset.
  51 *
  52 * Return the specified offset on success and -EINVAL on invalid offset.
  53 */
  54loff_t vfs_setpos(struct file *file, loff_t offset, loff_t maxsize)
  55{
  56        if (offset < 0 && !unsigned_offsets(file))
  57                return -EINVAL;
  58        if (offset > maxsize)
  59                return -EINVAL;
  60
  61        if (offset != file->f_pos) {
  62                file->f_pos = offset;
  63                file->f_version = 0;
  64        }
  65        return offset;
  66}
  67EXPORT_SYMBOL(vfs_setpos);
  68
  69/**
  70 * generic_file_llseek_size - generic llseek implementation for regular files
  71 * @file:       file structure to seek on
  72 * @offset:     file offset to seek to
  73 * @whence:     type of seek
  74 * @size:       max size of this file in file system
  75 * @eof:        offset used for SEEK_END position
  76 *
  77 * This is a variant of generic_file_llseek that allows passing in a custom
  78 * maximum file size and a custom EOF position, for e.g. hashed directories
  79 *
  80 * Synchronization:
  81 * SEEK_SET and SEEK_END are unsynchronized (but atomic on 64bit platforms)
  82 * SEEK_CUR is synchronized against other SEEK_CURs, but not read/writes.
  83 * read/writes behave like SEEK_SET against seeks.
  84 */
  85loff_t
  86generic_file_llseek_size(struct file *file, loff_t offset, int whence,
  87                loff_t maxsize, loff_t eof)
  88{
  89        switch (whence) {
  90        case SEEK_END:
  91                offset += eof;
  92                break;
  93        case SEEK_CUR:
  94                /*
  95                 * Here we special-case the lseek(fd, 0, SEEK_CUR)
  96                 * position-querying operation.  Avoid rewriting the "same"
  97                 * f_pos value back to the file because a concurrent read(),
  98                 * write() or lseek() might have altered it
  99                 */
 100                if (offset == 0)
 101                        return file->f_pos;
 102                /*
 103                 * f_lock protects against read/modify/write race with other
 104                 * SEEK_CURs. Note that parallel writes and reads behave
 105                 * like SEEK_SET.
 106                 */
 107                spin_lock(&file->f_lock);
 108                offset = vfs_setpos(file, file->f_pos + offset, maxsize);
 109                spin_unlock(&file->f_lock);
 110                return offset;
 111        case SEEK_DATA:
 112                /*
 113                 * In the generic case the entire file is data, so as long as
 114                 * offset isn't at the end of the file then the offset is data.
 115                 */
 116                if ((unsigned long long)offset >= eof)
 117                        return -ENXIO;
 118                break;
 119        case SEEK_HOLE:
 120                /*
 121                 * There is a virtual hole at the end of the file, so as long as
 122                 * offset isn't i_size or larger, return i_size.
 123                 */
 124                if ((unsigned long long)offset >= eof)
 125                        return -ENXIO;
 126                offset = eof;
 127                break;
 128        }
 129
 130        return vfs_setpos(file, offset, maxsize);
 131}
 132EXPORT_SYMBOL(generic_file_llseek_size);
 133
 134/**
 135 * generic_file_llseek - generic llseek implementation for regular files
 136 * @file:       file structure to seek on
 137 * @offset:     file offset to seek to
 138 * @whence:     type of seek
 139 *
 140 * This is a generic implemenation of ->llseek useable for all normal local
 141 * filesystems.  It just updates the file offset to the value specified by
 142 * @offset and @whence.
 143 */
 144loff_t generic_file_llseek(struct file *file, loff_t offset, int whence)
 145{
 146        struct inode *inode = file->f_mapping->host;
 147
 148        return generic_file_llseek_size(file, offset, whence,
 149                                        inode->i_sb->s_maxbytes,
 150                                        i_size_read(inode));
 151}
 152EXPORT_SYMBOL(generic_file_llseek);
 153
 154/**
 155 * fixed_size_llseek - llseek implementation for fixed-sized devices
 156 * @file:       file structure to seek on
 157 * @offset:     file offset to seek to
 158 * @whence:     type of seek
 159 * @size:       size of the file
 160 *
 161 */
 162loff_t fixed_size_llseek(struct file *file, loff_t offset, int whence, loff_t size)
 163{
 164        switch (whence) {
 165        case SEEK_SET: case SEEK_CUR: case SEEK_END:
 166                return generic_file_llseek_size(file, offset, whence,
 167                                                size, size);
 168        default:
 169                return -EINVAL;
 170        }
 171}
 172EXPORT_SYMBOL(fixed_size_llseek);
 173
 174/**
 175 * no_seek_end_llseek - llseek implementation for fixed-sized devices
 176 * @file:       file structure to seek on
 177 * @offset:     file offset to seek to
 178 * @whence:     type of seek
 179 *
 180 */
 181loff_t no_seek_end_llseek(struct file *file, loff_t offset, int whence)
 182{
 183        switch (whence) {
 184        case SEEK_SET: case SEEK_CUR:
 185                return generic_file_llseek_size(file, offset, whence,
 186                                                OFFSET_MAX, 0);
 187        default:
 188                return -EINVAL;
 189        }
 190}
 191EXPORT_SYMBOL(no_seek_end_llseek);
 192
 193/**
 194 * no_seek_end_llseek_size - llseek implementation for fixed-sized devices
 195 * @file:       file structure to seek on
 196 * @offset:     file offset to seek to
 197 * @whence:     type of seek
 198 * @size:       maximal offset allowed
 199 *
 200 */
 201loff_t no_seek_end_llseek_size(struct file *file, loff_t offset, int whence, loff_t size)
 202{
 203        switch (whence) {
 204        case SEEK_SET: case SEEK_CUR:
 205                return generic_file_llseek_size(file, offset, whence,
 206                                                size, 0);
 207        default:
 208                return -EINVAL;
 209        }
 210}
 211EXPORT_SYMBOL(no_seek_end_llseek_size);
 212
 213/**
 214 * noop_llseek - No Operation Performed llseek implementation
 215 * @file:       file structure to seek on
 216 * @offset:     file offset to seek to
 217 * @whence:     type of seek
 218 *
 219 * This is an implementation of ->llseek useable for the rare special case when
 220 * userspace expects the seek to succeed but the (device) file is actually not
 221 * able to perform the seek. In this case you use noop_llseek() instead of
 222 * falling back to the default implementation of ->llseek.
 223 */
 224loff_t noop_llseek(struct file *file, loff_t offset, int whence)
 225{
 226        return file->f_pos;
 227}
 228EXPORT_SYMBOL(noop_llseek);
 229
 230loff_t no_llseek(struct file *file, loff_t offset, int whence)
 231{
 232        return -ESPIPE;
 233}
 234EXPORT_SYMBOL(no_llseek);
 235
 236loff_t default_llseek(struct file *file, loff_t offset, int whence)
 237{
 238        struct inode *inode = file_inode(file);
 239        loff_t retval;
 240
 241        inode_lock(inode);
 242        switch (whence) {
 243                case SEEK_END:
 244                        offset += i_size_read(inode);
 245                        break;
 246                case SEEK_CUR:
 247                        if (offset == 0) {
 248                                retval = file->f_pos;
 249                                goto out;
 250                        }
 251                        offset += file->f_pos;
 252                        break;
 253                case SEEK_DATA:
 254                        /*
 255                         * In the generic case the entire file is data, so as
 256                         * long as offset isn't at the end of the file then the
 257                         * offset is data.
 258                         */
 259                        if (offset >= inode->i_size) {
 260                                retval = -ENXIO;
 261                                goto out;
 262                        }
 263                        break;
 264                case SEEK_HOLE:
 265                        /*
 266                         * There is a virtual hole at the end of the file, so
 267                         * as long as offset isn't i_size or larger, return
 268                         * i_size.
 269                         */
 270                        if (offset >= inode->i_size) {
 271                                retval = -ENXIO;
 272                                goto out;
 273                        }
 274                        offset = inode->i_size;
 275                        break;
 276        }
 277        retval = -EINVAL;
 278        if (offset >= 0 || unsigned_offsets(file)) {
 279                if (offset != file->f_pos) {
 280                        file->f_pos = offset;
 281                        file->f_version = 0;
 282                }
 283                retval = offset;
 284        }
 285out:
 286        inode_unlock(inode);
 287        return retval;
 288}
 289EXPORT_SYMBOL(default_llseek);
 290
 291loff_t vfs_llseek(struct file *file, loff_t offset, int whence)
 292{
 293        loff_t (*fn)(struct file *, loff_t, int);
 294
 295        fn = no_llseek;
 296        if (file->f_mode & FMODE_LSEEK) {
 297                if (file->f_op->llseek)
 298                        fn = file->f_op->llseek;
 299        }
 300        return fn(file, offset, whence);
 301}
 302EXPORT_SYMBOL(vfs_llseek);
 303
 304static off_t ksys_lseek(unsigned int fd, off_t offset, unsigned int whence)
 305{
 306        off_t retval;
 307        struct fd f = fdget_pos(fd);
 308        if (!f.file)
 309                return -EBADF;
 310
 311        retval = -EINVAL;
 312        if (whence <= SEEK_MAX) {
 313                loff_t res = vfs_llseek(f.file, offset, whence);
 314                retval = res;
 315                if (res != (loff_t)retval)
 316                        retval = -EOVERFLOW;    /* LFS: should only happen on 32 bit platforms */
 317        }
 318        fdput_pos(f);
 319        return retval;
 320}
 321
 322SYSCALL_DEFINE3(lseek, unsigned int, fd, off_t, offset, unsigned int, whence)
 323{
 324        return ksys_lseek(fd, offset, whence);
 325}
 326
 327#ifdef CONFIG_COMPAT
 328COMPAT_SYSCALL_DEFINE3(lseek, unsigned int, fd, compat_off_t, offset, unsigned int, whence)
 329{
 330        return ksys_lseek(fd, offset, whence);
 331}
 332#endif
 333
 334#if !defined(CONFIG_64BIT) || defined(CONFIG_COMPAT) || \
 335        defined(__ARCH_WANT_SYS_LLSEEK)
 336SYSCALL_DEFINE5(llseek, unsigned int, fd, unsigned long, offset_high,
 337                unsigned long, offset_low, loff_t __user *, result,
 338                unsigned int, whence)
 339{
 340        int retval;
 341        struct fd f = fdget_pos(fd);
 342        loff_t offset;
 343
 344        if (!f.file)
 345                return -EBADF;
 346
 347        retval = -EINVAL;
 348        if (whence > SEEK_MAX)
 349                goto out_putf;
 350
 351        offset = vfs_llseek(f.file, ((loff_t) offset_high << 32) | offset_low,
 352                        whence);
 353
 354        retval = (int)offset;
 355        if (offset >= 0) {
 356                retval = -EFAULT;
 357                if (!copy_to_user(result, &offset, sizeof(offset)))
 358                        retval = 0;
 359        }
 360out_putf:
 361        fdput_pos(f);
 362        return retval;
 363}
 364#endif
 365
 366int rw_verify_area(int read_write, struct file *file, const loff_t *ppos, size_t count)
 367{
 368        struct inode *inode;
 369        int retval = -EINVAL;
 370
 371        inode = file_inode(file);
 372        if (unlikely((ssize_t) count < 0))
 373                return retval;
 374
 375        /*
 376         * ranged mandatory locking does not apply to streams - it makes sense
 377         * only for files where position has a meaning.
 378         */
 379        if (ppos) {
 380                loff_t pos = *ppos;
 381
 382                if (unlikely(pos < 0)) {
 383                        if (!unsigned_offsets(file))
 384                                return retval;
 385                        if (count >= -pos) /* both values are in 0..LLONG_MAX */
 386                                return -EOVERFLOW;
 387                } else if (unlikely((loff_t) (pos + count) < 0)) {
 388                        if (!unsigned_offsets(file))
 389                                return retval;
 390                }
 391
 392                if (unlikely(inode->i_flctx && mandatory_lock(inode))) {
 393                        retval = locks_mandatory_area(inode, file, pos, pos + count - 1,
 394                                        read_write == READ ? F_RDLCK : F_WRLCK);
 395                        if (retval < 0)
 396                                return retval;
 397                }
 398        }
 399
 400        return security_file_permission(file,
 401                                read_write == READ ? MAY_READ : MAY_WRITE);
 402}
 403
 404static ssize_t new_sync_read(struct file *filp, char __user *buf, size_t len, loff_t *ppos)
 405{
 406        struct iovec iov = { .iov_base = buf, .iov_len = len };
 407        struct kiocb kiocb;
 408        struct iov_iter iter;
 409        ssize_t ret;
 410
 411        init_sync_kiocb(&kiocb, filp);
 412        kiocb.ki_pos = (ppos ? *ppos : 0);
 413        iov_iter_init(&iter, READ, &iov, 1, len);
 414
 415        ret = call_read_iter(filp, &kiocb, &iter);
 416        BUG_ON(ret == -EIOCBQUEUED);
 417        if (ppos)
 418                *ppos = kiocb.ki_pos;
 419        return ret;
 420}
 421
 422static int warn_unsupported(struct file *file, const char *op)
 423{
 424        pr_warn_ratelimited(
 425                "kernel %s not supported for file %pD4 (pid: %d comm: %.20s)\n",
 426                op, file, current->pid, current->comm);
 427        return -EINVAL;
 428}
 429
 430ssize_t __kernel_read(struct file *file, void *buf, size_t count, loff_t *pos)
 431{
 432        struct kvec iov = {
 433                .iov_base       = buf,
 434                .iov_len        = min_t(size_t, count, MAX_RW_COUNT),
 435        };
 436        struct kiocb kiocb;
 437        struct iov_iter iter;
 438        ssize_t ret;
 439
 440        if (WARN_ON_ONCE(!(file->f_mode & FMODE_READ)))
 441                return -EINVAL;
 442        if (!(file->f_mode & FMODE_CAN_READ))
 443                return -EINVAL;
 444        /*
 445         * Also fail if ->read_iter and ->read are both wired up as that
 446         * implies very convoluted semantics.
 447         */
 448        if (unlikely(!file->f_op->read_iter || file->f_op->read))
 449                return warn_unsupported(file, "read");
 450
 451        init_sync_kiocb(&kiocb, file);
 452        kiocb.ki_pos = pos ? *pos : 0;
 453        iov_iter_kvec(&iter, READ, &iov, 1, iov.iov_len);
 454        ret = file->f_op->read_iter(&kiocb, &iter);
 455        if (ret > 0) {
 456                if (pos)
 457                        *pos = kiocb.ki_pos;
 458                fsnotify_access(file);
 459                add_rchar(current, ret);
 460        }
 461        inc_syscr(current);
 462        return ret;
 463}
 464
 465ssize_t kernel_read(struct file *file, void *buf, size_t count, loff_t *pos)
 466{
 467        ssize_t ret;
 468
 469        ret = rw_verify_area(READ, file, pos, count);
 470        if (ret)
 471                return ret;
 472        return __kernel_read(file, buf, count, pos);
 473}
 474EXPORT_SYMBOL(kernel_read);
 475
 476ssize_t vfs_read(struct file *file, char __user *buf, size_t count, loff_t *pos)
 477{
 478        ssize_t ret;
 479
 480        if (!(file->f_mode & FMODE_READ))
 481                return -EBADF;
 482        if (!(file->f_mode & FMODE_CAN_READ))
 483                return -EINVAL;
 484        if (unlikely(!access_ok(buf, count)))
 485                return -EFAULT;
 486
 487        ret = rw_verify_area(READ, file, pos, count);
 488        if (ret)
 489                return ret;
 490        if (count > MAX_RW_COUNT)
 491                count =  MAX_RW_COUNT;
 492
 493        if (file->f_op->read)
 494                ret = file->f_op->read(file, buf, count, pos);
 495        else if (file->f_op->read_iter)
 496                ret = new_sync_read(file, buf, count, pos);
 497        else
 498                ret = -EINVAL;
 499        if (ret > 0) {
 500                fsnotify_access(file);
 501                add_rchar(current, ret);
 502        }
 503        inc_syscr(current);
 504        return ret;
 505}
 506
 507static ssize_t new_sync_write(struct file *filp, const char __user *buf, size_t len, loff_t *ppos)
 508{
 509        struct iovec iov = { .iov_base = (void __user *)buf, .iov_len = len };
 510        struct kiocb kiocb;
 511        struct iov_iter iter;
 512        ssize_t ret;
 513
 514        init_sync_kiocb(&kiocb, filp);
 515        kiocb.ki_pos = (ppos ? *ppos : 0);
 516        iov_iter_init(&iter, WRITE, &iov, 1, len);
 517
 518        ret = call_write_iter(filp, &kiocb, &iter);
 519        BUG_ON(ret == -EIOCBQUEUED);
 520        if (ret > 0 && ppos)
 521                *ppos = kiocb.ki_pos;
 522        return ret;
 523}
 524
 525/* caller is responsible for file_start_write/file_end_write */
 526ssize_t __kernel_write(struct file *file, const void *buf, size_t count, loff_t *pos)
 527{
 528        struct kvec iov = {
 529                .iov_base       = (void *)buf,
 530                .iov_len        = min_t(size_t, count, MAX_RW_COUNT),
 531        };
 532        struct kiocb kiocb;
 533        struct iov_iter iter;
 534        ssize_t ret;
 535
 536        if (WARN_ON_ONCE(!(file->f_mode & FMODE_WRITE)))
 537                return -EBADF;
 538        if (!(file->f_mode & FMODE_CAN_WRITE))
 539                return -EINVAL;
 540        /*
 541         * Also fail if ->write_iter and ->write are both wired up as that
 542         * implies very convoluted semantics.
 543         */
 544        if (unlikely(!file->f_op->write_iter || file->f_op->write))
 545                return warn_unsupported(file, "write");
 546
 547        init_sync_kiocb(&kiocb, file);
 548        kiocb.ki_pos = pos ? *pos : 0;
 549        iov_iter_kvec(&iter, WRITE, &iov, 1, iov.iov_len);
 550        ret = file->f_op->write_iter(&kiocb, &iter);
 551        if (ret > 0) {
 552                if (pos)
 553                        *pos = kiocb.ki_pos;
 554                fsnotify_modify(file);
 555                add_wchar(current, ret);
 556        }
 557        inc_syscw(current);
 558        return ret;
 559}
 560/*
 561 * This "EXPORT_SYMBOL_GPL()" is more of a "EXPORT_SYMBOL_DONTUSE()",
 562 * but autofs is one of the few internal kernel users that actually
 563 * wants this _and_ can be built as a module. So we need to export
 564 * this symbol for autofs, even though it really isn't appropriate
 565 * for any other kernel modules.
 566 */
 567EXPORT_SYMBOL_GPL(__kernel_write);
 568
 569ssize_t kernel_write(struct file *file, const void *buf, size_t count,
 570                            loff_t *pos)
 571{
 572        ssize_t ret;
 573
 574        ret = rw_verify_area(WRITE, file, pos, count);
 575        if (ret)
 576                return ret;
 577
 578        file_start_write(file);
 579        ret =  __kernel_write(file, buf, count, pos);
 580        file_end_write(file);
 581        return ret;
 582}
 583EXPORT_SYMBOL(kernel_write);
 584
 585ssize_t vfs_write(struct file *file, const char __user *buf, size_t count, loff_t *pos)
 586{
 587        ssize_t ret;
 588
 589        if (!(file->f_mode & FMODE_WRITE))
 590                return -EBADF;
 591        if (!(file->f_mode & FMODE_CAN_WRITE))
 592                return -EINVAL;
 593        if (unlikely(!access_ok(buf, count)))
 594                return -EFAULT;
 595
 596        ret = rw_verify_area(WRITE, file, pos, count);
 597        if (ret)
 598                return ret;
 599        if (count > MAX_RW_COUNT)
 600                count =  MAX_RW_COUNT;
 601        file_start_write(file);
 602        if (file->f_op->write)
 603                ret = file->f_op->write(file, buf, count, pos);
 604        else if (file->f_op->write_iter)
 605                ret = new_sync_write(file, buf, count, pos);
 606        else
 607                ret = -EINVAL;
 608        if (ret > 0) {
 609                fsnotify_modify(file);
 610                add_wchar(current, ret);
 611        }
 612        inc_syscw(current);
 613        file_end_write(file);
 614        return ret;
 615}
 616
 617/* file_ppos returns &file->f_pos or NULL if file is stream */
 618static inline loff_t *file_ppos(struct file *file)
 619{
 620        return file->f_mode & FMODE_STREAM ? NULL : &file->f_pos;
 621}
 622
 623ssize_t ksys_read(unsigned int fd, char __user *buf, size_t count)
 624{
 625        struct fd f = fdget_pos(fd);
 626        ssize_t ret = -EBADF;
 627
 628        if (f.file) {
 629                loff_t pos, *ppos = file_ppos(f.file);
 630                if (ppos) {
 631                        pos = *ppos;
 632                        ppos = &pos;
 633                }
 634                ret = vfs_read(f.file, buf, count, ppos);
 635                if (ret >= 0 && ppos)
 636                        f.file->f_pos = pos;
 637                fdput_pos(f);
 638        }
 639        return ret;
 640}
 641
 642SYSCALL_DEFINE3(read, unsigned int, fd, char __user *, buf, size_t, count)
 643{
 644        return ksys_read(fd, buf, count);
 645}
 646
 647ssize_t ksys_write(unsigned int fd, const char __user *buf, size_t count)
 648{
 649        struct fd f = fdget_pos(fd);
 650        ssize_t ret = -EBADF;
 651
 652        if (f.file) {
 653                loff_t pos, *ppos = file_ppos(f.file);
 654                if (ppos) {
 655                        pos = *ppos;
 656                        ppos = &pos;
 657                }
 658                ret = vfs_write(f.file, buf, count, ppos);
 659                if (ret >= 0 && ppos)
 660                        f.file->f_pos = pos;
 661                fdput_pos(f);
 662        }
 663
 664        return ret;
 665}
 666
 667SYSCALL_DEFINE3(write, unsigned int, fd, const char __user *, buf,
 668                size_t, count)
 669{
 670        return ksys_write(fd, buf, count);
 671}
 672
 673ssize_t ksys_pread64(unsigned int fd, char __user *buf, size_t count,
 674                     loff_t pos)
 675{
 676        struct fd f;
 677        ssize_t ret = -EBADF;
 678
 679        if (pos < 0)
 680                return -EINVAL;
 681
 682        f = fdget(fd);
 683        if (f.file) {
 684                ret = -ESPIPE;
 685                if (f.file->f_mode & FMODE_PREAD)
 686                        ret = vfs_read(f.file, buf, count, &pos);
 687                fdput(f);
 688        }
 689
 690        return ret;
 691}
 692
 693SYSCALL_DEFINE4(pread64, unsigned int, fd, char __user *, buf,
 694                        size_t, count, loff_t, pos)
 695{
 696        return ksys_pread64(fd, buf, count, pos);
 697}
 698
 699ssize_t ksys_pwrite64(unsigned int fd, const char __user *buf,
 700                      size_t count, loff_t pos)
 701{
 702        struct fd f;
 703        ssize_t ret = -EBADF;
 704
 705        if (pos < 0)
 706                return -EINVAL;
 707
 708        f = fdget(fd);
 709        if (f.file) {
 710                ret = -ESPIPE;
 711                if (f.file->f_mode & FMODE_PWRITE)  
 712                        ret = vfs_write(f.file, buf, count, &pos);
 713                fdput(f);
 714        }
 715
 716        return ret;
 717}
 718
 719SYSCALL_DEFINE4(pwrite64, unsigned int, fd, const char __user *, buf,
 720                         size_t, count, loff_t, pos)
 721{
 722        return ksys_pwrite64(fd, buf, count, pos);
 723}
 724
 725static ssize_t do_iter_readv_writev(struct file *filp, struct iov_iter *iter,
 726                loff_t *ppos, int type, rwf_t flags)
 727{
 728        struct kiocb kiocb;
 729        ssize_t ret;
 730
 731        init_sync_kiocb(&kiocb, filp);
 732        ret = kiocb_set_rw_flags(&kiocb, flags);
 733        if (ret)
 734                return ret;
 735        kiocb.ki_pos = (ppos ? *ppos : 0);
 736
 737        if (type == READ)
 738                ret = call_read_iter(filp, &kiocb, iter);
 739        else
 740                ret = call_write_iter(filp, &kiocb, iter);
 741        BUG_ON(ret == -EIOCBQUEUED);
 742        if (ppos)
 743                *ppos = kiocb.ki_pos;
 744        return ret;
 745}
 746
 747/* Do it by hand, with file-ops */
 748static ssize_t do_loop_readv_writev(struct file *filp, struct iov_iter *iter,
 749                loff_t *ppos, int type, rwf_t flags)
 750{
 751        ssize_t ret = 0;
 752
 753        if (flags & ~RWF_HIPRI)
 754                return -EOPNOTSUPP;
 755
 756        while (iov_iter_count(iter)) {
 757                struct iovec iovec = iov_iter_iovec(iter);
 758                ssize_t nr;
 759
 760                if (type == READ) {
 761                        nr = filp->f_op->read(filp, iovec.iov_base,
 762                                              iovec.iov_len, ppos);
 763                } else {
 764                        nr = filp->f_op->write(filp, iovec.iov_base,
 765                                               iovec.iov_len, ppos);
 766                }
 767
 768                if (nr < 0) {
 769                        if (!ret)
 770                                ret = nr;
 771                        break;
 772                }
 773                ret += nr;
 774                if (nr != iovec.iov_len)
 775                        break;
 776                iov_iter_advance(iter, nr);
 777        }
 778
 779        return ret;
 780}
 781
 782static ssize_t do_iter_read(struct file *file, struct iov_iter *iter,
 783                loff_t *pos, rwf_t flags)
 784{
 785        size_t tot_len;
 786        ssize_t ret = 0;
 787
 788        if (!(file->f_mode & FMODE_READ))
 789                return -EBADF;
 790        if (!(file->f_mode & FMODE_CAN_READ))
 791                return -EINVAL;
 792
 793        tot_len = iov_iter_count(iter);
 794        if (!tot_len)
 795                goto out;
 796        ret = rw_verify_area(READ, file, pos, tot_len);
 797        if (ret < 0)
 798                return ret;
 799
 800        if (file->f_op->read_iter)
 801                ret = do_iter_readv_writev(file, iter, pos, READ, flags);
 802        else
 803                ret = do_loop_readv_writev(file, iter, pos, READ, flags);
 804out:
 805        if (ret >= 0)
 806                fsnotify_access(file);
 807        return ret;
 808}
 809
 810ssize_t vfs_iocb_iter_read(struct file *file, struct kiocb *iocb,
 811                           struct iov_iter *iter)
 812{
 813        size_t tot_len;
 814        ssize_t ret = 0;
 815
 816        if (!file->f_op->read_iter)
 817                return -EINVAL;
 818        if (!(file->f_mode & FMODE_READ))
 819                return -EBADF;
 820        if (!(file->f_mode & FMODE_CAN_READ))
 821                return -EINVAL;
 822
 823        tot_len = iov_iter_count(iter);
 824        if (!tot_len)
 825                goto out;
 826        ret = rw_verify_area(READ, file, &iocb->ki_pos, tot_len);
 827        if (ret < 0)
 828                return ret;
 829
 830        ret = call_read_iter(file, iocb, iter);
 831out:
 832        if (ret >= 0)
 833                fsnotify_access(file);
 834        return ret;
 835}
 836EXPORT_SYMBOL(vfs_iocb_iter_read);
 837
 838ssize_t vfs_iter_read(struct file *file, struct iov_iter *iter, loff_t *ppos,
 839                rwf_t flags)
 840{
 841        if (!file->f_op->read_iter)
 842                return -EINVAL;
 843        return do_iter_read(file, iter, ppos, flags);
 844}
 845EXPORT_SYMBOL(vfs_iter_read);
 846
 847static ssize_t do_iter_write(struct file *file, struct iov_iter *iter,
 848                loff_t *pos, rwf_t flags)
 849{
 850        size_t tot_len;
 851        ssize_t ret = 0;
 852
 853        if (!(file->f_mode & FMODE_WRITE))
 854                return -EBADF;
 855        if (!(file->f_mode & FMODE_CAN_WRITE))
 856                return -EINVAL;
 857
 858        tot_len = iov_iter_count(iter);
 859        if (!tot_len)
 860                return 0;
 861        ret = rw_verify_area(WRITE, file, pos, tot_len);
 862        if (ret < 0)
 863                return ret;
 864
 865        if (file->f_op->write_iter)
 866                ret = do_iter_readv_writev(file, iter, pos, WRITE, flags);
 867        else
 868                ret = do_loop_readv_writev(file, iter, pos, WRITE, flags);
 869        if (ret > 0)
 870                fsnotify_modify(file);
 871        return ret;
 872}
 873
 874ssize_t vfs_iocb_iter_write(struct file *file, struct kiocb *iocb,
 875                            struct iov_iter *iter)
 876{
 877        size_t tot_len;
 878        ssize_t ret = 0;
 879
 880        if (!file->f_op->write_iter)
 881                return -EINVAL;
 882        if (!(file->f_mode & FMODE_WRITE))
 883                return -EBADF;
 884        if (!(file->f_mode & FMODE_CAN_WRITE))
 885                return -EINVAL;
 886
 887        tot_len = iov_iter_count(iter);
 888        if (!tot_len)
 889                return 0;
 890        ret = rw_verify_area(WRITE, file, &iocb->ki_pos, tot_len);
 891        if (ret < 0)
 892                return ret;
 893
 894        ret = call_write_iter(file, iocb, iter);
 895        if (ret > 0)
 896                fsnotify_modify(file);
 897
 898        return ret;
 899}
 900EXPORT_SYMBOL(vfs_iocb_iter_write);
 901
 902ssize_t vfs_iter_write(struct file *file, struct iov_iter *iter, loff_t *ppos,
 903                rwf_t flags)
 904{
 905        if (!file->f_op->write_iter)
 906                return -EINVAL;
 907        return do_iter_write(file, iter, ppos, flags);
 908}
 909EXPORT_SYMBOL(vfs_iter_write);
 910
 911static ssize_t vfs_readv(struct file *file, const struct iovec __user *vec,
 912                  unsigned long vlen, loff_t *pos, rwf_t flags)
 913{
 914        struct iovec iovstack[UIO_FASTIOV];
 915        struct iovec *iov = iovstack;
 916        struct iov_iter iter;
 917        ssize_t ret;
 918
 919        ret = import_iovec(READ, vec, vlen, ARRAY_SIZE(iovstack), &iov, &iter);
 920        if (ret >= 0) {
 921                ret = do_iter_read(file, &iter, pos, flags);
 922                kfree(iov);
 923        }
 924
 925        return ret;
 926}
 927
 928static ssize_t vfs_writev(struct file *file, const struct iovec __user *vec,
 929                   unsigned long vlen, loff_t *pos, rwf_t flags)
 930{
 931        struct iovec iovstack[UIO_FASTIOV];
 932        struct iovec *iov = iovstack;
 933        struct iov_iter iter;
 934        ssize_t ret;
 935
 936        ret = import_iovec(WRITE, vec, vlen, ARRAY_SIZE(iovstack), &iov, &iter);
 937        if (ret >= 0) {
 938                file_start_write(file);
 939                ret = do_iter_write(file, &iter, pos, flags);
 940                file_end_write(file);
 941                kfree(iov);
 942        }
 943        return ret;
 944}
 945
 946static ssize_t do_readv(unsigned long fd, const struct iovec __user *vec,
 947                        unsigned long vlen, rwf_t flags)
 948{
 949        struct fd f = fdget_pos(fd);
 950        ssize_t ret = -EBADF;
 951
 952        if (f.file) {
 953                loff_t pos, *ppos = file_ppos(f.file);
 954                if (ppos) {
 955                        pos = *ppos;
 956                        ppos = &pos;
 957                }
 958                ret = vfs_readv(f.file, vec, vlen, ppos, flags);
 959                if (ret >= 0 && ppos)
 960                        f.file->f_pos = pos;
 961                fdput_pos(f);
 962        }
 963
 964        if (ret > 0)
 965                add_rchar(current, ret);
 966        inc_syscr(current);
 967        return ret;
 968}
 969
 970static ssize_t do_writev(unsigned long fd, const struct iovec __user *vec,
 971                         unsigned long vlen, rwf_t flags)
 972{
 973        struct fd f = fdget_pos(fd);
 974        ssize_t ret = -EBADF;
 975
 976        if (f.file) {
 977                loff_t pos, *ppos = file_ppos(f.file);
 978                if (ppos) {
 979                        pos = *ppos;
 980                        ppos = &pos;
 981                }
 982                ret = vfs_writev(f.file, vec, vlen, ppos, flags);
 983                if (ret >= 0 && ppos)
 984                        f.file->f_pos = pos;
 985                fdput_pos(f);
 986        }
 987
 988        if (ret > 0)
 989                add_wchar(current, ret);
 990        inc_syscw(current);
 991        return ret;
 992}
 993
 994static inline loff_t pos_from_hilo(unsigned long high, unsigned long low)
 995{
 996#define HALF_LONG_BITS (BITS_PER_LONG / 2)
 997        return (((loff_t)high << HALF_LONG_BITS) << HALF_LONG_BITS) | low;
 998}
 999
1000static ssize_t do_preadv(unsigned long fd, const struct iovec __user *vec,
1001                         unsigned long vlen, loff_t pos, rwf_t flags)
1002{
1003        struct fd f;
1004        ssize_t ret = -EBADF;
1005
1006        if (pos < 0)
1007                return -EINVAL;
1008
1009        f = fdget(fd);
1010        if (f.file) {
1011                ret = -ESPIPE;
1012                if (f.file->f_mode & FMODE_PREAD)
1013                        ret = vfs_readv(f.file, vec, vlen, &pos, flags);
1014                fdput(f);
1015        }
1016
1017        if (ret > 0)
1018                add_rchar(current, ret);
1019        inc_syscr(current);
1020        return ret;
1021}
1022
1023static ssize_t do_pwritev(unsigned long fd, const struct iovec __user *vec,
1024                          unsigned long vlen, loff_t pos, rwf_t flags)
1025{
1026        struct fd f;
1027        ssize_t ret = -EBADF;
1028
1029        if (pos < 0)
1030                return -EINVAL;
1031
1032        f = fdget(fd);
1033        if (f.file) {
1034                ret = -ESPIPE;
1035                if (f.file->f_mode & FMODE_PWRITE)
1036                        ret = vfs_writev(f.file, vec, vlen, &pos, flags);
1037                fdput(f);
1038        }
1039
1040        if (ret > 0)
1041                add_wchar(current, ret);
1042        inc_syscw(current);
1043        return ret;
1044}
1045
1046SYSCALL_DEFINE3(readv, unsigned long, fd, const struct iovec __user *, vec,
1047                unsigned long, vlen)
1048{
1049        return do_readv(fd, vec, vlen, 0);
1050}
1051
1052SYSCALL_DEFINE3(writev, unsigned long, fd, const struct iovec __user *, vec,
1053                unsigned long, vlen)
1054{
1055        return do_writev(fd, vec, vlen, 0);
1056}
1057
1058SYSCALL_DEFINE5(preadv, unsigned long, fd, const struct iovec __user *, vec,
1059                unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h)
1060{
1061        loff_t pos = pos_from_hilo(pos_h, pos_l);
1062
1063        return do_preadv(fd, vec, vlen, pos, 0);
1064}
1065
1066SYSCALL_DEFINE6(preadv2, unsigned long, fd, const struct iovec __user *, vec,
1067                unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h,
1068                rwf_t, flags)
1069{
1070        loff_t pos = pos_from_hilo(pos_h, pos_l);
1071
1072        if (pos == -1)
1073                return do_readv(fd, vec, vlen, flags);
1074
1075        return do_preadv(fd, vec, vlen, pos, flags);
1076}
1077
1078SYSCALL_DEFINE5(pwritev, unsigned long, fd, const struct iovec __user *, vec,
1079                unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h)
1080{
1081        loff_t pos = pos_from_hilo(pos_h, pos_l);
1082
1083        return do_pwritev(fd, vec, vlen, pos, 0);
1084}
1085
1086SYSCALL_DEFINE6(pwritev2, unsigned long, fd, const struct iovec __user *, vec,
1087                unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h,
1088                rwf_t, flags)
1089{
1090        loff_t pos = pos_from_hilo(pos_h, pos_l);
1091
1092        if (pos == -1)
1093                return do_writev(fd, vec, vlen, flags);
1094
1095        return do_pwritev(fd, vec, vlen, pos, flags);
1096}
1097
1098/*
1099 * Various compat syscalls.  Note that they all pretend to take a native
1100 * iovec - import_iovec will properly treat those as compat_iovecs based on
1101 * in_compat_syscall().
1102 */
1103#ifdef CONFIG_COMPAT
1104#ifdef __ARCH_WANT_COMPAT_SYS_PREADV64
1105COMPAT_SYSCALL_DEFINE4(preadv64, unsigned long, fd,
1106                const struct iovec __user *, vec,
1107                unsigned long, vlen, loff_t, pos)
1108{
1109        return do_preadv(fd, vec, vlen, pos, 0);
1110}
1111#endif
1112
1113COMPAT_SYSCALL_DEFINE5(preadv, compat_ulong_t, fd,
1114                const struct iovec __user *, vec,
1115                compat_ulong_t, vlen, u32, pos_low, u32, pos_high)
1116{
1117        loff_t pos = ((loff_t)pos_high << 32) | pos_low;
1118
1119        return do_preadv(fd, vec, vlen, pos, 0);
1120}
1121
1122#ifdef __ARCH_WANT_COMPAT_SYS_PREADV64V2
1123COMPAT_SYSCALL_DEFINE5(preadv64v2, unsigned long, fd,
1124                const struct iovec __user *, vec,
1125                unsigned long, vlen, loff_t, pos, rwf_t, flags)
1126{
1127        if (pos == -1)
1128                return do_readv(fd, vec, vlen, flags);
1129        return do_preadv(fd, vec, vlen, pos, flags);
1130}
1131#endif
1132
1133COMPAT_SYSCALL_DEFINE6(preadv2, compat_ulong_t, fd,
1134                const struct iovec __user *, vec,
1135                compat_ulong_t, vlen, u32, pos_low, u32, pos_high,
1136                rwf_t, flags)
1137{
1138        loff_t pos = ((loff_t)pos_high << 32) | pos_low;
1139
1140        if (pos == -1)
1141                return do_readv(fd, vec, vlen, flags);
1142        return do_preadv(fd, vec, vlen, pos, flags);
1143}
1144
1145#ifdef __ARCH_WANT_COMPAT_SYS_PWRITEV64
1146COMPAT_SYSCALL_DEFINE4(pwritev64, unsigned long, fd,
1147                const struct iovec __user *, vec,
1148                unsigned long, vlen, loff_t, pos)
1149{
1150        return do_pwritev(fd, vec, vlen, pos, 0);
1151}
1152#endif
1153
1154COMPAT_SYSCALL_DEFINE5(pwritev, compat_ulong_t, fd,
1155                const struct iovec __user *,vec,
1156                compat_ulong_t, vlen, u32, pos_low, u32, pos_high)
1157{
1158        loff_t pos = ((loff_t)pos_high << 32) | pos_low;
1159
1160        return do_pwritev(fd, vec, vlen, pos, 0);
1161}
1162
1163#ifdef __ARCH_WANT_COMPAT_SYS_PWRITEV64V2
1164COMPAT_SYSCALL_DEFINE5(pwritev64v2, unsigned long, fd,
1165                const struct iovec __user *, vec,
1166                unsigned long, vlen, loff_t, pos, rwf_t, flags)
1167{
1168        if (pos == -1)
1169                return do_writev(fd, vec, vlen, flags);
1170        return do_pwritev(fd, vec, vlen, pos, flags);
1171}
1172#endif
1173
1174COMPAT_SYSCALL_DEFINE6(pwritev2, compat_ulong_t, fd,
1175                const struct iovec __user *,vec,
1176                compat_ulong_t, vlen, u32, pos_low, u32, pos_high, rwf_t, flags)
1177{
1178        loff_t pos = ((loff_t)pos_high << 32) | pos_low;
1179
1180        if (pos == -1)
1181                return do_writev(fd, vec, vlen, flags);
1182        return do_pwritev(fd, vec, vlen, pos, flags);
1183}
1184#endif /* CONFIG_COMPAT */
1185
1186static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos,
1187                           size_t count, loff_t max)
1188{
1189        struct fd in, out;
1190        struct inode *in_inode, *out_inode;
1191        struct pipe_inode_info *opipe;
1192        loff_t pos;
1193        loff_t out_pos;
1194        ssize_t retval;
1195        int fl;
1196
1197        /*
1198         * Get input file, and verify that it is ok..
1199         */
1200        retval = -EBADF;
1201        in = fdget(in_fd);
1202        if (!in.file)
1203                goto out;
1204        if (!(in.file->f_mode & FMODE_READ))
1205                goto fput_in;
1206        retval = -ESPIPE;
1207        if (!ppos) {
1208                pos = in.file->f_pos;
1209        } else {
1210                pos = *ppos;
1211                if (!(in.file->f_mode & FMODE_PREAD))
1212                        goto fput_in;
1213        }
1214        retval = rw_verify_area(READ, in.file, &pos, count);
1215        if (retval < 0)
1216                goto fput_in;
1217        if (count > MAX_RW_COUNT)
1218                count =  MAX_RW_COUNT;
1219
1220        /*
1221         * Get output file, and verify that it is ok..
1222         */
1223        retval = -EBADF;
1224        out = fdget(out_fd);
1225        if (!out.file)
1226                goto fput_in;
1227        if (!(out.file->f_mode & FMODE_WRITE))
1228                goto fput_out;
1229        in_inode = file_inode(in.file);
1230        out_inode = file_inode(out.file);
1231        out_pos = out.file->f_pos;
1232
1233        if (!max)
1234                max = min(in_inode->i_sb->s_maxbytes, out_inode->i_sb->s_maxbytes);
1235
1236        if (unlikely(pos + count > max)) {
1237                retval = -EOVERFLOW;
1238                if (pos >= max)
1239                        goto fput_out;
1240                count = max - pos;
1241        }
1242
1243        fl = 0;
1244#if 0
1245        /*
1246         * We need to debate whether we can enable this or not. The
1247         * man page documents EAGAIN return for the output at least,
1248         * and the application is arguably buggy if it doesn't expect
1249         * EAGAIN on a non-blocking file descriptor.
1250         */
1251        if (in.file->f_flags & O_NONBLOCK)
1252                fl = SPLICE_F_NONBLOCK;
1253#endif
1254        opipe = get_pipe_info(out.file, true);
1255        if (!opipe) {
1256                retval = rw_verify_area(WRITE, out.file, &out_pos, count);
1257                if (retval < 0)
1258                        goto fput_out;
1259                file_start_write(out.file);
1260                retval = do_splice_direct(in.file, &pos, out.file, &out_pos,
1261                                          count, fl);
1262                file_end_write(out.file);
1263        } else {
1264                retval = splice_file_to_pipe(in.file, opipe, &pos, count, fl);
1265        }
1266
1267        if (retval > 0) {
1268                add_rchar(current, retval);
1269                add_wchar(current, retval);
1270                fsnotify_access(in.file);
1271                fsnotify_modify(out.file);
1272                out.file->f_pos = out_pos;
1273                if (ppos)
1274                        *ppos = pos;
1275                else
1276                        in.file->f_pos = pos;
1277        }
1278
1279        inc_syscr(current);
1280        inc_syscw(current);
1281        if (pos > max)
1282                retval = -EOVERFLOW;
1283
1284fput_out:
1285        fdput(out);
1286fput_in:
1287        fdput(in);
1288out:
1289        return retval;
1290}
1291
1292SYSCALL_DEFINE4(sendfile, int, out_fd, int, in_fd, off_t __user *, offset, size_t, count)
1293{
1294        loff_t pos;
1295        off_t off;
1296        ssize_t ret;
1297
1298        if (offset) {
1299                if (unlikely(get_user(off, offset)))
1300                        return -EFAULT;
1301                pos = off;
1302                ret = do_sendfile(out_fd, in_fd, &pos, count, MAX_NON_LFS);
1303                if (unlikely(put_user(pos, offset)))
1304                        return -EFAULT;
1305                return ret;
1306        }
1307
1308        return do_sendfile(out_fd, in_fd, NULL, count, 0);
1309}
1310
1311SYSCALL_DEFINE4(sendfile64, int, out_fd, int, in_fd, loff_t __user *, offset, size_t, count)
1312{
1313        loff_t pos;
1314        ssize_t ret;
1315
1316        if (offset) {
1317                if (unlikely(copy_from_user(&pos, offset, sizeof(loff_t))))
1318                        return -EFAULT;
1319                ret = do_sendfile(out_fd, in_fd, &pos, count, 0);
1320                if (unlikely(put_user(pos, offset)))
1321                        return -EFAULT;
1322                return ret;
1323        }
1324
1325        return do_sendfile(out_fd, in_fd, NULL, count, 0);
1326}
1327
1328#ifdef CONFIG_COMPAT
1329COMPAT_SYSCALL_DEFINE4(sendfile, int, out_fd, int, in_fd,
1330                compat_off_t __user *, offset, compat_size_t, count)
1331{
1332        loff_t pos;
1333        off_t off;
1334        ssize_t ret;
1335
1336        if (offset) {
1337                if (unlikely(get_user(off, offset)))
1338                        return -EFAULT;
1339                pos = off;
1340                ret = do_sendfile(out_fd, in_fd, &pos, count, MAX_NON_LFS);
1341                if (unlikely(put_user(pos, offset)))
1342                        return -EFAULT;
1343                return ret;
1344        }
1345
1346        return do_sendfile(out_fd, in_fd, NULL, count, 0);
1347}
1348
1349COMPAT_SYSCALL_DEFINE4(sendfile64, int, out_fd, int, in_fd,
1350                compat_loff_t __user *, offset, compat_size_t, count)
1351{
1352        loff_t pos;
1353        ssize_t ret;
1354
1355        if (offset) {
1356                if (unlikely(copy_from_user(&pos, offset, sizeof(loff_t))))
1357                        return -EFAULT;
1358                ret = do_sendfile(out_fd, in_fd, &pos, count, 0);
1359                if (unlikely(put_user(pos, offset)))
1360                        return -EFAULT;
1361                return ret;
1362        }
1363
1364        return do_sendfile(out_fd, in_fd, NULL, count, 0);
1365}
1366#endif
1367
1368/**
1369 * generic_copy_file_range - copy data between two files
1370 * @file_in:    file structure to read from
1371 * @pos_in:     file offset to read from
1372 * @file_out:   file structure to write data to
1373 * @pos_out:    file offset to write data to
1374 * @len:        amount of data to copy
1375 * @flags:      copy flags
1376 *
1377 * This is a generic filesystem helper to copy data from one file to another.
1378 * It has no constraints on the source or destination file owners - the files
1379 * can belong to different superblocks and different filesystem types. Short
1380 * copies are allowed.
1381 *
1382 * This should be called from the @file_out filesystem, as per the
1383 * ->copy_file_range() method.
1384 *
1385 * Returns the number of bytes copied or a negative error indicating the
1386 * failure.
1387 */
1388
1389ssize_t generic_copy_file_range(struct file *file_in, loff_t pos_in,
1390                                struct file *file_out, loff_t pos_out,
1391                                size_t len, unsigned int flags)
1392{
1393        return do_splice_direct(file_in, &pos_in, file_out, &pos_out,
1394                                len > MAX_RW_COUNT ? MAX_RW_COUNT : len, 0);
1395}
1396EXPORT_SYMBOL(generic_copy_file_range);
1397
1398static ssize_t do_copy_file_range(struct file *file_in, loff_t pos_in,
1399                                  struct file *file_out, loff_t pos_out,
1400                                  size_t len, unsigned int flags)
1401{
1402        /*
1403         * Although we now allow filesystems to handle cross sb copy, passing
1404         * a file of the wrong filesystem type to filesystem driver can result
1405         * in an attempt to dereference the wrong type of ->private_data, so
1406         * avoid doing that until we really have a good reason.  NFS defines
1407         * several different file_system_type structures, but they all end up
1408         * using the same ->copy_file_range() function pointer.
1409         */
1410        if (file_out->f_op->copy_file_range &&
1411            file_out->f_op->copy_file_range == file_in->f_op->copy_file_range)
1412                return file_out->f_op->copy_file_range(file_in, pos_in,
1413                                                       file_out, pos_out,
1414                                                       len, flags);
1415
1416        return generic_copy_file_range(file_in, pos_in, file_out, pos_out, len,
1417                                       flags);
1418}
1419
1420/*
1421 * Performs necessary checks before doing a file copy
1422 *
1423 * Can adjust amount of bytes to copy via @req_count argument.
1424 * Returns appropriate error code that caller should return or
1425 * zero in case the copy should be allowed.
1426 */
1427static int generic_copy_file_checks(struct file *file_in, loff_t pos_in,
1428                                    struct file *file_out, loff_t pos_out,
1429                                    size_t *req_count, unsigned int flags)
1430{
1431        struct inode *inode_in = file_inode(file_in);
1432        struct inode *inode_out = file_inode(file_out);
1433        uint64_t count = *req_count;
1434        loff_t size_in;
1435        int ret;
1436
1437        ret = generic_file_rw_checks(file_in, file_out);
1438        if (ret)
1439                return ret;
1440
1441        /* Don't touch certain kinds of inodes */
1442        if (IS_IMMUTABLE(inode_out))
1443                return -EPERM;
1444
1445        if (IS_SWAPFILE(inode_in) || IS_SWAPFILE(inode_out))
1446                return -ETXTBSY;
1447
1448        /* Ensure offsets don't wrap. */
1449        if (pos_in + count < pos_in || pos_out + count < pos_out)
1450                return -EOVERFLOW;
1451
1452        /* Shorten the copy to EOF */
1453        size_in = i_size_read(inode_in);
1454        if (pos_in >= size_in)
1455                count = 0;
1456        else
1457                count = min(count, size_in - (uint64_t)pos_in);
1458
1459        ret = generic_write_check_limits(file_out, pos_out, &count);
1460        if (ret)
1461                return ret;
1462
1463        /* Don't allow overlapped copying within the same file. */
1464        if (inode_in == inode_out &&
1465            pos_out + count > pos_in &&
1466            pos_out < pos_in + count)
1467                return -EINVAL;
1468
1469        *req_count = count;
1470        return 0;
1471}
1472
1473/*
1474 * copy_file_range() differs from regular file read and write in that it
1475 * specifically allows return partial success.  When it does so is up to
1476 * the copy_file_range method.
1477 */
1478ssize_t vfs_copy_file_range(struct file *file_in, loff_t pos_in,
1479                            struct file *file_out, loff_t pos_out,
1480                            size_t len, unsigned int flags)
1481{
1482        ssize_t ret;
1483
1484        if (flags != 0)
1485                return -EINVAL;
1486
1487        ret = generic_copy_file_checks(file_in, pos_in, file_out, pos_out, &len,
1488                                       flags);
1489        if (unlikely(ret))
1490                return ret;
1491
1492        ret = rw_verify_area(READ, file_in, &pos_in, len);
1493        if (unlikely(ret))
1494                return ret;
1495
1496        ret = rw_verify_area(WRITE, file_out, &pos_out, len);
1497        if (unlikely(ret))
1498                return ret;
1499
1500        if (len == 0)
1501                return 0;
1502
1503        file_start_write(file_out);
1504
1505        /*
1506         * Try cloning first, this is supported by more file systems, and
1507         * more efficient if both clone and copy are supported (e.g. NFS).
1508         */
1509        if (file_in->f_op->remap_file_range &&
1510            file_inode(file_in)->i_sb == file_inode(file_out)->i_sb) {
1511                loff_t cloned;
1512
1513                cloned = file_in->f_op->remap_file_range(file_in, pos_in,
1514                                file_out, pos_out,
1515                                min_t(loff_t, MAX_RW_COUNT, len),
1516                                REMAP_FILE_CAN_SHORTEN);
1517                if (cloned > 0) {
1518                        ret = cloned;
1519                        goto done;
1520                }
1521        }
1522
1523        ret = do_copy_file_range(file_in, pos_in, file_out, pos_out, len,
1524                                flags);
1525        WARN_ON_ONCE(ret == -EOPNOTSUPP);
1526done:
1527        if (ret > 0) {
1528                fsnotify_access(file_in);
1529                add_rchar(current, ret);
1530                fsnotify_modify(file_out);
1531                add_wchar(current, ret);
1532        }
1533
1534        inc_syscr(current);
1535        inc_syscw(current);
1536
1537        file_end_write(file_out);
1538
1539        return ret;
1540}
1541EXPORT_SYMBOL(vfs_copy_file_range);
1542
1543SYSCALL_DEFINE6(copy_file_range, int, fd_in, loff_t __user *, off_in,
1544                int, fd_out, loff_t __user *, off_out,
1545                size_t, len, unsigned int, flags)
1546{
1547        loff_t pos_in;
1548        loff_t pos_out;
1549        struct fd f_in;
1550        struct fd f_out;
1551        ssize_t ret = -EBADF;
1552
1553        f_in = fdget(fd_in);
1554        if (!f_in.file)
1555                goto out2;
1556
1557        f_out = fdget(fd_out);
1558        if (!f_out.file)
1559                goto out1;
1560
1561        ret = -EFAULT;
1562        if (off_in) {
1563                if (copy_from_user(&pos_in, off_in, sizeof(loff_t)))
1564                        goto out;
1565        } else {
1566                pos_in = f_in.file->f_pos;
1567        }
1568
1569        if (off_out) {
1570                if (copy_from_user(&pos_out, off_out, sizeof(loff_t)))
1571                        goto out;
1572        } else {
1573                pos_out = f_out.file->f_pos;
1574        }
1575
1576        ret = vfs_copy_file_range(f_in.file, pos_in, f_out.file, pos_out, len,
1577                                  flags);
1578        if (ret > 0) {
1579                pos_in += ret;
1580                pos_out += ret;
1581
1582                if (off_in) {
1583                        if (copy_to_user(off_in, &pos_in, sizeof(loff_t)))
1584                                ret = -EFAULT;
1585                } else {
1586                        f_in.file->f_pos = pos_in;
1587                }
1588
1589                if (off_out) {
1590                        if (copy_to_user(off_out, &pos_out, sizeof(loff_t)))
1591                                ret = -EFAULT;
1592                } else {
1593                        f_out.file->f_pos = pos_out;
1594                }
1595        }
1596
1597out:
1598        fdput(f_out);
1599out1:
1600        fdput(f_in);
1601out2:
1602        return ret;
1603}
1604
1605/*
1606 * Don't operate on ranges the page cache doesn't support, and don't exceed the
1607 * LFS limits.  If pos is under the limit it becomes a short access.  If it
1608 * exceeds the limit we return -EFBIG.
1609 */
1610int generic_write_check_limits(struct file *file, loff_t pos, loff_t *count)
1611{
1612        struct inode *inode = file->f_mapping->host;
1613        loff_t max_size = inode->i_sb->s_maxbytes;
1614        loff_t limit = rlimit(RLIMIT_FSIZE);
1615
1616        if (limit != RLIM_INFINITY) {
1617                if (pos >= limit) {
1618                        send_sig(SIGXFSZ, current, 0);
1619                        return -EFBIG;
1620                }
1621                *count = min(*count, limit - pos);
1622        }
1623
1624        if (!(file->f_flags & O_LARGEFILE))
1625                max_size = MAX_NON_LFS;
1626
1627        if (unlikely(pos >= max_size))
1628                return -EFBIG;
1629
1630        *count = min(*count, max_size - pos);
1631
1632        return 0;
1633}
1634
1635/*
1636 * Performs necessary checks before doing a write
1637 *
1638 * Can adjust writing position or amount of bytes to write.
1639 * Returns appropriate error code that caller should return or
1640 * zero in case that write should be allowed.
1641 */
1642ssize_t generic_write_checks(struct kiocb *iocb, struct iov_iter *from)
1643{
1644        struct file *file = iocb->ki_filp;
1645        struct inode *inode = file->f_mapping->host;
1646        loff_t count;
1647        int ret;
1648
1649        if (IS_SWAPFILE(inode))
1650                return -ETXTBSY;
1651
1652        if (!iov_iter_count(from))
1653                return 0;
1654
1655        /* FIXME: this is for backwards compatibility with 2.4 */
1656        if (iocb->ki_flags & IOCB_APPEND)
1657                iocb->ki_pos = i_size_read(inode);
1658
1659        if ((iocb->ki_flags & IOCB_NOWAIT) && !(iocb->ki_flags & IOCB_DIRECT))
1660                return -EINVAL;
1661
1662        count = iov_iter_count(from);
1663        ret = generic_write_check_limits(file, iocb->ki_pos, &count);
1664        if (ret)
1665                return ret;
1666
1667        iov_iter_truncate(from, count);
1668        return iov_iter_count(from);
1669}
1670EXPORT_SYMBOL(generic_write_checks);
1671
1672/*
1673 * Performs common checks before doing a file copy/clone
1674 * from @file_in to @file_out.
1675 */
1676int generic_file_rw_checks(struct file *file_in, struct file *file_out)
1677{
1678        struct inode *inode_in = file_inode(file_in);
1679        struct inode *inode_out = file_inode(file_out);
1680
1681        /* Don't copy dirs, pipes, sockets... */
1682        if (S_ISDIR(inode_in->i_mode) || S_ISDIR(inode_out->i_mode))
1683                return -EISDIR;
1684        if (!S_ISREG(inode_in->i_mode) || !S_ISREG(inode_out->i_mode))
1685                return -EINVAL;
1686
1687        if (!(file_in->f_mode & FMODE_READ) ||
1688            !(file_out->f_mode & FMODE_WRITE) ||
1689            (file_out->f_flags & O_APPEND))
1690                return -EBADF;
1691
1692        return 0;
1693}
1694