linux/fs/read_write.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0
   2/*
   3 *  linux/fs/read_write.c
   4 *
   5 *  Copyright (C) 1991, 1992  Linus Torvalds
   6 */
   7
   8#include <linux/slab.h>
   9#include <linux/stat.h>
  10#include <linux/sched/xacct.h>
  11#include <linux/fcntl.h>
  12#include <linux/file.h>
  13#include <linux/uio.h>
  14#include <linux/fsnotify.h>
  15#include <linux/security.h>
  16#include <linux/export.h>
  17#include <linux/syscalls.h>
  18#include <linux/pagemap.h>
  19#include <linux/splice.h>
  20#include <linux/compat.h>
  21#include <linux/mount.h>
  22#include <linux/fs.h>
  23#include "internal.h"
  24
  25#include <linux/uaccess.h>
  26#include <asm/unistd.h>
  27
  28const struct file_operations generic_ro_fops = {
  29        .llseek         = generic_file_llseek,
  30        .read_iter      = generic_file_read_iter,
  31        .mmap           = generic_file_readonly_mmap,
  32        .splice_read    = generic_file_splice_read,
  33};
  34
  35EXPORT_SYMBOL(generic_ro_fops);
  36
  37static inline bool unsigned_offsets(struct file *file)
  38{
  39        return file->f_mode & FMODE_UNSIGNED_OFFSET;
  40}
  41
  42/**
  43 * vfs_setpos - update the file offset for lseek
  44 * @file:       file structure in question
  45 * @offset:     file offset to seek to
  46 * @maxsize:    maximum file size
  47 *
  48 * This is a low-level filesystem helper for updating the file offset to
  49 * the value specified by @offset if the given offset is valid and it is
  50 * not equal to the current file offset.
  51 *
  52 * Return the specified offset on success and -EINVAL on invalid offset.
  53 */
  54loff_t vfs_setpos(struct file *file, loff_t offset, loff_t maxsize)
  55{
  56        if (offset < 0 && !unsigned_offsets(file))
  57                return -EINVAL;
  58        if (offset > maxsize)
  59                return -EINVAL;
  60
  61        if (offset != file->f_pos) {
  62                file->f_pos = offset;
  63                file->f_version = 0;
  64        }
  65        return offset;
  66}
  67EXPORT_SYMBOL(vfs_setpos);
  68
  69/**
  70 * generic_file_llseek_size - generic llseek implementation for regular files
  71 * @file:       file structure to seek on
  72 * @offset:     file offset to seek to
  73 * @whence:     type of seek
  74 * @size:       max size of this file in file system
  75 * @eof:        offset used for SEEK_END position
  76 *
  77 * This is a variant of generic_file_llseek that allows passing in a custom
  78 * maximum file size and a custom EOF position, for e.g. hashed directories
  79 *
  80 * Synchronization:
  81 * SEEK_SET and SEEK_END are unsynchronized (but atomic on 64bit platforms)
  82 * SEEK_CUR is synchronized against other SEEK_CURs, but not read/writes.
  83 * read/writes behave like SEEK_SET against seeks.
  84 */
  85loff_t
  86generic_file_llseek_size(struct file *file, loff_t offset, int whence,
  87                loff_t maxsize, loff_t eof)
  88{
  89        switch (whence) {
  90        case SEEK_END:
  91                offset += eof;
  92                break;
  93        case SEEK_CUR:
  94                /*
  95                 * Here we special-case the lseek(fd, 0, SEEK_CUR)
  96                 * position-querying operation.  Avoid rewriting the "same"
  97                 * f_pos value back to the file because a concurrent read(),
  98                 * write() or lseek() might have altered it
  99                 */
 100                if (offset == 0)
 101                        return file->f_pos;
 102                /*
 103                 * f_lock protects against read/modify/write race with other
 104                 * SEEK_CURs. Note that parallel writes and reads behave
 105                 * like SEEK_SET.
 106                 */
 107                spin_lock(&file->f_lock);
 108                offset = vfs_setpos(file, file->f_pos + offset, maxsize);
 109                spin_unlock(&file->f_lock);
 110                return offset;
 111        case SEEK_DATA:
 112                /*
 113                 * In the generic case the entire file is data, so as long as
 114                 * offset isn't at the end of the file then the offset is data.
 115                 */
 116                if ((unsigned long long)offset >= eof)
 117                        return -ENXIO;
 118                break;
 119        case SEEK_HOLE:
 120                /*
 121                 * There is a virtual hole at the end of the file, so as long as
 122                 * offset isn't i_size or larger, return i_size.
 123                 */
 124                if ((unsigned long long)offset >= eof)
 125                        return -ENXIO;
 126                offset = eof;
 127                break;
 128        }
 129
 130        return vfs_setpos(file, offset, maxsize);
 131}
 132EXPORT_SYMBOL(generic_file_llseek_size);
 133
 134/**
 135 * generic_file_llseek - generic llseek implementation for regular files
 136 * @file:       file structure to seek on
 137 * @offset:     file offset to seek to
 138 * @whence:     type of seek
 139 *
 140 * This is a generic implemenation of ->llseek useable for all normal local
 141 * filesystems.  It just updates the file offset to the value specified by
 142 * @offset and @whence.
 143 */
 144loff_t generic_file_llseek(struct file *file, loff_t offset, int whence)
 145{
 146        struct inode *inode = file->f_mapping->host;
 147
 148        return generic_file_llseek_size(file, offset, whence,
 149                                        inode->i_sb->s_maxbytes,
 150                                        i_size_read(inode));
 151}
 152EXPORT_SYMBOL(generic_file_llseek);
 153
 154/**
 155 * fixed_size_llseek - llseek implementation for fixed-sized devices
 156 * @file:       file structure to seek on
 157 * @offset:     file offset to seek to
 158 * @whence:     type of seek
 159 * @size:       size of the file
 160 *
 161 */
 162loff_t fixed_size_llseek(struct file *file, loff_t offset, int whence, loff_t size)
 163{
 164        switch (whence) {
 165        case SEEK_SET: case SEEK_CUR: case SEEK_END:
 166                return generic_file_llseek_size(file, offset, whence,
 167                                                size, size);
 168        default:
 169                return -EINVAL;
 170        }
 171}
 172EXPORT_SYMBOL(fixed_size_llseek);
 173
 174/**
 175 * no_seek_end_llseek - llseek implementation for fixed-sized devices
 176 * @file:       file structure to seek on
 177 * @offset:     file offset to seek to
 178 * @whence:     type of seek
 179 *
 180 */
 181loff_t no_seek_end_llseek(struct file *file, loff_t offset, int whence)
 182{
 183        switch (whence) {
 184        case SEEK_SET: case SEEK_CUR:
 185                return generic_file_llseek_size(file, offset, whence,
 186                                                OFFSET_MAX, 0);
 187        default:
 188                return -EINVAL;
 189        }
 190}
 191EXPORT_SYMBOL(no_seek_end_llseek);
 192
 193/**
 194 * no_seek_end_llseek_size - llseek implementation for fixed-sized devices
 195 * @file:       file structure to seek on
 196 * @offset:     file offset to seek to
 197 * @whence:     type of seek
 198 * @size:       maximal offset allowed
 199 *
 200 */
 201loff_t no_seek_end_llseek_size(struct file *file, loff_t offset, int whence, loff_t size)
 202{
 203        switch (whence) {
 204        case SEEK_SET: case SEEK_CUR:
 205                return generic_file_llseek_size(file, offset, whence,
 206                                                size, 0);
 207        default:
 208                return -EINVAL;
 209        }
 210}
 211EXPORT_SYMBOL(no_seek_end_llseek_size);
 212
 213/**
 214 * noop_llseek - No Operation Performed llseek implementation
 215 * @file:       file structure to seek on
 216 * @offset:     file offset to seek to
 217 * @whence:     type of seek
 218 *
 219 * This is an implementation of ->llseek useable for the rare special case when
 220 * userspace expects the seek to succeed but the (device) file is actually not
 221 * able to perform the seek. In this case you use noop_llseek() instead of
 222 * falling back to the default implementation of ->llseek.
 223 */
 224loff_t noop_llseek(struct file *file, loff_t offset, int whence)
 225{
 226        return file->f_pos;
 227}
 228EXPORT_SYMBOL(noop_llseek);
 229
 230loff_t no_llseek(struct file *file, loff_t offset, int whence)
 231{
 232        return -ESPIPE;
 233}
 234EXPORT_SYMBOL(no_llseek);
 235
 236loff_t default_llseek(struct file *file, loff_t offset, int whence)
 237{
 238        struct inode *inode = file_inode(file);
 239        loff_t retval;
 240
 241        inode_lock(inode);
 242        switch (whence) {
 243                case SEEK_END:
 244                        offset += i_size_read(inode);
 245                        break;
 246                case SEEK_CUR:
 247                        if (offset == 0) {
 248                                retval = file->f_pos;
 249                                goto out;
 250                        }
 251                        offset += file->f_pos;
 252                        break;
 253                case SEEK_DATA:
 254                        /*
 255                         * In the generic case the entire file is data, so as
 256                         * long as offset isn't at the end of the file then the
 257                         * offset is data.
 258                         */
 259                        if (offset >= inode->i_size) {
 260                                retval = -ENXIO;
 261                                goto out;
 262                        }
 263                        break;
 264                case SEEK_HOLE:
 265                        /*
 266                         * There is a virtual hole at the end of the file, so
 267                         * as long as offset isn't i_size or larger, return
 268                         * i_size.
 269                         */
 270                        if (offset >= inode->i_size) {
 271                                retval = -ENXIO;
 272                                goto out;
 273                        }
 274                        offset = inode->i_size;
 275                        break;
 276        }
 277        retval = -EINVAL;
 278        if (offset >= 0 || unsigned_offsets(file)) {
 279                if (offset != file->f_pos) {
 280                        file->f_pos = offset;
 281                        file->f_version = 0;
 282                }
 283                retval = offset;
 284        }
 285out:
 286        inode_unlock(inode);
 287        return retval;
 288}
 289EXPORT_SYMBOL(default_llseek);
 290
 291loff_t vfs_llseek(struct file *file, loff_t offset, int whence)
 292{
 293        loff_t (*fn)(struct file *, loff_t, int);
 294
 295        fn = no_llseek;
 296        if (file->f_mode & FMODE_LSEEK) {
 297                if (file->f_op->llseek)
 298                        fn = file->f_op->llseek;
 299        }
 300        return fn(file, offset, whence);
 301}
 302EXPORT_SYMBOL(vfs_llseek);
 303
 304off_t ksys_lseek(unsigned int fd, off_t offset, unsigned int whence)
 305{
 306        off_t retval;
 307        struct fd f = fdget_pos(fd);
 308        if (!f.file)
 309                return -EBADF;
 310
 311        retval = -EINVAL;
 312        if (whence <= SEEK_MAX) {
 313                loff_t res = vfs_llseek(f.file, offset, whence);
 314                retval = res;
 315                if (res != (loff_t)retval)
 316                        retval = -EOVERFLOW;    /* LFS: should only happen on 32 bit platforms */
 317        }
 318        fdput_pos(f);
 319        return retval;
 320}
 321
 322SYSCALL_DEFINE3(lseek, unsigned int, fd, off_t, offset, unsigned int, whence)
 323{
 324        return ksys_lseek(fd, offset, whence);
 325}
 326
 327#ifdef CONFIG_COMPAT
 328COMPAT_SYSCALL_DEFINE3(lseek, unsigned int, fd, compat_off_t, offset, unsigned int, whence)
 329{
 330        return ksys_lseek(fd, offset, whence);
 331}
 332#endif
 333
 334#if !defined(CONFIG_64BIT) || defined(CONFIG_COMPAT)
 335SYSCALL_DEFINE5(llseek, unsigned int, fd, unsigned long, offset_high,
 336                unsigned long, offset_low, loff_t __user *, result,
 337                unsigned int, whence)
 338{
 339        int retval;
 340        struct fd f = fdget_pos(fd);
 341        loff_t offset;
 342
 343        if (!f.file)
 344                return -EBADF;
 345
 346        retval = -EINVAL;
 347        if (whence > SEEK_MAX)
 348                goto out_putf;
 349
 350        offset = vfs_llseek(f.file, ((loff_t) offset_high << 32) | offset_low,
 351                        whence);
 352
 353        retval = (int)offset;
 354        if (offset >= 0) {
 355                retval = -EFAULT;
 356                if (!copy_to_user(result, &offset, sizeof(offset)))
 357                        retval = 0;
 358        }
 359out_putf:
 360        fdput_pos(f);
 361        return retval;
 362}
 363#endif
 364
 365int rw_verify_area(int read_write, struct file *file, const loff_t *ppos, size_t count)
 366{
 367        struct inode *inode;
 368        int retval = -EINVAL;
 369
 370        inode = file_inode(file);
 371        if (unlikely((ssize_t) count < 0))
 372                return retval;
 373
 374        /*
 375         * ranged mandatory locking does not apply to streams - it makes sense
 376         * only for files where position has a meaning.
 377         */
 378        if (ppos) {
 379                loff_t pos = *ppos;
 380
 381                if (unlikely(pos < 0)) {
 382                        if (!unsigned_offsets(file))
 383                                return retval;
 384                        if (count >= -pos) /* both values are in 0..LLONG_MAX */
 385                                return -EOVERFLOW;
 386                } else if (unlikely((loff_t) (pos + count) < 0)) {
 387                        if (!unsigned_offsets(file))
 388                                return retval;
 389                }
 390
 391                if (unlikely(inode->i_flctx && mandatory_lock(inode))) {
 392                        retval = locks_mandatory_area(inode, file, pos, pos + count - 1,
 393                                        read_write == READ ? F_RDLCK : F_WRLCK);
 394                        if (retval < 0)
 395                                return retval;
 396                }
 397        }
 398
 399        return security_file_permission(file,
 400                                read_write == READ ? MAY_READ : MAY_WRITE);
 401}
 402
 403static ssize_t new_sync_read(struct file *filp, char __user *buf, size_t len, loff_t *ppos)
 404{
 405        struct iovec iov = { .iov_base = buf, .iov_len = len };
 406        struct kiocb kiocb;
 407        struct iov_iter iter;
 408        ssize_t ret;
 409
 410        init_sync_kiocb(&kiocb, filp);
 411        kiocb.ki_pos = (ppos ? *ppos : 0);
 412        iov_iter_init(&iter, READ, &iov, 1, len);
 413
 414        ret = call_read_iter(filp, &kiocb, &iter);
 415        BUG_ON(ret == -EIOCBQUEUED);
 416        if (ppos)
 417                *ppos = kiocb.ki_pos;
 418        return ret;
 419}
 420
 421ssize_t __vfs_read(struct file *file, char __user *buf, size_t count,
 422                   loff_t *pos)
 423{
 424        if (file->f_op->read)
 425                return file->f_op->read(file, buf, count, pos);
 426        else if (file->f_op->read_iter)
 427                return new_sync_read(file, buf, count, pos);
 428        else
 429                return -EINVAL;
 430}
 431
 432ssize_t kernel_read(struct file *file, void *buf, size_t count, loff_t *pos)
 433{
 434        mm_segment_t old_fs;
 435        ssize_t result;
 436
 437        old_fs = get_fs();
 438        set_fs(KERNEL_DS);
 439        /* The cast to a user pointer is valid due to the set_fs() */
 440        result = vfs_read(file, (void __user *)buf, count, pos);
 441        set_fs(old_fs);
 442        return result;
 443}
 444EXPORT_SYMBOL(kernel_read);
 445
 446ssize_t vfs_read(struct file *file, char __user *buf, size_t count, loff_t *pos)
 447{
 448        ssize_t ret;
 449
 450        if (!(file->f_mode & FMODE_READ))
 451                return -EBADF;
 452        if (!(file->f_mode & FMODE_CAN_READ))
 453                return -EINVAL;
 454        if (unlikely(!access_ok(buf, count)))
 455                return -EFAULT;
 456
 457        ret = rw_verify_area(READ, file, pos, count);
 458        if (!ret) {
 459                if (count > MAX_RW_COUNT)
 460                        count =  MAX_RW_COUNT;
 461                ret = __vfs_read(file, buf, count, pos);
 462                if (ret > 0) {
 463                        fsnotify_access(file);
 464                        add_rchar(current, ret);
 465                }
 466                inc_syscr(current);
 467        }
 468
 469        return ret;
 470}
 471
 472static ssize_t new_sync_write(struct file *filp, const char __user *buf, size_t len, loff_t *ppos)
 473{
 474        struct iovec iov = { .iov_base = (void __user *)buf, .iov_len = len };
 475        struct kiocb kiocb;
 476        struct iov_iter iter;
 477        ssize_t ret;
 478
 479        init_sync_kiocb(&kiocb, filp);
 480        kiocb.ki_pos = (ppos ? *ppos : 0);
 481        iov_iter_init(&iter, WRITE, &iov, 1, len);
 482
 483        ret = call_write_iter(filp, &kiocb, &iter);
 484        BUG_ON(ret == -EIOCBQUEUED);
 485        if (ret > 0 && ppos)
 486                *ppos = kiocb.ki_pos;
 487        return ret;
 488}
 489
 490static ssize_t __vfs_write(struct file *file, const char __user *p,
 491                           size_t count, loff_t *pos)
 492{
 493        if (file->f_op->write)
 494                return file->f_op->write(file, p, count, pos);
 495        else if (file->f_op->write_iter)
 496                return new_sync_write(file, p, count, pos);
 497        else
 498                return -EINVAL;
 499}
 500
 501ssize_t __kernel_write(struct file *file, const void *buf, size_t count, loff_t *pos)
 502{
 503        mm_segment_t old_fs;
 504        const char __user *p;
 505        ssize_t ret;
 506
 507        if (!(file->f_mode & FMODE_CAN_WRITE))
 508                return -EINVAL;
 509
 510        old_fs = get_fs();
 511        set_fs(KERNEL_DS);
 512        p = (__force const char __user *)buf;
 513        if (count > MAX_RW_COUNT)
 514                count =  MAX_RW_COUNT;
 515        ret = __vfs_write(file, p, count, pos);
 516        set_fs(old_fs);
 517        if (ret > 0) {
 518                fsnotify_modify(file);
 519                add_wchar(current, ret);
 520        }
 521        inc_syscw(current);
 522        return ret;
 523}
 524EXPORT_SYMBOL(__kernel_write);
 525
 526ssize_t kernel_write(struct file *file, const void *buf, size_t count,
 527                            loff_t *pos)
 528{
 529        mm_segment_t old_fs;
 530        ssize_t res;
 531
 532        old_fs = get_fs();
 533        set_fs(KERNEL_DS);
 534        /* The cast to a user pointer is valid due to the set_fs() */
 535        res = vfs_write(file, (__force const char __user *)buf, count, pos);
 536        set_fs(old_fs);
 537
 538        return res;
 539}
 540EXPORT_SYMBOL(kernel_write);
 541
 542ssize_t vfs_write(struct file *file, const char __user *buf, size_t count, loff_t *pos)
 543{
 544        ssize_t ret;
 545
 546        if (!(file->f_mode & FMODE_WRITE))
 547                return -EBADF;
 548        if (!(file->f_mode & FMODE_CAN_WRITE))
 549                return -EINVAL;
 550        if (unlikely(!access_ok(buf, count)))
 551                return -EFAULT;
 552
 553        ret = rw_verify_area(WRITE, file, pos, count);
 554        if (!ret) {
 555                if (count > MAX_RW_COUNT)
 556                        count =  MAX_RW_COUNT;
 557                file_start_write(file);
 558                ret = __vfs_write(file, buf, count, pos);
 559                if (ret > 0) {
 560                        fsnotify_modify(file);
 561                        add_wchar(current, ret);
 562                }
 563                inc_syscw(current);
 564                file_end_write(file);
 565        }
 566
 567        return ret;
 568}
 569
 570/* file_ppos returns &file->f_pos or NULL if file is stream */
 571static inline loff_t *file_ppos(struct file *file)
 572{
 573        return file->f_mode & FMODE_STREAM ? NULL : &file->f_pos;
 574}
 575
 576ssize_t ksys_read(unsigned int fd, char __user *buf, size_t count)
 577{
 578        struct fd f = fdget_pos(fd);
 579        ssize_t ret = -EBADF;
 580
 581        if (f.file) {
 582                loff_t pos, *ppos = file_ppos(f.file);
 583                if (ppos) {
 584                        pos = *ppos;
 585                        ppos = &pos;
 586                }
 587                ret = vfs_read(f.file, buf, count, ppos);
 588                if (ret >= 0 && ppos)
 589                        f.file->f_pos = pos;
 590                fdput_pos(f);
 591        }
 592        return ret;
 593}
 594
 595SYSCALL_DEFINE3(read, unsigned int, fd, char __user *, buf, size_t, count)
 596{
 597        return ksys_read(fd, buf, count);
 598}
 599
 600ssize_t ksys_write(unsigned int fd, const char __user *buf, size_t count)
 601{
 602        struct fd f = fdget_pos(fd);
 603        ssize_t ret = -EBADF;
 604
 605        if (f.file) {
 606                loff_t pos, *ppos = file_ppos(f.file);
 607                if (ppos) {
 608                        pos = *ppos;
 609                        ppos = &pos;
 610                }
 611                ret = vfs_write(f.file, buf, count, ppos);
 612                if (ret >= 0 && ppos)
 613                        f.file->f_pos = pos;
 614                fdput_pos(f);
 615        }
 616
 617        return ret;
 618}
 619
 620SYSCALL_DEFINE3(write, unsigned int, fd, const char __user *, buf,
 621                size_t, count)
 622{
 623        return ksys_write(fd, buf, count);
 624}
 625
 626ssize_t ksys_pread64(unsigned int fd, char __user *buf, size_t count,
 627                     loff_t pos)
 628{
 629        struct fd f;
 630        ssize_t ret = -EBADF;
 631
 632        if (pos < 0)
 633                return -EINVAL;
 634
 635        f = fdget(fd);
 636        if (f.file) {
 637                ret = -ESPIPE;
 638                if (f.file->f_mode & FMODE_PREAD)
 639                        ret = vfs_read(f.file, buf, count, &pos);
 640                fdput(f);
 641        }
 642
 643        return ret;
 644}
 645
 646SYSCALL_DEFINE4(pread64, unsigned int, fd, char __user *, buf,
 647                        size_t, count, loff_t, pos)
 648{
 649        return ksys_pread64(fd, buf, count, pos);
 650}
 651
 652ssize_t ksys_pwrite64(unsigned int fd, const char __user *buf,
 653                      size_t count, loff_t pos)
 654{
 655        struct fd f;
 656        ssize_t ret = -EBADF;
 657
 658        if (pos < 0)
 659                return -EINVAL;
 660
 661        f = fdget(fd);
 662        if (f.file) {
 663                ret = -ESPIPE;
 664                if (f.file->f_mode & FMODE_PWRITE)  
 665                        ret = vfs_write(f.file, buf, count, &pos);
 666                fdput(f);
 667        }
 668
 669        return ret;
 670}
 671
 672SYSCALL_DEFINE4(pwrite64, unsigned int, fd, const char __user *, buf,
 673                         size_t, count, loff_t, pos)
 674{
 675        return ksys_pwrite64(fd, buf, count, pos);
 676}
 677
 678static ssize_t do_iter_readv_writev(struct file *filp, struct iov_iter *iter,
 679                loff_t *ppos, int type, rwf_t flags)
 680{
 681        struct kiocb kiocb;
 682        ssize_t ret;
 683
 684        init_sync_kiocb(&kiocb, filp);
 685        ret = kiocb_set_rw_flags(&kiocb, flags);
 686        if (ret)
 687                return ret;
 688        kiocb.ki_pos = (ppos ? *ppos : 0);
 689
 690        if (type == READ)
 691                ret = call_read_iter(filp, &kiocb, iter);
 692        else
 693                ret = call_write_iter(filp, &kiocb, iter);
 694        BUG_ON(ret == -EIOCBQUEUED);
 695        if (ppos)
 696                *ppos = kiocb.ki_pos;
 697        return ret;
 698}
 699
 700/* Do it by hand, with file-ops */
 701static ssize_t do_loop_readv_writev(struct file *filp, struct iov_iter *iter,
 702                loff_t *ppos, int type, rwf_t flags)
 703{
 704        ssize_t ret = 0;
 705
 706        if (flags & ~RWF_HIPRI)
 707                return -EOPNOTSUPP;
 708
 709        while (iov_iter_count(iter)) {
 710                struct iovec iovec = iov_iter_iovec(iter);
 711                ssize_t nr;
 712
 713                if (type == READ) {
 714                        nr = filp->f_op->read(filp, iovec.iov_base,
 715                                              iovec.iov_len, ppos);
 716                } else {
 717                        nr = filp->f_op->write(filp, iovec.iov_base,
 718                                               iovec.iov_len, ppos);
 719                }
 720
 721                if (nr < 0) {
 722                        if (!ret)
 723                                ret = nr;
 724                        break;
 725                }
 726                ret += nr;
 727                if (nr != iovec.iov_len)
 728                        break;
 729                iov_iter_advance(iter, nr);
 730        }
 731
 732        return ret;
 733}
 734
 735/**
 736 * rw_copy_check_uvector() - Copy an array of &struct iovec from userspace
 737 *     into the kernel and check that it is valid.
 738 *
 739 * @type: One of %CHECK_IOVEC_ONLY, %READ, or %WRITE.
 740 * @uvector: Pointer to the userspace array.
 741 * @nr_segs: Number of elements in userspace array.
 742 * @fast_segs: Number of elements in @fast_pointer.
 743 * @fast_pointer: Pointer to (usually small on-stack) kernel array.
 744 * @ret_pointer: (output parameter) Pointer to a variable that will point to
 745 *     either @fast_pointer, a newly allocated kernel array, or NULL,
 746 *     depending on which array was used.
 747 *
 748 * This function copies an array of &struct iovec of @nr_segs from
 749 * userspace into the kernel and checks that each element is valid (e.g.
 750 * it does not point to a kernel address or cause overflow by being too
 751 * large, etc.).
 752 *
 753 * As an optimization, the caller may provide a pointer to a small
 754 * on-stack array in @fast_pointer, typically %UIO_FASTIOV elements long
 755 * (the size of this array, or 0 if unused, should be given in @fast_segs).
 756 *
 757 * @ret_pointer will always point to the array that was used, so the
 758 * caller must take care not to call kfree() on it e.g. in case the
 759 * @fast_pointer array was used and it was allocated on the stack.
 760 *
 761 * Return: The total number of bytes covered by the iovec array on success
 762 *   or a negative error code on error.
 763 */
 764ssize_t rw_copy_check_uvector(int type, const struct iovec __user * uvector,
 765                              unsigned long nr_segs, unsigned long fast_segs,
 766                              struct iovec *fast_pointer,
 767                              struct iovec **ret_pointer)
 768{
 769        unsigned long seg;
 770        ssize_t ret;
 771        struct iovec *iov = fast_pointer;
 772
 773        /*
 774         * SuS says "The readv() function *may* fail if the iovcnt argument
 775         * was less than or equal to 0, or greater than {IOV_MAX}.  Linux has
 776         * traditionally returned zero for zero segments, so...
 777         */
 778        if (nr_segs == 0) {
 779                ret = 0;
 780                goto out;
 781        }
 782
 783        /*
 784         * First get the "struct iovec" from user memory and
 785         * verify all the pointers
 786         */
 787        if (nr_segs > UIO_MAXIOV) {
 788                ret = -EINVAL;
 789                goto out;
 790        }
 791        if (nr_segs > fast_segs) {
 792                iov = kmalloc_array(nr_segs, sizeof(struct iovec), GFP_KERNEL);
 793                if (iov == NULL) {
 794                        ret = -ENOMEM;
 795                        goto out;
 796                }
 797        }
 798        if (copy_from_user(iov, uvector, nr_segs*sizeof(*uvector))) {
 799                ret = -EFAULT;
 800                goto out;
 801        }
 802
 803        /*
 804         * According to the Single Unix Specification we should return EINVAL
 805         * if an element length is < 0 when cast to ssize_t or if the
 806         * total length would overflow the ssize_t return value of the
 807         * system call.
 808         *
 809         * Linux caps all read/write calls to MAX_RW_COUNT, and avoids the
 810         * overflow case.
 811         */
 812        ret = 0;
 813        for (seg = 0; seg < nr_segs; seg++) {
 814                void __user *buf = iov[seg].iov_base;
 815                ssize_t len = (ssize_t)iov[seg].iov_len;
 816
 817                /* see if we we're about to use an invalid len or if
 818                 * it's about to overflow ssize_t */
 819                if (len < 0) {
 820                        ret = -EINVAL;
 821                        goto out;
 822                }
 823                if (type >= 0
 824                    && unlikely(!access_ok(buf, len))) {
 825                        ret = -EFAULT;
 826                        goto out;
 827                }
 828                if (len > MAX_RW_COUNT - ret) {
 829                        len = MAX_RW_COUNT - ret;
 830                        iov[seg].iov_len = len;
 831                }
 832                ret += len;
 833        }
 834out:
 835        *ret_pointer = iov;
 836        return ret;
 837}
 838
 839#ifdef CONFIG_COMPAT
 840ssize_t compat_rw_copy_check_uvector(int type,
 841                const struct compat_iovec __user *uvector, unsigned long nr_segs,
 842                unsigned long fast_segs, struct iovec *fast_pointer,
 843                struct iovec **ret_pointer)
 844{
 845        compat_ssize_t tot_len;
 846        struct iovec *iov = *ret_pointer = fast_pointer;
 847        ssize_t ret = 0;
 848        int seg;
 849
 850        /*
 851         * SuS says "The readv() function *may* fail if the iovcnt argument
 852         * was less than or equal to 0, or greater than {IOV_MAX}.  Linux has
 853         * traditionally returned zero for zero segments, so...
 854         */
 855        if (nr_segs == 0)
 856                goto out;
 857
 858        ret = -EINVAL;
 859        if (nr_segs > UIO_MAXIOV)
 860                goto out;
 861        if (nr_segs > fast_segs) {
 862                ret = -ENOMEM;
 863                iov = kmalloc_array(nr_segs, sizeof(struct iovec), GFP_KERNEL);
 864                if (iov == NULL)
 865                        goto out;
 866        }
 867        *ret_pointer = iov;
 868
 869        ret = -EFAULT;
 870        if (!access_ok(uvector, nr_segs*sizeof(*uvector)))
 871                goto out;
 872
 873        /*
 874         * Single unix specification:
 875         * We should -EINVAL if an element length is not >= 0 and fitting an
 876         * ssize_t.
 877         *
 878         * In Linux, the total length is limited to MAX_RW_COUNT, there is
 879         * no overflow possibility.
 880         */
 881        tot_len = 0;
 882        ret = -EINVAL;
 883        for (seg = 0; seg < nr_segs; seg++) {
 884                compat_uptr_t buf;
 885                compat_ssize_t len;
 886
 887                if (__get_user(len, &uvector->iov_len) ||
 888                   __get_user(buf, &uvector->iov_base)) {
 889                        ret = -EFAULT;
 890                        goto out;
 891                }
 892                if (len < 0)    /* size_t not fitting in compat_ssize_t .. */
 893                        goto out;
 894                if (type >= 0 &&
 895                    !access_ok(compat_ptr(buf), len)) {
 896                        ret = -EFAULT;
 897                        goto out;
 898                }
 899                if (len > MAX_RW_COUNT - tot_len)
 900                        len = MAX_RW_COUNT - tot_len;
 901                tot_len += len;
 902                iov->iov_base = compat_ptr(buf);
 903                iov->iov_len = (compat_size_t) len;
 904                uvector++;
 905                iov++;
 906        }
 907        ret = tot_len;
 908
 909out:
 910        return ret;
 911}
 912#endif
 913
 914static ssize_t do_iter_read(struct file *file, struct iov_iter *iter,
 915                loff_t *pos, rwf_t flags)
 916{
 917        size_t tot_len;
 918        ssize_t ret = 0;
 919
 920        if (!(file->f_mode & FMODE_READ))
 921                return -EBADF;
 922        if (!(file->f_mode & FMODE_CAN_READ))
 923                return -EINVAL;
 924
 925        tot_len = iov_iter_count(iter);
 926        if (!tot_len)
 927                goto out;
 928        ret = rw_verify_area(READ, file, pos, tot_len);
 929        if (ret < 0)
 930                return ret;
 931
 932        if (file->f_op->read_iter)
 933                ret = do_iter_readv_writev(file, iter, pos, READ, flags);
 934        else
 935                ret = do_loop_readv_writev(file, iter, pos, READ, flags);
 936out:
 937        if (ret >= 0)
 938                fsnotify_access(file);
 939        return ret;
 940}
 941
 942ssize_t vfs_iter_read(struct file *file, struct iov_iter *iter, loff_t *ppos,
 943                rwf_t flags)
 944{
 945        if (!file->f_op->read_iter)
 946                return -EINVAL;
 947        return do_iter_read(file, iter, ppos, flags);
 948}
 949EXPORT_SYMBOL(vfs_iter_read);
 950
 951static ssize_t do_iter_write(struct file *file, struct iov_iter *iter,
 952                loff_t *pos, rwf_t flags)
 953{
 954        size_t tot_len;
 955        ssize_t ret = 0;
 956
 957        if (!(file->f_mode & FMODE_WRITE))
 958                return -EBADF;
 959        if (!(file->f_mode & FMODE_CAN_WRITE))
 960                return -EINVAL;
 961
 962        tot_len = iov_iter_count(iter);
 963        if (!tot_len)
 964                return 0;
 965        ret = rw_verify_area(WRITE, file, pos, tot_len);
 966        if (ret < 0)
 967                return ret;
 968
 969        if (file->f_op->write_iter)
 970                ret = do_iter_readv_writev(file, iter, pos, WRITE, flags);
 971        else
 972                ret = do_loop_readv_writev(file, iter, pos, WRITE, flags);
 973        if (ret > 0)
 974                fsnotify_modify(file);
 975        return ret;
 976}
 977
 978ssize_t vfs_iter_write(struct file *file, struct iov_iter *iter, loff_t *ppos,
 979                rwf_t flags)
 980{
 981        if (!file->f_op->write_iter)
 982                return -EINVAL;
 983        return do_iter_write(file, iter, ppos, flags);
 984}
 985EXPORT_SYMBOL(vfs_iter_write);
 986
 987ssize_t vfs_readv(struct file *file, const struct iovec __user *vec,
 988                  unsigned long vlen, loff_t *pos, rwf_t flags)
 989{
 990        struct iovec iovstack[UIO_FASTIOV];
 991        struct iovec *iov = iovstack;
 992        struct iov_iter iter;
 993        ssize_t ret;
 994
 995        ret = import_iovec(READ, vec, vlen, ARRAY_SIZE(iovstack), &iov, &iter);
 996        if (ret >= 0) {
 997                ret = do_iter_read(file, &iter, pos, flags);
 998                kfree(iov);
 999        }
1000
1001        return ret;
1002}
1003
1004static ssize_t vfs_writev(struct file *file, const struct iovec __user *vec,
1005                   unsigned long vlen, loff_t *pos, rwf_t flags)
1006{
1007        struct iovec iovstack[UIO_FASTIOV];
1008        struct iovec *iov = iovstack;
1009        struct iov_iter iter;
1010        ssize_t ret;
1011
1012        ret = import_iovec(WRITE, vec, vlen, ARRAY_SIZE(iovstack), &iov, &iter);
1013        if (ret >= 0) {
1014                file_start_write(file);
1015                ret = do_iter_write(file, &iter, pos, flags);
1016                file_end_write(file);
1017                kfree(iov);
1018        }
1019        return ret;
1020}
1021
1022static ssize_t do_readv(unsigned long fd, const struct iovec __user *vec,
1023                        unsigned long vlen, rwf_t flags)
1024{
1025        struct fd f = fdget_pos(fd);
1026        ssize_t ret = -EBADF;
1027
1028        if (f.file) {
1029                loff_t pos, *ppos = file_ppos(f.file);
1030                if (ppos) {
1031                        pos = *ppos;
1032                        ppos = &pos;
1033                }
1034                ret = vfs_readv(f.file, vec, vlen, ppos, flags);
1035                if (ret >= 0 && ppos)
1036                        f.file->f_pos = pos;
1037                fdput_pos(f);
1038        }
1039
1040        if (ret > 0)
1041                add_rchar(current, ret);
1042        inc_syscr(current);
1043        return ret;
1044}
1045
1046static ssize_t do_writev(unsigned long fd, const struct iovec __user *vec,
1047                         unsigned long vlen, rwf_t flags)
1048{
1049        struct fd f = fdget_pos(fd);
1050        ssize_t ret = -EBADF;
1051
1052        if (f.file) {
1053                loff_t pos, *ppos = file_ppos(f.file);
1054                if (ppos) {
1055                        pos = *ppos;
1056                        ppos = &pos;
1057                }
1058                ret = vfs_writev(f.file, vec, vlen, ppos, flags);
1059                if (ret >= 0 && ppos)
1060                        f.file->f_pos = pos;
1061                fdput_pos(f);
1062        }
1063
1064        if (ret > 0)
1065                add_wchar(current, ret);
1066        inc_syscw(current);
1067        return ret;
1068}
1069
1070static inline loff_t pos_from_hilo(unsigned long high, unsigned long low)
1071{
1072#define HALF_LONG_BITS (BITS_PER_LONG / 2)
1073        return (((loff_t)high << HALF_LONG_BITS) << HALF_LONG_BITS) | low;
1074}
1075
1076static ssize_t do_preadv(unsigned long fd, const struct iovec __user *vec,
1077                         unsigned long vlen, loff_t pos, rwf_t flags)
1078{
1079        struct fd f;
1080        ssize_t ret = -EBADF;
1081
1082        if (pos < 0)
1083                return -EINVAL;
1084
1085        f = fdget(fd);
1086        if (f.file) {
1087                ret = -ESPIPE;
1088                if (f.file->f_mode & FMODE_PREAD)
1089                        ret = vfs_readv(f.file, vec, vlen, &pos, flags);
1090                fdput(f);
1091        }
1092
1093        if (ret > 0)
1094                add_rchar(current, ret);
1095        inc_syscr(current);
1096        return ret;
1097}
1098
1099static ssize_t do_pwritev(unsigned long fd, const struct iovec __user *vec,
1100                          unsigned long vlen, loff_t pos, rwf_t flags)
1101{
1102        struct fd f;
1103        ssize_t ret = -EBADF;
1104
1105        if (pos < 0)
1106                return -EINVAL;
1107
1108        f = fdget(fd);
1109        if (f.file) {
1110                ret = -ESPIPE;
1111                if (f.file->f_mode & FMODE_PWRITE)
1112                        ret = vfs_writev(f.file, vec, vlen, &pos, flags);
1113                fdput(f);
1114        }
1115
1116        if (ret > 0)
1117                add_wchar(current, ret);
1118        inc_syscw(current);
1119        return ret;
1120}
1121
1122SYSCALL_DEFINE3(readv, unsigned long, fd, const struct iovec __user *, vec,
1123                unsigned long, vlen)
1124{
1125        return do_readv(fd, vec, vlen, 0);
1126}
1127
1128SYSCALL_DEFINE3(writev, unsigned long, fd, const struct iovec __user *, vec,
1129                unsigned long, vlen)
1130{
1131        return do_writev(fd, vec, vlen, 0);
1132}
1133
1134SYSCALL_DEFINE5(preadv, unsigned long, fd, const struct iovec __user *, vec,
1135                unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h)
1136{
1137        loff_t pos = pos_from_hilo(pos_h, pos_l);
1138
1139        return do_preadv(fd, vec, vlen, pos, 0);
1140}
1141
1142SYSCALL_DEFINE6(preadv2, unsigned long, fd, const struct iovec __user *, vec,
1143                unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h,
1144                rwf_t, flags)
1145{
1146        loff_t pos = pos_from_hilo(pos_h, pos_l);
1147
1148        if (pos == -1)
1149                return do_readv(fd, vec, vlen, flags);
1150
1151        return do_preadv(fd, vec, vlen, pos, flags);
1152}
1153
1154SYSCALL_DEFINE5(pwritev, unsigned long, fd, const struct iovec __user *, vec,
1155                unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h)
1156{
1157        loff_t pos = pos_from_hilo(pos_h, pos_l);
1158
1159        return do_pwritev(fd, vec, vlen, pos, 0);
1160}
1161
1162SYSCALL_DEFINE6(pwritev2, unsigned long, fd, const struct iovec __user *, vec,
1163                unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h,
1164                rwf_t, flags)
1165{
1166        loff_t pos = pos_from_hilo(pos_h, pos_l);
1167
1168        if (pos == -1)
1169                return do_writev(fd, vec, vlen, flags);
1170
1171        return do_pwritev(fd, vec, vlen, pos, flags);
1172}
1173
1174#ifdef CONFIG_COMPAT
1175static size_t compat_readv(struct file *file,
1176                           const struct compat_iovec __user *vec,
1177                           unsigned long vlen, loff_t *pos, rwf_t flags)
1178{
1179        struct iovec iovstack[UIO_FASTIOV];
1180        struct iovec *iov = iovstack;
1181        struct iov_iter iter;
1182        ssize_t ret;
1183
1184        ret = compat_import_iovec(READ, vec, vlen, UIO_FASTIOV, &iov, &iter);
1185        if (ret >= 0) {
1186                ret = do_iter_read(file, &iter, pos, flags);
1187                kfree(iov);
1188        }
1189        if (ret > 0)
1190                add_rchar(current, ret);
1191        inc_syscr(current);
1192        return ret;
1193}
1194
1195static size_t do_compat_readv(compat_ulong_t fd,
1196                                 const struct compat_iovec __user *vec,
1197                                 compat_ulong_t vlen, rwf_t flags)
1198{
1199        struct fd f = fdget_pos(fd);
1200        ssize_t ret;
1201        loff_t pos;
1202
1203        if (!f.file)
1204                return -EBADF;
1205        pos = f.file->f_pos;
1206        ret = compat_readv(f.file, vec, vlen, &pos, flags);
1207        if (ret >= 0)
1208                f.file->f_pos = pos;
1209        fdput_pos(f);
1210        return ret;
1211
1212}
1213
1214COMPAT_SYSCALL_DEFINE3(readv, compat_ulong_t, fd,
1215                const struct compat_iovec __user *,vec,
1216                compat_ulong_t, vlen)
1217{
1218        return do_compat_readv(fd, vec, vlen, 0);
1219}
1220
1221static long do_compat_preadv64(unsigned long fd,
1222                                  const struct compat_iovec __user *vec,
1223                                  unsigned long vlen, loff_t pos, rwf_t flags)
1224{
1225        struct fd f;
1226        ssize_t ret;
1227
1228        if (pos < 0)
1229                return -EINVAL;
1230        f = fdget(fd);
1231        if (!f.file)
1232                return -EBADF;
1233        ret = -ESPIPE;
1234        if (f.file->f_mode & FMODE_PREAD)
1235                ret = compat_readv(f.file, vec, vlen, &pos, flags);
1236        fdput(f);
1237        return ret;
1238}
1239
1240#ifdef __ARCH_WANT_COMPAT_SYS_PREADV64
1241COMPAT_SYSCALL_DEFINE4(preadv64, unsigned long, fd,
1242                const struct compat_iovec __user *,vec,
1243                unsigned long, vlen, loff_t, pos)
1244{
1245        return do_compat_preadv64(fd, vec, vlen, pos, 0);
1246}
1247#endif
1248
1249COMPAT_SYSCALL_DEFINE5(preadv, compat_ulong_t, fd,
1250                const struct compat_iovec __user *,vec,
1251                compat_ulong_t, vlen, u32, pos_low, u32, pos_high)
1252{
1253        loff_t pos = ((loff_t)pos_high << 32) | pos_low;
1254
1255        return do_compat_preadv64(fd, vec, vlen, pos, 0);
1256}
1257
1258#ifdef __ARCH_WANT_COMPAT_SYS_PREADV64V2
1259COMPAT_SYSCALL_DEFINE5(preadv64v2, unsigned long, fd,
1260                const struct compat_iovec __user *,vec,
1261                unsigned long, vlen, loff_t, pos, rwf_t, flags)
1262{
1263        if (pos == -1)
1264                return do_compat_readv(fd, vec, vlen, flags);
1265
1266        return do_compat_preadv64(fd, vec, vlen, pos, flags);
1267}
1268#endif
1269
1270COMPAT_SYSCALL_DEFINE6(preadv2, compat_ulong_t, fd,
1271                const struct compat_iovec __user *,vec,
1272                compat_ulong_t, vlen, u32, pos_low, u32, pos_high,
1273                rwf_t, flags)
1274{
1275        loff_t pos = ((loff_t)pos_high << 32) | pos_low;
1276
1277        if (pos == -1)
1278                return do_compat_readv(fd, vec, vlen, flags);
1279
1280        return do_compat_preadv64(fd, vec, vlen, pos, flags);
1281}
1282
1283static size_t compat_writev(struct file *file,
1284                            const struct compat_iovec __user *vec,
1285                            unsigned long vlen, loff_t *pos, rwf_t flags)
1286{
1287        struct iovec iovstack[UIO_FASTIOV];
1288        struct iovec *iov = iovstack;
1289        struct iov_iter iter;
1290        ssize_t ret;
1291
1292        ret = compat_import_iovec(WRITE, vec, vlen, UIO_FASTIOV, &iov, &iter);
1293        if (ret >= 0) {
1294                file_start_write(file);
1295                ret = do_iter_write(file, &iter, pos, flags);
1296                file_end_write(file);
1297                kfree(iov);
1298        }
1299        if (ret > 0)
1300                add_wchar(current, ret);
1301        inc_syscw(current);
1302        return ret;
1303}
1304
1305static size_t do_compat_writev(compat_ulong_t fd,
1306                                  const struct compat_iovec __user* vec,
1307                                  compat_ulong_t vlen, rwf_t flags)
1308{
1309        struct fd f = fdget_pos(fd);
1310        ssize_t ret;
1311        loff_t pos;
1312
1313        if (!f.file)
1314                return -EBADF;
1315        pos = f.file->f_pos;
1316        ret = compat_writev(f.file, vec, vlen, &pos, flags);
1317        if (ret >= 0)
1318                f.file->f_pos = pos;
1319        fdput_pos(f);
1320        return ret;
1321}
1322
1323COMPAT_SYSCALL_DEFINE3(writev, compat_ulong_t, fd,
1324                const struct compat_iovec __user *, vec,
1325                compat_ulong_t, vlen)
1326{
1327        return do_compat_writev(fd, vec, vlen, 0);
1328}
1329
1330static long do_compat_pwritev64(unsigned long fd,
1331                                   const struct compat_iovec __user *vec,
1332                                   unsigned long vlen, loff_t pos, rwf_t flags)
1333{
1334        struct fd f;
1335        ssize_t ret;
1336
1337        if (pos < 0)
1338                return -EINVAL;
1339        f = fdget(fd);
1340        if (!f.file)
1341                return -EBADF;
1342        ret = -ESPIPE;
1343        if (f.file->f_mode & FMODE_PWRITE)
1344                ret = compat_writev(f.file, vec, vlen, &pos, flags);
1345        fdput(f);
1346        return ret;
1347}
1348
1349#ifdef __ARCH_WANT_COMPAT_SYS_PWRITEV64
1350COMPAT_SYSCALL_DEFINE4(pwritev64, unsigned long, fd,
1351                const struct compat_iovec __user *,vec,
1352                unsigned long, vlen, loff_t, pos)
1353{
1354        return do_compat_pwritev64(fd, vec, vlen, pos, 0);
1355}
1356#endif
1357
1358COMPAT_SYSCALL_DEFINE5(pwritev, compat_ulong_t, fd,
1359                const struct compat_iovec __user *,vec,
1360                compat_ulong_t, vlen, u32, pos_low, u32, pos_high)
1361{
1362        loff_t pos = ((loff_t)pos_high << 32) | pos_low;
1363
1364        return do_compat_pwritev64(fd, vec, vlen, pos, 0);
1365}
1366
1367#ifdef __ARCH_WANT_COMPAT_SYS_PWRITEV64V2
1368COMPAT_SYSCALL_DEFINE5(pwritev64v2, unsigned long, fd,
1369                const struct compat_iovec __user *,vec,
1370                unsigned long, vlen, loff_t, pos, rwf_t, flags)
1371{
1372        if (pos == -1)
1373                return do_compat_writev(fd, vec, vlen, flags);
1374
1375        return do_compat_pwritev64(fd, vec, vlen, pos, flags);
1376}
1377#endif
1378
1379COMPAT_SYSCALL_DEFINE6(pwritev2, compat_ulong_t, fd,
1380                const struct compat_iovec __user *,vec,
1381                compat_ulong_t, vlen, u32, pos_low, u32, pos_high, rwf_t, flags)
1382{
1383        loff_t pos = ((loff_t)pos_high << 32) | pos_low;
1384
1385        if (pos == -1)
1386                return do_compat_writev(fd, vec, vlen, flags);
1387
1388        return do_compat_pwritev64(fd, vec, vlen, pos, flags);
1389}
1390
1391#endif
1392
1393static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos,
1394                           size_t count, loff_t max)
1395{
1396        struct fd in, out;
1397        struct inode *in_inode, *out_inode;
1398        loff_t pos;
1399        loff_t out_pos;
1400        ssize_t retval;
1401        int fl;
1402
1403        /*
1404         * Get input file, and verify that it is ok..
1405         */
1406        retval = -EBADF;
1407        in = fdget(in_fd);
1408        if (!in.file)
1409                goto out;
1410        if (!(in.file->f_mode & FMODE_READ))
1411                goto fput_in;
1412        retval = -ESPIPE;
1413        if (!ppos) {
1414                pos = in.file->f_pos;
1415        } else {
1416                pos = *ppos;
1417                if (!(in.file->f_mode & FMODE_PREAD))
1418                        goto fput_in;
1419        }
1420        retval = rw_verify_area(READ, in.file, &pos, count);
1421        if (retval < 0)
1422                goto fput_in;
1423        if (count > MAX_RW_COUNT)
1424                count =  MAX_RW_COUNT;
1425
1426        /*
1427         * Get output file, and verify that it is ok..
1428         */
1429        retval = -EBADF;
1430        out = fdget(out_fd);
1431        if (!out.file)
1432                goto fput_in;
1433        if (!(out.file->f_mode & FMODE_WRITE))
1434                goto fput_out;
1435        in_inode = file_inode(in.file);
1436        out_inode = file_inode(out.file);
1437        out_pos = out.file->f_pos;
1438        retval = rw_verify_area(WRITE, out.file, &out_pos, count);
1439        if (retval < 0)
1440                goto fput_out;
1441
1442        if (!max)
1443                max = min(in_inode->i_sb->s_maxbytes, out_inode->i_sb->s_maxbytes);
1444
1445        if (unlikely(pos + count > max)) {
1446                retval = -EOVERFLOW;
1447                if (pos >= max)
1448                        goto fput_out;
1449                count = max - pos;
1450        }
1451
1452        fl = 0;
1453#if 0
1454        /*
1455         * We need to debate whether we can enable this or not. The
1456         * man page documents EAGAIN return for the output at least,
1457         * and the application is arguably buggy if it doesn't expect
1458         * EAGAIN on a non-blocking file descriptor.
1459         */
1460        if (in.file->f_flags & O_NONBLOCK)
1461                fl = SPLICE_F_NONBLOCK;
1462#endif
1463        file_start_write(out.file);
1464        retval = do_splice_direct(in.file, &pos, out.file, &out_pos, count, fl);
1465        file_end_write(out.file);
1466
1467        if (retval > 0) {
1468                add_rchar(current, retval);
1469                add_wchar(current, retval);
1470                fsnotify_access(in.file);
1471                fsnotify_modify(out.file);
1472                out.file->f_pos = out_pos;
1473                if (ppos)
1474                        *ppos = pos;
1475                else
1476                        in.file->f_pos = pos;
1477        }
1478
1479        inc_syscr(current);
1480        inc_syscw(current);
1481        if (pos > max)
1482                retval = -EOVERFLOW;
1483
1484fput_out:
1485        fdput(out);
1486fput_in:
1487        fdput(in);
1488out:
1489        return retval;
1490}
1491
1492SYSCALL_DEFINE4(sendfile, int, out_fd, int, in_fd, off_t __user *, offset, size_t, count)
1493{
1494        loff_t pos;
1495        off_t off;
1496        ssize_t ret;
1497
1498        if (offset) {
1499                if (unlikely(get_user(off, offset)))
1500                        return -EFAULT;
1501                pos = off;
1502                ret = do_sendfile(out_fd, in_fd, &pos, count, MAX_NON_LFS);
1503                if (unlikely(put_user(pos, offset)))
1504                        return -EFAULT;
1505                return ret;
1506        }
1507
1508        return do_sendfile(out_fd, in_fd, NULL, count, 0);
1509}
1510
1511SYSCALL_DEFINE4(sendfile64, int, out_fd, int, in_fd, loff_t __user *, offset, size_t, count)
1512{
1513        loff_t pos;
1514        ssize_t ret;
1515
1516        if (offset) {
1517                if (unlikely(copy_from_user(&pos, offset, sizeof(loff_t))))
1518                        return -EFAULT;
1519                ret = do_sendfile(out_fd, in_fd, &pos, count, 0);
1520                if (unlikely(put_user(pos, offset)))
1521                        return -EFAULT;
1522                return ret;
1523        }
1524
1525        return do_sendfile(out_fd, in_fd, NULL, count, 0);
1526}
1527
1528#ifdef CONFIG_COMPAT
1529COMPAT_SYSCALL_DEFINE4(sendfile, int, out_fd, int, in_fd,
1530                compat_off_t __user *, offset, compat_size_t, count)
1531{
1532        loff_t pos;
1533        off_t off;
1534        ssize_t ret;
1535
1536        if (offset) {
1537                if (unlikely(get_user(off, offset)))
1538                        return -EFAULT;
1539                pos = off;
1540                ret = do_sendfile(out_fd, in_fd, &pos, count, MAX_NON_LFS);
1541                if (unlikely(put_user(pos, offset)))
1542                        return -EFAULT;
1543                return ret;
1544        }
1545
1546        return do_sendfile(out_fd, in_fd, NULL, count, 0);
1547}
1548
1549COMPAT_SYSCALL_DEFINE4(sendfile64, int, out_fd, int, in_fd,
1550                compat_loff_t __user *, offset, compat_size_t, count)
1551{
1552        loff_t pos;
1553        ssize_t ret;
1554
1555        if (offset) {
1556                if (unlikely(copy_from_user(&pos, offset, sizeof(loff_t))))
1557                        return -EFAULT;
1558                ret = do_sendfile(out_fd, in_fd, &pos, count, 0);
1559                if (unlikely(put_user(pos, offset)))
1560                        return -EFAULT;
1561                return ret;
1562        }
1563
1564        return do_sendfile(out_fd, in_fd, NULL, count, 0);
1565}
1566#endif
1567
1568/**
1569 * generic_copy_file_range - copy data between two files
1570 * @file_in:    file structure to read from
1571 * @pos_in:     file offset to read from
1572 * @file_out:   file structure to write data to
1573 * @pos_out:    file offset to write data to
1574 * @len:        amount of data to copy
1575 * @flags:      copy flags
1576 *
1577 * This is a generic filesystem helper to copy data from one file to another.
1578 * It has no constraints on the source or destination file owners - the files
1579 * can belong to different superblocks and different filesystem types. Short
1580 * copies are allowed.
1581 *
1582 * This should be called from the @file_out filesystem, as per the
1583 * ->copy_file_range() method.
1584 *
1585 * Returns the number of bytes copied or a negative error indicating the
1586 * failure.
1587 */
1588
1589ssize_t generic_copy_file_range(struct file *file_in, loff_t pos_in,
1590                                struct file *file_out, loff_t pos_out,
1591                                size_t len, unsigned int flags)
1592{
1593        return do_splice_direct(file_in, &pos_in, file_out, &pos_out,
1594                                len > MAX_RW_COUNT ? MAX_RW_COUNT : len, 0);
1595}
1596EXPORT_SYMBOL(generic_copy_file_range);
1597
1598static ssize_t do_copy_file_range(struct file *file_in, loff_t pos_in,
1599                                  struct file *file_out, loff_t pos_out,
1600                                  size_t len, unsigned int flags)
1601{
1602        /*
1603         * Although we now allow filesystems to handle cross sb copy, passing
1604         * a file of the wrong filesystem type to filesystem driver can result
1605         * in an attempt to dereference the wrong type of ->private_data, so
1606         * avoid doing that until we really have a good reason.  NFS defines
1607         * several different file_system_type structures, but they all end up
1608         * using the same ->copy_file_range() function pointer.
1609         */
1610        if (file_out->f_op->copy_file_range &&
1611            file_out->f_op->copy_file_range == file_in->f_op->copy_file_range)
1612                return file_out->f_op->copy_file_range(file_in, pos_in,
1613                                                       file_out, pos_out,
1614                                                       len, flags);
1615
1616        return generic_copy_file_range(file_in, pos_in, file_out, pos_out, len,
1617                                       flags);
1618}
1619
1620/*
1621 * copy_file_range() differs from regular file read and write in that it
1622 * specifically allows return partial success.  When it does so is up to
1623 * the copy_file_range method.
1624 */
1625ssize_t vfs_copy_file_range(struct file *file_in, loff_t pos_in,
1626                            struct file *file_out, loff_t pos_out,
1627                            size_t len, unsigned int flags)
1628{
1629        ssize_t ret;
1630
1631        if (flags != 0)
1632                return -EINVAL;
1633
1634        ret = generic_copy_file_checks(file_in, pos_in, file_out, pos_out, &len,
1635                                       flags);
1636        if (unlikely(ret))
1637                return ret;
1638
1639        ret = rw_verify_area(READ, file_in, &pos_in, len);
1640        if (unlikely(ret))
1641                return ret;
1642
1643        ret = rw_verify_area(WRITE, file_out, &pos_out, len);
1644        if (unlikely(ret))
1645                return ret;
1646
1647        if (len == 0)
1648                return 0;
1649
1650        file_start_write(file_out);
1651
1652        /*
1653         * Try cloning first, this is supported by more file systems, and
1654         * more efficient if both clone and copy are supported (e.g. NFS).
1655         */
1656        if (file_in->f_op->remap_file_range &&
1657            file_inode(file_in)->i_sb == file_inode(file_out)->i_sb) {
1658                loff_t cloned;
1659
1660                cloned = file_in->f_op->remap_file_range(file_in, pos_in,
1661                                file_out, pos_out,
1662                                min_t(loff_t, MAX_RW_COUNT, len),
1663                                REMAP_FILE_CAN_SHORTEN);
1664                if (cloned > 0) {
1665                        ret = cloned;
1666                        goto done;
1667                }
1668        }
1669
1670        ret = do_copy_file_range(file_in, pos_in, file_out, pos_out, len,
1671                                flags);
1672        WARN_ON_ONCE(ret == -EOPNOTSUPP);
1673done:
1674        if (ret > 0) {
1675                fsnotify_access(file_in);
1676                add_rchar(current, ret);
1677                fsnotify_modify(file_out);
1678                add_wchar(current, ret);
1679        }
1680
1681        inc_syscr(current);
1682        inc_syscw(current);
1683
1684        file_end_write(file_out);
1685
1686        return ret;
1687}
1688EXPORT_SYMBOL(vfs_copy_file_range);
1689
1690SYSCALL_DEFINE6(copy_file_range, int, fd_in, loff_t __user *, off_in,
1691                int, fd_out, loff_t __user *, off_out,
1692                size_t, len, unsigned int, flags)
1693{
1694        loff_t pos_in;
1695        loff_t pos_out;
1696        struct fd f_in;
1697        struct fd f_out;
1698        ssize_t ret = -EBADF;
1699
1700        f_in = fdget(fd_in);
1701        if (!f_in.file)
1702                goto out2;
1703
1704        f_out = fdget(fd_out);
1705        if (!f_out.file)
1706                goto out1;
1707
1708        ret = -EFAULT;
1709        if (off_in) {
1710                if (copy_from_user(&pos_in, off_in, sizeof(loff_t)))
1711                        goto out;
1712        } else {
1713                pos_in = f_in.file->f_pos;
1714        }
1715
1716        if (off_out) {
1717                if (copy_from_user(&pos_out, off_out, sizeof(loff_t)))
1718                        goto out;
1719        } else {
1720                pos_out = f_out.file->f_pos;
1721        }
1722
1723        ret = vfs_copy_file_range(f_in.file, pos_in, f_out.file, pos_out, len,
1724                                  flags);
1725        if (ret > 0) {
1726                pos_in += ret;
1727                pos_out += ret;
1728
1729                if (off_in) {
1730                        if (copy_to_user(off_in, &pos_in, sizeof(loff_t)))
1731                                ret = -EFAULT;
1732                } else {
1733                        f_in.file->f_pos = pos_in;
1734                }
1735
1736                if (off_out) {
1737                        if (copy_to_user(off_out, &pos_out, sizeof(loff_t)))
1738                                ret = -EFAULT;
1739                } else {
1740                        f_out.file->f_pos = pos_out;
1741                }
1742        }
1743
1744out:
1745        fdput(f_out);
1746out1:
1747        fdput(f_in);
1748out2:
1749        return ret;
1750}
1751
1752static int remap_verify_area(struct file *file, loff_t pos, loff_t len,
1753                             bool write)
1754{
1755        struct inode *inode = file_inode(file);
1756
1757        if (unlikely(pos < 0 || len < 0))
1758                return -EINVAL;
1759
1760         if (unlikely((loff_t) (pos + len) < 0))
1761                return -EINVAL;
1762
1763        if (unlikely(inode->i_flctx && mandatory_lock(inode))) {
1764                loff_t end = len ? pos + len - 1 : OFFSET_MAX;
1765                int retval;
1766
1767                retval = locks_mandatory_area(inode, file, pos, end,
1768                                write ? F_WRLCK : F_RDLCK);
1769                if (retval < 0)
1770                        return retval;
1771        }
1772
1773        return security_file_permission(file, write ? MAY_WRITE : MAY_READ);
1774}
1775/*
1776 * Ensure that we don't remap a partial EOF block in the middle of something
1777 * else.  Assume that the offsets have already been checked for block
1778 * alignment.
1779 *
1780 * For deduplication we always scale down to the previous block because we
1781 * can't meaningfully compare post-EOF contents.
1782 *
1783 * For clone we only link a partial EOF block above the destination file's EOF.
1784 *
1785 * Shorten the request if possible.
1786 */
1787static int generic_remap_check_len(struct inode *inode_in,
1788                                   struct inode *inode_out,
1789                                   loff_t pos_out,
1790                                   loff_t *len,
1791                                   unsigned int remap_flags)
1792{
1793        u64 blkmask = i_blocksize(inode_in) - 1;
1794        loff_t new_len = *len;
1795
1796        if ((*len & blkmask) == 0)
1797                return 0;
1798
1799        if ((remap_flags & REMAP_FILE_DEDUP) ||
1800            pos_out + *len < i_size_read(inode_out))
1801                new_len &= ~blkmask;
1802
1803        if (new_len == *len)
1804                return 0;
1805
1806        if (remap_flags & REMAP_FILE_CAN_SHORTEN) {
1807                *len = new_len;
1808                return 0;
1809        }
1810
1811        return (remap_flags & REMAP_FILE_DEDUP) ? -EBADE : -EINVAL;
1812}
1813
1814/* Read a page's worth of file data into the page cache. */
1815static struct page *vfs_dedupe_get_page(struct inode *inode, loff_t offset)
1816{
1817        struct page *page;
1818
1819        page = read_mapping_page(inode->i_mapping, offset >> PAGE_SHIFT, NULL);
1820        if (IS_ERR(page))
1821                return page;
1822        if (!PageUptodate(page)) {
1823                put_page(page);
1824                return ERR_PTR(-EIO);
1825        }
1826        return page;
1827}
1828
1829/*
1830 * Lock two pages, ensuring that we lock in offset order if the pages are from
1831 * the same file.
1832 */
1833static void vfs_lock_two_pages(struct page *page1, struct page *page2)
1834{
1835        /* Always lock in order of increasing index. */
1836        if (page1->index > page2->index)
1837                swap(page1, page2);
1838
1839        lock_page(page1);
1840        if (page1 != page2)
1841                lock_page(page2);
1842}
1843
1844/* Unlock two pages, being careful not to unlock the same page twice. */
1845static void vfs_unlock_two_pages(struct page *page1, struct page *page2)
1846{
1847        unlock_page(page1);
1848        if (page1 != page2)
1849                unlock_page(page2);
1850}
1851
1852/*
1853 * Compare extents of two files to see if they are the same.
1854 * Caller must have locked both inodes to prevent write races.
1855 */
1856static int vfs_dedupe_file_range_compare(struct inode *src, loff_t srcoff,
1857                                         struct inode *dest, loff_t destoff,
1858                                         loff_t len, bool *is_same)
1859{
1860        loff_t src_poff;
1861        loff_t dest_poff;
1862        void *src_addr;
1863        void *dest_addr;
1864        struct page *src_page;
1865        struct page *dest_page;
1866        loff_t cmp_len;
1867        bool same;
1868        int error;
1869
1870        error = -EINVAL;
1871        same = true;
1872        while (len) {
1873                src_poff = srcoff & (PAGE_SIZE - 1);
1874                dest_poff = destoff & (PAGE_SIZE - 1);
1875                cmp_len = min(PAGE_SIZE - src_poff,
1876                              PAGE_SIZE - dest_poff);
1877                cmp_len = min(cmp_len, len);
1878                if (cmp_len <= 0)
1879                        goto out_error;
1880
1881                src_page = vfs_dedupe_get_page(src, srcoff);
1882                if (IS_ERR(src_page)) {
1883                        error = PTR_ERR(src_page);
1884                        goto out_error;
1885                }
1886                dest_page = vfs_dedupe_get_page(dest, destoff);
1887                if (IS_ERR(dest_page)) {
1888                        error = PTR_ERR(dest_page);
1889                        put_page(src_page);
1890                        goto out_error;
1891                }
1892
1893                vfs_lock_two_pages(src_page, dest_page);
1894
1895                /*
1896                 * Now that we've locked both pages, make sure they're still
1897                 * mapped to the file data we're interested in.  If not,
1898                 * someone is invalidating pages on us and we lose.
1899                 */
1900                if (!PageUptodate(src_page) || !PageUptodate(dest_page) ||
1901                    src_page->mapping != src->i_mapping ||
1902                    dest_page->mapping != dest->i_mapping) {
1903                        same = false;
1904                        goto unlock;
1905                }
1906
1907                src_addr = kmap_atomic(src_page);
1908                dest_addr = kmap_atomic(dest_page);
1909
1910                flush_dcache_page(src_page);
1911                flush_dcache_page(dest_page);
1912
1913                if (memcmp(src_addr + src_poff, dest_addr + dest_poff, cmp_len))
1914                        same = false;
1915
1916                kunmap_atomic(dest_addr);
1917                kunmap_atomic(src_addr);
1918unlock:
1919                vfs_unlock_two_pages(src_page, dest_page);
1920                put_page(dest_page);
1921                put_page(src_page);
1922
1923                if (!same)
1924                        break;
1925
1926                srcoff += cmp_len;
1927                destoff += cmp_len;
1928                len -= cmp_len;
1929        }
1930
1931        *is_same = same;
1932        return 0;
1933
1934out_error:
1935        return error;
1936}
1937
1938/*
1939 * Check that the two inodes are eligible for cloning, the ranges make
1940 * sense, and then flush all dirty data.  Caller must ensure that the
1941 * inodes have been locked against any other modifications.
1942 *
1943 * If there's an error, then the usual negative error code is returned.
1944 * Otherwise returns 0 with *len set to the request length.
1945 */
1946int generic_remap_file_range_prep(struct file *file_in, loff_t pos_in,
1947                                  struct file *file_out, loff_t pos_out,
1948                                  loff_t *len, unsigned int remap_flags)
1949{
1950        struct inode *inode_in = file_inode(file_in);
1951        struct inode *inode_out = file_inode(file_out);
1952        bool same_inode = (inode_in == inode_out);
1953        int ret;
1954
1955        /* Don't touch certain kinds of inodes */
1956        if (IS_IMMUTABLE(inode_out))
1957                return -EPERM;
1958
1959        if (IS_SWAPFILE(inode_in) || IS_SWAPFILE(inode_out))
1960                return -ETXTBSY;
1961
1962        /* Don't reflink dirs, pipes, sockets... */
1963        if (S_ISDIR(inode_in->i_mode) || S_ISDIR(inode_out->i_mode))
1964                return -EISDIR;
1965        if (!S_ISREG(inode_in->i_mode) || !S_ISREG(inode_out->i_mode))
1966                return -EINVAL;
1967
1968        /* Zero length dedupe exits immediately; reflink goes to EOF. */
1969        if (*len == 0) {
1970                loff_t isize = i_size_read(inode_in);
1971
1972                if ((remap_flags & REMAP_FILE_DEDUP) || pos_in == isize)
1973                        return 0;
1974                if (pos_in > isize)
1975                        return -EINVAL;
1976                *len = isize - pos_in;
1977                if (*len == 0)
1978                        return 0;
1979        }
1980
1981        /* Check that we don't violate system file offset limits. */
1982        ret = generic_remap_checks(file_in, pos_in, file_out, pos_out, len,
1983                        remap_flags);
1984        if (ret)
1985                return ret;
1986
1987        /* Wait for the completion of any pending IOs on both files */
1988        inode_dio_wait(inode_in);
1989        if (!same_inode)
1990                inode_dio_wait(inode_out);
1991
1992        ret = filemap_write_and_wait_range(inode_in->i_mapping,
1993                        pos_in, pos_in + *len - 1);
1994        if (ret)
1995                return ret;
1996
1997        ret = filemap_write_and_wait_range(inode_out->i_mapping,
1998                        pos_out, pos_out + *len - 1);
1999        if (ret)
2000                return ret;
2001
2002        /*
2003         * Check that the extents are the same.
2004         */
2005        if (remap_flags & REMAP_FILE_DEDUP) {
2006                bool            is_same = false;
2007
2008                ret = vfs_dedupe_file_range_compare(inode_in, pos_in,
2009                                inode_out, pos_out, *len, &is_same);
2010                if (ret)
2011                        return ret;
2012                if (!is_same)
2013                        return -EBADE;
2014        }
2015
2016        ret = generic_remap_check_len(inode_in, inode_out, pos_out, len,
2017                        remap_flags);
2018        if (ret)
2019                return ret;
2020
2021        /* If can't alter the file contents, we're done. */
2022        if (!(remap_flags & REMAP_FILE_DEDUP))
2023                ret = file_modified(file_out);
2024
2025        return ret;
2026}
2027EXPORT_SYMBOL(generic_remap_file_range_prep);
2028
2029loff_t do_clone_file_range(struct file *file_in, loff_t pos_in,
2030                           struct file *file_out, loff_t pos_out,
2031                           loff_t len, unsigned int remap_flags)
2032{
2033        loff_t ret;
2034
2035        WARN_ON_ONCE(remap_flags & REMAP_FILE_DEDUP);
2036
2037        /*
2038         * FICLONE/FICLONERANGE ioctls enforce that src and dest files are on
2039         * the same mount. Practically, they only need to be on the same file
2040         * system.
2041         */
2042        if (file_inode(file_in)->i_sb != file_inode(file_out)->i_sb)
2043                return -EXDEV;
2044
2045        ret = generic_file_rw_checks(file_in, file_out);
2046        if (ret < 0)
2047                return ret;
2048
2049        if (!file_in->f_op->remap_file_range)
2050                return -EOPNOTSUPP;
2051
2052        ret = remap_verify_area(file_in, pos_in, len, false);
2053        if (ret)
2054                return ret;
2055
2056        ret = remap_verify_area(file_out, pos_out, len, true);
2057        if (ret)
2058                return ret;
2059
2060        ret = file_in->f_op->remap_file_range(file_in, pos_in,
2061                        file_out, pos_out, len, remap_flags);
2062        if (ret < 0)
2063                return ret;
2064
2065        fsnotify_access(file_in);
2066        fsnotify_modify(file_out);
2067        return ret;
2068}
2069EXPORT_SYMBOL(do_clone_file_range);
2070
2071loff_t vfs_clone_file_range(struct file *file_in, loff_t pos_in,
2072                            struct file *file_out, loff_t pos_out,
2073                            loff_t len, unsigned int remap_flags)
2074{
2075        loff_t ret;
2076
2077        file_start_write(file_out);
2078        ret = do_clone_file_range(file_in, pos_in, file_out, pos_out, len,
2079                                  remap_flags);
2080        file_end_write(file_out);
2081
2082        return ret;
2083}
2084EXPORT_SYMBOL(vfs_clone_file_range);
2085
2086/* Check whether we are allowed to dedupe the destination file */
2087static bool allow_file_dedupe(struct file *file)
2088{
2089        if (capable(CAP_SYS_ADMIN))
2090                return true;
2091        if (file->f_mode & FMODE_WRITE)
2092                return true;
2093        if (uid_eq(current_fsuid(), file_inode(file)->i_uid))
2094                return true;
2095        if (!inode_permission(file_inode(file), MAY_WRITE))
2096                return true;
2097        return false;
2098}
2099
2100loff_t vfs_dedupe_file_range_one(struct file *src_file, loff_t src_pos,
2101                                 struct file *dst_file, loff_t dst_pos,
2102                                 loff_t len, unsigned int remap_flags)
2103{
2104        loff_t ret;
2105
2106        WARN_ON_ONCE(remap_flags & ~(REMAP_FILE_DEDUP |
2107                                     REMAP_FILE_CAN_SHORTEN));
2108
2109        ret = mnt_want_write_file(dst_file);
2110        if (ret)
2111                return ret;
2112
2113        ret = remap_verify_area(dst_file, dst_pos, len, true);
2114        if (ret < 0)
2115                goto out_drop_write;
2116
2117        ret = -EPERM;
2118        if (!allow_file_dedupe(dst_file))
2119                goto out_drop_write;
2120
2121        ret = -EXDEV;
2122        if (src_file->f_path.mnt != dst_file->f_path.mnt)
2123                goto out_drop_write;
2124
2125        ret = -EISDIR;
2126        if (S_ISDIR(file_inode(dst_file)->i_mode))
2127                goto out_drop_write;
2128
2129        ret = -EINVAL;
2130        if (!dst_file->f_op->remap_file_range)
2131                goto out_drop_write;
2132
2133        if (len == 0) {
2134                ret = 0;
2135                goto out_drop_write;
2136        }
2137
2138        ret = dst_file->f_op->remap_file_range(src_file, src_pos, dst_file,
2139                        dst_pos, len, remap_flags | REMAP_FILE_DEDUP);
2140out_drop_write:
2141        mnt_drop_write_file(dst_file);
2142
2143        return ret;
2144}
2145EXPORT_SYMBOL(vfs_dedupe_file_range_one);
2146
2147int vfs_dedupe_file_range(struct file *file, struct file_dedupe_range *same)
2148{
2149        struct file_dedupe_range_info *info;
2150        struct inode *src = file_inode(file);
2151        u64 off;
2152        u64 len;
2153        int i;
2154        int ret;
2155        u16 count = same->dest_count;
2156        loff_t deduped;
2157
2158        if (!(file->f_mode & FMODE_READ))
2159                return -EINVAL;
2160
2161        if (same->reserved1 || same->reserved2)
2162                return -EINVAL;
2163
2164        off = same->src_offset;
2165        len = same->src_length;
2166
2167        if (S_ISDIR(src->i_mode))
2168                return -EISDIR;
2169
2170        if (!S_ISREG(src->i_mode))
2171                return -EINVAL;
2172
2173        if (!file->f_op->remap_file_range)
2174                return -EOPNOTSUPP;
2175
2176        ret = remap_verify_area(file, off, len, false);
2177        if (ret < 0)
2178                return ret;
2179        ret = 0;
2180
2181        if (off + len > i_size_read(src))
2182                return -EINVAL;
2183
2184        /* Arbitrary 1G limit on a single dedupe request, can be raised. */
2185        len = min_t(u64, len, 1 << 30);
2186
2187        /* pre-format output fields to sane values */
2188        for (i = 0; i < count; i++) {
2189                same->info[i].bytes_deduped = 0ULL;
2190                same->info[i].status = FILE_DEDUPE_RANGE_SAME;
2191        }
2192
2193        for (i = 0, info = same->info; i < count; i++, info++) {
2194                struct fd dst_fd = fdget(info->dest_fd);
2195                struct file *dst_file = dst_fd.file;
2196
2197                if (!dst_file) {
2198                        info->status = -EBADF;
2199                        goto next_loop;
2200                }
2201
2202                if (info->reserved) {
2203                        info->status = -EINVAL;
2204                        goto next_fdput;
2205                }
2206
2207                deduped = vfs_dedupe_file_range_one(file, off, dst_file,
2208                                                    info->dest_offset, len,
2209                                                    REMAP_FILE_CAN_SHORTEN);
2210                if (deduped == -EBADE)
2211                        info->status = FILE_DEDUPE_RANGE_DIFFERS;
2212                else if (deduped < 0)
2213                        info->status = deduped;
2214                else
2215                        info->bytes_deduped = len;
2216
2217next_fdput:
2218                fdput(dst_fd);
2219next_loop:
2220                if (fatal_signal_pending(current))
2221                        break;
2222        }
2223        return ret;
2224}
2225EXPORT_SYMBOL(vfs_dedupe_file_range);
2226