linux/fs/read_write.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0
   2/*
   3 *  linux/fs/read_write.c
   4 *
   5 *  Copyright (C) 1991, 1992  Linus Torvalds
   6 */
   7
   8#include <linux/slab.h>
   9#include <linux/stat.h>
  10#include <linux/sched/xacct.h>
  11#include <linux/fcntl.h>
  12#include <linux/file.h>
  13#include <linux/uio.h>
  14#include <linux/fsnotify.h>
  15#include <linux/security.h>
  16#include <linux/export.h>
  17#include <linux/syscalls.h>
  18#include <linux/pagemap.h>
  19#include <linux/splice.h>
  20#include <linux/compat.h>
  21#include <linux/mount.h>
  22#include <linux/fs.h>
  23#include "internal.h"
  24
  25#include <linux/uaccess.h>
  26#include <asm/unistd.h>
  27
  28const struct file_operations generic_ro_fops = {
  29        .llseek         = generic_file_llseek,
  30        .read_iter      = generic_file_read_iter,
  31        .mmap           = generic_file_readonly_mmap,
  32        .splice_read    = generic_file_splice_read,
  33};
  34
  35EXPORT_SYMBOL(generic_ro_fops);
  36
  37static inline bool unsigned_offsets(struct file *file)
  38{
  39        return file->f_mode & FMODE_UNSIGNED_OFFSET;
  40}
  41
  42/**
  43 * vfs_setpos - update the file offset for lseek
  44 * @file:       file structure in question
  45 * @offset:     file offset to seek to
  46 * @maxsize:    maximum file size
  47 *
  48 * This is a low-level filesystem helper for updating the file offset to
  49 * the value specified by @offset if the given offset is valid and it is
  50 * not equal to the current file offset.
  51 *
  52 * Return the specified offset on success and -EINVAL on invalid offset.
  53 */
  54loff_t vfs_setpos(struct file *file, loff_t offset, loff_t maxsize)
  55{
  56        if (offset < 0 && !unsigned_offsets(file))
  57                return -EINVAL;
  58        if (offset > maxsize)
  59                return -EINVAL;
  60
  61        if (offset != file->f_pos) {
  62                file->f_pos = offset;
  63                file->f_version = 0;
  64        }
  65        return offset;
  66}
  67EXPORT_SYMBOL(vfs_setpos);
  68
  69/**
  70 * generic_file_llseek_size - generic llseek implementation for regular files
  71 * @file:       file structure to seek on
  72 * @offset:     file offset to seek to
  73 * @whence:     type of seek
  74 * @size:       max size of this file in file system
  75 * @eof:        offset used for SEEK_END position
  76 *
  77 * This is a variant of generic_file_llseek that allows passing in a custom
  78 * maximum file size and a custom EOF position, for e.g. hashed directories
  79 *
  80 * Synchronization:
  81 * SEEK_SET and SEEK_END are unsynchronized (but atomic on 64bit platforms)
  82 * SEEK_CUR is synchronized against other SEEK_CURs, but not read/writes.
  83 * read/writes behave like SEEK_SET against seeks.
  84 */
  85loff_t
  86generic_file_llseek_size(struct file *file, loff_t offset, int whence,
  87                loff_t maxsize, loff_t eof)
  88{
  89        switch (whence) {
  90        case SEEK_END:
  91                offset += eof;
  92                break;
  93        case SEEK_CUR:
  94                /*
  95                 * Here we special-case the lseek(fd, 0, SEEK_CUR)
  96                 * position-querying operation.  Avoid rewriting the "same"
  97                 * f_pos value back to the file because a concurrent read(),
  98                 * write() or lseek() might have altered it
  99                 */
 100                if (offset == 0)
 101                        return file->f_pos;
 102                /*
 103                 * f_lock protects against read/modify/write race with other
 104                 * SEEK_CURs. Note that parallel writes and reads behave
 105                 * like SEEK_SET.
 106                 */
 107                spin_lock(&file->f_lock);
 108                offset = vfs_setpos(file, file->f_pos + offset, maxsize);
 109                spin_unlock(&file->f_lock);
 110                return offset;
 111        case SEEK_DATA:
 112                /*
 113                 * In the generic case the entire file is data, so as long as
 114                 * offset isn't at the end of the file then the offset is data.
 115                 */
 116                if ((unsigned long long)offset >= eof)
 117                        return -ENXIO;
 118                break;
 119        case SEEK_HOLE:
 120                /*
 121                 * There is a virtual hole at the end of the file, so as long as
 122                 * offset isn't i_size or larger, return i_size.
 123                 */
 124                if ((unsigned long long)offset >= eof)
 125                        return -ENXIO;
 126                offset = eof;
 127                break;
 128        }
 129
 130        return vfs_setpos(file, offset, maxsize);
 131}
 132EXPORT_SYMBOL(generic_file_llseek_size);
 133
 134/**
 135 * generic_file_llseek - generic llseek implementation for regular files
 136 * @file:       file structure to seek on
 137 * @offset:     file offset to seek to
 138 * @whence:     type of seek
 139 *
 140 * This is a generic implemenation of ->llseek useable for all normal local
 141 * filesystems.  It just updates the file offset to the value specified by
 142 * @offset and @whence.
 143 */
 144loff_t generic_file_llseek(struct file *file, loff_t offset, int whence)
 145{
 146        struct inode *inode = file->f_mapping->host;
 147
 148        return generic_file_llseek_size(file, offset, whence,
 149                                        inode->i_sb->s_maxbytes,
 150                                        i_size_read(inode));
 151}
 152EXPORT_SYMBOL(generic_file_llseek);
 153
 154/**
 155 * fixed_size_llseek - llseek implementation for fixed-sized devices
 156 * @file:       file structure to seek on
 157 * @offset:     file offset to seek to
 158 * @whence:     type of seek
 159 * @size:       size of the file
 160 *
 161 */
 162loff_t fixed_size_llseek(struct file *file, loff_t offset, int whence, loff_t size)
 163{
 164        switch (whence) {
 165        case SEEK_SET: case SEEK_CUR: case SEEK_END:
 166                return generic_file_llseek_size(file, offset, whence,
 167                                                size, size);
 168        default:
 169                return -EINVAL;
 170        }
 171}
 172EXPORT_SYMBOL(fixed_size_llseek);
 173
 174/**
 175 * no_seek_end_llseek - llseek implementation for fixed-sized devices
 176 * @file:       file structure to seek on
 177 * @offset:     file offset to seek to
 178 * @whence:     type of seek
 179 *
 180 */
 181loff_t no_seek_end_llseek(struct file *file, loff_t offset, int whence)
 182{
 183        switch (whence) {
 184        case SEEK_SET: case SEEK_CUR:
 185                return generic_file_llseek_size(file, offset, whence,
 186                                                OFFSET_MAX, 0);
 187        default:
 188                return -EINVAL;
 189        }
 190}
 191EXPORT_SYMBOL(no_seek_end_llseek);
 192
 193/**
 194 * no_seek_end_llseek_size - llseek implementation for fixed-sized devices
 195 * @file:       file structure to seek on
 196 * @offset:     file offset to seek to
 197 * @whence:     type of seek
 198 * @size:       maximal offset allowed
 199 *
 200 */
 201loff_t no_seek_end_llseek_size(struct file *file, loff_t offset, int whence, loff_t size)
 202{
 203        switch (whence) {
 204        case SEEK_SET: case SEEK_CUR:
 205                return generic_file_llseek_size(file, offset, whence,
 206                                                size, 0);
 207        default:
 208                return -EINVAL;
 209        }
 210}
 211EXPORT_SYMBOL(no_seek_end_llseek_size);
 212
 213/**
 214 * noop_llseek - No Operation Performed llseek implementation
 215 * @file:       file structure to seek on
 216 * @offset:     file offset to seek to
 217 * @whence:     type of seek
 218 *
 219 * This is an implementation of ->llseek useable for the rare special case when
 220 * userspace expects the seek to succeed but the (device) file is actually not
 221 * able to perform the seek. In this case you use noop_llseek() instead of
 222 * falling back to the default implementation of ->llseek.
 223 */
 224loff_t noop_llseek(struct file *file, loff_t offset, int whence)
 225{
 226        return file->f_pos;
 227}
 228EXPORT_SYMBOL(noop_llseek);
 229
 230loff_t no_llseek(struct file *file, loff_t offset, int whence)
 231{
 232        return -ESPIPE;
 233}
 234EXPORT_SYMBOL(no_llseek);
 235
 236loff_t default_llseek(struct file *file, loff_t offset, int whence)
 237{
 238        struct inode *inode = file_inode(file);
 239        loff_t retval;
 240
 241        inode_lock(inode);
 242        switch (whence) {
 243                case SEEK_END:
 244                        offset += i_size_read(inode);
 245                        break;
 246                case SEEK_CUR:
 247                        if (offset == 0) {
 248                                retval = file->f_pos;
 249                                goto out;
 250                        }
 251                        offset += file->f_pos;
 252                        break;
 253                case SEEK_DATA:
 254                        /*
 255                         * In the generic case the entire file is data, so as
 256                         * long as offset isn't at the end of the file then the
 257                         * offset is data.
 258                         */
 259                        if (offset >= inode->i_size) {
 260                                retval = -ENXIO;
 261                                goto out;
 262                        }
 263                        break;
 264                case SEEK_HOLE:
 265                        /*
 266                         * There is a virtual hole at the end of the file, so
 267                         * as long as offset isn't i_size or larger, return
 268                         * i_size.
 269                         */
 270                        if (offset >= inode->i_size) {
 271                                retval = -ENXIO;
 272                                goto out;
 273                        }
 274                        offset = inode->i_size;
 275                        break;
 276        }
 277        retval = -EINVAL;
 278        if (offset >= 0 || unsigned_offsets(file)) {
 279                if (offset != file->f_pos) {
 280                        file->f_pos = offset;
 281                        file->f_version = 0;
 282                }
 283                retval = offset;
 284        }
 285out:
 286        inode_unlock(inode);
 287        return retval;
 288}
 289EXPORT_SYMBOL(default_llseek);
 290
 291loff_t vfs_llseek(struct file *file, loff_t offset, int whence)
 292{
 293        loff_t (*fn)(struct file *, loff_t, int);
 294
 295        fn = no_llseek;
 296        if (file->f_mode & FMODE_LSEEK) {
 297                if (file->f_op->llseek)
 298                        fn = file->f_op->llseek;
 299        }
 300        return fn(file, offset, whence);
 301}
 302EXPORT_SYMBOL(vfs_llseek);
 303
 304off_t ksys_lseek(unsigned int fd, off_t offset, unsigned int whence)
 305{
 306        off_t retval;
 307        struct fd f = fdget_pos(fd);
 308        if (!f.file)
 309                return -EBADF;
 310
 311        retval = -EINVAL;
 312        if (whence <= SEEK_MAX) {
 313                loff_t res = vfs_llseek(f.file, offset, whence);
 314                retval = res;
 315                if (res != (loff_t)retval)
 316                        retval = -EOVERFLOW;    /* LFS: should only happen on 32 bit platforms */
 317        }
 318        fdput_pos(f);
 319        return retval;
 320}
 321
 322SYSCALL_DEFINE3(lseek, unsigned int, fd, off_t, offset, unsigned int, whence)
 323{
 324        return ksys_lseek(fd, offset, whence);
 325}
 326
 327#ifdef CONFIG_COMPAT
 328COMPAT_SYSCALL_DEFINE3(lseek, unsigned int, fd, compat_off_t, offset, unsigned int, whence)
 329{
 330        return ksys_lseek(fd, offset, whence);
 331}
 332#endif
 333
 334#ifdef __ARCH_WANT_SYS_LLSEEK
 335SYSCALL_DEFINE5(llseek, unsigned int, fd, unsigned long, offset_high,
 336                unsigned long, offset_low, loff_t __user *, result,
 337                unsigned int, whence)
 338{
 339        int retval;
 340        struct fd f = fdget_pos(fd);
 341        loff_t offset;
 342
 343        if (!f.file)
 344                return -EBADF;
 345
 346        retval = -EINVAL;
 347        if (whence > SEEK_MAX)
 348                goto out_putf;
 349
 350        offset = vfs_llseek(f.file, ((loff_t) offset_high << 32) | offset_low,
 351                        whence);
 352
 353        retval = (int)offset;
 354        if (offset >= 0) {
 355                retval = -EFAULT;
 356                if (!copy_to_user(result, &offset, sizeof(offset)))
 357                        retval = 0;
 358        }
 359out_putf:
 360        fdput_pos(f);
 361        return retval;
 362}
 363#endif
 364
 365int rw_verify_area(int read_write, struct file *file, const loff_t *ppos, size_t count)
 366{
 367        struct inode *inode;
 368        loff_t pos;
 369        int retval = -EINVAL;
 370
 371        inode = file_inode(file);
 372        if (unlikely((ssize_t) count < 0))
 373                return retval;
 374        pos = *ppos;
 375        if (unlikely(pos < 0)) {
 376                if (!unsigned_offsets(file))
 377                        return retval;
 378                if (count >= -pos) /* both values are in 0..LLONG_MAX */
 379                        return -EOVERFLOW;
 380        } else if (unlikely((loff_t) (pos + count) < 0)) {
 381                if (!unsigned_offsets(file))
 382                        return retval;
 383        }
 384
 385        if (unlikely(inode->i_flctx && mandatory_lock(inode))) {
 386                retval = locks_mandatory_area(inode, file, pos, pos + count - 1,
 387                                read_write == READ ? F_RDLCK : F_WRLCK);
 388                if (retval < 0)
 389                        return retval;
 390        }
 391        return security_file_permission(file,
 392                                read_write == READ ? MAY_READ : MAY_WRITE);
 393}
 394
 395static ssize_t new_sync_read(struct file *filp, char __user *buf, size_t len, loff_t *ppos)
 396{
 397        struct iovec iov = { .iov_base = buf, .iov_len = len };
 398        struct kiocb kiocb;
 399        struct iov_iter iter;
 400        ssize_t ret;
 401
 402        init_sync_kiocb(&kiocb, filp);
 403        kiocb.ki_pos = *ppos;
 404        iov_iter_init(&iter, READ, &iov, 1, len);
 405
 406        ret = call_read_iter(filp, &kiocb, &iter);
 407        BUG_ON(ret == -EIOCBQUEUED);
 408        *ppos = kiocb.ki_pos;
 409        return ret;
 410}
 411
 412ssize_t __vfs_read(struct file *file, char __user *buf, size_t count,
 413                   loff_t *pos)
 414{
 415        if (file->f_op->read)
 416                return file->f_op->read(file, buf, count, pos);
 417        else if (file->f_op->read_iter)
 418                return new_sync_read(file, buf, count, pos);
 419        else
 420                return -EINVAL;
 421}
 422
 423ssize_t kernel_read(struct file *file, void *buf, size_t count, loff_t *pos)
 424{
 425        mm_segment_t old_fs;
 426        ssize_t result;
 427
 428        old_fs = get_fs();
 429        set_fs(get_ds());
 430        /* The cast to a user pointer is valid due to the set_fs() */
 431        result = vfs_read(file, (void __user *)buf, count, pos);
 432        set_fs(old_fs);
 433        return result;
 434}
 435EXPORT_SYMBOL(kernel_read);
 436
 437ssize_t vfs_read(struct file *file, char __user *buf, size_t count, loff_t *pos)
 438{
 439        ssize_t ret;
 440
 441        if (!(file->f_mode & FMODE_READ))
 442                return -EBADF;
 443        if (!(file->f_mode & FMODE_CAN_READ))
 444                return -EINVAL;
 445        if (unlikely(!access_ok(buf, count)))
 446                return -EFAULT;
 447
 448        ret = rw_verify_area(READ, file, pos, count);
 449        if (!ret) {
 450                if (count > MAX_RW_COUNT)
 451                        count =  MAX_RW_COUNT;
 452                ret = __vfs_read(file, buf, count, pos);
 453                if (ret > 0) {
 454                        fsnotify_access(file);
 455                        add_rchar(current, ret);
 456                }
 457                inc_syscr(current);
 458        }
 459
 460        return ret;
 461}
 462
 463static ssize_t new_sync_write(struct file *filp, const char __user *buf, size_t len, loff_t *ppos)
 464{
 465        struct iovec iov = { .iov_base = (void __user *)buf, .iov_len = len };
 466        struct kiocb kiocb;
 467        struct iov_iter iter;
 468        ssize_t ret;
 469
 470        init_sync_kiocb(&kiocb, filp);
 471        kiocb.ki_pos = *ppos;
 472        iov_iter_init(&iter, WRITE, &iov, 1, len);
 473
 474        ret = call_write_iter(filp, &kiocb, &iter);
 475        BUG_ON(ret == -EIOCBQUEUED);
 476        if (ret > 0)
 477                *ppos = kiocb.ki_pos;
 478        return ret;
 479}
 480
 481ssize_t __vfs_write(struct file *file, const char __user *p, size_t count,
 482                    loff_t *pos)
 483{
 484        if (file->f_op->write)
 485                return file->f_op->write(file, p, count, pos);
 486        else if (file->f_op->write_iter)
 487                return new_sync_write(file, p, count, pos);
 488        else
 489                return -EINVAL;
 490}
 491
 492ssize_t __kernel_write(struct file *file, const void *buf, size_t count, loff_t *pos)
 493{
 494        mm_segment_t old_fs;
 495        const char __user *p;
 496        ssize_t ret;
 497
 498        if (!(file->f_mode & FMODE_CAN_WRITE))
 499                return -EINVAL;
 500
 501        old_fs = get_fs();
 502        set_fs(get_ds());
 503        p = (__force const char __user *)buf;
 504        if (count > MAX_RW_COUNT)
 505                count =  MAX_RW_COUNT;
 506        ret = __vfs_write(file, p, count, pos);
 507        set_fs(old_fs);
 508        if (ret > 0) {
 509                fsnotify_modify(file);
 510                add_wchar(current, ret);
 511        }
 512        inc_syscw(current);
 513        return ret;
 514}
 515EXPORT_SYMBOL(__kernel_write);
 516
 517ssize_t kernel_write(struct file *file, const void *buf, size_t count,
 518                            loff_t *pos)
 519{
 520        mm_segment_t old_fs;
 521        ssize_t res;
 522
 523        old_fs = get_fs();
 524        set_fs(get_ds());
 525        /* The cast to a user pointer is valid due to the set_fs() */
 526        res = vfs_write(file, (__force const char __user *)buf, count, pos);
 527        set_fs(old_fs);
 528
 529        return res;
 530}
 531EXPORT_SYMBOL(kernel_write);
 532
 533ssize_t vfs_write(struct file *file, const char __user *buf, size_t count, loff_t *pos)
 534{
 535        ssize_t ret;
 536
 537        if (!(file->f_mode & FMODE_WRITE))
 538                return -EBADF;
 539        if (!(file->f_mode & FMODE_CAN_WRITE))
 540                return -EINVAL;
 541        if (unlikely(!access_ok(buf, count)))
 542                return -EFAULT;
 543
 544        ret = rw_verify_area(WRITE, file, pos, count);
 545        if (!ret) {
 546                if (count > MAX_RW_COUNT)
 547                        count =  MAX_RW_COUNT;
 548                file_start_write(file);
 549                ret = __vfs_write(file, buf, count, pos);
 550                if (ret > 0) {
 551                        fsnotify_modify(file);
 552                        add_wchar(current, ret);
 553                }
 554                inc_syscw(current);
 555                file_end_write(file);
 556        }
 557
 558        return ret;
 559}
 560
 561static inline loff_t file_pos_read(struct file *file)
 562{
 563        return file->f_pos;
 564}
 565
 566static inline void file_pos_write(struct file *file, loff_t pos)
 567{
 568        file->f_pos = pos;
 569}
 570
 571ssize_t ksys_read(unsigned int fd, char __user *buf, size_t count)
 572{
 573        struct fd f = fdget_pos(fd);
 574        ssize_t ret = -EBADF;
 575
 576        if (f.file) {
 577                loff_t pos = file_pos_read(f.file);
 578                ret = vfs_read(f.file, buf, count, &pos);
 579                if (ret >= 0)
 580                        file_pos_write(f.file, pos);
 581                fdput_pos(f);
 582        }
 583        return ret;
 584}
 585
 586SYSCALL_DEFINE3(read, unsigned int, fd, char __user *, buf, size_t, count)
 587{
 588        return ksys_read(fd, buf, count);
 589}
 590
 591ssize_t ksys_write(unsigned int fd, const char __user *buf, size_t count)
 592{
 593        struct fd f = fdget_pos(fd);
 594        ssize_t ret = -EBADF;
 595
 596        if (f.file) {
 597                loff_t pos = file_pos_read(f.file);
 598                ret = vfs_write(f.file, buf, count, &pos);
 599                if (ret >= 0)
 600                        file_pos_write(f.file, pos);
 601                fdput_pos(f);
 602        }
 603
 604        return ret;
 605}
 606
 607SYSCALL_DEFINE3(write, unsigned int, fd, const char __user *, buf,
 608                size_t, count)
 609{
 610        return ksys_write(fd, buf, count);
 611}
 612
 613ssize_t ksys_pread64(unsigned int fd, char __user *buf, size_t count,
 614                     loff_t pos)
 615{
 616        struct fd f;
 617        ssize_t ret = -EBADF;
 618
 619        if (pos < 0)
 620                return -EINVAL;
 621
 622        f = fdget(fd);
 623        if (f.file) {
 624                ret = -ESPIPE;
 625                if (f.file->f_mode & FMODE_PREAD)
 626                        ret = vfs_read(f.file, buf, count, &pos);
 627                fdput(f);
 628        }
 629
 630        return ret;
 631}
 632
 633SYSCALL_DEFINE4(pread64, unsigned int, fd, char __user *, buf,
 634                        size_t, count, loff_t, pos)
 635{
 636        return ksys_pread64(fd, buf, count, pos);
 637}
 638
 639ssize_t ksys_pwrite64(unsigned int fd, const char __user *buf,
 640                      size_t count, loff_t pos)
 641{
 642        struct fd f;
 643        ssize_t ret = -EBADF;
 644
 645        if (pos < 0)
 646                return -EINVAL;
 647
 648        f = fdget(fd);
 649        if (f.file) {
 650                ret = -ESPIPE;
 651                if (f.file->f_mode & FMODE_PWRITE)  
 652                        ret = vfs_write(f.file, buf, count, &pos);
 653                fdput(f);
 654        }
 655
 656        return ret;
 657}
 658
 659SYSCALL_DEFINE4(pwrite64, unsigned int, fd, const char __user *, buf,
 660                         size_t, count, loff_t, pos)
 661{
 662        return ksys_pwrite64(fd, buf, count, pos);
 663}
 664
 665static ssize_t do_iter_readv_writev(struct file *filp, struct iov_iter *iter,
 666                loff_t *ppos, int type, rwf_t flags)
 667{
 668        struct kiocb kiocb;
 669        ssize_t ret;
 670
 671        init_sync_kiocb(&kiocb, filp);
 672        ret = kiocb_set_rw_flags(&kiocb, flags);
 673        if (ret)
 674                return ret;
 675        kiocb.ki_pos = *ppos;
 676
 677        if (type == READ)
 678                ret = call_read_iter(filp, &kiocb, iter);
 679        else
 680                ret = call_write_iter(filp, &kiocb, iter);
 681        BUG_ON(ret == -EIOCBQUEUED);
 682        *ppos = kiocb.ki_pos;
 683        return ret;
 684}
 685
 686/* Do it by hand, with file-ops */
 687static ssize_t do_loop_readv_writev(struct file *filp, struct iov_iter *iter,
 688                loff_t *ppos, int type, rwf_t flags)
 689{
 690        ssize_t ret = 0;
 691
 692        if (flags & ~RWF_HIPRI)
 693                return -EOPNOTSUPP;
 694
 695        while (iov_iter_count(iter)) {
 696                struct iovec iovec = iov_iter_iovec(iter);
 697                ssize_t nr;
 698
 699                if (type == READ) {
 700                        nr = filp->f_op->read(filp, iovec.iov_base,
 701                                              iovec.iov_len, ppos);
 702                } else {
 703                        nr = filp->f_op->write(filp, iovec.iov_base,
 704                                               iovec.iov_len, ppos);
 705                }
 706
 707                if (nr < 0) {
 708                        if (!ret)
 709                                ret = nr;
 710                        break;
 711                }
 712                ret += nr;
 713                if (nr != iovec.iov_len)
 714                        break;
 715                iov_iter_advance(iter, nr);
 716        }
 717
 718        return ret;
 719}
 720
 721/**
 722 * rw_copy_check_uvector() - Copy an array of &struct iovec from userspace
 723 *     into the kernel and check that it is valid.
 724 *
 725 * @type: One of %CHECK_IOVEC_ONLY, %READ, or %WRITE.
 726 * @uvector: Pointer to the userspace array.
 727 * @nr_segs: Number of elements in userspace array.
 728 * @fast_segs: Number of elements in @fast_pointer.
 729 * @fast_pointer: Pointer to (usually small on-stack) kernel array.
 730 * @ret_pointer: (output parameter) Pointer to a variable that will point to
 731 *     either @fast_pointer, a newly allocated kernel array, or NULL,
 732 *     depending on which array was used.
 733 *
 734 * This function copies an array of &struct iovec of @nr_segs from
 735 * userspace into the kernel and checks that each element is valid (e.g.
 736 * it does not point to a kernel address or cause overflow by being too
 737 * large, etc.).
 738 *
 739 * As an optimization, the caller may provide a pointer to a small
 740 * on-stack array in @fast_pointer, typically %UIO_FASTIOV elements long
 741 * (the size of this array, or 0 if unused, should be given in @fast_segs).
 742 *
 743 * @ret_pointer will always point to the array that was used, so the
 744 * caller must take care not to call kfree() on it e.g. in case the
 745 * @fast_pointer array was used and it was allocated on the stack.
 746 *
 747 * Return: The total number of bytes covered by the iovec array on success
 748 *   or a negative error code on error.
 749 */
 750ssize_t rw_copy_check_uvector(int type, const struct iovec __user * uvector,
 751                              unsigned long nr_segs, unsigned long fast_segs,
 752                              struct iovec *fast_pointer,
 753                              struct iovec **ret_pointer)
 754{
 755        unsigned long seg;
 756        ssize_t ret;
 757        struct iovec *iov = fast_pointer;
 758
 759        /*
 760         * SuS says "The readv() function *may* fail if the iovcnt argument
 761         * was less than or equal to 0, or greater than {IOV_MAX}.  Linux has
 762         * traditionally returned zero for zero segments, so...
 763         */
 764        if (nr_segs == 0) {
 765                ret = 0;
 766                goto out;
 767        }
 768
 769        /*
 770         * First get the "struct iovec" from user memory and
 771         * verify all the pointers
 772         */
 773        if (nr_segs > UIO_MAXIOV) {
 774                ret = -EINVAL;
 775                goto out;
 776        }
 777        if (nr_segs > fast_segs) {
 778                iov = kmalloc_array(nr_segs, sizeof(struct iovec), GFP_KERNEL);
 779                if (iov == NULL) {
 780                        ret = -ENOMEM;
 781                        goto out;
 782                }
 783        }
 784        if (copy_from_user(iov, uvector, nr_segs*sizeof(*uvector))) {
 785                ret = -EFAULT;
 786                goto out;
 787        }
 788
 789        /*
 790         * According to the Single Unix Specification we should return EINVAL
 791         * if an element length is < 0 when cast to ssize_t or if the
 792         * total length would overflow the ssize_t return value of the
 793         * system call.
 794         *
 795         * Linux caps all read/write calls to MAX_RW_COUNT, and avoids the
 796         * overflow case.
 797         */
 798        ret = 0;
 799        for (seg = 0; seg < nr_segs; seg++) {
 800                void __user *buf = iov[seg].iov_base;
 801                ssize_t len = (ssize_t)iov[seg].iov_len;
 802
 803                /* see if we we're about to use an invalid len or if
 804                 * it's about to overflow ssize_t */
 805                if (len < 0) {
 806                        ret = -EINVAL;
 807                        goto out;
 808                }
 809                if (type >= 0
 810                    && unlikely(!access_ok(buf, len))) {
 811                        ret = -EFAULT;
 812                        goto out;
 813                }
 814                if (len > MAX_RW_COUNT - ret) {
 815                        len = MAX_RW_COUNT - ret;
 816                        iov[seg].iov_len = len;
 817                }
 818                ret += len;
 819        }
 820out:
 821        *ret_pointer = iov;
 822        return ret;
 823}
 824
 825#ifdef CONFIG_COMPAT
 826ssize_t compat_rw_copy_check_uvector(int type,
 827                const struct compat_iovec __user *uvector, unsigned long nr_segs,
 828                unsigned long fast_segs, struct iovec *fast_pointer,
 829                struct iovec **ret_pointer)
 830{
 831        compat_ssize_t tot_len;
 832        struct iovec *iov = *ret_pointer = fast_pointer;
 833        ssize_t ret = 0;
 834        int seg;
 835
 836        /*
 837         * SuS says "The readv() function *may* fail if the iovcnt argument
 838         * was less than or equal to 0, or greater than {IOV_MAX}.  Linux has
 839         * traditionally returned zero for zero segments, so...
 840         */
 841        if (nr_segs == 0)
 842                goto out;
 843
 844        ret = -EINVAL;
 845        if (nr_segs > UIO_MAXIOV)
 846                goto out;
 847        if (nr_segs > fast_segs) {
 848                ret = -ENOMEM;
 849                iov = kmalloc_array(nr_segs, sizeof(struct iovec), GFP_KERNEL);
 850                if (iov == NULL)
 851                        goto out;
 852        }
 853        *ret_pointer = iov;
 854
 855        ret = -EFAULT;
 856        if (!access_ok(uvector, nr_segs*sizeof(*uvector)))
 857                goto out;
 858
 859        /*
 860         * Single unix specification:
 861         * We should -EINVAL if an element length is not >= 0 and fitting an
 862         * ssize_t.
 863         *
 864         * In Linux, the total length is limited to MAX_RW_COUNT, there is
 865         * no overflow possibility.
 866         */
 867        tot_len = 0;
 868        ret = -EINVAL;
 869        for (seg = 0; seg < nr_segs; seg++) {
 870                compat_uptr_t buf;
 871                compat_ssize_t len;
 872
 873                if (__get_user(len, &uvector->iov_len) ||
 874                   __get_user(buf, &uvector->iov_base)) {
 875                        ret = -EFAULT;
 876                        goto out;
 877                }
 878                if (len < 0)    /* size_t not fitting in compat_ssize_t .. */
 879                        goto out;
 880                if (type >= 0 &&
 881                    !access_ok(compat_ptr(buf), len)) {
 882                        ret = -EFAULT;
 883                        goto out;
 884                }
 885                if (len > MAX_RW_COUNT - tot_len)
 886                        len = MAX_RW_COUNT - tot_len;
 887                tot_len += len;
 888                iov->iov_base = compat_ptr(buf);
 889                iov->iov_len = (compat_size_t) len;
 890                uvector++;
 891                iov++;
 892        }
 893        ret = tot_len;
 894
 895out:
 896        return ret;
 897}
 898#endif
 899
 900static ssize_t do_iter_read(struct file *file, struct iov_iter *iter,
 901                loff_t *pos, rwf_t flags)
 902{
 903        size_t tot_len;
 904        ssize_t ret = 0;
 905
 906        if (!(file->f_mode & FMODE_READ))
 907                return -EBADF;
 908        if (!(file->f_mode & FMODE_CAN_READ))
 909                return -EINVAL;
 910
 911        tot_len = iov_iter_count(iter);
 912        if (!tot_len)
 913                goto out;
 914        ret = rw_verify_area(READ, file, pos, tot_len);
 915        if (ret < 0)
 916                return ret;
 917
 918        if (file->f_op->read_iter)
 919                ret = do_iter_readv_writev(file, iter, pos, READ, flags);
 920        else
 921                ret = do_loop_readv_writev(file, iter, pos, READ, flags);
 922out:
 923        if (ret >= 0)
 924                fsnotify_access(file);
 925        return ret;
 926}
 927
 928ssize_t vfs_iocb_iter_read(struct file *file, struct kiocb *iocb,
 929                           struct iov_iter *iter)
 930{
 931        size_t tot_len;
 932        ssize_t ret = 0;
 933
 934        if (!file->f_op->read_iter)
 935                return -EINVAL;
 936        if (!(file->f_mode & FMODE_READ))
 937                return -EBADF;
 938        if (!(file->f_mode & FMODE_CAN_READ))
 939                return -EINVAL;
 940
 941        tot_len = iov_iter_count(iter);
 942        if (!tot_len)
 943                goto out;
 944        ret = rw_verify_area(READ, file, &iocb->ki_pos, tot_len);
 945        if (ret < 0)
 946                return ret;
 947
 948        ret = call_read_iter(file, iocb, iter);
 949out:
 950        if (ret >= 0)
 951                fsnotify_access(file);
 952        return ret;
 953}
 954EXPORT_SYMBOL(vfs_iocb_iter_read);
 955
 956ssize_t vfs_iter_read(struct file *file, struct iov_iter *iter, loff_t *ppos,
 957                rwf_t flags)
 958{
 959        if (!file->f_op->read_iter)
 960                return -EINVAL;
 961        return do_iter_read(file, iter, ppos, flags);
 962}
 963EXPORT_SYMBOL(vfs_iter_read);
 964
 965static ssize_t do_iter_write(struct file *file, struct iov_iter *iter,
 966                loff_t *pos, rwf_t flags)
 967{
 968        size_t tot_len;
 969        ssize_t ret = 0;
 970
 971        if (!(file->f_mode & FMODE_WRITE))
 972                return -EBADF;
 973        if (!(file->f_mode & FMODE_CAN_WRITE))
 974                return -EINVAL;
 975
 976        tot_len = iov_iter_count(iter);
 977        if (!tot_len)
 978                return 0;
 979        ret = rw_verify_area(WRITE, file, pos, tot_len);
 980        if (ret < 0)
 981                return ret;
 982
 983        if (file->f_op->write_iter)
 984                ret = do_iter_readv_writev(file, iter, pos, WRITE, flags);
 985        else
 986                ret = do_loop_readv_writev(file, iter, pos, WRITE, flags);
 987        if (ret > 0)
 988                fsnotify_modify(file);
 989        return ret;
 990}
 991
 992ssize_t vfs_iocb_iter_write(struct file *file, struct kiocb *iocb,
 993                            struct iov_iter *iter)
 994{
 995        size_t tot_len;
 996        ssize_t ret = 0;
 997
 998        if (!file->f_op->write_iter)
 999                return -EINVAL;
1000        if (!(file->f_mode & FMODE_WRITE))
1001                return -EBADF;
1002        if (!(file->f_mode & FMODE_CAN_WRITE))
1003                return -EINVAL;
1004
1005        tot_len = iov_iter_count(iter);
1006        if (!tot_len)
1007                return 0;
1008        ret = rw_verify_area(WRITE, file, &iocb->ki_pos, tot_len);
1009        if (ret < 0)
1010                return ret;
1011
1012        ret = call_write_iter(file, iocb, iter);
1013        if (ret > 0)
1014                fsnotify_modify(file);
1015
1016        return ret;
1017}
1018EXPORT_SYMBOL(vfs_iocb_iter_write);
1019
1020ssize_t vfs_iter_write(struct file *file, struct iov_iter *iter, loff_t *ppos,
1021                rwf_t flags)
1022{
1023        if (!file->f_op->write_iter)
1024                return -EINVAL;
1025        return do_iter_write(file, iter, ppos, flags);
1026}
1027EXPORT_SYMBOL(vfs_iter_write);
1028
1029ssize_t vfs_readv(struct file *file, const struct iovec __user *vec,
1030                  unsigned long vlen, loff_t *pos, rwf_t flags)
1031{
1032        struct iovec iovstack[UIO_FASTIOV];
1033        struct iovec *iov = iovstack;
1034        struct iov_iter iter;
1035        ssize_t ret;
1036
1037        ret = import_iovec(READ, vec, vlen, ARRAY_SIZE(iovstack), &iov, &iter);
1038        if (ret >= 0) {
1039                ret = do_iter_read(file, &iter, pos, flags);
1040                kfree(iov);
1041        }
1042
1043        return ret;
1044}
1045
1046static ssize_t vfs_writev(struct file *file, const struct iovec __user *vec,
1047                   unsigned long vlen, loff_t *pos, rwf_t flags)
1048{
1049        struct iovec iovstack[UIO_FASTIOV];
1050        struct iovec *iov = iovstack;
1051        struct iov_iter iter;
1052        ssize_t ret;
1053
1054        ret = import_iovec(WRITE, vec, vlen, ARRAY_SIZE(iovstack), &iov, &iter);
1055        if (ret >= 0) {
1056                file_start_write(file);
1057                ret = do_iter_write(file, &iter, pos, flags);
1058                file_end_write(file);
1059                kfree(iov);
1060        }
1061        return ret;
1062}
1063
1064static ssize_t do_readv(unsigned long fd, const struct iovec __user *vec,
1065                        unsigned long vlen, rwf_t flags)
1066{
1067        struct fd f = fdget_pos(fd);
1068        ssize_t ret = -EBADF;
1069
1070        if (f.file) {
1071                loff_t pos = file_pos_read(f.file);
1072                ret = vfs_readv(f.file, vec, vlen, &pos, flags);
1073                if (ret >= 0)
1074                        file_pos_write(f.file, pos);
1075                fdput_pos(f);
1076        }
1077
1078        if (ret > 0)
1079                add_rchar(current, ret);
1080        inc_syscr(current);
1081        return ret;
1082}
1083
1084static ssize_t do_writev(unsigned long fd, const struct iovec __user *vec,
1085                         unsigned long vlen, rwf_t flags)
1086{
1087        struct fd f = fdget_pos(fd);
1088        ssize_t ret = -EBADF;
1089
1090        if (f.file) {
1091                loff_t pos = file_pos_read(f.file);
1092                ret = vfs_writev(f.file, vec, vlen, &pos, flags);
1093                if (ret >= 0)
1094                        file_pos_write(f.file, pos);
1095                fdput_pos(f);
1096        }
1097
1098        if (ret > 0)
1099                add_wchar(current, ret);
1100        inc_syscw(current);
1101        return ret;
1102}
1103
1104static inline loff_t pos_from_hilo(unsigned long high, unsigned long low)
1105{
1106#define HALF_LONG_BITS (BITS_PER_LONG / 2)
1107        return (((loff_t)high << HALF_LONG_BITS) << HALF_LONG_BITS) | low;
1108}
1109
1110static ssize_t do_preadv(unsigned long fd, const struct iovec __user *vec,
1111                         unsigned long vlen, loff_t pos, rwf_t flags)
1112{
1113        struct fd f;
1114        ssize_t ret = -EBADF;
1115
1116        if (pos < 0)
1117                return -EINVAL;
1118
1119        f = fdget(fd);
1120        if (f.file) {
1121                ret = -ESPIPE;
1122                if (f.file->f_mode & FMODE_PREAD)
1123                        ret = vfs_readv(f.file, vec, vlen, &pos, flags);
1124                fdput(f);
1125        }
1126
1127        if (ret > 0)
1128                add_rchar(current, ret);
1129        inc_syscr(current);
1130        return ret;
1131}
1132
1133static ssize_t do_pwritev(unsigned long fd, const struct iovec __user *vec,
1134                          unsigned long vlen, loff_t pos, rwf_t flags)
1135{
1136        struct fd f;
1137        ssize_t ret = -EBADF;
1138
1139        if (pos < 0)
1140                return -EINVAL;
1141
1142        f = fdget(fd);
1143        if (f.file) {
1144                ret = -ESPIPE;
1145                if (f.file->f_mode & FMODE_PWRITE)
1146                        ret = vfs_writev(f.file, vec, vlen, &pos, flags);
1147                fdput(f);
1148        }
1149
1150        if (ret > 0)
1151                add_wchar(current, ret);
1152        inc_syscw(current);
1153        return ret;
1154}
1155
1156SYSCALL_DEFINE3(readv, unsigned long, fd, const struct iovec __user *, vec,
1157                unsigned long, vlen)
1158{
1159        return do_readv(fd, vec, vlen, 0);
1160}
1161
1162SYSCALL_DEFINE3(writev, unsigned long, fd, const struct iovec __user *, vec,
1163                unsigned long, vlen)
1164{
1165        return do_writev(fd, vec, vlen, 0);
1166}
1167
1168SYSCALL_DEFINE5(preadv, unsigned long, fd, const struct iovec __user *, vec,
1169                unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h)
1170{
1171        loff_t pos = pos_from_hilo(pos_h, pos_l);
1172
1173        return do_preadv(fd, vec, vlen, pos, 0);
1174}
1175
1176SYSCALL_DEFINE6(preadv2, unsigned long, fd, const struct iovec __user *, vec,
1177                unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h,
1178                rwf_t, flags)
1179{
1180        loff_t pos = pos_from_hilo(pos_h, pos_l);
1181
1182        if (pos == -1)
1183                return do_readv(fd, vec, vlen, flags);
1184
1185        return do_preadv(fd, vec, vlen, pos, flags);
1186}
1187
1188SYSCALL_DEFINE5(pwritev, unsigned long, fd, const struct iovec __user *, vec,
1189                unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h)
1190{
1191        loff_t pos = pos_from_hilo(pos_h, pos_l);
1192
1193        return do_pwritev(fd, vec, vlen, pos, 0);
1194}
1195
1196SYSCALL_DEFINE6(pwritev2, unsigned long, fd, const struct iovec __user *, vec,
1197                unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h,
1198                rwf_t, flags)
1199{
1200        loff_t pos = pos_from_hilo(pos_h, pos_l);
1201
1202        if (pos == -1)
1203                return do_writev(fd, vec, vlen, flags);
1204
1205        return do_pwritev(fd, vec, vlen, pos, flags);
1206}
1207
1208#ifdef CONFIG_COMPAT
1209static size_t compat_readv(struct file *file,
1210                           const struct compat_iovec __user *vec,
1211                           unsigned long vlen, loff_t *pos, rwf_t flags)
1212{
1213        struct iovec iovstack[UIO_FASTIOV];
1214        struct iovec *iov = iovstack;
1215        struct iov_iter iter;
1216        ssize_t ret;
1217
1218        ret = compat_import_iovec(READ, vec, vlen, UIO_FASTIOV, &iov, &iter);
1219        if (ret >= 0) {
1220                ret = do_iter_read(file, &iter, pos, flags);
1221                kfree(iov);
1222        }
1223        if (ret > 0)
1224                add_rchar(current, ret);
1225        inc_syscr(current);
1226        return ret;
1227}
1228
1229static size_t do_compat_readv(compat_ulong_t fd,
1230                                 const struct compat_iovec __user *vec,
1231                                 compat_ulong_t vlen, rwf_t flags)
1232{
1233        struct fd f = fdget_pos(fd);
1234        ssize_t ret;
1235        loff_t pos;
1236
1237        if (!f.file)
1238                return -EBADF;
1239        pos = f.file->f_pos;
1240        ret = compat_readv(f.file, vec, vlen, &pos, flags);
1241        if (ret >= 0)
1242                f.file->f_pos = pos;
1243        fdput_pos(f);
1244        return ret;
1245
1246}
1247
1248COMPAT_SYSCALL_DEFINE3(readv, compat_ulong_t, fd,
1249                const struct compat_iovec __user *,vec,
1250                compat_ulong_t, vlen)
1251{
1252        return do_compat_readv(fd, vec, vlen, 0);
1253}
1254
1255static long do_compat_preadv64(unsigned long fd,
1256                                  const struct compat_iovec __user *vec,
1257                                  unsigned long vlen, loff_t pos, rwf_t flags)
1258{
1259        struct fd f;
1260        ssize_t ret;
1261
1262        if (pos < 0)
1263                return -EINVAL;
1264        f = fdget(fd);
1265        if (!f.file)
1266                return -EBADF;
1267        ret = -ESPIPE;
1268        if (f.file->f_mode & FMODE_PREAD)
1269                ret = compat_readv(f.file, vec, vlen, &pos, flags);
1270        fdput(f);
1271        return ret;
1272}
1273
1274#ifdef __ARCH_WANT_COMPAT_SYS_PREADV64
1275COMPAT_SYSCALL_DEFINE4(preadv64, unsigned long, fd,
1276                const struct compat_iovec __user *,vec,
1277                unsigned long, vlen, loff_t, pos)
1278{
1279        return do_compat_preadv64(fd, vec, vlen, pos, 0);
1280}
1281#endif
1282
1283COMPAT_SYSCALL_DEFINE5(preadv, compat_ulong_t, fd,
1284                const struct compat_iovec __user *,vec,
1285                compat_ulong_t, vlen, u32, pos_low, u32, pos_high)
1286{
1287        loff_t pos = ((loff_t)pos_high << 32) | pos_low;
1288
1289        return do_compat_preadv64(fd, vec, vlen, pos, 0);
1290}
1291
1292#ifdef __ARCH_WANT_COMPAT_SYS_PREADV64V2
1293COMPAT_SYSCALL_DEFINE5(preadv64v2, unsigned long, fd,
1294                const struct compat_iovec __user *,vec,
1295                unsigned long, vlen, loff_t, pos, rwf_t, flags)
1296{
1297        return do_compat_preadv64(fd, vec, vlen, pos, flags);
1298}
1299#endif
1300
1301COMPAT_SYSCALL_DEFINE6(preadv2, compat_ulong_t, fd,
1302                const struct compat_iovec __user *,vec,
1303                compat_ulong_t, vlen, u32, pos_low, u32, pos_high,
1304                rwf_t, flags)
1305{
1306        loff_t pos = ((loff_t)pos_high << 32) | pos_low;
1307
1308        if (pos == -1)
1309                return do_compat_readv(fd, vec, vlen, flags);
1310
1311        return do_compat_preadv64(fd, vec, vlen, pos, flags);
1312}
1313
1314static size_t compat_writev(struct file *file,
1315                            const struct compat_iovec __user *vec,
1316                            unsigned long vlen, loff_t *pos, rwf_t flags)
1317{
1318        struct iovec iovstack[UIO_FASTIOV];
1319        struct iovec *iov = iovstack;
1320        struct iov_iter iter;
1321        ssize_t ret;
1322
1323        ret = compat_import_iovec(WRITE, vec, vlen, UIO_FASTIOV, &iov, &iter);
1324        if (ret >= 0) {
1325                file_start_write(file);
1326                ret = do_iter_write(file, &iter, pos, flags);
1327                file_end_write(file);
1328                kfree(iov);
1329        }
1330        if (ret > 0)
1331                add_wchar(current, ret);
1332        inc_syscw(current);
1333        return ret;
1334}
1335
1336static size_t do_compat_writev(compat_ulong_t fd,
1337                                  const struct compat_iovec __user* vec,
1338                                  compat_ulong_t vlen, rwf_t flags)
1339{
1340        struct fd f = fdget_pos(fd);
1341        ssize_t ret;
1342        loff_t pos;
1343
1344        if (!f.file)
1345                return -EBADF;
1346        pos = f.file->f_pos;
1347        ret = compat_writev(f.file, vec, vlen, &pos, flags);
1348        if (ret >= 0)
1349                f.file->f_pos = pos;
1350        fdput_pos(f);
1351        return ret;
1352}
1353
1354COMPAT_SYSCALL_DEFINE3(writev, compat_ulong_t, fd,
1355                const struct compat_iovec __user *, vec,
1356                compat_ulong_t, vlen)
1357{
1358        return do_compat_writev(fd, vec, vlen, 0);
1359}
1360
1361static long do_compat_pwritev64(unsigned long fd,
1362                                   const struct compat_iovec __user *vec,
1363                                   unsigned long vlen, loff_t pos, rwf_t flags)
1364{
1365        struct fd f;
1366        ssize_t ret;
1367
1368        if (pos < 0)
1369                return -EINVAL;
1370        f = fdget(fd);
1371        if (!f.file)
1372                return -EBADF;
1373        ret = -ESPIPE;
1374        if (f.file->f_mode & FMODE_PWRITE)
1375                ret = compat_writev(f.file, vec, vlen, &pos, flags);
1376        fdput(f);
1377        return ret;
1378}
1379
1380#ifdef __ARCH_WANT_COMPAT_SYS_PWRITEV64
1381COMPAT_SYSCALL_DEFINE4(pwritev64, unsigned long, fd,
1382                const struct compat_iovec __user *,vec,
1383                unsigned long, vlen, loff_t, pos)
1384{
1385        return do_compat_pwritev64(fd, vec, vlen, pos, 0);
1386}
1387#endif
1388
1389COMPAT_SYSCALL_DEFINE5(pwritev, compat_ulong_t, fd,
1390                const struct compat_iovec __user *,vec,
1391                compat_ulong_t, vlen, u32, pos_low, u32, pos_high)
1392{
1393        loff_t pos = ((loff_t)pos_high << 32) | pos_low;
1394
1395        return do_compat_pwritev64(fd, vec, vlen, pos, 0);
1396}
1397
1398#ifdef __ARCH_WANT_COMPAT_SYS_PWRITEV64V2
1399COMPAT_SYSCALL_DEFINE5(pwritev64v2, unsigned long, fd,
1400                const struct compat_iovec __user *,vec,
1401                unsigned long, vlen, loff_t, pos, rwf_t, flags)
1402{
1403        return do_compat_pwritev64(fd, vec, vlen, pos, flags);
1404}
1405#endif
1406
1407COMPAT_SYSCALL_DEFINE6(pwritev2, compat_ulong_t, fd,
1408                const struct compat_iovec __user *,vec,
1409                compat_ulong_t, vlen, u32, pos_low, u32, pos_high, rwf_t, flags)
1410{
1411        loff_t pos = ((loff_t)pos_high << 32) | pos_low;
1412
1413        if (pos == -1)
1414                return do_compat_writev(fd, vec, vlen, flags);
1415
1416        return do_compat_pwritev64(fd, vec, vlen, pos, flags);
1417}
1418
1419#endif
1420
1421static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos,
1422                           size_t count, loff_t max)
1423{
1424        struct fd in, out;
1425        struct inode *in_inode, *out_inode;
1426        loff_t pos;
1427        loff_t out_pos;
1428        ssize_t retval;
1429        int fl;
1430
1431        /*
1432         * Get input file, and verify that it is ok..
1433         */
1434        retval = -EBADF;
1435        in = fdget(in_fd);
1436        if (!in.file)
1437                goto out;
1438        if (!(in.file->f_mode & FMODE_READ))
1439                goto fput_in;
1440        retval = -ESPIPE;
1441        if (!ppos) {
1442                pos = in.file->f_pos;
1443        } else {
1444                pos = *ppos;
1445                if (!(in.file->f_mode & FMODE_PREAD))
1446                        goto fput_in;
1447        }
1448        retval = rw_verify_area(READ, in.file, &pos, count);
1449        if (retval < 0)
1450                goto fput_in;
1451        if (count > MAX_RW_COUNT)
1452                count =  MAX_RW_COUNT;
1453
1454        /*
1455         * Get output file, and verify that it is ok..
1456         */
1457        retval = -EBADF;
1458        out = fdget(out_fd);
1459        if (!out.file)
1460                goto fput_in;
1461        if (!(out.file->f_mode & FMODE_WRITE))
1462                goto fput_out;
1463        retval = -EINVAL;
1464        in_inode = file_inode(in.file);
1465        out_inode = file_inode(out.file);
1466        out_pos = out.file->f_pos;
1467        retval = rw_verify_area(WRITE, out.file, &out_pos, count);
1468        if (retval < 0)
1469                goto fput_out;
1470
1471        if (!max)
1472                max = min(in_inode->i_sb->s_maxbytes, out_inode->i_sb->s_maxbytes);
1473
1474        if (unlikely(pos + count > max)) {
1475                retval = -EOVERFLOW;
1476                if (pos >= max)
1477                        goto fput_out;
1478                count = max - pos;
1479        }
1480
1481        fl = 0;
1482#if 0
1483        /*
1484         * We need to debate whether we can enable this or not. The
1485         * man page documents EAGAIN return for the output at least,
1486         * and the application is arguably buggy if it doesn't expect
1487         * EAGAIN on a non-blocking file descriptor.
1488         */
1489        if (in.file->f_flags & O_NONBLOCK)
1490                fl = SPLICE_F_NONBLOCK;
1491#endif
1492        file_start_write(out.file);
1493        retval = do_splice_direct(in.file, &pos, out.file, &out_pos, count, fl);
1494        file_end_write(out.file);
1495
1496        if (retval > 0) {
1497                add_rchar(current, retval);
1498                add_wchar(current, retval);
1499                fsnotify_access(in.file);
1500                fsnotify_modify(out.file);
1501                out.file->f_pos = out_pos;
1502                if (ppos)
1503                        *ppos = pos;
1504                else
1505                        in.file->f_pos = pos;
1506        }
1507
1508        inc_syscr(current);
1509        inc_syscw(current);
1510        if (pos > max)
1511                retval = -EOVERFLOW;
1512
1513fput_out:
1514        fdput(out);
1515fput_in:
1516        fdput(in);
1517out:
1518        return retval;
1519}
1520
1521SYSCALL_DEFINE4(sendfile, int, out_fd, int, in_fd, off_t __user *, offset, size_t, count)
1522{
1523        loff_t pos;
1524        off_t off;
1525        ssize_t ret;
1526
1527        if (offset) {
1528                if (unlikely(get_user(off, offset)))
1529                        return -EFAULT;
1530                pos = off;
1531                ret = do_sendfile(out_fd, in_fd, &pos, count, MAX_NON_LFS);
1532                if (unlikely(put_user(pos, offset)))
1533                        return -EFAULT;
1534                return ret;
1535        }
1536
1537        return do_sendfile(out_fd, in_fd, NULL, count, 0);
1538}
1539
1540SYSCALL_DEFINE4(sendfile64, int, out_fd, int, in_fd, loff_t __user *, offset, size_t, count)
1541{
1542        loff_t pos;
1543        ssize_t ret;
1544
1545        if (offset) {
1546                if (unlikely(copy_from_user(&pos, offset, sizeof(loff_t))))
1547                        return -EFAULT;
1548                ret = do_sendfile(out_fd, in_fd, &pos, count, 0);
1549                if (unlikely(put_user(pos, offset)))
1550                        return -EFAULT;
1551                return ret;
1552        }
1553
1554        return do_sendfile(out_fd, in_fd, NULL, count, 0);
1555}
1556
1557#ifdef CONFIG_COMPAT
1558COMPAT_SYSCALL_DEFINE4(sendfile, int, out_fd, int, in_fd,
1559                compat_off_t __user *, offset, compat_size_t, count)
1560{
1561        loff_t pos;
1562        off_t off;
1563        ssize_t ret;
1564
1565        if (offset) {
1566                if (unlikely(get_user(off, offset)))
1567                        return -EFAULT;
1568                pos = off;
1569                ret = do_sendfile(out_fd, in_fd, &pos, count, MAX_NON_LFS);
1570                if (unlikely(put_user(pos, offset)))
1571                        return -EFAULT;
1572                return ret;
1573        }
1574
1575        return do_sendfile(out_fd, in_fd, NULL, count, 0);
1576}
1577
1578COMPAT_SYSCALL_DEFINE4(sendfile64, int, out_fd, int, in_fd,
1579                compat_loff_t __user *, offset, compat_size_t, count)
1580{
1581        loff_t pos;
1582        ssize_t ret;
1583
1584        if (offset) {
1585                if (unlikely(copy_from_user(&pos, offset, sizeof(loff_t))))
1586                        return -EFAULT;
1587                ret = do_sendfile(out_fd, in_fd, &pos, count, 0);
1588                if (unlikely(put_user(pos, offset)))
1589                        return -EFAULT;
1590                return ret;
1591        }
1592
1593        return do_sendfile(out_fd, in_fd, NULL, count, 0);
1594}
1595#endif
1596
1597/**
1598 * generic_copy_file_range - copy data between two files
1599 * @file_in:    file structure to read from
1600 * @pos_in:     file offset to read from
1601 * @file_out:   file structure to write data to
1602 * @pos_out:    file offset to write data to
1603 * @len:        amount of data to copy
1604 * @flags:      copy flags
1605 *
1606 * This is a generic filesystem helper to copy data from one file to another.
1607 * It has no constraints on the source or destination file owners - the files
1608 * can belong to different superblocks and different filesystem types. Short
1609 * copies are allowed.
1610 *
1611 * This should be called from the @file_out filesystem, as per the
1612 * ->copy_file_range() method.
1613 *
1614 * Returns the number of bytes copied or a negative error indicating the
1615 * failure.
1616 */
1617
1618ssize_t generic_copy_file_range(struct file *file_in, loff_t pos_in,
1619                                struct file *file_out, loff_t pos_out,
1620                                size_t len, unsigned int flags)
1621{
1622        return do_splice_direct(file_in, &pos_in, file_out, &pos_out,
1623                                len > MAX_RW_COUNT ? MAX_RW_COUNT : len, 0);
1624}
1625EXPORT_SYMBOL(generic_copy_file_range);
1626
1627static ssize_t do_copy_file_range(struct file *file_in, loff_t pos_in,
1628                                  struct file *file_out, loff_t pos_out,
1629                                  size_t len, unsigned int flags)
1630{
1631        /*
1632         * Although we now allow filesystems to handle cross sb copy, passing
1633         * a file of the wrong filesystem type to filesystem driver can result
1634         * in an attempt to dereference the wrong type of ->private_data, so
1635         * avoid doing that until we really have a good reason.  NFS defines
1636         * several different file_system_type structures, but they all end up
1637         * using the same ->copy_file_range() function pointer.
1638         */
1639        if (file_out->f_op->copy_file_range &&
1640            file_out->f_op->copy_file_range == file_in->f_op->copy_file_range)
1641                return file_out->f_op->copy_file_range(file_in, pos_in,
1642                                                       file_out, pos_out,
1643                                                       len, flags);
1644
1645        return generic_copy_file_range(file_in, pos_in, file_out, pos_out, len,
1646                                       flags);
1647}
1648
1649/*
1650 * copy_file_range() differs from regular file read and write in that it
1651 * specifically allows return partial success.  When it does so is up to
1652 * the copy_file_range method.
1653 */
1654ssize_t vfs_copy_file_range(struct file *file_in, loff_t pos_in,
1655                            struct file *file_out, loff_t pos_out,
1656                            size_t len, unsigned int flags)
1657{
1658        ssize_t ret;
1659
1660        if (flags != 0)
1661                return -EINVAL;
1662
1663        ret = generic_copy_file_checks(file_in, pos_in, file_out, pos_out, &len,
1664                                       flags);
1665        if (unlikely(ret))
1666                return ret;
1667
1668        ret = rw_verify_area(READ, file_in, &pos_in, len);
1669        if (unlikely(ret))
1670                return ret;
1671
1672        ret = rw_verify_area(WRITE, file_out, &pos_out, len);
1673        if (unlikely(ret))
1674                return ret;
1675
1676        if (len == 0)
1677                return 0;
1678
1679        file_start_write(file_out);
1680
1681        /*
1682         * Try cloning first, this is supported by more file systems, and
1683         * more efficient if both clone and copy are supported (e.g. NFS).
1684         */
1685        if (file_in->f_op->remap_file_range &&
1686            file_inode(file_in)->i_sb == file_inode(file_out)->i_sb) {
1687                loff_t cloned;
1688
1689                cloned = file_in->f_op->remap_file_range(file_in, pos_in,
1690                                file_out, pos_out,
1691                                min_t(loff_t, MAX_RW_COUNT, len),
1692                                REMAP_FILE_CAN_SHORTEN);
1693                if (cloned > 0) {
1694                        ret = cloned;
1695                        goto done;
1696                }
1697        }
1698
1699        ret = do_copy_file_range(file_in, pos_in, file_out, pos_out, len,
1700                                flags);
1701        WARN_ON_ONCE(ret == -EOPNOTSUPP);
1702done:
1703        if (ret > 0) {
1704                fsnotify_access(file_in);
1705                add_rchar(current, ret);
1706                fsnotify_modify(file_out);
1707                add_wchar(current, ret);
1708        }
1709
1710        inc_syscr(current);
1711        inc_syscw(current);
1712
1713        file_end_write(file_out);
1714
1715        return ret;
1716}
1717EXPORT_SYMBOL(vfs_copy_file_range);
1718
1719SYSCALL_DEFINE6(copy_file_range, int, fd_in, loff_t __user *, off_in,
1720                int, fd_out, loff_t __user *, off_out,
1721                size_t, len, unsigned int, flags)
1722{
1723        loff_t pos_in;
1724        loff_t pos_out;
1725        struct fd f_in;
1726        struct fd f_out;
1727        ssize_t ret = -EBADF;
1728
1729        f_in = fdget(fd_in);
1730        if (!f_in.file)
1731                goto out2;
1732
1733        f_out = fdget(fd_out);
1734        if (!f_out.file)
1735                goto out1;
1736
1737        ret = -EFAULT;
1738        if (off_in) {
1739                if (copy_from_user(&pos_in, off_in, sizeof(loff_t)))
1740                        goto out;
1741        } else {
1742                pos_in = f_in.file->f_pos;
1743        }
1744
1745        if (off_out) {
1746                if (copy_from_user(&pos_out, off_out, sizeof(loff_t)))
1747                        goto out;
1748        } else {
1749                pos_out = f_out.file->f_pos;
1750        }
1751
1752        ret = vfs_copy_file_range(f_in.file, pos_in, f_out.file, pos_out, len,
1753                                  flags);
1754        if (ret > 0) {
1755                pos_in += ret;
1756                pos_out += ret;
1757
1758                if (off_in) {
1759                        if (copy_to_user(off_in, &pos_in, sizeof(loff_t)))
1760                                ret = -EFAULT;
1761                } else {
1762                        f_in.file->f_pos = pos_in;
1763                }
1764
1765                if (off_out) {
1766                        if (copy_to_user(off_out, &pos_out, sizeof(loff_t)))
1767                                ret = -EFAULT;
1768                } else {
1769                        f_out.file->f_pos = pos_out;
1770                }
1771        }
1772
1773out:
1774        fdput(f_out);
1775out1:
1776        fdput(f_in);
1777out2:
1778        return ret;
1779}
1780
1781static int remap_verify_area(struct file *file, loff_t pos, loff_t len,
1782                             bool write)
1783{
1784        struct inode *inode = file_inode(file);
1785
1786        if (unlikely(pos < 0 || len < 0))
1787                return -EINVAL;
1788
1789         if (unlikely((loff_t) (pos + len) < 0))
1790                return -EINVAL;
1791
1792        if (unlikely(inode->i_flctx && mandatory_lock(inode))) {
1793                loff_t end = len ? pos + len - 1 : OFFSET_MAX;
1794                int retval;
1795
1796                retval = locks_mandatory_area(inode, file, pos, end,
1797                                write ? F_WRLCK : F_RDLCK);
1798                if (retval < 0)
1799                        return retval;
1800        }
1801
1802        return security_file_permission(file, write ? MAY_WRITE : MAY_READ);
1803}
1804/*
1805 * Ensure that we don't remap a partial EOF block in the middle of something
1806 * else.  Assume that the offsets have already been checked for block
1807 * alignment.
1808 *
1809 * For deduplication we always scale down to the previous block because we
1810 * can't meaningfully compare post-EOF contents.
1811 *
1812 * For clone we only link a partial EOF block above the destination file's EOF.
1813 *
1814 * Shorten the request if possible.
1815 */
1816static int generic_remap_check_len(struct inode *inode_in,
1817                                   struct inode *inode_out,
1818                                   loff_t pos_out,
1819                                   loff_t *len,
1820                                   unsigned int remap_flags)
1821{
1822        u64 blkmask = i_blocksize(inode_in) - 1;
1823        loff_t new_len = *len;
1824
1825        if ((*len & blkmask) == 0)
1826                return 0;
1827
1828        if ((remap_flags & REMAP_FILE_DEDUP) ||
1829            pos_out + *len < i_size_read(inode_out))
1830                new_len &= ~blkmask;
1831
1832        if (new_len == *len)
1833                return 0;
1834
1835        if (remap_flags & REMAP_FILE_CAN_SHORTEN) {
1836                *len = new_len;
1837                return 0;
1838        }
1839
1840        return (remap_flags & REMAP_FILE_DEDUP) ? -EBADE : -EINVAL;
1841}
1842
1843/*
1844 * Read a page's worth of file data into the page cache.  Return the page
1845 * locked.
1846 */
1847static struct page *vfs_dedupe_get_page(struct inode *inode, loff_t offset)
1848{
1849        struct page *page;
1850
1851        page = read_mapping_page(inode->i_mapping, offset >> PAGE_SHIFT, NULL);
1852        if (IS_ERR(page))
1853                return page;
1854        if (!PageUptodate(page)) {
1855                put_page(page);
1856                return ERR_PTR(-EIO);
1857        }
1858        lock_page(page);
1859        return page;
1860}
1861
1862/*
1863 * Compare extents of two files to see if they are the same.
1864 * Caller must have locked both inodes to prevent write races.
1865 */
1866static int vfs_dedupe_file_range_compare(struct inode *src, loff_t srcoff,
1867                                         struct inode *dest, loff_t destoff,
1868                                         loff_t len, bool *is_same)
1869{
1870        loff_t src_poff;
1871        loff_t dest_poff;
1872        void *src_addr;
1873        void *dest_addr;
1874        struct page *src_page;
1875        struct page *dest_page;
1876        loff_t cmp_len;
1877        bool same;
1878        int error;
1879
1880        error = -EINVAL;
1881        same = true;
1882        while (len) {
1883                src_poff = srcoff & (PAGE_SIZE - 1);
1884                dest_poff = destoff & (PAGE_SIZE - 1);
1885                cmp_len = min(PAGE_SIZE - src_poff,
1886                              PAGE_SIZE - dest_poff);
1887                cmp_len = min(cmp_len, len);
1888                if (cmp_len <= 0)
1889                        goto out_error;
1890
1891                src_page = vfs_dedupe_get_page(src, srcoff);
1892                if (IS_ERR(src_page)) {
1893                        error = PTR_ERR(src_page);
1894                        goto out_error;
1895                }
1896                dest_page = vfs_dedupe_get_page(dest, destoff);
1897                if (IS_ERR(dest_page)) {
1898                        error = PTR_ERR(dest_page);
1899                        unlock_page(src_page);
1900                        put_page(src_page);
1901                        goto out_error;
1902                }
1903                src_addr = kmap_atomic(src_page);
1904                dest_addr = kmap_atomic(dest_page);
1905
1906                flush_dcache_page(src_page);
1907                flush_dcache_page(dest_page);
1908
1909                if (memcmp(src_addr + src_poff, dest_addr + dest_poff, cmp_len))
1910                        same = false;
1911
1912                kunmap_atomic(dest_addr);
1913                kunmap_atomic(src_addr);
1914                unlock_page(dest_page);
1915                unlock_page(src_page);
1916                put_page(dest_page);
1917                put_page(src_page);
1918
1919                if (!same)
1920                        break;
1921
1922                srcoff += cmp_len;
1923                destoff += cmp_len;
1924                len -= cmp_len;
1925        }
1926
1927        *is_same = same;
1928        return 0;
1929
1930out_error:
1931        return error;
1932}
1933
1934/*
1935 * Check that the two inodes are eligible for cloning, the ranges make
1936 * sense, and then flush all dirty data.  Caller must ensure that the
1937 * inodes have been locked against any other modifications.
1938 *
1939 * If there's an error, then the usual negative error code is returned.
1940 * Otherwise returns 0 with *len set to the request length.
1941 */
1942int generic_remap_file_range_prep(struct file *file_in, loff_t pos_in,
1943                                  struct file *file_out, loff_t pos_out,
1944                                  loff_t *len, unsigned int remap_flags)
1945{
1946        struct inode *inode_in = file_inode(file_in);
1947        struct inode *inode_out = file_inode(file_out);
1948        bool same_inode = (inode_in == inode_out);
1949        int ret;
1950
1951        /* Don't touch certain kinds of inodes */
1952        if (IS_IMMUTABLE(inode_out))
1953                return -EPERM;
1954
1955        if (IS_SWAPFILE(inode_in) || IS_SWAPFILE(inode_out))
1956                return -ETXTBSY;
1957
1958        /* Don't reflink dirs, pipes, sockets... */
1959        if (S_ISDIR(inode_in->i_mode) || S_ISDIR(inode_out->i_mode))
1960                return -EISDIR;
1961        if (!S_ISREG(inode_in->i_mode) || !S_ISREG(inode_out->i_mode))
1962                return -EINVAL;
1963
1964        /* Zero length dedupe exits immediately; reflink goes to EOF. */
1965        if (*len == 0) {
1966                loff_t isize = i_size_read(inode_in);
1967
1968                if ((remap_flags & REMAP_FILE_DEDUP) || pos_in == isize)
1969                        return 0;
1970                if (pos_in > isize)
1971                        return -EINVAL;
1972                *len = isize - pos_in;
1973                if (*len == 0)
1974                        return 0;
1975        }
1976
1977        /* Check that we don't violate system file offset limits. */
1978        ret = generic_remap_checks(file_in, pos_in, file_out, pos_out, len,
1979                        remap_flags);
1980        if (ret)
1981                return ret;
1982
1983        /* Wait for the completion of any pending IOs on both files */
1984        inode_dio_wait(inode_in);
1985        if (!same_inode)
1986                inode_dio_wait(inode_out);
1987
1988        ret = filemap_write_and_wait_range(inode_in->i_mapping,
1989                        pos_in, pos_in + *len - 1);
1990        if (ret)
1991                return ret;
1992
1993        ret = filemap_write_and_wait_range(inode_out->i_mapping,
1994                        pos_out, pos_out + *len - 1);
1995        if (ret)
1996                return ret;
1997
1998        /*
1999         * Check that the extents are the same.
2000         */
2001        if (remap_flags & REMAP_FILE_DEDUP) {
2002                bool            is_same = false;
2003
2004                ret = vfs_dedupe_file_range_compare(inode_in, pos_in,
2005                                inode_out, pos_out, *len, &is_same);
2006                if (ret)
2007                        return ret;
2008                if (!is_same)
2009                        return -EBADE;
2010        }
2011
2012        ret = generic_remap_check_len(inode_in, inode_out, pos_out, len,
2013                        remap_flags);
2014        if (ret)
2015                return ret;
2016
2017        /* If can't alter the file contents, we're done. */
2018        if (!(remap_flags & REMAP_FILE_DEDUP))
2019                ret = file_modified(file_out);
2020
2021        return ret;
2022}
2023EXPORT_SYMBOL(generic_remap_file_range_prep);
2024
2025loff_t do_clone_file_range(struct file *file_in, loff_t pos_in,
2026                           struct file *file_out, loff_t pos_out,
2027                           loff_t len, unsigned int remap_flags)
2028{
2029        loff_t ret;
2030
2031        WARN_ON_ONCE(remap_flags & REMAP_FILE_DEDUP);
2032
2033        /*
2034         * FICLONE/FICLONERANGE ioctls enforce that src and dest files are on
2035         * the same mount. Practically, they only need to be on the same file
2036         * system.
2037         */
2038        if (file_inode(file_in)->i_sb != file_inode(file_out)->i_sb)
2039                return -EXDEV;
2040
2041        ret = generic_file_rw_checks(file_in, file_out);
2042        if (ret < 0)
2043                return ret;
2044
2045        if (!file_in->f_op->remap_file_range)
2046                return -EOPNOTSUPP;
2047
2048        ret = remap_verify_area(file_in, pos_in, len, false);
2049        if (ret)
2050                return ret;
2051
2052        ret = remap_verify_area(file_out, pos_out, len, true);
2053        if (ret)
2054                return ret;
2055
2056        ret = file_in->f_op->remap_file_range(file_in, pos_in,
2057                        file_out, pos_out, len, remap_flags);
2058        if (ret < 0)
2059                return ret;
2060
2061        fsnotify_access(file_in);
2062        fsnotify_modify(file_out);
2063        return ret;
2064}
2065EXPORT_SYMBOL(do_clone_file_range);
2066
2067loff_t vfs_clone_file_range(struct file *file_in, loff_t pos_in,
2068                            struct file *file_out, loff_t pos_out,
2069                            loff_t len, unsigned int remap_flags)
2070{
2071        loff_t ret;
2072
2073        file_start_write(file_out);
2074        ret = do_clone_file_range(file_in, pos_in, file_out, pos_out, len,
2075                                  remap_flags);
2076        file_end_write(file_out);
2077
2078        return ret;
2079}
2080EXPORT_SYMBOL(vfs_clone_file_range);
2081
2082loff_t vfs_dedupe_file_range_one(struct file *src_file, loff_t src_pos,
2083                                 struct file *dst_file, loff_t dst_pos,
2084                                 loff_t len, unsigned int remap_flags)
2085{
2086        loff_t ret;
2087
2088        WARN_ON_ONCE(remap_flags & ~(REMAP_FILE_DEDUP |
2089                                     REMAP_FILE_CAN_SHORTEN));
2090
2091        ret = mnt_want_write_file(dst_file);
2092        if (ret)
2093                return ret;
2094
2095        ret = remap_verify_area(dst_file, dst_pos, len, true);
2096        if (ret < 0)
2097                goto out_drop_write;
2098
2099        ret = -EINVAL;
2100        if (!(capable(CAP_SYS_ADMIN) || (dst_file->f_mode & FMODE_WRITE)))
2101                goto out_drop_write;
2102
2103        ret = -EXDEV;
2104        if (src_file->f_path.mnt != dst_file->f_path.mnt)
2105                goto out_drop_write;
2106
2107        ret = -EISDIR;
2108        if (S_ISDIR(file_inode(dst_file)->i_mode))
2109                goto out_drop_write;
2110
2111        ret = -EINVAL;
2112        if (!dst_file->f_op->remap_file_range)
2113                goto out_drop_write;
2114
2115        if (len == 0) {
2116                ret = 0;
2117                goto out_drop_write;
2118        }
2119
2120        ret = dst_file->f_op->remap_file_range(src_file, src_pos, dst_file,
2121                        dst_pos, len, remap_flags | REMAP_FILE_DEDUP);
2122out_drop_write:
2123        mnt_drop_write_file(dst_file);
2124
2125        return ret;
2126}
2127EXPORT_SYMBOL(vfs_dedupe_file_range_one);
2128
2129int vfs_dedupe_file_range(struct file *file, struct file_dedupe_range *same)
2130{
2131        struct file_dedupe_range_info *info;
2132        struct inode *src = file_inode(file);
2133        u64 off;
2134        u64 len;
2135        int i;
2136        int ret;
2137        u16 count = same->dest_count;
2138        loff_t deduped;
2139
2140        if (!(file->f_mode & FMODE_READ))
2141                return -EINVAL;
2142
2143        if (same->reserved1 || same->reserved2)
2144                return -EINVAL;
2145
2146        off = same->src_offset;
2147        len = same->src_length;
2148
2149        if (S_ISDIR(src->i_mode))
2150                return -EISDIR;
2151
2152        if (!S_ISREG(src->i_mode))
2153                return -EINVAL;
2154
2155        if (!file->f_op->remap_file_range)
2156                return -EOPNOTSUPP;
2157
2158        ret = remap_verify_area(file, off, len, false);
2159        if (ret < 0)
2160                return ret;
2161        ret = 0;
2162
2163        if (off + len > i_size_read(src))
2164                return -EINVAL;
2165
2166        /* Arbitrary 1G limit on a single dedupe request, can be raised. */
2167        len = min_t(u64, len, 1 << 30);
2168
2169        /* pre-format output fields to sane values */
2170        for (i = 0; i < count; i++) {
2171                same->info[i].bytes_deduped = 0ULL;
2172                same->info[i].status = FILE_DEDUPE_RANGE_SAME;
2173        }
2174
2175        for (i = 0, info = same->info; i < count; i++, info++) {
2176                struct fd dst_fd = fdget(info->dest_fd);
2177                struct file *dst_file = dst_fd.file;
2178
2179                if (!dst_file) {
2180                        info->status = -EBADF;
2181                        goto next_loop;
2182                }
2183
2184                if (info->reserved) {
2185                        info->status = -EINVAL;
2186                        goto next_fdput;
2187                }
2188
2189                deduped = vfs_dedupe_file_range_one(file, off, dst_file,
2190                                                    info->dest_offset, len,
2191                                                    REMAP_FILE_CAN_SHORTEN);
2192                if (deduped == -EBADE)
2193                        info->status = FILE_DEDUPE_RANGE_DIFFERS;
2194                else if (deduped < 0)
2195                        info->status = deduped;
2196                else
2197                        info->bytes_deduped = len;
2198
2199next_fdput:
2200                fdput(dst_fd);
2201next_loop:
2202                if (fatal_signal_pending(current))
2203                        break;
2204        }
2205        return ret;
2206}
2207EXPORT_SYMBOL(vfs_dedupe_file_range);
2208