linux/fs/read_write.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0
   2/*
   3 *  linux/fs/read_write.c
   4 *
   5 *  Copyright (C) 1991, 1992  Linus Torvalds
   6 */
   7
   8#include <linux/slab.h>
   9#include <linux/stat.h>
  10#include <linux/sched/xacct.h>
  11#include <linux/fcntl.h>
  12#include <linux/file.h>
  13#include <linux/uio.h>
  14#include <linux/fsnotify.h>
  15#include <linux/security.h>
  16#include <linux/export.h>
  17#include <linux/syscalls.h>
  18#include <linux/pagemap.h>
  19#include <linux/splice.h>
  20#include <linux/compat.h>
  21#include <linux/mount.h>
  22#include <linux/fs.h>
  23#include "internal.h"
  24
  25#include <linux/uaccess.h>
  26#include <asm/unistd.h>
  27
  28const struct file_operations generic_ro_fops = {
  29        .llseek         = generic_file_llseek,
  30        .read_iter      = generic_file_read_iter,
  31        .mmap           = generic_file_readonly_mmap,
  32        .splice_read    = generic_file_splice_read,
  33};
  34
  35EXPORT_SYMBOL(generic_ro_fops);
  36
  37static inline bool unsigned_offsets(struct file *file)
  38{
  39        return file->f_mode & FMODE_UNSIGNED_OFFSET;
  40}
  41
  42/**
  43 * vfs_setpos - update the file offset for lseek
  44 * @file:       file structure in question
  45 * @offset:     file offset to seek to
  46 * @maxsize:    maximum file size
  47 *
  48 * This is a low-level filesystem helper for updating the file offset to
  49 * the value specified by @offset if the given offset is valid and it is
  50 * not equal to the current file offset.
  51 *
  52 * Return the specified offset on success and -EINVAL on invalid offset.
  53 */
  54loff_t vfs_setpos(struct file *file, loff_t offset, loff_t maxsize)
  55{
  56        if (offset < 0 && !unsigned_offsets(file))
  57                return -EINVAL;
  58        if (offset > maxsize)
  59                return -EINVAL;
  60
  61        if (offset != file->f_pos) {
  62                file->f_pos = offset;
  63                file->f_version = 0;
  64        }
  65        return offset;
  66}
  67EXPORT_SYMBOL(vfs_setpos);
  68
  69/**
  70 * generic_file_llseek_size - generic llseek implementation for regular files
  71 * @file:       file structure to seek on
  72 * @offset:     file offset to seek to
  73 * @whence:     type of seek
  74 * @size:       max size of this file in file system
  75 * @eof:        offset used for SEEK_END position
  76 *
  77 * This is a variant of generic_file_llseek that allows passing in a custom
  78 * maximum file size and a custom EOF position, for e.g. hashed directories
  79 *
  80 * Synchronization:
  81 * SEEK_SET and SEEK_END are unsynchronized (but atomic on 64bit platforms)
  82 * SEEK_CUR is synchronized against other SEEK_CURs, but not read/writes.
  83 * read/writes behave like SEEK_SET against seeks.
  84 */
  85loff_t
  86generic_file_llseek_size(struct file *file, loff_t offset, int whence,
  87                loff_t maxsize, loff_t eof)
  88{
  89        switch (whence) {
  90        case SEEK_END:
  91                offset += eof;
  92                break;
  93        case SEEK_CUR:
  94                /*
  95                 * Here we special-case the lseek(fd, 0, SEEK_CUR)
  96                 * position-querying operation.  Avoid rewriting the "same"
  97                 * f_pos value back to the file because a concurrent read(),
  98                 * write() or lseek() might have altered it
  99                 */
 100                if (offset == 0)
 101                        return file->f_pos;
 102                /*
 103                 * f_lock protects against read/modify/write race with other
 104                 * SEEK_CURs. Note that parallel writes and reads behave
 105                 * like SEEK_SET.
 106                 */
 107                spin_lock(&file->f_lock);
 108                offset = vfs_setpos(file, file->f_pos + offset, maxsize);
 109                spin_unlock(&file->f_lock);
 110                return offset;
 111        case SEEK_DATA:
 112                /*
 113                 * In the generic case the entire file is data, so as long as
 114                 * offset isn't at the end of the file then the offset is data.
 115                 */
 116                if ((unsigned long long)offset >= eof)
 117                        return -ENXIO;
 118                break;
 119        case SEEK_HOLE:
 120                /*
 121                 * There is a virtual hole at the end of the file, so as long as
 122                 * offset isn't i_size or larger, return i_size.
 123                 */
 124                if ((unsigned long long)offset >= eof)
 125                        return -ENXIO;
 126                offset = eof;
 127                break;
 128        }
 129
 130        return vfs_setpos(file, offset, maxsize);
 131}
 132EXPORT_SYMBOL(generic_file_llseek_size);
 133
 134/**
 135 * generic_file_llseek - generic llseek implementation for regular files
 136 * @file:       file structure to seek on
 137 * @offset:     file offset to seek to
 138 * @whence:     type of seek
 139 *
 140 * This is a generic implemenation of ->llseek useable for all normal local
 141 * filesystems.  It just updates the file offset to the value specified by
 142 * @offset and @whence.
 143 */
 144loff_t generic_file_llseek(struct file *file, loff_t offset, int whence)
 145{
 146        struct inode *inode = file->f_mapping->host;
 147
 148        return generic_file_llseek_size(file, offset, whence,
 149                                        inode->i_sb->s_maxbytes,
 150                                        i_size_read(inode));
 151}
 152EXPORT_SYMBOL(generic_file_llseek);
 153
 154/**
 155 * fixed_size_llseek - llseek implementation for fixed-sized devices
 156 * @file:       file structure to seek on
 157 * @offset:     file offset to seek to
 158 * @whence:     type of seek
 159 * @size:       size of the file
 160 *
 161 */
 162loff_t fixed_size_llseek(struct file *file, loff_t offset, int whence, loff_t size)
 163{
 164        switch (whence) {
 165        case SEEK_SET: case SEEK_CUR: case SEEK_END:
 166                return generic_file_llseek_size(file, offset, whence,
 167                                                size, size);
 168        default:
 169                return -EINVAL;
 170        }
 171}
 172EXPORT_SYMBOL(fixed_size_llseek);
 173
 174/**
 175 * no_seek_end_llseek - llseek implementation for fixed-sized devices
 176 * @file:       file structure to seek on
 177 * @offset:     file offset to seek to
 178 * @whence:     type of seek
 179 *
 180 */
 181loff_t no_seek_end_llseek(struct file *file, loff_t offset, int whence)
 182{
 183        switch (whence) {
 184        case SEEK_SET: case SEEK_CUR:
 185                return generic_file_llseek_size(file, offset, whence,
 186                                                OFFSET_MAX, 0);
 187        default:
 188                return -EINVAL;
 189        }
 190}
 191EXPORT_SYMBOL(no_seek_end_llseek);
 192
 193/**
 194 * no_seek_end_llseek_size - llseek implementation for fixed-sized devices
 195 * @file:       file structure to seek on
 196 * @offset:     file offset to seek to
 197 * @whence:     type of seek
 198 * @size:       maximal offset allowed
 199 *
 200 */
 201loff_t no_seek_end_llseek_size(struct file *file, loff_t offset, int whence, loff_t size)
 202{
 203        switch (whence) {
 204        case SEEK_SET: case SEEK_CUR:
 205                return generic_file_llseek_size(file, offset, whence,
 206                                                size, 0);
 207        default:
 208                return -EINVAL;
 209        }
 210}
 211EXPORT_SYMBOL(no_seek_end_llseek_size);
 212
 213/**
 214 * noop_llseek - No Operation Performed llseek implementation
 215 * @file:       file structure to seek on
 216 * @offset:     file offset to seek to
 217 * @whence:     type of seek
 218 *
 219 * This is an implementation of ->llseek useable for the rare special case when
 220 * userspace expects the seek to succeed but the (device) file is actually not
 221 * able to perform the seek. In this case you use noop_llseek() instead of
 222 * falling back to the default implementation of ->llseek.
 223 */
 224loff_t noop_llseek(struct file *file, loff_t offset, int whence)
 225{
 226        return file->f_pos;
 227}
 228EXPORT_SYMBOL(noop_llseek);
 229
 230loff_t no_llseek(struct file *file, loff_t offset, int whence)
 231{
 232        return -ESPIPE;
 233}
 234EXPORT_SYMBOL(no_llseek);
 235
 236loff_t default_llseek(struct file *file, loff_t offset, int whence)
 237{
 238        struct inode *inode = file_inode(file);
 239        loff_t retval;
 240
 241        inode_lock(inode);
 242        switch (whence) {
 243                case SEEK_END:
 244                        offset += i_size_read(inode);
 245                        break;
 246                case SEEK_CUR:
 247                        if (offset == 0) {
 248                                retval = file->f_pos;
 249                                goto out;
 250                        }
 251                        offset += file->f_pos;
 252                        break;
 253                case SEEK_DATA:
 254                        /*
 255                         * In the generic case the entire file is data, so as
 256                         * long as offset isn't at the end of the file then the
 257                         * offset is data.
 258                         */
 259                        if (offset >= inode->i_size) {
 260                                retval = -ENXIO;
 261                                goto out;
 262                        }
 263                        break;
 264                case SEEK_HOLE:
 265                        /*
 266                         * There is a virtual hole at the end of the file, so
 267                         * as long as offset isn't i_size or larger, return
 268                         * i_size.
 269                         */
 270                        if (offset >= inode->i_size) {
 271                                retval = -ENXIO;
 272                                goto out;
 273                        }
 274                        offset = inode->i_size;
 275                        break;
 276        }
 277        retval = -EINVAL;
 278        if (offset >= 0 || unsigned_offsets(file)) {
 279                if (offset != file->f_pos) {
 280                        file->f_pos = offset;
 281                        file->f_version = 0;
 282                }
 283                retval = offset;
 284        }
 285out:
 286        inode_unlock(inode);
 287        return retval;
 288}
 289EXPORT_SYMBOL(default_llseek);
 290
 291loff_t vfs_llseek(struct file *file, loff_t offset, int whence)
 292{
 293        loff_t (*fn)(struct file *, loff_t, int);
 294
 295        fn = no_llseek;
 296        if (file->f_mode & FMODE_LSEEK) {
 297                if (file->f_op->llseek)
 298                        fn = file->f_op->llseek;
 299        }
 300        return fn(file, offset, whence);
 301}
 302EXPORT_SYMBOL(vfs_llseek);
 303
 304off_t ksys_lseek(unsigned int fd, off_t offset, unsigned int whence)
 305{
 306        off_t retval;
 307        struct fd f = fdget_pos(fd);
 308        if (!f.file)
 309                return -EBADF;
 310
 311        retval = -EINVAL;
 312        if (whence <= SEEK_MAX) {
 313                loff_t res = vfs_llseek(f.file, offset, whence);
 314                retval = res;
 315                if (res != (loff_t)retval)
 316                        retval = -EOVERFLOW;    /* LFS: should only happen on 32 bit platforms */
 317        }
 318        fdput_pos(f);
 319        return retval;
 320}
 321
 322SYSCALL_DEFINE3(lseek, unsigned int, fd, off_t, offset, unsigned int, whence)
 323{
 324        return ksys_lseek(fd, offset, whence);
 325}
 326
 327#ifdef CONFIG_COMPAT
 328COMPAT_SYSCALL_DEFINE3(lseek, unsigned int, fd, compat_off_t, offset, unsigned int, whence)
 329{
 330        return ksys_lseek(fd, offset, whence);
 331}
 332#endif
 333
 334#ifdef __ARCH_WANT_SYS_LLSEEK
 335SYSCALL_DEFINE5(llseek, unsigned int, fd, unsigned long, offset_high,
 336                unsigned long, offset_low, loff_t __user *, result,
 337                unsigned int, whence)
 338{
 339        int retval;
 340        struct fd f = fdget_pos(fd);
 341        loff_t offset;
 342
 343        if (!f.file)
 344                return -EBADF;
 345
 346        retval = -EINVAL;
 347        if (whence > SEEK_MAX)
 348                goto out_putf;
 349
 350        offset = vfs_llseek(f.file, ((loff_t) offset_high << 32) | offset_low,
 351                        whence);
 352
 353        retval = (int)offset;
 354        if (offset >= 0) {
 355                retval = -EFAULT;
 356                if (!copy_to_user(result, &offset, sizeof(offset)))
 357                        retval = 0;
 358        }
 359out_putf:
 360        fdput_pos(f);
 361        return retval;
 362}
 363#endif
 364
 365int rw_verify_area(int read_write, struct file *file, const loff_t *ppos, size_t count)
 366{
 367        struct inode *inode;
 368        loff_t pos;
 369        int retval = -EINVAL;
 370
 371        inode = file_inode(file);
 372        if (unlikely((ssize_t) count < 0))
 373                return retval;
 374        pos = *ppos;
 375        if (unlikely(pos < 0)) {
 376                if (!unsigned_offsets(file))
 377                        return retval;
 378                if (count >= -pos) /* both values are in 0..LLONG_MAX */
 379                        return -EOVERFLOW;
 380        } else if (unlikely((loff_t) (pos + count) < 0)) {
 381                if (!unsigned_offsets(file))
 382                        return retval;
 383        }
 384
 385        if (unlikely(inode->i_flctx && mandatory_lock(inode))) {
 386                retval = locks_mandatory_area(inode, file, pos, pos + count - 1,
 387                                read_write == READ ? F_RDLCK : F_WRLCK);
 388                if (retval < 0)
 389                        return retval;
 390        }
 391        return security_file_permission(file,
 392                                read_write == READ ? MAY_READ : MAY_WRITE);
 393}
 394
 395static ssize_t new_sync_read(struct file *filp, char __user *buf, size_t len, loff_t *ppos)
 396{
 397        struct iovec iov = { .iov_base = buf, .iov_len = len };
 398        struct kiocb kiocb;
 399        struct iov_iter iter;
 400        ssize_t ret;
 401
 402        init_sync_kiocb(&kiocb, filp);
 403        kiocb.ki_pos = *ppos;
 404        iov_iter_init(&iter, READ, &iov, 1, len);
 405
 406        ret = call_read_iter(filp, &kiocb, &iter);
 407        BUG_ON(ret == -EIOCBQUEUED);
 408        *ppos = kiocb.ki_pos;
 409        return ret;
 410}
 411
 412ssize_t __vfs_read(struct file *file, char __user *buf, size_t count,
 413                   loff_t *pos)
 414{
 415        if (file->f_op->read)
 416                return file->f_op->read(file, buf, count, pos);
 417        else if (file->f_op->read_iter)
 418                return new_sync_read(file, buf, count, pos);
 419        else
 420                return -EINVAL;
 421}
 422
 423ssize_t kernel_read(struct file *file, void *buf, size_t count, loff_t *pos)
 424{
 425        mm_segment_t old_fs;
 426        ssize_t result;
 427
 428        old_fs = get_fs();
 429        set_fs(get_ds());
 430        /* The cast to a user pointer is valid due to the set_fs() */
 431        result = vfs_read(file, (void __user *)buf, count, pos);
 432        set_fs(old_fs);
 433        return result;
 434}
 435EXPORT_SYMBOL(kernel_read);
 436
 437ssize_t vfs_read(struct file *file, char __user *buf, size_t count, loff_t *pos)
 438{
 439        ssize_t ret;
 440
 441        if (!(file->f_mode & FMODE_READ))
 442                return -EBADF;
 443        if (!(file->f_mode & FMODE_CAN_READ))
 444                return -EINVAL;
 445        if (unlikely(!access_ok(VERIFY_WRITE, buf, count)))
 446                return -EFAULT;
 447
 448        ret = rw_verify_area(READ, file, pos, count);
 449        if (!ret) {
 450                if (count > MAX_RW_COUNT)
 451                        count =  MAX_RW_COUNT;
 452                ret = __vfs_read(file, buf, count, pos);
 453                if (ret > 0) {
 454                        fsnotify_access(file);
 455                        add_rchar(current, ret);
 456                }
 457                inc_syscr(current);
 458        }
 459
 460        return ret;
 461}
 462
 463static ssize_t new_sync_write(struct file *filp, const char __user *buf, size_t len, loff_t *ppos)
 464{
 465        struct iovec iov = { .iov_base = (void __user *)buf, .iov_len = len };
 466        struct kiocb kiocb;
 467        struct iov_iter iter;
 468        ssize_t ret;
 469
 470        init_sync_kiocb(&kiocb, filp);
 471        kiocb.ki_pos = *ppos;
 472        iov_iter_init(&iter, WRITE, &iov, 1, len);
 473
 474        ret = call_write_iter(filp, &kiocb, &iter);
 475        BUG_ON(ret == -EIOCBQUEUED);
 476        if (ret > 0)
 477                *ppos = kiocb.ki_pos;
 478        return ret;
 479}
 480
 481ssize_t __vfs_write(struct file *file, const char __user *p, size_t count,
 482                    loff_t *pos)
 483{
 484        if (file->f_op->write)
 485                return file->f_op->write(file, p, count, pos);
 486        else if (file->f_op->write_iter)
 487                return new_sync_write(file, p, count, pos);
 488        else
 489                return -EINVAL;
 490}
 491
 492ssize_t __kernel_write(struct file *file, const void *buf, size_t count, loff_t *pos)
 493{
 494        mm_segment_t old_fs;
 495        const char __user *p;
 496        ssize_t ret;
 497
 498        if (!(file->f_mode & FMODE_CAN_WRITE))
 499                return -EINVAL;
 500
 501        old_fs = get_fs();
 502        set_fs(get_ds());
 503        p = (__force const char __user *)buf;
 504        if (count > MAX_RW_COUNT)
 505                count =  MAX_RW_COUNT;
 506        ret = __vfs_write(file, p, count, pos);
 507        set_fs(old_fs);
 508        if (ret > 0) {
 509                fsnotify_modify(file);
 510                add_wchar(current, ret);
 511        }
 512        inc_syscw(current);
 513        return ret;
 514}
 515EXPORT_SYMBOL(__kernel_write);
 516
 517ssize_t kernel_write(struct file *file, const void *buf, size_t count,
 518                            loff_t *pos)
 519{
 520        mm_segment_t old_fs;
 521        ssize_t res;
 522
 523        old_fs = get_fs();
 524        set_fs(get_ds());
 525        /* The cast to a user pointer is valid due to the set_fs() */
 526        res = vfs_write(file, (__force const char __user *)buf, count, pos);
 527        set_fs(old_fs);
 528
 529        return res;
 530}
 531EXPORT_SYMBOL(kernel_write);
 532
 533ssize_t vfs_write(struct file *file, const char __user *buf, size_t count, loff_t *pos)
 534{
 535        ssize_t ret;
 536
 537        if (!(file->f_mode & FMODE_WRITE))
 538                return -EBADF;
 539        if (!(file->f_mode & FMODE_CAN_WRITE))
 540                return -EINVAL;
 541        if (unlikely(!access_ok(VERIFY_READ, buf, count)))
 542                return -EFAULT;
 543
 544        ret = rw_verify_area(WRITE, file, pos, count);
 545        if (!ret) {
 546                if (count > MAX_RW_COUNT)
 547                        count =  MAX_RW_COUNT;
 548                file_start_write(file);
 549                ret = __vfs_write(file, buf, count, pos);
 550                if (ret > 0) {
 551                        fsnotify_modify(file);
 552                        add_wchar(current, ret);
 553                }
 554                inc_syscw(current);
 555                file_end_write(file);
 556        }
 557
 558        return ret;
 559}
 560
 561static inline loff_t file_pos_read(struct file *file)
 562{
 563        return file->f_pos;
 564}
 565
 566static inline void file_pos_write(struct file *file, loff_t pos)
 567{
 568        file->f_pos = pos;
 569}
 570
 571ssize_t ksys_read(unsigned int fd, char __user *buf, size_t count)
 572{
 573        struct fd f = fdget_pos(fd);
 574        ssize_t ret = -EBADF;
 575
 576        if (f.file) {
 577                loff_t pos = file_pos_read(f.file);
 578                ret = vfs_read(f.file, buf, count, &pos);
 579                if (ret >= 0)
 580                        file_pos_write(f.file, pos);
 581                fdput_pos(f);
 582        }
 583        return ret;
 584}
 585
 586SYSCALL_DEFINE3(read, unsigned int, fd, char __user *, buf, size_t, count)
 587{
 588        return ksys_read(fd, buf, count);
 589}
 590
 591ssize_t ksys_write(unsigned int fd, const char __user *buf, size_t count)
 592{
 593        struct fd f = fdget_pos(fd);
 594        ssize_t ret = -EBADF;
 595
 596        if (f.file) {
 597                loff_t pos = file_pos_read(f.file);
 598                ret = vfs_write(f.file, buf, count, &pos);
 599                if (ret >= 0)
 600                        file_pos_write(f.file, pos);
 601                fdput_pos(f);
 602        }
 603
 604        return ret;
 605}
 606
 607SYSCALL_DEFINE3(write, unsigned int, fd, const char __user *, buf,
 608                size_t, count)
 609{
 610        return ksys_write(fd, buf, count);
 611}
 612
 613ssize_t ksys_pread64(unsigned int fd, char __user *buf, size_t count,
 614                     loff_t pos)
 615{
 616        struct fd f;
 617        ssize_t ret = -EBADF;
 618
 619        if (pos < 0)
 620                return -EINVAL;
 621
 622        f = fdget(fd);
 623        if (f.file) {
 624                ret = -ESPIPE;
 625                if (f.file->f_mode & FMODE_PREAD)
 626                        ret = vfs_read(f.file, buf, count, &pos);
 627                fdput(f);
 628        }
 629
 630        return ret;
 631}
 632
 633SYSCALL_DEFINE4(pread64, unsigned int, fd, char __user *, buf,
 634                        size_t, count, loff_t, pos)
 635{
 636        return ksys_pread64(fd, buf, count, pos);
 637}
 638
 639ssize_t ksys_pwrite64(unsigned int fd, const char __user *buf,
 640                      size_t count, loff_t pos)
 641{
 642        struct fd f;
 643        ssize_t ret = -EBADF;
 644
 645        if (pos < 0)
 646                return -EINVAL;
 647
 648        f = fdget(fd);
 649        if (f.file) {
 650                ret = -ESPIPE;
 651                if (f.file->f_mode & FMODE_PWRITE)  
 652                        ret = vfs_write(f.file, buf, count, &pos);
 653                fdput(f);
 654        }
 655
 656        return ret;
 657}
 658
 659SYSCALL_DEFINE4(pwrite64, unsigned int, fd, const char __user *, buf,
 660                         size_t, count, loff_t, pos)
 661{
 662        return ksys_pwrite64(fd, buf, count, pos);
 663}
 664
 665static ssize_t do_iter_readv_writev(struct file *filp, struct iov_iter *iter,
 666                loff_t *ppos, int type, rwf_t flags)
 667{
 668        struct kiocb kiocb;
 669        ssize_t ret;
 670
 671        init_sync_kiocb(&kiocb, filp);
 672        ret = kiocb_set_rw_flags(&kiocb, flags);
 673        if (ret)
 674                return ret;
 675        kiocb.ki_pos = *ppos;
 676
 677        if (type == READ)
 678                ret = call_read_iter(filp, &kiocb, iter);
 679        else
 680                ret = call_write_iter(filp, &kiocb, iter);
 681        BUG_ON(ret == -EIOCBQUEUED);
 682        *ppos = kiocb.ki_pos;
 683        return ret;
 684}
 685
 686/* Do it by hand, with file-ops */
 687static ssize_t do_loop_readv_writev(struct file *filp, struct iov_iter *iter,
 688                loff_t *ppos, int type, rwf_t flags)
 689{
 690        ssize_t ret = 0;
 691
 692        if (flags & ~RWF_HIPRI)
 693                return -EOPNOTSUPP;
 694
 695        while (iov_iter_count(iter)) {
 696                struct iovec iovec = iov_iter_iovec(iter);
 697                ssize_t nr;
 698
 699                if (type == READ) {
 700                        nr = filp->f_op->read(filp, iovec.iov_base,
 701                                              iovec.iov_len, ppos);
 702                } else {
 703                        nr = filp->f_op->write(filp, iovec.iov_base,
 704                                               iovec.iov_len, ppos);
 705                }
 706
 707                if (nr < 0) {
 708                        if (!ret)
 709                                ret = nr;
 710                        break;
 711                }
 712                ret += nr;
 713                if (nr != iovec.iov_len)
 714                        break;
 715                iov_iter_advance(iter, nr);
 716        }
 717
 718        return ret;
 719}
 720
 721/* A write operation does a read from user space and vice versa */
 722#define vrfy_dir(type) ((type) == READ ? VERIFY_WRITE : VERIFY_READ)
 723
 724/**
 725 * rw_copy_check_uvector() - Copy an array of &struct iovec from userspace
 726 *     into the kernel and check that it is valid.
 727 *
 728 * @type: One of %CHECK_IOVEC_ONLY, %READ, or %WRITE.
 729 * @uvector: Pointer to the userspace array.
 730 * @nr_segs: Number of elements in userspace array.
 731 * @fast_segs: Number of elements in @fast_pointer.
 732 * @fast_pointer: Pointer to (usually small on-stack) kernel array.
 733 * @ret_pointer: (output parameter) Pointer to a variable that will point to
 734 *     either @fast_pointer, a newly allocated kernel array, or NULL,
 735 *     depending on which array was used.
 736 *
 737 * This function copies an array of &struct iovec of @nr_segs from
 738 * userspace into the kernel and checks that each element is valid (e.g.
 739 * it does not point to a kernel address or cause overflow by being too
 740 * large, etc.).
 741 *
 742 * As an optimization, the caller may provide a pointer to a small
 743 * on-stack array in @fast_pointer, typically %UIO_FASTIOV elements long
 744 * (the size of this array, or 0 if unused, should be given in @fast_segs).
 745 *
 746 * @ret_pointer will always point to the array that was used, so the
 747 * caller must take care not to call kfree() on it e.g. in case the
 748 * @fast_pointer array was used and it was allocated on the stack.
 749 *
 750 * Return: The total number of bytes covered by the iovec array on success
 751 *   or a negative error code on error.
 752 */
 753ssize_t rw_copy_check_uvector(int type, const struct iovec __user * uvector,
 754                              unsigned long nr_segs, unsigned long fast_segs,
 755                              struct iovec *fast_pointer,
 756                              struct iovec **ret_pointer)
 757{
 758        unsigned long seg;
 759        ssize_t ret;
 760        struct iovec *iov = fast_pointer;
 761
 762        /*
 763         * SuS says "The readv() function *may* fail if the iovcnt argument
 764         * was less than or equal to 0, or greater than {IOV_MAX}.  Linux has
 765         * traditionally returned zero for zero segments, so...
 766         */
 767        if (nr_segs == 0) {
 768                ret = 0;
 769                goto out;
 770        }
 771
 772        /*
 773         * First get the "struct iovec" from user memory and
 774         * verify all the pointers
 775         */
 776        if (nr_segs > UIO_MAXIOV) {
 777                ret = -EINVAL;
 778                goto out;
 779        }
 780        if (nr_segs > fast_segs) {
 781                iov = kmalloc(nr_segs*sizeof(struct iovec), GFP_KERNEL);
 782                if (iov == NULL) {
 783                        ret = -ENOMEM;
 784                        goto out;
 785                }
 786        }
 787        if (copy_from_user(iov, uvector, nr_segs*sizeof(*uvector))) {
 788                ret = -EFAULT;
 789                goto out;
 790        }
 791
 792        /*
 793         * According to the Single Unix Specification we should return EINVAL
 794         * if an element length is < 0 when cast to ssize_t or if the
 795         * total length would overflow the ssize_t return value of the
 796         * system call.
 797         *
 798         * Linux caps all read/write calls to MAX_RW_COUNT, and avoids the
 799         * overflow case.
 800         */
 801        ret = 0;
 802        for (seg = 0; seg < nr_segs; seg++) {
 803                void __user *buf = iov[seg].iov_base;
 804                ssize_t len = (ssize_t)iov[seg].iov_len;
 805
 806                /* see if we we're about to use an invalid len or if
 807                 * it's about to overflow ssize_t */
 808                if (len < 0) {
 809                        ret = -EINVAL;
 810                        goto out;
 811                }
 812                if (type >= 0
 813                    && unlikely(!access_ok(vrfy_dir(type), buf, len))) {
 814                        ret = -EFAULT;
 815                        goto out;
 816                }
 817                if (len > MAX_RW_COUNT - ret) {
 818                        len = MAX_RW_COUNT - ret;
 819                        iov[seg].iov_len = len;
 820                }
 821                ret += len;
 822        }
 823out:
 824        *ret_pointer = iov;
 825        return ret;
 826}
 827
 828#ifdef CONFIG_COMPAT
 829ssize_t compat_rw_copy_check_uvector(int type,
 830                const struct compat_iovec __user *uvector, unsigned long nr_segs,
 831                unsigned long fast_segs, struct iovec *fast_pointer,
 832                struct iovec **ret_pointer)
 833{
 834        compat_ssize_t tot_len;
 835        struct iovec *iov = *ret_pointer = fast_pointer;
 836        ssize_t ret = 0;
 837        int seg;
 838
 839        /*
 840         * SuS says "The readv() function *may* fail if the iovcnt argument
 841         * was less than or equal to 0, or greater than {IOV_MAX}.  Linux has
 842         * traditionally returned zero for zero segments, so...
 843         */
 844        if (nr_segs == 0)
 845                goto out;
 846
 847        ret = -EINVAL;
 848        if (nr_segs > UIO_MAXIOV)
 849                goto out;
 850        if (nr_segs > fast_segs) {
 851                ret = -ENOMEM;
 852                iov = kmalloc(nr_segs*sizeof(struct iovec), GFP_KERNEL);
 853                if (iov == NULL)
 854                        goto out;
 855        }
 856        *ret_pointer = iov;
 857
 858        ret = -EFAULT;
 859        if (!access_ok(VERIFY_READ, uvector, nr_segs*sizeof(*uvector)))
 860                goto out;
 861
 862        /*
 863         * Single unix specification:
 864         * We should -EINVAL if an element length is not >= 0 and fitting an
 865         * ssize_t.
 866         *
 867         * In Linux, the total length is limited to MAX_RW_COUNT, there is
 868         * no overflow possibility.
 869         */
 870        tot_len = 0;
 871        ret = -EINVAL;
 872        for (seg = 0; seg < nr_segs; seg++) {
 873                compat_uptr_t buf;
 874                compat_ssize_t len;
 875
 876                if (__get_user(len, &uvector->iov_len) ||
 877                   __get_user(buf, &uvector->iov_base)) {
 878                        ret = -EFAULT;
 879                        goto out;
 880                }
 881                if (len < 0)    /* size_t not fitting in compat_ssize_t .. */
 882                        goto out;
 883                if (type >= 0 &&
 884                    !access_ok(vrfy_dir(type), compat_ptr(buf), len)) {
 885                        ret = -EFAULT;
 886                        goto out;
 887                }
 888                if (len > MAX_RW_COUNT - tot_len)
 889                        len = MAX_RW_COUNT - tot_len;
 890                tot_len += len;
 891                iov->iov_base = compat_ptr(buf);
 892                iov->iov_len = (compat_size_t) len;
 893                uvector++;
 894                iov++;
 895        }
 896        ret = tot_len;
 897
 898out:
 899        return ret;
 900}
 901#endif
 902
 903static ssize_t do_iter_read(struct file *file, struct iov_iter *iter,
 904                loff_t *pos, rwf_t flags)
 905{
 906        size_t tot_len;
 907        ssize_t ret = 0;
 908
 909        if (!(file->f_mode & FMODE_READ))
 910                return -EBADF;
 911        if (!(file->f_mode & FMODE_CAN_READ))
 912                return -EINVAL;
 913
 914        tot_len = iov_iter_count(iter);
 915        if (!tot_len)
 916                goto out;
 917        ret = rw_verify_area(READ, file, pos, tot_len);
 918        if (ret < 0)
 919                return ret;
 920
 921        if (file->f_op->read_iter)
 922                ret = do_iter_readv_writev(file, iter, pos, READ, flags);
 923        else
 924                ret = do_loop_readv_writev(file, iter, pos, READ, flags);
 925out:
 926        if (ret >= 0)
 927                fsnotify_access(file);
 928        return ret;
 929}
 930
 931ssize_t vfs_iter_read(struct file *file, struct iov_iter *iter, loff_t *ppos,
 932                rwf_t flags)
 933{
 934        if (!file->f_op->read_iter)
 935                return -EINVAL;
 936        return do_iter_read(file, iter, ppos, flags);
 937}
 938EXPORT_SYMBOL(vfs_iter_read);
 939
 940static ssize_t do_iter_write(struct file *file, struct iov_iter *iter,
 941                loff_t *pos, rwf_t flags)
 942{
 943        size_t tot_len;
 944        ssize_t ret = 0;
 945
 946        if (!(file->f_mode & FMODE_WRITE))
 947                return -EBADF;
 948        if (!(file->f_mode & FMODE_CAN_WRITE))
 949                return -EINVAL;
 950
 951        tot_len = iov_iter_count(iter);
 952        if (!tot_len)
 953                return 0;
 954        ret = rw_verify_area(WRITE, file, pos, tot_len);
 955        if (ret < 0)
 956                return ret;
 957
 958        if (file->f_op->write_iter)
 959                ret = do_iter_readv_writev(file, iter, pos, WRITE, flags);
 960        else
 961                ret = do_loop_readv_writev(file, iter, pos, WRITE, flags);
 962        if (ret > 0)
 963                fsnotify_modify(file);
 964        return ret;
 965}
 966
 967ssize_t vfs_iter_write(struct file *file, struct iov_iter *iter, loff_t *ppos,
 968                rwf_t flags)
 969{
 970        if (!file->f_op->write_iter)
 971                return -EINVAL;
 972        return do_iter_write(file, iter, ppos, flags);
 973}
 974EXPORT_SYMBOL(vfs_iter_write);
 975
 976ssize_t vfs_readv(struct file *file, const struct iovec __user *vec,
 977                  unsigned long vlen, loff_t *pos, rwf_t flags)
 978{
 979        struct iovec iovstack[UIO_FASTIOV];
 980        struct iovec *iov = iovstack;
 981        struct iov_iter iter;
 982        ssize_t ret;
 983
 984        ret = import_iovec(READ, vec, vlen, ARRAY_SIZE(iovstack), &iov, &iter);
 985        if (ret >= 0) {
 986                ret = do_iter_read(file, &iter, pos, flags);
 987                kfree(iov);
 988        }
 989
 990        return ret;
 991}
 992
 993static ssize_t vfs_writev(struct file *file, const struct iovec __user *vec,
 994                   unsigned long vlen, loff_t *pos, rwf_t flags)
 995{
 996        struct iovec iovstack[UIO_FASTIOV];
 997        struct iovec *iov = iovstack;
 998        struct iov_iter iter;
 999        ssize_t ret;
1000
1001        ret = import_iovec(WRITE, vec, vlen, ARRAY_SIZE(iovstack), &iov, &iter);
1002        if (ret >= 0) {
1003                file_start_write(file);
1004                ret = do_iter_write(file, &iter, pos, flags);
1005                file_end_write(file);
1006                kfree(iov);
1007        }
1008        return ret;
1009}
1010
1011static ssize_t do_readv(unsigned long fd, const struct iovec __user *vec,
1012                        unsigned long vlen, rwf_t flags)
1013{
1014        struct fd f = fdget_pos(fd);
1015        ssize_t ret = -EBADF;
1016
1017        if (f.file) {
1018                loff_t pos = file_pos_read(f.file);
1019                ret = vfs_readv(f.file, vec, vlen, &pos, flags);
1020                if (ret >= 0)
1021                        file_pos_write(f.file, pos);
1022                fdput_pos(f);
1023        }
1024
1025        if (ret > 0)
1026                add_rchar(current, ret);
1027        inc_syscr(current);
1028        return ret;
1029}
1030
1031static ssize_t do_writev(unsigned long fd, const struct iovec __user *vec,
1032                         unsigned long vlen, rwf_t flags)
1033{
1034        struct fd f = fdget_pos(fd);
1035        ssize_t ret = -EBADF;
1036
1037        if (f.file) {
1038                loff_t pos = file_pos_read(f.file);
1039                ret = vfs_writev(f.file, vec, vlen, &pos, flags);
1040                if (ret >= 0)
1041                        file_pos_write(f.file, pos);
1042                fdput_pos(f);
1043        }
1044
1045        if (ret > 0)
1046                add_wchar(current, ret);
1047        inc_syscw(current);
1048        return ret;
1049}
1050
1051static inline loff_t pos_from_hilo(unsigned long high, unsigned long low)
1052{
1053#define HALF_LONG_BITS (BITS_PER_LONG / 2)
1054        return (((loff_t)high << HALF_LONG_BITS) << HALF_LONG_BITS) | low;
1055}
1056
1057static ssize_t do_preadv(unsigned long fd, const struct iovec __user *vec,
1058                         unsigned long vlen, loff_t pos, rwf_t flags)
1059{
1060        struct fd f;
1061        ssize_t ret = -EBADF;
1062
1063        if (pos < 0)
1064                return -EINVAL;
1065
1066        f = fdget(fd);
1067        if (f.file) {
1068                ret = -ESPIPE;
1069                if (f.file->f_mode & FMODE_PREAD)
1070                        ret = vfs_readv(f.file, vec, vlen, &pos, flags);
1071                fdput(f);
1072        }
1073
1074        if (ret > 0)
1075                add_rchar(current, ret);
1076        inc_syscr(current);
1077        return ret;
1078}
1079
1080static ssize_t do_pwritev(unsigned long fd, const struct iovec __user *vec,
1081                          unsigned long vlen, loff_t pos, rwf_t flags)
1082{
1083        struct fd f;
1084        ssize_t ret = -EBADF;
1085
1086        if (pos < 0)
1087                return -EINVAL;
1088
1089        f = fdget(fd);
1090        if (f.file) {
1091                ret = -ESPIPE;
1092                if (f.file->f_mode & FMODE_PWRITE)
1093                        ret = vfs_writev(f.file, vec, vlen, &pos, flags);
1094                fdput(f);
1095        }
1096
1097        if (ret > 0)
1098                add_wchar(current, ret);
1099        inc_syscw(current);
1100        return ret;
1101}
1102
1103SYSCALL_DEFINE3(readv, unsigned long, fd, const struct iovec __user *, vec,
1104                unsigned long, vlen)
1105{
1106        return do_readv(fd, vec, vlen, 0);
1107}
1108
1109SYSCALL_DEFINE3(writev, unsigned long, fd, const struct iovec __user *, vec,
1110                unsigned long, vlen)
1111{
1112        return do_writev(fd, vec, vlen, 0);
1113}
1114
1115SYSCALL_DEFINE5(preadv, unsigned long, fd, const struct iovec __user *, vec,
1116                unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h)
1117{
1118        loff_t pos = pos_from_hilo(pos_h, pos_l);
1119
1120        return do_preadv(fd, vec, vlen, pos, 0);
1121}
1122
1123SYSCALL_DEFINE6(preadv2, unsigned long, fd, const struct iovec __user *, vec,
1124                unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h,
1125                rwf_t, flags)
1126{
1127        loff_t pos = pos_from_hilo(pos_h, pos_l);
1128
1129        if (pos == -1)
1130                return do_readv(fd, vec, vlen, flags);
1131
1132        return do_preadv(fd, vec, vlen, pos, flags);
1133}
1134
1135SYSCALL_DEFINE5(pwritev, unsigned long, fd, const struct iovec __user *, vec,
1136                unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h)
1137{
1138        loff_t pos = pos_from_hilo(pos_h, pos_l);
1139
1140        return do_pwritev(fd, vec, vlen, pos, 0);
1141}
1142
1143SYSCALL_DEFINE6(pwritev2, unsigned long, fd, const struct iovec __user *, vec,
1144                unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h,
1145                rwf_t, flags)
1146{
1147        loff_t pos = pos_from_hilo(pos_h, pos_l);
1148
1149        if (pos == -1)
1150                return do_writev(fd, vec, vlen, flags);
1151
1152        return do_pwritev(fd, vec, vlen, pos, flags);
1153}
1154
1155#ifdef CONFIG_COMPAT
1156static size_t compat_readv(struct file *file,
1157                           const struct compat_iovec __user *vec,
1158                           unsigned long vlen, loff_t *pos, rwf_t flags)
1159{
1160        struct iovec iovstack[UIO_FASTIOV];
1161        struct iovec *iov = iovstack;
1162        struct iov_iter iter;
1163        ssize_t ret;
1164
1165        ret = compat_import_iovec(READ, vec, vlen, UIO_FASTIOV, &iov, &iter);
1166        if (ret >= 0) {
1167                ret = do_iter_read(file, &iter, pos, flags);
1168                kfree(iov);
1169        }
1170        if (ret > 0)
1171                add_rchar(current, ret);
1172        inc_syscr(current);
1173        return ret;
1174}
1175
1176static size_t do_compat_readv(compat_ulong_t fd,
1177                                 const struct compat_iovec __user *vec,
1178                                 compat_ulong_t vlen, rwf_t flags)
1179{
1180        struct fd f = fdget_pos(fd);
1181        ssize_t ret;
1182        loff_t pos;
1183
1184        if (!f.file)
1185                return -EBADF;
1186        pos = f.file->f_pos;
1187        ret = compat_readv(f.file, vec, vlen, &pos, flags);
1188        if (ret >= 0)
1189                f.file->f_pos = pos;
1190        fdput_pos(f);
1191        return ret;
1192
1193}
1194
1195COMPAT_SYSCALL_DEFINE3(readv, compat_ulong_t, fd,
1196                const struct compat_iovec __user *,vec,
1197                compat_ulong_t, vlen)
1198{
1199        return do_compat_readv(fd, vec, vlen, 0);
1200}
1201
1202static long do_compat_preadv64(unsigned long fd,
1203                                  const struct compat_iovec __user *vec,
1204                                  unsigned long vlen, loff_t pos, rwf_t flags)
1205{
1206        struct fd f;
1207        ssize_t ret;
1208
1209        if (pos < 0)
1210                return -EINVAL;
1211        f = fdget(fd);
1212        if (!f.file)
1213                return -EBADF;
1214        ret = -ESPIPE;
1215        if (f.file->f_mode & FMODE_PREAD)
1216                ret = compat_readv(f.file, vec, vlen, &pos, flags);
1217        fdput(f);
1218        return ret;
1219}
1220
1221#ifdef __ARCH_WANT_COMPAT_SYS_PREADV64
1222COMPAT_SYSCALL_DEFINE4(preadv64, unsigned long, fd,
1223                const struct compat_iovec __user *,vec,
1224                unsigned long, vlen, loff_t, pos)
1225{
1226        return do_compat_preadv64(fd, vec, vlen, pos, 0);
1227}
1228#endif
1229
1230COMPAT_SYSCALL_DEFINE5(preadv, compat_ulong_t, fd,
1231                const struct compat_iovec __user *,vec,
1232                compat_ulong_t, vlen, u32, pos_low, u32, pos_high)
1233{
1234        loff_t pos = ((loff_t)pos_high << 32) | pos_low;
1235
1236        return do_compat_preadv64(fd, vec, vlen, pos, 0);
1237}
1238
1239#ifdef __ARCH_WANT_COMPAT_SYS_PREADV64V2
1240COMPAT_SYSCALL_DEFINE5(preadv64v2, unsigned long, fd,
1241                const struct compat_iovec __user *,vec,
1242                unsigned long, vlen, loff_t, pos, rwf_t, flags)
1243{
1244        return do_compat_preadv64(fd, vec, vlen, pos, flags);
1245}
1246#endif
1247
1248COMPAT_SYSCALL_DEFINE6(preadv2, compat_ulong_t, fd,
1249                const struct compat_iovec __user *,vec,
1250                compat_ulong_t, vlen, u32, pos_low, u32, pos_high,
1251                rwf_t, flags)
1252{
1253        loff_t pos = ((loff_t)pos_high << 32) | pos_low;
1254
1255        if (pos == -1)
1256                return do_compat_readv(fd, vec, vlen, flags);
1257
1258        return do_compat_preadv64(fd, vec, vlen, pos, flags);
1259}
1260
1261static size_t compat_writev(struct file *file,
1262                            const struct compat_iovec __user *vec,
1263                            unsigned long vlen, loff_t *pos, rwf_t flags)
1264{
1265        struct iovec iovstack[UIO_FASTIOV];
1266        struct iovec *iov = iovstack;
1267        struct iov_iter iter;
1268        ssize_t ret;
1269
1270        ret = compat_import_iovec(WRITE, vec, vlen, UIO_FASTIOV, &iov, &iter);
1271        if (ret >= 0) {
1272                file_start_write(file);
1273                ret = do_iter_write(file, &iter, pos, flags);
1274                file_end_write(file);
1275                kfree(iov);
1276        }
1277        if (ret > 0)
1278                add_wchar(current, ret);
1279        inc_syscw(current);
1280        return ret;
1281}
1282
1283static size_t do_compat_writev(compat_ulong_t fd,
1284                                  const struct compat_iovec __user* vec,
1285                                  compat_ulong_t vlen, rwf_t flags)
1286{
1287        struct fd f = fdget_pos(fd);
1288        ssize_t ret;
1289        loff_t pos;
1290
1291        if (!f.file)
1292                return -EBADF;
1293        pos = f.file->f_pos;
1294        ret = compat_writev(f.file, vec, vlen, &pos, flags);
1295        if (ret >= 0)
1296                f.file->f_pos = pos;
1297        fdput_pos(f);
1298        return ret;
1299}
1300
1301COMPAT_SYSCALL_DEFINE3(writev, compat_ulong_t, fd,
1302                const struct compat_iovec __user *, vec,
1303                compat_ulong_t, vlen)
1304{
1305        return do_compat_writev(fd, vec, vlen, 0);
1306}
1307
1308static long do_compat_pwritev64(unsigned long fd,
1309                                   const struct compat_iovec __user *vec,
1310                                   unsigned long vlen, loff_t pos, rwf_t flags)
1311{
1312        struct fd f;
1313        ssize_t ret;
1314
1315        if (pos < 0)
1316                return -EINVAL;
1317        f = fdget(fd);
1318        if (!f.file)
1319                return -EBADF;
1320        ret = -ESPIPE;
1321        if (f.file->f_mode & FMODE_PWRITE)
1322                ret = compat_writev(f.file, vec, vlen, &pos, flags);
1323        fdput(f);
1324        return ret;
1325}
1326
1327#ifdef __ARCH_WANT_COMPAT_SYS_PWRITEV64
1328COMPAT_SYSCALL_DEFINE4(pwritev64, unsigned long, fd,
1329                const struct compat_iovec __user *,vec,
1330                unsigned long, vlen, loff_t, pos)
1331{
1332        return do_compat_pwritev64(fd, vec, vlen, pos, 0);
1333}
1334#endif
1335
1336COMPAT_SYSCALL_DEFINE5(pwritev, compat_ulong_t, fd,
1337                const struct compat_iovec __user *,vec,
1338                compat_ulong_t, vlen, u32, pos_low, u32, pos_high)
1339{
1340        loff_t pos = ((loff_t)pos_high << 32) | pos_low;
1341
1342        return do_compat_pwritev64(fd, vec, vlen, pos, 0);
1343}
1344
1345#ifdef __ARCH_WANT_COMPAT_SYS_PWRITEV64V2
1346COMPAT_SYSCALL_DEFINE5(pwritev64v2, unsigned long, fd,
1347                const struct compat_iovec __user *,vec,
1348                unsigned long, vlen, loff_t, pos, rwf_t, flags)
1349{
1350        return do_compat_pwritev64(fd, vec, vlen, pos, flags);
1351}
1352#endif
1353
1354COMPAT_SYSCALL_DEFINE6(pwritev2, compat_ulong_t, fd,
1355                const struct compat_iovec __user *,vec,
1356                compat_ulong_t, vlen, u32, pos_low, u32, pos_high, rwf_t, flags)
1357{
1358        loff_t pos = ((loff_t)pos_high << 32) | pos_low;
1359
1360        if (pos == -1)
1361                return do_compat_writev(fd, vec, vlen, flags);
1362
1363        return do_compat_pwritev64(fd, vec, vlen, pos, flags);
1364}
1365
1366#endif
1367
1368static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos,
1369                           size_t count, loff_t max)
1370{
1371        struct fd in, out;
1372        struct inode *in_inode, *out_inode;
1373        loff_t pos;
1374        loff_t out_pos;
1375        ssize_t retval;
1376        int fl;
1377
1378        /*
1379         * Get input file, and verify that it is ok..
1380         */
1381        retval = -EBADF;
1382        in = fdget(in_fd);
1383        if (!in.file)
1384                goto out;
1385        if (!(in.file->f_mode & FMODE_READ))
1386                goto fput_in;
1387        retval = -ESPIPE;
1388        if (!ppos) {
1389                pos = in.file->f_pos;
1390        } else {
1391                pos = *ppos;
1392                if (!(in.file->f_mode & FMODE_PREAD))
1393                        goto fput_in;
1394        }
1395        retval = rw_verify_area(READ, in.file, &pos, count);
1396        if (retval < 0)
1397                goto fput_in;
1398        if (count > MAX_RW_COUNT)
1399                count =  MAX_RW_COUNT;
1400
1401        /*
1402         * Get output file, and verify that it is ok..
1403         */
1404        retval = -EBADF;
1405        out = fdget(out_fd);
1406        if (!out.file)
1407                goto fput_in;
1408        if (!(out.file->f_mode & FMODE_WRITE))
1409                goto fput_out;
1410        retval = -EINVAL;
1411        in_inode = file_inode(in.file);
1412        out_inode = file_inode(out.file);
1413        out_pos = out.file->f_pos;
1414        retval = rw_verify_area(WRITE, out.file, &out_pos, count);
1415        if (retval < 0)
1416                goto fput_out;
1417
1418        if (!max)
1419                max = min(in_inode->i_sb->s_maxbytes, out_inode->i_sb->s_maxbytes);
1420
1421        if (unlikely(pos + count > max)) {
1422                retval = -EOVERFLOW;
1423                if (pos >= max)
1424                        goto fput_out;
1425                count = max - pos;
1426        }
1427
1428        fl = 0;
1429#if 0
1430        /*
1431         * We need to debate whether we can enable this or not. The
1432         * man page documents EAGAIN return for the output at least,
1433         * and the application is arguably buggy if it doesn't expect
1434         * EAGAIN on a non-blocking file descriptor.
1435         */
1436        if (in.file->f_flags & O_NONBLOCK)
1437                fl = SPLICE_F_NONBLOCK;
1438#endif
1439        file_start_write(out.file);
1440        retval = do_splice_direct(in.file, &pos, out.file, &out_pos, count, fl);
1441        file_end_write(out.file);
1442
1443        if (retval > 0) {
1444                add_rchar(current, retval);
1445                add_wchar(current, retval);
1446                fsnotify_access(in.file);
1447                fsnotify_modify(out.file);
1448                out.file->f_pos = out_pos;
1449                if (ppos)
1450                        *ppos = pos;
1451                else
1452                        in.file->f_pos = pos;
1453        }
1454
1455        inc_syscr(current);
1456        inc_syscw(current);
1457        if (pos > max)
1458                retval = -EOVERFLOW;
1459
1460fput_out:
1461        fdput(out);
1462fput_in:
1463        fdput(in);
1464out:
1465        return retval;
1466}
1467
1468SYSCALL_DEFINE4(sendfile, int, out_fd, int, in_fd, off_t __user *, offset, size_t, count)
1469{
1470        loff_t pos;
1471        off_t off;
1472        ssize_t ret;
1473
1474        if (offset) {
1475                if (unlikely(get_user(off, offset)))
1476                        return -EFAULT;
1477                pos = off;
1478                ret = do_sendfile(out_fd, in_fd, &pos, count, MAX_NON_LFS);
1479                if (unlikely(put_user(pos, offset)))
1480                        return -EFAULT;
1481                return ret;
1482        }
1483
1484        return do_sendfile(out_fd, in_fd, NULL, count, 0);
1485}
1486
1487SYSCALL_DEFINE4(sendfile64, int, out_fd, int, in_fd, loff_t __user *, offset, size_t, count)
1488{
1489        loff_t pos;
1490        ssize_t ret;
1491
1492        if (offset) {
1493                if (unlikely(copy_from_user(&pos, offset, sizeof(loff_t))))
1494                        return -EFAULT;
1495                ret = do_sendfile(out_fd, in_fd, &pos, count, 0);
1496                if (unlikely(put_user(pos, offset)))
1497                        return -EFAULT;
1498                return ret;
1499        }
1500
1501        return do_sendfile(out_fd, in_fd, NULL, count, 0);
1502}
1503
1504#ifdef CONFIG_COMPAT
1505COMPAT_SYSCALL_DEFINE4(sendfile, int, out_fd, int, in_fd,
1506                compat_off_t __user *, offset, compat_size_t, count)
1507{
1508        loff_t pos;
1509        off_t off;
1510        ssize_t ret;
1511
1512        if (offset) {
1513                if (unlikely(get_user(off, offset)))
1514                        return -EFAULT;
1515                pos = off;
1516                ret = do_sendfile(out_fd, in_fd, &pos, count, MAX_NON_LFS);
1517                if (unlikely(put_user(pos, offset)))
1518                        return -EFAULT;
1519                return ret;
1520        }
1521
1522        return do_sendfile(out_fd, in_fd, NULL, count, 0);
1523}
1524
1525COMPAT_SYSCALL_DEFINE4(sendfile64, int, out_fd, int, in_fd,
1526                compat_loff_t __user *, offset, compat_size_t, count)
1527{
1528        loff_t pos;
1529        ssize_t ret;
1530
1531        if (offset) {
1532                if (unlikely(copy_from_user(&pos, offset, sizeof(loff_t))))
1533                        return -EFAULT;
1534                ret = do_sendfile(out_fd, in_fd, &pos, count, 0);
1535                if (unlikely(put_user(pos, offset)))
1536                        return -EFAULT;
1537                return ret;
1538        }
1539
1540        return do_sendfile(out_fd, in_fd, NULL, count, 0);
1541}
1542#endif
1543
1544/*
1545 * copy_file_range() differs from regular file read and write in that it
1546 * specifically allows return partial success.  When it does so is up to
1547 * the copy_file_range method.
1548 */
1549ssize_t vfs_copy_file_range(struct file *file_in, loff_t pos_in,
1550                            struct file *file_out, loff_t pos_out,
1551                            size_t len, unsigned int flags)
1552{
1553        struct inode *inode_in = file_inode(file_in);
1554        struct inode *inode_out = file_inode(file_out);
1555        ssize_t ret;
1556
1557        if (flags != 0)
1558                return -EINVAL;
1559
1560        if (S_ISDIR(inode_in->i_mode) || S_ISDIR(inode_out->i_mode))
1561                return -EISDIR;
1562        if (!S_ISREG(inode_in->i_mode) || !S_ISREG(inode_out->i_mode))
1563                return -EINVAL;
1564
1565        ret = rw_verify_area(READ, file_in, &pos_in, len);
1566        if (unlikely(ret))
1567                return ret;
1568
1569        ret = rw_verify_area(WRITE, file_out, &pos_out, len);
1570        if (unlikely(ret))
1571                return ret;
1572
1573        if (!(file_in->f_mode & FMODE_READ) ||
1574            !(file_out->f_mode & FMODE_WRITE) ||
1575            (file_out->f_flags & O_APPEND))
1576                return -EBADF;
1577
1578        /* this could be relaxed once a method supports cross-fs copies */
1579        if (inode_in->i_sb != inode_out->i_sb)
1580                return -EXDEV;
1581
1582        if (len == 0)
1583                return 0;
1584
1585        file_start_write(file_out);
1586
1587        /*
1588         * Try cloning first, this is supported by more file systems, and
1589         * more efficient if both clone and copy are supported (e.g. NFS).
1590         */
1591        if (file_in->f_op->clone_file_range) {
1592                ret = file_in->f_op->clone_file_range(file_in, pos_in,
1593                                file_out, pos_out, len);
1594                if (ret == 0) {
1595                        ret = len;
1596                        goto done;
1597                }
1598        }
1599
1600        if (file_out->f_op->copy_file_range) {
1601                ret = file_out->f_op->copy_file_range(file_in, pos_in, file_out,
1602                                                      pos_out, len, flags);
1603                if (ret != -EOPNOTSUPP)
1604                        goto done;
1605        }
1606
1607        ret = do_splice_direct(file_in, &pos_in, file_out, &pos_out,
1608                        len > MAX_RW_COUNT ? MAX_RW_COUNT : len, 0);
1609
1610done:
1611        if (ret > 0) {
1612                fsnotify_access(file_in);
1613                add_rchar(current, ret);
1614                fsnotify_modify(file_out);
1615                add_wchar(current, ret);
1616        }
1617
1618        inc_syscr(current);
1619        inc_syscw(current);
1620
1621        file_end_write(file_out);
1622
1623        return ret;
1624}
1625EXPORT_SYMBOL(vfs_copy_file_range);
1626
1627SYSCALL_DEFINE6(copy_file_range, int, fd_in, loff_t __user *, off_in,
1628                int, fd_out, loff_t __user *, off_out,
1629                size_t, len, unsigned int, flags)
1630{
1631        loff_t pos_in;
1632        loff_t pos_out;
1633        struct fd f_in;
1634        struct fd f_out;
1635        ssize_t ret = -EBADF;
1636
1637        f_in = fdget(fd_in);
1638        if (!f_in.file)
1639                goto out2;
1640
1641        f_out = fdget(fd_out);
1642        if (!f_out.file)
1643                goto out1;
1644
1645        ret = -EFAULT;
1646        if (off_in) {
1647                if (copy_from_user(&pos_in, off_in, sizeof(loff_t)))
1648                        goto out;
1649        } else {
1650                pos_in = f_in.file->f_pos;
1651        }
1652
1653        if (off_out) {
1654                if (copy_from_user(&pos_out, off_out, sizeof(loff_t)))
1655                        goto out;
1656        } else {
1657                pos_out = f_out.file->f_pos;
1658        }
1659
1660        ret = vfs_copy_file_range(f_in.file, pos_in, f_out.file, pos_out, len,
1661                                  flags);
1662        if (ret > 0) {
1663                pos_in += ret;
1664                pos_out += ret;
1665
1666                if (off_in) {
1667                        if (copy_to_user(off_in, &pos_in, sizeof(loff_t)))
1668                                ret = -EFAULT;
1669                } else {
1670                        f_in.file->f_pos = pos_in;
1671                }
1672
1673                if (off_out) {
1674                        if (copy_to_user(off_out, &pos_out, sizeof(loff_t)))
1675                                ret = -EFAULT;
1676                } else {
1677                        f_out.file->f_pos = pos_out;
1678                }
1679        }
1680
1681out:
1682        fdput(f_out);
1683out1:
1684        fdput(f_in);
1685out2:
1686        return ret;
1687}
1688
1689static int clone_verify_area(struct file *file, loff_t pos, u64 len, bool write)
1690{
1691        struct inode *inode = file_inode(file);
1692
1693        if (unlikely(pos < 0))
1694                return -EINVAL;
1695
1696         if (unlikely((loff_t) (pos + len) < 0))
1697                return -EINVAL;
1698
1699        if (unlikely(inode->i_flctx && mandatory_lock(inode))) {
1700                loff_t end = len ? pos + len - 1 : OFFSET_MAX;
1701                int retval;
1702
1703                retval = locks_mandatory_area(inode, file, pos, end,
1704                                write ? F_WRLCK : F_RDLCK);
1705                if (retval < 0)
1706                        return retval;
1707        }
1708
1709        return security_file_permission(file, write ? MAY_WRITE : MAY_READ);
1710}
1711
1712/*
1713 * Check that the two inodes are eligible for cloning, the ranges make
1714 * sense, and then flush all dirty data.  Caller must ensure that the
1715 * inodes have been locked against any other modifications.
1716 *
1717 * Returns: 0 for "nothing to clone", 1 for "something to clone", or
1718 * the usual negative error code.
1719 */
1720int vfs_clone_file_prep_inodes(struct inode *inode_in, loff_t pos_in,
1721                               struct inode *inode_out, loff_t pos_out,
1722                               u64 *len, bool is_dedupe)
1723{
1724        loff_t bs = inode_out->i_sb->s_blocksize;
1725        loff_t blen;
1726        loff_t isize;
1727        bool same_inode = (inode_in == inode_out);
1728        int ret;
1729
1730        /* Don't touch certain kinds of inodes */
1731        if (IS_IMMUTABLE(inode_out))
1732                return -EPERM;
1733
1734        if (IS_SWAPFILE(inode_in) || IS_SWAPFILE(inode_out))
1735                return -ETXTBSY;
1736
1737        /* Don't reflink dirs, pipes, sockets... */
1738        if (S_ISDIR(inode_in->i_mode) || S_ISDIR(inode_out->i_mode))
1739                return -EISDIR;
1740        if (!S_ISREG(inode_in->i_mode) || !S_ISREG(inode_out->i_mode))
1741                return -EINVAL;
1742
1743        /* Are we going all the way to the end? */
1744        isize = i_size_read(inode_in);
1745        if (isize == 0)
1746                return 0;
1747
1748        /* Zero length dedupe exits immediately; reflink goes to EOF. */
1749        if (*len == 0) {
1750                if (is_dedupe || pos_in == isize)
1751                        return 0;
1752                if (pos_in > isize)
1753                        return -EINVAL;
1754                *len = isize - pos_in;
1755        }
1756
1757        /* Ensure offsets don't wrap and the input is inside i_size */
1758        if (pos_in + *len < pos_in || pos_out + *len < pos_out ||
1759            pos_in + *len > isize)
1760                return -EINVAL;
1761
1762        /* Don't allow dedupe past EOF in the dest file */
1763        if (is_dedupe) {
1764                loff_t  disize;
1765
1766                disize = i_size_read(inode_out);
1767                if (pos_out >= disize || pos_out + *len > disize)
1768                        return -EINVAL;
1769        }
1770
1771        /* If we're linking to EOF, continue to the block boundary. */
1772        if (pos_in + *len == isize)
1773                blen = ALIGN(isize, bs) - pos_in;
1774        else
1775                blen = *len;
1776
1777        /* Only reflink if we're aligned to block boundaries */
1778        if (!IS_ALIGNED(pos_in, bs) || !IS_ALIGNED(pos_in + blen, bs) ||
1779            !IS_ALIGNED(pos_out, bs) || !IS_ALIGNED(pos_out + blen, bs))
1780                return -EINVAL;
1781
1782        /* Don't allow overlapped reflink within the same file */
1783        if (same_inode) {
1784                if (pos_out + blen > pos_in && pos_out < pos_in + blen)
1785                        return -EINVAL;
1786        }
1787
1788        /* Wait for the completion of any pending IOs on both files */
1789        inode_dio_wait(inode_in);
1790        if (!same_inode)
1791                inode_dio_wait(inode_out);
1792
1793        ret = filemap_write_and_wait_range(inode_in->i_mapping,
1794                        pos_in, pos_in + *len - 1);
1795        if (ret)
1796                return ret;
1797
1798        ret = filemap_write_and_wait_range(inode_out->i_mapping,
1799                        pos_out, pos_out + *len - 1);
1800        if (ret)
1801                return ret;
1802
1803        /*
1804         * Check that the extents are the same.
1805         */
1806        if (is_dedupe) {
1807                bool            is_same = false;
1808
1809                ret = vfs_dedupe_file_range_compare(inode_in, pos_in,
1810                                inode_out, pos_out, *len, &is_same);
1811                if (ret)
1812                        return ret;
1813                if (!is_same)
1814                        return -EBADE;
1815        }
1816
1817        return 1;
1818}
1819EXPORT_SYMBOL(vfs_clone_file_prep_inodes);
1820
1821int vfs_clone_file_range(struct file *file_in, loff_t pos_in,
1822                struct file *file_out, loff_t pos_out, u64 len)
1823{
1824        struct inode *inode_in = file_inode(file_in);
1825        struct inode *inode_out = file_inode(file_out);
1826        int ret;
1827
1828        if (S_ISDIR(inode_in->i_mode) || S_ISDIR(inode_out->i_mode))
1829                return -EISDIR;
1830        if (!S_ISREG(inode_in->i_mode) || !S_ISREG(inode_out->i_mode))
1831                return -EINVAL;
1832
1833        /*
1834         * FICLONE/FICLONERANGE ioctls enforce that src and dest files are on
1835         * the same mount. Practically, they only need to be on the same file
1836         * system.
1837         */
1838        if (inode_in->i_sb != inode_out->i_sb)
1839                return -EXDEV;
1840
1841        if (!(file_in->f_mode & FMODE_READ) ||
1842            !(file_out->f_mode & FMODE_WRITE) ||
1843            (file_out->f_flags & O_APPEND))
1844                return -EBADF;
1845
1846        if (!file_in->f_op->clone_file_range)
1847                return -EOPNOTSUPP;
1848
1849        ret = clone_verify_area(file_in, pos_in, len, false);
1850        if (ret)
1851                return ret;
1852
1853        ret = clone_verify_area(file_out, pos_out, len, true);
1854        if (ret)
1855                return ret;
1856
1857        if (pos_in + len > i_size_read(inode_in))
1858                return -EINVAL;
1859
1860        ret = file_in->f_op->clone_file_range(file_in, pos_in,
1861                        file_out, pos_out, len);
1862        if (!ret) {
1863                fsnotify_access(file_in);
1864                fsnotify_modify(file_out);
1865        }
1866
1867        return ret;
1868}
1869EXPORT_SYMBOL(vfs_clone_file_range);
1870
1871/*
1872 * Read a page's worth of file data into the page cache.  Return the page
1873 * locked.
1874 */
1875static struct page *vfs_dedupe_get_page(struct inode *inode, loff_t offset)
1876{
1877        struct address_space *mapping;
1878        struct page *page;
1879        pgoff_t n;
1880
1881        n = offset >> PAGE_SHIFT;
1882        mapping = inode->i_mapping;
1883        page = read_mapping_page(mapping, n, NULL);
1884        if (IS_ERR(page))
1885                return page;
1886        if (!PageUptodate(page)) {
1887                put_page(page);
1888                return ERR_PTR(-EIO);
1889        }
1890        lock_page(page);
1891        return page;
1892}
1893
1894/*
1895 * Compare extents of two files to see if they are the same.
1896 * Caller must have locked both inodes to prevent write races.
1897 */
1898int vfs_dedupe_file_range_compare(struct inode *src, loff_t srcoff,
1899                                  struct inode *dest, loff_t destoff,
1900                                  loff_t len, bool *is_same)
1901{
1902        loff_t src_poff;
1903        loff_t dest_poff;
1904        void *src_addr;
1905        void *dest_addr;
1906        struct page *src_page;
1907        struct page *dest_page;
1908        loff_t cmp_len;
1909        bool same;
1910        int error;
1911
1912        error = -EINVAL;
1913        same = true;
1914        while (len) {
1915                src_poff = srcoff & (PAGE_SIZE - 1);
1916                dest_poff = destoff & (PAGE_SIZE - 1);
1917                cmp_len = min(PAGE_SIZE - src_poff,
1918                              PAGE_SIZE - dest_poff);
1919                cmp_len = min(cmp_len, len);
1920                if (cmp_len <= 0)
1921                        goto out_error;
1922
1923                src_page = vfs_dedupe_get_page(src, srcoff);
1924                if (IS_ERR(src_page)) {
1925                        error = PTR_ERR(src_page);
1926                        goto out_error;
1927                }
1928                dest_page = vfs_dedupe_get_page(dest, destoff);
1929                if (IS_ERR(dest_page)) {
1930                        error = PTR_ERR(dest_page);
1931                        unlock_page(src_page);
1932                        put_page(src_page);
1933                        goto out_error;
1934                }
1935                src_addr = kmap_atomic(src_page);
1936                dest_addr = kmap_atomic(dest_page);
1937
1938                flush_dcache_page(src_page);
1939                flush_dcache_page(dest_page);
1940
1941                if (memcmp(src_addr + src_poff, dest_addr + dest_poff, cmp_len))
1942                        same = false;
1943
1944                kunmap_atomic(dest_addr);
1945                kunmap_atomic(src_addr);
1946                unlock_page(dest_page);
1947                unlock_page(src_page);
1948                put_page(dest_page);
1949                put_page(src_page);
1950
1951                if (!same)
1952                        break;
1953
1954                srcoff += cmp_len;
1955                destoff += cmp_len;
1956                len -= cmp_len;
1957        }
1958
1959        *is_same = same;
1960        return 0;
1961
1962out_error:
1963        return error;
1964}
1965EXPORT_SYMBOL(vfs_dedupe_file_range_compare);
1966
1967int vfs_dedupe_file_range(struct file *file, struct file_dedupe_range *same)
1968{
1969        struct file_dedupe_range_info *info;
1970        struct inode *src = file_inode(file);
1971        u64 off;
1972        u64 len;
1973        int i;
1974        int ret;
1975        bool is_admin = capable(CAP_SYS_ADMIN);
1976        u16 count = same->dest_count;
1977        struct file *dst_file;
1978        loff_t dst_off;
1979        ssize_t deduped;
1980
1981        if (!(file->f_mode & FMODE_READ))
1982                return -EINVAL;
1983
1984        if (same->reserved1 || same->reserved2)
1985                return -EINVAL;
1986
1987        off = same->src_offset;
1988        len = same->src_length;
1989
1990        ret = -EISDIR;
1991        if (S_ISDIR(src->i_mode))
1992                goto out;
1993
1994        ret = -EINVAL;
1995        if (!S_ISREG(src->i_mode))
1996                goto out;
1997
1998        ret = clone_verify_area(file, off, len, false);
1999        if (ret < 0)
2000                goto out;
2001        ret = 0;
2002
2003        if (off + len > i_size_read(src))
2004                return -EINVAL;
2005
2006        /* pre-format output fields to sane values */
2007        for (i = 0; i < count; i++) {
2008                same->info[i].bytes_deduped = 0ULL;
2009                same->info[i].status = FILE_DEDUPE_RANGE_SAME;
2010        }
2011
2012        for (i = 0, info = same->info; i < count; i++, info++) {
2013                struct inode *dst;
2014                struct fd dst_fd = fdget(info->dest_fd);
2015
2016                dst_file = dst_fd.file;
2017                if (!dst_file) {
2018                        info->status = -EBADF;
2019                        goto next_loop;
2020                }
2021                dst = file_inode(dst_file);
2022
2023                ret = mnt_want_write_file(dst_file);
2024                if (ret) {
2025                        info->status = ret;
2026                        goto next_loop;
2027                }
2028
2029                dst_off = info->dest_offset;
2030                ret = clone_verify_area(dst_file, dst_off, len, true);
2031                if (ret < 0) {
2032                        info->status = ret;
2033                        goto next_file;
2034                }
2035                ret = 0;
2036
2037                if (info->reserved) {
2038                        info->status = -EINVAL;
2039                } else if (!(is_admin || (dst_file->f_mode & FMODE_WRITE))) {
2040                        info->status = -EINVAL;
2041                } else if (file->f_path.mnt != dst_file->f_path.mnt) {
2042                        info->status = -EXDEV;
2043                } else if (S_ISDIR(dst->i_mode)) {
2044                        info->status = -EISDIR;
2045                } else if (dst_file->f_op->dedupe_file_range == NULL) {
2046                        info->status = -EINVAL;
2047                } else {
2048                        deduped = dst_file->f_op->dedupe_file_range(file, off,
2049                                                        len, dst_file,
2050                                                        info->dest_offset);
2051                        if (deduped == -EBADE)
2052                                info->status = FILE_DEDUPE_RANGE_DIFFERS;
2053                        else if (deduped < 0)
2054                                info->status = deduped;
2055                        else
2056                                info->bytes_deduped += deduped;
2057                }
2058
2059next_file:
2060                mnt_drop_write_file(dst_file);
2061next_loop:
2062                fdput(dst_fd);
2063
2064                if (fatal_signal_pending(current))
2065                        goto out;
2066        }
2067
2068out:
2069        return ret;
2070}
2071EXPORT_SYMBOL(vfs_dedupe_file_range);
2072