linux/fs/read_write.c
<<
>>
Prefs
   1/*
   2 *  linux/fs/read_write.c
   3 *
   4 *  Copyright (C) 1991, 1992  Linus Torvalds
   5 */
   6
   7#include <linux/slab.h> 
   8#include <linux/stat.h>
   9#include <linux/fcntl.h>
  10#include <linux/file.h>
  11#include <linux/uio.h>
  12#include <linux/aio.h>
  13#include <linux/fsnotify.h>
  14#include <linux/security.h>
  15#include <linux/export.h>
  16#include <linux/syscalls.h>
  17#include <linux/pagemap.h>
  18#include <linux/splice.h>
  19#include <linux/compat.h>
  20#include "internal.h"
  21
  22#include <asm/uaccess.h>
  23#include <asm/unistd.h>
  24
  25typedef ssize_t (*io_fn_t)(struct file *, char __user *, size_t, loff_t *);
  26typedef ssize_t (*iov_fn_t)(struct kiocb *, const struct iovec *,
  27                unsigned long, loff_t);
  28
  29const struct file_operations generic_ro_fops = {
  30        .llseek         = generic_file_llseek,
  31        .read           = do_sync_read,
  32        .aio_read       = generic_file_aio_read,
  33        .mmap           = generic_file_readonly_mmap,
  34        .splice_read    = generic_file_splice_read,
  35};
  36
  37EXPORT_SYMBOL(generic_ro_fops);
  38
  39static inline int unsigned_offsets(struct file *file)
  40{
  41        return file->f_mode & FMODE_UNSIGNED_OFFSET;
  42}
  43
  44/**
  45 * vfs_setpos - update the file offset for lseek
  46 * @file:       file structure in question
  47 * @offset:     file offset to seek to
  48 * @maxsize:    maximum file size
  49 *
  50 * This is a low-level filesystem helper for updating the file offset to
  51 * the value specified by @offset if the given offset is valid and it is
  52 * not equal to the current file offset.
  53 *
  54 * Return the specified offset on success and -EINVAL on invalid offset.
  55 */
  56loff_t vfs_setpos(struct file *file, loff_t offset, loff_t maxsize)
  57{
  58        if (offset < 0 && !unsigned_offsets(file))
  59                return -EINVAL;
  60        if (offset > maxsize)
  61                return -EINVAL;
  62
  63        if (offset != file->f_pos) {
  64                file->f_pos = offset;
  65                file->f_version = 0;
  66        }
  67        return offset;
  68}
  69EXPORT_SYMBOL(vfs_setpos);
  70
  71/**
  72 * generic_file_llseek_size - generic llseek implementation for regular files
  73 * @file:       file structure to seek on
  74 * @offset:     file offset to seek to
  75 * @whence:     type of seek
  76 * @size:       max size of this file in file system
  77 * @eof:        offset used for SEEK_END position
  78 *
  79 * This is a variant of generic_file_llseek that allows passing in a custom
  80 * maximum file size and a custom EOF position, for e.g. hashed directories
  81 *
  82 * Synchronization:
  83 * SEEK_SET and SEEK_END are unsynchronized (but atomic on 64bit platforms)
  84 * SEEK_CUR is synchronized against other SEEK_CURs, but not read/writes.
  85 * read/writes behave like SEEK_SET against seeks.
  86 */
  87loff_t
  88generic_file_llseek_size(struct file *file, loff_t offset, int whence,
  89                loff_t maxsize, loff_t eof)
  90{
  91        switch (whence) {
  92        case SEEK_END:
  93                offset += eof;
  94                break;
  95        case SEEK_CUR:
  96                /*
  97                 * Here we special-case the lseek(fd, 0, SEEK_CUR)
  98                 * position-querying operation.  Avoid rewriting the "same"
  99                 * f_pos value back to the file because a concurrent read(),
 100                 * write() or lseek() might have altered it
 101                 */
 102                if (offset == 0)
 103                        return file->f_pos;
 104                /*
 105                 * f_lock protects against read/modify/write race with other
 106                 * SEEK_CURs. Note that parallel writes and reads behave
 107                 * like SEEK_SET.
 108                 */
 109                spin_lock(&file->f_lock);
 110                offset = vfs_setpos(file, file->f_pos + offset, maxsize);
 111                spin_unlock(&file->f_lock);
 112                return offset;
 113        case SEEK_DATA:
 114                /*
 115                 * In the generic case the entire file is data, so as long as
 116                 * offset isn't at the end of the file then the offset is data.
 117                 */
 118                if (offset >= eof)
 119                        return -ENXIO;
 120                break;
 121        case SEEK_HOLE:
 122                /*
 123                 * There is a virtual hole at the end of the file, so as long as
 124                 * offset isn't i_size or larger, return i_size.
 125                 */
 126                if (offset >= eof)
 127                        return -ENXIO;
 128                offset = eof;
 129                break;
 130        }
 131
 132        return vfs_setpos(file, offset, maxsize);
 133}
 134EXPORT_SYMBOL(generic_file_llseek_size);
 135
 136/**
 137 * generic_file_llseek - generic llseek implementation for regular files
 138 * @file:       file structure to seek on
 139 * @offset:     file offset to seek to
 140 * @whence:     type of seek
 141 *
 142 * This is a generic implemenation of ->llseek useable for all normal local
 143 * filesystems.  It just updates the file offset to the value specified by
 144 * @offset and @whence.
 145 */
 146loff_t generic_file_llseek(struct file *file, loff_t offset, int whence)
 147{
 148        struct inode *inode = file->f_mapping->host;
 149
 150        return generic_file_llseek_size(file, offset, whence,
 151                                        inode->i_sb->s_maxbytes,
 152                                        i_size_read(inode));
 153}
 154EXPORT_SYMBOL(generic_file_llseek);
 155
 156/**
 157 * fixed_size_llseek - llseek implementation for fixed-sized devices
 158 * @file:       file structure to seek on
 159 * @offset:     file offset to seek to
 160 * @whence:     type of seek
 161 * @size:       size of the file
 162 *
 163 */
 164loff_t fixed_size_llseek(struct file *file, loff_t offset, int whence, loff_t size)
 165{
 166        switch (whence) {
 167        case SEEK_SET: case SEEK_CUR: case SEEK_END:
 168                return generic_file_llseek_size(file, offset, whence,
 169                                                size, size);
 170        default:
 171                return -EINVAL;
 172        }
 173}
 174EXPORT_SYMBOL(fixed_size_llseek);
 175
 176/**
 177 * noop_llseek - No Operation Performed llseek implementation
 178 * @file:       file structure to seek on
 179 * @offset:     file offset to seek to
 180 * @whence:     type of seek
 181 *
 182 * This is an implementation of ->llseek useable for the rare special case when
 183 * userspace expects the seek to succeed but the (device) file is actually not
 184 * able to perform the seek. In this case you use noop_llseek() instead of
 185 * falling back to the default implementation of ->llseek.
 186 */
 187loff_t noop_llseek(struct file *file, loff_t offset, int whence)
 188{
 189        return file->f_pos;
 190}
 191EXPORT_SYMBOL(noop_llseek);
 192
 193loff_t no_llseek(struct file *file, loff_t offset, int whence)
 194{
 195        return -ESPIPE;
 196}
 197EXPORT_SYMBOL(no_llseek);
 198
 199loff_t default_llseek(struct file *file, loff_t offset, int whence)
 200{
 201        struct inode *inode = file_inode(file);
 202        loff_t retval;
 203
 204        mutex_lock(&inode->i_mutex);
 205        switch (whence) {
 206                case SEEK_END:
 207                        offset += i_size_read(inode);
 208                        break;
 209                case SEEK_CUR:
 210                        if (offset == 0) {
 211                                retval = file->f_pos;
 212                                goto out;
 213                        }
 214                        offset += file->f_pos;
 215                        break;
 216                case SEEK_DATA:
 217                        /*
 218                         * In the generic case the entire file is data, so as
 219                         * long as offset isn't at the end of the file then the
 220                         * offset is data.
 221                         */
 222                        if (offset >= inode->i_size) {
 223                                retval = -ENXIO;
 224                                goto out;
 225                        }
 226                        break;
 227                case SEEK_HOLE:
 228                        /*
 229                         * There is a virtual hole at the end of the file, so
 230                         * as long as offset isn't i_size or larger, return
 231                         * i_size.
 232                         */
 233                        if (offset >= inode->i_size) {
 234                                retval = -ENXIO;
 235                                goto out;
 236                        }
 237                        offset = inode->i_size;
 238                        break;
 239        }
 240        retval = -EINVAL;
 241        if (offset >= 0 || unsigned_offsets(file)) {
 242                if (offset != file->f_pos) {
 243                        file->f_pos = offset;
 244                        file->f_version = 0;
 245                }
 246                retval = offset;
 247        }
 248out:
 249        mutex_unlock(&inode->i_mutex);
 250        return retval;
 251}
 252EXPORT_SYMBOL(default_llseek);
 253
 254loff_t vfs_llseek(struct file *file, loff_t offset, int whence)
 255{
 256        loff_t (*fn)(struct file *, loff_t, int);
 257
 258        fn = no_llseek;
 259        if (file->f_mode & FMODE_LSEEK) {
 260                if (file->f_op && file->f_op->llseek)
 261                        fn = file->f_op->llseek;
 262        }
 263        return fn(file, offset, whence);
 264}
 265EXPORT_SYMBOL(vfs_llseek);
 266
 267SYSCALL_DEFINE3(lseek, unsigned int, fd, off_t, offset, unsigned int, whence)
 268{
 269        off_t retval;
 270        struct fd f = fdget(fd);
 271        if (!f.file)
 272                return -EBADF;
 273
 274        retval = -EINVAL;
 275        if (whence <= SEEK_MAX) {
 276                loff_t res = vfs_llseek(f.file, offset, whence);
 277                retval = res;
 278                if (res != (loff_t)retval)
 279                        retval = -EOVERFLOW;    /* LFS: should only happen on 32 bit platforms */
 280        }
 281        fdput(f);
 282        return retval;
 283}
 284
 285#ifdef CONFIG_COMPAT
 286COMPAT_SYSCALL_DEFINE3(lseek, unsigned int, fd, compat_off_t, offset, unsigned int, whence)
 287{
 288        return sys_lseek(fd, offset, whence);
 289}
 290#endif
 291
 292#ifdef __ARCH_WANT_SYS_LLSEEK
 293SYSCALL_DEFINE5(llseek, unsigned int, fd, unsigned long, offset_high,
 294                unsigned long, offset_low, loff_t __user *, result,
 295                unsigned int, whence)
 296{
 297        int retval;
 298        struct fd f = fdget(fd);
 299        loff_t offset;
 300
 301        if (!f.file)
 302                return -EBADF;
 303
 304        retval = -EINVAL;
 305        if (whence > SEEK_MAX)
 306                goto out_putf;
 307
 308        offset = vfs_llseek(f.file, ((loff_t) offset_high << 32) | offset_low,
 309                        whence);
 310
 311        retval = (int)offset;
 312        if (offset >= 0) {
 313                retval = -EFAULT;
 314                if (!copy_to_user(result, &offset, sizeof(offset)))
 315                        retval = 0;
 316        }
 317out_putf:
 318        fdput(f);
 319        return retval;
 320}
 321#endif
 322
 323/*
 324 * rw_verify_area doesn't like huge counts. We limit
 325 * them to something that fits in "int" so that others
 326 * won't have to do range checks all the time.
 327 */
 328int rw_verify_area(int read_write, struct file *file, const loff_t *ppos, size_t count)
 329{
 330        struct inode *inode;
 331        loff_t pos;
 332        int retval = -EINVAL;
 333
 334        inode = file_inode(file);
 335        if (unlikely((ssize_t) count < 0))
 336                return retval;
 337        pos = *ppos;
 338        if (unlikely(pos < 0)) {
 339                if (!unsigned_offsets(file))
 340                        return retval;
 341                if (count >= -pos) /* both values are in 0..LLONG_MAX */
 342                        return -EOVERFLOW;
 343        } else if (unlikely((loff_t) (pos + count) < 0)) {
 344                if (!unsigned_offsets(file))
 345                        return retval;
 346        }
 347
 348        if (unlikely(inode->i_flock && mandatory_lock(inode))) {
 349                retval = locks_mandatory_area(
 350                        read_write == READ ? FLOCK_VERIFY_READ : FLOCK_VERIFY_WRITE,
 351                        inode, file, pos, count);
 352                if (retval < 0)
 353                        return retval;
 354        }
 355        retval = security_file_permission(file,
 356                                read_write == READ ? MAY_READ : MAY_WRITE);
 357        if (retval)
 358                return retval;
 359        return count > MAX_RW_COUNT ? MAX_RW_COUNT : count;
 360}
 361
 362ssize_t do_sync_read(struct file *filp, char __user *buf, size_t len, loff_t *ppos)
 363{
 364        struct iovec iov = { .iov_base = buf, .iov_len = len };
 365        struct kiocb kiocb;
 366        ssize_t ret;
 367
 368        init_sync_kiocb(&kiocb, filp);
 369        kiocb.ki_pos = *ppos;
 370        kiocb.ki_nbytes = len;
 371
 372        ret = filp->f_op->aio_read(&kiocb, &iov, 1, kiocb.ki_pos);
 373        if (-EIOCBQUEUED == ret)
 374                ret = wait_on_sync_kiocb(&kiocb);
 375        *ppos = kiocb.ki_pos;
 376        return ret;
 377}
 378
 379EXPORT_SYMBOL(do_sync_read);
 380
 381ssize_t vfs_read(struct file *file, char __user *buf, size_t count, loff_t *pos)
 382{
 383        ssize_t ret;
 384
 385        if (!(file->f_mode & FMODE_READ))
 386                return -EBADF;
 387        if (!file->f_op || (!file->f_op->read && !file->f_op->aio_read))
 388                return -EINVAL;
 389        if (unlikely(!access_ok(VERIFY_WRITE, buf, count)))
 390                return -EFAULT;
 391
 392        ret = rw_verify_area(READ, file, pos, count);
 393        if (ret >= 0) {
 394                count = ret;
 395                if (file->f_op->read)
 396                        ret = file->f_op->read(file, buf, count, pos);
 397                else
 398                        ret = do_sync_read(file, buf, count, pos);
 399                if (ret > 0) {
 400                        fsnotify_access(file);
 401                        add_rchar(current, ret);
 402                }
 403                inc_syscr(current);
 404        }
 405
 406        return ret;
 407}
 408
 409EXPORT_SYMBOL(vfs_read);
 410
 411ssize_t do_sync_write(struct file *filp, const char __user *buf, size_t len, loff_t *ppos)
 412{
 413        struct iovec iov = { .iov_base = (void __user *)buf, .iov_len = len };
 414        struct kiocb kiocb;
 415        ssize_t ret;
 416
 417        init_sync_kiocb(&kiocb, filp);
 418        kiocb.ki_pos = *ppos;
 419        kiocb.ki_nbytes = len;
 420
 421        ret = filp->f_op->aio_write(&kiocb, &iov, 1, kiocb.ki_pos);
 422        if (-EIOCBQUEUED == ret)
 423                ret = wait_on_sync_kiocb(&kiocb);
 424        *ppos = kiocb.ki_pos;
 425        return ret;
 426}
 427
 428EXPORT_SYMBOL(do_sync_write);
 429
 430ssize_t __kernel_write(struct file *file, const char *buf, size_t count, loff_t *pos)
 431{
 432        mm_segment_t old_fs;
 433        const char __user *p;
 434        ssize_t ret;
 435
 436        if (!file->f_op || (!file->f_op->write && !file->f_op->aio_write))
 437                return -EINVAL;
 438
 439        old_fs = get_fs();
 440        set_fs(get_ds());
 441        p = (__force const char __user *)buf;
 442        if (count > MAX_RW_COUNT)
 443                count =  MAX_RW_COUNT;
 444        if (file->f_op->write)
 445                ret = file->f_op->write(file, p, count, pos);
 446        else
 447                ret = do_sync_write(file, p, count, pos);
 448        set_fs(old_fs);
 449        if (ret > 0) {
 450                fsnotify_modify(file);
 451                add_wchar(current, ret);
 452        }
 453        inc_syscw(current);
 454        return ret;
 455}
 456
 457ssize_t vfs_write(struct file *file, const char __user *buf, size_t count, loff_t *pos)
 458{
 459        ssize_t ret;
 460
 461        if (!(file->f_mode & FMODE_WRITE))
 462                return -EBADF;
 463        if (!file->f_op || (!file->f_op->write && !file->f_op->aio_write))
 464                return -EINVAL;
 465        if (unlikely(!access_ok(VERIFY_READ, buf, count)))
 466                return -EFAULT;
 467
 468        ret = rw_verify_area(WRITE, file, pos, count);
 469        if (ret >= 0) {
 470                count = ret;
 471                file_start_write(file);
 472                if (file->f_op->write)
 473                        ret = file->f_op->write(file, buf, count, pos);
 474                else
 475                        ret = do_sync_write(file, buf, count, pos);
 476                if (ret > 0) {
 477                        fsnotify_modify(file);
 478                        add_wchar(current, ret);
 479                }
 480                inc_syscw(current);
 481                file_end_write(file);
 482        }
 483
 484        return ret;
 485}
 486
 487EXPORT_SYMBOL(vfs_write);
 488
 489static inline loff_t file_pos_read(struct file *file)
 490{
 491        return file->f_pos;
 492}
 493
 494static inline void file_pos_write(struct file *file, loff_t pos)
 495{
 496        file->f_pos = pos;
 497}
 498
 499SYSCALL_DEFINE3(read, unsigned int, fd, char __user *, buf, size_t, count)
 500{
 501        struct fd f = fdget(fd);
 502        ssize_t ret = -EBADF;
 503
 504        if (f.file) {
 505                loff_t pos = file_pos_read(f.file);
 506                ret = vfs_read(f.file, buf, count, &pos);
 507                if (ret >= 0)
 508                        file_pos_write(f.file, pos);
 509                fdput(f);
 510        }
 511        return ret;
 512}
 513
 514SYSCALL_DEFINE3(write, unsigned int, fd, const char __user *, buf,
 515                size_t, count)
 516{
 517        struct fd f = fdget(fd);
 518        ssize_t ret = -EBADF;
 519
 520        if (f.file) {
 521                loff_t pos = file_pos_read(f.file);
 522                ret = vfs_write(f.file, buf, count, &pos);
 523                if (ret >= 0)
 524                        file_pos_write(f.file, pos);
 525                fdput(f);
 526        }
 527
 528        return ret;
 529}
 530
 531SYSCALL_DEFINE4(pread64, unsigned int, fd, char __user *, buf,
 532                        size_t, count, loff_t, pos)
 533{
 534        struct fd f;
 535        ssize_t ret = -EBADF;
 536
 537        if (pos < 0)
 538                return -EINVAL;
 539
 540        f = fdget(fd);
 541        if (f.file) {
 542                ret = -ESPIPE;
 543                if (f.file->f_mode & FMODE_PREAD)
 544                        ret = vfs_read(f.file, buf, count, &pos);
 545                fdput(f);
 546        }
 547
 548        return ret;
 549}
 550
 551SYSCALL_DEFINE4(pwrite64, unsigned int, fd, const char __user *, buf,
 552                         size_t, count, loff_t, pos)
 553{
 554        struct fd f;
 555        ssize_t ret = -EBADF;
 556
 557        if (pos < 0)
 558                return -EINVAL;
 559
 560        f = fdget(fd);
 561        if (f.file) {
 562                ret = -ESPIPE;
 563                if (f.file->f_mode & FMODE_PWRITE)  
 564                        ret = vfs_write(f.file, buf, count, &pos);
 565                fdput(f);
 566        }
 567
 568        return ret;
 569}
 570
 571/*
 572 * Reduce an iovec's length in-place.  Return the resulting number of segments
 573 */
 574unsigned long iov_shorten(struct iovec *iov, unsigned long nr_segs, size_t to)
 575{
 576        unsigned long seg = 0;
 577        size_t len = 0;
 578
 579        while (seg < nr_segs) {
 580                seg++;
 581                if (len + iov->iov_len >= to) {
 582                        iov->iov_len = to - len;
 583                        break;
 584                }
 585                len += iov->iov_len;
 586                iov++;
 587        }
 588        return seg;
 589}
 590EXPORT_SYMBOL(iov_shorten);
 591
 592static ssize_t do_sync_readv_writev(struct file *filp, const struct iovec *iov,
 593                unsigned long nr_segs, size_t len, loff_t *ppos, iov_fn_t fn)
 594{
 595        struct kiocb kiocb;
 596        ssize_t ret;
 597
 598        init_sync_kiocb(&kiocb, filp);
 599        kiocb.ki_pos = *ppos;
 600        kiocb.ki_nbytes = len;
 601
 602        ret = fn(&kiocb, iov, nr_segs, kiocb.ki_pos);
 603        if (ret == -EIOCBQUEUED)
 604                ret = wait_on_sync_kiocb(&kiocb);
 605        *ppos = kiocb.ki_pos;
 606        return ret;
 607}
 608
 609/* Do it by hand, with file-ops */
 610static ssize_t do_loop_readv_writev(struct file *filp, struct iovec *iov,
 611                unsigned long nr_segs, loff_t *ppos, io_fn_t fn)
 612{
 613        struct iovec *vector = iov;
 614        ssize_t ret = 0;
 615
 616        while (nr_segs > 0) {
 617                void __user *base;
 618                size_t len;
 619                ssize_t nr;
 620
 621                base = vector->iov_base;
 622                len = vector->iov_len;
 623                vector++;
 624                nr_segs--;
 625
 626                nr = fn(filp, base, len, ppos);
 627
 628                if (nr < 0) {
 629                        if (!ret)
 630                                ret = nr;
 631                        break;
 632                }
 633                ret += nr;
 634                if (nr != len)
 635                        break;
 636        }
 637
 638        return ret;
 639}
 640
 641/* A write operation does a read from user space and vice versa */
 642#define vrfy_dir(type) ((type) == READ ? VERIFY_WRITE : VERIFY_READ)
 643
 644ssize_t rw_copy_check_uvector(int type, const struct iovec __user * uvector,
 645                              unsigned long nr_segs, unsigned long fast_segs,
 646                              struct iovec *fast_pointer,
 647                              struct iovec **ret_pointer)
 648{
 649        unsigned long seg;
 650        ssize_t ret;
 651        struct iovec *iov = fast_pointer;
 652
 653        /*
 654         * SuS says "The readv() function *may* fail if the iovcnt argument
 655         * was less than or equal to 0, or greater than {IOV_MAX}.  Linux has
 656         * traditionally returned zero for zero segments, so...
 657         */
 658        if (nr_segs == 0) {
 659                ret = 0;
 660                goto out;
 661        }
 662
 663        /*
 664         * First get the "struct iovec" from user memory and
 665         * verify all the pointers
 666         */
 667        if (nr_segs > UIO_MAXIOV) {
 668                ret = -EINVAL;
 669                goto out;
 670        }
 671        if (nr_segs > fast_segs) {
 672                iov = kmalloc(nr_segs*sizeof(struct iovec), GFP_KERNEL);
 673                if (iov == NULL) {
 674                        ret = -ENOMEM;
 675                        goto out;
 676                }
 677        }
 678        if (copy_from_user(iov, uvector, nr_segs*sizeof(*uvector))) {
 679                ret = -EFAULT;
 680                goto out;
 681        }
 682
 683        /*
 684         * According to the Single Unix Specification we should return EINVAL
 685         * if an element length is < 0 when cast to ssize_t or if the
 686         * total length would overflow the ssize_t return value of the
 687         * system call.
 688         *
 689         * Linux caps all read/write calls to MAX_RW_COUNT, and avoids the
 690         * overflow case.
 691         */
 692        ret = 0;
 693        for (seg = 0; seg < nr_segs; seg++) {
 694                void __user *buf = iov[seg].iov_base;
 695                ssize_t len = (ssize_t)iov[seg].iov_len;
 696
 697                /* see if we we're about to use an invalid len or if
 698                 * it's about to overflow ssize_t */
 699                if (len < 0) {
 700                        ret = -EINVAL;
 701                        goto out;
 702                }
 703                if (type >= 0
 704                    && unlikely(!access_ok(vrfy_dir(type), buf, len))) {
 705                        ret = -EFAULT;
 706                        goto out;
 707                }
 708                if (len > MAX_RW_COUNT - ret) {
 709                        len = MAX_RW_COUNT - ret;
 710                        iov[seg].iov_len = len;
 711                }
 712                ret += len;
 713        }
 714out:
 715        *ret_pointer = iov;
 716        return ret;
 717}
 718
 719static ssize_t do_readv_writev(int type, struct file *file,
 720                               const struct iovec __user * uvector,
 721                               unsigned long nr_segs, loff_t *pos)
 722{
 723        size_t tot_len;
 724        struct iovec iovstack[UIO_FASTIOV];
 725        struct iovec *iov = iovstack;
 726        ssize_t ret;
 727        io_fn_t fn;
 728        iov_fn_t fnv;
 729
 730        if (!file->f_op) {
 731                ret = -EINVAL;
 732                goto out;
 733        }
 734
 735        ret = rw_copy_check_uvector(type, uvector, nr_segs,
 736                                    ARRAY_SIZE(iovstack), iovstack, &iov);
 737        if (ret <= 0)
 738                goto out;
 739
 740        tot_len = ret;
 741        ret = rw_verify_area(type, file, pos, tot_len);
 742        if (ret < 0)
 743                goto out;
 744
 745        fnv = NULL;
 746        if (type == READ) {
 747                fn = file->f_op->read;
 748                fnv = file->f_op->aio_read;
 749        } else {
 750                fn = (io_fn_t)file->f_op->write;
 751                fnv = file->f_op->aio_write;
 752                file_start_write(file);
 753        }
 754
 755        if (fnv)
 756                ret = do_sync_readv_writev(file, iov, nr_segs, tot_len,
 757                                                pos, fnv);
 758        else
 759                ret = do_loop_readv_writev(file, iov, nr_segs, pos, fn);
 760
 761        if (type != READ)
 762                file_end_write(file);
 763
 764out:
 765        if (iov != iovstack)
 766                kfree(iov);
 767        if ((ret + (type == READ)) > 0) {
 768                if (type == READ)
 769                        fsnotify_access(file);
 770                else
 771                        fsnotify_modify(file);
 772        }
 773        return ret;
 774}
 775
 776ssize_t vfs_readv(struct file *file, const struct iovec __user *vec,
 777                  unsigned long vlen, loff_t *pos)
 778{
 779        if (!(file->f_mode & FMODE_READ))
 780                return -EBADF;
 781        if (!file->f_op || (!file->f_op->aio_read && !file->f_op->read))
 782                return -EINVAL;
 783
 784        return do_readv_writev(READ, file, vec, vlen, pos);
 785}
 786
 787EXPORT_SYMBOL(vfs_readv);
 788
 789ssize_t vfs_writev(struct file *file, const struct iovec __user *vec,
 790                   unsigned long vlen, loff_t *pos)
 791{
 792        if (!(file->f_mode & FMODE_WRITE))
 793                return -EBADF;
 794        if (!file->f_op || (!file->f_op->aio_write && !file->f_op->write))
 795                return -EINVAL;
 796
 797        return do_readv_writev(WRITE, file, vec, vlen, pos);
 798}
 799
 800EXPORT_SYMBOL(vfs_writev);
 801
 802SYSCALL_DEFINE3(readv, unsigned long, fd, const struct iovec __user *, vec,
 803                unsigned long, vlen)
 804{
 805        struct fd f = fdget(fd);
 806        ssize_t ret = -EBADF;
 807
 808        if (f.file) {
 809                loff_t pos = file_pos_read(f.file);
 810                ret = vfs_readv(f.file, vec, vlen, &pos);
 811                if (ret >= 0)
 812                        file_pos_write(f.file, pos);
 813                fdput(f);
 814        }
 815
 816        if (ret > 0)
 817                add_rchar(current, ret);
 818        inc_syscr(current);
 819        return ret;
 820}
 821
 822SYSCALL_DEFINE3(writev, unsigned long, fd, const struct iovec __user *, vec,
 823                unsigned long, vlen)
 824{
 825        struct fd f = fdget(fd);
 826        ssize_t ret = -EBADF;
 827
 828        if (f.file) {
 829                loff_t pos = file_pos_read(f.file);
 830                ret = vfs_writev(f.file, vec, vlen, &pos);
 831                if (ret >= 0)
 832                        file_pos_write(f.file, pos);
 833                fdput(f);
 834        }
 835
 836        if (ret > 0)
 837                add_wchar(current, ret);
 838        inc_syscw(current);
 839        return ret;
 840}
 841
 842static inline loff_t pos_from_hilo(unsigned long high, unsigned long low)
 843{
 844#define HALF_LONG_BITS (BITS_PER_LONG / 2)
 845        return (((loff_t)high << HALF_LONG_BITS) << HALF_LONG_BITS) | low;
 846}
 847
 848SYSCALL_DEFINE5(preadv, unsigned long, fd, const struct iovec __user *, vec,
 849                unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h)
 850{
 851        loff_t pos = pos_from_hilo(pos_h, pos_l);
 852        struct fd f;
 853        ssize_t ret = -EBADF;
 854
 855        if (pos < 0)
 856                return -EINVAL;
 857
 858        f = fdget(fd);
 859        if (f.file) {
 860                ret = -ESPIPE;
 861                if (f.file->f_mode & FMODE_PREAD)
 862                        ret = vfs_readv(f.file, vec, vlen, &pos);
 863                fdput(f);
 864        }
 865
 866        if (ret > 0)
 867                add_rchar(current, ret);
 868        inc_syscr(current);
 869        return ret;
 870}
 871
 872SYSCALL_DEFINE5(pwritev, unsigned long, fd, const struct iovec __user *, vec,
 873                unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h)
 874{
 875        loff_t pos = pos_from_hilo(pos_h, pos_l);
 876        struct fd f;
 877        ssize_t ret = -EBADF;
 878
 879        if (pos < 0)
 880                return -EINVAL;
 881
 882        f = fdget(fd);
 883        if (f.file) {
 884                ret = -ESPIPE;
 885                if (f.file->f_mode & FMODE_PWRITE)
 886                        ret = vfs_writev(f.file, vec, vlen, &pos);
 887                fdput(f);
 888        }
 889
 890        if (ret > 0)
 891                add_wchar(current, ret);
 892        inc_syscw(current);
 893        return ret;
 894}
 895
 896#ifdef CONFIG_COMPAT
 897
 898static ssize_t compat_do_readv_writev(int type, struct file *file,
 899                               const struct compat_iovec __user *uvector,
 900                               unsigned long nr_segs, loff_t *pos)
 901{
 902        compat_ssize_t tot_len;
 903        struct iovec iovstack[UIO_FASTIOV];
 904        struct iovec *iov = iovstack;
 905        ssize_t ret;
 906        io_fn_t fn;
 907        iov_fn_t fnv;
 908
 909        ret = -EINVAL;
 910        if (!file->f_op)
 911                goto out;
 912
 913        ret = -EFAULT;
 914        if (!access_ok(VERIFY_READ, uvector, nr_segs*sizeof(*uvector)))
 915                goto out;
 916
 917        ret = compat_rw_copy_check_uvector(type, uvector, nr_segs,
 918                                               UIO_FASTIOV, iovstack, &iov);
 919        if (ret <= 0)
 920                goto out;
 921
 922        tot_len = ret;
 923        ret = rw_verify_area(type, file, pos, tot_len);
 924        if (ret < 0)
 925                goto out;
 926
 927        fnv = NULL;
 928        if (type == READ) {
 929                fn = file->f_op->read;
 930                fnv = file->f_op->aio_read;
 931        } else {
 932                fn = (io_fn_t)file->f_op->write;
 933                fnv = file->f_op->aio_write;
 934                file_start_write(file);
 935        }
 936
 937        if (fnv)
 938                ret = do_sync_readv_writev(file, iov, nr_segs, tot_len,
 939                                                pos, fnv);
 940        else
 941                ret = do_loop_readv_writev(file, iov, nr_segs, pos, fn);
 942
 943        if (type != READ)
 944                file_end_write(file);
 945
 946out:
 947        if (iov != iovstack)
 948                kfree(iov);
 949        if ((ret + (type == READ)) > 0) {
 950                if (type == READ)
 951                        fsnotify_access(file);
 952                else
 953                        fsnotify_modify(file);
 954        }
 955        return ret;
 956}
 957
 958static size_t compat_readv(struct file *file,
 959                           const struct compat_iovec __user *vec,
 960                           unsigned long vlen, loff_t *pos)
 961{
 962        ssize_t ret = -EBADF;
 963
 964        if (!(file->f_mode & FMODE_READ))
 965                goto out;
 966
 967        ret = -EINVAL;
 968        if (!file->f_op || (!file->f_op->aio_read && !file->f_op->read))
 969                goto out;
 970
 971        ret = compat_do_readv_writev(READ, file, vec, vlen, pos);
 972
 973out:
 974        if (ret > 0)
 975                add_rchar(current, ret);
 976        inc_syscr(current);
 977        return ret;
 978}
 979
 980COMPAT_SYSCALL_DEFINE3(readv, unsigned long, fd,
 981                const struct compat_iovec __user *,vec,
 982                unsigned long, vlen)
 983{
 984        struct fd f = fdget(fd);
 985        ssize_t ret;
 986        loff_t pos;
 987
 988        if (!f.file)
 989                return -EBADF;
 990        pos = f.file->f_pos;
 991        ret = compat_readv(f.file, vec, vlen, &pos);
 992        if (ret >= 0)
 993                f.file->f_pos = pos;
 994        fdput(f);
 995        return ret;
 996}
 997
 998COMPAT_SYSCALL_DEFINE4(preadv64, unsigned long, fd,
 999                const struct compat_iovec __user *,vec,
1000                unsigned long, vlen, loff_t, pos)
1001{
1002        struct fd f;
1003        ssize_t ret;
1004
1005        if (pos < 0)
1006                return -EINVAL;
1007        f = fdget(fd);
1008        if (!f.file)
1009                return -EBADF;
1010        ret = -ESPIPE;
1011        if (f.file->f_mode & FMODE_PREAD)
1012                ret = compat_readv(f.file, vec, vlen, &pos);
1013        fdput(f);
1014        return ret;
1015}
1016
1017COMPAT_SYSCALL_DEFINE5(preadv, unsigned long, fd,
1018                const struct compat_iovec __user *,vec,
1019                unsigned long, vlen, u32, pos_low, u32, pos_high)
1020{
1021        loff_t pos = ((loff_t)pos_high << 32) | pos_low;
1022        return compat_sys_preadv64(fd, vec, vlen, pos);
1023}
1024
1025static size_t compat_writev(struct file *file,
1026                            const struct compat_iovec __user *vec,
1027                            unsigned long vlen, loff_t *pos)
1028{
1029        ssize_t ret = -EBADF;
1030
1031        if (!(file->f_mode & FMODE_WRITE))
1032                goto out;
1033
1034        ret = -EINVAL;
1035        if (!file->f_op || (!file->f_op->aio_write && !file->f_op->write))
1036                goto out;
1037
1038        ret = compat_do_readv_writev(WRITE, file, vec, vlen, pos);
1039
1040out:
1041        if (ret > 0)
1042                add_wchar(current, ret);
1043        inc_syscw(current);
1044        return ret;
1045}
1046
1047COMPAT_SYSCALL_DEFINE3(writev, unsigned long, fd,
1048                const struct compat_iovec __user *, vec,
1049                unsigned long, vlen)
1050{
1051        struct fd f = fdget(fd);
1052        ssize_t ret;
1053        loff_t pos;
1054
1055        if (!f.file)
1056                return -EBADF;
1057        pos = f.file->f_pos;
1058        ret = compat_writev(f.file, vec, vlen, &pos);
1059        if (ret >= 0)
1060                f.file->f_pos = pos;
1061        fdput(f);
1062        return ret;
1063}
1064
1065COMPAT_SYSCALL_DEFINE4(pwritev64, unsigned long, fd,
1066                const struct compat_iovec __user *,vec,
1067                unsigned long, vlen, loff_t, pos)
1068{
1069        struct fd f;
1070        ssize_t ret;
1071
1072        if (pos < 0)
1073                return -EINVAL;
1074        f = fdget(fd);
1075        if (!f.file)
1076                return -EBADF;
1077        ret = -ESPIPE;
1078        if (f.file->f_mode & FMODE_PWRITE)
1079                ret = compat_writev(f.file, vec, vlen, &pos);
1080        fdput(f);
1081        return ret;
1082}
1083
1084COMPAT_SYSCALL_DEFINE5(pwritev, unsigned long, fd,
1085                const struct compat_iovec __user *,vec,
1086                unsigned long, vlen, u32, pos_low, u32, pos_high)
1087{
1088        loff_t pos = ((loff_t)pos_high << 32) | pos_low;
1089        return compat_sys_pwritev64(fd, vec, vlen, pos);
1090}
1091#endif
1092
1093static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos,
1094                           size_t count, loff_t max)
1095{
1096        struct fd in, out;
1097        struct inode *in_inode, *out_inode;
1098        loff_t pos;
1099        loff_t out_pos;
1100        ssize_t retval;
1101        int fl;
1102
1103        /*
1104         * Get input file, and verify that it is ok..
1105         */
1106        retval = -EBADF;
1107        in = fdget(in_fd);
1108        if (!in.file)
1109                goto out;
1110        if (!(in.file->f_mode & FMODE_READ))
1111                goto fput_in;
1112        retval = -ESPIPE;
1113        if (!ppos) {
1114                pos = in.file->f_pos;
1115        } else {
1116                pos = *ppos;
1117                if (!(in.file->f_mode & FMODE_PREAD))
1118                        goto fput_in;
1119        }
1120        retval = rw_verify_area(READ, in.file, &pos, count);
1121        if (retval < 0)
1122                goto fput_in;
1123        count = retval;
1124
1125        /*
1126         * Get output file, and verify that it is ok..
1127         */
1128        retval = -EBADF;
1129        out = fdget(out_fd);
1130        if (!out.file)
1131                goto fput_in;
1132        if (!(out.file->f_mode & FMODE_WRITE))
1133                goto fput_out;
1134        retval = -EINVAL;
1135        in_inode = file_inode(in.file);
1136        out_inode = file_inode(out.file);
1137        out_pos = out.file->f_pos;
1138        retval = rw_verify_area(WRITE, out.file, &out_pos, count);
1139        if (retval < 0)
1140                goto fput_out;
1141        count = retval;
1142
1143        if (!max)
1144                max = min(in_inode->i_sb->s_maxbytes, out_inode->i_sb->s_maxbytes);
1145
1146        if (unlikely(pos + count > max)) {
1147                retval = -EOVERFLOW;
1148                if (pos >= max)
1149                        goto fput_out;
1150                count = max - pos;
1151        }
1152
1153        fl = 0;
1154#if 0
1155        /*
1156         * We need to debate whether we can enable this or not. The
1157         * man page documents EAGAIN return for the output at least,
1158         * and the application is arguably buggy if it doesn't expect
1159         * EAGAIN on a non-blocking file descriptor.
1160         */
1161        if (in.file->f_flags & O_NONBLOCK)
1162                fl = SPLICE_F_NONBLOCK;
1163#endif
1164        file_start_write(out.file);
1165        retval = do_splice_direct(in.file, &pos, out.file, &out_pos, count, fl);
1166        file_end_write(out.file);
1167
1168        if (retval > 0) {
1169                add_rchar(current, retval);
1170                add_wchar(current, retval);
1171                fsnotify_access(in.file);
1172                fsnotify_modify(out.file);
1173                out.file->f_pos = out_pos;
1174                if (ppos)
1175                        *ppos = pos;
1176                else
1177                        in.file->f_pos = pos;
1178        }
1179
1180        inc_syscr(current);
1181        inc_syscw(current);
1182        if (pos > max)
1183                retval = -EOVERFLOW;
1184
1185fput_out:
1186        fdput(out);
1187fput_in:
1188        fdput(in);
1189out:
1190        return retval;
1191}
1192
1193SYSCALL_DEFINE4(sendfile, int, out_fd, int, in_fd, off_t __user *, offset, size_t, count)
1194{
1195        loff_t pos;
1196        off_t off;
1197        ssize_t ret;
1198
1199        if (offset) {
1200                if (unlikely(get_user(off, offset)))
1201                        return -EFAULT;
1202                pos = off;
1203                ret = do_sendfile(out_fd, in_fd, &pos, count, MAX_NON_LFS);
1204                if (unlikely(put_user(pos, offset)))
1205                        return -EFAULT;
1206                return ret;
1207        }
1208
1209        return do_sendfile(out_fd, in_fd, NULL, count, 0);
1210}
1211
1212SYSCALL_DEFINE4(sendfile64, int, out_fd, int, in_fd, loff_t __user *, offset, size_t, count)
1213{
1214        loff_t pos;
1215        ssize_t ret;
1216
1217        if (offset) {
1218                if (unlikely(copy_from_user(&pos, offset, sizeof(loff_t))))
1219                        return -EFAULT;
1220                ret = do_sendfile(out_fd, in_fd, &pos, count, 0);
1221                if (unlikely(put_user(pos, offset)))
1222                        return -EFAULT;
1223                return ret;
1224        }
1225
1226        return do_sendfile(out_fd, in_fd, NULL, count, 0);
1227}
1228
1229#ifdef CONFIG_COMPAT
1230COMPAT_SYSCALL_DEFINE4(sendfile, int, out_fd, int, in_fd,
1231                compat_off_t __user *, offset, compat_size_t, count)
1232{
1233        loff_t pos;
1234        off_t off;
1235        ssize_t ret;
1236
1237        if (offset) {
1238                if (unlikely(get_user(off, offset)))
1239                        return -EFAULT;
1240                pos = off;
1241                ret = do_sendfile(out_fd, in_fd, &pos, count, MAX_NON_LFS);
1242                if (unlikely(put_user(pos, offset)))
1243                        return -EFAULT;
1244                return ret;
1245        }
1246
1247        return do_sendfile(out_fd, in_fd, NULL, count, 0);
1248}
1249
1250COMPAT_SYSCALL_DEFINE4(sendfile64, int, out_fd, int, in_fd,
1251                compat_loff_t __user *, offset, compat_size_t, count)
1252{
1253        loff_t pos;
1254        ssize_t ret;
1255
1256        if (offset) {
1257                if (unlikely(copy_from_user(&pos, offset, sizeof(loff_t))))
1258                        return -EFAULT;
1259                ret = do_sendfile(out_fd, in_fd, &pos, count, 0);
1260                if (unlikely(put_user(pos, offset)))
1261                        return -EFAULT;
1262                return ret;
1263        }
1264
1265        return do_sendfile(out_fd, in_fd, NULL, count, 0);
1266}
1267#endif
1268