linux/fs/read_write.c
<<
>>
Prefs
   1/*
   2 *  linux/fs/read_write.c
   3 *
   4 *  Copyright (C) 1991, 1992  Linus Torvalds
   5 */
   6
   7#include <linux/slab.h> 
   8#include <linux/stat.h>
   9#include <linux/fcntl.h>
  10#include <linux/file.h>
  11#include <linux/uio.h>
  12#include <linux/aio.h>
  13#include <linux/fsnotify.h>
  14#include <linux/security.h>
  15#include <linux/export.h>
  16#include <linux/syscalls.h>
  17#include <linux/pagemap.h>
  18#include <linux/splice.h>
  19#include <linux/compat.h>
  20#include "internal.h"
  21
  22#include <asm/uaccess.h>
  23#include <asm/unistd.h>
  24
  25typedef ssize_t (*io_fn_t)(struct file *, char __user *, size_t, loff_t *);
  26typedef ssize_t (*iov_fn_t)(struct kiocb *, const struct iovec *,
  27                unsigned long, loff_t);
  28
  29const struct file_operations generic_ro_fops = {
  30        .llseek         = generic_file_llseek,
  31        .read           = do_sync_read,
  32        .aio_read       = generic_file_aio_read,
  33        .mmap           = generic_file_readonly_mmap,
  34        .splice_read    = generic_file_splice_read,
  35};
  36
  37EXPORT_SYMBOL(generic_ro_fops);
  38
  39static inline int unsigned_offsets(struct file *file)
  40{
  41        return file->f_mode & FMODE_UNSIGNED_OFFSET;
  42}
  43
  44/**
  45 * vfs_setpos - update the file offset for lseek
  46 * @file:       file structure in question
  47 * @offset:     file offset to seek to
  48 * @maxsize:    maximum file size
  49 *
  50 * This is a low-level filesystem helper for updating the file offset to
  51 * the value specified by @offset if the given offset is valid and it is
  52 * not equal to the current file offset.
  53 *
  54 * Return the specified offset on success and -EINVAL on invalid offset.
  55 */
  56loff_t vfs_setpos(struct file *file, loff_t offset, loff_t maxsize)
  57{
  58        if (offset < 0 && !unsigned_offsets(file))
  59                return -EINVAL;
  60        if (offset > maxsize)
  61                return -EINVAL;
  62
  63        if (offset != file->f_pos) {
  64                file->f_pos = offset;
  65                file->f_version = 0;
  66        }
  67        return offset;
  68}
  69EXPORT_SYMBOL(vfs_setpos);
  70
  71/**
  72 * generic_file_llseek_size - generic llseek implementation for regular files
  73 * @file:       file structure to seek on
  74 * @offset:     file offset to seek to
  75 * @whence:     type of seek
  76 * @size:       max size of this file in file system
  77 * @eof:        offset used for SEEK_END position
  78 *
  79 * This is a variant of generic_file_llseek that allows passing in a custom
  80 * maximum file size and a custom EOF position, for e.g. hashed directories
  81 *
  82 * Synchronization:
  83 * SEEK_SET and SEEK_END are unsynchronized (but atomic on 64bit platforms)
  84 * SEEK_CUR is synchronized against other SEEK_CURs, but not read/writes.
  85 * read/writes behave like SEEK_SET against seeks.
  86 */
  87loff_t
  88generic_file_llseek_size(struct file *file, loff_t offset, int whence,
  89                loff_t maxsize, loff_t eof)
  90{
  91        switch (whence) {
  92        case SEEK_END:
  93                offset += eof;
  94                break;
  95        case SEEK_CUR:
  96                /*
  97                 * Here we special-case the lseek(fd, 0, SEEK_CUR)
  98                 * position-querying operation.  Avoid rewriting the "same"
  99                 * f_pos value back to the file because a concurrent read(),
 100                 * write() or lseek() might have altered it
 101                 */
 102                if (offset == 0)
 103                        return file->f_pos;
 104                /*
 105                 * f_lock protects against read/modify/write race with other
 106                 * SEEK_CURs. Note that parallel writes and reads behave
 107                 * like SEEK_SET.
 108                 */
 109                spin_lock(&file->f_lock);
 110                offset = vfs_setpos(file, file->f_pos + offset, maxsize);
 111                spin_unlock(&file->f_lock);
 112                return offset;
 113        case SEEK_DATA:
 114                /*
 115                 * In the generic case the entire file is data, so as long as
 116                 * offset isn't at the end of the file then the offset is data.
 117                 */
 118                if (offset >= eof)
 119                        return -ENXIO;
 120                break;
 121        case SEEK_HOLE:
 122                /*
 123                 * There is a virtual hole at the end of the file, so as long as
 124                 * offset isn't i_size or larger, return i_size.
 125                 */
 126                if (offset >= eof)
 127                        return -ENXIO;
 128                offset = eof;
 129                break;
 130        }
 131
 132        return vfs_setpos(file, offset, maxsize);
 133}
 134EXPORT_SYMBOL(generic_file_llseek_size);
 135
 136/**
 137 * generic_file_llseek - generic llseek implementation for regular files
 138 * @file:       file structure to seek on
 139 * @offset:     file offset to seek to
 140 * @whence:     type of seek
 141 *
 142 * This is a generic implemenation of ->llseek useable for all normal local
 143 * filesystems.  It just updates the file offset to the value specified by
 144 * @offset and @whence.
 145 */
 146loff_t generic_file_llseek(struct file *file, loff_t offset, int whence)
 147{
 148        struct inode *inode = file->f_mapping->host;
 149
 150        return generic_file_llseek_size(file, offset, whence,
 151                                        inode->i_sb->s_maxbytes,
 152                                        i_size_read(inode));
 153}
 154EXPORT_SYMBOL(generic_file_llseek);
 155
 156/**
 157 * fixed_size_llseek - llseek implementation for fixed-sized devices
 158 * @file:       file structure to seek on
 159 * @offset:     file offset to seek to
 160 * @whence:     type of seek
 161 * @size:       size of the file
 162 *
 163 */
 164loff_t fixed_size_llseek(struct file *file, loff_t offset, int whence, loff_t size)
 165{
 166        switch (whence) {
 167        case SEEK_SET: case SEEK_CUR: case SEEK_END:
 168                return generic_file_llseek_size(file, offset, whence,
 169                                                size, size);
 170        default:
 171                return -EINVAL;
 172        }
 173}
 174EXPORT_SYMBOL(fixed_size_llseek);
 175
 176/**
 177 * noop_llseek - No Operation Performed llseek implementation
 178 * @file:       file structure to seek on
 179 * @offset:     file offset to seek to
 180 * @whence:     type of seek
 181 *
 182 * This is an implementation of ->llseek useable for the rare special case when
 183 * userspace expects the seek to succeed but the (device) file is actually not
 184 * able to perform the seek. In this case you use noop_llseek() instead of
 185 * falling back to the default implementation of ->llseek.
 186 */
 187loff_t noop_llseek(struct file *file, loff_t offset, int whence)
 188{
 189        return file->f_pos;
 190}
 191EXPORT_SYMBOL(noop_llseek);
 192
 193loff_t no_llseek(struct file *file, loff_t offset, int whence)
 194{
 195        return -ESPIPE;
 196}
 197EXPORT_SYMBOL(no_llseek);
 198
 199loff_t default_llseek(struct file *file, loff_t offset, int whence)
 200{
 201        struct inode *inode = file_inode(file);
 202        loff_t retval;
 203
 204        mutex_lock(&inode->i_mutex);
 205        switch (whence) {
 206                case SEEK_END:
 207                        offset += i_size_read(inode);
 208                        break;
 209                case SEEK_CUR:
 210                        if (offset == 0) {
 211                                retval = file->f_pos;
 212                                goto out;
 213                        }
 214                        offset += file->f_pos;
 215                        break;
 216                case SEEK_DATA:
 217                        /*
 218                         * In the generic case the entire file is data, so as
 219                         * long as offset isn't at the end of the file then the
 220                         * offset is data.
 221                         */
 222                        if (offset >= inode->i_size) {
 223                                retval = -ENXIO;
 224                                goto out;
 225                        }
 226                        break;
 227                case SEEK_HOLE:
 228                        /*
 229                         * There is a virtual hole at the end of the file, so
 230                         * as long as offset isn't i_size or larger, return
 231                         * i_size.
 232                         */
 233                        if (offset >= inode->i_size) {
 234                                retval = -ENXIO;
 235                                goto out;
 236                        }
 237                        offset = inode->i_size;
 238                        break;
 239        }
 240        retval = -EINVAL;
 241        if (offset >= 0 || unsigned_offsets(file)) {
 242                if (offset != file->f_pos) {
 243                        file->f_pos = offset;
 244                        file->f_version = 0;
 245                }
 246                retval = offset;
 247        }
 248out:
 249        mutex_unlock(&inode->i_mutex);
 250        return retval;
 251}
 252EXPORT_SYMBOL(default_llseek);
 253
 254loff_t vfs_llseek(struct file *file, loff_t offset, int whence)
 255{
 256        loff_t (*fn)(struct file *, loff_t, int);
 257
 258        fn = no_llseek;
 259        if (file->f_mode & FMODE_LSEEK) {
 260                if (file->f_op && file->f_op->llseek)
 261                        fn = file->f_op->llseek;
 262        }
 263        return fn(file, offset, whence);
 264}
 265EXPORT_SYMBOL(vfs_llseek);
 266
 267SYSCALL_DEFINE3(lseek, unsigned int, fd, off_t, offset, unsigned int, whence)
 268{
 269        off_t retval;
 270        struct fd f = fdget(fd);
 271        if (!f.file)
 272                return -EBADF;
 273
 274        retval = -EINVAL;
 275        if (whence <= SEEK_MAX) {
 276                loff_t res = vfs_llseek(f.file, offset, whence);
 277                retval = res;
 278                if (res != (loff_t)retval)
 279                        retval = -EOVERFLOW;    /* LFS: should only happen on 32 bit platforms */
 280        }
 281        fdput(f);
 282        return retval;
 283}
 284
 285#ifdef CONFIG_COMPAT
 286COMPAT_SYSCALL_DEFINE3(lseek, unsigned int, fd, compat_off_t, offset, unsigned int, whence)
 287{
 288        return sys_lseek(fd, offset, whence);
 289}
 290#endif
 291
 292#ifdef __ARCH_WANT_SYS_LLSEEK
 293SYSCALL_DEFINE5(llseek, unsigned int, fd, unsigned long, offset_high,
 294                unsigned long, offset_low, loff_t __user *, result,
 295                unsigned int, whence)
 296{
 297        int retval;
 298        struct fd f = fdget(fd);
 299        loff_t offset;
 300
 301        if (!f.file)
 302                return -EBADF;
 303
 304        retval = -EINVAL;
 305        if (whence > SEEK_MAX)
 306                goto out_putf;
 307
 308        offset = vfs_llseek(f.file, ((loff_t) offset_high << 32) | offset_low,
 309                        whence);
 310
 311        retval = (int)offset;
 312        if (offset >= 0) {
 313                retval = -EFAULT;
 314                if (!copy_to_user(result, &offset, sizeof(offset)))
 315                        retval = 0;
 316        }
 317out_putf:
 318        fdput(f);
 319        return retval;
 320}
 321#endif
 322
 323/*
 324 * rw_verify_area doesn't like huge counts. We limit
 325 * them to something that fits in "int" so that others
 326 * won't have to do range checks all the time.
 327 */
 328int rw_verify_area(int read_write, struct file *file, const loff_t *ppos, size_t count)
 329{
 330        struct inode *inode;
 331        loff_t pos;
 332        int retval = -EINVAL;
 333
 334        inode = file_inode(file);
 335        if (unlikely((ssize_t) count < 0))
 336                return retval;
 337        pos = *ppos;
 338        if (unlikely(pos < 0)) {
 339                if (!unsigned_offsets(file))
 340                        return retval;
 341                if (count >= -pos) /* both values are in 0..LLONG_MAX */
 342                        return -EOVERFLOW;
 343        } else if (unlikely((loff_t) (pos + count) < 0)) {
 344                if (!unsigned_offsets(file))
 345                        return retval;
 346        }
 347
 348        if (unlikely(inode->i_flock && mandatory_lock(inode))) {
 349                retval = locks_mandatory_area(
 350                        read_write == READ ? FLOCK_VERIFY_READ : FLOCK_VERIFY_WRITE,
 351                        inode, file, pos, count);
 352                if (retval < 0)
 353                        return retval;
 354        }
 355        retval = security_file_permission(file,
 356                                read_write == READ ? MAY_READ : MAY_WRITE);
 357        if (retval)
 358                return retval;
 359        return count > MAX_RW_COUNT ? MAX_RW_COUNT : count;
 360}
 361
 362ssize_t do_sync_read(struct file *filp, char __user *buf, size_t len, loff_t *ppos)
 363{
 364        struct iovec iov = { .iov_base = buf, .iov_len = len };
 365        struct kiocb kiocb;
 366        ssize_t ret;
 367
 368        init_sync_kiocb(&kiocb, filp);
 369        kiocb.ki_pos = *ppos;
 370        kiocb.ki_left = len;
 371        kiocb.ki_nbytes = len;
 372
 373        ret = filp->f_op->aio_read(&kiocb, &iov, 1, kiocb.ki_pos);
 374        if (-EIOCBQUEUED == ret)
 375                ret = wait_on_sync_kiocb(&kiocb);
 376        *ppos = kiocb.ki_pos;
 377        return ret;
 378}
 379
 380EXPORT_SYMBOL(do_sync_read);
 381
 382ssize_t vfs_read(struct file *file, char __user *buf, size_t count, loff_t *pos)
 383{
 384        ssize_t ret;
 385
 386        if (!(file->f_mode & FMODE_READ))
 387                return -EBADF;
 388        if (!file->f_op || (!file->f_op->read && !file->f_op->aio_read))
 389                return -EINVAL;
 390        if (unlikely(!access_ok(VERIFY_WRITE, buf, count)))
 391                return -EFAULT;
 392
 393        ret = rw_verify_area(READ, file, pos, count);
 394        if (ret >= 0) {
 395                count = ret;
 396                if (file->f_op->read)
 397                        ret = file->f_op->read(file, buf, count, pos);
 398                else
 399                        ret = do_sync_read(file, buf, count, pos);
 400                if (ret > 0) {
 401                        fsnotify_access(file);
 402                        add_rchar(current, ret);
 403                }
 404                inc_syscr(current);
 405        }
 406
 407        return ret;
 408}
 409
 410EXPORT_SYMBOL(vfs_read);
 411
 412ssize_t do_sync_write(struct file *filp, const char __user *buf, size_t len, loff_t *ppos)
 413{
 414        struct iovec iov = { .iov_base = (void __user *)buf, .iov_len = len };
 415        struct kiocb kiocb;
 416        ssize_t ret;
 417
 418        init_sync_kiocb(&kiocb, filp);
 419        kiocb.ki_pos = *ppos;
 420        kiocb.ki_left = len;
 421        kiocb.ki_nbytes = len;
 422
 423        ret = filp->f_op->aio_write(&kiocb, &iov, 1, kiocb.ki_pos);
 424        if (-EIOCBQUEUED == ret)
 425                ret = wait_on_sync_kiocb(&kiocb);
 426        *ppos = kiocb.ki_pos;
 427        return ret;
 428}
 429
 430EXPORT_SYMBOL(do_sync_write);
 431
 432ssize_t __kernel_write(struct file *file, const char *buf, size_t count, loff_t *pos)
 433{
 434        mm_segment_t old_fs;
 435        const char __user *p;
 436        ssize_t ret;
 437
 438        if (!file->f_op || (!file->f_op->write && !file->f_op->aio_write))
 439                return -EINVAL;
 440
 441        old_fs = get_fs();
 442        set_fs(get_ds());
 443        p = (__force const char __user *)buf;
 444        if (count > MAX_RW_COUNT)
 445                count =  MAX_RW_COUNT;
 446        if (file->f_op->write)
 447                ret = file->f_op->write(file, p, count, pos);
 448        else
 449                ret = do_sync_write(file, p, count, pos);
 450        set_fs(old_fs);
 451        if (ret > 0) {
 452                fsnotify_modify(file);
 453                add_wchar(current, ret);
 454        }
 455        inc_syscw(current);
 456        return ret;
 457}
 458
 459ssize_t vfs_write(struct file *file, const char __user *buf, size_t count, loff_t *pos)
 460{
 461        ssize_t ret;
 462
 463        if (!(file->f_mode & FMODE_WRITE))
 464                return -EBADF;
 465        if (!file->f_op || (!file->f_op->write && !file->f_op->aio_write))
 466                return -EINVAL;
 467        if (unlikely(!access_ok(VERIFY_READ, buf, count)))
 468                return -EFAULT;
 469
 470        ret = rw_verify_area(WRITE, file, pos, count);
 471        if (ret >= 0) {
 472                count = ret;
 473                file_start_write(file);
 474                if (file->f_op->write)
 475                        ret = file->f_op->write(file, buf, count, pos);
 476                else
 477                        ret = do_sync_write(file, buf, count, pos);
 478                if (ret > 0) {
 479                        fsnotify_modify(file);
 480                        add_wchar(current, ret);
 481                }
 482                inc_syscw(current);
 483                file_end_write(file);
 484        }
 485
 486        return ret;
 487}
 488
 489EXPORT_SYMBOL(vfs_write);
 490
 491static inline loff_t file_pos_read(struct file *file)
 492{
 493        return file->f_pos;
 494}
 495
 496static inline void file_pos_write(struct file *file, loff_t pos)
 497{
 498        file->f_pos = pos;
 499}
 500
 501SYSCALL_DEFINE3(read, unsigned int, fd, char __user *, buf, size_t, count)
 502{
 503        struct fd f = fdget(fd);
 504        ssize_t ret = -EBADF;
 505
 506        if (f.file) {
 507                loff_t pos = file_pos_read(f.file);
 508                ret = vfs_read(f.file, buf, count, &pos);
 509                if (ret >= 0)
 510                        file_pos_write(f.file, pos);
 511                fdput(f);
 512        }
 513        return ret;
 514}
 515
 516SYSCALL_DEFINE3(write, unsigned int, fd, const char __user *, buf,
 517                size_t, count)
 518{
 519        struct fd f = fdget(fd);
 520        ssize_t ret = -EBADF;
 521
 522        if (f.file) {
 523                loff_t pos = file_pos_read(f.file);
 524                ret = vfs_write(f.file, buf, count, &pos);
 525                if (ret >= 0)
 526                        file_pos_write(f.file, pos);
 527                fdput(f);
 528        }
 529
 530        return ret;
 531}
 532
 533SYSCALL_DEFINE4(pread64, unsigned int, fd, char __user *, buf,
 534                        size_t, count, loff_t, pos)
 535{
 536        struct fd f;
 537        ssize_t ret = -EBADF;
 538
 539        if (pos < 0)
 540                return -EINVAL;
 541
 542        f = fdget(fd);
 543        if (f.file) {
 544                ret = -ESPIPE;
 545                if (f.file->f_mode & FMODE_PREAD)
 546                        ret = vfs_read(f.file, buf, count, &pos);
 547                fdput(f);
 548        }
 549
 550        return ret;
 551}
 552
 553SYSCALL_DEFINE4(pwrite64, unsigned int, fd, const char __user *, buf,
 554                         size_t, count, loff_t, pos)
 555{
 556        struct fd f;
 557        ssize_t ret = -EBADF;
 558
 559        if (pos < 0)
 560                return -EINVAL;
 561
 562        f = fdget(fd);
 563        if (f.file) {
 564                ret = -ESPIPE;
 565                if (f.file->f_mode & FMODE_PWRITE)  
 566                        ret = vfs_write(f.file, buf, count, &pos);
 567                fdput(f);
 568        }
 569
 570        return ret;
 571}
 572
 573/*
 574 * Reduce an iovec's length in-place.  Return the resulting number of segments
 575 */
 576unsigned long iov_shorten(struct iovec *iov, unsigned long nr_segs, size_t to)
 577{
 578        unsigned long seg = 0;
 579        size_t len = 0;
 580
 581        while (seg < nr_segs) {
 582                seg++;
 583                if (len + iov->iov_len >= to) {
 584                        iov->iov_len = to - len;
 585                        break;
 586                }
 587                len += iov->iov_len;
 588                iov++;
 589        }
 590        return seg;
 591}
 592EXPORT_SYMBOL(iov_shorten);
 593
 594static ssize_t do_sync_readv_writev(struct file *filp, const struct iovec *iov,
 595                unsigned long nr_segs, size_t len, loff_t *ppos, iov_fn_t fn)
 596{
 597        struct kiocb kiocb;
 598        ssize_t ret;
 599
 600        init_sync_kiocb(&kiocb, filp);
 601        kiocb.ki_pos = *ppos;
 602        kiocb.ki_left = len;
 603        kiocb.ki_nbytes = len;
 604
 605        ret = fn(&kiocb, iov, nr_segs, kiocb.ki_pos);
 606        if (ret == -EIOCBQUEUED)
 607                ret = wait_on_sync_kiocb(&kiocb);
 608        *ppos = kiocb.ki_pos;
 609        return ret;
 610}
 611
 612/* Do it by hand, with file-ops */
 613static ssize_t do_loop_readv_writev(struct file *filp, struct iovec *iov,
 614                unsigned long nr_segs, loff_t *ppos, io_fn_t fn)
 615{
 616        struct iovec *vector = iov;
 617        ssize_t ret = 0;
 618
 619        while (nr_segs > 0) {
 620                void __user *base;
 621                size_t len;
 622                ssize_t nr;
 623
 624                base = vector->iov_base;
 625                len = vector->iov_len;
 626                vector++;
 627                nr_segs--;
 628
 629                nr = fn(filp, base, len, ppos);
 630
 631                if (nr < 0) {
 632                        if (!ret)
 633                                ret = nr;
 634                        break;
 635                }
 636                ret += nr;
 637                if (nr != len)
 638                        break;
 639        }
 640
 641        return ret;
 642}
 643
 644/* A write operation does a read from user space and vice versa */
 645#define vrfy_dir(type) ((type) == READ ? VERIFY_WRITE : VERIFY_READ)
 646
 647ssize_t rw_copy_check_uvector(int type, const struct iovec __user * uvector,
 648                              unsigned long nr_segs, unsigned long fast_segs,
 649                              struct iovec *fast_pointer,
 650                              struct iovec **ret_pointer)
 651{
 652        unsigned long seg;
 653        ssize_t ret;
 654        struct iovec *iov = fast_pointer;
 655
 656        /*
 657         * SuS says "The readv() function *may* fail if the iovcnt argument
 658         * was less than or equal to 0, or greater than {IOV_MAX}.  Linux has
 659         * traditionally returned zero for zero segments, so...
 660         */
 661        if (nr_segs == 0) {
 662                ret = 0;
 663                goto out;
 664        }
 665
 666        /*
 667         * First get the "struct iovec" from user memory and
 668         * verify all the pointers
 669         */
 670        if (nr_segs > UIO_MAXIOV) {
 671                ret = -EINVAL;
 672                goto out;
 673        }
 674        if (nr_segs > fast_segs) {
 675                iov = kmalloc(nr_segs*sizeof(struct iovec), GFP_KERNEL);
 676                if (iov == NULL) {
 677                        ret = -ENOMEM;
 678                        goto out;
 679                }
 680        }
 681        if (copy_from_user(iov, uvector, nr_segs*sizeof(*uvector))) {
 682                ret = -EFAULT;
 683                goto out;
 684        }
 685
 686        /*
 687         * According to the Single Unix Specification we should return EINVAL
 688         * if an element length is < 0 when cast to ssize_t or if the
 689         * total length would overflow the ssize_t return value of the
 690         * system call.
 691         *
 692         * Linux caps all read/write calls to MAX_RW_COUNT, and avoids the
 693         * overflow case.
 694         */
 695        ret = 0;
 696        for (seg = 0; seg < nr_segs; seg++) {
 697                void __user *buf = iov[seg].iov_base;
 698                ssize_t len = (ssize_t)iov[seg].iov_len;
 699
 700                /* see if we we're about to use an invalid len or if
 701                 * it's about to overflow ssize_t */
 702                if (len < 0) {
 703                        ret = -EINVAL;
 704                        goto out;
 705                }
 706                if (type >= 0
 707                    && unlikely(!access_ok(vrfy_dir(type), buf, len))) {
 708                        ret = -EFAULT;
 709                        goto out;
 710                }
 711                if (len > MAX_RW_COUNT - ret) {
 712                        len = MAX_RW_COUNT - ret;
 713                        iov[seg].iov_len = len;
 714                }
 715                ret += len;
 716        }
 717out:
 718        *ret_pointer = iov;
 719        return ret;
 720}
 721
 722static ssize_t do_readv_writev(int type, struct file *file,
 723                               const struct iovec __user * uvector,
 724                               unsigned long nr_segs, loff_t *pos)
 725{
 726        size_t tot_len;
 727        struct iovec iovstack[UIO_FASTIOV];
 728        struct iovec *iov = iovstack;
 729        ssize_t ret;
 730        io_fn_t fn;
 731        iov_fn_t fnv;
 732
 733        if (!file->f_op) {
 734                ret = -EINVAL;
 735                goto out;
 736        }
 737
 738        ret = rw_copy_check_uvector(type, uvector, nr_segs,
 739                                    ARRAY_SIZE(iovstack), iovstack, &iov);
 740        if (ret <= 0)
 741                goto out;
 742
 743        tot_len = ret;
 744        ret = rw_verify_area(type, file, pos, tot_len);
 745        if (ret < 0)
 746                goto out;
 747
 748        fnv = NULL;
 749        if (type == READ) {
 750                fn = file->f_op->read;
 751                fnv = file->f_op->aio_read;
 752        } else {
 753                fn = (io_fn_t)file->f_op->write;
 754                fnv = file->f_op->aio_write;
 755                file_start_write(file);
 756        }
 757
 758        if (fnv)
 759                ret = do_sync_readv_writev(file, iov, nr_segs, tot_len,
 760                                                pos, fnv);
 761        else
 762                ret = do_loop_readv_writev(file, iov, nr_segs, pos, fn);
 763
 764        if (type != READ)
 765                file_end_write(file);
 766
 767out:
 768        if (iov != iovstack)
 769                kfree(iov);
 770        if ((ret + (type == READ)) > 0) {
 771                if (type == READ)
 772                        fsnotify_access(file);
 773                else
 774                        fsnotify_modify(file);
 775        }
 776        return ret;
 777}
 778
 779ssize_t vfs_readv(struct file *file, const struct iovec __user *vec,
 780                  unsigned long vlen, loff_t *pos)
 781{
 782        if (!(file->f_mode & FMODE_READ))
 783                return -EBADF;
 784        if (!file->f_op || (!file->f_op->aio_read && !file->f_op->read))
 785                return -EINVAL;
 786
 787        return do_readv_writev(READ, file, vec, vlen, pos);
 788}
 789
 790EXPORT_SYMBOL(vfs_readv);
 791
 792ssize_t vfs_writev(struct file *file, const struct iovec __user *vec,
 793                   unsigned long vlen, loff_t *pos)
 794{
 795        if (!(file->f_mode & FMODE_WRITE))
 796                return -EBADF;
 797        if (!file->f_op || (!file->f_op->aio_write && !file->f_op->write))
 798                return -EINVAL;
 799
 800        return do_readv_writev(WRITE, file, vec, vlen, pos);
 801}
 802
 803EXPORT_SYMBOL(vfs_writev);
 804
 805SYSCALL_DEFINE3(readv, unsigned long, fd, const struct iovec __user *, vec,
 806                unsigned long, vlen)
 807{
 808        struct fd f = fdget(fd);
 809        ssize_t ret = -EBADF;
 810
 811        if (f.file) {
 812                loff_t pos = file_pos_read(f.file);
 813                ret = vfs_readv(f.file, vec, vlen, &pos);
 814                if (ret >= 0)
 815                        file_pos_write(f.file, pos);
 816                fdput(f);
 817        }
 818
 819        if (ret > 0)
 820                add_rchar(current, ret);
 821        inc_syscr(current);
 822        return ret;
 823}
 824
 825SYSCALL_DEFINE3(writev, unsigned long, fd, const struct iovec __user *, vec,
 826                unsigned long, vlen)
 827{
 828        struct fd f = fdget(fd);
 829        ssize_t ret = -EBADF;
 830
 831        if (f.file) {
 832                loff_t pos = file_pos_read(f.file);
 833                ret = vfs_writev(f.file, vec, vlen, &pos);
 834                if (ret >= 0)
 835                        file_pos_write(f.file, pos);
 836                fdput(f);
 837        }
 838
 839        if (ret > 0)
 840                add_wchar(current, ret);
 841        inc_syscw(current);
 842        return ret;
 843}
 844
 845static inline loff_t pos_from_hilo(unsigned long high, unsigned long low)
 846{
 847#define HALF_LONG_BITS (BITS_PER_LONG / 2)
 848        return (((loff_t)high << HALF_LONG_BITS) << HALF_LONG_BITS) | low;
 849}
 850
 851SYSCALL_DEFINE5(preadv, unsigned long, fd, const struct iovec __user *, vec,
 852                unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h)
 853{
 854        loff_t pos = pos_from_hilo(pos_h, pos_l);
 855        struct fd f;
 856        ssize_t ret = -EBADF;
 857
 858        if (pos < 0)
 859                return -EINVAL;
 860
 861        f = fdget(fd);
 862        if (f.file) {
 863                ret = -ESPIPE;
 864                if (f.file->f_mode & FMODE_PREAD)
 865                        ret = vfs_readv(f.file, vec, vlen, &pos);
 866                fdput(f);
 867        }
 868
 869        if (ret > 0)
 870                add_rchar(current, ret);
 871        inc_syscr(current);
 872        return ret;
 873}
 874
 875SYSCALL_DEFINE5(pwritev, unsigned long, fd, const struct iovec __user *, vec,
 876                unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h)
 877{
 878        loff_t pos = pos_from_hilo(pos_h, pos_l);
 879        struct fd f;
 880        ssize_t ret = -EBADF;
 881
 882        if (pos < 0)
 883                return -EINVAL;
 884
 885        f = fdget(fd);
 886        if (f.file) {
 887                ret = -ESPIPE;
 888                if (f.file->f_mode & FMODE_PWRITE)
 889                        ret = vfs_writev(f.file, vec, vlen, &pos);
 890                fdput(f);
 891        }
 892
 893        if (ret > 0)
 894                add_wchar(current, ret);
 895        inc_syscw(current);
 896        return ret;
 897}
 898
 899#ifdef CONFIG_COMPAT
 900
 901static ssize_t compat_do_readv_writev(int type, struct file *file,
 902                               const struct compat_iovec __user *uvector,
 903                               unsigned long nr_segs, loff_t *pos)
 904{
 905        compat_ssize_t tot_len;
 906        struct iovec iovstack[UIO_FASTIOV];
 907        struct iovec *iov = iovstack;
 908        ssize_t ret;
 909        io_fn_t fn;
 910        iov_fn_t fnv;
 911
 912        ret = -EINVAL;
 913        if (!file->f_op)
 914                goto out;
 915
 916        ret = -EFAULT;
 917        if (!access_ok(VERIFY_READ, uvector, nr_segs*sizeof(*uvector)))
 918                goto out;
 919
 920        ret = compat_rw_copy_check_uvector(type, uvector, nr_segs,
 921                                               UIO_FASTIOV, iovstack, &iov);
 922        if (ret <= 0)
 923                goto out;
 924
 925        tot_len = ret;
 926        ret = rw_verify_area(type, file, pos, tot_len);
 927        if (ret < 0)
 928                goto out;
 929
 930        fnv = NULL;
 931        if (type == READ) {
 932                fn = file->f_op->read;
 933                fnv = file->f_op->aio_read;
 934        } else {
 935                fn = (io_fn_t)file->f_op->write;
 936                fnv = file->f_op->aio_write;
 937                file_start_write(file);
 938        }
 939
 940        if (fnv)
 941                ret = do_sync_readv_writev(file, iov, nr_segs, tot_len,
 942                                                pos, fnv);
 943        else
 944                ret = do_loop_readv_writev(file, iov, nr_segs, pos, fn);
 945
 946        if (type != READ)
 947                file_end_write(file);
 948
 949out:
 950        if (iov != iovstack)
 951                kfree(iov);
 952        if ((ret + (type == READ)) > 0) {
 953                if (type == READ)
 954                        fsnotify_access(file);
 955                else
 956                        fsnotify_modify(file);
 957        }
 958        return ret;
 959}
 960
 961static size_t compat_readv(struct file *file,
 962                           const struct compat_iovec __user *vec,
 963                           unsigned long vlen, loff_t *pos)
 964{
 965        ssize_t ret = -EBADF;
 966
 967        if (!(file->f_mode & FMODE_READ))
 968                goto out;
 969
 970        ret = -EINVAL;
 971        if (!file->f_op || (!file->f_op->aio_read && !file->f_op->read))
 972                goto out;
 973
 974        ret = compat_do_readv_writev(READ, file, vec, vlen, pos);
 975
 976out:
 977        if (ret > 0)
 978                add_rchar(current, ret);
 979        inc_syscr(current);
 980        return ret;
 981}
 982
 983COMPAT_SYSCALL_DEFINE3(readv, unsigned long, fd,
 984                const struct compat_iovec __user *,vec,
 985                unsigned long, vlen)
 986{
 987        struct fd f = fdget(fd);
 988        ssize_t ret;
 989        loff_t pos;
 990
 991        if (!f.file)
 992                return -EBADF;
 993        pos = f.file->f_pos;
 994        ret = compat_readv(f.file, vec, vlen, &pos);
 995        if (ret >= 0)
 996                f.file->f_pos = pos;
 997        fdput(f);
 998        return ret;
 999}
1000
1001COMPAT_SYSCALL_DEFINE4(preadv64, unsigned long, fd,
1002                const struct compat_iovec __user *,vec,
1003                unsigned long, vlen, loff_t, pos)
1004{
1005        struct fd f;
1006        ssize_t ret;
1007
1008        if (pos < 0)
1009                return -EINVAL;
1010        f = fdget(fd);
1011        if (!f.file)
1012                return -EBADF;
1013        ret = -ESPIPE;
1014        if (f.file->f_mode & FMODE_PREAD)
1015                ret = compat_readv(f.file, vec, vlen, &pos);
1016        fdput(f);
1017        return ret;
1018}
1019
1020COMPAT_SYSCALL_DEFINE5(preadv, unsigned long, fd,
1021                const struct compat_iovec __user *,vec,
1022                unsigned long, vlen, u32, pos_low, u32, pos_high)
1023{
1024        loff_t pos = ((loff_t)pos_high << 32) | pos_low;
1025        return compat_sys_preadv64(fd, vec, vlen, pos);
1026}
1027
1028static size_t compat_writev(struct file *file,
1029                            const struct compat_iovec __user *vec,
1030                            unsigned long vlen, loff_t *pos)
1031{
1032        ssize_t ret = -EBADF;
1033
1034        if (!(file->f_mode & FMODE_WRITE))
1035                goto out;
1036
1037        ret = -EINVAL;
1038        if (!file->f_op || (!file->f_op->aio_write && !file->f_op->write))
1039                goto out;
1040
1041        ret = compat_do_readv_writev(WRITE, file, vec, vlen, pos);
1042
1043out:
1044        if (ret > 0)
1045                add_wchar(current, ret);
1046        inc_syscw(current);
1047        return ret;
1048}
1049
1050COMPAT_SYSCALL_DEFINE3(writev, unsigned long, fd,
1051                const struct compat_iovec __user *, vec,
1052                unsigned long, vlen)
1053{
1054        struct fd f = fdget(fd);
1055        ssize_t ret;
1056        loff_t pos;
1057
1058        if (!f.file)
1059                return -EBADF;
1060        pos = f.file->f_pos;
1061        ret = compat_writev(f.file, vec, vlen, &pos);
1062        if (ret >= 0)
1063                f.file->f_pos = pos;
1064        fdput(f);
1065        return ret;
1066}
1067
1068COMPAT_SYSCALL_DEFINE4(pwritev64, unsigned long, fd,
1069                const struct compat_iovec __user *,vec,
1070                unsigned long, vlen, loff_t, pos)
1071{
1072        struct fd f;
1073        ssize_t ret;
1074
1075        if (pos < 0)
1076                return -EINVAL;
1077        f = fdget(fd);
1078        if (!f.file)
1079                return -EBADF;
1080        ret = -ESPIPE;
1081        if (f.file->f_mode & FMODE_PWRITE)
1082                ret = compat_writev(f.file, vec, vlen, &pos);
1083        fdput(f);
1084        return ret;
1085}
1086
1087COMPAT_SYSCALL_DEFINE5(pwritev, unsigned long, fd,
1088                const struct compat_iovec __user *,vec,
1089                unsigned long, vlen, u32, pos_low, u32, pos_high)
1090{
1091        loff_t pos = ((loff_t)pos_high << 32) | pos_low;
1092        return compat_sys_pwritev64(fd, vec, vlen, pos);
1093}
1094#endif
1095
1096static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos,
1097                           size_t count, loff_t max)
1098{
1099        struct fd in, out;
1100        struct inode *in_inode, *out_inode;
1101        loff_t pos;
1102        loff_t out_pos;
1103        ssize_t retval;
1104        int fl;
1105
1106        /*
1107         * Get input file, and verify that it is ok..
1108         */
1109        retval = -EBADF;
1110        in = fdget(in_fd);
1111        if (!in.file)
1112                goto out;
1113        if (!(in.file->f_mode & FMODE_READ))
1114                goto fput_in;
1115        retval = -ESPIPE;
1116        if (!ppos) {
1117                pos = in.file->f_pos;
1118        } else {
1119                pos = *ppos;
1120                if (!(in.file->f_mode & FMODE_PREAD))
1121                        goto fput_in;
1122        }
1123        retval = rw_verify_area(READ, in.file, &pos, count);
1124        if (retval < 0)
1125                goto fput_in;
1126        count = retval;
1127
1128        /*
1129         * Get output file, and verify that it is ok..
1130         */
1131        retval = -EBADF;
1132        out = fdget(out_fd);
1133        if (!out.file)
1134                goto fput_in;
1135        if (!(out.file->f_mode & FMODE_WRITE))
1136                goto fput_out;
1137        retval = -EINVAL;
1138        in_inode = file_inode(in.file);
1139        out_inode = file_inode(out.file);
1140        out_pos = out.file->f_pos;
1141        retval = rw_verify_area(WRITE, out.file, &out_pos, count);
1142        if (retval < 0)
1143                goto fput_out;
1144        count = retval;
1145
1146        if (!max)
1147                max = min(in_inode->i_sb->s_maxbytes, out_inode->i_sb->s_maxbytes);
1148
1149        if (unlikely(pos + count > max)) {
1150                retval = -EOVERFLOW;
1151                if (pos >= max)
1152                        goto fput_out;
1153                count = max - pos;
1154        }
1155
1156        fl = 0;
1157#if 0
1158        /*
1159         * We need to debate whether we can enable this or not. The
1160         * man page documents EAGAIN return for the output at least,
1161         * and the application is arguably buggy if it doesn't expect
1162         * EAGAIN on a non-blocking file descriptor.
1163         */
1164        if (in.file->f_flags & O_NONBLOCK)
1165                fl = SPLICE_F_NONBLOCK;
1166#endif
1167        file_start_write(out.file);
1168        retval = do_splice_direct(in.file, &pos, out.file, &out_pos, count, fl);
1169        file_end_write(out.file);
1170
1171        if (retval > 0) {
1172                add_rchar(current, retval);
1173                add_wchar(current, retval);
1174                fsnotify_access(in.file);
1175                fsnotify_modify(out.file);
1176                out.file->f_pos = out_pos;
1177                if (ppos)
1178                        *ppos = pos;
1179                else
1180                        in.file->f_pos = pos;
1181        }
1182
1183        inc_syscr(current);
1184        inc_syscw(current);
1185        if (pos > max)
1186                retval = -EOVERFLOW;
1187
1188fput_out:
1189        fdput(out);
1190fput_in:
1191        fdput(in);
1192out:
1193        return retval;
1194}
1195
1196SYSCALL_DEFINE4(sendfile, int, out_fd, int, in_fd, off_t __user *, offset, size_t, count)
1197{
1198        loff_t pos;
1199        off_t off;
1200        ssize_t ret;
1201
1202        if (offset) {
1203                if (unlikely(get_user(off, offset)))
1204                        return -EFAULT;
1205                pos = off;
1206                ret = do_sendfile(out_fd, in_fd, &pos, count, MAX_NON_LFS);
1207                if (unlikely(put_user(pos, offset)))
1208                        return -EFAULT;
1209                return ret;
1210        }
1211
1212        return do_sendfile(out_fd, in_fd, NULL, count, 0);
1213}
1214
1215SYSCALL_DEFINE4(sendfile64, int, out_fd, int, in_fd, loff_t __user *, offset, size_t, count)
1216{
1217        loff_t pos;
1218        ssize_t ret;
1219
1220        if (offset) {
1221                if (unlikely(copy_from_user(&pos, offset, sizeof(loff_t))))
1222                        return -EFAULT;
1223                ret = do_sendfile(out_fd, in_fd, &pos, count, 0);
1224                if (unlikely(put_user(pos, offset)))
1225                        return -EFAULT;
1226                return ret;
1227        }
1228
1229        return do_sendfile(out_fd, in_fd, NULL, count, 0);
1230}
1231
1232#ifdef CONFIG_COMPAT
1233COMPAT_SYSCALL_DEFINE4(sendfile, int, out_fd, int, in_fd,
1234                compat_off_t __user *, offset, compat_size_t, count)
1235{
1236        loff_t pos;
1237        off_t off;
1238        ssize_t ret;
1239
1240        if (offset) {
1241                if (unlikely(get_user(off, offset)))
1242                        return -EFAULT;
1243                pos = off;
1244                ret = do_sendfile(out_fd, in_fd, &pos, count, MAX_NON_LFS);
1245                if (unlikely(put_user(pos, offset)))
1246                        return -EFAULT;
1247                return ret;
1248        }
1249
1250        return do_sendfile(out_fd, in_fd, NULL, count, 0);
1251}
1252
1253COMPAT_SYSCALL_DEFINE4(sendfile64, int, out_fd, int, in_fd,
1254                compat_loff_t __user *, offset, compat_size_t, count)
1255{
1256        loff_t pos;
1257        ssize_t ret;
1258
1259        if (offset) {
1260                if (unlikely(copy_from_user(&pos, offset, sizeof(loff_t))))
1261                        return -EFAULT;
1262                ret = do_sendfile(out_fd, in_fd, &pos, count, 0);
1263                if (unlikely(put_user(pos, offset)))
1264                        return -EFAULT;
1265                return ret;
1266        }
1267
1268        return do_sendfile(out_fd, in_fd, NULL, count, 0);
1269}
1270#endif
1271