linux/fs/read_write.c
<<
>>
Prefs
   1/*
   2 *  linux/fs/read_write.c
   3 *
   4 *  Copyright (C) 1991, 1992  Linus Torvalds
   5 */
   6
   7#include <linux/slab.h> 
   8#include <linux/stat.h>
   9#include <linux/fcntl.h>
  10#include <linux/file.h>
  11#include <linux/uio.h>
  12#include <linux/fsnotify.h>
  13#include <linux/security.h>
  14#include <linux/export.h>
  15#include <linux/syscalls.h>
  16#include <linux/pagemap.h>
  17#include <linux/splice.h>
  18#include "read_write.h"
  19
  20#include <asm/uaccess.h>
  21#include <asm/unistd.h>
  22
  23const struct file_operations generic_ro_fops = {
  24        .llseek         = generic_file_llseek,
  25        .read           = do_sync_read,
  26        .aio_read       = generic_file_aio_read,
  27        .mmap           = generic_file_readonly_mmap,
  28        .splice_read    = generic_file_splice_read,
  29};
  30
  31EXPORT_SYMBOL(generic_ro_fops);
  32
  33static inline int unsigned_offsets(struct file *file)
  34{
  35        return file->f_mode & FMODE_UNSIGNED_OFFSET;
  36}
  37
  38static loff_t lseek_execute(struct file *file, struct inode *inode,
  39                loff_t offset, loff_t maxsize)
  40{
  41        if (offset < 0 && !unsigned_offsets(file))
  42                return -EINVAL;
  43        if (offset > maxsize)
  44                return -EINVAL;
  45
  46        if (offset != file->f_pos) {
  47                file->f_pos = offset;
  48                file->f_version = 0;
  49        }
  50        return offset;
  51}
  52
  53/**
  54 * generic_file_llseek_size - generic llseek implementation for regular files
  55 * @file:       file structure to seek on
  56 * @offset:     file offset to seek to
  57 * @origin:     type of seek
  58 * @size:       max size of this file in file system
  59 * @eof:        offset used for SEEK_END position
  60 *
  61 * This is a variant of generic_file_llseek that allows passing in a custom
  62 * maximum file size and a custom EOF position, for e.g. hashed directories
  63 *
  64 * Synchronization:
  65 * SEEK_SET and SEEK_END are unsynchronized (but atomic on 64bit platforms)
  66 * SEEK_CUR is synchronized against other SEEK_CURs, but not read/writes.
  67 * read/writes behave like SEEK_SET against seeks.
  68 */
  69loff_t
  70generic_file_llseek_size(struct file *file, loff_t offset, int origin,
  71                loff_t maxsize, loff_t eof)
  72{
  73        struct inode *inode = file->f_mapping->host;
  74
  75        switch (origin) {
  76        case SEEK_END:
  77                offset += eof;
  78                break;
  79        case SEEK_CUR:
  80                /*
  81                 * Here we special-case the lseek(fd, 0, SEEK_CUR)
  82                 * position-querying operation.  Avoid rewriting the "same"
  83                 * f_pos value back to the file because a concurrent read(),
  84                 * write() or lseek() might have altered it
  85                 */
  86                if (offset == 0)
  87                        return file->f_pos;
  88                /*
  89                 * f_lock protects against read/modify/write race with other
  90                 * SEEK_CURs. Note that parallel writes and reads behave
  91                 * like SEEK_SET.
  92                 */
  93                spin_lock(&file->f_lock);
  94                offset = lseek_execute(file, inode, file->f_pos + offset,
  95                                       maxsize);
  96                spin_unlock(&file->f_lock);
  97                return offset;
  98        case SEEK_DATA:
  99                /*
 100                 * In the generic case the entire file is data, so as long as
 101                 * offset isn't at the end of the file then the offset is data.
 102                 */
 103                if (offset >= eof)
 104                        return -ENXIO;
 105                break;
 106        case SEEK_HOLE:
 107                /*
 108                 * There is a virtual hole at the end of the file, so as long as
 109                 * offset isn't i_size or larger, return i_size.
 110                 */
 111                if (offset >= eof)
 112                        return -ENXIO;
 113                offset = eof;
 114                break;
 115        }
 116
 117        return lseek_execute(file, inode, offset, maxsize);
 118}
 119EXPORT_SYMBOL(generic_file_llseek_size);
 120
 121/**
 122 * generic_file_llseek - generic llseek implementation for regular files
 123 * @file:       file structure to seek on
 124 * @offset:     file offset to seek to
 125 * @origin:     type of seek
 126 *
 127 * This is a generic implemenation of ->llseek useable for all normal local
 128 * filesystems.  It just updates the file offset to the value specified by
 129 * @offset and @origin under i_mutex.
 130 */
 131loff_t generic_file_llseek(struct file *file, loff_t offset, int origin)
 132{
 133        struct inode *inode = file->f_mapping->host;
 134
 135        return generic_file_llseek_size(file, offset, origin,
 136                                        inode->i_sb->s_maxbytes,
 137                                        i_size_read(inode));
 138}
 139EXPORT_SYMBOL(generic_file_llseek);
 140
 141/**
 142 * noop_llseek - No Operation Performed llseek implementation
 143 * @file:       file structure to seek on
 144 * @offset:     file offset to seek to
 145 * @origin:     type of seek
 146 *
 147 * This is an implementation of ->llseek useable for the rare special case when
 148 * userspace expects the seek to succeed but the (device) file is actually not
 149 * able to perform the seek. In this case you use noop_llseek() instead of
 150 * falling back to the default implementation of ->llseek.
 151 */
 152loff_t noop_llseek(struct file *file, loff_t offset, int origin)
 153{
 154        return file->f_pos;
 155}
 156EXPORT_SYMBOL(noop_llseek);
 157
 158loff_t no_llseek(struct file *file, loff_t offset, int origin)
 159{
 160        return -ESPIPE;
 161}
 162EXPORT_SYMBOL(no_llseek);
 163
 164loff_t default_llseek(struct file *file, loff_t offset, int origin)
 165{
 166        struct inode *inode = file->f_path.dentry->d_inode;
 167        loff_t retval;
 168
 169        mutex_lock(&inode->i_mutex);
 170        switch (origin) {
 171                case SEEK_END:
 172                        offset += i_size_read(inode);
 173                        break;
 174                case SEEK_CUR:
 175                        if (offset == 0) {
 176                                retval = file->f_pos;
 177                                goto out;
 178                        }
 179                        offset += file->f_pos;
 180                        break;
 181                case SEEK_DATA:
 182                        /*
 183                         * In the generic case the entire file is data, so as
 184                         * long as offset isn't at the end of the file then the
 185                         * offset is data.
 186                         */
 187                        if (offset >= inode->i_size) {
 188                                retval = -ENXIO;
 189                                goto out;
 190                        }
 191                        break;
 192                case SEEK_HOLE:
 193                        /*
 194                         * There is a virtual hole at the end of the file, so
 195                         * as long as offset isn't i_size or larger, return
 196                         * i_size.
 197                         */
 198                        if (offset >= inode->i_size) {
 199                                retval = -ENXIO;
 200                                goto out;
 201                        }
 202                        offset = inode->i_size;
 203                        break;
 204        }
 205        retval = -EINVAL;
 206        if (offset >= 0 || unsigned_offsets(file)) {
 207                if (offset != file->f_pos) {
 208                        file->f_pos = offset;
 209                        file->f_version = 0;
 210                }
 211                retval = offset;
 212        }
 213out:
 214        mutex_unlock(&inode->i_mutex);
 215        return retval;
 216}
 217EXPORT_SYMBOL(default_llseek);
 218
 219loff_t vfs_llseek(struct file *file, loff_t offset, int origin)
 220{
 221        loff_t (*fn)(struct file *, loff_t, int);
 222
 223        fn = no_llseek;
 224        if (file->f_mode & FMODE_LSEEK) {
 225                if (file->f_op && file->f_op->llseek)
 226                        fn = file->f_op->llseek;
 227        }
 228        return fn(file, offset, origin);
 229}
 230EXPORT_SYMBOL(vfs_llseek);
 231
 232SYSCALL_DEFINE3(lseek, unsigned int, fd, off_t, offset, unsigned int, origin)
 233{
 234        off_t retval;
 235        struct file * file;
 236        int fput_needed;
 237
 238        retval = -EBADF;
 239        file = fget_light(fd, &fput_needed);
 240        if (!file)
 241                goto bad;
 242
 243        retval = -EINVAL;
 244        if (origin <= SEEK_MAX) {
 245                loff_t res = vfs_llseek(file, offset, origin);
 246                retval = res;
 247                if (res != (loff_t)retval)
 248                        retval = -EOVERFLOW;    /* LFS: should only happen on 32 bit platforms */
 249        }
 250        fput_light(file, fput_needed);
 251bad:
 252        return retval;
 253}
 254
 255#ifdef __ARCH_WANT_SYS_LLSEEK
 256SYSCALL_DEFINE5(llseek, unsigned int, fd, unsigned long, offset_high,
 257                unsigned long, offset_low, loff_t __user *, result,
 258                unsigned int, origin)
 259{
 260        int retval;
 261        struct file * file;
 262        loff_t offset;
 263        int fput_needed;
 264
 265        retval = -EBADF;
 266        file = fget_light(fd, &fput_needed);
 267        if (!file)
 268                goto bad;
 269
 270        retval = -EINVAL;
 271        if (origin > SEEK_MAX)
 272                goto out_putf;
 273
 274        offset = vfs_llseek(file, ((loff_t) offset_high << 32) | offset_low,
 275                        origin);
 276
 277        retval = (int)offset;
 278        if (offset >= 0) {
 279                retval = -EFAULT;
 280                if (!copy_to_user(result, &offset, sizeof(offset)))
 281                        retval = 0;
 282        }
 283out_putf:
 284        fput_light(file, fput_needed);
 285bad:
 286        return retval;
 287}
 288#endif
 289
 290
 291/*
 292 * rw_verify_area doesn't like huge counts. We limit
 293 * them to something that fits in "int" so that others
 294 * won't have to do range checks all the time.
 295 */
 296int rw_verify_area(int read_write, struct file *file, loff_t *ppos, size_t count)
 297{
 298        struct inode *inode;
 299        loff_t pos;
 300        int retval = -EINVAL;
 301
 302        inode = file->f_path.dentry->d_inode;
 303        if (unlikely((ssize_t) count < 0))
 304                return retval;
 305        pos = *ppos;
 306        if (unlikely(pos < 0)) {
 307                if (!unsigned_offsets(file))
 308                        return retval;
 309                if (count >= -pos) /* both values are in 0..LLONG_MAX */
 310                        return -EOVERFLOW;
 311        } else if (unlikely((loff_t) (pos + count) < 0)) {
 312                if (!unsigned_offsets(file))
 313                        return retval;
 314        }
 315
 316        if (unlikely(inode->i_flock && mandatory_lock(inode))) {
 317                retval = locks_mandatory_area(
 318                        read_write == READ ? FLOCK_VERIFY_READ : FLOCK_VERIFY_WRITE,
 319                        inode, file, pos, count);
 320                if (retval < 0)
 321                        return retval;
 322        }
 323        retval = security_file_permission(file,
 324                                read_write == READ ? MAY_READ : MAY_WRITE);
 325        if (retval)
 326                return retval;
 327        return count > MAX_RW_COUNT ? MAX_RW_COUNT : count;
 328}
 329
 330static void wait_on_retry_sync_kiocb(struct kiocb *iocb)
 331{
 332        set_current_state(TASK_UNINTERRUPTIBLE);
 333        if (!kiocbIsKicked(iocb))
 334                schedule();
 335        else
 336                kiocbClearKicked(iocb);
 337        __set_current_state(TASK_RUNNING);
 338}
 339
 340ssize_t do_sync_read(struct file *filp, char __user *buf, size_t len, loff_t *ppos)
 341{
 342        struct iovec iov = { .iov_base = buf, .iov_len = len };
 343        struct kiocb kiocb;
 344        ssize_t ret;
 345
 346        init_sync_kiocb(&kiocb, filp);
 347        kiocb.ki_pos = *ppos;
 348        kiocb.ki_left = len;
 349        kiocb.ki_nbytes = len;
 350
 351        for (;;) {
 352                ret = filp->f_op->aio_read(&kiocb, &iov, 1, kiocb.ki_pos);
 353                if (ret != -EIOCBRETRY)
 354                        break;
 355                wait_on_retry_sync_kiocb(&kiocb);
 356        }
 357
 358        if (-EIOCBQUEUED == ret)
 359                ret = wait_on_sync_kiocb(&kiocb);
 360        *ppos = kiocb.ki_pos;
 361        return ret;
 362}
 363
 364EXPORT_SYMBOL(do_sync_read);
 365
 366ssize_t vfs_read(struct file *file, char __user *buf, size_t count, loff_t *pos)
 367{
 368        ssize_t ret;
 369
 370        if (!(file->f_mode & FMODE_READ))
 371                return -EBADF;
 372        if (!file->f_op || (!file->f_op->read && !file->f_op->aio_read))
 373                return -EINVAL;
 374        if (unlikely(!access_ok(VERIFY_WRITE, buf, count)))
 375                return -EFAULT;
 376
 377        ret = rw_verify_area(READ, file, pos, count);
 378        if (ret >= 0) {
 379                count = ret;
 380                if (file->f_op->read)
 381                        ret = file->f_op->read(file, buf, count, pos);
 382                else
 383                        ret = do_sync_read(file, buf, count, pos);
 384                if (ret > 0) {
 385                        fsnotify_access(file);
 386                        add_rchar(current, ret);
 387                }
 388                inc_syscr(current);
 389        }
 390
 391        return ret;
 392}
 393
 394EXPORT_SYMBOL(vfs_read);
 395
 396ssize_t do_sync_write(struct file *filp, const char __user *buf, size_t len, loff_t *ppos)
 397{
 398        struct iovec iov = { .iov_base = (void __user *)buf, .iov_len = len };
 399        struct kiocb kiocb;
 400        ssize_t ret;
 401
 402        init_sync_kiocb(&kiocb, filp);
 403        kiocb.ki_pos = *ppos;
 404        kiocb.ki_left = len;
 405        kiocb.ki_nbytes = len;
 406
 407        for (;;) {
 408                ret = filp->f_op->aio_write(&kiocb, &iov, 1, kiocb.ki_pos);
 409                if (ret != -EIOCBRETRY)
 410                        break;
 411                wait_on_retry_sync_kiocb(&kiocb);
 412        }
 413
 414        if (-EIOCBQUEUED == ret)
 415                ret = wait_on_sync_kiocb(&kiocb);
 416        *ppos = kiocb.ki_pos;
 417        return ret;
 418}
 419
 420EXPORT_SYMBOL(do_sync_write);
 421
 422ssize_t vfs_write(struct file *file, const char __user *buf, size_t count, loff_t *pos)
 423{
 424        ssize_t ret;
 425
 426        if (!(file->f_mode & FMODE_WRITE))
 427                return -EBADF;
 428        if (!file->f_op || (!file->f_op->write && !file->f_op->aio_write))
 429                return -EINVAL;
 430        if (unlikely(!access_ok(VERIFY_READ, buf, count)))
 431                return -EFAULT;
 432
 433        ret = rw_verify_area(WRITE, file, pos, count);
 434        if (ret >= 0) {
 435                count = ret;
 436                if (file->f_op->write)
 437                        ret = file->f_op->write(file, buf, count, pos);
 438                else
 439                        ret = do_sync_write(file, buf, count, pos);
 440                if (ret > 0) {
 441                        fsnotify_modify(file);
 442                        add_wchar(current, ret);
 443                }
 444                inc_syscw(current);
 445        }
 446
 447        return ret;
 448}
 449
 450EXPORT_SYMBOL(vfs_write);
 451
 452static inline loff_t file_pos_read(struct file *file)
 453{
 454        return file->f_pos;
 455}
 456
 457static inline void file_pos_write(struct file *file, loff_t pos)
 458{
 459        file->f_pos = pos;
 460}
 461
 462SYSCALL_DEFINE3(read, unsigned int, fd, char __user *, buf, size_t, count)
 463{
 464        struct file *file;
 465        ssize_t ret = -EBADF;
 466        int fput_needed;
 467
 468        file = fget_light(fd, &fput_needed);
 469        if (file) {
 470                loff_t pos = file_pos_read(file);
 471                ret = vfs_read(file, buf, count, &pos);
 472                file_pos_write(file, pos);
 473                fput_light(file, fput_needed);
 474        }
 475
 476        return ret;
 477}
 478
 479SYSCALL_DEFINE3(write, unsigned int, fd, const char __user *, buf,
 480                size_t, count)
 481{
 482        struct file *file;
 483        ssize_t ret = -EBADF;
 484        int fput_needed;
 485
 486        file = fget_light(fd, &fput_needed);
 487        if (file) {
 488                loff_t pos = file_pos_read(file);
 489                ret = vfs_write(file, buf, count, &pos);
 490                file_pos_write(file, pos);
 491                fput_light(file, fput_needed);
 492        }
 493
 494        return ret;
 495}
 496
 497SYSCALL_DEFINE(pread64)(unsigned int fd, char __user *buf,
 498                        size_t count, loff_t pos)
 499{
 500        struct file *file;
 501        ssize_t ret = -EBADF;
 502        int fput_needed;
 503
 504        if (pos < 0)
 505                return -EINVAL;
 506
 507        file = fget_light(fd, &fput_needed);
 508        if (file) {
 509                ret = -ESPIPE;
 510                if (file->f_mode & FMODE_PREAD)
 511                        ret = vfs_read(file, buf, count, &pos);
 512                fput_light(file, fput_needed);
 513        }
 514
 515        return ret;
 516}
 517#ifdef CONFIG_HAVE_SYSCALL_WRAPPERS
 518asmlinkage long SyS_pread64(long fd, long buf, long count, loff_t pos)
 519{
 520        return SYSC_pread64((unsigned int) fd, (char __user *) buf,
 521                            (size_t) count, pos);
 522}
 523SYSCALL_ALIAS(sys_pread64, SyS_pread64);
 524#endif
 525
 526SYSCALL_DEFINE(pwrite64)(unsigned int fd, const char __user *buf,
 527                         size_t count, loff_t pos)
 528{
 529        struct file *file;
 530        ssize_t ret = -EBADF;
 531        int fput_needed;
 532
 533        if (pos < 0)
 534                return -EINVAL;
 535
 536        file = fget_light(fd, &fput_needed);
 537        if (file) {
 538                ret = -ESPIPE;
 539                if (file->f_mode & FMODE_PWRITE)  
 540                        ret = vfs_write(file, buf, count, &pos);
 541                fput_light(file, fput_needed);
 542        }
 543
 544        return ret;
 545}
 546#ifdef CONFIG_HAVE_SYSCALL_WRAPPERS
 547asmlinkage long SyS_pwrite64(long fd, long buf, long count, loff_t pos)
 548{
 549        return SYSC_pwrite64((unsigned int) fd, (const char __user *) buf,
 550                             (size_t) count, pos);
 551}
 552SYSCALL_ALIAS(sys_pwrite64, SyS_pwrite64);
 553#endif
 554
 555/*
 556 * Reduce an iovec's length in-place.  Return the resulting number of segments
 557 */
 558unsigned long iov_shorten(struct iovec *iov, unsigned long nr_segs, size_t to)
 559{
 560        unsigned long seg = 0;
 561        size_t len = 0;
 562
 563        while (seg < nr_segs) {
 564                seg++;
 565                if (len + iov->iov_len >= to) {
 566                        iov->iov_len = to - len;
 567                        break;
 568                }
 569                len += iov->iov_len;
 570                iov++;
 571        }
 572        return seg;
 573}
 574EXPORT_SYMBOL(iov_shorten);
 575
 576ssize_t do_sync_readv_writev(struct file *filp, const struct iovec *iov,
 577                unsigned long nr_segs, size_t len, loff_t *ppos, iov_fn_t fn)
 578{
 579        struct kiocb kiocb;
 580        ssize_t ret;
 581
 582        init_sync_kiocb(&kiocb, filp);
 583        kiocb.ki_pos = *ppos;
 584        kiocb.ki_left = len;
 585        kiocb.ki_nbytes = len;
 586
 587        for (;;) {
 588                ret = fn(&kiocb, iov, nr_segs, kiocb.ki_pos);
 589                if (ret != -EIOCBRETRY)
 590                        break;
 591                wait_on_retry_sync_kiocb(&kiocb);
 592        }
 593
 594        if (ret == -EIOCBQUEUED)
 595                ret = wait_on_sync_kiocb(&kiocb);
 596        *ppos = kiocb.ki_pos;
 597        return ret;
 598}
 599
 600/* Do it by hand, with file-ops */
 601ssize_t do_loop_readv_writev(struct file *filp, struct iovec *iov,
 602                unsigned long nr_segs, loff_t *ppos, io_fn_t fn)
 603{
 604        struct iovec *vector = iov;
 605        ssize_t ret = 0;
 606
 607        while (nr_segs > 0) {
 608                void __user *base;
 609                size_t len;
 610                ssize_t nr;
 611
 612                base = vector->iov_base;
 613                len = vector->iov_len;
 614                vector++;
 615                nr_segs--;
 616
 617                nr = fn(filp, base, len, ppos);
 618
 619                if (nr < 0) {
 620                        if (!ret)
 621                                ret = nr;
 622                        break;
 623                }
 624                ret += nr;
 625                if (nr != len)
 626                        break;
 627        }
 628
 629        return ret;
 630}
 631
 632/* A write operation does a read from user space and vice versa */
 633#define vrfy_dir(type) ((type) == READ ? VERIFY_WRITE : VERIFY_READ)
 634
 635ssize_t rw_copy_check_uvector(int type, const struct iovec __user * uvector,
 636                              unsigned long nr_segs, unsigned long fast_segs,
 637                              struct iovec *fast_pointer,
 638                              struct iovec **ret_pointer)
 639{
 640        unsigned long seg;
 641        ssize_t ret;
 642        struct iovec *iov = fast_pointer;
 643
 644        /*
 645         * SuS says "The readv() function *may* fail if the iovcnt argument
 646         * was less than or equal to 0, or greater than {IOV_MAX}.  Linux has
 647         * traditionally returned zero for zero segments, so...
 648         */
 649        if (nr_segs == 0) {
 650                ret = 0;
 651                goto out;
 652        }
 653
 654        /*
 655         * First get the "struct iovec" from user memory and
 656         * verify all the pointers
 657         */
 658        if (nr_segs > UIO_MAXIOV) {
 659                ret = -EINVAL;
 660                goto out;
 661        }
 662        if (nr_segs > fast_segs) {
 663                iov = kmalloc(nr_segs*sizeof(struct iovec), GFP_KERNEL);
 664                if (iov == NULL) {
 665                        ret = -ENOMEM;
 666                        goto out;
 667                }
 668        }
 669        if (copy_from_user(iov, uvector, nr_segs*sizeof(*uvector))) {
 670                ret = -EFAULT;
 671                goto out;
 672        }
 673
 674        /*
 675         * According to the Single Unix Specification we should return EINVAL
 676         * if an element length is < 0 when cast to ssize_t or if the
 677         * total length would overflow the ssize_t return value of the
 678         * system call.
 679         *
 680         * Linux caps all read/write calls to MAX_RW_COUNT, and avoids the
 681         * overflow case.
 682         */
 683        ret = 0;
 684        for (seg = 0; seg < nr_segs; seg++) {
 685                void __user *buf = iov[seg].iov_base;
 686                ssize_t len = (ssize_t)iov[seg].iov_len;
 687
 688                /* see if we we're about to use an invalid len or if
 689                 * it's about to overflow ssize_t */
 690                if (len < 0) {
 691                        ret = -EINVAL;
 692                        goto out;
 693                }
 694                if (type >= 0
 695                    && unlikely(!access_ok(vrfy_dir(type), buf, len))) {
 696                        ret = -EFAULT;
 697                        goto out;
 698                }
 699                if (len > MAX_RW_COUNT - ret) {
 700                        len = MAX_RW_COUNT - ret;
 701                        iov[seg].iov_len = len;
 702                }
 703                ret += len;
 704        }
 705out:
 706        *ret_pointer = iov;
 707        return ret;
 708}
 709
 710static ssize_t do_readv_writev(int type, struct file *file,
 711                               const struct iovec __user * uvector,
 712                               unsigned long nr_segs, loff_t *pos)
 713{
 714        size_t tot_len;
 715        struct iovec iovstack[UIO_FASTIOV];
 716        struct iovec *iov = iovstack;
 717        ssize_t ret;
 718        io_fn_t fn;
 719        iov_fn_t fnv;
 720
 721        if (!file->f_op) {
 722                ret = -EINVAL;
 723                goto out;
 724        }
 725
 726        ret = rw_copy_check_uvector(type, uvector, nr_segs,
 727                                    ARRAY_SIZE(iovstack), iovstack, &iov);
 728        if (ret <= 0)
 729                goto out;
 730
 731        tot_len = ret;
 732        ret = rw_verify_area(type, file, pos, tot_len);
 733        if (ret < 0)
 734                goto out;
 735
 736        fnv = NULL;
 737        if (type == READ) {
 738                fn = file->f_op->read;
 739                fnv = file->f_op->aio_read;
 740        } else {
 741                fn = (io_fn_t)file->f_op->write;
 742                fnv = file->f_op->aio_write;
 743        }
 744
 745        if (fnv)
 746                ret = do_sync_readv_writev(file, iov, nr_segs, tot_len,
 747                                                pos, fnv);
 748        else
 749                ret = do_loop_readv_writev(file, iov, nr_segs, pos, fn);
 750
 751out:
 752        if (iov != iovstack)
 753                kfree(iov);
 754        if ((ret + (type == READ)) > 0) {
 755                if (type == READ)
 756                        fsnotify_access(file);
 757                else
 758                        fsnotify_modify(file);
 759        }
 760        return ret;
 761}
 762
 763ssize_t vfs_readv(struct file *file, const struct iovec __user *vec,
 764                  unsigned long vlen, loff_t *pos)
 765{
 766        if (!(file->f_mode & FMODE_READ))
 767                return -EBADF;
 768        if (!file->f_op || (!file->f_op->aio_read && !file->f_op->read))
 769                return -EINVAL;
 770
 771        return do_readv_writev(READ, file, vec, vlen, pos);
 772}
 773
 774EXPORT_SYMBOL(vfs_readv);
 775
 776ssize_t vfs_writev(struct file *file, const struct iovec __user *vec,
 777                   unsigned long vlen, loff_t *pos)
 778{
 779        if (!(file->f_mode & FMODE_WRITE))
 780                return -EBADF;
 781        if (!file->f_op || (!file->f_op->aio_write && !file->f_op->write))
 782                return -EINVAL;
 783
 784        return do_readv_writev(WRITE, file, vec, vlen, pos);
 785}
 786
 787EXPORT_SYMBOL(vfs_writev);
 788
 789SYSCALL_DEFINE3(readv, unsigned long, fd, const struct iovec __user *, vec,
 790                unsigned long, vlen)
 791{
 792        struct file *file;
 793        ssize_t ret = -EBADF;
 794        int fput_needed;
 795
 796        file = fget_light(fd, &fput_needed);
 797        if (file) {
 798                loff_t pos = file_pos_read(file);
 799                ret = vfs_readv(file, vec, vlen, &pos);
 800                file_pos_write(file, pos);
 801                fput_light(file, fput_needed);
 802        }
 803
 804        if (ret > 0)
 805                add_rchar(current, ret);
 806        inc_syscr(current);
 807        return ret;
 808}
 809
 810SYSCALL_DEFINE3(writev, unsigned long, fd, const struct iovec __user *, vec,
 811                unsigned long, vlen)
 812{
 813        struct file *file;
 814        ssize_t ret = -EBADF;
 815        int fput_needed;
 816
 817        file = fget_light(fd, &fput_needed);
 818        if (file) {
 819                loff_t pos = file_pos_read(file);
 820                ret = vfs_writev(file, vec, vlen, &pos);
 821                file_pos_write(file, pos);
 822                fput_light(file, fput_needed);
 823        }
 824
 825        if (ret > 0)
 826                add_wchar(current, ret);
 827        inc_syscw(current);
 828        return ret;
 829}
 830
 831static inline loff_t pos_from_hilo(unsigned long high, unsigned long low)
 832{
 833#define HALF_LONG_BITS (BITS_PER_LONG / 2)
 834        return (((loff_t)high << HALF_LONG_BITS) << HALF_LONG_BITS) | low;
 835}
 836
 837SYSCALL_DEFINE5(preadv, unsigned long, fd, const struct iovec __user *, vec,
 838                unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h)
 839{
 840        loff_t pos = pos_from_hilo(pos_h, pos_l);
 841        struct file *file;
 842        ssize_t ret = -EBADF;
 843        int fput_needed;
 844
 845        if (pos < 0)
 846                return -EINVAL;
 847
 848        file = fget_light(fd, &fput_needed);
 849        if (file) {
 850                ret = -ESPIPE;
 851                if (file->f_mode & FMODE_PREAD)
 852                        ret = vfs_readv(file, vec, vlen, &pos);
 853                fput_light(file, fput_needed);
 854        }
 855
 856        if (ret > 0)
 857                add_rchar(current, ret);
 858        inc_syscr(current);
 859        return ret;
 860}
 861
 862SYSCALL_DEFINE5(pwritev, unsigned long, fd, const struct iovec __user *, vec,
 863                unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h)
 864{
 865        loff_t pos = pos_from_hilo(pos_h, pos_l);
 866        struct file *file;
 867        ssize_t ret = -EBADF;
 868        int fput_needed;
 869
 870        if (pos < 0)
 871                return -EINVAL;
 872
 873        file = fget_light(fd, &fput_needed);
 874        if (file) {
 875                ret = -ESPIPE;
 876                if (file->f_mode & FMODE_PWRITE)
 877                        ret = vfs_writev(file, vec, vlen, &pos);
 878                fput_light(file, fput_needed);
 879        }
 880
 881        if (ret > 0)
 882                add_wchar(current, ret);
 883        inc_syscw(current);
 884        return ret;
 885}
 886
 887static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos,
 888                           size_t count, loff_t max)
 889{
 890        struct file * in_file, * out_file;
 891        struct inode * in_inode, * out_inode;
 892        loff_t pos;
 893        ssize_t retval;
 894        int fput_needed_in, fput_needed_out, fl;
 895
 896        /*
 897         * Get input file, and verify that it is ok..
 898         */
 899        retval = -EBADF;
 900        in_file = fget_light(in_fd, &fput_needed_in);
 901        if (!in_file)
 902                goto out;
 903        if (!(in_file->f_mode & FMODE_READ))
 904                goto fput_in;
 905        retval = -ESPIPE;
 906        if (!ppos)
 907                ppos = &in_file->f_pos;
 908        else
 909                if (!(in_file->f_mode & FMODE_PREAD))
 910                        goto fput_in;
 911        retval = rw_verify_area(READ, in_file, ppos, count);
 912        if (retval < 0)
 913                goto fput_in;
 914        count = retval;
 915
 916        /*
 917         * Get output file, and verify that it is ok..
 918         */
 919        retval = -EBADF;
 920        out_file = fget_light(out_fd, &fput_needed_out);
 921        if (!out_file)
 922                goto fput_in;
 923        if (!(out_file->f_mode & FMODE_WRITE))
 924                goto fput_out;
 925        retval = -EINVAL;
 926        in_inode = in_file->f_path.dentry->d_inode;
 927        out_inode = out_file->f_path.dentry->d_inode;
 928        retval = rw_verify_area(WRITE, out_file, &out_file->f_pos, count);
 929        if (retval < 0)
 930                goto fput_out;
 931        count = retval;
 932
 933        if (!max)
 934                max = min(in_inode->i_sb->s_maxbytes, out_inode->i_sb->s_maxbytes);
 935
 936        pos = *ppos;
 937        if (unlikely(pos + count > max)) {
 938                retval = -EOVERFLOW;
 939                if (pos >= max)
 940                        goto fput_out;
 941                count = max - pos;
 942        }
 943
 944        fl = 0;
 945#if 0
 946        /*
 947         * We need to debate whether we can enable this or not. The
 948         * man page documents EAGAIN return for the output at least,
 949         * and the application is arguably buggy if it doesn't expect
 950         * EAGAIN on a non-blocking file descriptor.
 951         */
 952        if (in_file->f_flags & O_NONBLOCK)
 953                fl = SPLICE_F_NONBLOCK;
 954#endif
 955        retval = do_splice_direct(in_file, ppos, out_file, count, fl);
 956
 957        if (retval > 0) {
 958                add_rchar(current, retval);
 959                add_wchar(current, retval);
 960        }
 961
 962        inc_syscr(current);
 963        inc_syscw(current);
 964        if (*ppos > max)
 965                retval = -EOVERFLOW;
 966
 967fput_out:
 968        fput_light(out_file, fput_needed_out);
 969fput_in:
 970        fput_light(in_file, fput_needed_in);
 971out:
 972        return retval;
 973}
 974
 975SYSCALL_DEFINE4(sendfile, int, out_fd, int, in_fd, off_t __user *, offset, size_t, count)
 976{
 977        loff_t pos;
 978        off_t off;
 979        ssize_t ret;
 980
 981        if (offset) {
 982                if (unlikely(get_user(off, offset)))
 983                        return -EFAULT;
 984                pos = off;
 985                ret = do_sendfile(out_fd, in_fd, &pos, count, MAX_NON_LFS);
 986                if (unlikely(put_user(pos, offset)))
 987                        return -EFAULT;
 988                return ret;
 989        }
 990
 991        return do_sendfile(out_fd, in_fd, NULL, count, 0);
 992}
 993
 994SYSCALL_DEFINE4(sendfile64, int, out_fd, int, in_fd, loff_t __user *, offset, size_t, count)
 995{
 996        loff_t pos;
 997        ssize_t ret;
 998
 999        if (offset) {
1000                if (unlikely(copy_from_user(&pos, offset, sizeof(loff_t))))
1001                        return -EFAULT;
1002                ret = do_sendfile(out_fd, in_fd, &pos, count, 0);
1003                if (unlikely(put_user(pos, offset)))
1004                        return -EFAULT;
1005                return ret;
1006        }
1007
1008        return do_sendfile(out_fd, in_fd, NULL, count, 0);
1009}
1010