linux/fs/read_write.c
<<
>>
Prefs
   1/*
   2 *  linux/fs/read_write.c
   3 *
   4 *  Copyright (C) 1991, 1992  Linus Torvalds
   5 */
   6
   7#include <linux/slab.h> 
   8#include <linux/stat.h>
   9#include <linux/fcntl.h>
  10#include <linux/file.h>
  11#include <linux/uio.h>
  12#include <linux/fsnotify.h>
  13#include <linux/security.h>
  14#include <linux/module.h>
  15#include <linux/syscalls.h>
  16#include <linux/pagemap.h>
  17#include <linux/splice.h>
  18#include "read_write.h"
  19
  20#include <asm/uaccess.h>
  21#include <asm/unistd.h>
  22
  23const struct file_operations generic_ro_fops = {
  24        .llseek         = generic_file_llseek,
  25        .read           = do_sync_read,
  26        .aio_read       = generic_file_aio_read,
  27        .mmap           = generic_file_readonly_mmap,
  28        .splice_read    = generic_file_splice_read,
  29};
  30
  31EXPORT_SYMBOL(generic_ro_fops);
  32
  33static inline int unsigned_offsets(struct file *file)
  34{
  35        return file->f_mode & FMODE_UNSIGNED_OFFSET;
  36}
  37
  38/**
  39 * generic_file_llseek_unlocked - lockless generic llseek implementation
  40 * @file:       file structure to seek on
  41 * @offset:     file offset to seek to
  42 * @origin:     type of seek
  43 *
  44 * Updates the file offset to the value specified by @offset and @origin.
  45 * Locking must be provided by the caller.
  46 */
  47loff_t
  48generic_file_llseek_unlocked(struct file *file, loff_t offset, int origin)
  49{
  50        struct inode *inode = file->f_mapping->host;
  51
  52        switch (origin) {
  53        case SEEK_END:
  54                offset += inode->i_size;
  55                break;
  56        case SEEK_CUR:
  57                /*
  58                 * Here we special-case the lseek(fd, 0, SEEK_CUR)
  59                 * position-querying operation.  Avoid rewriting the "same"
  60                 * f_pos value back to the file because a concurrent read(),
  61                 * write() or lseek() might have altered it
  62                 */
  63                if (offset == 0)
  64                        return file->f_pos;
  65                offset += file->f_pos;
  66                break;
  67        }
  68
  69        if (offset < 0 && !unsigned_offsets(file))
  70                return -EINVAL;
  71        if (offset > inode->i_sb->s_maxbytes)
  72                return -EINVAL;
  73
  74        /* Special lock needed here? */
  75        if (offset != file->f_pos) {
  76                file->f_pos = offset;
  77                file->f_version = 0;
  78        }
  79
  80        return offset;
  81}
  82EXPORT_SYMBOL(generic_file_llseek_unlocked);
  83
  84/**
  85 * generic_file_llseek - generic llseek implementation for regular files
  86 * @file:       file structure to seek on
  87 * @offset:     file offset to seek to
  88 * @origin:     type of seek
  89 *
  90 * This is a generic implemenation of ->llseek useable for all normal local
  91 * filesystems.  It just updates the file offset to the value specified by
  92 * @offset and @origin under i_mutex.
  93 */
  94loff_t generic_file_llseek(struct file *file, loff_t offset, int origin)
  95{
  96        loff_t rval;
  97
  98        mutex_lock(&file->f_dentry->d_inode->i_mutex);
  99        rval = generic_file_llseek_unlocked(file, offset, origin);
 100        mutex_unlock(&file->f_dentry->d_inode->i_mutex);
 101
 102        return rval;
 103}
 104EXPORT_SYMBOL(generic_file_llseek);
 105
 106/**
 107 * noop_llseek - No Operation Performed llseek implementation
 108 * @file:       file structure to seek on
 109 * @offset:     file offset to seek to
 110 * @origin:     type of seek
 111 *
 112 * This is an implementation of ->llseek useable for the rare special case when
 113 * userspace expects the seek to succeed but the (device) file is actually not
 114 * able to perform the seek. In this case you use noop_llseek() instead of
 115 * falling back to the default implementation of ->llseek.
 116 */
 117loff_t noop_llseek(struct file *file, loff_t offset, int origin)
 118{
 119        return file->f_pos;
 120}
 121EXPORT_SYMBOL(noop_llseek);
 122
 123loff_t no_llseek(struct file *file, loff_t offset, int origin)
 124{
 125        return -ESPIPE;
 126}
 127EXPORT_SYMBOL(no_llseek);
 128
 129loff_t default_llseek(struct file *file, loff_t offset, int origin)
 130{
 131        loff_t retval;
 132
 133        mutex_lock(&file->f_dentry->d_inode->i_mutex);
 134        switch (origin) {
 135                case SEEK_END:
 136                        offset += i_size_read(file->f_path.dentry->d_inode);
 137                        break;
 138                case SEEK_CUR:
 139                        if (offset == 0) {
 140                                retval = file->f_pos;
 141                                goto out;
 142                        }
 143                        offset += file->f_pos;
 144        }
 145        retval = -EINVAL;
 146        if (offset >= 0 || unsigned_offsets(file)) {
 147                if (offset != file->f_pos) {
 148                        file->f_pos = offset;
 149                        file->f_version = 0;
 150                }
 151                retval = offset;
 152        }
 153out:
 154        mutex_unlock(&file->f_dentry->d_inode->i_mutex);
 155        return retval;
 156}
 157EXPORT_SYMBOL(default_llseek);
 158
 159loff_t vfs_llseek(struct file *file, loff_t offset, int origin)
 160{
 161        loff_t (*fn)(struct file *, loff_t, int);
 162
 163        fn = no_llseek;
 164        if (file->f_mode & FMODE_LSEEK) {
 165                if (file->f_op && file->f_op->llseek)
 166                        fn = file->f_op->llseek;
 167        }
 168        return fn(file, offset, origin);
 169}
 170EXPORT_SYMBOL(vfs_llseek);
 171
 172SYSCALL_DEFINE3(lseek, unsigned int, fd, off_t, offset, unsigned int, origin)
 173{
 174        off_t retval;
 175        struct file * file;
 176        int fput_needed;
 177
 178        retval = -EBADF;
 179        file = fget_light(fd, &fput_needed);
 180        if (!file)
 181                goto bad;
 182
 183        retval = -EINVAL;
 184        if (origin <= SEEK_MAX) {
 185                loff_t res = vfs_llseek(file, offset, origin);
 186                retval = res;
 187                if (res != (loff_t)retval)
 188                        retval = -EOVERFLOW;    /* LFS: should only happen on 32 bit platforms */
 189        }
 190        fput_light(file, fput_needed);
 191bad:
 192        return retval;
 193}
 194
 195#ifdef __ARCH_WANT_SYS_LLSEEK
 196SYSCALL_DEFINE5(llseek, unsigned int, fd, unsigned long, offset_high,
 197                unsigned long, offset_low, loff_t __user *, result,
 198                unsigned int, origin)
 199{
 200        int retval;
 201        struct file * file;
 202        loff_t offset;
 203        int fput_needed;
 204
 205        retval = -EBADF;
 206        file = fget_light(fd, &fput_needed);
 207        if (!file)
 208                goto bad;
 209
 210        retval = -EINVAL;
 211        if (origin > SEEK_MAX)
 212                goto out_putf;
 213
 214        offset = vfs_llseek(file, ((loff_t) offset_high << 32) | offset_low,
 215                        origin);
 216
 217        retval = (int)offset;
 218        if (offset >= 0) {
 219                retval = -EFAULT;
 220                if (!copy_to_user(result, &offset, sizeof(offset)))
 221                        retval = 0;
 222        }
 223out_putf:
 224        fput_light(file, fput_needed);
 225bad:
 226        return retval;
 227}
 228#endif
 229
 230
 231/*
 232 * rw_verify_area doesn't like huge counts. We limit
 233 * them to something that fits in "int" so that others
 234 * won't have to do range checks all the time.
 235 */
 236int rw_verify_area(int read_write, struct file *file, loff_t *ppos, size_t count)
 237{
 238        struct inode *inode;
 239        loff_t pos;
 240        int retval = -EINVAL;
 241
 242        inode = file->f_path.dentry->d_inode;
 243        if (unlikely((ssize_t) count < 0))
 244                return retval;
 245        pos = *ppos;
 246        if (unlikely(pos < 0)) {
 247                if (!unsigned_offsets(file))
 248                        return retval;
 249                if (count >= -pos) /* both values are in 0..LLONG_MAX */
 250                        return -EOVERFLOW;
 251        } else if (unlikely((loff_t) (pos + count) < 0)) {
 252                if (!unsigned_offsets(file))
 253                        return retval;
 254        }
 255
 256        if (unlikely(inode->i_flock && mandatory_lock(inode))) {
 257                retval = locks_mandatory_area(
 258                        read_write == READ ? FLOCK_VERIFY_READ : FLOCK_VERIFY_WRITE,
 259                        inode, file, pos, count);
 260                if (retval < 0)
 261                        return retval;
 262        }
 263        retval = security_file_permission(file,
 264                                read_write == READ ? MAY_READ : MAY_WRITE);
 265        if (retval)
 266                return retval;
 267        return count > MAX_RW_COUNT ? MAX_RW_COUNT : count;
 268}
 269
 270static void wait_on_retry_sync_kiocb(struct kiocb *iocb)
 271{
 272        set_current_state(TASK_UNINTERRUPTIBLE);
 273        if (!kiocbIsKicked(iocb))
 274                schedule();
 275        else
 276                kiocbClearKicked(iocb);
 277        __set_current_state(TASK_RUNNING);
 278}
 279
 280ssize_t do_sync_read(struct file *filp, char __user *buf, size_t len, loff_t *ppos)
 281{
 282        struct iovec iov = { .iov_base = buf, .iov_len = len };
 283        struct kiocb kiocb;
 284        ssize_t ret;
 285
 286        init_sync_kiocb(&kiocb, filp);
 287        kiocb.ki_pos = *ppos;
 288        kiocb.ki_left = len;
 289        kiocb.ki_nbytes = len;
 290
 291        for (;;) {
 292                ret = filp->f_op->aio_read(&kiocb, &iov, 1, kiocb.ki_pos);
 293                if (ret != -EIOCBRETRY)
 294                        break;
 295                wait_on_retry_sync_kiocb(&kiocb);
 296        }
 297
 298        if (-EIOCBQUEUED == ret)
 299                ret = wait_on_sync_kiocb(&kiocb);
 300        *ppos = kiocb.ki_pos;
 301        return ret;
 302}
 303
 304EXPORT_SYMBOL(do_sync_read);
 305
 306ssize_t vfs_read(struct file *file, char __user *buf, size_t count, loff_t *pos)
 307{
 308        ssize_t ret;
 309
 310        if (!(file->f_mode & FMODE_READ))
 311                return -EBADF;
 312        if (!file->f_op || (!file->f_op->read && !file->f_op->aio_read))
 313                return -EINVAL;
 314        if (unlikely(!access_ok(VERIFY_WRITE, buf, count)))
 315                return -EFAULT;
 316
 317        ret = rw_verify_area(READ, file, pos, count);
 318        if (ret >= 0) {
 319                count = ret;
 320                if (file->f_op->read)
 321                        ret = file->f_op->read(file, buf, count, pos);
 322                else
 323                        ret = do_sync_read(file, buf, count, pos);
 324                if (ret > 0) {
 325                        fsnotify_access(file);
 326                        add_rchar(current, ret);
 327                }
 328                inc_syscr(current);
 329        }
 330
 331        return ret;
 332}
 333
 334EXPORT_SYMBOL(vfs_read);
 335
 336ssize_t do_sync_write(struct file *filp, const char __user *buf, size_t len, loff_t *ppos)
 337{
 338        struct iovec iov = { .iov_base = (void __user *)buf, .iov_len = len };
 339        struct kiocb kiocb;
 340        ssize_t ret;
 341
 342        init_sync_kiocb(&kiocb, filp);
 343        kiocb.ki_pos = *ppos;
 344        kiocb.ki_left = len;
 345        kiocb.ki_nbytes = len;
 346
 347        for (;;) {
 348                ret = filp->f_op->aio_write(&kiocb, &iov, 1, kiocb.ki_pos);
 349                if (ret != -EIOCBRETRY)
 350                        break;
 351                wait_on_retry_sync_kiocb(&kiocb);
 352        }
 353
 354        if (-EIOCBQUEUED == ret)
 355                ret = wait_on_sync_kiocb(&kiocb);
 356        *ppos = kiocb.ki_pos;
 357        return ret;
 358}
 359
 360EXPORT_SYMBOL(do_sync_write);
 361
 362ssize_t vfs_write(struct file *file, const char __user *buf, size_t count, loff_t *pos)
 363{
 364        ssize_t ret;
 365
 366        if (!(file->f_mode & FMODE_WRITE))
 367                return -EBADF;
 368        if (!file->f_op || (!file->f_op->write && !file->f_op->aio_write))
 369                return -EINVAL;
 370        if (unlikely(!access_ok(VERIFY_READ, buf, count)))
 371                return -EFAULT;
 372
 373        ret = rw_verify_area(WRITE, file, pos, count);
 374        if (ret >= 0) {
 375                count = ret;
 376                if (file->f_op->write)
 377                        ret = file->f_op->write(file, buf, count, pos);
 378                else
 379                        ret = do_sync_write(file, buf, count, pos);
 380                if (ret > 0) {
 381                        fsnotify_modify(file);
 382                        add_wchar(current, ret);
 383                }
 384                inc_syscw(current);
 385        }
 386
 387        return ret;
 388}
 389
 390EXPORT_SYMBOL(vfs_write);
 391
 392static inline loff_t file_pos_read(struct file *file)
 393{
 394        return file->f_pos;
 395}
 396
 397static inline void file_pos_write(struct file *file, loff_t pos)
 398{
 399        file->f_pos = pos;
 400}
 401
 402SYSCALL_DEFINE3(read, unsigned int, fd, char __user *, buf, size_t, count)
 403{
 404        struct file *file;
 405        ssize_t ret = -EBADF;
 406        int fput_needed;
 407
 408        file = fget_light(fd, &fput_needed);
 409        if (file) {
 410                loff_t pos = file_pos_read(file);
 411                ret = vfs_read(file, buf, count, &pos);
 412                file_pos_write(file, pos);
 413                fput_light(file, fput_needed);
 414        }
 415
 416        return ret;
 417}
 418
 419SYSCALL_DEFINE3(write, unsigned int, fd, const char __user *, buf,
 420                size_t, count)
 421{
 422        struct file *file;
 423        ssize_t ret = -EBADF;
 424        int fput_needed;
 425
 426        file = fget_light(fd, &fput_needed);
 427        if (file) {
 428                loff_t pos = file_pos_read(file);
 429                ret = vfs_write(file, buf, count, &pos);
 430                file_pos_write(file, pos);
 431                fput_light(file, fput_needed);
 432        }
 433
 434        return ret;
 435}
 436
 437SYSCALL_DEFINE(pread64)(unsigned int fd, char __user *buf,
 438                        size_t count, loff_t pos)
 439{
 440        struct file *file;
 441        ssize_t ret = -EBADF;
 442        int fput_needed;
 443
 444        if (pos < 0)
 445                return -EINVAL;
 446
 447        file = fget_light(fd, &fput_needed);
 448        if (file) {
 449                ret = -ESPIPE;
 450                if (file->f_mode & FMODE_PREAD)
 451                        ret = vfs_read(file, buf, count, &pos);
 452                fput_light(file, fput_needed);
 453        }
 454
 455        return ret;
 456}
 457#ifdef CONFIG_HAVE_SYSCALL_WRAPPERS
 458asmlinkage long SyS_pread64(long fd, long buf, long count, loff_t pos)
 459{
 460        return SYSC_pread64((unsigned int) fd, (char __user *) buf,
 461                            (size_t) count, pos);
 462}
 463SYSCALL_ALIAS(sys_pread64, SyS_pread64);
 464#endif
 465
 466SYSCALL_DEFINE(pwrite64)(unsigned int fd, const char __user *buf,
 467                         size_t count, loff_t pos)
 468{
 469        struct file *file;
 470        ssize_t ret = -EBADF;
 471        int fput_needed;
 472
 473        if (pos < 0)
 474                return -EINVAL;
 475
 476        file = fget_light(fd, &fput_needed);
 477        if (file) {
 478                ret = -ESPIPE;
 479                if (file->f_mode & FMODE_PWRITE)  
 480                        ret = vfs_write(file, buf, count, &pos);
 481                fput_light(file, fput_needed);
 482        }
 483
 484        return ret;
 485}
 486#ifdef CONFIG_HAVE_SYSCALL_WRAPPERS
 487asmlinkage long SyS_pwrite64(long fd, long buf, long count, loff_t pos)
 488{
 489        return SYSC_pwrite64((unsigned int) fd, (const char __user *) buf,
 490                             (size_t) count, pos);
 491}
 492SYSCALL_ALIAS(sys_pwrite64, SyS_pwrite64);
 493#endif
 494
 495/*
 496 * Reduce an iovec's length in-place.  Return the resulting number of segments
 497 */
 498unsigned long iov_shorten(struct iovec *iov, unsigned long nr_segs, size_t to)
 499{
 500        unsigned long seg = 0;
 501        size_t len = 0;
 502
 503        while (seg < nr_segs) {
 504                seg++;
 505                if (len + iov->iov_len >= to) {
 506                        iov->iov_len = to - len;
 507                        break;
 508                }
 509                len += iov->iov_len;
 510                iov++;
 511        }
 512        return seg;
 513}
 514EXPORT_SYMBOL(iov_shorten);
 515
 516ssize_t do_sync_readv_writev(struct file *filp, const struct iovec *iov,
 517                unsigned long nr_segs, size_t len, loff_t *ppos, iov_fn_t fn)
 518{
 519        struct kiocb kiocb;
 520        ssize_t ret;
 521
 522        init_sync_kiocb(&kiocb, filp);
 523        kiocb.ki_pos = *ppos;
 524        kiocb.ki_left = len;
 525        kiocb.ki_nbytes = len;
 526
 527        for (;;) {
 528                ret = fn(&kiocb, iov, nr_segs, kiocb.ki_pos);
 529                if (ret != -EIOCBRETRY)
 530                        break;
 531                wait_on_retry_sync_kiocb(&kiocb);
 532        }
 533
 534        if (ret == -EIOCBQUEUED)
 535                ret = wait_on_sync_kiocb(&kiocb);
 536        *ppos = kiocb.ki_pos;
 537        return ret;
 538}
 539
 540/* Do it by hand, with file-ops */
 541ssize_t do_loop_readv_writev(struct file *filp, struct iovec *iov,
 542                unsigned long nr_segs, loff_t *ppos, io_fn_t fn)
 543{
 544        struct iovec *vector = iov;
 545        ssize_t ret = 0;
 546
 547        while (nr_segs > 0) {
 548                void __user *base;
 549                size_t len;
 550                ssize_t nr;
 551
 552                base = vector->iov_base;
 553                len = vector->iov_len;
 554                vector++;
 555                nr_segs--;
 556
 557                nr = fn(filp, base, len, ppos);
 558
 559                if (nr < 0) {
 560                        if (!ret)
 561                                ret = nr;
 562                        break;
 563                }
 564                ret += nr;
 565                if (nr != len)
 566                        break;
 567        }
 568
 569        return ret;
 570}
 571
 572/* A write operation does a read from user space and vice versa */
 573#define vrfy_dir(type) ((type) == READ ? VERIFY_WRITE : VERIFY_READ)
 574
 575ssize_t rw_copy_check_uvector(int type, const struct iovec __user * uvector,
 576                              unsigned long nr_segs, unsigned long fast_segs,
 577                              struct iovec *fast_pointer,
 578                              struct iovec **ret_pointer)
 579{
 580        unsigned long seg;
 581        ssize_t ret;
 582        struct iovec *iov = fast_pointer;
 583
 584        /*
 585         * SuS says "The readv() function *may* fail if the iovcnt argument
 586         * was less than or equal to 0, or greater than {IOV_MAX}.  Linux has
 587         * traditionally returned zero for zero segments, so...
 588         */
 589        if (nr_segs == 0) {
 590                ret = 0;
 591                goto out;
 592        }
 593
 594        /*
 595         * First get the "struct iovec" from user memory and
 596         * verify all the pointers
 597         */
 598        if (nr_segs > UIO_MAXIOV) {
 599                ret = -EINVAL;
 600                goto out;
 601        }
 602        if (nr_segs > fast_segs) {
 603                iov = kmalloc(nr_segs*sizeof(struct iovec), GFP_KERNEL);
 604                if (iov == NULL) {
 605                        ret = -ENOMEM;
 606                        goto out;
 607                }
 608        }
 609        if (copy_from_user(iov, uvector, nr_segs*sizeof(*uvector))) {
 610                ret = -EFAULT;
 611                goto out;
 612        }
 613
 614        /*
 615         * According to the Single Unix Specification we should return EINVAL
 616         * if an element length is < 0 when cast to ssize_t or if the
 617         * total length would overflow the ssize_t return value of the
 618         * system call.
 619         *
 620         * Linux caps all read/write calls to MAX_RW_COUNT, and avoids the
 621         * overflow case.
 622         */
 623        ret = 0;
 624        for (seg = 0; seg < nr_segs; seg++) {
 625                void __user *buf = iov[seg].iov_base;
 626                ssize_t len = (ssize_t)iov[seg].iov_len;
 627
 628                /* see if we we're about to use an invalid len or if
 629                 * it's about to overflow ssize_t */
 630                if (len < 0) {
 631                        ret = -EINVAL;
 632                        goto out;
 633                }
 634                if (unlikely(!access_ok(vrfy_dir(type), buf, len))) {
 635                        ret = -EFAULT;
 636                        goto out;
 637                }
 638                if (len > MAX_RW_COUNT - ret) {
 639                        len = MAX_RW_COUNT - ret;
 640                        iov[seg].iov_len = len;
 641                }
 642                ret += len;
 643        }
 644out:
 645        *ret_pointer = iov;
 646        return ret;
 647}
 648
 649static ssize_t do_readv_writev(int type, struct file *file,
 650                               const struct iovec __user * uvector,
 651                               unsigned long nr_segs, loff_t *pos)
 652{
 653        size_t tot_len;
 654        struct iovec iovstack[UIO_FASTIOV];
 655        struct iovec *iov = iovstack;
 656        ssize_t ret;
 657        io_fn_t fn;
 658        iov_fn_t fnv;
 659
 660        if (!file->f_op) {
 661                ret = -EINVAL;
 662                goto out;
 663        }
 664
 665        ret = rw_copy_check_uvector(type, uvector, nr_segs,
 666                        ARRAY_SIZE(iovstack), iovstack, &iov);
 667        if (ret <= 0)
 668                goto out;
 669
 670        tot_len = ret;
 671        ret = rw_verify_area(type, file, pos, tot_len);
 672        if (ret < 0)
 673                goto out;
 674
 675        fnv = NULL;
 676        if (type == READ) {
 677                fn = file->f_op->read;
 678                fnv = file->f_op->aio_read;
 679        } else {
 680                fn = (io_fn_t)file->f_op->write;
 681                fnv = file->f_op->aio_write;
 682        }
 683
 684        if (fnv)
 685                ret = do_sync_readv_writev(file, iov, nr_segs, tot_len,
 686                                                pos, fnv);
 687        else
 688                ret = do_loop_readv_writev(file, iov, nr_segs, pos, fn);
 689
 690out:
 691        if (iov != iovstack)
 692                kfree(iov);
 693        if ((ret + (type == READ)) > 0) {
 694                if (type == READ)
 695                        fsnotify_access(file);
 696                else
 697                        fsnotify_modify(file);
 698        }
 699        return ret;
 700}
 701
 702ssize_t vfs_readv(struct file *file, const struct iovec __user *vec,
 703                  unsigned long vlen, loff_t *pos)
 704{
 705        if (!(file->f_mode & FMODE_READ))
 706                return -EBADF;
 707        if (!file->f_op || (!file->f_op->aio_read && !file->f_op->read))
 708                return -EINVAL;
 709
 710        return do_readv_writev(READ, file, vec, vlen, pos);
 711}
 712
 713EXPORT_SYMBOL(vfs_readv);
 714
 715ssize_t vfs_writev(struct file *file, const struct iovec __user *vec,
 716                   unsigned long vlen, loff_t *pos)
 717{
 718        if (!(file->f_mode & FMODE_WRITE))
 719                return -EBADF;
 720        if (!file->f_op || (!file->f_op->aio_write && !file->f_op->write))
 721                return -EINVAL;
 722
 723        return do_readv_writev(WRITE, file, vec, vlen, pos);
 724}
 725
 726EXPORT_SYMBOL(vfs_writev);
 727
 728SYSCALL_DEFINE3(readv, unsigned long, fd, const struct iovec __user *, vec,
 729                unsigned long, vlen)
 730{
 731        struct file *file;
 732        ssize_t ret = -EBADF;
 733        int fput_needed;
 734
 735        file = fget_light(fd, &fput_needed);
 736        if (file) {
 737                loff_t pos = file_pos_read(file);
 738                ret = vfs_readv(file, vec, vlen, &pos);
 739                file_pos_write(file, pos);
 740                fput_light(file, fput_needed);
 741        }
 742
 743        if (ret > 0)
 744                add_rchar(current, ret);
 745        inc_syscr(current);
 746        return ret;
 747}
 748
 749SYSCALL_DEFINE3(writev, unsigned long, fd, const struct iovec __user *, vec,
 750                unsigned long, vlen)
 751{
 752        struct file *file;
 753        ssize_t ret = -EBADF;
 754        int fput_needed;
 755
 756        file = fget_light(fd, &fput_needed);
 757        if (file) {
 758                loff_t pos = file_pos_read(file);
 759                ret = vfs_writev(file, vec, vlen, &pos);
 760                file_pos_write(file, pos);
 761                fput_light(file, fput_needed);
 762        }
 763
 764        if (ret > 0)
 765                add_wchar(current, ret);
 766        inc_syscw(current);
 767        return ret;
 768}
 769
 770static inline loff_t pos_from_hilo(unsigned long high, unsigned long low)
 771{
 772#define HALF_LONG_BITS (BITS_PER_LONG / 2)
 773        return (((loff_t)high << HALF_LONG_BITS) << HALF_LONG_BITS) | low;
 774}
 775
 776SYSCALL_DEFINE5(preadv, unsigned long, fd, const struct iovec __user *, vec,
 777                unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h)
 778{
 779        loff_t pos = pos_from_hilo(pos_h, pos_l);
 780        struct file *file;
 781        ssize_t ret = -EBADF;
 782        int fput_needed;
 783
 784        if (pos < 0)
 785                return -EINVAL;
 786
 787        file = fget_light(fd, &fput_needed);
 788        if (file) {
 789                ret = -ESPIPE;
 790                if (file->f_mode & FMODE_PREAD)
 791                        ret = vfs_readv(file, vec, vlen, &pos);
 792                fput_light(file, fput_needed);
 793        }
 794
 795        if (ret > 0)
 796                add_rchar(current, ret);
 797        inc_syscr(current);
 798        return ret;
 799}
 800
 801SYSCALL_DEFINE5(pwritev, unsigned long, fd, const struct iovec __user *, vec,
 802                unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h)
 803{
 804        loff_t pos = pos_from_hilo(pos_h, pos_l);
 805        struct file *file;
 806        ssize_t ret = -EBADF;
 807        int fput_needed;
 808
 809        if (pos < 0)
 810                return -EINVAL;
 811
 812        file = fget_light(fd, &fput_needed);
 813        if (file) {
 814                ret = -ESPIPE;
 815                if (file->f_mode & FMODE_PWRITE)
 816                        ret = vfs_writev(file, vec, vlen, &pos);
 817                fput_light(file, fput_needed);
 818        }
 819
 820        if (ret > 0)
 821                add_wchar(current, ret);
 822        inc_syscw(current);
 823        return ret;
 824}
 825
 826static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos,
 827                           size_t count, loff_t max)
 828{
 829        struct file * in_file, * out_file;
 830        struct inode * in_inode, * out_inode;
 831        loff_t pos;
 832        ssize_t retval;
 833        int fput_needed_in, fput_needed_out, fl;
 834
 835        /*
 836         * Get input file, and verify that it is ok..
 837         */
 838        retval = -EBADF;
 839        in_file = fget_light(in_fd, &fput_needed_in);
 840        if (!in_file)
 841                goto out;
 842        if (!(in_file->f_mode & FMODE_READ))
 843                goto fput_in;
 844        retval = -ESPIPE;
 845        if (!ppos)
 846                ppos = &in_file->f_pos;
 847        else
 848                if (!(in_file->f_mode & FMODE_PREAD))
 849                        goto fput_in;
 850        retval = rw_verify_area(READ, in_file, ppos, count);
 851        if (retval < 0)
 852                goto fput_in;
 853        count = retval;
 854
 855        /*
 856         * Get output file, and verify that it is ok..
 857         */
 858        retval = -EBADF;
 859        out_file = fget_light(out_fd, &fput_needed_out);
 860        if (!out_file)
 861                goto fput_in;
 862        if (!(out_file->f_mode & FMODE_WRITE))
 863                goto fput_out;
 864        retval = -EINVAL;
 865        in_inode = in_file->f_path.dentry->d_inode;
 866        out_inode = out_file->f_path.dentry->d_inode;
 867        retval = rw_verify_area(WRITE, out_file, &out_file->f_pos, count);
 868        if (retval < 0)
 869                goto fput_out;
 870        count = retval;
 871
 872        if (!max)
 873                max = min(in_inode->i_sb->s_maxbytes, out_inode->i_sb->s_maxbytes);
 874
 875        pos = *ppos;
 876        if (unlikely(pos + count > max)) {
 877                retval = -EOVERFLOW;
 878                if (pos >= max)
 879                        goto fput_out;
 880                count = max - pos;
 881        }
 882
 883        fl = 0;
 884#if 0
 885        /*
 886         * We need to debate whether we can enable this or not. The
 887         * man page documents EAGAIN return for the output at least,
 888         * and the application is arguably buggy if it doesn't expect
 889         * EAGAIN on a non-blocking file descriptor.
 890         */
 891        if (in_file->f_flags & O_NONBLOCK)
 892                fl = SPLICE_F_NONBLOCK;
 893#endif
 894        retval = do_splice_direct(in_file, ppos, out_file, count, fl);
 895
 896        if (retval > 0) {
 897                add_rchar(current, retval);
 898                add_wchar(current, retval);
 899        }
 900
 901        inc_syscr(current);
 902        inc_syscw(current);
 903        if (*ppos > max)
 904                retval = -EOVERFLOW;
 905
 906fput_out:
 907        fput_light(out_file, fput_needed_out);
 908fput_in:
 909        fput_light(in_file, fput_needed_in);
 910out:
 911        return retval;
 912}
 913
 914SYSCALL_DEFINE4(sendfile, int, out_fd, int, in_fd, off_t __user *, offset, size_t, count)
 915{
 916        loff_t pos;
 917        off_t off;
 918        ssize_t ret;
 919
 920        if (offset) {
 921                if (unlikely(get_user(off, offset)))
 922                        return -EFAULT;
 923                pos = off;
 924                ret = do_sendfile(out_fd, in_fd, &pos, count, MAX_NON_LFS);
 925                if (unlikely(put_user(pos, offset)))
 926                        return -EFAULT;
 927                return ret;
 928        }
 929
 930        return do_sendfile(out_fd, in_fd, NULL, count, 0);
 931}
 932
 933SYSCALL_DEFINE4(sendfile64, int, out_fd, int, in_fd, loff_t __user *, offset, size_t, count)
 934{
 935        loff_t pos;
 936        ssize_t ret;
 937
 938        if (offset) {
 939                if (unlikely(copy_from_user(&pos, offset, sizeof(loff_t))))
 940                        return -EFAULT;
 941                ret = do_sendfile(out_fd, in_fd, &pos, count, 0);
 942                if (unlikely(put_user(pos, offset)))
 943                        return -EFAULT;
 944                return ret;
 945        }
 946
 947        return do_sendfile(out_fd, in_fd, NULL, count, 0);
 948}
 949