linux/fs/read_write.c
<<
>>
Prefs
   1/*
   2 *  linux/fs/read_write.c
   3 *
   4 *  Copyright (C) 1991, 1992  Linus Torvalds
   5 */
   6
   7#include <linux/slab.h> 
   8#include <linux/stat.h>
   9#include <linux/fcntl.h>
  10#include <linux/file.h>
  11#include <linux/uio.h>
  12#include <linux/fsnotify.h>
  13#include <linux/security.h>
  14#include <linux/module.h>
  15#include <linux/syscalls.h>
  16#include <linux/pagemap.h>
  17#include <linux/splice.h>
  18#include "read_write.h"
  19
  20#include <asm/uaccess.h>
  21#include <asm/unistd.h>
  22
  23const struct file_operations generic_ro_fops = {
  24        .llseek         = generic_file_llseek,
  25        .read           = do_sync_read,
  26        .aio_read       = generic_file_aio_read,
  27        .mmap           = generic_file_readonly_mmap,
  28        .splice_read    = generic_file_splice_read,
  29};
  30
  31EXPORT_SYMBOL(generic_ro_fops);
  32
  33static inline int unsigned_offsets(struct file *file)
  34{
  35        return file->f_mode & FMODE_UNSIGNED_OFFSET;
  36}
  37
  38static loff_t lseek_execute(struct file *file, struct inode *inode,
  39                loff_t offset, loff_t maxsize)
  40{
  41        if (offset < 0 && !unsigned_offsets(file))
  42                return -EINVAL;
  43        if (offset > maxsize)
  44                return -EINVAL;
  45
  46        if (offset != file->f_pos) {
  47                file->f_pos = offset;
  48                file->f_version = 0;
  49        }
  50        return offset;
  51}
  52
  53/**
  54 * generic_file_llseek_size - generic llseek implementation for regular files
  55 * @file:       file structure to seek on
  56 * @offset:     file offset to seek to
  57 * @origin:     type of seek
  58 * @size:       max size of file system
  59 *
  60 * This is a variant of generic_file_llseek that allows passing in a custom
  61 * file size.
  62 *
  63 * Synchronization:
  64 * SEEK_SET and SEEK_END are unsynchronized (but atomic on 64bit platforms)
  65 * SEEK_CUR is synchronized against other SEEK_CURs, but not read/writes.
  66 * read/writes behave like SEEK_SET against seeks.
  67 */
  68loff_t
  69generic_file_llseek_size(struct file *file, loff_t offset, int origin,
  70                loff_t maxsize)
  71{
  72        struct inode *inode = file->f_mapping->host;
  73
  74        switch (origin) {
  75        case SEEK_END:
  76                offset += i_size_read(inode);
  77                break;
  78        case SEEK_CUR:
  79                /*
  80                 * Here we special-case the lseek(fd, 0, SEEK_CUR)
  81                 * position-querying operation.  Avoid rewriting the "same"
  82                 * f_pos value back to the file because a concurrent read(),
  83                 * write() or lseek() might have altered it
  84                 */
  85                if (offset == 0)
  86                        return file->f_pos;
  87                /*
  88                 * f_lock protects against read/modify/write race with other
  89                 * SEEK_CURs. Note that parallel writes and reads behave
  90                 * like SEEK_SET.
  91                 */
  92                spin_lock(&file->f_lock);
  93                offset = lseek_execute(file, inode, file->f_pos + offset,
  94                                       maxsize);
  95                spin_unlock(&file->f_lock);
  96                return offset;
  97        case SEEK_DATA:
  98                /*
  99                 * In the generic case the entire file is data, so as long as
 100                 * offset isn't at the end of the file then the offset is data.
 101                 */
 102                if (offset >= i_size_read(inode))
 103                        return -ENXIO;
 104                break;
 105        case SEEK_HOLE:
 106                /*
 107                 * There is a virtual hole at the end of the file, so as long as
 108                 * offset isn't i_size or larger, return i_size.
 109                 */
 110                if (offset >= i_size_read(inode))
 111                        return -ENXIO;
 112                offset = i_size_read(inode);
 113                break;
 114        }
 115
 116        return lseek_execute(file, inode, offset, maxsize);
 117}
 118EXPORT_SYMBOL(generic_file_llseek_size);
 119
 120/**
 121 * generic_file_llseek - generic llseek implementation for regular files
 122 * @file:       file structure to seek on
 123 * @offset:     file offset to seek to
 124 * @origin:     type of seek
 125 *
 126 * This is a generic implemenation of ->llseek useable for all normal local
 127 * filesystems.  It just updates the file offset to the value specified by
 128 * @offset and @origin under i_mutex.
 129 */
 130loff_t generic_file_llseek(struct file *file, loff_t offset, int origin)
 131{
 132        struct inode *inode = file->f_mapping->host;
 133
 134        return generic_file_llseek_size(file, offset, origin,
 135                                        inode->i_sb->s_maxbytes);
 136}
 137EXPORT_SYMBOL(generic_file_llseek);
 138
 139/**
 140 * noop_llseek - No Operation Performed llseek implementation
 141 * @file:       file structure to seek on
 142 * @offset:     file offset to seek to
 143 * @origin:     type of seek
 144 *
 145 * This is an implementation of ->llseek useable for the rare special case when
 146 * userspace expects the seek to succeed but the (device) file is actually not
 147 * able to perform the seek. In this case you use noop_llseek() instead of
 148 * falling back to the default implementation of ->llseek.
 149 */
 150loff_t noop_llseek(struct file *file, loff_t offset, int origin)
 151{
 152        return file->f_pos;
 153}
 154EXPORT_SYMBOL(noop_llseek);
 155
 156loff_t no_llseek(struct file *file, loff_t offset, int origin)
 157{
 158        return -ESPIPE;
 159}
 160EXPORT_SYMBOL(no_llseek);
 161
 162loff_t default_llseek(struct file *file, loff_t offset, int origin)
 163{
 164        struct inode *inode = file->f_path.dentry->d_inode;
 165        loff_t retval;
 166
 167        mutex_lock(&inode->i_mutex);
 168        switch (origin) {
 169                case SEEK_END:
 170                        offset += i_size_read(inode);
 171                        break;
 172                case SEEK_CUR:
 173                        if (offset == 0) {
 174                                retval = file->f_pos;
 175                                goto out;
 176                        }
 177                        offset += file->f_pos;
 178                        break;
 179                case SEEK_DATA:
 180                        /*
 181                         * In the generic case the entire file is data, so as
 182                         * long as offset isn't at the end of the file then the
 183                         * offset is data.
 184                         */
 185                        if (offset >= inode->i_size) {
 186                                retval = -ENXIO;
 187                                goto out;
 188                        }
 189                        break;
 190                case SEEK_HOLE:
 191                        /*
 192                         * There is a virtual hole at the end of the file, so
 193                         * as long as offset isn't i_size or larger, return
 194                         * i_size.
 195                         */
 196                        if (offset >= inode->i_size) {
 197                                retval = -ENXIO;
 198                                goto out;
 199                        }
 200                        offset = inode->i_size;
 201                        break;
 202        }
 203        retval = -EINVAL;
 204        if (offset >= 0 || unsigned_offsets(file)) {
 205                if (offset != file->f_pos) {
 206                        file->f_pos = offset;
 207                        file->f_version = 0;
 208                }
 209                retval = offset;
 210        }
 211out:
 212        mutex_unlock(&inode->i_mutex);
 213        return retval;
 214}
 215EXPORT_SYMBOL(default_llseek);
 216
 217loff_t vfs_llseek(struct file *file, loff_t offset, int origin)
 218{
 219        loff_t (*fn)(struct file *, loff_t, int);
 220
 221        fn = no_llseek;
 222        if (file->f_mode & FMODE_LSEEK) {
 223                if (file->f_op && file->f_op->llseek)
 224                        fn = file->f_op->llseek;
 225        }
 226        return fn(file, offset, origin);
 227}
 228EXPORT_SYMBOL(vfs_llseek);
 229
 230SYSCALL_DEFINE3(lseek, unsigned int, fd, off_t, offset, unsigned int, origin)
 231{
 232        off_t retval;
 233        struct file * file;
 234        int fput_needed;
 235
 236        retval = -EBADF;
 237        file = fget_light(fd, &fput_needed);
 238        if (!file)
 239                goto bad;
 240
 241        retval = -EINVAL;
 242        if (origin <= SEEK_MAX) {
 243                loff_t res = vfs_llseek(file, offset, origin);
 244                retval = res;
 245                if (res != (loff_t)retval)
 246                        retval = -EOVERFLOW;    /* LFS: should only happen on 32 bit platforms */
 247        }
 248        fput_light(file, fput_needed);
 249bad:
 250        return retval;
 251}
 252
 253#ifdef __ARCH_WANT_SYS_LLSEEK
 254SYSCALL_DEFINE5(llseek, unsigned int, fd, unsigned long, offset_high,
 255                unsigned long, offset_low, loff_t __user *, result,
 256                unsigned int, origin)
 257{
 258        int retval;
 259        struct file * file;
 260        loff_t offset;
 261        int fput_needed;
 262
 263        retval = -EBADF;
 264        file = fget_light(fd, &fput_needed);
 265        if (!file)
 266                goto bad;
 267
 268        retval = -EINVAL;
 269        if (origin > SEEK_MAX)
 270                goto out_putf;
 271
 272        offset = vfs_llseek(file, ((loff_t) offset_high << 32) | offset_low,
 273                        origin);
 274
 275        retval = (int)offset;
 276        if (offset >= 0) {
 277                retval = -EFAULT;
 278                if (!copy_to_user(result, &offset, sizeof(offset)))
 279                        retval = 0;
 280        }
 281out_putf:
 282        fput_light(file, fput_needed);
 283bad:
 284        return retval;
 285}
 286#endif
 287
 288
 289/*
 290 * rw_verify_area doesn't like huge counts. We limit
 291 * them to something that fits in "int" so that others
 292 * won't have to do range checks all the time.
 293 */
 294int rw_verify_area(int read_write, struct file *file, loff_t *ppos, size_t count)
 295{
 296        struct inode *inode;
 297        loff_t pos;
 298        int retval = -EINVAL;
 299
 300        inode = file->f_path.dentry->d_inode;
 301        if (unlikely((ssize_t) count < 0))
 302                return retval;
 303        pos = *ppos;
 304        if (unlikely(pos < 0)) {
 305                if (!unsigned_offsets(file))
 306                        return retval;
 307                if (count >= -pos) /* both values are in 0..LLONG_MAX */
 308                        return -EOVERFLOW;
 309        } else if (unlikely((loff_t) (pos + count) < 0)) {
 310                if (!unsigned_offsets(file))
 311                        return retval;
 312        }
 313
 314        if (unlikely(inode->i_flock && mandatory_lock(inode))) {
 315                retval = locks_mandatory_area(
 316                        read_write == READ ? FLOCK_VERIFY_READ : FLOCK_VERIFY_WRITE,
 317                        inode, file, pos, count);
 318                if (retval < 0)
 319                        return retval;
 320        }
 321        retval = security_file_permission(file,
 322                                read_write == READ ? MAY_READ : MAY_WRITE);
 323        if (retval)
 324                return retval;
 325        return count > MAX_RW_COUNT ? MAX_RW_COUNT : count;
 326}
 327
 328static void wait_on_retry_sync_kiocb(struct kiocb *iocb)
 329{
 330        set_current_state(TASK_UNINTERRUPTIBLE);
 331        if (!kiocbIsKicked(iocb))
 332                schedule();
 333        else
 334                kiocbClearKicked(iocb);
 335        __set_current_state(TASK_RUNNING);
 336}
 337
 338ssize_t do_sync_read(struct file *filp, char __user *buf, size_t len, loff_t *ppos)
 339{
 340        struct iovec iov = { .iov_base = buf, .iov_len = len };
 341        struct kiocb kiocb;
 342        ssize_t ret;
 343
 344        init_sync_kiocb(&kiocb, filp);
 345        kiocb.ki_pos = *ppos;
 346        kiocb.ki_left = len;
 347        kiocb.ki_nbytes = len;
 348
 349        for (;;) {
 350                ret = filp->f_op->aio_read(&kiocb, &iov, 1, kiocb.ki_pos);
 351                if (ret != -EIOCBRETRY)
 352                        break;
 353                wait_on_retry_sync_kiocb(&kiocb);
 354        }
 355
 356        if (-EIOCBQUEUED == ret)
 357                ret = wait_on_sync_kiocb(&kiocb);
 358        *ppos = kiocb.ki_pos;
 359        return ret;
 360}
 361
 362EXPORT_SYMBOL(do_sync_read);
 363
 364ssize_t vfs_read(struct file *file, char __user *buf, size_t count, loff_t *pos)
 365{
 366        ssize_t ret;
 367
 368        if (!(file->f_mode & FMODE_READ))
 369                return -EBADF;
 370        if (!file->f_op || (!file->f_op->read && !file->f_op->aio_read))
 371                return -EINVAL;
 372        if (unlikely(!access_ok(VERIFY_WRITE, buf, count)))
 373                return -EFAULT;
 374
 375        ret = rw_verify_area(READ, file, pos, count);
 376        if (ret >= 0) {
 377                count = ret;
 378                if (file->f_op->read)
 379                        ret = file->f_op->read(file, buf, count, pos);
 380                else
 381                        ret = do_sync_read(file, buf, count, pos);
 382                if (ret > 0) {
 383                        fsnotify_access(file);
 384                        add_rchar(current, ret);
 385                }
 386                inc_syscr(current);
 387        }
 388
 389        return ret;
 390}
 391
 392EXPORT_SYMBOL(vfs_read);
 393
 394ssize_t do_sync_write(struct file *filp, const char __user *buf, size_t len, loff_t *ppos)
 395{
 396        struct iovec iov = { .iov_base = (void __user *)buf, .iov_len = len };
 397        struct kiocb kiocb;
 398        ssize_t ret;
 399
 400        init_sync_kiocb(&kiocb, filp);
 401        kiocb.ki_pos = *ppos;
 402        kiocb.ki_left = len;
 403        kiocb.ki_nbytes = len;
 404
 405        for (;;) {
 406                ret = filp->f_op->aio_write(&kiocb, &iov, 1, kiocb.ki_pos);
 407                if (ret != -EIOCBRETRY)
 408                        break;
 409                wait_on_retry_sync_kiocb(&kiocb);
 410        }
 411
 412        if (-EIOCBQUEUED == ret)
 413                ret = wait_on_sync_kiocb(&kiocb);
 414        *ppos = kiocb.ki_pos;
 415        return ret;
 416}
 417
 418EXPORT_SYMBOL(do_sync_write);
 419
 420ssize_t vfs_write(struct file *file, const char __user *buf, size_t count, loff_t *pos)
 421{
 422        ssize_t ret;
 423
 424        if (!(file->f_mode & FMODE_WRITE))
 425                return -EBADF;
 426        if (!file->f_op || (!file->f_op->write && !file->f_op->aio_write))
 427                return -EINVAL;
 428        if (unlikely(!access_ok(VERIFY_READ, buf, count)))
 429                return -EFAULT;
 430
 431        ret = rw_verify_area(WRITE, file, pos, count);
 432        if (ret >= 0) {
 433                count = ret;
 434                if (file->f_op->write)
 435                        ret = file->f_op->write(file, buf, count, pos);
 436                else
 437                        ret = do_sync_write(file, buf, count, pos);
 438                if (ret > 0) {
 439                        fsnotify_modify(file);
 440                        add_wchar(current, ret);
 441                }
 442                inc_syscw(current);
 443        }
 444
 445        return ret;
 446}
 447
 448EXPORT_SYMBOL(vfs_write);
 449
 450static inline loff_t file_pos_read(struct file *file)
 451{
 452        return file->f_pos;
 453}
 454
 455static inline void file_pos_write(struct file *file, loff_t pos)
 456{
 457        file->f_pos = pos;
 458}
 459
 460SYSCALL_DEFINE3(read, unsigned int, fd, char __user *, buf, size_t, count)
 461{
 462        struct file *file;
 463        ssize_t ret = -EBADF;
 464        int fput_needed;
 465
 466        file = fget_light(fd, &fput_needed);
 467        if (file) {
 468                loff_t pos = file_pos_read(file);
 469                ret = vfs_read(file, buf, count, &pos);
 470                file_pos_write(file, pos);
 471                fput_light(file, fput_needed);
 472        }
 473
 474        return ret;
 475}
 476
 477SYSCALL_DEFINE3(write, unsigned int, fd, const char __user *, buf,
 478                size_t, count)
 479{
 480        struct file *file;
 481        ssize_t ret = -EBADF;
 482        int fput_needed;
 483
 484        file = fget_light(fd, &fput_needed);
 485        if (file) {
 486                loff_t pos = file_pos_read(file);
 487                ret = vfs_write(file, buf, count, &pos);
 488                file_pos_write(file, pos);
 489                fput_light(file, fput_needed);
 490        }
 491
 492        return ret;
 493}
 494
 495SYSCALL_DEFINE(pread64)(unsigned int fd, char __user *buf,
 496                        size_t count, loff_t pos)
 497{
 498        struct file *file;
 499        ssize_t ret = -EBADF;
 500        int fput_needed;
 501
 502        if (pos < 0)
 503                return -EINVAL;
 504
 505        file = fget_light(fd, &fput_needed);
 506        if (file) {
 507                ret = -ESPIPE;
 508                if (file->f_mode & FMODE_PREAD)
 509                        ret = vfs_read(file, buf, count, &pos);
 510                fput_light(file, fput_needed);
 511        }
 512
 513        return ret;
 514}
 515#ifdef CONFIG_HAVE_SYSCALL_WRAPPERS
 516asmlinkage long SyS_pread64(long fd, long buf, long count, loff_t pos)
 517{
 518        return SYSC_pread64((unsigned int) fd, (char __user *) buf,
 519                            (size_t) count, pos);
 520}
 521SYSCALL_ALIAS(sys_pread64, SyS_pread64);
 522#endif
 523
 524SYSCALL_DEFINE(pwrite64)(unsigned int fd, const char __user *buf,
 525                         size_t count, loff_t pos)
 526{
 527        struct file *file;
 528        ssize_t ret = -EBADF;
 529        int fput_needed;
 530
 531        if (pos < 0)
 532                return -EINVAL;
 533
 534        file = fget_light(fd, &fput_needed);
 535        if (file) {
 536                ret = -ESPIPE;
 537                if (file->f_mode & FMODE_PWRITE)  
 538                        ret = vfs_write(file, buf, count, &pos);
 539                fput_light(file, fput_needed);
 540        }
 541
 542        return ret;
 543}
 544#ifdef CONFIG_HAVE_SYSCALL_WRAPPERS
 545asmlinkage long SyS_pwrite64(long fd, long buf, long count, loff_t pos)
 546{
 547        return SYSC_pwrite64((unsigned int) fd, (const char __user *) buf,
 548                             (size_t) count, pos);
 549}
 550SYSCALL_ALIAS(sys_pwrite64, SyS_pwrite64);
 551#endif
 552
 553/*
 554 * Reduce an iovec's length in-place.  Return the resulting number of segments
 555 */
 556unsigned long iov_shorten(struct iovec *iov, unsigned long nr_segs, size_t to)
 557{
 558        unsigned long seg = 0;
 559        size_t len = 0;
 560
 561        while (seg < nr_segs) {
 562                seg++;
 563                if (len + iov->iov_len >= to) {
 564                        iov->iov_len = to - len;
 565                        break;
 566                }
 567                len += iov->iov_len;
 568                iov++;
 569        }
 570        return seg;
 571}
 572EXPORT_SYMBOL(iov_shorten);
 573
 574ssize_t do_sync_readv_writev(struct file *filp, const struct iovec *iov,
 575                unsigned long nr_segs, size_t len, loff_t *ppos, iov_fn_t fn)
 576{
 577        struct kiocb kiocb;
 578        ssize_t ret;
 579
 580        init_sync_kiocb(&kiocb, filp);
 581        kiocb.ki_pos = *ppos;
 582        kiocb.ki_left = len;
 583        kiocb.ki_nbytes = len;
 584
 585        for (;;) {
 586                ret = fn(&kiocb, iov, nr_segs, kiocb.ki_pos);
 587                if (ret != -EIOCBRETRY)
 588                        break;
 589                wait_on_retry_sync_kiocb(&kiocb);
 590        }
 591
 592        if (ret == -EIOCBQUEUED)
 593                ret = wait_on_sync_kiocb(&kiocb);
 594        *ppos = kiocb.ki_pos;
 595        return ret;
 596}
 597
 598/* Do it by hand, with file-ops */
 599ssize_t do_loop_readv_writev(struct file *filp, struct iovec *iov,
 600                unsigned long nr_segs, loff_t *ppos, io_fn_t fn)
 601{
 602        struct iovec *vector = iov;
 603        ssize_t ret = 0;
 604
 605        while (nr_segs > 0) {
 606                void __user *base;
 607                size_t len;
 608                ssize_t nr;
 609
 610                base = vector->iov_base;
 611                len = vector->iov_len;
 612                vector++;
 613                nr_segs--;
 614
 615                nr = fn(filp, base, len, ppos);
 616
 617                if (nr < 0) {
 618                        if (!ret)
 619                                ret = nr;
 620                        break;
 621                }
 622                ret += nr;
 623                if (nr != len)
 624                        break;
 625        }
 626
 627        return ret;
 628}
 629
 630/* A write operation does a read from user space and vice versa */
 631#define vrfy_dir(type) ((type) == READ ? VERIFY_WRITE : VERIFY_READ)
 632
 633ssize_t rw_copy_check_uvector(int type, const struct iovec __user * uvector,
 634                              unsigned long nr_segs, unsigned long fast_segs,
 635                              struct iovec *fast_pointer,
 636                              struct iovec **ret_pointer,
 637                              int check_access)
 638{
 639        unsigned long seg;
 640        ssize_t ret;
 641        struct iovec *iov = fast_pointer;
 642
 643        /*
 644         * SuS says "The readv() function *may* fail if the iovcnt argument
 645         * was less than or equal to 0, or greater than {IOV_MAX}.  Linux has
 646         * traditionally returned zero for zero segments, so...
 647         */
 648        if (nr_segs == 0) {
 649                ret = 0;
 650                goto out;
 651        }
 652
 653        /*
 654         * First get the "struct iovec" from user memory and
 655         * verify all the pointers
 656         */
 657        if (nr_segs > UIO_MAXIOV) {
 658                ret = -EINVAL;
 659                goto out;
 660        }
 661        if (nr_segs > fast_segs) {
 662                iov = kmalloc(nr_segs*sizeof(struct iovec), GFP_KERNEL);
 663                if (iov == NULL) {
 664                        ret = -ENOMEM;
 665                        goto out;
 666                }
 667        }
 668        if (copy_from_user(iov, uvector, nr_segs*sizeof(*uvector))) {
 669                ret = -EFAULT;
 670                goto out;
 671        }
 672
 673        /*
 674         * According to the Single Unix Specification we should return EINVAL
 675         * if an element length is < 0 when cast to ssize_t or if the
 676         * total length would overflow the ssize_t return value of the
 677         * system call.
 678         *
 679         * Linux caps all read/write calls to MAX_RW_COUNT, and avoids the
 680         * overflow case.
 681         */
 682        ret = 0;
 683        for (seg = 0; seg < nr_segs; seg++) {
 684                void __user *buf = iov[seg].iov_base;
 685                ssize_t len = (ssize_t)iov[seg].iov_len;
 686
 687                /* see if we we're about to use an invalid len or if
 688                 * it's about to overflow ssize_t */
 689                if (len < 0) {
 690                        ret = -EINVAL;
 691                        goto out;
 692                }
 693                if (check_access
 694                    && unlikely(!access_ok(vrfy_dir(type), buf, len))) {
 695                        ret = -EFAULT;
 696                        goto out;
 697                }
 698                if (len > MAX_RW_COUNT - ret) {
 699                        len = MAX_RW_COUNT - ret;
 700                        iov[seg].iov_len = len;
 701                }
 702                ret += len;
 703        }
 704out:
 705        *ret_pointer = iov;
 706        return ret;
 707}
 708
 709static ssize_t do_readv_writev(int type, struct file *file,
 710                               const struct iovec __user * uvector,
 711                               unsigned long nr_segs, loff_t *pos)
 712{
 713        size_t tot_len;
 714        struct iovec iovstack[UIO_FASTIOV];
 715        struct iovec *iov = iovstack;
 716        ssize_t ret;
 717        io_fn_t fn;
 718        iov_fn_t fnv;
 719
 720        if (!file->f_op) {
 721                ret = -EINVAL;
 722                goto out;
 723        }
 724
 725        ret = rw_copy_check_uvector(type, uvector, nr_segs,
 726                                    ARRAY_SIZE(iovstack), iovstack, &iov, 1);
 727        if (ret <= 0)
 728                goto out;
 729
 730        tot_len = ret;
 731        ret = rw_verify_area(type, file, pos, tot_len);
 732        if (ret < 0)
 733                goto out;
 734
 735        fnv = NULL;
 736        if (type == READ) {
 737                fn = file->f_op->read;
 738                fnv = file->f_op->aio_read;
 739        } else {
 740                fn = (io_fn_t)file->f_op->write;
 741                fnv = file->f_op->aio_write;
 742        }
 743
 744        if (fnv)
 745                ret = do_sync_readv_writev(file, iov, nr_segs, tot_len,
 746                                                pos, fnv);
 747        else
 748                ret = do_loop_readv_writev(file, iov, nr_segs, pos, fn);
 749
 750out:
 751        if (iov != iovstack)
 752                kfree(iov);
 753        if ((ret + (type == READ)) > 0) {
 754                if (type == READ)
 755                        fsnotify_access(file);
 756                else
 757                        fsnotify_modify(file);
 758        }
 759        return ret;
 760}
 761
 762ssize_t vfs_readv(struct file *file, const struct iovec __user *vec,
 763                  unsigned long vlen, loff_t *pos)
 764{
 765        if (!(file->f_mode & FMODE_READ))
 766                return -EBADF;
 767        if (!file->f_op || (!file->f_op->aio_read && !file->f_op->read))
 768                return -EINVAL;
 769
 770        return do_readv_writev(READ, file, vec, vlen, pos);
 771}
 772
 773EXPORT_SYMBOL(vfs_readv);
 774
 775ssize_t vfs_writev(struct file *file, const struct iovec __user *vec,
 776                   unsigned long vlen, loff_t *pos)
 777{
 778        if (!(file->f_mode & FMODE_WRITE))
 779                return -EBADF;
 780        if (!file->f_op || (!file->f_op->aio_write && !file->f_op->write))
 781                return -EINVAL;
 782
 783        return do_readv_writev(WRITE, file, vec, vlen, pos);
 784}
 785
 786EXPORT_SYMBOL(vfs_writev);
 787
 788SYSCALL_DEFINE3(readv, unsigned long, fd, const struct iovec __user *, vec,
 789                unsigned long, vlen)
 790{
 791        struct file *file;
 792        ssize_t ret = -EBADF;
 793        int fput_needed;
 794
 795        file = fget_light(fd, &fput_needed);
 796        if (file) {
 797                loff_t pos = file_pos_read(file);
 798                ret = vfs_readv(file, vec, vlen, &pos);
 799                file_pos_write(file, pos);
 800                fput_light(file, fput_needed);
 801        }
 802
 803        if (ret > 0)
 804                add_rchar(current, ret);
 805        inc_syscr(current);
 806        return ret;
 807}
 808
 809SYSCALL_DEFINE3(writev, unsigned long, fd, const struct iovec __user *, vec,
 810                unsigned long, vlen)
 811{
 812        struct file *file;
 813        ssize_t ret = -EBADF;
 814        int fput_needed;
 815
 816        file = fget_light(fd, &fput_needed);
 817        if (file) {
 818                loff_t pos = file_pos_read(file);
 819                ret = vfs_writev(file, vec, vlen, &pos);
 820                file_pos_write(file, pos);
 821                fput_light(file, fput_needed);
 822        }
 823
 824        if (ret > 0)
 825                add_wchar(current, ret);
 826        inc_syscw(current);
 827        return ret;
 828}
 829
 830static inline loff_t pos_from_hilo(unsigned long high, unsigned long low)
 831{
 832#define HALF_LONG_BITS (BITS_PER_LONG / 2)
 833        return (((loff_t)high << HALF_LONG_BITS) << HALF_LONG_BITS) | low;
 834}
 835
 836SYSCALL_DEFINE5(preadv, unsigned long, fd, const struct iovec __user *, vec,
 837                unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h)
 838{
 839        loff_t pos = pos_from_hilo(pos_h, pos_l);
 840        struct file *file;
 841        ssize_t ret = -EBADF;
 842        int fput_needed;
 843
 844        if (pos < 0)
 845                return -EINVAL;
 846
 847        file = fget_light(fd, &fput_needed);
 848        if (file) {
 849                ret = -ESPIPE;
 850                if (file->f_mode & FMODE_PREAD)
 851                        ret = vfs_readv(file, vec, vlen, &pos);
 852                fput_light(file, fput_needed);
 853        }
 854
 855        if (ret > 0)
 856                add_rchar(current, ret);
 857        inc_syscr(current);
 858        return ret;
 859}
 860
 861SYSCALL_DEFINE5(pwritev, unsigned long, fd, const struct iovec __user *, vec,
 862                unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h)
 863{
 864        loff_t pos = pos_from_hilo(pos_h, pos_l);
 865        struct file *file;
 866        ssize_t ret = -EBADF;
 867        int fput_needed;
 868
 869        if (pos < 0)
 870                return -EINVAL;
 871
 872        file = fget_light(fd, &fput_needed);
 873        if (file) {
 874                ret = -ESPIPE;
 875                if (file->f_mode & FMODE_PWRITE)
 876                        ret = vfs_writev(file, vec, vlen, &pos);
 877                fput_light(file, fput_needed);
 878        }
 879
 880        if (ret > 0)
 881                add_wchar(current, ret);
 882        inc_syscw(current);
 883        return ret;
 884}
 885
 886static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos,
 887                           size_t count, loff_t max)
 888{
 889        struct file * in_file, * out_file;
 890        struct inode * in_inode, * out_inode;
 891        loff_t pos;
 892        ssize_t retval;
 893        int fput_needed_in, fput_needed_out, fl;
 894
 895        /*
 896         * Get input file, and verify that it is ok..
 897         */
 898        retval = -EBADF;
 899        in_file = fget_light(in_fd, &fput_needed_in);
 900        if (!in_file)
 901                goto out;
 902        if (!(in_file->f_mode & FMODE_READ))
 903                goto fput_in;
 904        retval = -ESPIPE;
 905        if (!ppos)
 906                ppos = &in_file->f_pos;
 907        else
 908                if (!(in_file->f_mode & FMODE_PREAD))
 909                        goto fput_in;
 910        retval = rw_verify_area(READ, in_file, ppos, count);
 911        if (retval < 0)
 912                goto fput_in;
 913        count = retval;
 914
 915        /*
 916         * Get output file, and verify that it is ok..
 917         */
 918        retval = -EBADF;
 919        out_file = fget_light(out_fd, &fput_needed_out);
 920        if (!out_file)
 921                goto fput_in;
 922        if (!(out_file->f_mode & FMODE_WRITE))
 923                goto fput_out;
 924        retval = -EINVAL;
 925        in_inode = in_file->f_path.dentry->d_inode;
 926        out_inode = out_file->f_path.dentry->d_inode;
 927        retval = rw_verify_area(WRITE, out_file, &out_file->f_pos, count);
 928        if (retval < 0)
 929                goto fput_out;
 930        count = retval;
 931
 932        if (!max)
 933                max = min(in_inode->i_sb->s_maxbytes, out_inode->i_sb->s_maxbytes);
 934
 935        pos = *ppos;
 936        if (unlikely(pos + count > max)) {
 937                retval = -EOVERFLOW;
 938                if (pos >= max)
 939                        goto fput_out;
 940                count = max - pos;
 941        }
 942
 943        fl = 0;
 944#if 0
 945        /*
 946         * We need to debate whether we can enable this or not. The
 947         * man page documents EAGAIN return for the output at least,
 948         * and the application is arguably buggy if it doesn't expect
 949         * EAGAIN on a non-blocking file descriptor.
 950         */
 951        if (in_file->f_flags & O_NONBLOCK)
 952                fl = SPLICE_F_NONBLOCK;
 953#endif
 954        retval = do_splice_direct(in_file, ppos, out_file, count, fl);
 955
 956        if (retval > 0) {
 957                add_rchar(current, retval);
 958                add_wchar(current, retval);
 959        }
 960
 961        inc_syscr(current);
 962        inc_syscw(current);
 963        if (*ppos > max)
 964                retval = -EOVERFLOW;
 965
 966fput_out:
 967        fput_light(out_file, fput_needed_out);
 968fput_in:
 969        fput_light(in_file, fput_needed_in);
 970out:
 971        return retval;
 972}
 973
 974SYSCALL_DEFINE4(sendfile, int, out_fd, int, in_fd, off_t __user *, offset, size_t, count)
 975{
 976        loff_t pos;
 977        off_t off;
 978        ssize_t ret;
 979
 980        if (offset) {
 981                if (unlikely(get_user(off, offset)))
 982                        return -EFAULT;
 983                pos = off;
 984                ret = do_sendfile(out_fd, in_fd, &pos, count, MAX_NON_LFS);
 985                if (unlikely(put_user(pos, offset)))
 986                        return -EFAULT;
 987                return ret;
 988        }
 989
 990        return do_sendfile(out_fd, in_fd, NULL, count, 0);
 991}
 992
 993SYSCALL_DEFINE4(sendfile64, int, out_fd, int, in_fd, loff_t __user *, offset, size_t, count)
 994{
 995        loff_t pos;
 996        ssize_t ret;
 997
 998        if (offset) {
 999                if (unlikely(copy_from_user(&pos, offset, sizeof(loff_t))))
1000                        return -EFAULT;
1001                ret = do_sendfile(out_fd, in_fd, &pos, count, 0);
1002                if (unlikely(put_user(pos, offset)))
1003                        return -EFAULT;
1004                return ret;
1005        }
1006
1007        return do_sendfile(out_fd, in_fd, NULL, count, 0);
1008}
1009