linux/fs/read_write.c
<<
>>
Prefs
   1/*
   2 *  linux/fs/read_write.c
   3 *
   4 *  Copyright (C) 1991, 1992  Linus Torvalds
   5 */
   6
   7#include <linux/slab.h> 
   8#include <linux/stat.h>
   9#include <linux/fcntl.h>
  10#include <linux/file.h>
  11#include <linux/uio.h>
  12#include <linux/aio.h>
  13#include <linux/fsnotify.h>
  14#include <linux/security.h>
  15#include <linux/export.h>
  16#include <linux/syscalls.h>
  17#include <linux/pagemap.h>
  18#include <linux/splice.h>
  19#include <linux/compat.h>
  20#include "internal.h"
  21
  22#include <asm/uaccess.h>
  23#include <asm/unistd.h>
  24
  25typedef ssize_t (*io_fn_t)(struct file *, char __user *, size_t, loff_t *);
  26typedef ssize_t (*iov_fn_t)(struct kiocb *, const struct iovec *,
  27                unsigned long, loff_t);
  28typedef ssize_t (*iter_fn_t)(struct kiocb *, struct iov_iter *);
  29
  30const struct file_operations generic_ro_fops = {
  31        .llseek         = generic_file_llseek,
  32        .read           = new_sync_read,
  33        .read_iter      = generic_file_read_iter,
  34        .mmap           = generic_file_readonly_mmap,
  35        .splice_read    = generic_file_splice_read,
  36};
  37
  38EXPORT_SYMBOL(generic_ro_fops);
  39
  40static inline int unsigned_offsets(struct file *file)
  41{
  42        return file->f_mode & FMODE_UNSIGNED_OFFSET;
  43}
  44
  45/**
  46 * vfs_setpos - update the file offset for lseek
  47 * @file:       file structure in question
  48 * @offset:     file offset to seek to
  49 * @maxsize:    maximum file size
  50 *
  51 * This is a low-level filesystem helper for updating the file offset to
  52 * the value specified by @offset if the given offset is valid and it is
  53 * not equal to the current file offset.
  54 *
  55 * Return the specified offset on success and -EINVAL on invalid offset.
  56 */
  57loff_t vfs_setpos(struct file *file, loff_t offset, loff_t maxsize)
  58{
  59        if (offset < 0 && !unsigned_offsets(file))
  60                return -EINVAL;
  61        if (offset > maxsize)
  62                return -EINVAL;
  63
  64        if (offset != file->f_pos) {
  65                file->f_pos = offset;
  66                file->f_version = 0;
  67        }
  68        return offset;
  69}
  70EXPORT_SYMBOL(vfs_setpos);
  71
  72/**
  73 * generic_file_llseek_size - generic llseek implementation for regular files
  74 * @file:       file structure to seek on
  75 * @offset:     file offset to seek to
  76 * @whence:     type of seek
  77 * @size:       max size of this file in file system
  78 * @eof:        offset used for SEEK_END position
  79 *
  80 * This is a variant of generic_file_llseek that allows passing in a custom
  81 * maximum file size and a custom EOF position, for e.g. hashed directories
  82 *
  83 * Synchronization:
  84 * SEEK_SET and SEEK_END are unsynchronized (but atomic on 64bit platforms)
  85 * SEEK_CUR is synchronized against other SEEK_CURs, but not read/writes.
  86 * read/writes behave like SEEK_SET against seeks.
  87 */
  88loff_t
  89generic_file_llseek_size(struct file *file, loff_t offset, int whence,
  90                loff_t maxsize, loff_t eof)
  91{
  92        switch (whence) {
  93        case SEEK_END:
  94                offset += eof;
  95                break;
  96        case SEEK_CUR:
  97                /*
  98                 * Here we special-case the lseek(fd, 0, SEEK_CUR)
  99                 * position-querying operation.  Avoid rewriting the "same"
 100                 * f_pos value back to the file because a concurrent read(),
 101                 * write() or lseek() might have altered it
 102                 */
 103                if (offset == 0)
 104                        return file->f_pos;
 105                /*
 106                 * f_lock protects against read/modify/write race with other
 107                 * SEEK_CURs. Note that parallel writes and reads behave
 108                 * like SEEK_SET.
 109                 */
 110                spin_lock(&file->f_lock);
 111                offset = vfs_setpos(file, file->f_pos + offset, maxsize);
 112                spin_unlock(&file->f_lock);
 113                return offset;
 114        case SEEK_DATA:
 115                /*
 116                 * In the generic case the entire file is data, so as long as
 117                 * offset isn't at the end of the file then the offset is data.
 118                 */
 119                if (offset >= eof)
 120                        return -ENXIO;
 121                break;
 122        case SEEK_HOLE:
 123                /*
 124                 * There is a virtual hole at the end of the file, so as long as
 125                 * offset isn't i_size or larger, return i_size.
 126                 */
 127                if (offset >= eof)
 128                        return -ENXIO;
 129                offset = eof;
 130                break;
 131        }
 132
 133        return vfs_setpos(file, offset, maxsize);
 134}
 135EXPORT_SYMBOL(generic_file_llseek_size);
 136
 137/**
 138 * generic_file_llseek - generic llseek implementation for regular files
 139 * @file:       file structure to seek on
 140 * @offset:     file offset to seek to
 141 * @whence:     type of seek
 142 *
 143 * This is a generic implemenation of ->llseek useable for all normal local
 144 * filesystems.  It just updates the file offset to the value specified by
 145 * @offset and @whence.
 146 */
 147loff_t generic_file_llseek(struct file *file, loff_t offset, int whence)
 148{
 149        struct inode *inode = file->f_mapping->host;
 150
 151        return generic_file_llseek_size(file, offset, whence,
 152                                        inode->i_sb->s_maxbytes,
 153                                        i_size_read(inode));
 154}
 155EXPORT_SYMBOL(generic_file_llseek);
 156
 157/**
 158 * fixed_size_llseek - llseek implementation for fixed-sized devices
 159 * @file:       file structure to seek on
 160 * @offset:     file offset to seek to
 161 * @whence:     type of seek
 162 * @size:       size of the file
 163 *
 164 */
 165loff_t fixed_size_llseek(struct file *file, loff_t offset, int whence, loff_t size)
 166{
 167        switch (whence) {
 168        case SEEK_SET: case SEEK_CUR: case SEEK_END:
 169                return generic_file_llseek_size(file, offset, whence,
 170                                                size, size);
 171        default:
 172                return -EINVAL;
 173        }
 174}
 175EXPORT_SYMBOL(fixed_size_llseek);
 176
 177/**
 178 * noop_llseek - No Operation Performed llseek implementation
 179 * @file:       file structure to seek on
 180 * @offset:     file offset to seek to
 181 * @whence:     type of seek
 182 *
 183 * This is an implementation of ->llseek useable for the rare special case when
 184 * userspace expects the seek to succeed but the (device) file is actually not
 185 * able to perform the seek. In this case you use noop_llseek() instead of
 186 * falling back to the default implementation of ->llseek.
 187 */
 188loff_t noop_llseek(struct file *file, loff_t offset, int whence)
 189{
 190        return file->f_pos;
 191}
 192EXPORT_SYMBOL(noop_llseek);
 193
 194loff_t no_llseek(struct file *file, loff_t offset, int whence)
 195{
 196        return -ESPIPE;
 197}
 198EXPORT_SYMBOL(no_llseek);
 199
 200loff_t default_llseek(struct file *file, loff_t offset, int whence)
 201{
 202        struct inode *inode = file_inode(file);
 203        loff_t retval;
 204
 205        mutex_lock(&inode->i_mutex);
 206        switch (whence) {
 207                case SEEK_END:
 208                        offset += i_size_read(inode);
 209                        break;
 210                case SEEK_CUR:
 211                        if (offset == 0) {
 212                                retval = file->f_pos;
 213                                goto out;
 214                        }
 215                        offset += file->f_pos;
 216                        break;
 217                case SEEK_DATA:
 218                        /*
 219                         * In the generic case the entire file is data, so as
 220                         * long as offset isn't at the end of the file then the
 221                         * offset is data.
 222                         */
 223                        if (offset >= inode->i_size) {
 224                                retval = -ENXIO;
 225                                goto out;
 226                        }
 227                        break;
 228                case SEEK_HOLE:
 229                        /*
 230                         * There is a virtual hole at the end of the file, so
 231                         * as long as offset isn't i_size or larger, return
 232                         * i_size.
 233                         */
 234                        if (offset >= inode->i_size) {
 235                                retval = -ENXIO;
 236                                goto out;
 237                        }
 238                        offset = inode->i_size;
 239                        break;
 240        }
 241        retval = -EINVAL;
 242        if (offset >= 0 || unsigned_offsets(file)) {
 243                if (offset != file->f_pos) {
 244                        file->f_pos = offset;
 245                        file->f_version = 0;
 246                }
 247                retval = offset;
 248        }
 249out:
 250        mutex_unlock(&inode->i_mutex);
 251        return retval;
 252}
 253EXPORT_SYMBOL(default_llseek);
 254
 255loff_t vfs_llseek(struct file *file, loff_t offset, int whence)
 256{
 257        loff_t (*fn)(struct file *, loff_t, int);
 258
 259        fn = no_llseek;
 260        if (file->f_mode & FMODE_LSEEK) {
 261                if (file->f_op->llseek)
 262                        fn = file->f_op->llseek;
 263        }
 264        return fn(file, offset, whence);
 265}
 266EXPORT_SYMBOL(vfs_llseek);
 267
 268static inline struct fd fdget_pos(int fd)
 269{
 270        return __to_fd(__fdget_pos(fd));
 271}
 272
 273static inline void fdput_pos(struct fd f)
 274{
 275        if (f.flags & FDPUT_POS_UNLOCK)
 276                mutex_unlock(&f.file->f_pos_lock);
 277        fdput(f);
 278}
 279
 280SYSCALL_DEFINE3(lseek, unsigned int, fd, off_t, offset, unsigned int, whence)
 281{
 282        off_t retval;
 283        struct fd f = fdget_pos(fd);
 284        if (!f.file)
 285                return -EBADF;
 286
 287        retval = -EINVAL;
 288        if (whence <= SEEK_MAX) {
 289                loff_t res = vfs_llseek(f.file, offset, whence);
 290                retval = res;
 291                if (res != (loff_t)retval)
 292                        retval = -EOVERFLOW;    /* LFS: should only happen on 32 bit platforms */
 293        }
 294        fdput_pos(f);
 295        return retval;
 296}
 297
 298#ifdef CONFIG_COMPAT
 299COMPAT_SYSCALL_DEFINE3(lseek, unsigned int, fd, compat_off_t, offset, unsigned int, whence)
 300{
 301        return sys_lseek(fd, offset, whence);
 302}
 303#endif
 304
 305#ifdef __ARCH_WANT_SYS_LLSEEK
 306SYSCALL_DEFINE5(llseek, unsigned int, fd, unsigned long, offset_high,
 307                unsigned long, offset_low, loff_t __user *, result,
 308                unsigned int, whence)
 309{
 310        int retval;
 311        struct fd f = fdget_pos(fd);
 312        loff_t offset;
 313
 314        if (!f.file)
 315                return -EBADF;
 316
 317        retval = -EINVAL;
 318        if (whence > SEEK_MAX)
 319                goto out_putf;
 320
 321        offset = vfs_llseek(f.file, ((loff_t) offset_high << 32) | offset_low,
 322                        whence);
 323
 324        retval = (int)offset;
 325        if (offset >= 0) {
 326                retval = -EFAULT;
 327                if (!copy_to_user(result, &offset, sizeof(offset)))
 328                        retval = 0;
 329        }
 330out_putf:
 331        fdput_pos(f);
 332        return retval;
 333}
 334#endif
 335
 336/*
 337 * rw_verify_area doesn't like huge counts. We limit
 338 * them to something that fits in "int" so that others
 339 * won't have to do range checks all the time.
 340 */
 341int rw_verify_area(int read_write, struct file *file, const loff_t *ppos, size_t count)
 342{
 343        struct inode *inode;
 344        loff_t pos;
 345        int retval = -EINVAL;
 346
 347        inode = file_inode(file);
 348        if (unlikely((ssize_t) count < 0))
 349                return retval;
 350        pos = *ppos;
 351        if (unlikely(pos < 0)) {
 352                if (!unsigned_offsets(file))
 353                        return retval;
 354                if (count >= -pos) /* both values are in 0..LLONG_MAX */
 355                        return -EOVERFLOW;
 356        } else if (unlikely((loff_t) (pos + count) < 0)) {
 357                if (!unsigned_offsets(file))
 358                        return retval;
 359        }
 360
 361        if (unlikely(inode->i_flock && mandatory_lock(inode))) {
 362                retval = locks_mandatory_area(
 363                        read_write == READ ? FLOCK_VERIFY_READ : FLOCK_VERIFY_WRITE,
 364                        inode, file, pos, count);
 365                if (retval < 0)
 366                        return retval;
 367        }
 368        retval = security_file_permission(file,
 369                                read_write == READ ? MAY_READ : MAY_WRITE);
 370        if (retval)
 371                return retval;
 372        return count > MAX_RW_COUNT ? MAX_RW_COUNT : count;
 373}
 374
 375ssize_t do_sync_read(struct file *filp, char __user *buf, size_t len, loff_t *ppos)
 376{
 377        struct iovec iov = { .iov_base = buf, .iov_len = len };
 378        struct kiocb kiocb;
 379        ssize_t ret;
 380
 381        init_sync_kiocb(&kiocb, filp);
 382        kiocb.ki_pos = *ppos;
 383        kiocb.ki_nbytes = len;
 384
 385        ret = filp->f_op->aio_read(&kiocb, &iov, 1, kiocb.ki_pos);
 386        if (-EIOCBQUEUED == ret)
 387                ret = wait_on_sync_kiocb(&kiocb);
 388        *ppos = kiocb.ki_pos;
 389        return ret;
 390}
 391
 392EXPORT_SYMBOL(do_sync_read);
 393
 394ssize_t new_sync_read(struct file *filp, char __user *buf, size_t len, loff_t *ppos)
 395{
 396        struct iovec iov = { .iov_base = buf, .iov_len = len };
 397        struct kiocb kiocb;
 398        struct iov_iter iter;
 399        ssize_t ret;
 400
 401        init_sync_kiocb(&kiocb, filp);
 402        kiocb.ki_pos = *ppos;
 403        kiocb.ki_nbytes = len;
 404        iov_iter_init(&iter, READ, &iov, 1, len);
 405
 406        ret = filp->f_op->read_iter(&kiocb, &iter);
 407        if (-EIOCBQUEUED == ret)
 408                ret = wait_on_sync_kiocb(&kiocb);
 409        *ppos = kiocb.ki_pos;
 410        return ret;
 411}
 412
 413EXPORT_SYMBOL(new_sync_read);
 414
 415ssize_t vfs_read(struct file *file, char __user *buf, size_t count, loff_t *pos)
 416{
 417        ssize_t ret;
 418
 419        if (!(file->f_mode & FMODE_READ))
 420                return -EBADF;
 421        if (!(file->f_mode & FMODE_CAN_READ))
 422                return -EINVAL;
 423        if (unlikely(!access_ok(VERIFY_WRITE, buf, count)))
 424                return -EFAULT;
 425
 426        ret = rw_verify_area(READ, file, pos, count);
 427        if (ret >= 0) {
 428                count = ret;
 429                if (file->f_op->read)
 430                        ret = file->f_op->read(file, buf, count, pos);
 431                else if (file->f_op->aio_read)
 432                        ret = do_sync_read(file, buf, count, pos);
 433                else
 434                        ret = new_sync_read(file, buf, count, pos);
 435                if (ret > 0) {
 436                        fsnotify_access(file);
 437                        add_rchar(current, ret);
 438                }
 439                inc_syscr(current);
 440        }
 441
 442        return ret;
 443}
 444
 445EXPORT_SYMBOL(vfs_read);
 446
 447ssize_t do_sync_write(struct file *filp, const char __user *buf, size_t len, loff_t *ppos)
 448{
 449        struct iovec iov = { .iov_base = (void __user *)buf, .iov_len = len };
 450        struct kiocb kiocb;
 451        ssize_t ret;
 452
 453        init_sync_kiocb(&kiocb, filp);
 454        kiocb.ki_pos = *ppos;
 455        kiocb.ki_nbytes = len;
 456
 457        ret = filp->f_op->aio_write(&kiocb, &iov, 1, kiocb.ki_pos);
 458        if (-EIOCBQUEUED == ret)
 459                ret = wait_on_sync_kiocb(&kiocb);
 460        *ppos = kiocb.ki_pos;
 461        return ret;
 462}
 463
 464EXPORT_SYMBOL(do_sync_write);
 465
 466ssize_t new_sync_write(struct file *filp, const char __user *buf, size_t len, loff_t *ppos)
 467{
 468        struct iovec iov = { .iov_base = (void __user *)buf, .iov_len = len };
 469        struct kiocb kiocb;
 470        struct iov_iter iter;
 471        ssize_t ret;
 472
 473        init_sync_kiocb(&kiocb, filp);
 474        kiocb.ki_pos = *ppos;
 475        kiocb.ki_nbytes = len;
 476        iov_iter_init(&iter, WRITE, &iov, 1, len);
 477
 478        ret = filp->f_op->write_iter(&kiocb, &iter);
 479        if (-EIOCBQUEUED == ret)
 480                ret = wait_on_sync_kiocb(&kiocb);
 481        *ppos = kiocb.ki_pos;
 482        return ret;
 483}
 484
 485EXPORT_SYMBOL(new_sync_write);
 486
 487ssize_t __kernel_write(struct file *file, const char *buf, size_t count, loff_t *pos)
 488{
 489        mm_segment_t old_fs;
 490        const char __user *p;
 491        ssize_t ret;
 492
 493        if (!(file->f_mode & FMODE_CAN_WRITE))
 494                return -EINVAL;
 495
 496        old_fs = get_fs();
 497        set_fs(get_ds());
 498        p = (__force const char __user *)buf;
 499        if (count > MAX_RW_COUNT)
 500                count =  MAX_RW_COUNT;
 501        if (file->f_op->write)
 502                ret = file->f_op->write(file, p, count, pos);
 503        else if (file->f_op->aio_write)
 504                ret = do_sync_write(file, p, count, pos);
 505        else
 506                ret = new_sync_write(file, p, count, pos);
 507        set_fs(old_fs);
 508        if (ret > 0) {
 509                fsnotify_modify(file);
 510                add_wchar(current, ret);
 511        }
 512        inc_syscw(current);
 513        return ret;
 514}
 515
 516ssize_t vfs_write(struct file *file, const char __user *buf, size_t count, loff_t *pos)
 517{
 518        ssize_t ret;
 519
 520        if (!(file->f_mode & FMODE_WRITE))
 521                return -EBADF;
 522        if (!(file->f_mode & FMODE_CAN_WRITE))
 523                return -EINVAL;
 524        if (unlikely(!access_ok(VERIFY_READ, buf, count)))
 525                return -EFAULT;
 526
 527        ret = rw_verify_area(WRITE, file, pos, count);
 528        if (ret >= 0) {
 529                count = ret;
 530                file_start_write(file);
 531                if (file->f_op->write)
 532                        ret = file->f_op->write(file, buf, count, pos);
 533                else if (file->f_op->aio_write)
 534                        ret = do_sync_write(file, buf, count, pos);
 535                else
 536                        ret = new_sync_write(file, buf, count, pos);
 537                if (ret > 0) {
 538                        fsnotify_modify(file);
 539                        add_wchar(current, ret);
 540                }
 541                inc_syscw(current);
 542                file_end_write(file);
 543        }
 544
 545        return ret;
 546}
 547
 548EXPORT_SYMBOL(vfs_write);
 549
 550static inline loff_t file_pos_read(struct file *file)
 551{
 552        return file->f_pos;
 553}
 554
 555static inline void file_pos_write(struct file *file, loff_t pos)
 556{
 557        file->f_pos = pos;
 558}
 559
 560SYSCALL_DEFINE3(read, unsigned int, fd, char __user *, buf, size_t, count)
 561{
 562        struct fd f = fdget_pos(fd);
 563        ssize_t ret = -EBADF;
 564
 565        if (f.file) {
 566                loff_t pos = file_pos_read(f.file);
 567                ret = vfs_read(f.file, buf, count, &pos);
 568                if (ret >= 0)
 569                        file_pos_write(f.file, pos);
 570                fdput_pos(f);
 571        }
 572        return ret;
 573}
 574
 575SYSCALL_DEFINE3(write, unsigned int, fd, const char __user *, buf,
 576                size_t, count)
 577{
 578        struct fd f = fdget_pos(fd);
 579        ssize_t ret = -EBADF;
 580
 581        if (f.file) {
 582                loff_t pos = file_pos_read(f.file);
 583                ret = vfs_write(f.file, buf, count, &pos);
 584                if (ret >= 0)
 585                        file_pos_write(f.file, pos);
 586                fdput_pos(f);
 587        }
 588
 589        return ret;
 590}
 591
 592SYSCALL_DEFINE4(pread64, unsigned int, fd, char __user *, buf,
 593                        size_t, count, loff_t, pos)
 594{
 595        struct fd f;
 596        ssize_t ret = -EBADF;
 597
 598        if (pos < 0)
 599                return -EINVAL;
 600
 601        f = fdget(fd);
 602        if (f.file) {
 603                ret = -ESPIPE;
 604                if (f.file->f_mode & FMODE_PREAD)
 605                        ret = vfs_read(f.file, buf, count, &pos);
 606                fdput(f);
 607        }
 608
 609        return ret;
 610}
 611
 612SYSCALL_DEFINE4(pwrite64, unsigned int, fd, const char __user *, buf,
 613                         size_t, count, loff_t, pos)
 614{
 615        struct fd f;
 616        ssize_t ret = -EBADF;
 617
 618        if (pos < 0)
 619                return -EINVAL;
 620
 621        f = fdget(fd);
 622        if (f.file) {
 623                ret = -ESPIPE;
 624                if (f.file->f_mode & FMODE_PWRITE)  
 625                        ret = vfs_write(f.file, buf, count, &pos);
 626                fdput(f);
 627        }
 628
 629        return ret;
 630}
 631
 632/*
 633 * Reduce an iovec's length in-place.  Return the resulting number of segments
 634 */
 635unsigned long iov_shorten(struct iovec *iov, unsigned long nr_segs, size_t to)
 636{
 637        unsigned long seg = 0;
 638        size_t len = 0;
 639
 640        while (seg < nr_segs) {
 641                seg++;
 642                if (len + iov->iov_len >= to) {
 643                        iov->iov_len = to - len;
 644                        break;
 645                }
 646                len += iov->iov_len;
 647                iov++;
 648        }
 649        return seg;
 650}
 651EXPORT_SYMBOL(iov_shorten);
 652
 653static ssize_t do_iter_readv_writev(struct file *filp, int rw, const struct iovec *iov,
 654                unsigned long nr_segs, size_t len, loff_t *ppos, iter_fn_t fn)
 655{
 656        struct kiocb kiocb;
 657        struct iov_iter iter;
 658        ssize_t ret;
 659
 660        init_sync_kiocb(&kiocb, filp);
 661        kiocb.ki_pos = *ppos;
 662        kiocb.ki_nbytes = len;
 663
 664        iov_iter_init(&iter, rw, iov, nr_segs, len);
 665        ret = fn(&kiocb, &iter);
 666        if (ret == -EIOCBQUEUED)
 667                ret = wait_on_sync_kiocb(&kiocb);
 668        *ppos = kiocb.ki_pos;
 669        return ret;
 670}
 671
 672static ssize_t do_sync_readv_writev(struct file *filp, const struct iovec *iov,
 673                unsigned long nr_segs, size_t len, loff_t *ppos, iov_fn_t fn)
 674{
 675        struct kiocb kiocb;
 676        ssize_t ret;
 677
 678        init_sync_kiocb(&kiocb, filp);
 679        kiocb.ki_pos = *ppos;
 680        kiocb.ki_nbytes = len;
 681
 682        ret = fn(&kiocb, iov, nr_segs, kiocb.ki_pos);
 683        if (ret == -EIOCBQUEUED)
 684                ret = wait_on_sync_kiocb(&kiocb);
 685        *ppos = kiocb.ki_pos;
 686        return ret;
 687}
 688
 689/* Do it by hand, with file-ops */
 690static ssize_t do_loop_readv_writev(struct file *filp, struct iovec *iov,
 691                unsigned long nr_segs, loff_t *ppos, io_fn_t fn)
 692{
 693        struct iovec *vector = iov;
 694        ssize_t ret = 0;
 695
 696        while (nr_segs > 0) {
 697                void __user *base;
 698                size_t len;
 699                ssize_t nr;
 700
 701                base = vector->iov_base;
 702                len = vector->iov_len;
 703                vector++;
 704                nr_segs--;
 705
 706                nr = fn(filp, base, len, ppos);
 707
 708                if (nr < 0) {
 709                        if (!ret)
 710                                ret = nr;
 711                        break;
 712                }
 713                ret += nr;
 714                if (nr != len)
 715                        break;
 716        }
 717
 718        return ret;
 719}
 720
 721/* A write operation does a read from user space and vice versa */
 722#define vrfy_dir(type) ((type) == READ ? VERIFY_WRITE : VERIFY_READ)
 723
 724ssize_t rw_copy_check_uvector(int type, const struct iovec __user * uvector,
 725                              unsigned long nr_segs, unsigned long fast_segs,
 726                              struct iovec *fast_pointer,
 727                              struct iovec **ret_pointer)
 728{
 729        unsigned long seg;
 730        ssize_t ret;
 731        struct iovec *iov = fast_pointer;
 732
 733        /*
 734         * SuS says "The readv() function *may* fail if the iovcnt argument
 735         * was less than or equal to 0, or greater than {IOV_MAX}.  Linux has
 736         * traditionally returned zero for zero segments, so...
 737         */
 738        if (nr_segs == 0) {
 739                ret = 0;
 740                goto out;
 741        }
 742
 743        /*
 744         * First get the "struct iovec" from user memory and
 745         * verify all the pointers
 746         */
 747        if (nr_segs > UIO_MAXIOV) {
 748                ret = -EINVAL;
 749                goto out;
 750        }
 751        if (nr_segs > fast_segs) {
 752                iov = kmalloc(nr_segs*sizeof(struct iovec), GFP_KERNEL);
 753                if (iov == NULL) {
 754                        ret = -ENOMEM;
 755                        goto out;
 756                }
 757        }
 758        if (copy_from_user(iov, uvector, nr_segs*sizeof(*uvector))) {
 759                ret = -EFAULT;
 760                goto out;
 761        }
 762
 763        /*
 764         * According to the Single Unix Specification we should return EINVAL
 765         * if an element length is < 0 when cast to ssize_t or if the
 766         * total length would overflow the ssize_t return value of the
 767         * system call.
 768         *
 769         * Linux caps all read/write calls to MAX_RW_COUNT, and avoids the
 770         * overflow case.
 771         */
 772        ret = 0;
 773        for (seg = 0; seg < nr_segs; seg++) {
 774                void __user *buf = iov[seg].iov_base;
 775                ssize_t len = (ssize_t)iov[seg].iov_len;
 776
 777                /* see if we we're about to use an invalid len or if
 778                 * it's about to overflow ssize_t */
 779                if (len < 0) {
 780                        ret = -EINVAL;
 781                        goto out;
 782                }
 783                if (type >= 0
 784                    && unlikely(!access_ok(vrfy_dir(type), buf, len))) {
 785                        ret = -EFAULT;
 786                        goto out;
 787                }
 788                if (len > MAX_RW_COUNT - ret) {
 789                        len = MAX_RW_COUNT - ret;
 790                        iov[seg].iov_len = len;
 791                }
 792                ret += len;
 793        }
 794out:
 795        *ret_pointer = iov;
 796        return ret;
 797}
 798
 799static ssize_t do_readv_writev(int type, struct file *file,
 800                               const struct iovec __user * uvector,
 801                               unsigned long nr_segs, loff_t *pos)
 802{
 803        size_t tot_len;
 804        struct iovec iovstack[UIO_FASTIOV];
 805        struct iovec *iov = iovstack;
 806        ssize_t ret;
 807        io_fn_t fn;
 808        iov_fn_t fnv;
 809        iter_fn_t iter_fn;
 810
 811        ret = rw_copy_check_uvector(type, uvector, nr_segs,
 812                                    ARRAY_SIZE(iovstack), iovstack, &iov);
 813        if (ret <= 0)
 814                goto out;
 815
 816        tot_len = ret;
 817        ret = rw_verify_area(type, file, pos, tot_len);
 818        if (ret < 0)
 819                goto out;
 820
 821        fnv = NULL;
 822        if (type == READ) {
 823                fn = file->f_op->read;
 824                fnv = file->f_op->aio_read;
 825                iter_fn = file->f_op->read_iter;
 826        } else {
 827                fn = (io_fn_t)file->f_op->write;
 828                fnv = file->f_op->aio_write;
 829                iter_fn = file->f_op->write_iter;
 830                file_start_write(file);
 831        }
 832
 833        if (iter_fn)
 834                ret = do_iter_readv_writev(file, type, iov, nr_segs, tot_len,
 835                                                pos, iter_fn);
 836        else if (fnv)
 837                ret = do_sync_readv_writev(file, iov, nr_segs, tot_len,
 838                                                pos, fnv);
 839        else
 840                ret = do_loop_readv_writev(file, iov, nr_segs, pos, fn);
 841
 842        if (type != READ)
 843                file_end_write(file);
 844
 845out:
 846        if (iov != iovstack)
 847                kfree(iov);
 848        if ((ret + (type == READ)) > 0) {
 849                if (type == READ)
 850                        fsnotify_access(file);
 851                else
 852                        fsnotify_modify(file);
 853        }
 854        return ret;
 855}
 856
 857ssize_t vfs_readv(struct file *file, const struct iovec __user *vec,
 858                  unsigned long vlen, loff_t *pos)
 859{
 860        if (!(file->f_mode & FMODE_READ))
 861                return -EBADF;
 862        if (!(file->f_mode & FMODE_CAN_READ))
 863                return -EINVAL;
 864
 865        return do_readv_writev(READ, file, vec, vlen, pos);
 866}
 867
 868EXPORT_SYMBOL(vfs_readv);
 869
 870ssize_t vfs_writev(struct file *file, const struct iovec __user *vec,
 871                   unsigned long vlen, loff_t *pos)
 872{
 873        if (!(file->f_mode & FMODE_WRITE))
 874                return -EBADF;
 875        if (!(file->f_mode & FMODE_CAN_WRITE))
 876                return -EINVAL;
 877
 878        return do_readv_writev(WRITE, file, vec, vlen, pos);
 879}
 880
 881EXPORT_SYMBOL(vfs_writev);
 882
 883SYSCALL_DEFINE3(readv, unsigned long, fd, const struct iovec __user *, vec,
 884                unsigned long, vlen)
 885{
 886        struct fd f = fdget_pos(fd);
 887        ssize_t ret = -EBADF;
 888
 889        if (f.file) {
 890                loff_t pos = file_pos_read(f.file);
 891                ret = vfs_readv(f.file, vec, vlen, &pos);
 892                if (ret >= 0)
 893                        file_pos_write(f.file, pos);
 894                fdput_pos(f);
 895        }
 896
 897        if (ret > 0)
 898                add_rchar(current, ret);
 899        inc_syscr(current);
 900        return ret;
 901}
 902
 903SYSCALL_DEFINE3(writev, unsigned long, fd, const struct iovec __user *, vec,
 904                unsigned long, vlen)
 905{
 906        struct fd f = fdget_pos(fd);
 907        ssize_t ret = -EBADF;
 908
 909        if (f.file) {
 910                loff_t pos = file_pos_read(f.file);
 911                ret = vfs_writev(f.file, vec, vlen, &pos);
 912                if (ret >= 0)
 913                        file_pos_write(f.file, pos);
 914                fdput_pos(f);
 915        }
 916
 917        if (ret > 0)
 918                add_wchar(current, ret);
 919        inc_syscw(current);
 920        return ret;
 921}
 922
 923static inline loff_t pos_from_hilo(unsigned long high, unsigned long low)
 924{
 925#define HALF_LONG_BITS (BITS_PER_LONG / 2)
 926        return (((loff_t)high << HALF_LONG_BITS) << HALF_LONG_BITS) | low;
 927}
 928
 929SYSCALL_DEFINE5(preadv, unsigned long, fd, const struct iovec __user *, vec,
 930                unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h)
 931{
 932        loff_t pos = pos_from_hilo(pos_h, pos_l);
 933        struct fd f;
 934        ssize_t ret = -EBADF;
 935
 936        if (pos < 0)
 937                return -EINVAL;
 938
 939        f = fdget(fd);
 940        if (f.file) {
 941                ret = -ESPIPE;
 942                if (f.file->f_mode & FMODE_PREAD)
 943                        ret = vfs_readv(f.file, vec, vlen, &pos);
 944                fdput(f);
 945        }
 946
 947        if (ret > 0)
 948                add_rchar(current, ret);
 949        inc_syscr(current);
 950        return ret;
 951}
 952
 953SYSCALL_DEFINE5(pwritev, unsigned long, fd, const struct iovec __user *, vec,
 954                unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h)
 955{
 956        loff_t pos = pos_from_hilo(pos_h, pos_l);
 957        struct fd f;
 958        ssize_t ret = -EBADF;
 959
 960        if (pos < 0)
 961                return -EINVAL;
 962
 963        f = fdget(fd);
 964        if (f.file) {
 965                ret = -ESPIPE;
 966                if (f.file->f_mode & FMODE_PWRITE)
 967                        ret = vfs_writev(f.file, vec, vlen, &pos);
 968                fdput(f);
 969        }
 970
 971        if (ret > 0)
 972                add_wchar(current, ret);
 973        inc_syscw(current);
 974        return ret;
 975}
 976
 977#ifdef CONFIG_COMPAT
 978
 979static ssize_t compat_do_readv_writev(int type, struct file *file,
 980                               const struct compat_iovec __user *uvector,
 981                               unsigned long nr_segs, loff_t *pos)
 982{
 983        compat_ssize_t tot_len;
 984        struct iovec iovstack[UIO_FASTIOV];
 985        struct iovec *iov = iovstack;
 986        ssize_t ret;
 987        io_fn_t fn;
 988        iov_fn_t fnv;
 989        iter_fn_t iter_fn;
 990
 991        ret = compat_rw_copy_check_uvector(type, uvector, nr_segs,
 992                                               UIO_FASTIOV, iovstack, &iov);
 993        if (ret <= 0)
 994                goto out;
 995
 996        tot_len = ret;
 997        ret = rw_verify_area(type, file, pos, tot_len);
 998        if (ret < 0)
 999                goto out;
1000
1001        fnv = NULL;
1002        if (type == READ) {
1003                fn = file->f_op->read;
1004                fnv = file->f_op->aio_read;
1005                iter_fn = file->f_op->read_iter;
1006        } else {
1007                fn = (io_fn_t)file->f_op->write;
1008                fnv = file->f_op->aio_write;
1009                iter_fn = file->f_op->write_iter;
1010                file_start_write(file);
1011        }
1012
1013        if (iter_fn)
1014                ret = do_iter_readv_writev(file, type, iov, nr_segs, tot_len,
1015                                                pos, iter_fn);
1016        else if (fnv)
1017                ret = do_sync_readv_writev(file, iov, nr_segs, tot_len,
1018                                                pos, fnv);
1019        else
1020                ret = do_loop_readv_writev(file, iov, nr_segs, pos, fn);
1021
1022        if (type != READ)
1023                file_end_write(file);
1024
1025out:
1026        if (iov != iovstack)
1027                kfree(iov);
1028        if ((ret + (type == READ)) > 0) {
1029                if (type == READ)
1030                        fsnotify_access(file);
1031                else
1032                        fsnotify_modify(file);
1033        }
1034        return ret;
1035}
1036
1037static size_t compat_readv(struct file *file,
1038                           const struct compat_iovec __user *vec,
1039                           unsigned long vlen, loff_t *pos)
1040{
1041        ssize_t ret = -EBADF;
1042
1043        if (!(file->f_mode & FMODE_READ))
1044                goto out;
1045
1046        ret = -EINVAL;
1047        if (!(file->f_mode & FMODE_CAN_READ))
1048                goto out;
1049
1050        ret = compat_do_readv_writev(READ, file, vec, vlen, pos);
1051
1052out:
1053        if (ret > 0)
1054                add_rchar(current, ret);
1055        inc_syscr(current);
1056        return ret;
1057}
1058
1059COMPAT_SYSCALL_DEFINE3(readv, compat_ulong_t, fd,
1060                const struct compat_iovec __user *,vec,
1061                compat_ulong_t, vlen)
1062{
1063        struct fd f = fdget_pos(fd);
1064        ssize_t ret;
1065        loff_t pos;
1066
1067        if (!f.file)
1068                return -EBADF;
1069        pos = f.file->f_pos;
1070        ret = compat_readv(f.file, vec, vlen, &pos);
1071        if (ret >= 0)
1072                f.file->f_pos = pos;
1073        fdput_pos(f);
1074        return ret;
1075}
1076
1077static long __compat_sys_preadv64(unsigned long fd,
1078                                  const struct compat_iovec __user *vec,
1079                                  unsigned long vlen, loff_t pos)
1080{
1081        struct fd f;
1082        ssize_t ret;
1083
1084        if (pos < 0)
1085                return -EINVAL;
1086        f = fdget(fd);
1087        if (!f.file)
1088                return -EBADF;
1089        ret = -ESPIPE;
1090        if (f.file->f_mode & FMODE_PREAD)
1091                ret = compat_readv(f.file, vec, vlen, &pos);
1092        fdput(f);
1093        return ret;
1094}
1095
1096#ifdef __ARCH_WANT_COMPAT_SYS_PREADV64
1097COMPAT_SYSCALL_DEFINE4(preadv64, unsigned long, fd,
1098                const struct compat_iovec __user *,vec,
1099                unsigned long, vlen, loff_t, pos)
1100{
1101        return __compat_sys_preadv64(fd, vec, vlen, pos);
1102}
1103#endif
1104
1105COMPAT_SYSCALL_DEFINE5(preadv, compat_ulong_t, fd,
1106                const struct compat_iovec __user *,vec,
1107                compat_ulong_t, vlen, u32, pos_low, u32, pos_high)
1108{
1109        loff_t pos = ((loff_t)pos_high << 32) | pos_low;
1110
1111        return __compat_sys_preadv64(fd, vec, vlen, pos);
1112}
1113
1114static size_t compat_writev(struct file *file,
1115                            const struct compat_iovec __user *vec,
1116                            unsigned long vlen, loff_t *pos)
1117{
1118        ssize_t ret = -EBADF;
1119
1120        if (!(file->f_mode & FMODE_WRITE))
1121                goto out;
1122
1123        ret = -EINVAL;
1124        if (!(file->f_mode & FMODE_CAN_WRITE))
1125                goto out;
1126
1127        ret = compat_do_readv_writev(WRITE, file, vec, vlen, pos);
1128
1129out:
1130        if (ret > 0)
1131                add_wchar(current, ret);
1132        inc_syscw(current);
1133        return ret;
1134}
1135
1136COMPAT_SYSCALL_DEFINE3(writev, compat_ulong_t, fd,
1137                const struct compat_iovec __user *, vec,
1138                compat_ulong_t, vlen)
1139{
1140        struct fd f = fdget_pos(fd);
1141        ssize_t ret;
1142        loff_t pos;
1143
1144        if (!f.file)
1145                return -EBADF;
1146        pos = f.file->f_pos;
1147        ret = compat_writev(f.file, vec, vlen, &pos);
1148        if (ret >= 0)
1149                f.file->f_pos = pos;
1150        fdput_pos(f);
1151        return ret;
1152}
1153
1154static long __compat_sys_pwritev64(unsigned long fd,
1155                                   const struct compat_iovec __user *vec,
1156                                   unsigned long vlen, loff_t pos)
1157{
1158        struct fd f;
1159        ssize_t ret;
1160
1161        if (pos < 0)
1162                return -EINVAL;
1163        f = fdget(fd);
1164        if (!f.file)
1165                return -EBADF;
1166        ret = -ESPIPE;
1167        if (f.file->f_mode & FMODE_PWRITE)
1168                ret = compat_writev(f.file, vec, vlen, &pos);
1169        fdput(f);
1170        return ret;
1171}
1172
1173#ifdef __ARCH_WANT_COMPAT_SYS_PWRITEV64
1174COMPAT_SYSCALL_DEFINE4(pwritev64, unsigned long, fd,
1175                const struct compat_iovec __user *,vec,
1176                unsigned long, vlen, loff_t, pos)
1177{
1178        return __compat_sys_pwritev64(fd, vec, vlen, pos);
1179}
1180#endif
1181
1182COMPAT_SYSCALL_DEFINE5(pwritev, compat_ulong_t, fd,
1183                const struct compat_iovec __user *,vec,
1184                compat_ulong_t, vlen, u32, pos_low, u32, pos_high)
1185{
1186        loff_t pos = ((loff_t)pos_high << 32) | pos_low;
1187
1188        return __compat_sys_pwritev64(fd, vec, vlen, pos);
1189}
1190#endif
1191
1192static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos,
1193                           size_t count, loff_t max)
1194{
1195        struct fd in, out;
1196        struct inode *in_inode, *out_inode;
1197        loff_t pos;
1198        loff_t out_pos;
1199        ssize_t retval;
1200        int fl;
1201
1202        /*
1203         * Get input file, and verify that it is ok..
1204         */
1205        retval = -EBADF;
1206        in = fdget(in_fd);
1207        if (!in.file)
1208                goto out;
1209        if (!(in.file->f_mode & FMODE_READ))
1210                goto fput_in;
1211        retval = -ESPIPE;
1212        if (!ppos) {
1213                pos = in.file->f_pos;
1214        } else {
1215                pos = *ppos;
1216                if (!(in.file->f_mode & FMODE_PREAD))
1217                        goto fput_in;
1218        }
1219        retval = rw_verify_area(READ, in.file, &pos, count);
1220        if (retval < 0)
1221                goto fput_in;
1222        count = retval;
1223
1224        /*
1225         * Get output file, and verify that it is ok..
1226         */
1227        retval = -EBADF;
1228        out = fdget(out_fd);
1229        if (!out.file)
1230                goto fput_in;
1231        if (!(out.file->f_mode & FMODE_WRITE))
1232                goto fput_out;
1233        retval = -EINVAL;
1234        in_inode = file_inode(in.file);
1235        out_inode = file_inode(out.file);
1236        out_pos = out.file->f_pos;
1237        retval = rw_verify_area(WRITE, out.file, &out_pos, count);
1238        if (retval < 0)
1239                goto fput_out;
1240        count = retval;
1241
1242        if (!max)
1243                max = min(in_inode->i_sb->s_maxbytes, out_inode->i_sb->s_maxbytes);
1244
1245        if (unlikely(pos + count > max)) {
1246                retval = -EOVERFLOW;
1247                if (pos >= max)
1248                        goto fput_out;
1249                count = max - pos;
1250        }
1251
1252        fl = 0;
1253#if 0
1254        /*
1255         * We need to debate whether we can enable this or not. The
1256         * man page documents EAGAIN return for the output at least,
1257         * and the application is arguably buggy if it doesn't expect
1258         * EAGAIN on a non-blocking file descriptor.
1259         */
1260        if (in.file->f_flags & O_NONBLOCK)
1261                fl = SPLICE_F_NONBLOCK;
1262#endif
1263        file_start_write(out.file);
1264        retval = do_splice_direct(in.file, &pos, out.file, &out_pos, count, fl);
1265        file_end_write(out.file);
1266
1267        if (retval > 0) {
1268                add_rchar(current, retval);
1269                add_wchar(current, retval);
1270                fsnotify_access(in.file);
1271                fsnotify_modify(out.file);
1272                out.file->f_pos = out_pos;
1273                if (ppos)
1274                        *ppos = pos;
1275                else
1276                        in.file->f_pos = pos;
1277        }
1278
1279        inc_syscr(current);
1280        inc_syscw(current);
1281        if (pos > max)
1282                retval = -EOVERFLOW;
1283
1284fput_out:
1285        fdput(out);
1286fput_in:
1287        fdput(in);
1288out:
1289        return retval;
1290}
1291
1292SYSCALL_DEFINE4(sendfile, int, out_fd, int, in_fd, off_t __user *, offset, size_t, count)
1293{
1294        loff_t pos;
1295        off_t off;
1296        ssize_t ret;
1297
1298        if (offset) {
1299                if (unlikely(get_user(off, offset)))
1300                        return -EFAULT;
1301                pos = off;
1302                ret = do_sendfile(out_fd, in_fd, &pos, count, MAX_NON_LFS);
1303                if (unlikely(put_user(pos, offset)))
1304                        return -EFAULT;
1305                return ret;
1306        }
1307
1308        return do_sendfile(out_fd, in_fd, NULL, count, 0);
1309}
1310
1311SYSCALL_DEFINE4(sendfile64, int, out_fd, int, in_fd, loff_t __user *, offset, size_t, count)
1312{
1313        loff_t pos;
1314        ssize_t ret;
1315
1316        if (offset) {
1317                if (unlikely(copy_from_user(&pos, offset, sizeof(loff_t))))
1318                        return -EFAULT;
1319                ret = do_sendfile(out_fd, in_fd, &pos, count, 0);
1320                if (unlikely(put_user(pos, offset)))
1321                        return -EFAULT;
1322                return ret;
1323        }
1324
1325        return do_sendfile(out_fd, in_fd, NULL, count, 0);
1326}
1327
1328#ifdef CONFIG_COMPAT
1329COMPAT_SYSCALL_DEFINE4(sendfile, int, out_fd, int, in_fd,
1330                compat_off_t __user *, offset, compat_size_t, count)
1331{
1332        loff_t pos;
1333        off_t off;
1334        ssize_t ret;
1335
1336        if (offset) {
1337                if (unlikely(get_user(off, offset)))
1338                        return -EFAULT;
1339                pos = off;
1340                ret = do_sendfile(out_fd, in_fd, &pos, count, MAX_NON_LFS);
1341                if (unlikely(put_user(pos, offset)))
1342                        return -EFAULT;
1343                return ret;
1344        }
1345
1346        return do_sendfile(out_fd, in_fd, NULL, count, 0);
1347}
1348
1349COMPAT_SYSCALL_DEFINE4(sendfile64, int, out_fd, int, in_fd,
1350                compat_loff_t __user *, offset, compat_size_t, count)
1351{
1352        loff_t pos;
1353        ssize_t ret;
1354
1355        if (offset) {
1356                if (unlikely(copy_from_user(&pos, offset, sizeof(loff_t))))
1357                        return -EFAULT;
1358                ret = do_sendfile(out_fd, in_fd, &pos, count, 0);
1359                if (unlikely(put_user(pos, offset)))
1360                        return -EFAULT;
1361                return ret;
1362        }
1363
1364        return do_sendfile(out_fd, in_fd, NULL, count, 0);
1365}
1366#endif
1367