linux/fs/read_write.c
<<
>>
Prefs
   1/*
   2 *  linux/fs/read_write.c
   3 *
   4 *  Copyright (C) 1991, 1992  Linus Torvalds
   5 */
   6
   7#include <linux/slab.h> 
   8#include <linux/stat.h>
   9#include <linux/fcntl.h>
  10#include <linux/file.h>
  11#include <linux/uio.h>
  12#include <linux/aio.h>
  13#include <linux/fsnotify.h>
  14#include <linux/security.h>
  15#include <linux/export.h>
  16#include <linux/syscalls.h>
  17#include <linux/pagemap.h>
  18#include <linux/splice.h>
  19#include <linux/compat.h>
  20#include "internal.h"
  21
  22#include <asm/uaccess.h>
  23#include <asm/unistd.h>
  24
  25typedef ssize_t (*io_fn_t)(struct file *, char __user *, size_t, loff_t *);
  26typedef ssize_t (*iov_fn_t)(struct kiocb *, const struct iovec *,
  27                unsigned long, loff_t);
  28typedef ssize_t (*iter_fn_t)(struct kiocb *, struct iov_iter *);
  29
  30const struct file_operations generic_ro_fops = {
  31        .llseek         = generic_file_llseek,
  32        .read           = new_sync_read,
  33        .read_iter      = generic_file_read_iter,
  34        .mmap           = generic_file_readonly_mmap,
  35        .splice_read    = generic_file_splice_read,
  36};
  37
  38EXPORT_SYMBOL(generic_ro_fops);
  39
  40static inline int unsigned_offsets(struct file *file)
  41{
  42        return file->f_mode & FMODE_UNSIGNED_OFFSET;
  43}
  44
  45/**
  46 * vfs_setpos - update the file offset for lseek
  47 * @file:       file structure in question
  48 * @offset:     file offset to seek to
  49 * @maxsize:    maximum file size
  50 *
  51 * This is a low-level filesystem helper for updating the file offset to
  52 * the value specified by @offset if the given offset is valid and it is
  53 * not equal to the current file offset.
  54 *
  55 * Return the specified offset on success and -EINVAL on invalid offset.
  56 */
  57loff_t vfs_setpos(struct file *file, loff_t offset, loff_t maxsize)
  58{
  59        if (offset < 0 && !unsigned_offsets(file))
  60                return -EINVAL;
  61        if (offset > maxsize)
  62                return -EINVAL;
  63
  64        if (offset != file->f_pos) {
  65                file->f_pos = offset;
  66                file->f_version = 0;
  67        }
  68        return offset;
  69}
  70EXPORT_SYMBOL(vfs_setpos);
  71
  72/**
  73 * generic_file_llseek_size - generic llseek implementation for regular files
  74 * @file:       file structure to seek on
  75 * @offset:     file offset to seek to
  76 * @whence:     type of seek
  77 * @size:       max size of this file in file system
  78 * @eof:        offset used for SEEK_END position
  79 *
  80 * This is a variant of generic_file_llseek that allows passing in a custom
  81 * maximum file size and a custom EOF position, for e.g. hashed directories
  82 *
  83 * Synchronization:
  84 * SEEK_SET and SEEK_END are unsynchronized (but atomic on 64bit platforms)
  85 * SEEK_CUR is synchronized against other SEEK_CURs, but not read/writes.
  86 * read/writes behave like SEEK_SET against seeks.
  87 */
  88loff_t
  89generic_file_llseek_size(struct file *file, loff_t offset, int whence,
  90                loff_t maxsize, loff_t eof)
  91{
  92        switch (whence) {
  93        case SEEK_END:
  94                offset += eof;
  95                break;
  96        case SEEK_CUR:
  97                /*
  98                 * Here we special-case the lseek(fd, 0, SEEK_CUR)
  99                 * position-querying operation.  Avoid rewriting the "same"
 100                 * f_pos value back to the file because a concurrent read(),
 101                 * write() or lseek() might have altered it
 102                 */
 103                if (offset == 0)
 104                        return file->f_pos;
 105                /*
 106                 * f_lock protects against read/modify/write race with other
 107                 * SEEK_CURs. Note that parallel writes and reads behave
 108                 * like SEEK_SET.
 109                 */
 110                spin_lock(&file->f_lock);
 111                offset = vfs_setpos(file, file->f_pos + offset, maxsize);
 112                spin_unlock(&file->f_lock);
 113                return offset;
 114        case SEEK_DATA:
 115                /*
 116                 * In the generic case the entire file is data, so as long as
 117                 * offset isn't at the end of the file then the offset is data.
 118                 */
 119                if (offset >= eof)
 120                        return -ENXIO;
 121                break;
 122        case SEEK_HOLE:
 123                /*
 124                 * There is a virtual hole at the end of the file, so as long as
 125                 * offset isn't i_size or larger, return i_size.
 126                 */
 127                if (offset >= eof)
 128                        return -ENXIO;
 129                offset = eof;
 130                break;
 131        }
 132
 133        return vfs_setpos(file, offset, maxsize);
 134}
 135EXPORT_SYMBOL(generic_file_llseek_size);
 136
 137/**
 138 * generic_file_llseek - generic llseek implementation for regular files
 139 * @file:       file structure to seek on
 140 * @offset:     file offset to seek to
 141 * @whence:     type of seek
 142 *
 143 * This is a generic implemenation of ->llseek useable for all normal local
 144 * filesystems.  It just updates the file offset to the value specified by
 145 * @offset and @whence.
 146 */
 147loff_t generic_file_llseek(struct file *file, loff_t offset, int whence)
 148{
 149        struct inode *inode = file->f_mapping->host;
 150
 151        return generic_file_llseek_size(file, offset, whence,
 152                                        inode->i_sb->s_maxbytes,
 153                                        i_size_read(inode));
 154}
 155EXPORT_SYMBOL(generic_file_llseek);
 156
 157/**
 158 * fixed_size_llseek - llseek implementation for fixed-sized devices
 159 * @file:       file structure to seek on
 160 * @offset:     file offset to seek to
 161 * @whence:     type of seek
 162 * @size:       size of the file
 163 *
 164 */
 165loff_t fixed_size_llseek(struct file *file, loff_t offset, int whence, loff_t size)
 166{
 167        switch (whence) {
 168        case SEEK_SET: case SEEK_CUR: case SEEK_END:
 169                return generic_file_llseek_size(file, offset, whence,
 170                                                size, size);
 171        default:
 172                return -EINVAL;
 173        }
 174}
 175EXPORT_SYMBOL(fixed_size_llseek);
 176
 177/**
 178 * noop_llseek - No Operation Performed llseek implementation
 179 * @file:       file structure to seek on
 180 * @offset:     file offset to seek to
 181 * @whence:     type of seek
 182 *
 183 * This is an implementation of ->llseek useable for the rare special case when
 184 * userspace expects the seek to succeed but the (device) file is actually not
 185 * able to perform the seek. In this case you use noop_llseek() instead of
 186 * falling back to the default implementation of ->llseek.
 187 */
 188loff_t noop_llseek(struct file *file, loff_t offset, int whence)
 189{
 190        return file->f_pos;
 191}
 192EXPORT_SYMBOL(noop_llseek);
 193
 194loff_t no_llseek(struct file *file, loff_t offset, int whence)
 195{
 196        return -ESPIPE;
 197}
 198EXPORT_SYMBOL(no_llseek);
 199
 200loff_t default_llseek(struct file *file, loff_t offset, int whence)
 201{
 202        struct inode *inode = file_inode(file);
 203        loff_t retval;
 204
 205        mutex_lock(&inode->i_mutex);
 206        switch (whence) {
 207                case SEEK_END:
 208                        offset += i_size_read(inode);
 209                        break;
 210                case SEEK_CUR:
 211                        if (offset == 0) {
 212                                retval = file->f_pos;
 213                                goto out;
 214                        }
 215                        offset += file->f_pos;
 216                        break;
 217                case SEEK_DATA:
 218                        /*
 219                         * In the generic case the entire file is data, so as
 220                         * long as offset isn't at the end of the file then the
 221                         * offset is data.
 222                         */
 223                        if (offset >= inode->i_size) {
 224                                retval = -ENXIO;
 225                                goto out;
 226                        }
 227                        break;
 228                case SEEK_HOLE:
 229                        /*
 230                         * There is a virtual hole at the end of the file, so
 231                         * as long as offset isn't i_size or larger, return
 232                         * i_size.
 233                         */
 234                        if (offset >= inode->i_size) {
 235                                retval = -ENXIO;
 236                                goto out;
 237                        }
 238                        offset = inode->i_size;
 239                        break;
 240        }
 241        retval = -EINVAL;
 242        if (offset >= 0 || unsigned_offsets(file)) {
 243                if (offset != file->f_pos) {
 244                        file->f_pos = offset;
 245                        file->f_version = 0;
 246                }
 247                retval = offset;
 248        }
 249out:
 250        mutex_unlock(&inode->i_mutex);
 251        return retval;
 252}
 253EXPORT_SYMBOL(default_llseek);
 254
 255loff_t vfs_llseek(struct file *file, loff_t offset, int whence)
 256{
 257        loff_t (*fn)(struct file *, loff_t, int);
 258
 259        fn = no_llseek;
 260        if (file->f_mode & FMODE_LSEEK) {
 261                if (file->f_op->llseek)
 262                        fn = file->f_op->llseek;
 263        }
 264        return fn(file, offset, whence);
 265}
 266EXPORT_SYMBOL(vfs_llseek);
 267
 268static inline struct fd fdget_pos(int fd)
 269{
 270        return __to_fd(__fdget_pos(fd));
 271}
 272
 273static inline void fdput_pos(struct fd f)
 274{
 275        if (f.flags & FDPUT_POS_UNLOCK)
 276                mutex_unlock(&f.file->f_pos_lock);
 277        fdput(f);
 278}
 279
 280SYSCALL_DEFINE3(lseek, unsigned int, fd, off_t, offset, unsigned int, whence)
 281{
 282        off_t retval;
 283        struct fd f = fdget_pos(fd);
 284        if (!f.file)
 285                return -EBADF;
 286
 287        retval = -EINVAL;
 288        if (whence <= SEEK_MAX) {
 289                loff_t res = vfs_llseek(f.file, offset, whence);
 290                retval = res;
 291                if (res != (loff_t)retval)
 292                        retval = -EOVERFLOW;    /* LFS: should only happen on 32 bit platforms */
 293        }
 294        fdput_pos(f);
 295        return retval;
 296}
 297
 298#ifdef CONFIG_COMPAT
 299COMPAT_SYSCALL_DEFINE3(lseek, unsigned int, fd, compat_off_t, offset, unsigned int, whence)
 300{
 301        return sys_lseek(fd, offset, whence);
 302}
 303#endif
 304
 305#ifdef __ARCH_WANT_SYS_LLSEEK
 306SYSCALL_DEFINE5(llseek, unsigned int, fd, unsigned long, offset_high,
 307                unsigned long, offset_low, loff_t __user *, result,
 308                unsigned int, whence)
 309{
 310        int retval;
 311        struct fd f = fdget_pos(fd);
 312        loff_t offset;
 313
 314        if (!f.file)
 315                return -EBADF;
 316
 317        retval = -EINVAL;
 318        if (whence > SEEK_MAX)
 319                goto out_putf;
 320
 321        offset = vfs_llseek(f.file, ((loff_t) offset_high << 32) | offset_low,
 322                        whence);
 323
 324        retval = (int)offset;
 325        if (offset >= 0) {
 326                retval = -EFAULT;
 327                if (!copy_to_user(result, &offset, sizeof(offset)))
 328                        retval = 0;
 329        }
 330out_putf:
 331        fdput_pos(f);
 332        return retval;
 333}
 334#endif
 335
 336ssize_t vfs_iter_read(struct file *file, struct iov_iter *iter, loff_t *ppos)
 337{
 338        struct kiocb kiocb;
 339        ssize_t ret;
 340
 341        if (!file->f_op->read_iter)
 342                return -EINVAL;
 343
 344        init_sync_kiocb(&kiocb, file);
 345        kiocb.ki_pos = *ppos;
 346        kiocb.ki_nbytes = iov_iter_count(iter);
 347
 348        iter->type |= READ;
 349        ret = file->f_op->read_iter(&kiocb, iter);
 350        if (ret == -EIOCBQUEUED)
 351                ret = wait_on_sync_kiocb(&kiocb);
 352
 353        if (ret > 0)
 354                *ppos = kiocb.ki_pos;
 355        return ret;
 356}
 357EXPORT_SYMBOL(vfs_iter_read);
 358
 359ssize_t vfs_iter_write(struct file *file, struct iov_iter *iter, loff_t *ppos)
 360{
 361        struct kiocb kiocb;
 362        ssize_t ret;
 363
 364        if (!file->f_op->write_iter)
 365                return -EINVAL;
 366
 367        init_sync_kiocb(&kiocb, file);
 368        kiocb.ki_pos = *ppos;
 369        kiocb.ki_nbytes = iov_iter_count(iter);
 370
 371        iter->type |= WRITE;
 372        ret = file->f_op->write_iter(&kiocb, iter);
 373        if (ret == -EIOCBQUEUED)
 374                ret = wait_on_sync_kiocb(&kiocb);
 375
 376        if (ret > 0)
 377                *ppos = kiocb.ki_pos;
 378        return ret;
 379}
 380EXPORT_SYMBOL(vfs_iter_write);
 381
 382/*
 383 * rw_verify_area doesn't like huge counts. We limit
 384 * them to something that fits in "int" so that others
 385 * won't have to do range checks all the time.
 386 */
 387int rw_verify_area(int read_write, struct file *file, const loff_t *ppos, size_t count)
 388{
 389        struct inode *inode;
 390        loff_t pos;
 391        int retval = -EINVAL;
 392
 393        inode = file_inode(file);
 394        if (unlikely((ssize_t) count < 0))
 395                return retval;
 396        pos = *ppos;
 397        if (unlikely(pos < 0)) {
 398                if (!unsigned_offsets(file))
 399                        return retval;
 400                if (count >= -pos) /* both values are in 0..LLONG_MAX */
 401                        return -EOVERFLOW;
 402        } else if (unlikely((loff_t) (pos + count) < 0)) {
 403                if (!unsigned_offsets(file))
 404                        return retval;
 405        }
 406
 407        if (unlikely(inode->i_flctx && mandatory_lock(inode))) {
 408                retval = locks_mandatory_area(
 409                        read_write == READ ? FLOCK_VERIFY_READ : FLOCK_VERIFY_WRITE,
 410                        inode, file, pos, count);
 411                if (retval < 0)
 412                        return retval;
 413        }
 414        retval = security_file_permission(file,
 415                                read_write == READ ? MAY_READ : MAY_WRITE);
 416        if (retval)
 417                return retval;
 418        return count > MAX_RW_COUNT ? MAX_RW_COUNT : count;
 419}
 420
 421ssize_t do_sync_read(struct file *filp, char __user *buf, size_t len, loff_t *ppos)
 422{
 423        struct iovec iov = { .iov_base = buf, .iov_len = len };
 424        struct kiocb kiocb;
 425        ssize_t ret;
 426
 427        init_sync_kiocb(&kiocb, filp);
 428        kiocb.ki_pos = *ppos;
 429        kiocb.ki_nbytes = len;
 430
 431        ret = filp->f_op->aio_read(&kiocb, &iov, 1, kiocb.ki_pos);
 432        if (-EIOCBQUEUED == ret)
 433                ret = wait_on_sync_kiocb(&kiocb);
 434        *ppos = kiocb.ki_pos;
 435        return ret;
 436}
 437
 438EXPORT_SYMBOL(do_sync_read);
 439
 440ssize_t new_sync_read(struct file *filp, char __user *buf, size_t len, loff_t *ppos)
 441{
 442        struct iovec iov = { .iov_base = buf, .iov_len = len };
 443        struct kiocb kiocb;
 444        struct iov_iter iter;
 445        ssize_t ret;
 446
 447        init_sync_kiocb(&kiocb, filp);
 448        kiocb.ki_pos = *ppos;
 449        kiocb.ki_nbytes = len;
 450        iov_iter_init(&iter, READ, &iov, 1, len);
 451
 452        ret = filp->f_op->read_iter(&kiocb, &iter);
 453        if (-EIOCBQUEUED == ret)
 454                ret = wait_on_sync_kiocb(&kiocb);
 455        *ppos = kiocb.ki_pos;
 456        return ret;
 457}
 458
 459EXPORT_SYMBOL(new_sync_read);
 460
 461ssize_t __vfs_read(struct file *file, char __user *buf, size_t count,
 462                   loff_t *pos)
 463{
 464        ssize_t ret;
 465
 466        if (file->f_op->read)
 467                ret = file->f_op->read(file, buf, count, pos);
 468        else if (file->f_op->aio_read)
 469                ret = do_sync_read(file, buf, count, pos);
 470        else if (file->f_op->read_iter)
 471                ret = new_sync_read(file, buf, count, pos);
 472        else
 473                ret = -EINVAL;
 474
 475        return ret;
 476}
 477
 478ssize_t vfs_read(struct file *file, char __user *buf, size_t count, loff_t *pos)
 479{
 480        ssize_t ret;
 481
 482        if (!(file->f_mode & FMODE_READ))
 483                return -EBADF;
 484        if (!(file->f_mode & FMODE_CAN_READ))
 485                return -EINVAL;
 486        if (unlikely(!access_ok(VERIFY_WRITE, buf, count)))
 487                return -EFAULT;
 488
 489        ret = rw_verify_area(READ, file, pos, count);
 490        if (ret >= 0) {
 491                count = ret;
 492                ret = __vfs_read(file, buf, count, pos);
 493                if (ret > 0) {
 494                        fsnotify_access(file);
 495                        add_rchar(current, ret);
 496                }
 497                inc_syscr(current);
 498        }
 499
 500        return ret;
 501}
 502
 503EXPORT_SYMBOL(vfs_read);
 504
 505ssize_t do_sync_write(struct file *filp, const char __user *buf, size_t len, loff_t *ppos)
 506{
 507        struct iovec iov = { .iov_base = (void __user *)buf, .iov_len = len };
 508        struct kiocb kiocb;
 509        ssize_t ret;
 510
 511        init_sync_kiocb(&kiocb, filp);
 512        kiocb.ki_pos = *ppos;
 513        kiocb.ki_nbytes = len;
 514
 515        ret = filp->f_op->aio_write(&kiocb, &iov, 1, kiocb.ki_pos);
 516        if (-EIOCBQUEUED == ret)
 517                ret = wait_on_sync_kiocb(&kiocb);
 518        *ppos = kiocb.ki_pos;
 519        return ret;
 520}
 521
 522EXPORT_SYMBOL(do_sync_write);
 523
 524ssize_t new_sync_write(struct file *filp, const char __user *buf, size_t len, loff_t *ppos)
 525{
 526        struct iovec iov = { .iov_base = (void __user *)buf, .iov_len = len };
 527        struct kiocb kiocb;
 528        struct iov_iter iter;
 529        ssize_t ret;
 530
 531        init_sync_kiocb(&kiocb, filp);
 532        kiocb.ki_pos = *ppos;
 533        kiocb.ki_nbytes = len;
 534        iov_iter_init(&iter, WRITE, &iov, 1, len);
 535
 536        ret = filp->f_op->write_iter(&kiocb, &iter);
 537        if (-EIOCBQUEUED == ret)
 538                ret = wait_on_sync_kiocb(&kiocb);
 539        *ppos = kiocb.ki_pos;
 540        return ret;
 541}
 542
 543EXPORT_SYMBOL(new_sync_write);
 544
 545ssize_t __kernel_write(struct file *file, const char *buf, size_t count, loff_t *pos)
 546{
 547        mm_segment_t old_fs;
 548        const char __user *p;
 549        ssize_t ret;
 550
 551        if (!(file->f_mode & FMODE_CAN_WRITE))
 552                return -EINVAL;
 553
 554        old_fs = get_fs();
 555        set_fs(get_ds());
 556        p = (__force const char __user *)buf;
 557        if (count > MAX_RW_COUNT)
 558                count =  MAX_RW_COUNT;
 559        if (file->f_op->write)
 560                ret = file->f_op->write(file, p, count, pos);
 561        else if (file->f_op->aio_write)
 562                ret = do_sync_write(file, p, count, pos);
 563        else
 564                ret = new_sync_write(file, p, count, pos);
 565        set_fs(old_fs);
 566        if (ret > 0) {
 567                fsnotify_modify(file);
 568                add_wchar(current, ret);
 569        }
 570        inc_syscw(current);
 571        return ret;
 572}
 573
 574EXPORT_SYMBOL(__kernel_write);
 575
 576ssize_t vfs_write(struct file *file, const char __user *buf, size_t count, loff_t *pos)
 577{
 578        ssize_t ret;
 579
 580        if (!(file->f_mode & FMODE_WRITE))
 581                return -EBADF;
 582        if (!(file->f_mode & FMODE_CAN_WRITE))
 583                return -EINVAL;
 584        if (unlikely(!access_ok(VERIFY_READ, buf, count)))
 585                return -EFAULT;
 586
 587        ret = rw_verify_area(WRITE, file, pos, count);
 588        if (ret >= 0) {
 589                count = ret;
 590                file_start_write(file);
 591                if (file->f_op->write)
 592                        ret = file->f_op->write(file, buf, count, pos);
 593                else if (file->f_op->aio_write)
 594                        ret = do_sync_write(file, buf, count, pos);
 595                else
 596                        ret = new_sync_write(file, buf, count, pos);
 597                if (ret > 0) {
 598                        fsnotify_modify(file);
 599                        add_wchar(current, ret);
 600                }
 601                inc_syscw(current);
 602                file_end_write(file);
 603        }
 604
 605        return ret;
 606}
 607
 608EXPORT_SYMBOL(vfs_write);
 609
 610static inline loff_t file_pos_read(struct file *file)
 611{
 612        return file->f_pos;
 613}
 614
 615static inline void file_pos_write(struct file *file, loff_t pos)
 616{
 617        file->f_pos = pos;
 618}
 619
 620SYSCALL_DEFINE3(read, unsigned int, fd, char __user *, buf, size_t, count)
 621{
 622        struct fd f = fdget_pos(fd);
 623        ssize_t ret = -EBADF;
 624
 625        if (f.file) {
 626                loff_t pos = file_pos_read(f.file);
 627                ret = vfs_read(f.file, buf, count, &pos);
 628                if (ret >= 0)
 629                        file_pos_write(f.file, pos);
 630                fdput_pos(f);
 631        }
 632        return ret;
 633}
 634
 635SYSCALL_DEFINE3(write, unsigned int, fd, const char __user *, buf,
 636                size_t, count)
 637{
 638        struct fd f = fdget_pos(fd);
 639        ssize_t ret = -EBADF;
 640
 641        if (f.file) {
 642                loff_t pos = file_pos_read(f.file);
 643                ret = vfs_write(f.file, buf, count, &pos);
 644                if (ret >= 0)
 645                        file_pos_write(f.file, pos);
 646                fdput_pos(f);
 647        }
 648
 649        return ret;
 650}
 651
 652SYSCALL_DEFINE4(pread64, unsigned int, fd, char __user *, buf,
 653                        size_t, count, loff_t, pos)
 654{
 655        struct fd f;
 656        ssize_t ret = -EBADF;
 657
 658        if (pos < 0)
 659                return -EINVAL;
 660
 661        f = fdget(fd);
 662        if (f.file) {
 663                ret = -ESPIPE;
 664                if (f.file->f_mode & FMODE_PREAD)
 665                        ret = vfs_read(f.file, buf, count, &pos);
 666                fdput(f);
 667        }
 668
 669        return ret;
 670}
 671
 672SYSCALL_DEFINE4(pwrite64, unsigned int, fd, const char __user *, buf,
 673                         size_t, count, loff_t, pos)
 674{
 675        struct fd f;
 676        ssize_t ret = -EBADF;
 677
 678        if (pos < 0)
 679                return -EINVAL;
 680
 681        f = fdget(fd);
 682        if (f.file) {
 683                ret = -ESPIPE;
 684                if (f.file->f_mode & FMODE_PWRITE)  
 685                        ret = vfs_write(f.file, buf, count, &pos);
 686                fdput(f);
 687        }
 688
 689        return ret;
 690}
 691
 692/*
 693 * Reduce an iovec's length in-place.  Return the resulting number of segments
 694 */
 695unsigned long iov_shorten(struct iovec *iov, unsigned long nr_segs, size_t to)
 696{
 697        unsigned long seg = 0;
 698        size_t len = 0;
 699
 700        while (seg < nr_segs) {
 701                seg++;
 702                if (len + iov->iov_len >= to) {
 703                        iov->iov_len = to - len;
 704                        break;
 705                }
 706                len += iov->iov_len;
 707                iov++;
 708        }
 709        return seg;
 710}
 711EXPORT_SYMBOL(iov_shorten);
 712
 713static ssize_t do_iter_readv_writev(struct file *filp, int rw, const struct iovec *iov,
 714                unsigned long nr_segs, size_t len, loff_t *ppos, iter_fn_t fn)
 715{
 716        struct kiocb kiocb;
 717        struct iov_iter iter;
 718        ssize_t ret;
 719
 720        init_sync_kiocb(&kiocb, filp);
 721        kiocb.ki_pos = *ppos;
 722        kiocb.ki_nbytes = len;
 723
 724        iov_iter_init(&iter, rw, iov, nr_segs, len);
 725        ret = fn(&kiocb, &iter);
 726        if (ret == -EIOCBQUEUED)
 727                ret = wait_on_sync_kiocb(&kiocb);
 728        *ppos = kiocb.ki_pos;
 729        return ret;
 730}
 731
 732static ssize_t do_sync_readv_writev(struct file *filp, const struct iovec *iov,
 733                unsigned long nr_segs, size_t len, loff_t *ppos, iov_fn_t fn)
 734{
 735        struct kiocb kiocb;
 736        ssize_t ret;
 737
 738        init_sync_kiocb(&kiocb, filp);
 739        kiocb.ki_pos = *ppos;
 740        kiocb.ki_nbytes = len;
 741
 742        ret = fn(&kiocb, iov, nr_segs, kiocb.ki_pos);
 743        if (ret == -EIOCBQUEUED)
 744                ret = wait_on_sync_kiocb(&kiocb);
 745        *ppos = kiocb.ki_pos;
 746        return ret;
 747}
 748
 749/* Do it by hand, with file-ops */
 750static ssize_t do_loop_readv_writev(struct file *filp, struct iovec *iov,
 751                unsigned long nr_segs, loff_t *ppos, io_fn_t fn)
 752{
 753        struct iovec *vector = iov;
 754        ssize_t ret = 0;
 755
 756        while (nr_segs > 0) {
 757                void __user *base;
 758                size_t len;
 759                ssize_t nr;
 760
 761                base = vector->iov_base;
 762                len = vector->iov_len;
 763                vector++;
 764                nr_segs--;
 765
 766                nr = fn(filp, base, len, ppos);
 767
 768                if (nr < 0) {
 769                        if (!ret)
 770                                ret = nr;
 771                        break;
 772                }
 773                ret += nr;
 774                if (nr != len)
 775                        break;
 776        }
 777
 778        return ret;
 779}
 780
 781/* A write operation does a read from user space and vice versa */
 782#define vrfy_dir(type) ((type) == READ ? VERIFY_WRITE : VERIFY_READ)
 783
 784ssize_t rw_copy_check_uvector(int type, const struct iovec __user * uvector,
 785                              unsigned long nr_segs, unsigned long fast_segs,
 786                              struct iovec *fast_pointer,
 787                              struct iovec **ret_pointer)
 788{
 789        unsigned long seg;
 790        ssize_t ret;
 791        struct iovec *iov = fast_pointer;
 792
 793        /*
 794         * SuS says "The readv() function *may* fail if the iovcnt argument
 795         * was less than or equal to 0, or greater than {IOV_MAX}.  Linux has
 796         * traditionally returned zero for zero segments, so...
 797         */
 798        if (nr_segs == 0) {
 799                ret = 0;
 800                goto out;
 801        }
 802
 803        /*
 804         * First get the "struct iovec" from user memory and
 805         * verify all the pointers
 806         */
 807        if (nr_segs > UIO_MAXIOV) {
 808                ret = -EINVAL;
 809                goto out;
 810        }
 811        if (nr_segs > fast_segs) {
 812                iov = kmalloc(nr_segs*sizeof(struct iovec), GFP_KERNEL);
 813                if (iov == NULL) {
 814                        ret = -ENOMEM;
 815                        goto out;
 816                }
 817        }
 818        if (copy_from_user(iov, uvector, nr_segs*sizeof(*uvector))) {
 819                ret = -EFAULT;
 820                goto out;
 821        }
 822
 823        /*
 824         * According to the Single Unix Specification we should return EINVAL
 825         * if an element length is < 0 when cast to ssize_t or if the
 826         * total length would overflow the ssize_t return value of the
 827         * system call.
 828         *
 829         * Linux caps all read/write calls to MAX_RW_COUNT, and avoids the
 830         * overflow case.
 831         */
 832        ret = 0;
 833        for (seg = 0; seg < nr_segs; seg++) {
 834                void __user *buf = iov[seg].iov_base;
 835                ssize_t len = (ssize_t)iov[seg].iov_len;
 836
 837                /* see if we we're about to use an invalid len or if
 838                 * it's about to overflow ssize_t */
 839                if (len < 0) {
 840                        ret = -EINVAL;
 841                        goto out;
 842                }
 843                if (type >= 0
 844                    && unlikely(!access_ok(vrfy_dir(type), buf, len))) {
 845                        ret = -EFAULT;
 846                        goto out;
 847                }
 848                if (len > MAX_RW_COUNT - ret) {
 849                        len = MAX_RW_COUNT - ret;
 850                        iov[seg].iov_len = len;
 851                }
 852                ret += len;
 853        }
 854out:
 855        *ret_pointer = iov;
 856        return ret;
 857}
 858
 859static ssize_t do_readv_writev(int type, struct file *file,
 860                               const struct iovec __user * uvector,
 861                               unsigned long nr_segs, loff_t *pos)
 862{
 863        size_t tot_len;
 864        struct iovec iovstack[UIO_FASTIOV];
 865        struct iovec *iov = iovstack;
 866        ssize_t ret;
 867        io_fn_t fn;
 868        iov_fn_t fnv;
 869        iter_fn_t iter_fn;
 870
 871        ret = rw_copy_check_uvector(type, uvector, nr_segs,
 872                                    ARRAY_SIZE(iovstack), iovstack, &iov);
 873        if (ret <= 0)
 874                goto out;
 875
 876        tot_len = ret;
 877        ret = rw_verify_area(type, file, pos, tot_len);
 878        if (ret < 0)
 879                goto out;
 880
 881        fnv = NULL;
 882        if (type == READ) {
 883                fn = file->f_op->read;
 884                fnv = file->f_op->aio_read;
 885                iter_fn = file->f_op->read_iter;
 886        } else {
 887                fn = (io_fn_t)file->f_op->write;
 888                fnv = file->f_op->aio_write;
 889                iter_fn = file->f_op->write_iter;
 890                file_start_write(file);
 891        }
 892
 893        if (iter_fn)
 894                ret = do_iter_readv_writev(file, type, iov, nr_segs, tot_len,
 895                                                pos, iter_fn);
 896        else if (fnv)
 897                ret = do_sync_readv_writev(file, iov, nr_segs, tot_len,
 898                                                pos, fnv);
 899        else
 900                ret = do_loop_readv_writev(file, iov, nr_segs, pos, fn);
 901
 902        if (type != READ)
 903                file_end_write(file);
 904
 905out:
 906        if (iov != iovstack)
 907                kfree(iov);
 908        if ((ret + (type == READ)) > 0) {
 909                if (type == READ)
 910                        fsnotify_access(file);
 911                else
 912                        fsnotify_modify(file);
 913        }
 914        return ret;
 915}
 916
 917ssize_t vfs_readv(struct file *file, const struct iovec __user *vec,
 918                  unsigned long vlen, loff_t *pos)
 919{
 920        if (!(file->f_mode & FMODE_READ))
 921                return -EBADF;
 922        if (!(file->f_mode & FMODE_CAN_READ))
 923                return -EINVAL;
 924
 925        return do_readv_writev(READ, file, vec, vlen, pos);
 926}
 927
 928EXPORT_SYMBOL(vfs_readv);
 929
 930ssize_t vfs_writev(struct file *file, const struct iovec __user *vec,
 931                   unsigned long vlen, loff_t *pos)
 932{
 933        if (!(file->f_mode & FMODE_WRITE))
 934                return -EBADF;
 935        if (!(file->f_mode & FMODE_CAN_WRITE))
 936                return -EINVAL;
 937
 938        return do_readv_writev(WRITE, file, vec, vlen, pos);
 939}
 940
 941EXPORT_SYMBOL(vfs_writev);
 942
 943SYSCALL_DEFINE3(readv, unsigned long, fd, const struct iovec __user *, vec,
 944                unsigned long, vlen)
 945{
 946        struct fd f = fdget_pos(fd);
 947        ssize_t ret = -EBADF;
 948
 949        if (f.file) {
 950                loff_t pos = file_pos_read(f.file);
 951                ret = vfs_readv(f.file, vec, vlen, &pos);
 952                if (ret >= 0)
 953                        file_pos_write(f.file, pos);
 954                fdput_pos(f);
 955        }
 956
 957        if (ret > 0)
 958                add_rchar(current, ret);
 959        inc_syscr(current);
 960        return ret;
 961}
 962
 963SYSCALL_DEFINE3(writev, unsigned long, fd, const struct iovec __user *, vec,
 964                unsigned long, vlen)
 965{
 966        struct fd f = fdget_pos(fd);
 967        ssize_t ret = -EBADF;
 968
 969        if (f.file) {
 970                loff_t pos = file_pos_read(f.file);
 971                ret = vfs_writev(f.file, vec, vlen, &pos);
 972                if (ret >= 0)
 973                        file_pos_write(f.file, pos);
 974                fdput_pos(f);
 975        }
 976
 977        if (ret > 0)
 978                add_wchar(current, ret);
 979        inc_syscw(current);
 980        return ret;
 981}
 982
 983static inline loff_t pos_from_hilo(unsigned long high, unsigned long low)
 984{
 985#define HALF_LONG_BITS (BITS_PER_LONG / 2)
 986        return (((loff_t)high << HALF_LONG_BITS) << HALF_LONG_BITS) | low;
 987}
 988
 989SYSCALL_DEFINE5(preadv, unsigned long, fd, const struct iovec __user *, vec,
 990                unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h)
 991{
 992        loff_t pos = pos_from_hilo(pos_h, pos_l);
 993        struct fd f;
 994        ssize_t ret = -EBADF;
 995
 996        if (pos < 0)
 997                return -EINVAL;
 998
 999        f = fdget(fd);
1000        if (f.file) {
1001                ret = -ESPIPE;
1002                if (f.file->f_mode & FMODE_PREAD)
1003                        ret = vfs_readv(f.file, vec, vlen, &pos);
1004                fdput(f);
1005        }
1006
1007        if (ret > 0)
1008                add_rchar(current, ret);
1009        inc_syscr(current);
1010        return ret;
1011}
1012
1013SYSCALL_DEFINE5(pwritev, unsigned long, fd, const struct iovec __user *, vec,
1014                unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h)
1015{
1016        loff_t pos = pos_from_hilo(pos_h, pos_l);
1017        struct fd f;
1018        ssize_t ret = -EBADF;
1019
1020        if (pos < 0)
1021                return -EINVAL;
1022
1023        f = fdget(fd);
1024        if (f.file) {
1025                ret = -ESPIPE;
1026                if (f.file->f_mode & FMODE_PWRITE)
1027                        ret = vfs_writev(f.file, vec, vlen, &pos);
1028                fdput(f);
1029        }
1030
1031        if (ret > 0)
1032                add_wchar(current, ret);
1033        inc_syscw(current);
1034        return ret;
1035}
1036
1037#ifdef CONFIG_COMPAT
1038
1039static ssize_t compat_do_readv_writev(int type, struct file *file,
1040                               const struct compat_iovec __user *uvector,
1041                               unsigned long nr_segs, loff_t *pos)
1042{
1043        compat_ssize_t tot_len;
1044        struct iovec iovstack[UIO_FASTIOV];
1045        struct iovec *iov = iovstack;
1046        ssize_t ret;
1047        io_fn_t fn;
1048        iov_fn_t fnv;
1049        iter_fn_t iter_fn;
1050
1051        ret = compat_rw_copy_check_uvector(type, uvector, nr_segs,
1052                                               UIO_FASTIOV, iovstack, &iov);
1053        if (ret <= 0)
1054                goto out;
1055
1056        tot_len = ret;
1057        ret = rw_verify_area(type, file, pos, tot_len);
1058        if (ret < 0)
1059                goto out;
1060
1061        fnv = NULL;
1062        if (type == READ) {
1063                fn = file->f_op->read;
1064                fnv = file->f_op->aio_read;
1065                iter_fn = file->f_op->read_iter;
1066        } else {
1067                fn = (io_fn_t)file->f_op->write;
1068                fnv = file->f_op->aio_write;
1069                iter_fn = file->f_op->write_iter;
1070                file_start_write(file);
1071        }
1072
1073        if (iter_fn)
1074                ret = do_iter_readv_writev(file, type, iov, nr_segs, tot_len,
1075                                                pos, iter_fn);
1076        else if (fnv)
1077                ret = do_sync_readv_writev(file, iov, nr_segs, tot_len,
1078                                                pos, fnv);
1079        else
1080                ret = do_loop_readv_writev(file, iov, nr_segs, pos, fn);
1081
1082        if (type != READ)
1083                file_end_write(file);
1084
1085out:
1086        if (iov != iovstack)
1087                kfree(iov);
1088        if ((ret + (type == READ)) > 0) {
1089                if (type == READ)
1090                        fsnotify_access(file);
1091                else
1092                        fsnotify_modify(file);
1093        }
1094        return ret;
1095}
1096
1097static size_t compat_readv(struct file *file,
1098                           const struct compat_iovec __user *vec,
1099                           unsigned long vlen, loff_t *pos)
1100{
1101        ssize_t ret = -EBADF;
1102
1103        if (!(file->f_mode & FMODE_READ))
1104                goto out;
1105
1106        ret = -EINVAL;
1107        if (!(file->f_mode & FMODE_CAN_READ))
1108                goto out;
1109
1110        ret = compat_do_readv_writev(READ, file, vec, vlen, pos);
1111
1112out:
1113        if (ret > 0)
1114                add_rchar(current, ret);
1115        inc_syscr(current);
1116        return ret;
1117}
1118
1119COMPAT_SYSCALL_DEFINE3(readv, compat_ulong_t, fd,
1120                const struct compat_iovec __user *,vec,
1121                compat_ulong_t, vlen)
1122{
1123        struct fd f = fdget_pos(fd);
1124        ssize_t ret;
1125        loff_t pos;
1126
1127        if (!f.file)
1128                return -EBADF;
1129        pos = f.file->f_pos;
1130        ret = compat_readv(f.file, vec, vlen, &pos);
1131        if (ret >= 0)
1132                f.file->f_pos = pos;
1133        fdput_pos(f);
1134        return ret;
1135}
1136
1137static long __compat_sys_preadv64(unsigned long fd,
1138                                  const struct compat_iovec __user *vec,
1139                                  unsigned long vlen, loff_t pos)
1140{
1141        struct fd f;
1142        ssize_t ret;
1143
1144        if (pos < 0)
1145                return -EINVAL;
1146        f = fdget(fd);
1147        if (!f.file)
1148                return -EBADF;
1149        ret = -ESPIPE;
1150        if (f.file->f_mode & FMODE_PREAD)
1151                ret = compat_readv(f.file, vec, vlen, &pos);
1152        fdput(f);
1153        return ret;
1154}
1155
1156#ifdef __ARCH_WANT_COMPAT_SYS_PREADV64
1157COMPAT_SYSCALL_DEFINE4(preadv64, unsigned long, fd,
1158                const struct compat_iovec __user *,vec,
1159                unsigned long, vlen, loff_t, pos)
1160{
1161        return __compat_sys_preadv64(fd, vec, vlen, pos);
1162}
1163#endif
1164
1165COMPAT_SYSCALL_DEFINE5(preadv, compat_ulong_t, fd,
1166                const struct compat_iovec __user *,vec,
1167                compat_ulong_t, vlen, u32, pos_low, u32, pos_high)
1168{
1169        loff_t pos = ((loff_t)pos_high << 32) | pos_low;
1170
1171        return __compat_sys_preadv64(fd, vec, vlen, pos);
1172}
1173
1174static size_t compat_writev(struct file *file,
1175                            const struct compat_iovec __user *vec,
1176                            unsigned long vlen, loff_t *pos)
1177{
1178        ssize_t ret = -EBADF;
1179
1180        if (!(file->f_mode & FMODE_WRITE))
1181                goto out;
1182
1183        ret = -EINVAL;
1184        if (!(file->f_mode & FMODE_CAN_WRITE))
1185                goto out;
1186
1187        ret = compat_do_readv_writev(WRITE, file, vec, vlen, pos);
1188
1189out:
1190        if (ret > 0)
1191                add_wchar(current, ret);
1192        inc_syscw(current);
1193        return ret;
1194}
1195
1196COMPAT_SYSCALL_DEFINE3(writev, compat_ulong_t, fd,
1197                const struct compat_iovec __user *, vec,
1198                compat_ulong_t, vlen)
1199{
1200        struct fd f = fdget_pos(fd);
1201        ssize_t ret;
1202        loff_t pos;
1203
1204        if (!f.file)
1205                return -EBADF;
1206        pos = f.file->f_pos;
1207        ret = compat_writev(f.file, vec, vlen, &pos);
1208        if (ret >= 0)
1209                f.file->f_pos = pos;
1210        fdput_pos(f);
1211        return ret;
1212}
1213
1214static long __compat_sys_pwritev64(unsigned long fd,
1215                                   const struct compat_iovec __user *vec,
1216                                   unsigned long vlen, loff_t pos)
1217{
1218        struct fd f;
1219        ssize_t ret;
1220
1221        if (pos < 0)
1222                return -EINVAL;
1223        f = fdget(fd);
1224        if (!f.file)
1225                return -EBADF;
1226        ret = -ESPIPE;
1227        if (f.file->f_mode & FMODE_PWRITE)
1228                ret = compat_writev(f.file, vec, vlen, &pos);
1229        fdput(f);
1230        return ret;
1231}
1232
1233#ifdef __ARCH_WANT_COMPAT_SYS_PWRITEV64
1234COMPAT_SYSCALL_DEFINE4(pwritev64, unsigned long, fd,
1235                const struct compat_iovec __user *,vec,
1236                unsigned long, vlen, loff_t, pos)
1237{
1238        return __compat_sys_pwritev64(fd, vec, vlen, pos);
1239}
1240#endif
1241
1242COMPAT_SYSCALL_DEFINE5(pwritev, compat_ulong_t, fd,
1243                const struct compat_iovec __user *,vec,
1244                compat_ulong_t, vlen, u32, pos_low, u32, pos_high)
1245{
1246        loff_t pos = ((loff_t)pos_high << 32) | pos_low;
1247
1248        return __compat_sys_pwritev64(fd, vec, vlen, pos);
1249}
1250#endif
1251
1252static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos,
1253                           size_t count, loff_t max)
1254{
1255        struct fd in, out;
1256        struct inode *in_inode, *out_inode;
1257        loff_t pos;
1258        loff_t out_pos;
1259        ssize_t retval;
1260        int fl;
1261
1262        /*
1263         * Get input file, and verify that it is ok..
1264         */
1265        retval = -EBADF;
1266        in = fdget(in_fd);
1267        if (!in.file)
1268                goto out;
1269        if (!(in.file->f_mode & FMODE_READ))
1270                goto fput_in;
1271        retval = -ESPIPE;
1272        if (!ppos) {
1273                pos = in.file->f_pos;
1274        } else {
1275                pos = *ppos;
1276                if (!(in.file->f_mode & FMODE_PREAD))
1277                        goto fput_in;
1278        }
1279        retval = rw_verify_area(READ, in.file, &pos, count);
1280        if (retval < 0)
1281                goto fput_in;
1282        count = retval;
1283
1284        /*
1285         * Get output file, and verify that it is ok..
1286         */
1287        retval = -EBADF;
1288        out = fdget(out_fd);
1289        if (!out.file)
1290                goto fput_in;
1291        if (!(out.file->f_mode & FMODE_WRITE))
1292                goto fput_out;
1293        retval = -EINVAL;
1294        in_inode = file_inode(in.file);
1295        out_inode = file_inode(out.file);
1296        out_pos = out.file->f_pos;
1297        retval = rw_verify_area(WRITE, out.file, &out_pos, count);
1298        if (retval < 0)
1299                goto fput_out;
1300        count = retval;
1301
1302        if (!max)
1303                max = min(in_inode->i_sb->s_maxbytes, out_inode->i_sb->s_maxbytes);
1304
1305        if (unlikely(pos + count > max)) {
1306                retval = -EOVERFLOW;
1307                if (pos >= max)
1308                        goto fput_out;
1309                count = max - pos;
1310        }
1311
1312        fl = 0;
1313#if 0
1314        /*
1315         * We need to debate whether we can enable this or not. The
1316         * man page documents EAGAIN return for the output at least,
1317         * and the application is arguably buggy if it doesn't expect
1318         * EAGAIN on a non-blocking file descriptor.
1319         */
1320        if (in.file->f_flags & O_NONBLOCK)
1321                fl = SPLICE_F_NONBLOCK;
1322#endif
1323        file_start_write(out.file);
1324        retval = do_splice_direct(in.file, &pos, out.file, &out_pos, count, fl);
1325        file_end_write(out.file);
1326
1327        if (retval > 0) {
1328                add_rchar(current, retval);
1329                add_wchar(current, retval);
1330                fsnotify_access(in.file);
1331                fsnotify_modify(out.file);
1332                out.file->f_pos = out_pos;
1333                if (ppos)
1334                        *ppos = pos;
1335                else
1336                        in.file->f_pos = pos;
1337        }
1338
1339        inc_syscr(current);
1340        inc_syscw(current);
1341        if (pos > max)
1342                retval = -EOVERFLOW;
1343
1344fput_out:
1345        fdput(out);
1346fput_in:
1347        fdput(in);
1348out:
1349        return retval;
1350}
1351
1352SYSCALL_DEFINE4(sendfile, int, out_fd, int, in_fd, off_t __user *, offset, size_t, count)
1353{
1354        loff_t pos;
1355        off_t off;
1356        ssize_t ret;
1357
1358        if (offset) {
1359                if (unlikely(get_user(off, offset)))
1360                        return -EFAULT;
1361                pos = off;
1362                ret = do_sendfile(out_fd, in_fd, &pos, count, MAX_NON_LFS);
1363                if (unlikely(put_user(pos, offset)))
1364                        return -EFAULT;
1365                return ret;
1366        }
1367
1368        return do_sendfile(out_fd, in_fd, NULL, count, 0);
1369}
1370
1371SYSCALL_DEFINE4(sendfile64, int, out_fd, int, in_fd, loff_t __user *, offset, size_t, count)
1372{
1373        loff_t pos;
1374        ssize_t ret;
1375
1376        if (offset) {
1377                if (unlikely(copy_from_user(&pos, offset, sizeof(loff_t))))
1378                        return -EFAULT;
1379                ret = do_sendfile(out_fd, in_fd, &pos, count, 0);
1380                if (unlikely(put_user(pos, offset)))
1381                        return -EFAULT;
1382                return ret;
1383        }
1384
1385        return do_sendfile(out_fd, in_fd, NULL, count, 0);
1386}
1387
1388#ifdef CONFIG_COMPAT
1389COMPAT_SYSCALL_DEFINE4(sendfile, int, out_fd, int, in_fd,
1390                compat_off_t __user *, offset, compat_size_t, count)
1391{
1392        loff_t pos;
1393        off_t off;
1394        ssize_t ret;
1395
1396        if (offset) {
1397                if (unlikely(get_user(off, offset)))
1398                        return -EFAULT;
1399                pos = off;
1400                ret = do_sendfile(out_fd, in_fd, &pos, count, MAX_NON_LFS);
1401                if (unlikely(put_user(pos, offset)))
1402                        return -EFAULT;
1403                return ret;
1404        }
1405
1406        return do_sendfile(out_fd, in_fd, NULL, count, 0);
1407}
1408
1409COMPAT_SYSCALL_DEFINE4(sendfile64, int, out_fd, int, in_fd,
1410                compat_loff_t __user *, offset, compat_size_t, count)
1411{
1412        loff_t pos;
1413        ssize_t ret;
1414
1415        if (offset) {
1416                if (unlikely(copy_from_user(&pos, offset, sizeof(loff_t))))
1417                        return -EFAULT;
1418                ret = do_sendfile(out_fd, in_fd, &pos, count, 0);
1419                if (unlikely(put_user(pos, offset)))
1420                        return -EFAULT;
1421                return ret;
1422        }
1423
1424        return do_sendfile(out_fd, in_fd, NULL, count, 0);
1425}
1426#endif
1427