linux/fs/fuse/file.c
<<
>>
Prefs
   1/*
   2  FUSE: Filesystem in Userspace
   3  Copyright (C) 2001-2008  Miklos Szeredi <miklos@szeredi.hu>
   4
   5  This program can be distributed under the terms of the GNU GPL.
   6  See the file COPYING.
   7*/
   8
   9#include "fuse_i.h"
  10
  11#include <linux/pagemap.h>
  12#include <linux/slab.h>
  13#include <linux/kernel.h>
  14#include <linux/sched.h>
  15#include <linux/sched/signal.h>
  16#include <linux/module.h>
  17#include <linux/swap.h>
  18#include <linux/falloc.h>
  19#include <linux/uio.h>
  20#include <linux/fs.h>
  21
  22static int fuse_send_open(struct fuse_mount *fm, u64 nodeid,
  23                          unsigned int open_flags, int opcode,
  24                          struct fuse_open_out *outargp)
  25{
  26        struct fuse_open_in inarg;
  27        FUSE_ARGS(args);
  28
  29        memset(&inarg, 0, sizeof(inarg));
  30        inarg.flags = open_flags & ~(O_CREAT | O_EXCL | O_NOCTTY);
  31        if (!fm->fc->atomic_o_trunc)
  32                inarg.flags &= ~O_TRUNC;
  33
  34        if (fm->fc->handle_killpriv_v2 &&
  35            (inarg.flags & O_TRUNC) && !capable(CAP_FSETID)) {
  36                inarg.open_flags |= FUSE_OPEN_KILL_SUIDGID;
  37        }
  38
  39        args.opcode = opcode;
  40        args.nodeid = nodeid;
  41        args.in_numargs = 1;
  42        args.in_args[0].size = sizeof(inarg);
  43        args.in_args[0].value = &inarg;
  44        args.out_numargs = 1;
  45        args.out_args[0].size = sizeof(*outargp);
  46        args.out_args[0].value = outargp;
  47
  48        return fuse_simple_request(fm, &args);
  49}
  50
  51struct fuse_release_args {
  52        struct fuse_args args;
  53        struct fuse_release_in inarg;
  54        struct inode *inode;
  55};
  56
  57struct fuse_file *fuse_file_alloc(struct fuse_mount *fm)
  58{
  59        struct fuse_file *ff;
  60
  61        ff = kzalloc(sizeof(struct fuse_file), GFP_KERNEL_ACCOUNT);
  62        if (unlikely(!ff))
  63                return NULL;
  64
  65        ff->fm = fm;
  66        ff->release_args = kzalloc(sizeof(*ff->release_args),
  67                                   GFP_KERNEL_ACCOUNT);
  68        if (!ff->release_args) {
  69                kfree(ff);
  70                return NULL;
  71        }
  72
  73        INIT_LIST_HEAD(&ff->write_entry);
  74        mutex_init(&ff->readdir.lock);
  75        refcount_set(&ff->count, 1);
  76        RB_CLEAR_NODE(&ff->polled_node);
  77        init_waitqueue_head(&ff->poll_wait);
  78
  79        ff->kh = atomic64_inc_return(&fm->fc->khctr);
  80
  81        return ff;
  82}
  83
  84void fuse_file_free(struct fuse_file *ff)
  85{
  86        kfree(ff->release_args);
  87        mutex_destroy(&ff->readdir.lock);
  88        kfree(ff);
  89}
  90
  91static struct fuse_file *fuse_file_get(struct fuse_file *ff)
  92{
  93        refcount_inc(&ff->count);
  94        return ff;
  95}
  96
  97static void fuse_release_end(struct fuse_mount *fm, struct fuse_args *args,
  98                             int error)
  99{
 100        struct fuse_release_args *ra = container_of(args, typeof(*ra), args);
 101
 102        iput(ra->inode);
 103        kfree(ra);
 104}
 105
 106static void fuse_file_put(struct fuse_file *ff, bool sync, bool isdir)
 107{
 108        if (refcount_dec_and_test(&ff->count)) {
 109                struct fuse_args *args = &ff->release_args->args;
 110
 111                if (isdir ? ff->fm->fc->no_opendir : ff->fm->fc->no_open) {
 112                        /* Do nothing when client does not implement 'open' */
 113                        fuse_release_end(ff->fm, args, 0);
 114                } else if (sync) {
 115                        fuse_simple_request(ff->fm, args);
 116                        fuse_release_end(ff->fm, args, 0);
 117                } else {
 118                        args->end = fuse_release_end;
 119                        if (fuse_simple_background(ff->fm, args,
 120                                                   GFP_KERNEL | __GFP_NOFAIL))
 121                                fuse_release_end(ff->fm, args, -ENOTCONN);
 122                }
 123                kfree(ff);
 124        }
 125}
 126
 127struct fuse_file *fuse_file_open(struct fuse_mount *fm, u64 nodeid,
 128                                 unsigned int open_flags, bool isdir)
 129{
 130        struct fuse_conn *fc = fm->fc;
 131        struct fuse_file *ff;
 132        int opcode = isdir ? FUSE_OPENDIR : FUSE_OPEN;
 133
 134        ff = fuse_file_alloc(fm);
 135        if (!ff)
 136                return ERR_PTR(-ENOMEM);
 137
 138        ff->fh = 0;
 139        /* Default for no-open */
 140        ff->open_flags = FOPEN_KEEP_CACHE | (isdir ? FOPEN_CACHE_DIR : 0);
 141        if (isdir ? !fc->no_opendir : !fc->no_open) {
 142                struct fuse_open_out outarg;
 143                int err;
 144
 145                err = fuse_send_open(fm, nodeid, open_flags, opcode, &outarg);
 146                if (!err) {
 147                        ff->fh = outarg.fh;
 148                        ff->open_flags = outarg.open_flags;
 149
 150                } else if (err != -ENOSYS) {
 151                        fuse_file_free(ff);
 152                        return ERR_PTR(err);
 153                } else {
 154                        if (isdir)
 155                                fc->no_opendir = 1;
 156                        else
 157                                fc->no_open = 1;
 158                }
 159        }
 160
 161        if (isdir)
 162                ff->open_flags &= ~FOPEN_DIRECT_IO;
 163
 164        ff->nodeid = nodeid;
 165
 166        return ff;
 167}
 168
 169int fuse_do_open(struct fuse_mount *fm, u64 nodeid, struct file *file,
 170                 bool isdir)
 171{
 172        struct fuse_file *ff = fuse_file_open(fm, nodeid, file->f_flags, isdir);
 173
 174        if (!IS_ERR(ff))
 175                file->private_data = ff;
 176
 177        return PTR_ERR_OR_ZERO(ff);
 178}
 179EXPORT_SYMBOL_GPL(fuse_do_open);
 180
 181static void fuse_link_write_file(struct file *file)
 182{
 183        struct inode *inode = file_inode(file);
 184        struct fuse_inode *fi = get_fuse_inode(inode);
 185        struct fuse_file *ff = file->private_data;
 186        /*
 187         * file may be written through mmap, so chain it onto the
 188         * inodes's write_file list
 189         */
 190        spin_lock(&fi->lock);
 191        if (list_empty(&ff->write_entry))
 192                list_add(&ff->write_entry, &fi->write_files);
 193        spin_unlock(&fi->lock);
 194}
 195
 196void fuse_finish_open(struct inode *inode, struct file *file)
 197{
 198        struct fuse_file *ff = file->private_data;
 199        struct fuse_conn *fc = get_fuse_conn(inode);
 200
 201        if (ff->open_flags & FOPEN_STREAM)
 202                stream_open(inode, file);
 203        else if (ff->open_flags & FOPEN_NONSEEKABLE)
 204                nonseekable_open(inode, file);
 205
 206        if (fc->atomic_o_trunc && (file->f_flags & O_TRUNC)) {
 207                struct fuse_inode *fi = get_fuse_inode(inode);
 208
 209                spin_lock(&fi->lock);
 210                fi->attr_version = atomic64_inc_return(&fc->attr_version);
 211                i_size_write(inode, 0);
 212                spin_unlock(&fi->lock);
 213                truncate_pagecache(inode, 0);
 214                fuse_invalidate_attr(inode);
 215                if (fc->writeback_cache)
 216                        file_update_time(file);
 217        } else if (!(ff->open_flags & FOPEN_KEEP_CACHE)) {
 218                invalidate_inode_pages2(inode->i_mapping);
 219        }
 220
 221        if ((file->f_mode & FMODE_WRITE) && fc->writeback_cache)
 222                fuse_link_write_file(file);
 223}
 224
 225int fuse_open_common(struct inode *inode, struct file *file, bool isdir)
 226{
 227        struct fuse_mount *fm = get_fuse_mount(inode);
 228        struct fuse_conn *fc = fm->fc;
 229        int err;
 230        bool is_wb_truncate = (file->f_flags & O_TRUNC) &&
 231                          fc->atomic_o_trunc &&
 232                          fc->writeback_cache;
 233        bool dax_truncate = (file->f_flags & O_TRUNC) &&
 234                          fc->atomic_o_trunc && FUSE_IS_DAX(inode);
 235
 236        if (fuse_is_bad(inode))
 237                return -EIO;
 238
 239        err = generic_file_open(inode, file);
 240        if (err)
 241                return err;
 242
 243        if (is_wb_truncate || dax_truncate) {
 244                inode_lock(inode);
 245                fuse_set_nowrite(inode);
 246        }
 247
 248        if (dax_truncate) {
 249                filemap_invalidate_lock(inode->i_mapping);
 250                err = fuse_dax_break_layouts(inode, 0, 0);
 251                if (err)
 252                        goto out;
 253        }
 254
 255        err = fuse_do_open(fm, get_node_id(inode), file, isdir);
 256        if (!err)
 257                fuse_finish_open(inode, file);
 258
 259out:
 260        if (dax_truncate)
 261                filemap_invalidate_unlock(inode->i_mapping);
 262
 263        if (is_wb_truncate | dax_truncate) {
 264                fuse_release_nowrite(inode);
 265                inode_unlock(inode);
 266        }
 267
 268        return err;
 269}
 270
 271static void fuse_prepare_release(struct fuse_inode *fi, struct fuse_file *ff,
 272                                 unsigned int flags, int opcode)
 273{
 274        struct fuse_conn *fc = ff->fm->fc;
 275        struct fuse_release_args *ra = ff->release_args;
 276
 277        /* Inode is NULL on error path of fuse_create_open() */
 278        if (likely(fi)) {
 279                spin_lock(&fi->lock);
 280                list_del(&ff->write_entry);
 281                spin_unlock(&fi->lock);
 282        }
 283        spin_lock(&fc->lock);
 284        if (!RB_EMPTY_NODE(&ff->polled_node))
 285                rb_erase(&ff->polled_node, &fc->polled_files);
 286        spin_unlock(&fc->lock);
 287
 288        wake_up_interruptible_all(&ff->poll_wait);
 289
 290        ra->inarg.fh = ff->fh;
 291        ra->inarg.flags = flags;
 292        ra->args.in_numargs = 1;
 293        ra->args.in_args[0].size = sizeof(struct fuse_release_in);
 294        ra->args.in_args[0].value = &ra->inarg;
 295        ra->args.opcode = opcode;
 296        ra->args.nodeid = ff->nodeid;
 297        ra->args.force = true;
 298        ra->args.nocreds = true;
 299}
 300
 301void fuse_file_release(struct inode *inode, struct fuse_file *ff,
 302                       unsigned int open_flags, fl_owner_t id, bool isdir)
 303{
 304        struct fuse_inode *fi = get_fuse_inode(inode);
 305        struct fuse_release_args *ra = ff->release_args;
 306        int opcode = isdir ? FUSE_RELEASEDIR : FUSE_RELEASE;
 307
 308        fuse_prepare_release(fi, ff, open_flags, opcode);
 309
 310        if (ff->flock) {
 311                ra->inarg.release_flags |= FUSE_RELEASE_FLOCK_UNLOCK;
 312                ra->inarg.lock_owner = fuse_lock_owner_id(ff->fm->fc, id);
 313        }
 314        /* Hold inode until release is finished */
 315        ra->inode = igrab(inode);
 316
 317        /*
 318         * Normally this will send the RELEASE request, however if
 319         * some asynchronous READ or WRITE requests are outstanding,
 320         * the sending will be delayed.
 321         *
 322         * Make the release synchronous if this is a fuseblk mount,
 323         * synchronous RELEASE is allowed (and desirable) in this case
 324         * because the server can be trusted not to screw up.
 325         */
 326        fuse_file_put(ff, ff->fm->fc->destroy, isdir);
 327}
 328
 329void fuse_release_common(struct file *file, bool isdir)
 330{
 331        fuse_file_release(file_inode(file), file->private_data, file->f_flags,
 332                          (fl_owner_t) file, isdir);
 333}
 334
 335static int fuse_open(struct inode *inode, struct file *file)
 336{
 337        return fuse_open_common(inode, file, false);
 338}
 339
 340static int fuse_release(struct inode *inode, struct file *file)
 341{
 342        struct fuse_conn *fc = get_fuse_conn(inode);
 343
 344        /* see fuse_vma_close() for !writeback_cache case */
 345        if (fc->writeback_cache)
 346                write_inode_now(inode, 1);
 347
 348        fuse_release_common(file, false);
 349
 350        /* return value is ignored by VFS */
 351        return 0;
 352}
 353
 354void fuse_sync_release(struct fuse_inode *fi, struct fuse_file *ff,
 355                       unsigned int flags)
 356{
 357        WARN_ON(refcount_read(&ff->count) > 1);
 358        fuse_prepare_release(fi, ff, flags, FUSE_RELEASE);
 359        /*
 360         * iput(NULL) is a no-op and since the refcount is 1 and everything's
 361         * synchronous, we are fine with not doing igrab() here"
 362         */
 363        fuse_file_put(ff, true, false);
 364}
 365EXPORT_SYMBOL_GPL(fuse_sync_release);
 366
 367/*
 368 * Scramble the ID space with XTEA, so that the value of the files_struct
 369 * pointer is not exposed to userspace.
 370 */
 371u64 fuse_lock_owner_id(struct fuse_conn *fc, fl_owner_t id)
 372{
 373        u32 *k = fc->scramble_key;
 374        u64 v = (unsigned long) id;
 375        u32 v0 = v;
 376        u32 v1 = v >> 32;
 377        u32 sum = 0;
 378        int i;
 379
 380        for (i = 0; i < 32; i++) {
 381                v0 += ((v1 << 4 ^ v1 >> 5) + v1) ^ (sum + k[sum & 3]);
 382                sum += 0x9E3779B9;
 383                v1 += ((v0 << 4 ^ v0 >> 5) + v0) ^ (sum + k[sum>>11 & 3]);
 384        }
 385
 386        return (u64) v0 + ((u64) v1 << 32);
 387}
 388
 389struct fuse_writepage_args {
 390        struct fuse_io_args ia;
 391        struct rb_node writepages_entry;
 392        struct list_head queue_entry;
 393        struct fuse_writepage_args *next;
 394        struct inode *inode;
 395        struct fuse_sync_bucket *bucket;
 396};
 397
 398static struct fuse_writepage_args *fuse_find_writeback(struct fuse_inode *fi,
 399                                            pgoff_t idx_from, pgoff_t idx_to)
 400{
 401        struct rb_node *n;
 402
 403        n = fi->writepages.rb_node;
 404
 405        while (n) {
 406                struct fuse_writepage_args *wpa;
 407                pgoff_t curr_index;
 408
 409                wpa = rb_entry(n, struct fuse_writepage_args, writepages_entry);
 410                WARN_ON(get_fuse_inode(wpa->inode) != fi);
 411                curr_index = wpa->ia.write.in.offset >> PAGE_SHIFT;
 412                if (idx_from >= curr_index + wpa->ia.ap.num_pages)
 413                        n = n->rb_right;
 414                else if (idx_to < curr_index)
 415                        n = n->rb_left;
 416                else
 417                        return wpa;
 418        }
 419        return NULL;
 420}
 421
 422/*
 423 * Check if any page in a range is under writeback
 424 *
 425 * This is currently done by walking the list of writepage requests
 426 * for the inode, which can be pretty inefficient.
 427 */
 428static bool fuse_range_is_writeback(struct inode *inode, pgoff_t idx_from,
 429                                   pgoff_t idx_to)
 430{
 431        struct fuse_inode *fi = get_fuse_inode(inode);
 432        bool found;
 433
 434        spin_lock(&fi->lock);
 435        found = fuse_find_writeback(fi, idx_from, idx_to);
 436        spin_unlock(&fi->lock);
 437
 438        return found;
 439}
 440
 441static inline bool fuse_page_is_writeback(struct inode *inode, pgoff_t index)
 442{
 443        return fuse_range_is_writeback(inode, index, index);
 444}
 445
 446/*
 447 * Wait for page writeback to be completed.
 448 *
 449 * Since fuse doesn't rely on the VM writeback tracking, this has to
 450 * use some other means.
 451 */
 452static void fuse_wait_on_page_writeback(struct inode *inode, pgoff_t index)
 453{
 454        struct fuse_inode *fi = get_fuse_inode(inode);
 455
 456        wait_event(fi->page_waitq, !fuse_page_is_writeback(inode, index));
 457}
 458
 459/*
 460 * Wait for all pending writepages on the inode to finish.
 461 *
 462 * This is currently done by blocking further writes with FUSE_NOWRITE
 463 * and waiting for all sent writes to complete.
 464 *
 465 * This must be called under i_mutex, otherwise the FUSE_NOWRITE usage
 466 * could conflict with truncation.
 467 */
 468static void fuse_sync_writes(struct inode *inode)
 469{
 470        fuse_set_nowrite(inode);
 471        fuse_release_nowrite(inode);
 472}
 473
 474static int fuse_flush(struct file *file, fl_owner_t id)
 475{
 476        struct inode *inode = file_inode(file);
 477        struct fuse_mount *fm = get_fuse_mount(inode);
 478        struct fuse_file *ff = file->private_data;
 479        struct fuse_flush_in inarg;
 480        FUSE_ARGS(args);
 481        int err;
 482
 483        if (fuse_is_bad(inode))
 484                return -EIO;
 485
 486        err = write_inode_now(inode, 1);
 487        if (err)
 488                return err;
 489
 490        inode_lock(inode);
 491        fuse_sync_writes(inode);
 492        inode_unlock(inode);
 493
 494        err = filemap_check_errors(file->f_mapping);
 495        if (err)
 496                return err;
 497
 498        err = 0;
 499        if (fm->fc->no_flush)
 500                goto inval_attr_out;
 501
 502        memset(&inarg, 0, sizeof(inarg));
 503        inarg.fh = ff->fh;
 504        inarg.lock_owner = fuse_lock_owner_id(fm->fc, id);
 505        args.opcode = FUSE_FLUSH;
 506        args.nodeid = get_node_id(inode);
 507        args.in_numargs = 1;
 508        args.in_args[0].size = sizeof(inarg);
 509        args.in_args[0].value = &inarg;
 510        args.force = true;
 511
 512        err = fuse_simple_request(fm, &args);
 513        if (err == -ENOSYS) {
 514                fm->fc->no_flush = 1;
 515                err = 0;
 516        }
 517
 518inval_attr_out:
 519        /*
 520         * In memory i_blocks is not maintained by fuse, if writeback cache is
 521         * enabled, i_blocks from cached attr may not be accurate.
 522         */
 523        if (!err && fm->fc->writeback_cache)
 524                fuse_invalidate_attr(inode);
 525        return err;
 526}
 527
 528int fuse_fsync_common(struct file *file, loff_t start, loff_t end,
 529                      int datasync, int opcode)
 530{
 531        struct inode *inode = file->f_mapping->host;
 532        struct fuse_mount *fm = get_fuse_mount(inode);
 533        struct fuse_file *ff = file->private_data;
 534        FUSE_ARGS(args);
 535        struct fuse_fsync_in inarg;
 536
 537        memset(&inarg, 0, sizeof(inarg));
 538        inarg.fh = ff->fh;
 539        inarg.fsync_flags = datasync ? FUSE_FSYNC_FDATASYNC : 0;
 540        args.opcode = opcode;
 541        args.nodeid = get_node_id(inode);
 542        args.in_numargs = 1;
 543        args.in_args[0].size = sizeof(inarg);
 544        args.in_args[0].value = &inarg;
 545        return fuse_simple_request(fm, &args);
 546}
 547
 548static int fuse_fsync(struct file *file, loff_t start, loff_t end,
 549                      int datasync)
 550{
 551        struct inode *inode = file->f_mapping->host;
 552        struct fuse_conn *fc = get_fuse_conn(inode);
 553        int err;
 554
 555        if (fuse_is_bad(inode))
 556                return -EIO;
 557
 558        inode_lock(inode);
 559
 560        /*
 561         * Start writeback against all dirty pages of the inode, then
 562         * wait for all outstanding writes, before sending the FSYNC
 563         * request.
 564         */
 565        err = file_write_and_wait_range(file, start, end);
 566        if (err)
 567                goto out;
 568
 569        fuse_sync_writes(inode);
 570
 571        /*
 572         * Due to implementation of fuse writeback
 573         * file_write_and_wait_range() does not catch errors.
 574         * We have to do this directly after fuse_sync_writes()
 575         */
 576        err = file_check_and_advance_wb_err(file);
 577        if (err)
 578                goto out;
 579
 580        err = sync_inode_metadata(inode, 1);
 581        if (err)
 582                goto out;
 583
 584        if (fc->no_fsync)
 585                goto out;
 586
 587        err = fuse_fsync_common(file, start, end, datasync, FUSE_FSYNC);
 588        if (err == -ENOSYS) {
 589                fc->no_fsync = 1;
 590                err = 0;
 591        }
 592out:
 593        inode_unlock(inode);
 594
 595        return err;
 596}
 597
 598void fuse_read_args_fill(struct fuse_io_args *ia, struct file *file, loff_t pos,
 599                         size_t count, int opcode)
 600{
 601        struct fuse_file *ff = file->private_data;
 602        struct fuse_args *args = &ia->ap.args;
 603
 604        ia->read.in.fh = ff->fh;
 605        ia->read.in.offset = pos;
 606        ia->read.in.size = count;
 607        ia->read.in.flags = file->f_flags;
 608        args->opcode = opcode;
 609        args->nodeid = ff->nodeid;
 610        args->in_numargs = 1;
 611        args->in_args[0].size = sizeof(ia->read.in);
 612        args->in_args[0].value = &ia->read.in;
 613        args->out_argvar = true;
 614        args->out_numargs = 1;
 615        args->out_args[0].size = count;
 616}
 617
 618static void fuse_release_user_pages(struct fuse_args_pages *ap,
 619                                    bool should_dirty)
 620{
 621        unsigned int i;
 622
 623        for (i = 0; i < ap->num_pages; i++) {
 624                if (should_dirty)
 625                        set_page_dirty_lock(ap->pages[i]);
 626                put_page(ap->pages[i]);
 627        }
 628}
 629
 630static void fuse_io_release(struct kref *kref)
 631{
 632        kfree(container_of(kref, struct fuse_io_priv, refcnt));
 633}
 634
 635static ssize_t fuse_get_res_by_io(struct fuse_io_priv *io)
 636{
 637        if (io->err)
 638                return io->err;
 639
 640        if (io->bytes >= 0 && io->write)
 641                return -EIO;
 642
 643        return io->bytes < 0 ? io->size : io->bytes;
 644}
 645
 646/**
 647 * In case of short read, the caller sets 'pos' to the position of
 648 * actual end of fuse request in IO request. Otherwise, if bytes_requested
 649 * == bytes_transferred or rw == WRITE, the caller sets 'pos' to -1.
 650 *
 651 * An example:
 652 * User requested DIO read of 64K. It was split into two 32K fuse requests,
 653 * both submitted asynchronously. The first of them was ACKed by userspace as
 654 * fully completed (req->out.args[0].size == 32K) resulting in pos == -1. The
 655 * second request was ACKed as short, e.g. only 1K was read, resulting in
 656 * pos == 33K.
 657 *
 658 * Thus, when all fuse requests are completed, the minimal non-negative 'pos'
 659 * will be equal to the length of the longest contiguous fragment of
 660 * transferred data starting from the beginning of IO request.
 661 */
 662static void fuse_aio_complete(struct fuse_io_priv *io, int err, ssize_t pos)
 663{
 664        int left;
 665
 666        spin_lock(&io->lock);
 667        if (err)
 668                io->err = io->err ? : err;
 669        else if (pos >= 0 && (io->bytes < 0 || pos < io->bytes))
 670                io->bytes = pos;
 671
 672        left = --io->reqs;
 673        if (!left && io->blocking)
 674                complete(io->done);
 675        spin_unlock(&io->lock);
 676
 677        if (!left && !io->blocking) {
 678                ssize_t res = fuse_get_res_by_io(io);
 679
 680                if (res >= 0) {
 681                        struct inode *inode = file_inode(io->iocb->ki_filp);
 682                        struct fuse_conn *fc = get_fuse_conn(inode);
 683                        struct fuse_inode *fi = get_fuse_inode(inode);
 684
 685                        spin_lock(&fi->lock);
 686                        fi->attr_version = atomic64_inc_return(&fc->attr_version);
 687                        spin_unlock(&fi->lock);
 688                }
 689
 690                io->iocb->ki_complete(io->iocb, res, 0);
 691        }
 692
 693        kref_put(&io->refcnt, fuse_io_release);
 694}
 695
 696static struct fuse_io_args *fuse_io_alloc(struct fuse_io_priv *io,
 697                                          unsigned int npages)
 698{
 699        struct fuse_io_args *ia;
 700
 701        ia = kzalloc(sizeof(*ia), GFP_KERNEL);
 702        if (ia) {
 703                ia->io = io;
 704                ia->ap.pages = fuse_pages_alloc(npages, GFP_KERNEL,
 705                                                &ia->ap.descs);
 706                if (!ia->ap.pages) {
 707                        kfree(ia);
 708                        ia = NULL;
 709                }
 710        }
 711        return ia;
 712}
 713
 714static void fuse_io_free(struct fuse_io_args *ia)
 715{
 716        kfree(ia->ap.pages);
 717        kfree(ia);
 718}
 719
 720static void fuse_aio_complete_req(struct fuse_mount *fm, struct fuse_args *args,
 721                                  int err)
 722{
 723        struct fuse_io_args *ia = container_of(args, typeof(*ia), ap.args);
 724        struct fuse_io_priv *io = ia->io;
 725        ssize_t pos = -1;
 726
 727        fuse_release_user_pages(&ia->ap, io->should_dirty);
 728
 729        if (err) {
 730                /* Nothing */
 731        } else if (io->write) {
 732                if (ia->write.out.size > ia->write.in.size) {
 733                        err = -EIO;
 734                } else if (ia->write.in.size != ia->write.out.size) {
 735                        pos = ia->write.in.offset - io->offset +
 736                                ia->write.out.size;
 737                }
 738        } else {
 739                u32 outsize = args->out_args[0].size;
 740
 741                if (ia->read.in.size != outsize)
 742                        pos = ia->read.in.offset - io->offset + outsize;
 743        }
 744
 745        fuse_aio_complete(io, err, pos);
 746        fuse_io_free(ia);
 747}
 748
 749static ssize_t fuse_async_req_send(struct fuse_mount *fm,
 750                                   struct fuse_io_args *ia, size_t num_bytes)
 751{
 752        ssize_t err;
 753        struct fuse_io_priv *io = ia->io;
 754
 755        spin_lock(&io->lock);
 756        kref_get(&io->refcnt);
 757        io->size += num_bytes;
 758        io->reqs++;
 759        spin_unlock(&io->lock);
 760
 761        ia->ap.args.end = fuse_aio_complete_req;
 762        ia->ap.args.may_block = io->should_dirty;
 763        err = fuse_simple_background(fm, &ia->ap.args, GFP_KERNEL);
 764        if (err)
 765                fuse_aio_complete_req(fm, &ia->ap.args, err);
 766
 767        return num_bytes;
 768}
 769
 770static ssize_t fuse_send_read(struct fuse_io_args *ia, loff_t pos, size_t count,
 771                              fl_owner_t owner)
 772{
 773        struct file *file = ia->io->iocb->ki_filp;
 774        struct fuse_file *ff = file->private_data;
 775        struct fuse_mount *fm = ff->fm;
 776
 777        fuse_read_args_fill(ia, file, pos, count, FUSE_READ);
 778        if (owner != NULL) {
 779                ia->read.in.read_flags |= FUSE_READ_LOCKOWNER;
 780                ia->read.in.lock_owner = fuse_lock_owner_id(fm->fc, owner);
 781        }
 782
 783        if (ia->io->async)
 784                return fuse_async_req_send(fm, ia, count);
 785
 786        return fuse_simple_request(fm, &ia->ap.args);
 787}
 788
 789static void fuse_read_update_size(struct inode *inode, loff_t size,
 790                                  u64 attr_ver)
 791{
 792        struct fuse_conn *fc = get_fuse_conn(inode);
 793        struct fuse_inode *fi = get_fuse_inode(inode);
 794
 795        spin_lock(&fi->lock);
 796        if (attr_ver == fi->attr_version && size < inode->i_size &&
 797            !test_bit(FUSE_I_SIZE_UNSTABLE, &fi->state)) {
 798                fi->attr_version = atomic64_inc_return(&fc->attr_version);
 799                i_size_write(inode, size);
 800        }
 801        spin_unlock(&fi->lock);
 802}
 803
 804static void fuse_short_read(struct inode *inode, u64 attr_ver, size_t num_read,
 805                            struct fuse_args_pages *ap)
 806{
 807        struct fuse_conn *fc = get_fuse_conn(inode);
 808
 809        /*
 810         * If writeback_cache is enabled, a short read means there's a hole in
 811         * the file.  Some data after the hole is in page cache, but has not
 812         * reached the client fs yet.  So the hole is not present there.
 813         */
 814        if (!fc->writeback_cache) {
 815                loff_t pos = page_offset(ap->pages[0]) + num_read;
 816                fuse_read_update_size(inode, pos, attr_ver);
 817        }
 818}
 819
 820static int fuse_do_readpage(struct file *file, struct page *page)
 821{
 822        struct inode *inode = page->mapping->host;
 823        struct fuse_mount *fm = get_fuse_mount(inode);
 824        loff_t pos = page_offset(page);
 825        struct fuse_page_desc desc = { .length = PAGE_SIZE };
 826        struct fuse_io_args ia = {
 827                .ap.args.page_zeroing = true,
 828                .ap.args.out_pages = true,
 829                .ap.num_pages = 1,
 830                .ap.pages = &page,
 831                .ap.descs = &desc,
 832        };
 833        ssize_t res;
 834        u64 attr_ver;
 835
 836        /*
 837         * Page writeback can extend beyond the lifetime of the
 838         * page-cache page, so make sure we read a properly synced
 839         * page.
 840         */
 841        fuse_wait_on_page_writeback(inode, page->index);
 842
 843        attr_ver = fuse_get_attr_version(fm->fc);
 844
 845        /* Don't overflow end offset */
 846        if (pos + (desc.length - 1) == LLONG_MAX)
 847                desc.length--;
 848
 849        fuse_read_args_fill(&ia, file, pos, desc.length, FUSE_READ);
 850        res = fuse_simple_request(fm, &ia.ap.args);
 851        if (res < 0)
 852                return res;
 853        /*
 854         * Short read means EOF.  If file size is larger, truncate it
 855         */
 856        if (res < desc.length)
 857                fuse_short_read(inode, attr_ver, res, &ia.ap);
 858
 859        SetPageUptodate(page);
 860
 861        return 0;
 862}
 863
 864static int fuse_readpage(struct file *file, struct page *page)
 865{
 866        struct inode *inode = page->mapping->host;
 867        int err;
 868
 869        err = -EIO;
 870        if (fuse_is_bad(inode))
 871                goto out;
 872
 873        err = fuse_do_readpage(file, page);
 874        fuse_invalidate_atime(inode);
 875 out:
 876        unlock_page(page);
 877        return err;
 878}
 879
 880static void fuse_readpages_end(struct fuse_mount *fm, struct fuse_args *args,
 881                               int err)
 882{
 883        int i;
 884        struct fuse_io_args *ia = container_of(args, typeof(*ia), ap.args);
 885        struct fuse_args_pages *ap = &ia->ap;
 886        size_t count = ia->read.in.size;
 887        size_t num_read = args->out_args[0].size;
 888        struct address_space *mapping = NULL;
 889
 890        for (i = 0; mapping == NULL && i < ap->num_pages; i++)
 891                mapping = ap->pages[i]->mapping;
 892
 893        if (mapping) {
 894                struct inode *inode = mapping->host;
 895
 896                /*
 897                 * Short read means EOF. If file size is larger, truncate it
 898                 */
 899                if (!err && num_read < count)
 900                        fuse_short_read(inode, ia->read.attr_ver, num_read, ap);
 901
 902                fuse_invalidate_atime(inode);
 903        }
 904
 905        for (i = 0; i < ap->num_pages; i++) {
 906                struct page *page = ap->pages[i];
 907
 908                if (!err)
 909                        SetPageUptodate(page);
 910                else
 911                        SetPageError(page);
 912                unlock_page(page);
 913                put_page(page);
 914        }
 915        if (ia->ff)
 916                fuse_file_put(ia->ff, false, false);
 917
 918        fuse_io_free(ia);
 919}
 920
 921static void fuse_send_readpages(struct fuse_io_args *ia, struct file *file)
 922{
 923        struct fuse_file *ff = file->private_data;
 924        struct fuse_mount *fm = ff->fm;
 925        struct fuse_args_pages *ap = &ia->ap;
 926        loff_t pos = page_offset(ap->pages[0]);
 927        size_t count = ap->num_pages << PAGE_SHIFT;
 928        ssize_t res;
 929        int err;
 930
 931        ap->args.out_pages = true;
 932        ap->args.page_zeroing = true;
 933        ap->args.page_replace = true;
 934
 935        /* Don't overflow end offset */
 936        if (pos + (count - 1) == LLONG_MAX) {
 937                count--;
 938                ap->descs[ap->num_pages - 1].length--;
 939        }
 940        WARN_ON((loff_t) (pos + count) < 0);
 941
 942        fuse_read_args_fill(ia, file, pos, count, FUSE_READ);
 943        ia->read.attr_ver = fuse_get_attr_version(fm->fc);
 944        if (fm->fc->async_read) {
 945                ia->ff = fuse_file_get(ff);
 946                ap->args.end = fuse_readpages_end;
 947                err = fuse_simple_background(fm, &ap->args, GFP_KERNEL);
 948                if (!err)
 949                        return;
 950        } else {
 951                res = fuse_simple_request(fm, &ap->args);
 952                err = res < 0 ? res : 0;
 953        }
 954        fuse_readpages_end(fm, &ap->args, err);
 955}
 956
 957static void fuse_readahead(struct readahead_control *rac)
 958{
 959        struct inode *inode = rac->mapping->host;
 960        struct fuse_conn *fc = get_fuse_conn(inode);
 961        unsigned int i, max_pages, nr_pages = 0;
 962
 963        if (fuse_is_bad(inode))
 964                return;
 965
 966        max_pages = min_t(unsigned int, fc->max_pages,
 967                        fc->max_read / PAGE_SIZE);
 968
 969        for (;;) {
 970                struct fuse_io_args *ia;
 971                struct fuse_args_pages *ap;
 972
 973                nr_pages = readahead_count(rac) - nr_pages;
 974                if (nr_pages > max_pages)
 975                        nr_pages = max_pages;
 976                if (nr_pages == 0)
 977                        break;
 978                ia = fuse_io_alloc(NULL, nr_pages);
 979                if (!ia)
 980                        return;
 981                ap = &ia->ap;
 982                nr_pages = __readahead_batch(rac, ap->pages, nr_pages);
 983                for (i = 0; i < nr_pages; i++) {
 984                        fuse_wait_on_page_writeback(inode,
 985                                                    readahead_index(rac) + i);
 986                        ap->descs[i].length = PAGE_SIZE;
 987                }
 988                ap->num_pages = nr_pages;
 989                fuse_send_readpages(ia, rac->file);
 990        }
 991}
 992
 993static ssize_t fuse_cache_read_iter(struct kiocb *iocb, struct iov_iter *to)
 994{
 995        struct inode *inode = iocb->ki_filp->f_mapping->host;
 996        struct fuse_conn *fc = get_fuse_conn(inode);
 997
 998        /*
 999         * In auto invalidate mode, always update attributes on read.
1000         * Otherwise, only update if we attempt to read past EOF (to ensure
1001         * i_size is up to date).
1002         */
1003        if (fc->auto_inval_data ||
1004            (iocb->ki_pos + iov_iter_count(to) > i_size_read(inode))) {
1005                int err;
1006                err = fuse_update_attributes(inode, iocb->ki_filp);
1007                if (err)
1008                        return err;
1009        }
1010
1011        return generic_file_read_iter(iocb, to);
1012}
1013
1014static void fuse_write_args_fill(struct fuse_io_args *ia, struct fuse_file *ff,
1015                                 loff_t pos, size_t count)
1016{
1017        struct fuse_args *args = &ia->ap.args;
1018
1019        ia->write.in.fh = ff->fh;
1020        ia->write.in.offset = pos;
1021        ia->write.in.size = count;
1022        args->opcode = FUSE_WRITE;
1023        args->nodeid = ff->nodeid;
1024        args->in_numargs = 2;
1025        if (ff->fm->fc->minor < 9)
1026                args->in_args[0].size = FUSE_COMPAT_WRITE_IN_SIZE;
1027        else
1028                args->in_args[0].size = sizeof(ia->write.in);
1029        args->in_args[0].value = &ia->write.in;
1030        args->in_args[1].size = count;
1031        args->out_numargs = 1;
1032        args->out_args[0].size = sizeof(ia->write.out);
1033        args->out_args[0].value = &ia->write.out;
1034}
1035
1036static unsigned int fuse_write_flags(struct kiocb *iocb)
1037{
1038        unsigned int flags = iocb->ki_filp->f_flags;
1039
1040        if (iocb->ki_flags & IOCB_DSYNC)
1041                flags |= O_DSYNC;
1042        if (iocb->ki_flags & IOCB_SYNC)
1043                flags |= O_SYNC;
1044
1045        return flags;
1046}
1047
1048static ssize_t fuse_send_write(struct fuse_io_args *ia, loff_t pos,
1049                               size_t count, fl_owner_t owner)
1050{
1051        struct kiocb *iocb = ia->io->iocb;
1052        struct file *file = iocb->ki_filp;
1053        struct fuse_file *ff = file->private_data;
1054        struct fuse_mount *fm = ff->fm;
1055        struct fuse_write_in *inarg = &ia->write.in;
1056        ssize_t err;
1057
1058        fuse_write_args_fill(ia, ff, pos, count);
1059        inarg->flags = fuse_write_flags(iocb);
1060        if (owner != NULL) {
1061                inarg->write_flags |= FUSE_WRITE_LOCKOWNER;
1062                inarg->lock_owner = fuse_lock_owner_id(fm->fc, owner);
1063        }
1064
1065        if (ia->io->async)
1066                return fuse_async_req_send(fm, ia, count);
1067
1068        err = fuse_simple_request(fm, &ia->ap.args);
1069        if (!err && ia->write.out.size > count)
1070                err = -EIO;
1071
1072        return err ?: ia->write.out.size;
1073}
1074
1075bool fuse_write_update_size(struct inode *inode, loff_t pos)
1076{
1077        struct fuse_conn *fc = get_fuse_conn(inode);
1078        struct fuse_inode *fi = get_fuse_inode(inode);
1079        bool ret = false;
1080
1081        spin_lock(&fi->lock);
1082        fi->attr_version = atomic64_inc_return(&fc->attr_version);
1083        if (pos > inode->i_size) {
1084                i_size_write(inode, pos);
1085                ret = true;
1086        }
1087        spin_unlock(&fi->lock);
1088
1089        return ret;
1090}
1091
1092static ssize_t fuse_send_write_pages(struct fuse_io_args *ia,
1093                                     struct kiocb *iocb, struct inode *inode,
1094                                     loff_t pos, size_t count)
1095{
1096        struct fuse_args_pages *ap = &ia->ap;
1097        struct file *file = iocb->ki_filp;
1098        struct fuse_file *ff = file->private_data;
1099        struct fuse_mount *fm = ff->fm;
1100        unsigned int offset, i;
1101        bool short_write;
1102        int err;
1103
1104        for (i = 0; i < ap->num_pages; i++)
1105                fuse_wait_on_page_writeback(inode, ap->pages[i]->index);
1106
1107        fuse_write_args_fill(ia, ff, pos, count);
1108        ia->write.in.flags = fuse_write_flags(iocb);
1109        if (fm->fc->handle_killpriv_v2 && !capable(CAP_FSETID))
1110                ia->write.in.write_flags |= FUSE_WRITE_KILL_SUIDGID;
1111
1112        err = fuse_simple_request(fm, &ap->args);
1113        if (!err && ia->write.out.size > count)
1114                err = -EIO;
1115
1116        short_write = ia->write.out.size < count;
1117        offset = ap->descs[0].offset;
1118        count = ia->write.out.size;
1119        for (i = 0; i < ap->num_pages; i++) {
1120                struct page *page = ap->pages[i];
1121
1122                if (err) {
1123                        ClearPageUptodate(page);
1124                } else {
1125                        if (count >= PAGE_SIZE - offset)
1126                                count -= PAGE_SIZE - offset;
1127                        else {
1128                                if (short_write)
1129                                        ClearPageUptodate(page);
1130                                count = 0;
1131                        }
1132                        offset = 0;
1133                }
1134                if (ia->write.page_locked && (i == ap->num_pages - 1))
1135                        unlock_page(page);
1136                put_page(page);
1137        }
1138
1139        return err;
1140}
1141
1142static ssize_t fuse_fill_write_pages(struct fuse_io_args *ia,
1143                                     struct address_space *mapping,
1144                                     struct iov_iter *ii, loff_t pos,
1145                                     unsigned int max_pages)
1146{
1147        struct fuse_args_pages *ap = &ia->ap;
1148        struct fuse_conn *fc = get_fuse_conn(mapping->host);
1149        unsigned offset = pos & (PAGE_SIZE - 1);
1150        size_t count = 0;
1151        int err;
1152
1153        ap->args.in_pages = true;
1154        ap->descs[0].offset = offset;
1155
1156        do {
1157                size_t tmp;
1158                struct page *page;
1159                pgoff_t index = pos >> PAGE_SHIFT;
1160                size_t bytes = min_t(size_t, PAGE_SIZE - offset,
1161                                     iov_iter_count(ii));
1162
1163                bytes = min_t(size_t, bytes, fc->max_write - count);
1164
1165 again:
1166                err = -EFAULT;
1167                if (iov_iter_fault_in_readable(ii, bytes))
1168                        break;
1169
1170                err = -ENOMEM;
1171                page = grab_cache_page_write_begin(mapping, index, 0);
1172                if (!page)
1173                        break;
1174
1175                if (mapping_writably_mapped(mapping))
1176                        flush_dcache_page(page);
1177
1178                tmp = copy_page_from_iter_atomic(page, offset, bytes, ii);
1179                flush_dcache_page(page);
1180
1181                if (!tmp) {
1182                        unlock_page(page);
1183                        put_page(page);
1184                        goto again;
1185                }
1186
1187                err = 0;
1188                ap->pages[ap->num_pages] = page;
1189                ap->descs[ap->num_pages].length = tmp;
1190                ap->num_pages++;
1191
1192                count += tmp;
1193                pos += tmp;
1194                offset += tmp;
1195                if (offset == PAGE_SIZE)
1196                        offset = 0;
1197
1198                /* If we copied full page, mark it uptodate */
1199                if (tmp == PAGE_SIZE)
1200                        SetPageUptodate(page);
1201
1202                if (PageUptodate(page)) {
1203                        unlock_page(page);
1204                } else {
1205                        ia->write.page_locked = true;
1206                        break;
1207                }
1208                if (!fc->big_writes)
1209                        break;
1210        } while (iov_iter_count(ii) && count < fc->max_write &&
1211                 ap->num_pages < max_pages && offset == 0);
1212
1213        return count > 0 ? count : err;
1214}
1215
1216static inline unsigned int fuse_wr_pages(loff_t pos, size_t len,
1217                                     unsigned int max_pages)
1218{
1219        return min_t(unsigned int,
1220                     ((pos + len - 1) >> PAGE_SHIFT) -
1221                     (pos >> PAGE_SHIFT) + 1,
1222                     max_pages);
1223}
1224
1225static ssize_t fuse_perform_write(struct kiocb *iocb,
1226                                  struct address_space *mapping,
1227                                  struct iov_iter *ii, loff_t pos)
1228{
1229        struct inode *inode = mapping->host;
1230        struct fuse_conn *fc = get_fuse_conn(inode);
1231        struct fuse_inode *fi = get_fuse_inode(inode);
1232        int err = 0;
1233        ssize_t res = 0;
1234
1235        if (inode->i_size < pos + iov_iter_count(ii))
1236                set_bit(FUSE_I_SIZE_UNSTABLE, &fi->state);
1237
1238        do {
1239                ssize_t count;
1240                struct fuse_io_args ia = {};
1241                struct fuse_args_pages *ap = &ia.ap;
1242                unsigned int nr_pages = fuse_wr_pages(pos, iov_iter_count(ii),
1243                                                      fc->max_pages);
1244
1245                ap->pages = fuse_pages_alloc(nr_pages, GFP_KERNEL, &ap->descs);
1246                if (!ap->pages) {
1247                        err = -ENOMEM;
1248                        break;
1249                }
1250
1251                count = fuse_fill_write_pages(&ia, mapping, ii, pos, nr_pages);
1252                if (count <= 0) {
1253                        err = count;
1254                } else {
1255                        err = fuse_send_write_pages(&ia, iocb, inode,
1256                                                    pos, count);
1257                        if (!err) {
1258                                size_t num_written = ia.write.out.size;
1259
1260                                res += num_written;
1261                                pos += num_written;
1262
1263                                /* break out of the loop on short write */
1264                                if (num_written != count)
1265                                        err = -EIO;
1266                        }
1267                }
1268                kfree(ap->pages);
1269        } while (!err && iov_iter_count(ii));
1270
1271        if (res > 0)
1272                fuse_write_update_size(inode, pos);
1273
1274        clear_bit(FUSE_I_SIZE_UNSTABLE, &fi->state);
1275        fuse_invalidate_attr(inode);
1276
1277        return res > 0 ? res : err;
1278}
1279
1280static ssize_t fuse_cache_write_iter(struct kiocb *iocb, struct iov_iter *from)
1281{
1282        struct file *file = iocb->ki_filp;
1283        struct address_space *mapping = file->f_mapping;
1284        ssize_t written = 0;
1285        ssize_t written_buffered = 0;
1286        struct inode *inode = mapping->host;
1287        ssize_t err;
1288        struct fuse_conn *fc = get_fuse_conn(inode);
1289        loff_t endbyte = 0;
1290
1291        if (fc->writeback_cache) {
1292                /* Update size (EOF optimization) and mode (SUID clearing) */
1293                err = fuse_update_attributes(mapping->host, file);
1294                if (err)
1295                        return err;
1296
1297                if (fc->handle_killpriv_v2 &&
1298                    should_remove_suid(file_dentry(file))) {
1299                        goto writethrough;
1300                }
1301
1302                return generic_file_write_iter(iocb, from);
1303        }
1304
1305writethrough:
1306        inode_lock(inode);
1307
1308        /* We can write back this queue in page reclaim */
1309        current->backing_dev_info = inode_to_bdi(inode);
1310
1311        err = generic_write_checks(iocb, from);
1312        if (err <= 0)
1313                goto out;
1314
1315        err = file_remove_privs(file);
1316        if (err)
1317                goto out;
1318
1319        err = file_update_time(file);
1320        if (err)
1321                goto out;
1322
1323        if (iocb->ki_flags & IOCB_DIRECT) {
1324                loff_t pos = iocb->ki_pos;
1325                written = generic_file_direct_write(iocb, from);
1326                if (written < 0 || !iov_iter_count(from))
1327                        goto out;
1328
1329                pos += written;
1330
1331                written_buffered = fuse_perform_write(iocb, mapping, from, pos);
1332                if (written_buffered < 0) {
1333                        err = written_buffered;
1334                        goto out;
1335                }
1336                endbyte = pos + written_buffered - 1;
1337
1338                err = filemap_write_and_wait_range(file->f_mapping, pos,
1339                                                   endbyte);
1340                if (err)
1341                        goto out;
1342
1343                invalidate_mapping_pages(file->f_mapping,
1344                                         pos >> PAGE_SHIFT,
1345                                         endbyte >> PAGE_SHIFT);
1346
1347                written += written_buffered;
1348                iocb->ki_pos = pos + written_buffered;
1349        } else {
1350                written = fuse_perform_write(iocb, mapping, from, iocb->ki_pos);
1351                if (written >= 0)
1352                        iocb->ki_pos += written;
1353        }
1354out:
1355        current->backing_dev_info = NULL;
1356        inode_unlock(inode);
1357        if (written > 0)
1358                written = generic_write_sync(iocb, written);
1359
1360        return written ? written : err;
1361}
1362
1363static inline unsigned long fuse_get_user_addr(const struct iov_iter *ii)
1364{
1365        return (unsigned long)ii->iov->iov_base + ii->iov_offset;
1366}
1367
1368static inline size_t fuse_get_frag_size(const struct iov_iter *ii,
1369                                        size_t max_size)
1370{
1371        return min(iov_iter_single_seg_count(ii), max_size);
1372}
1373
1374static int fuse_get_user_pages(struct fuse_args_pages *ap, struct iov_iter *ii,
1375                               size_t *nbytesp, int write,
1376                               unsigned int max_pages)
1377{
1378        size_t nbytes = 0;  /* # bytes already packed in req */
1379        ssize_t ret = 0;
1380
1381        /* Special case for kernel I/O: can copy directly into the buffer */
1382        if (iov_iter_is_kvec(ii)) {
1383                unsigned long user_addr = fuse_get_user_addr(ii);
1384                size_t frag_size = fuse_get_frag_size(ii, *nbytesp);
1385
1386                if (write)
1387                        ap->args.in_args[1].value = (void *) user_addr;
1388                else
1389                        ap->args.out_args[0].value = (void *) user_addr;
1390
1391                iov_iter_advance(ii, frag_size);
1392                *nbytesp = frag_size;
1393                return 0;
1394        }
1395
1396        while (nbytes < *nbytesp && ap->num_pages < max_pages) {
1397                unsigned npages;
1398                size_t start;
1399                ret = iov_iter_get_pages(ii, &ap->pages[ap->num_pages],
1400                                        *nbytesp - nbytes,
1401                                        max_pages - ap->num_pages,
1402                                        &start);
1403                if (ret < 0)
1404                        break;
1405
1406                iov_iter_advance(ii, ret);
1407                nbytes += ret;
1408
1409                ret += start;
1410                npages = DIV_ROUND_UP(ret, PAGE_SIZE);
1411
1412                ap->descs[ap->num_pages].offset = start;
1413                fuse_page_descs_length_init(ap->descs, ap->num_pages, npages);
1414
1415                ap->num_pages += npages;
1416                ap->descs[ap->num_pages - 1].length -=
1417                        (PAGE_SIZE - ret) & (PAGE_SIZE - 1);
1418        }
1419
1420        if (write)
1421                ap->args.in_pages = true;
1422        else
1423                ap->args.out_pages = true;
1424
1425        *nbytesp = nbytes;
1426
1427        return ret < 0 ? ret : 0;
1428}
1429
1430ssize_t fuse_direct_io(struct fuse_io_priv *io, struct iov_iter *iter,
1431                       loff_t *ppos, int flags)
1432{
1433        int write = flags & FUSE_DIO_WRITE;
1434        int cuse = flags & FUSE_DIO_CUSE;
1435        struct file *file = io->iocb->ki_filp;
1436        struct inode *inode = file->f_mapping->host;
1437        struct fuse_file *ff = file->private_data;
1438        struct fuse_conn *fc = ff->fm->fc;
1439        size_t nmax = write ? fc->max_write : fc->max_read;
1440        loff_t pos = *ppos;
1441        size_t count = iov_iter_count(iter);
1442        pgoff_t idx_from = pos >> PAGE_SHIFT;
1443        pgoff_t idx_to = (pos + count - 1) >> PAGE_SHIFT;
1444        ssize_t res = 0;
1445        int err = 0;
1446        struct fuse_io_args *ia;
1447        unsigned int max_pages;
1448
1449        max_pages = iov_iter_npages(iter, fc->max_pages);
1450        ia = fuse_io_alloc(io, max_pages);
1451        if (!ia)
1452                return -ENOMEM;
1453
1454        ia->io = io;
1455        if (!cuse && fuse_range_is_writeback(inode, idx_from, idx_to)) {
1456                if (!write)
1457                        inode_lock(inode);
1458                fuse_sync_writes(inode);
1459                if (!write)
1460                        inode_unlock(inode);
1461        }
1462
1463        io->should_dirty = !write && iter_is_iovec(iter);
1464        while (count) {
1465                ssize_t nres;
1466                fl_owner_t owner = current->files;
1467                size_t nbytes = min(count, nmax);
1468
1469                err = fuse_get_user_pages(&ia->ap, iter, &nbytes, write,
1470                                          max_pages);
1471                if (err && !nbytes)
1472                        break;
1473
1474                if (write) {
1475                        if (!capable(CAP_FSETID))
1476                                ia->write.in.write_flags |= FUSE_WRITE_KILL_SUIDGID;
1477
1478                        nres = fuse_send_write(ia, pos, nbytes, owner);
1479                } else {
1480                        nres = fuse_send_read(ia, pos, nbytes, owner);
1481                }
1482
1483                if (!io->async || nres < 0) {
1484                        fuse_release_user_pages(&ia->ap, io->should_dirty);
1485                        fuse_io_free(ia);
1486                }
1487                ia = NULL;
1488                if (nres < 0) {
1489                        iov_iter_revert(iter, nbytes);
1490                        err = nres;
1491                        break;
1492                }
1493                WARN_ON(nres > nbytes);
1494
1495                count -= nres;
1496                res += nres;
1497                pos += nres;
1498                if (nres != nbytes) {
1499                        iov_iter_revert(iter, nbytes - nres);
1500                        break;
1501                }
1502                if (count) {
1503                        max_pages = iov_iter_npages(iter, fc->max_pages);
1504                        ia = fuse_io_alloc(io, max_pages);
1505                        if (!ia)
1506                                break;
1507                }
1508        }
1509        if (ia)
1510                fuse_io_free(ia);
1511        if (res > 0)
1512                *ppos = pos;
1513
1514        return res > 0 ? res : err;
1515}
1516EXPORT_SYMBOL_GPL(fuse_direct_io);
1517
1518static ssize_t __fuse_direct_read(struct fuse_io_priv *io,
1519                                  struct iov_iter *iter,
1520                                  loff_t *ppos)
1521{
1522        ssize_t res;
1523        struct inode *inode = file_inode(io->iocb->ki_filp);
1524
1525        res = fuse_direct_io(io, iter, ppos, 0);
1526
1527        fuse_invalidate_atime(inode);
1528
1529        return res;
1530}
1531
1532static ssize_t fuse_direct_IO(struct kiocb *iocb, struct iov_iter *iter);
1533
1534static ssize_t fuse_direct_read_iter(struct kiocb *iocb, struct iov_iter *to)
1535{
1536        ssize_t res;
1537
1538        if (!is_sync_kiocb(iocb) && iocb->ki_flags & IOCB_DIRECT) {
1539                res = fuse_direct_IO(iocb, to);
1540        } else {
1541                struct fuse_io_priv io = FUSE_IO_PRIV_SYNC(iocb);
1542
1543                res = __fuse_direct_read(&io, to, &iocb->ki_pos);
1544        }
1545
1546        return res;
1547}
1548
1549static ssize_t fuse_direct_write_iter(struct kiocb *iocb, struct iov_iter *from)
1550{
1551        struct inode *inode = file_inode(iocb->ki_filp);
1552        struct fuse_io_priv io = FUSE_IO_PRIV_SYNC(iocb);
1553        ssize_t res;
1554
1555        /* Don't allow parallel writes to the same file */
1556        inode_lock(inode);
1557        res = generic_write_checks(iocb, from);
1558        if (res > 0) {
1559                if (!is_sync_kiocb(iocb) && iocb->ki_flags & IOCB_DIRECT) {
1560                        res = fuse_direct_IO(iocb, from);
1561                } else {
1562                        res = fuse_direct_io(&io, from, &iocb->ki_pos,
1563                                             FUSE_DIO_WRITE);
1564                }
1565        }
1566        fuse_invalidate_attr(inode);
1567        if (res > 0)
1568                fuse_write_update_size(inode, iocb->ki_pos);
1569        inode_unlock(inode);
1570
1571        return res;
1572}
1573
1574static ssize_t fuse_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
1575{
1576        struct file *file = iocb->ki_filp;
1577        struct fuse_file *ff = file->private_data;
1578        struct inode *inode = file_inode(file);
1579
1580        if (fuse_is_bad(inode))
1581                return -EIO;
1582
1583        if (FUSE_IS_DAX(inode))
1584                return fuse_dax_read_iter(iocb, to);
1585
1586        if (!(ff->open_flags & FOPEN_DIRECT_IO))
1587                return fuse_cache_read_iter(iocb, to);
1588        else
1589                return fuse_direct_read_iter(iocb, to);
1590}
1591
1592static ssize_t fuse_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
1593{
1594        struct file *file = iocb->ki_filp;
1595        struct fuse_file *ff = file->private_data;
1596        struct inode *inode = file_inode(file);
1597
1598        if (fuse_is_bad(inode))
1599                return -EIO;
1600
1601        if (FUSE_IS_DAX(inode))
1602                return fuse_dax_write_iter(iocb, from);
1603
1604        if (!(ff->open_flags & FOPEN_DIRECT_IO))
1605                return fuse_cache_write_iter(iocb, from);
1606        else
1607                return fuse_direct_write_iter(iocb, from);
1608}
1609
1610static void fuse_writepage_free(struct fuse_writepage_args *wpa)
1611{
1612        struct fuse_args_pages *ap = &wpa->ia.ap;
1613        int i;
1614
1615        if (wpa->bucket)
1616                fuse_sync_bucket_dec(wpa->bucket);
1617
1618        for (i = 0; i < ap->num_pages; i++)
1619                __free_page(ap->pages[i]);
1620
1621        if (wpa->ia.ff)
1622                fuse_file_put(wpa->ia.ff, false, false);
1623
1624        kfree(ap->pages);
1625        kfree(wpa);
1626}
1627
1628static void fuse_writepage_finish(struct fuse_mount *fm,
1629                                  struct fuse_writepage_args *wpa)
1630{
1631        struct fuse_args_pages *ap = &wpa->ia.ap;
1632        struct inode *inode = wpa->inode;
1633        struct fuse_inode *fi = get_fuse_inode(inode);
1634        struct backing_dev_info *bdi = inode_to_bdi(inode);
1635        int i;
1636
1637        for (i = 0; i < ap->num_pages; i++) {
1638                dec_wb_stat(&bdi->wb, WB_WRITEBACK);
1639                dec_node_page_state(ap->pages[i], NR_WRITEBACK_TEMP);
1640                wb_writeout_inc(&bdi->wb);
1641        }
1642        wake_up(&fi->page_waitq);
1643}
1644
1645/* Called under fi->lock, may release and reacquire it */
1646static void fuse_send_writepage(struct fuse_mount *fm,
1647                                struct fuse_writepage_args *wpa, loff_t size)
1648__releases(fi->lock)
1649__acquires(fi->lock)
1650{
1651        struct fuse_writepage_args *aux, *next;
1652        struct fuse_inode *fi = get_fuse_inode(wpa->inode);
1653        struct fuse_write_in *inarg = &wpa->ia.write.in;
1654        struct fuse_args *args = &wpa->ia.ap.args;
1655        __u64 data_size = wpa->ia.ap.num_pages * PAGE_SIZE;
1656        int err;
1657
1658        fi->writectr++;
1659        if (inarg->offset + data_size <= size) {
1660                inarg->size = data_size;
1661        } else if (inarg->offset < size) {
1662                inarg->size = size - inarg->offset;
1663        } else {
1664                /* Got truncated off completely */
1665                goto out_free;
1666        }
1667
1668        args->in_args[1].size = inarg->size;
1669        args->force = true;
1670        args->nocreds = true;
1671
1672        err = fuse_simple_background(fm, args, GFP_ATOMIC);
1673        if (err == -ENOMEM) {
1674                spin_unlock(&fi->lock);
1675                err = fuse_simple_background(fm, args, GFP_NOFS | __GFP_NOFAIL);
1676                spin_lock(&fi->lock);
1677        }
1678
1679        /* Fails on broken connection only */
1680        if (unlikely(err))
1681                goto out_free;
1682
1683        return;
1684
1685 out_free:
1686        fi->writectr--;
1687        rb_erase(&wpa->writepages_entry, &fi->writepages);
1688        fuse_writepage_finish(fm, wpa);
1689        spin_unlock(&fi->lock);
1690
1691        /* After fuse_writepage_finish() aux request list is private */
1692        for (aux = wpa->next; aux; aux = next) {
1693                next = aux->next;
1694                aux->next = NULL;
1695                fuse_writepage_free(aux);
1696        }
1697
1698        fuse_writepage_free(wpa);
1699        spin_lock(&fi->lock);
1700}
1701
1702/*
1703 * If fi->writectr is positive (no truncate or fsync going on) send
1704 * all queued writepage requests.
1705 *
1706 * Called with fi->lock
1707 */
1708void fuse_flush_writepages(struct inode *inode)
1709__releases(fi->lock)
1710__acquires(fi->lock)
1711{
1712        struct fuse_mount *fm = get_fuse_mount(inode);
1713        struct fuse_inode *fi = get_fuse_inode(inode);
1714        loff_t crop = i_size_read(inode);
1715        struct fuse_writepage_args *wpa;
1716
1717        while (fi->writectr >= 0 && !list_empty(&fi->queued_writes)) {
1718                wpa = list_entry(fi->queued_writes.next,
1719                                 struct fuse_writepage_args, queue_entry);
1720                list_del_init(&wpa->queue_entry);
1721                fuse_send_writepage(fm, wpa, crop);
1722        }
1723}
1724
1725static struct fuse_writepage_args *fuse_insert_writeback(struct rb_root *root,
1726                                                struct fuse_writepage_args *wpa)
1727{
1728        pgoff_t idx_from = wpa->ia.write.in.offset >> PAGE_SHIFT;
1729        pgoff_t idx_to = idx_from + wpa->ia.ap.num_pages - 1;
1730        struct rb_node **p = &root->rb_node;
1731        struct rb_node  *parent = NULL;
1732
1733        WARN_ON(!wpa->ia.ap.num_pages);
1734        while (*p) {
1735                struct fuse_writepage_args *curr;
1736                pgoff_t curr_index;
1737
1738                parent = *p;
1739                curr = rb_entry(parent, struct fuse_writepage_args,
1740                                writepages_entry);
1741                WARN_ON(curr->inode != wpa->inode);
1742                curr_index = curr->ia.write.in.offset >> PAGE_SHIFT;
1743
1744                if (idx_from >= curr_index + curr->ia.ap.num_pages)
1745                        p = &(*p)->rb_right;
1746                else if (idx_to < curr_index)
1747                        p = &(*p)->rb_left;
1748                else
1749                        return curr;
1750        }
1751
1752        rb_link_node(&wpa->writepages_entry, parent, p);
1753        rb_insert_color(&wpa->writepages_entry, root);
1754        return NULL;
1755}
1756
1757static void tree_insert(struct rb_root *root, struct fuse_writepage_args *wpa)
1758{
1759        WARN_ON(fuse_insert_writeback(root, wpa));
1760}
1761
1762static void fuse_writepage_end(struct fuse_mount *fm, struct fuse_args *args,
1763                               int error)
1764{
1765        struct fuse_writepage_args *wpa =
1766                container_of(args, typeof(*wpa), ia.ap.args);
1767        struct inode *inode = wpa->inode;
1768        struct fuse_inode *fi = get_fuse_inode(inode);
1769        struct fuse_conn *fc = get_fuse_conn(inode);
1770
1771        mapping_set_error(inode->i_mapping, error);
1772        /*
1773         * A writeback finished and this might have updated mtime/ctime on
1774         * server making local mtime/ctime stale.  Hence invalidate attrs.
1775         * Do this only if writeback_cache is not enabled.  If writeback_cache
1776         * is enabled, we trust local ctime/mtime.
1777         */
1778        if (!fc->writeback_cache)
1779                fuse_invalidate_attr(inode);
1780        spin_lock(&fi->lock);
1781        rb_erase(&wpa->writepages_entry, &fi->writepages);
1782        while (wpa->next) {
1783                struct fuse_mount *fm = get_fuse_mount(inode);
1784                struct fuse_write_in *inarg = &wpa->ia.write.in;
1785                struct fuse_writepage_args *next = wpa->next;
1786
1787                wpa->next = next->next;
1788                next->next = NULL;
1789                next->ia.ff = fuse_file_get(wpa->ia.ff);
1790                tree_insert(&fi->writepages, next);
1791
1792                /*
1793                 * Skip fuse_flush_writepages() to make it easy to crop requests
1794                 * based on primary request size.
1795                 *
1796                 * 1st case (trivial): there are no concurrent activities using
1797                 * fuse_set/release_nowrite.  Then we're on safe side because
1798                 * fuse_flush_writepages() would call fuse_send_writepage()
1799                 * anyway.
1800                 *
1801                 * 2nd case: someone called fuse_set_nowrite and it is waiting
1802                 * now for completion of all in-flight requests.  This happens
1803                 * rarely and no more than once per page, so this should be
1804                 * okay.
1805                 *
1806                 * 3rd case: someone (e.g. fuse_do_setattr()) is in the middle
1807                 * of fuse_set_nowrite..fuse_release_nowrite section.  The fact
1808                 * that fuse_set_nowrite returned implies that all in-flight
1809                 * requests were completed along with all of their secondary
1810                 * requests.  Further primary requests are blocked by negative
1811                 * writectr.  Hence there cannot be any in-flight requests and
1812                 * no invocations of fuse_writepage_end() while we're in
1813                 * fuse_set_nowrite..fuse_release_nowrite section.
1814                 */
1815                fuse_send_writepage(fm, next, inarg->offset + inarg->size);
1816        }
1817        fi->writectr--;
1818        fuse_writepage_finish(fm, wpa);
1819        spin_unlock(&fi->lock);
1820        fuse_writepage_free(wpa);
1821}
1822
1823static struct fuse_file *__fuse_write_file_get(struct fuse_inode *fi)
1824{
1825        struct fuse_file *ff = NULL;
1826
1827        spin_lock(&fi->lock);
1828        if (!list_empty(&fi->write_files)) {
1829                ff = list_entry(fi->write_files.next, struct fuse_file,
1830                                write_entry);
1831                fuse_file_get(ff);
1832        }
1833        spin_unlock(&fi->lock);
1834
1835        return ff;
1836}
1837
1838static struct fuse_file *fuse_write_file_get(struct fuse_inode *fi)
1839{
1840        struct fuse_file *ff = __fuse_write_file_get(fi);
1841        WARN_ON(!ff);
1842        return ff;
1843}
1844
1845int fuse_write_inode(struct inode *inode, struct writeback_control *wbc)
1846{
1847        struct fuse_inode *fi = get_fuse_inode(inode);
1848        struct fuse_file *ff;
1849        int err;
1850
1851        ff = __fuse_write_file_get(fi);
1852        err = fuse_flush_times(inode, ff);
1853        if (ff)
1854                fuse_file_put(ff, false, false);
1855
1856        return err;
1857}
1858
1859static struct fuse_writepage_args *fuse_writepage_args_alloc(void)
1860{
1861        struct fuse_writepage_args *wpa;
1862        struct fuse_args_pages *ap;
1863
1864        wpa = kzalloc(sizeof(*wpa), GFP_NOFS);
1865        if (wpa) {
1866                ap = &wpa->ia.ap;
1867                ap->num_pages = 0;
1868                ap->pages = fuse_pages_alloc(1, GFP_NOFS, &ap->descs);
1869                if (!ap->pages) {
1870                        kfree(wpa);
1871                        wpa = NULL;
1872                }
1873        }
1874        return wpa;
1875
1876}
1877
1878static void fuse_writepage_add_to_bucket(struct fuse_conn *fc,
1879                                         struct fuse_writepage_args *wpa)
1880{
1881        if (!fc->sync_fs)
1882                return;
1883
1884        rcu_read_lock();
1885        /* Prevent resurrection of dead bucket in unlikely race with syncfs */
1886        do {
1887                wpa->bucket = rcu_dereference(fc->curr_bucket);
1888        } while (unlikely(!atomic_inc_not_zero(&wpa->bucket->count)));
1889        rcu_read_unlock();
1890}
1891
1892static int fuse_writepage_locked(struct page *page)
1893{
1894        struct address_space *mapping = page->mapping;
1895        struct inode *inode = mapping->host;
1896        struct fuse_conn *fc = get_fuse_conn(inode);
1897        struct fuse_inode *fi = get_fuse_inode(inode);
1898        struct fuse_writepage_args *wpa;
1899        struct fuse_args_pages *ap;
1900        struct page *tmp_page;
1901        int error = -ENOMEM;
1902
1903        set_page_writeback(page);
1904
1905        wpa = fuse_writepage_args_alloc();
1906        if (!wpa)
1907                goto err;
1908        ap = &wpa->ia.ap;
1909
1910        tmp_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
1911        if (!tmp_page)
1912                goto err_free;
1913
1914        error = -EIO;
1915        wpa->ia.ff = fuse_write_file_get(fi);
1916        if (!wpa->ia.ff)
1917                goto err_nofile;
1918
1919        fuse_writepage_add_to_bucket(fc, wpa);
1920        fuse_write_args_fill(&wpa->ia, wpa->ia.ff, page_offset(page), 0);
1921
1922        copy_highpage(tmp_page, page);
1923        wpa->ia.write.in.write_flags |= FUSE_WRITE_CACHE;
1924        wpa->next = NULL;
1925        ap->args.in_pages = true;
1926        ap->num_pages = 1;
1927        ap->pages[0] = tmp_page;
1928        ap->descs[0].offset = 0;
1929        ap->descs[0].length = PAGE_SIZE;
1930        ap->args.end = fuse_writepage_end;
1931        wpa->inode = inode;
1932
1933        inc_wb_stat(&inode_to_bdi(inode)->wb, WB_WRITEBACK);
1934        inc_node_page_state(tmp_page, NR_WRITEBACK_TEMP);
1935
1936        spin_lock(&fi->lock);
1937        tree_insert(&fi->writepages, wpa);
1938        list_add_tail(&wpa->queue_entry, &fi->queued_writes);
1939        fuse_flush_writepages(inode);
1940        spin_unlock(&fi->lock);
1941
1942        end_page_writeback(page);
1943
1944        return 0;
1945
1946err_nofile:
1947        __free_page(tmp_page);
1948err_free:
1949        kfree(wpa);
1950err:
1951        mapping_set_error(page->mapping, error);
1952        end_page_writeback(page);
1953        return error;
1954}
1955
1956static int fuse_writepage(struct page *page, struct writeback_control *wbc)
1957{
1958        int err;
1959
1960        if (fuse_page_is_writeback(page->mapping->host, page->index)) {
1961                /*
1962                 * ->writepages() should be called for sync() and friends.  We
1963                 * should only get here on direct reclaim and then we are
1964                 * allowed to skip a page which is already in flight
1965                 */
1966                WARN_ON(wbc->sync_mode == WB_SYNC_ALL);
1967
1968                redirty_page_for_writepage(wbc, page);
1969                unlock_page(page);
1970                return 0;
1971        }
1972
1973        err = fuse_writepage_locked(page);
1974        unlock_page(page);
1975
1976        return err;
1977}
1978
1979struct fuse_fill_wb_data {
1980        struct fuse_writepage_args *wpa;
1981        struct fuse_file *ff;
1982        struct inode *inode;
1983        struct page **orig_pages;
1984        unsigned int max_pages;
1985};
1986
1987static bool fuse_pages_realloc(struct fuse_fill_wb_data *data)
1988{
1989        struct fuse_args_pages *ap = &data->wpa->ia.ap;
1990        struct fuse_conn *fc = get_fuse_conn(data->inode);
1991        struct page **pages;
1992        struct fuse_page_desc *descs;
1993        unsigned int npages = min_t(unsigned int,
1994                                    max_t(unsigned int, data->max_pages * 2,
1995                                          FUSE_DEFAULT_MAX_PAGES_PER_REQ),
1996                                    fc->max_pages);
1997        WARN_ON(npages <= data->max_pages);
1998
1999        pages = fuse_pages_alloc(npages, GFP_NOFS, &descs);
2000        if (!pages)
2001                return false;
2002
2003        memcpy(pages, ap->pages, sizeof(struct page *) * ap->num_pages);
2004        memcpy(descs, ap->descs, sizeof(struct fuse_page_desc) * ap->num_pages);
2005        kfree(ap->pages);
2006        ap->pages = pages;
2007        ap->descs = descs;
2008        data->max_pages = npages;
2009
2010        return true;
2011}
2012
2013static void fuse_writepages_send(struct fuse_fill_wb_data *data)
2014{
2015        struct fuse_writepage_args *wpa = data->wpa;
2016        struct inode *inode = data->inode;
2017        struct fuse_inode *fi = get_fuse_inode(inode);
2018        int num_pages = wpa->ia.ap.num_pages;
2019        int i;
2020
2021        wpa->ia.ff = fuse_file_get(data->ff);
2022        spin_lock(&fi->lock);
2023        list_add_tail(&wpa->queue_entry, &fi->queued_writes);
2024        fuse_flush_writepages(inode);
2025        spin_unlock(&fi->lock);
2026
2027        for (i = 0; i < num_pages; i++)
2028                end_page_writeback(data->orig_pages[i]);
2029}
2030
2031/*
2032 * Check under fi->lock if the page is under writeback, and insert it onto the
2033 * rb_tree if not. Otherwise iterate auxiliary write requests, to see if there's
2034 * one already added for a page at this offset.  If there's none, then insert
2035 * this new request onto the auxiliary list, otherwise reuse the existing one by
2036 * swapping the new temp page with the old one.
2037 */
2038static bool fuse_writepage_add(struct fuse_writepage_args *new_wpa,
2039                               struct page *page)
2040{
2041        struct fuse_inode *fi = get_fuse_inode(new_wpa->inode);
2042        struct fuse_writepage_args *tmp;
2043        struct fuse_writepage_args *old_wpa;
2044        struct fuse_args_pages *new_ap = &new_wpa->ia.ap;
2045
2046        WARN_ON(new_ap->num_pages != 0);
2047        new_ap->num_pages = 1;
2048
2049        spin_lock(&fi->lock);
2050        old_wpa = fuse_insert_writeback(&fi->writepages, new_wpa);
2051        if (!old_wpa) {
2052                spin_unlock(&fi->lock);
2053                return true;
2054        }
2055
2056        for (tmp = old_wpa->next; tmp; tmp = tmp->next) {
2057                pgoff_t curr_index;
2058
2059                WARN_ON(tmp->inode != new_wpa->inode);
2060                curr_index = tmp->ia.write.in.offset >> PAGE_SHIFT;
2061                if (curr_index == page->index) {
2062                        WARN_ON(tmp->ia.ap.num_pages != 1);
2063                        swap(tmp->ia.ap.pages[0], new_ap->pages[0]);
2064                        break;
2065                }
2066        }
2067
2068        if (!tmp) {
2069                new_wpa->next = old_wpa->next;
2070                old_wpa->next = new_wpa;
2071        }
2072
2073        spin_unlock(&fi->lock);
2074
2075        if (tmp) {
2076                struct backing_dev_info *bdi = inode_to_bdi(new_wpa->inode);
2077
2078                dec_wb_stat(&bdi->wb, WB_WRITEBACK);
2079                dec_node_page_state(new_ap->pages[0], NR_WRITEBACK_TEMP);
2080                wb_writeout_inc(&bdi->wb);
2081                fuse_writepage_free(new_wpa);
2082        }
2083
2084        return false;
2085}
2086
2087static bool fuse_writepage_need_send(struct fuse_conn *fc, struct page *page,
2088                                     struct fuse_args_pages *ap,
2089                                     struct fuse_fill_wb_data *data)
2090{
2091        WARN_ON(!ap->num_pages);
2092
2093        /*
2094         * Being under writeback is unlikely but possible.  For example direct
2095         * read to an mmaped fuse file will set the page dirty twice; once when
2096         * the pages are faulted with get_user_pages(), and then after the read
2097         * completed.
2098         */
2099        if (fuse_page_is_writeback(data->inode, page->index))
2100                return true;
2101
2102        /* Reached max pages */
2103        if (ap->num_pages == fc->max_pages)
2104                return true;
2105
2106        /* Reached max write bytes */
2107        if ((ap->num_pages + 1) * PAGE_SIZE > fc->max_write)
2108                return true;
2109
2110        /* Discontinuity */
2111        if (data->orig_pages[ap->num_pages - 1]->index + 1 != page->index)
2112                return true;
2113
2114        /* Need to grow the pages array?  If so, did the expansion fail? */
2115        if (ap->num_pages == data->max_pages && !fuse_pages_realloc(data))
2116                return true;
2117
2118        return false;
2119}
2120
2121static int fuse_writepages_fill(struct page *page,
2122                struct writeback_control *wbc, void *_data)
2123{
2124        struct fuse_fill_wb_data *data = _data;
2125        struct fuse_writepage_args *wpa = data->wpa;
2126        struct fuse_args_pages *ap = &wpa->ia.ap;
2127        struct inode *inode = data->inode;
2128        struct fuse_inode *fi = get_fuse_inode(inode);
2129        struct fuse_conn *fc = get_fuse_conn(inode);
2130        struct page *tmp_page;
2131        int err;
2132
2133        if (!data->ff) {
2134                err = -EIO;
2135                data->ff = fuse_write_file_get(fi);
2136                if (!data->ff)
2137                        goto out_unlock;
2138        }
2139
2140        if (wpa && fuse_writepage_need_send(fc, page, ap, data)) {
2141                fuse_writepages_send(data);
2142                data->wpa = NULL;
2143        }
2144
2145        err = -ENOMEM;
2146        tmp_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
2147        if (!tmp_page)
2148                goto out_unlock;
2149
2150        /*
2151         * The page must not be redirtied until the writeout is completed
2152         * (i.e. userspace has sent a reply to the write request).  Otherwise
2153         * there could be more than one temporary page instance for each real
2154         * page.
2155         *
2156         * This is ensured by holding the page lock in page_mkwrite() while
2157         * checking fuse_page_is_writeback().  We already hold the page lock
2158         * since clear_page_dirty_for_io() and keep it held until we add the
2159         * request to the fi->writepages list and increment ap->num_pages.
2160         * After this fuse_page_is_writeback() will indicate that the page is
2161         * under writeback, so we can release the page lock.
2162         */
2163        if (data->wpa == NULL) {
2164                err = -ENOMEM;
2165                wpa = fuse_writepage_args_alloc();
2166                if (!wpa) {
2167                        __free_page(tmp_page);
2168                        goto out_unlock;
2169                }
2170                fuse_writepage_add_to_bucket(fc, wpa);
2171
2172                data->max_pages = 1;
2173
2174                ap = &wpa->ia.ap;
2175                fuse_write_args_fill(&wpa->ia, data->ff, page_offset(page), 0);
2176                wpa->ia.write.in.write_flags |= FUSE_WRITE_CACHE;
2177                wpa->next = NULL;
2178                ap->args.in_pages = true;
2179                ap->args.end = fuse_writepage_end;
2180                ap->num_pages = 0;
2181                wpa->inode = inode;
2182        }
2183        set_page_writeback(page);
2184
2185        copy_highpage(tmp_page, page);
2186        ap->pages[ap->num_pages] = tmp_page;
2187        ap->descs[ap->num_pages].offset = 0;
2188        ap->descs[ap->num_pages].length = PAGE_SIZE;
2189        data->orig_pages[ap->num_pages] = page;
2190
2191        inc_wb_stat(&inode_to_bdi(inode)->wb, WB_WRITEBACK);
2192        inc_node_page_state(tmp_page, NR_WRITEBACK_TEMP);
2193
2194        err = 0;
2195        if (data->wpa) {
2196                /*
2197                 * Protected by fi->lock against concurrent access by
2198                 * fuse_page_is_writeback().
2199                 */
2200                spin_lock(&fi->lock);
2201                ap->num_pages++;
2202                spin_unlock(&fi->lock);
2203        } else if (fuse_writepage_add(wpa, page)) {
2204                data->wpa = wpa;
2205        } else {
2206                end_page_writeback(page);
2207        }
2208out_unlock:
2209        unlock_page(page);
2210
2211        return err;
2212}
2213
2214static int fuse_writepages(struct address_space *mapping,
2215                           struct writeback_control *wbc)
2216{
2217        struct inode *inode = mapping->host;
2218        struct fuse_conn *fc = get_fuse_conn(inode);
2219        struct fuse_fill_wb_data data;
2220        int err;
2221
2222        err = -EIO;
2223        if (fuse_is_bad(inode))
2224                goto out;
2225
2226        data.inode = inode;
2227        data.wpa = NULL;
2228        data.ff = NULL;
2229
2230        err = -ENOMEM;
2231        data.orig_pages = kcalloc(fc->max_pages,
2232                                  sizeof(struct page *),
2233                                  GFP_NOFS);
2234        if (!data.orig_pages)
2235                goto out;
2236
2237        err = write_cache_pages(mapping, wbc, fuse_writepages_fill, &data);
2238        if (data.wpa) {
2239                WARN_ON(!data.wpa->ia.ap.num_pages);
2240                fuse_writepages_send(&data);
2241        }
2242        if (data.ff)
2243                fuse_file_put(data.ff, false, false);
2244
2245        kfree(data.orig_pages);
2246out:
2247        return err;
2248}
2249
2250/*
2251 * It's worthy to make sure that space is reserved on disk for the write,
2252 * but how to implement it without killing performance need more thinking.
2253 */
2254static int fuse_write_begin(struct file *file, struct address_space *mapping,
2255                loff_t pos, unsigned len, unsigned flags,
2256                struct page **pagep, void **fsdata)
2257{
2258        pgoff_t index = pos >> PAGE_SHIFT;
2259        struct fuse_conn *fc = get_fuse_conn(file_inode(file));
2260        struct page *page;
2261        loff_t fsize;
2262        int err = -ENOMEM;
2263
2264        WARN_ON(!fc->writeback_cache);
2265
2266        page = grab_cache_page_write_begin(mapping, index, flags);
2267        if (!page)
2268                goto error;
2269
2270        fuse_wait_on_page_writeback(mapping->host, page->index);
2271
2272        if (PageUptodate(page) || len == PAGE_SIZE)
2273                goto success;
2274        /*
2275         * Check if the start this page comes after the end of file, in which
2276         * case the readpage can be optimized away.
2277         */
2278        fsize = i_size_read(mapping->host);
2279        if (fsize <= (pos & PAGE_MASK)) {
2280                size_t off = pos & ~PAGE_MASK;
2281                if (off)
2282                        zero_user_segment(page, 0, off);
2283                goto success;
2284        }
2285        err = fuse_do_readpage(file, page);
2286        if (err)
2287                goto cleanup;
2288success:
2289        *pagep = page;
2290        return 0;
2291
2292cleanup:
2293        unlock_page(page);
2294        put_page(page);
2295error:
2296        return err;
2297}
2298
2299static int fuse_write_end(struct file *file, struct address_space *mapping,
2300                loff_t pos, unsigned len, unsigned copied,
2301                struct page *page, void *fsdata)
2302{
2303        struct inode *inode = page->mapping->host;
2304
2305        /* Haven't copied anything?  Skip zeroing, size extending, dirtying. */
2306        if (!copied)
2307                goto unlock;
2308
2309        if (!PageUptodate(page)) {
2310                /* Zero any unwritten bytes at the end of the page */
2311                size_t endoff = (pos + copied) & ~PAGE_MASK;
2312                if (endoff)
2313                        zero_user_segment(page, endoff, PAGE_SIZE);
2314                SetPageUptodate(page);
2315        }
2316
2317        fuse_write_update_size(inode, pos + copied);
2318        set_page_dirty(page);
2319
2320unlock:
2321        unlock_page(page);
2322        put_page(page);
2323
2324        return copied;
2325}
2326
2327static int fuse_launder_page(struct page *page)
2328{
2329        int err = 0;
2330        if (clear_page_dirty_for_io(page)) {
2331                struct inode *inode = page->mapping->host;
2332
2333                /* Serialize with pending writeback for the same page */
2334                fuse_wait_on_page_writeback(inode, page->index);
2335                err = fuse_writepage_locked(page);
2336                if (!err)
2337                        fuse_wait_on_page_writeback(inode, page->index);
2338        }
2339        return err;
2340}
2341
2342/*
2343 * Write back dirty pages now, because there may not be any suitable
2344 * open files later
2345 */
2346static void fuse_vma_close(struct vm_area_struct *vma)
2347{
2348        filemap_write_and_wait(vma->vm_file->f_mapping);
2349}
2350
2351/*
2352 * Wait for writeback against this page to complete before allowing it
2353 * to be marked dirty again, and hence written back again, possibly
2354 * before the previous writepage completed.
2355 *
2356 * Block here, instead of in ->writepage(), so that the userspace fs
2357 * can only block processes actually operating on the filesystem.
2358 *
2359 * Otherwise unprivileged userspace fs would be able to block
2360 * unrelated:
2361 *
2362 * - page migration
2363 * - sync(2)
2364 * - try_to_free_pages() with order > PAGE_ALLOC_COSTLY_ORDER
2365 */
2366static vm_fault_t fuse_page_mkwrite(struct vm_fault *vmf)
2367{
2368        struct page *page = vmf->page;
2369        struct inode *inode = file_inode(vmf->vma->vm_file);
2370
2371        file_update_time(vmf->vma->vm_file);
2372        lock_page(page);
2373        if (page->mapping != inode->i_mapping) {
2374                unlock_page(page);
2375                return VM_FAULT_NOPAGE;
2376        }
2377
2378        fuse_wait_on_page_writeback(inode, page->index);
2379        return VM_FAULT_LOCKED;
2380}
2381
2382static const struct vm_operations_struct fuse_file_vm_ops = {
2383        .close          = fuse_vma_close,
2384        .fault          = filemap_fault,
2385        .map_pages      = filemap_map_pages,
2386        .page_mkwrite   = fuse_page_mkwrite,
2387};
2388
2389static int fuse_file_mmap(struct file *file, struct vm_area_struct *vma)
2390{
2391        struct fuse_file *ff = file->private_data;
2392
2393        /* DAX mmap is superior to direct_io mmap */
2394        if (FUSE_IS_DAX(file_inode(file)))
2395                return fuse_dax_mmap(file, vma);
2396
2397        if (ff->open_flags & FOPEN_DIRECT_IO) {
2398                /* Can't provide the coherency needed for MAP_SHARED */
2399                if (vma->vm_flags & VM_MAYSHARE)
2400                        return -ENODEV;
2401
2402                invalidate_inode_pages2(file->f_mapping);
2403
2404                return generic_file_mmap(file, vma);
2405        }
2406
2407        if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_MAYWRITE))
2408                fuse_link_write_file(file);
2409
2410        file_accessed(file);
2411        vma->vm_ops = &fuse_file_vm_ops;
2412        return 0;
2413}
2414
2415static int convert_fuse_file_lock(struct fuse_conn *fc,
2416                                  const struct fuse_file_lock *ffl,
2417                                  struct file_lock *fl)
2418{
2419        switch (ffl->type) {
2420        case F_UNLCK:
2421                break;
2422
2423        case F_RDLCK:
2424        case F_WRLCK:
2425                if (ffl->start > OFFSET_MAX || ffl->end > OFFSET_MAX ||
2426                    ffl->end < ffl->start)
2427                        return -EIO;
2428
2429                fl->fl_start = ffl->start;
2430                fl->fl_end = ffl->end;
2431
2432                /*
2433                 * Convert pid into init's pid namespace.  The locks API will
2434                 * translate it into the caller's pid namespace.
2435                 */
2436                rcu_read_lock();
2437                fl->fl_pid = pid_nr_ns(find_pid_ns(ffl->pid, fc->pid_ns), &init_pid_ns);
2438                rcu_read_unlock();
2439                break;
2440
2441        default:
2442                return -EIO;
2443        }
2444        fl->fl_type = ffl->type;
2445        return 0;
2446}
2447
2448static void fuse_lk_fill(struct fuse_args *args, struct file *file,
2449                         const struct file_lock *fl, int opcode, pid_t pid,
2450                         int flock, struct fuse_lk_in *inarg)
2451{
2452        struct inode *inode = file_inode(file);
2453        struct fuse_conn *fc = get_fuse_conn(inode);
2454        struct fuse_file *ff = file->private_data;
2455
2456        memset(inarg, 0, sizeof(*inarg));
2457        inarg->fh = ff->fh;
2458        inarg->owner = fuse_lock_owner_id(fc, fl->fl_owner);
2459        inarg->lk.start = fl->fl_start;
2460        inarg->lk.end = fl->fl_end;
2461        inarg->lk.type = fl->fl_type;
2462        inarg->lk.pid = pid;
2463        if (flock)
2464                inarg->lk_flags |= FUSE_LK_FLOCK;
2465        args->opcode = opcode;
2466        args->nodeid = get_node_id(inode);
2467        args->in_numargs = 1;
2468        args->in_args[0].size = sizeof(*inarg);
2469        args->in_args[0].value = inarg;
2470}
2471
2472static int fuse_getlk(struct file *file, struct file_lock *fl)
2473{
2474        struct inode *inode = file_inode(file);
2475        struct fuse_mount *fm = get_fuse_mount(inode);
2476        FUSE_ARGS(args);
2477        struct fuse_lk_in inarg;
2478        struct fuse_lk_out outarg;
2479        int err;
2480
2481        fuse_lk_fill(&args, file, fl, FUSE_GETLK, 0, 0, &inarg);
2482        args.out_numargs = 1;
2483        args.out_args[0].size = sizeof(outarg);
2484        args.out_args[0].value = &outarg;
2485        err = fuse_simple_request(fm, &args);
2486        if (!err)
2487                err = convert_fuse_file_lock(fm->fc, &outarg.lk, fl);
2488
2489        return err;
2490}
2491
2492static int fuse_setlk(struct file *file, struct file_lock *fl, int flock)
2493{
2494        struct inode *inode = file_inode(file);
2495        struct fuse_mount *fm = get_fuse_mount(inode);
2496        FUSE_ARGS(args);
2497        struct fuse_lk_in inarg;
2498        int opcode = (fl->fl_flags & FL_SLEEP) ? FUSE_SETLKW : FUSE_SETLK;
2499        struct pid *pid = fl->fl_type != F_UNLCK ? task_tgid(current) : NULL;
2500        pid_t pid_nr = pid_nr_ns(pid, fm->fc->pid_ns);
2501        int err;
2502
2503        if (fl->fl_lmops && fl->fl_lmops->lm_grant) {
2504                /* NLM needs asynchronous locks, which we don't support yet */
2505                return -ENOLCK;
2506        }
2507
2508        /* Unlock on close is handled by the flush method */
2509        if ((fl->fl_flags & FL_CLOSE_POSIX) == FL_CLOSE_POSIX)
2510                return 0;
2511
2512        fuse_lk_fill(&args, file, fl, opcode, pid_nr, flock, &inarg);
2513        err = fuse_simple_request(fm, &args);
2514
2515        /* locking is restartable */
2516        if (err == -EINTR)
2517                err = -ERESTARTSYS;
2518
2519        return err;
2520}
2521
2522static int fuse_file_lock(struct file *file, int cmd, struct file_lock *fl)
2523{
2524        struct inode *inode = file_inode(file);
2525        struct fuse_conn *fc = get_fuse_conn(inode);
2526        int err;
2527
2528        if (cmd == F_CANCELLK) {
2529                err = 0;
2530        } else if (cmd == F_GETLK) {
2531                if (fc->no_lock) {
2532                        posix_test_lock(file, fl);
2533                        err = 0;
2534                } else
2535                        err = fuse_getlk(file, fl);
2536        } else {
2537                if (fc->no_lock)
2538                        err = posix_lock_file(file, fl, NULL);
2539                else
2540                        err = fuse_setlk(file, fl, 0);
2541        }
2542        return err;
2543}
2544
2545static int fuse_file_flock(struct file *file, int cmd, struct file_lock *fl)
2546{
2547        struct inode *inode = file_inode(file);
2548        struct fuse_conn *fc = get_fuse_conn(inode);
2549        int err;
2550
2551        if (fc->no_flock) {
2552                err = locks_lock_file_wait(file, fl);
2553        } else {
2554                struct fuse_file *ff = file->private_data;
2555
2556                /* emulate flock with POSIX locks */
2557                ff->flock = true;
2558                err = fuse_setlk(file, fl, 1);
2559        }
2560
2561        return err;
2562}
2563
2564static sector_t fuse_bmap(struct address_space *mapping, sector_t block)
2565{
2566        struct inode *inode = mapping->host;
2567        struct fuse_mount *fm = get_fuse_mount(inode);
2568        FUSE_ARGS(args);
2569        struct fuse_bmap_in inarg;
2570        struct fuse_bmap_out outarg;
2571        int err;
2572
2573        if (!inode->i_sb->s_bdev || fm->fc->no_bmap)
2574                return 0;
2575
2576        memset(&inarg, 0, sizeof(inarg));
2577        inarg.block = block;
2578        inarg.blocksize = inode->i_sb->s_blocksize;
2579        args.opcode = FUSE_BMAP;
2580        args.nodeid = get_node_id(inode);
2581        args.in_numargs = 1;
2582        args.in_args[0].size = sizeof(inarg);
2583        args.in_args[0].value = &inarg;
2584        args.out_numargs = 1;
2585        args.out_args[0].size = sizeof(outarg);
2586        args.out_args[0].value = &outarg;
2587        err = fuse_simple_request(fm, &args);
2588        if (err == -ENOSYS)
2589                fm->fc->no_bmap = 1;
2590
2591        return err ? 0 : outarg.block;
2592}
2593
2594static loff_t fuse_lseek(struct file *file, loff_t offset, int whence)
2595{
2596        struct inode *inode = file->f_mapping->host;
2597        struct fuse_mount *fm = get_fuse_mount(inode);
2598        struct fuse_file *ff = file->private_data;
2599        FUSE_ARGS(args);
2600        struct fuse_lseek_in inarg = {
2601                .fh = ff->fh,
2602                .offset = offset,
2603                .whence = whence
2604        };
2605        struct fuse_lseek_out outarg;
2606        int err;
2607
2608        if (fm->fc->no_lseek)
2609                goto fallback;
2610
2611        args.opcode = FUSE_LSEEK;
2612        args.nodeid = ff->nodeid;
2613        args.in_numargs = 1;
2614        args.in_args[0].size = sizeof(inarg);
2615        args.in_args[0].value = &inarg;
2616        args.out_numargs = 1;
2617        args.out_args[0].size = sizeof(outarg);
2618        args.out_args[0].value = &outarg;
2619        err = fuse_simple_request(fm, &args);
2620        if (err) {
2621                if (err == -ENOSYS) {
2622                        fm->fc->no_lseek = 1;
2623                        goto fallback;
2624                }
2625                return err;
2626        }
2627
2628        return vfs_setpos(file, outarg.offset, inode->i_sb->s_maxbytes);
2629
2630fallback:
2631        err = fuse_update_attributes(inode, file);
2632        if (!err)
2633                return generic_file_llseek(file, offset, whence);
2634        else
2635                return err;
2636}
2637
2638static loff_t fuse_file_llseek(struct file *file, loff_t offset, int whence)
2639{
2640        loff_t retval;
2641        struct inode *inode = file_inode(file);
2642
2643        switch (whence) {
2644        case SEEK_SET:
2645        case SEEK_CUR:
2646                 /* No i_mutex protection necessary for SEEK_CUR and SEEK_SET */
2647                retval = generic_file_llseek(file, offset, whence);
2648                break;
2649        case SEEK_END:
2650                inode_lock(inode);
2651                retval = fuse_update_attributes(inode, file);
2652                if (!retval)
2653                        retval = generic_file_llseek(file, offset, whence);
2654                inode_unlock(inode);
2655                break;
2656        case SEEK_HOLE:
2657        case SEEK_DATA:
2658                inode_lock(inode);
2659                retval = fuse_lseek(file, offset, whence);
2660                inode_unlock(inode);
2661                break;
2662        default:
2663                retval = -EINVAL;
2664        }
2665
2666        return retval;
2667}
2668
2669/*
2670 * All files which have been polled are linked to RB tree
2671 * fuse_conn->polled_files which is indexed by kh.  Walk the tree and
2672 * find the matching one.
2673 */
2674static struct rb_node **fuse_find_polled_node(struct fuse_conn *fc, u64 kh,
2675                                              struct rb_node **parent_out)
2676{
2677        struct rb_node **link = &fc->polled_files.rb_node;
2678        struct rb_node *last = NULL;
2679
2680        while (*link) {
2681                struct fuse_file *ff;
2682
2683                last = *link;
2684                ff = rb_entry(last, struct fuse_file, polled_node);
2685
2686                if (kh < ff->kh)
2687                        link = &last->rb_left;
2688                else if (kh > ff->kh)
2689                        link = &last->rb_right;
2690                else
2691                        return link;
2692        }
2693
2694        if (parent_out)
2695                *parent_out = last;
2696        return link;
2697}
2698
2699/*
2700 * The file is about to be polled.  Make sure it's on the polled_files
2701 * RB tree.  Note that files once added to the polled_files tree are
2702 * not removed before the file is released.  This is because a file
2703 * polled once is likely to be polled again.
2704 */
2705static void fuse_register_polled_file(struct fuse_conn *fc,
2706                                      struct fuse_file *ff)
2707{
2708        spin_lock(&fc->lock);
2709        if (RB_EMPTY_NODE(&ff->polled_node)) {
2710                struct rb_node **link, *parent;
2711
2712                link = fuse_find_polled_node(fc, ff->kh, &parent);
2713                BUG_ON(*link);
2714                rb_link_node(&ff->polled_node, parent, link);
2715                rb_insert_color(&ff->polled_node, &fc->polled_files);
2716        }
2717        spin_unlock(&fc->lock);
2718}
2719
2720__poll_t fuse_file_poll(struct file *file, poll_table *wait)
2721{
2722        struct fuse_file *ff = file->private_data;
2723        struct fuse_mount *fm = ff->fm;
2724        struct fuse_poll_in inarg = { .fh = ff->fh, .kh = ff->kh };
2725        struct fuse_poll_out outarg;
2726        FUSE_ARGS(args);
2727        int err;
2728
2729        if (fm->fc->no_poll)
2730                return DEFAULT_POLLMASK;
2731
2732        poll_wait(file, &ff->poll_wait, wait);
2733        inarg.events = mangle_poll(poll_requested_events(wait));
2734
2735        /*
2736         * Ask for notification iff there's someone waiting for it.
2737         * The client may ignore the flag and always notify.
2738         */
2739        if (waitqueue_active(&ff->poll_wait)) {
2740                inarg.flags |= FUSE_POLL_SCHEDULE_NOTIFY;
2741                fuse_register_polled_file(fm->fc, ff);
2742        }
2743
2744        args.opcode = FUSE_POLL;
2745        args.nodeid = ff->nodeid;
2746        args.in_numargs = 1;
2747        args.in_args[0].size = sizeof(inarg);
2748        args.in_args[0].value = &inarg;
2749        args.out_numargs = 1;
2750        args.out_args[0].size = sizeof(outarg);
2751        args.out_args[0].value = &outarg;
2752        err = fuse_simple_request(fm, &args);
2753
2754        if (!err)
2755                return demangle_poll(outarg.revents);
2756        if (err == -ENOSYS) {
2757                fm->fc->no_poll = 1;
2758                return DEFAULT_POLLMASK;
2759        }
2760        return EPOLLERR;
2761}
2762EXPORT_SYMBOL_GPL(fuse_file_poll);
2763
2764/*
2765 * This is called from fuse_handle_notify() on FUSE_NOTIFY_POLL and
2766 * wakes up the poll waiters.
2767 */
2768int fuse_notify_poll_wakeup(struct fuse_conn *fc,
2769                            struct fuse_notify_poll_wakeup_out *outarg)
2770{
2771        u64 kh = outarg->kh;
2772        struct rb_node **link;
2773
2774        spin_lock(&fc->lock);
2775
2776        link = fuse_find_polled_node(fc, kh, NULL);
2777        if (*link) {
2778                struct fuse_file *ff;
2779
2780                ff = rb_entry(*link, struct fuse_file, polled_node);
2781                wake_up_interruptible_sync(&ff->poll_wait);
2782        }
2783
2784        spin_unlock(&fc->lock);
2785        return 0;
2786}
2787
2788static void fuse_do_truncate(struct file *file)
2789{
2790        struct inode *inode = file->f_mapping->host;
2791        struct iattr attr;
2792
2793        attr.ia_valid = ATTR_SIZE;
2794        attr.ia_size = i_size_read(inode);
2795
2796        attr.ia_file = file;
2797        attr.ia_valid |= ATTR_FILE;
2798
2799        fuse_do_setattr(file_dentry(file), &attr, file);
2800}
2801
2802static inline loff_t fuse_round_up(struct fuse_conn *fc, loff_t off)
2803{
2804        return round_up(off, fc->max_pages << PAGE_SHIFT);
2805}
2806
2807static ssize_t
2808fuse_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
2809{
2810        DECLARE_COMPLETION_ONSTACK(wait);
2811        ssize_t ret = 0;
2812        struct file *file = iocb->ki_filp;
2813        struct fuse_file *ff = file->private_data;
2814        loff_t pos = 0;
2815        struct inode *inode;
2816        loff_t i_size;
2817        size_t count = iov_iter_count(iter), shortened = 0;
2818        loff_t offset = iocb->ki_pos;
2819        struct fuse_io_priv *io;
2820
2821        pos = offset;
2822        inode = file->f_mapping->host;
2823        i_size = i_size_read(inode);
2824
2825        if ((iov_iter_rw(iter) == READ) && (offset >= i_size))
2826                return 0;
2827
2828        io = kmalloc(sizeof(struct fuse_io_priv), GFP_KERNEL);
2829        if (!io)
2830                return -ENOMEM;
2831        spin_lock_init(&io->lock);
2832        kref_init(&io->refcnt);
2833        io->reqs = 1;
2834        io->bytes = -1;
2835        io->size = 0;
2836        io->offset = offset;
2837        io->write = (iov_iter_rw(iter) == WRITE);
2838        io->err = 0;
2839        /*
2840         * By default, we want to optimize all I/Os with async request
2841         * submission to the client filesystem if supported.
2842         */
2843        io->async = ff->fm->fc->async_dio;
2844        io->iocb = iocb;
2845        io->blocking = is_sync_kiocb(iocb);
2846
2847        /* optimization for short read */
2848        if (io->async && !io->write && offset + count > i_size) {
2849                iov_iter_truncate(iter, fuse_round_up(ff->fm->fc, i_size - offset));
2850                shortened = count - iov_iter_count(iter);
2851                count -= shortened;
2852        }
2853
2854        /*
2855         * We cannot asynchronously extend the size of a file.
2856         * In such case the aio will behave exactly like sync io.
2857         */
2858        if ((offset + count > i_size) && io->write)
2859                io->blocking = true;
2860
2861        if (io->async && io->blocking) {
2862                /*
2863                 * Additional reference to keep io around after
2864                 * calling fuse_aio_complete()
2865                 */
2866                kref_get(&io->refcnt);
2867                io->done = &wait;
2868        }
2869
2870        if (iov_iter_rw(iter) == WRITE) {
2871                ret = fuse_direct_io(io, iter, &pos, FUSE_DIO_WRITE);
2872                fuse_invalidate_attr(inode);
2873        } else {
2874                ret = __fuse_direct_read(io, iter, &pos);
2875        }
2876        iov_iter_reexpand(iter, iov_iter_count(iter) + shortened);
2877
2878        if (io->async) {
2879                bool blocking = io->blocking;
2880
2881                fuse_aio_complete(io, ret < 0 ? ret : 0, -1);
2882
2883                /* we have a non-extending, async request, so return */
2884                if (!blocking)
2885                        return -EIOCBQUEUED;
2886
2887                wait_for_completion(&wait);
2888                ret = fuse_get_res_by_io(io);
2889        }
2890
2891        kref_put(&io->refcnt, fuse_io_release);
2892
2893        if (iov_iter_rw(iter) == WRITE) {
2894                if (ret > 0)
2895                        fuse_write_update_size(inode, pos);
2896                else if (ret < 0 && offset + count > i_size)
2897                        fuse_do_truncate(file);
2898        }
2899
2900        return ret;
2901}
2902
2903static int fuse_writeback_range(struct inode *inode, loff_t start, loff_t end)
2904{
2905        int err = filemap_write_and_wait_range(inode->i_mapping, start, -1);
2906
2907        if (!err)
2908                fuse_sync_writes(inode);
2909
2910        return err;
2911}
2912
2913static long fuse_file_fallocate(struct file *file, int mode, loff_t offset,
2914                                loff_t length)
2915{
2916        struct fuse_file *ff = file->private_data;
2917        struct inode *inode = file_inode(file);
2918        struct fuse_inode *fi = get_fuse_inode(inode);
2919        struct fuse_mount *fm = ff->fm;
2920        FUSE_ARGS(args);
2921        struct fuse_fallocate_in inarg = {
2922                .fh = ff->fh,
2923                .offset = offset,
2924                .length = length,
2925                .mode = mode
2926        };
2927        int err;
2928        bool lock_inode = !(mode & FALLOC_FL_KEEP_SIZE) ||
2929                           (mode & (FALLOC_FL_PUNCH_HOLE |
2930                                    FALLOC_FL_ZERO_RANGE));
2931
2932        bool block_faults = FUSE_IS_DAX(inode) && lock_inode;
2933
2934        if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE |
2935                     FALLOC_FL_ZERO_RANGE))
2936                return -EOPNOTSUPP;
2937
2938        if (fm->fc->no_fallocate)
2939                return -EOPNOTSUPP;
2940
2941        if (lock_inode) {
2942                inode_lock(inode);
2943                if (block_faults) {
2944                        filemap_invalidate_lock(inode->i_mapping);
2945                        err = fuse_dax_break_layouts(inode, 0, 0);
2946                        if (err)
2947                                goto out;
2948                }
2949
2950                if (mode & (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_ZERO_RANGE)) {
2951                        loff_t endbyte = offset + length - 1;
2952
2953                        err = fuse_writeback_range(inode, offset, endbyte);
2954                        if (err)
2955                                goto out;
2956                }
2957        }
2958
2959        if (!(mode & FALLOC_FL_KEEP_SIZE) &&
2960            offset + length > i_size_read(inode)) {
2961                err = inode_newsize_ok(inode, offset + length);
2962                if (err)
2963                        goto out;
2964        }
2965
2966        if (!(mode & FALLOC_FL_KEEP_SIZE))
2967                set_bit(FUSE_I_SIZE_UNSTABLE, &fi->state);
2968
2969        args.opcode = FUSE_FALLOCATE;
2970        args.nodeid = ff->nodeid;
2971        args.in_numargs = 1;
2972        args.in_args[0].size = sizeof(inarg);
2973        args.in_args[0].value = &inarg;
2974        err = fuse_simple_request(fm, &args);
2975        if (err == -ENOSYS) {
2976                fm->fc->no_fallocate = 1;
2977                err = -EOPNOTSUPP;
2978        }
2979        if (err)
2980                goto out;
2981
2982        /* we could have extended the file */
2983        if (!(mode & FALLOC_FL_KEEP_SIZE)) {
2984                bool changed = fuse_write_update_size(inode, offset + length);
2985
2986                if (changed && fm->fc->writeback_cache)
2987                        file_update_time(file);
2988        }
2989
2990        if (mode & (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_ZERO_RANGE))
2991                truncate_pagecache_range(inode, offset, offset + length - 1);
2992
2993        fuse_invalidate_attr(inode);
2994
2995out:
2996        if (!(mode & FALLOC_FL_KEEP_SIZE))
2997                clear_bit(FUSE_I_SIZE_UNSTABLE, &fi->state);
2998
2999        if (block_faults)
3000                filemap_invalidate_unlock(inode->i_mapping);
3001
3002        if (lock_inode)
3003                inode_unlock(inode);
3004
3005        return err;
3006}
3007
3008static ssize_t __fuse_copy_file_range(struct file *file_in, loff_t pos_in,
3009                                      struct file *file_out, loff_t pos_out,
3010                                      size_t len, unsigned int flags)
3011{
3012        struct fuse_file *ff_in = file_in->private_data;
3013        struct fuse_file *ff_out = file_out->private_data;
3014        struct inode *inode_in = file_inode(file_in);
3015        struct inode *inode_out = file_inode(file_out);
3016        struct fuse_inode *fi_out = get_fuse_inode(inode_out);
3017        struct fuse_mount *fm = ff_in->fm;
3018        struct fuse_conn *fc = fm->fc;
3019        FUSE_ARGS(args);
3020        struct fuse_copy_file_range_in inarg = {
3021                .fh_in = ff_in->fh,
3022                .off_in = pos_in,
3023                .nodeid_out = ff_out->nodeid,
3024                .fh_out = ff_out->fh,
3025                .off_out = pos_out,
3026                .len = len,
3027                .flags = flags
3028        };
3029        struct fuse_write_out outarg;
3030        ssize_t err;
3031        /* mark unstable when write-back is not used, and file_out gets
3032         * extended */
3033        bool is_unstable = (!fc->writeback_cache) &&
3034                           ((pos_out + len) > inode_out->i_size);
3035
3036        if (fc->no_copy_file_range)
3037                return -EOPNOTSUPP;
3038
3039        if (file_inode(file_in)->i_sb != file_inode(file_out)->i_sb)
3040                return -EXDEV;
3041
3042        inode_lock(inode_in);
3043        err = fuse_writeback_range(inode_in, pos_in, pos_in + len - 1);
3044        inode_unlock(inode_in);
3045        if (err)
3046                return err;
3047
3048        inode_lock(inode_out);
3049
3050        err = file_modified(file_out);
3051        if (err)
3052                goto out;
3053
3054        /*
3055         * Write out dirty pages in the destination file before sending the COPY
3056         * request to userspace.  After the request is completed, truncate off
3057         * pages (including partial ones) from the cache that have been copied,
3058         * since these contain stale data at that point.
3059         *
3060         * This should be mostly correct, but if the COPY writes to partial
3061         * pages (at the start or end) and the parts not covered by the COPY are
3062         * written through a memory map after calling fuse_writeback_range(),
3063         * then these partial page modifications will be lost on truncation.
3064         *
3065         * It is unlikely that someone would rely on such mixed style
3066         * modifications.  Yet this does give less guarantees than if the
3067         * copying was performed with write(2).
3068         *
3069         * To fix this a mapping->invalidate_lock could be used to prevent new
3070         * faults while the copy is ongoing.
3071         */
3072        err = fuse_writeback_range(inode_out, pos_out, pos_out + len - 1);
3073        if (err)
3074                goto out;
3075
3076        if (is_unstable)
3077                set_bit(FUSE_I_SIZE_UNSTABLE, &fi_out->state);
3078
3079        args.opcode = FUSE_COPY_FILE_RANGE;
3080        args.nodeid = ff_in->nodeid;
3081        args.in_numargs = 1;
3082        args.in_args[0].size = sizeof(inarg);
3083        args.in_args[0].value = &inarg;
3084        args.out_numargs = 1;
3085        args.out_args[0].size = sizeof(outarg);
3086        args.out_args[0].value = &outarg;
3087        err = fuse_simple_request(fm, &args);
3088        if (err == -ENOSYS) {
3089                fc->no_copy_file_range = 1;
3090                err = -EOPNOTSUPP;
3091        }
3092        if (err)
3093                goto out;
3094
3095        truncate_inode_pages_range(inode_out->i_mapping,
3096                                   ALIGN_DOWN(pos_out, PAGE_SIZE),
3097                                   ALIGN(pos_out + outarg.size, PAGE_SIZE) - 1);
3098
3099        if (fc->writeback_cache) {
3100                fuse_write_update_size(inode_out, pos_out + outarg.size);
3101                file_update_time(file_out);
3102        }
3103
3104        fuse_invalidate_attr(inode_out);
3105
3106        err = outarg.size;
3107out:
3108        if (is_unstable)
3109                clear_bit(FUSE_I_SIZE_UNSTABLE, &fi_out->state);
3110
3111        inode_unlock(inode_out);
3112        file_accessed(file_in);
3113
3114        return err;
3115}
3116
3117static ssize_t fuse_copy_file_range(struct file *src_file, loff_t src_off,
3118                                    struct file *dst_file, loff_t dst_off,
3119                                    size_t len, unsigned int flags)
3120{
3121        ssize_t ret;
3122
3123        ret = __fuse_copy_file_range(src_file, src_off, dst_file, dst_off,
3124                                     len, flags);
3125
3126        if (ret == -EOPNOTSUPP || ret == -EXDEV)
3127                ret = generic_copy_file_range(src_file, src_off, dst_file,
3128                                              dst_off, len, flags);
3129        return ret;
3130}
3131
3132static const struct file_operations fuse_file_operations = {
3133        .llseek         = fuse_file_llseek,
3134        .read_iter      = fuse_file_read_iter,
3135        .write_iter     = fuse_file_write_iter,
3136        .mmap           = fuse_file_mmap,
3137        .open           = fuse_open,
3138        .flush          = fuse_flush,
3139        .release        = fuse_release,
3140        .fsync          = fuse_fsync,
3141        .lock           = fuse_file_lock,
3142        .get_unmapped_area = thp_get_unmapped_area,
3143        .flock          = fuse_file_flock,
3144        .splice_read    = generic_file_splice_read,
3145        .splice_write   = iter_file_splice_write,
3146        .unlocked_ioctl = fuse_file_ioctl,
3147        .compat_ioctl   = fuse_file_compat_ioctl,
3148        .poll           = fuse_file_poll,
3149        .fallocate      = fuse_file_fallocate,
3150        .copy_file_range = fuse_copy_file_range,
3151};
3152
3153static const struct address_space_operations fuse_file_aops  = {
3154        .readpage       = fuse_readpage,
3155        .readahead      = fuse_readahead,
3156        .writepage      = fuse_writepage,
3157        .writepages     = fuse_writepages,
3158        .launder_page   = fuse_launder_page,
3159        .set_page_dirty = __set_page_dirty_nobuffers,
3160        .bmap           = fuse_bmap,
3161        .direct_IO      = fuse_direct_IO,
3162        .write_begin    = fuse_write_begin,
3163        .write_end      = fuse_write_end,
3164};
3165
3166void fuse_init_file_inode(struct inode *inode)
3167{
3168        struct fuse_inode *fi = get_fuse_inode(inode);
3169
3170        inode->i_fop = &fuse_file_operations;
3171        inode->i_data.a_ops = &fuse_file_aops;
3172
3173        INIT_LIST_HEAD(&fi->write_files);
3174        INIT_LIST_HEAD(&fi->queued_writes);
3175        fi->writectr = 0;
3176        init_waitqueue_head(&fi->page_waitq);
3177        fi->writepages = RB_ROOT;
3178
3179        if (IS_ENABLED(CONFIG_FUSE_DAX))
3180                fuse_dax_inode_init(inode);
3181}
3182