linux/fs/orangefs/inode.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0
   2/*
   3 * (C) 2001 Clemson University and The University of Chicago
   4 * Copyright 2018 Omnibond Systems, L.L.C.
   5 *
   6 * See COPYING in top-level directory.
   7 */
   8
   9/*
  10 *  Linux VFS inode operations.
  11 */
  12
  13#include <linux/bvec.h>
  14#include <linux/fileattr.h>
  15#include "protocol.h"
  16#include "orangefs-kernel.h"
  17#include "orangefs-bufmap.h"
  18
  19static int orangefs_writepage_locked(struct page *page,
  20    struct writeback_control *wbc)
  21{
  22        struct inode *inode = page->mapping->host;
  23        struct orangefs_write_range *wr = NULL;
  24        struct iov_iter iter;
  25        struct bio_vec bv;
  26        size_t len, wlen;
  27        ssize_t ret;
  28        loff_t off;
  29
  30        set_page_writeback(page);
  31
  32        len = i_size_read(inode);
  33        if (PagePrivate(page)) {
  34                wr = (struct orangefs_write_range *)page_private(page);
  35                WARN_ON(wr->pos >= len);
  36                off = wr->pos;
  37                if (off + wr->len > len)
  38                        wlen = len - off;
  39                else
  40                        wlen = wr->len;
  41        } else {
  42                WARN_ON(1);
  43                off = page_offset(page);
  44                if (off + PAGE_SIZE > len)
  45                        wlen = len - off;
  46                else
  47                        wlen = PAGE_SIZE;
  48        }
  49        /* Should've been handled in orangefs_invalidatepage. */
  50        WARN_ON(off == len || off + wlen > len);
  51
  52        bv.bv_page = page;
  53        bv.bv_len = wlen;
  54        bv.bv_offset = off % PAGE_SIZE;
  55        WARN_ON(wlen == 0);
  56        iov_iter_bvec(&iter, WRITE, &bv, 1, wlen);
  57
  58        ret = wait_for_direct_io(ORANGEFS_IO_WRITE, inode, &off, &iter, wlen,
  59            len, wr, NULL, NULL);
  60        if (ret < 0) {
  61                SetPageError(page);
  62                mapping_set_error(page->mapping, ret);
  63        } else {
  64                ret = 0;
  65        }
  66        kfree(detach_page_private(page));
  67        return ret;
  68}
  69
  70static int orangefs_writepage(struct page *page, struct writeback_control *wbc)
  71{
  72        int ret;
  73        ret = orangefs_writepage_locked(page, wbc);
  74        unlock_page(page);
  75        end_page_writeback(page);
  76        return ret;
  77}
  78
  79struct orangefs_writepages {
  80        loff_t off;
  81        size_t len;
  82        kuid_t uid;
  83        kgid_t gid;
  84        int maxpages;
  85        int npages;
  86        struct page **pages;
  87        struct bio_vec *bv;
  88};
  89
  90static int orangefs_writepages_work(struct orangefs_writepages *ow,
  91    struct writeback_control *wbc)
  92{
  93        struct inode *inode = ow->pages[0]->mapping->host;
  94        struct orangefs_write_range *wrp, wr;
  95        struct iov_iter iter;
  96        ssize_t ret;
  97        size_t len;
  98        loff_t off;
  99        int i;
 100
 101        len = i_size_read(inode);
 102
 103        for (i = 0; i < ow->npages; i++) {
 104                set_page_writeback(ow->pages[i]);
 105                ow->bv[i].bv_page = ow->pages[i];
 106                ow->bv[i].bv_len = min(page_offset(ow->pages[i]) + PAGE_SIZE,
 107                    ow->off + ow->len) -
 108                    max(ow->off, page_offset(ow->pages[i]));
 109                if (i == 0)
 110                        ow->bv[i].bv_offset = ow->off -
 111                            page_offset(ow->pages[i]);
 112                else
 113                        ow->bv[i].bv_offset = 0;
 114        }
 115        iov_iter_bvec(&iter, WRITE, ow->bv, ow->npages, ow->len);
 116
 117        WARN_ON(ow->off >= len);
 118        if (ow->off + ow->len > len)
 119                ow->len = len - ow->off;
 120
 121        off = ow->off;
 122        wr.uid = ow->uid;
 123        wr.gid = ow->gid;
 124        ret = wait_for_direct_io(ORANGEFS_IO_WRITE, inode, &off, &iter, ow->len,
 125            0, &wr, NULL, NULL);
 126        if (ret < 0) {
 127                for (i = 0; i < ow->npages; i++) {
 128                        SetPageError(ow->pages[i]);
 129                        mapping_set_error(ow->pages[i]->mapping, ret);
 130                        if (PagePrivate(ow->pages[i])) {
 131                                wrp = (struct orangefs_write_range *)
 132                                    page_private(ow->pages[i]);
 133                                ClearPagePrivate(ow->pages[i]);
 134                                put_page(ow->pages[i]);
 135                                kfree(wrp);
 136                        }
 137                        end_page_writeback(ow->pages[i]);
 138                        unlock_page(ow->pages[i]);
 139                }
 140        } else {
 141                ret = 0;
 142                for (i = 0; i < ow->npages; i++) {
 143                        if (PagePrivate(ow->pages[i])) {
 144                                wrp = (struct orangefs_write_range *)
 145                                    page_private(ow->pages[i]);
 146                                ClearPagePrivate(ow->pages[i]);
 147                                put_page(ow->pages[i]);
 148                                kfree(wrp);
 149                        }
 150                        end_page_writeback(ow->pages[i]);
 151                        unlock_page(ow->pages[i]);
 152                }
 153        }
 154        return ret;
 155}
 156
 157static int orangefs_writepages_callback(struct page *page,
 158    struct writeback_control *wbc, void *data)
 159{
 160        struct orangefs_writepages *ow = data;
 161        struct orangefs_write_range *wr;
 162        int ret;
 163
 164        if (!PagePrivate(page)) {
 165                unlock_page(page);
 166                /* It's not private so there's nothing to write, right? */
 167                printk("writepages_callback not private!\n");
 168                BUG();
 169                return 0;
 170        }
 171        wr = (struct orangefs_write_range *)page_private(page);
 172
 173        ret = -1;
 174        if (ow->npages == 0) {
 175                ow->off = wr->pos;
 176                ow->len = wr->len;
 177                ow->uid = wr->uid;
 178                ow->gid = wr->gid;
 179                ow->pages[ow->npages++] = page;
 180                ret = 0;
 181                goto done;
 182        }
 183        if (!uid_eq(ow->uid, wr->uid) || !gid_eq(ow->gid, wr->gid)) {
 184                orangefs_writepages_work(ow, wbc);
 185                ow->npages = 0;
 186                ret = -1;
 187                goto done;
 188        }
 189        if (ow->off + ow->len == wr->pos) {
 190                ow->len += wr->len;
 191                ow->pages[ow->npages++] = page;
 192                ret = 0;
 193                goto done;
 194        }
 195done:
 196        if (ret == -1) {
 197                if (ow->npages) {
 198                        orangefs_writepages_work(ow, wbc);
 199                        ow->npages = 0;
 200                }
 201                ret = orangefs_writepage_locked(page, wbc);
 202                mapping_set_error(page->mapping, ret);
 203                unlock_page(page);
 204                end_page_writeback(page);
 205        } else {
 206                if (ow->npages == ow->maxpages) {
 207                        orangefs_writepages_work(ow, wbc);
 208                        ow->npages = 0;
 209                }
 210        }
 211        return ret;
 212}
 213
 214static int orangefs_writepages(struct address_space *mapping,
 215    struct writeback_control *wbc)
 216{
 217        struct orangefs_writepages *ow;
 218        struct blk_plug plug;
 219        int ret;
 220        ow = kzalloc(sizeof(struct orangefs_writepages), GFP_KERNEL);
 221        if (!ow)
 222                return -ENOMEM;
 223        ow->maxpages = orangefs_bufmap_size_query()/PAGE_SIZE;
 224        ow->pages = kcalloc(ow->maxpages, sizeof(struct page *), GFP_KERNEL);
 225        if (!ow->pages) {
 226                kfree(ow);
 227                return -ENOMEM;
 228        }
 229        ow->bv = kcalloc(ow->maxpages, sizeof(struct bio_vec), GFP_KERNEL);
 230        if (!ow->bv) {
 231                kfree(ow->pages);
 232                kfree(ow);
 233                return -ENOMEM;
 234        }
 235        blk_start_plug(&plug);
 236        ret = write_cache_pages(mapping, wbc, orangefs_writepages_callback, ow);
 237        if (ow->npages)
 238                ret = orangefs_writepages_work(ow, wbc);
 239        blk_finish_plug(&plug);
 240        kfree(ow->pages);
 241        kfree(ow->bv);
 242        kfree(ow);
 243        return ret;
 244}
 245
 246static int orangefs_launder_page(struct page *);
 247
 248static void orangefs_readahead(struct readahead_control *rac)
 249{
 250        loff_t offset;
 251        struct iov_iter iter;
 252        struct inode *inode = rac->mapping->host;
 253        struct xarray *i_pages;
 254        struct page *page;
 255        loff_t new_start = readahead_pos(rac);
 256        int ret;
 257        size_t new_len = 0;
 258
 259        loff_t bytes_remaining = inode->i_size - readahead_pos(rac);
 260        loff_t pages_remaining = bytes_remaining / PAGE_SIZE;
 261
 262        if (pages_remaining >= 1024)
 263                new_len = 4194304;
 264        else if (pages_remaining > readahead_count(rac))
 265                new_len = bytes_remaining;
 266
 267        if (new_len)
 268                readahead_expand(rac, new_start, new_len);
 269
 270        offset = readahead_pos(rac);
 271        i_pages = &rac->mapping->i_pages;
 272
 273        iov_iter_xarray(&iter, READ, i_pages, offset, readahead_length(rac));
 274
 275        /* read in the pages. */
 276        if ((ret = wait_for_direct_io(ORANGEFS_IO_READ, inode,
 277                        &offset, &iter, readahead_length(rac),
 278                        inode->i_size, NULL, NULL, rac->file)) < 0)
 279                gossip_debug(GOSSIP_FILE_DEBUG,
 280                        "%s: wait_for_direct_io failed. \n", __func__);
 281        else
 282                ret = 0;
 283
 284        /* clean up. */
 285        while ((page = readahead_page(rac))) {
 286                page_endio(page, false, ret);
 287                put_page(page);
 288        }
 289}
 290
 291static int orangefs_readpage(struct file *file, struct page *page)
 292{
 293        struct inode *inode = page->mapping->host;
 294        struct iov_iter iter;
 295        struct bio_vec bv;
 296        ssize_t ret;
 297        loff_t off; /* offset into this page */
 298
 299        if (PageDirty(page))
 300                orangefs_launder_page(page);
 301
 302        off = page_offset(page);
 303        bv.bv_page = page;
 304        bv.bv_len = PAGE_SIZE;
 305        bv.bv_offset = 0;
 306        iov_iter_bvec(&iter, READ, &bv, 1, PAGE_SIZE);
 307
 308        ret = wait_for_direct_io(ORANGEFS_IO_READ, inode, &off, &iter,
 309            PAGE_SIZE, inode->i_size, NULL, NULL, file);
 310        /* this will only zero remaining unread portions of the page data */
 311        iov_iter_zero(~0U, &iter);
 312        /* takes care of potential aliasing */
 313        flush_dcache_page(page);
 314        if (ret < 0) {
 315                SetPageError(page);
 316        } else {
 317                SetPageUptodate(page);
 318                if (PageError(page))
 319                        ClearPageError(page);
 320                ret = 0;
 321        }
 322        /* unlock the page after the ->readpage() routine completes */
 323        unlock_page(page);
 324        return ret;
 325}
 326
 327static int orangefs_write_begin(struct file *file,
 328    struct address_space *mapping,
 329    loff_t pos, unsigned len, unsigned flags, struct page **pagep,
 330    void **fsdata)
 331{
 332        struct orangefs_write_range *wr;
 333        struct page *page;
 334        pgoff_t index;
 335        int ret;
 336
 337        index = pos >> PAGE_SHIFT;
 338
 339        page = grab_cache_page_write_begin(mapping, index, flags);
 340        if (!page)
 341                return -ENOMEM;
 342
 343        *pagep = page;
 344
 345        if (PageDirty(page) && !PagePrivate(page)) {
 346                /*
 347                 * Should be impossible.  If it happens, launder the page
 348                 * since we don't know what's dirty.  This will WARN in
 349                 * orangefs_writepage_locked.
 350                 */
 351                ret = orangefs_launder_page(page);
 352                if (ret)
 353                        return ret;
 354        }
 355        if (PagePrivate(page)) {
 356                struct orangefs_write_range *wr;
 357                wr = (struct orangefs_write_range *)page_private(page);
 358                if (wr->pos + wr->len == pos &&
 359                    uid_eq(wr->uid, current_fsuid()) &&
 360                    gid_eq(wr->gid, current_fsgid())) {
 361                        wr->len += len;
 362                        goto okay;
 363                } else {
 364                        ret = orangefs_launder_page(page);
 365                        if (ret)
 366                                return ret;
 367                }
 368        }
 369
 370        wr = kmalloc(sizeof *wr, GFP_KERNEL);
 371        if (!wr)
 372                return -ENOMEM;
 373
 374        wr->pos = pos;
 375        wr->len = len;
 376        wr->uid = current_fsuid();
 377        wr->gid = current_fsgid();
 378        attach_page_private(page, wr);
 379okay:
 380        return 0;
 381}
 382
 383static int orangefs_write_end(struct file *file, struct address_space *mapping,
 384    loff_t pos, unsigned len, unsigned copied, struct page *page, void *fsdata)
 385{
 386        struct inode *inode = page->mapping->host;
 387        loff_t last_pos = pos + copied;
 388
 389        /*
 390         * No need to use i_size_read() here, the i_size
 391         * cannot change under us because we hold the i_mutex.
 392         */
 393        if (last_pos > inode->i_size)
 394                i_size_write(inode, last_pos);
 395
 396        /* zero the stale part of the page if we did a short copy */
 397        if (!PageUptodate(page)) {
 398                unsigned from = pos & (PAGE_SIZE - 1);
 399                if (copied < len) {
 400                        zero_user(page, from + copied, len - copied);
 401                }
 402                /* Set fully written pages uptodate. */
 403                if (pos == page_offset(page) &&
 404                    (len == PAGE_SIZE || pos + len == inode->i_size)) {
 405                        zero_user_segment(page, from + copied, PAGE_SIZE);
 406                        SetPageUptodate(page);
 407                }
 408        }
 409
 410        set_page_dirty(page);
 411        unlock_page(page);
 412        put_page(page);
 413
 414        mark_inode_dirty_sync(file_inode(file));
 415        return copied;
 416}
 417
 418static void orangefs_invalidatepage(struct page *page,
 419                                 unsigned int offset,
 420                                 unsigned int length)
 421{
 422        struct orangefs_write_range *wr;
 423        wr = (struct orangefs_write_range *)page_private(page);
 424
 425        if (offset == 0 && length == PAGE_SIZE) {
 426                kfree(detach_page_private(page));
 427                return;
 428        /* write range entirely within invalidate range (or equal) */
 429        } else if (page_offset(page) + offset <= wr->pos &&
 430            wr->pos + wr->len <= page_offset(page) + offset + length) {
 431                kfree(detach_page_private(page));
 432                /* XXX is this right? only caller in fs */
 433                cancel_dirty_page(page);
 434                return;
 435        /* invalidate range chops off end of write range */
 436        } else if (wr->pos < page_offset(page) + offset &&
 437            wr->pos + wr->len <= page_offset(page) + offset + length &&
 438             page_offset(page) + offset < wr->pos + wr->len) {
 439                size_t x;
 440                x = wr->pos + wr->len - (page_offset(page) + offset);
 441                WARN_ON(x > wr->len);
 442                wr->len -= x;
 443                wr->uid = current_fsuid();
 444                wr->gid = current_fsgid();
 445        /* invalidate range chops off beginning of write range */
 446        } else if (page_offset(page) + offset <= wr->pos &&
 447            page_offset(page) + offset + length < wr->pos + wr->len &&
 448            wr->pos < page_offset(page) + offset + length) {
 449                size_t x;
 450                x = page_offset(page) + offset + length - wr->pos;
 451                WARN_ON(x > wr->len);
 452                wr->pos += x;
 453                wr->len -= x;
 454                wr->uid = current_fsuid();
 455                wr->gid = current_fsgid();
 456        /* invalidate range entirely within write range (punch hole) */
 457        } else if (wr->pos < page_offset(page) + offset &&
 458            page_offset(page) + offset + length < wr->pos + wr->len) {
 459                /* XXX what do we do here... should not WARN_ON */
 460                WARN_ON(1);
 461                /* punch hole */
 462                /*
 463                 * should we just ignore this and write it out anyway?
 464                 * it hardly makes sense
 465                 */
 466                return;
 467        /* non-overlapping ranges */
 468        } else {
 469                /* WARN if they do overlap */
 470                if (!((page_offset(page) + offset + length <= wr->pos) ^
 471                    (wr->pos + wr->len <= page_offset(page) + offset))) {
 472                        WARN_ON(1);
 473                        printk("invalidate range offset %llu length %u\n",
 474                            page_offset(page) + offset, length);
 475                        printk("write range offset %llu length %zu\n",
 476                            wr->pos, wr->len);
 477                }
 478                return;
 479        }
 480
 481        /*
 482         * Above there are returns where wr is freed or where we WARN.
 483         * Thus the following runs if wr was modified above.
 484         */
 485
 486        orangefs_launder_page(page);
 487}
 488
 489static int orangefs_releasepage(struct page *page, gfp_t foo)
 490{
 491        return !PagePrivate(page);
 492}
 493
 494static void orangefs_freepage(struct page *page)
 495{
 496        kfree(detach_page_private(page));
 497}
 498
 499static int orangefs_launder_page(struct page *page)
 500{
 501        int r = 0;
 502        struct writeback_control wbc = {
 503                .sync_mode = WB_SYNC_ALL,
 504                .nr_to_write = 0,
 505        };
 506        wait_on_page_writeback(page);
 507        if (clear_page_dirty_for_io(page)) {
 508                r = orangefs_writepage_locked(page, &wbc);
 509                end_page_writeback(page);
 510        }
 511        return r;
 512}
 513
 514static ssize_t orangefs_direct_IO(struct kiocb *iocb,
 515                                  struct iov_iter *iter)
 516{
 517        /*
 518         * Comment from original do_readv_writev:
 519         * Common entry point for read/write/readv/writev
 520         * This function will dispatch it to either the direct I/O
 521         * or buffered I/O path depending on the mount options and/or
 522         * augmented/extended metadata attached to the file.
 523         * Note: File extended attributes override any mount options.
 524         */
 525        struct file *file = iocb->ki_filp;
 526        loff_t pos = iocb->ki_pos;
 527        enum ORANGEFS_io_type type = iov_iter_rw(iter) == WRITE ?
 528            ORANGEFS_IO_WRITE : ORANGEFS_IO_READ;
 529        loff_t *offset = &pos;
 530        struct inode *inode = file->f_mapping->host;
 531        struct orangefs_inode_s *orangefs_inode = ORANGEFS_I(inode);
 532        struct orangefs_khandle *handle = &orangefs_inode->refn.khandle;
 533        size_t count = iov_iter_count(iter);
 534        ssize_t total_count = 0;
 535        ssize_t ret = -EINVAL;
 536        int i = 0;
 537
 538        gossip_debug(GOSSIP_FILE_DEBUG,
 539                "%s-BEGIN(%pU): count(%d) after estimate_max_iovecs.\n",
 540                __func__,
 541                handle,
 542                (int)count);
 543
 544        if (type == ORANGEFS_IO_WRITE) {
 545                gossip_debug(GOSSIP_FILE_DEBUG,
 546                             "%s(%pU): proceeding with offset : %llu, "
 547                             "size %d\n",
 548                             __func__,
 549                             handle,
 550                             llu(*offset),
 551                             (int)count);
 552        }
 553
 554        if (count == 0) {
 555                ret = 0;
 556                goto out;
 557        }
 558
 559        while (iov_iter_count(iter)) {
 560                size_t each_count = iov_iter_count(iter);
 561                size_t amt_complete;
 562                i++;
 563
 564                /* how much to transfer in this loop iteration */
 565                if (each_count > orangefs_bufmap_size_query())
 566                        each_count = orangefs_bufmap_size_query();
 567
 568                gossip_debug(GOSSIP_FILE_DEBUG,
 569                             "%s(%pU): size of each_count(%d)\n",
 570                             __func__,
 571                             handle,
 572                             (int)each_count);
 573                gossip_debug(GOSSIP_FILE_DEBUG,
 574                             "%s(%pU): BEFORE wait_for_io: offset is %d\n",
 575                             __func__,
 576                             handle,
 577                             (int)*offset);
 578
 579                ret = wait_for_direct_io(type, inode, offset, iter,
 580                                each_count, 0, NULL, NULL, file);
 581                gossip_debug(GOSSIP_FILE_DEBUG,
 582                             "%s(%pU): return from wait_for_io:%d\n",
 583                             __func__,
 584                             handle,
 585                             (int)ret);
 586
 587                if (ret < 0)
 588                        goto out;
 589
 590                *offset += ret;
 591                total_count += ret;
 592                amt_complete = ret;
 593
 594                gossip_debug(GOSSIP_FILE_DEBUG,
 595                             "%s(%pU): AFTER wait_for_io: offset is %d\n",
 596                             __func__,
 597                             handle,
 598                             (int)*offset);
 599
 600                /*
 601                 * if we got a short I/O operations,
 602                 * fall out and return what we got so far
 603                 */
 604                if (amt_complete < each_count)
 605                        break;
 606        } /*end while */
 607
 608out:
 609        if (total_count > 0)
 610                ret = total_count;
 611        if (ret > 0) {
 612                if (type == ORANGEFS_IO_READ) {
 613                        file_accessed(file);
 614                } else {
 615                        file_update_time(file);
 616                        if (*offset > i_size_read(inode))
 617                                i_size_write(inode, *offset);
 618                }
 619        }
 620
 621        gossip_debug(GOSSIP_FILE_DEBUG,
 622                     "%s(%pU): Value(%d) returned.\n",
 623                     __func__,
 624                     handle,
 625                     (int)ret);
 626
 627        return ret;
 628}
 629
 630/** ORANGEFS2 implementation of address space operations */
 631static const struct address_space_operations orangefs_address_operations = {
 632        .writepage = orangefs_writepage,
 633        .readahead = orangefs_readahead,
 634        .readpage = orangefs_readpage,
 635        .writepages = orangefs_writepages,
 636        .set_page_dirty = __set_page_dirty_nobuffers,
 637        .write_begin = orangefs_write_begin,
 638        .write_end = orangefs_write_end,
 639        .invalidatepage = orangefs_invalidatepage,
 640        .releasepage = orangefs_releasepage,
 641        .freepage = orangefs_freepage,
 642        .launder_page = orangefs_launder_page,
 643        .direct_IO = orangefs_direct_IO,
 644};
 645
 646vm_fault_t orangefs_page_mkwrite(struct vm_fault *vmf)
 647{
 648        struct page *page = vmf->page;
 649        struct inode *inode = file_inode(vmf->vma->vm_file);
 650        struct orangefs_inode_s *orangefs_inode = ORANGEFS_I(inode);
 651        unsigned long *bitlock = &orangefs_inode->bitlock;
 652        vm_fault_t ret;
 653        struct orangefs_write_range *wr;
 654
 655        sb_start_pagefault(inode->i_sb);
 656
 657        if (wait_on_bit(bitlock, 1, TASK_KILLABLE)) {
 658                ret = VM_FAULT_RETRY;
 659                goto out;
 660        }
 661
 662        lock_page(page);
 663        if (PageDirty(page) && !PagePrivate(page)) {
 664                /*
 665                 * Should be impossible.  If it happens, launder the page
 666                 * since we don't know what's dirty.  This will WARN in
 667                 * orangefs_writepage_locked.
 668                 */
 669                if (orangefs_launder_page(page)) {
 670                        ret = VM_FAULT_LOCKED|VM_FAULT_RETRY;
 671                        goto out;
 672                }
 673        }
 674        if (PagePrivate(page)) {
 675                wr = (struct orangefs_write_range *)page_private(page);
 676                if (uid_eq(wr->uid, current_fsuid()) &&
 677                    gid_eq(wr->gid, current_fsgid())) {
 678                        wr->pos = page_offset(page);
 679                        wr->len = PAGE_SIZE;
 680                        goto okay;
 681                } else {
 682                        if (orangefs_launder_page(page)) {
 683                                ret = VM_FAULT_LOCKED|VM_FAULT_RETRY;
 684                                goto out;
 685                        }
 686                }
 687        }
 688        wr = kmalloc(sizeof *wr, GFP_KERNEL);
 689        if (!wr) {
 690                ret = VM_FAULT_LOCKED|VM_FAULT_RETRY;
 691                goto out;
 692        }
 693        wr->pos = page_offset(page);
 694        wr->len = PAGE_SIZE;
 695        wr->uid = current_fsuid();
 696        wr->gid = current_fsgid();
 697        attach_page_private(page, wr);
 698okay:
 699
 700        file_update_time(vmf->vma->vm_file);
 701        if (page->mapping != inode->i_mapping) {
 702                unlock_page(page);
 703                ret = VM_FAULT_LOCKED|VM_FAULT_NOPAGE;
 704                goto out;
 705        }
 706
 707        /*
 708         * We mark the page dirty already here so that when freeze is in
 709         * progress, we are guaranteed that writeback during freezing will
 710         * see the dirty page and writeprotect it again.
 711         */
 712        set_page_dirty(page);
 713        wait_for_stable_page(page);
 714        ret = VM_FAULT_LOCKED;
 715out:
 716        sb_end_pagefault(inode->i_sb);
 717        return ret;
 718}
 719
 720static int orangefs_setattr_size(struct inode *inode, struct iattr *iattr)
 721{
 722        struct orangefs_inode_s *orangefs_inode = ORANGEFS_I(inode);
 723        struct orangefs_kernel_op_s *new_op;
 724        loff_t orig_size;
 725        int ret = -EINVAL;
 726
 727        gossip_debug(GOSSIP_INODE_DEBUG,
 728                     "%s: %pU: Handle is %pU | fs_id %d | size is %llu\n",
 729                     __func__,
 730                     get_khandle_from_ino(inode),
 731                     &orangefs_inode->refn.khandle,
 732                     orangefs_inode->refn.fs_id,
 733                     iattr->ia_size);
 734
 735        /* Ensure that we have a up to date size, so we know if it changed. */
 736        ret = orangefs_inode_getattr(inode, ORANGEFS_GETATTR_SIZE);
 737        if (ret == -ESTALE)
 738                ret = -EIO;
 739        if (ret) {
 740                gossip_err("%s: orangefs_inode_getattr failed, ret:%d:.\n",
 741                    __func__, ret);
 742                return ret;
 743        }
 744        orig_size = i_size_read(inode);
 745
 746        /* This is truncate_setsize in a different order. */
 747        truncate_pagecache(inode, iattr->ia_size);
 748        i_size_write(inode, iattr->ia_size);
 749        if (iattr->ia_size > orig_size)
 750                pagecache_isize_extended(inode, orig_size, iattr->ia_size);
 751
 752        new_op = op_alloc(ORANGEFS_VFS_OP_TRUNCATE);
 753        if (!new_op)
 754                return -ENOMEM;
 755
 756        new_op->upcall.req.truncate.refn = orangefs_inode->refn;
 757        new_op->upcall.req.truncate.size = (__s64) iattr->ia_size;
 758
 759        ret = service_operation(new_op,
 760                __func__,
 761                get_interruptible_flag(inode));
 762
 763        /*
 764         * the truncate has no downcall members to retrieve, but
 765         * the status value tells us if it went through ok or not
 766         */
 767        gossip_debug(GOSSIP_INODE_DEBUG, "%s: ret:%d:\n", __func__, ret);
 768
 769        op_release(new_op);
 770
 771        if (ret != 0)
 772                return ret;
 773
 774        if (orig_size != i_size_read(inode))
 775                iattr->ia_valid |= ATTR_CTIME | ATTR_MTIME;
 776
 777        return ret;
 778}
 779
 780int __orangefs_setattr(struct inode *inode, struct iattr *iattr)
 781{
 782        int ret;
 783
 784        if (iattr->ia_valid & ATTR_MODE) {
 785                if (iattr->ia_mode & (S_ISVTX)) {
 786                        if (is_root_handle(inode)) {
 787                                /*
 788                                 * allow sticky bit to be set on root (since
 789                                 * it shows up that way by default anyhow),
 790                                 * but don't show it to the server
 791                                 */
 792                                iattr->ia_mode -= S_ISVTX;
 793                        } else {
 794                                gossip_debug(GOSSIP_UTILS_DEBUG,
 795                                             "User attempted to set sticky bit on non-root directory; returning EINVAL.\n");
 796                                ret = -EINVAL;
 797                                goto out;
 798                        }
 799                }
 800                if (iattr->ia_mode & (S_ISUID)) {
 801                        gossip_debug(GOSSIP_UTILS_DEBUG,
 802                                     "Attempting to set setuid bit (not supported); returning EINVAL.\n");
 803                        ret = -EINVAL;
 804                        goto out;
 805                }
 806        }
 807
 808        if (iattr->ia_valid & ATTR_SIZE) {
 809                ret = orangefs_setattr_size(inode, iattr);
 810                if (ret)
 811                        goto out;
 812        }
 813
 814again:
 815        spin_lock(&inode->i_lock);
 816        if (ORANGEFS_I(inode)->attr_valid) {
 817                if (uid_eq(ORANGEFS_I(inode)->attr_uid, current_fsuid()) &&
 818                    gid_eq(ORANGEFS_I(inode)->attr_gid, current_fsgid())) {
 819                        ORANGEFS_I(inode)->attr_valid = iattr->ia_valid;
 820                } else {
 821                        spin_unlock(&inode->i_lock);
 822                        write_inode_now(inode, 1);
 823                        goto again;
 824                }
 825        } else {
 826                ORANGEFS_I(inode)->attr_valid = iattr->ia_valid;
 827                ORANGEFS_I(inode)->attr_uid = current_fsuid();
 828                ORANGEFS_I(inode)->attr_gid = current_fsgid();
 829        }
 830        setattr_copy(&init_user_ns, inode, iattr);
 831        spin_unlock(&inode->i_lock);
 832        mark_inode_dirty(inode);
 833
 834        if (iattr->ia_valid & ATTR_MODE)
 835                /* change mod on a file that has ACLs */
 836                ret = posix_acl_chmod(&init_user_ns, inode, inode->i_mode);
 837
 838        ret = 0;
 839out:
 840        return ret;
 841}
 842
 843/*
 844 * Change attributes of an object referenced by dentry.
 845 */
 846int orangefs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
 847                     struct iattr *iattr)
 848{
 849        int ret;
 850        gossip_debug(GOSSIP_INODE_DEBUG, "__orangefs_setattr: called on %pd\n",
 851            dentry);
 852        ret = setattr_prepare(&init_user_ns, dentry, iattr);
 853        if (ret)
 854                goto out;
 855        ret = __orangefs_setattr(d_inode(dentry), iattr);
 856        sync_inode_metadata(d_inode(dentry), 1);
 857out:
 858        gossip_debug(GOSSIP_INODE_DEBUG, "orangefs_setattr: returning %d\n",
 859            ret);
 860        return ret;
 861}
 862
 863/*
 864 * Obtain attributes of an object given a dentry
 865 */
 866int orangefs_getattr(struct user_namespace *mnt_userns, const struct path *path,
 867                     struct kstat *stat, u32 request_mask, unsigned int flags)
 868{
 869        int ret;
 870        struct inode *inode = path->dentry->d_inode;
 871
 872        gossip_debug(GOSSIP_INODE_DEBUG,
 873                     "orangefs_getattr: called on %pd mask %u\n",
 874                     path->dentry, request_mask);
 875
 876        ret = orangefs_inode_getattr(inode,
 877            request_mask & STATX_SIZE ? ORANGEFS_GETATTR_SIZE : 0);
 878        if (ret == 0) {
 879                generic_fillattr(&init_user_ns, inode, stat);
 880
 881                /* override block size reported to stat */
 882                if (!(request_mask & STATX_SIZE))
 883                        stat->result_mask &= ~STATX_SIZE;
 884
 885                generic_fill_statx_attr(inode, stat);
 886        }
 887        return ret;
 888}
 889
 890int orangefs_permission(struct user_namespace *mnt_userns,
 891                        struct inode *inode, int mask)
 892{
 893        int ret;
 894
 895        if (mask & MAY_NOT_BLOCK)
 896                return -ECHILD;
 897
 898        gossip_debug(GOSSIP_INODE_DEBUG, "%s: refreshing\n", __func__);
 899
 900        /* Make sure the permission (and other common attrs) are up to date. */
 901        ret = orangefs_inode_getattr(inode, 0);
 902        if (ret < 0)
 903                return ret;
 904
 905        return generic_permission(&init_user_ns, inode, mask);
 906}
 907
 908int orangefs_update_time(struct inode *inode, struct timespec64 *time, int flags)
 909{
 910        struct iattr iattr;
 911        gossip_debug(GOSSIP_INODE_DEBUG, "orangefs_update_time: %pU\n",
 912            get_khandle_from_ino(inode));
 913        generic_update_time(inode, time, flags);
 914        memset(&iattr, 0, sizeof iattr);
 915        if (flags & S_ATIME)
 916                iattr.ia_valid |= ATTR_ATIME;
 917        if (flags & S_CTIME)
 918                iattr.ia_valid |= ATTR_CTIME;
 919        if (flags & S_MTIME)
 920                iattr.ia_valid |= ATTR_MTIME;
 921        return __orangefs_setattr(inode, &iattr);
 922}
 923
 924static int orangefs_fileattr_get(struct dentry *dentry, struct fileattr *fa)
 925{
 926        u64 val = 0;
 927        int ret;
 928
 929        gossip_debug(GOSSIP_FILE_DEBUG, "%s: called on %pd\n", __func__,
 930                     dentry);
 931
 932        ret = orangefs_inode_getxattr(d_inode(dentry),
 933                                      "user.pvfs2.meta_hint",
 934                                      &val, sizeof(val));
 935        if (ret < 0 && ret != -ENODATA)
 936                return ret;
 937
 938        gossip_debug(GOSSIP_FILE_DEBUG, "%s: flags=%u\n", __func__, (u32) val);
 939
 940        fileattr_fill_flags(fa, val);
 941        return 0;
 942}
 943
 944static int orangefs_fileattr_set(struct user_namespace *mnt_userns,
 945                                 struct dentry *dentry, struct fileattr *fa)
 946{
 947        u64 val = 0;
 948
 949        gossip_debug(GOSSIP_FILE_DEBUG, "%s: called on %pd\n", __func__,
 950                     dentry);
 951        /*
 952         * ORANGEFS_MIRROR_FL is set internally when the mirroring mode is
 953         * turned on for a file. The user is not allowed to turn on this bit,
 954         * but the bit is present if the user first gets the flags and then
 955         * updates the flags with some new settings. So, we ignore it in the
 956         * following edit. bligon.
 957         */
 958        if (fileattr_has_fsx(fa) ||
 959            (fa->flags & ~(FS_IMMUTABLE_FL | FS_APPEND_FL | FS_NOATIME_FL | ORANGEFS_MIRROR_FL))) {
 960                gossip_err("%s: only supports setting one of FS_IMMUTABLE_FL|FS_APPEND_FL|FS_NOATIME_FL\n",
 961                           __func__);
 962                return -EOPNOTSUPP;
 963        }
 964        val = fa->flags;
 965        gossip_debug(GOSSIP_FILE_DEBUG, "%s: flags=%u\n", __func__, (u32) val);
 966        return orangefs_inode_setxattr(d_inode(dentry),
 967                                       "user.pvfs2.meta_hint",
 968                                       &val, sizeof(val), 0);
 969}
 970
 971/* ORANGEFS2 implementation of VFS inode operations for files */
 972static const struct inode_operations orangefs_file_inode_operations = {
 973        .get_acl = orangefs_get_acl,
 974        .set_acl = orangefs_set_acl,
 975        .setattr = orangefs_setattr,
 976        .getattr = orangefs_getattr,
 977        .listxattr = orangefs_listxattr,
 978        .permission = orangefs_permission,
 979        .update_time = orangefs_update_time,
 980        .fileattr_get = orangefs_fileattr_get,
 981        .fileattr_set = orangefs_fileattr_set,
 982};
 983
 984static int orangefs_init_iops(struct inode *inode)
 985{
 986        inode->i_mapping->a_ops = &orangefs_address_operations;
 987
 988        switch (inode->i_mode & S_IFMT) {
 989        case S_IFREG:
 990                inode->i_op = &orangefs_file_inode_operations;
 991                inode->i_fop = &orangefs_file_operations;
 992                break;
 993        case S_IFLNK:
 994                inode->i_op = &orangefs_symlink_inode_operations;
 995                break;
 996        case S_IFDIR:
 997                inode->i_op = &orangefs_dir_inode_operations;
 998                inode->i_fop = &orangefs_dir_operations;
 999                break;
1000        default:
1001                gossip_debug(GOSSIP_INODE_DEBUG,
1002                             "%s: unsupported mode\n",
1003                             __func__);
1004                return -EINVAL;
1005        }
1006
1007        return 0;
1008}
1009
1010/*
1011 * Given an ORANGEFS object identifier (fsid, handle), convert it into
1012 * a ino_t type that will be used as a hash-index from where the handle will
1013 * be searched for in the VFS hash table of inodes.
1014 */
1015static inline ino_t orangefs_handle_hash(struct orangefs_object_kref *ref)
1016{
1017        if (!ref)
1018                return 0;
1019        return orangefs_khandle_to_ino(&(ref->khandle));
1020}
1021
1022/*
1023 * Called to set up an inode from iget5_locked.
1024 */
1025static int orangefs_set_inode(struct inode *inode, void *data)
1026{
1027        struct orangefs_object_kref *ref = (struct orangefs_object_kref *) data;
1028        ORANGEFS_I(inode)->refn.fs_id = ref->fs_id;
1029        ORANGEFS_I(inode)->refn.khandle = ref->khandle;
1030        ORANGEFS_I(inode)->attr_valid = 0;
1031        hash_init(ORANGEFS_I(inode)->xattr_cache);
1032        ORANGEFS_I(inode)->mapping_time = jiffies - 1;
1033        ORANGEFS_I(inode)->bitlock = 0;
1034        return 0;
1035}
1036
1037/*
1038 * Called to determine if handles match.
1039 */
1040static int orangefs_test_inode(struct inode *inode, void *data)
1041{
1042        struct orangefs_object_kref *ref = (struct orangefs_object_kref *) data;
1043        struct orangefs_inode_s *orangefs_inode = NULL;
1044
1045        orangefs_inode = ORANGEFS_I(inode);
1046        /* test handles and fs_ids... */
1047        return (!ORANGEFS_khandle_cmp(&(orangefs_inode->refn.khandle),
1048                                &(ref->khandle)) &&
1049                        orangefs_inode->refn.fs_id == ref->fs_id);
1050}
1051
1052/*
1053 * Front-end to lookup the inode-cache maintained by the VFS using the ORANGEFS
1054 * file handle.
1055 *
1056 * @sb: the file system super block instance.
1057 * @ref: The ORANGEFS object for which we are trying to locate an inode.
1058 */
1059struct inode *orangefs_iget(struct super_block *sb,
1060                struct orangefs_object_kref *ref)
1061{
1062        struct inode *inode = NULL;
1063        unsigned long hash;
1064        int error;
1065
1066        hash = orangefs_handle_hash(ref);
1067        inode = iget5_locked(sb,
1068                        hash,
1069                        orangefs_test_inode,
1070                        orangefs_set_inode,
1071                        ref);
1072
1073        if (!inode)
1074                return ERR_PTR(-ENOMEM);
1075
1076        if (!(inode->i_state & I_NEW))
1077                return inode;
1078
1079        error = orangefs_inode_getattr(inode, ORANGEFS_GETATTR_NEW);
1080        if (error) {
1081                iget_failed(inode);
1082                return ERR_PTR(error);
1083        }
1084
1085        inode->i_ino = hash;    /* needed for stat etc */
1086        orangefs_init_iops(inode);
1087        unlock_new_inode(inode);
1088
1089        gossip_debug(GOSSIP_INODE_DEBUG,
1090                     "iget handle %pU, fsid %d hash %ld i_ino %lu\n",
1091                     &ref->khandle,
1092                     ref->fs_id,
1093                     hash,
1094                     inode->i_ino);
1095
1096        return inode;
1097}
1098
1099/*
1100 * Allocate an inode for a newly created file and insert it into the inode hash.
1101 */
1102struct inode *orangefs_new_inode(struct super_block *sb, struct inode *dir,
1103                int mode, dev_t dev, struct orangefs_object_kref *ref)
1104{
1105        unsigned long hash = orangefs_handle_hash(ref);
1106        struct inode *inode;
1107        int error;
1108
1109        gossip_debug(GOSSIP_INODE_DEBUG,
1110                     "%s:(sb is %p | MAJOR(dev)=%u | MINOR(dev)=%u mode=%o)\n",
1111                     __func__,
1112                     sb,
1113                     MAJOR(dev),
1114                     MINOR(dev),
1115                     mode);
1116
1117        inode = new_inode(sb);
1118        if (!inode)
1119                return ERR_PTR(-ENOMEM);
1120
1121        orangefs_set_inode(inode, ref);
1122        inode->i_ino = hash;    /* needed for stat etc */
1123
1124        error = orangefs_inode_getattr(inode, ORANGEFS_GETATTR_NEW);
1125        if (error)
1126                goto out_iput;
1127
1128        orangefs_init_iops(inode);
1129        inode->i_rdev = dev;
1130
1131        error = insert_inode_locked4(inode, hash, orangefs_test_inode, ref);
1132        if (error < 0)
1133                goto out_iput;
1134
1135        gossip_debug(GOSSIP_INODE_DEBUG,
1136                     "Initializing ACL's for inode %pU\n",
1137                     get_khandle_from_ino(inode));
1138        orangefs_init_acl(inode, dir);
1139        return inode;
1140
1141out_iput:
1142        iput(inode);
1143        return ERR_PTR(error);
1144}
1145