LXR linux/fs/ntfs/file.c

   1/*
   2 * file.c - NTFS kernel file operations.  Part of the Linux-NTFS project.
   3 *
   4 * Copyright (c) 2001-2007 Anton Altaparmakov
   5 *
   6 * This program/include file is free software; you can redistribute it and/or
   7 * modify it under the terms of the GNU General Public License as published
   8 * by the Free Software Foundation; either version 2 of the License, or
   9 * (at your option) any later version.
  10 *
  11 * This program/include file is distributed in the hope that it will be
  12 * useful, but WITHOUT ANY WARRANTY; without even the implied warranty
  13 * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14 * GNU General Public License for more details.
  15 *
  16 * You should have received a copy of the GNU General Public License
  17 * along with this program (in the main directory of the Linux-NTFS
  18 * distribution in the file COPYING); if not, write to the Free Software
  19 * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  20 */
  21
  22#include <linux/buffer_head.h>
  23#include <linux/pagemap.h>
  24#include <linux/pagevec.h>
  25#include <linux/sched.h>
  26#include <linux/swap.h>
  27#include <linux/uio.h>
  28#include <linux/writeback.h>
  29
  30#include <asm/page.h>
  31#include <asm/uaccess.h>
  32
  33#include "attrib.h"
  34#include "bitmap.h"
  35#include "inode.h"
  36#include "debug.h"
  37#include "lcnalloc.h"
  38#include "malloc.h"
  39#include "mft.h"
  40#include "ntfs.h"
  41
  42/**
  43 * ntfs_file_open - called when an inode is about to be opened
  44 * @vi:         inode to be opened
  45 * @filp:       file structure describing the inode
  46 *
  47 * Limit file size to the page cache limit on architectures where unsigned long
  48 * is 32-bits. This is the most we can do for now without overflowing the page
  49 * cache page index. Doing it this way means we don't run into problems because
  50 * of existing too large files. It would be better to allow the user to read
  51 * the beginning of the file but I doubt very much anyone is going to hit this
  52 * check on a 32-bit architecture, so there is no point in adding the extra
  53 * complexity required to support this.
  54 *
  55 * On 64-bit architectures, the check is hopefully optimized away by the
  56 * compiler.
  57 *
  58 * After the check passes, just call generic_file_open() to do its work.
  59 */
  60static int ntfs_file_open(struct inode *vi, struct file *filp)
  61{
  62        if (sizeof(unsigned long) < 8) {
  63                if (i_size_read(vi) > MAX_LFS_FILESIZE)
  64                        return -EOVERFLOW;
  65        }
  66        return generic_file_open(vi, filp);
  67}
  68
  69#ifdef NTFS_RW
  70
  71/**
  72 * ntfs_attr_extend_initialized - extend the initialized size of an attribute
  73 * @ni:                 ntfs inode of the attribute to extend
  74 * @new_init_size:      requested new initialized size in bytes
  75 * @cached_page:        store any allocated but unused page here
  76 * @lru_pvec:           lru-buffering pagevec of the caller
  77 *
  78 * Extend the initialized size of an attribute described by the ntfs inode @ni
  79 * to @new_init_size bytes.  This involves zeroing any non-sparse space between
  80 * the old initialized size and @new_init_size both in the page cache and on
  81 * disk (if relevant complete pages are already uptodate in the page cache then
  82 * these are simply marked dirty).
  83 *
  84 * As a side-effect, the file size (vfs inode->i_size) may be incremented as,
  85 * in the resident attribute case, it is tied to the initialized size and, in
  86 * the non-resident attribute case, it may not fall below the initialized size.
  87 *
  88 * Note that if the attribute is resident, we do not need to touch the page
  89 * cache at all.  This is because if the page cache page is not uptodate we
  90 * bring it uptodate later, when doing the write to the mft record since we
  91 * then already have the page mapped.  And if the page is uptodate, the
  92 * non-initialized region will already have been zeroed when the page was
  93 * brought uptodate and the region may in fact already have been overwritten
  94 * with new data via mmap() based writes, so we cannot just zero it.  And since
  95 * POSIX specifies that the behaviour of resizing a file whilst it is mmap()ped
  96 * is unspecified, we choose not to do zeroing and thus we do not need to touch
  97 * the page at all.  For a more detailed explanation see ntfs_truncate() in
  98 * fs/ntfs/inode.c.
  99 *
 100 * @cached_page and @lru_pvec are just optimizations for dealing with multiple
 101 * pages.
 102 *
 103 * Return 0 on success and -errno on error.  In the case that an error is
 104 * encountered it is possible that the initialized size will already have been
 105 * incremented some way towards @new_init_size but it is guaranteed that if
 106 * this is the case, the necessary zeroing will also have happened and that all
 107 * metadata is self-consistent.
 108 *
 109 * Locking: i_mutex on the vfs inode corrseponsind to the ntfs inode @ni must be
 110 *          held by the caller.
 111 */
 112static int ntfs_attr_extend_initialized(ntfs_inode *ni, const s64 new_init_size,
 113                struct page **cached_page, struct pagevec *lru_pvec)
 114{
 115        s64 old_init_size;
 116        loff_t old_i_size;
 117        pgoff_t index, end_index;
 118        unsigned long flags;
 119        struct inode *vi = VFS_I(ni);
 120        ntfs_inode *base_ni;
 121        MFT_RECORD *m = NULL;
 122        ATTR_RECORD *a;
 123        ntfs_attr_search_ctx *ctx = NULL;
 124        struct address_space *mapping;
 125        struct page *page = NULL;
 126        u8 *kattr;
 127        int err;
 128        u32 attr_len;
 129
 130        read_lock_irqsave(&ni->size_lock, flags);
 131        old_init_size = ni->initialized_size;
 132        old_i_size = i_size_read(vi);
 133        BUG_ON(new_init_size > ni->allocated_size);
 134        read_unlock_irqrestore(&ni->size_lock, flags);
 135        ntfs_debug("Entering for i_ino 0x%lx, attribute type 0x%x, "
 136                        "old_initialized_size 0x%llx, "
 137                        "new_initialized_size 0x%llx, i_size 0x%llx.",
 138                        vi->i_ino, (unsigned)le32_to_cpu(ni->type),
 139                        (unsigned long long)old_init_size,
 140                        (unsigned long long)new_init_size, old_i_size);
 141        if (!NInoAttr(ni))
 142                base_ni = ni;
 143        else
 144                base_ni = ni->ext.base_ntfs_ino;
 145        /* Use goto to reduce indentation and we need the label below anyway. */
 146        if (NInoNonResident(ni))
 147                goto do_non_resident_extend;
 148        BUG_ON(old_init_size != old_i_size);
 149        m = map_mft_record(base_ni);
 150        if (IS_ERR(m)) {
 151                err = PTR_ERR(m);
 152                m = NULL;
 153                goto err_out;
 154        }
 155        ctx = ntfs_attr_get_search_ctx(base_ni, m);
 156        if (unlikely(!ctx)) {
 157                err = -ENOMEM;
 158                goto err_out;
 159        }
 160        err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len,
 161                        CASE_SENSITIVE, 0, NULL, 0, ctx);
 162        if (unlikely(err)) {
 163                if (err == -ENOENT)
 164                        err = -EIO;
 165                goto err_out;
 166        }
 167        m = ctx->mrec;
 168        a = ctx->attr;
 169        BUG_ON(a->non_resident);
 170        /* The total length of the attribute value. */
 171        attr_len = le32_to_cpu(a->data.resident.value_length);
 172        BUG_ON(old_i_size != (loff_t)attr_len);
 173        /*
 174         * Do the zeroing in the mft record and update the attribute size in
 175         * the mft record.
 176         */
 177        kattr = (u8*)a + le16_to_cpu(a->data.resident.value_offset);
 178        memset(kattr + attr_len, 0, new_init_size - attr_len);
 179        a->data.resident.value_length = cpu_to_le32((u32)new_init_size);
 180        /* Finally, update the sizes in the vfs and ntfs inodes. */
 181        write_lock_irqsave(&ni->size_lock, flags);
 182        i_size_write(vi, new_init_size);
 183        ni->initialized_size = new_init_size;
 184        write_unlock_irqrestore(&ni->size_lock, flags);
 185        goto done;
 186do_non_resident_extend:
 187        /*
 188         * If the new initialized size @new_init_size exceeds the current file
 189         * size (vfs inode->i_size), we need to extend the file size to the
 190         * new initialized size.
 191         */
 192        if (new_init_size > old_i_size) {
 193                m = map_mft_record(base_ni);
 194                if (IS_ERR(m)) {
 195                        err = PTR_ERR(m);
 196                        m = NULL;
 197                        goto err_out;
 198                }
 199                ctx = ntfs_attr_get_search_ctx(base_ni, m);
 200                if (unlikely(!ctx)) {
 201                        err = -ENOMEM;
 202                        goto err_out;
 203                }
 204                err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len,
 205                                CASE_SENSITIVE, 0, NULL, 0, ctx);
 206                if (unlikely(err)) {
 207                        if (err == -ENOENT)
 208                                err = -EIO;
 209                        goto err_out;
 210                }
 211                m = ctx->mrec;
 212                a = ctx->attr;
 213                BUG_ON(!a->non_resident);
 214                BUG_ON(old_i_size != (loff_t)
 215                                sle64_to_cpu(a->data.non_resident.data_size));
 216                a->data.non_resident.data_size = cpu_to_sle64(new_init_size);
 217                flush_dcache_mft_record_page(ctx->ntfs_ino);
 218                mark_mft_record_dirty(ctx->ntfs_ino);
 219                /* Update the file size in the vfs inode. */
 220                i_size_write(vi, new_init_size);
 221                ntfs_attr_put_search_ctx(ctx);
 222                ctx = NULL;
 223                unmap_mft_record(base_ni);
 224                m = NULL;
 225        }
 226        mapping = vi->i_mapping;
 227        index = old_init_size >> PAGE_CACHE_SHIFT;
 228        end_index = (new_init_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
 229        do {
 230                /*
 231                 * Read the page.  If the page is not present, this will zero
 232                 * the uninitialized regions for us.
 233                 */
 234                page = read_mapping_page(mapping, index, NULL);
 235                if (IS_ERR(page)) {
 236                        err = PTR_ERR(page);
 237                        goto init_err_out;
 238                }
 239                if (unlikely(PageError(page))) {
 240                        page_cache_release(page);
 241                        err = -EIO;
 242                        goto init_err_out;
 243                }
 244                /*
 245                 * Update the initialized size in the ntfs inode.  This is
 246                 * enough to make ntfs_writepage() work.
 247                 */
 248                write_lock_irqsave(&ni->size_lock, flags);
 249                ni->initialized_size = (s64)(index + 1) << PAGE_CACHE_SHIFT;
 250                if (ni->initialized_size > new_init_size)
 251                        ni->initialized_size = new_init_size;
 252                write_unlock_irqrestore(&ni->size_lock, flags);
 253                /* Set the page dirty so it gets written out. */
 254                set_page_dirty(page);
 255                page_cache_release(page);
 256                /*
 257                 * Play nice with the vm and the rest of the system.  This is
 258                 * very much needed as we can potentially be modifying the
 259                 * initialised size from a very small value to a really huge
 260                 * value, e.g.
 261                 *      f = open(somefile, O_TRUNC);
 262                 *      truncate(f, 10GiB);
 263                 *      seek(f, 10GiB);
 264                 *      write(f, 1);
 265                 * And this would mean we would be marking dirty hundreds of
 266                 * thousands of pages or as in the above example more than
 267                 * two and a half million pages!
 268                 *
 269                 * TODO: For sparse pages could optimize this workload by using
 270                 * the FsMisc / MiscFs page bit as a "PageIsSparse" bit.  This
 271                 * would be set in readpage for sparse pages and here we would
 272                 * not need to mark dirty any pages which have this bit set.
 273                 * The only caveat is that we have to clear the bit everywhere
 274                 * where we allocate any clusters that lie in the page or that
 275                 * contain the page.
 276                 *
 277                 * TODO: An even greater optimization would be for us to only
 278                 * call readpage() on pages which are not in sparse regions as
 279                 * determined from the runlist.  This would greatly reduce the
 280                 * number of pages we read and make dirty in the case of sparse
 281                 * files.
 282                 */
 283                balance_dirty_pages_ratelimited(mapping);
 284                cond_resched();
 285        } while (++index < end_index);
 286        read_lock_irqsave(&ni->size_lock, flags);
 287        BUG_ON(ni->initialized_size != new_init_size);
 288        read_unlock_irqrestore(&ni->size_lock, flags);
 289        /* Now bring in sync the initialized_size in the mft record. */
 290        m = map_mft_record(base_ni);
 291        if (IS_ERR(m)) {
 292                err = PTR_ERR(m);
 293                m = NULL;
 294                goto init_err_out;
 295        }
 296        ctx = ntfs_attr_get_search_ctx(base_ni, m);
 297        if (unlikely(!ctx)) {
 298                err = -ENOMEM;
 299                goto init_err_out;
 300        }
 301        err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len,
 302                        CASE_SENSITIVE, 0, NULL, 0, ctx);
 303        if (unlikely(err)) {
 304                if (err == -ENOENT)
 305                        err = -EIO;
 306                goto init_err_out;
 307        }
 308        m = ctx->mrec;
 309        a = ctx->attr;
 310        BUG_ON(!a->non_resident);
 311        a->data.non_resident.initialized_size = cpu_to_sle64(new_init_size);
 312done:
 313        flush_dcache_mft_record_page(ctx->ntfs_ino);
 314        mark_mft_record_dirty(ctx->ntfs_ino);
 315        if (ctx)
 316                ntfs_attr_put_search_ctx(ctx);
 317        if (m)
 318                unmap_mft_record(base_ni);
 319        ntfs_debug("Done, initialized_size 0x%llx, i_size 0x%llx.",
 320                        (unsigned long long)new_init_size, i_size_read(vi));
 321        return 0;
 322init_err_out:
 323        write_lock_irqsave(&ni->size_lock, flags);
 324        ni->initialized_size = old_init_size;
 325        write_unlock_irqrestore(&ni->size_lock, flags);
 326err_out:
 327        if (ctx)
 328                ntfs_attr_put_search_ctx(ctx);
 329        if (m)
 330                unmap_mft_record(base_ni);
 331        ntfs_debug("Failed.  Returning error code %i.", err);
 332        return err;
 333}
 334
 335/**
 336 * ntfs_fault_in_pages_readable -
 337 *
 338 * Fault a number of userspace pages into pagetables.
 339 *
 340 * Unlike include/linux/pagemap.h::fault_in_pages_readable(), this one copes
 341 * with more than two userspace pages as well as handling the single page case
 342 * elegantly.
 343 *
 344 * If you find this difficult to understand, then think of the while loop being
 345 * the following code, except that we do without the integer variable ret:
 346 *
 347 *      do {
 348 *              ret = __get_user(c, uaddr);
 349 *              uaddr += PAGE_SIZE;
 350 *      } while (!ret && uaddr < end);
 351 *
 352 * Note, the final __get_user() may well run out-of-bounds of the user buffer,
 353 * but _not_ out-of-bounds of the page the user buffer belongs to, and since
 354 * this is only a read and not a write, and since it is still in the same page,
 355 * it should not matter and this makes the code much simpler.
 356 */
 357static inline void ntfs_fault_in_pages_readable(const char __user *uaddr,
 358                int bytes)
 359{
 360        const char __user *end;
 361        volatile char c;
 362
 363        /* Set @end to the first byte outside the last page we care about. */
 364        end = (const char __user*)PAGE_ALIGN((unsigned long)uaddr + bytes);
 365
 366        while (!__get_user(c, uaddr) && (uaddr += PAGE_SIZE, uaddr < end))
 367                ;
 368}
 369
 370/**
 371 * ntfs_fault_in_pages_readable_iovec -
 372 *
 373 * Same as ntfs_fault_in_pages_readable() but operates on an array of iovecs.
 374 */
 375static inline void ntfs_fault_in_pages_readable_iovec(const struct iovec *iov,
 376                size_t iov_ofs, int bytes)
 377{
 378        do {
 379                const char __user *buf;
 380                unsigned len;
 381
 382                buf = iov->iov_base + iov_ofs;
 383                len = iov->iov_len - iov_ofs;
 384                if (len > bytes)
 385                        len = bytes;
 386                ntfs_fault_in_pages_readable(buf, len);
 387                bytes -= len;
 388                iov++;
 389                iov_ofs = 0;
 390        } while (bytes);
 391}
 392
 393/**
 394 * __ntfs_grab_cache_pages - obtain a number of locked pages
 395 * @mapping:    address space mapping from which to obtain page cache pages
 396 * @index:      starting index in @mapping at which to begin obtaining pages
 397 * @nr_pages:   number of page cache pages to obtain
 398 * @pages:      array of pages in which to return the obtained page cache pages
 399 * @cached_page: allocated but as yet unused page
 400 * @lru_pvec:   lru-buffering pagevec of caller
 401 *
 402 * Obtain @nr_pages locked page cache pages from the mapping @maping and
 403 * starting at index @index.
 404 *
 405 * If a page is newly created, increment its refcount and add it to the
 406 * caller's lru-buffering pagevec @lru_pvec.
 407 *
 408 * This is the same as mm/filemap.c::__grab_cache_page(), except that @nr_pages
 409 * are obtained at once instead of just one page and that 0 is returned on
 410 * success and -errno on error.
 411 *
 412 * Note, the page locks are obtained in ascending page index order.
 413 */
 414static inline int __ntfs_grab_cache_pages(struct address_space *mapping,
 415                pgoff_t index, const unsigned nr_pages, struct page **pages,
 416                struct page **cached_page, struct pagevec *lru_pvec)
 417{
 418        int err, nr;
 419
 420        BUG_ON(!nr_pages);
 421        err = nr = 0;
 422        do {
 423                pages[nr] = find_lock_page(mapping, index);
 424                if (!pages[nr]) {
 425                        if (!*cached_page) {
 426                                *cached_page = page_cache_alloc(mapping);
 427                                if (unlikely(!*cached_page)) {
 428                                        err = -ENOMEM;
 429                                        goto err_out;
 430                                }
 431                        }
 432                        err = add_to_page_cache(*cached_page, mapping, index,
 433                                        GFP_KERNEL);
 434                        if (unlikely(err)) {
 435                                if (err == -EEXIST)
 436                                        continue;
 437                                goto err_out;
 438                        }
 439                        pages[nr] = *cached_page;
 440                        page_cache_get(*cached_page);
 441                        if (unlikely(!pagevec_add(lru_pvec, *cached_page)))
 442                                __pagevec_lru_add_file(lru_pvec);
 443                        *cached_page = NULL;
 444                }
 445                index++;
 446                nr++;
 447        } while (nr < nr_pages);
 448out:
 449        return err;
 450err_out:
 451        while (nr > 0) {
 452                unlock_page(pages[--nr]);
 453                page_cache_release(pages[nr]);
 454        }
 455        goto out;
 456}
 457
 458static inline int ntfs_submit_bh_for_read(struct buffer_head *bh)
 459{
 460        lock_buffer(bh);
 461        get_bh(bh);
 462        bh->b_end_io = end_buffer_read_sync;
 463        return submit_bh(READ, bh);
 464}
 465
 466/**
 467 * ntfs_prepare_pages_for_non_resident_write - prepare pages for receiving data
 468 * @pages:      array of destination pages
 469 * @nr_pages:   number of pages in @pages
 470 * @pos:        byte position in file at which the write begins
 471 * @bytes:      number of bytes to be written
 472 *
 473 * This is called for non-resident attributes from ntfs_file_buffered_write()
 474 * with i_mutex held on the inode (@pages[0]->mapping->host).  There are
 475 * @nr_pages pages in @pages which are locked but not kmap()ped.  The source
 476 * data has not yet been copied into the @pages.
 477 * 
 478 * Need to fill any holes with actual clusters, allocate buffers if necessary,
 479 * ensure all the buffers are mapped, and bring uptodate any buffers that are
 480 * only partially being written to.
 481 *
 482 * If @nr_pages is greater than one, we are guaranteed that the cluster size is
 483 * greater than PAGE_CACHE_SIZE, that all pages in @pages are entirely inside
 484 * the same cluster and that they are the entirety of that cluster, and that
 485 * the cluster is sparse, i.e. we need to allocate a cluster to fill the hole.
 486 *
 487 * i_size is not to be modified yet.
 488 *
 489 * Return 0 on success or -errno on error.
 490 */
 491static int ntfs_prepare_pages_for_non_resident_write(struct page **pages,
 492                unsigned nr_pages, s64 pos, size_t bytes)
 493{
 494        VCN vcn, highest_vcn = 0, cpos, cend, bh_cpos, bh_cend;
 495        LCN lcn;
 496        s64 bh_pos, vcn_len, end, initialized_size;
 497        sector_t lcn_block;
 498        struct page *page;
 499        struct inode *vi;
 500        ntfs_inode *ni, *base_ni = NULL;
 501        ntfs_volume *vol;
 502        runlist_element *rl, *rl2;
 503        struct buffer_head *bh, *head, *wait[2], **wait_bh = wait;
 504        ntfs_attr_search_ctx *ctx = NULL;
 505        MFT_RECORD *m = NULL;
 506        ATTR_RECORD *a = NULL;
 507        unsigned long flags;
 508        u32 attr_rec_len = 0;
 509        unsigned blocksize, u;
 510        int err, mp_size;
 511        bool rl_write_locked, was_hole, is_retry;
 512        unsigned char blocksize_bits;
 513        struct {
 514                u8 runlist_merged:1;
 515                u8 mft_attr_mapped:1;
 516                u8 mp_rebuilt:1;
 517                u8 attr_switched:1;
 518        } status = { 0, 0, 0, 0 };
 519
 520        BUG_ON(!nr_pages);
 521        BUG_ON(!pages);
 522        BUG_ON(!*pages);
 523        vi = pages[0]->mapping->host;
 524        ni = NTFS_I(vi);
 525        vol = ni->vol;
 526        ntfs_debug("Entering for inode 0x%lx, attribute type 0x%x, start page "
 527                        "index 0x%lx, nr_pages 0x%x, pos 0x%llx, bytes 0x%zx.",
 528                        vi->i_ino, ni->type, pages[0]->index, nr_pages,
 529                        (long long)pos, bytes);
 530        blocksize = vol->sb->s_blocksize;
 531        blocksize_bits = vol->sb->s_blocksize_bits;
 532        u = 0;
 533        do {
 534                page = pages[u];
 535                BUG_ON(!page);
 536                /*
 537                 * create_empty_buffers() will create uptodate/dirty buffers if
 538                 * the page is uptodate/dirty.
 539                 */
 540                if (!page_has_buffers(page)) {
 541                        create_empty_buffers(page, blocksize, 0);
 542                        if (unlikely(!page_has_buffers(page)))
 543                                return -ENOMEM;
 544                }
 545        } while (++u < nr_pages);
 546        rl_write_locked = false;
 547        rl = NULL;
 548        err = 0;
 549        vcn = lcn = -1;
 550        vcn_len = 0;
 551        lcn_block = -1;
 552        was_hole = false;
 553        cpos = pos >> vol->cluster_size_bits;
 554        end = pos + bytes;
 555        cend = (end + vol->cluster_size - 1) >> vol->cluster_size_bits;
 556        /*
 557         * Loop over each page and for each page over each buffer.  Use goto to
 558         * reduce indentation.
 559         */
 560        u = 0;
 561do_next_page:
 562        page = pages[u];
 563        bh_pos = (s64)page->index << PAGE_CACHE_SHIFT;
 564        bh = head = page_buffers(page);
 565        do {
 566                VCN cdelta;
 567                s64 bh_end;
 568                unsigned bh_cofs;
 569
 570                /* Clear buffer_new on all buffers to reinitialise state. */
 571                if (buffer_new(bh))
 572                        clear_buffer_new(bh);
 573                bh_end = bh_pos + blocksize;
 574                bh_cpos = bh_pos >> vol->cluster_size_bits;
 575                bh_cofs = bh_pos & vol->cluster_size_mask;
 576                if (buffer_mapped(bh)) {
 577                        /*
 578                         * The buffer is already mapped.  If it is uptodate,
 579                         * ignore it.
 580                         */
 581                        if (buffer_uptodate(bh))
 582                                continue;
 583                        /*
 584                         * The buffer is not uptodate.  If the page is uptodate
 585                         * set the buffer uptodate and otherwise ignore it.
 586                         */
 587                        if (PageUptodate(page)) {
 588                                set_buffer_uptodate(bh);
 589                                continue;
 590                        }
 591                        /*
 592                         * Neither the page nor the buffer are uptodate.  If
 593                         * the buffer is only partially being written to, we
 594                         * need to read it in before the write, i.e. now.
 595                         */
 596                        if ((bh_pos < pos && bh_end > pos) ||
 597                                        (bh_pos < end && bh_end > end)) {
 598                                /*
 599                                 * If the buffer is fully or partially within
 600                                 * the initialized size, do an actual read.
 601                                 * Otherwise, simply zero the buffer.
 602                                 */
 603                                read_lock_irqsave(&ni->size_lock, flags);
 604                                initialized_size = ni->initialized_size;
 605                                read_unlock_irqrestore(&ni->size_lock, flags);
 606                                if (bh_pos < initialized_size) {
 607                                        ntfs_submit_bh_for_read(bh);
 608                                        *wait_bh++ = bh;
 609                                } else {
 610                                        zero_user(page, bh_offset(bh),
 611                                                        blocksize);
 612                                        set_buffer_uptodate(bh);
 613                                }
 614                        }
 615                        continue;
 616                }
 617                /* Unmapped buffer.  Need to map it. */
 618                bh->b_bdev = vol->sb->s_bdev;
 619                /*
 620                 * If the current buffer is in the same clusters as the map
 621                 * cache, there is no need to check the runlist again.  The
 622                 * map cache is made up of @vcn, which is the first cached file
 623                 * cluster, @vcn_len which is the number of cached file
 624                 * clusters, @lcn is the device cluster corresponding to @vcn,
 625                 * and @lcn_block is the block number corresponding to @lcn.
 626                 */
 627                cdelta = bh_cpos - vcn;
 628                if (likely(!cdelta || (cdelta > 0 && cdelta < vcn_len))) {
 629map_buffer_cached:
 630                        BUG_ON(lcn < 0);
 631                        bh->b_blocknr = lcn_block +
 632                                        (cdelta << (vol->cluster_size_bits -
 633                                        blocksize_bits)) +
 634                                        (bh_cofs >> blocksize_bits);
 635                        set_buffer_mapped(bh);
 636                        /*
 637                         * If the page is uptodate so is the buffer.  If the
 638                         * buffer is fully outside the write, we ignore it if
 639                         * it was already allocated and we mark it dirty so it
 640                         * gets written out if we allocated it.  On the other
 641                         * hand, if we allocated the buffer but we are not
 642                         * marking it dirty we set buffer_new so we can do
 643                         * error recovery.
 644                         */
 645                        if (PageUptodate(page)) {
 646                                if (!buffer_uptodate(bh))
 647                                        set_buffer_uptodate(bh);
 648                                if (unlikely(was_hole)) {
 649                                        /* We allocated the buffer. */
 650                                        unmap_underlying_metadata(bh->b_bdev,
 651                                                        bh->b_blocknr);
 652                                        if (bh_end <= pos || bh_pos >= end)
 653                                                mark_buffer_dirty(bh);
 654                                        else
 655                                                set_buffer_new(bh);
 656                                }
 657                                continue;
 658                        }
 659                        /* Page is _not_ uptodate. */
 660                        if (likely(!was_hole)) {
 661                                /*
 662                                 * Buffer was already allocated.  If it is not
 663                                 * uptodate and is only partially being written
 664                                 * to, we need to read it in before the write,
 665                                 * i.e. now.
 666                                 */
 667                                if (!buffer_uptodate(bh) && bh_pos < end &&
 668                                                bh_end > pos &&
 669                                                (bh_pos < pos ||
 670                                                bh_end > end)) {
 671                                        /*
 672                                         * If the buffer is fully or partially
 673                                         * within the initialized size, do an
 674                                         * actual read.  Otherwise, simply zero
 675                                         * the buffer.
 676                                         */
 677                                        read_lock_irqsave(&ni->size_lock,
 678                                                        flags);
 679                                        initialized_size = ni->initialized_size;
 680                                        read_unlock_irqrestore(&ni->size_lock,
 681                                                        flags);
 682                                        if (bh_pos < initialized_size) {
 683                                                ntfs_submit_bh_for_read(bh);
 684                                                *wait_bh++ = bh;
 685                                        } else {
 686                                                zero_user(page, bh_offset(bh),
 687                                                                blocksize);
 688                                                set_buffer_uptodate(bh);
 689                                        }
 690                                }
 691                                continue;
 692                        }
 693                        /* We allocated the buffer. */
 694                        unmap_underlying_metadata(bh->b_bdev, bh->b_blocknr);
 695                        /*
 696                         * If the buffer is fully outside the write, zero it,
 697                         * set it uptodate, and mark it dirty so it gets
 698                         * written out.  If it is partially being written to,
 699                         * zero region surrounding the write but leave it to
 700                         * commit write to do anything else.  Finally, if the
 701                         * buffer is fully being overwritten, do nothing.
 702                         */
 703                        if (bh_end <= pos || bh_pos >= end) {
 704                                if (!buffer_uptodate(bh)) {
 705                                        zero_user(page, bh_offset(bh),
 706                                                        blocksize);
 707                                        set_buffer_uptodate(bh);
 708                                }
 709                                mark_buffer_dirty(bh);
 710                                continue;
 711                        }
 712                        set_buffer_new(bh);
 713                        if (!buffer_uptodate(bh) &&
 714                                        (bh_pos < pos || bh_end > end)) {
 715                                u8 *kaddr;
 716                                unsigned pofs;
 717                                        
 718                                kaddr = kmap_atomic(page, KM_USER0);
 719                                if (bh_pos < pos) {
 720                                        pofs = bh_pos & ~PAGE_CACHE_MASK;
 721                                        memset(kaddr + pofs, 0, pos - bh_pos);
 722                                }
 723                                if (bh_end > end) {
 724                                        pofs = end & ~PAGE_CACHE_MASK;
 725                                        memset(kaddr + pofs, 0, bh_end - end);
 726                                }
 727                                kunmap_atomic(kaddr, KM_USER0);
 728                                flush_dcache_page(page);
 729                        }
 730                        continue;
 731                }
 732                /*
 733                 * Slow path: this is the first buffer in the cluster.  If it
 734                 * is outside allocated size and is not uptodate, zero it and
 735                 * set it uptodate.
 736                 */
 737                read_lock_irqsave(&ni->size_lock, flags);
 738                initialized_size = ni->allocated_size;
 739                read_unlock_irqrestore(&ni->size_lock, flags);
 740                if (bh_pos > initialized_size) {
 741                        if (PageUptodate(page)) {
 742                                if (!buffer_uptodate(bh))
 743                                        set_buffer_uptodate(bh);
 744                        } else if (!buffer_uptodate(bh)) {
 745                                zero_user(page, bh_offset(bh), blocksize);
 746                                set_buffer_uptodate(bh);
 747                        }
 748                        continue;
 749                }
 750                is_retry = false;
 751                if (!rl) {
 752                        down_read(&ni->runlist.lock);
 753retry_remap:
 754                        rl = ni->runlist.rl;
 755                }
 756                if (likely(rl != NULL)) {
 757                        /* Seek to element containing target cluster. */
 758                        while (rl->length && rl[1].vcn <= bh_cpos)
 759                                rl++;
 760                        lcn = ntfs_rl_vcn_to_lcn(rl, bh_cpos);
 761                        if (likely(lcn >= 0)) {
 762                                /*
 763                                 * Successful remap, setup the map cache and
 764                                 * use that to deal with the buffer.
 765                                 */
 766                                was_hole = false;
 767                                vcn = bh_cpos;
 768                                vcn_len = rl[1].vcn - vcn;
 769                                lcn_block = lcn << (vol->cluster_size_bits -
 770                                                blocksize_bits);
 771                                cdelta = 0;
 772                                /*
 773                                 * If the number of remaining clusters touched
 774                                 * by the write is smaller or equal to the
 775                                 * number of cached clusters, unlock the
 776                                 * runlist as the map cache will be used from
 777                                 * now on.
 778                                 */
 779                                if (likely(vcn + vcn_len >= cend)) {
 780                                        if (rl_write_locked) {
 781                                                up_write(&ni->runlist.lock);
 782                                                rl_write_locked = false;
 783                                        } else
 784                                                up_read(&ni->runlist.lock);
 785                                        rl = NULL;
 786                                }
 787                                goto map_buffer_cached;
 788                        }
 789                } else
 790                        lcn = LCN_RL_NOT_MAPPED;
 791                /*
 792                 * If it is not a hole and not out of bounds, the runlist is
 793                 * probably unmapped so try to map it now.
 794                 */
 795                if (unlikely(lcn != LCN_HOLE && lcn != LCN_ENOENT)) {
 796                        if (likely(!is_retry && lcn == LCN_RL_NOT_MAPPED)) {
 797                                /* Attempt to map runlist. */
 798                                if (!rl_write_locked) {
 799                                        /*
 800                                         * We need the runlist locked for
 801                                         * writing, so if it is locked for
 802                                         * reading relock it now and retry in
 803                                         * case it changed whilst we dropped
 804                                         * the lock.
 805                                         */
 806                                        up_read(&ni->runlist.lock);
 807                                        down_write(&ni->runlist.lock);
 808                                        rl_write_locked = true;
 809                                        goto retry_remap;
 810                                }
 811                                err = ntfs_map_runlist_nolock(ni, bh_cpos,
 812                                                NULL);
 813                                if (likely(!err)) {
 814                                        is_retry = true;
 815                                        goto retry_remap;
 816                                }
 817                                /*
 818                                 * If @vcn is out of bounds, pretend @lcn is
 819                                 * LCN_ENOENT.  As long as the buffer is out
 820                                 * of bounds this will work fine.
 821                                 */
 822                                if (err == -ENOENT) {
 823                                        lcn = LCN_ENOENT;
 824                                        err = 0;
 825                                        goto rl_not_mapped_enoent;
 826                                }
 827                        } else
 828                                err = -EIO;
 829                        /* Failed to map the buffer, even after retrying. */
 830                        bh->b_blocknr = -1;
 831                        ntfs_error(vol->sb, "Failed to write to inode 0x%lx, "
 832                                        "attribute type 0x%x, vcn 0x%llx, "
 833                                        "vcn offset 0x%x, because its "
 834                                        "location on disk could not be "
 835                                        "determined%s (error code %i).",
 836                                        ni->mft_no, ni->type,
 837                                        (unsigned long long)bh_cpos,
 838                                        (unsigned)bh_pos &
 839                                        vol->cluster_size_mask,
 840                                        is_retry ? " even after retrying" : "",
 841                                        err);
 842                        break;
 843                }
 844rl_not_mapped_enoent:
 845                /*
 846                 * The buffer is in a hole or out of bounds.  We need to fill
 847                 * the hole, unless the buffer is in a cluster which is not
 848                 * touched by the write, in which case we just leave the buffer
 849                 * unmapped.  This can only happen when the cluster size is
 850                 * less than the page cache size.
 851                 */
 852                if (unlikely(vol->cluster_size < PAGE_CACHE_SIZE)) {
 853                        bh_cend = (bh_end + vol->cluster_size - 1) >>
 854                                        vol->cluster_size_bits;
 855                        if ((bh_cend <= cpos || bh_cpos >= cend)) {
 856                                bh->b_blocknr = -1;
 857                                /*
 858                                 * If the buffer is uptodate we skip it.  If it
 859                                 * is not but the page is uptodate, we can set
 860                                 * the buffer uptodate.  If the page is not
 861                                 * uptodate, we can clear the buffer and set it
 862                                 * uptodate.  Whether this is worthwhile is
 863                                 * debatable and this could be removed.
 864                                 */
 865                                if (PageUptodate(page)) {
 866                                        if (!buffer_uptodate(bh))
 867                                                set_buffer_uptodate(bh);
 868                                } else if (!buffer_uptodate(bh)) {
 869                                        zero_user(page, bh_offset(bh),
 870                                                blocksize);
 871                                        set_buffer_uptodate(bh);
 872                                }
 873                                continue;
 874                        }
 875                }
 876                /*
 877                 * Out of bounds buffer is invalid if it was not really out of
 878                 * bounds.
 879                 */
 880                BUG_ON(lcn != LCN_HOLE);
 881                /*
 882                 * We need the runlist locked for writing, so if it is locked
 883                 * for reading relock it now and retry in case it changed
 884                 * whilst we dropped the lock.
 885                 */
 886                BUG_ON(!rl);
 887                if (!rl_write_locked) {
 888                        up_read(&ni->runlist.lock);
 889                        down_write(&ni->runlist.lock);
 890                        rl_write_locked = true;
 891                        goto retry_remap;
 892                }
 893                /* Find the previous last allocated cluster. */
 894                BUG_ON(rl->lcn != LCN_HOLE);
 895                lcn = -1;
 896                rl2 = rl;
 897                while (--rl2 >= ni->runlist.rl) {
 898                        if (rl2->lcn >= 0) {
 899                                lcn = rl2->lcn + rl2->length;
 900                                break;
 901                        }
 902                }
 903                rl2 = ntfs_cluster_alloc(vol, bh_cpos, 1, lcn, DATA_ZONE,
 904                                false);
 905                if (IS_ERR(rl2)) {
 906                        err = PTR_ERR(rl2);
 907                        ntfs_debug("Failed to allocate cluster, error code %i.",
 908                                        err);
 909                        break;
 910                }
 911                lcn = rl2->lcn;
 912                rl = ntfs_runlists_merge(ni->runlist.rl, rl2);
 913                if (IS_ERR(rl)) {
 914                        err = PTR_ERR(rl);
 915                        if (err != -ENOMEM)
 916                                err = -EIO;
 917                        if (ntfs_cluster_free_from_rl(vol, rl2)) {
 918                                ntfs_error(vol->sb, "Failed to release "
 919                                                "allocated cluster in error "
 920                                                "code path.  Run chkdsk to "
 921                                                "recover the lost cluster.");
 922                                NVolSetErrors(vol);
 923                        }
 924                        ntfs_free(rl2);
 925                        break;
 926                }
 927                ni->runlist.rl = rl;
 928                status.runlist_merged = 1;
 929                ntfs_debug("Allocated cluster, lcn 0x%llx.",
 930                                (unsigned long long)lcn);
 931                /* Map and lock the mft record and get the attribute record. */
 932                if (!NInoAttr(ni))
 933                        base_ni = ni;
 934                else
 935                        base_ni = ni->ext.base_ntfs_ino;
 936                m = map_mft_record(base_ni);
 937                if (IS_ERR(m)) {
 938                        err = PTR_ERR(m);
 939                        break;
 940                }
 941                ctx = ntfs_attr_get_search_ctx(base_ni, m);
 942                if (unlikely(!ctx)) {
 943                        err = -ENOMEM;
 944                        unmap_mft_record(base_ni);
 945                        break;
 946                }
 947                status.mft_attr_mapped = 1;
 948                err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len,
 949                                CASE_SENSITIVE, bh_cpos, NULL, 0, ctx);
 950                if (unlikely(err)) {
 951                        if (err == -ENOENT)
 952                                err = -EIO;
 953                        break;
 954                }
 955                m = ctx->mrec;
 956                a = ctx->attr;
 957                /*
 958                 * Find the runlist element with which the attribute extent
 959                 * starts.  Note, we cannot use the _attr_ version because we
 960                 * have mapped the mft record.  That is ok because we know the
 961                 * runlist fragment must be mapped already to have ever gotten
 962                 * here, so we can just use the _rl_ version.
 963                 */
 964                vcn = sle64_to_cpu(a->data.non_resident.lowest_vcn);
 965                rl2 = ntfs_rl_find_vcn_nolock(rl, vcn);
 966                BUG_ON(!rl2);
 967                BUG_ON(!rl2->length);
 968                BUG_ON(rl2->lcn < LCN_HOLE);
 969                highest_vcn = sle64_to_cpu(a->data.non_resident.highest_vcn);
 970                /*
 971                 * If @highest_vcn is zero, calculate the real highest_vcn
 972                 * (which can really be zero).
 973                 */
 974                if (!highest_vcn)
 975                        highest_vcn = (sle64_to_cpu(
 976                                        a->data.non_resident.allocated_size) >>
 977                                        vol->cluster_size_bits) - 1;
 978                /*
 979                 * Determine the size of the mapping pairs array for the new
 980                 * extent, i.e. the old extent with the hole filled.
 981                 */
 982                mp_size = ntfs_get_size_for_mapping_pairs(vol, rl2, vcn,
 983                                highest_vcn);
 984                if (unlikely(mp_size <= 0)) {
 985                        if (!(err = mp_size))
 986                                err = -EIO;
 987                        ntfs_debug("Failed to get size for mapping pairs "
 988                                        "array, error code %i.", err);
 989                        break;
 990                }
 991                /*
 992                 * Resize the attribute record to fit the new mapping pairs
 993                 * array.
 994                 */
 995                attr_rec_len = le32_to_cpu(a->length);
 996                err = ntfs_attr_record_resize(m, a, mp_size + le16_to_cpu(
 997                                a->data.non_resident.mapping_pairs_offset));
 998                if (unlikely(err)) {
 999                        BUG_ON(err != -ENOSPC);
1000                        // TODO: Deal with this by using the current attribute

1001                        // and fill it with as much of the mapping pairs
1002                        // array as possible.  Then loop over each attribute
1003                        // extent rewriting the mapping pairs arrays as we go
1004                        // along and if when we reach the end we have not
1005                        // enough space, try to resize the last attribute
1006                        // extent and if even that fails, add a new attribute
1007                        // extent.
1008                        // We could also try to resize at each step in the hope
1009                        // that we will not need to rewrite every single extent.
1010                        // Note, we may need to decompress some extents to fill
1011                        // the runlist as we are walking the extents...
1012                        ntfs_error(vol->sb, "Not enough space in the mft "
1013                                        "record for the extended attribute "
1014                                        "record.  This case is not "
1015                                        "implemented yet.");
1016                        err = -EOPNOTSUPP;
1017                        break ;
1018                }
1019                status.mp_rebuilt = 1;
1020                /*
1021                 * Generate the mapping pairs array directly into the attribute
1022                 * record.
1023                 */
1024                err = ntfs_mapping_pairs_build(vol, (u8*)a + le16_to_cpu(
1025                                a->data.non_resident.mapping_pairs_offset),
1026                                mp_size, rl2, vcn, highest_vcn, NULL);
1027                if (unlikely(err)) {
1028                        ntfs_error(vol->sb, "Cannot fill hole in inode 0x%lx, "
1029                                        "attribute type 0x%x, because building "
1030                                        "the mapping pairs failed with error "
1031                                        "code %i.", vi->i_ino,
1032                                        (unsigned)le32_to_cpu(ni->type), err);
1033                        err = -EIO;
1034                        break;
1035                }
1036                /* Update the highest_vcn but only if it was not set. */
1037                if (unlikely(!a->data.non_resident.highest_vcn))
1038                        a->data.non_resident.highest_vcn =
1039                                        cpu_to_sle64(highest_vcn);
1040                /*
1041                 * If the attribute is sparse/compressed, update the compressed
1042                 * size in the ntfs_inode structure and the attribute record.
1043                 */
1044                if (likely(NInoSparse(ni) || NInoCompressed(ni))) {
1045                        /*
1046                         * If we are not in the first attribute extent, switch
1047                         * to it, but first ensure the changes will make it to
1048                         * disk later.
1049                         */
1050                        if (a->data.non_resident.lowest_vcn) {
1051                                flush_dcache_mft_record_page(ctx->ntfs_ino);
1052                                mark_mft_record_dirty(ctx->ntfs_ino);
1053                                ntfs_attr_reinit_search_ctx(ctx);
1054                                err = ntfs_attr_lookup(ni->type, ni->name,
1055                                                ni->name_len, CASE_SENSITIVE,
1056                                                0, NULL, 0, ctx);
1057                                if (unlikely(err)) {
1058                                        status.attr_switched = 1;
1059                                        break;
1060                                }
1061                                /* @m is not used any more so do not set it. */
1062                                a = ctx->attr;
1063                        }
1064                        write_lock_irqsave(&ni->size_lock, flags);
1065                        ni->itype.compressed.size += vol->cluster_size;
1066                        a->data.non_resident.compressed_size =
1067                                        cpu_to_sle64(ni->itype.compressed.size);
1068                        write_unlock_irqrestore(&ni->size_lock, flags);
1069                }
1070                /* Ensure the changes make it to disk. */
1071                flush_dcache_mft_record_page(ctx->ntfs_ino);
1072                mark_mft_record_dirty(ctx->ntfs_ino);
1073                ntfs_attr_put_search_ctx(ctx);
1074                unmap_mft_record(base_ni);
1075                /* Successfully filled the hole. */
1076                status.runlist_merged = 0;
1077                status.mft_attr_mapped = 0;
1078                status.mp_rebuilt = 0;
1079                /* Setup the map cache and use that to deal with the buffer. */
1080                was_hole = true;
1081                vcn = bh_cpos;
1082                vcn_len = 1;
1083                lcn_block = lcn << (vol->cluster_size_bits - blocksize_bits);
1084                cdelta = 0;
1085                /*
1086                 * If the number of remaining clusters in the @pages is smaller
1087                 * or equal to the number of cached clusters, unlock the
1088                 * runlist as the map cache will be used from now on.
1089                 */
1090                if (likely(vcn + vcn_len >= cend)) {
1091                        up_write(&ni->runlist.lock);
1092                        rl_write_locked = false;
1093                        rl = NULL;
1094                }
1095                goto map_buffer_cached;
1096        } while (bh_pos += blocksize, (bh = bh->b_this_page) != head);
1097        /* If there are no errors, do the next page. */
1098        if (likely(!err && ++u < nr_pages))
1099                goto do_next_page;
1100        /* If there are no errors, release the runlist lock if we took it. */
1101        if (likely(!err)) {
1102                if (unlikely(rl_write_locked)) {
1103                        up_write(&ni->runlist.lock);
1104                        rl_write_locked = false;
1105                } else if (unlikely(rl))
1106                        up_read(&ni->runlist.lock);
1107                rl = NULL;
1108        }
1109        /* If we issued read requests, let them complete. */
1110        read_lock_irqsave(&ni->size_lock, flags);
1111        initialized_size = ni->initialized_size;
1112        read_unlock_irqrestore(&ni->size_lock, flags);
1113        while (wait_bh > wait) {
1114                bh = *--wait_bh;
1115                wait_on_buffer(bh);
1116                if (likely(buffer_uptodate(bh))) {
1117                        page = bh->b_page;
1118                        bh_pos = ((s64)page->index << PAGE_CACHE_SHIFT) +
1119                                        bh_offset(bh);
1120                        /*
1121                         * If the buffer overflows the initialized size, need
1122                         * to zero the overflowing region.
1123                         */
1124                        if (unlikely(bh_pos + blocksize > initialized_size)) {
1125                                int ofs = 0;
1126
1127                                if (likely(bh_pos < initialized_size))
1128                                        ofs = initialized_size - bh_pos;
1129                                zero_user_segment(page, bh_offset(bh) + ofs,
1130                                                blocksize);
1131                        }
1132                } else /* if (unlikely(!buffer_uptodate(bh))) */
1133                        err = -EIO;
1134        }
1135        if (likely(!err)) {
1136                /* Clear buffer_new on all buffers. */
1137                u = 0;
1138                do {
1139                        bh = head = page_buffers(pages[u]);
1140                        do {
1141                                if (buffer_new(bh))
1142                                        clear_buffer_new(bh);
1143                        } while ((bh = bh->b_this_page) != head);
1144                } while (++u < nr_pages);
1145                ntfs_debug("Done.");
1146                return err;
1147        }
1148        if (status.attr_switched) {
1149                /* Get back to the attribute extent we modified. */
1150                ntfs_attr_reinit_search_ctx(ctx);
1151                if (ntfs_attr_lookup(ni->type, ni->name, ni->name_len,
1152                                CASE_SENSITIVE, bh_cpos, NULL, 0, ctx)) {
1153                        ntfs_error(vol->sb, "Failed to find required "
1154                                        "attribute extent of attribute in "
1155                                        "error code path.  Run chkdsk to "
1156                                        "recover.");
1157                        write_lock_irqsave(&ni->size_lock, flags);
1158                        ni->itype.compressed.size += vol->cluster_size;
1159                        write_unlock_irqrestore(&ni->size_lock, flags);
1160                        flush_dcache_mft_record_page(ctx->ntfs_ino);
1161                        mark_mft_record_dirty(ctx->ntfs_ino);
1162                        /*
1163                         * The only thing that is now wrong is the compressed
1164                         * size of the base attribute extent which chkdsk
1165                         * should be able to fix.
1166                         */
1167                        NVolSetErrors(vol);
1168                } else {
1169                        m = ctx->mrec;
1170                        a = ctx->attr;
1171                        status.attr_switched = 0;
1172                }
1173        }
1174        /*
1175         * If the runlist has been modified, need to restore it by punching a
1176         * hole into it and we then need to deallocate the on-disk cluster as
1177         * well.  Note, we only modify the runlist if we are able to generate a
1178         * new mapping pairs array, i.e. only when the mapped attribute extent
1179         * is not switched.
1180         */
1181        if (status.runlist_merged && !status.attr_switched) {
1182                BUG_ON(!rl_write_locked);
1183                /* Make the file cluster we allocated sparse in the runlist. */
1184                if (ntfs_rl_punch_nolock(vol, &ni->runlist, bh_cpos, 1)) {
1185                        ntfs_error(vol->sb, "Failed to punch hole into "
1186                                        "attribute runlist in error code "
1187                                        "path.  Run chkdsk to recover the "
1188                                        "lost cluster.");
1189                        NVolSetErrors(vol);
1190                } else /* if (success) */ {
1191                        status.runlist_merged = 0;
1192                        /*
1193                         * Deallocate the on-disk cluster we allocated but only
1194                         * if we succeeded in punching its vcn out of the
1195                         * runlist.
1196                         */
1197                        down_write(&vol->lcnbmp_lock);
1198                        if (ntfs_bitmap_clear_bit(vol->lcnbmp_ino, lcn)) {
1199                                ntfs_error(vol->sb, "Failed to release "
1200                                                "allocated cluster in error "
1201                                                "code path.  Run chkdsk to "
1202                                                "recover the lost cluster.");
1203                                NVolSetErrors(vol);
1204                        }
1205                        up_write(&vol->lcnbmp_lock);
1206                }
1207        }
1208        /*
1209         * Resize the attribute record to its old size and rebuild the mapping
1210         * pairs array.  Note, we only can do this if the runlist has been
1211         * restored to its old state which also implies that the mapped
1212         * attribute extent is not switched.
1213         */
1214        if (status.mp_rebuilt && !status.runlist_merged) {
1215                if (ntfs_attr_record_resize(m, a, attr_rec_len)) {
1216                        ntfs_error(vol->sb, "Failed to restore attribute "
1217                                        "record in error code path.  Run "
1218                                        "chkdsk to recover.");
1219                        NVolSetErrors(vol);
1220                } else /* if (success) */ {
1221                        if (ntfs_mapping_pairs_build(vol, (u8*)a +
1222                                        le16_to_cpu(a->data.non_resident.
1223                                        mapping_pairs_offset), attr_rec_len -
1224                                        le16_to_cpu(a->data.non_resident.
1225                                        mapping_pairs_offset), ni->runlist.rl,
1226                                        vcn, highest_vcn, NULL)) {
1227                                ntfs_error(vol->sb, "Failed to restore "
1228                                                "mapping pairs array in error "
1229                                                "code path.  Run chkdsk to "
1230                                                "recover.");
1231                                NVolSetErrors(vol);
1232                        }
1233                        flush_dcache_mft_record_page(ctx->ntfs_ino);
1234                        mark_mft_record_dirty(ctx->ntfs_ino);
1235                }
1236        }
1237        /* Release the mft record and the attribute. */
1238        if (status.mft_attr_mapped) {
1239                ntfs_attr_put_search_ctx(ctx);
1240                unmap_mft_record(base_ni);
1241        }
1242        /* Release the runlist lock. */
1243        if (rl_write_locked)
1244                up_write(&ni->runlist.lock);
1245        else if (rl)
1246                up_read(&ni->runlist.lock);
1247        /*
1248         * Zero out any newly allocated blocks to avoid exposing stale data.
1249         * If BH_New is set, we know that the block was newly allocated above
1250         * and that it has not been fully zeroed and marked dirty yet.
1251         */
1252        nr_pages = u;
1253        u = 0;
1254        end = bh_cpos << vol->cluster_size_bits;
1255        do {
1256                page = pages[u];
1257                bh = head = page_buffers(page);
1258                do {
1259                        if (u == nr_pages &&
1260                                        ((s64)page->index << PAGE_CACHE_SHIFT) +
1261                                        bh_offset(bh) >= end)
1262                                break;
1263                        if (!buffer_new(bh))
1264                                continue;
1265                        clear_buffer_new(bh);
1266                        if (!buffer_uptodate(bh)) {
1267                                if (PageUptodate(page))
1268                                        set_buffer_uptodate(bh);
1269                                else {
1270                                        zero_user(page, bh_offset(bh),
1271                                                        blocksize);
1272                                        set_buffer_uptodate(bh);
1273                                }
1274                        }
1275                        mark_buffer_dirty(bh);
1276                } while ((bh = bh->b_this_page) != head);
1277        } while (++u <= nr_pages);
1278        ntfs_error(vol->sb, "Failed.  Returning error code %i.", err);
1279        return err;
1280}
1281
1282/*
1283 * Copy as much as we can into the pages and return the number of bytes which
1284 * were sucessfully copied.  If a fault is encountered then clear the pages
1285 * out to (ofs + bytes) and return the number of bytes which were copied.
1286 */
1287static inline size_t ntfs_copy_from_user(struct page **pages,
1288                unsigned nr_pages, unsigned ofs, const char __user *buf,
1289                size_t bytes)
1290{
1291        struct page **last_page = pages + nr_pages;
1292        char *addr;
1293        size_t total = 0;
1294        unsigned len;
1295        int left;
1296
1297        do {
1298                len = PAGE_CACHE_SIZE - ofs;
1299                if (len > bytes)
1300                        len = bytes;
1301                addr = kmap_atomic(*pages, KM_USER0);
1302                left = __copy_from_user_inatomic(addr + ofs, buf, len);
1303                kunmap_atomic(addr, KM_USER0);
1304                if (unlikely(left)) {
1305                        /* Do it the slow way. */
1306                        addr = kmap(*pages);
1307                        left = __copy_from_user(addr + ofs, buf, len);
1308                        kunmap(*pages);
1309                        if (unlikely(left))
1310                                goto err_out;
1311                }
1312                total += len;
1313                bytes -= len;
1314                if (!bytes)
1315                        break;
1316                buf += len;
1317                ofs = 0;
1318        } while (++pages < last_page);
1319out:
1320        return total;
1321err_out:
1322        total += len - left;
1323        /* Zero the rest of the target like __copy_from_user(). */
1324        while (++pages < last_page) {
1325                bytes -= len;
1326                if (!bytes)
1327                        break;
1328                len = PAGE_CACHE_SIZE;
1329                if (len > bytes)
1330                        len = bytes;
1331                zero_user(*pages, 0, len);
1332        }
1333        goto out;
1334}
1335
1336static size_t __ntfs_copy_from_user_iovec_inatomic(char *vaddr,
1337                const struct iovec *iov, size_t iov_ofs, size_t bytes)
1338{
1339        size_t total = 0;
1340
1341        while (1) {
1342                const char __user *buf = iov->iov_base + iov_ofs;
1343                unsigned len;
1344                size_t left;
1345
1346                len = iov->iov_len - iov_ofs;
1347                if (len > bytes)
1348                        len = bytes;
1349                left = __copy_from_user_inatomic(vaddr, buf, len);
1350                total += len;
1351                bytes -= len;
1352                vaddr += len;
1353                if (unlikely(left)) {
1354                        total -= left;
1355                        break;
1356                }
1357                if (!bytes)
1358                        break;
1359                iov++;
1360                iov_ofs = 0;
1361        }
1362        return total;
1363}
1364
1365static inline void ntfs_set_next_iovec(const struct iovec **iovp,
1366                size_t *iov_ofsp, size_t bytes)
1367{
1368        const struct iovec *iov = *iovp;
1369        size_t iov_ofs = *iov_ofsp;
1370
1371        while (bytes) {
1372                unsigned len;
1373
1374                len = iov->iov_len - iov_ofs;
1375                if (len > bytes)
1376                        len = bytes;
1377                bytes -= len;
1378                iov_ofs += len;
1379                if (iov->iov_len == iov_ofs) {
1380                        iov++;
1381                        iov_ofs = 0;
1382                }
1383        }
1384        *iovp = iov;
1385        *iov_ofsp = iov_ofs;
1386}
1387
1388/*
1389 * This has the same side-effects and return value as ntfs_copy_from_user().
1390 * The difference is that on a fault we need to memset the remainder of the
1391 * pages (out to offset + bytes), to emulate ntfs_copy_from_user()'s
1392 * single-segment behaviour.
1393 *
1394 * We call the same helper (__ntfs_copy_from_user_iovec_inatomic()) both
1395 * when atomic and when not atomic.  This is ok because
1396 * __ntfs_copy_from_user_iovec_inatomic() calls __copy_from_user_inatomic()
1397 * and it is ok to call this when non-atomic.
1398 * Infact, the only difference between __copy_from_user_inatomic() and
1399 * __copy_from_user() is that the latter calls might_sleep() and the former
1400 * should not zero the tail of the buffer on error.  And on many
1401 * architectures __copy_from_user_inatomic() is just defined to
1402 * __copy_from_user() so it makes no difference at all on those architectures.
1403 */
1404static inline size_t ntfs_copy_from_user_iovec(struct page **pages,
1405                unsigned nr_pages, unsigned ofs, const struct iovec **iov,
1406                size_t *iov_ofs, size_t bytes)
1407{
1408        struct page **last_page = pages + nr_pages;
1409        char *addr;
1410        size_t copied, len, total = 0;
1411
1412        do {
1413                len = PAGE_CACHE_SIZE - ofs;
1414                if (len > bytes)
1415                        len = bytes;
1416                addr = kmap_atomic(*pages, KM_USER0);
1417                copied = __ntfs_copy_from_user_iovec_inatomic(addr + ofs,
1418                                *iov, *iov_ofs, len);
1419                kunmap_atomic(addr, KM_USER0);
1420                if (unlikely(copied != len)) {
1421                        /* Do it the slow way. */
1422                        addr = kmap(*pages);
1423                        copied = __ntfs_copy_from_user_iovec_inatomic(addr + ofs,
1424                                        *iov, *iov_ofs, len);
1425                        /*
1426                         * Zero the rest of the target like __copy_from_user().
1427                         */
1428                        memset(addr + ofs + copied, 0, len - copied);
1429                        kunmap(*pages);
1430                        if (unlikely(copied != len))
1431                                goto err_out;
1432                }
1433                total += len;
1434                bytes -= len;
1435                if (!bytes)
1436                        break;
1437                ntfs_set_next_iovec(iov, iov_ofs, len);
1438                ofs = 0;
1439        } while (++pages < last_page);
1440out:
1441        return total;
1442err_out:
1443        total += copied;
1444        /* Zero the rest of the target like __copy_from_user(). */
1445        while (++pages < last_page) {
1446                bytes -= len;
1447                if (!bytes)
1448                        break;
1449                len = PAGE_CACHE_SIZE;
1450                if (len > bytes)
1451                        len = bytes;
1452                zero_user(*pages, 0, len);
1453        }
1454        goto out;
1455}
1456
1457static inline void ntfs_flush_dcache_pages(struct page **pages,
1458                unsigned nr_pages)
1459{
1460        BUG_ON(!nr_pages);
1461        /*
1462         * Warning: Do not do the decrement at the same time as the call to
1463         * flush_dcache_page() because it is a NULL macro on i386 and hence the
1464         * decrement never happens so the loop never terminates.
1465         */
1466        do {
1467                --nr_pages;
1468                flush_dcache_page(pages[nr_pages]);
1469        } while (nr_pages > 0);
1470}
1471
1472/**
1473 * ntfs_commit_pages_after_non_resident_write - commit the received data
1474 * @pages:      array of destination pages
1475 * @nr_pages:   number of pages in @pages
1476 * @pos:        byte position in file at which the write begins
1477 * @bytes:      number of bytes to be written
1478 *
1479 * See description of ntfs_commit_pages_after_write(), below.
1480 */
1481static inline int ntfs_commit_pages_after_non_resident_write(
1482                struct page **pages, const unsigned nr_pages,
1483                s64 pos, size_t bytes)
1484{
1485        s64 end, initialized_size;
1486        struct inode *vi;
1487        ntfs_inode *ni, *base_ni;
1488        struct buffer_head *bh, *head;
1489        ntfs_attr_search_ctx *ctx;
1490        MFT_RECORD *m;
1491        ATTR_RECORD *a;
1492        unsigned long flags;
1493        unsigned blocksize, u;
1494        int err;
1495
1496        vi = pages[0]->mapping->host;
1497        ni = NTFS_I(vi);
1498        blocksize = vi->i_sb->s_blocksize;
1499        end = pos + bytes;
1500        u = 0;
1501        do {
1502                s64 bh_pos;
1503                struct page *page;
1504                bool partial;
1505
1506                page = pages[u];
1507                bh_pos = (s64)page->index << PAGE_CACHE_SHIFT;
1508                bh = head = page_buffers(page);
1509                partial = false;
1510                do {
1511                        s64 bh_end;
1512
1513                        bh_end = bh_pos + blocksize;
1514                        if (bh_end <= pos || bh_pos >= end) {
1515                                if (!buffer_uptodate(bh))
1516                                        partial = true;
1517                        } else {
1518                                set_buffer_uptodate(bh);
1519                                mark_buffer_dirty(bh);
1520                        }
1521                } while (bh_pos += blocksize, (bh = bh->b_this_page) != head);
1522                /*
1523                 * If all buffers are now uptodate but the page is not, set the
1524                 * page uptodate.
1525                 */
1526                if (!partial && !PageUptodate(page))
1527                        SetPageUptodate(page);
1528        } while (++u < nr_pages);
1529        /*
1530         * Finally, if we do not need to update initialized_size or i_size we
1531         * are finished.
1532         */
1533        read_lock_irqsave(&ni->size_lock, flags);
1534        initialized_size = ni->initialized_size;
1535        read_unlock_irqrestore(&ni->size_lock, flags);
1536        if (end <= initialized_size) {
1537                ntfs_debug("Done.");
1538                return 0;
1539        }
1540        /*
1541         * Update initialized_size/i_size as appropriate, both in the inode and
1542         * the mft record.
1543         */
1544        if (!NInoAttr(ni))
1545                base_ni = ni;
1546        else
1547                base_ni = ni->ext.base_ntfs_ino;
1548        /* Map, pin, and lock the mft record. */
1549        m = map_mft_record(base_ni);
1550        if (IS_ERR(m)) {
1551                err = PTR_ERR(m);
1552                m = NULL;
1553                ctx = NULL;
1554                goto err_out;
1555        }
1556        BUG_ON(!NInoNonResident(ni));
1557        ctx = ntfs_attr_get_search_ctx(base_ni, m);
1558        if (unlikely(!ctx)) {
1559                err = -ENOMEM;
1560                goto err_out;
1561        }
1562        err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len,
1563                        CASE_SENSITIVE, 0, NULL, 0, ctx);
1564        if (unlikely(err)) {
1565                if (err == -ENOENT)
1566                        err = -EIO;
1567                goto err_out;
1568        }
1569        a = ctx->attr;
1570        BUG_ON(!a->non_resident);
1571        write_lock_irqsave(&ni->size_lock, flags);
1572        BUG_ON(end > ni->allocated_size);
1573        ni->initialized_size = end;
1574        a->data.non_resident.initialized_size = cpu_to_sle64(end);
1575        if (end > i_size_read(vi)) {
1576                i_size_write(vi, end);
1577                a->data.non_resident.data_size =
1578                                a->data.non_resident.initialized_size;
1579        }
1580        write_unlock_irqrestore(&ni->size_lock, flags);
1581        /* Mark the mft record dirty, so it gets written back. */
1582        flush_dcache_mft_record_page(ctx->ntfs_ino);
1583        mark_mft_record_dirty(ctx->ntfs_ino);
1584        ntfs_attr_put_search_ctx(ctx);
1585        unmap_mft_record(base_ni);
1586        ntfs_debug("Done.");
1587        return 0;
1588err_out:
1589        if (ctx)
1590                ntfs_attr_put_search_ctx(ctx);
1591        if (m)
1592                unmap_mft_record(base_ni);
1593        ntfs_error(vi->i_sb, "Failed to update initialized_size/i_size (error "
1594                        "code %i).", err);
1595        if (err != -ENOMEM)
1596                NVolSetErrors(ni->vol);
1597        return err;
1598}
1599
1600/**
1601 * ntfs_commit_pages_after_write - commit the received data
1602 * @pages:      array of destination pages
1603 * @nr_pages:   number of pages in @pages
1604 * @pos:        byte position in file at which the write begins
1605 * @bytes:      number of bytes to be written
1606 *
1607 * This is called from ntfs_file_buffered_write() with i_mutex held on the inode
1608 * (@pages[0]->mapping->host).  There are @nr_pages pages in @pages which are
1609 * locked but not kmap()ped.  The source data has already been copied into the
1610 * @page.  ntfs_prepare_pages_for_non_resident_write() has been called before
1611 * the data was copied (for non-resident attributes only) and it returned
1612 * success.
1613 *
1614 * Need to set uptodate and mark dirty all buffers within the boundary of the
1615 * write.  If all buffers in a page are uptodate we set the page uptodate, too.
1616 *
1617 * Setting the buffers dirty ensures that they get written out later when
1618 * ntfs_writepage() is invoked by the VM.
1619 *
1620 * Finally, we need to update i_size and initialized_size as appropriate both
1621 * in the inode and the mft record.
1622 *
1623 * This is modelled after fs/buffer.c::generic_commit_write(), which marks
1624 * buffers uptodate and dirty, sets the page uptodate if all buffers in the
1625 * page are uptodate, and updates i_size if the end of io is beyond i_size.  In
1626 * that case, it also marks the inode dirty.
1627 *
1628 * If things have gone as outlined in
1629 * ntfs_prepare_pages_for_non_resident_write(), we do not need to do any page
1630 * content modifications here for non-resident attributes.  For resident
1631 * attributes we need to do the uptodate bringing here which we combine with
1632 * the copying into the mft record which means we save one atomic kmap.
1633 *
1634 * Return 0 on success or -errno on error.
1635 */
1636static int ntfs_commit_pages_after_write(struct page **pages,
1637                const unsigned nr_pages, s64 pos, size_t bytes)
1638{
1639        s64 end, initialized_size;
1640        loff_t i_size;
1641        struct inode *vi;
1642        ntfs_inode *ni, *base_ni;
1643        struct page *page;
1644        ntfs_attr_search_ctx *ctx;
1645        MFT_RECORD *m;
1646        ATTR_RECORD *a;
1647        char *kattr, *kaddr;
1648        unsigned long flags;
1649        u32 attr_len;
1650        int err;
1651
1652        BUG_ON(!nr_pages);
1653        BUG_ON(!pages);
1654        page = pages[0];
1655        BUG_ON(!page);
1656        vi = page->mapping->host;
1657        ni = NTFS_I(vi);
1658        ntfs_debug("Entering for inode 0x%lx, attribute type 0x%x, start page "
1659                        "index 0x%lx, nr_pages 0x%x, pos 0x%llx, bytes 0x%zx.",
1660                        vi->i_ino, ni->type, page->index, nr_pages,
1661                        (long long)pos, bytes);
1662        if (NInoNonResident(ni))
1663                return ntfs_commit_pages_after_non_resident_write(pages,
1664                                nr_pages, pos, bytes);
1665        BUG_ON(nr_pages > 1);
1666        /*
1667         * Attribute is resident, implying it is not compressed, encrypted, or
1668         * sparse.
1669         */
1670        if (!NInoAttr(ni))
1671                base_ni = ni;
1672        else
1673                base_ni = ni->ext.base_ntfs_ino;
1674        BUG_ON(NInoNonResident(ni));
1675        /* Map, pin, and lock the mft record. */
1676        m = map_mft_record(base_ni);
1677        if (IS_ERR(m)) {
1678                err = PTR_ERR(m);
1679                m = NULL;
1680                ctx = NULL;
1681                goto err_out;
1682        }
1683        ctx = ntfs_attr_get_search_ctx(base_ni, m);
1684        if (unlikely(!ctx)) {
1685                err = -ENOMEM;
1686                goto err_out;
1687        }
1688        err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len,
1689                        CASE_SENSITIVE, 0, NULL, 0, ctx);
1690        if (unlikely(err)) {
1691                if (err == -ENOENT)
1692                        err = -EIO;
1693                goto err_out;
1694        }
1695        a = ctx->attr;
1696        BUG_ON(a->non_resident);
1697        /* The total length of the attribute value. */
1698        attr_len = le32_to_cpu(a->data.resident.value_length);
1699        i_size = i_size_read(vi);
1700        BUG_ON(attr_len != i_size);
1701        BUG_ON(pos > attr_len);
1702        end = pos + bytes;
1703        BUG_ON(end > le32_to_cpu(a->length) -
1704                        le16_to_cpu(a->data.resident.value_offset));
1705        kattr = (u8*)a + le16_to_cpu(a->data.resident.value_offset);
1706        kaddr = kmap_atomic(page, KM_USER0);
1707        /* Copy the received data from the page to the mft record. */
1708        memcpy(kattr + pos, kaddr + pos, bytes);
1709        /* Update the attribute length if necessary. */
1710        if (end > attr_len) {
1711                attr_len = end;
1712                a->data.resident.value_length = cpu_to_le32(attr_len);
1713        }
1714        /*
1715         * If the page is not uptodate, bring the out of bounds area(s)
1716         * uptodate by copying data from the mft record to the page.
1717         */
1718        if (!PageUptodate(page)) {
1719                if (pos > 0)
1720                        memcpy(kaddr, kattr, pos);
1721                if (end < attr_len)
1722                        memcpy(kaddr + end, kattr + end, attr_len - end);
1723                /* Zero the region outside the end of the attribute value. */
1724                memset(kaddr + attr_len, 0, PAGE_CACHE_SIZE - attr_len);
1725                flush_dcache_page(page);
1726                SetPageUptodate(page);
1727        }
1728        kunmap_atomic(kaddr, KM_USER0);
1729        /* Update initialized_size/i_size if necessary. */
1730        read_lock_irqsave(&ni->size_lock, flags);
1731        initialized_size = ni->initialized_size;
1732        BUG_ON(end > ni->allocated_size);
1733        read_unlock_irqrestore(&ni->size_lock, flags);
1734        BUG_ON(initialized_size != i_size);
1735        if (end > initialized_size) {
1736                write_lock_irqsave(&ni->size_lock, flags);
1737                ni->initialized_size = end;
1738                i_size_write(vi, end);
1739                write_unlock_irqrestore(&ni->size_lock, flags);
1740        }
1741        /* Mark the mft record dirty, so it gets written back. */
1742        flush_dcache_mft_record_page(ctx->ntfs_ino);
1743        mark_mft_record_dirty(ctx->ntfs_ino);
1744        ntfs_attr_put_search_ctx(ctx);
1745        unmap_mft_record(base_ni);
1746        ntfs_debug("Done.");
1747        return 0;
1748err_out:
1749        if (err == -ENOMEM) {
1750                ntfs_warning(vi->i_sb, "Error allocating memory required to "
1751                                "commit the write.");
1752                if (PageUptodate(page)) {
1753                        ntfs_warning(vi->i_sb, "Page is uptodate, setting "
1754                                        "dirty so the write will be retried "
1755                                        "later on by the VM.");
1756                        /*
1757                         * Put the page on mapping->dirty_pages, but leave its
1758                         * buffers' dirty state as-is.
1759                         */
1760                        __set_page_dirty_nobuffers(page);
1761                        err = 0;
1762                } else
1763                        ntfs_error(vi->i_sb, "Page is not uptodate.  Written "
1764                                        "data has been lost.");
1765        } else {
1766                ntfs_error(vi->i_sb, "Resident attribute commit write failed "
1767                                "with error %i.", err);
1768                NVolSetErrors(ni->vol);
1769        }
1770        if (ctx)
1771                ntfs_attr_put_search_ctx(ctx);
1772        if (m)
1773                unmap_mft_record(base_ni);
1774        return err;
1775}
1776
1777/**
1778 * ntfs_file_buffered_write -
1779 *
1780 * Locking: The vfs is holding ->i_mutex on the inode.
1781 */
1782static ssize_t ntfs_file_buffered_write(struct kiocb *iocb,
1783                const struct iovec *iov, unsigned long nr_segs,
1784                loff_t pos, loff_t *ppos, size_t count)
1785{
1786        struct file *file = iocb->ki_filp;
1787        struct address_space *mapping = file->f_mapping;
1788        struct inode *vi = mapping->host;
1789        ntfs_inode *ni = NTFS_I(vi);
1790        ntfs_volume *vol = ni->vol;
1791        struct page *pages[NTFS_MAX_PAGES_PER_CLUSTER];
1792        struct page *cached_page = NULL;
1793        char __user *buf = NULL;
1794        s64 end, ll;
1795        VCN last_vcn;
1796        LCN lcn;
1797        unsigned long flags;
1798        size_t bytes, iov_ofs = 0;      /* Offset in the current iovec. */
1799        ssize_t status, written;
1800        unsigned nr_pages;
1801        int err;
1802        struct pagevec lru_pvec;
1803
1804        ntfs_debug("Entering for i_ino 0x%lx, attribute type 0x%x, "
1805                        "pos 0x%llx, count 0x%lx.",
1806                        vi->i_ino, (unsigned)le32_to_cpu(ni->type),
1807                        (unsigned long long)pos, (unsigned long)count);
1808        if (unlikely(!count))
1809                return 0;
1810        BUG_ON(NInoMstProtected(ni));
1811        /*
1812         * If the attribute is not an index root and it is encrypted or
1813         * compressed, we cannot write to it yet.  Note we need to check for
1814         * AT_INDEX_ALLOCATION since this is the type of both directory and
1815         * index inodes.
1816         */
1817        if (ni->type != AT_INDEX_ALLOCATION) {
1818                /* If file is encrypted, deny access, just like NT4. */
1819                if (NInoEncrypted(ni)) {
1820                        /*
1821                         * Reminder for later: Encrypted files are _always_
1822                         * non-resident so that the content can always be
1823                         * encrypted.
1824                         */
1825                        ntfs_debug("Denying write access to encrypted file.");
1826                        return -EACCES;
1827                }
1828                if (NInoCompressed(ni)) {
1829                        /* Only unnamed $DATA attribute can be compressed. */
1830                        BUG_ON(ni->type != AT_DATA);
1831                        BUG_ON(ni->name_len);
1832                        /*
1833                         * Reminder for later: If resident, the data is not
1834                         * actually compressed.  Only on the switch to non-
1835                         * resident does compression kick in.  This is in
1836                         * contrast to encrypted files (see above).
1837                         */
1838                        ntfs_error(vi->i_sb, "Writing to compressed files is "
1839                                        "not implemented yet.  Sorry.");
1840                        return -EOPNOTSUPP;
1841                }
1842        }
1843        /*
1844         * If a previous ntfs_truncate() failed, repeat it and abort if it
1845         * fails again.
1846         */
1847        if (unlikely(NInoTruncateFailed(ni))) {
1848                down_write(&vi->i_alloc_sem);
1849                err = ntfs_truncate(vi);
1850                up_write(&vi->i_alloc_sem);
1851                if (err || NInoTruncateFailed(ni)) {
1852                        if (!err)
1853                                err = -EIO;
1854                        ntfs_error(vol->sb, "Cannot perform write to inode "
1855                                        "0x%lx, attribute type 0x%x, because "
1856                                        "ntfs_truncate() failed (error code "
1857                                        "%i).", vi->i_ino,
1858                                        (unsigned)le32_to_cpu(ni->type), err);
1859                        return err;
1860                }
1861        }
1862        /* The first byte after the write. */
1863        end = pos + count;
1864        /*
1865         * If the write goes beyond the allocated size, extend the allocation
1866         * to cover the whole of the write, rounded up to the nearest cluster.
1867         */
1868        read_lock_irqsave(&ni->size_lock, flags);
1869        ll = ni->allocated_size;
1870        read_unlock_irqrestore(&ni->size_lock, flags);
1871        if (end > ll) {
1872                /* Extend the allocation without changing the data size. */
1873                ll = ntfs_attr_extend_allocation(ni, end, -1, pos);
1874                if (likely(ll >= 0)) {
1875                        BUG_ON(pos >= ll);
1876                        /* If the extension was partial truncate the write. */
1877                        if (end > ll) {
1878                                ntfs_debug("Truncating write to inode 0x%lx, "
1879                                                "attribute type 0x%x, because "
1880                                                "the allocation was only "
1881                                                "partially extended.",
1882                                                vi->i_ino, (unsigned)
1883                                                le32_to_cpu(ni->type));
1884                                end = ll;
1885                                count = ll - pos;
1886                        }
1887                } else {
1888                        err = ll;
1889                        read_lock_irqsave(&ni->size_lock, flags);
1890                        ll = ni->allocated_size;
1891                        read_unlock_irqrestore(&ni->size_lock, flags);
1892                        /* Perform a partial write if possible or fail. */
1893                        if (pos < ll) {
1894                                ntfs_debug("Truncating write to inode 0x%lx, "
1895                                                "attribute type 0x%x, because "
1896                                                "extending the allocation "
1897                                                "failed (error code %i).",
1898                                                vi->i_ino, (unsigned)
1899                                                le32_to_cpu(ni->type), err);
1900                                end = ll;
1901                                count = ll - pos;
1902                        } else {
1903                                ntfs_error(vol->sb, "Cannot perform write to "
1904                                                "inode 0x%lx, attribute type "
1905                                                "0x%x, because extending the "
1906                                                "allocation failed (error "
1907                                                "code %i).", vi->i_ino,
1908                                                (unsigned)
1909                                                le32_to_cpu(ni->type), err);
1910                                return err;
1911                        }
1912                }
1913        }
1914        pagevec_init(&lru_pvec, 0);
1915        written = 0;
1916        /*
1917         * If the write starts beyond the initialized size, extend it up to the
1918         * beginning of the write and initialize all non-sparse space between
1919         * the old initialized size and the new one.  This automatically also
1920         * increments the vfs inode->i_size to keep it above or equal to the
1921         * initialized_size.
1922         */
1923        read_lock_irqsave(&ni->size_lock, flags);
1924        ll = ni->initialized_size;
1925        read_unlock_irqrestore(&ni->size_lock, flags);
1926        if (pos > ll) {
1927                err = ntfs_attr_extend_initialized(ni, pos, &cached_page,
1928                                &lru_pvec);
1929                if (err < 0) {
1930                        ntfs_error(vol->sb, "Cannot perform write to inode "
1931                                        "0x%lx, attribute type 0x%x, because "
1932                                        "extending the initialized size "
1933                                        "failed (error code %i).", vi->i_ino,
1934                                        (unsigned)le32_to_cpu(ni->type), err);
1935                        status = err;
1936                        goto err_out;
1937                }
1938        }
1939        /*
1940         * Determine the number of pages per cluster for non-resident
1941         * attributes.
1942         */
1943        nr_pages = 1;
1944        if (vol->cluster_size > PAGE_CACHE_SIZE && NInoNonResident(ni))
1945                nr_pages = vol->cluster_size >> PAGE_CACHE_SHIFT;
1946        /* Finally, perform the actual write. */
1947        last_vcn = -1;
1948        if (likely(nr_segs == 1))
1949                buf = iov->iov_base;
1950        do {
1951                VCN vcn;
1952                pgoff_t idx, start_idx;
1953                unsigned ofs, do_pages, u;
1954                size_t copied;
1955
1956                start_idx = idx = pos >> PAGE_CACHE_SHIFT;
1957                ofs = pos & ~PAGE_CACHE_MASK;
1958                bytes = PAGE_CACHE_SIZE - ofs;
1959                do_pages = 1;
1960                if (nr_pages > 1) {
1961                        vcn = pos >> vol->cluster_size_bits;
1962                        if (vcn != last_vcn) {
1963                                last_vcn = vcn;
1964                                /*
1965                                 * Get the lcn of the vcn the write is in.  If
1966                                 * it is a hole, need to lock down all pages in
1967                                 * the cluster.
1968                                 */
1969                                down_read(&ni->runlist.lock);
1970                                lcn = ntfs_attr_vcn_to_lcn_nolock(ni, pos >>
1971                                                vol->cluster_size_bits, false);
1972                                up_read(&ni->runlist.lock);
1973                                if (unlikely(lcn < LCN_HOLE)) {
1974                                        status = -EIO;
1975                                        if (lcn == LCN_ENOMEM)
1976                                                status = -ENOMEM;
1977                                        else
1978                                                ntfs_error(vol->sb, "Cannot "
1979                                                        "perform write to "
1980                                                        "inode 0x%lx, "
1981                                                        "attribute type 0x%x, "
1982                                                        "because the attribute "
1983                                                        "is corrupt.",
1984                                                        vi->i_ino, (unsigned)
1985                                                        le32_to_cpu(ni->type));
1986                                        break;
1987                                }
1988                                if (lcn == LCN_HOLE) {
1989                                        start_idx = (pos & ~(s64)
1990                                                        vol->cluster_size_mask)
1991                                                        >> PAGE_CACHE_SHIFT;
1992                                        bytes = vol->cluster_size - (pos &
1993                                                        vol->cluster_size_mask);
1994                                        do_pages = nr_pages;
1995                                }
1996                        }
1997                }
1998                if (bytes > count)
1999                        bytes = count;
2000                /*

2001                 * Bring in the user page(s) that we will copy from _first_.
2002                 * Otherwise there is a nasty deadlock on copying from the same
2003                 * page(s) as we are writing to, without it/them being marked
2004                 * up-to-date.  Note, at present there is nothing to stop the
2005                 * pages being swapped out between us bringing them into memory
2006                 * and doing the actual copying.
2007                 */
2008                if (likely(nr_segs == 1))
2009                        ntfs_fault_in_pages_readable(buf, bytes);
2010                else
2011                        ntfs_fault_in_pages_readable_iovec(iov, iov_ofs, bytes);
2012                /* Get and lock @do_pages starting at index @start_idx. */
2013                status = __ntfs_grab_cache_pages(mapping, start_idx, do_pages,
2014                                pages, &cached_page, &lru_pvec);
2015                if (unlikely(status))
2016                        break;
2017                /*
2018                 * For non-resident attributes, we need to fill any holes with
2019                 * actual clusters and ensure all bufferes are mapped.  We also
2020                 * need to bring uptodate any buffers that are only partially
2021                 * being written to.
2022                 */
2023                if (NInoNonResident(ni)) {
2024                        status = ntfs_prepare_pages_for_non_resident_write(
2025                                        pages, do_pages, pos, bytes);
2026                        if (unlikely(status)) {
2027                                loff_t i_size;
2028
2029                                do {
2030                                        unlock_page(pages[--do_pages]);
2031                                        page_cache_release(pages[do_pages]);
2032                                } while (do_pages);
2033                                /*
2034                                 * The write preparation may have instantiated
2035                                 * allocated space outside i_size.  Trim this
2036                                 * off again.  We can ignore any errors in this
2037                                 * case as we will just be waisting a bit of
2038                                 * allocated space, which is not a disaster.
2039                                 */
2040                                i_size = i_size_read(vi);
2041                                if (pos + bytes > i_size)
2042                                        vmtruncate(vi, i_size);
2043                                break;
2044                        }
2045                }
2046                u = (pos >> PAGE_CACHE_SHIFT) - pages[0]->index;
2047                if (likely(nr_segs == 1)) {
2048                        copied = ntfs_copy_from_user(pages + u, do_pages - u,
2049                                        ofs, buf, bytes);
2050                        buf += copied;
2051                } else
2052                        copied = ntfs_copy_from_user_iovec(pages + u,
2053                                        do_pages - u, ofs, &iov, &iov_ofs,
2054                                        bytes);
2055                ntfs_flush_dcache_pages(pages + u, do_pages - u);
2056                status = ntfs_commit_pages_after_write(pages, do_pages, pos,
2057                                bytes);
2058                if (likely(!status)) {
2059                        written += copied;
2060                        count -= copied;
2061                        pos += copied;
2062                        if (unlikely(copied != bytes))
2063                                status = -EFAULT;
2064                }
2065                do {
2066                        unlock_page(pages[--do_pages]);
2067                        mark_page_accessed(pages[do_pages]);
2068                        page_cache_release(pages[do_pages]);
2069                } while (do_pages);
2070                if (unlikely(status))
2071                        break;
2072                balance_dirty_pages_ratelimited(mapping);
2073                cond_resched();
2074        } while (count);
2075err_out:
2076        *ppos = pos;
2077        if (cached_page)
2078                page_cache_release(cached_page);
2079        pagevec_lru_add_file(&lru_pvec);
2080        ntfs_debug("Done.  Returning %s (written 0x%lx, status %li).",
2081                        written ? "written" : "status", (unsigned long)written,
2082                        (long)status);
2083        return written ? written : status;
2084}
2085
2086/**
2087 * ntfs_file_aio_write_nolock -
2088 */
2089static ssize_t ntfs_file_aio_write_nolock(struct kiocb *iocb,
2090                const struct iovec *iov, unsigned long nr_segs, loff_t *ppos)
2091{
2092        struct file *file = iocb->ki_filp;
2093        struct address_space *mapping = file->f_mapping;
2094        struct inode *inode = mapping->host;
2095        loff_t pos;
2096        size_t count;           /* after file limit checks */
2097        ssize_t written, err;
2098
2099        count = 0;
2100        err = generic_segment_checks(iov, &nr_segs, &count, VERIFY_READ);
2101        if (err)
2102                return err;
2103        pos = *ppos;
2104        vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE);
2105        /* We can write back this queue in page reclaim. */
2106        current->backing_dev_info = mapping->backing_dev_info;
2107        written = 0;
2108        err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode));
2109        if (err)
2110                goto out;
2111        if (!count)
2112                goto out;
2113        err = file_remove_suid(file);
2114        if (err)
2115                goto out;
2116        file_update_time(file);
2117        written = ntfs_file_buffered_write(iocb, iov, nr_segs, pos, ppos,
2118                        count);
2119out:
2120        current->backing_dev_info = NULL;
2121        return written ? written : err;
2122}
2123
2124/**
2125 * ntfs_file_aio_write -
2126 */
2127static ssize_t ntfs_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
2128                unsigned long nr_segs, loff_t pos)
2129{
2130        struct file *file = iocb->ki_filp;
2131        struct address_space *mapping = file->f_mapping;
2132        struct inode *inode = mapping->host;
2133        ssize_t ret;
2134
2135        BUG_ON(iocb->ki_pos != pos);
2136
2137        mutex_lock(&inode->i_mutex);
2138        ret = ntfs_file_aio_write_nolock(iocb, iov, nr_segs, &iocb->ki_pos);
2139        mutex_unlock(&inode->i_mutex);
2140        if (ret > 0) {
2141                int err = generic_write_sync(file, pos, ret);
2142                if (err < 0)
2143                        ret = err;
2144        }
2145        return ret;
2146}
2147
2148/**
2149 * ntfs_file_fsync - sync a file to disk
2150 * @filp:       file to be synced
2151 * @dentry:     dentry describing the file to sync
2152 * @datasync:   if non-zero only flush user data and not metadata
2153 *
2154 * Data integrity sync of a file to disk.  Used for fsync, fdatasync, and msync
2155 * system calls.  This function is inspired by fs/buffer.c::file_fsync().
2156 *
2157 * If @datasync is false, write the mft record and all associated extent mft
2158 * records as well as the $DATA attribute and then sync the block device.
2159 *
2160 * If @datasync is true and the attribute is non-resident, we skip the writing
2161 * of the mft record and all associated extent mft records (this might still
2162 * happen due to the write_inode_now() call).
2163 *
2164 * Also, if @datasync is true, we do not wait on the inode to be written out
2165 * but we always wait on the page cache pages to be written out.
2166 *
2167 * Note: In the past @filp could be NULL so we ignore it as we don't need it
2168 * anyway.
2169 *
2170 * Locking: Caller must hold i_mutex on the inode.
2171 *
2172 * TODO: We should probably also write all attribute/index inodes associated
2173 * with this inode but since we have no simple way of getting to them we ignore
2174 * this problem for now.
2175 */
2176static int ntfs_file_fsync(struct file *filp, struct dentry *dentry,
2177                int datasync)
2178{
2179        struct inode *vi = dentry->d_inode;
2180        int err, ret = 0;
2181
2182        ntfs_debug("Entering for inode 0x%lx.", vi->i_ino);
2183        BUG_ON(S_ISDIR(vi->i_mode));
2184        if (!datasync || !NInoNonResident(NTFS_I(vi)))
2185                ret = ntfs_write_inode(vi, 1);
2186        write_inode_now(vi, !datasync);
2187        /*
2188         * NOTE: If we were to use mapping->private_list (see ext2 and
2189         * fs/buffer.c) for dirty blocks then we could optimize the below to be
2190         * sync_mapping_buffers(vi->i_mapping).
2191         */
2192        err = sync_blockdev(vi->i_sb->s_bdev);
2193        if (unlikely(err && !ret))
2194                ret = err;
2195        if (likely(!ret))
2196                ntfs_debug("Done.");
2197        else
2198                ntfs_warning(vi->i_sb, "Failed to f%ssync inode 0x%lx.  Error "
2199                                "%u.", datasync ? "data" : "", vi->i_ino, -ret);
2200        return ret;
2201}
2202
2203#endif /* NTFS_RW */
2204
2205const struct file_operations ntfs_file_ops = {
2206        .llseek         = generic_file_llseek,   /* Seek inside file. */
2207        .read           = do_sync_read,          /* Read from file. */
2208        .aio_read       = generic_file_aio_read, /* Async read from file. */
2209#ifdef NTFS_RW
2210        .write          = do_sync_write,         /* Write to file. */
2211        .aio_write      = ntfs_file_aio_write,   /* Async write to file. */
2212        /*.release      = ,*/                    /* Last file is closed.  See
2213                                                    fs/ext2/file.c::
2214                                                    ext2_release_file() for
2215                                                    how to use this to discard
2216                                                    preallocated space for
2217                                                    write opened files. */
2218        .fsync          = ntfs_file_fsync,       /* Sync a file to disk. */
2219        /*.aio_fsync    = ,*/                    /* Sync all outstanding async
2220                                                    i/o operations on a
2221                                                    kiocb. */
2222#endif /* NTFS_RW */
2223        /*.ioctl        = ,*/                    /* Perform function on the
2224                                                    mounted filesystem. */
2225        .mmap           = generic_file_mmap,     /* Mmap file. */
2226        .open           = ntfs_file_open,        /* Open file. */
2227        .splice_read    = generic_file_splice_read /* Zero-copy data send with
2228                                                    the data source being on
2229                                                    the ntfs partition.  We do
2230                                                    not need to care about the
2231                                                    data destination. */
2232        /*.sendpage     = ,*/                    /* Zero-copy data send with
2233                                                    the data destination being
2234                                                    on the ntfs partition.  We
2235                                                    do not need to care about
2236                                                    the data source. */
2237};
2238
2239const struct inode_operations ntfs_file_inode_ops = {
2240#ifdef NTFS_RW
2241        .truncate       = ntfs_truncate_vfs,
2242        .setattr        = ntfs_setattr,
2243#endif /* NTFS_RW */
2244};
2245
2246const struct file_operations ntfs_empty_file_ops = {};
2247
2248const struct inode_operations ntfs_empty_inode_ops = {};
2249