linux/fs/ntfs/aops.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0-or-later
   2/**
   3 * aops.c - NTFS kernel address space operations and page cache handling.
   4 *
   5 * Copyright (c) 2001-2014 Anton Altaparmakov and Tuxera Inc.
   6 * Copyright (c) 2002 Richard Russon
   7 */
   8
   9#include <linux/errno.h>
  10#include <linux/fs.h>
  11#include <linux/gfp.h>
  12#include <linux/mm.h>
  13#include <linux/pagemap.h>
  14#include <linux/swap.h>
  15#include <linux/buffer_head.h>
  16#include <linux/writeback.h>
  17#include <linux/bit_spinlock.h>
  18#include <linux/bio.h>
  19
  20#include "aops.h"
  21#include "attrib.h"
  22#include "debug.h"
  23#include "inode.h"
  24#include "mft.h"
  25#include "runlist.h"
  26#include "types.h"
  27#include "ntfs.h"
  28
  29/**
  30 * ntfs_end_buffer_async_read - async io completion for reading attributes
  31 * @bh:         buffer head on which io is completed
  32 * @uptodate:   whether @bh is now uptodate or not
  33 *
  34 * Asynchronous I/O completion handler for reading pages belonging to the
  35 * attribute address space of an inode.  The inodes can either be files or
  36 * directories or they can be fake inodes describing some attribute.
  37 *
  38 * If NInoMstProtected(), perform the post read mst fixups when all IO on the
  39 * page has been completed and mark the page uptodate or set the error bit on
  40 * the page.  To determine the size of the records that need fixing up, we
  41 * cheat a little bit by setting the index_block_size in ntfs_inode to the ntfs
  42 * record size, and index_block_size_bits, to the log(base 2) of the ntfs
  43 * record size.
  44 */
  45static void ntfs_end_buffer_async_read(struct buffer_head *bh, int uptodate)
  46{
  47        unsigned long flags;
  48        struct buffer_head *first, *tmp;
  49        struct page *page;
  50        struct inode *vi;
  51        ntfs_inode *ni;
  52        int page_uptodate = 1;
  53
  54        page = bh->b_page;
  55        vi = page->mapping->host;
  56        ni = NTFS_I(vi);
  57
  58        if (likely(uptodate)) {
  59                loff_t i_size;
  60                s64 file_ofs, init_size;
  61
  62                set_buffer_uptodate(bh);
  63
  64                file_ofs = ((s64)page->index << PAGE_SHIFT) +
  65                                bh_offset(bh);
  66                read_lock_irqsave(&ni->size_lock, flags);
  67                init_size = ni->initialized_size;
  68                i_size = i_size_read(vi);
  69                read_unlock_irqrestore(&ni->size_lock, flags);
  70                if (unlikely(init_size > i_size)) {
  71                        /* Race with shrinking truncate. */
  72                        init_size = i_size;
  73                }
  74                /* Check for the current buffer head overflowing. */
  75                if (unlikely(file_ofs + bh->b_size > init_size)) {
  76                        int ofs;
  77                        void *kaddr;
  78
  79                        ofs = 0;
  80                        if (file_ofs < init_size)
  81                                ofs = init_size - file_ofs;
  82                        kaddr = kmap_atomic(page);
  83                        memset(kaddr + bh_offset(bh) + ofs, 0,
  84                                        bh->b_size - ofs);
  85                        flush_dcache_page(page);
  86                        kunmap_atomic(kaddr);
  87                }
  88        } else {
  89                clear_buffer_uptodate(bh);
  90                SetPageError(page);
  91                ntfs_error(ni->vol->sb, "Buffer I/O error, logical block "
  92                                "0x%llx.", (unsigned long long)bh->b_blocknr);
  93        }
  94        first = page_buffers(page);
  95        spin_lock_irqsave(&first->b_uptodate_lock, flags);
  96        clear_buffer_async_read(bh);
  97        unlock_buffer(bh);
  98        tmp = bh;
  99        do {
 100                if (!buffer_uptodate(tmp))
 101                        page_uptodate = 0;
 102                if (buffer_async_read(tmp)) {
 103                        if (likely(buffer_locked(tmp)))
 104                                goto still_busy;
 105                        /* Async buffers must be locked. */
 106                        BUG();
 107                }
 108                tmp = tmp->b_this_page;
 109        } while (tmp != bh);
 110        spin_unlock_irqrestore(&first->b_uptodate_lock, flags);
 111        /*
 112         * If none of the buffers had errors then we can set the page uptodate,
 113         * but we first have to perform the post read mst fixups, if the
 114         * attribute is mst protected, i.e. if NInoMstProteced(ni) is true.
 115         * Note we ignore fixup errors as those are detected when
 116         * map_mft_record() is called which gives us per record granularity
 117         * rather than per page granularity.
 118         */
 119        if (!NInoMstProtected(ni)) {
 120                if (likely(page_uptodate && !PageError(page)))
 121                        SetPageUptodate(page);
 122        } else {
 123                u8 *kaddr;
 124                unsigned int i, recs;
 125                u32 rec_size;
 126
 127                rec_size = ni->itype.index.block_size;
 128                recs = PAGE_SIZE / rec_size;
 129                /* Should have been verified before we got here... */
 130                BUG_ON(!recs);
 131                kaddr = kmap_atomic(page);
 132                for (i = 0; i < recs; i++)
 133                        post_read_mst_fixup((NTFS_RECORD*)(kaddr +
 134                                        i * rec_size), rec_size);
 135                kunmap_atomic(kaddr);
 136                flush_dcache_page(page);
 137                if (likely(page_uptodate && !PageError(page)))
 138                        SetPageUptodate(page);
 139        }
 140        unlock_page(page);
 141        return;
 142still_busy:
 143        spin_unlock_irqrestore(&first->b_uptodate_lock, flags);
 144        return;
 145}
 146
 147/**
 148 * ntfs_read_block - fill a @page of an address space with data
 149 * @page:       page cache page to fill with data
 150 *
 151 * Fill the page @page of the address space belonging to the @page->host inode.
 152 * We read each buffer asynchronously and when all buffers are read in, our io
 153 * completion handler ntfs_end_buffer_read_async(), if required, automatically
 154 * applies the mst fixups to the page before finally marking it uptodate and
 155 * unlocking it.
 156 *
 157 * We only enforce allocated_size limit because i_size is checked for in
 158 * generic_file_read().
 159 *
 160 * Return 0 on success and -errno on error.
 161 *
 162 * Contains an adapted version of fs/buffer.c::block_read_full_page().
 163 */
 164static int ntfs_read_block(struct page *page)
 165{
 166        loff_t i_size;
 167        VCN vcn;
 168        LCN lcn;
 169        s64 init_size;
 170        struct inode *vi;
 171        ntfs_inode *ni;
 172        ntfs_volume *vol;
 173        runlist_element *rl;
 174        struct buffer_head *bh, *head, *arr[MAX_BUF_PER_PAGE];
 175        sector_t iblock, lblock, zblock;
 176        unsigned long flags;
 177        unsigned int blocksize, vcn_ofs;
 178        int i, nr;
 179        unsigned char blocksize_bits;
 180
 181        vi = page->mapping->host;
 182        ni = NTFS_I(vi);
 183        vol = ni->vol;
 184
 185        /* $MFT/$DATA must have its complete runlist in memory at all times. */
 186        BUG_ON(!ni->runlist.rl && !ni->mft_no && !NInoAttr(ni));
 187
 188        blocksize = vol->sb->s_blocksize;
 189        blocksize_bits = vol->sb->s_blocksize_bits;
 190
 191        if (!page_has_buffers(page)) {
 192                create_empty_buffers(page, blocksize, 0);
 193                if (unlikely(!page_has_buffers(page))) {
 194                        unlock_page(page);
 195                        return -ENOMEM;
 196                }
 197        }
 198        bh = head = page_buffers(page);
 199        BUG_ON(!bh);
 200
 201        /*
 202         * We may be racing with truncate.  To avoid some of the problems we
 203         * now take a snapshot of the various sizes and use those for the whole
 204         * of the function.  In case of an extending truncate it just means we
 205         * may leave some buffers unmapped which are now allocated.  This is
 206         * not a problem since these buffers will just get mapped when a write
 207         * occurs.  In case of a shrinking truncate, we will detect this later
 208         * on due to the runlist being incomplete and if the page is being
 209         * fully truncated, truncate will throw it away as soon as we unlock
 210         * it so no need to worry what we do with it.
 211         */
 212        iblock = (s64)page->index << (PAGE_SHIFT - blocksize_bits);
 213        read_lock_irqsave(&ni->size_lock, flags);
 214        lblock = (ni->allocated_size + blocksize - 1) >> blocksize_bits;
 215        init_size = ni->initialized_size;
 216        i_size = i_size_read(vi);
 217        read_unlock_irqrestore(&ni->size_lock, flags);
 218        if (unlikely(init_size > i_size)) {
 219                /* Race with shrinking truncate. */
 220                init_size = i_size;
 221        }
 222        zblock = (init_size + blocksize - 1) >> blocksize_bits;
 223
 224        /* Loop through all the buffers in the page. */
 225        rl = NULL;
 226        nr = i = 0;
 227        do {
 228                int err = 0;
 229
 230                if (unlikely(buffer_uptodate(bh)))
 231                        continue;
 232                if (unlikely(buffer_mapped(bh))) {
 233                        arr[nr++] = bh;
 234                        continue;
 235                }
 236                bh->b_bdev = vol->sb->s_bdev;
 237                /* Is the block within the allowed limits? */
 238                if (iblock < lblock) {
 239                        bool is_retry = false;
 240
 241                        /* Convert iblock into corresponding vcn and offset. */
 242                        vcn = (VCN)iblock << blocksize_bits >>
 243                                        vol->cluster_size_bits;
 244                        vcn_ofs = ((VCN)iblock << blocksize_bits) &
 245                                        vol->cluster_size_mask;
 246                        if (!rl) {
 247lock_retry_remap:
 248                                down_read(&ni->runlist.lock);
 249                                rl = ni->runlist.rl;
 250                        }
 251                        if (likely(rl != NULL)) {
 252                                /* Seek to element containing target vcn. */
 253                                while (rl->length && rl[1].vcn <= vcn)
 254                                        rl++;
 255                                lcn = ntfs_rl_vcn_to_lcn(rl, vcn);
 256                        } else
 257                                lcn = LCN_RL_NOT_MAPPED;
 258                        /* Successful remap. */
 259                        if (lcn >= 0) {
 260                                /* Setup buffer head to correct block. */
 261                                bh->b_blocknr = ((lcn << vol->cluster_size_bits)
 262                                                + vcn_ofs) >> blocksize_bits;
 263                                set_buffer_mapped(bh);
 264                                /* Only read initialized data blocks. */
 265                                if (iblock < zblock) {
 266                                        arr[nr++] = bh;
 267                                        continue;
 268                                }
 269                                /* Fully non-initialized data block, zero it. */
 270                                goto handle_zblock;
 271                        }
 272                        /* It is a hole, need to zero it. */
 273                        if (lcn == LCN_HOLE)
 274                                goto handle_hole;
 275                        /* If first try and runlist unmapped, map and retry. */
 276                        if (!is_retry && lcn == LCN_RL_NOT_MAPPED) {
 277                                is_retry = true;
 278                                /*
 279                                 * Attempt to map runlist, dropping lock for
 280                                 * the duration.
 281                                 */
 282                                up_read(&ni->runlist.lock);
 283                                err = ntfs_map_runlist(ni, vcn);
 284                                if (likely(!err))
 285                                        goto lock_retry_remap;
 286                                rl = NULL;
 287                        } else if (!rl)
 288                                up_read(&ni->runlist.lock);
 289                        /*
 290                         * If buffer is outside the runlist, treat it as a
 291                         * hole.  This can happen due to concurrent truncate
 292                         * for example.
 293                         */
 294                        if (err == -ENOENT || lcn == LCN_ENOENT) {
 295                                err = 0;
 296                                goto handle_hole;
 297                        }
 298                        /* Hard error, zero out region. */
 299                        if (!err)
 300                                err = -EIO;
 301                        bh->b_blocknr = -1;
 302                        SetPageError(page);
 303                        ntfs_error(vol->sb, "Failed to read from inode 0x%lx, "
 304                                        "attribute type 0x%x, vcn 0x%llx, "
 305                                        "offset 0x%x because its location on "
 306                                        "disk could not be determined%s "
 307                                        "(error code %i).", ni->mft_no,
 308                                        ni->type, (unsigned long long)vcn,
 309                                        vcn_ofs, is_retry ? " even after "
 310                                        "retrying" : "", err);
 311                }
 312                /*
 313                 * Either iblock was outside lblock limits or
 314                 * ntfs_rl_vcn_to_lcn() returned error.  Just zero that portion
 315                 * of the page and set the buffer uptodate.
 316                 */
 317handle_hole:
 318                bh->b_blocknr = -1UL;
 319                clear_buffer_mapped(bh);
 320handle_zblock:
 321                zero_user(page, i * blocksize, blocksize);
 322                if (likely(!err))
 323                        set_buffer_uptodate(bh);
 324        } while (i++, iblock++, (bh = bh->b_this_page) != head);
 325
 326        /* Release the lock if we took it. */
 327        if (rl)
 328                up_read(&ni->runlist.lock);
 329
 330        /* Check we have at least one buffer ready for i/o. */
 331        if (nr) {
 332                struct buffer_head *tbh;
 333
 334                /* Lock the buffers. */
 335                for (i = 0; i < nr; i++) {
 336                        tbh = arr[i];
 337                        lock_buffer(tbh);
 338                        tbh->b_end_io = ntfs_end_buffer_async_read;
 339                        set_buffer_async_read(tbh);
 340                }
 341                /* Finally, start i/o on the buffers. */
 342                for (i = 0; i < nr; i++) {
 343                        tbh = arr[i];
 344                        if (likely(!buffer_uptodate(tbh)))
 345                                submit_bh(REQ_OP_READ, 0, tbh);
 346                        else
 347                                ntfs_end_buffer_async_read(tbh, 1);
 348                }
 349                return 0;
 350        }
 351        /* No i/o was scheduled on any of the buffers. */
 352        if (likely(!PageError(page)))
 353                SetPageUptodate(page);
 354        else /* Signal synchronous i/o error. */
 355                nr = -EIO;
 356        unlock_page(page);
 357        return nr;
 358}
 359
 360/**
 361 * ntfs_readpage - fill a @page of a @file with data from the device
 362 * @file:       open file to which the page @page belongs or NULL
 363 * @page:       page cache page to fill with data
 364 *
 365 * For non-resident attributes, ntfs_readpage() fills the @page of the open
 366 * file @file by calling the ntfs version of the generic block_read_full_page()
 367 * function, ntfs_read_block(), which in turn creates and reads in the buffers
 368 * associated with the page asynchronously.
 369 *
 370 * For resident attributes, OTOH, ntfs_readpage() fills @page by copying the
 371 * data from the mft record (which at this stage is most likely in memory) and
 372 * fills the remainder with zeroes. Thus, in this case, I/O is synchronous, as
 373 * even if the mft record is not cached at this point in time, we need to wait
 374 * for it to be read in before we can do the copy.
 375 *
 376 * Return 0 on success and -errno on error.
 377 */
 378static int ntfs_readpage(struct file *file, struct page *page)
 379{
 380        loff_t i_size;
 381        struct inode *vi;
 382        ntfs_inode *ni, *base_ni;
 383        u8 *addr;
 384        ntfs_attr_search_ctx *ctx;
 385        MFT_RECORD *mrec;
 386        unsigned long flags;
 387        u32 attr_len;
 388        int err = 0;
 389
 390retry_readpage:
 391        BUG_ON(!PageLocked(page));
 392        vi = page->mapping->host;
 393        i_size = i_size_read(vi);
 394        /* Is the page fully outside i_size? (truncate in progress) */
 395        if (unlikely(page->index >= (i_size + PAGE_SIZE - 1) >>
 396                        PAGE_SHIFT)) {
 397                zero_user(page, 0, PAGE_SIZE);
 398                ntfs_debug("Read outside i_size - truncated?");
 399                goto done;
 400        }
 401        /*
 402         * This can potentially happen because we clear PageUptodate() during
 403         * ntfs_writepage() of MstProtected() attributes.
 404         */
 405        if (PageUptodate(page)) {
 406                unlock_page(page);
 407                return 0;
 408        }
 409        ni = NTFS_I(vi);
 410        /*
 411         * Only $DATA attributes can be encrypted and only unnamed $DATA
 412         * attributes can be compressed.  Index root can have the flags set but
 413         * this means to create compressed/encrypted files, not that the
 414         * attribute is compressed/encrypted.  Note we need to check for
 415         * AT_INDEX_ALLOCATION since this is the type of both directory and
 416         * index inodes.
 417         */
 418        if (ni->type != AT_INDEX_ALLOCATION) {
 419                /* If attribute is encrypted, deny access, just like NT4. */
 420                if (NInoEncrypted(ni)) {
 421                        BUG_ON(ni->type != AT_DATA);
 422                        err = -EACCES;
 423                        goto err_out;
 424                }
 425                /* Compressed data streams are handled in compress.c. */
 426                if (NInoNonResident(ni) && NInoCompressed(ni)) {
 427                        BUG_ON(ni->type != AT_DATA);
 428                        BUG_ON(ni->name_len);
 429                        return ntfs_read_compressed_block(page);
 430                }
 431        }
 432        /* NInoNonResident() == NInoIndexAllocPresent() */
 433        if (NInoNonResident(ni)) {
 434                /* Normal, non-resident data stream. */
 435                return ntfs_read_block(page);
 436        }
 437        /*
 438         * Attribute is resident, implying it is not compressed or encrypted.
 439         * This also means the attribute is smaller than an mft record and
 440         * hence smaller than a page, so can simply zero out any pages with
 441         * index above 0.  Note the attribute can actually be marked compressed
 442         * but if it is resident the actual data is not compressed so we are
 443         * ok to ignore the compressed flag here.
 444         */
 445        if (unlikely(page->index > 0)) {
 446                zero_user(page, 0, PAGE_SIZE);
 447                goto done;
 448        }
 449        if (!NInoAttr(ni))
 450                base_ni = ni;
 451        else
 452                base_ni = ni->ext.base_ntfs_ino;
 453        /* Map, pin, and lock the mft record. */
 454        mrec = map_mft_record(base_ni);
 455        if (IS_ERR(mrec)) {
 456                err = PTR_ERR(mrec);
 457                goto err_out;
 458        }
 459        /*
 460         * If a parallel write made the attribute non-resident, drop the mft
 461         * record and retry the readpage.
 462         */
 463        if (unlikely(NInoNonResident(ni))) {
 464                unmap_mft_record(base_ni);
 465                goto retry_readpage;
 466        }
 467        ctx = ntfs_attr_get_search_ctx(base_ni, mrec);
 468        if (unlikely(!ctx)) {
 469                err = -ENOMEM;
 470                goto unm_err_out;
 471        }
 472        err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len,
 473                        CASE_SENSITIVE, 0, NULL, 0, ctx);
 474        if (unlikely(err))
 475                goto put_unm_err_out;
 476        attr_len = le32_to_cpu(ctx->attr->data.resident.value_length);
 477        read_lock_irqsave(&ni->size_lock, flags);
 478        if (unlikely(attr_len > ni->initialized_size))
 479                attr_len = ni->initialized_size;
 480        i_size = i_size_read(vi);
 481        read_unlock_irqrestore(&ni->size_lock, flags);
 482        if (unlikely(attr_len > i_size)) {
 483                /* Race with shrinking truncate. */
 484                attr_len = i_size;
 485        }
 486        addr = kmap_atomic(page);
 487        /* Copy the data to the page. */
 488        memcpy(addr, (u8*)ctx->attr +
 489                        le16_to_cpu(ctx->attr->data.resident.value_offset),
 490                        attr_len);
 491        /* Zero the remainder of the page. */
 492        memset(addr + attr_len, 0, PAGE_SIZE - attr_len);
 493        flush_dcache_page(page);
 494        kunmap_atomic(addr);
 495put_unm_err_out:
 496        ntfs_attr_put_search_ctx(ctx);
 497unm_err_out:
 498        unmap_mft_record(base_ni);
 499done:
 500        SetPageUptodate(page);
 501err_out:
 502        unlock_page(page);
 503        return err;
 504}
 505
 506#ifdef NTFS_RW
 507
 508/**
 509 * ntfs_write_block - write a @page to the backing store
 510 * @page:       page cache page to write out
 511 * @wbc:        writeback control structure
 512 *
 513 * This function is for writing pages belonging to non-resident, non-mst
 514 * protected attributes to their backing store.
 515 *
 516 * For a page with buffers, map and write the dirty buffers asynchronously
 517 * under page writeback. For a page without buffers, create buffers for the
 518 * page, then proceed as above.
 519 *
 520 * If a page doesn't have buffers the page dirty state is definitive. If a page
 521 * does have buffers, the page dirty state is just a hint, and the buffer dirty
 522 * state is definitive. (A hint which has rules: dirty buffers against a clean
 523 * page is illegal. Other combinations are legal and need to be handled. In
 524 * particular a dirty page containing clean buffers for example.)
 525 *
 526 * Return 0 on success and -errno on error.
 527 *
 528 * Based on ntfs_read_block() and __block_write_full_page().
 529 */
 530static int ntfs_write_block(struct page *page, struct writeback_control *wbc)
 531{
 532        VCN vcn;
 533        LCN lcn;
 534        s64 initialized_size;
 535        loff_t i_size;
 536        sector_t block, dblock, iblock;
 537        struct inode *vi;
 538        ntfs_inode *ni;
 539        ntfs_volume *vol;
 540        runlist_element *rl;
 541        struct buffer_head *bh, *head;
 542        unsigned long flags;
 543        unsigned int blocksize, vcn_ofs;
 544        int err;
 545        bool need_end_writeback;
 546        unsigned char blocksize_bits;
 547
 548        vi = page->mapping->host;
 549        ni = NTFS_I(vi);
 550        vol = ni->vol;
 551
 552        ntfs_debug("Entering for inode 0x%lx, attribute type 0x%x, page index "
 553                        "0x%lx.", ni->mft_no, ni->type, page->index);
 554
 555        BUG_ON(!NInoNonResident(ni));
 556        BUG_ON(NInoMstProtected(ni));
 557        blocksize = vol->sb->s_blocksize;
 558        blocksize_bits = vol->sb->s_blocksize_bits;
 559        if (!page_has_buffers(page)) {
 560                BUG_ON(!PageUptodate(page));
 561                create_empty_buffers(page, blocksize,
 562                                (1 << BH_Uptodate) | (1 << BH_Dirty));
 563                if (unlikely(!page_has_buffers(page))) {
 564                        ntfs_warning(vol->sb, "Error allocating page "
 565                                        "buffers.  Redirtying page so we try "
 566                                        "again later.");
 567                        /*
 568                         * Put the page back on mapping->dirty_pages, but leave
 569                         * its buffers' dirty state as-is.
 570                         */
 571                        redirty_page_for_writepage(wbc, page);
 572                        unlock_page(page);
 573                        return 0;
 574                }
 575        }
 576        bh = head = page_buffers(page);
 577        BUG_ON(!bh);
 578
 579        /* NOTE: Different naming scheme to ntfs_read_block()! */
 580
 581        /* The first block in the page. */
 582        block = (s64)page->index << (PAGE_SHIFT - blocksize_bits);
 583
 584        read_lock_irqsave(&ni->size_lock, flags);
 585        i_size = i_size_read(vi);
 586        initialized_size = ni->initialized_size;
 587        read_unlock_irqrestore(&ni->size_lock, flags);
 588
 589        /* The first out of bounds block for the data size. */
 590        dblock = (i_size + blocksize - 1) >> blocksize_bits;
 591
 592        /* The last (fully or partially) initialized block. */
 593        iblock = initialized_size >> blocksize_bits;
 594
 595        /*
 596         * Be very careful.  We have no exclusion from __set_page_dirty_buffers
 597         * here, and the (potentially unmapped) buffers may become dirty at
 598         * any time.  If a buffer becomes dirty here after we've inspected it
 599         * then we just miss that fact, and the page stays dirty.
 600         *
 601         * Buffers outside i_size may be dirtied by __set_page_dirty_buffers;
 602         * handle that here by just cleaning them.
 603         */
 604
 605        /*
 606         * Loop through all the buffers in the page, mapping all the dirty
 607         * buffers to disk addresses and handling any aliases from the
 608         * underlying block device's mapping.
 609         */
 610        rl = NULL;
 611        err = 0;
 612        do {
 613                bool is_retry = false;
 614
 615                if (unlikely(block >= dblock)) {
 616                        /*
 617                         * Mapped buffers outside i_size will occur, because
 618                         * this page can be outside i_size when there is a
 619                         * truncate in progress. The contents of such buffers
 620                         * were zeroed by ntfs_writepage().
 621                         *
 622                         * FIXME: What about the small race window where
 623                         * ntfs_writepage() has not done any clearing because
 624                         * the page was within i_size but before we get here,
 625                         * vmtruncate() modifies i_size?
 626                         */
 627                        clear_buffer_dirty(bh);
 628                        set_buffer_uptodate(bh);
 629                        continue;
 630                }
 631
 632                /* Clean buffers are not written out, so no need to map them. */
 633                if (!buffer_dirty(bh))
 634                        continue;
 635
 636                /* Make sure we have enough initialized size. */
 637                if (unlikely((block >= iblock) &&
 638                                (initialized_size < i_size))) {
 639                        /*
 640                         * If this page is fully outside initialized size, zero
 641                         * out all pages between the current initialized size
 642                         * and the current page. Just use ntfs_readpage() to do
 643                         * the zeroing transparently.
 644                         */
 645                        if (block > iblock) {
 646                                // TODO:
 647                                // For each page do:
 648                                // - read_cache_page()
 649                                // Again for each page do:
 650                                // - wait_on_page_locked()
 651                                // - Check (PageUptodate(page) &&
 652                                //                      !PageError(page))
 653                                // Update initialized size in the attribute and
 654                                // in the inode.
 655                                // Again, for each page do:
 656                                //      __set_page_dirty_buffers();
 657                                // put_page()
 658                                // We don't need to wait on the writes.
 659                                // Update iblock.
 660                        }
 661                        /*
 662                         * The current page straddles initialized size. Zero
 663                         * all non-uptodate buffers and set them uptodate (and
 664                         * dirty?). Note, there aren't any non-uptodate buffers
 665                         * if the page is uptodate.
 666                         * FIXME: For an uptodate page, the buffers may need to
 667                         * be written out because they were not initialized on
 668                         * disk before.
 669                         */
 670                        if (!PageUptodate(page)) {
 671                                // TODO:
 672                                // Zero any non-uptodate buffers up to i_size.
 673                                // Set them uptodate and dirty.
 674                        }
 675                        // TODO:
 676                        // Update initialized size in the attribute and in the
 677                        // inode (up to i_size).
 678                        // Update iblock.
 679                        // FIXME: This is inefficient. Try to batch the two
 680                        // size changes to happen in one go.
 681                        ntfs_error(vol->sb, "Writing beyond initialized size "
 682                                        "is not supported yet. Sorry.");
 683                        err = -EOPNOTSUPP;
 684                        break;
 685                        // Do NOT set_buffer_new() BUT DO clear buffer range
 686                        // outside write request range.
 687                        // set_buffer_uptodate() on complete buffers as well as
 688                        // set_buffer_dirty().
 689                }
 690
 691                /* No need to map buffers that are already mapped. */
 692                if (buffer_mapped(bh))
 693                        continue;
 694
 695                /* Unmapped, dirty buffer. Need to map it. */
 696                bh->b_bdev = vol->sb->s_bdev;
 697
 698                /* Convert block into corresponding vcn and offset. */
 699                vcn = (VCN)block << blocksize_bits;
 700                vcn_ofs = vcn & vol->cluster_size_mask;
 701                vcn >>= vol->cluster_size_bits;
 702                if (!rl) {
 703lock_retry_remap:
 704                        down_read(&ni->runlist.lock);
 705                        rl = ni->runlist.rl;
 706                }
 707                if (likely(rl != NULL)) {
 708                        /* Seek to element containing target vcn. */
 709                        while (rl->length && rl[1].vcn <= vcn)
 710                                rl++;
 711                        lcn = ntfs_rl_vcn_to_lcn(rl, vcn);
 712                } else
 713                        lcn = LCN_RL_NOT_MAPPED;
 714                /* Successful remap. */
 715                if (lcn >= 0) {
 716                        /* Setup buffer head to point to correct block. */
 717                        bh->b_blocknr = ((lcn << vol->cluster_size_bits) +
 718                                        vcn_ofs) >> blocksize_bits;
 719                        set_buffer_mapped(bh);
 720                        continue;
 721                }
 722                /* It is a hole, need to instantiate it. */
 723                if (lcn == LCN_HOLE) {
 724                        u8 *kaddr;
 725                        unsigned long *bpos, *bend;
 726
 727                        /* Check if the buffer is zero. */
 728                        kaddr = kmap_atomic(page);
 729                        bpos = (unsigned long *)(kaddr + bh_offset(bh));
 730                        bend = (unsigned long *)((u8*)bpos + blocksize);
 731                        do {
 732                                if (unlikely(*bpos))
 733                                        break;
 734                        } while (likely(++bpos < bend));
 735                        kunmap_atomic(kaddr);
 736                        if (bpos == bend) {
 737                                /*
 738                                 * Buffer is zero and sparse, no need to write
 739                                 * it.
 740                                 */
 741                                bh->b_blocknr = -1;
 742                                clear_buffer_dirty(bh);
 743                                continue;
 744                        }
 745                        // TODO: Instantiate the hole.
 746                        // clear_buffer_new(bh);
 747                        // clean_bdev_bh_alias(bh);
 748                        ntfs_error(vol->sb, "Writing into sparse regions is "
 749                                        "not supported yet. Sorry.");
 750                        err = -EOPNOTSUPP;
 751                        break;
 752                }
 753                /* If first try and runlist unmapped, map and retry. */
 754                if (!is_retry && lcn == LCN_RL_NOT_MAPPED) {
 755                        is_retry = true;
 756                        /*
 757                         * Attempt to map runlist, dropping lock for
 758                         * the duration.
 759                         */
 760                        up_read(&ni->runlist.lock);
 761                        err = ntfs_map_runlist(ni, vcn);
 762                        if (likely(!err))
 763                                goto lock_retry_remap;
 764                        rl = NULL;
 765                } else if (!rl)
 766                        up_read(&ni->runlist.lock);
 767                /*
 768                 * If buffer is outside the runlist, truncate has cut it out
 769                 * of the runlist.  Just clean and clear the buffer and set it
 770                 * uptodate so it can get discarded by the VM.
 771                 */
 772                if (err == -ENOENT || lcn == LCN_ENOENT) {
 773                        bh->b_blocknr = -1;
 774                        clear_buffer_dirty(bh);
 775                        zero_user(page, bh_offset(bh), blocksize);
 776                        set_buffer_uptodate(bh);
 777                        err = 0;
 778                        continue;
 779                }
 780                /* Failed to map the buffer, even after retrying. */
 781                if (!err)
 782                        err = -EIO;
 783                bh->b_blocknr = -1;
 784                ntfs_error(vol->sb, "Failed to write to inode 0x%lx, "
 785                                "attribute type 0x%x, vcn 0x%llx, offset 0x%x "
 786                                "because its location on disk could not be "
 787                                "determined%s (error code %i).", ni->mft_no,
 788                                ni->type, (unsigned long long)vcn,
 789                                vcn_ofs, is_retry ? " even after "
 790                                "retrying" : "", err);
 791                break;
 792        } while (block++, (bh = bh->b_this_page) != head);
 793
 794        /* Release the lock if we took it. */
 795        if (rl)
 796                up_read(&ni->runlist.lock);
 797
 798        /* For the error case, need to reset bh to the beginning. */
 799        bh = head;
 800
 801        /* Just an optimization, so ->readpage() is not called later. */
 802        if (unlikely(!PageUptodate(page))) {
 803                int uptodate = 1;
 804                do {
 805                        if (!buffer_uptodate(bh)) {
 806                                uptodate = 0;
 807                                bh = head;
 808                                break;
 809                        }
 810                } while ((bh = bh->b_this_page) != head);
 811                if (uptodate)
 812                        SetPageUptodate(page);
 813        }
 814
 815        /* Setup all mapped, dirty buffers for async write i/o. */
 816        do {
 817                if (buffer_mapped(bh) && buffer_dirty(bh)) {
 818                        lock_buffer(bh);
 819                        if (test_clear_buffer_dirty(bh)) {
 820                                BUG_ON(!buffer_uptodate(bh));
 821                                mark_buffer_async_write(bh);
 822                        } else
 823                                unlock_buffer(bh);
 824                } else if (unlikely(err)) {
 825                        /*
 826                         * For the error case. The buffer may have been set
 827                         * dirty during attachment to a dirty page.
 828                         */
 829                        if (err != -ENOMEM)
 830                                clear_buffer_dirty(bh);
 831                }
 832        } while ((bh = bh->b_this_page) != head);
 833
 834        if (unlikely(err)) {
 835                // TODO: Remove the -EOPNOTSUPP check later on...
 836                if (unlikely(err == -EOPNOTSUPP))
 837                        err = 0;
 838                else if (err == -ENOMEM) {
 839                        ntfs_warning(vol->sb, "Error allocating memory. "
 840                                        "Redirtying page so we try again "
 841                                        "later.");
 842                        /*
 843                         * Put the page back on mapping->dirty_pages, but
 844                         * leave its buffer's dirty state as-is.
 845                         */
 846                        redirty_page_for_writepage(wbc, page);
 847                        err = 0;
 848                } else
 849                        SetPageError(page);
 850        }
 851
 852        BUG_ON(PageWriteback(page));
 853        set_page_writeback(page);       /* Keeps try_to_free_buffers() away. */
 854
 855        /* Submit the prepared buffers for i/o. */
 856        need_end_writeback = true;
 857        do {
 858                struct buffer_head *next = bh->b_this_page;
 859                if (buffer_async_write(bh)) {
 860                        submit_bh(REQ_OP_WRITE, 0, bh);
 861                        need_end_writeback = false;
 862                }
 863                bh = next;
 864        } while (bh != head);
 865        unlock_page(page);
 866
 867        /* If no i/o was started, need to end_page_writeback(). */
 868        if (unlikely(need_end_writeback))
 869                end_page_writeback(page);
 870
 871        ntfs_debug("Done.");
 872        return err;
 873}
 874
 875/**
 876 * ntfs_write_mst_block - write a @page to the backing store
 877 * @page:       page cache page to write out
 878 * @wbc:        writeback control structure
 879 *
 880 * This function is for writing pages belonging to non-resident, mst protected
 881 * attributes to their backing store.  The only supported attributes are index
 882 * allocation and $MFT/$DATA.  Both directory inodes and index inodes are
 883 * supported for the index allocation case.
 884 *
 885 * The page must remain locked for the duration of the write because we apply
 886 * the mst fixups, write, and then undo the fixups, so if we were to unlock the
 887 * page before undoing the fixups, any other user of the page will see the
 888 * page contents as corrupt.
 889 *
 890 * We clear the page uptodate flag for the duration of the function to ensure
 891 * exclusion for the $MFT/$DATA case against someone mapping an mft record we
 892 * are about to apply the mst fixups to.
 893 *
 894 * Return 0 on success and -errno on error.
 895 *
 896 * Based on ntfs_write_block(), ntfs_mft_writepage(), and
 897 * write_mft_record_nolock().
 898 */
 899static int ntfs_write_mst_block(struct page *page,
 900                struct writeback_control *wbc)
 901{
 902        sector_t block, dblock, rec_block;
 903        struct inode *vi = page->mapping->host;
 904        ntfs_inode *ni = NTFS_I(vi);
 905        ntfs_volume *vol = ni->vol;
 906        u8 *kaddr;
 907        unsigned int rec_size = ni->itype.index.block_size;
 908        ntfs_inode *locked_nis[PAGE_SIZE / NTFS_BLOCK_SIZE];
 909        struct buffer_head *bh, *head, *tbh, *rec_start_bh;
 910        struct buffer_head *bhs[MAX_BUF_PER_PAGE];
 911        runlist_element *rl;
 912        int i, nr_locked_nis, nr_recs, nr_bhs, max_bhs, bhs_per_rec, err, err2;
 913        unsigned bh_size, rec_size_bits;
 914        bool sync, is_mft, page_is_dirty, rec_is_dirty;
 915        unsigned char bh_size_bits;
 916
 917        if (WARN_ON(rec_size < NTFS_BLOCK_SIZE))
 918                return -EINVAL;
 919
 920        ntfs_debug("Entering for inode 0x%lx, attribute type 0x%x, page index "
 921                        "0x%lx.", vi->i_ino, ni->type, page->index);
 922        BUG_ON(!NInoNonResident(ni));
 923        BUG_ON(!NInoMstProtected(ni));
 924        is_mft = (S_ISREG(vi->i_mode) && !vi->i_ino);
 925        /*
 926         * NOTE: ntfs_write_mst_block() would be called for $MFTMirr if a page
 927         * in its page cache were to be marked dirty.  However this should
 928         * never happen with the current driver and considering we do not
 929         * handle this case here we do want to BUG(), at least for now.
 930         */
 931        BUG_ON(!(is_mft || S_ISDIR(vi->i_mode) ||
 932                        (NInoAttr(ni) && ni->type == AT_INDEX_ALLOCATION)));
 933        bh_size = vol->sb->s_blocksize;
 934        bh_size_bits = vol->sb->s_blocksize_bits;
 935        max_bhs = PAGE_SIZE / bh_size;
 936        BUG_ON(!max_bhs);
 937        BUG_ON(max_bhs > MAX_BUF_PER_PAGE);
 938
 939        /* Were we called for sync purposes? */
 940        sync = (wbc->sync_mode == WB_SYNC_ALL);
 941
 942        /* Make sure we have mapped buffers. */
 943        bh = head = page_buffers(page);
 944        BUG_ON(!bh);
 945
 946        rec_size_bits = ni->itype.index.block_size_bits;
 947        BUG_ON(!(PAGE_SIZE >> rec_size_bits));
 948        bhs_per_rec = rec_size >> bh_size_bits;
 949        BUG_ON(!bhs_per_rec);
 950
 951        /* The first block in the page. */
 952        rec_block = block = (sector_t)page->index <<
 953                        (PAGE_SHIFT - bh_size_bits);
 954
 955        /* The first out of bounds block for the data size. */
 956        dblock = (i_size_read(vi) + bh_size - 1) >> bh_size_bits;
 957
 958        rl = NULL;
 959        err = err2 = nr_bhs = nr_recs = nr_locked_nis = 0;
 960        page_is_dirty = rec_is_dirty = false;
 961        rec_start_bh = NULL;
 962        do {
 963                bool is_retry = false;
 964
 965                if (likely(block < rec_block)) {
 966                        if (unlikely(block >= dblock)) {
 967                                clear_buffer_dirty(bh);
 968                                set_buffer_uptodate(bh);
 969                                continue;
 970                        }
 971                        /*
 972                         * This block is not the first one in the record.  We
 973                         * ignore the buffer's dirty state because we could
 974                         * have raced with a parallel mark_ntfs_record_dirty().
 975                         */
 976                        if (!rec_is_dirty)
 977                                continue;
 978                        if (unlikely(err2)) {
 979                                if (err2 != -ENOMEM)
 980                                        clear_buffer_dirty(bh);
 981                                continue;
 982                        }
 983                } else /* if (block == rec_block) */ {
 984                        BUG_ON(block > rec_block);
 985                        /* This block is the first one in the record. */
 986                        rec_block += bhs_per_rec;
 987                        err2 = 0;
 988                        if (unlikely(block >= dblock)) {
 989                                clear_buffer_dirty(bh);
 990                                continue;
 991                        }
 992                        if (!buffer_dirty(bh)) {
 993                                /* Clean records are not written out. */
 994                                rec_is_dirty = false;
 995                                continue;
 996                        }
 997                        rec_is_dirty = true;
 998                        rec_start_bh = bh;
 999                }
1000                /* Need to map the buffer if it is not mapped already. */
1001                if (unlikely(!buffer_mapped(bh))) {
1002                        VCN vcn;
1003                        LCN lcn;
1004                        unsigned int vcn_ofs;
1005
1006                        bh->b_bdev = vol->sb->s_bdev;
1007                        /* Obtain the vcn and offset of the current block. */
1008                        vcn = (VCN)block << bh_size_bits;
1009                        vcn_ofs = vcn & vol->cluster_size_mask;
1010                        vcn >>= vol->cluster_size_bits;
1011                        if (!rl) {
1012lock_retry_remap:
1013                                down_read(&ni->runlist.lock);
1014                                rl = ni->runlist.rl;
1015                        }
1016                        if (likely(rl != NULL)) {
1017                                /* Seek to element containing target vcn. */
1018                                while (rl->length && rl[1].vcn <= vcn)
1019                                        rl++;
1020                                lcn = ntfs_rl_vcn_to_lcn(rl, vcn);
1021                        } else
1022                                lcn = LCN_RL_NOT_MAPPED;
1023                        /* Successful remap. */
1024                        if (likely(lcn >= 0)) {
1025                                /* Setup buffer head to correct block. */
1026                                bh->b_blocknr = ((lcn <<
1027                                                vol->cluster_size_bits) +
1028                                                vcn_ofs) >> bh_size_bits;
1029                                set_buffer_mapped(bh);
1030                        } else {
1031                                /*
1032                                 * Remap failed.  Retry to map the runlist once
1033                                 * unless we are working on $MFT which always
1034                                 * has the whole of its runlist in memory.
1035                                 */
1036                                if (!is_mft && !is_retry &&
1037                                                lcn == LCN_RL_NOT_MAPPED) {
1038                                        is_retry = true;
1039                                        /*
1040                                         * Attempt to map runlist, dropping
1041                                         * lock for the duration.
1042                                         */
1043                                        up_read(&ni->runlist.lock);
1044                                        err2 = ntfs_map_runlist(ni, vcn);
1045                                        if (likely(!err2))
1046                                                goto lock_retry_remap;
1047                                        if (err2 == -ENOMEM)
1048                                                page_is_dirty = true;
1049                                        lcn = err2;
1050                                } else {
1051                                        err2 = -EIO;
1052                                        if (!rl)
1053                                                up_read(&ni->runlist.lock);
1054                                }
1055                                /* Hard error.  Abort writing this record. */
1056                                if (!err || err == -ENOMEM)
1057                                        err = err2;
1058                                bh->b_blocknr = -1;
1059                                ntfs_error(vol->sb, "Cannot write ntfs record "
1060                                                "0x%llx (inode 0x%lx, "
1061                                                "attribute type 0x%x) because "
1062                                                "its location on disk could "
1063                                                "not be determined (error "
1064                                                "code %lli).",
1065                                                (long long)block <<
1066                                                bh_size_bits >>
1067                                                vol->mft_record_size_bits,
1068                                                ni->mft_no, ni->type,
1069                                                (long long)lcn);
1070                                /*
1071                                 * If this is not the first buffer, remove the
1072                                 * buffers in this record from the list of
1073                                 * buffers to write and clear their dirty bit
1074                                 * if not error -ENOMEM.
1075                                 */
1076                                if (rec_start_bh != bh) {
1077                                        while (bhs[--nr_bhs] != rec_start_bh)
1078                                                ;
1079                                        if (err2 != -ENOMEM) {
1080                                                do {
1081                                                        clear_buffer_dirty(
1082                                                                rec_start_bh);
1083                                                } while ((rec_start_bh =
1084                                                                rec_start_bh->
1085                                                                b_this_page) !=
1086                                                                bh);
1087                                        }
1088                                }
1089                                continue;
1090                        }
1091                }
1092                BUG_ON(!buffer_uptodate(bh));
1093                BUG_ON(nr_bhs >= max_bhs);
1094                bhs[nr_bhs++] = bh;
1095        } while (block++, (bh = bh->b_this_page) != head);
1096        if (unlikely(rl))
1097                up_read(&ni->runlist.lock);
1098        /* If there were no dirty buffers, we are done. */
1099        if (!nr_bhs)
1100                goto done;
1101        /* Map the page so we can access its contents. */
1102        kaddr = kmap(page);
1103        /* Clear the page uptodate flag whilst the mst fixups are applied. */
1104        BUG_ON(!PageUptodate(page));
1105        ClearPageUptodate(page);
1106        for (i = 0; i < nr_bhs; i++) {
1107                unsigned int ofs;
1108
1109                /* Skip buffers which are not at the beginning of records. */
1110                if (i % bhs_per_rec)
1111                        continue;
1112                tbh = bhs[i];
1113                ofs = bh_offset(tbh);
1114                if (is_mft) {
1115                        ntfs_inode *tni;
1116                        unsigned long mft_no;
1117
1118                        /* Get the mft record number. */
1119                        mft_no = (((s64)page->index << PAGE_SHIFT) + ofs)
1120                                        >> rec_size_bits;
1121                        /* Check whether to write this mft record. */
1122                        tni = NULL;
1123                        if (!ntfs_may_write_mft_record(vol, mft_no,
1124                                        (MFT_RECORD*)(kaddr + ofs), &tni)) {
1125                                /*
1126                                 * The record should not be written.  This
1127                                 * means we need to redirty the page before
1128                                 * returning.
1129                                 */
1130                                page_is_dirty = true;
1131                                /*
1132                                 * Remove the buffers in this mft record from
1133                                 * the list of buffers to write.
1134                                 */
1135                                do {
1136                                        bhs[i] = NULL;
1137                                } while (++i % bhs_per_rec);
1138                                continue;
1139                        }
1140                        /*
1141                         * The record should be written.  If a locked ntfs
1142                         * inode was returned, add it to the array of locked
1143                         * ntfs inodes.
1144                         */
1145                        if (tni)
1146                                locked_nis[nr_locked_nis++] = tni;
1147                }
1148                /* Apply the mst protection fixups. */
1149                err2 = pre_write_mst_fixup((NTFS_RECORD*)(kaddr + ofs),
1150                                rec_size);
1151                if (unlikely(err2)) {
1152                        if (!err || err == -ENOMEM)
1153                                err = -EIO;
1154                        ntfs_error(vol->sb, "Failed to apply mst fixups "
1155                                        "(inode 0x%lx, attribute type 0x%x, "
1156                                        "page index 0x%lx, page offset 0x%x)!"
1157                                        "  Unmount and run chkdsk.", vi->i_ino,
1158                                        ni->type, page->index, ofs);
1159                        /*
1160                         * Mark all the buffers in this record clean as we do
1161                         * not want to write corrupt data to disk.
1162                         */
1163                        do {
1164                                clear_buffer_dirty(bhs[i]);
1165                                bhs[i] = NULL;
1166                        } while (++i % bhs_per_rec);
1167                        continue;
1168                }
1169                nr_recs++;
1170        }
1171        /* If no records are to be written out, we are done. */
1172        if (!nr_recs)
1173                goto unm_done;
1174        flush_dcache_page(page);
1175        /* Lock buffers and start synchronous write i/o on them. */
1176        for (i = 0; i < nr_bhs; i++) {
1177                tbh = bhs[i];
1178                if (!tbh)
1179                        continue;
1180                if (!trylock_buffer(tbh))
1181                        BUG();
1182                /* The buffer dirty state is now irrelevant, just clean it. */
1183                clear_buffer_dirty(tbh);
1184                BUG_ON(!buffer_uptodate(tbh));
1185                BUG_ON(!buffer_mapped(tbh));
1186                get_bh(tbh);
1187                tbh->b_end_io = end_buffer_write_sync;
1188                submit_bh(REQ_OP_WRITE, 0, tbh);
1189        }
1190        /* Synchronize the mft mirror now if not @sync. */
1191        if (is_mft && !sync)
1192                goto do_mirror;
1193do_wait:
1194        /* Wait on i/o completion of buffers. */
1195        for (i = 0; i < nr_bhs; i++) {
1196                tbh = bhs[i];
1197                if (!tbh)
1198                        continue;
1199                wait_on_buffer(tbh);
1200                if (unlikely(!buffer_uptodate(tbh))) {
1201                        ntfs_error(vol->sb, "I/O error while writing ntfs "
1202                                        "record buffer (inode 0x%lx, "
1203                                        "attribute type 0x%x, page index "
1204                                        "0x%lx, page offset 0x%lx)!  Unmount "
1205                                        "and run chkdsk.", vi->i_ino, ni->type,
1206                                        page->index, bh_offset(tbh));
1207                        if (!err || err == -ENOMEM)
1208                                err = -EIO;
1209                        /*
1210                         * Set the buffer uptodate so the page and buffer
1211                         * states do not become out of sync.
1212                         */
1213                        set_buffer_uptodate(tbh);
1214                }
1215        }
1216        /* If @sync, now synchronize the mft mirror. */
1217        if (is_mft && sync) {
1218do_mirror:
1219                for (i = 0; i < nr_bhs; i++) {
1220                        unsigned long mft_no;
1221                        unsigned int ofs;
1222
1223                        /*
1224                         * Skip buffers which are not at the beginning of
1225                         * records.
1226                         */
1227                        if (i % bhs_per_rec)
1228                                continue;
1229                        tbh = bhs[i];
1230                        /* Skip removed buffers (and hence records). */
1231                        if (!tbh)
1232                                continue;
1233                        ofs = bh_offset(tbh);
1234                        /* Get the mft record number. */
1235                        mft_no = (((s64)page->index << PAGE_SHIFT) + ofs)
1236                                        >> rec_size_bits;
1237                        if (mft_no < vol->mftmirr_size)
1238                                ntfs_sync_mft_mirror(vol, mft_no,
1239                                                (MFT_RECORD*)(kaddr + ofs),
1240                                                sync);
1241                }
1242                if (!sync)
1243                        goto do_wait;
1244        }
1245        /* Remove the mst protection fixups again. */
1246        for (i = 0; i < nr_bhs; i++) {
1247                if (!(i % bhs_per_rec)) {
1248                        tbh = bhs[i];
1249                        if (!tbh)
1250                                continue;
1251                        post_write_mst_fixup((NTFS_RECORD*)(kaddr +
1252                                        bh_offset(tbh)));
1253                }
1254        }
1255        flush_dcache_page(page);
1256unm_done:
1257        /* Unlock any locked inodes. */
1258        while (nr_locked_nis-- > 0) {
1259                ntfs_inode *tni, *base_tni;
1260                
1261                tni = locked_nis[nr_locked_nis];
1262                /* Get the base inode. */
1263                mutex_lock(&tni->extent_lock);
1264                if (tni->nr_extents >= 0)
1265                        base_tni = tni;
1266                else {
1267                        base_tni = tni->ext.base_ntfs_ino;
1268                        BUG_ON(!base_tni);
1269                }
1270                mutex_unlock(&tni->extent_lock);
1271                ntfs_debug("Unlocking %s inode 0x%lx.",
1272                                tni == base_tni ? "base" : "extent",
1273                                tni->mft_no);
1274                mutex_unlock(&tni->mrec_lock);
1275                atomic_dec(&tni->count);
1276                iput(VFS_I(base_tni));
1277        }
1278        SetPageUptodate(page);
1279        kunmap(page);
1280done:
1281        if (unlikely(err && err != -ENOMEM)) {
1282                /*
1283                 * Set page error if there is only one ntfs record in the page.
1284                 * Otherwise we would loose per-record granularity.
1285                 */
1286                if (ni->itype.index.block_size == PAGE_SIZE)
1287                        SetPageError(page);
1288                NVolSetErrors(vol);
1289        }
1290        if (page_is_dirty) {
1291                ntfs_debug("Page still contains one or more dirty ntfs "
1292                                "records.  Redirtying the page starting at "
1293                                "record 0x%lx.", page->index <<
1294                                (PAGE_SHIFT - rec_size_bits));
1295                redirty_page_for_writepage(wbc, page);
1296                unlock_page(page);
1297        } else {
1298                /*
1299                 * Keep the VM happy.  This must be done otherwise the
1300                 * radix-tree tag PAGECACHE_TAG_DIRTY remains set even though
1301                 * the page is clean.
1302                 */
1303                BUG_ON(PageWriteback(page));
1304                set_page_writeback(page);
1305                unlock_page(page);
1306                end_page_writeback(page);
1307        }
1308        if (likely(!err))
1309                ntfs_debug("Done.");
1310        return err;
1311}
1312
1313/**
1314 * ntfs_writepage - write a @page to the backing store
1315 * @page:       page cache page to write out
1316 * @wbc:        writeback control structure
1317 *
1318 * This is called from the VM when it wants to have a dirty ntfs page cache
1319 * page cleaned.  The VM has already locked the page and marked it clean.
1320 *
1321 * For non-resident attributes, ntfs_writepage() writes the @page by calling
1322 * the ntfs version of the generic block_write_full_page() function,
1323 * ntfs_write_block(), which in turn if necessary creates and writes the
1324 * buffers associated with the page asynchronously.
1325 *
1326 * For resident attributes, OTOH, ntfs_writepage() writes the @page by copying
1327 * the data to the mft record (which at this stage is most likely in memory).
1328 * The mft record is then marked dirty and written out asynchronously via the
1329 * vfs inode dirty code path for the inode the mft record belongs to or via the
1330 * vm page dirty code path for the page the mft record is in.
1331 *
1332 * Based on ntfs_readpage() and fs/buffer.c::block_write_full_page().
1333 *
1334 * Return 0 on success and -errno on error.
1335 */
1336static int ntfs_writepage(struct page *page, struct writeback_control *wbc)
1337{
1338        loff_t i_size;
1339        struct inode *vi = page->mapping->host;
1340        ntfs_inode *base_ni = NULL, *ni = NTFS_I(vi);
1341        char *addr;
1342        ntfs_attr_search_ctx *ctx = NULL;
1343        MFT_RECORD *m = NULL;
1344        u32 attr_len;
1345        int err;
1346
1347retry_writepage:
1348        BUG_ON(!PageLocked(page));
1349        i_size = i_size_read(vi);
1350        /* Is the page fully outside i_size? (truncate in progress) */
1351        if (unlikely(page->index >= (i_size + PAGE_SIZE - 1) >>
1352                        PAGE_SHIFT)) {
1353                /*
1354                 * The page may have dirty, unmapped buffers.  Make them
1355                 * freeable here, so the page does not leak.
1356                 */
1357                block_invalidatepage(page, 0, PAGE_SIZE);
1358                unlock_page(page);
1359                ntfs_debug("Write outside i_size - truncated?");
1360                return 0;
1361        }
1362        /*
1363         * Only $DATA attributes can be encrypted and only unnamed $DATA
1364         * attributes can be compressed.  Index root can have the flags set but
1365         * this means to create compressed/encrypted files, not that the
1366         * attribute is compressed/encrypted.  Note we need to check for
1367         * AT_INDEX_ALLOCATION since this is the type of both directory and
1368         * index inodes.
1369         */
1370        if (ni->type != AT_INDEX_ALLOCATION) {
1371                /* If file is encrypted, deny access, just like NT4. */
1372                if (NInoEncrypted(ni)) {
1373                        unlock_page(page);
1374                        BUG_ON(ni->type != AT_DATA);
1375                        ntfs_debug("Denying write access to encrypted file.");
1376                        return -EACCES;
1377                }
1378                /* Compressed data streams are handled in compress.c. */
1379                if (NInoNonResident(ni) && NInoCompressed(ni)) {
1380                        BUG_ON(ni->type != AT_DATA);
1381                        BUG_ON(ni->name_len);
1382                        // TODO: Implement and replace this with
1383                        // return ntfs_write_compressed_block(page);
1384                        unlock_page(page);
1385                        ntfs_error(vi->i_sb, "Writing to compressed files is "
1386                                        "not supported yet.  Sorry.");
1387                        return -EOPNOTSUPP;
1388                }
1389                // TODO: Implement and remove this check.
1390                if (NInoNonResident(ni) && NInoSparse(ni)) {
1391                        unlock_page(page);
1392                        ntfs_error(vi->i_sb, "Writing to sparse files is not "
1393                                        "supported yet.  Sorry.");
1394                        return -EOPNOTSUPP;
1395                }
1396        }
1397        /* NInoNonResident() == NInoIndexAllocPresent() */
1398        if (NInoNonResident(ni)) {
1399                /* We have to zero every time due to mmap-at-end-of-file. */
1400                if (page->index >= (i_size >> PAGE_SHIFT)) {
1401                        /* The page straddles i_size. */
1402                        unsigned int ofs = i_size & ~PAGE_MASK;
1403                        zero_user_segment(page, ofs, PAGE_SIZE);
1404                }
1405                /* Handle mst protected attributes. */
1406                if (NInoMstProtected(ni))
1407                        return ntfs_write_mst_block(page, wbc);
1408                /* Normal, non-resident data stream. */
1409                return ntfs_write_block(page, wbc);
1410        }
1411        /*
1412         * Attribute is resident, implying it is not compressed, encrypted, or
1413         * mst protected.  This also means the attribute is smaller than an mft
1414         * record and hence smaller than a page, so can simply return error on
1415         * any pages with index above 0.  Note the attribute can actually be
1416         * marked compressed but if it is resident the actual data is not
1417         * compressed so we are ok to ignore the compressed flag here.
1418         */
1419        BUG_ON(page_has_buffers(page));
1420        BUG_ON(!PageUptodate(page));
1421        if (unlikely(page->index > 0)) {
1422                ntfs_error(vi->i_sb, "BUG()! page->index (0x%lx) > 0.  "
1423                                "Aborting write.", page->index);
1424                BUG_ON(PageWriteback(page));
1425                set_page_writeback(page);
1426                unlock_page(page);
1427                end_page_writeback(page);
1428                return -EIO;
1429        }
1430        if (!NInoAttr(ni))
1431                base_ni = ni;
1432        else
1433                base_ni = ni->ext.base_ntfs_ino;
1434        /* Map, pin, and lock the mft record. */
1435        m = map_mft_record(base_ni);
1436        if (IS_ERR(m)) {
1437                err = PTR_ERR(m);
1438                m = NULL;
1439                ctx = NULL;
1440                goto err_out;
1441        }
1442        /*
1443         * If a parallel write made the attribute non-resident, drop the mft
1444         * record and retry the writepage.
1445         */
1446        if (unlikely(NInoNonResident(ni))) {
1447                unmap_mft_record(base_ni);
1448                goto retry_writepage;
1449        }
1450        ctx = ntfs_attr_get_search_ctx(base_ni, m);
1451        if (unlikely(!ctx)) {
1452                err = -ENOMEM;
1453                goto err_out;
1454        }
1455        err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len,
1456                        CASE_SENSITIVE, 0, NULL, 0, ctx);
1457        if (unlikely(err))
1458                goto err_out;
1459        /*
1460         * Keep the VM happy.  This must be done otherwise the radix-tree tag
1461         * PAGECACHE_TAG_DIRTY remains set even though the page is clean.
1462         */
1463        BUG_ON(PageWriteback(page));
1464        set_page_writeback(page);
1465        unlock_page(page);
1466        attr_len = le32_to_cpu(ctx->attr->data.resident.value_length);
1467        i_size = i_size_read(vi);
1468        if (unlikely(attr_len > i_size)) {
1469                /* Race with shrinking truncate or a failed truncate. */
1470                attr_len = i_size;
1471                /*
1472                 * If the truncate failed, fix it up now.  If a concurrent
1473                 * truncate, we do its job, so it does not have to do anything.
1474                 */
1475                err = ntfs_resident_attr_value_resize(ctx->mrec, ctx->attr,
1476                                attr_len);
1477                /* Shrinking cannot fail. */
1478                BUG_ON(err);
1479        }
1480        addr = kmap_atomic(page);
1481        /* Copy the data from the page to the mft record. */
1482        memcpy((u8*)ctx->attr +
1483                        le16_to_cpu(ctx->attr->data.resident.value_offset),
1484                        addr, attr_len);
1485        /* Zero out of bounds area in the page cache page. */
1486        memset(addr + attr_len, 0, PAGE_SIZE - attr_len);
1487        kunmap_atomic(addr);
1488        flush_dcache_page(page);
1489        flush_dcache_mft_record_page(ctx->ntfs_ino);
1490        /* We are done with the page. */
1491        end_page_writeback(page);
1492        /* Finally, mark the mft record dirty, so it gets written back. */
1493        mark_mft_record_dirty(ctx->ntfs_ino);
1494        ntfs_attr_put_search_ctx(ctx);
1495        unmap_mft_record(base_ni);
1496        return 0;
1497err_out:
1498        if (err == -ENOMEM) {
1499                ntfs_warning(vi->i_sb, "Error allocating memory. Redirtying "
1500                                "page so we try again later.");
1501                /*
1502                 * Put the page back on mapping->dirty_pages, but leave its
1503                 * buffers' dirty state as-is.
1504                 */
1505                redirty_page_for_writepage(wbc, page);
1506                err = 0;
1507        } else {
1508                ntfs_error(vi->i_sb, "Resident attribute write failed with "
1509                                "error %i.", err);
1510                SetPageError(page);
1511                NVolSetErrors(ni->vol);
1512        }
1513        unlock_page(page);
1514        if (ctx)
1515                ntfs_attr_put_search_ctx(ctx);
1516        if (m)
1517                unmap_mft_record(base_ni);
1518        return err;
1519}
1520
1521#endif  /* NTFS_RW */
1522
1523/**
1524 * ntfs_bmap - map logical file block to physical device block
1525 * @mapping:    address space mapping to which the block to be mapped belongs
1526 * @block:      logical block to map to its physical device block
1527 *
1528 * For regular, non-resident files (i.e. not compressed and not encrypted), map
1529 * the logical @block belonging to the file described by the address space
1530 * mapping @mapping to its physical device block.
1531 *
1532 * The size of the block is equal to the @s_blocksize field of the super block
1533 * of the mounted file system which is guaranteed to be smaller than or equal
1534 * to the cluster size thus the block is guaranteed to fit entirely inside the
1535 * cluster which means we do not need to care how many contiguous bytes are
1536 * available after the beginning of the block.
1537 *
1538 * Return the physical device block if the mapping succeeded or 0 if the block
1539 * is sparse or there was an error.
1540 *
1541 * Note: This is a problem if someone tries to run bmap() on $Boot system file
1542 * as that really is in block zero but there is nothing we can do.  bmap() is
1543 * just broken in that respect (just like it cannot distinguish sparse from
1544 * not available or error).
1545 */
1546static sector_t ntfs_bmap(struct address_space *mapping, sector_t block)
1547{
1548        s64 ofs, size;
1549        loff_t i_size;
1550        LCN lcn;
1551        unsigned long blocksize, flags;
1552        ntfs_inode *ni = NTFS_I(mapping->host);
1553        ntfs_volume *vol = ni->vol;
1554        unsigned delta;
1555        unsigned char blocksize_bits, cluster_size_shift;
1556
1557        ntfs_debug("Entering for mft_no 0x%lx, logical block 0x%llx.",
1558                        ni->mft_no, (unsigned long long)block);
1559        if (ni->type != AT_DATA || !NInoNonResident(ni) || NInoEncrypted(ni)) {
1560                ntfs_error(vol->sb, "BMAP does not make sense for %s "
1561                                "attributes, returning 0.",
1562                                (ni->type != AT_DATA) ? "non-data" :
1563                                (!NInoNonResident(ni) ? "resident" :
1564                                "encrypted"));
1565                return 0;
1566        }
1567        /* None of these can happen. */
1568        BUG_ON(NInoCompressed(ni));
1569        BUG_ON(NInoMstProtected(ni));
1570        blocksize = vol->sb->s_blocksize;
1571        blocksize_bits = vol->sb->s_blocksize_bits;
1572        ofs = (s64)block << blocksize_bits;
1573        read_lock_irqsave(&ni->size_lock, flags);
1574        size = ni->initialized_size;
1575        i_size = i_size_read(VFS_I(ni));
1576        read_unlock_irqrestore(&ni->size_lock, flags);
1577        /*
1578         * If the offset is outside the initialized size or the block straddles
1579         * the initialized size then pretend it is a hole unless the
1580         * initialized size equals the file size.
1581         */
1582        if (unlikely(ofs >= size || (ofs + blocksize > size && size < i_size)))
1583                goto hole;
1584        cluster_size_shift = vol->cluster_size_bits;
1585        down_read(&ni->runlist.lock);
1586        lcn = ntfs_attr_vcn_to_lcn_nolock(ni, ofs >> cluster_size_shift, false);
1587        up_read(&ni->runlist.lock);
1588        if (unlikely(lcn < LCN_HOLE)) {
1589                /*
1590                 * Step down to an integer to avoid gcc doing a long long
1591                 * comparision in the switch when we know @lcn is between
1592                 * LCN_HOLE and LCN_EIO (i.e. -1 to -5).
1593                 *
1594                 * Otherwise older gcc (at least on some architectures) will
1595                 * try to use __cmpdi2() which is of course not available in
1596                 * the kernel.
1597                 */
1598                switch ((int)lcn) {
1599                case LCN_ENOENT:
1600                        /*
1601                         * If the offset is out of bounds then pretend it is a
1602                         * hole.
1603                         */
1604                        goto hole;
1605                case LCN_ENOMEM:
1606                        ntfs_error(vol->sb, "Not enough memory to complete "
1607                                        "mapping for inode 0x%lx.  "
1608                                        "Returning 0.", ni->mft_no);
1609                        break;
1610                default:
1611                        ntfs_error(vol->sb, "Failed to complete mapping for "
1612                                        "inode 0x%lx.  Run chkdsk.  "
1613                                        "Returning 0.", ni->mft_no);
1614                        break;
1615                }
1616                return 0;
1617        }
1618        if (lcn < 0) {
1619                /* It is a hole. */
1620hole:
1621                ntfs_debug("Done (returning hole).");
1622                return 0;
1623        }
1624        /*
1625         * The block is really allocated and fullfils all our criteria.
1626         * Convert the cluster to units of block size and return the result.
1627         */
1628        delta = ofs & vol->cluster_size_mask;
1629        if (unlikely(sizeof(block) < sizeof(lcn))) {
1630                block = lcn = ((lcn << cluster_size_shift) + delta) >>
1631                                blocksize_bits;
1632                /* If the block number was truncated return 0. */
1633                if (unlikely(block != lcn)) {
1634                        ntfs_error(vol->sb, "Physical block 0x%llx is too "
1635                                        "large to be returned, returning 0.",
1636                                        (long long)lcn);
1637                        return 0;
1638                }
1639        } else
1640                block = ((lcn << cluster_size_shift) + delta) >>
1641                                blocksize_bits;
1642        ntfs_debug("Done (returning block 0x%llx).", (unsigned long long)lcn);
1643        return block;
1644}
1645
1646/**
1647 * ntfs_normal_aops - address space operations for normal inodes and attributes
1648 *
1649 * Note these are not used for compressed or mst protected inodes and
1650 * attributes.
1651 */
1652const struct address_space_operations ntfs_normal_aops = {
1653        .readpage       = ntfs_readpage,
1654#ifdef NTFS_RW
1655        .writepage      = ntfs_writepage,
1656        .set_page_dirty = __set_page_dirty_buffers,
1657#endif /* NTFS_RW */
1658        .bmap           = ntfs_bmap,
1659        .migratepage    = buffer_migrate_page,
1660        .is_partially_uptodate = block_is_partially_uptodate,
1661        .error_remove_page = generic_error_remove_page,
1662};
1663
1664/**
1665 * ntfs_compressed_aops - address space operations for compressed inodes
1666 */
1667const struct address_space_operations ntfs_compressed_aops = {
1668        .readpage       = ntfs_readpage,
1669#ifdef NTFS_RW
1670        .writepage      = ntfs_writepage,
1671        .set_page_dirty = __set_page_dirty_buffers,
1672#endif /* NTFS_RW */
1673        .migratepage    = buffer_migrate_page,
1674        .is_partially_uptodate = block_is_partially_uptodate,
1675        .error_remove_page = generic_error_remove_page,
1676};
1677
1678/**
1679 * ntfs_mst_aops - general address space operations for mst protecteed inodes
1680 *                 and attributes
1681 */
1682const struct address_space_operations ntfs_mst_aops = {
1683        .readpage       = ntfs_readpage,        /* Fill page with data. */
1684#ifdef NTFS_RW
1685        .writepage      = ntfs_writepage,       /* Write dirty page to disk. */
1686        .set_page_dirty = __set_page_dirty_nobuffers,   /* Set the page dirty
1687                                                   without touching the buffers
1688                                                   belonging to the page. */
1689#endif /* NTFS_RW */
1690        .migratepage    = buffer_migrate_page,
1691        .is_partially_uptodate  = block_is_partially_uptodate,
1692        .error_remove_page = generic_error_remove_page,
1693};
1694
1695#ifdef NTFS_RW
1696
1697/**
1698 * mark_ntfs_record_dirty - mark an ntfs record dirty
1699 * @page:       page containing the ntfs record to mark dirty
1700 * @ofs:        byte offset within @page at which the ntfs record begins
1701 *
1702 * Set the buffers and the page in which the ntfs record is located dirty.
1703 *
1704 * The latter also marks the vfs inode the ntfs record belongs to dirty
1705 * (I_DIRTY_PAGES only).
1706 *
1707 * If the page does not have buffers, we create them and set them uptodate.
1708 * The page may not be locked which is why we need to handle the buffers under
1709 * the mapping->private_lock.  Once the buffers are marked dirty we no longer
1710 * need the lock since try_to_free_buffers() does not free dirty buffers.
1711 */
1712void mark_ntfs_record_dirty(struct page *page, const unsigned int ofs) {
1713        struct address_space *mapping = page->mapping;
1714        ntfs_inode *ni = NTFS_I(mapping->host);
1715        struct buffer_head *bh, *head, *buffers_to_free = NULL;
1716        unsigned int end, bh_size, bh_ofs;
1717
1718        BUG_ON(!PageUptodate(page));
1719        end = ofs + ni->itype.index.block_size;
1720        bh_size = VFS_I(ni)->i_sb->s_blocksize;
1721        spin_lock(&mapping->private_lock);
1722        if (unlikely(!page_has_buffers(page))) {
1723                spin_unlock(&mapping->private_lock);
1724                bh = head = alloc_page_buffers(page, bh_size, true);
1725                spin_lock(&mapping->private_lock);
1726                if (likely(!page_has_buffers(page))) {
1727                        struct buffer_head *tail;
1728
1729                        do {
1730                                set_buffer_uptodate(bh);
1731                                tail = bh;
1732                                bh = bh->b_this_page;
1733                        } while (bh);
1734                        tail->b_this_page = head;
1735                        attach_page_private(page, head);
1736                } else
1737                        buffers_to_free = bh;
1738        }
1739        bh = head = page_buffers(page);
1740        BUG_ON(!bh);
1741        do {
1742                bh_ofs = bh_offset(bh);
1743                if (bh_ofs + bh_size <= ofs)
1744                        continue;
1745                if (unlikely(bh_ofs >= end))
1746                        break;
1747                set_buffer_dirty(bh);
1748        } while ((bh = bh->b_this_page) != head);
1749        spin_unlock(&mapping->private_lock);
1750        __set_page_dirty_nobuffers(page);
1751        if (unlikely(buffers_to_free)) {
1752                do {
1753                        bh = buffers_to_free->b_this_page;
1754                        free_buffer_head(buffers_to_free);
1755                        buffers_to_free = bh;
1756                } while (buffers_to_free);
1757        }
1758}
1759
1760#endif /* NTFS_RW */
1761