LXR linux/fs/btrfs/file.c

   1/*
   2 * Copyright (C) 2007 Oracle.  All rights reserved.
   3 *
   4 * This program is free software; you can redistribute it and/or
   5 * modify it under the terms of the GNU General Public
   6 * License v2 as published by the Free Software Foundation.
   7 *
   8 * This program is distributed in the hope that it will be useful,
   9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
  10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  11 * General Public License for more details.
  12 *
  13 * You should have received a copy of the GNU General Public
  14 * License along with this program; if not, write to the
  15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  16 * Boston, MA 021110-1307, USA.
  17 */
  18
  19#include <linux/fs.h>
  20#include <linux/pagemap.h>
  21#include <linux/highmem.h>
  22#include <linux/time.h>
  23#include <linux/init.h>
  24#include <linux/string.h>
  25#include <linux/backing-dev.h>
  26#include <linux/mpage.h>
  27#include <linux/falloc.h>
  28#include <linux/swap.h>
  29#include <linux/writeback.h>
  30#include <linux/statfs.h>
  31#include <linux/compat.h>
  32#include <linux/slab.h>
  33#include "ctree.h"
  34#include "disk-io.h"
  35#include "transaction.h"
  36#include "btrfs_inode.h"
  37#include "ioctl.h"
  38#include "print-tree.h"
  39#include "tree-log.h"
  40#include "locking.h"
  41#include "compat.h"
  42
  43
  44/* simple helper to fault in pages and copy.  This should go away
  45 * and be replaced with calls into generic code.
  46 */
  47static noinline int btrfs_copy_from_user(loff_t pos, int num_pages,
  48                                         int write_bytes,
  49                                         struct page **prepared_pages,
  50                                         struct iov_iter *i)
  51{
  52        size_t copied = 0;
  53        int pg = 0;
  54        int offset = pos & (PAGE_CACHE_SIZE - 1);
  55        int total_copied = 0;
  56
  57        while (write_bytes > 0) {
  58                size_t count = min_t(size_t,
  59                                     PAGE_CACHE_SIZE - offset, write_bytes);
  60                struct page *page = prepared_pages[pg];
  61                /*
  62                 * Copy data from userspace to the current page
  63                 *
  64                 * Disable pagefault to avoid recursive lock since
  65                 * the pages are already locked
  66                 */
  67                pagefault_disable();
  68                copied = iov_iter_copy_from_user_atomic(page, i, offset, count);
  69                pagefault_enable();
  70
  71                /* Flush processor's dcache for this page */
  72                flush_dcache_page(page);
  73
  74                /*
  75                 * if we get a partial write, we can end up with
  76                 * partially up to date pages.  These add
  77                 * a lot of complexity, so make sure they don't
  78                 * happen by forcing this copy to be retried.
  79                 *
  80                 * The rest of the btrfs_file_write code will fall
  81                 * back to page at a time copies after we return 0.
  82                 */
  83                if (!PageUptodate(page) && copied < count)
  84                        copied = 0;
  85
  86                iov_iter_advance(i, copied);
  87                write_bytes -= copied;
  88                total_copied += copied;
  89
  90                /* Return to btrfs_file_aio_write to fault page */
  91                if (unlikely(copied == 0)) {
  92                        break;
  93                }
  94
  95                if (unlikely(copied < PAGE_CACHE_SIZE - offset)) {
  96                        offset += copied;
  97                } else {
  98                        pg++;
  99                        offset = 0;
 100                }
 101        }
 102        return total_copied;
 103}
 104
 105/*
 106 * unlocks pages after btrfs_file_write is done with them
 107 */
 108static noinline void btrfs_drop_pages(struct page **pages, size_t num_pages)
 109{
 110        size_t i;
 111        for (i = 0; i < num_pages; i++) {
 112                if (!pages[i])
 113                        break;
 114                /* page checked is some magic around finding pages that
 115                 * have been modified without going through btrfs_set_page_dirty
 116                 * clear it here
 117                 */
 118                ClearPageChecked(pages[i]);
 119                unlock_page(pages[i]);
 120                mark_page_accessed(pages[i]);
 121                page_cache_release(pages[i]);
 122        }
 123}
 124
 125/*
 126 * after copy_from_user, pages need to be dirtied and we need to make
 127 * sure holes are created between the current EOF and the start of
 128 * any next extents (if required).
 129 *
 130 * this also makes the decision about creating an inline extent vs
 131 * doing real data extents, marking pages dirty and delalloc as required.
 132 */
 133static noinline int dirty_and_release_pages(struct btrfs_trans_handle *trans,
 134                                   struct btrfs_root *root,
 135                                   struct file *file,
 136                                   struct page **pages,
 137                                   size_t num_pages,
 138                                   loff_t pos,
 139                                   size_t write_bytes)
 140{
 141        int err = 0;
 142        int i;
 143        struct inode *inode = fdentry(file)->d_inode;
 144        u64 num_bytes;
 145        u64 start_pos;
 146        u64 end_of_last_block;
 147        u64 end_pos = pos + write_bytes;
 148        loff_t isize = i_size_read(inode);
 149
 150        start_pos = pos & ~((u64)root->sectorsize - 1);
 151        num_bytes = (write_bytes + pos - start_pos +
 152                    root->sectorsize - 1) & ~((u64)root->sectorsize - 1);
 153
 154        end_of_last_block = start_pos + num_bytes - 1;
 155        err = btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block,
 156                                        NULL);
 157        BUG_ON(err);
 158
 159        for (i = 0; i < num_pages; i++) {
 160                struct page *p = pages[i];
 161                SetPageUptodate(p);
 162                ClearPageChecked(p);
 163                set_page_dirty(p);
 164        }
 165        if (end_pos > isize) {
 166                i_size_write(inode, end_pos);
 167                /* we've only changed i_size in ram, and we haven't updated
 168                 * the disk i_size.  There is no need to log the inode
 169                 * at this time.
 170                 */
 171        }
 172        return 0;
 173}
 174
 175/*
 176 * this drops all the extents in the cache that intersect the range
 177 * [start, end].  Existing extents are split as required.
 178 */
 179int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
 180                            int skip_pinned)
 181{
 182        struct extent_map *em;
 183        struct extent_map *split = NULL;
 184        struct extent_map *split2 = NULL;
 185        struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
 186        u64 len = end - start + 1;
 187        int ret;
 188        int testend = 1;
 189        unsigned long flags;
 190        int compressed = 0;
 191
 192        WARN_ON(end < start);
 193        if (end == (u64)-1) {
 194                len = (u64)-1;
 195                testend = 0;
 196        }
 197        while (1) {
 198                if (!split)
 199                        split = alloc_extent_map(GFP_NOFS);
 200                if (!split2)
 201                        split2 = alloc_extent_map(GFP_NOFS);
 202                BUG_ON(!split || !split2);
 203
 204                write_lock(&em_tree->lock);
 205                em = lookup_extent_mapping(em_tree, start, len);
 206                if (!em) {
 207                        write_unlock(&em_tree->lock);
 208                        break;
 209                }
 210                flags = em->flags;
 211                if (skip_pinned && test_bit(EXTENT_FLAG_PINNED, &em->flags)) {
 212                        if (testend && em->start + em->len >= start + len) {
 213                                free_extent_map(em);
 214                                write_unlock(&em_tree->lock);
 215                                break;
 216                        }
 217                        start = em->start + em->len;
 218                        if (testend)
 219                                len = start + len - (em->start + em->len);
 220                        free_extent_map(em);
 221                        write_unlock(&em_tree->lock);
 222                        continue;
 223                }
 224                compressed = test_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
 225                clear_bit(EXTENT_FLAG_PINNED, &em->flags);
 226                remove_extent_mapping(em_tree, em);
 227
 228                if (em->block_start < EXTENT_MAP_LAST_BYTE &&
 229                    em->start < start) {
 230                        split->start = em->start;
 231                        split->len = start - em->start;
 232                        split->orig_start = em->orig_start;
 233                        split->block_start = em->block_start;
 234
 235                        if (compressed)
 236                                split->block_len = em->block_len;
 237                        else
 238                                split->block_len = split->len;
 239
 240                        split->bdev = em->bdev;
 241                        split->flags = flags;
 242                        split->compress_type = em->compress_type;
 243                        ret = add_extent_mapping(em_tree, split);
 244                        BUG_ON(ret);
 245                        free_extent_map(split);
 246                        split = split2;
 247                        split2 = NULL;
 248                }
 249                if (em->block_start < EXTENT_MAP_LAST_BYTE &&
 250                    testend && em->start + em->len > start + len) {
 251                        u64 diff = start + len - em->start;
 252
 253                        split->start = start + len;
 254                        split->len = em->start + em->len - (start + len);
 255                        split->bdev = em->bdev;
 256                        split->flags = flags;
 257                        split->compress_type = em->compress_type;
 258
 259                        if (compressed) {
 260                                split->block_len = em->block_len;
 261                                split->block_start = em->block_start;
 262                                split->orig_start = em->orig_start;
 263                        } else {
 264                                split->block_len = split->len;
 265                                split->block_start = em->block_start + diff;
 266                                split->orig_start = split->start;
 267                        }
 268
 269                        ret = add_extent_mapping(em_tree, split);
 270                        BUG_ON(ret);
 271                        free_extent_map(split);
 272                        split = NULL;
 273                }
 274                write_unlock(&em_tree->lock);
 275
 276                /* once for us */
 277                free_extent_map(em);
 278                /* once for the tree*/
 279                free_extent_map(em);
 280        }
 281        if (split)
 282                free_extent_map(split);
 283        if (split2)
 284                free_extent_map(split2);
 285        return 0;
 286}
 287
 288/*
 289 * this is very complex, but the basic idea is to drop all extents
 290 * in the range start - end.  hint_block is filled in with a block number
 291 * that would be a good hint to the block allocator for this file.
 292 *
 293 * If an extent intersects the range but is not entirely inside the range
 294 * it is either truncated or split.  Anything entirely inside the range
 295 * is deleted from the tree.
 296 */
 297int btrfs_drop_extents(struct btrfs_trans_handle *trans, struct inode *inode,
 298                       u64 start, u64 end, u64 *hint_byte, int drop_cache)
 299{
 300        struct btrfs_root *root = BTRFS_I(inode)->root;
 301        struct extent_buffer *leaf;
 302        struct btrfs_file_extent_item *fi;
 303        struct btrfs_path *path;
 304        struct btrfs_key key;
 305        struct btrfs_key new_key;
 306        u64 search_start = start;
 307        u64 disk_bytenr = 0;
 308        u64 num_bytes = 0;
 309        u64 extent_offset = 0;
 310        u64 extent_end = 0;
 311        int del_nr = 0;
 312        int del_slot = 0;
 313        int extent_type;
 314        int recow;
 315        int ret;
 316
 317        if (drop_cache)
 318                btrfs_drop_extent_cache(inode, start, end - 1, 0);
 319
 320        path = btrfs_alloc_path();
 321        if (!path)
 322                return -ENOMEM;
 323
 324        while (1) {
 325                recow = 0;
 326                ret = btrfs_lookup_file_extent(trans, root, path, inode->i_ino,
 327                                               search_start, -1);
 328                if (ret < 0)
 329                        break;
 330                if (ret > 0 && path->slots[0] > 0 && search_start == start) {
 331                        leaf = path->nodes[0];
 332                        btrfs_item_key_to_cpu(leaf, &key, path->slots[0] - 1);
 333                        if (key.objectid == inode->i_ino &&
 334                            key.type == BTRFS_EXTENT_DATA_KEY)
 335                                path->slots[0]--;
 336                }
 337                ret = 0;
 338next_slot:
 339                leaf = path->nodes[0];
 340                if (path->slots[0] >= btrfs_header_nritems(leaf)) {
 341                        BUG_ON(del_nr > 0);
 342                        ret = btrfs_next_leaf(root, path);
 343                        if (ret < 0)
 344                                break;
 345                        if (ret > 0) {
 346                                ret = 0;
 347                                break;
 348                        }
 349                        leaf = path->nodes[0];
 350                        recow = 1;
 351                }
 352
 353                btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
 354                if (key.objectid > inode->i_ino ||
 355                    key.type > BTRFS_EXTENT_DATA_KEY || key.offset >= end)
 356                        break;
 357
 358                fi = btrfs_item_ptr(leaf, path->slots[0],
 359                                    struct btrfs_file_extent_item);
 360                extent_type = btrfs_file_extent_type(leaf, fi);
 361
 362                if (extent_type == BTRFS_FILE_EXTENT_REG ||
 363                    extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
 364                        disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
 365                        num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi);
 366                        extent_offset = btrfs_file_extent_offset(leaf, fi);
 367                        extent_end = key.offset +
 368                                btrfs_file_extent_num_bytes(leaf, fi);
 369                } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
 370                        extent_end = key.offset +
 371                                btrfs_file_extent_inline_len(leaf, fi);
 372                } else {
 373                        WARN_ON(1);
 374                        extent_end = search_start;
 375                }
 376
 377                if (extent_end <= search_start) {
 378                        path->slots[0]++;
 379                        goto next_slot;
 380                }
 381
 382                search_start = max(key.offset, start);
 383                if (recow) {
 384                        btrfs_release_path(root, path);
 385                        continue;
 386                }
 387
 388                /*
 389                 *     | - range to drop - |
 390                 *  | -------- extent -------- |
 391                 */
 392                if (start > key.offset && end < extent_end) {
 393                        BUG_ON(del_nr > 0);
 394                        BUG_ON(extent_type == BTRFS_FILE_EXTENT_INLINE);
 395
 396                        memcpy(&new_key, &key, sizeof(new_key));
 397                        new_key.offset = start;
 398                        ret = btrfs_duplicate_item(trans, root, path,
 399                                                   &new_key);
 400                        if (ret == -EAGAIN) {
 401                                btrfs_release_path(root, path);
 402                                continue;
 403                        }
 404                        if (ret < 0)
 405                                break;
 406
 407                        leaf = path->nodes[0];
 408                        fi = btrfs_item_ptr(leaf, path->slots[0] - 1,
 409                                            struct btrfs_file_extent_item);
 410                        btrfs_set_file_extent_num_bytes(leaf, fi,
 411                                                        start - key.offset);
 412
 413                        fi = btrfs_item_ptr(leaf, path->slots[0],
 414                                            struct btrfs_file_extent_item);
 415
 416                        extent_offset += start - key.offset;
 417                        btrfs_set_file_extent_offset(leaf, fi, extent_offset);
 418                        btrfs_set_file_extent_num_bytes(leaf, fi,
 419                                                        extent_end - start);
 420                        btrfs_mark_buffer_dirty(leaf);
 421
 422                        if (disk_bytenr > 0) {
 423                                ret = btrfs_inc_extent_ref(trans, root,
 424                                                disk_bytenr, num_bytes, 0,
 425                                                root->root_key.objectid,
 426                                                new_key.objectid,
 427                                                start - extent_offset);
 428                                BUG_ON(ret);
 429                                *hint_byte = disk_bytenr;
 430                        }
 431                        key.offset = start;
 432                }
 433                /*
 434                 *  | ---- range to drop ----- |
 435                 *      | -------- extent -------- |
 436                 */
 437                if (start <= key.offset && end < extent_end) {
 438                        BUG_ON(extent_type == BTRFS_FILE_EXTENT_INLINE);
 439
 440                        memcpy(&new_key, &key, sizeof(new_key));
 441                        new_key.offset = end;
 442                        btrfs_set_item_key_safe(trans, root, path, &new_key);
 443
 444                        extent_offset += end - key.offset;
 445                        btrfs_set_file_extent_offset(leaf, fi, extent_offset);
 446                        btrfs_set_file_extent_num_bytes(leaf, fi,
 447                                                        extent_end - end);
 448                        btrfs_mark_buffer_dirty(leaf);
 449                        if (disk_bytenr > 0) {
 450                                inode_sub_bytes(inode, end - key.offset);
 451                                *hint_byte = disk_bytenr;
 452                        }
 453                        break;
 454                }
 455
 456                search_start = extent_end;
 457                /*
 458                 *       | ---- range to drop ----- |
 459                 *  | -------- extent -------- |
 460                 */
 461                if (start > key.offset && end >= extent_end) {
 462                        BUG_ON(del_nr > 0);
 463                        BUG_ON(extent_type == BTRFS_FILE_EXTENT_INLINE);
 464
 465                        btrfs_set_file_extent_num_bytes(leaf, fi,
 466                                                        start - key.offset);
 467                        btrfs_mark_buffer_dirty(leaf);
 468                        if (disk_bytenr > 0) {
 469                                inode_sub_bytes(inode, extent_end - start);
 470                                *hint_byte = disk_bytenr;
 471                        }
 472                        if (end == extent_end)
 473                                break;
 474
 475                        path->slots[0]++;
 476                        goto next_slot;
 477                }
 478
 479                /*
 480                 *  | ---- range to drop ----- |
 481                 *    | ------ extent ------ |
 482                 */
 483                if (start <= key.offset && end >= extent_end) {
 484                        if (del_nr == 0) {
 485                                del_slot = path->slots[0];
 486                                del_nr = 1;
 487                        } else {
 488                                BUG_ON(del_slot + del_nr != path->slots[0]);
 489                                del_nr++;
 490                        }
 491
 492                        if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
 493                                inode_sub_bytes(inode,
 494                                                extent_end - key.offset);
 495                                extent_end = ALIGN(extent_end,
 496                                                   root->sectorsize);
 497                        } else if (disk_bytenr > 0) {
 498                                ret = btrfs_free_extent(trans, root,
 499                                                disk_bytenr, num_bytes, 0,
 500                                                root->root_key.objectid,
 501                                                key.objectid, key.offset -
 502                                                extent_offset);
 503                                BUG_ON(ret);
 504                                inode_sub_bytes(inode,
 505                                                extent_end - key.offset);
 506                                *hint_byte = disk_bytenr;
 507                        }
 508
 509                        if (end == extent_end)
 510                                break;
 511
 512                        if (path->slots[0] + 1 < btrfs_header_nritems(leaf)) {
 513                                path->slots[0]++;
 514                                goto next_slot;
 515                        }
 516
 517                        ret = btrfs_del_items(trans, root, path, del_slot,
 518                                              del_nr);
 519                        BUG_ON(ret);
 520
 521                        del_nr = 0;
 522                        del_slot = 0;
 523
 524                        btrfs_release_path(root, path);
 525                        continue;
 526                }
 527
 528                BUG_ON(1);
 529        }
 530
 531        if (del_nr > 0) {
 532                ret = btrfs_del_items(trans, root, path, del_slot, del_nr);
 533                BUG_ON(ret);
 534        }
 535
 536        btrfs_free_path(path);
 537        return ret;
 538}
 539
 540static int extent_mergeable(struct extent_buffer *leaf, int slot,
 541                            u64 objectid, u64 bytenr, u64 orig_offset,
 542                            u64 *start, u64 *end)
 543{
 544        struct btrfs_file_extent_item *fi;
 545        struct btrfs_key key;
 546        u64 extent_end;
 547
 548        if (slot < 0 || slot >= btrfs_header_nritems(leaf))
 549                return 0;
 550
 551        btrfs_item_key_to_cpu(leaf, &key, slot);
 552        if (key.objectid != objectid || key.type != BTRFS_EXTENT_DATA_KEY)
 553                return 0;
 554
 555        fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
 556        if (btrfs_file_extent_type(leaf, fi) != BTRFS_FILE_EXTENT_REG ||
 557            btrfs_file_extent_disk_bytenr(leaf, fi) != bytenr ||
 558            btrfs_file_extent_offset(leaf, fi) != key.offset - orig_offset ||
 559            btrfs_file_extent_compression(leaf, fi) ||
 560            btrfs_file_extent_encryption(leaf, fi) ||
 561            btrfs_file_extent_other_encoding(leaf, fi))
 562                return 0;
 563
 564        extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi);
 565        if ((*start && *start != key.offset) || (*end && *end != extent_end))
 566                return 0;
 567
 568        *start = key.offset;
 569        *end = extent_end;
 570        return 1;
 571}
 572
 573/*
 574 * Mark extent in the range start - end as written.
 575 *
 576 * This changes extent type from 'pre-allocated' to 'regular'. If only
 577 * part of extent is marked as written, the extent will be split into
 578 * two or three.
 579 */
 580int btrfs_mark_extent_written(struct btrfs_trans_handle *trans,
 581                              struct inode *inode, u64 start, u64 end)
 582{
 583        struct btrfs_root *root = BTRFS_I(inode)->root;
 584        struct extent_buffer *leaf;
 585        struct btrfs_path *path;
 586        struct btrfs_file_extent_item *fi;
 587        struct btrfs_key key;
 588        struct btrfs_key new_key;
 589        u64 bytenr;
 590        u64 num_bytes;
 591        u64 extent_end;
 592        u64 orig_offset;
 593        u64 other_start;
 594        u64 other_end;
 595        u64 split;
 596        int del_nr = 0;
 597        int del_slot = 0;
 598        int recow;
 599        int ret;
 600
 601        btrfs_drop_extent_cache(inode, start, end - 1, 0);
 602
 603        path = btrfs_alloc_path();
 604        BUG_ON(!path);
 605again:
 606        recow = 0;
 607        split = start;
 608        key.objectid = inode->i_ino;
 609        key.type = BTRFS_EXTENT_DATA_KEY;
 610        key.offset = split;
 611
 612        ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
 613        if (ret > 0 && path->slots[0] > 0)
 614                path->slots[0]--;
 615
 616        leaf = path->nodes[0];
 617        btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
 618        BUG_ON(key.objectid != inode->i_ino ||
 619               key.type != BTRFS_EXTENT_DATA_KEY);
 620        fi = btrfs_item_ptr(leaf, path->slots[0],
 621                            struct btrfs_file_extent_item);
 622        BUG_ON(btrfs_file_extent_type(leaf, fi) !=
 623               BTRFS_FILE_EXTENT_PREALLOC);
 624        extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi);
 625        BUG_ON(key.offset > start || extent_end < end);
 626
 627        bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
 628        num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi);
 629        orig_offset = key.offset - btrfs_file_extent_offset(leaf, fi);
 630        memcpy(&new_key, &key, sizeof(new_key));
 631
 632        if (start == key.offset && end < extent_end) {
 633                other_start = 0;
 634                other_end = start;
 635                if (extent_mergeable(leaf, path->slots[0] - 1,
 636                                     inode->i_ino, bytenr, orig_offset,
 637                                     &other_start, &other_end)) {
 638                        new_key.offset = end;
 639                        btrfs_set_item_key_safe(trans, root, path, &new_key);
 640                        fi = btrfs_item_ptr(leaf, path->slots[0],
 641                                            struct btrfs_file_extent_item);
 642                        btrfs_set_file_extent_num_bytes(leaf, fi,
 643                                                        extent_end - end);
 644                        btrfs_set_file_extent_offset(leaf, fi,
 645                                                     end - orig_offset);
 646                        fi = btrfs_item_ptr(leaf, path->slots[0] - 1,
 647                                            struct btrfs_file_extent_item);
 648                        btrfs_set_file_extent_num_bytes(leaf, fi,
 649                                                        end - other_start);
 650                        btrfs_mark_buffer_dirty(leaf);
 651                        goto out;
 652                }
 653        }
 654
 655        if (start > key.offset && end == extent_end) {
 656                other_start = end;
 657                other_end = 0;
 658                if (extent_mergeable(leaf, path->slots[0] + 1,
 659                                     inode->i_ino, bytenr, orig_offset,
 660                                     &other_start, &other_end)) {
 661                        fi = btrfs_item_ptr(leaf, path->slots[0],
 662                                            struct btrfs_file_extent_item);
 663                        btrfs_set_file_extent_num_bytes(leaf, fi,
 664                                                        start - key.offset);
 665                        path->slots[0]++;
 666                        new_key.offset = start;
 667                        btrfs_set_item_key_safe(trans, root, path, &new_key);
 668
 669                        fi = btrfs_item_ptr(leaf, path->slots[0],
 670                                            struct btrfs_file_extent_item);
 671                        btrfs_set_file_extent_num_bytes(leaf, fi,
 672                                                        other_end - start);
 673                        btrfs_set_file_extent_offset(leaf, fi,
 674                                                     start - orig_offset);
 675                        btrfs_mark_buffer_dirty(leaf);
 676                        goto out;
 677                }
 678        }
 679
 680        while (start > key.offset || end < extent_end) {
 681                if (key.offset == start)
 682                        split = end;
 683
 684                new_key.offset = split;
 685                ret = btrfs_duplicate_item(trans, root, path, &new_key);
 686                if (ret == -EAGAIN) {
 687                        btrfs_release_path(root, path);
 688                        goto again;
 689                }
 690                BUG_ON(ret < 0);
 691
 692                leaf = path->nodes[0];
 693                fi = btrfs_item_ptr(leaf, path->slots[0] - 1,
 694                                    struct btrfs_file_extent_item);
 695                btrfs_set_file_extent_num_bytes(leaf, fi,
 696                                                split - key.offset);
 697
 698                fi = btrfs_item_ptr(leaf, path->slots[0],
 699                                    struct btrfs_file_extent_item);
 700
 701                btrfs_set_file_extent_offset(leaf, fi, split - orig_offset);
 702                btrfs_set_file_extent_num_bytes(leaf, fi,
 703                                                extent_end - split);
 704                btrfs_mark_buffer_dirty(leaf);
 705
 706                ret = btrfs_inc_extent_ref(trans, root, bytenr, num_bytes, 0,
 707                                           root->root_key.objectid,
 708                                           inode->i_ino, orig_offset);
 709                BUG_ON(ret);
 710
 711                if (split == start) {
 712                        key.offset = start;
 713                } else {
 714                        BUG_ON(start != key.offset);
 715                        path->slots[0]--;
 716                        extent_end = end;
 717                }
 718                recow = 1;
 719        }
 720
 721        other_start = end;
 722        other_end = 0;
 723        if (extent_mergeable(leaf, path->slots[0] + 1,
 724                             inode->i_ino, bytenr, orig_offset,
 725                             &other_start, &other_end)) {
 726                if (recow) {
 727                        btrfs_release_path(root, path);
 728                        goto again;
 729                }
 730                extent_end = other_end;
 731                del_slot = path->slots[0] + 1;
 732                del_nr++;
 733                ret = btrfs_free_extent(trans, root, bytenr, num_bytes,
 734                                        0, root->root_key.objectid,
 735                                        inode->i_ino, orig_offset);
 736                BUG_ON(ret);
 737        }
 738        other_start = 0;
 739        other_end = start;
 740        if (extent_mergeable(leaf, path->slots[0] - 1,
 741                             inode->i_ino, bytenr, orig_offset,
 742                             &other_start, &other_end)) {
 743                if (recow) {
 744                        btrfs_release_path(root, path);
 745                        goto again;
 746                }
 747                key.offset = other_start;
 748                del_slot = path->slots[0];
 749                del_nr++;
 750                ret = btrfs_free_extent(trans, root, bytenr, num_bytes,
 751                                        0, root->root_key.objectid,
 752                                        inode->i_ino, orig_offset);
 753                BUG_ON(ret);
 754        }
 755        if (del_nr == 0) {
 756                fi = btrfs_item_ptr(leaf, path->slots[0],
 757                           struct btrfs_file_extent_item);
 758                btrfs_set_file_extent_type(leaf, fi,
 759                                           BTRFS_FILE_EXTENT_REG);
 760                btrfs_mark_buffer_dirty(leaf);
 761        } else {
 762                fi = btrfs_item_ptr(leaf, del_slot - 1,
 763                           struct btrfs_file_extent_item);
 764                btrfs_set_file_extent_type(leaf, fi,
 765                                           BTRFS_FILE_EXTENT_REG);
 766                btrfs_set_file_extent_num_bytes(leaf, fi,
 767                                                extent_end - key.offset);
 768                btrfs_mark_buffer_dirty(leaf);
 769
 770                ret = btrfs_del_items(trans, root, path, del_slot, del_nr);
 771                BUG_ON(ret);
 772        }
 773out:
 774        btrfs_free_path(path);
 775        return 0;
 776}
 777
 778/*
 779 * on error we return an unlocked page and the error value
 780 * on success we return a locked page and 0
 781 */
 782static int prepare_uptodate_page(struct page *page, u64 pos)
 783{
 784        int ret = 0;
 785
 786        if ((pos & (PAGE_CACHE_SIZE - 1)) && !PageUptodate(page)) {
 787                ret = btrfs_readpage(NULL, page);
 788                if (ret)
 789                        return ret;
 790                lock_page(page);
 791                if (!PageUptodate(page)) {
 792                        unlock_page(page);
 793                        return -EIO;
 794                }
 795        }
 796        return 0;
 797}
 798
 799/*
 800 * this gets pages into the page cache and locks them down, it also properly
 801 * waits for data=ordered extents to finish before allowing the pages to be
 802 * modified.
 803 */
 804static noinline int prepare_pages(struct btrfs_root *root, struct file *file,
 805                         struct page **pages, size_t num_pages,
 806                         loff_t pos, unsigned long first_index,
 807                         unsigned long last_index, size_t write_bytes)
 808{
 809        struct extent_state *cached_state = NULL;
 810        int i;
 811        unsigned long index = pos >> PAGE_CACHE_SHIFT;
 812        struct inode *inode = fdentry(file)->d_inode;
 813        int err = 0;
 814        int faili = 0;
 815        u64 start_pos;
 816        u64 last_pos;
 817
 818        start_pos = pos & ~((u64)root->sectorsize - 1);
 819        last_pos = ((u64)index + num_pages) << PAGE_CACHE_SHIFT;
 820
 821        if (start_pos > inode->i_size) {
 822                err = btrfs_cont_expand(inode, start_pos);
 823                if (err)
 824                        return err;
 825        }
 826
 827        memset(pages, 0, num_pages * sizeof(struct page *));
 828again:
 829        for (i = 0; i < num_pages; i++) {
 830                pages[i] = grab_cache_page(inode->i_mapping, index + i);
 831                if (!pages[i]) {
 832                        faili = i - 1;
 833                        err = -ENOMEM;
 834                        goto fail;
 835                }
 836
 837                if (i == 0)
 838                        err = prepare_uptodate_page(pages[i], pos);
 839                if (i == num_pages - 1)
 840                        err = prepare_uptodate_page(pages[i],
 841                                                    pos + write_bytes);
 842                if (err) {
 843                        page_cache_release(pages[i]);
 844                        faili = i - 1;
 845                        goto fail;
 846                }
 847                wait_on_page_writeback(pages[i]);
 848        }
 849        err = 0;
 850        if (start_pos < inode->i_size) {
 851                struct btrfs_ordered_extent *ordered;
 852                lock_extent_bits(&BTRFS_I(inode)->io_tree,
 853                                 start_pos, last_pos - 1, 0, &cached_state,
 854                                 GFP_NOFS);
 855                ordered = btrfs_lookup_first_ordered_extent(inode,
 856                                                            last_pos - 1);
 857                if (ordered &&
 858                    ordered->file_offset + ordered->len > start_pos &&
 859                    ordered->file_offset < last_pos) {
 860                        btrfs_put_ordered_extent(ordered);
 861                        unlock_extent_cached(&BTRFS_I(inode)->io_tree,
 862                                             start_pos, last_pos - 1,
 863                                             &cached_state, GFP_NOFS);
 864                        for (i = 0; i < num_pages; i++) {
 865                                unlock_page(pages[i]);
 866                                page_cache_release(pages[i]);
 867                        }
 868                        btrfs_wait_ordered_range(inode, start_pos,
 869                                                 last_pos - start_pos);
 870                        goto again;
 871                }
 872                if (ordered)
 873                        btrfs_put_ordered_extent(ordered);
 874
 875                clear_extent_bit(&BTRFS_I(inode)->io_tree, start_pos,
 876                                  last_pos - 1, EXTENT_DIRTY | EXTENT_DELALLOC |
 877                                  EXTENT_DO_ACCOUNTING, 0, 0, &cached_state,
 878                                  GFP_NOFS);
 879                unlock_extent_cached(&BTRFS_I(inode)->io_tree,
 880                                     start_pos, last_pos - 1, &cached_state,
 881                                     GFP_NOFS);
 882        }
 883        for (i = 0; i < num_pages; i++) {
 884                clear_page_dirty_for_io(pages[i]);
 885                set_page_extent_mapped(pages[i]);
 886                WARN_ON(!PageLocked(pages[i]));
 887        }
 888        return 0;
 889fail:
 890        while (faili >= 0) {
 891                unlock_page(pages[faili]);
 892                page_cache_release(pages[faili]);
 893                faili--;
 894        }
 895        return err;
 896
 897}
 898
 899static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
 900                                    const struct iovec *iov,
 901                                    unsigned long nr_segs, loff_t pos)
 902{
 903        struct file *file = iocb->ki_filp;
 904        struct inode *inode = fdentry(file)->d_inode;
 905        struct btrfs_root *root = BTRFS_I(inode)->root;
 906        struct page **pages = NULL;
 907        struct iov_iter i;
 908        loff_t *ppos = &iocb->ki_pos;
 909        loff_t start_pos;
 910        ssize_t num_written = 0;
 911        ssize_t err = 0;
 912        size_t count;
 913        size_t ocount;
 914        int ret = 0;
 915        int nrptrs;
 916        unsigned long first_index;
 917        unsigned long last_index;
 918        int will_write;
 919        int buffered = 0;
 920        int copied = 0;
 921        int dirty_pages = 0;
 922
 923        will_write = ((file->f_flags & O_DSYNC) || IS_SYNC(inode) ||
 924                      (file->f_flags & O_DIRECT));
 925
 926        start_pos = pos;
 927
 928        vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE);
 929
 930        mutex_lock(&inode->i_mutex);
 931
 932        err = generic_segment_checks(iov, &nr_segs, &ocount, VERIFY_READ);
 933        if (err)
 934                goto out;
 935        count = ocount;
 936
 937        current->backing_dev_info = inode->i_mapping->backing_dev_info;
 938        err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode));
 939        if (err)
 940                goto out;
 941
 942        if (count == 0)
 943                goto out;
 944
 945        err = file_remove_suid(file);
 946        if (err)
 947                goto out;
 948
 949        /*
 950         * If BTRFS flips readonly due to some impossible error
 951         * (fs_info->fs_state now has BTRFS_SUPER_FLAG_ERROR),
 952         * although we have opened a file as writable, we have
 953         * to stop this write operation to ensure FS consistency.
 954         */
 955        if (root->fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) {
 956                err = -EROFS;
 957                goto out;
 958        }
 959
 960        file_update_time(file);
 961        BTRFS_I(inode)->sequence++;
 962
 963        if (unlikely(file->f_flags & O_DIRECT)) {
 964                num_written = generic_file_direct_write(iocb, iov, &nr_segs,
 965                                                        pos, ppos, count,
 966                                                        ocount);
 967                /*
 968                 * the generic O_DIRECT will update in-memory i_size after the
 969                 * DIOs are done.  But our endio handlers that update the on
 970                 * disk i_size never update past the in memory i_size.  So we
 971                 * need one more update here to catch any additions to the
 972                 * file
 973                 */
 974                if (inode->i_size != BTRFS_I(inode)->disk_i_size) {
 975                        btrfs_ordered_update_i_size(inode, inode->i_size, NULL);
 976                        mark_inode_dirty(inode);
 977                }
 978
 979                if (num_written < 0) {
 980                        ret = num_written;
 981                        num_written = 0;
 982                        goto out;
 983                } else if (num_written == count) {
 984                        /* pick up pos changes done by the generic code */
 985                        pos = *ppos;
 986                        goto out;
 987                }
 988                /*
 989                 * We are going to do buffered for the rest of the range, so we
 990                 * need to make sure to invalidate the buffered pages when we're
 991                 * done.
 992                 */
 993                buffered = 1;
 994                pos += num_written;
 995        }
 996
 997        iov_iter_init(&i, iov, nr_segs, count, num_written);
 998        nrptrs = min((iov_iter_count(&i) + PAGE_CACHE_SIZE - 1) /
 999                     PAGE_CACHE_SIZE, PAGE_CACHE_SIZE /
1000                     (sizeof(struct page *)));

1001        pages = kmalloc(nrptrs * sizeof(struct page *), GFP_KERNEL);
1002        if (!pages) {
1003                ret = -ENOMEM;
1004                goto out;
1005        }
1006
1007        /* generic_write_checks can change our pos */
1008        start_pos = pos;
1009
1010        first_index = pos >> PAGE_CACHE_SHIFT;
1011        last_index = (pos + iov_iter_count(&i)) >> PAGE_CACHE_SHIFT;
1012
1013        while (iov_iter_count(&i) > 0) {
1014                size_t offset = pos & (PAGE_CACHE_SIZE - 1);
1015                size_t write_bytes = min(iov_iter_count(&i),
1016                                         nrptrs * (size_t)PAGE_CACHE_SIZE -
1017                                         offset);
1018                size_t num_pages = (write_bytes + offset +
1019                                    PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
1020
1021                WARN_ON(num_pages > nrptrs);
1022                memset(pages, 0, sizeof(struct page *) * nrptrs);
1023
1024                /*
1025                 * Fault pages before locking them in prepare_pages
1026                 * to avoid recursive lock
1027                 */
1028                if (unlikely(iov_iter_fault_in_readable(&i, write_bytes))) {
1029                        ret = -EFAULT;
1030                        goto out;
1031                }
1032
1033                ret = btrfs_delalloc_reserve_space(inode,
1034                                        num_pages << PAGE_CACHE_SHIFT);
1035                if (ret)
1036                        goto out;
1037
1038                ret = prepare_pages(root, file, pages, num_pages,
1039                                    pos, first_index, last_index,
1040                                    write_bytes);
1041                if (ret) {
1042                        btrfs_delalloc_release_space(inode,
1043                                        num_pages << PAGE_CACHE_SHIFT);
1044                        goto out;
1045                }
1046
1047                copied = btrfs_copy_from_user(pos, num_pages,
1048                                           write_bytes, pages, &i);
1049
1050                /*
1051                 * if we have trouble faulting in the pages, fall
1052                 * back to one page at a time
1053                 */
1054                if (copied < write_bytes)
1055                        nrptrs = 1;
1056
1057                if (copied == 0)
1058                        dirty_pages = 0;
1059                else
1060                        dirty_pages = (copied + offset +
1061                                       PAGE_CACHE_SIZE - 1) >>
1062                                       PAGE_CACHE_SHIFT;
1063
1064                if (num_pages > dirty_pages) {
1065                        if (copied > 0)
1066                                atomic_inc(
1067                                        &BTRFS_I(inode)->outstanding_extents);
1068                        btrfs_delalloc_release_space(inode,
1069                                        (num_pages - dirty_pages) <<
1070                                        PAGE_CACHE_SHIFT);
1071                }
1072
1073                if (copied > 0) {
1074                        dirty_and_release_pages(NULL, root, file, pages,
1075                                                dirty_pages, pos, copied);
1076                }
1077
1078                btrfs_drop_pages(pages, num_pages);
1079
1080                if (copied > 0) {
1081                        if (will_write) {
1082                                filemap_fdatawrite_range(inode->i_mapping, pos,
1083                                                         pos + copied - 1);
1084                        } else {
1085                                balance_dirty_pages_ratelimited_nr(
1086                                                        inode->i_mapping,
1087                                                        dirty_pages);
1088                                if (dirty_pages <
1089                                (root->leafsize >> PAGE_CACHE_SHIFT) + 1)
1090                                        btrfs_btree_balance_dirty(root, 1);
1091                                btrfs_throttle(root);
1092                        }
1093                }
1094
1095                pos += copied;
1096                num_written += copied;
1097
1098                cond_resched();
1099        }
1100out:
1101        mutex_unlock(&inode->i_mutex);
1102        if (ret)
1103                err = ret;
1104
1105        kfree(pages);
1106        *ppos = pos;
1107
1108        /*
1109         * we want to make sure fsync finds this change
1110         * but we haven't joined a transaction running right now.
1111         *
1112         * Later on, someone is sure to update the inode and get the
1113         * real transid recorded.
1114         *
1115         * We set last_trans now to the fs_info generation + 1,
1116         * this will either be one more than the running transaction
1117         * or the generation used for the next transaction if there isn't
1118         * one running right now.
1119         */
1120        BTRFS_I(inode)->last_trans = root->fs_info->generation + 1;
1121
1122        if (num_written > 0 && will_write) {
1123                struct btrfs_trans_handle *trans;
1124
1125                err = btrfs_wait_ordered_range(inode, start_pos, num_written);
1126                if (err)
1127                        num_written = err;
1128
1129                if ((file->f_flags & O_DSYNC) || IS_SYNC(inode)) {
1130                        trans = btrfs_start_transaction(root, 0);
1131                        if (IS_ERR(trans)) {
1132                                num_written = PTR_ERR(trans);
1133                                goto done;
1134                        }
1135                        mutex_lock(&inode->i_mutex);
1136                        ret = btrfs_log_dentry_safe(trans, root,
1137                                                    file->f_dentry);
1138                        mutex_unlock(&inode->i_mutex);
1139                        if (ret == 0) {
1140                                ret = btrfs_sync_log(trans, root);
1141                                if (ret == 0)
1142                                        btrfs_end_transaction(trans, root);
1143                                else
1144                                        btrfs_commit_transaction(trans, root);
1145                        } else if (ret != BTRFS_NO_LOG_SYNC) {
1146                                btrfs_commit_transaction(trans, root);
1147                        } else {
1148                                btrfs_end_transaction(trans, root);
1149                        }
1150                }
1151                if (file->f_flags & O_DIRECT && buffered) {
1152                        invalidate_mapping_pages(inode->i_mapping,
1153                              start_pos >> PAGE_CACHE_SHIFT,
1154                             (start_pos + num_written - 1) >> PAGE_CACHE_SHIFT);
1155                }
1156        }
1157done:
1158        current->backing_dev_info = NULL;
1159        return num_written ? num_written : err;
1160}
1161
1162int btrfs_release_file(struct inode *inode, struct file *filp)
1163{
1164        /*
1165         * ordered_data_close is set by settattr when we are about to truncate
1166         * a file from a non-zero size to a zero size.  This tries to
1167         * flush down new bytes that may have been written if the
1168         * application were using truncate to replace a file in place.
1169         */
1170        if (BTRFS_I(inode)->ordered_data_close) {
1171                BTRFS_I(inode)->ordered_data_close = 0;
1172                btrfs_add_ordered_operation(NULL, BTRFS_I(inode)->root, inode);
1173                if (inode->i_size > BTRFS_ORDERED_OPERATIONS_FLUSH_LIMIT)
1174                        filemap_flush(inode->i_mapping);
1175        }
1176        if (filp->private_data)
1177                btrfs_ioctl_trans_end(filp);
1178        return 0;
1179}
1180
1181/*
1182 * fsync call for both files and directories.  This logs the inode into
1183 * the tree log instead of forcing full commits whenever possible.
1184 *
1185 * It needs to call filemap_fdatawait so that all ordered extent updates are
1186 * in the metadata btree are up to date for copying to the log.
1187 *
1188 * It drops the inode mutex before doing the tree log commit.  This is an
1189 * important optimization for directories because holding the mutex prevents
1190 * new operations on the dir while we write to disk.
1191 */
1192int btrfs_sync_file(struct file *file, int datasync)
1193{
1194        struct dentry *dentry = file->f_path.dentry;
1195        struct inode *inode = dentry->d_inode;
1196        struct btrfs_root *root = BTRFS_I(inode)->root;
1197        int ret = 0;
1198        struct btrfs_trans_handle *trans;
1199
1200
1201        /* we wait first, since the writeback may change the inode */
1202        root->log_batch++;
1203        /* the VFS called filemap_fdatawrite for us */
1204        btrfs_wait_ordered_range(inode, 0, (u64)-1);
1205        root->log_batch++;
1206
1207        /*
1208         * check the transaction that last modified this inode
1209         * and see if its already been committed
1210         */
1211        if (!BTRFS_I(inode)->last_trans)
1212                goto out;
1213
1214        /*
1215         * if the last transaction that changed this file was before
1216         * the current transaction, we can bail out now without any
1217         * syncing
1218         */
1219        mutex_lock(&root->fs_info->trans_mutex);
1220        if (BTRFS_I(inode)->last_trans <=
1221            root->fs_info->last_trans_committed) {
1222                BTRFS_I(inode)->last_trans = 0;
1223                mutex_unlock(&root->fs_info->trans_mutex);
1224                goto out;
1225        }
1226        mutex_unlock(&root->fs_info->trans_mutex);
1227
1228        /*
1229         * ok we haven't committed the transaction yet, lets do a commit
1230         */
1231        if (file->private_data)
1232                btrfs_ioctl_trans_end(file);
1233
1234        trans = btrfs_start_transaction(root, 0);
1235        if (IS_ERR(trans)) {
1236                ret = PTR_ERR(trans);
1237                goto out;
1238        }
1239
1240        ret = btrfs_log_dentry_safe(trans, root, dentry);
1241        if (ret < 0)
1242                goto out;
1243
1244        /* we've logged all the items and now have a consistent
1245         * version of the file in the log.  It is possible that
1246         * someone will come in and modify the file, but that's
1247         * fine because the log is consistent on disk, and we
1248         * have references to all of the file's extents
1249         *
1250         * It is possible that someone will come in and log the
1251         * file again, but that will end up using the synchronization
1252         * inside btrfs_sync_log to keep things safe.
1253         */
1254        mutex_unlock(&dentry->d_inode->i_mutex);
1255
1256        if (ret != BTRFS_NO_LOG_SYNC) {
1257                if (ret > 0) {
1258                        ret = btrfs_commit_transaction(trans, root);
1259                } else {
1260                        ret = btrfs_sync_log(trans, root);
1261                        if (ret == 0)
1262                                ret = btrfs_end_transaction(trans, root);
1263                        else
1264                                ret = btrfs_commit_transaction(trans, root);
1265                }
1266        } else {
1267                ret = btrfs_end_transaction(trans, root);
1268        }
1269        mutex_lock(&dentry->d_inode->i_mutex);
1270out:
1271        return ret > 0 ? -EIO : ret;
1272}
1273
1274static const struct vm_operations_struct btrfs_file_vm_ops = {
1275        .fault          = filemap_fault,
1276        .page_mkwrite   = btrfs_page_mkwrite,
1277};
1278
1279static int btrfs_file_mmap(struct file  *filp, struct vm_area_struct *vma)
1280{
1281        struct address_space *mapping = filp->f_mapping;
1282
1283        if (!mapping->a_ops->readpage)
1284                return -ENOEXEC;
1285
1286        file_accessed(filp);
1287        vma->vm_ops = &btrfs_file_vm_ops;
1288        vma->vm_flags |= VM_CAN_NONLINEAR;
1289
1290        return 0;
1291}
1292
1293static long btrfs_fallocate(struct file *file, int mode,
1294                            loff_t offset, loff_t len)
1295{
1296        struct inode *inode = file->f_path.dentry->d_inode;
1297        struct extent_state *cached_state = NULL;
1298        u64 cur_offset;
1299        u64 last_byte;
1300        u64 alloc_start;
1301        u64 alloc_end;
1302        u64 alloc_hint = 0;
1303        u64 locked_end;
1304        u64 mask = BTRFS_I(inode)->root->sectorsize - 1;
1305        struct extent_map *em;
1306        int ret;
1307
1308        alloc_start = offset & ~mask;
1309        alloc_end =  (offset + len + mask) & ~mask;
1310
1311        /* We only support the FALLOC_FL_KEEP_SIZE mode */
1312        if (mode & ~FALLOC_FL_KEEP_SIZE)
1313                return -EOPNOTSUPP;
1314
1315        /*
1316         * wait for ordered IO before we have any locks.  We'll loop again
1317         * below with the locks held.
1318         */
1319        btrfs_wait_ordered_range(inode, alloc_start, alloc_end - alloc_start);
1320
1321        mutex_lock(&inode->i_mutex);
1322        ret = inode_newsize_ok(inode, alloc_end);
1323        if (ret)
1324                goto out;
1325
1326        if (alloc_start > inode->i_size) {
1327                ret = btrfs_cont_expand(inode, alloc_start);
1328                if (ret)
1329                        goto out;
1330        }
1331
1332        ret = btrfs_check_data_free_space(inode, alloc_end - alloc_start);
1333        if (ret)
1334                goto out;
1335
1336        locked_end = alloc_end - 1;
1337        while (1) {
1338                struct btrfs_ordered_extent *ordered;
1339
1340                /* the extent lock is ordered inside the running
1341                 * transaction
1342                 */
1343                lock_extent_bits(&BTRFS_I(inode)->io_tree, alloc_start,
1344                                 locked_end, 0, &cached_state, GFP_NOFS);
1345                ordered = btrfs_lookup_first_ordered_extent(inode,
1346                                                            alloc_end - 1);
1347                if (ordered &&
1348                    ordered->file_offset + ordered->len > alloc_start &&
1349                    ordered->file_offset < alloc_end) {
1350                        btrfs_put_ordered_extent(ordered);
1351                        unlock_extent_cached(&BTRFS_I(inode)->io_tree,
1352                                             alloc_start, locked_end,
1353                                             &cached_state, GFP_NOFS);
1354                        /*
1355                         * we can't wait on the range with the transaction
1356                         * running or with the extent lock held
1357                         */
1358                        btrfs_wait_ordered_range(inode, alloc_start,
1359                                                 alloc_end - alloc_start);
1360                } else {
1361                        if (ordered)
1362                                btrfs_put_ordered_extent(ordered);
1363                        break;
1364                }
1365        }
1366
1367        cur_offset = alloc_start;
1368        while (1) {
1369                em = btrfs_get_extent(inode, NULL, 0, cur_offset,
1370                                      alloc_end - cur_offset, 0);
1371                BUG_ON(IS_ERR(em) || !em);
1372                last_byte = min(extent_map_end(em), alloc_end);
1373                last_byte = (last_byte + mask) & ~mask;
1374                if (em->block_start == EXTENT_MAP_HOLE ||
1375                    (cur_offset >= inode->i_size &&
1376                     !test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) {
1377                        ret = btrfs_prealloc_file_range(inode, mode, cur_offset,
1378                                                        last_byte - cur_offset,
1379                                                        1 << inode->i_blkbits,
1380                                                        offset + len,
1381                                                        &alloc_hint);
1382                        if (ret < 0) {
1383                                free_extent_map(em);
1384                                break;
1385                        }
1386                }
1387                free_extent_map(em);
1388
1389                cur_offset = last_byte;
1390                if (cur_offset >= alloc_end) {
1391                        ret = 0;
1392                        break;
1393                }
1394        }
1395        unlock_extent_cached(&BTRFS_I(inode)->io_tree, alloc_start, locked_end,
1396                             &cached_state, GFP_NOFS);
1397
1398        btrfs_free_reserved_data_space(inode, alloc_end - alloc_start);
1399out:
1400        mutex_unlock(&inode->i_mutex);
1401        return ret;
1402}
1403
1404const struct file_operations btrfs_file_operations = {
1405        .llseek         = generic_file_llseek,
1406        .read           = do_sync_read,
1407        .write          = do_sync_write,
1408        .aio_read       = generic_file_aio_read,
1409        .splice_read    = generic_file_splice_read,
1410        .aio_write      = btrfs_file_aio_write,
1411        .mmap           = btrfs_file_mmap,
1412        .open           = generic_file_open,
1413        .release        = btrfs_release_file,
1414        .fsync          = btrfs_sync_file,
1415        .fallocate      = btrfs_fallocate,
1416        .unlocked_ioctl = btrfs_ioctl,
1417#ifdef CONFIG_COMPAT
1418        .compat_ioctl   = btrfs_ioctl,
1419#endif
1420};
1421