linux/fs/btrfs/file.c
<<
>>
Prefs
   1/*
   2 * Copyright (C) 2007 Oracle.  All rights reserved.
   3 *
   4 * This program is free software; you can redistribute it and/or
   5 * modify it under the terms of the GNU General Public
   6 * License v2 as published by the Free Software Foundation.
   7 *
   8 * This program is distributed in the hope that it will be useful,
   9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
  10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  11 * General Public License for more details.
  12 *
  13 * You should have received a copy of the GNU General Public
  14 * License along with this program; if not, write to the
  15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  16 * Boston, MA 021110-1307, USA.
  17 */
  18
  19#include <linux/fs.h>
  20#include <linux/pagemap.h>
  21#include <linux/highmem.h>
  22#include <linux/time.h>
  23#include <linux/init.h>
  24#include <linux/string.h>
  25#include <linux/backing-dev.h>
  26#include <linux/mpage.h>
  27#include <linux/swap.h>
  28#include <linux/writeback.h>
  29#include <linux/statfs.h>
  30#include <linux/compat.h>
  31#include "ctree.h"
  32#include "disk-io.h"
  33#include "transaction.h"
  34#include "btrfs_inode.h"
  35#include "ioctl.h"
  36#include "print-tree.h"
  37#include "tree-log.h"
  38#include "locking.h"
  39#include "compat.h"
  40
  41
  42/* simple helper to fault in pages and copy.  This should go away
  43 * and be replaced with calls into generic code.
  44 */
  45static noinline int btrfs_copy_from_user(loff_t pos, int num_pages,
  46                                         int write_bytes,
  47                                         struct page **prepared_pages,
  48                                         const char __user *buf)
  49{
  50        long page_fault = 0;
  51        int i;
  52        int offset = pos & (PAGE_CACHE_SIZE - 1);
  53
  54        for (i = 0; i < num_pages && write_bytes > 0; i++, offset = 0) {
  55                size_t count = min_t(size_t,
  56                                     PAGE_CACHE_SIZE - offset, write_bytes);
  57                struct page *page = prepared_pages[i];
  58                fault_in_pages_readable(buf, count);
  59
  60                /* Copy data from userspace to the current page */
  61                kmap(page);
  62                page_fault = __copy_from_user(page_address(page) + offset,
  63                                              buf, count);
  64                /* Flush processor's dcache for this page */
  65                flush_dcache_page(page);
  66                kunmap(page);
  67                buf += count;
  68                write_bytes -= count;
  69
  70                if (page_fault)
  71                        break;
  72        }
  73        return page_fault ? -EFAULT : 0;
  74}
  75
  76/*
  77 * unlocks pages after btrfs_file_write is done with them
  78 */
  79static noinline void btrfs_drop_pages(struct page **pages, size_t num_pages)
  80{
  81        size_t i;
  82        for (i = 0; i < num_pages; i++) {
  83                if (!pages[i])
  84                        break;
  85                /* page checked is some magic around finding pages that
  86                 * have been modified without going through btrfs_set_page_dirty
  87                 * clear it here
  88                 */
  89                ClearPageChecked(pages[i]);
  90                unlock_page(pages[i]);
  91                mark_page_accessed(pages[i]);
  92                page_cache_release(pages[i]);
  93        }
  94}
  95
  96/*
  97 * after copy_from_user, pages need to be dirtied and we need to make
  98 * sure holes are created between the current EOF and the start of
  99 * any next extents (if required).
 100 *
 101 * this also makes the decision about creating an inline extent vs
 102 * doing real data extents, marking pages dirty and delalloc as required.
 103 */
 104static noinline int dirty_and_release_pages(struct btrfs_trans_handle *trans,
 105                                   struct btrfs_root *root,
 106                                   struct file *file,
 107                                   struct page **pages,
 108                                   size_t num_pages,
 109                                   loff_t pos,
 110                                   size_t write_bytes)
 111{
 112        int err = 0;
 113        int i;
 114        struct inode *inode = fdentry(file)->d_inode;
 115        u64 num_bytes;
 116        u64 start_pos;
 117        u64 end_of_last_block;
 118        u64 end_pos = pos + write_bytes;
 119        loff_t isize = i_size_read(inode);
 120
 121        start_pos = pos & ~((u64)root->sectorsize - 1);
 122        num_bytes = (write_bytes + pos - start_pos +
 123                    root->sectorsize - 1) & ~((u64)root->sectorsize - 1);
 124
 125        end_of_last_block = start_pos + num_bytes - 1;
 126        err = btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block);
 127        if (err)
 128                return err;
 129
 130        for (i = 0; i < num_pages; i++) {
 131                struct page *p = pages[i];
 132                SetPageUptodate(p);
 133                ClearPageChecked(p);
 134                set_page_dirty(p);
 135        }
 136        if (end_pos > isize) {
 137                i_size_write(inode, end_pos);
 138                /* we've only changed i_size in ram, and we haven't updated
 139                 * the disk i_size.  There is no need to log the inode
 140                 * at this time.
 141                 */
 142        }
 143        return err;
 144}
 145
 146/*
 147 * this drops all the extents in the cache that intersect the range
 148 * [start, end].  Existing extents are split as required.
 149 */
 150int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
 151                            int skip_pinned)
 152{
 153        struct extent_map *em;
 154        struct extent_map *split = NULL;
 155        struct extent_map *split2 = NULL;
 156        struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
 157        u64 len = end - start + 1;
 158        int ret;
 159        int testend = 1;
 160        unsigned long flags;
 161        int compressed = 0;
 162
 163        WARN_ON(end < start);
 164        if (end == (u64)-1) {
 165                len = (u64)-1;
 166                testend = 0;
 167        }
 168        while (1) {
 169                if (!split)
 170                        split = alloc_extent_map(GFP_NOFS);
 171                if (!split2)
 172                        split2 = alloc_extent_map(GFP_NOFS);
 173
 174                write_lock(&em_tree->lock);
 175                em = lookup_extent_mapping(em_tree, start, len);
 176                if (!em) {
 177                        write_unlock(&em_tree->lock);
 178                        break;
 179                }
 180                flags = em->flags;
 181                if (skip_pinned && test_bit(EXTENT_FLAG_PINNED, &em->flags)) {
 182                        if (em->start <= start &&
 183                            (!testend || em->start + em->len >= start + len)) {
 184                                free_extent_map(em);
 185                                write_unlock(&em_tree->lock);
 186                                break;
 187                        }
 188                        if (start < em->start) {
 189                                len = em->start - start;
 190                        } else {
 191                                len = start + len - (em->start + em->len);
 192                                start = em->start + em->len;
 193                        }
 194                        free_extent_map(em);
 195                        write_unlock(&em_tree->lock);
 196                        continue;
 197                }
 198                compressed = test_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
 199                clear_bit(EXTENT_FLAG_PINNED, &em->flags);
 200                remove_extent_mapping(em_tree, em);
 201
 202                if (em->block_start < EXTENT_MAP_LAST_BYTE &&
 203                    em->start < start) {
 204                        split->start = em->start;
 205                        split->len = start - em->start;
 206                        split->orig_start = em->orig_start;
 207                        split->block_start = em->block_start;
 208
 209                        if (compressed)
 210                                split->block_len = em->block_len;
 211                        else
 212                                split->block_len = split->len;
 213
 214                        split->bdev = em->bdev;
 215                        split->flags = flags;
 216                        ret = add_extent_mapping(em_tree, split);
 217                        BUG_ON(ret);
 218                        free_extent_map(split);
 219                        split = split2;
 220                        split2 = NULL;
 221                }
 222                if (em->block_start < EXTENT_MAP_LAST_BYTE &&
 223                    testend && em->start + em->len > start + len) {
 224                        u64 diff = start + len - em->start;
 225
 226                        split->start = start + len;
 227                        split->len = em->start + em->len - (start + len);
 228                        split->bdev = em->bdev;
 229                        split->flags = flags;
 230
 231                        if (compressed) {
 232                                split->block_len = em->block_len;
 233                                split->block_start = em->block_start;
 234                                split->orig_start = em->orig_start;
 235                        } else {
 236                                split->block_len = split->len;
 237                                split->block_start = em->block_start + diff;
 238                                split->orig_start = split->start;
 239                        }
 240
 241                        ret = add_extent_mapping(em_tree, split);
 242                        BUG_ON(ret);
 243                        free_extent_map(split);
 244                        split = NULL;
 245                }
 246                write_unlock(&em_tree->lock);
 247
 248                /* once for us */
 249                free_extent_map(em);
 250                /* once for the tree*/
 251                free_extent_map(em);
 252        }
 253        if (split)
 254                free_extent_map(split);
 255        if (split2)
 256                free_extent_map(split2);
 257        return 0;
 258}
 259
 260/*
 261 * this is very complex, but the basic idea is to drop all extents
 262 * in the range start - end.  hint_block is filled in with a block number
 263 * that would be a good hint to the block allocator for this file.
 264 *
 265 * If an extent intersects the range but is not entirely inside the range
 266 * it is either truncated or split.  Anything entirely inside the range
 267 * is deleted from the tree.
 268 *
 269 * inline_limit is used to tell this code which offsets in the file to keep
 270 * if they contain inline extents.
 271 */
 272noinline int btrfs_drop_extents(struct btrfs_trans_handle *trans,
 273                       struct btrfs_root *root, struct inode *inode,
 274                       u64 start, u64 end, u64 locked_end,
 275                       u64 inline_limit, u64 *hint_byte, int drop_cache)
 276{
 277        u64 extent_end = 0;
 278        u64 search_start = start;
 279        u64 ram_bytes = 0;
 280        u64 disk_bytenr = 0;
 281        u64 orig_locked_end = locked_end;
 282        u8 compression;
 283        u8 encryption;
 284        u16 other_encoding = 0;
 285        struct extent_buffer *leaf;
 286        struct btrfs_file_extent_item *extent;
 287        struct btrfs_path *path;
 288        struct btrfs_key key;
 289        struct btrfs_file_extent_item old;
 290        int keep;
 291        int slot;
 292        int bookend;
 293        int found_type = 0;
 294        int found_extent;
 295        int found_inline;
 296        int recow;
 297        int ret;
 298
 299        inline_limit = 0;
 300        if (drop_cache)
 301                btrfs_drop_extent_cache(inode, start, end - 1, 0);
 302
 303        path = btrfs_alloc_path();
 304        if (!path)
 305                return -ENOMEM;
 306        while (1) {
 307                recow = 0;
 308                btrfs_release_path(root, path);
 309                ret = btrfs_lookup_file_extent(trans, root, path, inode->i_ino,
 310                                               search_start, -1);
 311                if (ret < 0)
 312                        goto out;
 313                if (ret > 0) {
 314                        if (path->slots[0] == 0) {
 315                                ret = 0;
 316                                goto out;
 317                        }
 318                        path->slots[0]--;
 319                }
 320next_slot:
 321                keep = 0;
 322                bookend = 0;
 323                found_extent = 0;
 324                found_inline = 0;
 325                compression = 0;
 326                encryption = 0;
 327                extent = NULL;
 328                leaf = path->nodes[0];
 329                slot = path->slots[0];
 330                ret = 0;
 331                btrfs_item_key_to_cpu(leaf, &key, slot);
 332                if (btrfs_key_type(&key) == BTRFS_EXTENT_DATA_KEY &&
 333                    key.offset >= end) {
 334                        goto out;
 335                }
 336                if (btrfs_key_type(&key) > BTRFS_EXTENT_DATA_KEY ||
 337                    key.objectid != inode->i_ino) {
 338                        goto out;
 339                }
 340                if (recow) {
 341                        search_start = max(key.offset, start);
 342                        continue;
 343                }
 344                if (btrfs_key_type(&key) == BTRFS_EXTENT_DATA_KEY) {
 345                        extent = btrfs_item_ptr(leaf, slot,
 346                                                struct btrfs_file_extent_item);
 347                        found_type = btrfs_file_extent_type(leaf, extent);
 348                        compression = btrfs_file_extent_compression(leaf,
 349                                                                    extent);
 350                        encryption = btrfs_file_extent_encryption(leaf,
 351                                                                  extent);
 352                        other_encoding = btrfs_file_extent_other_encoding(leaf,
 353                                                                  extent);
 354                        if (found_type == BTRFS_FILE_EXTENT_REG ||
 355                            found_type == BTRFS_FILE_EXTENT_PREALLOC) {
 356                                extent_end =
 357                                     btrfs_file_extent_disk_bytenr(leaf,
 358                                                                   extent);
 359                                if (extent_end)
 360                                        *hint_byte = extent_end;
 361
 362                                extent_end = key.offset +
 363                                     btrfs_file_extent_num_bytes(leaf, extent);
 364                                ram_bytes = btrfs_file_extent_ram_bytes(leaf,
 365                                                                extent);
 366                                found_extent = 1;
 367                        } else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
 368                                found_inline = 1;
 369                                extent_end = key.offset +
 370                                     btrfs_file_extent_inline_len(leaf, extent);
 371                        }
 372                } else {
 373                        extent_end = search_start;
 374                }
 375
 376                /* we found nothing we can drop */
 377                if ((!found_extent && !found_inline) ||
 378                    search_start >= extent_end) {
 379                        int nextret;
 380                        u32 nritems;
 381                        nritems = btrfs_header_nritems(leaf);
 382                        if (slot >= nritems - 1) {
 383                                nextret = btrfs_next_leaf(root, path);
 384                                if (nextret)
 385                                        goto out;
 386                                recow = 1;
 387                        } else {
 388                                path->slots[0]++;
 389                        }
 390                        goto next_slot;
 391                }
 392
 393                if (end <= extent_end && start >= key.offset && found_inline)
 394                        *hint_byte = EXTENT_MAP_INLINE;
 395
 396                if (found_extent) {
 397                        read_extent_buffer(leaf, &old, (unsigned long)extent,
 398                                           sizeof(old));
 399                }
 400
 401                if (end < extent_end && end >= key.offset) {
 402                        bookend = 1;
 403                        if (found_inline && start <= key.offset)
 404                                keep = 1;
 405                }
 406
 407                if (bookend && found_extent) {
 408                        if (locked_end < extent_end) {
 409                                ret = try_lock_extent(&BTRFS_I(inode)->io_tree,
 410                                                locked_end, extent_end - 1,
 411                                                GFP_NOFS);
 412                                if (!ret) {
 413                                        btrfs_release_path(root, path);
 414                                        lock_extent(&BTRFS_I(inode)->io_tree,
 415                                                locked_end, extent_end - 1,
 416                                                GFP_NOFS);
 417                                        locked_end = extent_end;
 418                                        continue;
 419                                }
 420                                locked_end = extent_end;
 421                        }
 422                        disk_bytenr = le64_to_cpu(old.disk_bytenr);
 423                        if (disk_bytenr != 0) {
 424                                ret = btrfs_inc_extent_ref(trans, root,
 425                                           disk_bytenr,
 426                                           le64_to_cpu(old.disk_num_bytes), 0,
 427                                           root->root_key.objectid,
 428                                           key.objectid, key.offset -
 429                                           le64_to_cpu(old.offset));
 430                                BUG_ON(ret);
 431                        }
 432                }
 433
 434                if (found_inline) {
 435                        u64 mask = root->sectorsize - 1;
 436                        search_start = (extent_end + mask) & ~mask;
 437                } else
 438                        search_start = extent_end;
 439
 440                /* truncate existing extent */
 441                if (start > key.offset) {
 442                        u64 new_num;
 443                        u64 old_num;
 444                        keep = 1;
 445                        WARN_ON(start & (root->sectorsize - 1));
 446                        if (found_extent) {
 447                                new_num = start - key.offset;
 448                                old_num = btrfs_file_extent_num_bytes(leaf,
 449                                                                      extent);
 450                                *hint_byte =
 451                                        btrfs_file_extent_disk_bytenr(leaf,
 452                                                                      extent);
 453                                if (btrfs_file_extent_disk_bytenr(leaf,
 454                                                                  extent)) {
 455                                        inode_sub_bytes(inode, old_num -
 456                                                        new_num);
 457                                }
 458                                btrfs_set_file_extent_num_bytes(leaf,
 459                                                        extent, new_num);
 460                                btrfs_mark_buffer_dirty(leaf);
 461                        } else if (key.offset < inline_limit &&
 462                                   (end > extent_end) &&
 463                                   (inline_limit < extent_end)) {
 464                                u32 new_size;
 465                                new_size = btrfs_file_extent_calc_inline_size(
 466                                                   inline_limit - key.offset);
 467                                inode_sub_bytes(inode, extent_end -
 468                                                inline_limit);
 469                                btrfs_set_file_extent_ram_bytes(leaf, extent,
 470                                                        new_size);
 471                                if (!compression && !encryption) {
 472                                        btrfs_truncate_item(trans, root, path,
 473                                                            new_size, 1);
 474                                }
 475                        }
 476                }
 477                /* delete the entire extent */
 478                if (!keep) {
 479                        if (found_inline)
 480                                inode_sub_bytes(inode, extent_end -
 481                                                key.offset);
 482                        ret = btrfs_del_item(trans, root, path);
 483                        /* TODO update progress marker and return */
 484                        BUG_ON(ret);
 485                        extent = NULL;
 486                        btrfs_release_path(root, path);
 487                        /* the extent will be freed later */
 488                }
 489                if (bookend && found_inline && start <= key.offset) {
 490                        u32 new_size;
 491                        new_size = btrfs_file_extent_calc_inline_size(
 492                                                   extent_end - end);
 493                        inode_sub_bytes(inode, end - key.offset);
 494                        btrfs_set_file_extent_ram_bytes(leaf, extent,
 495                                                        new_size);
 496                        if (!compression && !encryption)
 497                                ret = btrfs_truncate_item(trans, root, path,
 498                                                          new_size, 0);
 499                        BUG_ON(ret);
 500                }
 501                /* create bookend, splitting the extent in two */
 502                if (bookend && found_extent) {
 503                        struct btrfs_key ins;
 504                        ins.objectid = inode->i_ino;
 505                        ins.offset = end;
 506                        btrfs_set_key_type(&ins, BTRFS_EXTENT_DATA_KEY);
 507
 508                        btrfs_release_path(root, path);
 509                        path->leave_spinning = 1;
 510                        ret = btrfs_insert_empty_item(trans, root, path, &ins,
 511                                                      sizeof(*extent));
 512                        BUG_ON(ret);
 513
 514                        leaf = path->nodes[0];
 515                        extent = btrfs_item_ptr(leaf, path->slots[0],
 516                                                struct btrfs_file_extent_item);
 517                        write_extent_buffer(leaf, &old,
 518                                            (unsigned long)extent, sizeof(old));
 519
 520                        btrfs_set_file_extent_compression(leaf, extent,
 521                                                          compression);
 522                        btrfs_set_file_extent_encryption(leaf, extent,
 523                                                         encryption);
 524                        btrfs_set_file_extent_other_encoding(leaf, extent,
 525                                                             other_encoding);
 526                        btrfs_set_file_extent_offset(leaf, extent,
 527                                    le64_to_cpu(old.offset) + end - key.offset);
 528                        WARN_ON(le64_to_cpu(old.num_bytes) <
 529                                (extent_end - end));
 530                        btrfs_set_file_extent_num_bytes(leaf, extent,
 531                                                        extent_end - end);
 532
 533                        /*
 534                         * set the ram bytes to the size of the full extent
 535                         * before splitting.  This is a worst case flag,
 536                         * but its the best we can do because we don't know
 537                         * how splitting affects compression
 538                         */
 539                        btrfs_set_file_extent_ram_bytes(leaf, extent,
 540                                                        ram_bytes);
 541                        btrfs_set_file_extent_type(leaf, extent, found_type);
 542
 543                        btrfs_unlock_up_safe(path, 1);
 544                        btrfs_mark_buffer_dirty(path->nodes[0]);
 545                        btrfs_set_lock_blocking(path->nodes[0]);
 546
 547                        path->leave_spinning = 0;
 548                        btrfs_release_path(root, path);
 549                        if (disk_bytenr != 0)
 550                                inode_add_bytes(inode, extent_end - end);
 551                }
 552
 553                if (found_extent && !keep) {
 554                        u64 old_disk_bytenr = le64_to_cpu(old.disk_bytenr);
 555
 556                        if (old_disk_bytenr != 0) {
 557                                inode_sub_bytes(inode,
 558                                                le64_to_cpu(old.num_bytes));
 559                                ret = btrfs_free_extent(trans, root,
 560                                                old_disk_bytenr,
 561                                                le64_to_cpu(old.disk_num_bytes),
 562                                                0, root->root_key.objectid,
 563                                                key.objectid, key.offset -
 564                                                le64_to_cpu(old.offset));
 565                                BUG_ON(ret);
 566                                *hint_byte = old_disk_bytenr;
 567                        }
 568                }
 569
 570                if (search_start >= end) {
 571                        ret = 0;
 572                        goto out;
 573                }
 574        }
 575out:
 576        btrfs_free_path(path);
 577        if (locked_end > orig_locked_end) {
 578                unlock_extent(&BTRFS_I(inode)->io_tree, orig_locked_end,
 579                              locked_end - 1, GFP_NOFS);
 580        }
 581        return ret;
 582}
 583
 584static int extent_mergeable(struct extent_buffer *leaf, int slot,
 585                            u64 objectid, u64 bytenr, u64 *start, u64 *end)
 586{
 587        struct btrfs_file_extent_item *fi;
 588        struct btrfs_key key;
 589        u64 extent_end;
 590
 591        if (slot < 0 || slot >= btrfs_header_nritems(leaf))
 592                return 0;
 593
 594        btrfs_item_key_to_cpu(leaf, &key, slot);
 595        if (key.objectid != objectid || key.type != BTRFS_EXTENT_DATA_KEY)
 596                return 0;
 597
 598        fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
 599        if (btrfs_file_extent_type(leaf, fi) != BTRFS_FILE_EXTENT_REG ||
 600            btrfs_file_extent_disk_bytenr(leaf, fi) != bytenr ||
 601            btrfs_file_extent_compression(leaf, fi) ||
 602            btrfs_file_extent_encryption(leaf, fi) ||
 603            btrfs_file_extent_other_encoding(leaf, fi))
 604                return 0;
 605
 606        extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi);
 607        if ((*start && *start != key.offset) || (*end && *end != extent_end))
 608                return 0;
 609
 610        *start = key.offset;
 611        *end = extent_end;
 612        return 1;
 613}
 614
 615/*
 616 * Mark extent in the range start - end as written.
 617 *
 618 * This changes extent type from 'pre-allocated' to 'regular'. If only
 619 * part of extent is marked as written, the extent will be split into
 620 * two or three.
 621 */
 622int btrfs_mark_extent_written(struct btrfs_trans_handle *trans,
 623                              struct btrfs_root *root,
 624                              struct inode *inode, u64 start, u64 end)
 625{
 626        struct extent_buffer *leaf;
 627        struct btrfs_path *path;
 628        struct btrfs_file_extent_item *fi;
 629        struct btrfs_key key;
 630        u64 bytenr;
 631        u64 num_bytes;
 632        u64 extent_end;
 633        u64 orig_offset;
 634        u64 other_start;
 635        u64 other_end;
 636        u64 split = start;
 637        u64 locked_end = end;
 638        int extent_type;
 639        int split_end = 1;
 640        int ret;
 641
 642        btrfs_drop_extent_cache(inode, start, end - 1, 0);
 643
 644        path = btrfs_alloc_path();
 645        BUG_ON(!path);
 646again:
 647        key.objectid = inode->i_ino;
 648        key.type = BTRFS_EXTENT_DATA_KEY;
 649        if (split == start)
 650                key.offset = split;
 651        else
 652                key.offset = split - 1;
 653
 654        ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
 655        if (ret > 0 && path->slots[0] > 0)
 656                path->slots[0]--;
 657
 658        leaf = path->nodes[0];
 659        btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
 660        BUG_ON(key.objectid != inode->i_ino ||
 661               key.type != BTRFS_EXTENT_DATA_KEY);
 662        fi = btrfs_item_ptr(leaf, path->slots[0],
 663                            struct btrfs_file_extent_item);
 664        extent_type = btrfs_file_extent_type(leaf, fi);
 665        BUG_ON(extent_type != BTRFS_FILE_EXTENT_PREALLOC);
 666        extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi);
 667        BUG_ON(key.offset > start || extent_end < end);
 668
 669        bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
 670        num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi);
 671        orig_offset = key.offset - btrfs_file_extent_offset(leaf, fi);
 672
 673        if (key.offset == start)
 674                split = end;
 675
 676        if (key.offset == start && extent_end == end) {
 677                int del_nr = 0;
 678                int del_slot = 0;
 679                other_start = end;
 680                other_end = 0;
 681                if (extent_mergeable(leaf, path->slots[0] + 1, inode->i_ino,
 682                                     bytenr, &other_start, &other_end)) {
 683                        extent_end = other_end;
 684                        del_slot = path->slots[0] + 1;
 685                        del_nr++;
 686                        ret = btrfs_free_extent(trans, root, bytenr, num_bytes,
 687                                                0, root->root_key.objectid,
 688                                                inode->i_ino, orig_offset);
 689                        BUG_ON(ret);
 690                }
 691                other_start = 0;
 692                other_end = start;
 693                if (extent_mergeable(leaf, path->slots[0] - 1, inode->i_ino,
 694                                     bytenr, &other_start, &other_end)) {
 695                        key.offset = other_start;
 696                        del_slot = path->slots[0];
 697                        del_nr++;
 698                        ret = btrfs_free_extent(trans, root, bytenr, num_bytes,
 699                                                0, root->root_key.objectid,
 700                                                inode->i_ino, orig_offset);
 701                        BUG_ON(ret);
 702                }
 703                split_end = 0;
 704                if (del_nr == 0) {
 705                        btrfs_set_file_extent_type(leaf, fi,
 706                                                   BTRFS_FILE_EXTENT_REG);
 707                        goto done;
 708                }
 709
 710                fi = btrfs_item_ptr(leaf, del_slot - 1,
 711                                    struct btrfs_file_extent_item);
 712                btrfs_set_file_extent_type(leaf, fi, BTRFS_FILE_EXTENT_REG);
 713                btrfs_set_file_extent_num_bytes(leaf, fi,
 714                                                extent_end - key.offset);
 715                btrfs_mark_buffer_dirty(leaf);
 716
 717                ret = btrfs_del_items(trans, root, path, del_slot, del_nr);
 718                BUG_ON(ret);
 719                goto release;
 720        } else if (split == start) {
 721                if (locked_end < extent_end) {
 722                        ret = try_lock_extent(&BTRFS_I(inode)->io_tree,
 723                                        locked_end, extent_end - 1, GFP_NOFS);
 724                        if (!ret) {
 725                                btrfs_release_path(root, path);
 726                                lock_extent(&BTRFS_I(inode)->io_tree,
 727                                        locked_end, extent_end - 1, GFP_NOFS);
 728                                locked_end = extent_end;
 729                                goto again;
 730                        }
 731                        locked_end = extent_end;
 732                }
 733                btrfs_set_file_extent_num_bytes(leaf, fi, split - key.offset);
 734        } else  {
 735                BUG_ON(key.offset != start);
 736                key.offset = split;
 737                btrfs_set_file_extent_offset(leaf, fi, key.offset -
 738                                             orig_offset);
 739                btrfs_set_file_extent_num_bytes(leaf, fi, extent_end - split);
 740                btrfs_set_item_key_safe(trans, root, path, &key);
 741                extent_end = split;
 742        }
 743
 744        if (extent_end == end) {
 745                split_end = 0;
 746                extent_type = BTRFS_FILE_EXTENT_REG;
 747        }
 748        if (extent_end == end && split == start) {
 749                other_start = end;
 750                other_end = 0;
 751                if (extent_mergeable(leaf, path->slots[0] + 1, inode->i_ino,
 752                                     bytenr, &other_start, &other_end)) {
 753                        path->slots[0]++;
 754                        fi = btrfs_item_ptr(leaf, path->slots[0],
 755                                            struct btrfs_file_extent_item);
 756                        key.offset = split;
 757                        btrfs_set_item_key_safe(trans, root, path, &key);
 758                        btrfs_set_file_extent_offset(leaf, fi, key.offset -
 759                                                     orig_offset);
 760                        btrfs_set_file_extent_num_bytes(leaf, fi,
 761                                                        other_end - split);
 762                        goto done;
 763                }
 764        }
 765        if (extent_end == end && split == end) {
 766                other_start = 0;
 767                other_end = start;
 768                if (extent_mergeable(leaf, path->slots[0] - 1 , inode->i_ino,
 769                                     bytenr, &other_start, &other_end)) {
 770                        path->slots[0]--;
 771                        fi = btrfs_item_ptr(leaf, path->slots[0],
 772                                            struct btrfs_file_extent_item);
 773                        btrfs_set_file_extent_num_bytes(leaf, fi, extent_end -
 774                                                        other_start);
 775                        goto done;
 776                }
 777        }
 778
 779        btrfs_mark_buffer_dirty(leaf);
 780
 781        ret = btrfs_inc_extent_ref(trans, root, bytenr, num_bytes, 0,
 782                                   root->root_key.objectid,
 783                                   inode->i_ino, orig_offset);
 784        BUG_ON(ret);
 785        btrfs_release_path(root, path);
 786
 787        key.offset = start;
 788        ret = btrfs_insert_empty_item(trans, root, path, &key, sizeof(*fi));
 789        BUG_ON(ret);
 790
 791        leaf = path->nodes[0];
 792        fi = btrfs_item_ptr(leaf, path->slots[0],
 793                            struct btrfs_file_extent_item);
 794        btrfs_set_file_extent_generation(leaf, fi, trans->transid);
 795        btrfs_set_file_extent_type(leaf, fi, extent_type);
 796        btrfs_set_file_extent_disk_bytenr(leaf, fi, bytenr);
 797        btrfs_set_file_extent_disk_num_bytes(leaf, fi, num_bytes);
 798        btrfs_set_file_extent_offset(leaf, fi, key.offset - orig_offset);
 799        btrfs_set_file_extent_num_bytes(leaf, fi, extent_end - key.offset);
 800        btrfs_set_file_extent_ram_bytes(leaf, fi, num_bytes);
 801        btrfs_set_file_extent_compression(leaf, fi, 0);
 802        btrfs_set_file_extent_encryption(leaf, fi, 0);
 803        btrfs_set_file_extent_other_encoding(leaf, fi, 0);
 804done:
 805        btrfs_mark_buffer_dirty(leaf);
 806
 807release:
 808        btrfs_release_path(root, path);
 809        if (split_end && split == start) {
 810                split = end;
 811                goto again;
 812        }
 813        if (locked_end > end) {
 814                unlock_extent(&BTRFS_I(inode)->io_tree, end, locked_end - 1,
 815                              GFP_NOFS);
 816        }
 817        btrfs_free_path(path);
 818        return 0;
 819}
 820
 821/*
 822 * this gets pages into the page cache and locks them down, it also properly
 823 * waits for data=ordered extents to finish before allowing the pages to be
 824 * modified.
 825 */
 826static noinline int prepare_pages(struct btrfs_root *root, struct file *file,
 827                         struct page **pages, size_t num_pages,
 828                         loff_t pos, unsigned long first_index,
 829                         unsigned long last_index, size_t write_bytes)
 830{
 831        int i;
 832        unsigned long index = pos >> PAGE_CACHE_SHIFT;
 833        struct inode *inode = fdentry(file)->d_inode;
 834        int err = 0;
 835        u64 start_pos;
 836        u64 last_pos;
 837
 838        start_pos = pos & ~((u64)root->sectorsize - 1);
 839        last_pos = ((u64)index + num_pages) << PAGE_CACHE_SHIFT;
 840
 841        if (start_pos > inode->i_size) {
 842                err = btrfs_cont_expand(inode, start_pos);
 843                if (err)
 844                        return err;
 845        }
 846
 847        memset(pages, 0, num_pages * sizeof(struct page *));
 848again:
 849        for (i = 0; i < num_pages; i++) {
 850                pages[i] = grab_cache_page(inode->i_mapping, index + i);
 851                if (!pages[i]) {
 852                        err = -ENOMEM;
 853                        BUG_ON(1);
 854                }
 855                wait_on_page_writeback(pages[i]);
 856        }
 857        if (start_pos < inode->i_size) {
 858                struct btrfs_ordered_extent *ordered;
 859                lock_extent(&BTRFS_I(inode)->io_tree,
 860                            start_pos, last_pos - 1, GFP_NOFS);
 861                ordered = btrfs_lookup_first_ordered_extent(inode,
 862                                                            last_pos - 1);
 863                if (ordered &&
 864                    ordered->file_offset + ordered->len > start_pos &&
 865                    ordered->file_offset < last_pos) {
 866                        btrfs_put_ordered_extent(ordered);
 867                        unlock_extent(&BTRFS_I(inode)->io_tree,
 868                                      start_pos, last_pos - 1, GFP_NOFS);
 869                        for (i = 0; i < num_pages; i++) {
 870                                unlock_page(pages[i]);
 871                                page_cache_release(pages[i]);
 872                        }
 873                        btrfs_wait_ordered_range(inode, start_pos,
 874                                                 last_pos - start_pos);
 875                        goto again;
 876                }
 877                if (ordered)
 878                        btrfs_put_ordered_extent(ordered);
 879
 880                clear_extent_bits(&BTRFS_I(inode)->io_tree, start_pos,
 881                                  last_pos - 1, EXTENT_DIRTY | EXTENT_DELALLOC |
 882                                  EXTENT_DO_ACCOUNTING,
 883                                  GFP_NOFS);
 884                unlock_extent(&BTRFS_I(inode)->io_tree,
 885                              start_pos, last_pos - 1, GFP_NOFS);
 886        }
 887        for (i = 0; i < num_pages; i++) {
 888                clear_page_dirty_for_io(pages[i]);
 889                set_page_extent_mapped(pages[i]);
 890                WARN_ON(!PageLocked(pages[i]));
 891        }
 892        return 0;
 893}
 894
 895static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
 896                                size_t count, loff_t *ppos)
 897{
 898        loff_t pos;
 899        loff_t start_pos;
 900        ssize_t num_written = 0;
 901        ssize_t err = 0;
 902        int ret = 0;
 903        struct inode *inode = fdentry(file)->d_inode;
 904        struct btrfs_root *root = BTRFS_I(inode)->root;
 905        struct page **pages = NULL;
 906        int nrptrs;
 907        struct page *pinned[2];
 908        unsigned long first_index;
 909        unsigned long last_index;
 910        int will_write;
 911
 912        will_write = ((file->f_flags & O_SYNC) || IS_SYNC(inode) ||
 913                      (file->f_flags & O_DIRECT));
 914
 915        nrptrs = min((count + PAGE_CACHE_SIZE - 1) / PAGE_CACHE_SIZE,
 916                     PAGE_CACHE_SIZE / (sizeof(struct page *)));
 917        pinned[0] = NULL;
 918        pinned[1] = NULL;
 919
 920        pos = *ppos;
 921        start_pos = pos;
 922
 923        vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE);
 924
 925        /* do the reserve before the mutex lock in case we have to do some
 926         * flushing.  We wouldn't deadlock, but this is more polite.
 927         */
 928        err = btrfs_reserve_metadata_for_delalloc(root, inode, 1);
 929        if (err)
 930                goto out_nolock;
 931
 932        mutex_lock(&inode->i_mutex);
 933
 934        current->backing_dev_info = inode->i_mapping->backing_dev_info;
 935        err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode));
 936        if (err)
 937                goto out;
 938
 939        if (count == 0)
 940                goto out;
 941
 942        err = file_remove_suid(file);
 943        if (err)
 944                goto out;
 945
 946        file_update_time(file);
 947
 948        pages = kmalloc(nrptrs * sizeof(struct page *), GFP_KERNEL);
 949
 950        /* generic_write_checks can change our pos */
 951        start_pos = pos;
 952
 953        BTRFS_I(inode)->sequence++;
 954        first_index = pos >> PAGE_CACHE_SHIFT;
 955        last_index = (pos + count) >> PAGE_CACHE_SHIFT;
 956
 957        /*
 958         * there are lots of better ways to do this, but this code
 959         * makes sure the first and last page in the file range are
 960         * up to date and ready for cow
 961         */
 962        if ((pos & (PAGE_CACHE_SIZE - 1))) {
 963                pinned[0] = grab_cache_page(inode->i_mapping, first_index);
 964                if (!PageUptodate(pinned[0])) {
 965                        ret = btrfs_readpage(NULL, pinned[0]);
 966                        BUG_ON(ret);
 967                        wait_on_page_locked(pinned[0]);
 968                } else {
 969                        unlock_page(pinned[0]);
 970                }
 971        }
 972        if ((pos + count) & (PAGE_CACHE_SIZE - 1)) {
 973                pinned[1] = grab_cache_page(inode->i_mapping, last_index);
 974                if (!PageUptodate(pinned[1])) {
 975                        ret = btrfs_readpage(NULL, pinned[1]);
 976                        BUG_ON(ret);
 977                        wait_on_page_locked(pinned[1]);
 978                } else {
 979                        unlock_page(pinned[1]);
 980                }
 981        }
 982
 983        while (count > 0) {
 984                size_t offset = pos & (PAGE_CACHE_SIZE - 1);
 985                size_t write_bytes = min(count, nrptrs *
 986                                        (size_t)PAGE_CACHE_SIZE -
 987                                         offset);
 988                size_t num_pages = (write_bytes + PAGE_CACHE_SIZE - 1) >>
 989                                        PAGE_CACHE_SHIFT;
 990
 991                WARN_ON(num_pages > nrptrs);
 992                memset(pages, 0, sizeof(struct page *) * nrptrs);
 993
 994                ret = btrfs_check_data_free_space(root, inode, write_bytes);
 995                if (ret)
 996                        goto out;
 997
 998                ret = prepare_pages(root, file, pages, num_pages,
 999                                    pos, first_index, last_index,
1000                                    write_bytes);
1001                if (ret) {
1002                        btrfs_free_reserved_data_space(root, inode,
1003                                                       write_bytes);
1004                        goto out;
1005                }
1006
1007                ret = btrfs_copy_from_user(pos, num_pages,
1008                                           write_bytes, pages, buf);
1009                if (ret) {
1010                        btrfs_free_reserved_data_space(root, inode,
1011                                                       write_bytes);
1012                        btrfs_drop_pages(pages, num_pages);
1013                        goto out;
1014                }
1015
1016                ret = dirty_and_release_pages(NULL, root, file, pages,
1017                                              num_pages, pos, write_bytes);
1018                btrfs_drop_pages(pages, num_pages);
1019                if (ret) {
1020                        btrfs_free_reserved_data_space(root, inode,
1021                                                       write_bytes);
1022                        goto out;
1023                }
1024
1025                if (will_write) {
1026                        filemap_fdatawrite_range(inode->i_mapping, pos,
1027                                                 pos + write_bytes - 1);
1028                } else {
1029                        balance_dirty_pages_ratelimited_nr(inode->i_mapping,
1030                                                           num_pages);
1031                        if (num_pages <
1032                            (root->leafsize >> PAGE_CACHE_SHIFT) + 1)
1033                                btrfs_btree_balance_dirty(root, 1);
1034                        btrfs_throttle(root);
1035                }
1036
1037                buf += write_bytes;
1038                count -= write_bytes;
1039                pos += write_bytes;
1040                num_written += write_bytes;
1041
1042                cond_resched();
1043        }
1044out:
1045        mutex_unlock(&inode->i_mutex);
1046        if (ret)
1047                err = ret;
1048        btrfs_unreserve_metadata_for_delalloc(root, inode, 1);
1049
1050out_nolock:
1051        kfree(pages);
1052        if (pinned[0])
1053                page_cache_release(pinned[0]);
1054        if (pinned[1])
1055                page_cache_release(pinned[1]);
1056        *ppos = pos;
1057
1058        /*
1059         * we want to make sure fsync finds this change
1060         * but we haven't joined a transaction running right now.
1061         *
1062         * Later on, someone is sure to update the inode and get the
1063         * real transid recorded.
1064         *
1065         * We set last_trans now to the fs_info generation + 1,
1066         * this will either be one more than the running transaction
1067         * or the generation used for the next transaction if there isn't
1068         * one running right now.
1069         */
1070        BTRFS_I(inode)->last_trans = root->fs_info->generation + 1;
1071
1072        if (num_written > 0 && will_write) {
1073                struct btrfs_trans_handle *trans;
1074
1075                err = btrfs_wait_ordered_range(inode, start_pos, num_written);
1076                if (err)
1077                        num_written = err;
1078
1079                if ((file->f_flags & O_SYNC) || IS_SYNC(inode)) {
1080                        trans = btrfs_start_transaction(root, 1);
1081                        ret = btrfs_log_dentry_safe(trans, root,
1082                                                    file->f_dentry);
1083                        if (ret == 0) {
1084                                ret = btrfs_sync_log(trans, root);
1085                                if (ret == 0)
1086                                        btrfs_end_transaction(trans, root);
1087                                else
1088                                        btrfs_commit_transaction(trans, root);
1089                        } else if (ret != BTRFS_NO_LOG_SYNC) {
1090                                btrfs_commit_transaction(trans, root);
1091                        } else {
1092                                btrfs_end_transaction(trans, root);
1093                        }
1094                }
1095                if (file->f_flags & O_DIRECT) {
1096                        invalidate_mapping_pages(inode->i_mapping,
1097                              start_pos >> PAGE_CACHE_SHIFT,
1098                             (start_pos + num_written - 1) >> PAGE_CACHE_SHIFT);
1099                }
1100        }
1101        current->backing_dev_info = NULL;
1102        return num_written ? num_written : err;
1103}
1104
1105int btrfs_release_file(struct inode *inode, struct file *filp)
1106{
1107        /*
1108         * ordered_data_close is set by settattr when we are about to truncate
1109         * a file from a non-zero size to a zero size.  This tries to
1110         * flush down new bytes that may have been written if the
1111         * application were using truncate to replace a file in place.
1112         */
1113        if (BTRFS_I(inode)->ordered_data_close) {
1114                BTRFS_I(inode)->ordered_data_close = 0;
1115                btrfs_add_ordered_operation(NULL, BTRFS_I(inode)->root, inode);
1116                if (inode->i_size > BTRFS_ORDERED_OPERATIONS_FLUSH_LIMIT)
1117                        filemap_flush(inode->i_mapping);
1118        }
1119        if (filp->private_data)
1120                btrfs_ioctl_trans_end(filp);
1121        return 0;
1122}
1123
1124/*
1125 * fsync call for both files and directories.  This logs the inode into
1126 * the tree log instead of forcing full commits whenever possible.
1127 *
1128 * It needs to call filemap_fdatawait so that all ordered extent updates are
1129 * in the metadata btree are up to date for copying to the log.
1130 *
1131 * It drops the inode mutex before doing the tree log commit.  This is an
1132 * important optimization for directories because holding the mutex prevents
1133 * new operations on the dir while we write to disk.
1134 */
1135int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync)
1136{
1137        struct inode *inode = dentry->d_inode;
1138        struct btrfs_root *root = BTRFS_I(inode)->root;
1139        int ret = 0;
1140        struct btrfs_trans_handle *trans;
1141
1142
1143        /* we wait first, since the writeback may change the inode */
1144        root->log_batch++;
1145        /* the VFS called filemap_fdatawrite for us */
1146        btrfs_wait_ordered_range(inode, 0, (u64)-1);
1147        root->log_batch++;
1148
1149        /*
1150         * check the transaction that last modified this inode
1151         * and see if its already been committed
1152         */
1153        if (!BTRFS_I(inode)->last_trans)
1154                goto out;
1155
1156        /*
1157         * if the last transaction that changed this file was before
1158         * the current transaction, we can bail out now without any
1159         * syncing
1160         */
1161        mutex_lock(&root->fs_info->trans_mutex);
1162        if (BTRFS_I(inode)->last_trans <=
1163            root->fs_info->last_trans_committed) {
1164                BTRFS_I(inode)->last_trans = 0;
1165                mutex_unlock(&root->fs_info->trans_mutex);
1166                goto out;
1167        }
1168        mutex_unlock(&root->fs_info->trans_mutex);
1169
1170        /*
1171         * ok we haven't committed the transaction yet, lets do a commit
1172         */
1173        if (file && file->private_data)
1174                btrfs_ioctl_trans_end(file);
1175
1176        trans = btrfs_start_transaction(root, 1);
1177        if (!trans) {
1178                ret = -ENOMEM;
1179                goto out;
1180        }
1181
1182        ret = btrfs_log_dentry_safe(trans, root, dentry);
1183        if (ret < 0)
1184                goto out;
1185
1186        /* we've logged all the items and now have a consistent
1187         * version of the file in the log.  It is possible that
1188         * someone will come in and modify the file, but that's
1189         * fine because the log is consistent on disk, and we
1190         * have references to all of the file's extents
1191         *
1192         * It is possible that someone will come in and log the
1193         * file again, but that will end up using the synchronization
1194         * inside btrfs_sync_log to keep things safe.
1195         */
1196        mutex_unlock(&dentry->d_inode->i_mutex);
1197
1198        if (ret != BTRFS_NO_LOG_SYNC) {
1199                if (ret > 0) {
1200                        ret = btrfs_commit_transaction(trans, root);
1201                } else {
1202                        ret = btrfs_sync_log(trans, root);
1203                        if (ret == 0)
1204                                ret = btrfs_end_transaction(trans, root);
1205                        else
1206                                ret = btrfs_commit_transaction(trans, root);
1207                }
1208        } else {
1209                ret = btrfs_end_transaction(trans, root);
1210        }
1211        mutex_lock(&dentry->d_inode->i_mutex);
1212out:
1213        return ret > 0 ? EIO : ret;
1214}
1215
1216static const struct vm_operations_struct btrfs_file_vm_ops = {
1217        .fault          = filemap_fault,
1218        .page_mkwrite   = btrfs_page_mkwrite,
1219};
1220
1221static int btrfs_file_mmap(struct file  *filp, struct vm_area_struct *vma)
1222{
1223        vma->vm_ops = &btrfs_file_vm_ops;
1224        file_accessed(filp);
1225        return 0;
1226}
1227
1228const struct file_operations btrfs_file_operations = {
1229        .llseek         = generic_file_llseek,
1230        .read           = do_sync_read,
1231        .aio_read       = generic_file_aio_read,
1232        .splice_read    = generic_file_splice_read,
1233        .write          = btrfs_file_write,
1234        .mmap           = btrfs_file_mmap,
1235        .open           = generic_file_open,
1236        .release        = btrfs_release_file,
1237        .fsync          = btrfs_sync_file,
1238        .unlocked_ioctl = btrfs_ioctl,
1239#ifdef CONFIG_COMPAT
1240        .compat_ioctl   = btrfs_ioctl,
1241#endif
1242};
1243