LXR linux/fs/ext4/file.c

   1/*
   2 *  linux/fs/ext4/file.c
   3 *
   4 * Copyright (C) 1992, 1993, 1994, 1995
   5 * Remy Card (card@masi.ibp.fr)
   6 * Laboratoire MASI - Institut Blaise Pascal
   7 * Universite Pierre et Marie Curie (Paris VI)
   8 *
   9 *  from
  10 *
  11 *  linux/fs/minix/file.c
  12 *
  13 *  Copyright (C) 1991, 1992  Linus Torvalds
  14 *
  15 *  ext4 fs regular file handling primitives
  16 *
  17 *  64-bit file support on 64-bit platforms by Jakub Jelinek
  18 *      (jj@sunsite.ms.mff.cuni.cz)
  19 */
  20
  21#include <linux/time.h>
  22#include <linux/fs.h>
  23#include <linux/mount.h>
  24#include <linux/path.h>
  25#include <linux/quotaops.h>
  26#include <linux/pagevec.h>
  27#include <linux/uio.h>
  28#include "ext4.h"
  29#include "ext4_jbd2.h"
  30#include "xattr.h"
  31#include "acl.h"
  32
  33/*
  34 * Called when an inode is released. Note that this is different
  35 * from ext4_file_open: open gets called at every open, but release
  36 * gets called only when /all/ the files are closed.
  37 */
  38static int ext4_release_file(struct inode *inode, struct file *filp)
  39{
  40        if (ext4_test_inode_state(inode, EXT4_STATE_DA_ALLOC_CLOSE)) {
  41                ext4_alloc_da_blocks(inode);
  42                ext4_clear_inode_state(inode, EXT4_STATE_DA_ALLOC_CLOSE);
  43        }
  44        /* if we are the last writer on the inode, drop the block reservation */
  45        if ((filp->f_mode & FMODE_WRITE) &&
  46                        (atomic_read(&inode->i_writecount) == 1) &&
  47                        !EXT4_I(inode)->i_reserved_data_blocks)
  48        {
  49                down_write(&EXT4_I(inode)->i_data_sem);
  50                ext4_discard_preallocations(inode);
  51                up_write(&EXT4_I(inode)->i_data_sem);
  52        }
  53        if (is_dx(inode) && filp->private_data)
  54                ext4_htree_free_dir_info(filp->private_data);
  55
  56        return 0;
  57}
  58
  59static void ext4_unwritten_wait(struct inode *inode)
  60{
  61        wait_queue_head_t *wq = ext4_ioend_wq(inode);
  62
  63        wait_event(*wq, (atomic_read(&EXT4_I(inode)->i_unwritten) == 0));
  64}
  65
  66/*
  67 * This tests whether the IO in question is block-aligned or not.
  68 * Ext4 utilizes unwritten extents when hole-filling during direct IO, and they
  69 * are converted to written only after the IO is complete.  Until they are
  70 * mapped, these blocks appear as holes, so dio_zero_block() will assume that
  71 * it needs to zero out portions of the start and/or end block.  If 2 AIO
  72 * threads are at work on the same unwritten block, they must be synchronized
  73 * or one thread will zero the other's data, causing corruption.
  74 */
  75static int
  76ext4_unaligned_aio(struct inode *inode, struct iov_iter *from, loff_t pos)
  77{
  78        struct super_block *sb = inode->i_sb;
  79        int blockmask = sb->s_blocksize - 1;
  80
  81        if (pos >= i_size_read(inode))
  82                return 0;
  83
  84        if ((pos | iov_iter_alignment(from)) & blockmask)
  85                return 1;
  86
  87        return 0;
  88}
  89
  90static ssize_t
  91ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
  92{
  93        struct file *file = iocb->ki_filp;
  94        struct inode *inode = file_inode(iocb->ki_filp);
  95        struct mutex *aio_mutex = NULL;
  96        struct blk_plug plug;
  97        int o_direct = iocb->ki_flags & IOCB_DIRECT;
  98        int overwrite = 0;
  99        ssize_t ret;
 100
 101        /*
 102         * Unaligned direct AIO must be serialized; see comment above
 103         * In the case of O_APPEND, assume that we must always serialize
 104         */
 105        if (o_direct &&
 106            ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS) &&
 107            !is_sync_kiocb(iocb) &&
 108            (iocb->ki_flags & IOCB_APPEND ||
 109             ext4_unaligned_aio(inode, from, iocb->ki_pos))) {
 110                aio_mutex = ext4_aio_mutex(inode);
 111                mutex_lock(aio_mutex);
 112                ext4_unwritten_wait(inode);
 113        }
 114
 115        mutex_lock(&inode->i_mutex);
 116        ret = generic_write_checks(iocb, from);
 117        if (ret <= 0)
 118                goto out;
 119
 120        /*
 121         * If we have encountered a bitmap-format file, the size limit
 122         * is smaller than s_maxbytes, which is for extent-mapped files.
 123         */
 124        if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) {
 125                struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
 126
 127                if (iocb->ki_pos >= sbi->s_bitmap_maxbytes) {
 128                        ret = -EFBIG;
 129                        goto out;
 130                }
 131                iov_iter_truncate(from, sbi->s_bitmap_maxbytes - iocb->ki_pos);
 132        }
 133
 134        iocb->private = &overwrite;
 135        if (o_direct) {
 136                size_t length = iov_iter_count(from);
 137                loff_t pos = iocb->ki_pos;
 138                blk_start_plug(&plug);
 139
 140                /* check whether we do a DIO overwrite or not */
 141                if (ext4_should_dioread_nolock(inode) && !aio_mutex &&
 142                    !file->f_mapping->nrpages && pos + length <= i_size_read(inode)) {
 143                        struct ext4_map_blocks map;
 144                        unsigned int blkbits = inode->i_blkbits;
 145                        int err, len;
 146
 147                        map.m_lblk = pos >> blkbits;
 148                        map.m_len = (EXT4_BLOCK_ALIGN(pos + length, blkbits) >> blkbits)
 149                                - map.m_lblk;
 150                        len = map.m_len;
 151
 152                        err = ext4_map_blocks(NULL, inode, &map, 0);
 153                        /*
 154                         * 'err==len' means that all of blocks has
 155                         * been preallocated no matter they are
 156                         * initialized or not.  For excluding
 157                         * unwritten extents, we need to check
 158                         * m_flags.  There are two conditions that
 159                         * indicate for initialized extents.  1) If we
 160                         * hit extent cache, EXT4_MAP_MAPPED flag is
 161                         * returned; 2) If we do a real lookup,
 162                         * non-flags are returned.  So we should check
 163                         * these two conditions.
 164                         */
 165                        if (err == len && (map.m_flags & EXT4_MAP_MAPPED))
 166                                overwrite = 1;
 167                }
 168        }
 169
 170        ret = __generic_file_write_iter(iocb, from);
 171        mutex_unlock(&inode->i_mutex);
 172
 173        if (ret > 0) {
 174                ssize_t err;
 175
 176                err = generic_write_sync(file, iocb->ki_pos - ret, ret);
 177                if (err < 0)
 178                        ret = err;
 179        }
 180        if (o_direct)
 181                blk_finish_plug(&plug);
 182
 183        if (aio_mutex)
 184                mutex_unlock(aio_mutex);
 185        return ret;
 186
 187out:
 188        mutex_unlock(&inode->i_mutex);
 189        if (aio_mutex)
 190                mutex_unlock(aio_mutex);
 191        return ret;
 192}
 193
 194#ifdef CONFIG_FS_DAX
 195static int ext4_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 196{
 197        return dax_fault(vma, vmf, ext4_get_block);
 198                                        /* Is this the right get_block? */
 199}
 200
 201static int ext4_dax_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
 202{
 203        return dax_mkwrite(vma, vmf, ext4_get_block);
 204}
 205
 206static const struct vm_operations_struct ext4_dax_vm_ops = {
 207        .fault          = ext4_dax_fault,
 208        .page_mkwrite   = ext4_dax_mkwrite,
 209        .pfn_mkwrite    = dax_pfn_mkwrite,
 210};
 211#else
 212#define ext4_dax_vm_ops ext4_file_vm_ops
 213#endif
 214
 215static const struct vm_operations_struct ext4_file_vm_ops = {
 216        .fault          = filemap_fault,
 217        .map_pages      = filemap_map_pages,
 218        .page_mkwrite   = ext4_page_mkwrite,
 219};
 220
 221static int ext4_file_mmap(struct file *file, struct vm_area_struct *vma)
 222{
 223        struct inode *inode = file->f_mapping->host;
 224
 225        if (ext4_encrypted_inode(inode)) {
 226                int err = ext4_generate_encryption_key(inode);
 227                if (err)
 228                        return 0;
 229        }
 230        file_accessed(file);
 231        if (IS_DAX(file_inode(file))) {
 232                vma->vm_ops = &ext4_dax_vm_ops;
 233                vma->vm_flags |= VM_MIXEDMAP;
 234        } else {
 235                vma->vm_ops = &ext4_file_vm_ops;
 236        }
 237        return 0;
 238}
 239
 240static int ext4_file_open(struct inode * inode, struct file * filp)
 241{
 242        struct super_block *sb = inode->i_sb;
 243        struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
 244        struct vfsmount *mnt = filp->f_path.mnt;
 245        struct path path;
 246        char buf[64], *cp;
 247        int ret;
 248
 249        if (unlikely(!(sbi->s_mount_flags & EXT4_MF_MNTDIR_SAMPLED) &&
 250                     !(sb->s_flags & MS_RDONLY))) {
 251                sbi->s_mount_flags |= EXT4_MF_MNTDIR_SAMPLED;
 252                /*
 253                 * Sample where the filesystem has been mounted and
 254                 * store it in the superblock for sysadmin convenience
 255                 * when trying to sort through large numbers of block
 256                 * devices or filesystem images.
 257                 */
 258                memset(buf, 0, sizeof(buf));
 259                path.mnt = mnt;
 260                path.dentry = mnt->mnt_root;
 261                cp = d_path(&path, buf, sizeof(buf));
 262                if (!IS_ERR(cp)) {
 263                        handle_t *handle;
 264                        int err;
 265
 266                        handle = ext4_journal_start_sb(sb, EXT4_HT_MISC, 1);
 267                        if (IS_ERR(handle))
 268                                return PTR_ERR(handle);
 269                        BUFFER_TRACE(sbi->s_sbh, "get_write_access");
 270                        err = ext4_journal_get_write_access(handle, sbi->s_sbh);
 271                        if (err) {
 272                                ext4_journal_stop(handle);
 273                                return err;
 274                        }
 275                        strlcpy(sbi->s_es->s_last_mounted, cp,
 276                                sizeof(sbi->s_es->s_last_mounted));
 277                        ext4_handle_dirty_super(handle, sb);
 278                        ext4_journal_stop(handle);
 279                }
 280        }
 281        /*
 282         * Set up the jbd2_inode if we are opening the inode for
 283         * writing and the journal is present
 284         */
 285        if (filp->f_mode & FMODE_WRITE) {
 286                ret = ext4_inode_attach_jinode(inode);
 287                if (ret < 0)
 288                        return ret;
 289        }
 290        ret = dquot_file_open(inode, filp);
 291        if (!ret && ext4_encrypted_inode(inode)) {
 292                ret = ext4_generate_encryption_key(inode);
 293                if (ret)
 294                        ret = -EACCES;
 295        }
 296        return ret;
 297}
 298
 299/*
 300 * Here we use ext4_map_blocks() to get a block mapping for a extent-based
 301 * file rather than ext4_ext_walk_space() because we can introduce
 302 * SEEK_DATA/SEEK_HOLE for block-mapped and extent-mapped file at the same
 303 * function.  When extent status tree has been fully implemented, it will
 304 * track all extent status for a file and we can directly use it to
 305 * retrieve the offset for SEEK_DATA/SEEK_HOLE.
 306 */
 307
 308/*
 309 * When we retrieve the offset for SEEK_DATA/SEEK_HOLE, we would need to
 310 * lookup page cache to check whether or not there has some data between
 311 * [startoff, endoff] because, if this range contains an unwritten extent,
 312 * we determine this extent as a data or a hole according to whether the
 313 * page cache has data or not.
 314 */
 315static int ext4_find_unwritten_pgoff(struct inode *inode,
 316                                     int whence,
 317                                     struct ext4_map_blocks *map,
 318                                     loff_t *offset)
 319{
 320        struct pagevec pvec;
 321        unsigned int blkbits;
 322        pgoff_t index;
 323        pgoff_t end;
 324        loff_t endoff;
 325        loff_t startoff;
 326        loff_t lastoff;
 327        int found = 0;
 328
 329        blkbits = inode->i_sb->s_blocksize_bits;
 330        startoff = *offset;
 331        lastoff = startoff;
 332        endoff = (loff_t)(map->m_lblk + map->m_len) << blkbits;
 333
 334        index = startoff >> PAGE_CACHE_SHIFT;
 335        end = endoff >> PAGE_CACHE_SHIFT;
 336
 337        pagevec_init(&pvec, 0);
 338        do {
 339                int i, num;
 340                unsigned long nr_pages;
 341
 342                num = min_t(pgoff_t, end - index, PAGEVEC_SIZE);
 343                nr_pages = pagevec_lookup(&pvec, inode->i_mapping, index,
 344                                          (pgoff_t)num);
 345                if (nr_pages == 0) {
 346                        if (whence == SEEK_DATA)
 347                                break;
 348
 349                        BUG_ON(whence != SEEK_HOLE);
 350                        /*
 351                         * If this is the first time to go into the loop and
 352                         * offset is not beyond the end offset, it will be a
 353                         * hole at this offset
 354                         */
 355                        if (lastoff == startoff || lastoff < endoff)
 356                                found = 1;
 357                        break;
 358                }
 359
 360                /*
 361                 * If this is the first time to go into the loop and
 362                 * offset is smaller than the first page offset, it will be a
 363                 * hole at this offset.
 364                 */
 365                if (lastoff == startoff && whence == SEEK_HOLE &&
 366                    lastoff < page_offset(pvec.pages[0])) {
 367                        found = 1;
 368                        break;
 369                }
 370
 371                for (i = 0; i < nr_pages; i++) {
 372                        struct page *page = pvec.pages[i];
 373                        struct buffer_head *bh, *head;
 374
 375                        /*
 376                         * If the current offset is not beyond the end of given
 377                         * range, it will be a hole.
 378                         */
 379                        if (lastoff < endoff && whence == SEEK_HOLE &&
 380                            page->index > end) {
 381                                found = 1;
 382                                *offset = lastoff;
 383                                goto out;
 384                        }
 385
 386                        lock_page(page);
 387
 388                        if (unlikely(page->mapping != inode->i_mapping)) {
 389                                unlock_page(page);
 390                                continue;
 391                        }
 392
 393                        if (!page_has_buffers(page)) {
 394                                unlock_page(page);
 395                                continue;
 396                        }
 397
 398                        if (page_has_buffers(page)) {
 399                                lastoff = page_offset(page);
 400                                bh = head = page_buffers(page);
 401                                do {
 402                                        if (buffer_uptodate(bh) ||
 403                                            buffer_unwritten(bh)) {
 404                                                if (whence == SEEK_DATA)
 405                                                        found = 1;
 406                                        } else {
 407                                                if (whence == SEEK_HOLE)
 408                                                        found = 1;
 409                                        }
 410                                        if (found) {
 411                                                *offset = max_t(loff_t,
 412                                                        startoff, lastoff);
 413                                                unlock_page(page);
 414                                                goto out;
 415                                        }
 416                                        lastoff += bh->b_size;
 417                                        bh = bh->b_this_page;
 418                                } while (bh != head);
 419                        }
 420
 421                        lastoff = page_offset(page) + PAGE_SIZE;
 422                        unlock_page(page);
 423                }
 424
 425                /*
 426                 * The no. of pages is less than our desired, that would be a
 427                 * hole in there.
 428                 */
 429                if (nr_pages < num && whence == SEEK_HOLE) {
 430                        found = 1;
 431                        *offset = lastoff;
 432                        break;
 433                }
 434
 435                index = pvec.pages[i - 1]->index + 1;
 436                pagevec_release(&pvec);
 437        } while (index <= end);
 438
 439out:
 440        pagevec_release(&pvec);
 441        return found;
 442}
 443
 444/*
 445 * ext4_seek_data() retrieves the offset for SEEK_DATA.
 446 */
 447static loff_t ext4_seek_data(struct file *file, loff_t offset, loff_t maxsize)
 448{
 449        struct inode *inode = file->f_mapping->host;
 450        struct ext4_map_blocks map;
 451        struct extent_status es;
 452        ext4_lblk_t start, last, end;
 453        loff_t dataoff, isize;
 454        int blkbits;
 455        int ret = 0;
 456
 457        mutex_lock(&inode->i_mutex);
 458
 459        isize = i_size_read(inode);
 460        if (offset >= isize) {
 461                mutex_unlock(&inode->i_mutex);
 462                return -ENXIO;
 463        }
 464
 465        blkbits = inode->i_sb->s_blocksize_bits;
 466        start = offset >> blkbits;
 467        last = start;
 468        end = isize >> blkbits;
 469        dataoff = offset;
 470
 471        do {
 472                map.m_lblk = last;
 473                map.m_len = end - last + 1;
 474                ret = ext4_map_blocks(NULL, inode, &map, 0);
 475                if (ret > 0 && !(map.m_flags & EXT4_MAP_UNWRITTEN)) {
 476                        if (last != start)
 477                                dataoff = (loff_t)last << blkbits;
 478                        break;
 479                }
 480
 481                /*
 482                 * If there is a delay extent at this offset,
 483                 * it will be as a data.
 484                 */
 485                ext4_es_find_delayed_extent_range(inode, last, last, &es);
 486                if (es.es_len != 0 && in_range(last, es.es_lblk, es.es_len)) {
 487                        if (last != start)
 488                                dataoff = (loff_t)last << blkbits;
 489                        break;
 490                }
 491
 492                /*
 493                 * If there is a unwritten extent at this offset,
 494                 * it will be as a data or a hole according to page
 495                 * cache that has data or not.
 496                 */
 497                if (map.m_flags & EXT4_MAP_UNWRITTEN) {
 498                        int unwritten;
 499                        unwritten = ext4_find_unwritten_pgoff(inode, SEEK_DATA,
 500                                                              &map, &dataoff);
 501                        if (unwritten)
 502                                break;
 503                }
 504
 505                last++;
 506                dataoff = (loff_t)last << blkbits;
 507        } while (last <= end);
 508
 509        mutex_unlock(&inode->i_mutex);
 510
 511        if (dataoff > isize)
 512                return -ENXIO;
 513
 514        return vfs_setpos(file, dataoff, maxsize);
 515}
 516
 517/*
 518 * ext4_seek_hole() retrieves the offset for SEEK_HOLE.
 519 */
 520static loff_t ext4_seek_hole(struct file *file, loff_t offset, loff_t maxsize)
 521{
 522        struct inode *inode = file->f_mapping->host;
 523        struct ext4_map_blocks map;
 524        struct extent_status es;
 525        ext4_lblk_t start, last, end;
 526        loff_t holeoff, isize;
 527        int blkbits;
 528        int ret = 0;
 529
 530        mutex_lock(&inode->i_mutex);
 531
 532        isize = i_size_read(inode);
 533        if (offset >= isize) {
 534                mutex_unlock(&inode->i_mutex);
 535                return -ENXIO;
 536        }
 537
 538        blkbits = inode->i_sb->s_blocksize_bits;
 539        start = offset >> blkbits;
 540        last = start;
 541        end = isize >> blkbits;
 542        holeoff = offset;
 543
 544        do {
 545                map.m_lblk = last;
 546                map.m_len = end - last + 1;
 547                ret = ext4_map_blocks(NULL, inode, &map, 0);
 548                if (ret > 0 && !(map.m_flags & EXT4_MAP_UNWRITTEN)) {
 549                        last += ret;
 550                        holeoff = (loff_t)last << blkbits;
 551                        continue;
 552                }
 553
 554                /*
 555                 * If there is a delay extent at this offset,
 556                 * we will skip this extent.
 557                 */
 558                ext4_es_find_delayed_extent_range(inode, last, last, &es);
 559                if (es.es_len != 0 && in_range(last, es.es_lblk, es.es_len)) {
 560                        last = es.es_lblk + es.es_len;
 561                        holeoff = (loff_t)last << blkbits;
 562                        continue;
 563                }
 564
 565                /*
 566                 * If there is a unwritten extent at this offset,
 567                 * it will be as a data or a hole according to page
 568                 * cache that has data or not.
 569                 */
 570                if (map.m_flags & EXT4_MAP_UNWRITTEN) {
 571                        int unwritten;
 572                        unwritten = ext4_find_unwritten_pgoff(inode, SEEK_HOLE,
 573                                                              &map, &holeoff);
 574                        if (!unwritten) {
 575                                last += ret;
 576                                holeoff = (loff_t)last << blkbits;
 577                                continue;
 578                        }
 579                }
 580
 581                /* find a hole */
 582                break;
 583        } while (last <= end);
 584
 585        mutex_unlock(&inode->i_mutex);
 586
 587        if (holeoff > isize)
 588                holeoff = isize;
 589
 590        return vfs_setpos(file, holeoff, maxsize);
 591}
 592
 593/*
 594 * ext4_llseek() handles both block-mapped and extent-mapped maxbytes values
 595 * by calling generic_file_llseek_size() with the appropriate maxbytes
 596 * value for each.
 597 */
 598loff_t ext4_llseek(struct file *file, loff_t offset, int whence)
 599{
 600        struct inode *inode = file->f_mapping->host;
 601        loff_t maxbytes;
 602
 603        if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
 604                maxbytes = EXT4_SB(inode->i_sb)->s_bitmap_maxbytes;
 605        else
 606                maxbytes = inode->i_sb->s_maxbytes;
 607
 608        switch (whence) {
 609        case SEEK_SET:
 610        case SEEK_CUR:
 611        case SEEK_END:
 612                return generic_file_llseek_size(file, offset, whence,
 613                                                maxbytes, i_size_read(inode));
 614        case SEEK_DATA:
 615                return ext4_seek_data(file, offset, maxbytes);
 616        case SEEK_HOLE:
 617                return ext4_seek_hole(file, offset, maxbytes);
 618        }
 619
 620        return -EINVAL;
 621}
 622
 623const struct file_operations ext4_file_operations = {
 624        .llseek         = ext4_llseek,
 625        .read_iter      = generic_file_read_iter,
 626        .write_iter     = ext4_file_write_iter,
 627        .unlocked_ioctl = ext4_ioctl,
 628#ifdef CONFIG_COMPAT
 629        .compat_ioctl   = ext4_compat_ioctl,
 630#endif
 631        .mmap           = ext4_file_mmap,
 632        .open           = ext4_file_open,
 633        .release        = ext4_release_file,
 634        .fsync          = ext4_sync_file,
 635        .splice_read    = generic_file_splice_read,
 636        .splice_write   = iter_file_splice_write,
 637        .fallocate      = ext4_fallocate,
 638};
 639
 640const struct inode_operations ext4_file_inode_operations = {
 641        .setattr        = ext4_setattr,
 642        .getattr        = ext4_getattr,
 643        .setxattr       = generic_setxattr,
 644        .getxattr       = generic_getxattr,
 645        .listxattr      = ext4_listxattr,
 646        .removexattr    = generic_removexattr,
 647        .get_acl        = ext4_get_acl,
 648        .set_acl        = ext4_set_acl,
 649        .fiemap         = ext4_fiemap,
 650};
 651
 652