linux/fs/ext4/file.c
<<
>>
Prefs
   1/*
   2 *  linux/fs/ext4/file.c
   3 *
   4 * Copyright (C) 1992, 1993, 1994, 1995
   5 * Remy Card (card@masi.ibp.fr)
   6 * Laboratoire MASI - Institut Blaise Pascal
   7 * Universite Pierre et Marie Curie (Paris VI)
   8 *
   9 *  from
  10 *
  11 *  linux/fs/minix/file.c
  12 *
  13 *  Copyright (C) 1991, 1992  Linus Torvalds
  14 *
  15 *  ext4 fs regular file handling primitives
  16 *
  17 *  64-bit file support on 64-bit platforms by Jakub Jelinek
  18 *      (jj@sunsite.ms.mff.cuni.cz)
  19 */
  20
  21#include <linux/time.h>
  22#include <linux/fs.h>
  23#include <linux/mount.h>
  24#include <linux/path.h>
  25#include <linux/quotaops.h>
  26#include <linux/pagevec.h>
  27#include <linux/uio.h>
  28#include "ext4.h"
  29#include "ext4_jbd2.h"
  30#include "xattr.h"
  31#include "acl.h"
  32
  33/*
  34 * Called when an inode is released. Note that this is different
  35 * from ext4_file_open: open gets called at every open, but release
  36 * gets called only when /all/ the files are closed.
  37 */
  38static int ext4_release_file(struct inode *inode, struct file *filp)
  39{
  40        if (ext4_test_inode_state(inode, EXT4_STATE_DA_ALLOC_CLOSE)) {
  41                ext4_alloc_da_blocks(inode);
  42                ext4_clear_inode_state(inode, EXT4_STATE_DA_ALLOC_CLOSE);
  43        }
  44        /* if we are the last writer on the inode, drop the block reservation */
  45        if ((filp->f_mode & FMODE_WRITE) &&
  46                        (atomic_read(&inode->i_writecount) == 1) &&
  47                        !EXT4_I(inode)->i_reserved_data_blocks)
  48        {
  49                down_write(&EXT4_I(inode)->i_data_sem);
  50                ext4_discard_preallocations(inode);
  51                up_write(&EXT4_I(inode)->i_data_sem);
  52        }
  53        if (is_dx(inode) && filp->private_data)
  54                ext4_htree_free_dir_info(filp->private_data);
  55
  56        return 0;
  57}
  58
  59static void ext4_unwritten_wait(struct inode *inode)
  60{
  61        wait_queue_head_t *wq = ext4_ioend_wq(inode);
  62
  63        wait_event(*wq, (atomic_read(&EXT4_I(inode)->i_unwritten) == 0));
  64}
  65
  66/*
  67 * This tests whether the IO in question is block-aligned or not.
  68 * Ext4 utilizes unwritten extents when hole-filling during direct IO, and they
  69 * are converted to written only after the IO is complete.  Until they are
  70 * mapped, these blocks appear as holes, so dio_zero_block() will assume that
  71 * it needs to zero out portions of the start and/or end block.  If 2 AIO
  72 * threads are at work on the same unwritten block, they must be synchronized
  73 * or one thread will zero the other's data, causing corruption.
  74 */
  75static int
  76ext4_unaligned_aio(struct inode *inode, struct iov_iter *from, loff_t pos)
  77{
  78        struct super_block *sb = inode->i_sb;
  79        int blockmask = sb->s_blocksize - 1;
  80
  81        if (pos >= i_size_read(inode))
  82                return 0;
  83
  84        if ((pos | iov_iter_alignment(from)) & blockmask)
  85                return 1;
  86
  87        return 0;
  88}
  89
  90static ssize_t
  91ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
  92{
  93        struct file *file = iocb->ki_filp;
  94        struct inode *inode = file_inode(iocb->ki_filp);
  95        struct mutex *aio_mutex = NULL;
  96        struct blk_plug plug;
  97        int o_direct = iocb->ki_flags & IOCB_DIRECT;
  98        int overwrite = 0;
  99        ssize_t ret;
 100
 101        /*
 102         * Unaligned direct AIO must be serialized; see comment above
 103         * In the case of O_APPEND, assume that we must always serialize
 104         */
 105        if (o_direct &&
 106            ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS) &&
 107            !is_sync_kiocb(iocb) &&
 108            (iocb->ki_flags & IOCB_APPEND ||
 109             ext4_unaligned_aio(inode, from, iocb->ki_pos))) {
 110                aio_mutex = ext4_aio_mutex(inode);
 111                mutex_lock(aio_mutex);
 112                ext4_unwritten_wait(inode);
 113        }
 114
 115        mutex_lock(&inode->i_mutex);
 116        ret = generic_write_checks(iocb, from);
 117        if (ret <= 0)
 118                goto out;
 119
 120        /*
 121         * If we have encountered a bitmap-format file, the size limit
 122         * is smaller than s_maxbytes, which is for extent-mapped files.
 123         */
 124        if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) {
 125                struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
 126
 127                if (iocb->ki_pos >= sbi->s_bitmap_maxbytes) {
 128                        ret = -EFBIG;
 129                        goto out;
 130                }
 131                iov_iter_truncate(from, sbi->s_bitmap_maxbytes - iocb->ki_pos);
 132        }
 133
 134        iocb->private = &overwrite;
 135        if (o_direct) {
 136                size_t length = iov_iter_count(from);
 137                loff_t pos = iocb->ki_pos;
 138                blk_start_plug(&plug);
 139
 140                /* check whether we do a DIO overwrite or not */
 141                if (ext4_should_dioread_nolock(inode) && !aio_mutex &&
 142                    !file->f_mapping->nrpages && pos + length <= i_size_read(inode)) {
 143                        struct ext4_map_blocks map;
 144                        unsigned int blkbits = inode->i_blkbits;
 145                        int err, len;
 146
 147                        map.m_lblk = pos >> blkbits;
 148                        map.m_len = (EXT4_BLOCK_ALIGN(pos + length, blkbits) >> blkbits)
 149                                - map.m_lblk;
 150                        len = map.m_len;
 151
 152                        err = ext4_map_blocks(NULL, inode, &map, 0);
 153                        /*
 154                         * 'err==len' means that all of blocks has
 155                         * been preallocated no matter they are
 156                         * initialized or not.  For excluding
 157                         * unwritten extents, we need to check
 158                         * m_flags.  There are two conditions that
 159                         * indicate for initialized extents.  1) If we
 160                         * hit extent cache, EXT4_MAP_MAPPED flag is
 161                         * returned; 2) If we do a real lookup,
 162                         * non-flags are returned.  So we should check
 163                         * these two conditions.
 164                         */
 165                        if (err == len && (map.m_flags & EXT4_MAP_MAPPED))
 166                                overwrite = 1;
 167                }
 168        }
 169
 170        ret = __generic_file_write_iter(iocb, from);
 171        mutex_unlock(&inode->i_mutex);
 172
 173        if (ret > 0) {
 174                ssize_t err;
 175
 176                err = generic_write_sync(file, iocb->ki_pos - ret, ret);
 177                if (err < 0)
 178                        ret = err;
 179        }
 180        if (o_direct)
 181                blk_finish_plug(&plug);
 182
 183        if (aio_mutex)
 184                mutex_unlock(aio_mutex);
 185        return ret;
 186
 187out:
 188        mutex_unlock(&inode->i_mutex);
 189        if (aio_mutex)
 190                mutex_unlock(aio_mutex);
 191        return ret;
 192}
 193
 194#ifdef CONFIG_FS_DAX
 195static void ext4_end_io_unwritten(struct buffer_head *bh, int uptodate)
 196{
 197        struct inode *inode = bh->b_assoc_map->host;
 198        /* XXX: breaks on 32-bit > 16GB. Is that even supported? */
 199        loff_t offset = (loff_t)(uintptr_t)bh->b_private << inode->i_blkbits;
 200        int err;
 201        if (!uptodate)
 202                return;
 203        WARN_ON(!buffer_unwritten(bh));
 204        err = ext4_convert_unwritten_extents(NULL, inode, offset, bh->b_size);
 205}
 206
 207static int ext4_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 208{
 209        return dax_fault(vma, vmf, ext4_get_block, ext4_end_io_unwritten);
 210                                        /* Is this the right get_block? */
 211}
 212
 213static int ext4_dax_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
 214{
 215        return dax_mkwrite(vma, vmf, ext4_get_block, ext4_end_io_unwritten);
 216}
 217
 218static const struct vm_operations_struct ext4_dax_vm_ops = {
 219        .fault          = ext4_dax_fault,
 220        .page_mkwrite   = ext4_dax_mkwrite,
 221        .pfn_mkwrite    = dax_pfn_mkwrite,
 222};
 223#else
 224#define ext4_dax_vm_ops ext4_file_vm_ops
 225#endif
 226
 227static const struct vm_operations_struct ext4_file_vm_ops = {
 228        .fault          = filemap_fault,
 229        .map_pages      = filemap_map_pages,
 230        .page_mkwrite   = ext4_page_mkwrite,
 231};
 232
 233static int ext4_file_mmap(struct file *file, struct vm_area_struct *vma)
 234{
 235        struct inode *inode = file->f_mapping->host;
 236
 237        if (ext4_encrypted_inode(inode)) {
 238                int err = ext4_get_encryption_info(inode);
 239                if (err)
 240                        return 0;
 241                if (ext4_encryption_info(inode) == NULL)
 242                        return -ENOKEY;
 243        }
 244        file_accessed(file);
 245        if (IS_DAX(file_inode(file))) {
 246                vma->vm_ops = &ext4_dax_vm_ops;
 247                vma->vm_flags |= VM_MIXEDMAP;
 248        } else {
 249                vma->vm_ops = &ext4_file_vm_ops;
 250        }
 251        return 0;
 252}
 253
 254static int ext4_file_open(struct inode * inode, struct file * filp)
 255{
 256        struct super_block *sb = inode->i_sb;
 257        struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
 258        struct vfsmount *mnt = filp->f_path.mnt;
 259        struct path path;
 260        char buf[64], *cp;
 261        int ret;
 262
 263        if (unlikely(!(sbi->s_mount_flags & EXT4_MF_MNTDIR_SAMPLED) &&
 264                     !(sb->s_flags & MS_RDONLY))) {
 265                sbi->s_mount_flags |= EXT4_MF_MNTDIR_SAMPLED;
 266                /*
 267                 * Sample where the filesystem has been mounted and
 268                 * store it in the superblock for sysadmin convenience
 269                 * when trying to sort through large numbers of block
 270                 * devices or filesystem images.
 271                 */
 272                memset(buf, 0, sizeof(buf));
 273                path.mnt = mnt;
 274                path.dentry = mnt->mnt_root;
 275                cp = d_path(&path, buf, sizeof(buf));
 276                if (!IS_ERR(cp)) {
 277                        handle_t *handle;
 278                        int err;
 279
 280                        handle = ext4_journal_start_sb(sb, EXT4_HT_MISC, 1);
 281                        if (IS_ERR(handle))
 282                                return PTR_ERR(handle);
 283                        BUFFER_TRACE(sbi->s_sbh, "get_write_access");
 284                        err = ext4_journal_get_write_access(handle, sbi->s_sbh);
 285                        if (err) {
 286                                ext4_journal_stop(handle);
 287                                return err;
 288                        }
 289                        strlcpy(sbi->s_es->s_last_mounted, cp,
 290                                sizeof(sbi->s_es->s_last_mounted));
 291                        ext4_handle_dirty_super(handle, sb);
 292                        ext4_journal_stop(handle);
 293                }
 294        }
 295        if (ext4_encrypted_inode(inode)) {
 296                ret = ext4_get_encryption_info(inode);
 297                if (ret)
 298                        return -EACCES;
 299                if (ext4_encryption_info(inode) == NULL)
 300                        return -ENOKEY;
 301        }
 302        /*
 303         * Set up the jbd2_inode if we are opening the inode for
 304         * writing and the journal is present
 305         */
 306        if (filp->f_mode & FMODE_WRITE) {
 307                ret = ext4_inode_attach_jinode(inode);
 308                if (ret < 0)
 309                        return ret;
 310        }
 311        return dquot_file_open(inode, filp);
 312}
 313
 314/*
 315 * Here we use ext4_map_blocks() to get a block mapping for a extent-based
 316 * file rather than ext4_ext_walk_space() because we can introduce
 317 * SEEK_DATA/SEEK_HOLE for block-mapped and extent-mapped file at the same
 318 * function.  When extent status tree has been fully implemented, it will
 319 * track all extent status for a file and we can directly use it to
 320 * retrieve the offset for SEEK_DATA/SEEK_HOLE.
 321 */
 322
 323/*
 324 * When we retrieve the offset for SEEK_DATA/SEEK_HOLE, we would need to
 325 * lookup page cache to check whether or not there has some data between
 326 * [startoff, endoff] because, if this range contains an unwritten extent,
 327 * we determine this extent as a data or a hole according to whether the
 328 * page cache has data or not.
 329 */
 330static int ext4_find_unwritten_pgoff(struct inode *inode,
 331                                     int whence,
 332                                     struct ext4_map_blocks *map,
 333                                     loff_t *offset)
 334{
 335        struct pagevec pvec;
 336        unsigned int blkbits;
 337        pgoff_t index;
 338        pgoff_t end;
 339        loff_t endoff;
 340        loff_t startoff;
 341        loff_t lastoff;
 342        int found = 0;
 343
 344        blkbits = inode->i_sb->s_blocksize_bits;
 345        startoff = *offset;
 346        lastoff = startoff;
 347        endoff = (loff_t)(map->m_lblk + map->m_len) << blkbits;
 348
 349        index = startoff >> PAGE_CACHE_SHIFT;
 350        end = endoff >> PAGE_CACHE_SHIFT;
 351
 352        pagevec_init(&pvec, 0);
 353        do {
 354                int i, num;
 355                unsigned long nr_pages;
 356
 357                num = min_t(pgoff_t, end - index, PAGEVEC_SIZE);
 358                nr_pages = pagevec_lookup(&pvec, inode->i_mapping, index,
 359                                          (pgoff_t)num);
 360                if (nr_pages == 0) {
 361                        if (whence == SEEK_DATA)
 362                                break;
 363
 364                        BUG_ON(whence != SEEK_HOLE);
 365                        /*
 366                         * If this is the first time to go into the loop and
 367                         * offset is not beyond the end offset, it will be a
 368                         * hole at this offset
 369                         */
 370                        if (lastoff == startoff || lastoff < endoff)
 371                                found = 1;
 372                        break;
 373                }
 374
 375                /*
 376                 * If this is the first time to go into the loop and
 377                 * offset is smaller than the first page offset, it will be a
 378                 * hole at this offset.
 379                 */
 380                if (lastoff == startoff && whence == SEEK_HOLE &&
 381                    lastoff < page_offset(pvec.pages[0])) {
 382                        found = 1;
 383                        break;
 384                }
 385
 386                for (i = 0; i < nr_pages; i++) {
 387                        struct page *page = pvec.pages[i];
 388                        struct buffer_head *bh, *head;
 389
 390                        /*
 391                         * If the current offset is not beyond the end of given
 392                         * range, it will be a hole.
 393                         */
 394                        if (lastoff < endoff && whence == SEEK_HOLE &&
 395                            page->index > end) {
 396                                found = 1;
 397                                *offset = lastoff;
 398                                goto out;
 399                        }
 400
 401                        lock_page(page);
 402
 403                        if (unlikely(page->mapping != inode->i_mapping)) {
 404                                unlock_page(page);
 405                                continue;
 406                        }
 407
 408                        if (!page_has_buffers(page)) {
 409                                unlock_page(page);
 410                                continue;
 411                        }
 412
 413                        if (page_has_buffers(page)) {
 414                                lastoff = page_offset(page);
 415                                bh = head = page_buffers(page);
 416                                do {
 417                                        if (buffer_uptodate(bh) ||
 418                                            buffer_unwritten(bh)) {
 419                                                if (whence == SEEK_DATA)
 420                                                        found = 1;
 421                                        } else {
 422                                                if (whence == SEEK_HOLE)
 423                                                        found = 1;
 424                                        }
 425                                        if (found) {
 426                                                *offset = max_t(loff_t,
 427                                                        startoff, lastoff);
 428                                                unlock_page(page);
 429                                                goto out;
 430                                        }
 431                                        lastoff += bh->b_size;
 432                                        bh = bh->b_this_page;
 433                                } while (bh != head);
 434                        }
 435
 436                        lastoff = page_offset(page) + PAGE_SIZE;
 437                        unlock_page(page);
 438                }
 439
 440                /*
 441                 * The no. of pages is less than our desired, that would be a
 442                 * hole in there.
 443                 */
 444                if (nr_pages < num && whence == SEEK_HOLE) {
 445                        found = 1;
 446                        *offset = lastoff;
 447                        break;
 448                }
 449
 450                index = pvec.pages[i - 1]->index + 1;
 451                pagevec_release(&pvec);
 452        } while (index <= end);
 453
 454out:
 455        pagevec_release(&pvec);
 456        return found;
 457}
 458
 459/*
 460 * ext4_seek_data() retrieves the offset for SEEK_DATA.
 461 */
 462static loff_t ext4_seek_data(struct file *file, loff_t offset, loff_t maxsize)
 463{
 464        struct inode *inode = file->f_mapping->host;
 465        struct ext4_map_blocks map;
 466        struct extent_status es;
 467        ext4_lblk_t start, last, end;
 468        loff_t dataoff, isize;
 469        int blkbits;
 470        int ret = 0;
 471
 472        mutex_lock(&inode->i_mutex);
 473
 474        isize = i_size_read(inode);
 475        if (offset >= isize) {
 476                mutex_unlock(&inode->i_mutex);
 477                return -ENXIO;
 478        }
 479
 480        blkbits = inode->i_sb->s_blocksize_bits;
 481        start = offset >> blkbits;
 482        last = start;
 483        end = isize >> blkbits;
 484        dataoff = offset;
 485
 486        do {
 487                map.m_lblk = last;
 488                map.m_len = end - last + 1;
 489                ret = ext4_map_blocks(NULL, inode, &map, 0);
 490                if (ret > 0 && !(map.m_flags & EXT4_MAP_UNWRITTEN)) {
 491                        if (last != start)
 492                                dataoff = (loff_t)last << blkbits;
 493                        break;
 494                }
 495
 496                /*
 497                 * If there is a delay extent at this offset,
 498                 * it will be as a data.
 499                 */
 500                ext4_es_find_delayed_extent_range(inode, last, last, &es);
 501                if (es.es_len != 0 && in_range(last, es.es_lblk, es.es_len)) {
 502                        if (last != start)
 503                                dataoff = (loff_t)last << blkbits;
 504                        break;
 505                }
 506
 507                /*
 508                 * If there is a unwritten extent at this offset,
 509                 * it will be as a data or a hole according to page
 510                 * cache that has data or not.
 511                 */
 512                if (map.m_flags & EXT4_MAP_UNWRITTEN) {
 513                        int unwritten;
 514                        unwritten = ext4_find_unwritten_pgoff(inode, SEEK_DATA,
 515                                                              &map, &dataoff);
 516                        if (unwritten)
 517                                break;
 518                }
 519
 520                last++;
 521                dataoff = (loff_t)last << blkbits;
 522        } while (last <= end);
 523
 524        mutex_unlock(&inode->i_mutex);
 525
 526        if (dataoff > isize)
 527                return -ENXIO;
 528
 529        return vfs_setpos(file, dataoff, maxsize);
 530}
 531
 532/*
 533 * ext4_seek_hole() retrieves the offset for SEEK_HOLE.
 534 */
 535static loff_t ext4_seek_hole(struct file *file, loff_t offset, loff_t maxsize)
 536{
 537        struct inode *inode = file->f_mapping->host;
 538        struct ext4_map_blocks map;
 539        struct extent_status es;
 540        ext4_lblk_t start, last, end;
 541        loff_t holeoff, isize;
 542        int blkbits;
 543        int ret = 0;
 544
 545        mutex_lock(&inode->i_mutex);
 546
 547        isize = i_size_read(inode);
 548        if (offset >= isize) {
 549                mutex_unlock(&inode->i_mutex);
 550                return -ENXIO;
 551        }
 552
 553        blkbits = inode->i_sb->s_blocksize_bits;
 554        start = offset >> blkbits;
 555        last = start;
 556        end = isize >> blkbits;
 557        holeoff = offset;
 558
 559        do {
 560                map.m_lblk = last;
 561                map.m_len = end - last + 1;
 562                ret = ext4_map_blocks(NULL, inode, &map, 0);
 563                if (ret > 0 && !(map.m_flags & EXT4_MAP_UNWRITTEN)) {
 564                        last += ret;
 565                        holeoff = (loff_t)last << blkbits;
 566                        continue;
 567                }
 568
 569                /*
 570                 * If there is a delay extent at this offset,
 571                 * we will skip this extent.
 572                 */
 573                ext4_es_find_delayed_extent_range(inode, last, last, &es);
 574                if (es.es_len != 0 && in_range(last, es.es_lblk, es.es_len)) {
 575                        last = es.es_lblk + es.es_len;
 576                        holeoff = (loff_t)last << blkbits;
 577                        continue;
 578                }
 579
 580                /*
 581                 * If there is a unwritten extent at this offset,
 582                 * it will be as a data or a hole according to page
 583                 * cache that has data or not.
 584                 */
 585                if (map.m_flags & EXT4_MAP_UNWRITTEN) {
 586                        int unwritten;
 587                        unwritten = ext4_find_unwritten_pgoff(inode, SEEK_HOLE,
 588                                                              &map, &holeoff);
 589                        if (!unwritten) {
 590                                last += ret;
 591                                holeoff = (loff_t)last << blkbits;
 592                                continue;
 593                        }
 594                }
 595
 596                /* find a hole */
 597                break;
 598        } while (last <= end);
 599
 600        mutex_unlock(&inode->i_mutex);
 601
 602        if (holeoff > isize)
 603                holeoff = isize;
 604
 605        return vfs_setpos(file, holeoff, maxsize);
 606}
 607
 608/*
 609 * ext4_llseek() handles both block-mapped and extent-mapped maxbytes values
 610 * by calling generic_file_llseek_size() with the appropriate maxbytes
 611 * value for each.
 612 */
 613loff_t ext4_llseek(struct file *file, loff_t offset, int whence)
 614{
 615        struct inode *inode = file->f_mapping->host;
 616        loff_t maxbytes;
 617
 618        if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
 619                maxbytes = EXT4_SB(inode->i_sb)->s_bitmap_maxbytes;
 620        else
 621                maxbytes = inode->i_sb->s_maxbytes;
 622
 623        switch (whence) {
 624        case SEEK_SET:
 625        case SEEK_CUR:
 626        case SEEK_END:
 627                return generic_file_llseek_size(file, offset, whence,
 628                                                maxbytes, i_size_read(inode));
 629        case SEEK_DATA:
 630                return ext4_seek_data(file, offset, maxbytes);
 631        case SEEK_HOLE:
 632                return ext4_seek_hole(file, offset, maxbytes);
 633        }
 634
 635        return -EINVAL;
 636}
 637
 638const struct file_operations ext4_file_operations = {
 639        .llseek         = ext4_llseek,
 640        .read_iter      = generic_file_read_iter,
 641        .write_iter     = ext4_file_write_iter,
 642        .unlocked_ioctl = ext4_ioctl,
 643#ifdef CONFIG_COMPAT
 644        .compat_ioctl   = ext4_compat_ioctl,
 645#endif
 646        .mmap           = ext4_file_mmap,
 647        .open           = ext4_file_open,
 648        .release        = ext4_release_file,
 649        .fsync          = ext4_sync_file,
 650        .splice_read    = generic_file_splice_read,
 651        .splice_write   = iter_file_splice_write,
 652        .fallocate      = ext4_fallocate,
 653};
 654
 655const struct inode_operations ext4_file_inode_operations = {
 656        .setattr        = ext4_setattr,
 657        .getattr        = ext4_getattr,
 658        .setxattr       = generic_setxattr,
 659        .getxattr       = generic_getxattr,
 660        .listxattr      = ext4_listxattr,
 661        .removexattr    = generic_removexattr,
 662        .get_acl        = ext4_get_acl,
 663        .set_acl        = ext4_set_acl,
 664        .fiemap         = ext4_fiemap,
 665};
 666
 667