linux/fs/f2fs/checkpoint.c
<<
>>
Prefs
   1/*
   2 * fs/f2fs/checkpoint.c
   3 *
   4 * Copyright (c) 2012 Samsung Electronics Co., Ltd.
   5 *             http://www.samsung.com/
   6 *
   7 * This program is free software; you can redistribute it and/or modify
   8 * it under the terms of the GNU General Public License version 2 as
   9 * published by the Free Software Foundation.
  10 */
  11#include <linux/fs.h>
  12#include <linux/bio.h>
  13#include <linux/mpage.h>
  14#include <linux/writeback.h>
  15#include <linux/blkdev.h>
  16#include <linux/f2fs_fs.h>
  17#include <linux/pagevec.h>
  18#include <linux/swap.h>
  19
  20#include "f2fs.h"
  21#include "node.h"
  22#include "segment.h"
  23#include "trace.h"
  24#include <trace/events/f2fs.h>
  25
  26static struct kmem_cache *ino_entry_slab;
  27struct kmem_cache *inode_entry_slab;
  28
  29/*
  30 * We guarantee no failure on the returned page.
  31 */
  32struct page *grab_meta_page(struct f2fs_sb_info *sbi, pgoff_t index)
  33{
  34        struct address_space *mapping = META_MAPPING(sbi);
  35        struct page *page = NULL;
  36repeat:
  37        page = grab_cache_page(mapping, index);
  38        if (!page) {
  39                cond_resched();
  40                goto repeat;
  41        }
  42        f2fs_wait_on_page_writeback(page, META);
  43        SetPageUptodate(page);
  44        return page;
  45}
  46
  47/*
  48 * We guarantee no failure on the returned page.
  49 */
  50static struct page *__get_meta_page(struct f2fs_sb_info *sbi, pgoff_t index,
  51                                                        bool is_meta)
  52{
  53        struct address_space *mapping = META_MAPPING(sbi);
  54        struct page *page;
  55        struct f2fs_io_info fio = {
  56                .sbi = sbi,
  57                .type = META,
  58                .rw = READ_SYNC | REQ_META | REQ_PRIO,
  59                .blk_addr = index,
  60                .encrypted_page = NULL,
  61        };
  62
  63        if (unlikely(!is_meta))
  64                fio.rw &= ~REQ_META;
  65repeat:
  66        page = grab_cache_page(mapping, index);
  67        if (!page) {
  68                cond_resched();
  69                goto repeat;
  70        }
  71        if (PageUptodate(page))
  72                goto out;
  73
  74        fio.page = page;
  75
  76        if (f2fs_submit_page_bio(&fio)) {
  77                f2fs_put_page(page, 1);
  78                goto repeat;
  79        }
  80
  81        lock_page(page);
  82        if (unlikely(page->mapping != mapping)) {
  83                f2fs_put_page(page, 1);
  84                goto repeat;
  85        }
  86
  87        /*
  88         * if there is any IO error when accessing device, make our filesystem
  89         * readonly and make sure do not write checkpoint with non-uptodate
  90         * meta page.
  91         */
  92        if (unlikely(!PageUptodate(page)))
  93                f2fs_stop_checkpoint(sbi);
  94out:
  95        return page;
  96}
  97
  98struct page *get_meta_page(struct f2fs_sb_info *sbi, pgoff_t index)
  99{
 100        return __get_meta_page(sbi, index, true);
 101}
 102
 103/* for POR only */
 104struct page *get_tmp_page(struct f2fs_sb_info *sbi, pgoff_t index)
 105{
 106        return __get_meta_page(sbi, index, false);
 107}
 108
 109bool is_valid_blkaddr(struct f2fs_sb_info *sbi, block_t blkaddr, int type)
 110{
 111        switch (type) {
 112        case META_NAT:
 113                break;
 114        case META_SIT:
 115                if (unlikely(blkaddr >= SIT_BLK_CNT(sbi)))
 116                        return false;
 117                break;
 118        case META_SSA:
 119                if (unlikely(blkaddr >= MAIN_BLKADDR(sbi) ||
 120                        blkaddr < SM_I(sbi)->ssa_blkaddr))
 121                        return false;
 122                break;
 123        case META_CP:
 124                if (unlikely(blkaddr >= SIT_I(sbi)->sit_base_addr ||
 125                        blkaddr < __start_cp_addr(sbi)))
 126                        return false;
 127                break;
 128        case META_POR:
 129                if (unlikely(blkaddr >= MAX_BLKADDR(sbi) ||
 130                        blkaddr < MAIN_BLKADDR(sbi)))
 131                        return false;
 132                break;
 133        default:
 134                BUG();
 135        }
 136
 137        return true;
 138}
 139
 140/*
 141 * Readahead CP/NAT/SIT/SSA pages
 142 */
 143int ra_meta_pages(struct f2fs_sb_info *sbi, block_t start, int nrpages,
 144                                                        int type, bool sync)
 145{
 146        block_t prev_blk_addr = 0;
 147        struct page *page;
 148        block_t blkno = start;
 149        struct f2fs_io_info fio = {
 150                .sbi = sbi,
 151                .type = META,
 152                .rw = sync ? (READ_SYNC | REQ_META | REQ_PRIO) : READA,
 153                .encrypted_page = NULL,
 154        };
 155
 156        if (unlikely(type == META_POR))
 157                fio.rw &= ~REQ_META;
 158
 159        for (; nrpages-- > 0; blkno++) {
 160
 161                if (!is_valid_blkaddr(sbi, blkno, type))
 162                        goto out;
 163
 164                switch (type) {
 165                case META_NAT:
 166                        if (unlikely(blkno >=
 167                                        NAT_BLOCK_OFFSET(NM_I(sbi)->max_nid)))
 168                                blkno = 0;
 169                        /* get nat block addr */
 170                        fio.blk_addr = current_nat_addr(sbi,
 171                                        blkno * NAT_ENTRY_PER_BLOCK);
 172                        break;
 173                case META_SIT:
 174                        /* get sit block addr */
 175                        fio.blk_addr = current_sit_addr(sbi,
 176                                        blkno * SIT_ENTRY_PER_BLOCK);
 177                        if (blkno != start && prev_blk_addr + 1 != fio.blk_addr)
 178                                goto out;
 179                        prev_blk_addr = fio.blk_addr;
 180                        break;
 181                case META_SSA:
 182                case META_CP:
 183                case META_POR:
 184                        fio.blk_addr = blkno;
 185                        break;
 186                default:
 187                        BUG();
 188                }
 189
 190                page = grab_cache_page(META_MAPPING(sbi), fio.blk_addr);
 191                if (!page)
 192                        continue;
 193                if (PageUptodate(page)) {
 194                        f2fs_put_page(page, 1);
 195                        continue;
 196                }
 197
 198                fio.page = page;
 199                f2fs_submit_page_mbio(&fio);
 200                f2fs_put_page(page, 0);
 201        }
 202out:
 203        f2fs_submit_merged_bio(sbi, META, READ);
 204        return blkno - start;
 205}
 206
 207void ra_meta_pages_cond(struct f2fs_sb_info *sbi, pgoff_t index)
 208{
 209        struct page *page;
 210        bool readahead = false;
 211
 212        page = find_get_page(META_MAPPING(sbi), index);
 213        if (!page || (page && !PageUptodate(page)))
 214                readahead = true;
 215        f2fs_put_page(page, 0);
 216
 217        if (readahead)
 218                ra_meta_pages(sbi, index, MAX_BIO_BLOCKS(sbi), META_POR, true);
 219}
 220
 221static int f2fs_write_meta_page(struct page *page,
 222                                struct writeback_control *wbc)
 223{
 224        struct f2fs_sb_info *sbi = F2FS_P_SB(page);
 225
 226        trace_f2fs_writepage(page, META);
 227
 228        if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING)))
 229                goto redirty_out;
 230        if (wbc->for_reclaim && page->index < GET_SUM_BLOCK(sbi, 0))
 231                goto redirty_out;
 232        if (unlikely(f2fs_cp_error(sbi)))
 233                goto redirty_out;
 234
 235        f2fs_wait_on_page_writeback(page, META);
 236        write_meta_page(sbi, page);
 237        dec_page_count(sbi, F2FS_DIRTY_META);
 238        unlock_page(page);
 239
 240        if (wbc->for_reclaim || unlikely(f2fs_cp_error(sbi)))
 241                f2fs_submit_merged_bio(sbi, META, WRITE);
 242        return 0;
 243
 244redirty_out:
 245        redirty_page_for_writepage(wbc, page);
 246        return AOP_WRITEPAGE_ACTIVATE;
 247}
 248
 249static int f2fs_write_meta_pages(struct address_space *mapping,
 250                                struct writeback_control *wbc)
 251{
 252        struct f2fs_sb_info *sbi = F2FS_M_SB(mapping);
 253        long diff, written;
 254
 255        trace_f2fs_writepages(mapping->host, wbc, META);
 256
 257        /* collect a number of dirty meta pages and write together */
 258        if (wbc->for_kupdate ||
 259                get_pages(sbi, F2FS_DIRTY_META) < nr_pages_to_skip(sbi, META))
 260                goto skip_write;
 261
 262        /* if mounting is failed, skip writing node pages */
 263        mutex_lock(&sbi->cp_mutex);
 264        diff = nr_pages_to_write(sbi, META, wbc);
 265        written = sync_meta_pages(sbi, META, wbc->nr_to_write);
 266        mutex_unlock(&sbi->cp_mutex);
 267        wbc->nr_to_write = max((long)0, wbc->nr_to_write - written - diff);
 268        return 0;
 269
 270skip_write:
 271        wbc->pages_skipped += get_pages(sbi, F2FS_DIRTY_META);
 272        return 0;
 273}
 274
 275long sync_meta_pages(struct f2fs_sb_info *sbi, enum page_type type,
 276                                                long nr_to_write)
 277{
 278        struct address_space *mapping = META_MAPPING(sbi);
 279        pgoff_t index = 0, end = LONG_MAX, prev = LONG_MAX;
 280        struct pagevec pvec;
 281        long nwritten = 0;
 282        struct writeback_control wbc = {
 283                .for_reclaim = 0,
 284        };
 285
 286        pagevec_init(&pvec, 0);
 287
 288        while (index <= end) {
 289                int i, nr_pages;
 290                nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
 291                                PAGECACHE_TAG_DIRTY,
 292                                min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1);
 293                if (unlikely(nr_pages == 0))
 294                        break;
 295
 296                for (i = 0; i < nr_pages; i++) {
 297                        struct page *page = pvec.pages[i];
 298
 299                        if (prev == LONG_MAX)
 300                                prev = page->index - 1;
 301                        if (nr_to_write != LONG_MAX && page->index != prev + 1) {
 302                                pagevec_release(&pvec);
 303                                goto stop;
 304                        }
 305
 306                        lock_page(page);
 307
 308                        if (unlikely(page->mapping != mapping)) {
 309continue_unlock:
 310                                unlock_page(page);
 311                                continue;
 312                        }
 313                        if (!PageDirty(page)) {
 314                                /* someone wrote it for us */
 315                                goto continue_unlock;
 316                        }
 317
 318                        if (!clear_page_dirty_for_io(page))
 319                                goto continue_unlock;
 320
 321                        if (mapping->a_ops->writepage(page, &wbc)) {
 322                                unlock_page(page);
 323                                break;
 324                        }
 325                        nwritten++;
 326                        prev = page->index;
 327                        if (unlikely(nwritten >= nr_to_write))
 328                                break;
 329                }
 330                pagevec_release(&pvec);
 331                cond_resched();
 332        }
 333stop:
 334        if (nwritten)
 335                f2fs_submit_merged_bio(sbi, type, WRITE);
 336
 337        return nwritten;
 338}
 339
 340static int f2fs_set_meta_page_dirty(struct page *page)
 341{
 342        trace_f2fs_set_page_dirty(page, META);
 343
 344        SetPageUptodate(page);
 345        if (!PageDirty(page)) {
 346                __set_page_dirty_nobuffers(page);
 347                inc_page_count(F2FS_P_SB(page), F2FS_DIRTY_META);
 348                SetPagePrivate(page);
 349                f2fs_trace_pid(page);
 350                return 1;
 351        }
 352        return 0;
 353}
 354
 355const struct address_space_operations f2fs_meta_aops = {
 356        .writepage      = f2fs_write_meta_page,
 357        .writepages     = f2fs_write_meta_pages,
 358        .set_page_dirty = f2fs_set_meta_page_dirty,
 359        .invalidatepage = f2fs_invalidate_page,
 360        .releasepage    = f2fs_release_page,
 361};
 362
 363static void __add_ino_entry(struct f2fs_sb_info *sbi, nid_t ino, int type)
 364{
 365        struct inode_management *im = &sbi->im[type];
 366        struct ino_entry *e, *tmp;
 367
 368        tmp = f2fs_kmem_cache_alloc(ino_entry_slab, GFP_NOFS);
 369retry:
 370        radix_tree_preload(GFP_NOFS | __GFP_NOFAIL);
 371
 372        spin_lock(&im->ino_lock);
 373        e = radix_tree_lookup(&im->ino_root, ino);
 374        if (!e) {
 375                e = tmp;
 376                if (radix_tree_insert(&im->ino_root, ino, e)) {
 377                        spin_unlock(&im->ino_lock);
 378                        radix_tree_preload_end();
 379                        goto retry;
 380                }
 381                memset(e, 0, sizeof(struct ino_entry));
 382                e->ino = ino;
 383
 384                list_add_tail(&e->list, &im->ino_list);
 385                if (type != ORPHAN_INO)
 386                        im->ino_num++;
 387        }
 388        spin_unlock(&im->ino_lock);
 389        radix_tree_preload_end();
 390
 391        if (e != tmp)
 392                kmem_cache_free(ino_entry_slab, tmp);
 393}
 394
 395static void __remove_ino_entry(struct f2fs_sb_info *sbi, nid_t ino, int type)
 396{
 397        struct inode_management *im = &sbi->im[type];
 398        struct ino_entry *e;
 399
 400        spin_lock(&im->ino_lock);
 401        e = radix_tree_lookup(&im->ino_root, ino);
 402        if (e) {
 403                list_del(&e->list);
 404                radix_tree_delete(&im->ino_root, ino);
 405                im->ino_num--;
 406                spin_unlock(&im->ino_lock);
 407                kmem_cache_free(ino_entry_slab, e);
 408                return;
 409        }
 410        spin_unlock(&im->ino_lock);
 411}
 412
 413void add_ino_entry(struct f2fs_sb_info *sbi, nid_t ino, int type)
 414{
 415        /* add new dirty ino entry into list */
 416        __add_ino_entry(sbi, ino, type);
 417}
 418
 419void remove_ino_entry(struct f2fs_sb_info *sbi, nid_t ino, int type)
 420{
 421        /* remove dirty ino entry from list */
 422        __remove_ino_entry(sbi, ino, type);
 423}
 424
 425/* mode should be APPEND_INO or UPDATE_INO */
 426bool exist_written_data(struct f2fs_sb_info *sbi, nid_t ino, int mode)
 427{
 428        struct inode_management *im = &sbi->im[mode];
 429        struct ino_entry *e;
 430
 431        spin_lock(&im->ino_lock);
 432        e = radix_tree_lookup(&im->ino_root, ino);
 433        spin_unlock(&im->ino_lock);
 434        return e ? true : false;
 435}
 436
 437void release_ino_entry(struct f2fs_sb_info *sbi)
 438{
 439        struct ino_entry *e, *tmp;
 440        int i;
 441
 442        for (i = APPEND_INO; i <= UPDATE_INO; i++) {
 443                struct inode_management *im = &sbi->im[i];
 444
 445                spin_lock(&im->ino_lock);
 446                list_for_each_entry_safe(e, tmp, &im->ino_list, list) {
 447                        list_del(&e->list);
 448                        radix_tree_delete(&im->ino_root, e->ino);
 449                        kmem_cache_free(ino_entry_slab, e);
 450                        im->ino_num--;
 451                }
 452                spin_unlock(&im->ino_lock);
 453        }
 454}
 455
 456int acquire_orphan_inode(struct f2fs_sb_info *sbi)
 457{
 458        struct inode_management *im = &sbi->im[ORPHAN_INO];
 459        int err = 0;
 460
 461        spin_lock(&im->ino_lock);
 462        if (unlikely(im->ino_num >= sbi->max_orphans))
 463                err = -ENOSPC;
 464        else
 465                im->ino_num++;
 466        spin_unlock(&im->ino_lock);
 467
 468        return err;
 469}
 470
 471void release_orphan_inode(struct f2fs_sb_info *sbi)
 472{
 473        struct inode_management *im = &sbi->im[ORPHAN_INO];
 474
 475        spin_lock(&im->ino_lock);
 476        f2fs_bug_on(sbi, im->ino_num == 0);
 477        im->ino_num--;
 478        spin_unlock(&im->ino_lock);
 479}
 480
 481void add_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino)
 482{
 483        /* add new orphan ino entry into list */
 484        __add_ino_entry(sbi, ino, ORPHAN_INO);
 485}
 486
 487void remove_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino)
 488{
 489        /* remove orphan entry from orphan list */
 490        __remove_ino_entry(sbi, ino, ORPHAN_INO);
 491}
 492
 493static int recover_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino)
 494{
 495        struct inode *inode;
 496
 497        inode = f2fs_iget(sbi->sb, ino);
 498        if (IS_ERR(inode)) {
 499                /*
 500                 * there should be a bug that we can't find the entry
 501                 * to orphan inode.
 502                 */
 503                f2fs_bug_on(sbi, PTR_ERR(inode) == -ENOENT);
 504                return PTR_ERR(inode);
 505        }
 506
 507        clear_nlink(inode);
 508
 509        /* truncate all the data during iput */
 510        iput(inode);
 511        return 0;
 512}
 513
 514int recover_orphan_inodes(struct f2fs_sb_info *sbi)
 515{
 516        block_t start_blk, orphan_blocks, i, j;
 517        int err;
 518
 519        if (!is_set_ckpt_flags(F2FS_CKPT(sbi), CP_ORPHAN_PRESENT_FLAG))
 520                return 0;
 521
 522        start_blk = __start_cp_addr(sbi) + 1 + __cp_payload(sbi);
 523        orphan_blocks = __start_sum_addr(sbi) - 1 - __cp_payload(sbi);
 524
 525        ra_meta_pages(sbi, start_blk, orphan_blocks, META_CP, true);
 526
 527        for (i = 0; i < orphan_blocks; i++) {
 528                struct page *page = get_meta_page(sbi, start_blk + i);
 529                struct f2fs_orphan_block *orphan_blk;
 530
 531                orphan_blk = (struct f2fs_orphan_block *)page_address(page);
 532                for (j = 0; j < le32_to_cpu(orphan_blk->entry_count); j++) {
 533                        nid_t ino = le32_to_cpu(orphan_blk->ino[j]);
 534                        err = recover_orphan_inode(sbi, ino);
 535                        if (err) {
 536                                f2fs_put_page(page, 1);
 537                                return err;
 538                        }
 539                }
 540                f2fs_put_page(page, 1);
 541        }
 542        /* clear Orphan Flag */
 543        clear_ckpt_flags(F2FS_CKPT(sbi), CP_ORPHAN_PRESENT_FLAG);
 544        return 0;
 545}
 546
 547static void write_orphan_inodes(struct f2fs_sb_info *sbi, block_t start_blk)
 548{
 549        struct list_head *head;
 550        struct f2fs_orphan_block *orphan_blk = NULL;
 551        unsigned int nentries = 0;
 552        unsigned short index = 1;
 553        unsigned short orphan_blocks;
 554        struct page *page = NULL;
 555        struct ino_entry *orphan = NULL;
 556        struct inode_management *im = &sbi->im[ORPHAN_INO];
 557
 558        orphan_blocks = GET_ORPHAN_BLOCKS(im->ino_num);
 559
 560        /*
 561         * we don't need to do spin_lock(&im->ino_lock) here, since all the
 562         * orphan inode operations are covered under f2fs_lock_op().
 563         * And, spin_lock should be avoided due to page operations below.
 564         */
 565        head = &im->ino_list;
 566
 567        /* loop for each orphan inode entry and write them in Jornal block */
 568        list_for_each_entry(orphan, head, list) {
 569                if (!page) {
 570                        page = grab_meta_page(sbi, start_blk++);
 571                        orphan_blk =
 572                                (struct f2fs_orphan_block *)page_address(page);
 573                        memset(orphan_blk, 0, sizeof(*orphan_blk));
 574                }
 575
 576                orphan_blk->ino[nentries++] = cpu_to_le32(orphan->ino);
 577
 578                if (nentries == F2FS_ORPHANS_PER_BLOCK) {
 579                        /*
 580                         * an orphan block is full of 1020 entries,
 581                         * then we need to flush current orphan blocks
 582                         * and bring another one in memory
 583                         */
 584                        orphan_blk->blk_addr = cpu_to_le16(index);
 585                        orphan_blk->blk_count = cpu_to_le16(orphan_blocks);
 586                        orphan_blk->entry_count = cpu_to_le32(nentries);
 587                        set_page_dirty(page);
 588                        f2fs_put_page(page, 1);
 589                        index++;
 590                        nentries = 0;
 591                        page = NULL;
 592                }
 593        }
 594
 595        if (page) {
 596                orphan_blk->blk_addr = cpu_to_le16(index);
 597                orphan_blk->blk_count = cpu_to_le16(orphan_blocks);
 598                orphan_blk->entry_count = cpu_to_le32(nentries);
 599                set_page_dirty(page);
 600                f2fs_put_page(page, 1);
 601        }
 602}
 603
 604static struct page *validate_checkpoint(struct f2fs_sb_info *sbi,
 605                                block_t cp_addr, unsigned long long *version)
 606{
 607        struct page *cp_page_1, *cp_page_2 = NULL;
 608        unsigned long blk_size = sbi->blocksize;
 609        struct f2fs_checkpoint *cp_block;
 610        unsigned long long cur_version = 0, pre_version = 0;
 611        size_t crc_offset;
 612        __u32 crc = 0;
 613
 614        /* Read the 1st cp block in this CP pack */
 615        cp_page_1 = get_meta_page(sbi, cp_addr);
 616
 617        /* get the version number */
 618        cp_block = (struct f2fs_checkpoint *)page_address(cp_page_1);
 619        crc_offset = le32_to_cpu(cp_block->checksum_offset);
 620        if (crc_offset >= blk_size)
 621                goto invalid_cp1;
 622
 623        crc = le32_to_cpu(*((__le32 *)((unsigned char *)cp_block + crc_offset)));
 624        if (!f2fs_crc_valid(crc, cp_block, crc_offset))
 625                goto invalid_cp1;
 626
 627        pre_version = cur_cp_version(cp_block);
 628
 629        /* Read the 2nd cp block in this CP pack */
 630        cp_addr += le32_to_cpu(cp_block->cp_pack_total_block_count) - 1;
 631        cp_page_2 = get_meta_page(sbi, cp_addr);
 632
 633        cp_block = (struct f2fs_checkpoint *)page_address(cp_page_2);
 634        crc_offset = le32_to_cpu(cp_block->checksum_offset);
 635        if (crc_offset >= blk_size)
 636                goto invalid_cp2;
 637
 638        crc = le32_to_cpu(*((__le32 *)((unsigned char *)cp_block + crc_offset)));
 639        if (!f2fs_crc_valid(crc, cp_block, crc_offset))
 640                goto invalid_cp2;
 641
 642        cur_version = cur_cp_version(cp_block);
 643
 644        if (cur_version == pre_version) {
 645                *version = cur_version;
 646                f2fs_put_page(cp_page_2, 1);
 647                return cp_page_1;
 648        }
 649invalid_cp2:
 650        f2fs_put_page(cp_page_2, 1);
 651invalid_cp1:
 652        f2fs_put_page(cp_page_1, 1);
 653        return NULL;
 654}
 655
 656int get_valid_checkpoint(struct f2fs_sb_info *sbi)
 657{
 658        struct f2fs_checkpoint *cp_block;
 659        struct f2fs_super_block *fsb = sbi->raw_super;
 660        struct page *cp1, *cp2, *cur_page;
 661        unsigned long blk_size = sbi->blocksize;
 662        unsigned long long cp1_version = 0, cp2_version = 0;
 663        unsigned long long cp_start_blk_no;
 664        unsigned int cp_blks = 1 + __cp_payload(sbi);
 665        block_t cp_blk_no;
 666        int i;
 667
 668        sbi->ckpt = kzalloc(cp_blks * blk_size, GFP_KERNEL);
 669        if (!sbi->ckpt)
 670                return -ENOMEM;
 671        /*
 672         * Finding out valid cp block involves read both
 673         * sets( cp pack1 and cp pack 2)
 674         */
 675        cp_start_blk_no = le32_to_cpu(fsb->cp_blkaddr);
 676        cp1 = validate_checkpoint(sbi, cp_start_blk_no, &cp1_version);
 677
 678        /* The second checkpoint pack should start at the next segment */
 679        cp_start_blk_no += ((unsigned long long)1) <<
 680                                le32_to_cpu(fsb->log_blocks_per_seg);
 681        cp2 = validate_checkpoint(sbi, cp_start_blk_no, &cp2_version);
 682
 683        if (cp1 && cp2) {
 684                if (ver_after(cp2_version, cp1_version))
 685                        cur_page = cp2;
 686                else
 687                        cur_page = cp1;
 688        } else if (cp1) {
 689                cur_page = cp1;
 690        } else if (cp2) {
 691                cur_page = cp2;
 692        } else {
 693                goto fail_no_cp;
 694        }
 695
 696        cp_block = (struct f2fs_checkpoint *)page_address(cur_page);
 697        memcpy(sbi->ckpt, cp_block, blk_size);
 698
 699        if (cp_blks <= 1)
 700                goto done;
 701
 702        cp_blk_no = le32_to_cpu(fsb->cp_blkaddr);
 703        if (cur_page == cp2)
 704                cp_blk_no += 1 << le32_to_cpu(fsb->log_blocks_per_seg);
 705
 706        for (i = 1; i < cp_blks; i++) {
 707                void *sit_bitmap_ptr;
 708                unsigned char *ckpt = (unsigned char *)sbi->ckpt;
 709
 710                cur_page = get_meta_page(sbi, cp_blk_no + i);
 711                sit_bitmap_ptr = page_address(cur_page);
 712                memcpy(ckpt + i * blk_size, sit_bitmap_ptr, blk_size);
 713                f2fs_put_page(cur_page, 1);
 714        }
 715done:
 716        f2fs_put_page(cp1, 1);
 717        f2fs_put_page(cp2, 1);
 718        return 0;
 719
 720fail_no_cp:
 721        kfree(sbi->ckpt);
 722        return -EINVAL;
 723}
 724
 725static void __add_dirty_inode(struct inode *inode, enum inode_type type)
 726{
 727        struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
 728        struct f2fs_inode_info *fi = F2FS_I(inode);
 729        int flag = (type == DIR_INODE) ? FI_DIRTY_DIR : FI_DIRTY_FILE;
 730
 731        if (is_inode_flag_set(fi, flag))
 732                return;
 733
 734        set_inode_flag(fi, flag);
 735        list_add_tail(&fi->dirty_list, &sbi->inode_list[type]);
 736        stat_inc_dirty_inode(sbi, type);
 737}
 738
 739static void __remove_dirty_inode(struct inode *inode, enum inode_type type)
 740{
 741        struct f2fs_inode_info *fi = F2FS_I(inode);
 742        int flag = (type == DIR_INODE) ? FI_DIRTY_DIR : FI_DIRTY_FILE;
 743
 744        if (get_dirty_pages(inode) ||
 745                        !is_inode_flag_set(F2FS_I(inode), flag))
 746                return;
 747
 748        list_del_init(&fi->dirty_list);
 749        clear_inode_flag(fi, flag);
 750        stat_dec_dirty_inode(F2FS_I_SB(inode), type);
 751}
 752
 753void update_dirty_page(struct inode *inode, struct page *page)
 754{
 755        struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
 756        enum inode_type type = S_ISDIR(inode->i_mode) ? DIR_INODE : FILE_INODE;
 757
 758        if (!S_ISDIR(inode->i_mode) && !S_ISREG(inode->i_mode) &&
 759                        !S_ISLNK(inode->i_mode))
 760                return;
 761
 762        spin_lock(&sbi->inode_lock[type]);
 763        __add_dirty_inode(inode, type);
 764        inode_inc_dirty_pages(inode);
 765        spin_unlock(&sbi->inode_lock[type]);
 766
 767        SetPagePrivate(page);
 768        f2fs_trace_pid(page);
 769}
 770
 771void add_dirty_dir_inode(struct inode *inode)
 772{
 773        struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
 774
 775        spin_lock(&sbi->inode_lock[DIR_INODE]);
 776        __add_dirty_inode(inode, DIR_INODE);
 777        spin_unlock(&sbi->inode_lock[DIR_INODE]);
 778}
 779
 780void remove_dirty_inode(struct inode *inode)
 781{
 782        struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
 783        struct f2fs_inode_info *fi = F2FS_I(inode);
 784        enum inode_type type = S_ISDIR(inode->i_mode) ? DIR_INODE : FILE_INODE;
 785
 786        if (!S_ISDIR(inode->i_mode) && !S_ISREG(inode->i_mode) &&
 787                        !S_ISLNK(inode->i_mode))
 788                return;
 789
 790        spin_lock(&sbi->inode_lock[type]);
 791        __remove_dirty_inode(inode, type);
 792        spin_unlock(&sbi->inode_lock[type]);
 793
 794        /* Only from the recovery routine */
 795        if (is_inode_flag_set(fi, FI_DELAY_IPUT)) {
 796                clear_inode_flag(fi, FI_DELAY_IPUT);
 797                iput(inode);
 798        }
 799}
 800
 801int sync_dirty_inodes(struct f2fs_sb_info *sbi, enum inode_type type)
 802{
 803        struct list_head *head;
 804        struct inode *inode;
 805        struct f2fs_inode_info *fi;
 806        bool is_dir = (type == DIR_INODE);
 807
 808        trace_f2fs_sync_dirty_inodes_enter(sbi->sb, is_dir,
 809                                get_pages(sbi, is_dir ?
 810                                F2FS_DIRTY_DENTS : F2FS_DIRTY_DATA));
 811retry:
 812        if (unlikely(f2fs_cp_error(sbi)))
 813                return -EIO;
 814
 815        spin_lock(&sbi->inode_lock[type]);
 816
 817        head = &sbi->inode_list[type];
 818        if (list_empty(head)) {
 819                spin_unlock(&sbi->inode_lock[type]);
 820                trace_f2fs_sync_dirty_inodes_exit(sbi->sb, is_dir,
 821                                get_pages(sbi, is_dir ?
 822                                F2FS_DIRTY_DENTS : F2FS_DIRTY_DATA));
 823                return 0;
 824        }
 825        fi = list_entry(head->next, struct f2fs_inode_info, dirty_list);
 826        inode = igrab(&fi->vfs_inode);
 827        spin_unlock(&sbi->inode_lock[type]);
 828        if (inode) {
 829                filemap_fdatawrite(inode->i_mapping);
 830                iput(inode);
 831        } else {
 832                /*
 833                 * We should submit bio, since it exists several
 834                 * wribacking dentry pages in the freeing inode.
 835                 */
 836                f2fs_submit_merged_bio(sbi, DATA, WRITE);
 837                cond_resched();
 838        }
 839        goto retry;
 840}
 841
 842/*
 843 * Freeze all the FS-operations for checkpoint.
 844 */
 845static int block_operations(struct f2fs_sb_info *sbi)
 846{
 847        struct writeback_control wbc = {
 848                .sync_mode = WB_SYNC_ALL,
 849                .nr_to_write = LONG_MAX,
 850                .for_reclaim = 0,
 851        };
 852        struct blk_plug plug;
 853        int err = 0;
 854
 855        blk_start_plug(&plug);
 856
 857retry_flush_dents:
 858        f2fs_lock_all(sbi);
 859        /* write all the dirty dentry pages */
 860        if (get_pages(sbi, F2FS_DIRTY_DENTS)) {
 861                f2fs_unlock_all(sbi);
 862                err = sync_dirty_inodes(sbi, DIR_INODE);
 863                if (err)
 864                        goto out;
 865                goto retry_flush_dents;
 866        }
 867
 868        /*
 869         * POR: we should ensure that there are no dirty node pages
 870         * until finishing nat/sit flush.
 871         */
 872retry_flush_nodes:
 873        down_write(&sbi->node_write);
 874
 875        if (get_pages(sbi, F2FS_DIRTY_NODES)) {
 876                up_write(&sbi->node_write);
 877                err = sync_node_pages(sbi, 0, &wbc);
 878                if (err) {
 879                        f2fs_unlock_all(sbi);
 880                        goto out;
 881                }
 882                goto retry_flush_nodes;
 883        }
 884out:
 885        blk_finish_plug(&plug);
 886        return err;
 887}
 888
 889static void unblock_operations(struct f2fs_sb_info *sbi)
 890{
 891        up_write(&sbi->node_write);
 892        f2fs_unlock_all(sbi);
 893}
 894
 895static void wait_on_all_pages_writeback(struct f2fs_sb_info *sbi)
 896{
 897        DEFINE_WAIT(wait);
 898
 899        for (;;) {
 900                prepare_to_wait(&sbi->cp_wait, &wait, TASK_UNINTERRUPTIBLE);
 901
 902                if (!get_pages(sbi, F2FS_WRITEBACK))
 903                        break;
 904
 905                io_schedule();
 906        }
 907        finish_wait(&sbi->cp_wait, &wait);
 908}
 909
 910static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
 911{
 912        struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi);
 913        struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_WARM_NODE);
 914        struct f2fs_nm_info *nm_i = NM_I(sbi);
 915        unsigned long orphan_num = sbi->im[ORPHAN_INO].ino_num;
 916        nid_t last_nid = nm_i->next_scan_nid;
 917        block_t start_blk;
 918        unsigned int data_sum_blocks, orphan_blocks;
 919        __u32 crc32 = 0;
 920        int i;
 921        int cp_payload_blks = __cp_payload(sbi);
 922        block_t discard_blk = NEXT_FREE_BLKADDR(sbi, curseg);
 923        bool invalidate = false;
 924
 925        /*
 926         * This avoids to conduct wrong roll-forward operations and uses
 927         * metapages, so should be called prior to sync_meta_pages below.
 928         */
 929        if (discard_next_dnode(sbi, discard_blk))
 930                invalidate = true;
 931
 932        /* Flush all the NAT/SIT pages */
 933        while (get_pages(sbi, F2FS_DIRTY_META)) {
 934                sync_meta_pages(sbi, META, LONG_MAX);
 935                if (unlikely(f2fs_cp_error(sbi)))
 936                        return -EIO;
 937        }
 938
 939        next_free_nid(sbi, &last_nid);
 940
 941        /*
 942         * modify checkpoint
 943         * version number is already updated
 944         */
 945        ckpt->elapsed_time = cpu_to_le64(get_mtime(sbi));
 946        ckpt->valid_block_count = cpu_to_le64(valid_user_blocks(sbi));
 947        ckpt->free_segment_count = cpu_to_le32(free_segments(sbi));
 948        for (i = 0; i < NR_CURSEG_NODE_TYPE; i++) {
 949                ckpt->cur_node_segno[i] =
 950                        cpu_to_le32(curseg_segno(sbi, i + CURSEG_HOT_NODE));
 951                ckpt->cur_node_blkoff[i] =
 952                        cpu_to_le16(curseg_blkoff(sbi, i + CURSEG_HOT_NODE));
 953                ckpt->alloc_type[i + CURSEG_HOT_NODE] =
 954                                curseg_alloc_type(sbi, i + CURSEG_HOT_NODE);
 955        }
 956        for (i = 0; i < NR_CURSEG_DATA_TYPE; i++) {
 957                ckpt->cur_data_segno[i] =
 958                        cpu_to_le32(curseg_segno(sbi, i + CURSEG_HOT_DATA));
 959                ckpt->cur_data_blkoff[i] =
 960                        cpu_to_le16(curseg_blkoff(sbi, i + CURSEG_HOT_DATA));
 961                ckpt->alloc_type[i + CURSEG_HOT_DATA] =
 962                                curseg_alloc_type(sbi, i + CURSEG_HOT_DATA);
 963        }
 964
 965        ckpt->valid_node_count = cpu_to_le32(valid_node_count(sbi));
 966        ckpt->valid_inode_count = cpu_to_le32(valid_inode_count(sbi));
 967        ckpt->next_free_nid = cpu_to_le32(last_nid);
 968
 969        /* 2 cp  + n data seg summary + orphan inode blocks */
 970        data_sum_blocks = npages_for_summary_flush(sbi, false);
 971        if (data_sum_blocks < NR_CURSEG_DATA_TYPE)
 972                set_ckpt_flags(ckpt, CP_COMPACT_SUM_FLAG);
 973        else
 974                clear_ckpt_flags(ckpt, CP_COMPACT_SUM_FLAG);
 975
 976        orphan_blocks = GET_ORPHAN_BLOCKS(orphan_num);
 977        ckpt->cp_pack_start_sum = cpu_to_le32(1 + cp_payload_blks +
 978                        orphan_blocks);
 979
 980        if (__remain_node_summaries(cpc->reason))
 981                ckpt->cp_pack_total_block_count = cpu_to_le32(F2FS_CP_PACKS+
 982                                cp_payload_blks + data_sum_blocks +
 983                                orphan_blocks + NR_CURSEG_NODE_TYPE);
 984        else
 985                ckpt->cp_pack_total_block_count = cpu_to_le32(F2FS_CP_PACKS +
 986                                cp_payload_blks + data_sum_blocks +
 987                                orphan_blocks);
 988
 989        if (cpc->reason == CP_UMOUNT)
 990                set_ckpt_flags(ckpt, CP_UMOUNT_FLAG);
 991        else
 992                clear_ckpt_flags(ckpt, CP_UMOUNT_FLAG);
 993
 994        if (cpc->reason == CP_FASTBOOT)
 995                set_ckpt_flags(ckpt, CP_FASTBOOT_FLAG);
 996        else
 997                clear_ckpt_flags(ckpt, CP_FASTBOOT_FLAG);
 998
 999        if (orphan_num)
1000                set_ckpt_flags(ckpt, CP_ORPHAN_PRESENT_FLAG);
1001        else
1002                clear_ckpt_flags(ckpt, CP_ORPHAN_PRESENT_FLAG);
1003
1004        if (is_sbi_flag_set(sbi, SBI_NEED_FSCK))
1005                set_ckpt_flags(ckpt, CP_FSCK_FLAG);
1006
1007        /* update SIT/NAT bitmap */
1008        get_sit_bitmap(sbi, __bitmap_ptr(sbi, SIT_BITMAP));
1009        get_nat_bitmap(sbi, __bitmap_ptr(sbi, NAT_BITMAP));
1010
1011        crc32 = f2fs_crc32(ckpt, le32_to_cpu(ckpt->checksum_offset));
1012        *((__le32 *)((unsigned char *)ckpt +
1013                                le32_to_cpu(ckpt->checksum_offset)))
1014                                = cpu_to_le32(crc32);
1015
1016        start_blk = __start_cp_addr(sbi);
1017
1018        /* need to wait for end_io results */
1019        wait_on_all_pages_writeback(sbi);
1020        if (unlikely(f2fs_cp_error(sbi)))
1021                return -EIO;
1022
1023        /* write out checkpoint buffer at block 0 */
1024        update_meta_page(sbi, ckpt, start_blk++);
1025
1026        for (i = 1; i < 1 + cp_payload_blks; i++)
1027                update_meta_page(sbi, (char *)ckpt + i * F2FS_BLKSIZE,
1028                                                        start_blk++);
1029
1030        if (orphan_num) {
1031                write_orphan_inodes(sbi, start_blk);
1032                start_blk += orphan_blocks;
1033        }
1034
1035        write_data_summaries(sbi, start_blk);
1036        start_blk += data_sum_blocks;
1037        if (__remain_node_summaries(cpc->reason)) {
1038                write_node_summaries(sbi, start_blk);
1039                start_blk += NR_CURSEG_NODE_TYPE;
1040        }
1041
1042        /* writeout checkpoint block */
1043        update_meta_page(sbi, ckpt, start_blk);
1044
1045        /* wait for previous submitted node/meta pages writeback */
1046        wait_on_all_pages_writeback(sbi);
1047
1048        if (unlikely(f2fs_cp_error(sbi)))
1049                return -EIO;
1050
1051        filemap_fdatawait_range(NODE_MAPPING(sbi), 0, LONG_MAX);
1052        filemap_fdatawait_range(META_MAPPING(sbi), 0, LONG_MAX);
1053
1054        /* update user_block_counts */
1055        sbi->last_valid_block_count = sbi->total_valid_block_count;
1056        sbi->alloc_valid_block_count = 0;
1057
1058        /* Here, we only have one bio having CP pack */
1059        sync_meta_pages(sbi, META_FLUSH, LONG_MAX);
1060
1061        /* wait for previous submitted meta pages writeback */
1062        wait_on_all_pages_writeback(sbi);
1063
1064        /*
1065         * invalidate meta page which is used temporarily for zeroing out
1066         * block at the end of warm node chain.
1067         */
1068        if (invalidate)
1069                invalidate_mapping_pages(META_MAPPING(sbi), discard_blk,
1070                                                                discard_blk);
1071
1072        release_ino_entry(sbi);
1073
1074        if (unlikely(f2fs_cp_error(sbi)))
1075                return -EIO;
1076
1077        clear_prefree_segments(sbi, cpc);
1078        clear_sbi_flag(sbi, SBI_IS_DIRTY);
1079
1080        return 0;
1081}
1082
1083/*
1084 * We guarantee that this checkpoint procedure will not fail.
1085 */
1086int write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
1087{
1088        struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi);
1089        unsigned long long ckpt_ver;
1090        int err = 0;
1091
1092        mutex_lock(&sbi->cp_mutex);
1093
1094        if (!is_sbi_flag_set(sbi, SBI_IS_DIRTY) &&
1095                (cpc->reason == CP_FASTBOOT || cpc->reason == CP_SYNC ||
1096                (cpc->reason == CP_DISCARD && !sbi->discard_blks)))
1097                goto out;
1098        if (unlikely(f2fs_cp_error(sbi))) {
1099                err = -EIO;
1100                goto out;
1101        }
1102        if (f2fs_readonly(sbi->sb)) {
1103                err = -EROFS;
1104                goto out;
1105        }
1106
1107        trace_f2fs_write_checkpoint(sbi->sb, cpc->reason, "start block_ops");
1108
1109        err = block_operations(sbi);
1110        if (err)
1111                goto out;
1112
1113        trace_f2fs_write_checkpoint(sbi->sb, cpc->reason, "finish block_ops");
1114
1115        f2fs_submit_merged_bio(sbi, DATA, WRITE);
1116        f2fs_submit_merged_bio(sbi, NODE, WRITE);
1117        f2fs_submit_merged_bio(sbi, META, WRITE);
1118
1119        /*
1120         * update checkpoint pack index
1121         * Increase the version number so that
1122         * SIT entries and seg summaries are written at correct place
1123         */
1124        ckpt_ver = cur_cp_version(ckpt);
1125        ckpt->checkpoint_ver = cpu_to_le64(++ckpt_ver);
1126
1127        /* write cached NAT/SIT entries to NAT/SIT area */
1128        flush_nat_entries(sbi);
1129        flush_sit_entries(sbi, cpc);
1130
1131        /* unlock all the fs_lock[] in do_checkpoint() */
1132        err = do_checkpoint(sbi, cpc);
1133
1134        unblock_operations(sbi);
1135        stat_inc_cp_count(sbi->stat_info);
1136
1137        if (cpc->reason == CP_RECOVERY)
1138                f2fs_msg(sbi->sb, KERN_NOTICE,
1139                        "checkpoint: version = %llx", ckpt_ver);
1140
1141        /* do checkpoint periodically */
1142        f2fs_update_time(sbi, CP_TIME);
1143        trace_f2fs_write_checkpoint(sbi->sb, cpc->reason, "finish checkpoint");
1144out:
1145        mutex_unlock(&sbi->cp_mutex);
1146        return err;
1147}
1148
1149void init_ino_entry_info(struct f2fs_sb_info *sbi)
1150{
1151        int i;
1152
1153        for (i = 0; i < MAX_INO_ENTRY; i++) {
1154                struct inode_management *im = &sbi->im[i];
1155
1156                INIT_RADIX_TREE(&im->ino_root, GFP_ATOMIC);
1157                spin_lock_init(&im->ino_lock);
1158                INIT_LIST_HEAD(&im->ino_list);
1159                im->ino_num = 0;
1160        }
1161
1162        sbi->max_orphans = (sbi->blocks_per_seg - F2FS_CP_PACKS -
1163                        NR_CURSEG_TYPE - __cp_payload(sbi)) *
1164                                F2FS_ORPHANS_PER_BLOCK;
1165}
1166
1167int __init create_checkpoint_caches(void)
1168{
1169        ino_entry_slab = f2fs_kmem_cache_create("f2fs_ino_entry",
1170                        sizeof(struct ino_entry));
1171        if (!ino_entry_slab)
1172                return -ENOMEM;
1173        inode_entry_slab = f2fs_kmem_cache_create("f2fs_inode_entry",
1174                        sizeof(struct inode_entry));
1175        if (!inode_entry_slab) {
1176                kmem_cache_destroy(ino_entry_slab);
1177                return -ENOMEM;
1178        }
1179        return 0;
1180}
1181
1182void destroy_checkpoint_caches(void)
1183{
1184        kmem_cache_destroy(ino_entry_slab);
1185        kmem_cache_destroy(inode_entry_slab);
1186}
1187