linux/fs/f2fs/node.c
<<
>>
Prefs
   1/*
   2 * fs/f2fs/node.c
   3 *
   4 * Copyright (c) 2012 Samsung Electronics Co., Ltd.
   5 *             http://www.samsung.com/
   6 *
   7 * This program is free software; you can redistribute it and/or modify
   8 * it under the terms of the GNU General Public License version 2 as
   9 * published by the Free Software Foundation.
  10 */
  11#include <linux/fs.h>
  12#include <linux/f2fs_fs.h>
  13#include <linux/mpage.h>
  14#include <linux/backing-dev.h>
  15#include <linux/blkdev.h>
  16#include <linux/pagevec.h>
  17#include <linux/swap.h>
  18
  19#include "f2fs.h"
  20#include "node.h"
  21#include "segment.h"
  22#include "xattr.h"
  23#include "trace.h"
  24#include <trace/events/f2fs.h>
  25
  26#define on_build_free_nids(nmi) mutex_is_locked(&(nm_i)->build_lock)
  27
  28static struct kmem_cache *nat_entry_slab;
  29static struct kmem_cache *free_nid_slab;
  30static struct kmem_cache *nat_entry_set_slab;
  31
  32bool available_free_memory(struct f2fs_sb_info *sbi, int type)
  33{
  34        struct f2fs_nm_info *nm_i = NM_I(sbi);
  35        struct sysinfo val;
  36        unsigned long avail_ram;
  37        unsigned long mem_size = 0;
  38        bool res = false;
  39
  40        si_meminfo(&val);
  41
  42        /* only uses low memory */
  43        avail_ram = val.totalram - val.totalhigh;
  44
  45        /*
  46         * give 25%, 25%, 50%, 50%, 50% memory for each components respectively
  47         */
  48        if (type == FREE_NIDS) {
  49                mem_size = (nm_i->nid_cnt[FREE_NID] *
  50                                sizeof(struct free_nid)) >> PAGE_SHIFT;
  51                res = mem_size < ((avail_ram * nm_i->ram_thresh / 100) >> 2);
  52        } else if (type == NAT_ENTRIES) {
  53                mem_size = (nm_i->nat_cnt * sizeof(struct nat_entry)) >>
  54                                                        PAGE_SHIFT;
  55                res = mem_size < ((avail_ram * nm_i->ram_thresh / 100) >> 2);
  56                if (excess_cached_nats(sbi))
  57                        res = false;
  58        } else if (type == DIRTY_DENTS) {
  59                if (sbi->sb->s_bdi->wb.dirty_exceeded)
  60                        return false;
  61                mem_size = get_pages(sbi, F2FS_DIRTY_DENTS);
  62                res = mem_size < ((avail_ram * nm_i->ram_thresh / 100) >> 1);
  63        } else if (type == INO_ENTRIES) {
  64                int i;
  65
  66                for (i = 0; i < MAX_INO_ENTRY; i++)
  67                        mem_size += sbi->im[i].ino_num *
  68                                                sizeof(struct ino_entry);
  69                mem_size >>= PAGE_SHIFT;
  70                res = mem_size < ((avail_ram * nm_i->ram_thresh / 100) >> 1);
  71        } else if (type == EXTENT_CACHE) {
  72                mem_size = (atomic_read(&sbi->total_ext_tree) *
  73                                sizeof(struct extent_tree) +
  74                                atomic_read(&sbi->total_ext_node) *
  75                                sizeof(struct extent_node)) >> PAGE_SHIFT;
  76                res = mem_size < ((avail_ram * nm_i->ram_thresh / 100) >> 1);
  77        } else if (type == INMEM_PAGES) {
  78                /* it allows 20% / total_ram for inmemory pages */
  79                mem_size = get_pages(sbi, F2FS_INMEM_PAGES);
  80                res = mem_size < (val.totalram / 5);
  81        } else {
  82                if (!sbi->sb->s_bdi->wb.dirty_exceeded)
  83                        return true;
  84        }
  85        return res;
  86}
  87
  88static void clear_node_page_dirty(struct page *page)
  89{
  90        struct address_space *mapping = page->mapping;
  91        unsigned int long flags;
  92
  93        if (PageDirty(page)) {
  94                xa_lock_irqsave(&mapping->i_pages, flags);
  95                radix_tree_tag_clear(&mapping->i_pages,
  96                                page_index(page),
  97                                PAGECACHE_TAG_DIRTY);
  98                xa_unlock_irqrestore(&mapping->i_pages, flags);
  99
 100                clear_page_dirty_for_io(page);
 101                dec_page_count(F2FS_M_SB(mapping), F2FS_DIRTY_NODES);
 102        }
 103        ClearPageUptodate(page);
 104}
 105
 106static struct page *get_current_nat_page(struct f2fs_sb_info *sbi, nid_t nid)
 107{
 108        pgoff_t index = current_nat_addr(sbi, nid);
 109        return get_meta_page(sbi, index);
 110}
 111
 112static struct page *get_next_nat_page(struct f2fs_sb_info *sbi, nid_t nid)
 113{
 114        struct page *src_page;
 115        struct page *dst_page;
 116        pgoff_t src_off;
 117        pgoff_t dst_off;
 118        void *src_addr;
 119        void *dst_addr;
 120        struct f2fs_nm_info *nm_i = NM_I(sbi);
 121
 122        src_off = current_nat_addr(sbi, nid);
 123        dst_off = next_nat_addr(sbi, src_off);
 124
 125        /* get current nat block page with lock */
 126        src_page = get_meta_page(sbi, src_off);
 127        dst_page = grab_meta_page(sbi, dst_off);
 128        f2fs_bug_on(sbi, PageDirty(src_page));
 129
 130        src_addr = page_address(src_page);
 131        dst_addr = page_address(dst_page);
 132        memcpy(dst_addr, src_addr, PAGE_SIZE);
 133        set_page_dirty(dst_page);
 134        f2fs_put_page(src_page, 1);
 135
 136        set_to_next_nat(nm_i, nid);
 137
 138        return dst_page;
 139}
 140
 141static struct nat_entry *__alloc_nat_entry(nid_t nid, bool no_fail)
 142{
 143        struct nat_entry *new;
 144
 145        if (no_fail)
 146                new = f2fs_kmem_cache_alloc(nat_entry_slab, GFP_F2FS_ZERO);
 147        else
 148                new = kmem_cache_alloc(nat_entry_slab, GFP_F2FS_ZERO);
 149        if (new) {
 150                nat_set_nid(new, nid);
 151                nat_reset_flag(new);
 152        }
 153        return new;
 154}
 155
 156static void __free_nat_entry(struct nat_entry *e)
 157{
 158        kmem_cache_free(nat_entry_slab, e);
 159}
 160
 161/* must be locked by nat_tree_lock */
 162static struct nat_entry *__init_nat_entry(struct f2fs_nm_info *nm_i,
 163        struct nat_entry *ne, struct f2fs_nat_entry *raw_ne, bool no_fail)
 164{
 165        if (no_fail)
 166                f2fs_radix_tree_insert(&nm_i->nat_root, nat_get_nid(ne), ne);
 167        else if (radix_tree_insert(&nm_i->nat_root, nat_get_nid(ne), ne))
 168                return NULL;
 169
 170        if (raw_ne)
 171                node_info_from_raw_nat(&ne->ni, raw_ne);
 172        list_add_tail(&ne->list, &nm_i->nat_entries);
 173        nm_i->nat_cnt++;
 174        return ne;
 175}
 176
 177static struct nat_entry *__lookup_nat_cache(struct f2fs_nm_info *nm_i, nid_t n)
 178{
 179        return radix_tree_lookup(&nm_i->nat_root, n);
 180}
 181
 182static unsigned int __gang_lookup_nat_cache(struct f2fs_nm_info *nm_i,
 183                nid_t start, unsigned int nr, struct nat_entry **ep)
 184{
 185        return radix_tree_gang_lookup(&nm_i->nat_root, (void **)ep, start, nr);
 186}
 187
 188static void __del_from_nat_cache(struct f2fs_nm_info *nm_i, struct nat_entry *e)
 189{
 190        list_del(&e->list);
 191        radix_tree_delete(&nm_i->nat_root, nat_get_nid(e));
 192        nm_i->nat_cnt--;
 193        __free_nat_entry(e);
 194}
 195
 196static struct nat_entry_set *__grab_nat_entry_set(struct f2fs_nm_info *nm_i,
 197                                                        struct nat_entry *ne)
 198{
 199        nid_t set = NAT_BLOCK_OFFSET(ne->ni.nid);
 200        struct nat_entry_set *head;
 201
 202        head = radix_tree_lookup(&nm_i->nat_set_root, set);
 203        if (!head) {
 204                head = f2fs_kmem_cache_alloc(nat_entry_set_slab, GFP_NOFS);
 205
 206                INIT_LIST_HEAD(&head->entry_list);
 207                INIT_LIST_HEAD(&head->set_list);
 208                head->set = set;
 209                head->entry_cnt = 0;
 210                f2fs_radix_tree_insert(&nm_i->nat_set_root, set, head);
 211        }
 212        return head;
 213}
 214
 215static void __set_nat_cache_dirty(struct f2fs_nm_info *nm_i,
 216                                                struct nat_entry *ne)
 217{
 218        struct nat_entry_set *head;
 219        bool new_ne = nat_get_blkaddr(ne) == NEW_ADDR;
 220
 221        if (!new_ne)
 222                head = __grab_nat_entry_set(nm_i, ne);
 223
 224        /*
 225         * update entry_cnt in below condition:
 226         * 1. update NEW_ADDR to valid block address;
 227         * 2. update old block address to new one;
 228         */
 229        if (!new_ne && (get_nat_flag(ne, IS_PREALLOC) ||
 230                                !get_nat_flag(ne, IS_DIRTY)))
 231                head->entry_cnt++;
 232
 233        set_nat_flag(ne, IS_PREALLOC, new_ne);
 234
 235        if (get_nat_flag(ne, IS_DIRTY))
 236                goto refresh_list;
 237
 238        nm_i->dirty_nat_cnt++;
 239        set_nat_flag(ne, IS_DIRTY, true);
 240refresh_list:
 241        if (new_ne)
 242                list_del_init(&ne->list);
 243        else
 244                list_move_tail(&ne->list, &head->entry_list);
 245}
 246
 247static void __clear_nat_cache_dirty(struct f2fs_nm_info *nm_i,
 248                struct nat_entry_set *set, struct nat_entry *ne)
 249{
 250        list_move_tail(&ne->list, &nm_i->nat_entries);
 251        set_nat_flag(ne, IS_DIRTY, false);
 252        set->entry_cnt--;
 253        nm_i->dirty_nat_cnt--;
 254}
 255
 256static unsigned int __gang_lookup_nat_set(struct f2fs_nm_info *nm_i,
 257                nid_t start, unsigned int nr, struct nat_entry_set **ep)
 258{
 259        return radix_tree_gang_lookup(&nm_i->nat_set_root, (void **)ep,
 260                                                        start, nr);
 261}
 262
 263int need_dentry_mark(struct f2fs_sb_info *sbi, nid_t nid)
 264{
 265        struct f2fs_nm_info *nm_i = NM_I(sbi);
 266        struct nat_entry *e;
 267        bool need = false;
 268
 269        down_read(&nm_i->nat_tree_lock);
 270        e = __lookup_nat_cache(nm_i, nid);
 271        if (e) {
 272                if (!get_nat_flag(e, IS_CHECKPOINTED) &&
 273                                !get_nat_flag(e, HAS_FSYNCED_INODE))
 274                        need = true;
 275        }
 276        up_read(&nm_i->nat_tree_lock);
 277        return need;
 278}
 279
 280bool is_checkpointed_node(struct f2fs_sb_info *sbi, nid_t nid)
 281{
 282        struct f2fs_nm_info *nm_i = NM_I(sbi);
 283        struct nat_entry *e;
 284        bool is_cp = true;
 285
 286        down_read(&nm_i->nat_tree_lock);
 287        e = __lookup_nat_cache(nm_i, nid);
 288        if (e && !get_nat_flag(e, IS_CHECKPOINTED))
 289                is_cp = false;
 290        up_read(&nm_i->nat_tree_lock);
 291        return is_cp;
 292}
 293
 294bool need_inode_block_update(struct f2fs_sb_info *sbi, nid_t ino)
 295{
 296        struct f2fs_nm_info *nm_i = NM_I(sbi);
 297        struct nat_entry *e;
 298        bool need_update = true;
 299
 300        down_read(&nm_i->nat_tree_lock);
 301        e = __lookup_nat_cache(nm_i, ino);
 302        if (e && get_nat_flag(e, HAS_LAST_FSYNC) &&
 303                        (get_nat_flag(e, IS_CHECKPOINTED) ||
 304                         get_nat_flag(e, HAS_FSYNCED_INODE)))
 305                need_update = false;
 306        up_read(&nm_i->nat_tree_lock);
 307        return need_update;
 308}
 309
 310/* must be locked by nat_tree_lock */
 311static void cache_nat_entry(struct f2fs_sb_info *sbi, nid_t nid,
 312                                                struct f2fs_nat_entry *ne)
 313{
 314        struct f2fs_nm_info *nm_i = NM_I(sbi);
 315        struct nat_entry *new, *e;
 316
 317        new = __alloc_nat_entry(nid, false);
 318        if (!new)
 319                return;
 320
 321        down_write(&nm_i->nat_tree_lock);
 322        e = __lookup_nat_cache(nm_i, nid);
 323        if (!e)
 324                e = __init_nat_entry(nm_i, new, ne, false);
 325        else
 326                f2fs_bug_on(sbi, nat_get_ino(e) != le32_to_cpu(ne->ino) ||
 327                                nat_get_blkaddr(e) !=
 328                                        le32_to_cpu(ne->block_addr) ||
 329                                nat_get_version(e) != ne->version);
 330        up_write(&nm_i->nat_tree_lock);
 331        if (e != new)
 332                __free_nat_entry(new);
 333}
 334
 335static void set_node_addr(struct f2fs_sb_info *sbi, struct node_info *ni,
 336                        block_t new_blkaddr, bool fsync_done)
 337{
 338        struct f2fs_nm_info *nm_i = NM_I(sbi);
 339        struct nat_entry *e;
 340        struct nat_entry *new = __alloc_nat_entry(ni->nid, true);
 341
 342        down_write(&nm_i->nat_tree_lock);
 343        e = __lookup_nat_cache(nm_i, ni->nid);
 344        if (!e) {
 345                e = __init_nat_entry(nm_i, new, NULL, true);
 346                copy_node_info(&e->ni, ni);
 347                f2fs_bug_on(sbi, ni->blk_addr == NEW_ADDR);
 348        } else if (new_blkaddr == NEW_ADDR) {
 349                /*
 350                 * when nid is reallocated,
 351                 * previous nat entry can be remained in nat cache.
 352                 * So, reinitialize it with new information.
 353                 */
 354                copy_node_info(&e->ni, ni);
 355                f2fs_bug_on(sbi, ni->blk_addr != NULL_ADDR);
 356        }
 357        /* let's free early to reduce memory consumption */
 358        if (e != new)
 359                __free_nat_entry(new);
 360
 361        /* sanity check */
 362        f2fs_bug_on(sbi, nat_get_blkaddr(e) != ni->blk_addr);
 363        f2fs_bug_on(sbi, nat_get_blkaddr(e) == NULL_ADDR &&
 364                        new_blkaddr == NULL_ADDR);
 365        f2fs_bug_on(sbi, nat_get_blkaddr(e) == NEW_ADDR &&
 366                        new_blkaddr == NEW_ADDR);
 367        f2fs_bug_on(sbi, nat_get_blkaddr(e) != NEW_ADDR &&
 368                        nat_get_blkaddr(e) != NULL_ADDR &&
 369                        new_blkaddr == NEW_ADDR);
 370
 371        /* increment version no as node is removed */
 372        if (nat_get_blkaddr(e) != NEW_ADDR && new_blkaddr == NULL_ADDR) {
 373                unsigned char version = nat_get_version(e);
 374                nat_set_version(e, inc_node_version(version));
 375        }
 376
 377        /* change address */
 378        nat_set_blkaddr(e, new_blkaddr);
 379        if (new_blkaddr == NEW_ADDR || new_blkaddr == NULL_ADDR)
 380                set_nat_flag(e, IS_CHECKPOINTED, false);
 381        __set_nat_cache_dirty(nm_i, e);
 382
 383        /* update fsync_mark if its inode nat entry is still alive */
 384        if (ni->nid != ni->ino)
 385                e = __lookup_nat_cache(nm_i, ni->ino);
 386        if (e) {
 387                if (fsync_done && ni->nid == ni->ino)
 388                        set_nat_flag(e, HAS_FSYNCED_INODE, true);
 389                set_nat_flag(e, HAS_LAST_FSYNC, fsync_done);
 390        }
 391        up_write(&nm_i->nat_tree_lock);
 392}
 393
 394int try_to_free_nats(struct f2fs_sb_info *sbi, int nr_shrink)
 395{
 396        struct f2fs_nm_info *nm_i = NM_I(sbi);
 397        int nr = nr_shrink;
 398
 399        if (!down_write_trylock(&nm_i->nat_tree_lock))
 400                return 0;
 401
 402        while (nr_shrink && !list_empty(&nm_i->nat_entries)) {
 403                struct nat_entry *ne;
 404                ne = list_first_entry(&nm_i->nat_entries,
 405                                        struct nat_entry, list);
 406                __del_from_nat_cache(nm_i, ne);
 407                nr_shrink--;
 408        }
 409        up_write(&nm_i->nat_tree_lock);
 410        return nr - nr_shrink;
 411}
 412
 413/*
 414 * This function always returns success
 415 */
 416void get_node_info(struct f2fs_sb_info *sbi, nid_t nid, struct node_info *ni)
 417{
 418        struct f2fs_nm_info *nm_i = NM_I(sbi);
 419        struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA);
 420        struct f2fs_journal *journal = curseg->journal;
 421        nid_t start_nid = START_NID(nid);
 422        struct f2fs_nat_block *nat_blk;
 423        struct page *page = NULL;
 424        struct f2fs_nat_entry ne;
 425        struct nat_entry *e;
 426        pgoff_t index;
 427        int i;
 428
 429        ni->nid = nid;
 430
 431        /* Check nat cache */
 432        down_read(&nm_i->nat_tree_lock);
 433        e = __lookup_nat_cache(nm_i, nid);
 434        if (e) {
 435                ni->ino = nat_get_ino(e);
 436                ni->blk_addr = nat_get_blkaddr(e);
 437                ni->version = nat_get_version(e);
 438                up_read(&nm_i->nat_tree_lock);
 439                return;
 440        }
 441
 442        memset(&ne, 0, sizeof(struct f2fs_nat_entry));
 443
 444        /* Check current segment summary */
 445        down_read(&curseg->journal_rwsem);
 446        i = lookup_journal_in_cursum(journal, NAT_JOURNAL, nid, 0);
 447        if (i >= 0) {
 448                ne = nat_in_journal(journal, i);
 449                node_info_from_raw_nat(ni, &ne);
 450        }
 451        up_read(&curseg->journal_rwsem);
 452        if (i >= 0) {
 453                up_read(&nm_i->nat_tree_lock);
 454                goto cache;
 455        }
 456
 457        /* Fill node_info from nat page */
 458        index = current_nat_addr(sbi, nid);
 459        up_read(&nm_i->nat_tree_lock);
 460
 461        page = get_meta_page(sbi, index);
 462        nat_blk = (struct f2fs_nat_block *)page_address(page);
 463        ne = nat_blk->entries[nid - start_nid];
 464        node_info_from_raw_nat(ni, &ne);
 465        f2fs_put_page(page, 1);
 466cache:
 467        /* cache nat entry */
 468        cache_nat_entry(sbi, nid, &ne);
 469}
 470
 471/*
 472 * readahead MAX_RA_NODE number of node pages.
 473 */
 474static void ra_node_pages(struct page *parent, int start, int n)
 475{
 476        struct f2fs_sb_info *sbi = F2FS_P_SB(parent);
 477        struct blk_plug plug;
 478        int i, end;
 479        nid_t nid;
 480
 481        blk_start_plug(&plug);
 482
 483        /* Then, try readahead for siblings of the desired node */
 484        end = start + n;
 485        end = min(end, NIDS_PER_BLOCK);
 486        for (i = start; i < end; i++) {
 487                nid = get_nid(parent, i, false);
 488                ra_node_page(sbi, nid);
 489        }
 490
 491        blk_finish_plug(&plug);
 492}
 493
 494pgoff_t get_next_page_offset(struct dnode_of_data *dn, pgoff_t pgofs)
 495{
 496        const long direct_index = ADDRS_PER_INODE(dn->inode);
 497        const long direct_blks = ADDRS_PER_BLOCK;
 498        const long indirect_blks = ADDRS_PER_BLOCK * NIDS_PER_BLOCK;
 499        unsigned int skipped_unit = ADDRS_PER_BLOCK;
 500        int cur_level = dn->cur_level;
 501        int max_level = dn->max_level;
 502        pgoff_t base = 0;
 503
 504        if (!dn->max_level)
 505                return pgofs + 1;
 506
 507        while (max_level-- > cur_level)
 508                skipped_unit *= NIDS_PER_BLOCK;
 509
 510        switch (dn->max_level) {
 511        case 3:
 512                base += 2 * indirect_blks;
 513        case 2:
 514                base += 2 * direct_blks;
 515        case 1:
 516                base += direct_index;
 517                break;
 518        default:
 519                f2fs_bug_on(F2FS_I_SB(dn->inode), 1);
 520        }
 521
 522        return ((pgofs - base) / skipped_unit + 1) * skipped_unit + base;
 523}
 524
 525/*
 526 * The maximum depth is four.
 527 * Offset[0] will have raw inode offset.
 528 */
 529static int get_node_path(struct inode *inode, long block,
 530                                int offset[4], unsigned int noffset[4])
 531{
 532        const long direct_index = ADDRS_PER_INODE(inode);
 533        const long direct_blks = ADDRS_PER_BLOCK;
 534        const long dptrs_per_blk = NIDS_PER_BLOCK;
 535        const long indirect_blks = ADDRS_PER_BLOCK * NIDS_PER_BLOCK;
 536        const long dindirect_blks = indirect_blks * NIDS_PER_BLOCK;
 537        int n = 0;
 538        int level = 0;
 539
 540        noffset[0] = 0;
 541
 542        if (block < direct_index) {
 543                offset[n] = block;
 544                goto got;
 545        }
 546        block -= direct_index;
 547        if (block < direct_blks) {
 548                offset[n++] = NODE_DIR1_BLOCK;
 549                noffset[n] = 1;
 550                offset[n] = block;
 551                level = 1;
 552                goto got;
 553        }
 554        block -= direct_blks;
 555        if (block < direct_blks) {
 556                offset[n++] = NODE_DIR2_BLOCK;
 557                noffset[n] = 2;
 558                offset[n] = block;
 559                level = 1;
 560                goto got;
 561        }
 562        block -= direct_blks;
 563        if (block < indirect_blks) {
 564                offset[n++] = NODE_IND1_BLOCK;
 565                noffset[n] = 3;
 566                offset[n++] = block / direct_blks;
 567                noffset[n] = 4 + offset[n - 1];
 568                offset[n] = block % direct_blks;
 569                level = 2;
 570                goto got;
 571        }
 572        block -= indirect_blks;
 573        if (block < indirect_blks) {
 574                offset[n++] = NODE_IND2_BLOCK;
 575                noffset[n] = 4 + dptrs_per_blk;
 576                offset[n++] = block / direct_blks;
 577                noffset[n] = 5 + dptrs_per_blk + offset[n - 1];
 578                offset[n] = block % direct_blks;
 579                level = 2;
 580                goto got;
 581        }
 582        block -= indirect_blks;
 583        if (block < dindirect_blks) {
 584                offset[n++] = NODE_DIND_BLOCK;
 585                noffset[n] = 5 + (dptrs_per_blk * 2);
 586                offset[n++] = block / indirect_blks;
 587                noffset[n] = 6 + (dptrs_per_blk * 2) +
 588                              offset[n - 1] * (dptrs_per_blk + 1);
 589                offset[n++] = (block / direct_blks) % dptrs_per_blk;
 590                noffset[n] = 7 + (dptrs_per_blk * 2) +
 591                              offset[n - 2] * (dptrs_per_blk + 1) +
 592                              offset[n - 1];
 593                offset[n] = block % direct_blks;
 594                level = 3;
 595                goto got;
 596        } else {
 597                return -E2BIG;
 598        }
 599got:
 600        return level;
 601}
 602
 603/*
 604 * Caller should call f2fs_put_dnode(dn).
 605 * Also, it should grab and release a rwsem by calling f2fs_lock_op() and
 606 * f2fs_unlock_op() only if ro is not set RDONLY_NODE.
 607 * In the case of RDONLY_NODE, we don't need to care about mutex.
 608 */
 609int get_dnode_of_data(struct dnode_of_data *dn, pgoff_t index, int mode)
 610{
 611        struct f2fs_sb_info *sbi = F2FS_I_SB(dn->inode);
 612        struct page *npage[4];
 613        struct page *parent = NULL;
 614        int offset[4];
 615        unsigned int noffset[4];
 616        nid_t nids[4];
 617        int level, i = 0;
 618        int err = 0;
 619
 620        level = get_node_path(dn->inode, index, offset, noffset);
 621        if (level < 0)
 622                return level;
 623
 624        nids[0] = dn->inode->i_ino;
 625        npage[0] = dn->inode_page;
 626
 627        if (!npage[0]) {
 628                npage[0] = get_node_page(sbi, nids[0]);
 629                if (IS_ERR(npage[0]))
 630                        return PTR_ERR(npage[0]);
 631        }
 632
 633        /* if inline_data is set, should not report any block indices */
 634        if (f2fs_has_inline_data(dn->inode) && index) {
 635                err = -ENOENT;
 636                f2fs_put_page(npage[0], 1);
 637                goto release_out;
 638        }
 639
 640        parent = npage[0];
 641        if (level != 0)
 642                nids[1] = get_nid(parent, offset[0], true);
 643        dn->inode_page = npage[0];
 644        dn->inode_page_locked = true;
 645
 646        /* get indirect or direct nodes */
 647        for (i = 1; i <= level; i++) {
 648                bool done = false;
 649
 650                if (!nids[i] && mode == ALLOC_NODE) {
 651                        /* alloc new node */
 652                        if (!alloc_nid(sbi, &(nids[i]))) {
 653                                err = -ENOSPC;
 654                                goto release_pages;
 655                        }
 656
 657                        dn->nid = nids[i];
 658                        npage[i] = new_node_page(dn, noffset[i]);
 659                        if (IS_ERR(npage[i])) {
 660                                alloc_nid_failed(sbi, nids[i]);
 661                                err = PTR_ERR(npage[i]);
 662                                goto release_pages;
 663                        }
 664
 665                        set_nid(parent, offset[i - 1], nids[i], i == 1);
 666                        alloc_nid_done(sbi, nids[i]);
 667                        done = true;
 668                } else if (mode == LOOKUP_NODE_RA && i == level && level > 1) {
 669                        npage[i] = get_node_page_ra(parent, offset[i - 1]);
 670                        if (IS_ERR(npage[i])) {
 671                                err = PTR_ERR(npage[i]);
 672                                goto release_pages;
 673                        }
 674                        done = true;
 675                }
 676                if (i == 1) {
 677                        dn->inode_page_locked = false;
 678                        unlock_page(parent);
 679                } else {
 680                        f2fs_put_page(parent, 1);
 681                }
 682
 683                if (!done) {
 684                        npage[i] = get_node_page(sbi, nids[i]);
 685                        if (IS_ERR(npage[i])) {
 686                                err = PTR_ERR(npage[i]);
 687                                f2fs_put_page(npage[0], 0);
 688                                goto release_out;
 689                        }
 690                }
 691                if (i < level) {
 692                        parent = npage[i];
 693                        nids[i + 1] = get_nid(parent, offset[i], false);
 694                }
 695        }
 696        dn->nid = nids[level];
 697        dn->ofs_in_node = offset[level];
 698        dn->node_page = npage[level];
 699        dn->data_blkaddr = datablock_addr(dn->inode,
 700                                dn->node_page, dn->ofs_in_node);
 701        return 0;
 702
 703release_pages:
 704        f2fs_put_page(parent, 1);
 705        if (i > 1)
 706                f2fs_put_page(npage[0], 0);
 707release_out:
 708        dn->inode_page = NULL;
 709        dn->node_page = NULL;
 710        if (err == -ENOENT) {
 711                dn->cur_level = i;
 712                dn->max_level = level;
 713                dn->ofs_in_node = offset[level];
 714        }
 715        return err;
 716}
 717
 718static void truncate_node(struct dnode_of_data *dn)
 719{
 720        struct f2fs_sb_info *sbi = F2FS_I_SB(dn->inode);
 721        struct node_info ni;
 722
 723        get_node_info(sbi, dn->nid, &ni);
 724
 725        /* Deallocate node address */
 726        invalidate_blocks(sbi, ni.blk_addr);
 727        dec_valid_node_count(sbi, dn->inode, dn->nid == dn->inode->i_ino);
 728        set_node_addr(sbi, &ni, NULL_ADDR, false);
 729
 730        if (dn->nid == dn->inode->i_ino) {
 731                remove_orphan_inode(sbi, dn->nid);
 732                dec_valid_inode_count(sbi);
 733                f2fs_inode_synced(dn->inode);
 734        }
 735
 736        clear_node_page_dirty(dn->node_page);
 737        set_sbi_flag(sbi, SBI_IS_DIRTY);
 738
 739        f2fs_put_page(dn->node_page, 1);
 740
 741        invalidate_mapping_pages(NODE_MAPPING(sbi),
 742                        dn->node_page->index, dn->node_page->index);
 743
 744        dn->node_page = NULL;
 745        trace_f2fs_truncate_node(dn->inode, dn->nid, ni.blk_addr);
 746}
 747
 748static int truncate_dnode(struct dnode_of_data *dn)
 749{
 750        struct page *page;
 751
 752        if (dn->nid == 0)
 753                return 1;
 754
 755        /* get direct node */
 756        page = get_node_page(F2FS_I_SB(dn->inode), dn->nid);
 757        if (IS_ERR(page) && PTR_ERR(page) == -ENOENT)
 758                return 1;
 759        else if (IS_ERR(page))
 760                return PTR_ERR(page);
 761
 762        /* Make dnode_of_data for parameter */
 763        dn->node_page = page;
 764        dn->ofs_in_node = 0;
 765        truncate_data_blocks(dn);
 766        truncate_node(dn);
 767        return 1;
 768}
 769
 770static int truncate_nodes(struct dnode_of_data *dn, unsigned int nofs,
 771                                                int ofs, int depth)
 772{
 773        struct dnode_of_data rdn = *dn;
 774        struct page *page;
 775        struct f2fs_node *rn;
 776        nid_t child_nid;
 777        unsigned int child_nofs;
 778        int freed = 0;
 779        int i, ret;
 780
 781        if (dn->nid == 0)
 782                return NIDS_PER_BLOCK + 1;
 783
 784        trace_f2fs_truncate_nodes_enter(dn->inode, dn->nid, dn->data_blkaddr);
 785
 786        page = get_node_page(F2FS_I_SB(dn->inode), dn->nid);
 787        if (IS_ERR(page)) {
 788                trace_f2fs_truncate_nodes_exit(dn->inode, PTR_ERR(page));
 789                return PTR_ERR(page);
 790        }
 791
 792        ra_node_pages(page, ofs, NIDS_PER_BLOCK);
 793
 794        rn = F2FS_NODE(page);
 795        if (depth < 3) {
 796                for (i = ofs; i < NIDS_PER_BLOCK; i++, freed++) {
 797                        child_nid = le32_to_cpu(rn->in.nid[i]);
 798                        if (child_nid == 0)
 799                                continue;
 800                        rdn.nid = child_nid;
 801                        ret = truncate_dnode(&rdn);
 802                        if (ret < 0)
 803                                goto out_err;
 804                        if (set_nid(page, i, 0, false))
 805                                dn->node_changed = true;
 806                }
 807        } else {
 808                child_nofs = nofs + ofs * (NIDS_PER_BLOCK + 1) + 1;
 809                for (i = ofs; i < NIDS_PER_BLOCK; i++) {
 810                        child_nid = le32_to_cpu(rn->in.nid[i]);
 811                        if (child_nid == 0) {
 812                                child_nofs += NIDS_PER_BLOCK + 1;
 813                                continue;
 814                        }
 815                        rdn.nid = child_nid;
 816                        ret = truncate_nodes(&rdn, child_nofs, 0, depth - 1);
 817                        if (ret == (NIDS_PER_BLOCK + 1)) {
 818                                if (set_nid(page, i, 0, false))
 819                                        dn->node_changed = true;
 820                                child_nofs += ret;
 821                        } else if (ret < 0 && ret != -ENOENT) {
 822                                goto out_err;
 823                        }
 824                }
 825                freed = child_nofs;
 826        }
 827
 828        if (!ofs) {
 829                /* remove current indirect node */
 830                dn->node_page = page;
 831                truncate_node(dn);
 832                freed++;
 833        } else {
 834                f2fs_put_page(page, 1);
 835        }
 836        trace_f2fs_truncate_nodes_exit(dn->inode, freed);
 837        return freed;
 838
 839out_err:
 840        f2fs_put_page(page, 1);
 841        trace_f2fs_truncate_nodes_exit(dn->inode, ret);
 842        return ret;
 843}
 844
 845static int truncate_partial_nodes(struct dnode_of_data *dn,
 846                        struct f2fs_inode *ri, int *offset, int depth)
 847{
 848        struct page *pages[2];
 849        nid_t nid[3];
 850        nid_t child_nid;
 851        int err = 0;
 852        int i;
 853        int idx = depth - 2;
 854
 855        nid[0] = le32_to_cpu(ri->i_nid[offset[0] - NODE_DIR1_BLOCK]);
 856        if (!nid[0])
 857                return 0;
 858
 859        /* get indirect nodes in the path */
 860        for (i = 0; i < idx + 1; i++) {
 861                /* reference count'll be increased */
 862                pages[i] = get_node_page(F2FS_I_SB(dn->inode), nid[i]);
 863                if (IS_ERR(pages[i])) {
 864                        err = PTR_ERR(pages[i]);
 865                        idx = i - 1;
 866                        goto fail;
 867                }
 868                nid[i + 1] = get_nid(pages[i], offset[i + 1], false);
 869        }
 870
 871        ra_node_pages(pages[idx], offset[idx + 1], NIDS_PER_BLOCK);
 872
 873        /* free direct nodes linked to a partial indirect node */
 874        for (i = offset[idx + 1]; i < NIDS_PER_BLOCK; i++) {
 875                child_nid = get_nid(pages[idx], i, false);
 876                if (!child_nid)
 877                        continue;
 878                dn->nid = child_nid;
 879                err = truncate_dnode(dn);
 880                if (err < 0)
 881                        goto fail;
 882                if (set_nid(pages[idx], i, 0, false))
 883                        dn->node_changed = true;
 884        }
 885
 886        if (offset[idx + 1] == 0) {
 887                dn->node_page = pages[idx];
 888                dn->nid = nid[idx];
 889                truncate_node(dn);
 890        } else {
 891                f2fs_put_page(pages[idx], 1);
 892        }
 893        offset[idx]++;
 894        offset[idx + 1] = 0;
 895        idx--;
 896fail:
 897        for (i = idx; i >= 0; i--)
 898                f2fs_put_page(pages[i], 1);
 899
 900        trace_f2fs_truncate_partial_nodes(dn->inode, nid, depth, err);
 901
 902        return err;
 903}
 904
 905/*
 906 * All the block addresses of data and nodes should be nullified.
 907 */
 908int truncate_inode_blocks(struct inode *inode, pgoff_t from)
 909{
 910        struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
 911        int err = 0, cont = 1;
 912        int level, offset[4], noffset[4];
 913        unsigned int nofs = 0;
 914        struct f2fs_inode *ri;
 915        struct dnode_of_data dn;
 916        struct page *page;
 917
 918        trace_f2fs_truncate_inode_blocks_enter(inode, from);
 919
 920        level = get_node_path(inode, from, offset, noffset);
 921        if (level < 0)
 922                return level;
 923
 924        page = get_node_page(sbi, inode->i_ino);
 925        if (IS_ERR(page)) {
 926                trace_f2fs_truncate_inode_blocks_exit(inode, PTR_ERR(page));
 927                return PTR_ERR(page);
 928        }
 929
 930        set_new_dnode(&dn, inode, page, NULL, 0);
 931        unlock_page(page);
 932
 933        ri = F2FS_INODE(page);
 934        switch (level) {
 935        case 0:
 936        case 1:
 937                nofs = noffset[1];
 938                break;
 939        case 2:
 940                nofs = noffset[1];
 941                if (!offset[level - 1])
 942                        goto skip_partial;
 943                err = truncate_partial_nodes(&dn, ri, offset, level);
 944                if (err < 0 && err != -ENOENT)
 945                        goto fail;
 946                nofs += 1 + NIDS_PER_BLOCK;
 947                break;
 948        case 3:
 949                nofs = 5 + 2 * NIDS_PER_BLOCK;
 950                if (!offset[level - 1])
 951                        goto skip_partial;
 952                err = truncate_partial_nodes(&dn, ri, offset, level);
 953                if (err < 0 && err != -ENOENT)
 954                        goto fail;
 955                break;
 956        default:
 957                BUG();
 958        }
 959
 960skip_partial:
 961        while (cont) {
 962                dn.nid = le32_to_cpu(ri->i_nid[offset[0] - NODE_DIR1_BLOCK]);
 963                switch (offset[0]) {
 964                case NODE_DIR1_BLOCK:
 965                case NODE_DIR2_BLOCK:
 966                        err = truncate_dnode(&dn);
 967                        break;
 968
 969                case NODE_IND1_BLOCK:
 970                case NODE_IND2_BLOCK:
 971                        err = truncate_nodes(&dn, nofs, offset[1], 2);
 972                        break;
 973
 974                case NODE_DIND_BLOCK:
 975                        err = truncate_nodes(&dn, nofs, offset[1], 3);
 976                        cont = 0;
 977                        break;
 978
 979                default:
 980                        BUG();
 981                }
 982                if (err < 0 && err != -ENOENT)
 983                        goto fail;
 984                if (offset[1] == 0 &&
 985                                ri->i_nid[offset[0] - NODE_DIR1_BLOCK]) {
 986                        lock_page(page);
 987                        BUG_ON(page->mapping != NODE_MAPPING(sbi));
 988                        f2fs_wait_on_page_writeback(page, NODE, true);
 989                        ri->i_nid[offset[0] - NODE_DIR1_BLOCK] = 0;
 990                        set_page_dirty(page);
 991                        unlock_page(page);
 992                }
 993                offset[1] = 0;
 994                offset[0]++;
 995                nofs += err;
 996        }
 997fail:
 998        f2fs_put_page(page, 0);
 999        trace_f2fs_truncate_inode_blocks_exit(inode, err);
1000        return err > 0 ? 0 : err;
1001}
1002
1003/* caller must lock inode page */
1004int truncate_xattr_node(struct inode *inode)
1005{
1006        struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
1007        nid_t nid = F2FS_I(inode)->i_xattr_nid;
1008        struct dnode_of_data dn;
1009        struct page *npage;
1010
1011        if (!nid)
1012                return 0;
1013
1014        npage = get_node_page(sbi, nid);
1015        if (IS_ERR(npage))
1016                return PTR_ERR(npage);
1017
1018        f2fs_i_xnid_write(inode, 0);
1019
1020        set_new_dnode(&dn, inode, NULL, npage, nid);
1021        truncate_node(&dn);
1022        return 0;
1023}
1024
1025/*
1026 * Caller should grab and release a rwsem by calling f2fs_lock_op() and
1027 * f2fs_unlock_op().
1028 */
1029int remove_inode_page(struct inode *inode)
1030{
1031        struct dnode_of_data dn;
1032        int err;
1033
1034        set_new_dnode(&dn, inode, NULL, NULL, inode->i_ino);
1035        err = get_dnode_of_data(&dn, 0, LOOKUP_NODE);
1036        if (err)
1037                return err;
1038
1039        err = truncate_xattr_node(inode);
1040        if (err) {
1041                f2fs_put_dnode(&dn);
1042                return err;
1043        }
1044
1045        /* remove potential inline_data blocks */
1046        if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
1047                                S_ISLNK(inode->i_mode))
1048                truncate_data_blocks_range(&dn, 1);
1049
1050        /* 0 is possible, after f2fs_new_inode() has failed */
1051        f2fs_bug_on(F2FS_I_SB(inode),
1052                        inode->i_blocks != 0 && inode->i_blocks != 8);
1053
1054        /* will put inode & node pages */
1055        truncate_node(&dn);
1056        return 0;
1057}
1058
1059struct page *new_inode_page(struct inode *inode)
1060{
1061        struct dnode_of_data dn;
1062
1063        /* allocate inode page for new inode */
1064        set_new_dnode(&dn, inode, NULL, NULL, inode->i_ino);
1065
1066        /* caller should f2fs_put_page(page, 1); */
1067        return new_node_page(&dn, 0);
1068}
1069
1070struct page *new_node_page(struct dnode_of_data *dn, unsigned int ofs)
1071{
1072        struct f2fs_sb_info *sbi = F2FS_I_SB(dn->inode);
1073        struct node_info new_ni;
1074        struct page *page;
1075        int err;
1076
1077        if (unlikely(is_inode_flag_set(dn->inode, FI_NO_ALLOC)))
1078                return ERR_PTR(-EPERM);
1079
1080        page = f2fs_grab_cache_page(NODE_MAPPING(sbi), dn->nid, false);
1081        if (!page)
1082                return ERR_PTR(-ENOMEM);
1083
1084        if (unlikely((err = inc_valid_node_count(sbi, dn->inode, !ofs))))
1085                goto fail;
1086
1087#ifdef CONFIG_F2FS_CHECK_FS
1088        get_node_info(sbi, dn->nid, &new_ni);
1089        f2fs_bug_on(sbi, new_ni.blk_addr != NULL_ADDR);
1090#endif
1091        new_ni.nid = dn->nid;
1092        new_ni.ino = dn->inode->i_ino;
1093        new_ni.blk_addr = NULL_ADDR;
1094        new_ni.flag = 0;
1095        new_ni.version = 0;
1096        set_node_addr(sbi, &new_ni, NEW_ADDR, false);
1097
1098        f2fs_wait_on_page_writeback(page, NODE, true);
1099        fill_node_footer(page, dn->nid, dn->inode->i_ino, ofs, true);
1100        set_cold_node(page, S_ISDIR(dn->inode->i_mode));
1101        if (!PageUptodate(page))
1102                SetPageUptodate(page);
1103        if (set_page_dirty(page))
1104                dn->node_changed = true;
1105
1106        if (f2fs_has_xattr_block(ofs))
1107                f2fs_i_xnid_write(dn->inode, dn->nid);
1108
1109        if (ofs == 0)
1110                inc_valid_inode_count(sbi);
1111        return page;
1112
1113fail:
1114        clear_node_page_dirty(page);
1115        f2fs_put_page(page, 1);
1116        return ERR_PTR(err);
1117}
1118
1119/*
1120 * Caller should do after getting the following values.
1121 * 0: f2fs_put_page(page, 0)
1122 * LOCKED_PAGE or error: f2fs_put_page(page, 1)
1123 */
1124static int read_node_page(struct page *page, int op_flags)
1125{
1126        struct f2fs_sb_info *sbi = F2FS_P_SB(page);
1127        struct node_info ni;
1128        struct f2fs_io_info fio = {
1129                .sbi = sbi,
1130                .type = NODE,
1131                .op = REQ_OP_READ,
1132                .op_flags = op_flags,
1133                .page = page,
1134                .encrypted_page = NULL,
1135        };
1136
1137        if (PageUptodate(page))
1138                return LOCKED_PAGE;
1139
1140        get_node_info(sbi, page->index, &ni);
1141
1142        if (unlikely(ni.blk_addr == NULL_ADDR)) {
1143                ClearPageUptodate(page);
1144                return -ENOENT;
1145        }
1146
1147        fio.new_blkaddr = fio.old_blkaddr = ni.blk_addr;
1148        return f2fs_submit_page_bio(&fio);
1149}
1150
1151/*
1152 * Readahead a node page
1153 */
1154void ra_node_page(struct f2fs_sb_info *sbi, nid_t nid)
1155{
1156        struct page *apage;
1157        int err;
1158
1159        if (!nid)
1160                return;
1161        f2fs_bug_on(sbi, check_nid_range(sbi, nid));
1162
1163        rcu_read_lock();
1164        apage = radix_tree_lookup(&NODE_MAPPING(sbi)->i_pages, nid);
1165        rcu_read_unlock();
1166        if (apage)
1167                return;
1168
1169        apage = f2fs_grab_cache_page(NODE_MAPPING(sbi), nid, false);
1170        if (!apage)
1171                return;
1172
1173        err = read_node_page(apage, REQ_RAHEAD);
1174        f2fs_put_page(apage, err ? 1 : 0);
1175}
1176
1177static struct page *__get_node_page(struct f2fs_sb_info *sbi, pgoff_t nid,
1178                                        struct page *parent, int start)
1179{
1180        struct page *page;
1181        int err;
1182
1183        if (!nid)
1184                return ERR_PTR(-ENOENT);
1185        f2fs_bug_on(sbi, check_nid_range(sbi, nid));
1186repeat:
1187        page = f2fs_grab_cache_page(NODE_MAPPING(sbi), nid, false);
1188        if (!page)
1189                return ERR_PTR(-ENOMEM);
1190
1191        err = read_node_page(page, 0);
1192        if (err < 0) {
1193                f2fs_put_page(page, 1);
1194                return ERR_PTR(err);
1195        } else if (err == LOCKED_PAGE) {
1196                err = 0;
1197                goto page_hit;
1198        }
1199
1200        if (parent)
1201                ra_node_pages(parent, start + 1, MAX_RA_NODE);
1202
1203        lock_page(page);
1204
1205        if (unlikely(page->mapping != NODE_MAPPING(sbi))) {
1206                f2fs_put_page(page, 1);
1207                goto repeat;
1208        }
1209
1210        if (unlikely(!PageUptodate(page))) {
1211                err = -EIO;
1212                goto out_err;
1213        }
1214
1215        if (!f2fs_inode_chksum_verify(sbi, page)) {
1216                err = -EBADMSG;
1217                goto out_err;
1218        }
1219page_hit:
1220        if(unlikely(nid != nid_of_node(page))) {
1221                f2fs_msg(sbi->sb, KERN_WARNING, "inconsistent node block, "
1222                        "nid:%lu, node_footer[nid:%u,ino:%u,ofs:%u,cpver:%llu,blkaddr:%u]",
1223                        nid, nid_of_node(page), ino_of_node(page),
1224                        ofs_of_node(page), cpver_of_node(page),
1225                        next_blkaddr_of_node(page));
1226                err = -EINVAL;
1227out_err:
1228                ClearPageUptodate(page);
1229                f2fs_put_page(page, 1);
1230                return ERR_PTR(err);
1231        }
1232        return page;
1233}
1234
1235struct page *get_node_page(struct f2fs_sb_info *sbi, pgoff_t nid)
1236{
1237        return __get_node_page(sbi, nid, NULL, 0);
1238}
1239
1240struct page *get_node_page_ra(struct page *parent, int start)
1241{
1242        struct f2fs_sb_info *sbi = F2FS_P_SB(parent);
1243        nid_t nid = get_nid(parent, start, false);
1244
1245        return __get_node_page(sbi, nid, parent, start);
1246}
1247
1248static void flush_inline_data(struct f2fs_sb_info *sbi, nid_t ino)
1249{
1250        struct inode *inode;
1251        struct page *page;
1252        int ret;
1253
1254        /* should flush inline_data before evict_inode */
1255        inode = ilookup(sbi->sb, ino);
1256        if (!inode)
1257                return;
1258
1259        page = f2fs_pagecache_get_page(inode->i_mapping, 0,
1260                                        FGP_LOCK|FGP_NOWAIT, 0);
1261        if (!page)
1262                goto iput_out;
1263
1264        if (!PageUptodate(page))
1265                goto page_out;
1266
1267        if (!PageDirty(page))
1268                goto page_out;
1269
1270        if (!clear_page_dirty_for_io(page))
1271                goto page_out;
1272
1273        ret = f2fs_write_inline_data(inode, page);
1274        inode_dec_dirty_pages(inode);
1275        remove_dirty_inode(inode);
1276        if (ret)
1277                set_page_dirty(page);
1278page_out:
1279        f2fs_put_page(page, 1);
1280iput_out:
1281        iput(inode);
1282}
1283
1284static struct page *last_fsync_dnode(struct f2fs_sb_info *sbi, nid_t ino)
1285{
1286        pgoff_t index;
1287        struct pagevec pvec;
1288        struct page *last_page = NULL;
1289        int nr_pages;
1290
1291        pagevec_init(&pvec);
1292        index = 0;
1293
1294        while ((nr_pages = pagevec_lookup_tag(&pvec, NODE_MAPPING(sbi), &index,
1295                                PAGECACHE_TAG_DIRTY))) {
1296                int i;
1297
1298                for (i = 0; i < nr_pages; i++) {
1299                        struct page *page = pvec.pages[i];
1300
1301                        if (unlikely(f2fs_cp_error(sbi))) {
1302                                f2fs_put_page(last_page, 0);
1303                                pagevec_release(&pvec);
1304                                return ERR_PTR(-EIO);
1305                        }
1306
1307                        if (!IS_DNODE(page) || !is_cold_node(page))
1308                                continue;
1309                        if (ino_of_node(page) != ino)
1310                                continue;
1311
1312                        lock_page(page);
1313
1314                        if (unlikely(page->mapping != NODE_MAPPING(sbi))) {
1315continue_unlock:
1316                                unlock_page(page);
1317                                continue;
1318                        }
1319                        if (ino_of_node(page) != ino)
1320                                goto continue_unlock;
1321
1322                        if (!PageDirty(page)) {
1323                                /* someone wrote it for us */
1324                                goto continue_unlock;
1325                        }
1326
1327                        if (last_page)
1328                                f2fs_put_page(last_page, 0);
1329
1330                        get_page(page);
1331                        last_page = page;
1332                        unlock_page(page);
1333                }
1334                pagevec_release(&pvec);
1335                cond_resched();
1336        }
1337        return last_page;
1338}
1339
1340static int __write_node_page(struct page *page, bool atomic, bool *submitted,
1341                                struct writeback_control *wbc, bool do_balance,
1342                                enum iostat_type io_type)
1343{
1344        struct f2fs_sb_info *sbi = F2FS_P_SB(page);
1345        nid_t nid;
1346        struct node_info ni;
1347        struct f2fs_io_info fio = {
1348                .sbi = sbi,
1349                .ino = ino_of_node(page),
1350                .type = NODE,
1351                .op = REQ_OP_WRITE,
1352                .op_flags = wbc_to_write_flags(wbc),
1353                .page = page,
1354                .encrypted_page = NULL,
1355                .submitted = false,
1356                .io_type = io_type,
1357                .io_wbc = wbc,
1358        };
1359
1360        trace_f2fs_writepage(page, NODE);
1361
1362        if (unlikely(f2fs_cp_error(sbi))) {
1363                dec_page_count(sbi, F2FS_DIRTY_NODES);
1364                unlock_page(page);
1365                return 0;
1366        }
1367
1368        if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING)))
1369                goto redirty_out;
1370
1371        /* get old block addr of this node page */
1372        nid = nid_of_node(page);
1373        f2fs_bug_on(sbi, page->index != nid);
1374
1375        if (wbc->for_reclaim) {
1376                if (!down_read_trylock(&sbi->node_write))
1377                        goto redirty_out;
1378        } else {
1379                down_read(&sbi->node_write);
1380        }
1381
1382        get_node_info(sbi, nid, &ni);
1383
1384        /* This page is already truncated */
1385        if (unlikely(ni.blk_addr == NULL_ADDR)) {
1386                ClearPageUptodate(page);
1387                dec_page_count(sbi, F2FS_DIRTY_NODES);
1388                up_read(&sbi->node_write);
1389                unlock_page(page);
1390                return 0;
1391        }
1392
1393        if (atomic && !test_opt(sbi, NOBARRIER))
1394                fio.op_flags |= REQ_PREFLUSH | REQ_FUA;
1395
1396        set_page_writeback(page);
1397        fio.old_blkaddr = ni.blk_addr;
1398        write_node_page(nid, &fio);
1399        set_node_addr(sbi, &ni, fio.new_blkaddr, is_fsync_dnode(page));
1400        dec_page_count(sbi, F2FS_DIRTY_NODES);
1401        up_read(&sbi->node_write);
1402
1403        if (wbc->for_reclaim) {
1404                f2fs_submit_merged_write_cond(sbi, page->mapping->host, 0,
1405                                                page->index, NODE);
1406                submitted = NULL;
1407        }
1408
1409        unlock_page(page);
1410
1411        if (unlikely(f2fs_cp_error(sbi))) {
1412                f2fs_submit_merged_write(sbi, NODE);
1413                submitted = NULL;
1414        }
1415        if (submitted)
1416                *submitted = fio.submitted;
1417
1418        if (do_balance)
1419                f2fs_balance_fs(sbi, false);
1420        return 0;
1421
1422redirty_out:
1423        redirty_page_for_writepage(wbc, page);
1424        return AOP_WRITEPAGE_ACTIVATE;
1425}
1426
1427void move_node_page(struct page *node_page, int gc_type)
1428{
1429        if (gc_type == FG_GC) {
1430                struct writeback_control wbc = {
1431                        .sync_mode = WB_SYNC_ALL,
1432                        .nr_to_write = 1,
1433                        .for_reclaim = 0,
1434                };
1435
1436                set_page_dirty(node_page);
1437                f2fs_wait_on_page_writeback(node_page, NODE, true);
1438
1439                f2fs_bug_on(F2FS_P_SB(node_page), PageWriteback(node_page));
1440                if (!clear_page_dirty_for_io(node_page))
1441                        goto out_page;
1442
1443                if (__write_node_page(node_page, false, NULL,
1444                                        &wbc, false, FS_GC_NODE_IO))
1445                        unlock_page(node_page);
1446                goto release_page;
1447        } else {
1448                /* set page dirty and write it */
1449                if (!PageWriteback(node_page))
1450                        set_page_dirty(node_page);
1451        }
1452out_page:
1453        unlock_page(node_page);
1454release_page:
1455        f2fs_put_page(node_page, 0);
1456}
1457
1458static int f2fs_write_node_page(struct page *page,
1459                                struct writeback_control *wbc)
1460{
1461        return __write_node_page(page, false, NULL, wbc, false, FS_NODE_IO);
1462}
1463
1464int fsync_node_pages(struct f2fs_sb_info *sbi, struct inode *inode,
1465                        struct writeback_control *wbc, bool atomic)
1466{
1467        pgoff_t index;
1468        pgoff_t last_idx = ULONG_MAX;
1469        struct pagevec pvec;
1470        int ret = 0;
1471        struct page *last_page = NULL;
1472        bool marked = false;
1473        nid_t ino = inode->i_ino;
1474        int nr_pages;
1475
1476        if (atomic) {
1477                last_page = last_fsync_dnode(sbi, ino);
1478                if (IS_ERR_OR_NULL(last_page))
1479                        return PTR_ERR_OR_ZERO(last_page);
1480        }
1481retry:
1482        pagevec_init(&pvec);
1483        index = 0;
1484
1485        while ((nr_pages = pagevec_lookup_tag(&pvec, NODE_MAPPING(sbi), &index,
1486                                PAGECACHE_TAG_DIRTY))) {
1487                int i;
1488
1489                for (i = 0; i < nr_pages; i++) {
1490                        struct page *page = pvec.pages[i];
1491                        bool submitted = false;
1492
1493                        if (unlikely(f2fs_cp_error(sbi))) {
1494                                f2fs_put_page(last_page, 0);
1495                                pagevec_release(&pvec);
1496                                ret = -EIO;
1497                                goto out;
1498                        }
1499
1500                        if (!IS_DNODE(page) || !is_cold_node(page))
1501                                continue;
1502                        if (ino_of_node(page) != ino)
1503                                continue;
1504
1505                        lock_page(page);
1506
1507                        if (unlikely(page->mapping != NODE_MAPPING(sbi))) {
1508continue_unlock:
1509                                unlock_page(page);
1510                                continue;
1511                        }
1512                        if (ino_of_node(page) != ino)
1513                                goto continue_unlock;
1514
1515                        if (!PageDirty(page) && page != last_page) {
1516                                /* someone wrote it for us */
1517                                goto continue_unlock;
1518                        }
1519
1520                        f2fs_wait_on_page_writeback(page, NODE, true);
1521                        BUG_ON(PageWriteback(page));
1522
1523                        set_fsync_mark(page, 0);
1524                        set_dentry_mark(page, 0);
1525
1526                        if (!atomic || page == last_page) {
1527                                set_fsync_mark(page, 1);
1528                                if (IS_INODE(page)) {
1529                                        if (is_inode_flag_set(inode,
1530                                                                FI_DIRTY_INODE))
1531                                                update_inode(inode, page);
1532                                        set_dentry_mark(page,
1533                                                need_dentry_mark(sbi, ino));
1534                                }
1535                                /*  may be written by other thread */
1536                                if (!PageDirty(page))
1537                                        set_page_dirty(page);
1538                        }
1539
1540                        if (!clear_page_dirty_for_io(page))
1541                                goto continue_unlock;
1542
1543                        ret = __write_node_page(page, atomic &&
1544                                                page == last_page,
1545                                                &submitted, wbc, true,
1546                                                FS_NODE_IO);
1547                        if (ret) {
1548                                unlock_page(page);
1549                                f2fs_put_page(last_page, 0);
1550                                break;
1551                        } else if (submitted) {
1552                                last_idx = page->index;
1553                        }
1554
1555                        if (page == last_page) {
1556                                f2fs_put_page(page, 0);
1557                                marked = true;
1558                                break;
1559                        }
1560                }
1561                pagevec_release(&pvec);
1562                cond_resched();
1563
1564                if (ret || marked)
1565                        break;
1566        }
1567        if (!ret && atomic && !marked) {
1568                f2fs_msg(sbi->sb, KERN_DEBUG,
1569                        "Retry to write fsync mark: ino=%u, idx=%lx",
1570                                        ino, last_page->index);
1571                lock_page(last_page);
1572                f2fs_wait_on_page_writeback(last_page, NODE, true);
1573                set_page_dirty(last_page);
1574                unlock_page(last_page);
1575                goto retry;
1576        }
1577out:
1578        if (last_idx != ULONG_MAX)
1579                f2fs_submit_merged_write_cond(sbi, NULL, ino, last_idx, NODE);
1580        return ret ? -EIO: 0;
1581}
1582
1583int sync_node_pages(struct f2fs_sb_info *sbi, struct writeback_control *wbc,
1584                                bool do_balance, enum iostat_type io_type)
1585{
1586        pgoff_t index;
1587        struct pagevec pvec;
1588        int step = 0;
1589        int nwritten = 0;
1590        int ret = 0;
1591        int nr_pages;
1592
1593        pagevec_init(&pvec);
1594
1595next_step:
1596        index = 0;
1597
1598        while ((nr_pages = pagevec_lookup_tag(&pvec, NODE_MAPPING(sbi), &index,
1599                                PAGECACHE_TAG_DIRTY))) {
1600                int i;
1601
1602                for (i = 0; i < nr_pages; i++) {
1603                        struct page *page = pvec.pages[i];
1604                        bool submitted = false;
1605
1606                        /*
1607                         * flushing sequence with step:
1608                         * 0. indirect nodes
1609                         * 1. dentry dnodes
1610                         * 2. file dnodes
1611                         */
1612                        if (step == 0 && IS_DNODE(page))
1613                                continue;
1614                        if (step == 1 && (!IS_DNODE(page) ||
1615                                                is_cold_node(page)))
1616                                continue;
1617                        if (step == 2 && (!IS_DNODE(page) ||
1618                                                !is_cold_node(page)))
1619                                continue;
1620lock_node:
1621                        if (!trylock_page(page))
1622                                continue;
1623
1624                        if (unlikely(page->mapping != NODE_MAPPING(sbi))) {
1625continue_unlock:
1626                                unlock_page(page);
1627                                continue;
1628                        }
1629
1630                        if (!PageDirty(page)) {
1631                                /* someone wrote it for us */
1632                                goto continue_unlock;
1633                        }
1634
1635                        /* flush inline_data */
1636                        if (is_inline_node(page)) {
1637                                clear_inline_node(page);
1638                                unlock_page(page);
1639                                flush_inline_data(sbi, ino_of_node(page));
1640                                goto lock_node;
1641                        }
1642
1643                        f2fs_wait_on_page_writeback(page, NODE, true);
1644
1645                        BUG_ON(PageWriteback(page));
1646                        if (!clear_page_dirty_for_io(page))
1647                                goto continue_unlock;
1648
1649                        set_fsync_mark(page, 0);
1650                        set_dentry_mark(page, 0);
1651
1652                        ret = __write_node_page(page, false, &submitted,
1653                                                wbc, do_balance, io_type);
1654                        if (ret)
1655                                unlock_page(page);
1656                        else if (submitted)
1657                                nwritten++;
1658
1659                        if (--wbc->nr_to_write == 0)
1660                                break;
1661                }
1662                pagevec_release(&pvec);
1663                cond_resched();
1664
1665                if (wbc->nr_to_write == 0) {
1666                        step = 2;
1667                        break;
1668                }
1669        }
1670
1671        if (step < 2) {
1672                step++;
1673                goto next_step;
1674        }
1675
1676        if (nwritten)
1677                f2fs_submit_merged_write(sbi, NODE);
1678
1679        if (unlikely(f2fs_cp_error(sbi)))
1680                return -EIO;
1681        return ret;
1682}
1683
1684int wait_on_node_pages_writeback(struct f2fs_sb_info *sbi, nid_t ino)
1685{
1686        pgoff_t index = 0;
1687        struct pagevec pvec;
1688        int ret2, ret = 0;
1689        int nr_pages;
1690
1691        pagevec_init(&pvec);
1692
1693        while ((nr_pages = pagevec_lookup_tag(&pvec, NODE_MAPPING(sbi), &index,
1694                                PAGECACHE_TAG_WRITEBACK))) {
1695                int i;
1696
1697                for (i = 0; i < nr_pages; i++) {
1698                        struct page *page = pvec.pages[i];
1699
1700                        if (ino && ino_of_node(page) == ino) {
1701                                f2fs_wait_on_page_writeback(page, NODE, true);
1702                                if (TestClearPageError(page))
1703                                        ret = -EIO;
1704                        }
1705                }
1706                pagevec_release(&pvec);
1707                cond_resched();
1708        }
1709
1710        ret2 = filemap_check_errors(NODE_MAPPING(sbi));
1711        if (!ret)
1712                ret = ret2;
1713        return ret;
1714}
1715
1716static int f2fs_write_node_pages(struct address_space *mapping,
1717                            struct writeback_control *wbc)
1718{
1719        struct f2fs_sb_info *sbi = F2FS_M_SB(mapping);
1720        struct blk_plug plug;
1721        long diff;
1722
1723        if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING)))
1724                goto skip_write;
1725
1726        /* balancing f2fs's metadata in background */
1727        f2fs_balance_fs_bg(sbi);
1728
1729        /* collect a number of dirty node pages and write together */
1730        if (get_pages(sbi, F2FS_DIRTY_NODES) < nr_pages_to_skip(sbi, NODE))
1731                goto skip_write;
1732
1733        trace_f2fs_writepages(mapping->host, wbc, NODE);
1734
1735        diff = nr_pages_to_write(sbi, NODE, wbc);
1736        wbc->sync_mode = WB_SYNC_NONE;
1737        blk_start_plug(&plug);
1738        sync_node_pages(sbi, wbc, true, FS_NODE_IO);
1739        blk_finish_plug(&plug);
1740        wbc->nr_to_write = max((long)0, wbc->nr_to_write - diff);
1741        return 0;
1742
1743skip_write:
1744        wbc->pages_skipped += get_pages(sbi, F2FS_DIRTY_NODES);
1745        trace_f2fs_writepages(mapping->host, wbc, NODE);
1746        return 0;
1747}
1748
1749static int f2fs_set_node_page_dirty(struct page *page)
1750{
1751        trace_f2fs_set_page_dirty(page, NODE);
1752
1753        if (!PageUptodate(page))
1754                SetPageUptodate(page);
1755        if (!PageDirty(page)) {
1756                f2fs_set_page_dirty_nobuffers(page);
1757                inc_page_count(F2FS_P_SB(page), F2FS_DIRTY_NODES);
1758                SetPagePrivate(page);
1759                f2fs_trace_pid(page);
1760                return 1;
1761        }
1762        return 0;
1763}
1764
1765/*
1766 * Structure of the f2fs node operations
1767 */
1768const struct address_space_operations f2fs_node_aops = {
1769        .writepage      = f2fs_write_node_page,
1770        .writepages     = f2fs_write_node_pages,
1771        .set_page_dirty = f2fs_set_node_page_dirty,
1772        .invalidatepage = f2fs_invalidate_page,
1773        .releasepage    = f2fs_release_page,
1774#ifdef CONFIG_MIGRATION
1775        .migratepage    = f2fs_migrate_page,
1776#endif
1777};
1778
1779static struct free_nid *__lookup_free_nid_list(struct f2fs_nm_info *nm_i,
1780                                                nid_t n)
1781{
1782        return radix_tree_lookup(&nm_i->free_nid_root, n);
1783}
1784
1785static int __insert_free_nid(struct f2fs_sb_info *sbi,
1786                        struct free_nid *i, enum nid_state state)
1787{
1788        struct f2fs_nm_info *nm_i = NM_I(sbi);
1789
1790        int err = radix_tree_insert(&nm_i->free_nid_root, i->nid, i);
1791        if (err)
1792                return err;
1793
1794        f2fs_bug_on(sbi, state != i->state);
1795        nm_i->nid_cnt[state]++;
1796        if (state == FREE_NID)
1797                list_add_tail(&i->list, &nm_i->free_nid_list);
1798        return 0;
1799}
1800
1801static void __remove_free_nid(struct f2fs_sb_info *sbi,
1802                        struct free_nid *i, enum nid_state state)
1803{
1804        struct f2fs_nm_info *nm_i = NM_I(sbi);
1805
1806        f2fs_bug_on(sbi, state != i->state);
1807        nm_i->nid_cnt[state]--;
1808        if (state == FREE_NID)
1809                list_del(&i->list);
1810        radix_tree_delete(&nm_i->free_nid_root, i->nid);
1811}
1812
1813static void __move_free_nid(struct f2fs_sb_info *sbi, struct free_nid *i,
1814                        enum nid_state org_state, enum nid_state dst_state)
1815{
1816        struct f2fs_nm_info *nm_i = NM_I(sbi);
1817
1818        f2fs_bug_on(sbi, org_state != i->state);
1819        i->state = dst_state;
1820        nm_i->nid_cnt[org_state]--;
1821        nm_i->nid_cnt[dst_state]++;
1822
1823        switch (dst_state) {
1824        case PREALLOC_NID:
1825                list_del(&i->list);
1826                break;
1827        case FREE_NID:
1828                list_add_tail(&i->list, &nm_i->free_nid_list);
1829                break;
1830        default:
1831                BUG_ON(1);
1832        }
1833}
1834
1835static void update_free_nid_bitmap(struct f2fs_sb_info *sbi, nid_t nid,
1836                                                        bool set, bool build)
1837{
1838        struct f2fs_nm_info *nm_i = NM_I(sbi);
1839        unsigned int nat_ofs = NAT_BLOCK_OFFSET(nid);
1840        unsigned int nid_ofs = nid - START_NID(nid);
1841
1842        if (!test_bit_le(nat_ofs, nm_i->nat_block_bitmap))
1843                return;
1844
1845        if (set) {
1846                if (test_bit_le(nid_ofs, nm_i->free_nid_bitmap[nat_ofs]))
1847                        return;
1848                __set_bit_le(nid_ofs, nm_i->free_nid_bitmap[nat_ofs]);
1849                nm_i->free_nid_count[nat_ofs]++;
1850        } else {
1851                if (!test_bit_le(nid_ofs, nm_i->free_nid_bitmap[nat_ofs]))
1852                        return;
1853                __clear_bit_le(nid_ofs, nm_i->free_nid_bitmap[nat_ofs]);
1854                if (!build)
1855                        nm_i->free_nid_count[nat_ofs]--;
1856        }
1857}
1858
1859/* return if the nid is recognized as free */
1860static bool add_free_nid(struct f2fs_sb_info *sbi,
1861                                nid_t nid, bool build, bool update)
1862{
1863        struct f2fs_nm_info *nm_i = NM_I(sbi);
1864        struct free_nid *i, *e;
1865        struct nat_entry *ne;
1866        int err = -EINVAL;
1867        bool ret = false;
1868
1869        /* 0 nid should not be used */
1870        if (unlikely(nid == 0))
1871                return false;
1872
1873        i = f2fs_kmem_cache_alloc(free_nid_slab, GFP_NOFS);
1874        i->nid = nid;
1875        i->state = FREE_NID;
1876
1877        radix_tree_preload(GFP_NOFS | __GFP_NOFAIL);
1878
1879        spin_lock(&nm_i->nid_list_lock);
1880
1881        if (build) {
1882                /*
1883                 *   Thread A             Thread B
1884                 *  - f2fs_create
1885                 *   - f2fs_new_inode
1886                 *    - alloc_nid
1887                 *     - __insert_nid_to_list(PREALLOC_NID)
1888                 *                     - f2fs_balance_fs_bg
1889                 *                      - build_free_nids
1890                 *                       - __build_free_nids
1891                 *                        - scan_nat_page
1892                 *                         - add_free_nid
1893                 *                          - __lookup_nat_cache
1894                 *  - f2fs_add_link
1895                 *   - init_inode_metadata
1896                 *    - new_inode_page
1897                 *     - new_node_page
1898                 *      - set_node_addr
1899                 *  - alloc_nid_done
1900                 *   - __remove_nid_from_list(PREALLOC_NID)
1901                 *                         - __insert_nid_to_list(FREE_NID)
1902                 */
1903                ne = __lookup_nat_cache(nm_i, nid);
1904                if (ne && (!get_nat_flag(ne, IS_CHECKPOINTED) ||
1905                                nat_get_blkaddr(ne) != NULL_ADDR))
1906                        goto err_out;
1907
1908                e = __lookup_free_nid_list(nm_i, nid);
1909                if (e) {
1910                        if (e->state == FREE_NID)
1911                                ret = true;
1912                        goto err_out;
1913                }
1914        }
1915        ret = true;
1916        err = __insert_free_nid(sbi, i, FREE_NID);
1917err_out:
1918        if (update) {
1919                update_free_nid_bitmap(sbi, nid, ret, build);
1920                if (!build)
1921                        nm_i->available_nids++;
1922        }
1923        spin_unlock(&nm_i->nid_list_lock);
1924        radix_tree_preload_end();
1925
1926        if (err)
1927                kmem_cache_free(free_nid_slab, i);
1928        return ret;
1929}
1930
1931static void remove_free_nid(struct f2fs_sb_info *sbi, nid_t nid)
1932{
1933        struct f2fs_nm_info *nm_i = NM_I(sbi);
1934        struct free_nid *i;
1935        bool need_free = false;
1936
1937        spin_lock(&nm_i->nid_list_lock);
1938        i = __lookup_free_nid_list(nm_i, nid);
1939        if (i && i->state == FREE_NID) {
1940                __remove_free_nid(sbi, i, FREE_NID);
1941                need_free = true;
1942        }
1943        spin_unlock(&nm_i->nid_list_lock);
1944
1945        if (need_free)
1946                kmem_cache_free(free_nid_slab, i);
1947}
1948
1949static void scan_nat_page(struct f2fs_sb_info *sbi,
1950                        struct page *nat_page, nid_t start_nid)
1951{
1952        struct f2fs_nm_info *nm_i = NM_I(sbi);
1953        struct f2fs_nat_block *nat_blk = page_address(nat_page);
1954        block_t blk_addr;
1955        unsigned int nat_ofs = NAT_BLOCK_OFFSET(start_nid);
1956        int i;
1957
1958        __set_bit_le(nat_ofs, nm_i->nat_block_bitmap);
1959
1960        i = start_nid % NAT_ENTRY_PER_BLOCK;
1961
1962        for (; i < NAT_ENTRY_PER_BLOCK; i++, start_nid++) {
1963                if (unlikely(start_nid >= nm_i->max_nid))
1964                        break;
1965
1966                blk_addr = le32_to_cpu(nat_blk->entries[i].block_addr);
1967                f2fs_bug_on(sbi, blk_addr == NEW_ADDR);
1968                if (blk_addr == NULL_ADDR) {
1969                        add_free_nid(sbi, start_nid, true, true);
1970                } else {
1971                        spin_lock(&NM_I(sbi)->nid_list_lock);
1972                        update_free_nid_bitmap(sbi, start_nid, false, true);
1973                        spin_unlock(&NM_I(sbi)->nid_list_lock);
1974                }
1975        }
1976}
1977
1978static void scan_curseg_cache(struct f2fs_sb_info *sbi)
1979{
1980        struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA);
1981        struct f2fs_journal *journal = curseg->journal;
1982        int i;
1983
1984        down_read(&curseg->journal_rwsem);
1985        for (i = 0; i < nats_in_cursum(journal); i++) {
1986                block_t addr;
1987                nid_t nid;
1988
1989                addr = le32_to_cpu(nat_in_journal(journal, i).block_addr);
1990                nid = le32_to_cpu(nid_in_journal(journal, i));
1991                if (addr == NULL_ADDR)
1992                        add_free_nid(sbi, nid, true, false);
1993                else
1994                        remove_free_nid(sbi, nid);
1995        }
1996        up_read(&curseg->journal_rwsem);
1997}
1998
1999static void scan_free_nid_bits(struct f2fs_sb_info *sbi)
2000{
2001        struct f2fs_nm_info *nm_i = NM_I(sbi);
2002        unsigned int i, idx;
2003        nid_t nid;
2004
2005        down_read(&nm_i->nat_tree_lock);
2006
2007        for (i = 0; i < nm_i->nat_blocks; i++) {
2008                if (!test_bit_le(i, nm_i->nat_block_bitmap))
2009                        continue;
2010                if (!nm_i->free_nid_count[i])
2011                        continue;
2012                for (idx = 0; idx < NAT_ENTRY_PER_BLOCK; idx++) {
2013                        idx = find_next_bit_le(nm_i->free_nid_bitmap[i],
2014                                                NAT_ENTRY_PER_BLOCK, idx);
2015                        if (idx >= NAT_ENTRY_PER_BLOCK)
2016                                break;
2017
2018                        nid = i * NAT_ENTRY_PER_BLOCK + idx;
2019                        add_free_nid(sbi, nid, true, false);
2020
2021                        if (nm_i->nid_cnt[FREE_NID] >= MAX_FREE_NIDS)
2022                                goto out;
2023                }
2024        }
2025out:
2026        scan_curseg_cache(sbi);
2027
2028        up_read(&nm_i->nat_tree_lock);
2029}
2030
2031static void __build_free_nids(struct f2fs_sb_info *sbi, bool sync, bool mount)
2032{
2033        struct f2fs_nm_info *nm_i = NM_I(sbi);
2034        int i = 0;
2035        nid_t nid = nm_i->next_scan_nid;
2036
2037        if (unlikely(nid >= nm_i->max_nid))
2038                nid = 0;
2039
2040        /* Enough entries */
2041        if (nm_i->nid_cnt[FREE_NID] >= NAT_ENTRY_PER_BLOCK)
2042                return;
2043
2044        if (!sync && !available_free_memory(sbi, FREE_NIDS))
2045                return;
2046
2047        if (!mount) {
2048                /* try to find free nids in free_nid_bitmap */
2049                scan_free_nid_bits(sbi);
2050
2051                if (nm_i->nid_cnt[FREE_NID] >= NAT_ENTRY_PER_BLOCK)
2052                        return;
2053        }
2054
2055        /* readahead nat pages to be scanned */
2056        ra_meta_pages(sbi, NAT_BLOCK_OFFSET(nid), FREE_NID_PAGES,
2057                                                        META_NAT, true);
2058
2059        down_read(&nm_i->nat_tree_lock);
2060
2061        while (1) {
2062                if (!test_bit_le(NAT_BLOCK_OFFSET(nid),
2063                                                nm_i->nat_block_bitmap)) {
2064                        struct page *page = get_current_nat_page(sbi, nid);
2065
2066                        scan_nat_page(sbi, page, nid);
2067                        f2fs_put_page(page, 1);
2068                }
2069
2070                nid += (NAT_ENTRY_PER_BLOCK - (nid % NAT_ENTRY_PER_BLOCK));
2071                if (unlikely(nid >= nm_i->max_nid))
2072                        nid = 0;
2073
2074                if (++i >= FREE_NID_PAGES)
2075                        break;
2076        }
2077
2078        /* go to the next free nat pages to find free nids abundantly */
2079        nm_i->next_scan_nid = nid;
2080
2081        /* find free nids from current sum_pages */
2082        scan_curseg_cache(sbi);
2083
2084        up_read(&nm_i->nat_tree_lock);
2085
2086        ra_meta_pages(sbi, NAT_BLOCK_OFFSET(nm_i->next_scan_nid),
2087                                        nm_i->ra_nid_pages, META_NAT, false);
2088}
2089
2090void build_free_nids(struct f2fs_sb_info *sbi, bool sync, bool mount)
2091{
2092        mutex_lock(&NM_I(sbi)->build_lock);
2093        __build_free_nids(sbi, sync, mount);
2094        mutex_unlock(&NM_I(sbi)->build_lock);
2095}
2096
2097/*
2098 * If this function returns success, caller can obtain a new nid
2099 * from second parameter of this function.
2100 * The returned nid could be used ino as well as nid when inode is created.
2101 */
2102bool alloc_nid(struct f2fs_sb_info *sbi, nid_t *nid)
2103{
2104        struct f2fs_nm_info *nm_i = NM_I(sbi);
2105        struct free_nid *i = NULL;
2106retry:
2107#ifdef CONFIG_F2FS_FAULT_INJECTION
2108        if (time_to_inject(sbi, FAULT_ALLOC_NID)) {
2109                f2fs_show_injection_info(FAULT_ALLOC_NID);
2110                return false;
2111        }
2112#endif
2113        spin_lock(&nm_i->nid_list_lock);
2114
2115        if (unlikely(nm_i->available_nids == 0)) {
2116                spin_unlock(&nm_i->nid_list_lock);
2117                return false;
2118        }
2119
2120        /* We should not use stale free nids created by build_free_nids */
2121        if (nm_i->nid_cnt[FREE_NID] && !on_build_free_nids(nm_i)) {
2122                f2fs_bug_on(sbi, list_empty(&nm_i->free_nid_list));
2123                i = list_first_entry(&nm_i->free_nid_list,
2124                                        struct free_nid, list);
2125                *nid = i->nid;
2126
2127                __move_free_nid(sbi, i, FREE_NID, PREALLOC_NID);
2128                nm_i->available_nids--;
2129
2130                update_free_nid_bitmap(sbi, *nid, false, false);
2131
2132                spin_unlock(&nm_i->nid_list_lock);
2133                return true;
2134        }
2135        spin_unlock(&nm_i->nid_list_lock);
2136
2137        /* Let's scan nat pages and its caches to get free nids */
2138        build_free_nids(sbi, true, false);
2139        goto retry;
2140}
2141
2142/*
2143 * alloc_nid() should be called prior to this function.
2144 */
2145void alloc_nid_done(struct f2fs_sb_info *sbi, nid_t nid)
2146{
2147        struct f2fs_nm_info *nm_i = NM_I(sbi);
2148        struct free_nid *i;
2149
2150        spin_lock(&nm_i->nid_list_lock);
2151        i = __lookup_free_nid_list(nm_i, nid);
2152        f2fs_bug_on(sbi, !i);
2153        __remove_free_nid(sbi, i, PREALLOC_NID);
2154        spin_unlock(&nm_i->nid_list_lock);
2155
2156        kmem_cache_free(free_nid_slab, i);
2157}
2158
2159/*
2160 * alloc_nid() should be called prior to this function.
2161 */
2162void alloc_nid_failed(struct f2fs_sb_info *sbi, nid_t nid)
2163{
2164        struct f2fs_nm_info *nm_i = NM_I(sbi);
2165        struct free_nid *i;
2166        bool need_free = false;
2167
2168        if (!nid)
2169                return;
2170
2171        spin_lock(&nm_i->nid_list_lock);
2172        i = __lookup_free_nid_list(nm_i, nid);
2173        f2fs_bug_on(sbi, !i);
2174
2175        if (!available_free_memory(sbi, FREE_NIDS)) {
2176                __remove_free_nid(sbi, i, PREALLOC_NID);
2177                need_free = true;
2178        } else {
2179                __move_free_nid(sbi, i, PREALLOC_NID, FREE_NID);
2180        }
2181
2182        nm_i->available_nids++;
2183
2184        update_free_nid_bitmap(sbi, nid, true, false);
2185
2186        spin_unlock(&nm_i->nid_list_lock);
2187
2188        if (need_free)
2189                kmem_cache_free(free_nid_slab, i);
2190}
2191
2192int try_to_free_nids(struct f2fs_sb_info *sbi, int nr_shrink)
2193{
2194        struct f2fs_nm_info *nm_i = NM_I(sbi);
2195        struct free_nid *i, *next;
2196        int nr = nr_shrink;
2197
2198        if (nm_i->nid_cnt[FREE_NID] <= MAX_FREE_NIDS)
2199                return 0;
2200
2201        if (!mutex_trylock(&nm_i->build_lock))
2202                return 0;
2203
2204        spin_lock(&nm_i->nid_list_lock);
2205        list_for_each_entry_safe(i, next, &nm_i->free_nid_list, list) {
2206                if (nr_shrink <= 0 ||
2207                                nm_i->nid_cnt[FREE_NID] <= MAX_FREE_NIDS)
2208                        break;
2209
2210                __remove_free_nid(sbi, i, FREE_NID);
2211                kmem_cache_free(free_nid_slab, i);
2212                nr_shrink--;
2213        }
2214        spin_unlock(&nm_i->nid_list_lock);
2215        mutex_unlock(&nm_i->build_lock);
2216
2217        return nr - nr_shrink;
2218}
2219
2220void recover_inline_xattr(struct inode *inode, struct page *page)
2221{
2222        void *src_addr, *dst_addr;
2223        size_t inline_size;
2224        struct page *ipage;
2225        struct f2fs_inode *ri;
2226
2227        ipage = get_node_page(F2FS_I_SB(inode), inode->i_ino);
2228        f2fs_bug_on(F2FS_I_SB(inode), IS_ERR(ipage));
2229
2230        ri = F2FS_INODE(page);
2231        if (ri->i_inline & F2FS_INLINE_XATTR) {
2232                set_inode_flag(inode, FI_INLINE_XATTR);
2233        } else {
2234                clear_inode_flag(inode, FI_INLINE_XATTR);
2235                goto update_inode;
2236        }
2237
2238        dst_addr = inline_xattr_addr(inode, ipage);
2239        src_addr = inline_xattr_addr(inode, page);
2240        inline_size = inline_xattr_size(inode);
2241
2242        f2fs_wait_on_page_writeback(ipage, NODE, true);
2243        memcpy(dst_addr, src_addr, inline_size);
2244update_inode:
2245        update_inode(inode, ipage);
2246        f2fs_put_page(ipage, 1);
2247}
2248
2249int recover_xattr_data(struct inode *inode, struct page *page)
2250{
2251        struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
2252        nid_t prev_xnid = F2FS_I(inode)->i_xattr_nid;
2253        nid_t new_xnid;
2254        struct dnode_of_data dn;
2255        struct node_info ni;
2256        struct page *xpage;
2257
2258        if (!prev_xnid)
2259                goto recover_xnid;
2260
2261        /* 1: invalidate the previous xattr nid */
2262        get_node_info(sbi, prev_xnid, &ni);
2263        invalidate_blocks(sbi, ni.blk_addr);
2264        dec_valid_node_count(sbi, inode, false);
2265        set_node_addr(sbi, &ni, NULL_ADDR, false);
2266
2267recover_xnid:
2268        /* 2: update xattr nid in inode */
2269        if (!alloc_nid(sbi, &new_xnid))
2270                return -ENOSPC;
2271
2272        set_new_dnode(&dn, inode, NULL, NULL, new_xnid);
2273        xpage = new_node_page(&dn, XATTR_NODE_OFFSET);
2274        if (IS_ERR(xpage)) {
2275                alloc_nid_failed(sbi, new_xnid);
2276                return PTR_ERR(xpage);
2277        }
2278
2279        alloc_nid_done(sbi, new_xnid);
2280        update_inode_page(inode);
2281
2282        /* 3: update and set xattr node page dirty */
2283        memcpy(F2FS_NODE(xpage), F2FS_NODE(page), VALID_XATTR_BLOCK_SIZE);
2284
2285        set_page_dirty(xpage);
2286        f2fs_put_page(xpage, 1);
2287
2288        return 0;
2289}
2290
2291int recover_inode_page(struct f2fs_sb_info *sbi, struct page *page)
2292{
2293        struct f2fs_inode *src, *dst;
2294        nid_t ino = ino_of_node(page);
2295        struct node_info old_ni, new_ni;
2296        struct page *ipage;
2297
2298        get_node_info(sbi, ino, &old_ni);
2299
2300        if (unlikely(old_ni.blk_addr != NULL_ADDR))
2301                return -EINVAL;
2302retry:
2303        ipage = f2fs_grab_cache_page(NODE_MAPPING(sbi), ino, false);
2304        if (!ipage) {
2305                congestion_wait(BLK_RW_ASYNC, HZ/50);
2306                goto retry;
2307        }
2308
2309        /* Should not use this inode from free nid list */
2310        remove_free_nid(sbi, ino);
2311
2312        if (!PageUptodate(ipage))
2313                SetPageUptodate(ipage);
2314        fill_node_footer(ipage, ino, ino, 0, true);
2315        set_cold_node(page, false);
2316
2317        src = F2FS_INODE(page);
2318        dst = F2FS_INODE(ipage);
2319
2320        memcpy(dst, src, (unsigned long)&src->i_ext - (unsigned long)src);
2321        dst->i_size = 0;
2322        dst->i_blocks = cpu_to_le64(1);
2323        dst->i_links = cpu_to_le32(1);
2324        dst->i_xattr_nid = 0;
2325        dst->i_inline = src->i_inline & (F2FS_INLINE_XATTR | F2FS_EXTRA_ATTR);
2326        if (dst->i_inline & F2FS_EXTRA_ATTR) {
2327                dst->i_extra_isize = src->i_extra_isize;
2328
2329                if (f2fs_sb_has_flexible_inline_xattr(sbi->sb) &&
2330                        F2FS_FITS_IN_INODE(src, le16_to_cpu(src->i_extra_isize),
2331                                                        i_inline_xattr_size))
2332                        dst->i_inline_xattr_size = src->i_inline_xattr_size;
2333
2334                if (f2fs_sb_has_project_quota(sbi->sb) &&
2335                        F2FS_FITS_IN_INODE(src, le16_to_cpu(src->i_extra_isize),
2336                                                                i_projid))
2337                        dst->i_projid = src->i_projid;
2338        }
2339
2340        new_ni = old_ni;
2341        new_ni.ino = ino;
2342
2343        if (unlikely(inc_valid_node_count(sbi, NULL, true)))
2344                WARN_ON(1);
2345        set_node_addr(sbi, &new_ni, NEW_ADDR, false);
2346        inc_valid_inode_count(sbi);
2347        set_page_dirty(ipage);
2348        f2fs_put_page(ipage, 1);
2349        return 0;
2350}
2351
2352void restore_node_summary(struct f2fs_sb_info *sbi,
2353                        unsigned int segno, struct f2fs_summary_block *sum)
2354{
2355        struct f2fs_node *rn;
2356        struct f2fs_summary *sum_entry;
2357        block_t addr;
2358        int i, idx, last_offset, nrpages;
2359
2360        /* scan the node segment */
2361        last_offset = sbi->blocks_per_seg;
2362        addr = START_BLOCK(sbi, segno);
2363        sum_entry = &sum->entries[0];
2364
2365        for (i = 0; i < last_offset; i += nrpages, addr += nrpages) {
2366                nrpages = min(last_offset - i, BIO_MAX_PAGES);
2367
2368                /* readahead node pages */
2369                ra_meta_pages(sbi, addr, nrpages, META_POR, true);
2370
2371                for (idx = addr; idx < addr + nrpages; idx++) {
2372                        struct page *page = get_tmp_page(sbi, idx);
2373
2374                        rn = F2FS_NODE(page);
2375                        sum_entry->nid = rn->footer.nid;
2376                        sum_entry->version = 0;
2377                        sum_entry->ofs_in_node = 0;
2378                        sum_entry++;
2379                        f2fs_put_page(page, 1);
2380                }
2381
2382                invalidate_mapping_pages(META_MAPPING(sbi), addr,
2383                                                        addr + nrpages);
2384        }
2385}
2386
2387static void remove_nats_in_journal(struct f2fs_sb_info *sbi)
2388{
2389        struct f2fs_nm_info *nm_i = NM_I(sbi);
2390        struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA);
2391        struct f2fs_journal *journal = curseg->journal;
2392        int i;
2393
2394        down_write(&curseg->journal_rwsem);
2395        for (i = 0; i < nats_in_cursum(journal); i++) {
2396                struct nat_entry *ne;
2397                struct f2fs_nat_entry raw_ne;
2398                nid_t nid = le32_to_cpu(nid_in_journal(journal, i));
2399
2400                raw_ne = nat_in_journal(journal, i);
2401
2402                ne = __lookup_nat_cache(nm_i, nid);
2403                if (!ne) {
2404                        ne = __alloc_nat_entry(nid, true);
2405                        __init_nat_entry(nm_i, ne, &raw_ne, true);
2406                }
2407
2408                /*
2409                 * if a free nat in journal has not been used after last
2410                 * checkpoint, we should remove it from available nids,
2411                 * since later we will add it again.
2412                 */
2413                if (!get_nat_flag(ne, IS_DIRTY) &&
2414                                le32_to_cpu(raw_ne.block_addr) == NULL_ADDR) {
2415                        spin_lock(&nm_i->nid_list_lock);
2416                        nm_i->available_nids--;
2417                        spin_unlock(&nm_i->nid_list_lock);
2418                }
2419
2420                __set_nat_cache_dirty(nm_i, ne);
2421        }
2422        update_nats_in_cursum(journal, -i);
2423        up_write(&curseg->journal_rwsem);
2424}
2425
2426static void __adjust_nat_entry_set(struct nat_entry_set *nes,
2427                                                struct list_head *head, int max)
2428{
2429        struct nat_entry_set *cur;
2430
2431        if (nes->entry_cnt >= max)
2432                goto add_out;
2433
2434        list_for_each_entry(cur, head, set_list) {
2435                if (cur->entry_cnt >= nes->entry_cnt) {
2436                        list_add(&nes->set_list, cur->set_list.prev);
2437                        return;
2438                }
2439        }
2440add_out:
2441        list_add_tail(&nes->set_list, head);
2442}
2443
2444static void __update_nat_bits(struct f2fs_sb_info *sbi, nid_t start_nid,
2445                                                struct page *page)
2446{
2447        struct f2fs_nm_info *nm_i = NM_I(sbi);
2448        unsigned int nat_index = start_nid / NAT_ENTRY_PER_BLOCK;
2449        struct f2fs_nat_block *nat_blk = page_address(page);
2450        int valid = 0;
2451        int i = 0;
2452
2453        if (!enabled_nat_bits(sbi, NULL))
2454                return;
2455
2456        if (nat_index == 0) {
2457                valid = 1;
2458                i = 1;
2459        }
2460        for (; i < NAT_ENTRY_PER_BLOCK; i++) {
2461                if (nat_blk->entries[i].block_addr != NULL_ADDR)
2462                        valid++;
2463        }
2464        if (valid == 0) {
2465                __set_bit_le(nat_index, nm_i->empty_nat_bits);
2466                __clear_bit_le(nat_index, nm_i->full_nat_bits);
2467                return;
2468        }
2469
2470        __clear_bit_le(nat_index, nm_i->empty_nat_bits);
2471        if (valid == NAT_ENTRY_PER_BLOCK)
2472                __set_bit_le(nat_index, nm_i->full_nat_bits);
2473        else
2474                __clear_bit_le(nat_index, nm_i->full_nat_bits);
2475}
2476
2477static void __flush_nat_entry_set(struct f2fs_sb_info *sbi,
2478                struct nat_entry_set *set, struct cp_control *cpc)
2479{
2480        struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA);
2481        struct f2fs_journal *journal = curseg->journal;
2482        nid_t start_nid = set->set * NAT_ENTRY_PER_BLOCK;
2483        bool to_journal = true;
2484        struct f2fs_nat_block *nat_blk;
2485        struct nat_entry *ne, *cur;
2486        struct page *page = NULL;
2487
2488        /*
2489         * there are two steps to flush nat entries:
2490         * #1, flush nat entries to journal in current hot data summary block.
2491         * #2, flush nat entries to nat page.
2492         */
2493        if (enabled_nat_bits(sbi, cpc) ||
2494                !__has_cursum_space(journal, set->entry_cnt, NAT_JOURNAL))
2495                to_journal = false;
2496
2497        if (to_journal) {
2498                down_write(&curseg->journal_rwsem);
2499        } else {
2500                page = get_next_nat_page(sbi, start_nid);
2501                nat_blk = page_address(page);
2502                f2fs_bug_on(sbi, !nat_blk);
2503        }
2504
2505        /* flush dirty nats in nat entry set */
2506        list_for_each_entry_safe(ne, cur, &set->entry_list, list) {
2507                struct f2fs_nat_entry *raw_ne;
2508                nid_t nid = nat_get_nid(ne);
2509                int offset;
2510
2511                f2fs_bug_on(sbi, nat_get_blkaddr(ne) == NEW_ADDR);
2512
2513                if (to_journal) {
2514                        offset = lookup_journal_in_cursum(journal,
2515                                                        NAT_JOURNAL, nid, 1);
2516                        f2fs_bug_on(sbi, offset < 0);
2517                        raw_ne = &nat_in_journal(journal, offset);
2518                        nid_in_journal(journal, offset) = cpu_to_le32(nid);
2519                } else {
2520                        raw_ne = &nat_blk->entries[nid - start_nid];
2521                }
2522                raw_nat_from_node_info(raw_ne, &ne->ni);
2523                nat_reset_flag(ne);
2524                __clear_nat_cache_dirty(NM_I(sbi), set, ne);
2525                if (nat_get_blkaddr(ne) == NULL_ADDR) {
2526                        add_free_nid(sbi, nid, false, true);
2527                } else {
2528                        spin_lock(&NM_I(sbi)->nid_list_lock);
2529                        update_free_nid_bitmap(sbi, nid, false, false);
2530                        spin_unlock(&NM_I(sbi)->nid_list_lock);
2531                }
2532        }
2533
2534        if (to_journal) {
2535                up_write(&curseg->journal_rwsem);
2536        } else {
2537                __update_nat_bits(sbi, start_nid, page);
2538                f2fs_put_page(page, 1);
2539        }
2540
2541        /* Allow dirty nats by node block allocation in write_begin */
2542        if (!set->entry_cnt) {
2543                radix_tree_delete(&NM_I(sbi)->nat_set_root, set->set);
2544                kmem_cache_free(nat_entry_set_slab, set);
2545        }
2546}
2547
2548/*
2549 * This function is called during the checkpointing process.
2550 */
2551void flush_nat_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc)
2552{
2553        struct f2fs_nm_info *nm_i = NM_I(sbi);
2554        struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA);
2555        struct f2fs_journal *journal = curseg->journal;
2556        struct nat_entry_set *setvec[SETVEC_SIZE];
2557        struct nat_entry_set *set, *tmp;
2558        unsigned int found;
2559        nid_t set_idx = 0;
2560        LIST_HEAD(sets);
2561
2562        if (!nm_i->dirty_nat_cnt)
2563                return;
2564
2565        down_write(&nm_i->nat_tree_lock);
2566
2567        /*
2568         * if there are no enough space in journal to store dirty nat
2569         * entries, remove all entries from journal and merge them
2570         * into nat entry set.
2571         */
2572        if (enabled_nat_bits(sbi, cpc) ||
2573                !__has_cursum_space(journal, nm_i->dirty_nat_cnt, NAT_JOURNAL))
2574                remove_nats_in_journal(sbi);
2575
2576        while ((found = __gang_lookup_nat_set(nm_i,
2577                                        set_idx, SETVEC_SIZE, setvec))) {
2578                unsigned idx;
2579                set_idx = setvec[found - 1]->set + 1;
2580                for (idx = 0; idx < found; idx++)
2581                        __adjust_nat_entry_set(setvec[idx], &sets,
2582                                                MAX_NAT_JENTRIES(journal));
2583        }
2584
2585        /* flush dirty nats in nat entry set */
2586        list_for_each_entry_safe(set, tmp, &sets, set_list)
2587                __flush_nat_entry_set(sbi, set, cpc);
2588
2589        up_write(&nm_i->nat_tree_lock);
2590        /* Allow dirty nats by node block allocation in write_begin */
2591}
2592
2593static int __get_nat_bitmaps(struct f2fs_sb_info *sbi)
2594{
2595        struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi);
2596        struct f2fs_nm_info *nm_i = NM_I(sbi);
2597        unsigned int nat_bits_bytes = nm_i->nat_blocks / BITS_PER_BYTE;
2598        unsigned int i;
2599        __u64 cp_ver = cur_cp_version(ckpt);
2600        block_t nat_bits_addr;
2601
2602        if (!enabled_nat_bits(sbi, NULL))
2603                return 0;
2604
2605        nm_i->nat_bits_blocks = F2FS_BLK_ALIGN((nat_bits_bytes << 1) + 8);
2606        nm_i->nat_bits = f2fs_kzalloc(sbi,
2607                        nm_i->nat_bits_blocks << F2FS_BLKSIZE_BITS, GFP_KERNEL);
2608        if (!nm_i->nat_bits)
2609                return -ENOMEM;
2610
2611        nat_bits_addr = __start_cp_addr(sbi) + sbi->blocks_per_seg -
2612                                                nm_i->nat_bits_blocks;
2613        for (i = 0; i < nm_i->nat_bits_blocks; i++) {
2614                struct page *page = get_meta_page(sbi, nat_bits_addr++);
2615
2616                memcpy(nm_i->nat_bits + (i << F2FS_BLKSIZE_BITS),
2617                                        page_address(page), F2FS_BLKSIZE);
2618                f2fs_put_page(page, 1);
2619        }
2620
2621        cp_ver |= (cur_cp_crc(ckpt) << 32);
2622        if (cpu_to_le64(cp_ver) != *(__le64 *)nm_i->nat_bits) {
2623                disable_nat_bits(sbi, true);
2624                return 0;
2625        }
2626
2627        nm_i->full_nat_bits = nm_i->nat_bits + 8;
2628        nm_i->empty_nat_bits = nm_i->full_nat_bits + nat_bits_bytes;
2629
2630        f2fs_msg(sbi->sb, KERN_NOTICE, "Found nat_bits in checkpoint");
2631        return 0;
2632}
2633
2634static inline void load_free_nid_bitmap(struct f2fs_sb_info *sbi)
2635{
2636        struct f2fs_nm_info *nm_i = NM_I(sbi);
2637        unsigned int i = 0;
2638        nid_t nid, last_nid;
2639
2640        if (!enabled_nat_bits(sbi, NULL))
2641                return;
2642
2643        for (i = 0; i < nm_i->nat_blocks; i++) {
2644                i = find_next_bit_le(nm_i->empty_nat_bits, nm_i->nat_blocks, i);
2645                if (i >= nm_i->nat_blocks)
2646                        break;
2647
2648                __set_bit_le(i, nm_i->nat_block_bitmap);
2649
2650                nid = i * NAT_ENTRY_PER_BLOCK;
2651                last_nid = nid + NAT_ENTRY_PER_BLOCK;
2652
2653                spin_lock(&NM_I(sbi)->nid_list_lock);
2654                for (; nid < last_nid; nid++)
2655                        update_free_nid_bitmap(sbi, nid, true, true);
2656                spin_unlock(&NM_I(sbi)->nid_list_lock);
2657        }
2658
2659        for (i = 0; i < nm_i->nat_blocks; i++) {
2660                i = find_next_bit_le(nm_i->full_nat_bits, nm_i->nat_blocks, i);
2661                if (i >= nm_i->nat_blocks)
2662                        break;
2663
2664                __set_bit_le(i, nm_i->nat_block_bitmap);
2665        }
2666}
2667
2668static int init_node_manager(struct f2fs_sb_info *sbi)
2669{
2670        struct f2fs_super_block *sb_raw = F2FS_RAW_SUPER(sbi);
2671        struct f2fs_nm_info *nm_i = NM_I(sbi);
2672        unsigned char *version_bitmap;
2673        unsigned int nat_segs;
2674        int err;
2675
2676        nm_i->nat_blkaddr = le32_to_cpu(sb_raw->nat_blkaddr);
2677
2678        /* segment_count_nat includes pair segment so divide to 2. */
2679        nat_segs = le32_to_cpu(sb_raw->segment_count_nat) >> 1;
2680        nm_i->nat_blocks = nat_segs << le32_to_cpu(sb_raw->log_blocks_per_seg);
2681        nm_i->max_nid = NAT_ENTRY_PER_BLOCK * nm_i->nat_blocks;
2682
2683        /* not used nids: 0, node, meta, (and root counted as valid node) */
2684        nm_i->available_nids = nm_i->max_nid - sbi->total_valid_node_count -
2685                                sbi->nquota_files - F2FS_RESERVED_NODE_NUM;
2686        nm_i->nid_cnt[FREE_NID] = 0;
2687        nm_i->nid_cnt[PREALLOC_NID] = 0;
2688        nm_i->nat_cnt = 0;
2689        nm_i->ram_thresh = DEF_RAM_THRESHOLD;
2690        nm_i->ra_nid_pages = DEF_RA_NID_PAGES;
2691        nm_i->dirty_nats_ratio = DEF_DIRTY_NAT_RATIO_THRESHOLD;
2692
2693        INIT_RADIX_TREE(&nm_i->free_nid_root, GFP_ATOMIC);
2694        INIT_LIST_HEAD(&nm_i->free_nid_list);
2695        INIT_RADIX_TREE(&nm_i->nat_root, GFP_NOIO);
2696        INIT_RADIX_TREE(&nm_i->nat_set_root, GFP_NOIO);
2697        INIT_LIST_HEAD(&nm_i->nat_entries);
2698
2699        mutex_init(&nm_i->build_lock);
2700        spin_lock_init(&nm_i->nid_list_lock);
2701        init_rwsem(&nm_i->nat_tree_lock);
2702
2703        nm_i->next_scan_nid = le32_to_cpu(sbi->ckpt->next_free_nid);
2704        nm_i->bitmap_size = __bitmap_size(sbi, NAT_BITMAP);
2705        version_bitmap = __bitmap_ptr(sbi, NAT_BITMAP);
2706        if (!version_bitmap)
2707                return -EFAULT;
2708
2709        nm_i->nat_bitmap = kmemdup(version_bitmap, nm_i->bitmap_size,
2710                                        GFP_KERNEL);
2711        if (!nm_i->nat_bitmap)
2712                return -ENOMEM;
2713
2714        err = __get_nat_bitmaps(sbi);
2715        if (err)
2716                return err;
2717
2718#ifdef CONFIG_F2FS_CHECK_FS
2719        nm_i->nat_bitmap_mir = kmemdup(version_bitmap, nm_i->bitmap_size,
2720                                        GFP_KERNEL);
2721        if (!nm_i->nat_bitmap_mir)
2722                return -ENOMEM;
2723#endif
2724
2725        return 0;
2726}
2727
2728static int init_free_nid_cache(struct f2fs_sb_info *sbi)
2729{
2730        struct f2fs_nm_info *nm_i = NM_I(sbi);
2731        int i;
2732
2733        nm_i->free_nid_bitmap = f2fs_kzalloc(sbi, nm_i->nat_blocks *
2734                                sizeof(unsigned char *), GFP_KERNEL);
2735        if (!nm_i->free_nid_bitmap)
2736                return -ENOMEM;
2737
2738        for (i = 0; i < nm_i->nat_blocks; i++) {
2739                nm_i->free_nid_bitmap[i] = f2fs_kvzalloc(sbi,
2740                                NAT_ENTRY_BITMAP_SIZE_ALIGNED, GFP_KERNEL);
2741                if (!nm_i->free_nid_bitmap)
2742                        return -ENOMEM;
2743        }
2744
2745        nm_i->nat_block_bitmap = f2fs_kvzalloc(sbi, nm_i->nat_blocks / 8,
2746                                                                GFP_KERNEL);
2747        if (!nm_i->nat_block_bitmap)
2748                return -ENOMEM;
2749
2750        nm_i->free_nid_count = f2fs_kvzalloc(sbi, nm_i->nat_blocks *
2751                                        sizeof(unsigned short), GFP_KERNEL);
2752        if (!nm_i->free_nid_count)
2753                return -ENOMEM;
2754        return 0;
2755}
2756
2757int build_node_manager(struct f2fs_sb_info *sbi)
2758{
2759        int err;
2760
2761        sbi->nm_info = f2fs_kzalloc(sbi, sizeof(struct f2fs_nm_info),
2762                                                        GFP_KERNEL);
2763        if (!sbi->nm_info)
2764                return -ENOMEM;
2765
2766        err = init_node_manager(sbi);
2767        if (err)
2768                return err;
2769
2770        err = init_free_nid_cache(sbi);
2771        if (err)
2772                return err;
2773
2774        /* load free nid status from nat_bits table */
2775        load_free_nid_bitmap(sbi);
2776
2777        build_free_nids(sbi, true, true);
2778        return 0;
2779}
2780
2781void destroy_node_manager(struct f2fs_sb_info *sbi)
2782{
2783        struct f2fs_nm_info *nm_i = NM_I(sbi);
2784        struct free_nid *i, *next_i;
2785        struct nat_entry *natvec[NATVEC_SIZE];
2786        struct nat_entry_set *setvec[SETVEC_SIZE];
2787        nid_t nid = 0;
2788        unsigned int found;
2789
2790        if (!nm_i)
2791                return;
2792
2793        /* destroy free nid list */
2794        spin_lock(&nm_i->nid_list_lock);
2795        list_for_each_entry_safe(i, next_i, &nm_i->free_nid_list, list) {
2796                __remove_free_nid(sbi, i, FREE_NID);
2797                spin_unlock(&nm_i->nid_list_lock);
2798                kmem_cache_free(free_nid_slab, i);
2799                spin_lock(&nm_i->nid_list_lock);
2800        }
2801        f2fs_bug_on(sbi, nm_i->nid_cnt[FREE_NID]);
2802        f2fs_bug_on(sbi, nm_i->nid_cnt[PREALLOC_NID]);
2803        f2fs_bug_on(sbi, !list_empty(&nm_i->free_nid_list));
2804        spin_unlock(&nm_i->nid_list_lock);
2805
2806        /* destroy nat cache */
2807        down_write(&nm_i->nat_tree_lock);
2808        while ((found = __gang_lookup_nat_cache(nm_i,
2809                                        nid, NATVEC_SIZE, natvec))) {
2810                unsigned idx;
2811
2812                nid = nat_get_nid(natvec[found - 1]) + 1;
2813                for (idx = 0; idx < found; idx++)
2814                        __del_from_nat_cache(nm_i, natvec[idx]);
2815        }
2816        f2fs_bug_on(sbi, nm_i->nat_cnt);
2817
2818        /* destroy nat set cache */
2819        nid = 0;
2820        while ((found = __gang_lookup_nat_set(nm_i,
2821                                        nid, SETVEC_SIZE, setvec))) {
2822                unsigned idx;
2823
2824                nid = setvec[found - 1]->set + 1;
2825                for (idx = 0; idx < found; idx++) {
2826                        /* entry_cnt is not zero, when cp_error was occurred */
2827                        f2fs_bug_on(sbi, !list_empty(&setvec[idx]->entry_list));
2828                        radix_tree_delete(&nm_i->nat_set_root, setvec[idx]->set);
2829                        kmem_cache_free(nat_entry_set_slab, setvec[idx]);
2830                }
2831        }
2832        up_write(&nm_i->nat_tree_lock);
2833
2834        kvfree(nm_i->nat_block_bitmap);
2835        if (nm_i->free_nid_bitmap) {
2836                int i;
2837
2838                for (i = 0; i < nm_i->nat_blocks; i++)
2839                        kvfree(nm_i->free_nid_bitmap[i]);
2840                kfree(nm_i->free_nid_bitmap);
2841        }
2842        kvfree(nm_i->free_nid_count);
2843
2844        kfree(nm_i->nat_bitmap);
2845        kfree(nm_i->nat_bits);
2846#ifdef CONFIG_F2FS_CHECK_FS
2847        kfree(nm_i->nat_bitmap_mir);
2848#endif
2849        sbi->nm_info = NULL;
2850        kfree(nm_i);
2851}
2852
2853int __init create_node_manager_caches(void)
2854{
2855        nat_entry_slab = f2fs_kmem_cache_create("nat_entry",
2856                        sizeof(struct nat_entry));
2857        if (!nat_entry_slab)
2858                goto fail;
2859
2860        free_nid_slab = f2fs_kmem_cache_create("free_nid",
2861                        sizeof(struct free_nid));
2862        if (!free_nid_slab)
2863                goto destroy_nat_entry;
2864
2865        nat_entry_set_slab = f2fs_kmem_cache_create("nat_entry_set",
2866                        sizeof(struct nat_entry_set));
2867        if (!nat_entry_set_slab)
2868                goto destroy_free_nid;
2869        return 0;
2870
2871destroy_free_nid:
2872        kmem_cache_destroy(free_nid_slab);
2873destroy_nat_entry:
2874        kmem_cache_destroy(nat_entry_slab);
2875fail:
2876        return -ENOMEM;
2877}
2878
2879void destroy_node_manager_caches(void)
2880{
2881        kmem_cache_destroy(nat_entry_set_slab);
2882        kmem_cache_destroy(free_nid_slab);
2883        kmem_cache_destroy(nat_entry_slab);
2884}
2885