linux/fs/f2fs/node.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0
   2/*
   3 * fs/f2fs/node.c
   4 *
   5 * Copyright (c) 2012 Samsung Electronics Co., Ltd.
   6 *             http://www.samsung.com/
   7 */
   8#include <linux/fs.h>
   9#include <linux/f2fs_fs.h>
  10#include <linux/mpage.h>
  11#include <linux/sched/mm.h>
  12#include <linux/blkdev.h>
  13#include <linux/pagevec.h>
  14#include <linux/swap.h>
  15
  16#include "f2fs.h"
  17#include "node.h"
  18#include "segment.h"
  19#include "xattr.h"
  20#include "iostat.h"
  21#include <trace/events/f2fs.h>
  22
  23#define on_f2fs_build_free_nids(nmi) mutex_is_locked(&(nm_i)->build_lock)
  24
  25static struct kmem_cache *nat_entry_slab;
  26static struct kmem_cache *free_nid_slab;
  27static struct kmem_cache *nat_entry_set_slab;
  28static struct kmem_cache *fsync_node_entry_slab;
  29
  30/*
  31 * Check whether the given nid is within node id range.
  32 */
  33int f2fs_check_nid_range(struct f2fs_sb_info *sbi, nid_t nid)
  34{
  35        if (unlikely(nid < F2FS_ROOT_INO(sbi) || nid >= NM_I(sbi)->max_nid)) {
  36                set_sbi_flag(sbi, SBI_NEED_FSCK);
  37                f2fs_warn(sbi, "%s: out-of-range nid=%x, run fsck to fix.",
  38                          __func__, nid);
  39                return -EFSCORRUPTED;
  40        }
  41        return 0;
  42}
  43
  44bool f2fs_available_free_memory(struct f2fs_sb_info *sbi, int type)
  45{
  46        struct f2fs_nm_info *nm_i = NM_I(sbi);
  47        struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info;
  48        struct sysinfo val;
  49        unsigned long avail_ram;
  50        unsigned long mem_size = 0;
  51        bool res = false;
  52
  53        if (!nm_i)
  54                return true;
  55
  56        si_meminfo(&val);
  57
  58        /* only uses low memory */
  59        avail_ram = val.totalram - val.totalhigh;
  60
  61        /*
  62         * give 25%, 25%, 50%, 50%, 50% memory for each components respectively
  63         */
  64        if (type == FREE_NIDS) {
  65                mem_size = (nm_i->nid_cnt[FREE_NID] *
  66                                sizeof(struct free_nid)) >> PAGE_SHIFT;
  67                res = mem_size < ((avail_ram * nm_i->ram_thresh / 100) >> 2);
  68        } else if (type == NAT_ENTRIES) {
  69                mem_size = (nm_i->nat_cnt[TOTAL_NAT] *
  70                                sizeof(struct nat_entry)) >> PAGE_SHIFT;
  71                res = mem_size < ((avail_ram * nm_i->ram_thresh / 100) >> 2);
  72                if (excess_cached_nats(sbi))
  73                        res = false;
  74        } else if (type == DIRTY_DENTS) {
  75                if (sbi->sb->s_bdi->wb.dirty_exceeded)
  76                        return false;
  77                mem_size = get_pages(sbi, F2FS_DIRTY_DENTS);
  78                res = mem_size < ((avail_ram * nm_i->ram_thresh / 100) >> 1);
  79        } else if (type == INO_ENTRIES) {
  80                int i;
  81
  82                for (i = 0; i < MAX_INO_ENTRY; i++)
  83                        mem_size += sbi->im[i].ino_num *
  84                                                sizeof(struct ino_entry);
  85                mem_size >>= PAGE_SHIFT;
  86                res = mem_size < ((avail_ram * nm_i->ram_thresh / 100) >> 1);
  87        } else if (type == EXTENT_CACHE) {
  88                mem_size = (atomic_read(&sbi->total_ext_tree) *
  89                                sizeof(struct extent_tree) +
  90                                atomic_read(&sbi->total_ext_node) *
  91                                sizeof(struct extent_node)) >> PAGE_SHIFT;
  92                res = mem_size < ((avail_ram * nm_i->ram_thresh / 100) >> 1);
  93        } else if (type == DISCARD_CACHE) {
  94                mem_size = (atomic_read(&dcc->discard_cmd_cnt) *
  95                                sizeof(struct discard_cmd)) >> PAGE_SHIFT;
  96                res = mem_size < (avail_ram * nm_i->ram_thresh / 100);
  97        } else if (type == COMPRESS_PAGE) {
  98#ifdef CONFIG_F2FS_FS_COMPRESSION
  99                unsigned long free_ram = val.freeram;
 100
 101                /*
 102                 * free memory is lower than watermark or cached page count
 103                 * exceed threshold, deny caching compress page.
 104                 */
 105                res = (free_ram > avail_ram * sbi->compress_watermark / 100) &&
 106                        (COMPRESS_MAPPING(sbi)->nrpages <
 107                         free_ram * sbi->compress_percent / 100);
 108#else
 109                res = false;
 110#endif
 111        } else {
 112                if (!sbi->sb->s_bdi->wb.dirty_exceeded)
 113                        return true;
 114        }
 115        return res;
 116}
 117
 118static void clear_node_page_dirty(struct page *page)
 119{
 120        if (PageDirty(page)) {
 121                f2fs_clear_page_cache_dirty_tag(page);
 122                clear_page_dirty_for_io(page);
 123                dec_page_count(F2FS_P_SB(page), F2FS_DIRTY_NODES);
 124        }
 125        ClearPageUptodate(page);
 126}
 127
 128static struct page *get_current_nat_page(struct f2fs_sb_info *sbi, nid_t nid)
 129{
 130        return f2fs_get_meta_page_retry(sbi, current_nat_addr(sbi, nid));
 131}
 132
 133static struct page *get_next_nat_page(struct f2fs_sb_info *sbi, nid_t nid)
 134{
 135        struct page *src_page;
 136        struct page *dst_page;
 137        pgoff_t dst_off;
 138        void *src_addr;
 139        void *dst_addr;
 140        struct f2fs_nm_info *nm_i = NM_I(sbi);
 141
 142        dst_off = next_nat_addr(sbi, current_nat_addr(sbi, nid));
 143
 144        /* get current nat block page with lock */
 145        src_page = get_current_nat_page(sbi, nid);
 146        if (IS_ERR(src_page))
 147                return src_page;
 148        dst_page = f2fs_grab_meta_page(sbi, dst_off);
 149        f2fs_bug_on(sbi, PageDirty(src_page));
 150
 151        src_addr = page_address(src_page);
 152        dst_addr = page_address(dst_page);
 153        memcpy(dst_addr, src_addr, PAGE_SIZE);
 154        set_page_dirty(dst_page);
 155        f2fs_put_page(src_page, 1);
 156
 157        set_to_next_nat(nm_i, nid);
 158
 159        return dst_page;
 160}
 161
 162static struct nat_entry *__alloc_nat_entry(struct f2fs_sb_info *sbi,
 163                                                nid_t nid, bool no_fail)
 164{
 165        struct nat_entry *new;
 166
 167        new = f2fs_kmem_cache_alloc(nat_entry_slab,
 168                                        GFP_F2FS_ZERO, no_fail, sbi);
 169        if (new) {
 170                nat_set_nid(new, nid);
 171                nat_reset_flag(new);
 172        }
 173        return new;
 174}
 175
 176static void __free_nat_entry(struct nat_entry *e)
 177{
 178        kmem_cache_free(nat_entry_slab, e);
 179}
 180
 181/* must be locked by nat_tree_lock */
 182static struct nat_entry *__init_nat_entry(struct f2fs_nm_info *nm_i,
 183        struct nat_entry *ne, struct f2fs_nat_entry *raw_ne, bool no_fail)
 184{
 185        if (no_fail)
 186                f2fs_radix_tree_insert(&nm_i->nat_root, nat_get_nid(ne), ne);
 187        else if (radix_tree_insert(&nm_i->nat_root, nat_get_nid(ne), ne))
 188                return NULL;
 189
 190        if (raw_ne)
 191                node_info_from_raw_nat(&ne->ni, raw_ne);
 192
 193        spin_lock(&nm_i->nat_list_lock);
 194        list_add_tail(&ne->list, &nm_i->nat_entries);
 195        spin_unlock(&nm_i->nat_list_lock);
 196
 197        nm_i->nat_cnt[TOTAL_NAT]++;
 198        nm_i->nat_cnt[RECLAIMABLE_NAT]++;
 199        return ne;
 200}
 201
 202static struct nat_entry *__lookup_nat_cache(struct f2fs_nm_info *nm_i, nid_t n)
 203{
 204        struct nat_entry *ne;
 205
 206        ne = radix_tree_lookup(&nm_i->nat_root, n);
 207
 208        /* for recent accessed nat entry, move it to tail of lru list */
 209        if (ne && !get_nat_flag(ne, IS_DIRTY)) {
 210                spin_lock(&nm_i->nat_list_lock);
 211                if (!list_empty(&ne->list))
 212                        list_move_tail(&ne->list, &nm_i->nat_entries);
 213                spin_unlock(&nm_i->nat_list_lock);
 214        }
 215
 216        return ne;
 217}
 218
 219static unsigned int __gang_lookup_nat_cache(struct f2fs_nm_info *nm_i,
 220                nid_t start, unsigned int nr, struct nat_entry **ep)
 221{
 222        return radix_tree_gang_lookup(&nm_i->nat_root, (void **)ep, start, nr);
 223}
 224
 225static void __del_from_nat_cache(struct f2fs_nm_info *nm_i, struct nat_entry *e)
 226{
 227        radix_tree_delete(&nm_i->nat_root, nat_get_nid(e));
 228        nm_i->nat_cnt[TOTAL_NAT]--;
 229        nm_i->nat_cnt[RECLAIMABLE_NAT]--;
 230        __free_nat_entry(e);
 231}
 232
 233static struct nat_entry_set *__grab_nat_entry_set(struct f2fs_nm_info *nm_i,
 234                                                        struct nat_entry *ne)
 235{
 236        nid_t set = NAT_BLOCK_OFFSET(ne->ni.nid);
 237        struct nat_entry_set *head;
 238
 239        head = radix_tree_lookup(&nm_i->nat_set_root, set);
 240        if (!head) {
 241                head = f2fs_kmem_cache_alloc(nat_entry_set_slab,
 242                                                GFP_NOFS, true, NULL);
 243
 244                INIT_LIST_HEAD(&head->entry_list);
 245                INIT_LIST_HEAD(&head->set_list);
 246                head->set = set;
 247                head->entry_cnt = 0;
 248                f2fs_radix_tree_insert(&nm_i->nat_set_root, set, head);
 249        }
 250        return head;
 251}
 252
 253static void __set_nat_cache_dirty(struct f2fs_nm_info *nm_i,
 254                                                struct nat_entry *ne)
 255{
 256        struct nat_entry_set *head;
 257        bool new_ne = nat_get_blkaddr(ne) == NEW_ADDR;
 258
 259        if (!new_ne)
 260                head = __grab_nat_entry_set(nm_i, ne);
 261
 262        /*
 263         * update entry_cnt in below condition:
 264         * 1. update NEW_ADDR to valid block address;
 265         * 2. update old block address to new one;
 266         */
 267        if (!new_ne && (get_nat_flag(ne, IS_PREALLOC) ||
 268                                !get_nat_flag(ne, IS_DIRTY)))
 269                head->entry_cnt++;
 270
 271        set_nat_flag(ne, IS_PREALLOC, new_ne);
 272
 273        if (get_nat_flag(ne, IS_DIRTY))
 274                goto refresh_list;
 275
 276        nm_i->nat_cnt[DIRTY_NAT]++;
 277        nm_i->nat_cnt[RECLAIMABLE_NAT]--;
 278        set_nat_flag(ne, IS_DIRTY, true);
 279refresh_list:
 280        spin_lock(&nm_i->nat_list_lock);
 281        if (new_ne)
 282                list_del_init(&ne->list);
 283        else
 284                list_move_tail(&ne->list, &head->entry_list);
 285        spin_unlock(&nm_i->nat_list_lock);
 286}
 287
 288static void __clear_nat_cache_dirty(struct f2fs_nm_info *nm_i,
 289                struct nat_entry_set *set, struct nat_entry *ne)
 290{
 291        spin_lock(&nm_i->nat_list_lock);
 292        list_move_tail(&ne->list, &nm_i->nat_entries);
 293        spin_unlock(&nm_i->nat_list_lock);
 294
 295        set_nat_flag(ne, IS_DIRTY, false);
 296        set->entry_cnt--;
 297        nm_i->nat_cnt[DIRTY_NAT]--;
 298        nm_i->nat_cnt[RECLAIMABLE_NAT]++;
 299}
 300
 301static unsigned int __gang_lookup_nat_set(struct f2fs_nm_info *nm_i,
 302                nid_t start, unsigned int nr, struct nat_entry_set **ep)
 303{
 304        return radix_tree_gang_lookup(&nm_i->nat_set_root, (void **)ep,
 305                                                        start, nr);
 306}
 307
 308bool f2fs_in_warm_node_list(struct f2fs_sb_info *sbi, struct page *page)
 309{
 310        return NODE_MAPPING(sbi) == page->mapping &&
 311                        IS_DNODE(page) && is_cold_node(page);
 312}
 313
 314void f2fs_init_fsync_node_info(struct f2fs_sb_info *sbi)
 315{
 316        spin_lock_init(&sbi->fsync_node_lock);
 317        INIT_LIST_HEAD(&sbi->fsync_node_list);
 318        sbi->fsync_seg_id = 0;
 319        sbi->fsync_node_num = 0;
 320}
 321
 322static unsigned int f2fs_add_fsync_node_entry(struct f2fs_sb_info *sbi,
 323                                                        struct page *page)
 324{
 325        struct fsync_node_entry *fn;
 326        unsigned long flags;
 327        unsigned int seq_id;
 328
 329        fn = f2fs_kmem_cache_alloc(fsync_node_entry_slab,
 330                                        GFP_NOFS, true, NULL);
 331
 332        get_page(page);
 333        fn->page = page;
 334        INIT_LIST_HEAD(&fn->list);
 335
 336        spin_lock_irqsave(&sbi->fsync_node_lock, flags);
 337        list_add_tail(&fn->list, &sbi->fsync_node_list);
 338        fn->seq_id = sbi->fsync_seg_id++;
 339        seq_id = fn->seq_id;
 340        sbi->fsync_node_num++;
 341        spin_unlock_irqrestore(&sbi->fsync_node_lock, flags);
 342
 343        return seq_id;
 344}
 345
 346void f2fs_del_fsync_node_entry(struct f2fs_sb_info *sbi, struct page *page)
 347{
 348        struct fsync_node_entry *fn;
 349        unsigned long flags;
 350
 351        spin_lock_irqsave(&sbi->fsync_node_lock, flags);
 352        list_for_each_entry(fn, &sbi->fsync_node_list, list) {
 353                if (fn->page == page) {
 354                        list_del(&fn->list);
 355                        sbi->fsync_node_num--;
 356                        spin_unlock_irqrestore(&sbi->fsync_node_lock, flags);
 357                        kmem_cache_free(fsync_node_entry_slab, fn);
 358                        put_page(page);
 359                        return;
 360                }
 361        }
 362        spin_unlock_irqrestore(&sbi->fsync_node_lock, flags);
 363        f2fs_bug_on(sbi, 1);
 364}
 365
 366void f2fs_reset_fsync_node_info(struct f2fs_sb_info *sbi)
 367{
 368        unsigned long flags;
 369
 370        spin_lock_irqsave(&sbi->fsync_node_lock, flags);
 371        sbi->fsync_seg_id = 0;
 372        spin_unlock_irqrestore(&sbi->fsync_node_lock, flags);
 373}
 374
 375int f2fs_need_dentry_mark(struct f2fs_sb_info *sbi, nid_t nid)
 376{
 377        struct f2fs_nm_info *nm_i = NM_I(sbi);
 378        struct nat_entry *e;
 379        bool need = false;
 380
 381        f2fs_down_read(&nm_i->nat_tree_lock);
 382        e = __lookup_nat_cache(nm_i, nid);
 383        if (e) {
 384                if (!get_nat_flag(e, IS_CHECKPOINTED) &&
 385                                !get_nat_flag(e, HAS_FSYNCED_INODE))
 386                        need = true;
 387        }
 388        f2fs_up_read(&nm_i->nat_tree_lock);
 389        return need;
 390}
 391
 392bool f2fs_is_checkpointed_node(struct f2fs_sb_info *sbi, nid_t nid)
 393{
 394        struct f2fs_nm_info *nm_i = NM_I(sbi);
 395        struct nat_entry *e;
 396        bool is_cp = true;
 397
 398        f2fs_down_read(&nm_i->nat_tree_lock);
 399        e = __lookup_nat_cache(nm_i, nid);
 400        if (e && !get_nat_flag(e, IS_CHECKPOINTED))
 401                is_cp = false;
 402        f2fs_up_read(&nm_i->nat_tree_lock);
 403        return is_cp;
 404}
 405
 406bool f2fs_need_inode_block_update(struct f2fs_sb_info *sbi, nid_t ino)
 407{
 408        struct f2fs_nm_info *nm_i = NM_I(sbi);
 409        struct nat_entry *e;
 410        bool need_update = true;
 411
 412        f2fs_down_read(&nm_i->nat_tree_lock);
 413        e = __lookup_nat_cache(nm_i, ino);
 414        if (e && get_nat_flag(e, HAS_LAST_FSYNC) &&
 415                        (get_nat_flag(e, IS_CHECKPOINTED) ||
 416                         get_nat_flag(e, HAS_FSYNCED_INODE)))
 417                need_update = false;
 418        f2fs_up_read(&nm_i->nat_tree_lock);
 419        return need_update;
 420}
 421
 422/* must be locked by nat_tree_lock */
 423static void cache_nat_entry(struct f2fs_sb_info *sbi, nid_t nid,
 424                                                struct f2fs_nat_entry *ne)
 425{
 426        struct f2fs_nm_info *nm_i = NM_I(sbi);
 427        struct nat_entry *new, *e;
 428
 429        /* Let's mitigate lock contention of nat_tree_lock during checkpoint */
 430        if (f2fs_rwsem_is_locked(&sbi->cp_global_sem))
 431                return;
 432
 433        new = __alloc_nat_entry(sbi, nid, false);
 434        if (!new)
 435                return;
 436
 437        f2fs_down_write(&nm_i->nat_tree_lock);
 438        e = __lookup_nat_cache(nm_i, nid);
 439        if (!e)
 440                e = __init_nat_entry(nm_i, new, ne, false);
 441        else
 442                f2fs_bug_on(sbi, nat_get_ino(e) != le32_to_cpu(ne->ino) ||
 443                                nat_get_blkaddr(e) !=
 444                                        le32_to_cpu(ne->block_addr) ||
 445                                nat_get_version(e) != ne->version);
 446        f2fs_up_write(&nm_i->nat_tree_lock);
 447        if (e != new)
 448                __free_nat_entry(new);
 449}
 450
 451static void set_node_addr(struct f2fs_sb_info *sbi, struct node_info *ni,
 452                        block_t new_blkaddr, bool fsync_done)
 453{
 454        struct f2fs_nm_info *nm_i = NM_I(sbi);
 455        struct nat_entry *e;
 456        struct nat_entry *new = __alloc_nat_entry(sbi, ni->nid, true);
 457
 458        f2fs_down_write(&nm_i->nat_tree_lock);
 459        e = __lookup_nat_cache(nm_i, ni->nid);
 460        if (!e) {
 461                e = __init_nat_entry(nm_i, new, NULL, true);
 462                copy_node_info(&e->ni, ni);
 463                f2fs_bug_on(sbi, ni->blk_addr == NEW_ADDR);
 464        } else if (new_blkaddr == NEW_ADDR) {
 465                /*
 466                 * when nid is reallocated,
 467                 * previous nat entry can be remained in nat cache.
 468                 * So, reinitialize it with new information.
 469                 */
 470                copy_node_info(&e->ni, ni);
 471                f2fs_bug_on(sbi, ni->blk_addr != NULL_ADDR);
 472        }
 473        /* let's free early to reduce memory consumption */
 474        if (e != new)
 475                __free_nat_entry(new);
 476
 477        /* sanity check */
 478        f2fs_bug_on(sbi, nat_get_blkaddr(e) != ni->blk_addr);
 479        f2fs_bug_on(sbi, nat_get_blkaddr(e) == NULL_ADDR &&
 480                        new_blkaddr == NULL_ADDR);
 481        f2fs_bug_on(sbi, nat_get_blkaddr(e) == NEW_ADDR &&
 482                        new_blkaddr == NEW_ADDR);
 483        f2fs_bug_on(sbi, __is_valid_data_blkaddr(nat_get_blkaddr(e)) &&
 484                        new_blkaddr == NEW_ADDR);
 485
 486        /* increment version no as node is removed */
 487        if (nat_get_blkaddr(e) != NEW_ADDR && new_blkaddr == NULL_ADDR) {
 488                unsigned char version = nat_get_version(e);
 489
 490                nat_set_version(e, inc_node_version(version));
 491        }
 492
 493        /* change address */
 494        nat_set_blkaddr(e, new_blkaddr);
 495        if (!__is_valid_data_blkaddr(new_blkaddr))
 496                set_nat_flag(e, IS_CHECKPOINTED, false);
 497        __set_nat_cache_dirty(nm_i, e);
 498
 499        /* update fsync_mark if its inode nat entry is still alive */
 500        if (ni->nid != ni->ino)
 501                e = __lookup_nat_cache(nm_i, ni->ino);
 502        if (e) {
 503                if (fsync_done && ni->nid == ni->ino)
 504                        set_nat_flag(e, HAS_FSYNCED_INODE, true);
 505                set_nat_flag(e, HAS_LAST_FSYNC, fsync_done);
 506        }
 507        f2fs_up_write(&nm_i->nat_tree_lock);
 508}
 509
 510int f2fs_try_to_free_nats(struct f2fs_sb_info *sbi, int nr_shrink)
 511{
 512        struct f2fs_nm_info *nm_i = NM_I(sbi);
 513        int nr = nr_shrink;
 514
 515        if (!f2fs_down_write_trylock(&nm_i->nat_tree_lock))
 516                return 0;
 517
 518        spin_lock(&nm_i->nat_list_lock);
 519        while (nr_shrink) {
 520                struct nat_entry *ne;
 521
 522                if (list_empty(&nm_i->nat_entries))
 523                        break;
 524
 525                ne = list_first_entry(&nm_i->nat_entries,
 526                                        struct nat_entry, list);
 527                list_del(&ne->list);
 528                spin_unlock(&nm_i->nat_list_lock);
 529
 530                __del_from_nat_cache(nm_i, ne);
 531                nr_shrink--;
 532
 533                spin_lock(&nm_i->nat_list_lock);
 534        }
 535        spin_unlock(&nm_i->nat_list_lock);
 536
 537        f2fs_up_write(&nm_i->nat_tree_lock);
 538        return nr - nr_shrink;
 539}
 540
 541int f2fs_get_node_info(struct f2fs_sb_info *sbi, nid_t nid,
 542                                struct node_info *ni, bool checkpoint_context)
 543{
 544        struct f2fs_nm_info *nm_i = NM_I(sbi);
 545        struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA);
 546        struct f2fs_journal *journal = curseg->journal;
 547        nid_t start_nid = START_NID(nid);
 548        struct f2fs_nat_block *nat_blk;
 549        struct page *page = NULL;
 550        struct f2fs_nat_entry ne;
 551        struct nat_entry *e;
 552        pgoff_t index;
 553        block_t blkaddr;
 554        int i;
 555
 556        ni->nid = nid;
 557retry:
 558        /* Check nat cache */
 559        f2fs_down_read(&nm_i->nat_tree_lock);
 560        e = __lookup_nat_cache(nm_i, nid);
 561        if (e) {
 562                ni->ino = nat_get_ino(e);
 563                ni->blk_addr = nat_get_blkaddr(e);
 564                ni->version = nat_get_version(e);
 565                f2fs_up_read(&nm_i->nat_tree_lock);
 566                return 0;
 567        }
 568
 569        /*
 570         * Check current segment summary by trying to grab journal_rwsem first.
 571         * This sem is on the critical path on the checkpoint requiring the above
 572         * nat_tree_lock. Therefore, we should retry, if we failed to grab here
 573         * while not bothering checkpoint.
 574         */
 575        if (!f2fs_rwsem_is_locked(&sbi->cp_global_sem) || checkpoint_context) {
 576                down_read(&curseg->journal_rwsem);
 577        } else if (f2fs_rwsem_is_contended(&nm_i->nat_tree_lock) ||
 578                                !down_read_trylock(&curseg->journal_rwsem)) {
 579                f2fs_up_read(&nm_i->nat_tree_lock);
 580                goto retry;
 581        }
 582
 583        i = f2fs_lookup_journal_in_cursum(journal, NAT_JOURNAL, nid, 0);
 584        if (i >= 0) {
 585                ne = nat_in_journal(journal, i);
 586                node_info_from_raw_nat(ni, &ne);
 587        }
 588        up_read(&curseg->journal_rwsem);
 589        if (i >= 0) {
 590                f2fs_up_read(&nm_i->nat_tree_lock);
 591                goto cache;
 592        }
 593
 594        /* Fill node_info from nat page */
 595        index = current_nat_addr(sbi, nid);
 596        f2fs_up_read(&nm_i->nat_tree_lock);
 597
 598        page = f2fs_get_meta_page(sbi, index);
 599        if (IS_ERR(page))
 600                return PTR_ERR(page);
 601
 602        nat_blk = (struct f2fs_nat_block *)page_address(page);
 603        ne = nat_blk->entries[nid - start_nid];
 604        node_info_from_raw_nat(ni, &ne);
 605        f2fs_put_page(page, 1);
 606cache:
 607        blkaddr = le32_to_cpu(ne.block_addr);
 608        if (__is_valid_data_blkaddr(blkaddr) &&
 609                !f2fs_is_valid_blkaddr(sbi, blkaddr, DATA_GENERIC_ENHANCE))
 610                return -EFAULT;
 611
 612        /* cache nat entry */
 613        cache_nat_entry(sbi, nid, &ne);
 614        return 0;
 615}
 616
 617/*
 618 * readahead MAX_RA_NODE number of node pages.
 619 */
 620static void f2fs_ra_node_pages(struct page *parent, int start, int n)
 621{
 622        struct f2fs_sb_info *sbi = F2FS_P_SB(parent);
 623        struct blk_plug plug;
 624        int i, end;
 625        nid_t nid;
 626
 627        blk_start_plug(&plug);
 628
 629        /* Then, try readahead for siblings of the desired node */
 630        end = start + n;
 631        end = min(end, NIDS_PER_BLOCK);
 632        for (i = start; i < end; i++) {
 633                nid = get_nid(parent, i, false);
 634                f2fs_ra_node_page(sbi, nid);
 635        }
 636
 637        blk_finish_plug(&plug);
 638}
 639
 640pgoff_t f2fs_get_next_page_offset(struct dnode_of_data *dn, pgoff_t pgofs)
 641{
 642        const long direct_index = ADDRS_PER_INODE(dn->inode);
 643        const long direct_blks = ADDRS_PER_BLOCK(dn->inode);
 644        const long indirect_blks = ADDRS_PER_BLOCK(dn->inode) * NIDS_PER_BLOCK;
 645        unsigned int skipped_unit = ADDRS_PER_BLOCK(dn->inode);
 646        int cur_level = dn->cur_level;
 647        int max_level = dn->max_level;
 648        pgoff_t base = 0;
 649
 650        if (!dn->max_level)
 651                return pgofs + 1;
 652
 653        while (max_level-- > cur_level)
 654                skipped_unit *= NIDS_PER_BLOCK;
 655
 656        switch (dn->max_level) {
 657        case 3:
 658                base += 2 * indirect_blks;
 659                fallthrough;
 660        case 2:
 661                base += 2 * direct_blks;
 662                fallthrough;
 663        case 1:
 664                base += direct_index;
 665                break;
 666        default:
 667                f2fs_bug_on(F2FS_I_SB(dn->inode), 1);
 668        }
 669
 670        return ((pgofs - base) / skipped_unit + 1) * skipped_unit + base;
 671}
 672
 673/*
 674 * The maximum depth is four.
 675 * Offset[0] will have raw inode offset.
 676 */
 677static int get_node_path(struct inode *inode, long block,
 678                                int offset[4], unsigned int noffset[4])
 679{
 680        const long direct_index = ADDRS_PER_INODE(inode);
 681        const long direct_blks = ADDRS_PER_BLOCK(inode);
 682        const long dptrs_per_blk = NIDS_PER_BLOCK;
 683        const long indirect_blks = ADDRS_PER_BLOCK(inode) * NIDS_PER_BLOCK;
 684        const long dindirect_blks = indirect_blks * NIDS_PER_BLOCK;
 685        int n = 0;
 686        int level = 0;
 687
 688        noffset[0] = 0;
 689
 690        if (block < direct_index) {
 691                offset[n] = block;
 692                goto got;
 693        }
 694        block -= direct_index;
 695        if (block < direct_blks) {
 696                offset[n++] = NODE_DIR1_BLOCK;
 697                noffset[n] = 1;
 698                offset[n] = block;
 699                level = 1;
 700                goto got;
 701        }
 702        block -= direct_blks;
 703        if (block < direct_blks) {
 704                offset[n++] = NODE_DIR2_BLOCK;
 705                noffset[n] = 2;
 706                offset[n] = block;
 707                level = 1;
 708                goto got;
 709        }
 710        block -= direct_blks;
 711        if (block < indirect_blks) {
 712                offset[n++] = NODE_IND1_BLOCK;
 713                noffset[n] = 3;
 714                offset[n++] = block / direct_blks;
 715                noffset[n] = 4 + offset[n - 1];
 716                offset[n] = block % direct_blks;
 717                level = 2;
 718                goto got;
 719        }
 720        block -= indirect_blks;
 721        if (block < indirect_blks) {
 722                offset[n++] = NODE_IND2_BLOCK;
 723                noffset[n] = 4 + dptrs_per_blk;
 724                offset[n++] = block / direct_blks;
 725                noffset[n] = 5 + dptrs_per_blk + offset[n - 1];
 726                offset[n] = block % direct_blks;
 727                level = 2;
 728                goto got;
 729        }
 730        block -= indirect_blks;
 731        if (block < dindirect_blks) {
 732                offset[n++] = NODE_DIND_BLOCK;
 733                noffset[n] = 5 + (dptrs_per_blk * 2);
 734                offset[n++] = block / indirect_blks;
 735                noffset[n] = 6 + (dptrs_per_blk * 2) +
 736                              offset[n - 1] * (dptrs_per_blk + 1);
 737                offset[n++] = (block / direct_blks) % dptrs_per_blk;
 738                noffset[n] = 7 + (dptrs_per_blk * 2) +
 739                              offset[n - 2] * (dptrs_per_blk + 1) +
 740                              offset[n - 1];
 741                offset[n] = block % direct_blks;
 742                level = 3;
 743                goto got;
 744        } else {
 745                return -E2BIG;
 746        }
 747got:
 748        return level;
 749}
 750
 751/*
 752 * Caller should call f2fs_put_dnode(dn).
 753 * Also, it should grab and release a rwsem by calling f2fs_lock_op() and
 754 * f2fs_unlock_op() only if mode is set with ALLOC_NODE.
 755 */
 756int f2fs_get_dnode_of_data(struct dnode_of_data *dn, pgoff_t index, int mode)
 757{
 758        struct f2fs_sb_info *sbi = F2FS_I_SB(dn->inode);
 759        struct page *npage[4];
 760        struct page *parent = NULL;
 761        int offset[4];
 762        unsigned int noffset[4];
 763        nid_t nids[4];
 764        int level, i = 0;
 765        int err = 0;
 766
 767        level = get_node_path(dn->inode, index, offset, noffset);
 768        if (level < 0)
 769                return level;
 770
 771        nids[0] = dn->inode->i_ino;
 772        npage[0] = dn->inode_page;
 773
 774        if (!npage[0]) {
 775                npage[0] = f2fs_get_node_page(sbi, nids[0]);
 776                if (IS_ERR(npage[0]))
 777                        return PTR_ERR(npage[0]);
 778        }
 779
 780        /* if inline_data is set, should not report any block indices */
 781        if (f2fs_has_inline_data(dn->inode) && index) {
 782                err = -ENOENT;
 783                f2fs_put_page(npage[0], 1);
 784                goto release_out;
 785        }
 786
 787        parent = npage[0];
 788        if (level != 0)
 789                nids[1] = get_nid(parent, offset[0], true);
 790        dn->inode_page = npage[0];
 791        dn->inode_page_locked = true;
 792
 793        /* get indirect or direct nodes */
 794        for (i = 1; i <= level; i++) {
 795                bool done = false;
 796
 797                if (!nids[i] && mode == ALLOC_NODE) {
 798                        /* alloc new node */
 799                        if (!f2fs_alloc_nid(sbi, &(nids[i]))) {
 800                                err = -ENOSPC;
 801                                goto release_pages;
 802                        }
 803
 804                        dn->nid = nids[i];
 805                        npage[i] = f2fs_new_node_page(dn, noffset[i]);
 806                        if (IS_ERR(npage[i])) {
 807                                f2fs_alloc_nid_failed(sbi, nids[i]);
 808                                err = PTR_ERR(npage[i]);
 809                                goto release_pages;
 810                        }
 811
 812                        set_nid(parent, offset[i - 1], nids[i], i == 1);
 813                        f2fs_alloc_nid_done(sbi, nids[i]);
 814                        done = true;
 815                } else if (mode == LOOKUP_NODE_RA && i == level && level > 1) {
 816                        npage[i] = f2fs_get_node_page_ra(parent, offset[i - 1]);
 817                        if (IS_ERR(npage[i])) {
 818                                err = PTR_ERR(npage[i]);
 819                                goto release_pages;
 820                        }
 821                        done = true;
 822                }
 823                if (i == 1) {
 824                        dn->inode_page_locked = false;
 825                        unlock_page(parent);
 826                } else {
 827                        f2fs_put_page(parent, 1);
 828                }
 829
 830                if (!done) {
 831                        npage[i] = f2fs_get_node_page(sbi, nids[i]);
 832                        if (IS_ERR(npage[i])) {
 833                                err = PTR_ERR(npage[i]);
 834                                f2fs_put_page(npage[0], 0);
 835                                goto release_out;
 836                        }
 837                }
 838                if (i < level) {
 839                        parent = npage[i];
 840                        nids[i + 1] = get_nid(parent, offset[i], false);
 841                }
 842        }
 843        dn->nid = nids[level];
 844        dn->ofs_in_node = offset[level];
 845        dn->node_page = npage[level];
 846        dn->data_blkaddr = f2fs_data_blkaddr(dn);
 847
 848        if (is_inode_flag_set(dn->inode, FI_COMPRESSED_FILE) &&
 849                                        f2fs_sb_has_readonly(sbi)) {
 850                unsigned int c_len = f2fs_cluster_blocks_are_contiguous(dn);
 851                block_t blkaddr;
 852
 853                if (!c_len)
 854                        goto out;
 855
 856                blkaddr = f2fs_data_blkaddr(dn);
 857                if (blkaddr == COMPRESS_ADDR)
 858                        blkaddr = data_blkaddr(dn->inode, dn->node_page,
 859                                                dn->ofs_in_node + 1);
 860
 861                f2fs_update_extent_tree_range_compressed(dn->inode,
 862                                        index, blkaddr,
 863                                        F2FS_I(dn->inode)->i_cluster_size,
 864                                        c_len);
 865        }
 866out:
 867        return 0;
 868
 869release_pages:
 870        f2fs_put_page(parent, 1);
 871        if (i > 1)
 872                f2fs_put_page(npage[0], 0);
 873release_out:
 874        dn->inode_page = NULL;
 875        dn->node_page = NULL;
 876        if (err == -ENOENT) {
 877                dn->cur_level = i;
 878                dn->max_level = level;
 879                dn->ofs_in_node = offset[level];
 880        }
 881        return err;
 882}
 883
 884static int truncate_node(struct dnode_of_data *dn)
 885{
 886        struct f2fs_sb_info *sbi = F2FS_I_SB(dn->inode);
 887        struct node_info ni;
 888        int err;
 889        pgoff_t index;
 890
 891        err = f2fs_get_node_info(sbi, dn->nid, &ni, false);
 892        if (err)
 893                return err;
 894
 895        /* Deallocate node address */
 896        f2fs_invalidate_blocks(sbi, ni.blk_addr);
 897        dec_valid_node_count(sbi, dn->inode, dn->nid == dn->inode->i_ino);
 898        set_node_addr(sbi, &ni, NULL_ADDR, false);
 899
 900        if (dn->nid == dn->inode->i_ino) {
 901                f2fs_remove_orphan_inode(sbi, dn->nid);
 902                dec_valid_inode_count(sbi);
 903                f2fs_inode_synced(dn->inode);
 904        }
 905
 906        clear_node_page_dirty(dn->node_page);
 907        set_sbi_flag(sbi, SBI_IS_DIRTY);
 908
 909        index = dn->node_page->index;
 910        f2fs_put_page(dn->node_page, 1);
 911
 912        invalidate_mapping_pages(NODE_MAPPING(sbi),
 913                        index, index);
 914
 915        dn->node_page = NULL;
 916        trace_f2fs_truncate_node(dn->inode, dn->nid, ni.blk_addr);
 917
 918        return 0;
 919}
 920
 921static int truncate_dnode(struct dnode_of_data *dn)
 922{
 923        struct page *page;
 924        int err;
 925
 926        if (dn->nid == 0)
 927                return 1;
 928
 929        /* get direct node */
 930        page = f2fs_get_node_page(F2FS_I_SB(dn->inode), dn->nid);
 931        if (PTR_ERR(page) == -ENOENT)
 932                return 1;
 933        else if (IS_ERR(page))
 934                return PTR_ERR(page);
 935
 936        /* Make dnode_of_data for parameter */
 937        dn->node_page = page;
 938        dn->ofs_in_node = 0;
 939        f2fs_truncate_data_blocks(dn);
 940        err = truncate_node(dn);
 941        if (err)
 942                return err;
 943
 944        return 1;
 945}
 946
 947static int truncate_nodes(struct dnode_of_data *dn, unsigned int nofs,
 948                                                int ofs, int depth)
 949{
 950        struct dnode_of_data rdn = *dn;
 951        struct page *page;
 952        struct f2fs_node *rn;
 953        nid_t child_nid;
 954        unsigned int child_nofs;
 955        int freed = 0;
 956        int i, ret;
 957
 958        if (dn->nid == 0)
 959                return NIDS_PER_BLOCK + 1;
 960
 961        trace_f2fs_truncate_nodes_enter(dn->inode, dn->nid, dn->data_blkaddr);
 962
 963        page = f2fs_get_node_page(F2FS_I_SB(dn->inode), dn->nid);
 964        if (IS_ERR(page)) {
 965                trace_f2fs_truncate_nodes_exit(dn->inode, PTR_ERR(page));
 966                return PTR_ERR(page);
 967        }
 968
 969        f2fs_ra_node_pages(page, ofs, NIDS_PER_BLOCK);
 970
 971        rn = F2FS_NODE(page);
 972        if (depth < 3) {
 973                for (i = ofs; i < NIDS_PER_BLOCK; i++, freed++) {
 974                        child_nid = le32_to_cpu(rn->in.nid[i]);
 975                        if (child_nid == 0)
 976                                continue;
 977                        rdn.nid = child_nid;
 978                        ret = truncate_dnode(&rdn);
 979                        if (ret < 0)
 980                                goto out_err;
 981                        if (set_nid(page, i, 0, false))
 982                                dn->node_changed = true;
 983                }
 984        } else {
 985                child_nofs = nofs + ofs * (NIDS_PER_BLOCK + 1) + 1;
 986                for (i = ofs; i < NIDS_PER_BLOCK; i++) {
 987                        child_nid = le32_to_cpu(rn->in.nid[i]);
 988                        if (child_nid == 0) {
 989                                child_nofs += NIDS_PER_BLOCK + 1;
 990                                continue;
 991                        }
 992                        rdn.nid = child_nid;
 993                        ret = truncate_nodes(&rdn, child_nofs, 0, depth - 1);
 994                        if (ret == (NIDS_PER_BLOCK + 1)) {
 995                                if (set_nid(page, i, 0, false))
 996                                        dn->node_changed = true;
 997                                child_nofs += ret;
 998                        } else if (ret < 0 && ret != -ENOENT) {
 999                                goto out_err;
1000                        }
1001                }
1002                freed = child_nofs;
1003        }
1004
1005        if (!ofs) {
1006                /* remove current indirect node */
1007                dn->node_page = page;
1008                ret = truncate_node(dn);
1009                if (ret)
1010                        goto out_err;
1011                freed++;
1012        } else {
1013                f2fs_put_page(page, 1);
1014        }
1015        trace_f2fs_truncate_nodes_exit(dn->inode, freed);
1016        return freed;
1017
1018out_err:
1019        f2fs_put_page(page, 1);
1020        trace_f2fs_truncate_nodes_exit(dn->inode, ret);
1021        return ret;
1022}
1023
1024static int truncate_partial_nodes(struct dnode_of_data *dn,
1025                        struct f2fs_inode *ri, int *offset, int depth)
1026{
1027        struct page *pages[2];
1028        nid_t nid[3];
1029        nid_t child_nid;
1030        int err = 0;
1031        int i;
1032        int idx = depth - 2;
1033
1034        nid[0] = le32_to_cpu(ri->i_nid[offset[0] - NODE_DIR1_BLOCK]);
1035        if (!nid[0])
1036                return 0;
1037
1038        /* get indirect nodes in the path */
1039        for (i = 0; i < idx + 1; i++) {
1040                /* reference count'll be increased */
1041                pages[i] = f2fs_get_node_page(F2FS_I_SB(dn->inode), nid[i]);
1042                if (IS_ERR(pages[i])) {
1043                        err = PTR_ERR(pages[i]);
1044                        idx = i - 1;
1045                        goto fail;
1046                }
1047                nid[i + 1] = get_nid(pages[i], offset[i + 1], false);
1048        }
1049
1050        f2fs_ra_node_pages(pages[idx], offset[idx + 1], NIDS_PER_BLOCK);
1051
1052        /* free direct nodes linked to a partial indirect node */
1053        for (i = offset[idx + 1]; i < NIDS_PER_BLOCK; i++) {
1054                child_nid = get_nid(pages[idx], i, false);
1055                if (!child_nid)
1056                        continue;
1057                dn->nid = child_nid;
1058                err = truncate_dnode(dn);
1059                if (err < 0)
1060                        goto fail;
1061                if (set_nid(pages[idx], i, 0, false))
1062                        dn->node_changed = true;
1063        }
1064
1065        if (offset[idx + 1] == 0) {
1066                dn->node_page = pages[idx];
1067                dn->nid = nid[idx];
1068                err = truncate_node(dn);
1069                if (err)
1070                        goto fail;
1071        } else {
1072                f2fs_put_page(pages[idx], 1);
1073        }
1074        offset[idx]++;
1075        offset[idx + 1] = 0;
1076        idx--;
1077fail:
1078        for (i = idx; i >= 0; i--)
1079                f2fs_put_page(pages[i], 1);
1080
1081        trace_f2fs_truncate_partial_nodes(dn->inode, nid, depth, err);
1082
1083        return err;
1084}
1085
1086/*
1087 * All the block addresses of data and nodes should be nullified.
1088 */
1089int f2fs_truncate_inode_blocks(struct inode *inode, pgoff_t from)
1090{
1091        struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
1092        int err = 0, cont = 1;
1093        int level, offset[4], noffset[4];
1094        unsigned int nofs = 0;
1095        struct f2fs_inode *ri;
1096        struct dnode_of_data dn;
1097        struct page *page;
1098
1099        trace_f2fs_truncate_inode_blocks_enter(inode, from);
1100
1101        level = get_node_path(inode, from, offset, noffset);
1102        if (level < 0) {
1103                trace_f2fs_truncate_inode_blocks_exit(inode, level);
1104                return level;
1105        }
1106
1107        page = f2fs_get_node_page(sbi, inode->i_ino);
1108        if (IS_ERR(page)) {
1109                trace_f2fs_truncate_inode_blocks_exit(inode, PTR_ERR(page));
1110                return PTR_ERR(page);
1111        }
1112
1113        set_new_dnode(&dn, inode, page, NULL, 0);
1114        unlock_page(page);
1115
1116        ri = F2FS_INODE(page);
1117        switch (level) {
1118        case 0:
1119        case 1:
1120                nofs = noffset[1];
1121                break;
1122        case 2:
1123                nofs = noffset[1];
1124                if (!offset[level - 1])
1125                        goto skip_partial;
1126                err = truncate_partial_nodes(&dn, ri, offset, level);
1127                if (err < 0 && err != -ENOENT)
1128                        goto fail;
1129                nofs += 1 + NIDS_PER_BLOCK;
1130                break;
1131        case 3:
1132                nofs = 5 + 2 * NIDS_PER_BLOCK;
1133                if (!offset[level - 1])
1134                        goto skip_partial;
1135                err = truncate_partial_nodes(&dn, ri, offset, level);
1136                if (err < 0 && err != -ENOENT)
1137                        goto fail;
1138                break;
1139        default:
1140                BUG();
1141        }
1142
1143skip_partial:
1144        while (cont) {
1145                dn.nid = le32_to_cpu(ri->i_nid[offset[0] - NODE_DIR1_BLOCK]);
1146                switch (offset[0]) {
1147                case NODE_DIR1_BLOCK:
1148                case NODE_DIR2_BLOCK:
1149                        err = truncate_dnode(&dn);
1150                        break;
1151
1152                case NODE_IND1_BLOCK:
1153                case NODE_IND2_BLOCK:
1154                        err = truncate_nodes(&dn, nofs, offset[1], 2);
1155                        break;
1156
1157                case NODE_DIND_BLOCK:
1158                        err = truncate_nodes(&dn, nofs, offset[1], 3);
1159                        cont = 0;
1160                        break;
1161
1162                default:
1163                        BUG();
1164                }
1165                if (err < 0 && err != -ENOENT)
1166                        goto fail;
1167                if (offset[1] == 0 &&
1168                                ri->i_nid[offset[0] - NODE_DIR1_BLOCK]) {
1169                        lock_page(page);
1170                        BUG_ON(page->mapping != NODE_MAPPING(sbi));
1171                        f2fs_wait_on_page_writeback(page, NODE, true, true);
1172                        ri->i_nid[offset[0] - NODE_DIR1_BLOCK] = 0;
1173                        set_page_dirty(page);
1174                        unlock_page(page);
1175                }
1176                offset[1] = 0;
1177                offset[0]++;
1178                nofs += err;
1179        }
1180fail:
1181        f2fs_put_page(page, 0);
1182        trace_f2fs_truncate_inode_blocks_exit(inode, err);
1183        return err > 0 ? 0 : err;
1184}
1185
1186/* caller must lock inode page */
1187int f2fs_truncate_xattr_node(struct inode *inode)
1188{
1189        struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
1190        nid_t nid = F2FS_I(inode)->i_xattr_nid;
1191        struct dnode_of_data dn;
1192        struct page *npage;
1193        int err;
1194
1195        if (!nid)
1196                return 0;
1197
1198        npage = f2fs_get_node_page(sbi, nid);
1199        if (IS_ERR(npage))
1200                return PTR_ERR(npage);
1201
1202        set_new_dnode(&dn, inode, NULL, npage, nid);
1203        err = truncate_node(&dn);
1204        if (err) {
1205                f2fs_put_page(npage, 1);
1206                return err;
1207        }
1208
1209        f2fs_i_xnid_write(inode, 0);
1210
1211        return 0;
1212}
1213
1214/*
1215 * Caller should grab and release a rwsem by calling f2fs_lock_op() and
1216 * f2fs_unlock_op().
1217 */
1218int f2fs_remove_inode_page(struct inode *inode)
1219{
1220        struct dnode_of_data dn;
1221        int err;
1222
1223        set_new_dnode(&dn, inode, NULL, NULL, inode->i_ino);
1224        err = f2fs_get_dnode_of_data(&dn, 0, LOOKUP_NODE);
1225        if (err)
1226                return err;
1227
1228        err = f2fs_truncate_xattr_node(inode);
1229        if (err) {
1230                f2fs_put_dnode(&dn);
1231                return err;
1232        }
1233
1234        /* remove potential inline_data blocks */
1235        if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
1236                                S_ISLNK(inode->i_mode))
1237                f2fs_truncate_data_blocks_range(&dn, 1);
1238
1239        /* 0 is possible, after f2fs_new_inode() has failed */
1240        if (unlikely(f2fs_cp_error(F2FS_I_SB(inode)))) {
1241                f2fs_put_dnode(&dn);
1242                return -EIO;
1243        }
1244
1245        if (unlikely(inode->i_blocks != 0 && inode->i_blocks != 8)) {
1246                f2fs_warn(F2FS_I_SB(inode),
1247                        "f2fs_remove_inode_page: inconsistent i_blocks, ino:%lu, iblocks:%llu",
1248                        inode->i_ino, (unsigned long long)inode->i_blocks);
1249                set_sbi_flag(F2FS_I_SB(inode), SBI_NEED_FSCK);
1250        }
1251
1252        /* will put inode & node pages */
1253        err = truncate_node(&dn);
1254        if (err) {
1255                f2fs_put_dnode(&dn);
1256                return err;
1257        }
1258        return 0;
1259}
1260
1261struct page *f2fs_new_inode_page(struct inode *inode)
1262{
1263        struct dnode_of_data dn;
1264
1265        /* allocate inode page for new inode */
1266        set_new_dnode(&dn, inode, NULL, NULL, inode->i_ino);
1267
1268        /* caller should f2fs_put_page(page, 1); */
1269        return f2fs_new_node_page(&dn, 0);
1270}
1271
1272struct page *f2fs_new_node_page(struct dnode_of_data *dn, unsigned int ofs)
1273{
1274        struct f2fs_sb_info *sbi = F2FS_I_SB(dn->inode);
1275        struct node_info new_ni;
1276        struct page *page;
1277        int err;
1278
1279        if (unlikely(is_inode_flag_set(dn->inode, FI_NO_ALLOC)))
1280                return ERR_PTR(-EPERM);
1281
1282        page = f2fs_grab_cache_page(NODE_MAPPING(sbi), dn->nid, false);
1283        if (!page)
1284                return ERR_PTR(-ENOMEM);
1285
1286        if (unlikely((err = inc_valid_node_count(sbi, dn->inode, !ofs))))
1287                goto fail;
1288
1289#ifdef CONFIG_F2FS_CHECK_FS
1290        err = f2fs_get_node_info(sbi, dn->nid, &new_ni, false);
1291        if (err) {
1292                dec_valid_node_count(sbi, dn->inode, !ofs);
1293                goto fail;
1294        }
1295        f2fs_bug_on(sbi, new_ni.blk_addr != NULL_ADDR);
1296#endif
1297        new_ni.nid = dn->nid;
1298        new_ni.ino = dn->inode->i_ino;
1299        new_ni.blk_addr = NULL_ADDR;
1300        new_ni.flag = 0;
1301        new_ni.version = 0;
1302        set_node_addr(sbi, &new_ni, NEW_ADDR, false);
1303
1304        f2fs_wait_on_page_writeback(page, NODE, true, true);
1305        fill_node_footer(page, dn->nid, dn->inode->i_ino, ofs, true);
1306        set_cold_node(page, S_ISDIR(dn->inode->i_mode));
1307        if (!PageUptodate(page))
1308                SetPageUptodate(page);
1309        if (set_page_dirty(page))
1310                dn->node_changed = true;
1311
1312        if (f2fs_has_xattr_block(ofs))
1313                f2fs_i_xnid_write(dn->inode, dn->nid);
1314
1315        if (ofs == 0)
1316                inc_valid_inode_count(sbi);
1317        return page;
1318
1319fail:
1320        clear_node_page_dirty(page);
1321        f2fs_put_page(page, 1);
1322        return ERR_PTR(err);
1323}
1324
1325/*
1326 * Caller should do after getting the following values.
1327 * 0: f2fs_put_page(page, 0)
1328 * LOCKED_PAGE or error: f2fs_put_page(page, 1)
1329 */
1330static int read_node_page(struct page *page, int op_flags)
1331{
1332        struct f2fs_sb_info *sbi = F2FS_P_SB(page);
1333        struct node_info ni;
1334        struct f2fs_io_info fio = {
1335                .sbi = sbi,
1336                .type = NODE,
1337                .op = REQ_OP_READ,
1338                .op_flags = op_flags,
1339                .page = page,
1340                .encrypted_page = NULL,
1341        };
1342        int err;
1343
1344        if (PageUptodate(page)) {
1345                if (!f2fs_inode_chksum_verify(sbi, page)) {
1346                        ClearPageUptodate(page);
1347                        return -EFSBADCRC;
1348                }
1349                return LOCKED_PAGE;
1350        }
1351
1352        err = f2fs_get_node_info(sbi, page->index, &ni, false);
1353        if (err)
1354                return err;
1355
1356        /* NEW_ADDR can be seen, after cp_error drops some dirty node pages */
1357        if (unlikely(ni.blk_addr == NULL_ADDR || ni.blk_addr == NEW_ADDR) ||
1358                        is_sbi_flag_set(sbi, SBI_IS_SHUTDOWN)) {
1359                ClearPageUptodate(page);
1360                return -ENOENT;
1361        }
1362
1363        fio.new_blkaddr = fio.old_blkaddr = ni.blk_addr;
1364
1365        err = f2fs_submit_page_bio(&fio);
1366
1367        if (!err)
1368                f2fs_update_iostat(sbi, FS_NODE_READ_IO, F2FS_BLKSIZE);
1369
1370        return err;
1371}
1372
1373/*
1374 * Readahead a node page
1375 */
1376void f2fs_ra_node_page(struct f2fs_sb_info *sbi, nid_t nid)
1377{
1378        struct page *apage;
1379        int err;
1380
1381        if (!nid)
1382                return;
1383        if (f2fs_check_nid_range(sbi, nid))
1384                return;
1385
1386        apage = xa_load(&NODE_MAPPING(sbi)->i_pages, nid);
1387        if (apage)
1388                return;
1389
1390        apage = f2fs_grab_cache_page(NODE_MAPPING(sbi), nid, false);
1391        if (!apage)
1392                return;
1393
1394        err = read_node_page(apage, REQ_RAHEAD);
1395        f2fs_put_page(apage, err ? 1 : 0);
1396}
1397
1398static struct page *__get_node_page(struct f2fs_sb_info *sbi, pgoff_t nid,
1399                                        struct page *parent, int start)
1400{
1401        struct page *page;
1402        int err;
1403
1404        if (!nid)
1405                return ERR_PTR(-ENOENT);
1406        if (f2fs_check_nid_range(sbi, nid))
1407                return ERR_PTR(-EINVAL);
1408repeat:
1409        page = f2fs_grab_cache_page(NODE_MAPPING(sbi), nid, false);
1410        if (!page)
1411                return ERR_PTR(-ENOMEM);
1412
1413        err = read_node_page(page, 0);
1414        if (err < 0) {
1415                goto out_put_err;
1416        } else if (err == LOCKED_PAGE) {
1417                err = 0;
1418                goto page_hit;
1419        }
1420
1421        if (parent)
1422                f2fs_ra_node_pages(parent, start + 1, MAX_RA_NODE);
1423
1424        lock_page(page);
1425
1426        if (unlikely(page->mapping != NODE_MAPPING(sbi))) {
1427                f2fs_put_page(page, 1);
1428                goto repeat;
1429        }
1430
1431        if (unlikely(!PageUptodate(page))) {
1432                err = -EIO;
1433                goto out_err;
1434        }
1435
1436        if (!f2fs_inode_chksum_verify(sbi, page)) {
1437                err = -EFSBADCRC;
1438                goto out_err;
1439        }
1440page_hit:
1441        if (likely(nid == nid_of_node(page)))
1442                return page;
1443
1444        f2fs_warn(sbi, "inconsistent node block, nid:%lu, node_footer[nid:%u,ino:%u,ofs:%u,cpver:%llu,blkaddr:%u]",
1445                          nid, nid_of_node(page), ino_of_node(page),
1446                          ofs_of_node(page), cpver_of_node(page),
1447                          next_blkaddr_of_node(page));
1448        set_sbi_flag(sbi, SBI_NEED_FSCK);
1449        err = -EINVAL;
1450out_err:
1451        ClearPageUptodate(page);
1452out_put_err:
1453        /* ENOENT comes from read_node_page which is not an error. */
1454        if (err != -ENOENT)
1455                f2fs_handle_page_eio(sbi, page->index, NODE);
1456        f2fs_put_page(page, 1);
1457        return ERR_PTR(err);
1458}
1459
1460struct page *f2fs_get_node_page(struct f2fs_sb_info *sbi, pgoff_t nid)
1461{
1462        return __get_node_page(sbi, nid, NULL, 0);
1463}
1464
1465struct page *f2fs_get_node_page_ra(struct page *parent, int start)
1466{
1467        struct f2fs_sb_info *sbi = F2FS_P_SB(parent);
1468        nid_t nid = get_nid(parent, start, false);
1469
1470        return __get_node_page(sbi, nid, parent, start);
1471}
1472
1473static void flush_inline_data(struct f2fs_sb_info *sbi, nid_t ino)
1474{
1475        struct inode *inode;
1476        struct page *page;
1477        int ret;
1478
1479        /* should flush inline_data before evict_inode */
1480        inode = ilookup(sbi->sb, ino);
1481        if (!inode)
1482                return;
1483
1484        page = f2fs_pagecache_get_page(inode->i_mapping, 0,
1485                                        FGP_LOCK|FGP_NOWAIT, 0);
1486        if (!page)
1487                goto iput_out;
1488
1489        if (!PageUptodate(page))
1490                goto page_out;
1491
1492        if (!PageDirty(page))
1493                goto page_out;
1494
1495        if (!clear_page_dirty_for_io(page))
1496                goto page_out;
1497
1498        ret = f2fs_write_inline_data(inode, page);
1499        inode_dec_dirty_pages(inode);
1500        f2fs_remove_dirty_inode(inode);
1501        if (ret)
1502                set_page_dirty(page);
1503page_out:
1504        f2fs_put_page(page, 1);
1505iput_out:
1506        iput(inode);
1507}
1508
1509static struct page *last_fsync_dnode(struct f2fs_sb_info *sbi, nid_t ino)
1510{
1511        pgoff_t index;
1512        struct pagevec pvec;
1513        struct page *last_page = NULL;
1514        int nr_pages;
1515
1516        pagevec_init(&pvec);
1517        index = 0;
1518
1519        while ((nr_pages = pagevec_lookup_tag(&pvec, NODE_MAPPING(sbi), &index,
1520                                PAGECACHE_TAG_DIRTY))) {
1521                int i;
1522
1523                for (i = 0; i < nr_pages; i++) {
1524                        struct page *page = pvec.pages[i];
1525
1526                        if (unlikely(f2fs_cp_error(sbi))) {
1527                                f2fs_put_page(last_page, 0);
1528                                pagevec_release(&pvec);
1529                                return ERR_PTR(-EIO);
1530                        }
1531
1532                        if (!IS_DNODE(page) || !is_cold_node(page))
1533                                continue;
1534                        if (ino_of_node(page) != ino)
1535                                continue;
1536
1537                        lock_page(page);
1538
1539                        if (unlikely(page->mapping != NODE_MAPPING(sbi))) {
1540continue_unlock:
1541                                unlock_page(page);
1542                                continue;
1543                        }
1544                        if (ino_of_node(page) != ino)
1545                                goto continue_unlock;
1546
1547                        if (!PageDirty(page)) {
1548                                /* someone wrote it for us */
1549                                goto continue_unlock;
1550                        }
1551
1552                        if (last_page)
1553                                f2fs_put_page(last_page, 0);
1554
1555                        get_page(page);
1556                        last_page = page;
1557                        unlock_page(page);
1558                }
1559                pagevec_release(&pvec);
1560                cond_resched();
1561        }
1562        return last_page;
1563}
1564
1565static int __write_node_page(struct page *page, bool atomic, bool *submitted,
1566                                struct writeback_control *wbc, bool do_balance,
1567                                enum iostat_type io_type, unsigned int *seq_id)
1568{
1569        struct f2fs_sb_info *sbi = F2FS_P_SB(page);
1570        nid_t nid;
1571        struct node_info ni;
1572        struct f2fs_io_info fio = {
1573                .sbi = sbi,
1574                .ino = ino_of_node(page),
1575                .type = NODE,
1576                .op = REQ_OP_WRITE,
1577                .op_flags = wbc_to_write_flags(wbc),
1578                .page = page,
1579                .encrypted_page = NULL,
1580                .submitted = false,
1581                .io_type = io_type,
1582                .io_wbc = wbc,
1583        };
1584        unsigned int seq;
1585
1586        trace_f2fs_writepage(page, NODE);
1587
1588        if (unlikely(f2fs_cp_error(sbi))) {
1589                ClearPageUptodate(page);
1590                dec_page_count(sbi, F2FS_DIRTY_NODES);
1591                unlock_page(page);
1592                return 0;
1593        }
1594
1595        if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING)))
1596                goto redirty_out;
1597
1598        if (!is_sbi_flag_set(sbi, SBI_CP_DISABLED) &&
1599                        wbc->sync_mode == WB_SYNC_NONE &&
1600                        IS_DNODE(page) && is_cold_node(page))
1601                goto redirty_out;
1602
1603        /* get old block addr of this node page */
1604        nid = nid_of_node(page);
1605        f2fs_bug_on(sbi, page->index != nid);
1606
1607        if (f2fs_get_node_info(sbi, nid, &ni, !do_balance))
1608                goto redirty_out;
1609
1610        if (wbc->for_reclaim) {
1611                if (!f2fs_down_read_trylock(&sbi->node_write))
1612                        goto redirty_out;
1613        } else {
1614                f2fs_down_read(&sbi->node_write);
1615        }
1616
1617        /* This page is already truncated */
1618        if (unlikely(ni.blk_addr == NULL_ADDR)) {
1619                ClearPageUptodate(page);
1620                dec_page_count(sbi, F2FS_DIRTY_NODES);
1621                f2fs_up_read(&sbi->node_write);
1622                unlock_page(page);
1623                return 0;
1624        }
1625
1626        if (__is_valid_data_blkaddr(ni.blk_addr) &&
1627                !f2fs_is_valid_blkaddr(sbi, ni.blk_addr,
1628                                        DATA_GENERIC_ENHANCE)) {
1629                f2fs_up_read(&sbi->node_write);
1630                goto redirty_out;
1631        }
1632
1633        if (atomic && !test_opt(sbi, NOBARRIER) && !f2fs_sb_has_blkzoned(sbi))
1634                fio.op_flags |= REQ_PREFLUSH | REQ_FUA;
1635
1636        /* should add to global list before clearing PAGECACHE status */
1637        if (f2fs_in_warm_node_list(sbi, page)) {
1638                seq = f2fs_add_fsync_node_entry(sbi, page);
1639                if (seq_id)
1640                        *seq_id = seq;
1641        }
1642
1643        set_page_writeback(page);
1644        ClearPageError(page);
1645
1646        fio.old_blkaddr = ni.blk_addr;
1647        f2fs_do_write_node_page(nid, &fio);
1648        set_node_addr(sbi, &ni, fio.new_blkaddr, is_fsync_dnode(page));
1649        dec_page_count(sbi, F2FS_DIRTY_NODES);
1650        f2fs_up_read(&sbi->node_write);
1651
1652        if (wbc->for_reclaim) {
1653                f2fs_submit_merged_write_cond(sbi, NULL, page, 0, NODE);
1654                submitted = NULL;
1655        }
1656
1657        unlock_page(page);
1658
1659        if (unlikely(f2fs_cp_error(sbi))) {
1660                f2fs_submit_merged_write(sbi, NODE);
1661                submitted = NULL;
1662        }
1663        if (submitted)
1664                *submitted = fio.submitted;
1665
1666        if (do_balance)
1667                f2fs_balance_fs(sbi, false);
1668        return 0;
1669
1670redirty_out:
1671        redirty_page_for_writepage(wbc, page);
1672        return AOP_WRITEPAGE_ACTIVATE;
1673}
1674
1675int f2fs_move_node_page(struct page *node_page, int gc_type)
1676{
1677        int err = 0;
1678
1679        if (gc_type == FG_GC) {
1680                struct writeback_control wbc = {
1681                        .sync_mode = WB_SYNC_ALL,
1682                        .nr_to_write = 1,
1683                        .for_reclaim = 0,
1684                };
1685
1686                f2fs_wait_on_page_writeback(node_page, NODE, true, true);
1687
1688                set_page_dirty(node_page);
1689
1690                if (!clear_page_dirty_for_io(node_page)) {
1691                        err = -EAGAIN;
1692                        goto out_page;
1693                }
1694
1695                if (__write_node_page(node_page, false, NULL,
1696                                        &wbc, false, FS_GC_NODE_IO, NULL)) {
1697                        err = -EAGAIN;
1698                        unlock_page(node_page);
1699                }
1700                goto release_page;
1701        } else {
1702                /* set page dirty and write it */
1703                if (!PageWriteback(node_page))
1704                        set_page_dirty(node_page);
1705        }
1706out_page:
1707        unlock_page(node_page);
1708release_page:
1709        f2fs_put_page(node_page, 0);
1710        return err;
1711}
1712
1713static int f2fs_write_node_page(struct page *page,
1714                                struct writeback_control *wbc)
1715{
1716        return __write_node_page(page, false, NULL, wbc, false,
1717                                                FS_NODE_IO, NULL);
1718}
1719
1720int f2fs_fsync_node_pages(struct f2fs_sb_info *sbi, struct inode *inode,
1721                        struct writeback_control *wbc, bool atomic,
1722                        unsigned int *seq_id)
1723{
1724        pgoff_t index;
1725        struct pagevec pvec;
1726        int ret = 0;
1727        struct page *last_page = NULL;
1728        bool marked = false;
1729        nid_t ino = inode->i_ino;
1730        int nr_pages;
1731        int nwritten = 0;
1732
1733        if (atomic) {
1734                last_page = last_fsync_dnode(sbi, ino);
1735                if (IS_ERR_OR_NULL(last_page))
1736                        return PTR_ERR_OR_ZERO(last_page);
1737        }
1738retry:
1739        pagevec_init(&pvec);
1740        index = 0;
1741
1742        while ((nr_pages = pagevec_lookup_tag(&pvec, NODE_MAPPING(sbi), &index,
1743                                PAGECACHE_TAG_DIRTY))) {
1744                int i;
1745
1746                for (i = 0; i < nr_pages; i++) {
1747                        struct page *page = pvec.pages[i];
1748                        bool submitted = false;
1749
1750                        if (unlikely(f2fs_cp_error(sbi))) {
1751                                f2fs_put_page(last_page, 0);
1752                                pagevec_release(&pvec);
1753                                ret = -EIO;
1754                                goto out;
1755                        }
1756
1757                        if (!IS_DNODE(page) || !is_cold_node(page))
1758                                continue;
1759                        if (ino_of_node(page) != ino)
1760                                continue;
1761
1762                        lock_page(page);
1763
1764                        if (unlikely(page->mapping != NODE_MAPPING(sbi))) {
1765continue_unlock:
1766                                unlock_page(page);
1767                                continue;
1768                        }
1769                        if (ino_of_node(page) != ino)
1770                                goto continue_unlock;
1771
1772                        if (!PageDirty(page) && page != last_page) {
1773                                /* someone wrote it for us */
1774                                goto continue_unlock;
1775                        }
1776
1777                        f2fs_wait_on_page_writeback(page, NODE, true, true);
1778
1779                        set_fsync_mark(page, 0);
1780                        set_dentry_mark(page, 0);
1781
1782                        if (!atomic || page == last_page) {
1783                                set_fsync_mark(page, 1);
1784                                percpu_counter_inc(&sbi->rf_node_block_count);
1785                                if (IS_INODE(page)) {
1786                                        if (is_inode_flag_set(inode,
1787                                                                FI_DIRTY_INODE))
1788                                                f2fs_update_inode(inode, page);
1789                                        set_dentry_mark(page,
1790                                                f2fs_need_dentry_mark(sbi, ino));
1791                                }
1792                                /* may be written by other thread */
1793                                if (!PageDirty(page))
1794                                        set_page_dirty(page);
1795                        }
1796
1797                        if (!clear_page_dirty_for_io(page))
1798                                goto continue_unlock;
1799
1800                        ret = __write_node_page(page, atomic &&
1801                                                page == last_page,
1802                                                &submitted, wbc, true,
1803                                                FS_NODE_IO, seq_id);
1804                        if (ret) {
1805                                unlock_page(page);
1806                                f2fs_put_page(last_page, 0);
1807                                break;
1808                        } else if (submitted) {
1809                                nwritten++;
1810                        }
1811
1812                        if (page == last_page) {
1813                                f2fs_put_page(page, 0);
1814                                marked = true;
1815                                break;
1816                        }
1817                }
1818                pagevec_release(&pvec);
1819                cond_resched();
1820
1821                if (ret || marked)
1822                        break;
1823        }
1824        if (!ret && atomic && !marked) {
1825                f2fs_debug(sbi, "Retry to write fsync mark: ino=%u, idx=%lx",
1826                           ino, last_page->index);
1827                lock_page(last_page);
1828                f2fs_wait_on_page_writeback(last_page, NODE, true, true);
1829                set_page_dirty(last_page);
1830                unlock_page(last_page);
1831                goto retry;
1832        }
1833out:
1834        if (nwritten)
1835                f2fs_submit_merged_write_cond(sbi, NULL, NULL, ino, NODE);
1836        return ret ? -EIO : 0;
1837}
1838
1839static int f2fs_match_ino(struct inode *inode, unsigned long ino, void *data)
1840{
1841        struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
1842        bool clean;
1843
1844        if (inode->i_ino != ino)
1845                return 0;
1846
1847        if (!is_inode_flag_set(inode, FI_DIRTY_INODE))
1848                return 0;
1849
1850        spin_lock(&sbi->inode_lock[DIRTY_META]);
1851        clean = list_empty(&F2FS_I(inode)->gdirty_list);
1852        spin_unlock(&sbi->inode_lock[DIRTY_META]);
1853
1854        if (clean)
1855                return 0;
1856
1857        inode = igrab(inode);
1858        if (!inode)
1859                return 0;
1860        return 1;
1861}
1862
1863static bool flush_dirty_inode(struct page *page)
1864{
1865        struct f2fs_sb_info *sbi = F2FS_P_SB(page);
1866        struct inode *inode;
1867        nid_t ino = ino_of_node(page);
1868
1869        inode = find_inode_nowait(sbi->sb, ino, f2fs_match_ino, NULL);
1870        if (!inode)
1871                return false;
1872
1873        f2fs_update_inode(inode, page);
1874        unlock_page(page);
1875
1876        iput(inode);
1877        return true;
1878}
1879
1880void f2fs_flush_inline_data(struct f2fs_sb_info *sbi)
1881{
1882        pgoff_t index = 0;
1883        struct pagevec pvec;
1884        int nr_pages;
1885
1886        pagevec_init(&pvec);
1887
1888        while ((nr_pages = pagevec_lookup_tag(&pvec,
1889                        NODE_MAPPING(sbi), &index, PAGECACHE_TAG_DIRTY))) {
1890                int i;
1891
1892                for (i = 0; i < nr_pages; i++) {
1893                        struct page *page = pvec.pages[i];
1894
1895                        if (!IS_DNODE(page))
1896                                continue;
1897
1898                        lock_page(page);
1899
1900                        if (unlikely(page->mapping != NODE_MAPPING(sbi))) {
1901continue_unlock:
1902                                unlock_page(page);
1903                                continue;
1904                        }
1905
1906                        if (!PageDirty(page)) {
1907                                /* someone wrote it for us */
1908                                goto continue_unlock;
1909                        }
1910
1911                        /* flush inline_data, if it's async context. */
1912                        if (page_private_inline(page)) {
1913                                clear_page_private_inline(page);
1914                                unlock_page(page);
1915                                flush_inline_data(sbi, ino_of_node(page));
1916                                continue;
1917                        }
1918                        unlock_page(page);
1919                }
1920                pagevec_release(&pvec);
1921                cond_resched();
1922        }
1923}
1924
1925int f2fs_sync_node_pages(struct f2fs_sb_info *sbi,
1926                                struct writeback_control *wbc,
1927                                bool do_balance, enum iostat_type io_type)
1928{
1929        pgoff_t index;
1930        struct pagevec pvec;
1931        int step = 0;
1932        int nwritten = 0;
1933        int ret = 0;
1934        int nr_pages, done = 0;
1935
1936        pagevec_init(&pvec);
1937
1938next_step:
1939        index = 0;
1940
1941        while (!done && (nr_pages = pagevec_lookup_tag(&pvec,
1942                        NODE_MAPPING(sbi), &index, PAGECACHE_TAG_DIRTY))) {
1943                int i;
1944
1945                for (i = 0; i < nr_pages; i++) {
1946                        struct page *page = pvec.pages[i];
1947                        bool submitted = false;
1948                        bool may_dirty = true;
1949
1950                        /* give a priority to WB_SYNC threads */
1951                        if (atomic_read(&sbi->wb_sync_req[NODE]) &&
1952                                        wbc->sync_mode == WB_SYNC_NONE) {
1953                                done = 1;
1954                                break;
1955                        }
1956
1957                        /*
1958                         * flushing sequence with step:
1959                         * 0. indirect nodes
1960                         * 1. dentry dnodes
1961                         * 2. file dnodes
1962                         */
1963                        if (step == 0 && IS_DNODE(page))
1964                                continue;
1965                        if (step == 1 && (!IS_DNODE(page) ||
1966                                                is_cold_node(page)))
1967                                continue;
1968                        if (step == 2 && (!IS_DNODE(page) ||
1969                                                !is_cold_node(page)))
1970                                continue;
1971lock_node:
1972                        if (wbc->sync_mode == WB_SYNC_ALL)
1973                                lock_page(page);
1974                        else if (!trylock_page(page))
1975                                continue;
1976
1977                        if (unlikely(page->mapping != NODE_MAPPING(sbi))) {
1978continue_unlock:
1979                                unlock_page(page);
1980                                continue;
1981                        }
1982
1983                        if (!PageDirty(page)) {
1984                                /* someone wrote it for us */
1985                                goto continue_unlock;
1986                        }
1987
1988                        /* flush inline_data/inode, if it's async context. */
1989                        if (!do_balance)
1990                                goto write_node;
1991
1992                        /* flush inline_data */
1993                        if (page_private_inline(page)) {
1994                                clear_page_private_inline(page);
1995                                unlock_page(page);
1996                                flush_inline_data(sbi, ino_of_node(page));
1997                                goto lock_node;
1998                        }
1999
2000                        /* flush dirty inode */
2001                        if (IS_INODE(page) && may_dirty) {
2002                                may_dirty = false;
2003                                if (flush_dirty_inode(page))
2004                                        goto lock_node;
2005                        }
2006write_node:
2007                        f2fs_wait_on_page_writeback(page, NODE, true, true);
2008
2009                        if (!clear_page_dirty_for_io(page))
2010                                goto continue_unlock;
2011
2012                        set_fsync_mark(page, 0);
2013                        set_dentry_mark(page, 0);
2014
2015                        ret = __write_node_page(page, false, &submitted,
2016                                                wbc, do_balance, io_type, NULL);
2017                        if (ret)
2018                                unlock_page(page);
2019                        else if (submitted)
2020                                nwritten++;
2021
2022                        if (--wbc->nr_to_write == 0)
2023                                break;
2024                }
2025                pagevec_release(&pvec);
2026                cond_resched();
2027
2028                if (wbc->nr_to_write == 0) {
2029                        step = 2;
2030                        break;
2031                }
2032        }
2033
2034        if (step < 2) {
2035                if (!is_sbi_flag_set(sbi, SBI_CP_DISABLED) &&
2036                                wbc->sync_mode == WB_SYNC_NONE && step == 1)
2037                        goto out;
2038                step++;
2039                goto next_step;
2040        }
2041out:
2042        if (nwritten)
2043                f2fs_submit_merged_write(sbi, NODE);
2044
2045        if (unlikely(f2fs_cp_error(sbi)))
2046                return -EIO;
2047        return ret;
2048}
2049
2050int f2fs_wait_on_node_pages_writeback(struct f2fs_sb_info *sbi,
2051                                                unsigned int seq_id)
2052{
2053        struct fsync_node_entry *fn;
2054        struct page *page;
2055        struct list_head *head = &sbi->fsync_node_list;
2056        unsigned long flags;
2057        unsigned int cur_seq_id = 0;
2058        int ret2, ret = 0;
2059
2060        while (seq_id && cur_seq_id < seq_id) {
2061                spin_lock_irqsave(&sbi->fsync_node_lock, flags);
2062                if (list_empty(head)) {
2063                        spin_unlock_irqrestore(&sbi->fsync_node_lock, flags);
2064                        break;
2065                }
2066                fn = list_first_entry(head, struct fsync_node_entry, list);
2067                if (fn->seq_id > seq_id) {
2068                        spin_unlock_irqrestore(&sbi->fsync_node_lock, flags);
2069                        break;
2070                }
2071                cur_seq_id = fn->seq_id;
2072                page = fn->page;
2073                get_page(page);
2074                spin_unlock_irqrestore(&sbi->fsync_node_lock, flags);
2075
2076                f2fs_wait_on_page_writeback(page, NODE, true, false);
2077                if (TestClearPageError(page))
2078                        ret = -EIO;
2079
2080                put_page(page);
2081
2082                if (ret)
2083                        break;
2084        }
2085
2086        ret2 = filemap_check_errors(NODE_MAPPING(sbi));
2087        if (!ret)
2088                ret = ret2;
2089
2090        return ret;
2091}
2092
2093static int f2fs_write_node_pages(struct address_space *mapping,
2094                            struct writeback_control *wbc)
2095{
2096        struct f2fs_sb_info *sbi = F2FS_M_SB(mapping);
2097        struct blk_plug plug;
2098        long diff;
2099
2100        if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING)))
2101                goto skip_write;
2102
2103        /* balancing f2fs's metadata in background */
2104        f2fs_balance_fs_bg(sbi, true);
2105
2106        /* collect a number of dirty node pages and write together */
2107        if (wbc->sync_mode != WB_SYNC_ALL &&
2108                        get_pages(sbi, F2FS_DIRTY_NODES) <
2109                                        nr_pages_to_skip(sbi, NODE))
2110                goto skip_write;
2111
2112        if (wbc->sync_mode == WB_SYNC_ALL)
2113                atomic_inc(&sbi->wb_sync_req[NODE]);
2114        else if (atomic_read(&sbi->wb_sync_req[NODE])) {
2115                /* to avoid potential deadlock */
2116                if (current->plug)
2117                        blk_finish_plug(current->plug);
2118                goto skip_write;
2119        }
2120
2121        trace_f2fs_writepages(mapping->host, wbc, NODE);
2122
2123        diff = nr_pages_to_write(sbi, NODE, wbc);
2124        blk_start_plug(&plug);
2125        f2fs_sync_node_pages(sbi, wbc, true, FS_NODE_IO);
2126        blk_finish_plug(&plug);
2127        wbc->nr_to_write = max((long)0, wbc->nr_to_write - diff);
2128
2129        if (wbc->sync_mode == WB_SYNC_ALL)
2130                atomic_dec(&sbi->wb_sync_req[NODE]);
2131        return 0;
2132
2133skip_write:
2134        wbc->pages_skipped += get_pages(sbi, F2FS_DIRTY_NODES);
2135        trace_f2fs_writepages(mapping->host, wbc, NODE);
2136        return 0;
2137}
2138
2139static bool f2fs_dirty_node_folio(struct address_space *mapping,
2140                struct folio *folio)
2141{
2142        trace_f2fs_set_page_dirty(&folio->page, NODE);
2143
2144        if (!folio_test_uptodate(folio))
2145                folio_mark_uptodate(folio);
2146#ifdef CONFIG_F2FS_CHECK_FS
2147        if (IS_INODE(&folio->page))
2148                f2fs_inode_chksum_set(F2FS_M_SB(mapping), &folio->page);
2149#endif
2150        if (!folio_test_dirty(folio)) {
2151                filemap_dirty_folio(mapping, folio);
2152                inc_page_count(F2FS_M_SB(mapping), F2FS_DIRTY_NODES);
2153                set_page_private_reference(&folio->page);
2154                return true;
2155        }
2156        return false;
2157}
2158
2159/*
2160 * Structure of the f2fs node operations
2161 */
2162const struct address_space_operations f2fs_node_aops = {
2163        .writepage      = f2fs_write_node_page,
2164        .writepages     = f2fs_write_node_pages,
2165        .dirty_folio    = f2fs_dirty_node_folio,
2166        .invalidate_folio = f2fs_invalidate_folio,
2167        .release_folio  = f2fs_release_folio,
2168#ifdef CONFIG_MIGRATION
2169        .migratepage    = f2fs_migrate_page,
2170#endif
2171};
2172
2173static struct free_nid *__lookup_free_nid_list(struct f2fs_nm_info *nm_i,
2174                                                nid_t n)
2175{
2176        return radix_tree_lookup(&nm_i->free_nid_root, n);
2177}
2178
2179static int __insert_free_nid(struct f2fs_sb_info *sbi,
2180                                struct free_nid *i)
2181{
2182        struct f2fs_nm_info *nm_i = NM_I(sbi);
2183        int err = radix_tree_insert(&nm_i->free_nid_root, i->nid, i);
2184
2185        if (err)
2186                return err;
2187
2188        nm_i->nid_cnt[FREE_NID]++;
2189        list_add_tail(&i->list, &nm_i->free_nid_list);
2190        return 0;
2191}
2192
2193static void __remove_free_nid(struct f2fs_sb_info *sbi,
2194                        struct free_nid *i, enum nid_state state)
2195{
2196        struct f2fs_nm_info *nm_i = NM_I(sbi);
2197
2198        f2fs_bug_on(sbi, state != i->state);
2199        nm_i->nid_cnt[state]--;
2200        if (state == FREE_NID)
2201                list_del(&i->list);
2202        radix_tree_delete(&nm_i->free_nid_root, i->nid);
2203}
2204
2205static void __move_free_nid(struct f2fs_sb_info *sbi, struct free_nid *i,
2206                        enum nid_state org_state, enum nid_state dst_state)
2207{
2208        struct f2fs_nm_info *nm_i = NM_I(sbi);
2209
2210        f2fs_bug_on(sbi, org_state != i->state);
2211        i->state = dst_state;
2212        nm_i->nid_cnt[org_state]--;
2213        nm_i->nid_cnt[dst_state]++;
2214
2215        switch (dst_state) {
2216        case PREALLOC_NID:
2217                list_del(&i->list);
2218                break;
2219        case FREE_NID:
2220                list_add_tail(&i->list, &nm_i->free_nid_list);
2221                break;
2222        default:
2223                BUG_ON(1);
2224        }
2225}
2226
2227bool f2fs_nat_bitmap_enabled(struct f2fs_sb_info *sbi)
2228{
2229        struct f2fs_nm_info *nm_i = NM_I(sbi);
2230        unsigned int i;
2231        bool ret = true;
2232
2233        f2fs_down_read(&nm_i->nat_tree_lock);
2234        for (i = 0; i < nm_i->nat_blocks; i++) {
2235                if (!test_bit_le(i, nm_i->nat_block_bitmap)) {
2236                        ret = false;
2237                        break;
2238                }
2239        }
2240        f2fs_up_read(&nm_i->nat_tree_lock);
2241
2242        return ret;
2243}
2244
2245static void update_free_nid_bitmap(struct f2fs_sb_info *sbi, nid_t nid,
2246                                                        bool set, bool build)
2247{
2248        struct f2fs_nm_info *nm_i = NM_I(sbi);
2249        unsigned int nat_ofs = NAT_BLOCK_OFFSET(nid);
2250        unsigned int nid_ofs = nid - START_NID(nid);
2251
2252        if (!test_bit_le(nat_ofs, nm_i->nat_block_bitmap))
2253                return;
2254
2255        if (set) {
2256                if (test_bit_le(nid_ofs, nm_i->free_nid_bitmap[nat_ofs]))
2257                        return;
2258                __set_bit_le(nid_ofs, nm_i->free_nid_bitmap[nat_ofs]);
2259                nm_i->free_nid_count[nat_ofs]++;
2260        } else {
2261                if (!test_bit_le(nid_ofs, nm_i->free_nid_bitmap[nat_ofs]))
2262                        return;
2263                __clear_bit_le(nid_ofs, nm_i->free_nid_bitmap[nat_ofs]);
2264                if (!build)
2265                        nm_i->free_nid_count[nat_ofs]--;
2266        }
2267}
2268
2269/* return if the nid is recognized as free */
2270static bool add_free_nid(struct f2fs_sb_info *sbi,
2271                                nid_t nid, bool build, bool update)
2272{
2273        struct f2fs_nm_info *nm_i = NM_I(sbi);
2274        struct free_nid *i, *e;
2275        struct nat_entry *ne;
2276        int err = -EINVAL;
2277        bool ret = false;
2278
2279        /* 0 nid should not be used */
2280        if (unlikely(nid == 0))
2281                return false;
2282
2283        if (unlikely(f2fs_check_nid_range(sbi, nid)))
2284                return false;
2285
2286        i = f2fs_kmem_cache_alloc(free_nid_slab, GFP_NOFS, true, NULL);
2287        i->nid = nid;
2288        i->state = FREE_NID;
2289
2290        radix_tree_preload(GFP_NOFS | __GFP_NOFAIL);
2291
2292        spin_lock(&nm_i->nid_list_lock);
2293
2294        if (build) {
2295                /*
2296                 *   Thread A             Thread B
2297                 *  - f2fs_create
2298                 *   - f2fs_new_inode
2299                 *    - f2fs_alloc_nid
2300                 *     - __insert_nid_to_list(PREALLOC_NID)
2301                 *                     - f2fs_balance_fs_bg
2302                 *                      - f2fs_build_free_nids
2303                 *                       - __f2fs_build_free_nids
2304                 *                        - scan_nat_page
2305                 *                         - add_free_nid
2306                 *                          - __lookup_nat_cache
2307                 *  - f2fs_add_link
2308                 *   - f2fs_init_inode_metadata
2309                 *    - f2fs_new_inode_page
2310                 *     - f2fs_new_node_page
2311                 *      - set_node_addr
2312                 *  - f2fs_alloc_nid_done
2313                 *   - __remove_nid_from_list(PREALLOC_NID)
2314                 *                         - __insert_nid_to_list(FREE_NID)
2315                 */
2316                ne = __lookup_nat_cache(nm_i, nid);
2317                if (ne && (!get_nat_flag(ne, IS_CHECKPOINTED) ||
2318                                nat_get_blkaddr(ne) != NULL_ADDR))
2319                        goto err_out;
2320
2321                e = __lookup_free_nid_list(nm_i, nid);
2322                if (e) {
2323                        if (e->state == FREE_NID)
2324                                ret = true;
2325                        goto err_out;
2326                }
2327        }
2328        ret = true;
2329        err = __insert_free_nid(sbi, i);
2330err_out:
2331        if (update) {
2332                update_free_nid_bitmap(sbi, nid, ret, build);
2333                if (!build)
2334                        nm_i->available_nids++;
2335        }
2336        spin_unlock(&nm_i->nid_list_lock);
2337        radix_tree_preload_end();
2338
2339        if (err)
2340                kmem_cache_free(free_nid_slab, i);
2341        return ret;
2342}
2343
2344static void remove_free_nid(struct f2fs_sb_info *sbi, nid_t nid)
2345{
2346        struct f2fs_nm_info *nm_i = NM_I(sbi);
2347        struct free_nid *i;
2348        bool need_free = false;
2349
2350        spin_lock(&nm_i->nid_list_lock);
2351        i = __lookup_free_nid_list(nm_i, nid);
2352        if (i && i->state == FREE_NID) {
2353                __remove_free_nid(sbi, i, FREE_NID);
2354                need_free = true;
2355        }
2356        spin_unlock(&nm_i->nid_list_lock);
2357
2358        if (need_free)
2359                kmem_cache_free(free_nid_slab, i);
2360}
2361
2362static int scan_nat_page(struct f2fs_sb_info *sbi,
2363                        struct page *nat_page, nid_t start_nid)
2364{
2365        struct f2fs_nm_info *nm_i = NM_I(sbi);
2366        struct f2fs_nat_block *nat_blk = page_address(nat_page);
2367        block_t blk_addr;
2368        unsigned int nat_ofs = NAT_BLOCK_OFFSET(start_nid);
2369        int i;
2370
2371        __set_bit_le(nat_ofs, nm_i->nat_block_bitmap);
2372
2373        i = start_nid % NAT_ENTRY_PER_BLOCK;
2374
2375        for (; i < NAT_ENTRY_PER_BLOCK; i++, start_nid++) {
2376                if (unlikely(start_nid >= nm_i->max_nid))
2377                        break;
2378
2379                blk_addr = le32_to_cpu(nat_blk->entries[i].block_addr);
2380
2381                if (blk_addr == NEW_ADDR)
2382                        return -EINVAL;
2383
2384                if (blk_addr == NULL_ADDR) {
2385                        add_free_nid(sbi, start_nid, true, true);
2386                } else {
2387                        spin_lock(&NM_I(sbi)->nid_list_lock);
2388                        update_free_nid_bitmap(sbi, start_nid, false, true);
2389                        spin_unlock(&NM_I(sbi)->nid_list_lock);
2390                }
2391        }
2392
2393        return 0;
2394}
2395
2396static void scan_curseg_cache(struct f2fs_sb_info *sbi)
2397{
2398        struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA);
2399        struct f2fs_journal *journal = curseg->journal;
2400        int i;
2401
2402        down_read(&curseg->journal_rwsem);
2403        for (i = 0; i < nats_in_cursum(journal); i++) {
2404                block_t addr;
2405                nid_t nid;
2406
2407                addr = le32_to_cpu(nat_in_journal(journal, i).block_addr);
2408                nid = le32_to_cpu(nid_in_journal(journal, i));
2409                if (addr == NULL_ADDR)
2410                        add_free_nid(sbi, nid, true, false);
2411                else
2412                        remove_free_nid(sbi, nid);
2413        }
2414        up_read(&curseg->journal_rwsem);
2415}
2416
2417static void scan_free_nid_bits(struct f2fs_sb_info *sbi)
2418{
2419        struct f2fs_nm_info *nm_i = NM_I(sbi);
2420        unsigned int i, idx;
2421        nid_t nid;
2422
2423        f2fs_down_read(&nm_i->nat_tree_lock);
2424
2425        for (i = 0; i < nm_i->nat_blocks; i++) {
2426                if (!test_bit_le(i, nm_i->nat_block_bitmap))
2427                        continue;
2428                if (!nm_i->free_nid_count[i])
2429                        continue;
2430                for (idx = 0; idx < NAT_ENTRY_PER_BLOCK; idx++) {
2431                        idx = find_next_bit_le(nm_i->free_nid_bitmap[i],
2432                                                NAT_ENTRY_PER_BLOCK, idx);
2433                        if (idx >= NAT_ENTRY_PER_BLOCK)
2434                                break;
2435
2436                        nid = i * NAT_ENTRY_PER_BLOCK + idx;
2437                        add_free_nid(sbi, nid, true, false);
2438
2439                        if (nm_i->nid_cnt[FREE_NID] >= MAX_FREE_NIDS)
2440                                goto out;
2441                }
2442        }
2443out:
2444        scan_curseg_cache(sbi);
2445
2446        f2fs_up_read(&nm_i->nat_tree_lock);
2447}
2448
2449static int __f2fs_build_free_nids(struct f2fs_sb_info *sbi,
2450                                                bool sync, bool mount)
2451{
2452        struct f2fs_nm_info *nm_i = NM_I(sbi);
2453        int i = 0, ret;
2454        nid_t nid = nm_i->next_scan_nid;
2455
2456        if (unlikely(nid >= nm_i->max_nid))
2457                nid = 0;
2458
2459        if (unlikely(nid % NAT_ENTRY_PER_BLOCK))
2460                nid = NAT_BLOCK_OFFSET(nid) * NAT_ENTRY_PER_BLOCK;
2461
2462        /* Enough entries */
2463        if (nm_i->nid_cnt[FREE_NID] >= NAT_ENTRY_PER_BLOCK)
2464                return 0;
2465
2466        if (!sync && !f2fs_available_free_memory(sbi, FREE_NIDS))
2467                return 0;
2468
2469        if (!mount) {
2470                /* try to find free nids in free_nid_bitmap */
2471                scan_free_nid_bits(sbi);
2472
2473                if (nm_i->nid_cnt[FREE_NID] >= NAT_ENTRY_PER_BLOCK)
2474                        return 0;
2475        }
2476
2477        /* readahead nat pages to be scanned */
2478        f2fs_ra_meta_pages(sbi, NAT_BLOCK_OFFSET(nid), FREE_NID_PAGES,
2479                                                        META_NAT, true);
2480
2481        f2fs_down_read(&nm_i->nat_tree_lock);
2482
2483        while (1) {
2484                if (!test_bit_le(NAT_BLOCK_OFFSET(nid),
2485                                                nm_i->nat_block_bitmap)) {
2486                        struct page *page = get_current_nat_page(sbi, nid);
2487
2488                        if (IS_ERR(page)) {
2489                                ret = PTR_ERR(page);
2490                        } else {
2491                                ret = scan_nat_page(sbi, page, nid);
2492                                f2fs_put_page(page, 1);
2493                        }
2494
2495                        if (ret) {
2496                                f2fs_up_read(&nm_i->nat_tree_lock);
2497                                f2fs_err(sbi, "NAT is corrupt, run fsck to fix it");
2498                                return ret;
2499                        }
2500                }
2501
2502                nid += (NAT_ENTRY_PER_BLOCK - (nid % NAT_ENTRY_PER_BLOCK));
2503                if (unlikely(nid >= nm_i->max_nid))
2504                        nid = 0;
2505
2506                if (++i >= FREE_NID_PAGES)
2507                        break;
2508        }
2509
2510        /* go to the next free nat pages to find free nids abundantly */
2511        nm_i->next_scan_nid = nid;
2512
2513        /* find free nids from current sum_pages */
2514        scan_curseg_cache(sbi);
2515
2516        f2fs_up_read(&nm_i->nat_tree_lock);
2517
2518        f2fs_ra_meta_pages(sbi, NAT_BLOCK_OFFSET(nm_i->next_scan_nid),
2519                                        nm_i->ra_nid_pages, META_NAT, false);
2520
2521        return 0;
2522}
2523
2524int f2fs_build_free_nids(struct f2fs_sb_info *sbi, bool sync, bool mount)
2525{
2526        int ret;
2527
2528        mutex_lock(&NM_I(sbi)->build_lock);
2529        ret = __f2fs_build_free_nids(sbi, sync, mount);
2530        mutex_unlock(&NM_I(sbi)->build_lock);
2531
2532        return ret;
2533}
2534
2535/*
2536 * If this function returns success, caller can obtain a new nid
2537 * from second parameter of this function.
2538 * The returned nid could be used ino as well as nid when inode is created.
2539 */
2540bool f2fs_alloc_nid(struct f2fs_sb_info *sbi, nid_t *nid)
2541{
2542        struct f2fs_nm_info *nm_i = NM_I(sbi);
2543        struct free_nid *i = NULL;
2544retry:
2545        if (time_to_inject(sbi, FAULT_ALLOC_NID)) {
2546                f2fs_show_injection_info(sbi, FAULT_ALLOC_NID);
2547                return false;
2548        }
2549
2550        spin_lock(&nm_i->nid_list_lock);
2551
2552        if (unlikely(nm_i->available_nids == 0)) {
2553                spin_unlock(&nm_i->nid_list_lock);
2554                return false;
2555        }
2556
2557        /* We should not use stale free nids created by f2fs_build_free_nids */
2558        if (nm_i->nid_cnt[FREE_NID] && !on_f2fs_build_free_nids(nm_i)) {
2559                f2fs_bug_on(sbi, list_empty(&nm_i->free_nid_list));
2560                i = list_first_entry(&nm_i->free_nid_list,
2561                                        struct free_nid, list);
2562                *nid = i->nid;
2563
2564                __move_free_nid(sbi, i, FREE_NID, PREALLOC_NID);
2565                nm_i->available_nids--;
2566
2567                update_free_nid_bitmap(sbi, *nid, false, false);
2568
2569                spin_unlock(&nm_i->nid_list_lock);
2570                return true;
2571        }
2572        spin_unlock(&nm_i->nid_list_lock);
2573
2574        /* Let's scan nat pages and its caches to get free nids */
2575        if (!f2fs_build_free_nids(sbi, true, false))
2576                goto retry;
2577        return false;
2578}
2579
2580/*
2581 * f2fs_alloc_nid() should be called prior to this function.
2582 */
2583void f2fs_alloc_nid_done(struct f2fs_sb_info *sbi, nid_t nid)
2584{
2585        struct f2fs_nm_info *nm_i = NM_I(sbi);
2586        struct free_nid *i;
2587
2588        spin_lock(&nm_i->nid_list_lock);
2589        i = __lookup_free_nid_list(nm_i, nid);
2590        f2fs_bug_on(sbi, !i);
2591        __remove_free_nid(sbi, i, PREALLOC_NID);
2592        spin_unlock(&nm_i->nid_list_lock);
2593
2594        kmem_cache_free(free_nid_slab, i);
2595}
2596
2597/*
2598 * f2fs_alloc_nid() should be called prior to this function.
2599 */
2600void f2fs_alloc_nid_failed(struct f2fs_sb_info *sbi, nid_t nid)
2601{
2602        struct f2fs_nm_info *nm_i = NM_I(sbi);
2603        struct free_nid *i;
2604        bool need_free = false;
2605
2606        if (!nid)
2607                return;
2608
2609        spin_lock(&nm_i->nid_list_lock);
2610        i = __lookup_free_nid_list(nm_i, nid);
2611        f2fs_bug_on(sbi, !i);
2612
2613        if (!f2fs_available_free_memory(sbi, FREE_NIDS)) {
2614                __remove_free_nid(sbi, i, PREALLOC_NID);
2615                need_free = true;
2616        } else {
2617                __move_free_nid(sbi, i, PREALLOC_NID, FREE_NID);
2618        }
2619
2620        nm_i->available_nids++;
2621
2622        update_free_nid_bitmap(sbi, nid, true, false);
2623
2624        spin_unlock(&nm_i->nid_list_lock);
2625
2626        if (need_free)
2627                kmem_cache_free(free_nid_slab, i);
2628}
2629
2630int f2fs_try_to_free_nids(struct f2fs_sb_info *sbi, int nr_shrink)
2631{
2632        struct f2fs_nm_info *nm_i = NM_I(sbi);
2633        int nr = nr_shrink;
2634
2635        if (nm_i->nid_cnt[FREE_NID] <= MAX_FREE_NIDS)
2636                return 0;
2637
2638        if (!mutex_trylock(&nm_i->build_lock))
2639                return 0;
2640
2641        while (nr_shrink && nm_i->nid_cnt[FREE_NID] > MAX_FREE_NIDS) {
2642                struct free_nid *i, *next;
2643                unsigned int batch = SHRINK_NID_BATCH_SIZE;
2644
2645                spin_lock(&nm_i->nid_list_lock);
2646                list_for_each_entry_safe(i, next, &nm_i->free_nid_list, list) {
2647                        if (!nr_shrink || !batch ||
2648                                nm_i->nid_cnt[FREE_NID] <= MAX_FREE_NIDS)
2649                                break;
2650                        __remove_free_nid(sbi, i, FREE_NID);
2651                        kmem_cache_free(free_nid_slab, i);
2652                        nr_shrink--;
2653                        batch--;
2654                }
2655                spin_unlock(&nm_i->nid_list_lock);
2656        }
2657
2658        mutex_unlock(&nm_i->build_lock);
2659
2660        return nr - nr_shrink;
2661}
2662
2663int f2fs_recover_inline_xattr(struct inode *inode, struct page *page)
2664{
2665        void *src_addr, *dst_addr;
2666        size_t inline_size;
2667        struct page *ipage;
2668        struct f2fs_inode *ri;
2669
2670        ipage = f2fs_get_node_page(F2FS_I_SB(inode), inode->i_ino);
2671        if (IS_ERR(ipage))
2672                return PTR_ERR(ipage);
2673
2674        ri = F2FS_INODE(page);
2675        if (ri->i_inline & F2FS_INLINE_XATTR) {
2676                if (!f2fs_has_inline_xattr(inode)) {
2677                        set_inode_flag(inode, FI_INLINE_XATTR);
2678                        stat_inc_inline_xattr(inode);
2679                }
2680        } else {
2681                if (f2fs_has_inline_xattr(inode)) {
2682                        stat_dec_inline_xattr(inode);
2683                        clear_inode_flag(inode, FI_INLINE_XATTR);
2684                }
2685                goto update_inode;
2686        }
2687
2688        dst_addr = inline_xattr_addr(inode, ipage);
2689        src_addr = inline_xattr_addr(inode, page);
2690        inline_size = inline_xattr_size(inode);
2691
2692        f2fs_wait_on_page_writeback(ipage, NODE, true, true);
2693        memcpy(dst_addr, src_addr, inline_size);
2694update_inode:
2695        f2fs_update_inode(inode, ipage);
2696        f2fs_put_page(ipage, 1);
2697        return 0;
2698}
2699
2700int f2fs_recover_xattr_data(struct inode *inode, struct page *page)
2701{
2702        struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
2703        nid_t prev_xnid = F2FS_I(inode)->i_xattr_nid;
2704        nid_t new_xnid;
2705        struct dnode_of_data dn;
2706        struct node_info ni;
2707        struct page *xpage;
2708        int err;
2709
2710        if (!prev_xnid)
2711                goto recover_xnid;
2712
2713        /* 1: invalidate the previous xattr nid */
2714        err = f2fs_get_node_info(sbi, prev_xnid, &ni, false);
2715        if (err)
2716                return err;
2717
2718        f2fs_invalidate_blocks(sbi, ni.blk_addr);
2719        dec_valid_node_count(sbi, inode, false);
2720        set_node_addr(sbi, &ni, NULL_ADDR, false);
2721
2722recover_xnid:
2723        /* 2: update xattr nid in inode */
2724        if (!f2fs_alloc_nid(sbi, &new_xnid))
2725                return -ENOSPC;
2726
2727        set_new_dnode(&dn, inode, NULL, NULL, new_xnid);
2728        xpage = f2fs_new_node_page(&dn, XATTR_NODE_OFFSET);
2729        if (IS_ERR(xpage)) {
2730                f2fs_alloc_nid_failed(sbi, new_xnid);
2731                return PTR_ERR(xpage);
2732        }
2733
2734        f2fs_alloc_nid_done(sbi, new_xnid);
2735        f2fs_update_inode_page(inode);
2736
2737        /* 3: update and set xattr node page dirty */
2738        memcpy(F2FS_NODE(xpage), F2FS_NODE(page), VALID_XATTR_BLOCK_SIZE);
2739
2740        set_page_dirty(xpage);
2741        f2fs_put_page(xpage, 1);
2742
2743        return 0;
2744}
2745
2746int f2fs_recover_inode_page(struct f2fs_sb_info *sbi, struct page *page)
2747{
2748        struct f2fs_inode *src, *dst;
2749        nid_t ino = ino_of_node(page);
2750        struct node_info old_ni, new_ni;
2751        struct page *ipage;
2752        int err;
2753
2754        err = f2fs_get_node_info(sbi, ino, &old_ni, false);
2755        if (err)
2756                return err;
2757
2758        if (unlikely(old_ni.blk_addr != NULL_ADDR))
2759                return -EINVAL;
2760retry:
2761        ipage = f2fs_grab_cache_page(NODE_MAPPING(sbi), ino, false);
2762        if (!ipage) {
2763                memalloc_retry_wait(GFP_NOFS);
2764                goto retry;
2765        }
2766
2767        /* Should not use this inode from free nid list */
2768        remove_free_nid(sbi, ino);
2769
2770        if (!PageUptodate(ipage))
2771                SetPageUptodate(ipage);
2772        fill_node_footer(ipage, ino, ino, 0, true);
2773        set_cold_node(ipage, false);
2774
2775        src = F2FS_INODE(page);
2776        dst = F2FS_INODE(ipage);
2777
2778        memcpy(dst, src, offsetof(struct f2fs_inode, i_ext));
2779        dst->i_size = 0;
2780        dst->i_blocks = cpu_to_le64(1);
2781        dst->i_links = cpu_to_le32(1);
2782        dst->i_xattr_nid = 0;
2783        dst->i_inline = src->i_inline & (F2FS_INLINE_XATTR | F2FS_EXTRA_ATTR);
2784        if (dst->i_inline & F2FS_EXTRA_ATTR) {
2785                dst->i_extra_isize = src->i_extra_isize;
2786
2787                if (f2fs_sb_has_flexible_inline_xattr(sbi) &&
2788                        F2FS_FITS_IN_INODE(src, le16_to_cpu(src->i_extra_isize),
2789                                                        i_inline_xattr_size))
2790                        dst->i_inline_xattr_size = src->i_inline_xattr_size;
2791
2792                if (f2fs_sb_has_project_quota(sbi) &&
2793                        F2FS_FITS_IN_INODE(src, le16_to_cpu(src->i_extra_isize),
2794                                                                i_projid))
2795                        dst->i_projid = src->i_projid;
2796
2797                if (f2fs_sb_has_inode_crtime(sbi) &&
2798                        F2FS_FITS_IN_INODE(src, le16_to_cpu(src->i_extra_isize),
2799                                                        i_crtime_nsec)) {
2800                        dst->i_crtime = src->i_crtime;
2801                        dst->i_crtime_nsec = src->i_crtime_nsec;
2802                }
2803        }
2804
2805        new_ni = old_ni;
2806        new_ni.ino = ino;
2807
2808        if (unlikely(inc_valid_node_count(sbi, NULL, true)))
2809                WARN_ON(1);
2810        set_node_addr(sbi, &new_ni, NEW_ADDR, false);
2811        inc_valid_inode_count(sbi);
2812        set_page_dirty(ipage);
2813        f2fs_put_page(ipage, 1);
2814        return 0;
2815}
2816
2817int f2fs_restore_node_summary(struct f2fs_sb_info *sbi,
2818                        unsigned int segno, struct f2fs_summary_block *sum)
2819{
2820        struct f2fs_node *rn;
2821        struct f2fs_summary *sum_entry;
2822        block_t addr;
2823        int i, idx, last_offset, nrpages;
2824
2825        /* scan the node segment */
2826        last_offset = sbi->blocks_per_seg;
2827        addr = START_BLOCK(sbi, segno);
2828        sum_entry = &sum->entries[0];
2829
2830        for (i = 0; i < last_offset; i += nrpages, addr += nrpages) {
2831                nrpages = bio_max_segs(last_offset - i);
2832
2833                /* readahead node pages */
2834                f2fs_ra_meta_pages(sbi, addr, nrpages, META_POR, true);
2835
2836                for (idx = addr; idx < addr + nrpages; idx++) {
2837                        struct page *page = f2fs_get_tmp_page(sbi, idx);
2838
2839                        if (IS_ERR(page))
2840                                return PTR_ERR(page);
2841
2842                        rn = F2FS_NODE(page);
2843                        sum_entry->nid = rn->footer.nid;
2844                        sum_entry->version = 0;
2845                        sum_entry->ofs_in_node = 0;
2846                        sum_entry++;
2847                        f2fs_put_page(page, 1);
2848                }
2849
2850                invalidate_mapping_pages(META_MAPPING(sbi), addr,
2851                                                        addr + nrpages);
2852        }
2853        return 0;
2854}
2855
2856static void remove_nats_in_journal(struct f2fs_sb_info *sbi)
2857{
2858        struct f2fs_nm_info *nm_i = NM_I(sbi);
2859        struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA);
2860        struct f2fs_journal *journal = curseg->journal;
2861        int i;
2862
2863        down_write(&curseg->journal_rwsem);
2864        for (i = 0; i < nats_in_cursum(journal); i++) {
2865                struct nat_entry *ne;
2866                struct f2fs_nat_entry raw_ne;
2867                nid_t nid = le32_to_cpu(nid_in_journal(journal, i));
2868
2869                if (f2fs_check_nid_range(sbi, nid))
2870                        continue;
2871
2872                raw_ne = nat_in_journal(journal, i);
2873
2874                ne = __lookup_nat_cache(nm_i, nid);
2875                if (!ne) {
2876                        ne = __alloc_nat_entry(sbi, nid, true);
2877                        __init_nat_entry(nm_i, ne, &raw_ne, true);
2878                }
2879
2880                /*
2881                 * if a free nat in journal has not been used after last
2882                 * checkpoint, we should remove it from available nids,
2883                 * since later we will add it again.
2884                 */
2885                if (!get_nat_flag(ne, IS_DIRTY) &&
2886                                le32_to_cpu(raw_ne.block_addr) == NULL_ADDR) {
2887                        spin_lock(&nm_i->nid_list_lock);
2888                        nm_i->available_nids--;
2889                        spin_unlock(&nm_i->nid_list_lock);
2890                }
2891
2892                __set_nat_cache_dirty(nm_i, ne);
2893        }
2894        update_nats_in_cursum(journal, -i);
2895        up_write(&curseg->journal_rwsem);
2896}
2897
2898static void __adjust_nat_entry_set(struct nat_entry_set *nes,
2899                                                struct list_head *head, int max)
2900{
2901        struct nat_entry_set *cur;
2902
2903        if (nes->entry_cnt >= max)
2904                goto add_out;
2905
2906        list_for_each_entry(cur, head, set_list) {
2907                if (cur->entry_cnt >= nes->entry_cnt) {
2908                        list_add(&nes->set_list, cur->set_list.prev);
2909                        return;
2910                }
2911        }
2912add_out:
2913        list_add_tail(&nes->set_list, head);
2914}
2915
2916static void __update_nat_bits(struct f2fs_nm_info *nm_i, unsigned int nat_ofs,
2917                                                        unsigned int valid)
2918{
2919        if (valid == 0) {
2920                __set_bit_le(nat_ofs, nm_i->empty_nat_bits);
2921                __clear_bit_le(nat_ofs, nm_i->full_nat_bits);
2922                return;
2923        }
2924
2925        __clear_bit_le(nat_ofs, nm_i->empty_nat_bits);
2926        if (valid == NAT_ENTRY_PER_BLOCK)
2927                __set_bit_le(nat_ofs, nm_i->full_nat_bits);
2928        else
2929                __clear_bit_le(nat_ofs, nm_i->full_nat_bits);
2930}
2931
2932static void update_nat_bits(struct f2fs_sb_info *sbi, nid_t start_nid,
2933                                                struct page *page)
2934{
2935        struct f2fs_nm_info *nm_i = NM_I(sbi);
2936        unsigned int nat_index = start_nid / NAT_ENTRY_PER_BLOCK;
2937        struct f2fs_nat_block *nat_blk = page_address(page);
2938        int valid = 0;
2939        int i = 0;
2940
2941        if (!is_set_ckpt_flags(sbi, CP_NAT_BITS_FLAG))
2942                return;
2943
2944        if (nat_index == 0) {
2945                valid = 1;
2946                i = 1;
2947        }
2948        for (; i < NAT_ENTRY_PER_BLOCK; i++) {
2949                if (le32_to_cpu(nat_blk->entries[i].block_addr) != NULL_ADDR)
2950                        valid++;
2951        }
2952
2953        __update_nat_bits(nm_i, nat_index, valid);
2954}
2955
2956void f2fs_enable_nat_bits(struct f2fs_sb_info *sbi)
2957{
2958        struct f2fs_nm_info *nm_i = NM_I(sbi);
2959        unsigned int nat_ofs;
2960
2961        f2fs_down_read(&nm_i->nat_tree_lock);
2962
2963        for (nat_ofs = 0; nat_ofs < nm_i->nat_blocks; nat_ofs++) {
2964                unsigned int valid = 0, nid_ofs = 0;
2965
2966                /* handle nid zero due to it should never be used */
2967                if (unlikely(nat_ofs == 0)) {
2968                        valid = 1;
2969                        nid_ofs = 1;
2970                }
2971
2972                for (; nid_ofs < NAT_ENTRY_PER_BLOCK; nid_ofs++) {
2973                        if (!test_bit_le(nid_ofs,
2974                                        nm_i->free_nid_bitmap[nat_ofs]))
2975                                valid++;
2976                }
2977
2978                __update_nat_bits(nm_i, nat_ofs, valid);
2979        }
2980
2981        f2fs_up_read(&nm_i->nat_tree_lock);
2982}
2983
2984static int __flush_nat_entry_set(struct f2fs_sb_info *sbi,
2985                struct nat_entry_set *set, struct cp_control *cpc)
2986{
2987        struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA);
2988        struct f2fs_journal *journal = curseg->journal;
2989        nid_t start_nid = set->set * NAT_ENTRY_PER_BLOCK;
2990        bool to_journal = true;
2991        struct f2fs_nat_block *nat_blk;
2992        struct nat_entry *ne, *cur;
2993        struct page *page = NULL;
2994
2995        /*
2996         * there are two steps to flush nat entries:
2997         * #1, flush nat entries to journal in current hot data summary block.
2998         * #2, flush nat entries to nat page.
2999         */
3000        if ((cpc->reason & CP_UMOUNT) ||
3001                !__has_cursum_space(journal, set->entry_cnt, NAT_JOURNAL))
3002                to_journal = false;
3003
3004        if (to_journal) {
3005                down_write(&curseg->journal_rwsem);
3006        } else {
3007                page = get_next_nat_page(sbi, start_nid);
3008                if (IS_ERR(page))
3009                        return PTR_ERR(page);
3010
3011                nat_blk = page_address(page);
3012                f2fs_bug_on(sbi, !nat_blk);
3013        }
3014
3015        /* flush dirty nats in nat entry set */
3016        list_for_each_entry_safe(ne, cur, &set->entry_list, list) {
3017                struct f2fs_nat_entry *raw_ne;
3018                nid_t nid = nat_get_nid(ne);
3019                int offset;
3020
3021                f2fs_bug_on(sbi, nat_get_blkaddr(ne) == NEW_ADDR);
3022
3023                if (to_journal) {
3024                        offset = f2fs_lookup_journal_in_cursum(journal,
3025                                                        NAT_JOURNAL, nid, 1);
3026                        f2fs_bug_on(sbi, offset < 0);
3027                        raw_ne = &nat_in_journal(journal, offset);
3028                        nid_in_journal(journal, offset) = cpu_to_le32(nid);
3029                } else {
3030                        raw_ne = &nat_blk->entries[nid - start_nid];
3031                }
3032                raw_nat_from_node_info(raw_ne, &ne->ni);
3033                nat_reset_flag(ne);
3034                __clear_nat_cache_dirty(NM_I(sbi), set, ne);
3035                if (nat_get_blkaddr(ne) == NULL_ADDR) {
3036                        add_free_nid(sbi, nid, false, true);
3037                } else {
3038                        spin_lock(&NM_I(sbi)->nid_list_lock);
3039                        update_free_nid_bitmap(sbi, nid, false, false);
3040                        spin_unlock(&NM_I(sbi)->nid_list_lock);
3041                }
3042        }
3043
3044        if (to_journal) {
3045                up_write(&curseg->journal_rwsem);
3046        } else {
3047                update_nat_bits(sbi, start_nid, page);
3048                f2fs_put_page(page, 1);
3049        }
3050
3051        /* Allow dirty nats by node block allocation in write_begin */
3052        if (!set->entry_cnt) {
3053                radix_tree_delete(&NM_I(sbi)->nat_set_root, set->set);
3054                kmem_cache_free(nat_entry_set_slab, set);
3055        }
3056        return 0;
3057}
3058
3059/*
3060 * This function is called during the checkpointing process.
3061 */
3062int f2fs_flush_nat_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc)
3063{
3064        struct f2fs_nm_info *nm_i = NM_I(sbi);
3065        struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA);
3066        struct f2fs_journal *journal = curseg->journal;
3067        struct nat_entry_set *setvec[SETVEC_SIZE];
3068        struct nat_entry_set *set, *tmp;
3069        unsigned int found;
3070        nid_t set_idx = 0;
3071        LIST_HEAD(sets);
3072        int err = 0;
3073
3074        /*
3075         * during unmount, let's flush nat_bits before checking
3076         * nat_cnt[DIRTY_NAT].
3077         */
3078        if (cpc->reason & CP_UMOUNT) {
3079                f2fs_down_write(&nm_i->nat_tree_lock);
3080                remove_nats_in_journal(sbi);
3081                f2fs_up_write(&nm_i->nat_tree_lock);
3082        }
3083
3084        if (!nm_i->nat_cnt[DIRTY_NAT])
3085                return 0;
3086
3087        f2fs_down_write(&nm_i->nat_tree_lock);
3088
3089        /*
3090         * if there are no enough space in journal to store dirty nat
3091         * entries, remove all entries from journal and merge them
3092         * into nat entry set.
3093         */
3094        if (cpc->reason & CP_UMOUNT ||
3095                !__has_cursum_space(journal,
3096                        nm_i->nat_cnt[DIRTY_NAT], NAT_JOURNAL))
3097                remove_nats_in_journal(sbi);
3098
3099        while ((found = __gang_lookup_nat_set(nm_i,
3100                                        set_idx, SETVEC_SIZE, setvec))) {
3101                unsigned idx;
3102
3103                set_idx = setvec[found - 1]->set + 1;
3104                for (idx = 0; idx < found; idx++)
3105                        __adjust_nat_entry_set(setvec[idx], &sets,
3106                                                MAX_NAT_JENTRIES(journal));
3107        }
3108
3109        /* flush dirty nats in nat entry set */
3110        list_for_each_entry_safe(set, tmp, &sets, set_list) {
3111                err = __flush_nat_entry_set(sbi, set, cpc);
3112                if (err)
3113                        break;
3114        }
3115
3116        f2fs_up_write(&nm_i->nat_tree_lock);
3117        /* Allow dirty nats by node block allocation in write_begin */
3118
3119        return err;
3120}
3121
3122static int __get_nat_bitmaps(struct f2fs_sb_info *sbi)
3123{
3124        struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi);
3125        struct f2fs_nm_info *nm_i = NM_I(sbi);
3126        unsigned int nat_bits_bytes = nm_i->nat_blocks / BITS_PER_BYTE;
3127        unsigned int i;
3128        __u64 cp_ver = cur_cp_version(ckpt);
3129        block_t nat_bits_addr;
3130
3131        nm_i->nat_bits_blocks = F2FS_BLK_ALIGN((nat_bits_bytes << 1) + 8);
3132        nm_i->nat_bits = f2fs_kvzalloc(sbi,
3133                        nm_i->nat_bits_blocks << F2FS_BLKSIZE_BITS, GFP_KERNEL);
3134        if (!nm_i->nat_bits)
3135                return -ENOMEM;
3136
3137        nm_i->full_nat_bits = nm_i->nat_bits + 8;
3138        nm_i->empty_nat_bits = nm_i->full_nat_bits + nat_bits_bytes;
3139
3140        if (!is_set_ckpt_flags(sbi, CP_NAT_BITS_FLAG))
3141                return 0;
3142
3143        nat_bits_addr = __start_cp_addr(sbi) + sbi->blocks_per_seg -
3144                                                nm_i->nat_bits_blocks;
3145        for (i = 0; i < nm_i->nat_bits_blocks; i++) {
3146                struct page *page;
3147
3148                page = f2fs_get_meta_page(sbi, nat_bits_addr++);
3149                if (IS_ERR(page))
3150                        return PTR_ERR(page);
3151
3152                memcpy(nm_i->nat_bits + (i << F2FS_BLKSIZE_BITS),
3153                                        page_address(page), F2FS_BLKSIZE);
3154                f2fs_put_page(page, 1);
3155        }
3156
3157        cp_ver |= (cur_cp_crc(ckpt) << 32);
3158        if (cpu_to_le64(cp_ver) != *(__le64 *)nm_i->nat_bits) {
3159                clear_ckpt_flags(sbi, CP_NAT_BITS_FLAG);
3160                f2fs_notice(sbi, "Disable nat_bits due to incorrect cp_ver (%llu, %llu)",
3161                        cp_ver, le64_to_cpu(*(__le64 *)nm_i->nat_bits));
3162                return 0;
3163        }
3164
3165        f2fs_notice(sbi, "Found nat_bits in checkpoint");
3166        return 0;
3167}
3168
3169static inline void load_free_nid_bitmap(struct f2fs_sb_info *sbi)
3170{
3171        struct f2fs_nm_info *nm_i = NM_I(sbi);
3172        unsigned int i = 0;
3173        nid_t nid, last_nid;
3174
3175        if (!is_set_ckpt_flags(sbi, CP_NAT_BITS_FLAG))
3176                return;
3177
3178        for (i = 0; i < nm_i->nat_blocks; i++) {
3179                i = find_next_bit_le(nm_i->empty_nat_bits, nm_i->nat_blocks, i);
3180                if (i >= nm_i->nat_blocks)
3181                        break;
3182
3183                __set_bit_le(i, nm_i->nat_block_bitmap);
3184
3185                nid = i * NAT_ENTRY_PER_BLOCK;
3186                last_nid = nid + NAT_ENTRY_PER_BLOCK;
3187
3188                spin_lock(&NM_I(sbi)->nid_list_lock);
3189                for (; nid < last_nid; nid++)
3190                        update_free_nid_bitmap(sbi, nid, true, true);
3191                spin_unlock(&NM_I(sbi)->nid_list_lock);
3192        }
3193
3194        for (i = 0; i < nm_i->nat_blocks; i++) {
3195                i = find_next_bit_le(nm_i->full_nat_bits, nm_i->nat_blocks, i);
3196                if (i >= nm_i->nat_blocks)
3197                        break;
3198
3199                __set_bit_le(i, nm_i->nat_block_bitmap);
3200        }
3201}
3202
3203static int init_node_manager(struct f2fs_sb_info *sbi)
3204{
3205        struct f2fs_super_block *sb_raw = F2FS_RAW_SUPER(sbi);
3206        struct f2fs_nm_info *nm_i = NM_I(sbi);
3207        unsigned char *version_bitmap;
3208        unsigned int nat_segs;
3209        int err;
3210
3211        nm_i->nat_blkaddr = le32_to_cpu(sb_raw->nat_blkaddr);
3212
3213        /* segment_count_nat includes pair segment so divide to 2. */
3214        nat_segs = le32_to_cpu(sb_raw->segment_count_nat) >> 1;
3215        nm_i->nat_blocks = nat_segs << le32_to_cpu(sb_raw->log_blocks_per_seg);
3216        nm_i->max_nid = NAT_ENTRY_PER_BLOCK * nm_i->nat_blocks;
3217
3218        /* not used nids: 0, node, meta, (and root counted as valid node) */
3219        nm_i->available_nids = nm_i->max_nid - sbi->total_valid_node_count -
3220                                                F2FS_RESERVED_NODE_NUM;
3221        nm_i->nid_cnt[FREE_NID] = 0;
3222        nm_i->nid_cnt[PREALLOC_NID] = 0;
3223        nm_i->ram_thresh = DEF_RAM_THRESHOLD;
3224        nm_i->ra_nid_pages = DEF_RA_NID_PAGES;
3225        nm_i->dirty_nats_ratio = DEF_DIRTY_NAT_RATIO_THRESHOLD;
3226        nm_i->max_rf_node_blocks = DEF_RF_NODE_BLOCKS;
3227
3228        INIT_RADIX_TREE(&nm_i->free_nid_root, GFP_ATOMIC);
3229        INIT_LIST_HEAD(&nm_i->free_nid_list);
3230        INIT_RADIX_TREE(&nm_i->nat_root, GFP_NOIO);
3231        INIT_RADIX_TREE(&nm_i->nat_set_root, GFP_NOIO);
3232        INIT_LIST_HEAD(&nm_i->nat_entries);
3233        spin_lock_init(&nm_i->nat_list_lock);
3234
3235        mutex_init(&nm_i->build_lock);
3236        spin_lock_init(&nm_i->nid_list_lock);
3237        init_f2fs_rwsem(&nm_i->nat_tree_lock);
3238
3239        nm_i->next_scan_nid = le32_to_cpu(sbi->ckpt->next_free_nid);
3240        nm_i->bitmap_size = __bitmap_size(sbi, NAT_BITMAP);
3241        version_bitmap = __bitmap_ptr(sbi, NAT_BITMAP);
3242        nm_i->nat_bitmap = kmemdup(version_bitmap, nm_i->bitmap_size,
3243                                        GFP_KERNEL);
3244        if (!nm_i->nat_bitmap)
3245                return -ENOMEM;
3246
3247        err = __get_nat_bitmaps(sbi);
3248        if (err)
3249                return err;
3250
3251#ifdef CONFIG_F2FS_CHECK_FS
3252        nm_i->nat_bitmap_mir = kmemdup(version_bitmap, nm_i->bitmap_size,
3253                                        GFP_KERNEL);
3254        if (!nm_i->nat_bitmap_mir)
3255                return -ENOMEM;
3256#endif
3257
3258        return 0;
3259}
3260
3261static int init_free_nid_cache(struct f2fs_sb_info *sbi)
3262{
3263        struct f2fs_nm_info *nm_i = NM_I(sbi);
3264        int i;
3265
3266        nm_i->free_nid_bitmap =
3267                f2fs_kvzalloc(sbi, array_size(sizeof(unsigned char *),
3268                                              nm_i->nat_blocks),
3269                              GFP_KERNEL);
3270        if (!nm_i->free_nid_bitmap)
3271                return -ENOMEM;
3272
3273        for (i = 0; i < nm_i->nat_blocks; i++) {
3274                nm_i->free_nid_bitmap[i] = f2fs_kvzalloc(sbi,
3275                        f2fs_bitmap_size(NAT_ENTRY_PER_BLOCK), GFP_KERNEL);
3276                if (!nm_i->free_nid_bitmap[i])
3277                        return -ENOMEM;
3278        }
3279
3280        nm_i->nat_block_bitmap = f2fs_kvzalloc(sbi, nm_i->nat_blocks / 8,
3281                                                                GFP_KERNEL);
3282        if (!nm_i->nat_block_bitmap)
3283                return -ENOMEM;
3284
3285        nm_i->free_nid_count =
3286                f2fs_kvzalloc(sbi, array_size(sizeof(unsigned short),
3287                                              nm_i->nat_blocks),
3288                              GFP_KERNEL);
3289        if (!nm_i->free_nid_count)
3290                return -ENOMEM;
3291        return 0;
3292}
3293
3294int f2fs_build_node_manager(struct f2fs_sb_info *sbi)
3295{
3296        int err;
3297
3298        sbi->nm_info = f2fs_kzalloc(sbi, sizeof(struct f2fs_nm_info),
3299                                                        GFP_KERNEL);
3300        if (!sbi->nm_info)
3301                return -ENOMEM;
3302
3303        err = init_node_manager(sbi);
3304        if (err)
3305                return err;
3306
3307        err = init_free_nid_cache(sbi);
3308        if (err)
3309                return err;
3310
3311        /* load free nid status from nat_bits table */
3312        load_free_nid_bitmap(sbi);
3313
3314        return f2fs_build_free_nids(sbi, true, true);
3315}
3316
3317void f2fs_destroy_node_manager(struct f2fs_sb_info *sbi)
3318{
3319        struct f2fs_nm_info *nm_i = NM_I(sbi);
3320        struct free_nid *i, *next_i;
3321        struct nat_entry *natvec[NATVEC_SIZE];
3322        struct nat_entry_set *setvec[SETVEC_SIZE];
3323        nid_t nid = 0;
3324        unsigned int found;
3325
3326        if (!nm_i)
3327                return;
3328
3329        /* destroy free nid list */
3330        spin_lock(&nm_i->nid_list_lock);
3331        list_for_each_entry_safe(i, next_i, &nm_i->free_nid_list, list) {
3332                __remove_free_nid(sbi, i, FREE_NID);
3333                spin_unlock(&nm_i->nid_list_lock);
3334                kmem_cache_free(free_nid_slab, i);
3335                spin_lock(&nm_i->nid_list_lock);
3336        }
3337        f2fs_bug_on(sbi, nm_i->nid_cnt[FREE_NID]);
3338        f2fs_bug_on(sbi, nm_i->nid_cnt[PREALLOC_NID]);
3339        f2fs_bug_on(sbi, !list_empty(&nm_i->free_nid_list));
3340        spin_unlock(&nm_i->nid_list_lock);
3341
3342        /* destroy nat cache */
3343        f2fs_down_write(&nm_i->nat_tree_lock);
3344        while ((found = __gang_lookup_nat_cache(nm_i,
3345                                        nid, NATVEC_SIZE, natvec))) {
3346                unsigned idx;
3347
3348                nid = nat_get_nid(natvec[found - 1]) + 1;
3349                for (idx = 0; idx < found; idx++) {
3350                        spin_lock(&nm_i->nat_list_lock);
3351                        list_del(&natvec[idx]->list);
3352                        spin_unlock(&nm_i->nat_list_lock);
3353
3354                        __del_from_nat_cache(nm_i, natvec[idx]);
3355                }
3356        }
3357        f2fs_bug_on(sbi, nm_i->nat_cnt[TOTAL_NAT]);
3358
3359        /* destroy nat set cache */
3360        nid = 0;
3361        while ((found = __gang_lookup_nat_set(nm_i,
3362                                        nid, SETVEC_SIZE, setvec))) {
3363                unsigned idx;
3364
3365                nid = setvec[found - 1]->set + 1;
3366                for (idx = 0; idx < found; idx++) {
3367                        /* entry_cnt is not zero, when cp_error was occurred */
3368                        f2fs_bug_on(sbi, !list_empty(&setvec[idx]->entry_list));
3369                        radix_tree_delete(&nm_i->nat_set_root, setvec[idx]->set);
3370                        kmem_cache_free(nat_entry_set_slab, setvec[idx]);
3371                }
3372        }
3373        f2fs_up_write(&nm_i->nat_tree_lock);
3374
3375        kvfree(nm_i->nat_block_bitmap);
3376        if (nm_i->free_nid_bitmap) {
3377                int i;
3378
3379                for (i = 0; i < nm_i->nat_blocks; i++)
3380                        kvfree(nm_i->free_nid_bitmap[i]);
3381                kvfree(nm_i->free_nid_bitmap);
3382        }
3383        kvfree(nm_i->free_nid_count);
3384
3385        kvfree(nm_i->nat_bitmap);
3386        kvfree(nm_i->nat_bits);
3387#ifdef CONFIG_F2FS_CHECK_FS
3388        kvfree(nm_i->nat_bitmap_mir);
3389#endif
3390        sbi->nm_info = NULL;
3391        kfree(nm_i);
3392}
3393
3394int __init f2fs_create_node_manager_caches(void)
3395{
3396        nat_entry_slab = f2fs_kmem_cache_create("f2fs_nat_entry",
3397                        sizeof(struct nat_entry));
3398        if (!nat_entry_slab)
3399                goto fail;
3400
3401        free_nid_slab = f2fs_kmem_cache_create("f2fs_free_nid",
3402                        sizeof(struct free_nid));
3403        if (!free_nid_slab)
3404                goto destroy_nat_entry;
3405
3406        nat_entry_set_slab = f2fs_kmem_cache_create("f2fs_nat_entry_set",
3407                        sizeof(struct nat_entry_set));
3408        if (!nat_entry_set_slab)
3409                goto destroy_free_nid;
3410
3411        fsync_node_entry_slab = f2fs_kmem_cache_create("f2fs_fsync_node_entry",
3412                        sizeof(struct fsync_node_entry));
3413        if (!fsync_node_entry_slab)
3414                goto destroy_nat_entry_set;
3415        return 0;
3416
3417destroy_nat_entry_set:
3418        kmem_cache_destroy(nat_entry_set_slab);
3419destroy_free_nid:
3420        kmem_cache_destroy(free_nid_slab);
3421destroy_nat_entry:
3422        kmem_cache_destroy(nat_entry_slab);
3423fail:
3424        return -ENOMEM;
3425}
3426
3427void f2fs_destroy_node_manager_caches(void)
3428{
3429        kmem_cache_destroy(fsync_node_entry_slab);
3430        kmem_cache_destroy(nat_entry_set_slab);
3431        kmem_cache_destroy(free_nid_slab);
3432        kmem_cache_destroy(nat_entry_slab);
3433}
3434