linux/fs/f2fs/gc.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0
   2/*
   3 * fs/f2fs/gc.c
   4 *
   5 * Copyright (c) 2012 Samsung Electronics Co., Ltd.
   6 *             http://www.samsung.com/
   7 */
   8#include <linux/fs.h>
   9#include <linux/module.h>
  10#include <linux/init.h>
  11#include <linux/f2fs_fs.h>
  12#include <linux/kthread.h>
  13#include <linux/delay.h>
  14#include <linux/freezer.h>
  15#include <linux/sched/signal.h>
  16#include <linux/random.h>
  17#include <linux/sched/mm.h>
  18
  19#include "f2fs.h"
  20#include "node.h"
  21#include "segment.h"
  22#include "gc.h"
  23#include "iostat.h"
  24#include <trace/events/f2fs.h>
  25
  26static struct kmem_cache *victim_entry_slab;
  27
  28static unsigned int count_bits(const unsigned long *addr,
  29                                unsigned int offset, unsigned int len);
  30
  31static int gc_thread_func(void *data)
  32{
  33        struct f2fs_sb_info *sbi = data;
  34        struct f2fs_gc_kthread *gc_th = sbi->gc_thread;
  35        wait_queue_head_t *wq = &sbi->gc_thread->gc_wait_queue_head;
  36        wait_queue_head_t *fggc_wq = &sbi->gc_thread->fggc_wq;
  37        unsigned int wait_ms;
  38
  39        wait_ms = gc_th->min_sleep_time;
  40
  41        set_freezable();
  42        do {
  43                bool sync_mode, foreground = false;
  44
  45                wait_event_interruptible_timeout(*wq,
  46                                kthread_should_stop() || freezing(current) ||
  47                                waitqueue_active(fggc_wq) ||
  48                                gc_th->gc_wake,
  49                                msecs_to_jiffies(wait_ms));
  50
  51                if (test_opt(sbi, GC_MERGE) && waitqueue_active(fggc_wq))
  52                        foreground = true;
  53
  54                /* give it a try one time */
  55                if (gc_th->gc_wake)
  56                        gc_th->gc_wake = 0;
  57
  58                if (try_to_freeze()) {
  59                        stat_other_skip_bggc_count(sbi);
  60                        continue;
  61                }
  62                if (kthread_should_stop())
  63                        break;
  64
  65                if (sbi->sb->s_writers.frozen >= SB_FREEZE_WRITE) {
  66                        increase_sleep_time(gc_th, &wait_ms);
  67                        stat_other_skip_bggc_count(sbi);
  68                        continue;
  69                }
  70
  71                if (time_to_inject(sbi, FAULT_CHECKPOINT)) {
  72                        f2fs_show_injection_info(sbi, FAULT_CHECKPOINT);
  73                        f2fs_stop_checkpoint(sbi, false);
  74                }
  75
  76                if (!sb_start_write_trylock(sbi->sb)) {
  77                        stat_other_skip_bggc_count(sbi);
  78                        continue;
  79                }
  80
  81                /*
  82                 * [GC triggering condition]
  83                 * 0. GC is not conducted currently.
  84                 * 1. There are enough dirty segments.
  85                 * 2. IO subsystem is idle by checking the # of writeback pages.
  86                 * 3. IO subsystem is idle by checking the # of requests in
  87                 *    bdev's request list.
  88                 *
  89                 * Note) We have to avoid triggering GCs frequently.
  90                 * Because it is possible that some segments can be
  91                 * invalidated soon after by user update or deletion.
  92                 * So, I'd like to wait some time to collect dirty segments.
  93                 */
  94                if (sbi->gc_mode == GC_URGENT_HIGH) {
  95                        spin_lock(&sbi->gc_urgent_high_lock);
  96                        if (sbi->gc_urgent_high_limited) {
  97                                if (!sbi->gc_urgent_high_remaining) {
  98                                        sbi->gc_urgent_high_limited = false;
  99                                        spin_unlock(&sbi->gc_urgent_high_lock);
 100                                        sbi->gc_mode = GC_NORMAL;
 101                                        continue;
 102                                }
 103                                sbi->gc_urgent_high_remaining--;
 104                        }
 105                        spin_unlock(&sbi->gc_urgent_high_lock);
 106
 107                        wait_ms = gc_th->urgent_sleep_time;
 108                        down_write(&sbi->gc_lock);
 109                        goto do_gc;
 110                }
 111
 112                if (foreground) {
 113                        down_write(&sbi->gc_lock);
 114                        goto do_gc;
 115                } else if (!down_write_trylock(&sbi->gc_lock)) {
 116                        stat_other_skip_bggc_count(sbi);
 117                        goto next;
 118                }
 119
 120                if (!is_idle(sbi, GC_TIME)) {
 121                        increase_sleep_time(gc_th, &wait_ms);
 122                        up_write(&sbi->gc_lock);
 123                        stat_io_skip_bggc_count(sbi);
 124                        goto next;
 125                }
 126
 127                if (has_enough_invalid_blocks(sbi))
 128                        decrease_sleep_time(gc_th, &wait_ms);
 129                else
 130                        increase_sleep_time(gc_th, &wait_ms);
 131do_gc:
 132                if (!foreground)
 133                        stat_inc_bggc_count(sbi->stat_info);
 134
 135                sync_mode = F2FS_OPTION(sbi).bggc_mode == BGGC_MODE_SYNC;
 136
 137                /* foreground GC was been triggered via f2fs_balance_fs() */
 138                if (foreground)
 139                        sync_mode = false;
 140
 141                /* if return value is not zero, no victim was selected */
 142                if (f2fs_gc(sbi, sync_mode, !foreground, false, NULL_SEGNO))
 143                        wait_ms = gc_th->no_gc_sleep_time;
 144
 145                if (foreground)
 146                        wake_up_all(&gc_th->fggc_wq);
 147
 148                trace_f2fs_background_gc(sbi->sb, wait_ms,
 149                                prefree_segments(sbi), free_segments(sbi));
 150
 151                /* balancing f2fs's metadata periodically */
 152                f2fs_balance_fs_bg(sbi, true);
 153next:
 154                sb_end_write(sbi->sb);
 155
 156        } while (!kthread_should_stop());
 157        return 0;
 158}
 159
 160int f2fs_start_gc_thread(struct f2fs_sb_info *sbi)
 161{
 162        struct f2fs_gc_kthread *gc_th;
 163        dev_t dev = sbi->sb->s_bdev->bd_dev;
 164        int err = 0;
 165
 166        gc_th = f2fs_kmalloc(sbi, sizeof(struct f2fs_gc_kthread), GFP_KERNEL);
 167        if (!gc_th) {
 168                err = -ENOMEM;
 169                goto out;
 170        }
 171
 172        gc_th->urgent_sleep_time = DEF_GC_THREAD_URGENT_SLEEP_TIME;
 173        gc_th->min_sleep_time = DEF_GC_THREAD_MIN_SLEEP_TIME;
 174        gc_th->max_sleep_time = DEF_GC_THREAD_MAX_SLEEP_TIME;
 175        gc_th->no_gc_sleep_time = DEF_GC_THREAD_NOGC_SLEEP_TIME;
 176
 177        gc_th->gc_wake = 0;
 178
 179        sbi->gc_thread = gc_th;
 180        init_waitqueue_head(&sbi->gc_thread->gc_wait_queue_head);
 181        init_waitqueue_head(&sbi->gc_thread->fggc_wq);
 182        sbi->gc_thread->f2fs_gc_task = kthread_run(gc_thread_func, sbi,
 183                        "f2fs_gc-%u:%u", MAJOR(dev), MINOR(dev));
 184        if (IS_ERR(gc_th->f2fs_gc_task)) {
 185                err = PTR_ERR(gc_th->f2fs_gc_task);
 186                kfree(gc_th);
 187                sbi->gc_thread = NULL;
 188        }
 189out:
 190        return err;
 191}
 192
 193void f2fs_stop_gc_thread(struct f2fs_sb_info *sbi)
 194{
 195        struct f2fs_gc_kthread *gc_th = sbi->gc_thread;
 196
 197        if (!gc_th)
 198                return;
 199        kthread_stop(gc_th->f2fs_gc_task);
 200        wake_up_all(&gc_th->fggc_wq);
 201        kfree(gc_th);
 202        sbi->gc_thread = NULL;
 203}
 204
 205static int select_gc_type(struct f2fs_sb_info *sbi, int gc_type)
 206{
 207        int gc_mode;
 208
 209        if (gc_type == BG_GC) {
 210                if (sbi->am.atgc_enabled)
 211                        gc_mode = GC_AT;
 212                else
 213                        gc_mode = GC_CB;
 214        } else {
 215                gc_mode = GC_GREEDY;
 216        }
 217
 218        switch (sbi->gc_mode) {
 219        case GC_IDLE_CB:
 220                gc_mode = GC_CB;
 221                break;
 222        case GC_IDLE_GREEDY:
 223        case GC_URGENT_HIGH:
 224                gc_mode = GC_GREEDY;
 225                break;
 226        case GC_IDLE_AT:
 227                gc_mode = GC_AT;
 228                break;
 229        }
 230
 231        return gc_mode;
 232}
 233
 234static void select_policy(struct f2fs_sb_info *sbi, int gc_type,
 235                        int type, struct victim_sel_policy *p)
 236{
 237        struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
 238
 239        if (p->alloc_mode == SSR) {
 240                p->gc_mode = GC_GREEDY;
 241                p->dirty_bitmap = dirty_i->dirty_segmap[type];
 242                p->max_search = dirty_i->nr_dirty[type];
 243                p->ofs_unit = 1;
 244        } else if (p->alloc_mode == AT_SSR) {
 245                p->gc_mode = GC_GREEDY;
 246                p->dirty_bitmap = dirty_i->dirty_segmap[type];
 247                p->max_search = dirty_i->nr_dirty[type];
 248                p->ofs_unit = 1;
 249        } else {
 250                p->gc_mode = select_gc_type(sbi, gc_type);
 251                p->ofs_unit = sbi->segs_per_sec;
 252                if (__is_large_section(sbi)) {
 253                        p->dirty_bitmap = dirty_i->dirty_secmap;
 254                        p->max_search = count_bits(p->dirty_bitmap,
 255                                                0, MAIN_SECS(sbi));
 256                } else {
 257                        p->dirty_bitmap = dirty_i->dirty_segmap[DIRTY];
 258                        p->max_search = dirty_i->nr_dirty[DIRTY];
 259                }
 260        }
 261
 262        /*
 263         * adjust candidates range, should select all dirty segments for
 264         * foreground GC and urgent GC cases.
 265         */
 266        if (gc_type != FG_GC &&
 267                        (sbi->gc_mode != GC_URGENT_HIGH) &&
 268                        (p->gc_mode != GC_AT && p->alloc_mode != AT_SSR) &&
 269                        p->max_search > sbi->max_victim_search)
 270                p->max_search = sbi->max_victim_search;
 271
 272        /* let's select beginning hot/small space first in no_heap mode*/
 273        if (f2fs_need_rand_seg(sbi))
 274                p->offset = prandom_u32() % (MAIN_SECS(sbi) * sbi->segs_per_sec);
 275        else if (test_opt(sbi, NOHEAP) &&
 276                (type == CURSEG_HOT_DATA || IS_NODESEG(type)))
 277                p->offset = 0;
 278        else
 279                p->offset = SIT_I(sbi)->last_victim[p->gc_mode];
 280}
 281
 282static unsigned int get_max_cost(struct f2fs_sb_info *sbi,
 283                                struct victim_sel_policy *p)
 284{
 285        /* SSR allocates in a segment unit */
 286        if (p->alloc_mode == SSR)
 287                return sbi->blocks_per_seg;
 288        else if (p->alloc_mode == AT_SSR)
 289                return UINT_MAX;
 290
 291        /* LFS */
 292        if (p->gc_mode == GC_GREEDY)
 293                return 2 * sbi->blocks_per_seg * p->ofs_unit;
 294        else if (p->gc_mode == GC_CB)
 295                return UINT_MAX;
 296        else if (p->gc_mode == GC_AT)
 297                return UINT_MAX;
 298        else /* No other gc_mode */
 299                return 0;
 300}
 301
 302static unsigned int check_bg_victims(struct f2fs_sb_info *sbi)
 303{
 304        struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
 305        unsigned int secno;
 306
 307        /*
 308         * If the gc_type is FG_GC, we can select victim segments
 309         * selected by background GC before.
 310         * Those segments guarantee they have small valid blocks.
 311         */
 312        for_each_set_bit(secno, dirty_i->victim_secmap, MAIN_SECS(sbi)) {
 313                if (sec_usage_check(sbi, secno))
 314                        continue;
 315                clear_bit(secno, dirty_i->victim_secmap);
 316                return GET_SEG_FROM_SEC(sbi, secno);
 317        }
 318        return NULL_SEGNO;
 319}
 320
 321static unsigned int get_cb_cost(struct f2fs_sb_info *sbi, unsigned int segno)
 322{
 323        struct sit_info *sit_i = SIT_I(sbi);
 324        unsigned int secno = GET_SEC_FROM_SEG(sbi, segno);
 325        unsigned int start = GET_SEG_FROM_SEC(sbi, secno);
 326        unsigned long long mtime = 0;
 327        unsigned int vblocks;
 328        unsigned char age = 0;
 329        unsigned char u;
 330        unsigned int i;
 331        unsigned int usable_segs_per_sec = f2fs_usable_segs_in_sec(sbi, segno);
 332
 333        for (i = 0; i < usable_segs_per_sec; i++)
 334                mtime += get_seg_entry(sbi, start + i)->mtime;
 335        vblocks = get_valid_blocks(sbi, segno, true);
 336
 337        mtime = div_u64(mtime, usable_segs_per_sec);
 338        vblocks = div_u64(vblocks, usable_segs_per_sec);
 339
 340        u = (vblocks * 100) >> sbi->log_blocks_per_seg;
 341
 342        /* Handle if the system time has changed by the user */
 343        if (mtime < sit_i->min_mtime)
 344                sit_i->min_mtime = mtime;
 345        if (mtime > sit_i->max_mtime)
 346                sit_i->max_mtime = mtime;
 347        if (sit_i->max_mtime != sit_i->min_mtime)
 348                age = 100 - div64_u64(100 * (mtime - sit_i->min_mtime),
 349                                sit_i->max_mtime - sit_i->min_mtime);
 350
 351        return UINT_MAX - ((100 * (100 - u) * age) / (100 + u));
 352}
 353
 354static inline unsigned int get_gc_cost(struct f2fs_sb_info *sbi,
 355                        unsigned int segno, struct victim_sel_policy *p)
 356{
 357        if (p->alloc_mode == SSR)
 358                return get_seg_entry(sbi, segno)->ckpt_valid_blocks;
 359
 360        /* alloc_mode == LFS */
 361        if (p->gc_mode == GC_GREEDY)
 362                return get_valid_blocks(sbi, segno, true);
 363        else if (p->gc_mode == GC_CB)
 364                return get_cb_cost(sbi, segno);
 365
 366        f2fs_bug_on(sbi, 1);
 367        return 0;
 368}
 369
 370static unsigned int count_bits(const unsigned long *addr,
 371                                unsigned int offset, unsigned int len)
 372{
 373        unsigned int end = offset + len, sum = 0;
 374
 375        while (offset < end) {
 376                if (test_bit(offset++, addr))
 377                        ++sum;
 378        }
 379        return sum;
 380}
 381
 382static struct victim_entry *attach_victim_entry(struct f2fs_sb_info *sbi,
 383                                unsigned long long mtime, unsigned int segno,
 384                                struct rb_node *parent, struct rb_node **p,
 385                                bool left_most)
 386{
 387        struct atgc_management *am = &sbi->am;
 388        struct victim_entry *ve;
 389
 390        ve =  f2fs_kmem_cache_alloc(victim_entry_slab,
 391                                GFP_NOFS, true, NULL);
 392
 393        ve->mtime = mtime;
 394        ve->segno = segno;
 395
 396        rb_link_node(&ve->rb_node, parent, p);
 397        rb_insert_color_cached(&ve->rb_node, &am->root, left_most);
 398
 399        list_add_tail(&ve->list, &am->victim_list);
 400
 401        am->victim_count++;
 402
 403        return ve;
 404}
 405
 406static void insert_victim_entry(struct f2fs_sb_info *sbi,
 407                                unsigned long long mtime, unsigned int segno)
 408{
 409        struct atgc_management *am = &sbi->am;
 410        struct rb_node **p;
 411        struct rb_node *parent = NULL;
 412        bool left_most = true;
 413
 414        p = f2fs_lookup_rb_tree_ext(sbi, &am->root, &parent, mtime, &left_most);
 415        attach_victim_entry(sbi, mtime, segno, parent, p, left_most);
 416}
 417
 418static void add_victim_entry(struct f2fs_sb_info *sbi,
 419                                struct victim_sel_policy *p, unsigned int segno)
 420{
 421        struct sit_info *sit_i = SIT_I(sbi);
 422        unsigned int secno = GET_SEC_FROM_SEG(sbi, segno);
 423        unsigned int start = GET_SEG_FROM_SEC(sbi, secno);
 424        unsigned long long mtime = 0;
 425        unsigned int i;
 426
 427        if (unlikely(is_sbi_flag_set(sbi, SBI_CP_DISABLED))) {
 428                if (p->gc_mode == GC_AT &&
 429                        get_valid_blocks(sbi, segno, true) == 0)
 430                        return;
 431        }
 432
 433        for (i = 0; i < sbi->segs_per_sec; i++)
 434                mtime += get_seg_entry(sbi, start + i)->mtime;
 435        mtime = div_u64(mtime, sbi->segs_per_sec);
 436
 437        /* Handle if the system time has changed by the user */
 438        if (mtime < sit_i->min_mtime)
 439                sit_i->min_mtime = mtime;
 440        if (mtime > sit_i->max_mtime)
 441                sit_i->max_mtime = mtime;
 442        if (mtime < sit_i->dirty_min_mtime)
 443                sit_i->dirty_min_mtime = mtime;
 444        if (mtime > sit_i->dirty_max_mtime)
 445                sit_i->dirty_max_mtime = mtime;
 446
 447        /* don't choose young section as candidate */
 448        if (sit_i->dirty_max_mtime - mtime < p->age_threshold)
 449                return;
 450
 451        insert_victim_entry(sbi, mtime, segno);
 452}
 453
 454static struct rb_node *lookup_central_victim(struct f2fs_sb_info *sbi,
 455                                                struct victim_sel_policy *p)
 456{
 457        struct atgc_management *am = &sbi->am;
 458        struct rb_node *parent = NULL;
 459        bool left_most;
 460
 461        f2fs_lookup_rb_tree_ext(sbi, &am->root, &parent, p->age, &left_most);
 462
 463        return parent;
 464}
 465
 466static void atgc_lookup_victim(struct f2fs_sb_info *sbi,
 467                                                struct victim_sel_policy *p)
 468{
 469        struct sit_info *sit_i = SIT_I(sbi);
 470        struct atgc_management *am = &sbi->am;
 471        struct rb_root_cached *root = &am->root;
 472        struct rb_node *node;
 473        struct rb_entry *re;
 474        struct victim_entry *ve;
 475        unsigned long long total_time;
 476        unsigned long long age, u, accu;
 477        unsigned long long max_mtime = sit_i->dirty_max_mtime;
 478        unsigned long long min_mtime = sit_i->dirty_min_mtime;
 479        unsigned int sec_blocks = BLKS_PER_SEC(sbi);
 480        unsigned int vblocks;
 481        unsigned int dirty_threshold = max(am->max_candidate_count,
 482                                        am->candidate_ratio *
 483                                        am->victim_count / 100);
 484        unsigned int age_weight = am->age_weight;
 485        unsigned int cost;
 486        unsigned int iter = 0;
 487
 488        if (max_mtime < min_mtime)
 489                return;
 490
 491        max_mtime += 1;
 492        total_time = max_mtime - min_mtime;
 493
 494        accu = div64_u64(ULLONG_MAX, total_time);
 495        accu = min_t(unsigned long long, div_u64(accu, 100),
 496                                        DEFAULT_ACCURACY_CLASS);
 497
 498        node = rb_first_cached(root);
 499next:
 500        re = rb_entry_safe(node, struct rb_entry, rb_node);
 501        if (!re)
 502                return;
 503
 504        ve = (struct victim_entry *)re;
 505
 506        if (ve->mtime >= max_mtime || ve->mtime < min_mtime)
 507                goto skip;
 508
 509        /* age = 10000 * x% * 60 */
 510        age = div64_u64(accu * (max_mtime - ve->mtime), total_time) *
 511                                                                age_weight;
 512
 513        vblocks = get_valid_blocks(sbi, ve->segno, true);
 514        f2fs_bug_on(sbi, !vblocks || vblocks == sec_blocks);
 515
 516        /* u = 10000 * x% * 40 */
 517        u = div64_u64(accu * (sec_blocks - vblocks), sec_blocks) *
 518                                                        (100 - age_weight);
 519
 520        f2fs_bug_on(sbi, age + u >= UINT_MAX);
 521
 522        cost = UINT_MAX - (age + u);
 523        iter++;
 524
 525        if (cost < p->min_cost ||
 526                        (cost == p->min_cost && age > p->oldest_age)) {
 527                p->min_cost = cost;
 528                p->oldest_age = age;
 529                p->min_segno = ve->segno;
 530        }
 531skip:
 532        if (iter < dirty_threshold) {
 533                node = rb_next(node);
 534                goto next;
 535        }
 536}
 537
 538/*
 539 * select candidates around source section in range of
 540 * [target - dirty_threshold, target + dirty_threshold]
 541 */
 542static void atssr_lookup_victim(struct f2fs_sb_info *sbi,
 543                                                struct victim_sel_policy *p)
 544{
 545        struct sit_info *sit_i = SIT_I(sbi);
 546        struct atgc_management *am = &sbi->am;
 547        struct rb_node *node;
 548        struct rb_entry *re;
 549        struct victim_entry *ve;
 550        unsigned long long age;
 551        unsigned long long max_mtime = sit_i->dirty_max_mtime;
 552        unsigned long long min_mtime = sit_i->dirty_min_mtime;
 553        unsigned int seg_blocks = sbi->blocks_per_seg;
 554        unsigned int vblocks;
 555        unsigned int dirty_threshold = max(am->max_candidate_count,
 556                                        am->candidate_ratio *
 557                                        am->victim_count / 100);
 558        unsigned int cost;
 559        unsigned int iter = 0;
 560        int stage = 0;
 561
 562        if (max_mtime < min_mtime)
 563                return;
 564        max_mtime += 1;
 565next_stage:
 566        node = lookup_central_victim(sbi, p);
 567next_node:
 568        re = rb_entry_safe(node, struct rb_entry, rb_node);
 569        if (!re) {
 570                if (stage == 0)
 571                        goto skip_stage;
 572                return;
 573        }
 574
 575        ve = (struct victim_entry *)re;
 576
 577        if (ve->mtime >= max_mtime || ve->mtime < min_mtime)
 578                goto skip_node;
 579
 580        age = max_mtime - ve->mtime;
 581
 582        vblocks = get_seg_entry(sbi, ve->segno)->ckpt_valid_blocks;
 583        f2fs_bug_on(sbi, !vblocks);
 584
 585        /* rare case */
 586        if (vblocks == seg_blocks)
 587                goto skip_node;
 588
 589        iter++;
 590
 591        age = max_mtime - abs(p->age - age);
 592        cost = UINT_MAX - vblocks;
 593
 594        if (cost < p->min_cost ||
 595                        (cost == p->min_cost && age > p->oldest_age)) {
 596                p->min_cost = cost;
 597                p->oldest_age = age;
 598                p->min_segno = ve->segno;
 599        }
 600skip_node:
 601        if (iter < dirty_threshold) {
 602                if (stage == 0)
 603                        node = rb_prev(node);
 604                else if (stage == 1)
 605                        node = rb_next(node);
 606                goto next_node;
 607        }
 608skip_stage:
 609        if (stage < 1) {
 610                stage++;
 611                iter = 0;
 612                goto next_stage;
 613        }
 614}
 615static void lookup_victim_by_age(struct f2fs_sb_info *sbi,
 616                                                struct victim_sel_policy *p)
 617{
 618        f2fs_bug_on(sbi, !f2fs_check_rb_tree_consistence(sbi,
 619                                                &sbi->am.root, true));
 620
 621        if (p->gc_mode == GC_AT)
 622                atgc_lookup_victim(sbi, p);
 623        else if (p->alloc_mode == AT_SSR)
 624                atssr_lookup_victim(sbi, p);
 625        else
 626                f2fs_bug_on(sbi, 1);
 627}
 628
 629static void release_victim_entry(struct f2fs_sb_info *sbi)
 630{
 631        struct atgc_management *am = &sbi->am;
 632        struct victim_entry *ve, *tmp;
 633
 634        list_for_each_entry_safe(ve, tmp, &am->victim_list, list) {
 635                list_del(&ve->list);
 636                kmem_cache_free(victim_entry_slab, ve);
 637                am->victim_count--;
 638        }
 639
 640        am->root = RB_ROOT_CACHED;
 641
 642        f2fs_bug_on(sbi, am->victim_count);
 643        f2fs_bug_on(sbi, !list_empty(&am->victim_list));
 644}
 645
 646/*
 647 * This function is called from two paths.
 648 * One is garbage collection and the other is SSR segment selection.
 649 * When it is called during GC, it just gets a victim segment
 650 * and it does not remove it from dirty seglist.
 651 * When it is called from SSR segment selection, it finds a segment
 652 * which has minimum valid blocks and removes it from dirty seglist.
 653 */
 654static int get_victim_by_default(struct f2fs_sb_info *sbi,
 655                        unsigned int *result, int gc_type, int type,
 656                        char alloc_mode, unsigned long long age)
 657{
 658        struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
 659        struct sit_info *sm = SIT_I(sbi);
 660        struct victim_sel_policy p;
 661        unsigned int secno, last_victim;
 662        unsigned int last_segment;
 663        unsigned int nsearched;
 664        bool is_atgc;
 665        int ret = 0;
 666
 667        mutex_lock(&dirty_i->seglist_lock);
 668        last_segment = MAIN_SECS(sbi) * sbi->segs_per_sec;
 669
 670        p.alloc_mode = alloc_mode;
 671        p.age = age;
 672        p.age_threshold = sbi->am.age_threshold;
 673
 674retry:
 675        select_policy(sbi, gc_type, type, &p);
 676        p.min_segno = NULL_SEGNO;
 677        p.oldest_age = 0;
 678        p.min_cost = get_max_cost(sbi, &p);
 679
 680        is_atgc = (p.gc_mode == GC_AT || p.alloc_mode == AT_SSR);
 681        nsearched = 0;
 682
 683        if (is_atgc)
 684                SIT_I(sbi)->dirty_min_mtime = ULLONG_MAX;
 685
 686        if (*result != NULL_SEGNO) {
 687                if (!get_valid_blocks(sbi, *result, false)) {
 688                        ret = -ENODATA;
 689                        goto out;
 690                }
 691
 692                if (sec_usage_check(sbi, GET_SEC_FROM_SEG(sbi, *result)))
 693                        ret = -EBUSY;
 694                else
 695                        p.min_segno = *result;
 696                goto out;
 697        }
 698
 699        ret = -ENODATA;
 700        if (p.max_search == 0)
 701                goto out;
 702
 703        if (__is_large_section(sbi) && p.alloc_mode == LFS) {
 704                if (sbi->next_victim_seg[BG_GC] != NULL_SEGNO) {
 705                        p.min_segno = sbi->next_victim_seg[BG_GC];
 706                        *result = p.min_segno;
 707                        sbi->next_victim_seg[BG_GC] = NULL_SEGNO;
 708                        goto got_result;
 709                }
 710                if (gc_type == FG_GC &&
 711                                sbi->next_victim_seg[FG_GC] != NULL_SEGNO) {
 712                        p.min_segno = sbi->next_victim_seg[FG_GC];
 713                        *result = p.min_segno;
 714                        sbi->next_victim_seg[FG_GC] = NULL_SEGNO;
 715                        goto got_result;
 716                }
 717        }
 718
 719        last_victim = sm->last_victim[p.gc_mode];
 720        if (p.alloc_mode == LFS && gc_type == FG_GC) {
 721                p.min_segno = check_bg_victims(sbi);
 722                if (p.min_segno != NULL_SEGNO)
 723                        goto got_it;
 724        }
 725
 726        while (1) {
 727                unsigned long cost, *dirty_bitmap;
 728                unsigned int unit_no, segno;
 729
 730                dirty_bitmap = p.dirty_bitmap;
 731                unit_no = find_next_bit(dirty_bitmap,
 732                                last_segment / p.ofs_unit,
 733                                p.offset / p.ofs_unit);
 734                segno = unit_no * p.ofs_unit;
 735                if (segno >= last_segment) {
 736                        if (sm->last_victim[p.gc_mode]) {
 737                                last_segment =
 738                                        sm->last_victim[p.gc_mode];
 739                                sm->last_victim[p.gc_mode] = 0;
 740                                p.offset = 0;
 741                                continue;
 742                        }
 743                        break;
 744                }
 745
 746                p.offset = segno + p.ofs_unit;
 747                nsearched++;
 748
 749#ifdef CONFIG_F2FS_CHECK_FS
 750                /*
 751                 * skip selecting the invalid segno (that is failed due to block
 752                 * validity check failure during GC) to avoid endless GC loop in
 753                 * such cases.
 754                 */
 755                if (test_bit(segno, sm->invalid_segmap))
 756                        goto next;
 757#endif
 758
 759                secno = GET_SEC_FROM_SEG(sbi, segno);
 760
 761                if (sec_usage_check(sbi, secno))
 762                        goto next;
 763
 764                /* Don't touch checkpointed data */
 765                if (unlikely(is_sbi_flag_set(sbi, SBI_CP_DISABLED))) {
 766                        if (p.alloc_mode == LFS) {
 767                                /*
 768                                 * LFS is set to find source section during GC.
 769                                 * The victim should have no checkpointed data.
 770                                 */
 771                                if (get_ckpt_valid_blocks(sbi, segno, true))
 772                                        goto next;
 773                        } else {
 774                                /*
 775                                 * SSR | AT_SSR are set to find target segment
 776                                 * for writes which can be full by checkpointed
 777                                 * and newly written blocks.
 778                                 */
 779                                if (!f2fs_segment_has_free_slot(sbi, segno))
 780                                        goto next;
 781                        }
 782                }
 783
 784                if (gc_type == BG_GC && test_bit(secno, dirty_i->victim_secmap))
 785                        goto next;
 786
 787                if (is_atgc) {
 788                        add_victim_entry(sbi, &p, segno);
 789                        goto next;
 790                }
 791
 792                cost = get_gc_cost(sbi, segno, &p);
 793
 794                if (p.min_cost > cost) {
 795                        p.min_segno = segno;
 796                        p.min_cost = cost;
 797                }
 798next:
 799                if (nsearched >= p.max_search) {
 800                        if (!sm->last_victim[p.gc_mode] && segno <= last_victim)
 801                                sm->last_victim[p.gc_mode] =
 802                                        last_victim + p.ofs_unit;
 803                        else
 804                                sm->last_victim[p.gc_mode] = segno + p.ofs_unit;
 805                        sm->last_victim[p.gc_mode] %=
 806                                (MAIN_SECS(sbi) * sbi->segs_per_sec);
 807                        break;
 808                }
 809        }
 810
 811        /* get victim for GC_AT/AT_SSR */
 812        if (is_atgc) {
 813                lookup_victim_by_age(sbi, &p);
 814                release_victim_entry(sbi);
 815        }
 816
 817        if (is_atgc && p.min_segno == NULL_SEGNO &&
 818                        sm->elapsed_time < p.age_threshold) {
 819                p.age_threshold = 0;
 820                goto retry;
 821        }
 822
 823        if (p.min_segno != NULL_SEGNO) {
 824got_it:
 825                *result = (p.min_segno / p.ofs_unit) * p.ofs_unit;
 826got_result:
 827                if (p.alloc_mode == LFS) {
 828                        secno = GET_SEC_FROM_SEG(sbi, p.min_segno);
 829                        if (gc_type == FG_GC)
 830                                sbi->cur_victim_sec = secno;
 831                        else
 832                                set_bit(secno, dirty_i->victim_secmap);
 833                }
 834                ret = 0;
 835
 836        }
 837out:
 838        if (p.min_segno != NULL_SEGNO)
 839                trace_f2fs_get_victim(sbi->sb, type, gc_type, &p,
 840                                sbi->cur_victim_sec,
 841                                prefree_segments(sbi), free_segments(sbi));
 842        mutex_unlock(&dirty_i->seglist_lock);
 843
 844        return ret;
 845}
 846
 847static const struct victim_selection default_v_ops = {
 848        .get_victim = get_victim_by_default,
 849};
 850
 851static struct inode *find_gc_inode(struct gc_inode_list *gc_list, nid_t ino)
 852{
 853        struct inode_entry *ie;
 854
 855        ie = radix_tree_lookup(&gc_list->iroot, ino);
 856        if (ie)
 857                return ie->inode;
 858        return NULL;
 859}
 860
 861static void add_gc_inode(struct gc_inode_list *gc_list, struct inode *inode)
 862{
 863        struct inode_entry *new_ie;
 864
 865        if (inode == find_gc_inode(gc_list, inode->i_ino)) {
 866                iput(inode);
 867                return;
 868        }
 869        new_ie = f2fs_kmem_cache_alloc(f2fs_inode_entry_slab,
 870                                        GFP_NOFS, true, NULL);
 871        new_ie->inode = inode;
 872
 873        f2fs_radix_tree_insert(&gc_list->iroot, inode->i_ino, new_ie);
 874        list_add_tail(&new_ie->list, &gc_list->ilist);
 875}
 876
 877static void put_gc_inode(struct gc_inode_list *gc_list)
 878{
 879        struct inode_entry *ie, *next_ie;
 880
 881        list_for_each_entry_safe(ie, next_ie, &gc_list->ilist, list) {
 882                radix_tree_delete(&gc_list->iroot, ie->inode->i_ino);
 883                iput(ie->inode);
 884                list_del(&ie->list);
 885                kmem_cache_free(f2fs_inode_entry_slab, ie);
 886        }
 887}
 888
 889static int check_valid_map(struct f2fs_sb_info *sbi,
 890                                unsigned int segno, int offset)
 891{
 892        struct sit_info *sit_i = SIT_I(sbi);
 893        struct seg_entry *sentry;
 894        int ret;
 895
 896        down_read(&sit_i->sentry_lock);
 897        sentry = get_seg_entry(sbi, segno);
 898        ret = f2fs_test_bit(offset, sentry->cur_valid_map);
 899        up_read(&sit_i->sentry_lock);
 900        return ret;
 901}
 902
 903/*
 904 * This function compares node address got in summary with that in NAT.
 905 * On validity, copy that node with cold status, otherwise (invalid node)
 906 * ignore that.
 907 */
 908static int gc_node_segment(struct f2fs_sb_info *sbi,
 909                struct f2fs_summary *sum, unsigned int segno, int gc_type)
 910{
 911        struct f2fs_summary *entry;
 912        block_t start_addr;
 913        int off;
 914        int phase = 0;
 915        bool fggc = (gc_type == FG_GC);
 916        int submitted = 0;
 917        unsigned int usable_blks_in_seg = f2fs_usable_blks_in_seg(sbi, segno);
 918
 919        start_addr = START_BLOCK(sbi, segno);
 920
 921next_step:
 922        entry = sum;
 923
 924        if (fggc && phase == 2)
 925                atomic_inc(&sbi->wb_sync_req[NODE]);
 926
 927        for (off = 0; off < usable_blks_in_seg; off++, entry++) {
 928                nid_t nid = le32_to_cpu(entry->nid);
 929                struct page *node_page;
 930                struct node_info ni;
 931                int err;
 932
 933                /* stop BG_GC if there is not enough free sections. */
 934                if (gc_type == BG_GC && has_not_enough_free_secs(sbi, 0, 0))
 935                        return submitted;
 936
 937                if (check_valid_map(sbi, segno, off) == 0)
 938                        continue;
 939
 940                if (phase == 0) {
 941                        f2fs_ra_meta_pages(sbi, NAT_BLOCK_OFFSET(nid), 1,
 942                                                        META_NAT, true);
 943                        continue;
 944                }
 945
 946                if (phase == 1) {
 947                        f2fs_ra_node_page(sbi, nid);
 948                        continue;
 949                }
 950
 951                /* phase == 2 */
 952                node_page = f2fs_get_node_page(sbi, nid);
 953                if (IS_ERR(node_page))
 954                        continue;
 955
 956                /* block may become invalid during f2fs_get_node_page */
 957                if (check_valid_map(sbi, segno, off) == 0) {
 958                        f2fs_put_page(node_page, 1);
 959                        continue;
 960                }
 961
 962                if (f2fs_get_node_info(sbi, nid, &ni, false)) {
 963                        f2fs_put_page(node_page, 1);
 964                        continue;
 965                }
 966
 967                if (ni.blk_addr != start_addr + off) {
 968                        f2fs_put_page(node_page, 1);
 969                        continue;
 970                }
 971
 972                err = f2fs_move_node_page(node_page, gc_type);
 973                if (!err && gc_type == FG_GC)
 974                        submitted++;
 975                stat_inc_node_blk_count(sbi, 1, gc_type);
 976        }
 977
 978        if (++phase < 3)
 979                goto next_step;
 980
 981        if (fggc)
 982                atomic_dec(&sbi->wb_sync_req[NODE]);
 983        return submitted;
 984}
 985
 986/*
 987 * Calculate start block index indicating the given node offset.
 988 * Be careful, caller should give this node offset only indicating direct node
 989 * blocks. If any node offsets, which point the other types of node blocks such
 990 * as indirect or double indirect node blocks, are given, it must be a caller's
 991 * bug.
 992 */
 993block_t f2fs_start_bidx_of_node(unsigned int node_ofs, struct inode *inode)
 994{
 995        unsigned int indirect_blks = 2 * NIDS_PER_BLOCK + 4;
 996        unsigned int bidx;
 997
 998        if (node_ofs == 0)
 999                return 0;
1000
1001        if (node_ofs <= 2) {
1002                bidx = node_ofs - 1;
1003        } else if (node_ofs <= indirect_blks) {
1004                int dec = (node_ofs - 4) / (NIDS_PER_BLOCK + 1);
1005
1006                bidx = node_ofs - 2 - dec;
1007        } else {
1008                int dec = (node_ofs - indirect_blks - 3) / (NIDS_PER_BLOCK + 1);
1009
1010                bidx = node_ofs - 5 - dec;
1011        }
1012        return bidx * ADDRS_PER_BLOCK(inode) + ADDRS_PER_INODE(inode);
1013}
1014
1015static bool is_alive(struct f2fs_sb_info *sbi, struct f2fs_summary *sum,
1016                struct node_info *dni, block_t blkaddr, unsigned int *nofs)
1017{
1018        struct page *node_page;
1019        nid_t nid;
1020        unsigned int ofs_in_node;
1021        block_t source_blkaddr;
1022
1023        nid = le32_to_cpu(sum->nid);
1024        ofs_in_node = le16_to_cpu(sum->ofs_in_node);
1025
1026        node_page = f2fs_get_node_page(sbi, nid);
1027        if (IS_ERR(node_page))
1028                return false;
1029
1030        if (f2fs_get_node_info(sbi, nid, dni, false)) {
1031                f2fs_put_page(node_page, 1);
1032                return false;
1033        }
1034
1035        if (sum->version != dni->version) {
1036                f2fs_warn(sbi, "%s: valid data with mismatched node version.",
1037                          __func__);
1038                set_sbi_flag(sbi, SBI_NEED_FSCK);
1039        }
1040
1041        if (f2fs_check_nid_range(sbi, dni->ino))
1042                return false;
1043
1044        *nofs = ofs_of_node(node_page);
1045        source_blkaddr = data_blkaddr(NULL, node_page, ofs_in_node);
1046        f2fs_put_page(node_page, 1);
1047
1048        if (source_blkaddr != blkaddr) {
1049#ifdef CONFIG_F2FS_CHECK_FS
1050                unsigned int segno = GET_SEGNO(sbi, blkaddr);
1051                unsigned long offset = GET_BLKOFF_FROM_SEG0(sbi, blkaddr);
1052
1053                if (unlikely(check_valid_map(sbi, segno, offset))) {
1054                        if (!test_and_set_bit(segno, SIT_I(sbi)->invalid_segmap)) {
1055                                f2fs_err(sbi, "mismatched blkaddr %u (source_blkaddr %u) in seg %u",
1056                                         blkaddr, source_blkaddr, segno);
1057                                set_sbi_flag(sbi, SBI_NEED_FSCK);
1058                        }
1059                }
1060#endif
1061                return false;
1062        }
1063        return true;
1064}
1065
1066static int ra_data_block(struct inode *inode, pgoff_t index)
1067{
1068        struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
1069        struct address_space *mapping = inode->i_mapping;
1070        struct dnode_of_data dn;
1071        struct page *page;
1072        struct extent_info ei = {0, 0, 0};
1073        struct f2fs_io_info fio = {
1074                .sbi = sbi,
1075                .ino = inode->i_ino,
1076                .type = DATA,
1077                .temp = COLD,
1078                .op = REQ_OP_READ,
1079                .op_flags = 0,
1080                .encrypted_page = NULL,
1081                .in_list = false,
1082                .retry = false,
1083        };
1084        int err;
1085
1086        page = f2fs_grab_cache_page(mapping, index, true);
1087        if (!page)
1088                return -ENOMEM;
1089
1090        if (f2fs_lookup_extent_cache(inode, index, &ei)) {
1091                dn.data_blkaddr = ei.blk + index - ei.fofs;
1092                if (unlikely(!f2fs_is_valid_blkaddr(sbi, dn.data_blkaddr,
1093                                                DATA_GENERIC_ENHANCE_READ))) {
1094                        err = -EFSCORRUPTED;
1095                        goto put_page;
1096                }
1097                goto got_it;
1098        }
1099
1100        set_new_dnode(&dn, inode, NULL, NULL, 0);
1101        err = f2fs_get_dnode_of_data(&dn, index, LOOKUP_NODE);
1102        if (err)
1103                goto put_page;
1104        f2fs_put_dnode(&dn);
1105
1106        if (!__is_valid_data_blkaddr(dn.data_blkaddr)) {
1107                err = -ENOENT;
1108                goto put_page;
1109        }
1110        if (unlikely(!f2fs_is_valid_blkaddr(sbi, dn.data_blkaddr,
1111                                                DATA_GENERIC_ENHANCE))) {
1112                err = -EFSCORRUPTED;
1113                goto put_page;
1114        }
1115got_it:
1116        /* read page */
1117        fio.page = page;
1118        fio.new_blkaddr = fio.old_blkaddr = dn.data_blkaddr;
1119
1120        /*
1121         * don't cache encrypted data into meta inode until previous dirty
1122         * data were writebacked to avoid racing between GC and flush.
1123         */
1124        f2fs_wait_on_page_writeback(page, DATA, true, true);
1125
1126        f2fs_wait_on_block_writeback(inode, dn.data_blkaddr);
1127
1128        fio.encrypted_page = f2fs_pagecache_get_page(META_MAPPING(sbi),
1129                                        dn.data_blkaddr,
1130                                        FGP_LOCK | FGP_CREAT, GFP_NOFS);
1131        if (!fio.encrypted_page) {
1132                err = -ENOMEM;
1133                goto put_page;
1134        }
1135
1136        err = f2fs_submit_page_bio(&fio);
1137        if (err)
1138                goto put_encrypted_page;
1139        f2fs_put_page(fio.encrypted_page, 0);
1140        f2fs_put_page(page, 1);
1141
1142        f2fs_update_iostat(sbi, FS_DATA_READ_IO, F2FS_BLKSIZE);
1143        f2fs_update_iostat(sbi, FS_GDATA_READ_IO, F2FS_BLKSIZE);
1144
1145        return 0;
1146put_encrypted_page:
1147        f2fs_put_page(fio.encrypted_page, 1);
1148put_page:
1149        f2fs_put_page(page, 1);
1150        return err;
1151}
1152
1153/*
1154 * Move data block via META_MAPPING while keeping locked data page.
1155 * This can be used to move blocks, aka LBAs, directly on disk.
1156 */
1157static int move_data_block(struct inode *inode, block_t bidx,
1158                                int gc_type, unsigned int segno, int off)
1159{
1160        struct f2fs_io_info fio = {
1161                .sbi = F2FS_I_SB(inode),
1162                .ino = inode->i_ino,
1163                .type = DATA,
1164                .temp = COLD,
1165                .op = REQ_OP_READ,
1166                .op_flags = 0,
1167                .encrypted_page = NULL,
1168                .in_list = false,
1169                .retry = false,
1170        };
1171        struct dnode_of_data dn;
1172        struct f2fs_summary sum;
1173        struct node_info ni;
1174        struct page *page, *mpage;
1175        block_t newaddr;
1176        int err = 0;
1177        bool lfs_mode = f2fs_lfs_mode(fio.sbi);
1178        int type = fio.sbi->am.atgc_enabled && (gc_type == BG_GC) &&
1179                                (fio.sbi->gc_mode != GC_URGENT_HIGH) ?
1180                                CURSEG_ALL_DATA_ATGC : CURSEG_COLD_DATA;
1181
1182        /* do not read out */
1183        page = f2fs_grab_cache_page(inode->i_mapping, bidx, false);
1184        if (!page)
1185                return -ENOMEM;
1186
1187        if (!check_valid_map(F2FS_I_SB(inode), segno, off)) {
1188                err = -ENOENT;
1189                goto out;
1190        }
1191
1192        if (f2fs_is_atomic_file(inode)) {
1193                F2FS_I(inode)->i_gc_failures[GC_FAILURE_ATOMIC]++;
1194                F2FS_I_SB(inode)->skipped_atomic_files[gc_type]++;
1195                err = -EAGAIN;
1196                goto out;
1197        }
1198
1199        if (f2fs_is_pinned_file(inode)) {
1200                f2fs_pin_file_control(inode, true);
1201                err = -EAGAIN;
1202                goto out;
1203        }
1204
1205        set_new_dnode(&dn, inode, NULL, NULL, 0);
1206        err = f2fs_get_dnode_of_data(&dn, bidx, LOOKUP_NODE);
1207        if (err)
1208                goto out;
1209
1210        if (unlikely(dn.data_blkaddr == NULL_ADDR)) {
1211                ClearPageUptodate(page);
1212                err = -ENOENT;
1213                goto put_out;
1214        }
1215
1216        /*
1217         * don't cache encrypted data into meta inode until previous dirty
1218         * data were writebacked to avoid racing between GC and flush.
1219         */
1220        f2fs_wait_on_page_writeback(page, DATA, true, true);
1221
1222        f2fs_wait_on_block_writeback(inode, dn.data_blkaddr);
1223
1224        err = f2fs_get_node_info(fio.sbi, dn.nid, &ni, false);
1225        if (err)
1226                goto put_out;
1227
1228        /* read page */
1229        fio.page = page;
1230        fio.new_blkaddr = fio.old_blkaddr = dn.data_blkaddr;
1231
1232        if (lfs_mode)
1233                down_write(&fio.sbi->io_order_lock);
1234
1235        mpage = f2fs_grab_cache_page(META_MAPPING(fio.sbi),
1236                                        fio.old_blkaddr, false);
1237        if (!mpage) {
1238                err = -ENOMEM;
1239                goto up_out;
1240        }
1241
1242        fio.encrypted_page = mpage;
1243
1244        /* read source block in mpage */
1245        if (!PageUptodate(mpage)) {
1246                err = f2fs_submit_page_bio(&fio);
1247                if (err) {
1248                        f2fs_put_page(mpage, 1);
1249                        goto up_out;
1250                }
1251
1252                f2fs_update_iostat(fio.sbi, FS_DATA_READ_IO, F2FS_BLKSIZE);
1253                f2fs_update_iostat(fio.sbi, FS_GDATA_READ_IO, F2FS_BLKSIZE);
1254
1255                lock_page(mpage);
1256                if (unlikely(mpage->mapping != META_MAPPING(fio.sbi) ||
1257                                                !PageUptodate(mpage))) {
1258                        err = -EIO;
1259                        f2fs_put_page(mpage, 1);
1260                        goto up_out;
1261                }
1262        }
1263
1264        set_summary(&sum, dn.nid, dn.ofs_in_node, ni.version);
1265
1266        /* allocate block address */
1267        f2fs_allocate_data_block(fio.sbi, NULL, fio.old_blkaddr, &newaddr,
1268                                &sum, type, NULL);
1269
1270        fio.encrypted_page = f2fs_pagecache_get_page(META_MAPPING(fio.sbi),
1271                                newaddr, FGP_LOCK | FGP_CREAT, GFP_NOFS);
1272        if (!fio.encrypted_page) {
1273                err = -ENOMEM;
1274                f2fs_put_page(mpage, 1);
1275                goto recover_block;
1276        }
1277
1278        /* write target block */
1279        f2fs_wait_on_page_writeback(fio.encrypted_page, DATA, true, true);
1280        memcpy(page_address(fio.encrypted_page),
1281                                page_address(mpage), PAGE_SIZE);
1282        f2fs_put_page(mpage, 1);
1283        invalidate_mapping_pages(META_MAPPING(fio.sbi),
1284                                fio.old_blkaddr, fio.old_blkaddr);
1285        f2fs_invalidate_compress_page(fio.sbi, fio.old_blkaddr);
1286
1287        set_page_dirty(fio.encrypted_page);
1288        if (clear_page_dirty_for_io(fio.encrypted_page))
1289                dec_page_count(fio.sbi, F2FS_DIRTY_META);
1290
1291        set_page_writeback(fio.encrypted_page);
1292        ClearPageError(page);
1293
1294        fio.op = REQ_OP_WRITE;
1295        fio.op_flags = REQ_SYNC;
1296        fio.new_blkaddr = newaddr;
1297        f2fs_submit_page_write(&fio);
1298        if (fio.retry) {
1299                err = -EAGAIN;
1300                if (PageWriteback(fio.encrypted_page))
1301                        end_page_writeback(fio.encrypted_page);
1302                goto put_page_out;
1303        }
1304
1305        f2fs_update_iostat(fio.sbi, FS_GC_DATA_IO, F2FS_BLKSIZE);
1306
1307        f2fs_update_data_blkaddr(&dn, newaddr);
1308        set_inode_flag(inode, FI_APPEND_WRITE);
1309        if (page->index == 0)
1310                set_inode_flag(inode, FI_FIRST_BLOCK_WRITTEN);
1311put_page_out:
1312        f2fs_put_page(fio.encrypted_page, 1);
1313recover_block:
1314        if (err)
1315                f2fs_do_replace_block(fio.sbi, &sum, newaddr, fio.old_blkaddr,
1316                                                        true, true, true);
1317up_out:
1318        if (lfs_mode)
1319                up_write(&fio.sbi->io_order_lock);
1320put_out:
1321        f2fs_put_dnode(&dn);
1322out:
1323        f2fs_put_page(page, 1);
1324        return err;
1325}
1326
1327static int move_data_page(struct inode *inode, block_t bidx, int gc_type,
1328                                                        unsigned int segno, int off)
1329{
1330        struct page *page;
1331        int err = 0;
1332
1333        page = f2fs_get_lock_data_page(inode, bidx, true);
1334        if (IS_ERR(page))
1335                return PTR_ERR(page);
1336
1337        if (!check_valid_map(F2FS_I_SB(inode), segno, off)) {
1338                err = -ENOENT;
1339                goto out;
1340        }
1341
1342        if (f2fs_is_atomic_file(inode)) {
1343                F2FS_I(inode)->i_gc_failures[GC_FAILURE_ATOMIC]++;
1344                F2FS_I_SB(inode)->skipped_atomic_files[gc_type]++;
1345                err = -EAGAIN;
1346                goto out;
1347        }
1348        if (f2fs_is_pinned_file(inode)) {
1349                if (gc_type == FG_GC)
1350                        f2fs_pin_file_control(inode, true);
1351                err = -EAGAIN;
1352                goto out;
1353        }
1354
1355        if (gc_type == BG_GC) {
1356                if (PageWriteback(page)) {
1357                        err = -EAGAIN;
1358                        goto out;
1359                }
1360                set_page_dirty(page);
1361                set_page_private_gcing(page);
1362        } else {
1363                struct f2fs_io_info fio = {
1364                        .sbi = F2FS_I_SB(inode),
1365                        .ino = inode->i_ino,
1366                        .type = DATA,
1367                        .temp = COLD,
1368                        .op = REQ_OP_WRITE,
1369                        .op_flags = REQ_SYNC,
1370                        .old_blkaddr = NULL_ADDR,
1371                        .page = page,
1372                        .encrypted_page = NULL,
1373                        .need_lock = LOCK_REQ,
1374                        .io_type = FS_GC_DATA_IO,
1375                };
1376                bool is_dirty = PageDirty(page);
1377
1378retry:
1379                f2fs_wait_on_page_writeback(page, DATA, true, true);
1380
1381                set_page_dirty(page);
1382                if (clear_page_dirty_for_io(page)) {
1383                        inode_dec_dirty_pages(inode);
1384                        f2fs_remove_dirty_inode(inode);
1385                }
1386
1387                set_page_private_gcing(page);
1388
1389                err = f2fs_do_write_data_page(&fio);
1390                if (err) {
1391                        clear_page_private_gcing(page);
1392                        if (err == -ENOMEM) {
1393                                memalloc_retry_wait(GFP_NOFS);
1394                                goto retry;
1395                        }
1396                        if (is_dirty)
1397                                set_page_dirty(page);
1398                }
1399        }
1400out:
1401        f2fs_put_page(page, 1);
1402        return err;
1403}
1404
1405/*
1406 * This function tries to get parent node of victim data block, and identifies
1407 * data block validity. If the block is valid, copy that with cold status and
1408 * modify parent node.
1409 * If the parent node is not valid or the data block address is different,
1410 * the victim data block is ignored.
1411 */
1412static int gc_data_segment(struct f2fs_sb_info *sbi, struct f2fs_summary *sum,
1413                struct gc_inode_list *gc_list, unsigned int segno, int gc_type,
1414                bool force_migrate)
1415{
1416        struct super_block *sb = sbi->sb;
1417        struct f2fs_summary *entry;
1418        block_t start_addr;
1419        int off;
1420        int phase = 0;
1421        int submitted = 0;
1422        unsigned int usable_blks_in_seg = f2fs_usable_blks_in_seg(sbi, segno);
1423
1424        start_addr = START_BLOCK(sbi, segno);
1425
1426next_step:
1427        entry = sum;
1428
1429        for (off = 0; off < usable_blks_in_seg; off++, entry++) {
1430                struct page *data_page;
1431                struct inode *inode;
1432                struct node_info dni; /* dnode info for the data */
1433                unsigned int ofs_in_node, nofs;
1434                block_t start_bidx;
1435                nid_t nid = le32_to_cpu(entry->nid);
1436
1437                /*
1438                 * stop BG_GC if there is not enough free sections.
1439                 * Or, stop GC if the segment becomes fully valid caused by
1440                 * race condition along with SSR block allocation.
1441                 */
1442                if ((gc_type == BG_GC && has_not_enough_free_secs(sbi, 0, 0)) ||
1443                        (!force_migrate && get_valid_blocks(sbi, segno, true) ==
1444                                                        BLKS_PER_SEC(sbi)))
1445                        return submitted;
1446
1447                if (check_valid_map(sbi, segno, off) == 0)
1448                        continue;
1449
1450                if (phase == 0) {
1451                        f2fs_ra_meta_pages(sbi, NAT_BLOCK_OFFSET(nid), 1,
1452                                                        META_NAT, true);
1453                        continue;
1454                }
1455
1456                if (phase == 1) {
1457                        f2fs_ra_node_page(sbi, nid);
1458                        continue;
1459                }
1460
1461                /* Get an inode by ino with checking validity */
1462                if (!is_alive(sbi, entry, &dni, start_addr + off, &nofs))
1463                        continue;
1464
1465                if (phase == 2) {
1466                        f2fs_ra_node_page(sbi, dni.ino);
1467                        continue;
1468                }
1469
1470                ofs_in_node = le16_to_cpu(entry->ofs_in_node);
1471
1472                if (phase == 3) {
1473                        inode = f2fs_iget(sb, dni.ino);
1474                        if (IS_ERR(inode) || is_bad_inode(inode) ||
1475                                        special_file(inode->i_mode))
1476                                continue;
1477
1478                        if (!down_write_trylock(
1479                                &F2FS_I(inode)->i_gc_rwsem[WRITE])) {
1480                                iput(inode);
1481                                sbi->skipped_gc_rwsem++;
1482                                continue;
1483                        }
1484
1485                        start_bidx = f2fs_start_bidx_of_node(nofs, inode) +
1486                                                                ofs_in_node;
1487
1488                        if (f2fs_post_read_required(inode)) {
1489                                int err = ra_data_block(inode, start_bidx);
1490
1491                                up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
1492                                if (err) {
1493                                        iput(inode);
1494                                        continue;
1495                                }
1496                                add_gc_inode(gc_list, inode);
1497                                continue;
1498                        }
1499
1500                        data_page = f2fs_get_read_data_page(inode,
1501                                                start_bidx, REQ_RAHEAD, true);
1502                        up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
1503                        if (IS_ERR(data_page)) {
1504                                iput(inode);
1505                                continue;
1506                        }
1507
1508                        f2fs_put_page(data_page, 0);
1509                        add_gc_inode(gc_list, inode);
1510                        continue;
1511                }
1512
1513                /* phase 4 */
1514                inode = find_gc_inode(gc_list, dni.ino);
1515                if (inode) {
1516                        struct f2fs_inode_info *fi = F2FS_I(inode);
1517                        bool locked = false;
1518                        int err;
1519
1520                        if (S_ISREG(inode->i_mode)) {
1521                                if (!down_write_trylock(&fi->i_gc_rwsem[READ])) {
1522                                        sbi->skipped_gc_rwsem++;
1523                                        continue;
1524                                }
1525                                if (!down_write_trylock(
1526                                                &fi->i_gc_rwsem[WRITE])) {
1527                                        sbi->skipped_gc_rwsem++;
1528                                        up_write(&fi->i_gc_rwsem[READ]);
1529                                        continue;
1530                                }
1531                                locked = true;
1532
1533                                /* wait for all inflight aio data */
1534                                inode_dio_wait(inode);
1535                        }
1536
1537                        start_bidx = f2fs_start_bidx_of_node(nofs, inode)
1538                                                                + ofs_in_node;
1539                        if (f2fs_post_read_required(inode))
1540                                err = move_data_block(inode, start_bidx,
1541                                                        gc_type, segno, off);
1542                        else
1543                                err = move_data_page(inode, start_bidx, gc_type,
1544                                                                segno, off);
1545
1546                        if (!err && (gc_type == FG_GC ||
1547                                        f2fs_post_read_required(inode)))
1548                                submitted++;
1549
1550                        if (locked) {
1551                                up_write(&fi->i_gc_rwsem[WRITE]);
1552                                up_write(&fi->i_gc_rwsem[READ]);
1553                        }
1554
1555                        stat_inc_data_blk_count(sbi, 1, gc_type);
1556                }
1557        }
1558
1559        if (++phase < 5)
1560                goto next_step;
1561
1562        return submitted;
1563}
1564
1565static int __get_victim(struct f2fs_sb_info *sbi, unsigned int *victim,
1566                        int gc_type)
1567{
1568        struct sit_info *sit_i = SIT_I(sbi);
1569        int ret;
1570
1571        down_write(&sit_i->sentry_lock);
1572        ret = DIRTY_I(sbi)->v_ops->get_victim(sbi, victim, gc_type,
1573                                              NO_CHECK_TYPE, LFS, 0);
1574        up_write(&sit_i->sentry_lock);
1575        return ret;
1576}
1577
1578static int do_garbage_collect(struct f2fs_sb_info *sbi,
1579                                unsigned int start_segno,
1580                                struct gc_inode_list *gc_list, int gc_type,
1581                                bool force_migrate)
1582{
1583        struct page *sum_page;
1584        struct f2fs_summary_block *sum;
1585        struct blk_plug plug;
1586        unsigned int segno = start_segno;
1587        unsigned int end_segno = start_segno + sbi->segs_per_sec;
1588        int seg_freed = 0, migrated = 0;
1589        unsigned char type = IS_DATASEG(get_seg_entry(sbi, segno)->type) ?
1590                                                SUM_TYPE_DATA : SUM_TYPE_NODE;
1591        int submitted = 0;
1592
1593        if (__is_large_section(sbi))
1594                end_segno = rounddown(end_segno, sbi->segs_per_sec);
1595
1596        /*
1597         * zone-capacity can be less than zone-size in zoned devices,
1598         * resulting in less than expected usable segments in the zone,
1599         * calculate the end segno in the zone which can be garbage collected
1600         */
1601        if (f2fs_sb_has_blkzoned(sbi))
1602                end_segno -= sbi->segs_per_sec -
1603                                        f2fs_usable_segs_in_sec(sbi, segno);
1604
1605        sanity_check_seg_type(sbi, get_seg_entry(sbi, segno)->type);
1606
1607        /* readahead multi ssa blocks those have contiguous address */
1608        if (__is_large_section(sbi))
1609                f2fs_ra_meta_pages(sbi, GET_SUM_BLOCK(sbi, segno),
1610                                        end_segno - segno, META_SSA, true);
1611
1612        /* reference all summary page */
1613        while (segno < end_segno) {
1614                sum_page = f2fs_get_sum_page(sbi, segno++);
1615                if (IS_ERR(sum_page)) {
1616                        int err = PTR_ERR(sum_page);
1617
1618                        end_segno = segno - 1;
1619                        for (segno = start_segno; segno < end_segno; segno++) {
1620                                sum_page = find_get_page(META_MAPPING(sbi),
1621                                                GET_SUM_BLOCK(sbi, segno));
1622                                f2fs_put_page(sum_page, 0);
1623                                f2fs_put_page(sum_page, 0);
1624                        }
1625                        return err;
1626                }
1627                unlock_page(sum_page);
1628        }
1629
1630        blk_start_plug(&plug);
1631
1632        for (segno = start_segno; segno < end_segno; segno++) {
1633
1634                /* find segment summary of victim */
1635                sum_page = find_get_page(META_MAPPING(sbi),
1636                                        GET_SUM_BLOCK(sbi, segno));
1637                f2fs_put_page(sum_page, 0);
1638
1639                if (get_valid_blocks(sbi, segno, false) == 0)
1640                        goto freed;
1641                if (gc_type == BG_GC && __is_large_section(sbi) &&
1642                                migrated >= sbi->migration_granularity)
1643                        goto skip;
1644                if (!PageUptodate(sum_page) || unlikely(f2fs_cp_error(sbi)))
1645                        goto skip;
1646
1647                sum = page_address(sum_page);
1648                if (type != GET_SUM_TYPE((&sum->footer))) {
1649                        f2fs_err(sbi, "Inconsistent segment (%u) type [%d, %d] in SSA and SIT",
1650                                 segno, type, GET_SUM_TYPE((&sum->footer)));
1651                        set_sbi_flag(sbi, SBI_NEED_FSCK);
1652                        f2fs_stop_checkpoint(sbi, false);
1653                        goto skip;
1654                }
1655
1656                /*
1657                 * this is to avoid deadlock:
1658                 * - lock_page(sum_page)         - f2fs_replace_block
1659                 *  - check_valid_map()            - down_write(sentry_lock)
1660                 *   - down_read(sentry_lock)     - change_curseg()
1661                 *                                  - lock_page(sum_page)
1662                 */
1663                if (type == SUM_TYPE_NODE)
1664                        submitted += gc_node_segment(sbi, sum->entries, segno,
1665                                                                gc_type);
1666                else
1667                        submitted += gc_data_segment(sbi, sum->entries, gc_list,
1668                                                        segno, gc_type,
1669                                                        force_migrate);
1670
1671                stat_inc_seg_count(sbi, type, gc_type);
1672                sbi->gc_reclaimed_segs[sbi->gc_mode]++;
1673                migrated++;
1674
1675freed:
1676                if (gc_type == FG_GC &&
1677                                get_valid_blocks(sbi, segno, false) == 0)
1678                        seg_freed++;
1679
1680                if (__is_large_section(sbi) && segno + 1 < end_segno)
1681                        sbi->next_victim_seg[gc_type] = segno + 1;
1682skip:
1683                f2fs_put_page(sum_page, 0);
1684        }
1685
1686        if (submitted)
1687                f2fs_submit_merged_write(sbi,
1688                                (type == SUM_TYPE_NODE) ? NODE : DATA);
1689
1690        blk_finish_plug(&plug);
1691
1692        stat_inc_call_count(sbi->stat_info);
1693
1694        return seg_freed;
1695}
1696
1697int f2fs_gc(struct f2fs_sb_info *sbi, bool sync,
1698                        bool background, bool force, unsigned int segno)
1699{
1700        int gc_type = sync ? FG_GC : BG_GC;
1701        int sec_freed = 0, seg_freed = 0, total_freed = 0;
1702        int ret = 0;
1703        struct cp_control cpc;
1704        unsigned int init_segno = segno;
1705        struct gc_inode_list gc_list = {
1706                .ilist = LIST_HEAD_INIT(gc_list.ilist),
1707                .iroot = RADIX_TREE_INIT(gc_list.iroot, GFP_NOFS),
1708        };
1709        unsigned long long last_skipped = sbi->skipped_atomic_files[FG_GC];
1710        unsigned long long first_skipped;
1711        unsigned int skipped_round = 0, round = 0;
1712
1713        trace_f2fs_gc_begin(sbi->sb, sync, background,
1714                                get_pages(sbi, F2FS_DIRTY_NODES),
1715                                get_pages(sbi, F2FS_DIRTY_DENTS),
1716                                get_pages(sbi, F2FS_DIRTY_IMETA),
1717                                free_sections(sbi),
1718                                free_segments(sbi),
1719                                reserved_segments(sbi),
1720                                prefree_segments(sbi));
1721
1722        cpc.reason = __get_cp_reason(sbi);
1723        sbi->skipped_gc_rwsem = 0;
1724        first_skipped = last_skipped;
1725gc_more:
1726        if (unlikely(!(sbi->sb->s_flags & SB_ACTIVE))) {
1727                ret = -EINVAL;
1728                goto stop;
1729        }
1730        if (unlikely(f2fs_cp_error(sbi))) {
1731                ret = -EIO;
1732                goto stop;
1733        }
1734
1735        if (gc_type == BG_GC && has_not_enough_free_secs(sbi, 0, 0)) {
1736                /*
1737                 * For example, if there are many prefree_segments below given
1738                 * threshold, we can make them free by checkpoint. Then, we
1739                 * secure free segments which doesn't need fggc any more.
1740                 */
1741                if (prefree_segments(sbi) &&
1742                                !is_sbi_flag_set(sbi, SBI_CP_DISABLED)) {
1743                        ret = f2fs_write_checkpoint(sbi, &cpc);
1744                        if (ret)
1745                                goto stop;
1746                }
1747                if (has_not_enough_free_secs(sbi, 0, 0))
1748                        gc_type = FG_GC;
1749        }
1750
1751        /* f2fs_balance_fs doesn't need to do BG_GC in critical path. */
1752        if (gc_type == BG_GC && !background) {
1753                ret = -EINVAL;
1754                goto stop;
1755        }
1756        ret = __get_victim(sbi, &segno, gc_type);
1757        if (ret)
1758                goto stop;
1759
1760        seg_freed = do_garbage_collect(sbi, segno, &gc_list, gc_type, force);
1761        if (gc_type == FG_GC &&
1762                seg_freed == f2fs_usable_segs_in_sec(sbi, segno))
1763                sec_freed++;
1764        total_freed += seg_freed;
1765
1766        if (gc_type == FG_GC) {
1767                if (sbi->skipped_atomic_files[FG_GC] > last_skipped ||
1768                                                sbi->skipped_gc_rwsem)
1769                        skipped_round++;
1770                last_skipped = sbi->skipped_atomic_files[FG_GC];
1771                round++;
1772        }
1773
1774        if (gc_type == FG_GC)
1775                sbi->cur_victim_sec = NULL_SEGNO;
1776
1777        if (sync)
1778                goto stop;
1779
1780        if (has_not_enough_free_secs(sbi, sec_freed, 0)) {
1781                if (skipped_round <= MAX_SKIP_GC_COUNT ||
1782                                        skipped_round * 2 < round) {
1783                        segno = NULL_SEGNO;
1784                        goto gc_more;
1785                }
1786
1787                if (first_skipped < last_skipped &&
1788                                (last_skipped - first_skipped) >
1789                                                sbi->skipped_gc_rwsem) {
1790                        f2fs_drop_inmem_pages_all(sbi, true);
1791                        segno = NULL_SEGNO;
1792                        goto gc_more;
1793                }
1794                if (gc_type == FG_GC && !is_sbi_flag_set(sbi, SBI_CP_DISABLED))
1795                        ret = f2fs_write_checkpoint(sbi, &cpc);
1796        }
1797stop:
1798        SIT_I(sbi)->last_victim[ALLOC_NEXT] = 0;
1799        SIT_I(sbi)->last_victim[FLUSH_DEVICE] = init_segno;
1800
1801        trace_f2fs_gc_end(sbi->sb, ret, total_freed, sec_freed,
1802                                get_pages(sbi, F2FS_DIRTY_NODES),
1803                                get_pages(sbi, F2FS_DIRTY_DENTS),
1804                                get_pages(sbi, F2FS_DIRTY_IMETA),
1805                                free_sections(sbi),
1806                                free_segments(sbi),
1807                                reserved_segments(sbi),
1808                                prefree_segments(sbi));
1809
1810        up_write(&sbi->gc_lock);
1811
1812        put_gc_inode(&gc_list);
1813
1814        if (sync && !ret)
1815                ret = sec_freed ? 0 : -EAGAIN;
1816        return ret;
1817}
1818
1819int __init f2fs_create_garbage_collection_cache(void)
1820{
1821        victim_entry_slab = f2fs_kmem_cache_create("f2fs_victim_entry",
1822                                        sizeof(struct victim_entry));
1823        if (!victim_entry_slab)
1824                return -ENOMEM;
1825        return 0;
1826}
1827
1828void f2fs_destroy_garbage_collection_cache(void)
1829{
1830        kmem_cache_destroy(victim_entry_slab);
1831}
1832
1833static void init_atgc_management(struct f2fs_sb_info *sbi)
1834{
1835        struct atgc_management *am = &sbi->am;
1836
1837        if (test_opt(sbi, ATGC) &&
1838                SIT_I(sbi)->elapsed_time >= DEF_GC_THREAD_AGE_THRESHOLD)
1839                am->atgc_enabled = true;
1840
1841        am->root = RB_ROOT_CACHED;
1842        INIT_LIST_HEAD(&am->victim_list);
1843        am->victim_count = 0;
1844
1845        am->candidate_ratio = DEF_GC_THREAD_CANDIDATE_RATIO;
1846        am->max_candidate_count = DEF_GC_THREAD_MAX_CANDIDATE_COUNT;
1847        am->age_weight = DEF_GC_THREAD_AGE_WEIGHT;
1848        am->age_threshold = DEF_GC_THREAD_AGE_THRESHOLD;
1849}
1850
1851void f2fs_build_gc_manager(struct f2fs_sb_info *sbi)
1852{
1853        DIRTY_I(sbi)->v_ops = &default_v_ops;
1854
1855        sbi->gc_pin_file_threshold = DEF_GC_FAILED_PINNED_FILES;
1856
1857        /* give warm/cold data area from slower device */
1858        if (f2fs_is_multi_device(sbi) && !__is_large_section(sbi))
1859                SIT_I(sbi)->last_victim[ALLOC_NEXT] =
1860                                GET_SEGNO(sbi, FDEV(0).end_blk) + 1;
1861
1862        init_atgc_management(sbi);
1863}
1864
1865static int free_segment_range(struct f2fs_sb_info *sbi,
1866                                unsigned int secs, bool gc_only)
1867{
1868        unsigned int segno, next_inuse, start, end;
1869        struct cp_control cpc = { CP_RESIZE, 0, 0, 0 };
1870        int gc_mode, gc_type;
1871        int err = 0;
1872        int type;
1873
1874        /* Force block allocation for GC */
1875        MAIN_SECS(sbi) -= secs;
1876        start = MAIN_SECS(sbi) * sbi->segs_per_sec;
1877        end = MAIN_SEGS(sbi) - 1;
1878
1879        mutex_lock(&DIRTY_I(sbi)->seglist_lock);
1880        for (gc_mode = 0; gc_mode < MAX_GC_POLICY; gc_mode++)
1881                if (SIT_I(sbi)->last_victim[gc_mode] >= start)
1882                        SIT_I(sbi)->last_victim[gc_mode] = 0;
1883
1884        for (gc_type = BG_GC; gc_type <= FG_GC; gc_type++)
1885                if (sbi->next_victim_seg[gc_type] >= start)
1886                        sbi->next_victim_seg[gc_type] = NULL_SEGNO;
1887        mutex_unlock(&DIRTY_I(sbi)->seglist_lock);
1888
1889        /* Move out cursegs from the target range */
1890        for (type = CURSEG_HOT_DATA; type < NR_CURSEG_PERSIST_TYPE; type++)
1891                f2fs_allocate_segment_for_resize(sbi, type, start, end);
1892
1893        /* do GC to move out valid blocks in the range */
1894        for (segno = start; segno <= end; segno += sbi->segs_per_sec) {
1895                struct gc_inode_list gc_list = {
1896                        .ilist = LIST_HEAD_INIT(gc_list.ilist),
1897                        .iroot = RADIX_TREE_INIT(gc_list.iroot, GFP_NOFS),
1898                };
1899
1900                do_garbage_collect(sbi, segno, &gc_list, FG_GC, true);
1901                put_gc_inode(&gc_list);
1902
1903                if (!gc_only && get_valid_blocks(sbi, segno, true)) {
1904                        err = -EAGAIN;
1905                        goto out;
1906                }
1907                if (fatal_signal_pending(current)) {
1908                        err = -ERESTARTSYS;
1909                        goto out;
1910                }
1911        }
1912        if (gc_only)
1913                goto out;
1914
1915        err = f2fs_write_checkpoint(sbi, &cpc);
1916        if (err)
1917                goto out;
1918
1919        next_inuse = find_next_inuse(FREE_I(sbi), end + 1, start);
1920        if (next_inuse <= end) {
1921                f2fs_err(sbi, "segno %u should be free but still inuse!",
1922                         next_inuse);
1923                f2fs_bug_on(sbi, 1);
1924        }
1925out:
1926        MAIN_SECS(sbi) += secs;
1927        return err;
1928}
1929
1930static void update_sb_metadata(struct f2fs_sb_info *sbi, int secs)
1931{
1932        struct f2fs_super_block *raw_sb = F2FS_RAW_SUPER(sbi);
1933        int section_count;
1934        int segment_count;
1935        int segment_count_main;
1936        long long block_count;
1937        int segs = secs * sbi->segs_per_sec;
1938
1939        down_write(&sbi->sb_lock);
1940
1941        section_count = le32_to_cpu(raw_sb->section_count);
1942        segment_count = le32_to_cpu(raw_sb->segment_count);
1943        segment_count_main = le32_to_cpu(raw_sb->segment_count_main);
1944        block_count = le64_to_cpu(raw_sb->block_count);
1945
1946        raw_sb->section_count = cpu_to_le32(section_count + secs);
1947        raw_sb->segment_count = cpu_to_le32(segment_count + segs);
1948        raw_sb->segment_count_main = cpu_to_le32(segment_count_main + segs);
1949        raw_sb->block_count = cpu_to_le64(block_count +
1950                                        (long long)segs * sbi->blocks_per_seg);
1951        if (f2fs_is_multi_device(sbi)) {
1952                int last_dev = sbi->s_ndevs - 1;
1953                int dev_segs =
1954                        le32_to_cpu(raw_sb->devs[last_dev].total_segments);
1955
1956                raw_sb->devs[last_dev].total_segments =
1957                                                cpu_to_le32(dev_segs + segs);
1958        }
1959
1960        up_write(&sbi->sb_lock);
1961}
1962
1963static void update_fs_metadata(struct f2fs_sb_info *sbi, int secs)
1964{
1965        int segs = secs * sbi->segs_per_sec;
1966        long long blks = (long long)segs * sbi->blocks_per_seg;
1967        long long user_block_count =
1968                                le64_to_cpu(F2FS_CKPT(sbi)->user_block_count);
1969
1970        SM_I(sbi)->segment_count = (int)SM_I(sbi)->segment_count + segs;
1971        MAIN_SEGS(sbi) = (int)MAIN_SEGS(sbi) + segs;
1972        MAIN_SECS(sbi) += secs;
1973        FREE_I(sbi)->free_sections = (int)FREE_I(sbi)->free_sections + secs;
1974        FREE_I(sbi)->free_segments = (int)FREE_I(sbi)->free_segments + segs;
1975        F2FS_CKPT(sbi)->user_block_count = cpu_to_le64(user_block_count + blks);
1976
1977        if (f2fs_is_multi_device(sbi)) {
1978                int last_dev = sbi->s_ndevs - 1;
1979
1980                FDEV(last_dev).total_segments =
1981                                (int)FDEV(last_dev).total_segments + segs;
1982                FDEV(last_dev).end_blk =
1983                                (long long)FDEV(last_dev).end_blk + blks;
1984#ifdef CONFIG_BLK_DEV_ZONED
1985                FDEV(last_dev).nr_blkz = (int)FDEV(last_dev).nr_blkz +
1986                                        (int)(blks >> sbi->log_blocks_per_blkz);
1987#endif
1988        }
1989}
1990
1991int f2fs_resize_fs(struct f2fs_sb_info *sbi, __u64 block_count)
1992{
1993        __u64 old_block_count, shrunk_blocks;
1994        struct cp_control cpc = { CP_RESIZE, 0, 0, 0 };
1995        unsigned int secs;
1996        int err = 0;
1997        __u32 rem;
1998
1999        old_block_count = le64_to_cpu(F2FS_RAW_SUPER(sbi)->block_count);
2000        if (block_count > old_block_count)
2001                return -EINVAL;
2002
2003        if (f2fs_is_multi_device(sbi)) {
2004                int last_dev = sbi->s_ndevs - 1;
2005                __u64 last_segs = FDEV(last_dev).total_segments;
2006
2007                if (block_count + last_segs * sbi->blocks_per_seg <=
2008                                                                old_block_count)
2009                        return -EINVAL;
2010        }
2011
2012        /* new fs size should align to section size */
2013        div_u64_rem(block_count, BLKS_PER_SEC(sbi), &rem);
2014        if (rem)
2015                return -EINVAL;
2016
2017        if (block_count == old_block_count)
2018                return 0;
2019
2020        if (is_sbi_flag_set(sbi, SBI_NEED_FSCK)) {
2021                f2fs_err(sbi, "Should run fsck to repair first.");
2022                return -EFSCORRUPTED;
2023        }
2024
2025        if (test_opt(sbi, DISABLE_CHECKPOINT)) {
2026                f2fs_err(sbi, "Checkpoint should be enabled.");
2027                return -EINVAL;
2028        }
2029
2030        shrunk_blocks = old_block_count - block_count;
2031        secs = div_u64(shrunk_blocks, BLKS_PER_SEC(sbi));
2032
2033        /* stop other GC */
2034        if (!down_write_trylock(&sbi->gc_lock))
2035                return -EAGAIN;
2036
2037        /* stop CP to protect MAIN_SEC in free_segment_range */
2038        f2fs_lock_op(sbi);
2039
2040        spin_lock(&sbi->stat_lock);
2041        if (shrunk_blocks + valid_user_blocks(sbi) +
2042                sbi->current_reserved_blocks + sbi->unusable_block_count +
2043                F2FS_OPTION(sbi).root_reserved_blocks > sbi->user_block_count)
2044                err = -ENOSPC;
2045        spin_unlock(&sbi->stat_lock);
2046
2047        if (err)
2048                goto out_unlock;
2049
2050        err = free_segment_range(sbi, secs, true);
2051
2052out_unlock:
2053        f2fs_unlock_op(sbi);
2054        up_write(&sbi->gc_lock);
2055        if (err)
2056                return err;
2057
2058        set_sbi_flag(sbi, SBI_IS_RESIZEFS);
2059
2060        freeze_super(sbi->sb);
2061        down_write(&sbi->gc_lock);
2062        down_write(&sbi->cp_global_sem);
2063
2064        spin_lock(&sbi->stat_lock);
2065        if (shrunk_blocks + valid_user_blocks(sbi) +
2066                sbi->current_reserved_blocks + sbi->unusable_block_count +
2067                F2FS_OPTION(sbi).root_reserved_blocks > sbi->user_block_count)
2068                err = -ENOSPC;
2069        else
2070                sbi->user_block_count -= shrunk_blocks;
2071        spin_unlock(&sbi->stat_lock);
2072        if (err)
2073                goto out_err;
2074
2075        err = free_segment_range(sbi, secs, false);
2076        if (err)
2077                goto recover_out;
2078
2079        update_sb_metadata(sbi, -secs);
2080
2081        err = f2fs_commit_super(sbi, false);
2082        if (err) {
2083                update_sb_metadata(sbi, secs);
2084                goto recover_out;
2085        }
2086
2087        update_fs_metadata(sbi, -secs);
2088        clear_sbi_flag(sbi, SBI_IS_RESIZEFS);
2089        set_sbi_flag(sbi, SBI_IS_DIRTY);
2090
2091        err = f2fs_write_checkpoint(sbi, &cpc);
2092        if (err) {
2093                update_fs_metadata(sbi, secs);
2094                update_sb_metadata(sbi, secs);
2095                f2fs_commit_super(sbi, false);
2096        }
2097recover_out:
2098        if (err) {
2099                set_sbi_flag(sbi, SBI_NEED_FSCK);
2100                f2fs_err(sbi, "resize_fs failed, should run fsck to repair!");
2101
2102                spin_lock(&sbi->stat_lock);
2103                sbi->user_block_count += shrunk_blocks;
2104                spin_unlock(&sbi->stat_lock);
2105        }
2106out_err:
2107        up_write(&sbi->cp_global_sem);
2108        up_write(&sbi->gc_lock);
2109        thaw_super(sbi->sb);
2110        clear_sbi_flag(sbi, SBI_IS_RESIZEFS);
2111        return err;
2112}
2113