linux/fs/xfs/xfs_iwalk.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0-or-later
   2/*
   3 * Copyright (C) 2019 Oracle.  All Rights Reserved.
   4 * Author: Darrick J. Wong <darrick.wong@oracle.com>
   5 */
   6#include "xfs.h"
   7#include "xfs_fs.h"
   8#include "xfs_shared.h"
   9#include "xfs_format.h"
  10#include "xfs_log_format.h"
  11#include "xfs_trans_resv.h"
  12#include "xfs_mount.h"
  13#include "xfs_inode.h"
  14#include "xfs_btree.h"
  15#include "xfs_ialloc.h"
  16#include "xfs_ialloc_btree.h"
  17#include "xfs_iwalk.h"
  18#include "xfs_error.h"
  19#include "xfs_trace.h"
  20#include "xfs_icache.h"
  21#include "xfs_health.h"
  22#include "xfs_trans.h"
  23#include "xfs_pwork.h"
  24
  25/*
  26 * Walking Inodes in the Filesystem
  27 * ================================
  28 *
  29 * This iterator function walks a subset of filesystem inodes in increasing
  30 * order from @startino until there are no more inodes.  For each allocated
  31 * inode it finds, it calls a walk function with the relevant inode number and
  32 * a pointer to caller-provided data.  The walk function can return the usual
  33 * negative error code to stop the iteration; 0 to continue the iteration; or
  34 * -ECANCELED to stop the iteration.  This return value is returned to the
  35 * caller.
  36 *
  37 * Internally, we allow the walk function to do anything, which means that we
  38 * cannot maintain the inobt cursor or our lock on the AGI buffer.  We
  39 * therefore cache the inobt records in kernel memory and only call the walk
  40 * function when our memory buffer is full.  @nr_recs is the number of records
  41 * that we've cached, and @sz_recs is the size of our cache.
  42 *
  43 * It is the responsibility of the walk function to ensure it accesses
  44 * allocated inodes, as the inobt records may be stale by the time they are
  45 * acted upon.
  46 */
  47
  48struct xfs_iwalk_ag {
  49        /* parallel work control data; will be null if single threaded */
  50        struct xfs_pwork                pwork;
  51
  52        struct xfs_mount                *mp;
  53        struct xfs_trans                *tp;
  54
  55        /* Where do we start the traversal? */
  56        xfs_ino_t                       startino;
  57
  58        /* Array of inobt records we cache. */
  59        struct xfs_inobt_rec_incore     *recs;
  60
  61        /* Number of entries allocated for the @recs array. */
  62        unsigned int                    sz_recs;
  63
  64        /* Number of entries in the @recs array that are in use. */
  65        unsigned int                    nr_recs;
  66
  67        /* Inode walk function and data pointer. */
  68        xfs_iwalk_fn                    iwalk_fn;
  69        xfs_inobt_walk_fn               inobt_walk_fn;
  70        void                            *data;
  71
  72        /*
  73         * Make it look like the inodes up to startino are free so that
  74         * bulkstat can start its inode iteration at the correct place without
  75         * needing to special case everywhere.
  76         */
  77        unsigned int                    trim_start:1;
  78
  79        /* Skip empty inobt records? */
  80        unsigned int                    skip_empty:1;
  81};
  82
  83/*
  84 * Loop over all clusters in a chunk for a given incore inode allocation btree
  85 * record.  Do a readahead if there are any allocated inodes in that cluster.
  86 */
  87STATIC void
  88xfs_iwalk_ichunk_ra(
  89        struct xfs_mount                *mp,
  90        xfs_agnumber_t                  agno,
  91        struct xfs_inobt_rec_incore     *irec)
  92{
  93        struct xfs_ino_geometry         *igeo = M_IGEO(mp);
  94        xfs_agblock_t                   agbno;
  95        struct blk_plug                 plug;
  96        int                             i;      /* inode chunk index */
  97
  98        agbno = XFS_AGINO_TO_AGBNO(mp, irec->ir_startino);
  99
 100        blk_start_plug(&plug);
 101        for (i = 0; i < XFS_INODES_PER_CHUNK; i += igeo->inodes_per_cluster) {
 102                xfs_inofree_t   imask;
 103
 104                imask = xfs_inobt_maskn(i, igeo->inodes_per_cluster);
 105                if (imask & ~irec->ir_free) {
 106                        xfs_btree_reada_bufs(mp, agno, agbno,
 107                                        igeo->blocks_per_cluster,
 108                                        &xfs_inode_buf_ops);
 109                }
 110                agbno += igeo->blocks_per_cluster;
 111        }
 112        blk_finish_plug(&plug);
 113}
 114
 115/*
 116 * Set the bits in @irec's free mask that correspond to the inodes before
 117 * @agino so that we skip them.  This is how we restart an inode walk that was
 118 * interrupted in the middle of an inode record.
 119 */
 120STATIC void
 121xfs_iwalk_adjust_start(
 122        xfs_agino_t                     agino,  /* starting inode of chunk */
 123        struct xfs_inobt_rec_incore     *irec)  /* btree record */
 124{
 125        int                             idx;    /* index into inode chunk */
 126        int                             i;
 127
 128        idx = agino - irec->ir_startino;
 129
 130        /*
 131         * We got a right chunk with some left inodes allocated at it.  Grab
 132         * the chunk record.  Mark all the uninteresting inodes free because
 133         * they're before our start point.
 134         */
 135        for (i = 0; i < idx; i++) {
 136                if (XFS_INOBT_MASK(i) & ~irec->ir_free)
 137                        irec->ir_freecount++;
 138        }
 139
 140        irec->ir_free |= xfs_inobt_maskn(0, idx);
 141}
 142
 143/* Allocate memory for a walk. */
 144STATIC int
 145xfs_iwalk_alloc(
 146        struct xfs_iwalk_ag     *iwag)
 147{
 148        size_t                  size;
 149
 150        ASSERT(iwag->recs == NULL);
 151        iwag->nr_recs = 0;
 152
 153        /* Allocate a prefetch buffer for inobt records. */
 154        size = iwag->sz_recs * sizeof(struct xfs_inobt_rec_incore);
 155        iwag->recs = kmem_alloc(size, KM_MAYFAIL);
 156        if (iwag->recs == NULL)
 157                return -ENOMEM;
 158
 159        return 0;
 160}
 161
 162/* Free memory we allocated for a walk. */
 163STATIC void
 164xfs_iwalk_free(
 165        struct xfs_iwalk_ag     *iwag)
 166{
 167        kmem_free(iwag->recs);
 168        iwag->recs = NULL;
 169}
 170
 171/* For each inuse inode in each cached inobt record, call our function. */
 172STATIC int
 173xfs_iwalk_ag_recs(
 174        struct xfs_iwalk_ag             *iwag)
 175{
 176        struct xfs_mount                *mp = iwag->mp;
 177        struct xfs_trans                *tp = iwag->tp;
 178        xfs_ino_t                       ino;
 179        unsigned int                    i, j;
 180        xfs_agnumber_t                  agno;
 181        int                             error;
 182
 183        agno = XFS_INO_TO_AGNO(mp, iwag->startino);
 184        for (i = 0; i < iwag->nr_recs; i++) {
 185                struct xfs_inobt_rec_incore     *irec = &iwag->recs[i];
 186
 187                trace_xfs_iwalk_ag_rec(mp, agno, irec);
 188
 189                if (xfs_pwork_want_abort(&iwag->pwork))
 190                        return 0;
 191
 192                if (iwag->inobt_walk_fn) {
 193                        error = iwag->inobt_walk_fn(mp, tp, agno, irec,
 194                                        iwag->data);
 195                        if (error)
 196                                return error;
 197                }
 198
 199                if (!iwag->iwalk_fn)
 200                        continue;
 201
 202                for (j = 0; j < XFS_INODES_PER_CHUNK; j++) {
 203                        if (xfs_pwork_want_abort(&iwag->pwork))
 204                                return 0;
 205
 206                        /* Skip if this inode is free */
 207                        if (XFS_INOBT_MASK(j) & irec->ir_free)
 208                                continue;
 209
 210                        /* Otherwise call our function. */
 211                        ino = XFS_AGINO_TO_INO(mp, agno, irec->ir_startino + j);
 212                        error = iwag->iwalk_fn(mp, tp, ino, iwag->data);
 213                        if (error)
 214                                return error;
 215                }
 216        }
 217
 218        return 0;
 219}
 220
 221/* Delete cursor and let go of AGI. */
 222static inline void
 223xfs_iwalk_del_inobt(
 224        struct xfs_trans        *tp,
 225        struct xfs_btree_cur    **curpp,
 226        struct xfs_buf          **agi_bpp,
 227        int                     error)
 228{
 229        if (*curpp) {
 230                xfs_btree_del_cursor(*curpp, error);
 231                *curpp = NULL;
 232        }
 233        if (*agi_bpp) {
 234                xfs_trans_brelse(tp, *agi_bpp);
 235                *agi_bpp = NULL;
 236        }
 237}
 238
 239/*
 240 * Set ourselves up for walking inobt records starting from a given point in
 241 * the filesystem.
 242 *
 243 * If caller passed in a nonzero start inode number, load the record from the
 244 * inobt and make the record look like all the inodes before agino are free so
 245 * that we skip them, and then move the cursor to the next inobt record.  This
 246 * is how we support starting an iwalk in the middle of an inode chunk.
 247 *
 248 * If the caller passed in a start number of zero, move the cursor to the first
 249 * inobt record.
 250 *
 251 * The caller is responsible for cleaning up the cursor and buffer pointer
 252 * regardless of the error status.
 253 */
 254STATIC int
 255xfs_iwalk_ag_start(
 256        struct xfs_iwalk_ag     *iwag,
 257        xfs_agnumber_t          agno,
 258        xfs_agino_t             agino,
 259        struct xfs_btree_cur    **curpp,
 260        struct xfs_buf          **agi_bpp,
 261        int                     *has_more)
 262{
 263        struct xfs_mount        *mp = iwag->mp;
 264        struct xfs_trans        *tp = iwag->tp;
 265        struct xfs_inobt_rec_incore *irec;
 266        int                     error;
 267
 268        /* Set up a fresh cursor and empty the inobt cache. */
 269        iwag->nr_recs = 0;
 270        error = xfs_inobt_cur(mp, tp, agno, XFS_BTNUM_INO, curpp, agi_bpp);
 271        if (error)
 272                return error;
 273
 274        /* Starting at the beginning of the AG?  That's easy! */
 275        if (agino == 0)
 276                return xfs_inobt_lookup(*curpp, 0, XFS_LOOKUP_GE, has_more);
 277
 278        /*
 279         * Otherwise, we have to grab the inobt record where we left off, stuff
 280         * the record into our cache, and then see if there are more records.
 281         * We require a lookup cache of at least two elements so that the
 282         * caller doesn't have to deal with tearing down the cursor to walk the
 283         * records.
 284         */
 285        error = xfs_inobt_lookup(*curpp, agino, XFS_LOOKUP_LE, has_more);
 286        if (error)
 287                return error;
 288
 289        /*
 290         * If the LE lookup at @agino yields no records, jump ahead to the
 291         * inobt cursor increment to see if there are more records to process.
 292         */
 293        if (!*has_more)
 294                goto out_advance;
 295
 296        /* Get the record, should always work */
 297        irec = &iwag->recs[iwag->nr_recs];
 298        error = xfs_inobt_get_rec(*curpp, irec, has_more);
 299        if (error)
 300                return error;
 301        if (XFS_IS_CORRUPT(mp, *has_more != 1))
 302                return -EFSCORRUPTED;
 303
 304        /*
 305         * If the LE lookup yielded an inobt record before the cursor position,
 306         * skip it and see if there's another one after it.
 307         */
 308        if (irec->ir_startino + XFS_INODES_PER_CHUNK <= agino)
 309                goto out_advance;
 310
 311        /*
 312         * If agino fell in the middle of the inode record, make it look like
 313         * the inodes up to agino are free so that we don't return them again.
 314         */
 315        if (iwag->trim_start)
 316                xfs_iwalk_adjust_start(agino, irec);
 317
 318        /*
 319         * The prefetch calculation is supposed to give us a large enough inobt
 320         * record cache that grab_ichunk can stage a partial first record and
 321         * the loop body can cache a record without having to check for cache
 322         * space until after it reads an inobt record.
 323         */
 324        iwag->nr_recs++;
 325        ASSERT(iwag->nr_recs < iwag->sz_recs);
 326
 327out_advance:
 328        return xfs_btree_increment(*curpp, 0, has_more);
 329}
 330
 331/*
 332 * The inobt record cache is full, so preserve the inobt cursor state and
 333 * run callbacks on the cached inobt records.  When we're done, restore the
 334 * cursor state to wherever the cursor would have been had the cache not been
 335 * full (and therefore we could've just incremented the cursor) if *@has_more
 336 * is true.  On exit, *@has_more will indicate whether or not the caller should
 337 * try for more inode records.
 338 */
 339STATIC int
 340xfs_iwalk_run_callbacks(
 341        struct xfs_iwalk_ag             *iwag,
 342        xfs_agnumber_t                  agno,
 343        struct xfs_btree_cur            **curpp,
 344        struct xfs_buf                  **agi_bpp,
 345        int                             *has_more)
 346{
 347        struct xfs_mount                *mp = iwag->mp;
 348        struct xfs_trans                *tp = iwag->tp;
 349        struct xfs_inobt_rec_incore     *irec;
 350        xfs_agino_t                     restart;
 351        int                             error;
 352
 353        ASSERT(iwag->nr_recs > 0);
 354
 355        /* Delete cursor but remember the last record we cached... */
 356        xfs_iwalk_del_inobt(tp, curpp, agi_bpp, 0);
 357        irec = &iwag->recs[iwag->nr_recs - 1];
 358        restart = irec->ir_startino + XFS_INODES_PER_CHUNK - 1;
 359
 360        error = xfs_iwalk_ag_recs(iwag);
 361        if (error)
 362                return error;
 363
 364        /* ...empty the cache... */
 365        iwag->nr_recs = 0;
 366
 367        if (!has_more)
 368                return 0;
 369
 370        /* ...and recreate the cursor just past where we left off. */
 371        error = xfs_inobt_cur(mp, tp, agno, XFS_BTNUM_INO, curpp, agi_bpp);
 372        if (error)
 373                return error;
 374
 375        return xfs_inobt_lookup(*curpp, restart, XFS_LOOKUP_GE, has_more);
 376}
 377
 378/* Walk all inodes in a single AG, from @iwag->startino to the end of the AG. */
 379STATIC int
 380xfs_iwalk_ag(
 381        struct xfs_iwalk_ag             *iwag)
 382{
 383        struct xfs_mount                *mp = iwag->mp;
 384        struct xfs_trans                *tp = iwag->tp;
 385        struct xfs_buf                  *agi_bp = NULL;
 386        struct xfs_btree_cur            *cur = NULL;
 387        xfs_agnumber_t                  agno;
 388        xfs_agino_t                     agino;
 389        int                             has_more;
 390        int                             error = 0;
 391
 392        /* Set up our cursor at the right place in the inode btree. */
 393        agno = XFS_INO_TO_AGNO(mp, iwag->startino);
 394        agino = XFS_INO_TO_AGINO(mp, iwag->startino);
 395        error = xfs_iwalk_ag_start(iwag, agno, agino, &cur, &agi_bp, &has_more);
 396
 397        while (!error && has_more) {
 398                struct xfs_inobt_rec_incore     *irec;
 399
 400                cond_resched();
 401                if (xfs_pwork_want_abort(&iwag->pwork))
 402                        goto out;
 403
 404                /* Fetch the inobt record. */
 405                irec = &iwag->recs[iwag->nr_recs];
 406                error = xfs_inobt_get_rec(cur, irec, &has_more);
 407                if (error || !has_more)
 408                        break;
 409
 410                /* No allocated inodes in this chunk; skip it. */
 411                if (iwag->skip_empty && irec->ir_freecount == irec->ir_count) {
 412                        error = xfs_btree_increment(cur, 0, &has_more);
 413                        if (error)
 414                                break;
 415                        continue;
 416                }
 417
 418                /*
 419                 * Start readahead for this inode chunk in anticipation of
 420                 * walking the inodes.
 421                 */
 422                if (iwag->iwalk_fn)
 423                        xfs_iwalk_ichunk_ra(mp, agno, irec);
 424
 425                /*
 426                 * If there's space in the buffer for more records, increment
 427                 * the btree cursor and grab more.
 428                 */
 429                if (++iwag->nr_recs < iwag->sz_recs) {
 430                        error = xfs_btree_increment(cur, 0, &has_more);
 431                        if (error || !has_more)
 432                                break;
 433                        continue;
 434                }
 435
 436                /*
 437                 * Otherwise, we need to save cursor state and run the callback
 438                 * function on the cached records.  The run_callbacks function
 439                 * is supposed to return a cursor pointing to the record where
 440                 * we would be if we had been able to increment like above.
 441                 */
 442                ASSERT(has_more);
 443                error = xfs_iwalk_run_callbacks(iwag, agno, &cur, &agi_bp,
 444                                &has_more);
 445        }
 446
 447        if (iwag->nr_recs == 0 || error)
 448                goto out;
 449
 450        /* Walk the unprocessed records in the cache. */
 451        error = xfs_iwalk_run_callbacks(iwag, agno, &cur, &agi_bp, &has_more);
 452
 453out:
 454        xfs_iwalk_del_inobt(tp, &cur, &agi_bp, error);
 455        return error;
 456}
 457
 458/*
 459 * We experimentally determined that the reduction in ioctl call overhead
 460 * diminishes when userspace asks for more than 2048 inodes, so we'll cap
 461 * prefetch at this point.
 462 */
 463#define IWALK_MAX_INODE_PREFETCH        (2048U)
 464
 465/*
 466 * Given the number of inodes to prefetch, set the number of inobt records that
 467 * we cache in memory, which controls the number of inodes we try to read
 468 * ahead.  Set the maximum if @inodes == 0.
 469 */
 470static inline unsigned int
 471xfs_iwalk_prefetch(
 472        unsigned int            inodes)
 473{
 474        unsigned int            inobt_records;
 475
 476        /*
 477         * If the caller didn't tell us the number of inodes they wanted,
 478         * assume the maximum prefetch possible for best performance.
 479         * Otherwise, cap prefetch at that maximum so that we don't start an
 480         * absurd amount of prefetch.
 481         */
 482        if (inodes == 0)
 483                inodes = IWALK_MAX_INODE_PREFETCH;
 484        inodes = min(inodes, IWALK_MAX_INODE_PREFETCH);
 485
 486        /* Round the inode count up to a full chunk. */
 487        inodes = round_up(inodes, XFS_INODES_PER_CHUNK);
 488
 489        /*
 490         * In order to convert the number of inodes to prefetch into an
 491         * estimate of the number of inobt records to cache, we require a
 492         * conversion factor that reflects our expectations of the average
 493         * loading factor of an inode chunk.  Based on data gathered, most
 494         * (but not all) filesystems manage to keep the inode chunks totally
 495         * full, so we'll underestimate slightly so that our readahead will
 496         * still deliver the performance we want on aging filesystems:
 497         *
 498         * inobt = inodes / (INODES_PER_CHUNK * (4 / 5));
 499         *
 500         * The funny math is to avoid integer division.
 501         */
 502        inobt_records = (inodes * 5) / (4 * XFS_INODES_PER_CHUNK);
 503
 504        /*
 505         * Allocate enough space to prefetch at least two inobt records so that
 506         * we can cache both the record where the iwalk started and the next
 507         * record.  This simplifies the AG inode walk loop setup code.
 508         */
 509        return max(inobt_records, 2U);
 510}
 511
 512/*
 513 * Walk all inodes in the filesystem starting from @startino.  The @iwalk_fn
 514 * will be called for each allocated inode, being passed the inode's number and
 515 * @data.  @max_prefetch controls how many inobt records' worth of inodes we
 516 * try to readahead.
 517 */
 518int
 519xfs_iwalk(
 520        struct xfs_mount        *mp,
 521        struct xfs_trans        *tp,
 522        xfs_ino_t               startino,
 523        unsigned int            flags,
 524        xfs_iwalk_fn            iwalk_fn,
 525        unsigned int            inode_records,
 526        void                    *data)
 527{
 528        struct xfs_iwalk_ag     iwag = {
 529                .mp             = mp,
 530                .tp             = tp,
 531                .iwalk_fn       = iwalk_fn,
 532                .data           = data,
 533                .startino       = startino,
 534                .sz_recs        = xfs_iwalk_prefetch(inode_records),
 535                .trim_start     = 1,
 536                .skip_empty     = 1,
 537                .pwork          = XFS_PWORK_SINGLE_THREADED,
 538        };
 539        xfs_agnumber_t          agno = XFS_INO_TO_AGNO(mp, startino);
 540        int                     error;
 541
 542        ASSERT(agno < mp->m_sb.sb_agcount);
 543        ASSERT(!(flags & ~XFS_IWALK_FLAGS_ALL));
 544
 545        error = xfs_iwalk_alloc(&iwag);
 546        if (error)
 547                return error;
 548
 549        for (; agno < mp->m_sb.sb_agcount; agno++) {
 550                error = xfs_iwalk_ag(&iwag);
 551                if (error)
 552                        break;
 553                iwag.startino = XFS_AGINO_TO_INO(mp, agno + 1, 0);
 554                if (flags & XFS_INOBT_WALK_SAME_AG)
 555                        break;
 556        }
 557
 558        xfs_iwalk_free(&iwag);
 559        return error;
 560}
 561
 562/* Run per-thread iwalk work. */
 563static int
 564xfs_iwalk_ag_work(
 565        struct xfs_mount        *mp,
 566        struct xfs_pwork        *pwork)
 567{
 568        struct xfs_iwalk_ag     *iwag;
 569        int                     error = 0;
 570
 571        iwag = container_of(pwork, struct xfs_iwalk_ag, pwork);
 572        if (xfs_pwork_want_abort(pwork))
 573                goto out;
 574
 575        error = xfs_iwalk_alloc(iwag);
 576        if (error)
 577                goto out;
 578
 579        error = xfs_iwalk_ag(iwag);
 580        xfs_iwalk_free(iwag);
 581out:
 582        kmem_free(iwag);
 583        return error;
 584}
 585
 586/*
 587 * Walk all the inodes in the filesystem using multiple threads to process each
 588 * AG.
 589 */
 590int
 591xfs_iwalk_threaded(
 592        struct xfs_mount        *mp,
 593        xfs_ino_t               startino,
 594        unsigned int            flags,
 595        xfs_iwalk_fn            iwalk_fn,
 596        unsigned int            inode_records,
 597        bool                    polled,
 598        void                    *data)
 599{
 600        struct xfs_pwork_ctl    pctl;
 601        xfs_agnumber_t          agno = XFS_INO_TO_AGNO(mp, startino);
 602        unsigned int            nr_threads;
 603        int                     error;
 604
 605        ASSERT(agno < mp->m_sb.sb_agcount);
 606        ASSERT(!(flags & ~XFS_IWALK_FLAGS_ALL));
 607
 608        nr_threads = xfs_pwork_guess_datadev_parallelism(mp);
 609        error = xfs_pwork_init(mp, &pctl, xfs_iwalk_ag_work, "xfs_iwalk",
 610                        nr_threads);
 611        if (error)
 612                return error;
 613
 614        for (; agno < mp->m_sb.sb_agcount; agno++) {
 615                struct xfs_iwalk_ag     *iwag;
 616
 617                if (xfs_pwork_ctl_want_abort(&pctl))
 618                        break;
 619
 620                iwag = kmem_zalloc(sizeof(struct xfs_iwalk_ag), 0);
 621                iwag->mp = mp;
 622                iwag->iwalk_fn = iwalk_fn;
 623                iwag->data = data;
 624                iwag->startino = startino;
 625                iwag->sz_recs = xfs_iwalk_prefetch(inode_records);
 626                xfs_pwork_queue(&pctl, &iwag->pwork);
 627                startino = XFS_AGINO_TO_INO(mp, agno + 1, 0);
 628                if (flags & XFS_INOBT_WALK_SAME_AG)
 629                        break;
 630        }
 631
 632        if (polled)
 633                xfs_pwork_poll(&pctl);
 634        return xfs_pwork_destroy(&pctl);
 635}
 636
 637/*
 638 * Allow callers to cache up to a page's worth of inobt records.  This reflects
 639 * the existing inumbers prefetching behavior.  Since the inobt walk does not
 640 * itself do anything with the inobt records, we can set a fairly high limit
 641 * here.
 642 */
 643#define MAX_INOBT_WALK_PREFETCH \
 644        (PAGE_SIZE / sizeof(struct xfs_inobt_rec_incore))
 645
 646/*
 647 * Given the number of records that the user wanted, set the number of inobt
 648 * records that we buffer in memory.  Set the maximum if @inobt_records == 0.
 649 */
 650static inline unsigned int
 651xfs_inobt_walk_prefetch(
 652        unsigned int            inobt_records)
 653{
 654        /*
 655         * If the caller didn't tell us the number of inobt records they
 656         * wanted, assume the maximum prefetch possible for best performance.
 657         */
 658        if (inobt_records == 0)
 659                inobt_records = MAX_INOBT_WALK_PREFETCH;
 660
 661        /*
 662         * Allocate enough space to prefetch at least two inobt records so that
 663         * we can cache both the record where the iwalk started and the next
 664         * record.  This simplifies the AG inode walk loop setup code.
 665         */
 666        inobt_records = max(inobt_records, 2U);
 667
 668        /*
 669         * Cap prefetch at that maximum so that we don't use an absurd amount
 670         * of memory.
 671         */
 672        return min_t(unsigned int, inobt_records, MAX_INOBT_WALK_PREFETCH);
 673}
 674
 675/*
 676 * Walk all inode btree records in the filesystem starting from @startino.  The
 677 * @inobt_walk_fn will be called for each btree record, being passed the incore
 678 * record and @data.  @max_prefetch controls how many inobt records we try to
 679 * cache ahead of time.
 680 */
 681int
 682xfs_inobt_walk(
 683        struct xfs_mount        *mp,
 684        struct xfs_trans        *tp,
 685        xfs_ino_t               startino,
 686        unsigned int            flags,
 687        xfs_inobt_walk_fn       inobt_walk_fn,
 688        unsigned int            inobt_records,
 689        void                    *data)
 690{
 691        struct xfs_iwalk_ag     iwag = {
 692                .mp             = mp,
 693                .tp             = tp,
 694                .inobt_walk_fn  = inobt_walk_fn,
 695                .data           = data,
 696                .startino       = startino,
 697                .sz_recs        = xfs_inobt_walk_prefetch(inobt_records),
 698                .pwork          = XFS_PWORK_SINGLE_THREADED,
 699        };
 700        xfs_agnumber_t          agno = XFS_INO_TO_AGNO(mp, startino);
 701        int                     error;
 702
 703        ASSERT(agno < mp->m_sb.sb_agcount);
 704        ASSERT(!(flags & ~XFS_INOBT_WALK_FLAGS_ALL));
 705
 706        error = xfs_iwalk_alloc(&iwag);
 707        if (error)
 708                return error;
 709
 710        for (; agno < mp->m_sb.sb_agcount; agno++) {
 711                error = xfs_iwalk_ag(&iwag);
 712                if (error)
 713                        break;
 714                iwag.startino = XFS_AGINO_TO_INO(mp, agno + 1, 0);
 715                if (flags & XFS_INOBT_WALK_SAME_AG)
 716                        break;
 717        }
 718
 719        xfs_iwalk_free(&iwag);
 720        return error;
 721}
 722