linux/fs/xfs/libxfs/xfs_btree_staging.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0-or-later
   2/*
   3 * Copyright (C) 2020 Oracle.  All Rights Reserved.
   4 * Author: Darrick J. Wong <darrick.wong@oracle.com>
   5 */
   6#include "xfs.h"
   7#include "xfs_fs.h"
   8#include "xfs_shared.h"
   9#include "xfs_format.h"
  10#include "xfs_log_format.h"
  11#include "xfs_trans_resv.h"
  12#include "xfs_bit.h"
  13#include "xfs_mount.h"
  14#include "xfs_inode.h"
  15#include "xfs_trans.h"
  16#include "xfs_btree.h"
  17#include "xfs_trace.h"
  18#include "xfs_btree_staging.h"
  19
  20/*
  21 * Staging Cursors and Fake Roots for Btrees
  22 * =========================================
  23 *
  24 * A staging btree cursor is a special type of btree cursor that callers must
  25 * use to construct a new btree index using the btree bulk loader code.  The
  26 * bulk loading code uses the staging btree cursor to abstract the details of
  27 * initializing new btree blocks and filling them with records or key/ptr
  28 * pairs.  Regular btree operations (e.g. queries and modifications) are not
  29 * supported with staging cursors, and callers must not invoke them.
  30 *
  31 * Fake root structures contain all the information about a btree that is under
  32 * construction by the bulk loading code.  Staging btree cursors point to fake
  33 * root structures instead of the usual AG header or inode structure.
  34 *
  35 * Callers are expected to initialize a fake root structure and pass it into
  36 * the _stage_cursor function for a specific btree type.  When bulk loading is
  37 * complete, callers should call the _commit_staged_btree function for that
  38 * specific btree type to commit the new btree into the filesystem.
  39 */
  40
  41/*
  42 * Don't allow staging cursors to be duplicated because they're supposed to be
  43 * kept private to a single thread.
  44 */
  45STATIC struct xfs_btree_cur *
  46xfs_btree_fakeroot_dup_cursor(
  47        struct xfs_btree_cur    *cur)
  48{
  49        ASSERT(0);
  50        return NULL;
  51}
  52
  53/*
  54 * Don't allow block allocation for a staging cursor, because staging cursors
  55 * do not support regular btree modifications.
  56 *
  57 * Bulk loading uses a separate callback to obtain new blocks from a
  58 * preallocated list, which prevents ENOSPC failures during loading.
  59 */
  60STATIC int
  61xfs_btree_fakeroot_alloc_block(
  62        struct xfs_btree_cur            *cur,
  63        const union xfs_btree_ptr       *start_bno,
  64        union xfs_btree_ptr             *new_bno,
  65        int                             *stat)
  66{
  67        ASSERT(0);
  68        return -EFSCORRUPTED;
  69}
  70
  71/*
  72 * Don't allow block freeing for a staging cursor, because staging cursors
  73 * do not support regular btree modifications.
  74 */
  75STATIC int
  76xfs_btree_fakeroot_free_block(
  77        struct xfs_btree_cur    *cur,
  78        struct xfs_buf          *bp)
  79{
  80        ASSERT(0);
  81        return -EFSCORRUPTED;
  82}
  83
  84/* Initialize a pointer to the root block from the fakeroot. */
  85STATIC void
  86xfs_btree_fakeroot_init_ptr_from_cur(
  87        struct xfs_btree_cur    *cur,
  88        union xfs_btree_ptr     *ptr)
  89{
  90        struct xbtree_afakeroot *afake;
  91
  92        ASSERT(cur->bc_flags & XFS_BTREE_STAGING);
  93
  94        afake = cur->bc_ag.afake;
  95        ptr->s = cpu_to_be32(afake->af_root);
  96}
  97
  98/*
  99 * Bulk Loading for AG Btrees
 100 * ==========================
 101 *
 102 * For a btree rooted in an AG header, pass a xbtree_afakeroot structure to the
 103 * staging cursor.  Callers should initialize this to zero.
 104 *
 105 * The _stage_cursor() function for a specific btree type should call
 106 * xfs_btree_stage_afakeroot to set up the in-memory cursor as a staging
 107 * cursor.  The corresponding _commit_staged_btree() function should log the
 108 * new root and call xfs_btree_commit_afakeroot() to transform the staging
 109 * cursor into a regular btree cursor.
 110 */
 111
 112/* Update the btree root information for a per-AG fake root. */
 113STATIC void
 114xfs_btree_afakeroot_set_root(
 115        struct xfs_btree_cur            *cur,
 116        const union xfs_btree_ptr       *ptr,
 117        int                             inc)
 118{
 119        struct xbtree_afakeroot *afake = cur->bc_ag.afake;
 120
 121        ASSERT(cur->bc_flags & XFS_BTREE_STAGING);
 122        afake->af_root = be32_to_cpu(ptr->s);
 123        afake->af_levels += inc;
 124}
 125
 126/*
 127 * Initialize a AG-rooted btree cursor with the given AG btree fake root.
 128 * The btree cursor's bc_ops will be overridden as needed to make the staging
 129 * functionality work.
 130 */
 131void
 132xfs_btree_stage_afakeroot(
 133        struct xfs_btree_cur            *cur,
 134        struct xbtree_afakeroot         *afake)
 135{
 136        struct xfs_btree_ops            *nops;
 137
 138        ASSERT(!(cur->bc_flags & XFS_BTREE_STAGING));
 139        ASSERT(!(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE));
 140        ASSERT(cur->bc_tp == NULL);
 141
 142        nops = kmem_alloc(sizeof(struct xfs_btree_ops), KM_NOFS);
 143        memcpy(nops, cur->bc_ops, sizeof(struct xfs_btree_ops));
 144        nops->alloc_block = xfs_btree_fakeroot_alloc_block;
 145        nops->free_block = xfs_btree_fakeroot_free_block;
 146        nops->init_ptr_from_cur = xfs_btree_fakeroot_init_ptr_from_cur;
 147        nops->set_root = xfs_btree_afakeroot_set_root;
 148        nops->dup_cursor = xfs_btree_fakeroot_dup_cursor;
 149
 150        cur->bc_ag.afake = afake;
 151        cur->bc_nlevels = afake->af_levels;
 152        cur->bc_ops = nops;
 153        cur->bc_flags |= XFS_BTREE_STAGING;
 154}
 155
 156/*
 157 * Transform an AG-rooted staging btree cursor back into a regular cursor by
 158 * substituting a real btree root for the fake one and restoring normal btree
 159 * cursor ops.  The caller must log the btree root change prior to calling
 160 * this.
 161 */
 162void
 163xfs_btree_commit_afakeroot(
 164        struct xfs_btree_cur            *cur,
 165        struct xfs_trans                *tp,
 166        struct xfs_buf                  *agbp,
 167        const struct xfs_btree_ops      *ops)
 168{
 169        ASSERT(cur->bc_flags & XFS_BTREE_STAGING);
 170        ASSERT(cur->bc_tp == NULL);
 171
 172        trace_xfs_btree_commit_afakeroot(cur);
 173
 174        kmem_free((void *)cur->bc_ops);
 175        cur->bc_ag.agbp = agbp;
 176        cur->bc_ops = ops;
 177        cur->bc_flags &= ~XFS_BTREE_STAGING;
 178        cur->bc_tp = tp;
 179}
 180
 181/*
 182 * Bulk Loading for Inode-Rooted Btrees
 183 * ====================================
 184 *
 185 * For a btree rooted in an inode fork, pass a xbtree_ifakeroot structure to
 186 * the staging cursor.  This structure should be initialized as follows:
 187 *
 188 * - if_fork_size field should be set to the number of bytes available to the
 189 *   fork in the inode.
 190 *
 191 * - if_fork should point to a freshly allocated struct xfs_ifork.
 192 *
 193 * - if_format should be set to the appropriate fork type (e.g.
 194 *   XFS_DINODE_FMT_BTREE).
 195 *
 196 * All other fields must be zero.
 197 *
 198 * The _stage_cursor() function for a specific btree type should call
 199 * xfs_btree_stage_ifakeroot to set up the in-memory cursor as a staging
 200 * cursor.  The corresponding _commit_staged_btree() function should log the
 201 * new root and call xfs_btree_commit_ifakeroot() to transform the staging
 202 * cursor into a regular btree cursor.
 203 */
 204
 205/*
 206 * Initialize an inode-rooted btree cursor with the given inode btree fake
 207 * root.  The btree cursor's bc_ops will be overridden as needed to make the
 208 * staging functionality work.  If new_ops is not NULL, these new ops will be
 209 * passed out to the caller for further overriding.
 210 */
 211void
 212xfs_btree_stage_ifakeroot(
 213        struct xfs_btree_cur            *cur,
 214        struct xbtree_ifakeroot         *ifake,
 215        struct xfs_btree_ops            **new_ops)
 216{
 217        struct xfs_btree_ops            *nops;
 218
 219        ASSERT(!(cur->bc_flags & XFS_BTREE_STAGING));
 220        ASSERT(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE);
 221        ASSERT(cur->bc_tp == NULL);
 222
 223        nops = kmem_alloc(sizeof(struct xfs_btree_ops), KM_NOFS);
 224        memcpy(nops, cur->bc_ops, sizeof(struct xfs_btree_ops));
 225        nops->alloc_block = xfs_btree_fakeroot_alloc_block;
 226        nops->free_block = xfs_btree_fakeroot_free_block;
 227        nops->init_ptr_from_cur = xfs_btree_fakeroot_init_ptr_from_cur;
 228        nops->dup_cursor = xfs_btree_fakeroot_dup_cursor;
 229
 230        cur->bc_ino.ifake = ifake;
 231        cur->bc_nlevels = ifake->if_levels;
 232        cur->bc_ops = nops;
 233        cur->bc_flags |= XFS_BTREE_STAGING;
 234
 235        if (new_ops)
 236                *new_ops = nops;
 237}
 238
 239/*
 240 * Transform an inode-rooted staging btree cursor back into a regular cursor by
 241 * substituting a real btree root for the fake one and restoring normal btree
 242 * cursor ops.  The caller must log the btree root change prior to calling
 243 * this.
 244 */
 245void
 246xfs_btree_commit_ifakeroot(
 247        struct xfs_btree_cur            *cur,
 248        struct xfs_trans                *tp,
 249        int                             whichfork,
 250        const struct xfs_btree_ops      *ops)
 251{
 252        ASSERT(cur->bc_flags & XFS_BTREE_STAGING);
 253        ASSERT(cur->bc_tp == NULL);
 254
 255        trace_xfs_btree_commit_ifakeroot(cur);
 256
 257        kmem_free((void *)cur->bc_ops);
 258        cur->bc_ino.ifake = NULL;
 259        cur->bc_ino.whichfork = whichfork;
 260        cur->bc_ops = ops;
 261        cur->bc_flags &= ~XFS_BTREE_STAGING;
 262        cur->bc_tp = tp;
 263}
 264
 265/*
 266 * Bulk Loading of Staged Btrees
 267 * =============================
 268 *
 269 * This interface is used with a staged btree cursor to create a totally new
 270 * btree with a large number of records (i.e. more than what would fit in a
 271 * single root block).  When the creation is complete, the new root can be
 272 * linked atomically into the filesystem by committing the staged cursor.
 273 *
 274 * Creation of a new btree proceeds roughly as follows:
 275 *
 276 * The first step is to initialize an appropriate fake btree root structure and
 277 * then construct a staged btree cursor.  Refer to the block comments about
 278 * "Bulk Loading for AG Btrees" and "Bulk Loading for Inode-Rooted Btrees" for
 279 * more information about how to do this.
 280 *
 281 * The second step is to initialize a struct xfs_btree_bload context as
 282 * documented in the structure definition.
 283 *
 284 * The third step is to call xfs_btree_bload_compute_geometry to compute the
 285 * height of and the number of blocks needed to construct the btree.  See the
 286 * section "Computing the Geometry of the New Btree" for details about this
 287 * computation.
 288 *
 289 * In step four, the caller must allocate xfs_btree_bload.nr_blocks blocks and
 290 * save them for later use by ->claim_block().  Bulk loading requires all
 291 * blocks to be allocated beforehand to avoid ENOSPC failures midway through a
 292 * rebuild, and to minimize seek distances of the new btree.
 293 *
 294 * Step five is to call xfs_btree_bload() to start constructing the btree.
 295 *
 296 * The final step is to commit the staging btree cursor, which logs the new
 297 * btree root and turns the staging cursor into a regular cursor.  The caller
 298 * is responsible for cleaning up the previous btree blocks, if any.
 299 *
 300 * Computing the Geometry of the New Btree
 301 * =======================================
 302 *
 303 * The number of items placed in each btree block is computed via the following
 304 * algorithm: For leaf levels, the number of items for the level is nr_records
 305 * in the bload structure.  For node levels, the number of items for the level
 306 * is the number of blocks in the next lower level of the tree.  For each
 307 * level, the desired number of items per block is defined as:
 308 *
 309 * desired = max(minrecs, maxrecs - slack factor)
 310 *
 311 * The number of blocks for the level is defined to be:
 312 *
 313 * blocks = floor(nr_items / desired)
 314 *
 315 * Note this is rounded down so that the npb calculation below will never fall
 316 * below minrecs.  The number of items that will actually be loaded into each
 317 * btree block is defined as:
 318 *
 319 * npb =  nr_items / blocks
 320 *
 321 * Some of the leftmost blocks in the level will contain one extra record as
 322 * needed to handle uneven division.  If the number of records in any block
 323 * would exceed maxrecs for that level, blocks is incremented and npb is
 324 * recalculated.
 325 *
 326 * In other words, we compute the number of blocks needed to satisfy a given
 327 * loading level, then spread the items as evenly as possible.
 328 *
 329 * The height and number of fs blocks required to create the btree are computed
 330 * and returned via btree_height and nr_blocks.
 331 */
 332
 333/*
 334 * Put a btree block that we're loading onto the ordered list and release it.
 335 * The btree blocks will be written to disk when bulk loading is finished.
 336 */
 337static void
 338xfs_btree_bload_drop_buf(
 339        struct list_head        *buffers_list,
 340        struct xfs_buf          **bpp)
 341{
 342        if (*bpp == NULL)
 343                return;
 344
 345        if (!xfs_buf_delwri_queue(*bpp, buffers_list))
 346                ASSERT(0);
 347
 348        xfs_buf_relse(*bpp);
 349        *bpp = NULL;
 350}
 351
 352/*
 353 * Allocate and initialize one btree block for bulk loading.
 354 *
 355 * The new btree block will have its level and numrecs fields set to the values
 356 * of the level and nr_this_block parameters, respectively.
 357 *
 358 * The caller should ensure that ptrp, bpp, and blockp refer to the left
 359 * sibling of the new block, if there is any.  On exit, ptrp, bpp, and blockp
 360 * will all point to the new block.
 361 */
 362STATIC int
 363xfs_btree_bload_prep_block(
 364        struct xfs_btree_cur            *cur,
 365        struct xfs_btree_bload          *bbl,
 366        struct list_head                *buffers_list,
 367        unsigned int                    level,
 368        unsigned int                    nr_this_block,
 369        union xfs_btree_ptr             *ptrp, /* in/out */
 370        struct xfs_buf                  **bpp, /* in/out */
 371        struct xfs_btree_block          **blockp, /* in/out */
 372        void                            *priv)
 373{
 374        union xfs_btree_ptr             new_ptr;
 375        struct xfs_buf                  *new_bp;
 376        struct xfs_btree_block          *new_block;
 377        int                             ret;
 378
 379        if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
 380            level == cur->bc_nlevels - 1) {
 381                struct xfs_ifork        *ifp = xfs_btree_ifork_ptr(cur);
 382                size_t                  new_size;
 383
 384                ASSERT(*bpp == NULL);
 385
 386                /* Allocate a new incore btree root block. */
 387                new_size = bbl->iroot_size(cur, nr_this_block, priv);
 388                ifp->if_broot = kmem_zalloc(new_size, 0);
 389                ifp->if_broot_bytes = (int)new_size;
 390
 391                /* Initialize it and send it out. */
 392                xfs_btree_init_block_int(cur->bc_mp, ifp->if_broot,
 393                                XFS_BUF_DADDR_NULL, cur->bc_btnum, level,
 394                                nr_this_block, cur->bc_ino.ip->i_ino,
 395                                cur->bc_flags);
 396
 397                *bpp = NULL;
 398                *blockp = ifp->if_broot;
 399                xfs_btree_set_ptr_null(cur, ptrp);
 400                return 0;
 401        }
 402
 403        /* Claim one of the caller's preallocated blocks. */
 404        xfs_btree_set_ptr_null(cur, &new_ptr);
 405        ret = bbl->claim_block(cur, &new_ptr, priv);
 406        if (ret)
 407                return ret;
 408
 409        ASSERT(!xfs_btree_ptr_is_null(cur, &new_ptr));
 410
 411        ret = xfs_btree_get_buf_block(cur, &new_ptr, &new_block, &new_bp);
 412        if (ret)
 413                return ret;
 414
 415        /*
 416         * The previous block (if any) is the left sibling of the new block,
 417         * so set its right sibling pointer to the new block and drop it.
 418         */
 419        if (*blockp)
 420                xfs_btree_set_sibling(cur, *blockp, &new_ptr, XFS_BB_RIGHTSIB);
 421        xfs_btree_bload_drop_buf(buffers_list, bpp);
 422
 423        /* Initialize the new btree block. */
 424        xfs_btree_init_block_cur(cur, new_bp, level, nr_this_block);
 425        xfs_btree_set_sibling(cur, new_block, ptrp, XFS_BB_LEFTSIB);
 426
 427        /* Set the out parameters. */
 428        *bpp = new_bp;
 429        *blockp = new_block;
 430        xfs_btree_copy_ptrs(cur, ptrp, &new_ptr, 1);
 431        return 0;
 432}
 433
 434/* Load one leaf block. */
 435STATIC int
 436xfs_btree_bload_leaf(
 437        struct xfs_btree_cur            *cur,
 438        unsigned int                    recs_this_block,
 439        xfs_btree_bload_get_record_fn   get_record,
 440        struct xfs_btree_block          *block,
 441        void                            *priv)
 442{
 443        unsigned int                    j;
 444        int                             ret;
 445
 446        /* Fill the leaf block with records. */
 447        for (j = 1; j <= recs_this_block; j++) {
 448                union xfs_btree_rec     *block_rec;
 449
 450                ret = get_record(cur, priv);
 451                if (ret)
 452                        return ret;
 453                block_rec = xfs_btree_rec_addr(cur, j, block);
 454                cur->bc_ops->init_rec_from_cur(cur, block_rec);
 455        }
 456
 457        return 0;
 458}
 459
 460/*
 461 * Load one node block with key/ptr pairs.
 462 *
 463 * child_ptr must point to a block within the next level down in the tree.  A
 464 * key/ptr entry will be created in the new node block to the block pointed to
 465 * by child_ptr.  On exit, child_ptr points to the next block on the child
 466 * level that needs processing.
 467 */
 468STATIC int
 469xfs_btree_bload_node(
 470        struct xfs_btree_cur    *cur,
 471        unsigned int            recs_this_block,
 472        union xfs_btree_ptr     *child_ptr,
 473        struct xfs_btree_block  *block)
 474{
 475        unsigned int            j;
 476        int                     ret;
 477
 478        /* Fill the node block with keys and pointers. */
 479        for (j = 1; j <= recs_this_block; j++) {
 480                union xfs_btree_key     child_key;
 481                union xfs_btree_ptr     *block_ptr;
 482                union xfs_btree_key     *block_key;
 483                struct xfs_btree_block  *child_block;
 484                struct xfs_buf          *child_bp;
 485
 486                ASSERT(!xfs_btree_ptr_is_null(cur, child_ptr));
 487
 488                ret = xfs_btree_get_buf_block(cur, child_ptr, &child_block,
 489                                &child_bp);
 490                if (ret)
 491                        return ret;
 492
 493                block_ptr = xfs_btree_ptr_addr(cur, j, block);
 494                xfs_btree_copy_ptrs(cur, block_ptr, child_ptr, 1);
 495
 496                block_key = xfs_btree_key_addr(cur, j, block);
 497                xfs_btree_get_keys(cur, child_block, &child_key);
 498                xfs_btree_copy_keys(cur, block_key, &child_key, 1);
 499
 500                xfs_btree_get_sibling(cur, child_block, child_ptr,
 501                                XFS_BB_RIGHTSIB);
 502                xfs_buf_relse(child_bp);
 503        }
 504
 505        return 0;
 506}
 507
 508/*
 509 * Compute the maximum number of records (or keyptrs) per block that we want to
 510 * install at this level in the btree.  Caller is responsible for having set
 511 * @cur->bc_ino.forksize to the desired fork size, if appropriate.
 512 */
 513STATIC unsigned int
 514xfs_btree_bload_max_npb(
 515        struct xfs_btree_cur    *cur,
 516        struct xfs_btree_bload  *bbl,
 517        unsigned int            level)
 518{
 519        unsigned int            ret;
 520
 521        if (level == cur->bc_nlevels - 1 && cur->bc_ops->get_dmaxrecs)
 522                return cur->bc_ops->get_dmaxrecs(cur, level);
 523
 524        ret = cur->bc_ops->get_maxrecs(cur, level);
 525        if (level == 0)
 526                ret -= bbl->leaf_slack;
 527        else
 528                ret -= bbl->node_slack;
 529        return ret;
 530}
 531
 532/*
 533 * Compute the desired number of records (or keyptrs) per block that we want to
 534 * install at this level in the btree, which must be somewhere between minrecs
 535 * and max_npb.  The caller is free to install fewer records per block.
 536 */
 537STATIC unsigned int
 538xfs_btree_bload_desired_npb(
 539        struct xfs_btree_cur    *cur,
 540        struct xfs_btree_bload  *bbl,
 541        unsigned int            level)
 542{
 543        unsigned int            npb = xfs_btree_bload_max_npb(cur, bbl, level);
 544
 545        /* Root blocks are not subject to minrecs rules. */
 546        if (level == cur->bc_nlevels - 1)
 547                return max(1U, npb);
 548
 549        return max_t(unsigned int, cur->bc_ops->get_minrecs(cur, level), npb);
 550}
 551
 552/*
 553 * Compute the number of records to be stored in each block at this level and
 554 * the number of blocks for this level.  For leaf levels, we must populate an
 555 * empty root block even if there are no records, so we have to have at least
 556 * one block.
 557 */
 558STATIC void
 559xfs_btree_bload_level_geometry(
 560        struct xfs_btree_cur    *cur,
 561        struct xfs_btree_bload  *bbl,
 562        unsigned int            level,
 563        uint64_t                nr_this_level,
 564        unsigned int            *avg_per_block,
 565        uint64_t                *blocks,
 566        uint64_t                *blocks_with_extra)
 567{
 568        uint64_t                npb;
 569        uint64_t                dontcare;
 570        unsigned int            desired_npb;
 571        unsigned int            maxnr;
 572
 573        maxnr = cur->bc_ops->get_maxrecs(cur, level);
 574
 575        /*
 576         * Compute the number of blocks we need to fill each block with the
 577         * desired number of records/keyptrs per block.  Because desired_npb
 578         * could be minrecs, we use regular integer division (which rounds
 579         * the block count down) so that in the next step the effective # of
 580         * items per block will never be less than desired_npb.
 581         */
 582        desired_npb = xfs_btree_bload_desired_npb(cur, bbl, level);
 583        *blocks = div64_u64_rem(nr_this_level, desired_npb, &dontcare);
 584        *blocks = max(1ULL, *blocks);
 585
 586        /*
 587         * Compute the number of records that we will actually put in each
 588         * block, assuming that we want to spread the records evenly between
 589         * the blocks.  Take care that the effective # of items per block (npb)
 590         * won't exceed maxrecs even for the blocks that get an extra record,
 591         * since desired_npb could be maxrecs, and in the previous step we
 592         * rounded the block count down.
 593         */
 594        npb = div64_u64_rem(nr_this_level, *blocks, blocks_with_extra);
 595        if (npb > maxnr || (npb == maxnr && *blocks_with_extra > 0)) {
 596                (*blocks)++;
 597                npb = div64_u64_rem(nr_this_level, *blocks, blocks_with_extra);
 598        }
 599
 600        *avg_per_block = min_t(uint64_t, npb, nr_this_level);
 601
 602        trace_xfs_btree_bload_level_geometry(cur, level, nr_this_level,
 603                        *avg_per_block, desired_npb, *blocks,
 604                        *blocks_with_extra);
 605}
 606
 607/*
 608 * Ensure a slack value is appropriate for the btree.
 609 *
 610 * If the slack value is negative, set slack so that we fill the block to
 611 * halfway between minrecs and maxrecs.  Make sure the slack is never so large
 612 * that we can underflow minrecs.
 613 */
 614static void
 615xfs_btree_bload_ensure_slack(
 616        struct xfs_btree_cur    *cur,
 617        int                     *slack,
 618        int                     level)
 619{
 620        int                     maxr;
 621        int                     minr;
 622
 623        maxr = cur->bc_ops->get_maxrecs(cur, level);
 624        minr = cur->bc_ops->get_minrecs(cur, level);
 625
 626        /*
 627         * If slack is negative, automatically set slack so that we load the
 628         * btree block approximately halfway between minrecs and maxrecs.
 629         * Generally, this will net us 75% loading.
 630         */
 631        if (*slack < 0)
 632                *slack = maxr - ((maxr + minr) >> 1);
 633
 634        *slack = min(*slack, maxr - minr);
 635}
 636
 637/*
 638 * Prepare a btree cursor for a bulk load operation by computing the geometry
 639 * fields in bbl.  Caller must ensure that the btree cursor is a staging
 640 * cursor.  This function can be called multiple times.
 641 */
 642int
 643xfs_btree_bload_compute_geometry(
 644        struct xfs_btree_cur    *cur,
 645        struct xfs_btree_bload  *bbl,
 646        uint64_t                nr_records)
 647{
 648        uint64_t                nr_blocks = 0;
 649        uint64_t                nr_this_level;
 650
 651        ASSERT(cur->bc_flags & XFS_BTREE_STAGING);
 652
 653        /*
 654         * Make sure that the slack values make sense for traditional leaf and
 655         * node blocks.  Inode-rooted btrees will return different minrecs and
 656         * maxrecs values for the root block (bc_nlevels == level - 1).  We're
 657         * checking levels 0 and 1 here, so set bc_nlevels such that the btree
 658         * code doesn't interpret either as the root level.
 659         */
 660        cur->bc_nlevels = cur->bc_maxlevels - 1;
 661        xfs_btree_bload_ensure_slack(cur, &bbl->leaf_slack, 0);
 662        xfs_btree_bload_ensure_slack(cur, &bbl->node_slack, 1);
 663
 664        bbl->nr_records = nr_this_level = nr_records;
 665        for (cur->bc_nlevels = 1; cur->bc_nlevels <= cur->bc_maxlevels;) {
 666                uint64_t        level_blocks;
 667                uint64_t        dontcare64;
 668                unsigned int    level = cur->bc_nlevels - 1;
 669                unsigned int    avg_per_block;
 670
 671                xfs_btree_bload_level_geometry(cur, bbl, level, nr_this_level,
 672                                &avg_per_block, &level_blocks, &dontcare64);
 673
 674                if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) {
 675                        /*
 676                         * If all the items we want to store at this level
 677                         * would fit in the inode root block, then we have our
 678                         * btree root and are done.
 679                         *
 680                         * Note that bmap btrees forbid records in the root.
 681                         */
 682                        if (level != 0 && nr_this_level <= avg_per_block) {
 683                                nr_blocks++;
 684                                break;
 685                        }
 686
 687                        /*
 688                         * Otherwise, we have to store all the items for this
 689                         * level in traditional btree blocks and therefore need
 690                         * another level of btree to point to those blocks.
 691                         *
 692                         * We have to re-compute the geometry for each level of
 693                         * an inode-rooted btree because the geometry differs
 694                         * between a btree root in an inode fork and a
 695                         * traditional btree block.
 696                         *
 697                         * This distinction is made in the btree code based on
 698                         * whether level == bc_nlevels - 1.  Based on the
 699                         * previous root block size check against the root
 700                         * block geometry, we know that we aren't yet ready to
 701                         * populate the root.  Increment bc_nevels and
 702                         * recalculate the geometry for a traditional
 703                         * block-based btree level.
 704                         */
 705                        cur->bc_nlevels++;
 706                        ASSERT(cur->bc_nlevels <= cur->bc_maxlevels);
 707                        xfs_btree_bload_level_geometry(cur, bbl, level,
 708                                        nr_this_level, &avg_per_block,
 709                                        &level_blocks, &dontcare64);
 710                } else {
 711                        /*
 712                         * If all the items we want to store at this level
 713                         * would fit in a single root block, we're done.
 714                         */
 715                        if (nr_this_level <= avg_per_block) {
 716                                nr_blocks++;
 717                                break;
 718                        }
 719
 720                        /* Otherwise, we need another level of btree. */
 721                        cur->bc_nlevels++;
 722                        ASSERT(cur->bc_nlevels <= cur->bc_maxlevels);
 723                }
 724
 725                nr_blocks += level_blocks;
 726                nr_this_level = level_blocks;
 727        }
 728
 729        if (cur->bc_nlevels > cur->bc_maxlevels)
 730                return -EOVERFLOW;
 731
 732        bbl->btree_height = cur->bc_nlevels;
 733        if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE)
 734                bbl->nr_blocks = nr_blocks - 1;
 735        else
 736                bbl->nr_blocks = nr_blocks;
 737        return 0;
 738}
 739
 740/* Bulk load a btree given the parameters and geometry established in bbl. */
 741int
 742xfs_btree_bload(
 743        struct xfs_btree_cur            *cur,
 744        struct xfs_btree_bload          *bbl,
 745        void                            *priv)
 746{
 747        struct list_head                buffers_list;
 748        union xfs_btree_ptr             child_ptr;
 749        union xfs_btree_ptr             ptr;
 750        struct xfs_buf                  *bp = NULL;
 751        struct xfs_btree_block          *block = NULL;
 752        uint64_t                        nr_this_level = bbl->nr_records;
 753        uint64_t                        blocks;
 754        uint64_t                        i;
 755        uint64_t                        blocks_with_extra;
 756        uint64_t                        total_blocks = 0;
 757        unsigned int                    avg_per_block;
 758        unsigned int                    level = 0;
 759        int                             ret;
 760
 761        ASSERT(cur->bc_flags & XFS_BTREE_STAGING);
 762
 763        INIT_LIST_HEAD(&buffers_list);
 764        cur->bc_nlevels = bbl->btree_height;
 765        xfs_btree_set_ptr_null(cur, &child_ptr);
 766        xfs_btree_set_ptr_null(cur, &ptr);
 767
 768        xfs_btree_bload_level_geometry(cur, bbl, level, nr_this_level,
 769                        &avg_per_block, &blocks, &blocks_with_extra);
 770
 771        /* Load each leaf block. */
 772        for (i = 0; i < blocks; i++) {
 773                unsigned int            nr_this_block = avg_per_block;
 774
 775                /*
 776                 * Due to rounding, btree blocks will not be evenly populated
 777                 * in most cases.  blocks_with_extra tells us how many blocks
 778                 * will receive an extra record to distribute the excess across
 779                 * the current level as evenly as possible.
 780                 */
 781                if (i < blocks_with_extra)
 782                        nr_this_block++;
 783
 784                ret = xfs_btree_bload_prep_block(cur, bbl, &buffers_list, level,
 785                                nr_this_block, &ptr, &bp, &block, priv);
 786                if (ret)
 787                        goto out;
 788
 789                trace_xfs_btree_bload_block(cur, level, i, blocks, &ptr,
 790                                nr_this_block);
 791
 792                ret = xfs_btree_bload_leaf(cur, nr_this_block, bbl->get_record,
 793                                block, priv);
 794                if (ret)
 795                        goto out;
 796
 797                /*
 798                 * Record the leftmost leaf pointer so we know where to start
 799                 * with the first node level.
 800                 */
 801                if (i == 0)
 802                        xfs_btree_copy_ptrs(cur, &child_ptr, &ptr, 1);
 803        }
 804        total_blocks += blocks;
 805        xfs_btree_bload_drop_buf(&buffers_list, &bp);
 806
 807        /* Populate the internal btree nodes. */
 808        for (level = 1; level < cur->bc_nlevels; level++) {
 809                union xfs_btree_ptr     first_ptr;
 810
 811                nr_this_level = blocks;
 812                block = NULL;
 813                xfs_btree_set_ptr_null(cur, &ptr);
 814
 815                xfs_btree_bload_level_geometry(cur, bbl, level, nr_this_level,
 816                                &avg_per_block, &blocks, &blocks_with_extra);
 817
 818                /* Load each node block. */
 819                for (i = 0; i < blocks; i++) {
 820                        unsigned int    nr_this_block = avg_per_block;
 821
 822                        if (i < blocks_with_extra)
 823                                nr_this_block++;
 824
 825                        ret = xfs_btree_bload_prep_block(cur, bbl,
 826                                        &buffers_list, level, nr_this_block,
 827                                        &ptr, &bp, &block, priv);
 828                        if (ret)
 829                                goto out;
 830
 831                        trace_xfs_btree_bload_block(cur, level, i, blocks,
 832                                        &ptr, nr_this_block);
 833
 834                        ret = xfs_btree_bload_node(cur, nr_this_block,
 835                                        &child_ptr, block);
 836                        if (ret)
 837                                goto out;
 838
 839                        /*
 840                         * Record the leftmost node pointer so that we know
 841                         * where to start the next node level above this one.
 842                         */
 843                        if (i == 0)
 844                                xfs_btree_copy_ptrs(cur, &first_ptr, &ptr, 1);
 845                }
 846                total_blocks += blocks;
 847                xfs_btree_bload_drop_buf(&buffers_list, &bp);
 848                xfs_btree_copy_ptrs(cur, &child_ptr, &first_ptr, 1);
 849        }
 850
 851        /* Initialize the new root. */
 852        if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) {
 853                ASSERT(xfs_btree_ptr_is_null(cur, &ptr));
 854                cur->bc_ino.ifake->if_levels = cur->bc_nlevels;
 855                cur->bc_ino.ifake->if_blocks = total_blocks - 1;
 856        } else {
 857                cur->bc_ag.afake->af_root = be32_to_cpu(ptr.s);
 858                cur->bc_ag.afake->af_levels = cur->bc_nlevels;
 859                cur->bc_ag.afake->af_blocks = total_blocks;
 860        }
 861
 862        /*
 863         * Write the new blocks to disk.  If the ordered list isn't empty after
 864         * that, then something went wrong and we have to fail.  This should
 865         * never happen, but we'll check anyway.
 866         */
 867        ret = xfs_buf_delwri_submit(&buffers_list);
 868        if (ret)
 869                goto out;
 870        if (!list_empty(&buffers_list)) {
 871                ASSERT(list_empty(&buffers_list));
 872                ret = -EIO;
 873        }
 874
 875out:
 876        xfs_buf_delwri_cancel(&buffers_list);
 877        if (bp)
 878                xfs_buf_relse(bp);
 879        return ret;
 880}
 881