linux/fs/xfs/libxfs/xfs_btree.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0
   2/*
   3 * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc.
   4 * All Rights Reserved.
   5 */
   6#include "xfs.h"
   7#include "xfs_fs.h"
   8#include "xfs_shared.h"
   9#include "xfs_format.h"
  10#include "xfs_log_format.h"
  11#include "xfs_trans_resv.h"
  12#include "xfs_bit.h"
  13#include "xfs_mount.h"
  14#include "xfs_inode.h"
  15#include "xfs_trans.h"
  16#include "xfs_buf_item.h"
  17#include "xfs_btree.h"
  18#include "xfs_errortag.h"
  19#include "xfs_error.h"
  20#include "xfs_trace.h"
  21#include "xfs_alloc.h"
  22#include "xfs_log.h"
  23#include "xfs_btree_staging.h"
  24#include "xfs_ag.h"
  25#include "xfs_alloc_btree.h"
  26#include "xfs_ialloc_btree.h"
  27#include "xfs_bmap_btree.h"
  28#include "xfs_rmap_btree.h"
  29#include "xfs_refcount_btree.h"
  30
  31/*
  32 * Btree magic numbers.
  33 */
  34static const uint32_t xfs_magics[2][XFS_BTNUM_MAX] = {
  35        { XFS_ABTB_MAGIC, XFS_ABTC_MAGIC, 0, XFS_BMAP_MAGIC, XFS_IBT_MAGIC,
  36          XFS_FIBT_MAGIC, 0 },
  37        { XFS_ABTB_CRC_MAGIC, XFS_ABTC_CRC_MAGIC, XFS_RMAP_CRC_MAGIC,
  38          XFS_BMAP_CRC_MAGIC, XFS_IBT_CRC_MAGIC, XFS_FIBT_CRC_MAGIC,
  39          XFS_REFC_CRC_MAGIC }
  40};
  41
  42uint32_t
  43xfs_btree_magic(
  44        int                     crc,
  45        xfs_btnum_t             btnum)
  46{
  47        uint32_t                magic = xfs_magics[crc][btnum];
  48
  49        /* Ensure we asked for crc for crc-only magics. */
  50        ASSERT(magic != 0);
  51        return magic;
  52}
  53
  54/*
  55 * Check a long btree block header.  Return the address of the failing check,
  56 * or NULL if everything is ok.
  57 */
  58xfs_failaddr_t
  59__xfs_btree_check_lblock(
  60        struct xfs_btree_cur    *cur,
  61        struct xfs_btree_block  *block,
  62        int                     level,
  63        struct xfs_buf          *bp)
  64{
  65        struct xfs_mount        *mp = cur->bc_mp;
  66        xfs_btnum_t             btnum = cur->bc_btnum;
  67        int                     crc = xfs_has_crc(mp);
  68
  69        if (crc) {
  70                if (!uuid_equal(&block->bb_u.l.bb_uuid, &mp->m_sb.sb_meta_uuid))
  71                        return __this_address;
  72                if (block->bb_u.l.bb_blkno !=
  73                    cpu_to_be64(bp ? xfs_buf_daddr(bp) : XFS_BUF_DADDR_NULL))
  74                        return __this_address;
  75                if (block->bb_u.l.bb_pad != cpu_to_be32(0))
  76                        return __this_address;
  77        }
  78
  79        if (be32_to_cpu(block->bb_magic) != xfs_btree_magic(crc, btnum))
  80                return __this_address;
  81        if (be16_to_cpu(block->bb_level) != level)
  82                return __this_address;
  83        if (be16_to_cpu(block->bb_numrecs) >
  84            cur->bc_ops->get_maxrecs(cur, level))
  85                return __this_address;
  86        if (block->bb_u.l.bb_leftsib != cpu_to_be64(NULLFSBLOCK) &&
  87            !xfs_btree_check_lptr(cur, be64_to_cpu(block->bb_u.l.bb_leftsib),
  88                        level + 1))
  89                return __this_address;
  90        if (block->bb_u.l.bb_rightsib != cpu_to_be64(NULLFSBLOCK) &&
  91            !xfs_btree_check_lptr(cur, be64_to_cpu(block->bb_u.l.bb_rightsib),
  92                        level + 1))
  93                return __this_address;
  94
  95        return NULL;
  96}
  97
  98/* Check a long btree block header. */
  99static int
 100xfs_btree_check_lblock(
 101        struct xfs_btree_cur    *cur,
 102        struct xfs_btree_block  *block,
 103        int                     level,
 104        struct xfs_buf          *bp)
 105{
 106        struct xfs_mount        *mp = cur->bc_mp;
 107        xfs_failaddr_t          fa;
 108
 109        fa = __xfs_btree_check_lblock(cur, block, level, bp);
 110        if (XFS_IS_CORRUPT(mp, fa != NULL) ||
 111            XFS_TEST_ERROR(false, mp, XFS_ERRTAG_BTREE_CHECK_LBLOCK)) {
 112                if (bp)
 113                        trace_xfs_btree_corrupt(bp, _RET_IP_);
 114                return -EFSCORRUPTED;
 115        }
 116        return 0;
 117}
 118
 119/*
 120 * Check a short btree block header.  Return the address of the failing check,
 121 * or NULL if everything is ok.
 122 */
 123xfs_failaddr_t
 124__xfs_btree_check_sblock(
 125        struct xfs_btree_cur    *cur,
 126        struct xfs_btree_block  *block,
 127        int                     level,
 128        struct xfs_buf          *bp)
 129{
 130        struct xfs_mount        *mp = cur->bc_mp;
 131        xfs_btnum_t             btnum = cur->bc_btnum;
 132        int                     crc = xfs_has_crc(mp);
 133
 134        if (crc) {
 135                if (!uuid_equal(&block->bb_u.s.bb_uuid, &mp->m_sb.sb_meta_uuid))
 136                        return __this_address;
 137                if (block->bb_u.s.bb_blkno !=
 138                    cpu_to_be64(bp ? xfs_buf_daddr(bp) : XFS_BUF_DADDR_NULL))
 139                        return __this_address;
 140        }
 141
 142        if (be32_to_cpu(block->bb_magic) != xfs_btree_magic(crc, btnum))
 143                return __this_address;
 144        if (be16_to_cpu(block->bb_level) != level)
 145                return __this_address;
 146        if (be16_to_cpu(block->bb_numrecs) >
 147            cur->bc_ops->get_maxrecs(cur, level))
 148                return __this_address;
 149        if (block->bb_u.s.bb_leftsib != cpu_to_be32(NULLAGBLOCK) &&
 150            !xfs_btree_check_sptr(cur, be32_to_cpu(block->bb_u.s.bb_leftsib),
 151                        level + 1))
 152                return __this_address;
 153        if (block->bb_u.s.bb_rightsib != cpu_to_be32(NULLAGBLOCK) &&
 154            !xfs_btree_check_sptr(cur, be32_to_cpu(block->bb_u.s.bb_rightsib),
 155                        level + 1))
 156                return __this_address;
 157
 158        return NULL;
 159}
 160
 161/* Check a short btree block header. */
 162STATIC int
 163xfs_btree_check_sblock(
 164        struct xfs_btree_cur    *cur,
 165        struct xfs_btree_block  *block,
 166        int                     level,
 167        struct xfs_buf          *bp)
 168{
 169        struct xfs_mount        *mp = cur->bc_mp;
 170        xfs_failaddr_t          fa;
 171
 172        fa = __xfs_btree_check_sblock(cur, block, level, bp);
 173        if (XFS_IS_CORRUPT(mp, fa != NULL) ||
 174            XFS_TEST_ERROR(false, mp, XFS_ERRTAG_BTREE_CHECK_SBLOCK)) {
 175                if (bp)
 176                        trace_xfs_btree_corrupt(bp, _RET_IP_);
 177                return -EFSCORRUPTED;
 178        }
 179        return 0;
 180}
 181
 182/*
 183 * Debug routine: check that block header is ok.
 184 */
 185int
 186xfs_btree_check_block(
 187        struct xfs_btree_cur    *cur,   /* btree cursor */
 188        struct xfs_btree_block  *block, /* generic btree block pointer */
 189        int                     level,  /* level of the btree block */
 190        struct xfs_buf          *bp)    /* buffer containing block, if any */
 191{
 192        if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
 193                return xfs_btree_check_lblock(cur, block, level, bp);
 194        else
 195                return xfs_btree_check_sblock(cur, block, level, bp);
 196}
 197
 198/* Check that this long pointer is valid and points within the fs. */
 199bool
 200xfs_btree_check_lptr(
 201        struct xfs_btree_cur    *cur,
 202        xfs_fsblock_t           fsbno,
 203        int                     level)
 204{
 205        if (level <= 0)
 206                return false;
 207        return xfs_verify_fsbno(cur->bc_mp, fsbno);
 208}
 209
 210/* Check that this short pointer is valid and points within the AG. */
 211bool
 212xfs_btree_check_sptr(
 213        struct xfs_btree_cur    *cur,
 214        xfs_agblock_t           agbno,
 215        int                     level)
 216{
 217        if (level <= 0)
 218                return false;
 219        return xfs_verify_agbno(cur->bc_mp, cur->bc_ag.pag->pag_agno, agbno);
 220}
 221
 222/*
 223 * Check that a given (indexed) btree pointer at a certain level of a
 224 * btree is valid and doesn't point past where it should.
 225 */
 226static int
 227xfs_btree_check_ptr(
 228        struct xfs_btree_cur            *cur,
 229        const union xfs_btree_ptr       *ptr,
 230        int                             index,
 231        int                             level)
 232{
 233        if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
 234                if (xfs_btree_check_lptr(cur, be64_to_cpu((&ptr->l)[index]),
 235                                level))
 236                        return 0;
 237                xfs_err(cur->bc_mp,
 238"Inode %llu fork %d: Corrupt btree %d pointer at level %d index %d.",
 239                                cur->bc_ino.ip->i_ino,
 240                                cur->bc_ino.whichfork, cur->bc_btnum,
 241                                level, index);
 242        } else {
 243                if (xfs_btree_check_sptr(cur, be32_to_cpu((&ptr->s)[index]),
 244                                level))
 245                        return 0;
 246                xfs_err(cur->bc_mp,
 247"AG %u: Corrupt btree %d pointer at level %d index %d.",
 248                                cur->bc_ag.pag->pag_agno, cur->bc_btnum,
 249                                level, index);
 250        }
 251
 252        return -EFSCORRUPTED;
 253}
 254
 255#ifdef DEBUG
 256# define xfs_btree_debug_check_ptr      xfs_btree_check_ptr
 257#else
 258# define xfs_btree_debug_check_ptr(...) (0)
 259#endif
 260
 261/*
 262 * Calculate CRC on the whole btree block and stuff it into the
 263 * long-form btree header.
 264 *
 265 * Prior to calculting the CRC, pull the LSN out of the buffer log item and put
 266 * it into the buffer so recovery knows what the last modification was that made
 267 * it to disk.
 268 */
 269void
 270xfs_btree_lblock_calc_crc(
 271        struct xfs_buf          *bp)
 272{
 273        struct xfs_btree_block  *block = XFS_BUF_TO_BLOCK(bp);
 274        struct xfs_buf_log_item *bip = bp->b_log_item;
 275
 276        if (!xfs_has_crc(bp->b_mount))
 277                return;
 278        if (bip)
 279                block->bb_u.l.bb_lsn = cpu_to_be64(bip->bli_item.li_lsn);
 280        xfs_buf_update_cksum(bp, XFS_BTREE_LBLOCK_CRC_OFF);
 281}
 282
 283bool
 284xfs_btree_lblock_verify_crc(
 285        struct xfs_buf          *bp)
 286{
 287        struct xfs_btree_block  *block = XFS_BUF_TO_BLOCK(bp);
 288        struct xfs_mount        *mp = bp->b_mount;
 289
 290        if (xfs_has_crc(mp)) {
 291                if (!xfs_log_check_lsn(mp, be64_to_cpu(block->bb_u.l.bb_lsn)))
 292                        return false;
 293                return xfs_buf_verify_cksum(bp, XFS_BTREE_LBLOCK_CRC_OFF);
 294        }
 295
 296        return true;
 297}
 298
 299/*
 300 * Calculate CRC on the whole btree block and stuff it into the
 301 * short-form btree header.
 302 *
 303 * Prior to calculting the CRC, pull the LSN out of the buffer log item and put
 304 * it into the buffer so recovery knows what the last modification was that made
 305 * it to disk.
 306 */
 307void
 308xfs_btree_sblock_calc_crc(
 309        struct xfs_buf          *bp)
 310{
 311        struct xfs_btree_block  *block = XFS_BUF_TO_BLOCK(bp);
 312        struct xfs_buf_log_item *bip = bp->b_log_item;
 313
 314        if (!xfs_has_crc(bp->b_mount))
 315                return;
 316        if (bip)
 317                block->bb_u.s.bb_lsn = cpu_to_be64(bip->bli_item.li_lsn);
 318        xfs_buf_update_cksum(bp, XFS_BTREE_SBLOCK_CRC_OFF);
 319}
 320
 321bool
 322xfs_btree_sblock_verify_crc(
 323        struct xfs_buf          *bp)
 324{
 325        struct xfs_btree_block  *block = XFS_BUF_TO_BLOCK(bp);
 326        struct xfs_mount        *mp = bp->b_mount;
 327
 328        if (xfs_has_crc(mp)) {
 329                if (!xfs_log_check_lsn(mp, be64_to_cpu(block->bb_u.s.bb_lsn)))
 330                        return false;
 331                return xfs_buf_verify_cksum(bp, XFS_BTREE_SBLOCK_CRC_OFF);
 332        }
 333
 334        return true;
 335}
 336
 337static int
 338xfs_btree_free_block(
 339        struct xfs_btree_cur    *cur,
 340        struct xfs_buf          *bp)
 341{
 342        int                     error;
 343
 344        error = cur->bc_ops->free_block(cur, bp);
 345        if (!error) {
 346                xfs_trans_binval(cur->bc_tp, bp);
 347                XFS_BTREE_STATS_INC(cur, free);
 348        }
 349        return error;
 350}
 351
 352/*
 353 * Delete the btree cursor.
 354 */
 355void
 356xfs_btree_del_cursor(
 357        struct xfs_btree_cur    *cur,           /* btree cursor */
 358        int                     error)          /* del because of error */
 359{
 360        int                     i;              /* btree level */
 361
 362        /*
 363         * Clear the buffer pointers and release the buffers. If we're doing
 364         * this because of an error, inspect all of the entries in the bc_bufs
 365         * array for buffers to be unlocked. This is because some of the btree
 366         * code works from level n down to 0, and if we get an error along the
 367         * way we won't have initialized all the entries down to 0.
 368         */
 369        for (i = 0; i < cur->bc_nlevels; i++) {
 370                if (cur->bc_levels[i].bp)
 371                        xfs_trans_brelse(cur->bc_tp, cur->bc_levels[i].bp);
 372                else if (!error)
 373                        break;
 374        }
 375
 376        ASSERT(cur->bc_btnum != XFS_BTNUM_BMAP || cur->bc_ino.allocated == 0 ||
 377               xfs_is_shutdown(cur->bc_mp));
 378        if (unlikely(cur->bc_flags & XFS_BTREE_STAGING))
 379                kmem_free(cur->bc_ops);
 380        if (!(cur->bc_flags & XFS_BTREE_LONG_PTRS) && cur->bc_ag.pag)
 381                xfs_perag_put(cur->bc_ag.pag);
 382        kmem_cache_free(cur->bc_cache, cur);
 383}
 384
 385/*
 386 * Duplicate the btree cursor.
 387 * Allocate a new one, copy the record, re-get the buffers.
 388 */
 389int                                     /* error */
 390xfs_btree_dup_cursor(
 391        struct xfs_btree_cur *cur,              /* input cursor */
 392        struct xfs_btree_cur **ncur)            /* output cursor */
 393{
 394        struct xfs_buf  *bp;            /* btree block's buffer pointer */
 395        int             error;          /* error return value */
 396        int             i;              /* level number of btree block */
 397        xfs_mount_t     *mp;            /* mount structure for filesystem */
 398        struct xfs_btree_cur *new;              /* new cursor value */
 399        xfs_trans_t     *tp;            /* transaction pointer, can be NULL */
 400
 401        tp = cur->bc_tp;
 402        mp = cur->bc_mp;
 403
 404        /*
 405         * Allocate a new cursor like the old one.
 406         */
 407        new = cur->bc_ops->dup_cursor(cur);
 408
 409        /*
 410         * Copy the record currently in the cursor.
 411         */
 412        new->bc_rec = cur->bc_rec;
 413
 414        /*
 415         * For each level current, re-get the buffer and copy the ptr value.
 416         */
 417        for (i = 0; i < new->bc_nlevels; i++) {
 418                new->bc_levels[i].ptr = cur->bc_levels[i].ptr;
 419                new->bc_levels[i].ra = cur->bc_levels[i].ra;
 420                bp = cur->bc_levels[i].bp;
 421                if (bp) {
 422                        error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp,
 423                                                   xfs_buf_daddr(bp), mp->m_bsize,
 424                                                   0, &bp,
 425                                                   cur->bc_ops->buf_ops);
 426                        if (error) {
 427                                xfs_btree_del_cursor(new, error);
 428                                *ncur = NULL;
 429                                return error;
 430                        }
 431                }
 432                new->bc_levels[i].bp = bp;
 433        }
 434        *ncur = new;
 435        return 0;
 436}
 437
 438/*
 439 * XFS btree block layout and addressing:
 440 *
 441 * There are two types of blocks in the btree: leaf and non-leaf blocks.
 442 *
 443 * The leaf record start with a header then followed by records containing
 444 * the values.  A non-leaf block also starts with the same header, and
 445 * then first contains lookup keys followed by an equal number of pointers
 446 * to the btree blocks at the previous level.
 447 *
 448 *              +--------+-------+-------+-------+-------+-------+-------+
 449 * Leaf:        | header | rec 1 | rec 2 | rec 3 | rec 4 | rec 5 | rec N |
 450 *              +--------+-------+-------+-------+-------+-------+-------+
 451 *
 452 *              +--------+-------+-------+-------+-------+-------+-------+
 453 * Non-Leaf:    | header | key 1 | key 2 | key N | ptr 1 | ptr 2 | ptr N |
 454 *              +--------+-------+-------+-------+-------+-------+-------+
 455 *
 456 * The header is called struct xfs_btree_block for reasons better left unknown
 457 * and comes in different versions for short (32bit) and long (64bit) block
 458 * pointers.  The record and key structures are defined by the btree instances
 459 * and opaque to the btree core.  The block pointers are simple disk endian
 460 * integers, available in a short (32bit) and long (64bit) variant.
 461 *
 462 * The helpers below calculate the offset of a given record, key or pointer
 463 * into a btree block (xfs_btree_*_offset) or return a pointer to the given
 464 * record, key or pointer (xfs_btree_*_addr).  Note that all addressing
 465 * inside the btree block is done using indices starting at one, not zero!
 466 *
 467 * If XFS_BTREE_OVERLAPPING is set, then this btree supports keys containing
 468 * overlapping intervals.  In such a tree, records are still sorted lowest to
 469 * highest and indexed by the smallest key value that refers to the record.
 470 * However, nodes are different: each pointer has two associated keys -- one
 471 * indexing the lowest key available in the block(s) below (the same behavior
 472 * as the key in a regular btree) and another indexing the highest key
 473 * available in the block(s) below.  Because records are /not/ sorted by the
 474 * highest key, all leaf block updates require us to compute the highest key
 475 * that matches any record in the leaf and to recursively update the high keys
 476 * in the nodes going further up in the tree, if necessary.  Nodes look like
 477 * this:
 478 *
 479 *              +--------+-----+-----+-----+-----+-----+-------+-------+-----+
 480 * Non-Leaf:    | header | lo1 | hi1 | lo2 | hi2 | ... | ptr 1 | ptr 2 | ... |
 481 *              +--------+-----+-----+-----+-----+-----+-------+-------+-----+
 482 *
 483 * To perform an interval query on an overlapped tree, perform the usual
 484 * depth-first search and use the low and high keys to decide if we can skip
 485 * that particular node.  If a leaf node is reached, return the records that
 486 * intersect the interval.  Note that an interval query may return numerous
 487 * entries.  For a non-overlapped tree, simply search for the record associated
 488 * with the lowest key and iterate forward until a non-matching record is
 489 * found.  Section 14.3 ("Interval Trees") of _Introduction to Algorithms_ by
 490 * Cormen, Leiserson, Rivest, and Stein (2nd or 3rd ed. only) discuss this in
 491 * more detail.
 492 *
 493 * Why do we care about overlapping intervals?  Let's say you have a bunch of
 494 * reverse mapping records on a reflink filesystem:
 495 *
 496 * 1: +- file A startblock B offset C length D -----------+
 497 * 2:      +- file E startblock F offset G length H --------------+
 498 * 3:      +- file I startblock F offset J length K --+
 499 * 4:                                                        +- file L... --+
 500 *
 501 * Now say we want to map block (B+D) into file A at offset (C+D).  Ideally,
 502 * we'd simply increment the length of record 1.  But how do we find the record
 503 * that ends at (B+D-1) (i.e. record 1)?  A LE lookup of (B+D-1) would return
 504 * record 3 because the keys are ordered first by startblock.  An interval
 505 * query would return records 1 and 2 because they both overlap (B+D-1), and
 506 * from that we can pick out record 1 as the appropriate left neighbor.
 507 *
 508 * In the non-overlapped case you can do a LE lookup and decrement the cursor
 509 * because a record's interval must end before the next record.
 510 */
 511
 512/*
 513 * Return size of the btree block header for this btree instance.
 514 */
 515static inline size_t xfs_btree_block_len(struct xfs_btree_cur *cur)
 516{
 517        if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
 518                if (cur->bc_flags & XFS_BTREE_CRC_BLOCKS)
 519                        return XFS_BTREE_LBLOCK_CRC_LEN;
 520                return XFS_BTREE_LBLOCK_LEN;
 521        }
 522        if (cur->bc_flags & XFS_BTREE_CRC_BLOCKS)
 523                return XFS_BTREE_SBLOCK_CRC_LEN;
 524        return XFS_BTREE_SBLOCK_LEN;
 525}
 526
 527/*
 528 * Return size of btree block pointers for this btree instance.
 529 */
 530static inline size_t xfs_btree_ptr_len(struct xfs_btree_cur *cur)
 531{
 532        return (cur->bc_flags & XFS_BTREE_LONG_PTRS) ?
 533                sizeof(__be64) : sizeof(__be32);
 534}
 535
 536/*
 537 * Calculate offset of the n-th record in a btree block.
 538 */
 539STATIC size_t
 540xfs_btree_rec_offset(
 541        struct xfs_btree_cur    *cur,
 542        int                     n)
 543{
 544        return xfs_btree_block_len(cur) +
 545                (n - 1) * cur->bc_ops->rec_len;
 546}
 547
 548/*
 549 * Calculate offset of the n-th key in a btree block.
 550 */
 551STATIC size_t
 552xfs_btree_key_offset(
 553        struct xfs_btree_cur    *cur,
 554        int                     n)
 555{
 556        return xfs_btree_block_len(cur) +
 557                (n - 1) * cur->bc_ops->key_len;
 558}
 559
 560/*
 561 * Calculate offset of the n-th high key in a btree block.
 562 */
 563STATIC size_t
 564xfs_btree_high_key_offset(
 565        struct xfs_btree_cur    *cur,
 566        int                     n)
 567{
 568        return xfs_btree_block_len(cur) +
 569                (n - 1) * cur->bc_ops->key_len + (cur->bc_ops->key_len / 2);
 570}
 571
 572/*
 573 * Calculate offset of the n-th block pointer in a btree block.
 574 */
 575STATIC size_t
 576xfs_btree_ptr_offset(
 577        struct xfs_btree_cur    *cur,
 578        int                     n,
 579        int                     level)
 580{
 581        return xfs_btree_block_len(cur) +
 582                cur->bc_ops->get_maxrecs(cur, level) * cur->bc_ops->key_len +
 583                (n - 1) * xfs_btree_ptr_len(cur);
 584}
 585
 586/*
 587 * Return a pointer to the n-th record in the btree block.
 588 */
 589union xfs_btree_rec *
 590xfs_btree_rec_addr(
 591        struct xfs_btree_cur    *cur,
 592        int                     n,
 593        struct xfs_btree_block  *block)
 594{
 595        return (union xfs_btree_rec *)
 596                ((char *)block + xfs_btree_rec_offset(cur, n));
 597}
 598
 599/*
 600 * Return a pointer to the n-th key in the btree block.
 601 */
 602union xfs_btree_key *
 603xfs_btree_key_addr(
 604        struct xfs_btree_cur    *cur,
 605        int                     n,
 606        struct xfs_btree_block  *block)
 607{
 608        return (union xfs_btree_key *)
 609                ((char *)block + xfs_btree_key_offset(cur, n));
 610}
 611
 612/*
 613 * Return a pointer to the n-th high key in the btree block.
 614 */
 615union xfs_btree_key *
 616xfs_btree_high_key_addr(
 617        struct xfs_btree_cur    *cur,
 618        int                     n,
 619        struct xfs_btree_block  *block)
 620{
 621        return (union xfs_btree_key *)
 622                ((char *)block + xfs_btree_high_key_offset(cur, n));
 623}
 624
 625/*
 626 * Return a pointer to the n-th block pointer in the btree block.
 627 */
 628union xfs_btree_ptr *
 629xfs_btree_ptr_addr(
 630        struct xfs_btree_cur    *cur,
 631        int                     n,
 632        struct xfs_btree_block  *block)
 633{
 634        int                     level = xfs_btree_get_level(block);
 635
 636        ASSERT(block->bb_level != 0);
 637
 638        return (union xfs_btree_ptr *)
 639                ((char *)block + xfs_btree_ptr_offset(cur, n, level));
 640}
 641
 642struct xfs_ifork *
 643xfs_btree_ifork_ptr(
 644        struct xfs_btree_cur    *cur)
 645{
 646        ASSERT(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE);
 647
 648        if (cur->bc_flags & XFS_BTREE_STAGING)
 649                return cur->bc_ino.ifake->if_fork;
 650        return XFS_IFORK_PTR(cur->bc_ino.ip, cur->bc_ino.whichfork);
 651}
 652
 653/*
 654 * Get the root block which is stored in the inode.
 655 *
 656 * For now this btree implementation assumes the btree root is always
 657 * stored in the if_broot field of an inode fork.
 658 */
 659STATIC struct xfs_btree_block *
 660xfs_btree_get_iroot(
 661        struct xfs_btree_cur    *cur)
 662{
 663        struct xfs_ifork        *ifp = xfs_btree_ifork_ptr(cur);
 664
 665        return (struct xfs_btree_block *)ifp->if_broot;
 666}
 667
 668/*
 669 * Retrieve the block pointer from the cursor at the given level.
 670 * This may be an inode btree root or from a buffer.
 671 */
 672struct xfs_btree_block *                /* generic btree block pointer */
 673xfs_btree_get_block(
 674        struct xfs_btree_cur    *cur,   /* btree cursor */
 675        int                     level,  /* level in btree */
 676        struct xfs_buf          **bpp)  /* buffer containing the block */
 677{
 678        if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
 679            (level == cur->bc_nlevels - 1)) {
 680                *bpp = NULL;
 681                return xfs_btree_get_iroot(cur);
 682        }
 683
 684        *bpp = cur->bc_levels[level].bp;
 685        return XFS_BUF_TO_BLOCK(*bpp);
 686}
 687
 688/*
 689 * Change the cursor to point to the first record at the given level.
 690 * Other levels are unaffected.
 691 */
 692STATIC int                              /* success=1, failure=0 */
 693xfs_btree_firstrec(
 694        struct xfs_btree_cur    *cur,   /* btree cursor */
 695        int                     level)  /* level to change */
 696{
 697        struct xfs_btree_block  *block; /* generic btree block pointer */
 698        struct xfs_buf          *bp;    /* buffer containing block */
 699
 700        /*
 701         * Get the block pointer for this level.
 702         */
 703        block = xfs_btree_get_block(cur, level, &bp);
 704        if (xfs_btree_check_block(cur, block, level, bp))
 705                return 0;
 706        /*
 707         * It's empty, there is no such record.
 708         */
 709        if (!block->bb_numrecs)
 710                return 0;
 711        /*
 712         * Set the ptr value to 1, that's the first record/key.
 713         */
 714        cur->bc_levels[level].ptr = 1;
 715        return 1;
 716}
 717
 718/*
 719 * Change the cursor to point to the last record in the current block
 720 * at the given level.  Other levels are unaffected.
 721 */
 722STATIC int                              /* success=1, failure=0 */
 723xfs_btree_lastrec(
 724        struct xfs_btree_cur    *cur,   /* btree cursor */
 725        int                     level)  /* level to change */
 726{
 727        struct xfs_btree_block  *block; /* generic btree block pointer */
 728        struct xfs_buf          *bp;    /* buffer containing block */
 729
 730        /*
 731         * Get the block pointer for this level.
 732         */
 733        block = xfs_btree_get_block(cur, level, &bp);
 734        if (xfs_btree_check_block(cur, block, level, bp))
 735                return 0;
 736        /*
 737         * It's empty, there is no such record.
 738         */
 739        if (!block->bb_numrecs)
 740                return 0;
 741        /*
 742         * Set the ptr value to numrecs, that's the last record/key.
 743         */
 744        cur->bc_levels[level].ptr = be16_to_cpu(block->bb_numrecs);
 745        return 1;
 746}
 747
 748/*
 749 * Compute first and last byte offsets for the fields given.
 750 * Interprets the offsets table, which contains struct field offsets.
 751 */
 752void
 753xfs_btree_offsets(
 754        int64_t         fields,         /* bitmask of fields */
 755        const short     *offsets,       /* table of field offsets */
 756        int             nbits,          /* number of bits to inspect */
 757        int             *first,         /* output: first byte offset */
 758        int             *last)          /* output: last byte offset */
 759{
 760        int             i;              /* current bit number */
 761        int64_t         imask;          /* mask for current bit number */
 762
 763        ASSERT(fields != 0);
 764        /*
 765         * Find the lowest bit, so the first byte offset.
 766         */
 767        for (i = 0, imask = 1LL; ; i++, imask <<= 1) {
 768                if (imask & fields) {
 769                        *first = offsets[i];
 770                        break;
 771                }
 772        }
 773        /*
 774         * Find the highest bit, so the last byte offset.
 775         */
 776        for (i = nbits - 1, imask = 1LL << i; ; i--, imask >>= 1) {
 777                if (imask & fields) {
 778                        *last = offsets[i + 1] - 1;
 779                        break;
 780                }
 781        }
 782}
 783
 784/*
 785 * Get a buffer for the block, return it read in.
 786 * Long-form addressing.
 787 */
 788int
 789xfs_btree_read_bufl(
 790        struct xfs_mount        *mp,            /* file system mount point */
 791        struct xfs_trans        *tp,            /* transaction pointer */
 792        xfs_fsblock_t           fsbno,          /* file system block number */
 793        struct xfs_buf          **bpp,          /* buffer for fsbno */
 794        int                     refval,         /* ref count value for buffer */
 795        const struct xfs_buf_ops *ops)
 796{
 797        struct xfs_buf          *bp;            /* return value */
 798        xfs_daddr_t             d;              /* real disk block address */
 799        int                     error;
 800
 801        if (!xfs_verify_fsbno(mp, fsbno))
 802                return -EFSCORRUPTED;
 803        d = XFS_FSB_TO_DADDR(mp, fsbno);
 804        error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, d,
 805                                   mp->m_bsize, 0, &bp, ops);
 806        if (error)
 807                return error;
 808        if (bp)
 809                xfs_buf_set_ref(bp, refval);
 810        *bpp = bp;
 811        return 0;
 812}
 813
 814/*
 815 * Read-ahead the block, don't wait for it, don't return a buffer.
 816 * Long-form addressing.
 817 */
 818/* ARGSUSED */
 819void
 820xfs_btree_reada_bufl(
 821        struct xfs_mount        *mp,            /* file system mount point */
 822        xfs_fsblock_t           fsbno,          /* file system block number */
 823        xfs_extlen_t            count,          /* count of filesystem blocks */
 824        const struct xfs_buf_ops *ops)
 825{
 826        xfs_daddr_t             d;
 827
 828        ASSERT(fsbno != NULLFSBLOCK);
 829        d = XFS_FSB_TO_DADDR(mp, fsbno);
 830        xfs_buf_readahead(mp->m_ddev_targp, d, mp->m_bsize * count, ops);
 831}
 832
 833/*
 834 * Read-ahead the block, don't wait for it, don't return a buffer.
 835 * Short-form addressing.
 836 */
 837/* ARGSUSED */
 838void
 839xfs_btree_reada_bufs(
 840        struct xfs_mount        *mp,            /* file system mount point */
 841        xfs_agnumber_t          agno,           /* allocation group number */
 842        xfs_agblock_t           agbno,          /* allocation group block number */
 843        xfs_extlen_t            count,          /* count of filesystem blocks */
 844        const struct xfs_buf_ops *ops)
 845{
 846        xfs_daddr_t             d;
 847
 848        ASSERT(agno != NULLAGNUMBER);
 849        ASSERT(agbno != NULLAGBLOCK);
 850        d = XFS_AGB_TO_DADDR(mp, agno, agbno);
 851        xfs_buf_readahead(mp->m_ddev_targp, d, mp->m_bsize * count, ops);
 852}
 853
 854STATIC int
 855xfs_btree_readahead_lblock(
 856        struct xfs_btree_cur    *cur,
 857        int                     lr,
 858        struct xfs_btree_block  *block)
 859{
 860        int                     rval = 0;
 861        xfs_fsblock_t           left = be64_to_cpu(block->bb_u.l.bb_leftsib);
 862        xfs_fsblock_t           right = be64_to_cpu(block->bb_u.l.bb_rightsib);
 863
 864        if ((lr & XFS_BTCUR_LEFTRA) && left != NULLFSBLOCK) {
 865                xfs_btree_reada_bufl(cur->bc_mp, left, 1,
 866                                     cur->bc_ops->buf_ops);
 867                rval++;
 868        }
 869
 870        if ((lr & XFS_BTCUR_RIGHTRA) && right != NULLFSBLOCK) {
 871                xfs_btree_reada_bufl(cur->bc_mp, right, 1,
 872                                     cur->bc_ops->buf_ops);
 873                rval++;
 874        }
 875
 876        return rval;
 877}
 878
 879STATIC int
 880xfs_btree_readahead_sblock(
 881        struct xfs_btree_cur    *cur,
 882        int                     lr,
 883        struct xfs_btree_block *block)
 884{
 885        int                     rval = 0;
 886        xfs_agblock_t           left = be32_to_cpu(block->bb_u.s.bb_leftsib);
 887        xfs_agblock_t           right = be32_to_cpu(block->bb_u.s.bb_rightsib);
 888
 889
 890        if ((lr & XFS_BTCUR_LEFTRA) && left != NULLAGBLOCK) {
 891                xfs_btree_reada_bufs(cur->bc_mp, cur->bc_ag.pag->pag_agno,
 892                                     left, 1, cur->bc_ops->buf_ops);
 893                rval++;
 894        }
 895
 896        if ((lr & XFS_BTCUR_RIGHTRA) && right != NULLAGBLOCK) {
 897                xfs_btree_reada_bufs(cur->bc_mp, cur->bc_ag.pag->pag_agno,
 898                                     right, 1, cur->bc_ops->buf_ops);
 899                rval++;
 900        }
 901
 902        return rval;
 903}
 904
 905/*
 906 * Read-ahead btree blocks, at the given level.
 907 * Bits in lr are set from XFS_BTCUR_{LEFT,RIGHT}RA.
 908 */
 909STATIC int
 910xfs_btree_readahead(
 911        struct xfs_btree_cur    *cur,           /* btree cursor */
 912        int                     lev,            /* level in btree */
 913        int                     lr)             /* left/right bits */
 914{
 915        struct xfs_btree_block  *block;
 916
 917        /*
 918         * No readahead needed if we are at the root level and the
 919         * btree root is stored in the inode.
 920         */
 921        if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
 922            (lev == cur->bc_nlevels - 1))
 923                return 0;
 924
 925        if ((cur->bc_levels[lev].ra | lr) == cur->bc_levels[lev].ra)
 926                return 0;
 927
 928        cur->bc_levels[lev].ra |= lr;
 929        block = XFS_BUF_TO_BLOCK(cur->bc_levels[lev].bp);
 930
 931        if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
 932                return xfs_btree_readahead_lblock(cur, lr, block);
 933        return xfs_btree_readahead_sblock(cur, lr, block);
 934}
 935
 936STATIC int
 937xfs_btree_ptr_to_daddr(
 938        struct xfs_btree_cur            *cur,
 939        const union xfs_btree_ptr       *ptr,
 940        xfs_daddr_t                     *daddr)
 941{
 942        xfs_fsblock_t           fsbno;
 943        xfs_agblock_t           agbno;
 944        int                     error;
 945
 946        error = xfs_btree_check_ptr(cur, ptr, 0, 1);
 947        if (error)
 948                return error;
 949
 950        if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
 951                fsbno = be64_to_cpu(ptr->l);
 952                *daddr = XFS_FSB_TO_DADDR(cur->bc_mp, fsbno);
 953        } else {
 954                agbno = be32_to_cpu(ptr->s);
 955                *daddr = XFS_AGB_TO_DADDR(cur->bc_mp, cur->bc_ag.pag->pag_agno,
 956                                agbno);
 957        }
 958
 959        return 0;
 960}
 961
 962/*
 963 * Readahead @count btree blocks at the given @ptr location.
 964 *
 965 * We don't need to care about long or short form btrees here as we have a
 966 * method of converting the ptr directly to a daddr available to us.
 967 */
 968STATIC void
 969xfs_btree_readahead_ptr(
 970        struct xfs_btree_cur    *cur,
 971        union xfs_btree_ptr     *ptr,
 972        xfs_extlen_t            count)
 973{
 974        xfs_daddr_t             daddr;
 975
 976        if (xfs_btree_ptr_to_daddr(cur, ptr, &daddr))
 977                return;
 978        xfs_buf_readahead(cur->bc_mp->m_ddev_targp, daddr,
 979                          cur->bc_mp->m_bsize * count, cur->bc_ops->buf_ops);
 980}
 981
 982/*
 983 * Set the buffer for level "lev" in the cursor to bp, releasing
 984 * any previous buffer.
 985 */
 986STATIC void
 987xfs_btree_setbuf(
 988        struct xfs_btree_cur    *cur,   /* btree cursor */
 989        int                     lev,    /* level in btree */
 990        struct xfs_buf          *bp)    /* new buffer to set */
 991{
 992        struct xfs_btree_block  *b;     /* btree block */
 993
 994        if (cur->bc_levels[lev].bp)
 995                xfs_trans_brelse(cur->bc_tp, cur->bc_levels[lev].bp);
 996        cur->bc_levels[lev].bp = bp;
 997        cur->bc_levels[lev].ra = 0;
 998
 999        b = XFS_BUF_TO_BLOCK(bp);
1000        if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
1001                if (b->bb_u.l.bb_leftsib == cpu_to_be64(NULLFSBLOCK))
1002                        cur->bc_levels[lev].ra |= XFS_BTCUR_LEFTRA;
1003                if (b->bb_u.l.bb_rightsib == cpu_to_be64(NULLFSBLOCK))
1004                        cur->bc_levels[lev].ra |= XFS_BTCUR_RIGHTRA;
1005        } else {
1006                if (b->bb_u.s.bb_leftsib == cpu_to_be32(NULLAGBLOCK))
1007                        cur->bc_levels[lev].ra |= XFS_BTCUR_LEFTRA;
1008                if (b->bb_u.s.bb_rightsib == cpu_to_be32(NULLAGBLOCK))
1009                        cur->bc_levels[lev].ra |= XFS_BTCUR_RIGHTRA;
1010        }
1011}
1012
1013bool
1014xfs_btree_ptr_is_null(
1015        struct xfs_btree_cur            *cur,
1016        const union xfs_btree_ptr       *ptr)
1017{
1018        if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
1019                return ptr->l == cpu_to_be64(NULLFSBLOCK);
1020        else
1021                return ptr->s == cpu_to_be32(NULLAGBLOCK);
1022}
1023
1024void
1025xfs_btree_set_ptr_null(
1026        struct xfs_btree_cur    *cur,
1027        union xfs_btree_ptr     *ptr)
1028{
1029        if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
1030                ptr->l = cpu_to_be64(NULLFSBLOCK);
1031        else
1032                ptr->s = cpu_to_be32(NULLAGBLOCK);
1033}
1034
1035/*
1036 * Get/set/init sibling pointers
1037 */
1038void
1039xfs_btree_get_sibling(
1040        struct xfs_btree_cur    *cur,
1041        struct xfs_btree_block  *block,
1042        union xfs_btree_ptr     *ptr,
1043        int                     lr)
1044{
1045        ASSERT(lr == XFS_BB_LEFTSIB || lr == XFS_BB_RIGHTSIB);
1046
1047        if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
1048                if (lr == XFS_BB_RIGHTSIB)
1049                        ptr->l = block->bb_u.l.bb_rightsib;
1050                else
1051                        ptr->l = block->bb_u.l.bb_leftsib;
1052        } else {
1053                if (lr == XFS_BB_RIGHTSIB)
1054                        ptr->s = block->bb_u.s.bb_rightsib;
1055                else
1056                        ptr->s = block->bb_u.s.bb_leftsib;
1057        }
1058}
1059
1060void
1061xfs_btree_set_sibling(
1062        struct xfs_btree_cur            *cur,
1063        struct xfs_btree_block          *block,
1064        const union xfs_btree_ptr       *ptr,
1065        int                             lr)
1066{
1067        ASSERT(lr == XFS_BB_LEFTSIB || lr == XFS_BB_RIGHTSIB);
1068
1069        if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
1070                if (lr == XFS_BB_RIGHTSIB)
1071                        block->bb_u.l.bb_rightsib = ptr->l;
1072                else
1073                        block->bb_u.l.bb_leftsib = ptr->l;
1074        } else {
1075                if (lr == XFS_BB_RIGHTSIB)
1076                        block->bb_u.s.bb_rightsib = ptr->s;
1077                else
1078                        block->bb_u.s.bb_leftsib = ptr->s;
1079        }
1080}
1081
1082void
1083xfs_btree_init_block_int(
1084        struct xfs_mount        *mp,
1085        struct xfs_btree_block  *buf,
1086        xfs_daddr_t             blkno,
1087        xfs_btnum_t             btnum,
1088        __u16                   level,
1089        __u16                   numrecs,
1090        __u64                   owner,
1091        unsigned int            flags)
1092{
1093        int                     crc = xfs_has_crc(mp);
1094        __u32                   magic = xfs_btree_magic(crc, btnum);
1095
1096        buf->bb_magic = cpu_to_be32(magic);
1097        buf->bb_level = cpu_to_be16(level);
1098        buf->bb_numrecs = cpu_to_be16(numrecs);
1099
1100        if (flags & XFS_BTREE_LONG_PTRS) {
1101                buf->bb_u.l.bb_leftsib = cpu_to_be64(NULLFSBLOCK);
1102                buf->bb_u.l.bb_rightsib = cpu_to_be64(NULLFSBLOCK);
1103                if (crc) {
1104                        buf->bb_u.l.bb_blkno = cpu_to_be64(blkno);
1105                        buf->bb_u.l.bb_owner = cpu_to_be64(owner);
1106                        uuid_copy(&buf->bb_u.l.bb_uuid, &mp->m_sb.sb_meta_uuid);
1107                        buf->bb_u.l.bb_pad = 0;
1108                        buf->bb_u.l.bb_lsn = 0;
1109                }
1110        } else {
1111                /* owner is a 32 bit value on short blocks */
1112                __u32 __owner = (__u32)owner;
1113
1114                buf->bb_u.s.bb_leftsib = cpu_to_be32(NULLAGBLOCK);
1115                buf->bb_u.s.bb_rightsib = cpu_to_be32(NULLAGBLOCK);
1116                if (crc) {
1117                        buf->bb_u.s.bb_blkno = cpu_to_be64(blkno);
1118                        buf->bb_u.s.bb_owner = cpu_to_be32(__owner);
1119                        uuid_copy(&buf->bb_u.s.bb_uuid, &mp->m_sb.sb_meta_uuid);
1120                        buf->bb_u.s.bb_lsn = 0;
1121                }
1122        }
1123}
1124
1125void
1126xfs_btree_init_block(
1127        struct xfs_mount *mp,
1128        struct xfs_buf  *bp,
1129        xfs_btnum_t     btnum,
1130        __u16           level,
1131        __u16           numrecs,
1132        __u64           owner)
1133{
1134        xfs_btree_init_block_int(mp, XFS_BUF_TO_BLOCK(bp), xfs_buf_daddr(bp),
1135                                 btnum, level, numrecs, owner, 0);
1136}
1137
1138void
1139xfs_btree_init_block_cur(
1140        struct xfs_btree_cur    *cur,
1141        struct xfs_buf          *bp,
1142        int                     level,
1143        int                     numrecs)
1144{
1145        __u64                   owner;
1146
1147        /*
1148         * we can pull the owner from the cursor right now as the different
1149         * owners align directly with the pointer size of the btree. This may
1150         * change in future, but is safe for current users of the generic btree
1151         * code.
1152         */
1153        if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
1154                owner = cur->bc_ino.ip->i_ino;
1155        else
1156                owner = cur->bc_ag.pag->pag_agno;
1157
1158        xfs_btree_init_block_int(cur->bc_mp, XFS_BUF_TO_BLOCK(bp),
1159                                xfs_buf_daddr(bp), cur->bc_btnum, level,
1160                                numrecs, owner, cur->bc_flags);
1161}
1162
1163/*
1164 * Return true if ptr is the last record in the btree and
1165 * we need to track updates to this record.  The decision
1166 * will be further refined in the update_lastrec method.
1167 */
1168STATIC int
1169xfs_btree_is_lastrec(
1170        struct xfs_btree_cur    *cur,
1171        struct xfs_btree_block  *block,
1172        int                     level)
1173{
1174        union xfs_btree_ptr     ptr;
1175
1176        if (level > 0)
1177                return 0;
1178        if (!(cur->bc_flags & XFS_BTREE_LASTREC_UPDATE))
1179                return 0;
1180
1181        xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_RIGHTSIB);
1182        if (!xfs_btree_ptr_is_null(cur, &ptr))
1183                return 0;
1184        return 1;
1185}
1186
1187STATIC void
1188xfs_btree_buf_to_ptr(
1189        struct xfs_btree_cur    *cur,
1190        struct xfs_buf          *bp,
1191        union xfs_btree_ptr     *ptr)
1192{
1193        if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
1194                ptr->l = cpu_to_be64(XFS_DADDR_TO_FSB(cur->bc_mp,
1195                                        xfs_buf_daddr(bp)));
1196        else {
1197                ptr->s = cpu_to_be32(xfs_daddr_to_agbno(cur->bc_mp,
1198                                        xfs_buf_daddr(bp)));
1199        }
1200}
1201
1202STATIC void
1203xfs_btree_set_refs(
1204        struct xfs_btree_cur    *cur,
1205        struct xfs_buf          *bp)
1206{
1207        switch (cur->bc_btnum) {
1208        case XFS_BTNUM_BNO:
1209        case XFS_BTNUM_CNT:
1210                xfs_buf_set_ref(bp, XFS_ALLOC_BTREE_REF);
1211                break;
1212        case XFS_BTNUM_INO:
1213        case XFS_BTNUM_FINO:
1214                xfs_buf_set_ref(bp, XFS_INO_BTREE_REF);
1215                break;
1216        case XFS_BTNUM_BMAP:
1217                xfs_buf_set_ref(bp, XFS_BMAP_BTREE_REF);
1218                break;
1219        case XFS_BTNUM_RMAP:
1220                xfs_buf_set_ref(bp, XFS_RMAP_BTREE_REF);
1221                break;
1222        case XFS_BTNUM_REFC:
1223                xfs_buf_set_ref(bp, XFS_REFC_BTREE_REF);
1224                break;
1225        default:
1226                ASSERT(0);
1227        }
1228}
1229
1230int
1231xfs_btree_get_buf_block(
1232        struct xfs_btree_cur            *cur,
1233        const union xfs_btree_ptr       *ptr,
1234        struct xfs_btree_block          **block,
1235        struct xfs_buf                  **bpp)
1236{
1237        struct xfs_mount        *mp = cur->bc_mp;
1238        xfs_daddr_t             d;
1239        int                     error;
1240
1241        error = xfs_btree_ptr_to_daddr(cur, ptr, &d);
1242        if (error)
1243                return error;
1244        error = xfs_trans_get_buf(cur->bc_tp, mp->m_ddev_targp, d, mp->m_bsize,
1245                        0, bpp);
1246        if (error)
1247                return error;
1248
1249        (*bpp)->b_ops = cur->bc_ops->buf_ops;
1250        *block = XFS_BUF_TO_BLOCK(*bpp);
1251        return 0;
1252}
1253
1254/*
1255 * Read in the buffer at the given ptr and return the buffer and
1256 * the block pointer within the buffer.
1257 */
1258STATIC int
1259xfs_btree_read_buf_block(
1260        struct xfs_btree_cur            *cur,
1261        const union xfs_btree_ptr       *ptr,
1262        int                             flags,
1263        struct xfs_btree_block          **block,
1264        struct xfs_buf                  **bpp)
1265{
1266        struct xfs_mount        *mp = cur->bc_mp;
1267        xfs_daddr_t             d;
1268        int                     error;
1269
1270        /* need to sort out how callers deal with failures first */
1271        ASSERT(!(flags & XBF_TRYLOCK));
1272
1273        error = xfs_btree_ptr_to_daddr(cur, ptr, &d);
1274        if (error)
1275                return error;
1276        error = xfs_trans_read_buf(mp, cur->bc_tp, mp->m_ddev_targp, d,
1277                                   mp->m_bsize, flags, bpp,
1278                                   cur->bc_ops->buf_ops);
1279        if (error)
1280                return error;
1281
1282        xfs_btree_set_refs(cur, *bpp);
1283        *block = XFS_BUF_TO_BLOCK(*bpp);
1284        return 0;
1285}
1286
1287/*
1288 * Copy keys from one btree block to another.
1289 */
1290void
1291xfs_btree_copy_keys(
1292        struct xfs_btree_cur            *cur,
1293        union xfs_btree_key             *dst_key,
1294        const union xfs_btree_key       *src_key,
1295        int                             numkeys)
1296{
1297        ASSERT(numkeys >= 0);
1298        memcpy(dst_key, src_key, numkeys * cur->bc_ops->key_len);
1299}
1300
1301/*
1302 * Copy records from one btree block to another.
1303 */
1304STATIC void
1305xfs_btree_copy_recs(
1306        struct xfs_btree_cur    *cur,
1307        union xfs_btree_rec     *dst_rec,
1308        union xfs_btree_rec     *src_rec,
1309        int                     numrecs)
1310{
1311        ASSERT(numrecs >= 0);
1312        memcpy(dst_rec, src_rec, numrecs * cur->bc_ops->rec_len);
1313}
1314
1315/*
1316 * Copy block pointers from one btree block to another.
1317 */
1318void
1319xfs_btree_copy_ptrs(
1320        struct xfs_btree_cur    *cur,
1321        union xfs_btree_ptr     *dst_ptr,
1322        const union xfs_btree_ptr *src_ptr,
1323        int                     numptrs)
1324{
1325        ASSERT(numptrs >= 0);
1326        memcpy(dst_ptr, src_ptr, numptrs * xfs_btree_ptr_len(cur));
1327}
1328
1329/*
1330 * Shift keys one index left/right inside a single btree block.
1331 */
1332STATIC void
1333xfs_btree_shift_keys(
1334        struct xfs_btree_cur    *cur,
1335        union xfs_btree_key     *key,
1336        int                     dir,
1337        int                     numkeys)
1338{
1339        char                    *dst_key;
1340
1341        ASSERT(numkeys >= 0);
1342        ASSERT(dir == 1 || dir == -1);
1343
1344        dst_key = (char *)key + (dir * cur->bc_ops->key_len);
1345        memmove(dst_key, key, numkeys * cur->bc_ops->key_len);
1346}
1347
1348/*
1349 * Shift records one index left/right inside a single btree block.
1350 */
1351STATIC void
1352xfs_btree_shift_recs(
1353        struct xfs_btree_cur    *cur,
1354        union xfs_btree_rec     *rec,
1355        int                     dir,
1356        int                     numrecs)
1357{
1358        char                    *dst_rec;
1359
1360        ASSERT(numrecs >= 0);
1361        ASSERT(dir == 1 || dir == -1);
1362
1363        dst_rec = (char *)rec + (dir * cur->bc_ops->rec_len);
1364        memmove(dst_rec, rec, numrecs * cur->bc_ops->rec_len);
1365}
1366
1367/*
1368 * Shift block pointers one index left/right inside a single btree block.
1369 */
1370STATIC void
1371xfs_btree_shift_ptrs(
1372        struct xfs_btree_cur    *cur,
1373        union xfs_btree_ptr     *ptr,
1374        int                     dir,
1375        int                     numptrs)
1376{
1377        char                    *dst_ptr;
1378
1379        ASSERT(numptrs >= 0);
1380        ASSERT(dir == 1 || dir == -1);
1381
1382        dst_ptr = (char *)ptr + (dir * xfs_btree_ptr_len(cur));
1383        memmove(dst_ptr, ptr, numptrs * xfs_btree_ptr_len(cur));
1384}
1385
1386/*
1387 * Log key values from the btree block.
1388 */
1389STATIC void
1390xfs_btree_log_keys(
1391        struct xfs_btree_cur    *cur,
1392        struct xfs_buf          *bp,
1393        int                     first,
1394        int                     last)
1395{
1396
1397        if (bp) {
1398                xfs_trans_buf_set_type(cur->bc_tp, bp, XFS_BLFT_BTREE_BUF);
1399                xfs_trans_log_buf(cur->bc_tp, bp,
1400                                  xfs_btree_key_offset(cur, first),
1401                                  xfs_btree_key_offset(cur, last + 1) - 1);
1402        } else {
1403                xfs_trans_log_inode(cur->bc_tp, cur->bc_ino.ip,
1404                                xfs_ilog_fbroot(cur->bc_ino.whichfork));
1405        }
1406}
1407
1408/*
1409 * Log record values from the btree block.
1410 */
1411void
1412xfs_btree_log_recs(
1413        struct xfs_btree_cur    *cur,
1414        struct xfs_buf          *bp,
1415        int                     first,
1416        int                     last)
1417{
1418
1419        xfs_trans_buf_set_type(cur->bc_tp, bp, XFS_BLFT_BTREE_BUF);
1420        xfs_trans_log_buf(cur->bc_tp, bp,
1421                          xfs_btree_rec_offset(cur, first),
1422                          xfs_btree_rec_offset(cur, last + 1) - 1);
1423
1424}
1425
1426/*
1427 * Log block pointer fields from a btree block (nonleaf).
1428 */
1429STATIC void
1430xfs_btree_log_ptrs(
1431        struct xfs_btree_cur    *cur,   /* btree cursor */
1432        struct xfs_buf          *bp,    /* buffer containing btree block */
1433        int                     first,  /* index of first pointer to log */
1434        int                     last)   /* index of last pointer to log */
1435{
1436
1437        if (bp) {
1438                struct xfs_btree_block  *block = XFS_BUF_TO_BLOCK(bp);
1439                int                     level = xfs_btree_get_level(block);
1440
1441                xfs_trans_buf_set_type(cur->bc_tp, bp, XFS_BLFT_BTREE_BUF);
1442                xfs_trans_log_buf(cur->bc_tp, bp,
1443                                xfs_btree_ptr_offset(cur, first, level),
1444                                xfs_btree_ptr_offset(cur, last + 1, level) - 1);
1445        } else {
1446                xfs_trans_log_inode(cur->bc_tp, cur->bc_ino.ip,
1447                        xfs_ilog_fbroot(cur->bc_ino.whichfork));
1448        }
1449
1450}
1451
1452/*
1453 * Log fields from a btree block header.
1454 */
1455void
1456xfs_btree_log_block(
1457        struct xfs_btree_cur    *cur,   /* btree cursor */
1458        struct xfs_buf          *bp,    /* buffer containing btree block */
1459        int                     fields) /* mask of fields: XFS_BB_... */
1460{
1461        int                     first;  /* first byte offset logged */
1462        int                     last;   /* last byte offset logged */
1463        static const short      soffsets[] = {  /* table of offsets (short) */
1464                offsetof(struct xfs_btree_block, bb_magic),
1465                offsetof(struct xfs_btree_block, bb_level),
1466                offsetof(struct xfs_btree_block, bb_numrecs),
1467                offsetof(struct xfs_btree_block, bb_u.s.bb_leftsib),
1468                offsetof(struct xfs_btree_block, bb_u.s.bb_rightsib),
1469                offsetof(struct xfs_btree_block, bb_u.s.bb_blkno),
1470                offsetof(struct xfs_btree_block, bb_u.s.bb_lsn),
1471                offsetof(struct xfs_btree_block, bb_u.s.bb_uuid),
1472                offsetof(struct xfs_btree_block, bb_u.s.bb_owner),
1473                offsetof(struct xfs_btree_block, bb_u.s.bb_crc),
1474                XFS_BTREE_SBLOCK_CRC_LEN
1475        };
1476        static const short      loffsets[] = {  /* table of offsets (long) */
1477                offsetof(struct xfs_btree_block, bb_magic),
1478                offsetof(struct xfs_btree_block, bb_level),
1479                offsetof(struct xfs_btree_block, bb_numrecs),
1480                offsetof(struct xfs_btree_block, bb_u.l.bb_leftsib),
1481                offsetof(struct xfs_btree_block, bb_u.l.bb_rightsib),
1482                offsetof(struct xfs_btree_block, bb_u.l.bb_blkno),
1483                offsetof(struct xfs_btree_block, bb_u.l.bb_lsn),
1484                offsetof(struct xfs_btree_block, bb_u.l.bb_uuid),
1485                offsetof(struct xfs_btree_block, bb_u.l.bb_owner),
1486                offsetof(struct xfs_btree_block, bb_u.l.bb_crc),
1487                offsetof(struct xfs_btree_block, bb_u.l.bb_pad),
1488                XFS_BTREE_LBLOCK_CRC_LEN
1489        };
1490
1491        if (bp) {
1492                int nbits;
1493
1494                if (cur->bc_flags & XFS_BTREE_CRC_BLOCKS) {
1495                        /*
1496                         * We don't log the CRC when updating a btree
1497                         * block but instead recreate it during log
1498                         * recovery.  As the log buffers have checksums
1499                         * of their own this is safe and avoids logging a crc
1500                         * update in a lot of places.
1501                         */
1502                        if (fields == XFS_BB_ALL_BITS)
1503                                fields = XFS_BB_ALL_BITS_CRC;
1504                        nbits = XFS_BB_NUM_BITS_CRC;
1505                } else {
1506                        nbits = XFS_BB_NUM_BITS;
1507                }
1508                xfs_btree_offsets(fields,
1509                                  (cur->bc_flags & XFS_BTREE_LONG_PTRS) ?
1510                                        loffsets : soffsets,
1511                                  nbits, &first, &last);
1512                xfs_trans_buf_set_type(cur->bc_tp, bp, XFS_BLFT_BTREE_BUF);
1513                xfs_trans_log_buf(cur->bc_tp, bp, first, last);
1514        } else {
1515                xfs_trans_log_inode(cur->bc_tp, cur->bc_ino.ip,
1516                        xfs_ilog_fbroot(cur->bc_ino.whichfork));
1517        }
1518}
1519
1520/*
1521 * Increment cursor by one record at the level.
1522 * For nonzero levels the leaf-ward information is untouched.
1523 */
1524int                                             /* error */
1525xfs_btree_increment(
1526        struct xfs_btree_cur    *cur,
1527        int                     level,
1528        int                     *stat)          /* success/failure */
1529{
1530        struct xfs_btree_block  *block;
1531        union xfs_btree_ptr     ptr;
1532        struct xfs_buf          *bp;
1533        int                     error;          /* error return value */
1534        int                     lev;
1535
1536        ASSERT(level < cur->bc_nlevels);
1537
1538        /* Read-ahead to the right at this level. */
1539        xfs_btree_readahead(cur, level, XFS_BTCUR_RIGHTRA);
1540
1541        /* Get a pointer to the btree block. */
1542        block = xfs_btree_get_block(cur, level, &bp);
1543
1544#ifdef DEBUG
1545        error = xfs_btree_check_block(cur, block, level, bp);
1546        if (error)
1547                goto error0;
1548#endif
1549
1550        /* We're done if we remain in the block after the increment. */
1551        if (++cur->bc_levels[level].ptr <= xfs_btree_get_numrecs(block))
1552                goto out1;
1553
1554        /* Fail if we just went off the right edge of the tree. */
1555        xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_RIGHTSIB);
1556        if (xfs_btree_ptr_is_null(cur, &ptr))
1557                goto out0;
1558
1559        XFS_BTREE_STATS_INC(cur, increment);
1560
1561        /*
1562         * March up the tree incrementing pointers.
1563         * Stop when we don't go off the right edge of a block.
1564         */
1565        for (lev = level + 1; lev < cur->bc_nlevels; lev++) {
1566                block = xfs_btree_get_block(cur, lev, &bp);
1567
1568#ifdef DEBUG
1569                error = xfs_btree_check_block(cur, block, lev, bp);
1570                if (error)
1571                        goto error0;
1572#endif
1573
1574                if (++cur->bc_levels[lev].ptr <= xfs_btree_get_numrecs(block))
1575                        break;
1576
1577                /* Read-ahead the right block for the next loop. */
1578                xfs_btree_readahead(cur, lev, XFS_BTCUR_RIGHTRA);
1579        }
1580
1581        /*
1582         * If we went off the root then we are either seriously
1583         * confused or have the tree root in an inode.
1584         */
1585        if (lev == cur->bc_nlevels) {
1586                if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE)
1587                        goto out0;
1588                ASSERT(0);
1589                error = -EFSCORRUPTED;
1590                goto error0;
1591        }
1592        ASSERT(lev < cur->bc_nlevels);
1593
1594        /*
1595         * Now walk back down the tree, fixing up the cursor's buffer
1596         * pointers and key numbers.
1597         */
1598        for (block = xfs_btree_get_block(cur, lev, &bp); lev > level; ) {
1599                union xfs_btree_ptr     *ptrp;
1600
1601                ptrp = xfs_btree_ptr_addr(cur, cur->bc_levels[lev].ptr, block);
1602                --lev;
1603                error = xfs_btree_read_buf_block(cur, ptrp, 0, &block, &bp);
1604                if (error)
1605                        goto error0;
1606
1607                xfs_btree_setbuf(cur, lev, bp);
1608                cur->bc_levels[lev].ptr = 1;
1609        }
1610out1:
1611        *stat = 1;
1612        return 0;
1613
1614out0:
1615        *stat = 0;
1616        return 0;
1617
1618error0:
1619        return error;
1620}
1621
1622/*
1623 * Decrement cursor by one record at the level.
1624 * For nonzero levels the leaf-ward information is untouched.
1625 */
1626int                                             /* error */
1627xfs_btree_decrement(
1628        struct xfs_btree_cur    *cur,
1629        int                     level,
1630        int                     *stat)          /* success/failure */
1631{
1632        struct xfs_btree_block  *block;
1633        struct xfs_buf          *bp;
1634        int                     error;          /* error return value */
1635        int                     lev;
1636        union xfs_btree_ptr     ptr;
1637
1638        ASSERT(level < cur->bc_nlevels);
1639
1640        /* Read-ahead to the left at this level. */
1641        xfs_btree_readahead(cur, level, XFS_BTCUR_LEFTRA);
1642
1643        /* We're done if we remain in the block after the decrement. */
1644        if (--cur->bc_levels[level].ptr > 0)
1645                goto out1;
1646
1647        /* Get a pointer to the btree block. */
1648        block = xfs_btree_get_block(cur, level, &bp);
1649
1650#ifdef DEBUG
1651        error = xfs_btree_check_block(cur, block, level, bp);
1652        if (error)
1653                goto error0;
1654#endif
1655
1656        /* Fail if we just went off the left edge of the tree. */
1657        xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_LEFTSIB);
1658        if (xfs_btree_ptr_is_null(cur, &ptr))
1659                goto out0;
1660
1661        XFS_BTREE_STATS_INC(cur, decrement);
1662
1663        /*
1664         * March up the tree decrementing pointers.
1665         * Stop when we don't go off the left edge of a block.
1666         */
1667        for (lev = level + 1; lev < cur->bc_nlevels; lev++) {
1668                if (--cur->bc_levels[lev].ptr > 0)
1669                        break;
1670                /* Read-ahead the left block for the next loop. */
1671                xfs_btree_readahead(cur, lev, XFS_BTCUR_LEFTRA);
1672        }
1673
1674        /*
1675         * If we went off the root then we are seriously confused.
1676         * or the root of the tree is in an inode.
1677         */
1678        if (lev == cur->bc_nlevels) {
1679                if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE)
1680                        goto out0;
1681                ASSERT(0);
1682                error = -EFSCORRUPTED;
1683                goto error0;
1684        }
1685        ASSERT(lev < cur->bc_nlevels);
1686
1687        /*
1688         * Now walk back down the tree, fixing up the cursor's buffer
1689         * pointers and key numbers.
1690         */
1691        for (block = xfs_btree_get_block(cur, lev, &bp); lev > level; ) {
1692                union xfs_btree_ptr     *ptrp;
1693
1694                ptrp = xfs_btree_ptr_addr(cur, cur->bc_levels[lev].ptr, block);
1695                --lev;
1696                error = xfs_btree_read_buf_block(cur, ptrp, 0, &block, &bp);
1697                if (error)
1698                        goto error0;
1699                xfs_btree_setbuf(cur, lev, bp);
1700                cur->bc_levels[lev].ptr = xfs_btree_get_numrecs(block);
1701        }
1702out1:
1703        *stat = 1;
1704        return 0;
1705
1706out0:
1707        *stat = 0;
1708        return 0;
1709
1710error0:
1711        return error;
1712}
1713
1714int
1715xfs_btree_lookup_get_block(
1716        struct xfs_btree_cur            *cur,   /* btree cursor */
1717        int                             level,  /* level in the btree */
1718        const union xfs_btree_ptr       *pp,    /* ptr to btree block */
1719        struct xfs_btree_block          **blkp) /* return btree block */
1720{
1721        struct xfs_buf          *bp;    /* buffer pointer for btree block */
1722        xfs_daddr_t             daddr;
1723        int                     error = 0;
1724
1725        /* special case the root block if in an inode */
1726        if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
1727            (level == cur->bc_nlevels - 1)) {
1728                *blkp = xfs_btree_get_iroot(cur);
1729                return 0;
1730        }
1731
1732        /*
1733         * If the old buffer at this level for the disk address we are
1734         * looking for re-use it.
1735         *
1736         * Otherwise throw it away and get a new one.
1737         */
1738        bp = cur->bc_levels[level].bp;
1739        error = xfs_btree_ptr_to_daddr(cur, pp, &daddr);
1740        if (error)
1741                return error;
1742        if (bp && xfs_buf_daddr(bp) == daddr) {
1743                *blkp = XFS_BUF_TO_BLOCK(bp);
1744                return 0;
1745        }
1746
1747        error = xfs_btree_read_buf_block(cur, pp, 0, blkp, &bp);
1748        if (error)
1749                return error;
1750
1751        /* Check the inode owner since the verifiers don't. */
1752        if (xfs_has_crc(cur->bc_mp) &&
1753            !(cur->bc_ino.flags & XFS_BTCUR_BMBT_INVALID_OWNER) &&
1754            (cur->bc_flags & XFS_BTREE_LONG_PTRS) &&
1755            be64_to_cpu((*blkp)->bb_u.l.bb_owner) !=
1756                        cur->bc_ino.ip->i_ino)
1757                goto out_bad;
1758
1759        /* Did we get the level we were looking for? */
1760        if (be16_to_cpu((*blkp)->bb_level) != level)
1761                goto out_bad;
1762
1763        /* Check that internal nodes have at least one record. */
1764        if (level != 0 && be16_to_cpu((*blkp)->bb_numrecs) == 0)
1765                goto out_bad;
1766
1767        xfs_btree_setbuf(cur, level, bp);
1768        return 0;
1769
1770out_bad:
1771        *blkp = NULL;
1772        xfs_buf_mark_corrupt(bp);
1773        xfs_trans_brelse(cur->bc_tp, bp);
1774        return -EFSCORRUPTED;
1775}
1776
1777/*
1778 * Get current search key.  For level 0 we don't actually have a key
1779 * structure so we make one up from the record.  For all other levels
1780 * we just return the right key.
1781 */
1782STATIC union xfs_btree_key *
1783xfs_lookup_get_search_key(
1784        struct xfs_btree_cur    *cur,
1785        int                     level,
1786        int                     keyno,
1787        struct xfs_btree_block  *block,
1788        union xfs_btree_key     *kp)
1789{
1790        if (level == 0) {
1791                cur->bc_ops->init_key_from_rec(kp,
1792                                xfs_btree_rec_addr(cur, keyno, block));
1793                return kp;
1794        }
1795
1796        return xfs_btree_key_addr(cur, keyno, block);
1797}
1798
1799/*
1800 * Lookup the record.  The cursor is made to point to it, based on dir.
1801 * stat is set to 0 if can't find any such record, 1 for success.
1802 */
1803int                                     /* error */
1804xfs_btree_lookup(
1805        struct xfs_btree_cur    *cur,   /* btree cursor */
1806        xfs_lookup_t            dir,    /* <=, ==, or >= */
1807        int                     *stat)  /* success/failure */
1808{
1809        struct xfs_btree_block  *block; /* current btree block */
1810        int64_t                 diff;   /* difference for the current key */
1811        int                     error;  /* error return value */
1812        int                     keyno;  /* current key number */
1813        int                     level;  /* level in the btree */
1814        union xfs_btree_ptr     *pp;    /* ptr to btree block */
1815        union xfs_btree_ptr     ptr;    /* ptr to btree block */
1816
1817        XFS_BTREE_STATS_INC(cur, lookup);
1818
1819        /* No such thing as a zero-level tree. */
1820        if (XFS_IS_CORRUPT(cur->bc_mp, cur->bc_nlevels == 0))
1821                return -EFSCORRUPTED;
1822
1823        block = NULL;
1824        keyno = 0;
1825
1826        /* initialise start pointer from cursor */
1827        cur->bc_ops->init_ptr_from_cur(cur, &ptr);
1828        pp = &ptr;
1829
1830        /*
1831         * Iterate over each level in the btree, starting at the root.
1832         * For each level above the leaves, find the key we need, based
1833         * on the lookup record, then follow the corresponding block
1834         * pointer down to the next level.
1835         */
1836        for (level = cur->bc_nlevels - 1, diff = 1; level >= 0; level--) {
1837                /* Get the block we need to do the lookup on. */
1838                error = xfs_btree_lookup_get_block(cur, level, pp, &block);
1839                if (error)
1840                        goto error0;
1841
1842                if (diff == 0) {
1843                        /*
1844                         * If we already had a key match at a higher level, we
1845                         * know we need to use the first entry in this block.
1846                         */
1847                        keyno = 1;
1848                } else {
1849                        /* Otherwise search this block. Do a binary search. */
1850
1851                        int     high;   /* high entry number */
1852                        int     low;    /* low entry number */
1853
1854                        /* Set low and high entry numbers, 1-based. */
1855                        low = 1;
1856                        high = xfs_btree_get_numrecs(block);
1857                        if (!high) {
1858                                /* Block is empty, must be an empty leaf. */
1859                                if (level != 0 || cur->bc_nlevels != 1) {
1860                                        XFS_CORRUPTION_ERROR(__func__,
1861                                                        XFS_ERRLEVEL_LOW,
1862                                                        cur->bc_mp, block,
1863                                                        sizeof(*block));
1864                                        return -EFSCORRUPTED;
1865                                }
1866
1867                                cur->bc_levels[0].ptr = dir != XFS_LOOKUP_LE;
1868                                *stat = 0;
1869                                return 0;
1870                        }
1871
1872                        /* Binary search the block. */
1873                        while (low <= high) {
1874                                union xfs_btree_key     key;
1875                                union xfs_btree_key     *kp;
1876
1877                                XFS_BTREE_STATS_INC(cur, compare);
1878
1879                                /* keyno is average of low and high. */
1880                                keyno = (low + high) >> 1;
1881
1882                                /* Get current search key */
1883                                kp = xfs_lookup_get_search_key(cur, level,
1884                                                keyno, block, &key);
1885
1886                                /*
1887                                 * Compute difference to get next direction:
1888                                 *  - less than, move right
1889                                 *  - greater than, move left
1890                                 *  - equal, we're done
1891                                 */
1892                                diff = cur->bc_ops->key_diff(cur, kp);
1893                                if (diff < 0)
1894                                        low = keyno + 1;
1895                                else if (diff > 0)
1896                                        high = keyno - 1;
1897                                else
1898                                        break;
1899                        }
1900                }
1901
1902                /*
1903                 * If there are more levels, set up for the next level
1904                 * by getting the block number and filling in the cursor.
1905                 */
1906                if (level > 0) {
1907                        /*
1908                         * If we moved left, need the previous key number,
1909                         * unless there isn't one.
1910                         */
1911                        if (diff > 0 && --keyno < 1)
1912                                keyno = 1;
1913                        pp = xfs_btree_ptr_addr(cur, keyno, block);
1914
1915                        error = xfs_btree_debug_check_ptr(cur, pp, 0, level);
1916                        if (error)
1917                                goto error0;
1918
1919                        cur->bc_levels[level].ptr = keyno;
1920                }
1921        }
1922
1923        /* Done with the search. See if we need to adjust the results. */
1924        if (dir != XFS_LOOKUP_LE && diff < 0) {
1925                keyno++;
1926                /*
1927                 * If ge search and we went off the end of the block, but it's
1928                 * not the last block, we're in the wrong block.
1929                 */
1930                xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_RIGHTSIB);
1931                if (dir == XFS_LOOKUP_GE &&
1932                    keyno > xfs_btree_get_numrecs(block) &&
1933                    !xfs_btree_ptr_is_null(cur, &ptr)) {
1934                        int     i;
1935
1936                        cur->bc_levels[0].ptr = keyno;
1937                        error = xfs_btree_increment(cur, 0, &i);
1938                        if (error)
1939                                goto error0;
1940                        if (XFS_IS_CORRUPT(cur->bc_mp, i != 1))
1941                                return -EFSCORRUPTED;
1942                        *stat = 1;
1943                        return 0;
1944                }
1945        } else if (dir == XFS_LOOKUP_LE && diff > 0)
1946                keyno--;
1947        cur->bc_levels[0].ptr = keyno;
1948
1949        /* Return if we succeeded or not. */
1950        if (keyno == 0 || keyno > xfs_btree_get_numrecs(block))
1951                *stat = 0;
1952        else if (dir != XFS_LOOKUP_EQ || diff == 0)
1953                *stat = 1;
1954        else
1955                *stat = 0;
1956        return 0;
1957
1958error0:
1959        return error;
1960}
1961
1962/* Find the high key storage area from a regular key. */
1963union xfs_btree_key *
1964xfs_btree_high_key_from_key(
1965        struct xfs_btree_cur    *cur,
1966        union xfs_btree_key     *key)
1967{
1968        ASSERT(cur->bc_flags & XFS_BTREE_OVERLAPPING);
1969        return (union xfs_btree_key *)((char *)key +
1970                        (cur->bc_ops->key_len / 2));
1971}
1972
1973/* Determine the low (and high if overlapped) keys of a leaf block */
1974STATIC void
1975xfs_btree_get_leaf_keys(
1976        struct xfs_btree_cur    *cur,
1977        struct xfs_btree_block  *block,
1978        union xfs_btree_key     *key)
1979{
1980        union xfs_btree_key     max_hkey;
1981        union xfs_btree_key     hkey;
1982        union xfs_btree_rec     *rec;
1983        union xfs_btree_key     *high;
1984        int                     n;
1985
1986        rec = xfs_btree_rec_addr(cur, 1, block);
1987        cur->bc_ops->init_key_from_rec(key, rec);
1988
1989        if (cur->bc_flags & XFS_BTREE_OVERLAPPING) {
1990
1991                cur->bc_ops->init_high_key_from_rec(&max_hkey, rec);
1992                for (n = 2; n <= xfs_btree_get_numrecs(block); n++) {
1993                        rec = xfs_btree_rec_addr(cur, n, block);
1994                        cur->bc_ops->init_high_key_from_rec(&hkey, rec);
1995                        if (cur->bc_ops->diff_two_keys(cur, &hkey, &max_hkey)
1996                                        > 0)
1997                                max_hkey = hkey;
1998                }
1999
2000                high = xfs_btree_high_key_from_key(cur, key);
2001                memcpy(high, &max_hkey, cur->bc_ops->key_len / 2);
2002        }
2003}
2004
2005/* Determine the low (and high if overlapped) keys of a node block */
2006STATIC void
2007xfs_btree_get_node_keys(
2008        struct xfs_btree_cur    *cur,
2009        struct xfs_btree_block  *block,
2010        union xfs_btree_key     *key)
2011{
2012        union xfs_btree_key     *hkey;
2013        union xfs_btree_key     *max_hkey;
2014        union xfs_btree_key     *high;
2015        int                     n;
2016
2017        if (cur->bc_flags & XFS_BTREE_OVERLAPPING) {
2018                memcpy(key, xfs_btree_key_addr(cur, 1, block),
2019                                cur->bc_ops->key_len / 2);
2020
2021                max_hkey = xfs_btree_high_key_addr(cur, 1, block);
2022                for (n = 2; n <= xfs_btree_get_numrecs(block); n++) {
2023                        hkey = xfs_btree_high_key_addr(cur, n, block);
2024                        if (cur->bc_ops->diff_two_keys(cur, hkey, max_hkey) > 0)
2025                                max_hkey = hkey;
2026                }
2027
2028                high = xfs_btree_high_key_from_key(cur, key);
2029                memcpy(high, max_hkey, cur->bc_ops->key_len / 2);
2030        } else {
2031                memcpy(key, xfs_btree_key_addr(cur, 1, block),
2032                                cur->bc_ops->key_len);
2033        }
2034}
2035
2036/* Derive the keys for any btree block. */
2037void
2038xfs_btree_get_keys(
2039        struct xfs_btree_cur    *cur,
2040        struct xfs_btree_block  *block,
2041        union xfs_btree_key     *key)
2042{
2043        if (be16_to_cpu(block->bb_level) == 0)
2044                xfs_btree_get_leaf_keys(cur, block, key);
2045        else
2046                xfs_btree_get_node_keys(cur, block, key);
2047}
2048
2049/*
2050 * Decide if we need to update the parent keys of a btree block.  For
2051 * a standard btree this is only necessary if we're updating the first
2052 * record/key.  For an overlapping btree, we must always update the
2053 * keys because the highest key can be in any of the records or keys
2054 * in the block.
2055 */
2056static inline bool
2057xfs_btree_needs_key_update(
2058        struct xfs_btree_cur    *cur,
2059        int                     ptr)
2060{
2061        return (cur->bc_flags & XFS_BTREE_OVERLAPPING) || ptr == 1;
2062}
2063
2064/*
2065 * Update the low and high parent keys of the given level, progressing
2066 * towards the root.  If force_all is false, stop if the keys for a given
2067 * level do not need updating.
2068 */
2069STATIC int
2070__xfs_btree_updkeys(
2071        struct xfs_btree_cur    *cur,
2072        int                     level,
2073        struct xfs_btree_block  *block,
2074        struct xfs_buf          *bp0,
2075        bool                    force_all)
2076{
2077        union xfs_btree_key     key;    /* keys from current level */
2078        union xfs_btree_key     *lkey;  /* keys from the next level up */
2079        union xfs_btree_key     *hkey;
2080        union xfs_btree_key     *nlkey; /* keys from the next level up */
2081        union xfs_btree_key     *nhkey;
2082        struct xfs_buf          *bp;
2083        int                     ptr;
2084
2085        ASSERT(cur->bc_flags & XFS_BTREE_OVERLAPPING);
2086
2087        /* Exit if there aren't any parent levels to update. */
2088        if (level + 1 >= cur->bc_nlevels)
2089                return 0;
2090
2091        trace_xfs_btree_updkeys(cur, level, bp0);
2092
2093        lkey = &key;
2094        hkey = xfs_btree_high_key_from_key(cur, lkey);
2095        xfs_btree_get_keys(cur, block, lkey);
2096        for (level++; level < cur->bc_nlevels; level++) {
2097#ifdef DEBUG
2098                int             error;
2099#endif
2100                block = xfs_btree_get_block(cur, level, &bp);
2101                trace_xfs_btree_updkeys(cur, level, bp);
2102#ifdef DEBUG
2103                error = xfs_btree_check_block(cur, block, level, bp);
2104                if (error)
2105                        return error;
2106#endif
2107                ptr = cur->bc_levels[level].ptr;
2108                nlkey = xfs_btree_key_addr(cur, ptr, block);
2109                nhkey = xfs_btree_high_key_addr(cur, ptr, block);
2110                if (!force_all &&
2111                    !(cur->bc_ops->diff_two_keys(cur, nlkey, lkey) != 0 ||
2112                      cur->bc_ops->diff_two_keys(cur, nhkey, hkey) != 0))
2113                        break;
2114                xfs_btree_copy_keys(cur, nlkey, lkey, 1);
2115                xfs_btree_log_keys(cur, bp, ptr, ptr);
2116                if (level + 1 >= cur->bc_nlevels)
2117                        break;
2118                xfs_btree_get_node_keys(cur, block, lkey);
2119        }
2120
2121        return 0;
2122}
2123
2124/* Update all the keys from some level in cursor back to the root. */
2125STATIC int
2126xfs_btree_updkeys_force(
2127        struct xfs_btree_cur    *cur,
2128        int                     level)
2129{
2130        struct xfs_buf          *bp;
2131        struct xfs_btree_block  *block;
2132
2133        block = xfs_btree_get_block(cur, level, &bp);
2134        return __xfs_btree_updkeys(cur, level, block, bp, true);
2135}
2136
2137/*
2138 * Update the parent keys of the given level, progressing towards the root.
2139 */
2140STATIC int
2141xfs_btree_update_keys(
2142        struct xfs_btree_cur    *cur,
2143        int                     level)
2144{
2145        struct xfs_btree_block  *block;
2146        struct xfs_buf          *bp;
2147        union xfs_btree_key     *kp;
2148        union xfs_btree_key     key;
2149        int                     ptr;
2150
2151        ASSERT(level >= 0);
2152
2153        block = xfs_btree_get_block(cur, level, &bp);
2154        if (cur->bc_flags & XFS_BTREE_OVERLAPPING)
2155                return __xfs_btree_updkeys(cur, level, block, bp, false);
2156
2157        /*
2158         * Go up the tree from this level toward the root.
2159         * At each level, update the key value to the value input.
2160         * Stop when we reach a level where the cursor isn't pointing
2161         * at the first entry in the block.
2162         */
2163        xfs_btree_get_keys(cur, block, &key);
2164        for (level++, ptr = 1; ptr == 1 && level < cur->bc_nlevels; level++) {
2165#ifdef DEBUG
2166                int             error;
2167#endif
2168                block = xfs_btree_get_block(cur, level, &bp);
2169#ifdef DEBUG
2170                error = xfs_btree_check_block(cur, block, level, bp);
2171                if (error)
2172                        return error;
2173#endif
2174                ptr = cur->bc_levels[level].ptr;
2175                kp = xfs_btree_key_addr(cur, ptr, block);
2176                xfs_btree_copy_keys(cur, kp, &key, 1);
2177                xfs_btree_log_keys(cur, bp, ptr, ptr);
2178        }
2179
2180        return 0;
2181}
2182
2183/*
2184 * Update the record referred to by cur to the value in the
2185 * given record. This either works (return 0) or gets an
2186 * EFSCORRUPTED error.
2187 */
2188int
2189xfs_btree_update(
2190        struct xfs_btree_cur    *cur,
2191        union xfs_btree_rec     *rec)
2192{
2193        struct xfs_btree_block  *block;
2194        struct xfs_buf          *bp;
2195        int                     error;
2196        int                     ptr;
2197        union xfs_btree_rec     *rp;
2198
2199        /* Pick up the current block. */
2200        block = xfs_btree_get_block(cur, 0, &bp);
2201
2202#ifdef DEBUG
2203        error = xfs_btree_check_block(cur, block, 0, bp);
2204        if (error)
2205                goto error0;
2206#endif
2207        /* Get the address of the rec to be updated. */
2208        ptr = cur->bc_levels[0].ptr;
2209        rp = xfs_btree_rec_addr(cur, ptr, block);
2210
2211        /* Fill in the new contents and log them. */
2212        xfs_btree_copy_recs(cur, rp, rec, 1);
2213        xfs_btree_log_recs(cur, bp, ptr, ptr);
2214
2215        /*
2216         * If we are tracking the last record in the tree and
2217         * we are at the far right edge of the tree, update it.
2218         */
2219        if (xfs_btree_is_lastrec(cur, block, 0)) {
2220                cur->bc_ops->update_lastrec(cur, block, rec,
2221                                            ptr, LASTREC_UPDATE);
2222        }
2223
2224        /* Pass new key value up to our parent. */
2225        if (xfs_btree_needs_key_update(cur, ptr)) {
2226                error = xfs_btree_update_keys(cur, 0);
2227                if (error)
2228                        goto error0;
2229        }
2230
2231        return 0;
2232
2233error0:
2234        return error;
2235}
2236
2237/*
2238 * Move 1 record left from cur/level if possible.
2239 * Update cur to reflect the new path.
2240 */
2241STATIC int                                      /* error */
2242xfs_btree_lshift(
2243        struct xfs_btree_cur    *cur,
2244        int                     level,
2245        int                     *stat)          /* success/failure */
2246{
2247        struct xfs_buf          *lbp;           /* left buffer pointer */
2248        struct xfs_btree_block  *left;          /* left btree block */
2249        int                     lrecs;          /* left record count */
2250        struct xfs_buf          *rbp;           /* right buffer pointer */
2251        struct xfs_btree_block  *right;         /* right btree block */
2252        struct xfs_btree_cur    *tcur;          /* temporary btree cursor */
2253        int                     rrecs;          /* right record count */
2254        union xfs_btree_ptr     lptr;           /* left btree pointer */
2255        union xfs_btree_key     *rkp = NULL;    /* right btree key */
2256        union xfs_btree_ptr     *rpp = NULL;    /* right address pointer */
2257        union xfs_btree_rec     *rrp = NULL;    /* right record pointer */
2258        int                     error;          /* error return value */
2259        int                     i;
2260
2261        if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
2262            level == cur->bc_nlevels - 1)
2263                goto out0;
2264
2265        /* Set up variables for this block as "right". */
2266        right = xfs_btree_get_block(cur, level, &rbp);
2267
2268#ifdef DEBUG
2269        error = xfs_btree_check_block(cur, right, level, rbp);
2270        if (error)
2271                goto error0;
2272#endif
2273
2274        /* If we've got no left sibling then we can't shift an entry left. */
2275        xfs_btree_get_sibling(cur, right, &lptr, XFS_BB_LEFTSIB);
2276        if (xfs_btree_ptr_is_null(cur, &lptr))
2277                goto out0;
2278
2279        /*
2280         * If the cursor entry is the one that would be moved, don't
2281         * do it... it's too complicated.
2282         */
2283        if (cur->bc_levels[level].ptr <= 1)
2284                goto out0;
2285
2286        /* Set up the left neighbor as "left". */
2287        error = xfs_btree_read_buf_block(cur, &lptr, 0, &left, &lbp);
2288        if (error)
2289                goto error0;
2290
2291        /* If it's full, it can't take another entry. */
2292        lrecs = xfs_btree_get_numrecs(left);
2293        if (lrecs == cur->bc_ops->get_maxrecs(cur, level))
2294                goto out0;
2295
2296        rrecs = xfs_btree_get_numrecs(right);
2297
2298        /*
2299         * We add one entry to the left side and remove one for the right side.
2300         * Account for it here, the changes will be updated on disk and logged
2301         * later.
2302         */
2303        lrecs++;
2304        rrecs--;
2305
2306        XFS_BTREE_STATS_INC(cur, lshift);
2307        XFS_BTREE_STATS_ADD(cur, moves, 1);
2308
2309        /*
2310         * If non-leaf, copy a key and a ptr to the left block.
2311         * Log the changes to the left block.
2312         */
2313        if (level > 0) {
2314                /* It's a non-leaf.  Move keys and pointers. */
2315                union xfs_btree_key     *lkp;   /* left btree key */
2316                union xfs_btree_ptr     *lpp;   /* left address pointer */
2317
2318                lkp = xfs_btree_key_addr(cur, lrecs, left);
2319                rkp = xfs_btree_key_addr(cur, 1, right);
2320
2321                lpp = xfs_btree_ptr_addr(cur, lrecs, left);
2322                rpp = xfs_btree_ptr_addr(cur, 1, right);
2323
2324                error = xfs_btree_debug_check_ptr(cur, rpp, 0, level);
2325                if (error)
2326                        goto error0;
2327
2328                xfs_btree_copy_keys(cur, lkp, rkp, 1);
2329                xfs_btree_copy_ptrs(cur, lpp, rpp, 1);
2330
2331                xfs_btree_log_keys(cur, lbp, lrecs, lrecs);
2332                xfs_btree_log_ptrs(cur, lbp, lrecs, lrecs);
2333
2334                ASSERT(cur->bc_ops->keys_inorder(cur,
2335                        xfs_btree_key_addr(cur, lrecs - 1, left), lkp));
2336        } else {
2337                /* It's a leaf.  Move records.  */
2338                union xfs_btree_rec     *lrp;   /* left record pointer */
2339
2340                lrp = xfs_btree_rec_addr(cur, lrecs, left);
2341                rrp = xfs_btree_rec_addr(cur, 1, right);
2342
2343                xfs_btree_copy_recs(cur, lrp, rrp, 1);
2344                xfs_btree_log_recs(cur, lbp, lrecs, lrecs);
2345
2346                ASSERT(cur->bc_ops->recs_inorder(cur,
2347                        xfs_btree_rec_addr(cur, lrecs - 1, left), lrp));
2348        }
2349
2350        xfs_btree_set_numrecs(left, lrecs);
2351        xfs_btree_log_block(cur, lbp, XFS_BB_NUMRECS);
2352
2353        xfs_btree_set_numrecs(right, rrecs);
2354        xfs_btree_log_block(cur, rbp, XFS_BB_NUMRECS);
2355
2356        /*
2357         * Slide the contents of right down one entry.
2358         */
2359        XFS_BTREE_STATS_ADD(cur, moves, rrecs - 1);
2360        if (level > 0) {
2361                /* It's a nonleaf. operate on keys and ptrs */
2362                for (i = 0; i < rrecs; i++) {
2363                        error = xfs_btree_debug_check_ptr(cur, rpp, i + 1, level);
2364                        if (error)
2365                                goto error0;
2366                }
2367
2368                xfs_btree_shift_keys(cur,
2369                                xfs_btree_key_addr(cur, 2, right),
2370                                -1, rrecs);
2371                xfs_btree_shift_ptrs(cur,
2372                                xfs_btree_ptr_addr(cur, 2, right),
2373                                -1, rrecs);
2374
2375                xfs_btree_log_keys(cur, rbp, 1, rrecs);
2376                xfs_btree_log_ptrs(cur, rbp, 1, rrecs);
2377        } else {
2378                /* It's a leaf. operate on records */
2379                xfs_btree_shift_recs(cur,
2380                        xfs_btree_rec_addr(cur, 2, right),
2381                        -1, rrecs);
2382                xfs_btree_log_recs(cur, rbp, 1, rrecs);
2383        }
2384
2385        /*
2386         * Using a temporary cursor, update the parent key values of the
2387         * block on the left.
2388         */
2389        if (cur->bc_flags & XFS_BTREE_OVERLAPPING) {
2390                error = xfs_btree_dup_cursor(cur, &tcur);
2391                if (error)
2392                        goto error0;
2393                i = xfs_btree_firstrec(tcur, level);
2394                if (XFS_IS_CORRUPT(tcur->bc_mp, i != 1)) {
2395                        error = -EFSCORRUPTED;
2396                        goto error0;
2397                }
2398
2399                error = xfs_btree_decrement(tcur, level, &i);
2400                if (error)
2401                        goto error1;
2402
2403                /* Update the parent high keys of the left block, if needed. */
2404                error = xfs_btree_update_keys(tcur, level);
2405                if (error)
2406                        goto error1;
2407
2408                xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
2409        }
2410
2411        /* Update the parent keys of the right block. */
2412        error = xfs_btree_update_keys(cur, level);
2413        if (error)
2414                goto error0;
2415
2416        /* Slide the cursor value left one. */
2417        cur->bc_levels[level].ptr--;
2418
2419        *stat = 1;
2420        return 0;
2421
2422out0:
2423        *stat = 0;
2424        return 0;
2425
2426error0:
2427        return error;
2428
2429error1:
2430        xfs_btree_del_cursor(tcur, XFS_BTREE_ERROR);
2431        return error;
2432}
2433
2434/*
2435 * Move 1 record right from cur/level if possible.
2436 * Update cur to reflect the new path.
2437 */
2438STATIC int                                      /* error */
2439xfs_btree_rshift(
2440        struct xfs_btree_cur    *cur,
2441        int                     level,
2442        int                     *stat)          /* success/failure */
2443{
2444        struct xfs_buf          *lbp;           /* left buffer pointer */
2445        struct xfs_btree_block  *left;          /* left btree block */
2446        struct xfs_buf          *rbp;           /* right buffer pointer */
2447        struct xfs_btree_block  *right;         /* right btree block */
2448        struct xfs_btree_cur    *tcur;          /* temporary btree cursor */
2449        union xfs_btree_ptr     rptr;           /* right block pointer */
2450        union xfs_btree_key     *rkp;           /* right btree key */
2451        int                     rrecs;          /* right record count */
2452        int                     lrecs;          /* left record count */
2453        int                     error;          /* error return value */
2454        int                     i;              /* loop counter */
2455
2456        if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
2457            (level == cur->bc_nlevels - 1))
2458                goto out0;
2459
2460        /* Set up variables for this block as "left". */
2461        left = xfs_btree_get_block(cur, level, &lbp);
2462
2463#ifdef DEBUG
2464        error = xfs_btree_check_block(cur, left, level, lbp);
2465        if (error)
2466                goto error0;
2467#endif
2468
2469        /* If we've got no right sibling then we can't shift an entry right. */
2470        xfs_btree_get_sibling(cur, left, &rptr, XFS_BB_RIGHTSIB);
2471        if (xfs_btree_ptr_is_null(cur, &rptr))
2472                goto out0;
2473
2474        /*
2475         * If the cursor entry is the one that would be moved, don't
2476         * do it... it's too complicated.
2477         */
2478        lrecs = xfs_btree_get_numrecs(left);
2479        if (cur->bc_levels[level].ptr >= lrecs)
2480                goto out0;
2481
2482        /* Set up the right neighbor as "right". */
2483        error = xfs_btree_read_buf_block(cur, &rptr, 0, &right, &rbp);
2484        if (error)
2485                goto error0;
2486
2487        /* If it's full, it can't take another entry. */
2488        rrecs = xfs_btree_get_numrecs(right);
2489        if (rrecs == cur->bc_ops->get_maxrecs(cur, level))
2490                goto out0;
2491
2492        XFS_BTREE_STATS_INC(cur, rshift);
2493        XFS_BTREE_STATS_ADD(cur, moves, rrecs);
2494
2495        /*
2496         * Make a hole at the start of the right neighbor block, then
2497         * copy the last left block entry to the hole.
2498         */
2499        if (level > 0) {
2500                /* It's a nonleaf. make a hole in the keys and ptrs */
2501                union xfs_btree_key     *lkp;
2502                union xfs_btree_ptr     *lpp;
2503                union xfs_btree_ptr     *rpp;
2504
2505                lkp = xfs_btree_key_addr(cur, lrecs, left);
2506                lpp = xfs_btree_ptr_addr(cur, lrecs, left);
2507                rkp = xfs_btree_key_addr(cur, 1, right);
2508                rpp = xfs_btree_ptr_addr(cur, 1, right);
2509
2510                for (i = rrecs - 1; i >= 0; i--) {
2511                        error = xfs_btree_debug_check_ptr(cur, rpp, i, level);
2512                        if (error)
2513                                goto error0;
2514                }
2515
2516                xfs_btree_shift_keys(cur, rkp, 1, rrecs);
2517                xfs_btree_shift_ptrs(cur, rpp, 1, rrecs);
2518
2519                error = xfs_btree_debug_check_ptr(cur, lpp, 0, level);
2520                if (error)
2521                        goto error0;
2522
2523                /* Now put the new data in, and log it. */
2524                xfs_btree_copy_keys(cur, rkp, lkp, 1);
2525                xfs_btree_copy_ptrs(cur, rpp, lpp, 1);
2526
2527                xfs_btree_log_keys(cur, rbp, 1, rrecs + 1);
2528                xfs_btree_log_ptrs(cur, rbp, 1, rrecs + 1);
2529
2530                ASSERT(cur->bc_ops->keys_inorder(cur, rkp,
2531                        xfs_btree_key_addr(cur, 2, right)));
2532        } else {
2533                /* It's a leaf. make a hole in the records */
2534                union xfs_btree_rec     *lrp;
2535                union xfs_btree_rec     *rrp;
2536
2537                lrp = xfs_btree_rec_addr(cur, lrecs, left);
2538                rrp = xfs_btree_rec_addr(cur, 1, right);
2539
2540                xfs_btree_shift_recs(cur, rrp, 1, rrecs);
2541
2542                /* Now put the new data in, and log it. */
2543                xfs_btree_copy_recs(cur, rrp, lrp, 1);
2544                xfs_btree_log_recs(cur, rbp, 1, rrecs + 1);
2545        }
2546
2547        /*
2548         * Decrement and log left's numrecs, bump and log right's numrecs.
2549         */
2550        xfs_btree_set_numrecs(left, --lrecs);
2551        xfs_btree_log_block(cur, lbp, XFS_BB_NUMRECS);
2552
2553        xfs_btree_set_numrecs(right, ++rrecs);
2554        xfs_btree_log_block(cur, rbp, XFS_BB_NUMRECS);
2555
2556        /*
2557         * Using a temporary cursor, update the parent key values of the
2558         * block on the right.
2559         */
2560        error = xfs_btree_dup_cursor(cur, &tcur);
2561        if (error)
2562                goto error0;
2563        i = xfs_btree_lastrec(tcur, level);
2564        if (XFS_IS_CORRUPT(tcur->bc_mp, i != 1)) {
2565                error = -EFSCORRUPTED;
2566                goto error0;
2567        }
2568
2569        error = xfs_btree_increment(tcur, level, &i);
2570        if (error)
2571                goto error1;
2572
2573        /* Update the parent high keys of the left block, if needed. */
2574        if (cur->bc_flags & XFS_BTREE_OVERLAPPING) {
2575                error = xfs_btree_update_keys(cur, level);
2576                if (error)
2577                        goto error1;
2578        }
2579
2580        /* Update the parent keys of the right block. */
2581        error = xfs_btree_update_keys(tcur, level);
2582        if (error)
2583                goto error1;
2584
2585        xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
2586
2587        *stat = 1;
2588        return 0;
2589
2590out0:
2591        *stat = 0;
2592        return 0;
2593
2594error0:
2595        return error;
2596
2597error1:
2598        xfs_btree_del_cursor(tcur, XFS_BTREE_ERROR);
2599        return error;
2600}
2601
2602/*
2603 * Split cur/level block in half.
2604 * Return new block number and the key to its first
2605 * record (to be inserted into parent).
2606 */
2607STATIC int                                      /* error */
2608__xfs_btree_split(
2609        struct xfs_btree_cur    *cur,
2610        int                     level,
2611        union xfs_btree_ptr     *ptrp,
2612        union xfs_btree_key     *key,
2613        struct xfs_btree_cur    **curp,
2614        int                     *stat)          /* success/failure */
2615{
2616        union xfs_btree_ptr     lptr;           /* left sibling block ptr */
2617        struct xfs_buf          *lbp;           /* left buffer pointer */
2618        struct xfs_btree_block  *left;          /* left btree block */
2619        union xfs_btree_ptr     rptr;           /* right sibling block ptr */
2620        struct xfs_buf          *rbp;           /* right buffer pointer */
2621        struct xfs_btree_block  *right;         /* right btree block */
2622        union xfs_btree_ptr     rrptr;          /* right-right sibling ptr */
2623        struct xfs_buf          *rrbp;          /* right-right buffer pointer */
2624        struct xfs_btree_block  *rrblock;       /* right-right btree block */
2625        int                     lrecs;
2626        int                     rrecs;
2627        int                     src_index;
2628        int                     error;          /* error return value */
2629        int                     i;
2630
2631        XFS_BTREE_STATS_INC(cur, split);
2632
2633        /* Set up left block (current one). */
2634        left = xfs_btree_get_block(cur, level, &lbp);
2635
2636#ifdef DEBUG
2637        error = xfs_btree_check_block(cur, left, level, lbp);
2638        if (error)
2639                goto error0;
2640#endif
2641
2642        xfs_btree_buf_to_ptr(cur, lbp, &lptr);
2643
2644        /* Allocate the new block. If we can't do it, we're toast. Give up. */
2645        error = cur->bc_ops->alloc_block(cur, &lptr, &rptr, stat);
2646        if (error)
2647                goto error0;
2648        if (*stat == 0)
2649                goto out0;
2650        XFS_BTREE_STATS_INC(cur, alloc);
2651
2652        /* Set up the new block as "right". */
2653        error = xfs_btree_get_buf_block(cur, &rptr, &right, &rbp);
2654        if (error)
2655                goto error0;
2656
2657        /* Fill in the btree header for the new right block. */
2658        xfs_btree_init_block_cur(cur, rbp, xfs_btree_get_level(left), 0);
2659
2660        /*
2661         * Split the entries between the old and the new block evenly.
2662         * Make sure that if there's an odd number of entries now, that
2663         * each new block will have the same number of entries.
2664         */
2665        lrecs = xfs_btree_get_numrecs(left);
2666        rrecs = lrecs / 2;
2667        if ((lrecs & 1) && cur->bc_levels[level].ptr <= rrecs + 1)
2668                rrecs++;
2669        src_index = (lrecs - rrecs + 1);
2670
2671        XFS_BTREE_STATS_ADD(cur, moves, rrecs);
2672
2673        /* Adjust numrecs for the later get_*_keys() calls. */
2674        lrecs -= rrecs;
2675        xfs_btree_set_numrecs(left, lrecs);
2676        xfs_btree_set_numrecs(right, xfs_btree_get_numrecs(right) + rrecs);
2677
2678        /*
2679         * Copy btree block entries from the left block over to the
2680         * new block, the right. Update the right block and log the
2681         * changes.
2682         */
2683        if (level > 0) {
2684                /* It's a non-leaf.  Move keys and pointers. */
2685                union xfs_btree_key     *lkp;   /* left btree key */
2686                union xfs_btree_ptr     *lpp;   /* left address pointer */
2687                union xfs_btree_key     *rkp;   /* right btree key */
2688                union xfs_btree_ptr     *rpp;   /* right address pointer */
2689
2690                lkp = xfs_btree_key_addr(cur, src_index, left);
2691                lpp = xfs_btree_ptr_addr(cur, src_index, left);
2692                rkp = xfs_btree_key_addr(cur, 1, right);
2693                rpp = xfs_btree_ptr_addr(cur, 1, right);
2694
2695                for (i = src_index; i < rrecs; i++) {
2696                        error = xfs_btree_debug_check_ptr(cur, lpp, i, level);
2697                        if (error)
2698                                goto error0;
2699                }
2700
2701                /* Copy the keys & pointers to the new block. */
2702                xfs_btree_copy_keys(cur, rkp, lkp, rrecs);
2703                xfs_btree_copy_ptrs(cur, rpp, lpp, rrecs);
2704
2705                xfs_btree_log_keys(cur, rbp, 1, rrecs);
2706                xfs_btree_log_ptrs(cur, rbp, 1, rrecs);
2707
2708                /* Stash the keys of the new block for later insertion. */
2709                xfs_btree_get_node_keys(cur, right, key);
2710        } else {
2711                /* It's a leaf.  Move records.  */
2712                union xfs_btree_rec     *lrp;   /* left record pointer */
2713                union xfs_btree_rec     *rrp;   /* right record pointer */
2714
2715                lrp = xfs_btree_rec_addr(cur, src_index, left);
2716                rrp = xfs_btree_rec_addr(cur, 1, right);
2717
2718                /* Copy records to the new block. */
2719                xfs_btree_copy_recs(cur, rrp, lrp, rrecs);
2720                xfs_btree_log_recs(cur, rbp, 1, rrecs);
2721
2722                /* Stash the keys of the new block for later insertion. */
2723                xfs_btree_get_leaf_keys(cur, right, key);
2724        }
2725
2726        /*
2727         * Find the left block number by looking in the buffer.
2728         * Adjust sibling pointers.
2729         */
2730        xfs_btree_get_sibling(cur, left, &rrptr, XFS_BB_RIGHTSIB);
2731        xfs_btree_set_sibling(cur, right, &rrptr, XFS_BB_RIGHTSIB);
2732        xfs_btree_set_sibling(cur, right, &lptr, XFS_BB_LEFTSIB);
2733        xfs_btree_set_sibling(cur, left, &rptr, XFS_BB_RIGHTSIB);
2734
2735        xfs_btree_log_block(cur, rbp, XFS_BB_ALL_BITS);
2736        xfs_btree_log_block(cur, lbp, XFS_BB_NUMRECS | XFS_BB_RIGHTSIB);
2737
2738        /*
2739         * If there's a block to the new block's right, make that block
2740         * point back to right instead of to left.
2741         */
2742        if (!xfs_btree_ptr_is_null(cur, &rrptr)) {
2743                error = xfs_btree_read_buf_block(cur, &rrptr,
2744                                                        0, &rrblock, &rrbp);
2745                if (error)
2746                        goto error0;
2747                xfs_btree_set_sibling(cur, rrblock, &rptr, XFS_BB_LEFTSIB);
2748                xfs_btree_log_block(cur, rrbp, XFS_BB_LEFTSIB);
2749        }
2750
2751        /* Update the parent high keys of the left block, if needed. */
2752        if (cur->bc_flags & XFS_BTREE_OVERLAPPING) {
2753                error = xfs_btree_update_keys(cur, level);
2754                if (error)
2755                        goto error0;
2756        }
2757
2758        /*
2759         * If the cursor is really in the right block, move it there.
2760         * If it's just pointing past the last entry in left, then we'll
2761         * insert there, so don't change anything in that case.
2762         */
2763        if (cur->bc_levels[level].ptr > lrecs + 1) {
2764                xfs_btree_setbuf(cur, level, rbp);
2765                cur->bc_levels[level].ptr -= lrecs;
2766        }
2767        /*
2768         * If there are more levels, we'll need another cursor which refers
2769         * the right block, no matter where this cursor was.
2770         */
2771        if (level + 1 < cur->bc_nlevels) {
2772                error = xfs_btree_dup_cursor(cur, curp);
2773                if (error)
2774                        goto error0;
2775                (*curp)->bc_levels[level + 1].ptr++;
2776        }
2777        *ptrp = rptr;
2778        *stat = 1;
2779        return 0;
2780out0:
2781        *stat = 0;
2782        return 0;
2783
2784error0:
2785        return error;
2786}
2787
2788#ifdef __KERNEL__
2789struct xfs_btree_split_args {
2790        struct xfs_btree_cur    *cur;
2791        int                     level;
2792        union xfs_btree_ptr     *ptrp;
2793        union xfs_btree_key     *key;
2794        struct xfs_btree_cur    **curp;
2795        int                     *stat;          /* success/failure */
2796        int                     result;
2797        bool                    kswapd; /* allocation in kswapd context */
2798        struct completion       *done;
2799        struct work_struct      work;
2800};
2801
2802/*
2803 * Stack switching interfaces for allocation
2804 */
2805static void
2806xfs_btree_split_worker(
2807        struct work_struct      *work)
2808{
2809        struct xfs_btree_split_args     *args = container_of(work,
2810                                                struct xfs_btree_split_args, work);
2811        unsigned long           pflags;
2812        unsigned long           new_pflags = 0;
2813
2814        /*
2815         * we are in a transaction context here, but may also be doing work
2816         * in kswapd context, and hence we may need to inherit that state
2817         * temporarily to ensure that we don't block waiting for memory reclaim
2818         * in any way.
2819         */
2820        if (args->kswapd)
2821                new_pflags |= PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD;
2822
2823        current_set_flags_nested(&pflags, new_pflags);
2824        xfs_trans_set_context(args->cur->bc_tp);
2825
2826        args->result = __xfs_btree_split(args->cur, args->level, args->ptrp,
2827                                         args->key, args->curp, args->stat);
2828
2829        xfs_trans_clear_context(args->cur->bc_tp);
2830        current_restore_flags_nested(&pflags, new_pflags);
2831
2832        /*
2833         * Do not access args after complete() has run here. We don't own args
2834         * and the owner may run and free args before we return here.
2835         */
2836        complete(args->done);
2837
2838}
2839
2840/*
2841 * BMBT split requests often come in with little stack to work on. Push
2842 * them off to a worker thread so there is lots of stack to use. For the other
2843 * btree types, just call directly to avoid the context switch overhead here.
2844 */
2845STATIC int                                      /* error */
2846xfs_btree_split(
2847        struct xfs_btree_cur    *cur,
2848        int                     level,
2849        union xfs_btree_ptr     *ptrp,
2850        union xfs_btree_key     *key,
2851        struct xfs_btree_cur    **curp,
2852        int                     *stat)          /* success/failure */
2853{
2854        struct xfs_btree_split_args     args;
2855        DECLARE_COMPLETION_ONSTACK(done);
2856
2857        if (cur->bc_btnum != XFS_BTNUM_BMAP)
2858                return __xfs_btree_split(cur, level, ptrp, key, curp, stat);
2859
2860        args.cur = cur;
2861        args.level = level;
2862        args.ptrp = ptrp;
2863        args.key = key;
2864        args.curp = curp;
2865        args.stat = stat;
2866        args.done = &done;
2867        args.kswapd = current_is_kswapd();
2868        INIT_WORK_ONSTACK(&args.work, xfs_btree_split_worker);
2869        queue_work(xfs_alloc_wq, &args.work);
2870        wait_for_completion(&done);
2871        destroy_work_on_stack(&args.work);
2872        return args.result;
2873}
2874#else
2875#define xfs_btree_split __xfs_btree_split
2876#endif /* __KERNEL__ */
2877
2878
2879/*
2880 * Copy the old inode root contents into a real block and make the
2881 * broot point to it.
2882 */
2883int                                             /* error */
2884xfs_btree_new_iroot(
2885        struct xfs_btree_cur    *cur,           /* btree cursor */
2886        int                     *logflags,      /* logging flags for inode */
2887        int                     *stat)          /* return status - 0 fail */
2888{
2889        struct xfs_buf          *cbp;           /* buffer for cblock */
2890        struct xfs_btree_block  *block;         /* btree block */
2891        struct xfs_btree_block  *cblock;        /* child btree block */
2892        union xfs_btree_key     *ckp;           /* child key pointer */
2893        union xfs_btree_ptr     *cpp;           /* child ptr pointer */
2894        union xfs_btree_key     *kp;            /* pointer to btree key */
2895        union xfs_btree_ptr     *pp;            /* pointer to block addr */
2896        union xfs_btree_ptr     nptr;           /* new block addr */
2897        int                     level;          /* btree level */
2898        int                     error;          /* error return code */
2899        int                     i;              /* loop counter */
2900
2901        XFS_BTREE_STATS_INC(cur, newroot);
2902
2903        ASSERT(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE);
2904
2905        level = cur->bc_nlevels - 1;
2906
2907        block = xfs_btree_get_iroot(cur);
2908        pp = xfs_btree_ptr_addr(cur, 1, block);
2909
2910        /* Allocate the new block. If we can't do it, we're toast. Give up. */
2911        error = cur->bc_ops->alloc_block(cur, pp, &nptr, stat);
2912        if (error)
2913                goto error0;
2914        if (*stat == 0)
2915                return 0;
2916
2917        XFS_BTREE_STATS_INC(cur, alloc);
2918
2919        /* Copy the root into a real block. */
2920        error = xfs_btree_get_buf_block(cur, &nptr, &cblock, &cbp);
2921        if (error)
2922                goto error0;
2923
2924        /*
2925         * we can't just memcpy() the root in for CRC enabled btree blocks.
2926         * In that case have to also ensure the blkno remains correct
2927         */
2928        memcpy(cblock, block, xfs_btree_block_len(cur));
2929        if (cur->bc_flags & XFS_BTREE_CRC_BLOCKS) {
2930                __be64 bno = cpu_to_be64(xfs_buf_daddr(cbp));
2931                if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
2932                        cblock->bb_u.l.bb_blkno = bno;
2933                else
2934                        cblock->bb_u.s.bb_blkno = bno;
2935        }
2936
2937        be16_add_cpu(&block->bb_level, 1);
2938        xfs_btree_set_numrecs(block, 1);
2939        cur->bc_nlevels++;
2940        ASSERT(cur->bc_nlevels <= cur->bc_maxlevels);
2941        cur->bc_levels[level + 1].ptr = 1;
2942
2943        kp = xfs_btree_key_addr(cur, 1, block);
2944        ckp = xfs_btree_key_addr(cur, 1, cblock);
2945        xfs_btree_copy_keys(cur, ckp, kp, xfs_btree_get_numrecs(cblock));
2946
2947        cpp = xfs_btree_ptr_addr(cur, 1, cblock);
2948        for (i = 0; i < be16_to_cpu(cblock->bb_numrecs); i++) {
2949                error = xfs_btree_debug_check_ptr(cur, pp, i, level);
2950                if (error)
2951                        goto error0;
2952        }
2953
2954        xfs_btree_copy_ptrs(cur, cpp, pp, xfs_btree_get_numrecs(cblock));
2955
2956        error = xfs_btree_debug_check_ptr(cur, &nptr, 0, level);
2957        if (error)
2958                goto error0;
2959
2960        xfs_btree_copy_ptrs(cur, pp, &nptr, 1);
2961
2962        xfs_iroot_realloc(cur->bc_ino.ip,
2963                          1 - xfs_btree_get_numrecs(cblock),
2964                          cur->bc_ino.whichfork);
2965
2966        xfs_btree_setbuf(cur, level, cbp);
2967
2968        /*
2969         * Do all this logging at the end so that
2970         * the root is at the right level.
2971         */
2972        xfs_btree_log_block(cur, cbp, XFS_BB_ALL_BITS);
2973        xfs_btree_log_keys(cur, cbp, 1, be16_to_cpu(cblock->bb_numrecs));
2974        xfs_btree_log_ptrs(cur, cbp, 1, be16_to_cpu(cblock->bb_numrecs));
2975
2976        *logflags |=
2977                XFS_ILOG_CORE | xfs_ilog_fbroot(cur->bc_ino.whichfork);
2978        *stat = 1;
2979        return 0;
2980error0:
2981        return error;
2982}
2983
2984/*
2985 * Allocate a new root block, fill it in.
2986 */
2987STATIC int                              /* error */
2988xfs_btree_new_root(
2989        struct xfs_btree_cur    *cur,   /* btree cursor */
2990        int                     *stat)  /* success/failure */
2991{
2992        struct xfs_btree_block  *block; /* one half of the old root block */
2993        struct xfs_buf          *bp;    /* buffer containing block */
2994        int                     error;  /* error return value */
2995        struct xfs_buf          *lbp;   /* left buffer pointer */
2996        struct xfs_btree_block  *left;  /* left btree block */
2997        struct xfs_buf          *nbp;   /* new (root) buffer */
2998        struct xfs_btree_block  *new;   /* new (root) btree block */
2999        int                     nptr;   /* new value for key index, 1 or 2 */
3000        struct xfs_buf          *rbp;   /* right buffer pointer */
3001        struct xfs_btree_block  *right; /* right btree block */
3002        union xfs_btree_ptr     rptr;
3003        union xfs_btree_ptr     lptr;
3004
3005        XFS_BTREE_STATS_INC(cur, newroot);
3006
3007        /* initialise our start point from the cursor */
3008        cur->bc_ops->init_ptr_from_cur(cur, &rptr);
3009
3010        /* Allocate the new block. If we can't do it, we're toast. Give up. */
3011        error = cur->bc_ops->alloc_block(cur, &rptr, &lptr, stat);
3012        if (error)
3013                goto error0;
3014        if (*stat == 0)
3015                goto out0;
3016        XFS_BTREE_STATS_INC(cur, alloc);
3017
3018        /* Set up the new block. */
3019        error = xfs_btree_get_buf_block(cur, &lptr, &new, &nbp);
3020        if (error)
3021                goto error0;
3022
3023        /* Set the root in the holding structure  increasing the level by 1. */
3024        cur->bc_ops->set_root(cur, &lptr, 1);
3025
3026        /*
3027         * At the previous root level there are now two blocks: the old root,
3028         * and the new block generated when it was split.  We don't know which
3029         * one the cursor is pointing at, so we set up variables "left" and
3030         * "right" for each case.
3031         */
3032        block = xfs_btree_get_block(cur, cur->bc_nlevels - 1, &bp);
3033
3034#ifdef DEBUG
3035        error = xfs_btree_check_block(cur, block, cur->bc_nlevels - 1, bp);
3036        if (error)
3037                goto error0;
3038#endif
3039
3040        xfs_btree_get_sibling(cur, block, &rptr, XFS_BB_RIGHTSIB);
3041        if (!xfs_btree_ptr_is_null(cur, &rptr)) {
3042                /* Our block is left, pick up the right block. */
3043                lbp = bp;
3044                xfs_btree_buf_to_ptr(cur, lbp, &lptr);
3045                left = block;
3046                error = xfs_btree_read_buf_block(cur, &rptr, 0, &right, &rbp);
3047                if (error)
3048                        goto error0;
3049                bp = rbp;
3050                nptr = 1;
3051        } else {
3052                /* Our block is right, pick up the left block. */
3053                rbp = bp;
3054                xfs_btree_buf_to_ptr(cur, rbp, &rptr);
3055                right = block;
3056                xfs_btree_get_sibling(cur, right, &lptr, XFS_BB_LEFTSIB);
3057                error = xfs_btree_read_buf_block(cur, &lptr, 0, &left, &lbp);
3058                if (error)
3059                        goto error0;
3060                bp = lbp;
3061                nptr = 2;
3062        }
3063
3064        /* Fill in the new block's btree header and log it. */
3065        xfs_btree_init_block_cur(cur, nbp, cur->bc_nlevels, 2);
3066        xfs_btree_log_block(cur, nbp, XFS_BB_ALL_BITS);
3067        ASSERT(!xfs_btree_ptr_is_null(cur, &lptr) &&
3068                        !xfs_btree_ptr_is_null(cur, &rptr));
3069
3070        /* Fill in the key data in the new root. */
3071        if (xfs_btree_get_level(left) > 0) {
3072                /*
3073                 * Get the keys for the left block's keys and put them directly
3074                 * in the parent block.  Do the same for the right block.
3075                 */
3076                xfs_btree_get_node_keys(cur, left,
3077                                xfs_btree_key_addr(cur, 1, new));
3078                xfs_btree_get_node_keys(cur, right,
3079                                xfs_btree_key_addr(cur, 2, new));
3080        } else {
3081                /*
3082                 * Get the keys for the left block's records and put them
3083                 * directly in the parent block.  Do the same for the right
3084                 * block.
3085                 */
3086                xfs_btree_get_leaf_keys(cur, left,
3087                        xfs_btree_key_addr(cur, 1, new));
3088                xfs_btree_get_leaf_keys(cur, right,
3089                        xfs_btree_key_addr(cur, 2, new));
3090        }
3091        xfs_btree_log_keys(cur, nbp, 1, 2);
3092
3093        /* Fill in the pointer data in the new root. */
3094        xfs_btree_copy_ptrs(cur,
3095                xfs_btree_ptr_addr(cur, 1, new), &lptr, 1);
3096        xfs_btree_copy_ptrs(cur,
3097                xfs_btree_ptr_addr(cur, 2, new), &rptr, 1);
3098        xfs_btree_log_ptrs(cur, nbp, 1, 2);
3099
3100        /* Fix up the cursor. */
3101        xfs_btree_setbuf(cur, cur->bc_nlevels, nbp);
3102        cur->bc_levels[cur->bc_nlevels].ptr = nptr;
3103        cur->bc_nlevels++;
3104        ASSERT(cur->bc_nlevels <= cur->bc_maxlevels);
3105        *stat = 1;
3106        return 0;
3107error0:
3108        return error;
3109out0:
3110        *stat = 0;
3111        return 0;
3112}
3113
3114STATIC int
3115xfs_btree_make_block_unfull(
3116        struct xfs_btree_cur    *cur,   /* btree cursor */
3117        int                     level,  /* btree level */
3118        int                     numrecs,/* # of recs in block */
3119        int                     *oindex,/* old tree index */
3120        int                     *index, /* new tree index */
3121        union xfs_btree_ptr     *nptr,  /* new btree ptr */
3122        struct xfs_btree_cur    **ncur, /* new btree cursor */
3123        union xfs_btree_key     *key,   /* key of new block */
3124        int                     *stat)
3125{
3126        int                     error = 0;
3127
3128        if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
3129            level == cur->bc_nlevels - 1) {
3130                struct xfs_inode *ip = cur->bc_ino.ip;
3131
3132                if (numrecs < cur->bc_ops->get_dmaxrecs(cur, level)) {
3133                        /* A root block that can be made bigger. */
3134                        xfs_iroot_realloc(ip, 1, cur->bc_ino.whichfork);
3135                        *stat = 1;
3136                } else {
3137                        /* A root block that needs replacing */
3138                        int     logflags = 0;
3139
3140                        error = xfs_btree_new_iroot(cur, &logflags, stat);
3141                        if (error || *stat == 0)
3142                                return error;
3143
3144                        xfs_trans_log_inode(cur->bc_tp, ip, logflags);
3145                }
3146
3147                return 0;
3148        }
3149
3150        /* First, try shifting an entry to the right neighbor. */
3151        error = xfs_btree_rshift(cur, level, stat);
3152        if (error || *stat)
3153                return error;
3154
3155        /* Next, try shifting an entry to the left neighbor. */
3156        error = xfs_btree_lshift(cur, level, stat);
3157        if (error)
3158                return error;
3159
3160        if (*stat) {
3161                *oindex = *index = cur->bc_levels[level].ptr;
3162                return 0;
3163        }
3164
3165        /*
3166         * Next, try splitting the current block in half.
3167         *
3168         * If this works we have to re-set our variables because we
3169         * could be in a different block now.
3170         */
3171        error = xfs_btree_split(cur, level, nptr, key, ncur, stat);
3172        if (error || *stat == 0)
3173                return error;
3174
3175
3176        *index = cur->bc_levels[level].ptr;
3177        return 0;
3178}
3179
3180/*
3181 * Insert one record/level.  Return information to the caller
3182 * allowing the next level up to proceed if necessary.
3183 */
3184STATIC int
3185xfs_btree_insrec(
3186        struct xfs_btree_cur    *cur,   /* btree cursor */
3187        int                     level,  /* level to insert record at */
3188        union xfs_btree_ptr     *ptrp,  /* i/o: block number inserted */
3189        union xfs_btree_rec     *rec,   /* record to insert */
3190        union xfs_btree_key     *key,   /* i/o: block key for ptrp */
3191        struct xfs_btree_cur    **curp, /* output: new cursor replacing cur */
3192        int                     *stat)  /* success/failure */
3193{
3194        struct xfs_btree_block  *block; /* btree block */
3195        struct xfs_buf          *bp;    /* buffer for block */
3196        union xfs_btree_ptr     nptr;   /* new block ptr */
3197        struct xfs_btree_cur    *ncur;  /* new btree cursor */
3198        union xfs_btree_key     nkey;   /* new block key */
3199        union xfs_btree_key     *lkey;
3200        int                     optr;   /* old key/record index */
3201        int                     ptr;    /* key/record index */
3202        int                     numrecs;/* number of records */
3203        int                     error;  /* error return value */
3204        int                     i;
3205        xfs_daddr_t             old_bn;
3206
3207        ncur = NULL;
3208        lkey = &nkey;
3209
3210        /*
3211         * If we have an external root pointer, and we've made it to the
3212         * root level, allocate a new root block and we're done.
3213         */
3214        if (!(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
3215            (level >= cur->bc_nlevels)) {
3216                error = xfs_btree_new_root(cur, stat);
3217                xfs_btree_set_ptr_null(cur, ptrp);
3218
3219                return error;
3220        }
3221
3222        /* If we're off the left edge, return failure. */
3223        ptr = cur->bc_levels[level].ptr;
3224        if (ptr == 0) {
3225                *stat = 0;
3226                return 0;
3227        }
3228
3229        optr = ptr;
3230
3231        XFS_BTREE_STATS_INC(cur, insrec);
3232
3233        /* Get pointers to the btree buffer and block. */
3234        block = xfs_btree_get_block(cur, level, &bp);
3235        old_bn = bp ? xfs_buf_daddr(bp) : XFS_BUF_DADDR_NULL;
3236        numrecs = xfs_btree_get_numrecs(block);
3237
3238#ifdef DEBUG
3239        error = xfs_btree_check_block(cur, block, level, bp);
3240        if (error)
3241                goto error0;
3242
3243        /* Check that the new entry is being inserted in the right place. */
3244        if (ptr <= numrecs) {
3245                if (level == 0) {
3246                        ASSERT(cur->bc_ops->recs_inorder(cur, rec,
3247                                xfs_btree_rec_addr(cur, ptr, block)));
3248                } else {
3249                        ASSERT(cur->bc_ops->keys_inorder(cur, key,
3250                                xfs_btree_key_addr(cur, ptr, block)));
3251                }
3252        }
3253#endif
3254
3255        /*
3256         * If the block is full, we can't insert the new entry until we
3257         * make the block un-full.
3258         */
3259        xfs_btree_set_ptr_null(cur, &nptr);
3260        if (numrecs == cur->bc_ops->get_maxrecs(cur, level)) {
3261                error = xfs_btree_make_block_unfull(cur, level, numrecs,
3262                                        &optr, &ptr, &nptr, &ncur, lkey, stat);
3263                if (error || *stat == 0)
3264                        goto error0;
3265        }
3266
3267        /*
3268         * The current block may have changed if the block was
3269         * previously full and we have just made space in it.
3270         */
3271        block = xfs_btree_get_block(cur, level, &bp);
3272        numrecs = xfs_btree_get_numrecs(block);
3273
3274#ifdef DEBUG
3275        error = xfs_btree_check_block(cur, block, level, bp);
3276        if (error)
3277                return error;
3278#endif
3279
3280        /*
3281         * At this point we know there's room for our new entry in the block
3282         * we're pointing at.
3283         */
3284        XFS_BTREE_STATS_ADD(cur, moves, numrecs - ptr + 1);
3285
3286        if (level > 0) {
3287                /* It's a nonleaf. make a hole in the keys and ptrs */
3288                union xfs_btree_key     *kp;
3289                union xfs_btree_ptr     *pp;
3290
3291                kp = xfs_btree_key_addr(cur, ptr, block);
3292                pp = xfs_btree_ptr_addr(cur, ptr, block);
3293
3294                for (i = numrecs - ptr; i >= 0; i--) {
3295                        error = xfs_btree_debug_check_ptr(cur, pp, i, level);
3296                        if (error)
3297                                return error;
3298                }
3299
3300                xfs_btree_shift_keys(cur, kp, 1, numrecs - ptr + 1);
3301                xfs_btree_shift_ptrs(cur, pp, 1, numrecs - ptr + 1);
3302
3303                error = xfs_btree_debug_check_ptr(cur, ptrp, 0, level);
3304                if (error)
3305                        goto error0;
3306
3307                /* Now put the new data in, bump numrecs and log it. */
3308                xfs_btree_copy_keys(cur, kp, key, 1);
3309                xfs_btree_copy_ptrs(cur, pp, ptrp, 1);
3310                numrecs++;
3311                xfs_btree_set_numrecs(block, numrecs);
3312                xfs_btree_log_ptrs(cur, bp, ptr, numrecs);
3313                xfs_btree_log_keys(cur, bp, ptr, numrecs);
3314#ifdef DEBUG
3315                if (ptr < numrecs) {
3316                        ASSERT(cur->bc_ops->keys_inorder(cur, kp,
3317                                xfs_btree_key_addr(cur, ptr + 1, block)));
3318                }
3319#endif
3320        } else {
3321                /* It's a leaf. make a hole in the records */
3322                union xfs_btree_rec             *rp;
3323
3324                rp = xfs_btree_rec_addr(cur, ptr, block);
3325
3326                xfs_btree_shift_recs(cur, rp, 1, numrecs - ptr + 1);
3327
3328                /* Now put the new data in, bump numrecs and log it. */
3329                xfs_btree_copy_recs(cur, rp, rec, 1);
3330                xfs_btree_set_numrecs(block, ++numrecs);
3331                xfs_btree_log_recs(cur, bp, ptr, numrecs);
3332#ifdef DEBUG
3333                if (ptr < numrecs) {
3334                        ASSERT(cur->bc_ops->recs_inorder(cur, rp,
3335                                xfs_btree_rec_addr(cur, ptr + 1, block)));
3336                }
3337#endif
3338        }
3339
3340        /* Log the new number of records in the btree header. */
3341        xfs_btree_log_block(cur, bp, XFS_BB_NUMRECS);
3342
3343        /*
3344         * If we just inserted into a new tree block, we have to
3345         * recalculate nkey here because nkey is out of date.
3346         *
3347         * Otherwise we're just updating an existing block (having shoved
3348         * some records into the new tree block), so use the regular key
3349         * update mechanism.
3350         */
3351        if (bp && xfs_buf_daddr(bp) != old_bn) {
3352                xfs_btree_get_keys(cur, block, lkey);
3353        } else if (xfs_btree_needs_key_update(cur, optr)) {
3354                error = xfs_btree_update_keys(cur, level);
3355                if (error)
3356                        goto error0;
3357        }
3358
3359        /*
3360         * If we are tracking the last record in the tree and
3361         * we are at the far right edge of the tree, update it.
3362         */
3363        if (xfs_btree_is_lastrec(cur, block, level)) {
3364                cur->bc_ops->update_lastrec(cur, block, rec,
3365                                            ptr, LASTREC_INSREC);
3366        }
3367
3368        /*
3369         * Return the new block number, if any.
3370         * If there is one, give back a record value and a cursor too.
3371         */
3372        *ptrp = nptr;
3373        if (!xfs_btree_ptr_is_null(cur, &nptr)) {
3374                xfs_btree_copy_keys(cur, key, lkey, 1);
3375                *curp = ncur;
3376        }
3377
3378        *stat = 1;
3379        return 0;
3380
3381error0:
3382        return error;
3383}
3384
3385/*
3386 * Insert the record at the point referenced by cur.
3387 *
3388 * A multi-level split of the tree on insert will invalidate the original
3389 * cursor.  All callers of this function should assume that the cursor is
3390 * no longer valid and revalidate it.
3391 */
3392int
3393xfs_btree_insert(
3394        struct xfs_btree_cur    *cur,
3395        int                     *stat)
3396{
3397        int                     error;  /* error return value */
3398        int                     i;      /* result value, 0 for failure */
3399        int                     level;  /* current level number in btree */
3400        union xfs_btree_ptr     nptr;   /* new block number (split result) */
3401        struct xfs_btree_cur    *ncur;  /* new cursor (split result) */
3402        struct xfs_btree_cur    *pcur;  /* previous level's cursor */
3403        union xfs_btree_key     bkey;   /* key of block to insert */
3404        union xfs_btree_key     *key;
3405        union xfs_btree_rec     rec;    /* record to insert */
3406
3407        level = 0;
3408        ncur = NULL;
3409        pcur = cur;
3410        key = &bkey;
3411
3412        xfs_btree_set_ptr_null(cur, &nptr);
3413
3414        /* Make a key out of the record data to be inserted, and save it. */
3415        cur->bc_ops->init_rec_from_cur(cur, &rec);
3416        cur->bc_ops->init_key_from_rec(key, &rec);
3417
3418        /*
3419         * Loop going up the tree, starting at the leaf level.
3420         * Stop when we don't get a split block, that must mean that
3421         * the insert is finished with this level.
3422         */
3423        do {
3424                /*
3425                 * Insert nrec/nptr into this level of the tree.
3426                 * Note if we fail, nptr will be null.
3427                 */
3428                error = xfs_btree_insrec(pcur, level, &nptr, &rec, key,
3429                                &ncur, &i);
3430                if (error) {
3431                        if (pcur != cur)
3432                                xfs_btree_del_cursor(pcur, XFS_BTREE_ERROR);
3433                        goto error0;
3434                }
3435
3436                if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) {
3437                        error = -EFSCORRUPTED;
3438                        goto error0;
3439                }
3440                level++;
3441
3442                /*
3443                 * See if the cursor we just used is trash.
3444                 * Can't trash the caller's cursor, but otherwise we should
3445                 * if ncur is a new cursor or we're about to be done.
3446                 */
3447                if (pcur != cur &&
3448                    (ncur || xfs_btree_ptr_is_null(cur, &nptr))) {
3449                        /* Save the state from the cursor before we trash it */
3450                        if (cur->bc_ops->update_cursor)
3451                                cur->bc_ops->update_cursor(pcur, cur);
3452                        cur->bc_nlevels = pcur->bc_nlevels;
3453                        xfs_btree_del_cursor(pcur, XFS_BTREE_NOERROR);
3454                }
3455                /* If we got a new cursor, switch to it. */
3456                if (ncur) {
3457                        pcur = ncur;
3458                        ncur = NULL;
3459                }
3460        } while (!xfs_btree_ptr_is_null(cur, &nptr));
3461
3462        *stat = i;
3463        return 0;
3464error0:
3465        return error;
3466}
3467
3468/*
3469 * Try to merge a non-leaf block back into the inode root.
3470 *
3471 * Note: the killroot names comes from the fact that we're effectively
3472 * killing the old root block.  But because we can't just delete the
3473 * inode we have to copy the single block it was pointing to into the
3474 * inode.
3475 */
3476STATIC int
3477xfs_btree_kill_iroot(
3478        struct xfs_btree_cur    *cur)
3479{
3480        int                     whichfork = cur->bc_ino.whichfork;
3481        struct xfs_inode        *ip = cur->bc_ino.ip;
3482        struct xfs_ifork        *ifp = XFS_IFORK_PTR(ip, whichfork);
3483        struct xfs_btree_block  *block;
3484        struct xfs_btree_block  *cblock;
3485        union xfs_btree_key     *kp;
3486        union xfs_btree_key     *ckp;
3487        union xfs_btree_ptr     *pp;
3488        union xfs_btree_ptr     *cpp;
3489        struct xfs_buf          *cbp;
3490        int                     level;
3491        int                     index;
3492        int                     numrecs;
3493        int                     error;
3494#ifdef DEBUG
3495        union xfs_btree_ptr     ptr;
3496#endif
3497        int                     i;
3498
3499        ASSERT(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE);
3500        ASSERT(cur->bc_nlevels > 1);
3501
3502        /*
3503         * Don't deal with the root block needs to be a leaf case.
3504         * We're just going to turn the thing back into extents anyway.
3505         */
3506        level = cur->bc_nlevels - 1;
3507        if (level == 1)
3508                goto out0;
3509
3510        /*
3511         * Give up if the root has multiple children.
3512         */
3513        block = xfs_btree_get_iroot(cur);
3514        if (xfs_btree_get_numrecs(block) != 1)
3515                goto out0;
3516
3517        cblock = xfs_btree_get_block(cur, level - 1, &cbp);
3518        numrecs = xfs_btree_get_numrecs(cblock);
3519
3520        /*
3521         * Only do this if the next level will fit.
3522         * Then the data must be copied up to the inode,
3523         * instead of freeing the root you free the next level.
3524         */
3525        if (numrecs > cur->bc_ops->get_dmaxrecs(cur, level))
3526                goto out0;
3527
3528        XFS_BTREE_STATS_INC(cur, killroot);
3529
3530#ifdef DEBUG
3531        xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_LEFTSIB);
3532        ASSERT(xfs_btree_ptr_is_null(cur, &ptr));
3533        xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_RIGHTSIB);
3534        ASSERT(xfs_btree_ptr_is_null(cur, &ptr));
3535#endif
3536
3537        index = numrecs - cur->bc_ops->get_maxrecs(cur, level);
3538        if (index) {
3539                xfs_iroot_realloc(cur->bc_ino.ip, index,
3540                                  cur->bc_ino.whichfork);
3541                block = ifp->if_broot;
3542        }
3543
3544        be16_add_cpu(&block->bb_numrecs, index);
3545        ASSERT(block->bb_numrecs == cblock->bb_numrecs);
3546
3547        kp = xfs_btree_key_addr(cur, 1, block);
3548        ckp = xfs_btree_key_addr(cur, 1, cblock);
3549        xfs_btree_copy_keys(cur, kp, ckp, numrecs);
3550
3551        pp = xfs_btree_ptr_addr(cur, 1, block);
3552        cpp = xfs_btree_ptr_addr(cur, 1, cblock);
3553
3554        for (i = 0; i < numrecs; i++) {
3555                error = xfs_btree_debug_check_ptr(cur, cpp, i, level - 1);
3556                if (error)
3557                        return error;
3558        }
3559
3560        xfs_btree_copy_ptrs(cur, pp, cpp, numrecs);
3561
3562        error = xfs_btree_free_block(cur, cbp);
3563        if (error)
3564                return error;
3565
3566        cur->bc_levels[level - 1].bp = NULL;
3567        be16_add_cpu(&block->bb_level, -1);
3568        xfs_trans_log_inode(cur->bc_tp, ip,
3569                XFS_ILOG_CORE | xfs_ilog_fbroot(cur->bc_ino.whichfork));
3570        cur->bc_nlevels--;
3571out0:
3572        return 0;
3573}
3574
3575/*
3576 * Kill the current root node, and replace it with it's only child node.
3577 */
3578STATIC int
3579xfs_btree_kill_root(
3580        struct xfs_btree_cur    *cur,
3581        struct xfs_buf          *bp,
3582        int                     level,
3583        union xfs_btree_ptr     *newroot)
3584{
3585        int                     error;
3586
3587        XFS_BTREE_STATS_INC(cur, killroot);
3588
3589        /*
3590         * Update the root pointer, decreasing the level by 1 and then
3591         * free the old root.
3592         */
3593        cur->bc_ops->set_root(cur, newroot, -1);
3594
3595        error = xfs_btree_free_block(cur, bp);
3596        if (error)
3597                return error;
3598
3599        cur->bc_levels[level].bp = NULL;
3600        cur->bc_levels[level].ra = 0;
3601        cur->bc_nlevels--;
3602
3603        return 0;
3604}
3605
3606STATIC int
3607xfs_btree_dec_cursor(
3608        struct xfs_btree_cur    *cur,
3609        int                     level,
3610        int                     *stat)
3611{
3612        int                     error;
3613        int                     i;
3614
3615        if (level > 0) {
3616                error = xfs_btree_decrement(cur, level, &i);
3617                if (error)
3618                        return error;
3619        }
3620
3621        *stat = 1;
3622        return 0;
3623}
3624
3625/*
3626 * Single level of the btree record deletion routine.
3627 * Delete record pointed to by cur/level.
3628 * Remove the record from its block then rebalance the tree.
3629 * Return 0 for error, 1 for done, 2 to go on to the next level.
3630 */
3631STATIC int                                      /* error */
3632xfs_btree_delrec(
3633        struct xfs_btree_cur    *cur,           /* btree cursor */
3634        int                     level,          /* level removing record from */
3635        int                     *stat)          /* fail/done/go-on */
3636{
3637        struct xfs_btree_block  *block;         /* btree block */
3638        union xfs_btree_ptr     cptr;           /* current block ptr */
3639        struct xfs_buf          *bp;            /* buffer for block */
3640        int                     error;          /* error return value */
3641        int                     i;              /* loop counter */
3642        union xfs_btree_ptr     lptr;           /* left sibling block ptr */
3643        struct xfs_buf          *lbp;           /* left buffer pointer */
3644        struct xfs_btree_block  *left;          /* left btree block */
3645        int                     lrecs = 0;      /* left record count */
3646        int                     ptr;            /* key/record index */
3647        union xfs_btree_ptr     rptr;           /* right sibling block ptr */
3648        struct xfs_buf          *rbp;           /* right buffer pointer */
3649        struct xfs_btree_block  *right;         /* right btree block */
3650        struct xfs_btree_block  *rrblock;       /* right-right btree block */
3651        struct xfs_buf          *rrbp;          /* right-right buffer pointer */
3652        int                     rrecs = 0;      /* right record count */
3653        struct xfs_btree_cur    *tcur;          /* temporary btree cursor */
3654        int                     numrecs;        /* temporary numrec count */
3655
3656        tcur = NULL;
3657
3658        /* Get the index of the entry being deleted, check for nothing there. */
3659        ptr = cur->bc_levels[level].ptr;
3660        if (ptr == 0) {
3661                *stat = 0;
3662                return 0;
3663        }
3664
3665        /* Get the buffer & block containing the record or key/ptr. */
3666        block = xfs_btree_get_block(cur, level, &bp);
3667        numrecs = xfs_btree_get_numrecs(block);
3668
3669#ifdef DEBUG
3670        error = xfs_btree_check_block(cur, block, level, bp);
3671        if (error)
3672                goto error0;
3673#endif
3674
3675        /* Fail if we're off the end of the block. */
3676        if (ptr > numrecs) {
3677                *stat = 0;
3678                return 0;
3679        }
3680
3681        XFS_BTREE_STATS_INC(cur, delrec);
3682        XFS_BTREE_STATS_ADD(cur, moves, numrecs - ptr);
3683
3684        /* Excise the entries being deleted. */
3685        if (level > 0) {
3686                /* It's a nonleaf. operate on keys and ptrs */
3687                union xfs_btree_key     *lkp;
3688                union xfs_btree_ptr     *lpp;
3689
3690                lkp = xfs_btree_key_addr(cur, ptr + 1, block);
3691                lpp = xfs_btree_ptr_addr(cur, ptr + 1, block);
3692
3693                for (i = 0; i < numrecs - ptr; i++) {
3694                        error = xfs_btree_debug_check_ptr(cur, lpp, i, level);
3695                        if (error)
3696                                goto error0;
3697                }
3698
3699                if (ptr < numrecs) {
3700                        xfs_btree_shift_keys(cur, lkp, -1, numrecs - ptr);
3701                        xfs_btree_shift_ptrs(cur, lpp, -1, numrecs - ptr);
3702                        xfs_btree_log_keys(cur, bp, ptr, numrecs - 1);
3703                        xfs_btree_log_ptrs(cur, bp, ptr, numrecs - 1);
3704                }
3705        } else {
3706                /* It's a leaf. operate on records */
3707                if (ptr < numrecs) {
3708                        xfs_btree_shift_recs(cur,
3709                                xfs_btree_rec_addr(cur, ptr + 1, block),
3710                                -1, numrecs - ptr);
3711                        xfs_btree_log_recs(cur, bp, ptr, numrecs - 1);
3712                }
3713        }
3714
3715        /*
3716         * Decrement and log the number of entries in the block.
3717         */
3718        xfs_btree_set_numrecs(block, --numrecs);
3719        xfs_btree_log_block(cur, bp, XFS_BB_NUMRECS);
3720
3721        /*
3722         * If we are tracking the last record in the tree and
3723         * we are at the far right edge of the tree, update it.
3724         */
3725        if (xfs_btree_is_lastrec(cur, block, level)) {
3726                cur->bc_ops->update_lastrec(cur, block, NULL,
3727                                            ptr, LASTREC_DELREC);
3728        }
3729
3730        /*
3731         * We're at the root level.  First, shrink the root block in-memory.
3732         * Try to get rid of the next level down.  If we can't then there's
3733         * nothing left to do.
3734         */
3735        if (level == cur->bc_nlevels - 1) {
3736                if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) {
3737                        xfs_iroot_realloc(cur->bc_ino.ip, -1,
3738                                          cur->bc_ino.whichfork);
3739
3740                        error = xfs_btree_kill_iroot(cur);
3741                        if (error)
3742                                goto error0;
3743
3744                        error = xfs_btree_dec_cursor(cur, level, stat);
3745                        if (error)
3746                                goto error0;
3747                        *stat = 1;
3748                        return 0;
3749                }
3750
3751                /*
3752                 * If this is the root level, and there's only one entry left,
3753                 * and it's NOT the leaf level, then we can get rid of this
3754                 * level.
3755                 */
3756                if (numrecs == 1 && level > 0) {
3757                        union xfs_btree_ptr     *pp;
3758                        /*
3759                         * pp is still set to the first pointer in the block.
3760                         * Make it the new root of the btree.
3761                         */
3762                        pp = xfs_btree_ptr_addr(cur, 1, block);
3763                        error = xfs_btree_kill_root(cur, bp, level, pp);
3764                        if (error)
3765                                goto error0;
3766                } else if (level > 0) {
3767                        error = xfs_btree_dec_cursor(cur, level, stat);
3768                        if (error)
3769                                goto error0;
3770                }
3771                *stat = 1;
3772                return 0;
3773        }
3774
3775        /*
3776         * If we deleted the leftmost entry in the block, update the
3777         * key values above us in the tree.
3778         */
3779        if (xfs_btree_needs_key_update(cur, ptr)) {
3780                error = xfs_btree_update_keys(cur, level);
3781                if (error)
3782                        goto error0;
3783        }
3784
3785        /*
3786         * If the number of records remaining in the block is at least
3787         * the minimum, we're done.
3788         */
3789        if (numrecs >= cur->bc_ops->get_minrecs(cur, level)) {
3790                error = xfs_btree_dec_cursor(cur, level, stat);
3791                if (error)
3792                        goto error0;
3793                return 0;
3794        }
3795
3796        /*
3797         * Otherwise, we have to move some records around to keep the
3798         * tree balanced.  Look at the left and right sibling blocks to
3799         * see if we can re-balance by moving only one record.
3800         */
3801        xfs_btree_get_sibling(cur, block, &rptr, XFS_BB_RIGHTSIB);
3802        xfs_btree_get_sibling(cur, block, &lptr, XFS_BB_LEFTSIB);
3803
3804        if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) {
3805                /*
3806                 * One child of root, need to get a chance to copy its contents
3807                 * into the root and delete it. Can't go up to next level,
3808                 * there's nothing to delete there.
3809                 */
3810                if (xfs_btree_ptr_is_null(cur, &rptr) &&
3811                    xfs_btree_ptr_is_null(cur, &lptr) &&
3812                    level == cur->bc_nlevels - 2) {
3813                        error = xfs_btree_kill_iroot(cur);
3814                        if (!error)
3815                                error = xfs_btree_dec_cursor(cur, level, stat);
3816                        if (error)
3817                                goto error0;
3818                        return 0;
3819                }
3820        }
3821
3822        ASSERT(!xfs_btree_ptr_is_null(cur, &rptr) ||
3823               !xfs_btree_ptr_is_null(cur, &lptr));
3824
3825        /*
3826         * Duplicate the cursor so our btree manipulations here won't
3827         * disrupt the next level up.
3828         */
3829        error = xfs_btree_dup_cursor(cur, &tcur);
3830        if (error)
3831                goto error0;
3832
3833        /*
3834         * If there's a right sibling, see if it's ok to shift an entry
3835         * out of it.
3836         */
3837        if (!xfs_btree_ptr_is_null(cur, &rptr)) {
3838                /*
3839                 * Move the temp cursor to the last entry in the next block.
3840                 * Actually any entry but the first would suffice.
3841                 */
3842                i = xfs_btree_lastrec(tcur, level);
3843                if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) {
3844                        error = -EFSCORRUPTED;
3845                        goto error0;
3846                }
3847
3848                error = xfs_btree_increment(tcur, level, &i);
3849                if (error)
3850                        goto error0;
3851                if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) {
3852                        error = -EFSCORRUPTED;
3853                        goto error0;
3854                }
3855
3856                i = xfs_btree_lastrec(tcur, level);
3857                if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) {
3858                        error = -EFSCORRUPTED;
3859                        goto error0;
3860                }
3861
3862                /* Grab a pointer to the block. */
3863                right = xfs_btree_get_block(tcur, level, &rbp);
3864#ifdef DEBUG
3865                error = xfs_btree_check_block(tcur, right, level, rbp);
3866                if (error)
3867                        goto error0;
3868#endif
3869                /* Grab the current block number, for future use. */
3870                xfs_btree_get_sibling(tcur, right, &cptr, XFS_BB_LEFTSIB);
3871
3872                /*
3873                 * If right block is full enough so that removing one entry
3874                 * won't make it too empty, and left-shifting an entry out
3875                 * of right to us works, we're done.
3876                 */
3877                if (xfs_btree_get_numrecs(right) - 1 >=
3878                    cur->bc_ops->get_minrecs(tcur, level)) {
3879                        error = xfs_btree_lshift(tcur, level, &i);
3880                        if (error)
3881                                goto error0;
3882                        if (i) {
3883                                ASSERT(xfs_btree_get_numrecs(block) >=
3884                                       cur->bc_ops->get_minrecs(tcur, level));
3885
3886                                xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
3887                                tcur = NULL;
3888
3889                                error = xfs_btree_dec_cursor(cur, level, stat);
3890                                if (error)
3891                                        goto error0;
3892                                return 0;
3893                        }
3894                }
3895
3896                /*
3897                 * Otherwise, grab the number of records in right for
3898                 * future reference, and fix up the temp cursor to point
3899                 * to our block again (last record).
3900                 */
3901                rrecs = xfs_btree_get_numrecs(right);
3902                if (!xfs_btree_ptr_is_null(cur, &lptr)) {
3903                        i = xfs_btree_firstrec(tcur, level);
3904                        if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) {
3905                                error = -EFSCORRUPTED;
3906                                goto error0;
3907                        }
3908
3909                        error = xfs_btree_decrement(tcur, level, &i);
3910                        if (error)
3911                                goto error0;
3912                        if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) {
3913                                error = -EFSCORRUPTED;
3914                                goto error0;
3915                        }
3916                }
3917        }
3918
3919        /*
3920         * If there's a left sibling, see if it's ok to shift an entry
3921         * out of it.
3922         */
3923        if (!xfs_btree_ptr_is_null(cur, &lptr)) {
3924                /*
3925                 * Move the temp cursor to the first entry in the
3926                 * previous block.
3927                 */
3928                i = xfs_btree_firstrec(tcur, level);
3929                if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) {
3930                        error = -EFSCORRUPTED;
3931                        goto error0;
3932                }
3933
3934                error = xfs_btree_decrement(tcur, level, &i);
3935                if (error)
3936                        goto error0;
3937                i = xfs_btree_firstrec(tcur, level);
3938                if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) {
3939                        error = -EFSCORRUPTED;
3940                        goto error0;
3941                }
3942
3943                /* Grab a pointer to the block. */
3944                left = xfs_btree_get_block(tcur, level, &lbp);
3945#ifdef DEBUG
3946                error = xfs_btree_check_block(cur, left, level, lbp);
3947                if (error)
3948                        goto error0;
3949#endif
3950                /* Grab the current block number, for future use. */
3951                xfs_btree_get_sibling(tcur, left, &cptr, XFS_BB_RIGHTSIB);
3952
3953                /*
3954                 * If left block is full enough so that removing one entry
3955                 * won't make it too empty, and right-shifting an entry out
3956                 * of left to us works, we're done.
3957                 */
3958                if (xfs_btree_get_numrecs(left) - 1 >=
3959                    cur->bc_ops->get_minrecs(tcur, level)) {
3960                        error = xfs_btree_rshift(tcur, level, &i);
3961                        if (error)
3962                                goto error0;
3963                        if (i) {
3964                                ASSERT(xfs_btree_get_numrecs(block) >=
3965                                       cur->bc_ops->get_minrecs(tcur, level));
3966                                xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
3967                                tcur = NULL;
3968                                if (level == 0)
3969                                        cur->bc_levels[0].ptr++;
3970
3971                                *stat = 1;
3972                                return 0;
3973                        }
3974                }
3975
3976                /*
3977                 * Otherwise, grab the number of records in right for
3978                 * future reference.
3979                 */
3980                lrecs = xfs_btree_get_numrecs(left);
3981        }
3982
3983        /* Delete the temp cursor, we're done with it. */
3984        xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
3985        tcur = NULL;
3986
3987        /* If here, we need to do a join to keep the tree balanced. */
3988        ASSERT(!xfs_btree_ptr_is_null(cur, &cptr));
3989
3990        if (!xfs_btree_ptr_is_null(cur, &lptr) &&
3991            lrecs + xfs_btree_get_numrecs(block) <=
3992                        cur->bc_ops->get_maxrecs(cur, level)) {
3993                /*
3994                 * Set "right" to be the starting block,
3995                 * "left" to be the left neighbor.
3996                 */
3997                rptr = cptr;
3998                right = block;
3999                rbp = bp;
4000                error = xfs_btree_read_buf_block(cur, &lptr, 0, &left, &lbp);
4001                if (error)
4002                        goto error0;
4003
4004        /*
4005         * If that won't work, see if we can join with the right neighbor block.
4006         */
4007        } else if (!xfs_btree_ptr_is_null(cur, &rptr) &&
4008                   rrecs + xfs_btree_get_numrecs(block) <=
4009                        cur->bc_ops->get_maxrecs(cur, level)) {
4010                /*
4011                 * Set "left" to be the starting block,
4012                 * "right" to be the right neighbor.
4013                 */
4014                lptr = cptr;
4015                left = block;
4016                lbp = bp;
4017                error = xfs_btree_read_buf_block(cur, &rptr, 0, &right, &rbp);
4018                if (error)
4019                        goto error0;
4020
4021        /*
4022         * Otherwise, we can't fix the imbalance.
4023         * Just return.  This is probably a logic error, but it's not fatal.
4024         */
4025        } else {
4026                error = xfs_btree_dec_cursor(cur, level, stat);
4027                if (error)
4028                        goto error0;
4029                return 0;
4030        }
4031
4032        rrecs = xfs_btree_get_numrecs(right);
4033        lrecs = xfs_btree_get_numrecs(left);
4034
4035        /*
4036         * We're now going to join "left" and "right" by moving all the stuff
4037         * in "right" to "left" and deleting "right".
4038         */
4039        XFS_BTREE_STATS_ADD(cur, moves, rrecs);
4040        if (level > 0) {
4041                /* It's a non-leaf.  Move keys and pointers. */
4042                union xfs_btree_key     *lkp;   /* left btree key */
4043                union xfs_btree_ptr     *lpp;   /* left address pointer */
4044                union xfs_btree_key     *rkp;   /* right btree key */
4045                union xfs_btree_ptr     *rpp;   /* right address pointer */
4046
4047                lkp = xfs_btree_key_addr(cur, lrecs + 1, left);
4048                lpp = xfs_btree_ptr_addr(cur, lrecs + 1, left);
4049                rkp = xfs_btree_key_addr(cur, 1, right);
4050                rpp = xfs_btree_ptr_addr(cur, 1, right);
4051
4052                for (i = 1; i < rrecs; i++) {
4053                        error = xfs_btree_debug_check_ptr(cur, rpp, i, level);
4054                        if (error)
4055                                goto error0;
4056                }
4057
4058                xfs_btree_copy_keys(cur, lkp, rkp, rrecs);
4059                xfs_btree_copy_ptrs(cur, lpp, rpp, rrecs);
4060
4061                xfs_btree_log_keys(cur, lbp, lrecs + 1, lrecs + rrecs);
4062                xfs_btree_log_ptrs(cur, lbp, lrecs + 1, lrecs + rrecs);
4063        } else {
4064                /* It's a leaf.  Move records.  */
4065                union xfs_btree_rec     *lrp;   /* left record pointer */
4066                union xfs_btree_rec     *rrp;   /* right record pointer */
4067
4068                lrp = xfs_btree_rec_addr(cur, lrecs + 1, left);
4069                rrp = xfs_btree_rec_addr(cur, 1, right);
4070
4071                xfs_btree_copy_recs(cur, lrp, rrp, rrecs);
4072                xfs_btree_log_recs(cur, lbp, lrecs + 1, lrecs + rrecs);
4073        }
4074
4075        XFS_BTREE_STATS_INC(cur, join);
4076
4077        /*
4078         * Fix up the number of records and right block pointer in the
4079         * surviving block, and log it.
4080         */
4081        xfs_btree_set_numrecs(left, lrecs + rrecs);
4082        xfs_btree_get_sibling(cur, right, &cptr, XFS_BB_RIGHTSIB);
4083        xfs_btree_set_sibling(cur, left, &cptr, XFS_BB_RIGHTSIB);
4084        xfs_btree_log_block(cur, lbp, XFS_BB_NUMRECS | XFS_BB_RIGHTSIB);
4085
4086        /* If there is a right sibling, point it to the remaining block. */
4087        xfs_btree_get_sibling(cur, left, &cptr, XFS_BB_RIGHTSIB);
4088        if (!xfs_btree_ptr_is_null(cur, &cptr)) {
4089                error = xfs_btree_read_buf_block(cur, &cptr, 0, &rrblock, &rrbp);
4090                if (error)
4091                        goto error0;
4092                xfs_btree_set_sibling(cur, rrblock, &lptr, XFS_BB_LEFTSIB);
4093                xfs_btree_log_block(cur, rrbp, XFS_BB_LEFTSIB);
4094        }
4095
4096        /* Free the deleted block. */
4097        error = xfs_btree_free_block(cur, rbp);
4098        if (error)
4099                goto error0;
4100
4101        /*
4102         * If we joined with the left neighbor, set the buffer in the
4103         * cursor to the left block, and fix up the index.
4104         */
4105        if (bp != lbp) {
4106                cur->bc_levels[level].bp = lbp;
4107                cur->bc_levels[level].ptr += lrecs;
4108                cur->bc_levels[level].ra = 0;
4109        }
4110        /*
4111         * If we joined with the right neighbor and there's a level above
4112         * us, increment the cursor at that level.
4113         */
4114        else if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) ||
4115                   (level + 1 < cur->bc_nlevels)) {
4116                error = xfs_btree_increment(cur, level + 1, &i);
4117                if (error)
4118                        goto error0;
4119        }
4120
4121        /*
4122         * Readjust the ptr at this level if it's not a leaf, since it's
4123         * still pointing at the deletion point, which makes the cursor
4124         * inconsistent.  If this makes the ptr 0, the caller fixes it up.
4125         * We can't use decrement because it would change the next level up.
4126         */
4127        if (level > 0)
4128                cur->bc_levels[level].ptr--;
4129
4130        /*
4131         * We combined blocks, so we have to update the parent keys if the
4132         * btree supports overlapped intervals.  However,
4133         * bc_levels[level + 1].ptr points to the old block so that the caller
4134         * knows which record to delete.  Therefore, the caller must be savvy
4135         * enough to call updkeys for us if we return stat == 2.  The other
4136         * exit points from this function don't require deletions further up
4137         * the tree, so they can call updkeys directly.
4138         */
4139
4140        /* Return value means the next level up has something to do. */
4141        *stat = 2;
4142        return 0;
4143
4144error0:
4145        if (tcur)
4146                xfs_btree_del_cursor(tcur, XFS_BTREE_ERROR);
4147        return error;
4148}
4149
4150/*
4151 * Delete the record pointed to by cur.
4152 * The cursor refers to the place where the record was (could be inserted)
4153 * when the operation returns.
4154 */
4155int                                     /* error */
4156xfs_btree_delete(
4157        struct xfs_btree_cur    *cur,
4158        int                     *stat)  /* success/failure */
4159{
4160        int                     error;  /* error return value */
4161        int                     level;
4162        int                     i;
4163        bool                    joined = false;
4164
4165        /*
4166         * Go up the tree, starting at leaf level.
4167         *
4168         * If 2 is returned then a join was done; go to the next level.
4169         * Otherwise we are done.
4170         */
4171        for (level = 0, i = 2; i == 2; level++) {
4172                error = xfs_btree_delrec(cur, level, &i);
4173                if (error)
4174                        goto error0;
4175                if (i == 2)
4176                        joined = true;
4177        }
4178
4179        /*
4180         * If we combined blocks as part of deleting the record, delrec won't
4181         * have updated the parent high keys so we have to do that here.
4182         */
4183        if (joined && (cur->bc_flags & XFS_BTREE_OVERLAPPING)) {
4184                error = xfs_btree_updkeys_force(cur, 0);
4185                if (error)
4186                        goto error0;
4187        }
4188
4189        if (i == 0) {
4190                for (level = 1; level < cur->bc_nlevels; level++) {
4191                        if (cur->bc_levels[level].ptr == 0) {
4192                                error = xfs_btree_decrement(cur, level, &i);
4193                                if (error)
4194                                        goto error0;
4195                                break;
4196                        }
4197                }
4198        }
4199
4200        *stat = i;
4201        return 0;
4202error0:
4203        return error;
4204}
4205
4206/*
4207 * Get the data from the pointed-to record.
4208 */
4209int                                     /* error */
4210xfs_btree_get_rec(
4211        struct xfs_btree_cur    *cur,   /* btree cursor */
4212        union xfs_btree_rec     **recp, /* output: btree record */
4213        int                     *stat)  /* output: success/failure */
4214{
4215        struct xfs_btree_block  *block; /* btree block */
4216        struct xfs_buf          *bp;    /* buffer pointer */
4217        int                     ptr;    /* record number */
4218#ifdef DEBUG
4219        int                     error;  /* error return value */
4220#endif
4221
4222        ptr = cur->bc_levels[0].ptr;
4223        block = xfs_btree_get_block(cur, 0, &bp);
4224
4225#ifdef DEBUG
4226        error = xfs_btree_check_block(cur, block, 0, bp);
4227        if (error)
4228                return error;
4229#endif
4230
4231        /*
4232         * Off the right end or left end, return failure.
4233         */
4234        if (ptr > xfs_btree_get_numrecs(block) || ptr <= 0) {
4235                *stat = 0;
4236                return 0;
4237        }
4238
4239        /*
4240         * Point to the record and extract its data.
4241         */
4242        *recp = xfs_btree_rec_addr(cur, ptr, block);
4243        *stat = 1;
4244        return 0;
4245}
4246
4247/* Visit a block in a btree. */
4248STATIC int
4249xfs_btree_visit_block(
4250        struct xfs_btree_cur            *cur,
4251        int                             level,
4252        xfs_btree_visit_blocks_fn       fn,
4253        void                            *data)
4254{
4255        struct xfs_btree_block          *block;
4256        struct xfs_buf                  *bp;
4257        union xfs_btree_ptr             rptr;
4258        int                             error;
4259
4260        /* do right sibling readahead */
4261        xfs_btree_readahead(cur, level, XFS_BTCUR_RIGHTRA);
4262        block = xfs_btree_get_block(cur, level, &bp);
4263
4264        /* process the block */
4265        error = fn(cur, level, data);
4266        if (error)
4267                return error;
4268
4269        /* now read rh sibling block for next iteration */
4270        xfs_btree_get_sibling(cur, block, &rptr, XFS_BB_RIGHTSIB);
4271        if (xfs_btree_ptr_is_null(cur, &rptr))
4272                return -ENOENT;
4273
4274        return xfs_btree_lookup_get_block(cur, level, &rptr, &block);
4275}
4276
4277
4278/* Visit every block in a btree. */
4279int
4280xfs_btree_visit_blocks(
4281        struct xfs_btree_cur            *cur,
4282        xfs_btree_visit_blocks_fn       fn,
4283        unsigned int                    flags,
4284        void                            *data)
4285{
4286        union xfs_btree_ptr             lptr;
4287        int                             level;
4288        struct xfs_btree_block          *block = NULL;
4289        int                             error = 0;
4290
4291        cur->bc_ops->init_ptr_from_cur(cur, &lptr);
4292
4293        /* for each level */
4294        for (level = cur->bc_nlevels - 1; level >= 0; level--) {
4295                /* grab the left hand block */
4296                error = xfs_btree_lookup_get_block(cur, level, &lptr, &block);
4297                if (error)
4298                        return error;
4299
4300                /* readahead the left most block for the next level down */
4301                if (level > 0) {
4302                        union xfs_btree_ptr     *ptr;
4303
4304                        ptr = xfs_btree_ptr_addr(cur, 1, block);
4305                        xfs_btree_readahead_ptr(cur, ptr, 1);
4306
4307                        /* save for the next iteration of the loop */
4308                        xfs_btree_copy_ptrs(cur, &lptr, ptr, 1);
4309
4310                        if (!(flags & XFS_BTREE_VISIT_LEAVES))
4311                                continue;
4312                } else if (!(flags & XFS_BTREE_VISIT_RECORDS)) {
4313                        continue;
4314                }
4315
4316                /* for each buffer in the level */
4317                do {
4318                        error = xfs_btree_visit_block(cur, level, fn, data);
4319                } while (!error);
4320
4321                if (error != -ENOENT)
4322                        return error;
4323        }
4324
4325        return 0;
4326}
4327
4328/*
4329 * Change the owner of a btree.
4330 *
4331 * The mechanism we use here is ordered buffer logging. Because we don't know
4332 * how many buffers were are going to need to modify, we don't really want to
4333 * have to make transaction reservations for the worst case of every buffer in a
4334 * full size btree as that may be more space that we can fit in the log....
4335 *
4336 * We do the btree walk in the most optimal manner possible - we have sibling
4337 * pointers so we can just walk all the blocks on each level from left to right
4338 * in a single pass, and then move to the next level and do the same. We can
4339 * also do readahead on the sibling pointers to get IO moving more quickly,
4340 * though for slow disks this is unlikely to make much difference to performance
4341 * as the amount of CPU work we have to do before moving to the next block is
4342 * relatively small.
4343 *
4344 * For each btree block that we load, modify the owner appropriately, set the
4345 * buffer as an ordered buffer and log it appropriately. We need to ensure that
4346 * we mark the region we change dirty so that if the buffer is relogged in
4347 * a subsequent transaction the changes we make here as an ordered buffer are
4348 * correctly relogged in that transaction.  If we are in recovery context, then
4349 * just queue the modified buffer as delayed write buffer so the transaction
4350 * recovery completion writes the changes to disk.
4351 */
4352struct xfs_btree_block_change_owner_info {
4353        uint64_t                new_owner;
4354        struct list_head        *buffer_list;
4355};
4356
4357static int
4358xfs_btree_block_change_owner(
4359        struct xfs_btree_cur    *cur,
4360        int                     level,
4361        void                    *data)
4362{
4363        struct xfs_btree_block_change_owner_info        *bbcoi = data;
4364        struct xfs_btree_block  *block;
4365        struct xfs_buf          *bp;
4366
4367        /* modify the owner */
4368        block = xfs_btree_get_block(cur, level, &bp);
4369        if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
4370                if (block->bb_u.l.bb_owner == cpu_to_be64(bbcoi->new_owner))
4371                        return 0;
4372                block->bb_u.l.bb_owner = cpu_to_be64(bbcoi->new_owner);
4373        } else {
4374                if (block->bb_u.s.bb_owner == cpu_to_be32(bbcoi->new_owner))
4375                        return 0;
4376                block->bb_u.s.bb_owner = cpu_to_be32(bbcoi->new_owner);
4377        }
4378
4379        /*
4380         * If the block is a root block hosted in an inode, we might not have a
4381         * buffer pointer here and we shouldn't attempt to log the change as the
4382         * information is already held in the inode and discarded when the root
4383         * block is formatted into the on-disk inode fork. We still change it,
4384         * though, so everything is consistent in memory.
4385         */
4386        if (!bp) {
4387                ASSERT(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE);
4388                ASSERT(level == cur->bc_nlevels - 1);
4389                return 0;
4390        }
4391
4392        if (cur->bc_tp) {
4393                if (!xfs_trans_ordered_buf(cur->bc_tp, bp)) {
4394                        xfs_btree_log_block(cur, bp, XFS_BB_OWNER);
4395                        return -EAGAIN;
4396                }
4397        } else {
4398                xfs_buf_delwri_queue(bp, bbcoi->buffer_list);
4399        }
4400
4401        return 0;
4402}
4403
4404int
4405xfs_btree_change_owner(
4406        struct xfs_btree_cur    *cur,
4407        uint64_t                new_owner,
4408        struct list_head        *buffer_list)
4409{
4410        struct xfs_btree_block_change_owner_info        bbcoi;
4411
4412        bbcoi.new_owner = new_owner;
4413        bbcoi.buffer_list = buffer_list;
4414
4415        return xfs_btree_visit_blocks(cur, xfs_btree_block_change_owner,
4416                        XFS_BTREE_VISIT_ALL, &bbcoi);
4417}
4418
4419/* Verify the v5 fields of a long-format btree block. */
4420xfs_failaddr_t
4421xfs_btree_lblock_v5hdr_verify(
4422        struct xfs_buf          *bp,
4423        uint64_t                owner)
4424{
4425        struct xfs_mount        *mp = bp->b_mount;
4426        struct xfs_btree_block  *block = XFS_BUF_TO_BLOCK(bp);
4427
4428        if (!xfs_has_crc(mp))
4429                return __this_address;
4430        if (!uuid_equal(&block->bb_u.l.bb_uuid, &mp->m_sb.sb_meta_uuid))
4431                return __this_address;
4432        if (block->bb_u.l.bb_blkno != cpu_to_be64(xfs_buf_daddr(bp)))
4433                return __this_address;
4434        if (owner != XFS_RMAP_OWN_UNKNOWN &&
4435            be64_to_cpu(block->bb_u.l.bb_owner) != owner)
4436                return __this_address;
4437        return NULL;
4438}
4439
4440/* Verify a long-format btree block. */
4441xfs_failaddr_t
4442xfs_btree_lblock_verify(
4443        struct xfs_buf          *bp,
4444        unsigned int            max_recs)
4445{
4446        struct xfs_mount        *mp = bp->b_mount;
4447        struct xfs_btree_block  *block = XFS_BUF_TO_BLOCK(bp);
4448
4449        /* numrecs verification */
4450        if (be16_to_cpu(block->bb_numrecs) > max_recs)
4451                return __this_address;
4452
4453        /* sibling pointer verification */
4454        if (block->bb_u.l.bb_leftsib != cpu_to_be64(NULLFSBLOCK) &&
4455            !xfs_verify_fsbno(mp, be64_to_cpu(block->bb_u.l.bb_leftsib)))
4456                return __this_address;
4457        if (block->bb_u.l.bb_rightsib != cpu_to_be64(NULLFSBLOCK) &&
4458            !xfs_verify_fsbno(mp, be64_to_cpu(block->bb_u.l.bb_rightsib)))
4459                return __this_address;
4460
4461        return NULL;
4462}
4463
4464/**
4465 * xfs_btree_sblock_v5hdr_verify() -- verify the v5 fields of a short-format
4466 *                                    btree block
4467 *
4468 * @bp: buffer containing the btree block
4469 */
4470xfs_failaddr_t
4471xfs_btree_sblock_v5hdr_verify(
4472        struct xfs_buf          *bp)
4473{
4474        struct xfs_mount        *mp = bp->b_mount;
4475        struct xfs_btree_block  *block = XFS_BUF_TO_BLOCK(bp);
4476        struct xfs_perag        *pag = bp->b_pag;
4477
4478        if (!xfs_has_crc(mp))
4479                return __this_address;
4480        if (!uuid_equal(&block->bb_u.s.bb_uuid, &mp->m_sb.sb_meta_uuid))
4481                return __this_address;
4482        if (block->bb_u.s.bb_blkno != cpu_to_be64(xfs_buf_daddr(bp)))
4483                return __this_address;
4484        if (pag && be32_to_cpu(block->bb_u.s.bb_owner) != pag->pag_agno)
4485                return __this_address;
4486        return NULL;
4487}
4488
4489/**
4490 * xfs_btree_sblock_verify() -- verify a short-format btree block
4491 *
4492 * @bp: buffer containing the btree block
4493 * @max_recs: maximum records allowed in this btree node
4494 */
4495xfs_failaddr_t
4496xfs_btree_sblock_verify(
4497        struct xfs_buf          *bp,
4498        unsigned int            max_recs)
4499{
4500        struct xfs_mount        *mp = bp->b_mount;
4501        struct xfs_btree_block  *block = XFS_BUF_TO_BLOCK(bp);
4502        xfs_agblock_t           agno;
4503
4504        /* numrecs verification */
4505        if (be16_to_cpu(block->bb_numrecs) > max_recs)
4506                return __this_address;
4507
4508        /* sibling pointer verification */
4509        agno = xfs_daddr_to_agno(mp, xfs_buf_daddr(bp));
4510        if (block->bb_u.s.bb_leftsib != cpu_to_be32(NULLAGBLOCK) &&
4511            !xfs_verify_agbno(mp, agno, be32_to_cpu(block->bb_u.s.bb_leftsib)))
4512                return __this_address;
4513        if (block->bb_u.s.bb_rightsib != cpu_to_be32(NULLAGBLOCK) &&
4514            !xfs_verify_agbno(mp, agno, be32_to_cpu(block->bb_u.s.bb_rightsib)))
4515                return __this_address;
4516
4517        return NULL;
4518}
4519
4520/*
4521 * For the given limits on leaf and keyptr records per block, calculate the
4522 * height of the tree needed to index the number of leaf records.
4523 */
4524unsigned int
4525xfs_btree_compute_maxlevels(
4526        const unsigned int      *limits,
4527        unsigned long long      records)
4528{
4529        unsigned long long      level_blocks = howmany_64(records, limits[0]);
4530        unsigned int            height = 1;
4531
4532        while (level_blocks > 1) {
4533                level_blocks = howmany_64(level_blocks, limits[1]);
4534                height++;
4535        }
4536
4537        return height;
4538}
4539
4540/*
4541 * For the given limits on leaf and keyptr records per block, calculate the
4542 * number of blocks needed to index the given number of leaf records.
4543 */
4544unsigned long long
4545xfs_btree_calc_size(
4546        const unsigned int      *limits,
4547        unsigned long long      records)
4548{
4549        unsigned long long      level_blocks = howmany_64(records, limits[0]);
4550        unsigned long long      blocks = level_blocks;
4551
4552        while (level_blocks > 1) {
4553                level_blocks = howmany_64(level_blocks, limits[1]);
4554                blocks += level_blocks;
4555        }
4556
4557        return blocks;
4558}
4559
4560/*
4561 * Given a number of available blocks for the btree to consume with records and
4562 * pointers, calculate the height of the tree needed to index all the records
4563 * that space can hold based on the number of pointers each interior node
4564 * holds.
4565 *
4566 * We start by assuming a single level tree consumes a single block, then track
4567 * the number of blocks each node level consumes until we no longer have space
4568 * to store the next node level. At this point, we are indexing all the leaf
4569 * blocks in the space, and there's no more free space to split the tree any
4570 * further. That's our maximum btree height.
4571 */
4572unsigned int
4573xfs_btree_space_to_height(
4574        const unsigned int      *limits,
4575        unsigned long long      leaf_blocks)
4576{
4577        unsigned long long      node_blocks = limits[1];
4578        unsigned long long      blocks_left = leaf_blocks - 1;
4579        unsigned int            height = 1;
4580
4581        if (leaf_blocks < 1)
4582                return 0;
4583
4584        while (node_blocks < blocks_left) {
4585                blocks_left -= node_blocks;
4586                node_blocks *= limits[1];
4587                height++;
4588        }
4589
4590        return height;
4591}
4592
4593/*
4594 * Query a regular btree for all records overlapping a given interval.
4595 * Start with a LE lookup of the key of low_rec and return all records
4596 * until we find a record with a key greater than the key of high_rec.
4597 */
4598STATIC int
4599xfs_btree_simple_query_range(
4600        struct xfs_btree_cur            *cur,
4601        const union xfs_btree_key       *low_key,
4602        const union xfs_btree_key       *high_key,
4603        xfs_btree_query_range_fn        fn,
4604        void                            *priv)
4605{
4606        union xfs_btree_rec             *recp;
4607        union xfs_btree_key             rec_key;
4608        int64_t                         diff;
4609        int                             stat;
4610        bool                            firstrec = true;
4611        int                             error;
4612
4613        ASSERT(cur->bc_ops->init_high_key_from_rec);
4614        ASSERT(cur->bc_ops->diff_two_keys);
4615
4616        /*
4617         * Find the leftmost record.  The btree cursor must be set
4618         * to the low record used to generate low_key.
4619         */
4620        stat = 0;
4621        error = xfs_btree_lookup(cur, XFS_LOOKUP_LE, &stat);
4622        if (error)
4623                goto out;
4624
4625        /* Nothing?  See if there's anything to the right. */
4626        if (!stat) {
4627                error = xfs_btree_increment(cur, 0, &stat);
4628                if (error)
4629                        goto out;
4630        }
4631
4632        while (stat) {
4633                /* Find the record. */
4634                error = xfs_btree_get_rec(cur, &recp, &stat);
4635                if (error || !stat)
4636                        break;
4637
4638                /* Skip if high_key(rec) < low_key. */
4639                if (firstrec) {
4640                        cur->bc_ops->init_high_key_from_rec(&rec_key, recp);
4641                        firstrec = false;
4642                        diff = cur->bc_ops->diff_two_keys(cur, low_key,
4643                                        &rec_key);
4644                        if (diff > 0)
4645                                goto advloop;
4646                }
4647
4648                /* Stop if high_key < low_key(rec). */
4649                cur->bc_ops->init_key_from_rec(&rec_key, recp);
4650                diff = cur->bc_ops->diff_two_keys(cur, &rec_key, high_key);
4651                if (diff > 0)
4652                        break;
4653
4654                /* Callback */
4655                error = fn(cur, recp, priv);
4656                if (error)
4657                        break;
4658
4659advloop:
4660                /* Move on to the next record. */
4661                error = xfs_btree_increment(cur, 0, &stat);
4662                if (error)
4663                        break;
4664        }
4665
4666out:
4667        return error;
4668}
4669
4670/*
4671 * Query an overlapped interval btree for all records overlapping a given
4672 * interval.  This function roughly follows the algorithm given in
4673 * "Interval Trees" of _Introduction to Algorithms_, which is section
4674 * 14.3 in the 2nd and 3rd editions.
4675 *
4676 * First, generate keys for the low and high records passed in.
4677 *
4678 * For any leaf node, generate the high and low keys for the record.
4679 * If the record keys overlap with the query low/high keys, pass the
4680 * record to the function iterator.
4681 *
4682 * For any internal node, compare the low and high keys of each
4683 * pointer against the query low/high keys.  If there's an overlap,
4684 * follow the pointer.
4685 *
4686 * As an optimization, we stop scanning a block when we find a low key
4687 * that is greater than the query's high key.
4688 */
4689STATIC int
4690xfs_btree_overlapped_query_range(
4691        struct xfs_btree_cur            *cur,
4692        const union xfs_btree_key       *low_key,
4693        const union xfs_btree_key       *high_key,
4694        xfs_btree_query_range_fn        fn,
4695        void                            *priv)
4696{
4697        union xfs_btree_ptr             ptr;
4698        union xfs_btree_ptr             *pp;
4699        union xfs_btree_key             rec_key;
4700        union xfs_btree_key             rec_hkey;
4701        union xfs_btree_key             *lkp;
4702        union xfs_btree_key             *hkp;
4703        union xfs_btree_rec             *recp;
4704        struct xfs_btree_block          *block;
4705        int64_t                         ldiff;
4706        int64_t                         hdiff;
4707        int                             level;
4708        struct xfs_buf                  *bp;
4709        int                             i;
4710        int                             error;
4711
4712        /* Load the root of the btree. */
4713        level = cur->bc_nlevels - 1;
4714        cur->bc_ops->init_ptr_from_cur(cur, &ptr);
4715        error = xfs_btree_lookup_get_block(cur, level, &ptr, &block);
4716        if (error)
4717                return error;
4718        xfs_btree_get_block(cur, level, &bp);
4719        trace_xfs_btree_overlapped_query_range(cur, level, bp);
4720#ifdef DEBUG
4721        error = xfs_btree_check_block(cur, block, level, bp);
4722        if (error)
4723                goto out;
4724#endif
4725        cur->bc_levels[level].ptr = 1;
4726
4727        while (level < cur->bc_nlevels) {
4728                block = xfs_btree_get_block(cur, level, &bp);
4729
4730                /* End of node, pop back towards the root. */
4731                if (cur->bc_levels[level].ptr >
4732                                        be16_to_cpu(block->bb_numrecs)) {
4733pop_up:
4734                        if (level < cur->bc_nlevels - 1)
4735                                cur->bc_levels[level + 1].ptr++;
4736                        level++;
4737                        continue;
4738                }
4739
4740                if (level == 0) {
4741                        /* Handle a leaf node. */
4742                        recp = xfs_btree_rec_addr(cur, cur->bc_levels[0].ptr,
4743                                        block);
4744
4745                        cur->bc_ops->init_high_key_from_rec(&rec_hkey, recp);
4746                        ldiff = cur->bc_ops->diff_two_keys(cur, &rec_hkey,
4747                                        low_key);
4748
4749                        cur->bc_ops->init_key_from_rec(&rec_key, recp);
4750                        hdiff = cur->bc_ops->diff_two_keys(cur, high_key,
4751                                        &rec_key);
4752
4753                        /*
4754                         * If (record's high key >= query's low key) and
4755                         *    (query's high key >= record's low key), then
4756                         * this record overlaps the query range; callback.
4757                         */
4758                        if (ldiff >= 0 && hdiff >= 0) {
4759                                error = fn(cur, recp, priv);
4760                                if (error)
4761                                        break;
4762                        } else if (hdiff < 0) {
4763                                /* Record is larger than high key; pop. */
4764                                goto pop_up;
4765                        }
4766                        cur->bc_levels[level].ptr++;
4767                        continue;
4768                }
4769
4770                /* Handle an internal node. */
4771                lkp = xfs_btree_key_addr(cur, cur->bc_levels[level].ptr, block);
4772                hkp = xfs_btree_high_key_addr(cur, cur->bc_levels[level].ptr,
4773                                block);
4774                pp = xfs_btree_ptr_addr(cur, cur->bc_levels[level].ptr, block);
4775
4776                ldiff = cur->bc_ops->diff_two_keys(cur, hkp, low_key);
4777                hdiff = cur->bc_ops->diff_two_keys(cur, high_key, lkp);
4778
4779                /*
4780                 * If (pointer's high key >= query's low key) and
4781                 *    (query's high key >= pointer's low key), then
4782                 * this record overlaps the query range; follow pointer.
4783                 */
4784                if (ldiff >= 0 && hdiff >= 0) {
4785                        level--;
4786                        error = xfs_btree_lookup_get_block(cur, level, pp,
4787                                        &block);
4788                        if (error)
4789                                goto out;
4790                        xfs_btree_get_block(cur, level, &bp);
4791                        trace_xfs_btree_overlapped_query_range(cur, level, bp);
4792#ifdef DEBUG
4793                        error = xfs_btree_check_block(cur, block, level, bp);
4794                        if (error)
4795                                goto out;
4796#endif
4797                        cur->bc_levels[level].ptr = 1;
4798                        continue;
4799                } else if (hdiff < 0) {
4800                        /* The low key is larger than the upper range; pop. */
4801                        goto pop_up;
4802                }
4803                cur->bc_levels[level].ptr++;
4804        }
4805
4806out:
4807        /*
4808         * If we don't end this function with the cursor pointing at a record
4809         * block, a subsequent non-error cursor deletion will not release
4810         * node-level buffers, causing a buffer leak.  This is quite possible
4811         * with a zero-results range query, so release the buffers if we
4812         * failed to return any results.
4813         */
4814        if (cur->bc_levels[0].bp == NULL) {
4815                for (i = 0; i < cur->bc_nlevels; i++) {
4816                        if (cur->bc_levels[i].bp) {
4817                                xfs_trans_brelse(cur->bc_tp,
4818                                                cur->bc_levels[i].bp);
4819                                cur->bc_levels[i].bp = NULL;
4820                                cur->bc_levels[i].ptr = 0;
4821                                cur->bc_levels[i].ra = 0;
4822                        }
4823                }
4824        }
4825
4826        return error;
4827}
4828
4829/*
4830 * Query a btree for all records overlapping a given interval of keys.  The
4831 * supplied function will be called with each record found; return one of the
4832 * XFS_BTREE_QUERY_RANGE_{CONTINUE,ABORT} values or the usual negative error
4833 * code.  This function returns -ECANCELED, zero, or a negative error code.
4834 */
4835int
4836xfs_btree_query_range(
4837        struct xfs_btree_cur            *cur,
4838        const union xfs_btree_irec      *low_rec,
4839        const union xfs_btree_irec      *high_rec,
4840        xfs_btree_query_range_fn        fn,
4841        void                            *priv)
4842{
4843        union xfs_btree_rec             rec;
4844        union xfs_btree_key             low_key;
4845        union xfs_btree_key             high_key;
4846
4847        /* Find the keys of both ends of the interval. */
4848        cur->bc_rec = *high_rec;
4849        cur->bc_ops->init_rec_from_cur(cur, &rec);
4850        cur->bc_ops->init_key_from_rec(&high_key, &rec);
4851
4852        cur->bc_rec = *low_rec;
4853        cur->bc_ops->init_rec_from_cur(cur, &rec);
4854        cur->bc_ops->init_key_from_rec(&low_key, &rec);
4855
4856        /* Enforce low key < high key. */
4857        if (cur->bc_ops->diff_two_keys(cur, &low_key, &high_key) > 0)
4858                return -EINVAL;
4859
4860        if (!(cur->bc_flags & XFS_BTREE_OVERLAPPING))
4861                return xfs_btree_simple_query_range(cur, &low_key,
4862                                &high_key, fn, priv);
4863        return xfs_btree_overlapped_query_range(cur, &low_key, &high_key,
4864                        fn, priv);
4865}
4866
4867/* Query a btree for all records. */
4868int
4869xfs_btree_query_all(
4870        struct xfs_btree_cur            *cur,
4871        xfs_btree_query_range_fn        fn,
4872        void                            *priv)
4873{
4874        union xfs_btree_key             low_key;
4875        union xfs_btree_key             high_key;
4876
4877        memset(&cur->bc_rec, 0, sizeof(cur->bc_rec));
4878        memset(&low_key, 0, sizeof(low_key));
4879        memset(&high_key, 0xFF, sizeof(high_key));
4880
4881        return xfs_btree_simple_query_range(cur, &low_key, &high_key, fn, priv);
4882}
4883
4884static int
4885xfs_btree_count_blocks_helper(
4886        struct xfs_btree_cur    *cur,
4887        int                     level,
4888        void                    *data)
4889{
4890        xfs_extlen_t            *blocks = data;
4891        (*blocks)++;
4892
4893        return 0;
4894}
4895
4896/* Count the blocks in a btree and return the result in *blocks. */
4897int
4898xfs_btree_count_blocks(
4899        struct xfs_btree_cur    *cur,
4900        xfs_extlen_t            *blocks)
4901{
4902        *blocks = 0;
4903        return xfs_btree_visit_blocks(cur, xfs_btree_count_blocks_helper,
4904                        XFS_BTREE_VISIT_ALL, blocks);
4905}
4906
4907/* Compare two btree pointers. */
4908int64_t
4909xfs_btree_diff_two_ptrs(
4910        struct xfs_btree_cur            *cur,
4911        const union xfs_btree_ptr       *a,
4912        const union xfs_btree_ptr       *b)
4913{
4914        if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
4915                return (int64_t)be64_to_cpu(a->l) - be64_to_cpu(b->l);
4916        return (int64_t)be32_to_cpu(a->s) - be32_to_cpu(b->s);
4917}
4918
4919/* If there's an extent, we're done. */
4920STATIC int
4921xfs_btree_has_record_helper(
4922        struct xfs_btree_cur            *cur,
4923        const union xfs_btree_rec       *rec,
4924        void                            *priv)
4925{
4926        return -ECANCELED;
4927}
4928
4929/* Is there a record covering a given range of keys? */
4930int
4931xfs_btree_has_record(
4932        struct xfs_btree_cur            *cur,
4933        const union xfs_btree_irec      *low,
4934        const union xfs_btree_irec      *high,
4935        bool                            *exists)
4936{
4937        int                             error;
4938
4939        error = xfs_btree_query_range(cur, low, high,
4940                        &xfs_btree_has_record_helper, NULL);
4941        if (error == -ECANCELED) {
4942                *exists = true;
4943                return 0;
4944        }
4945        *exists = false;
4946        return error;
4947}
4948
4949/* Are there more records in this btree? */
4950bool
4951xfs_btree_has_more_records(
4952        struct xfs_btree_cur    *cur)
4953{
4954        struct xfs_btree_block  *block;
4955        struct xfs_buf          *bp;
4956
4957        block = xfs_btree_get_block(cur, 0, &bp);
4958
4959        /* There are still records in this block. */
4960        if (cur->bc_levels[0].ptr < xfs_btree_get_numrecs(block))
4961                return true;
4962
4963        /* There are more record blocks. */
4964        if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
4965                return block->bb_u.l.bb_rightsib != cpu_to_be64(NULLFSBLOCK);
4966        else
4967                return block->bb_u.s.bb_rightsib != cpu_to_be32(NULLAGBLOCK);
4968}
4969
4970/* Set up all the btree cursor caches. */
4971int __init
4972xfs_btree_init_cur_caches(void)
4973{
4974        int             error;
4975
4976        error = xfs_allocbt_init_cur_cache();
4977        if (error)
4978                return error;
4979        error = xfs_inobt_init_cur_cache();
4980        if (error)
4981                goto err;
4982        error = xfs_bmbt_init_cur_cache();
4983        if (error)
4984                goto err;
4985        error = xfs_rmapbt_init_cur_cache();
4986        if (error)
4987                goto err;
4988        error = xfs_refcountbt_init_cur_cache();
4989        if (error)
4990                goto err;
4991
4992        return 0;
4993err:
4994        xfs_btree_destroy_cur_caches();
4995        return error;
4996}
4997
4998/* Destroy all the btree cursor caches, if they've been allocated. */
4999void
5000xfs_btree_destroy_cur_caches(void)
5001{
5002        xfs_allocbt_destroy_cur_cache();
5003        xfs_inobt_destroy_cur_cache();
5004        xfs_bmbt_destroy_cur_cache();
5005        xfs_rmapbt_destroy_cur_cache();
5006        xfs_refcountbt_destroy_cur_cache();
5007}
5008