linux/fs/xfs/libxfs/xfs_btree.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0
   2/*
   3 * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc.
   4 * All Rights Reserved.
   5 */
   6#include "xfs.h"
   7#include "xfs_fs.h"
   8#include "xfs_shared.h"
   9#include "xfs_format.h"
  10#include "xfs_log_format.h"
  11#include "xfs_trans_resv.h"
  12#include "xfs_bit.h"
  13#include "xfs_mount.h"
  14#include "xfs_inode.h"
  15#include "xfs_trans.h"
  16#include "xfs_buf_item.h"
  17#include "xfs_btree.h"
  18#include "xfs_errortag.h"
  19#include "xfs_error.h"
  20#include "xfs_trace.h"
  21#include "xfs_alloc.h"
  22#include "xfs_log.h"
  23#include "xfs_btree_staging.h"
  24
  25/*
  26 * Cursor allocation zone.
  27 */
  28kmem_zone_t     *xfs_btree_cur_zone;
  29
  30/*
  31 * Btree magic numbers.
  32 */
  33static const uint32_t xfs_magics[2][XFS_BTNUM_MAX] = {
  34        { XFS_ABTB_MAGIC, XFS_ABTC_MAGIC, 0, XFS_BMAP_MAGIC, XFS_IBT_MAGIC,
  35          XFS_FIBT_MAGIC, 0 },
  36        { XFS_ABTB_CRC_MAGIC, XFS_ABTC_CRC_MAGIC, XFS_RMAP_CRC_MAGIC,
  37          XFS_BMAP_CRC_MAGIC, XFS_IBT_CRC_MAGIC, XFS_FIBT_CRC_MAGIC,
  38          XFS_REFC_CRC_MAGIC }
  39};
  40
  41uint32_t
  42xfs_btree_magic(
  43        int                     crc,
  44        xfs_btnum_t             btnum)
  45{
  46        uint32_t                magic = xfs_magics[crc][btnum];
  47
  48        /* Ensure we asked for crc for crc-only magics. */
  49        ASSERT(magic != 0);
  50        return magic;
  51}
  52
  53/*
  54 * Check a long btree block header.  Return the address of the failing check,
  55 * or NULL if everything is ok.
  56 */
  57xfs_failaddr_t
  58__xfs_btree_check_lblock(
  59        struct xfs_btree_cur    *cur,
  60        struct xfs_btree_block  *block,
  61        int                     level,
  62        struct xfs_buf          *bp)
  63{
  64        struct xfs_mount        *mp = cur->bc_mp;
  65        xfs_btnum_t             btnum = cur->bc_btnum;
  66        int                     crc = xfs_sb_version_hascrc(&mp->m_sb);
  67
  68        if (crc) {
  69                if (!uuid_equal(&block->bb_u.l.bb_uuid, &mp->m_sb.sb_meta_uuid))
  70                        return __this_address;
  71                if (block->bb_u.l.bb_blkno !=
  72                    cpu_to_be64(bp ? bp->b_bn : XFS_BUF_DADDR_NULL))
  73                        return __this_address;
  74                if (block->bb_u.l.bb_pad != cpu_to_be32(0))
  75                        return __this_address;
  76        }
  77
  78        if (be32_to_cpu(block->bb_magic) != xfs_btree_magic(crc, btnum))
  79                return __this_address;
  80        if (be16_to_cpu(block->bb_level) != level)
  81                return __this_address;
  82        if (be16_to_cpu(block->bb_numrecs) >
  83            cur->bc_ops->get_maxrecs(cur, level))
  84                return __this_address;
  85        if (block->bb_u.l.bb_leftsib != cpu_to_be64(NULLFSBLOCK) &&
  86            !xfs_btree_check_lptr(cur, be64_to_cpu(block->bb_u.l.bb_leftsib),
  87                        level + 1))
  88                return __this_address;
  89        if (block->bb_u.l.bb_rightsib != cpu_to_be64(NULLFSBLOCK) &&
  90            !xfs_btree_check_lptr(cur, be64_to_cpu(block->bb_u.l.bb_rightsib),
  91                        level + 1))
  92                return __this_address;
  93
  94        return NULL;
  95}
  96
  97/* Check a long btree block header. */
  98static int
  99xfs_btree_check_lblock(
 100        struct xfs_btree_cur    *cur,
 101        struct xfs_btree_block  *block,
 102        int                     level,
 103        struct xfs_buf          *bp)
 104{
 105        struct xfs_mount        *mp = cur->bc_mp;
 106        xfs_failaddr_t          fa;
 107
 108        fa = __xfs_btree_check_lblock(cur, block, level, bp);
 109        if (XFS_IS_CORRUPT(mp, fa != NULL) ||
 110            XFS_TEST_ERROR(false, mp, XFS_ERRTAG_BTREE_CHECK_LBLOCK)) {
 111                if (bp)
 112                        trace_xfs_btree_corrupt(bp, _RET_IP_);
 113                return -EFSCORRUPTED;
 114        }
 115        return 0;
 116}
 117
 118/*
 119 * Check a short btree block header.  Return the address of the failing check,
 120 * or NULL if everything is ok.
 121 */
 122xfs_failaddr_t
 123__xfs_btree_check_sblock(
 124        struct xfs_btree_cur    *cur,
 125        struct xfs_btree_block  *block,
 126        int                     level,
 127        struct xfs_buf          *bp)
 128{
 129        struct xfs_mount        *mp = cur->bc_mp;
 130        xfs_btnum_t             btnum = cur->bc_btnum;
 131        int                     crc = xfs_sb_version_hascrc(&mp->m_sb);
 132
 133        if (crc) {
 134                if (!uuid_equal(&block->bb_u.s.bb_uuid, &mp->m_sb.sb_meta_uuid))
 135                        return __this_address;
 136                if (block->bb_u.s.bb_blkno !=
 137                    cpu_to_be64(bp ? bp->b_bn : XFS_BUF_DADDR_NULL))
 138                        return __this_address;
 139        }
 140
 141        if (be32_to_cpu(block->bb_magic) != xfs_btree_magic(crc, btnum))
 142                return __this_address;
 143        if (be16_to_cpu(block->bb_level) != level)
 144                return __this_address;
 145        if (be16_to_cpu(block->bb_numrecs) >
 146            cur->bc_ops->get_maxrecs(cur, level))
 147                return __this_address;
 148        if (block->bb_u.s.bb_leftsib != cpu_to_be32(NULLAGBLOCK) &&
 149            !xfs_btree_check_sptr(cur, be32_to_cpu(block->bb_u.s.bb_leftsib),
 150                        level + 1))
 151                return __this_address;
 152        if (block->bb_u.s.bb_rightsib != cpu_to_be32(NULLAGBLOCK) &&
 153            !xfs_btree_check_sptr(cur, be32_to_cpu(block->bb_u.s.bb_rightsib),
 154                        level + 1))
 155                return __this_address;
 156
 157        return NULL;
 158}
 159
 160/* Check a short btree block header. */
 161STATIC int
 162xfs_btree_check_sblock(
 163        struct xfs_btree_cur    *cur,
 164        struct xfs_btree_block  *block,
 165        int                     level,
 166        struct xfs_buf          *bp)
 167{
 168        struct xfs_mount        *mp = cur->bc_mp;
 169        xfs_failaddr_t          fa;
 170
 171        fa = __xfs_btree_check_sblock(cur, block, level, bp);
 172        if (XFS_IS_CORRUPT(mp, fa != NULL) ||
 173            XFS_TEST_ERROR(false, mp, XFS_ERRTAG_BTREE_CHECK_SBLOCK)) {
 174                if (bp)
 175                        trace_xfs_btree_corrupt(bp, _RET_IP_);
 176                return -EFSCORRUPTED;
 177        }
 178        return 0;
 179}
 180
 181/*
 182 * Debug routine: check that block header is ok.
 183 */
 184int
 185xfs_btree_check_block(
 186        struct xfs_btree_cur    *cur,   /* btree cursor */
 187        struct xfs_btree_block  *block, /* generic btree block pointer */
 188        int                     level,  /* level of the btree block */
 189        struct xfs_buf          *bp)    /* buffer containing block, if any */
 190{
 191        if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
 192                return xfs_btree_check_lblock(cur, block, level, bp);
 193        else
 194                return xfs_btree_check_sblock(cur, block, level, bp);
 195}
 196
 197/* Check that this long pointer is valid and points within the fs. */
 198bool
 199xfs_btree_check_lptr(
 200        struct xfs_btree_cur    *cur,
 201        xfs_fsblock_t           fsbno,
 202        int                     level)
 203{
 204        if (level <= 0)
 205                return false;
 206        return xfs_verify_fsbno(cur->bc_mp, fsbno);
 207}
 208
 209/* Check that this short pointer is valid and points within the AG. */
 210bool
 211xfs_btree_check_sptr(
 212        struct xfs_btree_cur    *cur,
 213        xfs_agblock_t           agbno,
 214        int                     level)
 215{
 216        if (level <= 0)
 217                return false;
 218        return xfs_verify_agbno(cur->bc_mp, cur->bc_ag.agno, agbno);
 219}
 220
 221/*
 222 * Check that a given (indexed) btree pointer at a certain level of a
 223 * btree is valid and doesn't point past where it should.
 224 */
 225static int
 226xfs_btree_check_ptr(
 227        struct xfs_btree_cur    *cur,
 228        union xfs_btree_ptr     *ptr,
 229        int                     index,
 230        int                     level)
 231{
 232        if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
 233                if (xfs_btree_check_lptr(cur, be64_to_cpu((&ptr->l)[index]),
 234                                level))
 235                        return 0;
 236                xfs_err(cur->bc_mp,
 237"Inode %llu fork %d: Corrupt btree %d pointer at level %d index %d.",
 238                                cur->bc_ino.ip->i_ino,
 239                                cur->bc_ino.whichfork, cur->bc_btnum,
 240                                level, index);
 241        } else {
 242                if (xfs_btree_check_sptr(cur, be32_to_cpu((&ptr->s)[index]),
 243                                level))
 244                        return 0;
 245                xfs_err(cur->bc_mp,
 246"AG %u: Corrupt btree %d pointer at level %d index %d.",
 247                                cur->bc_ag.agno, cur->bc_btnum,
 248                                level, index);
 249        }
 250
 251        return -EFSCORRUPTED;
 252}
 253
 254#ifdef DEBUG
 255# define xfs_btree_debug_check_ptr      xfs_btree_check_ptr
 256#else
 257# define xfs_btree_debug_check_ptr(...) (0)
 258#endif
 259
 260/*
 261 * Calculate CRC on the whole btree block and stuff it into the
 262 * long-form btree header.
 263 *
 264 * Prior to calculting the CRC, pull the LSN out of the buffer log item and put
 265 * it into the buffer so recovery knows what the last modification was that made
 266 * it to disk.
 267 */
 268void
 269xfs_btree_lblock_calc_crc(
 270        struct xfs_buf          *bp)
 271{
 272        struct xfs_btree_block  *block = XFS_BUF_TO_BLOCK(bp);
 273        struct xfs_buf_log_item *bip = bp->b_log_item;
 274
 275        if (!xfs_sb_version_hascrc(&bp->b_mount->m_sb))
 276                return;
 277        if (bip)
 278                block->bb_u.l.bb_lsn = cpu_to_be64(bip->bli_item.li_lsn);
 279        xfs_buf_update_cksum(bp, XFS_BTREE_LBLOCK_CRC_OFF);
 280}
 281
 282bool
 283xfs_btree_lblock_verify_crc(
 284        struct xfs_buf          *bp)
 285{
 286        struct xfs_btree_block  *block = XFS_BUF_TO_BLOCK(bp);
 287        struct xfs_mount        *mp = bp->b_mount;
 288
 289        if (xfs_sb_version_hascrc(&mp->m_sb)) {
 290                if (!xfs_log_check_lsn(mp, be64_to_cpu(block->bb_u.l.bb_lsn)))
 291                        return false;
 292                return xfs_buf_verify_cksum(bp, XFS_BTREE_LBLOCK_CRC_OFF);
 293        }
 294
 295        return true;
 296}
 297
 298/*
 299 * Calculate CRC on the whole btree block and stuff it into the
 300 * short-form btree header.
 301 *
 302 * Prior to calculting the CRC, pull the LSN out of the buffer log item and put
 303 * it into the buffer so recovery knows what the last modification was that made
 304 * it to disk.
 305 */
 306void
 307xfs_btree_sblock_calc_crc(
 308        struct xfs_buf          *bp)
 309{
 310        struct xfs_btree_block  *block = XFS_BUF_TO_BLOCK(bp);
 311        struct xfs_buf_log_item *bip = bp->b_log_item;
 312
 313        if (!xfs_sb_version_hascrc(&bp->b_mount->m_sb))
 314                return;
 315        if (bip)
 316                block->bb_u.s.bb_lsn = cpu_to_be64(bip->bli_item.li_lsn);
 317        xfs_buf_update_cksum(bp, XFS_BTREE_SBLOCK_CRC_OFF);
 318}
 319
 320bool
 321xfs_btree_sblock_verify_crc(
 322        struct xfs_buf          *bp)
 323{
 324        struct xfs_btree_block  *block = XFS_BUF_TO_BLOCK(bp);
 325        struct xfs_mount        *mp = bp->b_mount;
 326
 327        if (xfs_sb_version_hascrc(&mp->m_sb)) {
 328                if (!xfs_log_check_lsn(mp, be64_to_cpu(block->bb_u.s.bb_lsn)))
 329                        return false;
 330                return xfs_buf_verify_cksum(bp, XFS_BTREE_SBLOCK_CRC_OFF);
 331        }
 332
 333        return true;
 334}
 335
 336static int
 337xfs_btree_free_block(
 338        struct xfs_btree_cur    *cur,
 339        struct xfs_buf          *bp)
 340{
 341        int                     error;
 342
 343        error = cur->bc_ops->free_block(cur, bp);
 344        if (!error) {
 345                xfs_trans_binval(cur->bc_tp, bp);
 346                XFS_BTREE_STATS_INC(cur, free);
 347        }
 348        return error;
 349}
 350
 351/*
 352 * Delete the btree cursor.
 353 */
 354void
 355xfs_btree_del_cursor(
 356        xfs_btree_cur_t *cur,           /* btree cursor */
 357        int             error)          /* del because of error */
 358{
 359        int             i;              /* btree level */
 360
 361        /*
 362         * Clear the buffer pointers, and release the buffers.
 363         * If we're doing this in the face of an error, we
 364         * need to make sure to inspect all of the entries
 365         * in the bc_bufs array for buffers to be unlocked.
 366         * This is because some of the btree code works from
 367         * level n down to 0, and if we get an error along
 368         * the way we won't have initialized all the entries
 369         * down to 0.
 370         */
 371        for (i = 0; i < cur->bc_nlevels; i++) {
 372                if (cur->bc_bufs[i])
 373                        xfs_trans_brelse(cur->bc_tp, cur->bc_bufs[i]);
 374                else if (!error)
 375                        break;
 376        }
 377        /*
 378         * Can't free a bmap cursor without having dealt with the
 379         * allocated indirect blocks' accounting.
 380         */
 381        ASSERT(cur->bc_btnum != XFS_BTNUM_BMAP ||
 382               cur->bc_ino.allocated == 0);
 383        /*
 384         * Free the cursor.
 385         */
 386        if (unlikely(cur->bc_flags & XFS_BTREE_STAGING))
 387                kmem_free((void *)cur->bc_ops);
 388        kmem_cache_free(xfs_btree_cur_zone, cur);
 389}
 390
 391/*
 392 * Duplicate the btree cursor.
 393 * Allocate a new one, copy the record, re-get the buffers.
 394 */
 395int                                     /* error */
 396xfs_btree_dup_cursor(
 397        xfs_btree_cur_t *cur,           /* input cursor */
 398        xfs_btree_cur_t **ncur)         /* output cursor */
 399{
 400        struct xfs_buf  *bp;            /* btree block's buffer pointer */
 401        int             error;          /* error return value */
 402        int             i;              /* level number of btree block */
 403        xfs_mount_t     *mp;            /* mount structure for filesystem */
 404        xfs_btree_cur_t *new;           /* new cursor value */
 405        xfs_trans_t     *tp;            /* transaction pointer, can be NULL */
 406
 407        tp = cur->bc_tp;
 408        mp = cur->bc_mp;
 409
 410        /*
 411         * Allocate a new cursor like the old one.
 412         */
 413        new = cur->bc_ops->dup_cursor(cur);
 414
 415        /*
 416         * Copy the record currently in the cursor.
 417         */
 418        new->bc_rec = cur->bc_rec;
 419
 420        /*
 421         * For each level current, re-get the buffer and copy the ptr value.
 422         */
 423        for (i = 0; i < new->bc_nlevels; i++) {
 424                new->bc_ptrs[i] = cur->bc_ptrs[i];
 425                new->bc_ra[i] = cur->bc_ra[i];
 426                bp = cur->bc_bufs[i];
 427                if (bp) {
 428                        error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp,
 429                                                   XFS_BUF_ADDR(bp), mp->m_bsize,
 430                                                   0, &bp,
 431                                                   cur->bc_ops->buf_ops);
 432                        if (error) {
 433                                xfs_btree_del_cursor(new, error);
 434                                *ncur = NULL;
 435                                return error;
 436                        }
 437                }
 438                new->bc_bufs[i] = bp;
 439        }
 440        *ncur = new;
 441        return 0;
 442}
 443
 444/*
 445 * XFS btree block layout and addressing:
 446 *
 447 * There are two types of blocks in the btree: leaf and non-leaf blocks.
 448 *
 449 * The leaf record start with a header then followed by records containing
 450 * the values.  A non-leaf block also starts with the same header, and
 451 * then first contains lookup keys followed by an equal number of pointers
 452 * to the btree blocks at the previous level.
 453 *
 454 *              +--------+-------+-------+-------+-------+-------+-------+
 455 * Leaf:        | header | rec 1 | rec 2 | rec 3 | rec 4 | rec 5 | rec N |
 456 *              +--------+-------+-------+-------+-------+-------+-------+
 457 *
 458 *              +--------+-------+-------+-------+-------+-------+-------+
 459 * Non-Leaf:    | header | key 1 | key 2 | key N | ptr 1 | ptr 2 | ptr N |
 460 *              +--------+-------+-------+-------+-------+-------+-------+
 461 *
 462 * The header is called struct xfs_btree_block for reasons better left unknown
 463 * and comes in different versions for short (32bit) and long (64bit) block
 464 * pointers.  The record and key structures are defined by the btree instances
 465 * and opaque to the btree core.  The block pointers are simple disk endian
 466 * integers, available in a short (32bit) and long (64bit) variant.
 467 *
 468 * The helpers below calculate the offset of a given record, key or pointer
 469 * into a btree block (xfs_btree_*_offset) or return a pointer to the given
 470 * record, key or pointer (xfs_btree_*_addr).  Note that all addressing
 471 * inside the btree block is done using indices starting at one, not zero!
 472 *
 473 * If XFS_BTREE_OVERLAPPING is set, then this btree supports keys containing
 474 * overlapping intervals.  In such a tree, records are still sorted lowest to
 475 * highest and indexed by the smallest key value that refers to the record.
 476 * However, nodes are different: each pointer has two associated keys -- one
 477 * indexing the lowest key available in the block(s) below (the same behavior
 478 * as the key in a regular btree) and another indexing the highest key
 479 * available in the block(s) below.  Because records are /not/ sorted by the
 480 * highest key, all leaf block updates require us to compute the highest key
 481 * that matches any record in the leaf and to recursively update the high keys
 482 * in the nodes going further up in the tree, if necessary.  Nodes look like
 483 * this:
 484 *
 485 *              +--------+-----+-----+-----+-----+-----+-------+-------+-----+
 486 * Non-Leaf:    | header | lo1 | hi1 | lo2 | hi2 | ... | ptr 1 | ptr 2 | ... |
 487 *              +--------+-----+-----+-----+-----+-----+-------+-------+-----+
 488 *
 489 * To perform an interval query on an overlapped tree, perform the usual
 490 * depth-first search and use the low and high keys to decide if we can skip
 491 * that particular node.  If a leaf node is reached, return the records that
 492 * intersect the interval.  Note that an interval query may return numerous
 493 * entries.  For a non-overlapped tree, simply search for the record associated
 494 * with the lowest key and iterate forward until a non-matching record is
 495 * found.  Section 14.3 ("Interval Trees") of _Introduction to Algorithms_ by
 496 * Cormen, Leiserson, Rivest, and Stein (2nd or 3rd ed. only) discuss this in
 497 * more detail.
 498 *
 499 * Why do we care about overlapping intervals?  Let's say you have a bunch of
 500 * reverse mapping records on a reflink filesystem:
 501 *
 502 * 1: +- file A startblock B offset C length D -----------+
 503 * 2:      +- file E startblock F offset G length H --------------+
 504 * 3:      +- file I startblock F offset J length K --+
 505 * 4:                                                        +- file L... --+
 506 *
 507 * Now say we want to map block (B+D) into file A at offset (C+D).  Ideally,
 508 * we'd simply increment the length of record 1.  But how do we find the record
 509 * that ends at (B+D-1) (i.e. record 1)?  A LE lookup of (B+D-1) would return
 510 * record 3 because the keys are ordered first by startblock.  An interval
 511 * query would return records 1 and 2 because they both overlap (B+D-1), and
 512 * from that we can pick out record 1 as the appropriate left neighbor.
 513 *
 514 * In the non-overlapped case you can do a LE lookup and decrement the cursor
 515 * because a record's interval must end before the next record.
 516 */
 517
 518/*
 519 * Return size of the btree block header for this btree instance.
 520 */
 521static inline size_t xfs_btree_block_len(struct xfs_btree_cur *cur)
 522{
 523        if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
 524                if (cur->bc_flags & XFS_BTREE_CRC_BLOCKS)
 525                        return XFS_BTREE_LBLOCK_CRC_LEN;
 526                return XFS_BTREE_LBLOCK_LEN;
 527        }
 528        if (cur->bc_flags & XFS_BTREE_CRC_BLOCKS)
 529                return XFS_BTREE_SBLOCK_CRC_LEN;
 530        return XFS_BTREE_SBLOCK_LEN;
 531}
 532
 533/*
 534 * Return size of btree block pointers for this btree instance.
 535 */
 536static inline size_t xfs_btree_ptr_len(struct xfs_btree_cur *cur)
 537{
 538        return (cur->bc_flags & XFS_BTREE_LONG_PTRS) ?
 539                sizeof(__be64) : sizeof(__be32);
 540}
 541
 542/*
 543 * Calculate offset of the n-th record in a btree block.
 544 */
 545STATIC size_t
 546xfs_btree_rec_offset(
 547        struct xfs_btree_cur    *cur,
 548        int                     n)
 549{
 550        return xfs_btree_block_len(cur) +
 551                (n - 1) * cur->bc_ops->rec_len;
 552}
 553
 554/*
 555 * Calculate offset of the n-th key in a btree block.
 556 */
 557STATIC size_t
 558xfs_btree_key_offset(
 559        struct xfs_btree_cur    *cur,
 560        int                     n)
 561{
 562        return xfs_btree_block_len(cur) +
 563                (n - 1) * cur->bc_ops->key_len;
 564}
 565
 566/*
 567 * Calculate offset of the n-th high key in a btree block.
 568 */
 569STATIC size_t
 570xfs_btree_high_key_offset(
 571        struct xfs_btree_cur    *cur,
 572        int                     n)
 573{
 574        return xfs_btree_block_len(cur) +
 575                (n - 1) * cur->bc_ops->key_len + (cur->bc_ops->key_len / 2);
 576}
 577
 578/*
 579 * Calculate offset of the n-th block pointer in a btree block.
 580 */
 581STATIC size_t
 582xfs_btree_ptr_offset(
 583        struct xfs_btree_cur    *cur,
 584        int                     n,
 585        int                     level)
 586{
 587        return xfs_btree_block_len(cur) +
 588                cur->bc_ops->get_maxrecs(cur, level) * cur->bc_ops->key_len +
 589                (n - 1) * xfs_btree_ptr_len(cur);
 590}
 591
 592/*
 593 * Return a pointer to the n-th record in the btree block.
 594 */
 595union xfs_btree_rec *
 596xfs_btree_rec_addr(
 597        struct xfs_btree_cur    *cur,
 598        int                     n,
 599        struct xfs_btree_block  *block)
 600{
 601        return (union xfs_btree_rec *)
 602                ((char *)block + xfs_btree_rec_offset(cur, n));
 603}
 604
 605/*
 606 * Return a pointer to the n-th key in the btree block.
 607 */
 608union xfs_btree_key *
 609xfs_btree_key_addr(
 610        struct xfs_btree_cur    *cur,
 611        int                     n,
 612        struct xfs_btree_block  *block)
 613{
 614        return (union xfs_btree_key *)
 615                ((char *)block + xfs_btree_key_offset(cur, n));
 616}
 617
 618/*
 619 * Return a pointer to the n-th high key in the btree block.
 620 */
 621union xfs_btree_key *
 622xfs_btree_high_key_addr(
 623        struct xfs_btree_cur    *cur,
 624        int                     n,
 625        struct xfs_btree_block  *block)
 626{
 627        return (union xfs_btree_key *)
 628                ((char *)block + xfs_btree_high_key_offset(cur, n));
 629}
 630
 631/*
 632 * Return a pointer to the n-th block pointer in the btree block.
 633 */
 634union xfs_btree_ptr *
 635xfs_btree_ptr_addr(
 636        struct xfs_btree_cur    *cur,
 637        int                     n,
 638        struct xfs_btree_block  *block)
 639{
 640        int                     level = xfs_btree_get_level(block);
 641
 642        ASSERT(block->bb_level != 0);
 643
 644        return (union xfs_btree_ptr *)
 645                ((char *)block + xfs_btree_ptr_offset(cur, n, level));
 646}
 647
 648struct xfs_ifork *
 649xfs_btree_ifork_ptr(
 650        struct xfs_btree_cur    *cur)
 651{
 652        ASSERT(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE);
 653
 654        if (cur->bc_flags & XFS_BTREE_STAGING)
 655                return cur->bc_ino.ifake->if_fork;
 656        return XFS_IFORK_PTR(cur->bc_ino.ip, cur->bc_ino.whichfork);
 657}
 658
 659/*
 660 * Get the root block which is stored in the inode.
 661 *
 662 * For now this btree implementation assumes the btree root is always
 663 * stored in the if_broot field of an inode fork.
 664 */
 665STATIC struct xfs_btree_block *
 666xfs_btree_get_iroot(
 667        struct xfs_btree_cur    *cur)
 668{
 669        struct xfs_ifork        *ifp = xfs_btree_ifork_ptr(cur);
 670
 671        return (struct xfs_btree_block *)ifp->if_broot;
 672}
 673
 674/*
 675 * Retrieve the block pointer from the cursor at the given level.
 676 * This may be an inode btree root or from a buffer.
 677 */
 678struct xfs_btree_block *                /* generic btree block pointer */
 679xfs_btree_get_block(
 680        struct xfs_btree_cur    *cur,   /* btree cursor */
 681        int                     level,  /* level in btree */
 682        struct xfs_buf          **bpp)  /* buffer containing the block */
 683{
 684        if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
 685            (level == cur->bc_nlevels - 1)) {
 686                *bpp = NULL;
 687                return xfs_btree_get_iroot(cur);
 688        }
 689
 690        *bpp = cur->bc_bufs[level];
 691        return XFS_BUF_TO_BLOCK(*bpp);
 692}
 693
 694/*
 695 * Change the cursor to point to the first record at the given level.
 696 * Other levels are unaffected.
 697 */
 698STATIC int                              /* success=1, failure=0 */
 699xfs_btree_firstrec(
 700        xfs_btree_cur_t         *cur,   /* btree cursor */
 701        int                     level)  /* level to change */
 702{
 703        struct xfs_btree_block  *block; /* generic btree block pointer */
 704        struct xfs_buf          *bp;    /* buffer containing block */
 705
 706        /*
 707         * Get the block pointer for this level.
 708         */
 709        block = xfs_btree_get_block(cur, level, &bp);
 710        if (xfs_btree_check_block(cur, block, level, bp))
 711                return 0;
 712        /*
 713         * It's empty, there is no such record.
 714         */
 715        if (!block->bb_numrecs)
 716                return 0;
 717        /*
 718         * Set the ptr value to 1, that's the first record/key.
 719         */
 720        cur->bc_ptrs[level] = 1;
 721        return 1;
 722}
 723
 724/*
 725 * Change the cursor to point to the last record in the current block
 726 * at the given level.  Other levels are unaffected.
 727 */
 728STATIC int                              /* success=1, failure=0 */
 729xfs_btree_lastrec(
 730        xfs_btree_cur_t         *cur,   /* btree cursor */
 731        int                     level)  /* level to change */
 732{
 733        struct xfs_btree_block  *block; /* generic btree block pointer */
 734        struct xfs_buf          *bp;    /* buffer containing block */
 735
 736        /*
 737         * Get the block pointer for this level.
 738         */
 739        block = xfs_btree_get_block(cur, level, &bp);
 740        if (xfs_btree_check_block(cur, block, level, bp))
 741                return 0;
 742        /*
 743         * It's empty, there is no such record.
 744         */
 745        if (!block->bb_numrecs)
 746                return 0;
 747        /*
 748         * Set the ptr value to numrecs, that's the last record/key.
 749         */
 750        cur->bc_ptrs[level] = be16_to_cpu(block->bb_numrecs);
 751        return 1;
 752}
 753
 754/*
 755 * Compute first and last byte offsets for the fields given.
 756 * Interprets the offsets table, which contains struct field offsets.
 757 */
 758void
 759xfs_btree_offsets(
 760        int64_t         fields,         /* bitmask of fields */
 761        const short     *offsets,       /* table of field offsets */
 762        int             nbits,          /* number of bits to inspect */
 763        int             *first,         /* output: first byte offset */
 764        int             *last)          /* output: last byte offset */
 765{
 766        int             i;              /* current bit number */
 767        int64_t         imask;          /* mask for current bit number */
 768
 769        ASSERT(fields != 0);
 770        /*
 771         * Find the lowest bit, so the first byte offset.
 772         */
 773        for (i = 0, imask = 1LL; ; i++, imask <<= 1) {
 774                if (imask & fields) {
 775                        *first = offsets[i];
 776                        break;
 777                }
 778        }
 779        /*
 780         * Find the highest bit, so the last byte offset.
 781         */
 782        for (i = nbits - 1, imask = 1LL << i; ; i--, imask >>= 1) {
 783                if (imask & fields) {
 784                        *last = offsets[i + 1] - 1;
 785                        break;
 786                }
 787        }
 788}
 789
 790/*
 791 * Get a buffer for the block, return it read in.
 792 * Long-form addressing.
 793 */
 794int
 795xfs_btree_read_bufl(
 796        struct xfs_mount        *mp,            /* file system mount point */
 797        struct xfs_trans        *tp,            /* transaction pointer */
 798        xfs_fsblock_t           fsbno,          /* file system block number */
 799        struct xfs_buf          **bpp,          /* buffer for fsbno */
 800        int                     refval,         /* ref count value for buffer */
 801        const struct xfs_buf_ops *ops)
 802{
 803        struct xfs_buf          *bp;            /* return value */
 804        xfs_daddr_t             d;              /* real disk block address */
 805        int                     error;
 806
 807        if (!xfs_verify_fsbno(mp, fsbno))
 808                return -EFSCORRUPTED;
 809        d = XFS_FSB_TO_DADDR(mp, fsbno);
 810        error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, d,
 811                                   mp->m_bsize, 0, &bp, ops);
 812        if (error)
 813                return error;
 814        if (bp)
 815                xfs_buf_set_ref(bp, refval);
 816        *bpp = bp;
 817        return 0;
 818}
 819
 820/*
 821 * Read-ahead the block, don't wait for it, don't return a buffer.
 822 * Long-form addressing.
 823 */
 824/* ARGSUSED */
 825void
 826xfs_btree_reada_bufl(
 827        struct xfs_mount        *mp,            /* file system mount point */
 828        xfs_fsblock_t           fsbno,          /* file system block number */
 829        xfs_extlen_t            count,          /* count of filesystem blocks */
 830        const struct xfs_buf_ops *ops)
 831{
 832        xfs_daddr_t             d;
 833
 834        ASSERT(fsbno != NULLFSBLOCK);
 835        d = XFS_FSB_TO_DADDR(mp, fsbno);
 836        xfs_buf_readahead(mp->m_ddev_targp, d, mp->m_bsize * count, ops);
 837}
 838
 839/*
 840 * Read-ahead the block, don't wait for it, don't return a buffer.
 841 * Short-form addressing.
 842 */
 843/* ARGSUSED */
 844void
 845xfs_btree_reada_bufs(
 846        struct xfs_mount        *mp,            /* file system mount point */
 847        xfs_agnumber_t          agno,           /* allocation group number */
 848        xfs_agblock_t           agbno,          /* allocation group block number */
 849        xfs_extlen_t            count,          /* count of filesystem blocks */
 850        const struct xfs_buf_ops *ops)
 851{
 852        xfs_daddr_t             d;
 853
 854        ASSERT(agno != NULLAGNUMBER);
 855        ASSERT(agbno != NULLAGBLOCK);
 856        d = XFS_AGB_TO_DADDR(mp, agno, agbno);
 857        xfs_buf_readahead(mp->m_ddev_targp, d, mp->m_bsize * count, ops);
 858}
 859
 860STATIC int
 861xfs_btree_readahead_lblock(
 862        struct xfs_btree_cur    *cur,
 863        int                     lr,
 864        struct xfs_btree_block  *block)
 865{
 866        int                     rval = 0;
 867        xfs_fsblock_t           left = be64_to_cpu(block->bb_u.l.bb_leftsib);
 868        xfs_fsblock_t           right = be64_to_cpu(block->bb_u.l.bb_rightsib);
 869
 870        if ((lr & XFS_BTCUR_LEFTRA) && left != NULLFSBLOCK) {
 871                xfs_btree_reada_bufl(cur->bc_mp, left, 1,
 872                                     cur->bc_ops->buf_ops);
 873                rval++;
 874        }
 875
 876        if ((lr & XFS_BTCUR_RIGHTRA) && right != NULLFSBLOCK) {
 877                xfs_btree_reada_bufl(cur->bc_mp, right, 1,
 878                                     cur->bc_ops->buf_ops);
 879                rval++;
 880        }
 881
 882        return rval;
 883}
 884
 885STATIC int
 886xfs_btree_readahead_sblock(
 887        struct xfs_btree_cur    *cur,
 888        int                     lr,
 889        struct xfs_btree_block *block)
 890{
 891        int                     rval = 0;
 892        xfs_agblock_t           left = be32_to_cpu(block->bb_u.s.bb_leftsib);
 893        xfs_agblock_t           right = be32_to_cpu(block->bb_u.s.bb_rightsib);
 894
 895
 896        if ((lr & XFS_BTCUR_LEFTRA) && left != NULLAGBLOCK) {
 897                xfs_btree_reada_bufs(cur->bc_mp, cur->bc_ag.agno,
 898                                     left, 1, cur->bc_ops->buf_ops);
 899                rval++;
 900        }
 901
 902        if ((lr & XFS_BTCUR_RIGHTRA) && right != NULLAGBLOCK) {
 903                xfs_btree_reada_bufs(cur->bc_mp, cur->bc_ag.agno,
 904                                     right, 1, cur->bc_ops->buf_ops);
 905                rval++;
 906        }
 907
 908        return rval;
 909}
 910
 911/*
 912 * Read-ahead btree blocks, at the given level.
 913 * Bits in lr are set from XFS_BTCUR_{LEFT,RIGHT}RA.
 914 */
 915STATIC int
 916xfs_btree_readahead(
 917        struct xfs_btree_cur    *cur,           /* btree cursor */
 918        int                     lev,            /* level in btree */
 919        int                     lr)             /* left/right bits */
 920{
 921        struct xfs_btree_block  *block;
 922
 923        /*
 924         * No readahead needed if we are at the root level and the
 925         * btree root is stored in the inode.
 926         */
 927        if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
 928            (lev == cur->bc_nlevels - 1))
 929                return 0;
 930
 931        if ((cur->bc_ra[lev] | lr) == cur->bc_ra[lev])
 932                return 0;
 933
 934        cur->bc_ra[lev] |= lr;
 935        block = XFS_BUF_TO_BLOCK(cur->bc_bufs[lev]);
 936
 937        if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
 938                return xfs_btree_readahead_lblock(cur, lr, block);
 939        return xfs_btree_readahead_sblock(cur, lr, block);
 940}
 941
 942STATIC int
 943xfs_btree_ptr_to_daddr(
 944        struct xfs_btree_cur    *cur,
 945        union xfs_btree_ptr     *ptr,
 946        xfs_daddr_t             *daddr)
 947{
 948        xfs_fsblock_t           fsbno;
 949        xfs_agblock_t           agbno;
 950        int                     error;
 951
 952        error = xfs_btree_check_ptr(cur, ptr, 0, 1);
 953        if (error)
 954                return error;
 955
 956        if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
 957                fsbno = be64_to_cpu(ptr->l);
 958                *daddr = XFS_FSB_TO_DADDR(cur->bc_mp, fsbno);
 959        } else {
 960                agbno = be32_to_cpu(ptr->s);
 961                *daddr = XFS_AGB_TO_DADDR(cur->bc_mp, cur->bc_ag.agno,
 962                                agbno);
 963        }
 964
 965        return 0;
 966}
 967
 968/*
 969 * Readahead @count btree blocks at the given @ptr location.
 970 *
 971 * We don't need to care about long or short form btrees here as we have a
 972 * method of converting the ptr directly to a daddr available to us.
 973 */
 974STATIC void
 975xfs_btree_readahead_ptr(
 976        struct xfs_btree_cur    *cur,
 977        union xfs_btree_ptr     *ptr,
 978        xfs_extlen_t            count)
 979{
 980        xfs_daddr_t             daddr;
 981
 982        if (xfs_btree_ptr_to_daddr(cur, ptr, &daddr))
 983                return;
 984        xfs_buf_readahead(cur->bc_mp->m_ddev_targp, daddr,
 985                          cur->bc_mp->m_bsize * count, cur->bc_ops->buf_ops);
 986}
 987
 988/*
 989 * Set the buffer for level "lev" in the cursor to bp, releasing
 990 * any previous buffer.
 991 */
 992STATIC void
 993xfs_btree_setbuf(
 994        xfs_btree_cur_t         *cur,   /* btree cursor */
 995        int                     lev,    /* level in btree */
 996        struct xfs_buf          *bp)    /* new buffer to set */
 997{
 998        struct xfs_btree_block  *b;     /* btree block */
 999
1000        if (cur->bc_bufs[lev])
1001                xfs_trans_brelse(cur->bc_tp, cur->bc_bufs[lev]);
1002        cur->bc_bufs[lev] = bp;
1003        cur->bc_ra[lev] = 0;
1004
1005        b = XFS_BUF_TO_BLOCK(bp);
1006        if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
1007                if (b->bb_u.l.bb_leftsib == cpu_to_be64(NULLFSBLOCK))
1008                        cur->bc_ra[lev] |= XFS_BTCUR_LEFTRA;
1009                if (b->bb_u.l.bb_rightsib == cpu_to_be64(NULLFSBLOCK))
1010                        cur->bc_ra[lev] |= XFS_BTCUR_RIGHTRA;
1011        } else {
1012                if (b->bb_u.s.bb_leftsib == cpu_to_be32(NULLAGBLOCK))
1013                        cur->bc_ra[lev] |= XFS_BTCUR_LEFTRA;
1014                if (b->bb_u.s.bb_rightsib == cpu_to_be32(NULLAGBLOCK))
1015                        cur->bc_ra[lev] |= XFS_BTCUR_RIGHTRA;
1016        }
1017}
1018
1019bool
1020xfs_btree_ptr_is_null(
1021        struct xfs_btree_cur    *cur,
1022        union xfs_btree_ptr     *ptr)
1023{
1024        if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
1025                return ptr->l == cpu_to_be64(NULLFSBLOCK);
1026        else
1027                return ptr->s == cpu_to_be32(NULLAGBLOCK);
1028}
1029
1030void
1031xfs_btree_set_ptr_null(
1032        struct xfs_btree_cur    *cur,
1033        union xfs_btree_ptr     *ptr)
1034{
1035        if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
1036                ptr->l = cpu_to_be64(NULLFSBLOCK);
1037        else
1038                ptr->s = cpu_to_be32(NULLAGBLOCK);
1039}
1040
1041/*
1042 * Get/set/init sibling pointers
1043 */
1044void
1045xfs_btree_get_sibling(
1046        struct xfs_btree_cur    *cur,
1047        struct xfs_btree_block  *block,
1048        union xfs_btree_ptr     *ptr,
1049        int                     lr)
1050{
1051        ASSERT(lr == XFS_BB_LEFTSIB || lr == XFS_BB_RIGHTSIB);
1052
1053        if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
1054                if (lr == XFS_BB_RIGHTSIB)
1055                        ptr->l = block->bb_u.l.bb_rightsib;
1056                else
1057                        ptr->l = block->bb_u.l.bb_leftsib;
1058        } else {
1059                if (lr == XFS_BB_RIGHTSIB)
1060                        ptr->s = block->bb_u.s.bb_rightsib;
1061                else
1062                        ptr->s = block->bb_u.s.bb_leftsib;
1063        }
1064}
1065
1066void
1067xfs_btree_set_sibling(
1068        struct xfs_btree_cur    *cur,
1069        struct xfs_btree_block  *block,
1070        union xfs_btree_ptr     *ptr,
1071        int                     lr)
1072{
1073        ASSERT(lr == XFS_BB_LEFTSIB || lr == XFS_BB_RIGHTSIB);
1074
1075        if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
1076                if (lr == XFS_BB_RIGHTSIB)
1077                        block->bb_u.l.bb_rightsib = ptr->l;
1078                else
1079                        block->bb_u.l.bb_leftsib = ptr->l;
1080        } else {
1081                if (lr == XFS_BB_RIGHTSIB)
1082                        block->bb_u.s.bb_rightsib = ptr->s;
1083                else
1084                        block->bb_u.s.bb_leftsib = ptr->s;
1085        }
1086}
1087
1088void
1089xfs_btree_init_block_int(
1090        struct xfs_mount        *mp,
1091        struct xfs_btree_block  *buf,
1092        xfs_daddr_t             blkno,
1093        xfs_btnum_t             btnum,
1094        __u16                   level,
1095        __u16                   numrecs,
1096        __u64                   owner,
1097        unsigned int            flags)
1098{
1099        int                     crc = xfs_sb_version_hascrc(&mp->m_sb);
1100        __u32                   magic = xfs_btree_magic(crc, btnum);
1101
1102        buf->bb_magic = cpu_to_be32(magic);
1103        buf->bb_level = cpu_to_be16(level);
1104        buf->bb_numrecs = cpu_to_be16(numrecs);
1105
1106        if (flags & XFS_BTREE_LONG_PTRS) {
1107                buf->bb_u.l.bb_leftsib = cpu_to_be64(NULLFSBLOCK);
1108                buf->bb_u.l.bb_rightsib = cpu_to_be64(NULLFSBLOCK);
1109                if (crc) {
1110                        buf->bb_u.l.bb_blkno = cpu_to_be64(blkno);
1111                        buf->bb_u.l.bb_owner = cpu_to_be64(owner);
1112                        uuid_copy(&buf->bb_u.l.bb_uuid, &mp->m_sb.sb_meta_uuid);
1113                        buf->bb_u.l.bb_pad = 0;
1114                        buf->bb_u.l.bb_lsn = 0;
1115                }
1116        } else {
1117                /* owner is a 32 bit value on short blocks */
1118                __u32 __owner = (__u32)owner;
1119
1120                buf->bb_u.s.bb_leftsib = cpu_to_be32(NULLAGBLOCK);
1121                buf->bb_u.s.bb_rightsib = cpu_to_be32(NULLAGBLOCK);
1122                if (crc) {
1123                        buf->bb_u.s.bb_blkno = cpu_to_be64(blkno);
1124                        buf->bb_u.s.bb_owner = cpu_to_be32(__owner);
1125                        uuid_copy(&buf->bb_u.s.bb_uuid, &mp->m_sb.sb_meta_uuid);
1126                        buf->bb_u.s.bb_lsn = 0;
1127                }
1128        }
1129}
1130
1131void
1132xfs_btree_init_block(
1133        struct xfs_mount *mp,
1134        struct xfs_buf  *bp,
1135        xfs_btnum_t     btnum,
1136        __u16           level,
1137        __u16           numrecs,
1138        __u64           owner)
1139{
1140        xfs_btree_init_block_int(mp, XFS_BUF_TO_BLOCK(bp), bp->b_bn,
1141                                 btnum, level, numrecs, owner, 0);
1142}
1143
1144void
1145xfs_btree_init_block_cur(
1146        struct xfs_btree_cur    *cur,
1147        struct xfs_buf          *bp,
1148        int                     level,
1149        int                     numrecs)
1150{
1151        __u64                   owner;
1152
1153        /*
1154         * we can pull the owner from the cursor right now as the different
1155         * owners align directly with the pointer size of the btree. This may
1156         * change in future, but is safe for current users of the generic btree
1157         * code.
1158         */
1159        if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
1160                owner = cur->bc_ino.ip->i_ino;
1161        else
1162                owner = cur->bc_ag.agno;
1163
1164        xfs_btree_init_block_int(cur->bc_mp, XFS_BUF_TO_BLOCK(bp), bp->b_bn,
1165                                 cur->bc_btnum, level, numrecs,
1166                                 owner, cur->bc_flags);
1167}
1168
1169/*
1170 * Return true if ptr is the last record in the btree and
1171 * we need to track updates to this record.  The decision
1172 * will be further refined in the update_lastrec method.
1173 */
1174STATIC int
1175xfs_btree_is_lastrec(
1176        struct xfs_btree_cur    *cur,
1177        struct xfs_btree_block  *block,
1178        int                     level)
1179{
1180        union xfs_btree_ptr     ptr;
1181
1182        if (level > 0)
1183                return 0;
1184        if (!(cur->bc_flags & XFS_BTREE_LASTREC_UPDATE))
1185                return 0;
1186
1187        xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_RIGHTSIB);
1188        if (!xfs_btree_ptr_is_null(cur, &ptr))
1189                return 0;
1190        return 1;
1191}
1192
1193STATIC void
1194xfs_btree_buf_to_ptr(
1195        struct xfs_btree_cur    *cur,
1196        struct xfs_buf          *bp,
1197        union xfs_btree_ptr     *ptr)
1198{
1199        if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
1200                ptr->l = cpu_to_be64(XFS_DADDR_TO_FSB(cur->bc_mp,
1201                                        XFS_BUF_ADDR(bp)));
1202        else {
1203                ptr->s = cpu_to_be32(xfs_daddr_to_agbno(cur->bc_mp,
1204                                        XFS_BUF_ADDR(bp)));
1205        }
1206}
1207
1208STATIC void
1209xfs_btree_set_refs(
1210        struct xfs_btree_cur    *cur,
1211        struct xfs_buf          *bp)
1212{
1213        switch (cur->bc_btnum) {
1214        case XFS_BTNUM_BNO:
1215        case XFS_BTNUM_CNT:
1216                xfs_buf_set_ref(bp, XFS_ALLOC_BTREE_REF);
1217                break;
1218        case XFS_BTNUM_INO:
1219        case XFS_BTNUM_FINO:
1220                xfs_buf_set_ref(bp, XFS_INO_BTREE_REF);
1221                break;
1222        case XFS_BTNUM_BMAP:
1223                xfs_buf_set_ref(bp, XFS_BMAP_BTREE_REF);
1224                break;
1225        case XFS_BTNUM_RMAP:
1226                xfs_buf_set_ref(bp, XFS_RMAP_BTREE_REF);
1227                break;
1228        case XFS_BTNUM_REFC:
1229                xfs_buf_set_ref(bp, XFS_REFC_BTREE_REF);
1230                break;
1231        default:
1232                ASSERT(0);
1233        }
1234}
1235
1236int
1237xfs_btree_get_buf_block(
1238        struct xfs_btree_cur    *cur,
1239        union xfs_btree_ptr     *ptr,
1240        struct xfs_btree_block  **block,
1241        struct xfs_buf          **bpp)
1242{
1243        struct xfs_mount        *mp = cur->bc_mp;
1244        xfs_daddr_t             d;
1245        int                     error;
1246
1247        error = xfs_btree_ptr_to_daddr(cur, ptr, &d);
1248        if (error)
1249                return error;
1250        error = xfs_trans_get_buf(cur->bc_tp, mp->m_ddev_targp, d, mp->m_bsize,
1251                        0, bpp);
1252        if (error)
1253                return error;
1254
1255        (*bpp)->b_ops = cur->bc_ops->buf_ops;
1256        *block = XFS_BUF_TO_BLOCK(*bpp);
1257        return 0;
1258}
1259
1260/*
1261 * Read in the buffer at the given ptr and return the buffer and
1262 * the block pointer within the buffer.
1263 */
1264STATIC int
1265xfs_btree_read_buf_block(
1266        struct xfs_btree_cur    *cur,
1267        union xfs_btree_ptr     *ptr,
1268        int                     flags,
1269        struct xfs_btree_block  **block,
1270        struct xfs_buf          **bpp)
1271{
1272        struct xfs_mount        *mp = cur->bc_mp;
1273        xfs_daddr_t             d;
1274        int                     error;
1275
1276        /* need to sort out how callers deal with failures first */
1277        ASSERT(!(flags & XBF_TRYLOCK));
1278
1279        error = xfs_btree_ptr_to_daddr(cur, ptr, &d);
1280        if (error)
1281                return error;
1282        error = xfs_trans_read_buf(mp, cur->bc_tp, mp->m_ddev_targp, d,
1283                                   mp->m_bsize, flags, bpp,
1284                                   cur->bc_ops->buf_ops);
1285        if (error)
1286                return error;
1287
1288        xfs_btree_set_refs(cur, *bpp);
1289        *block = XFS_BUF_TO_BLOCK(*bpp);
1290        return 0;
1291}
1292
1293/*
1294 * Copy keys from one btree block to another.
1295 */
1296void
1297xfs_btree_copy_keys(
1298        struct xfs_btree_cur    *cur,
1299        union xfs_btree_key     *dst_key,
1300        union xfs_btree_key     *src_key,
1301        int                     numkeys)
1302{
1303        ASSERT(numkeys >= 0);
1304        memcpy(dst_key, src_key, numkeys * cur->bc_ops->key_len);
1305}
1306
1307/*
1308 * Copy records from one btree block to another.
1309 */
1310STATIC void
1311xfs_btree_copy_recs(
1312        struct xfs_btree_cur    *cur,
1313        union xfs_btree_rec     *dst_rec,
1314        union xfs_btree_rec     *src_rec,
1315        int                     numrecs)
1316{
1317        ASSERT(numrecs >= 0);
1318        memcpy(dst_rec, src_rec, numrecs * cur->bc_ops->rec_len);
1319}
1320
1321/*
1322 * Copy block pointers from one btree block to another.
1323 */
1324void
1325xfs_btree_copy_ptrs(
1326        struct xfs_btree_cur    *cur,
1327        union xfs_btree_ptr     *dst_ptr,
1328        const union xfs_btree_ptr *src_ptr,
1329        int                     numptrs)
1330{
1331        ASSERT(numptrs >= 0);
1332        memcpy(dst_ptr, src_ptr, numptrs * xfs_btree_ptr_len(cur));
1333}
1334
1335/*
1336 * Shift keys one index left/right inside a single btree block.
1337 */
1338STATIC void
1339xfs_btree_shift_keys(
1340        struct xfs_btree_cur    *cur,
1341        union xfs_btree_key     *key,
1342        int                     dir,
1343        int                     numkeys)
1344{
1345        char                    *dst_key;
1346
1347        ASSERT(numkeys >= 0);
1348        ASSERT(dir == 1 || dir == -1);
1349
1350        dst_key = (char *)key + (dir * cur->bc_ops->key_len);
1351        memmove(dst_key, key, numkeys * cur->bc_ops->key_len);
1352}
1353
1354/*
1355 * Shift records one index left/right inside a single btree block.
1356 */
1357STATIC void
1358xfs_btree_shift_recs(
1359        struct xfs_btree_cur    *cur,
1360        union xfs_btree_rec     *rec,
1361        int                     dir,
1362        int                     numrecs)
1363{
1364        char                    *dst_rec;
1365
1366        ASSERT(numrecs >= 0);
1367        ASSERT(dir == 1 || dir == -1);
1368
1369        dst_rec = (char *)rec + (dir * cur->bc_ops->rec_len);
1370        memmove(dst_rec, rec, numrecs * cur->bc_ops->rec_len);
1371}
1372
1373/*
1374 * Shift block pointers one index left/right inside a single btree block.
1375 */
1376STATIC void
1377xfs_btree_shift_ptrs(
1378        struct xfs_btree_cur    *cur,
1379        union xfs_btree_ptr     *ptr,
1380        int                     dir,
1381        int                     numptrs)
1382{
1383        char                    *dst_ptr;
1384
1385        ASSERT(numptrs >= 0);
1386        ASSERT(dir == 1 || dir == -1);
1387
1388        dst_ptr = (char *)ptr + (dir * xfs_btree_ptr_len(cur));
1389        memmove(dst_ptr, ptr, numptrs * xfs_btree_ptr_len(cur));
1390}
1391
1392/*
1393 * Log key values from the btree block.
1394 */
1395STATIC void
1396xfs_btree_log_keys(
1397        struct xfs_btree_cur    *cur,
1398        struct xfs_buf          *bp,
1399        int                     first,
1400        int                     last)
1401{
1402
1403        if (bp) {
1404                xfs_trans_buf_set_type(cur->bc_tp, bp, XFS_BLFT_BTREE_BUF);
1405                xfs_trans_log_buf(cur->bc_tp, bp,
1406                                  xfs_btree_key_offset(cur, first),
1407                                  xfs_btree_key_offset(cur, last + 1) - 1);
1408        } else {
1409                xfs_trans_log_inode(cur->bc_tp, cur->bc_ino.ip,
1410                                xfs_ilog_fbroot(cur->bc_ino.whichfork));
1411        }
1412}
1413
1414/*
1415 * Log record values from the btree block.
1416 */
1417void
1418xfs_btree_log_recs(
1419        struct xfs_btree_cur    *cur,
1420        struct xfs_buf          *bp,
1421        int                     first,
1422        int                     last)
1423{
1424
1425        xfs_trans_buf_set_type(cur->bc_tp, bp, XFS_BLFT_BTREE_BUF);
1426        xfs_trans_log_buf(cur->bc_tp, bp,
1427                          xfs_btree_rec_offset(cur, first),
1428                          xfs_btree_rec_offset(cur, last + 1) - 1);
1429
1430}
1431
1432/*
1433 * Log block pointer fields from a btree block (nonleaf).
1434 */
1435STATIC void
1436xfs_btree_log_ptrs(
1437        struct xfs_btree_cur    *cur,   /* btree cursor */
1438        struct xfs_buf          *bp,    /* buffer containing btree block */
1439        int                     first,  /* index of first pointer to log */
1440        int                     last)   /* index of last pointer to log */
1441{
1442
1443        if (bp) {
1444                struct xfs_btree_block  *block = XFS_BUF_TO_BLOCK(bp);
1445                int                     level = xfs_btree_get_level(block);
1446
1447                xfs_trans_buf_set_type(cur->bc_tp, bp, XFS_BLFT_BTREE_BUF);
1448                xfs_trans_log_buf(cur->bc_tp, bp,
1449                                xfs_btree_ptr_offset(cur, first, level),
1450                                xfs_btree_ptr_offset(cur, last + 1, level) - 1);
1451        } else {
1452                xfs_trans_log_inode(cur->bc_tp, cur->bc_ino.ip,
1453                        xfs_ilog_fbroot(cur->bc_ino.whichfork));
1454        }
1455
1456}
1457
1458/*
1459 * Log fields from a btree block header.
1460 */
1461void
1462xfs_btree_log_block(
1463        struct xfs_btree_cur    *cur,   /* btree cursor */
1464        struct xfs_buf          *bp,    /* buffer containing btree block */
1465        int                     fields) /* mask of fields: XFS_BB_... */
1466{
1467        int                     first;  /* first byte offset logged */
1468        int                     last;   /* last byte offset logged */
1469        static const short      soffsets[] = {  /* table of offsets (short) */
1470                offsetof(struct xfs_btree_block, bb_magic),
1471                offsetof(struct xfs_btree_block, bb_level),
1472                offsetof(struct xfs_btree_block, bb_numrecs),
1473                offsetof(struct xfs_btree_block, bb_u.s.bb_leftsib),
1474                offsetof(struct xfs_btree_block, bb_u.s.bb_rightsib),
1475                offsetof(struct xfs_btree_block, bb_u.s.bb_blkno),
1476                offsetof(struct xfs_btree_block, bb_u.s.bb_lsn),
1477                offsetof(struct xfs_btree_block, bb_u.s.bb_uuid),
1478                offsetof(struct xfs_btree_block, bb_u.s.bb_owner),
1479                offsetof(struct xfs_btree_block, bb_u.s.bb_crc),
1480                XFS_BTREE_SBLOCK_CRC_LEN
1481        };
1482        static const short      loffsets[] = {  /* table of offsets (long) */
1483                offsetof(struct xfs_btree_block, bb_magic),
1484                offsetof(struct xfs_btree_block, bb_level),
1485                offsetof(struct xfs_btree_block, bb_numrecs),
1486                offsetof(struct xfs_btree_block, bb_u.l.bb_leftsib),
1487                offsetof(struct xfs_btree_block, bb_u.l.bb_rightsib),
1488                offsetof(struct xfs_btree_block, bb_u.l.bb_blkno),
1489                offsetof(struct xfs_btree_block, bb_u.l.bb_lsn),
1490                offsetof(struct xfs_btree_block, bb_u.l.bb_uuid),
1491                offsetof(struct xfs_btree_block, bb_u.l.bb_owner),
1492                offsetof(struct xfs_btree_block, bb_u.l.bb_crc),
1493                offsetof(struct xfs_btree_block, bb_u.l.bb_pad),
1494                XFS_BTREE_LBLOCK_CRC_LEN
1495        };
1496
1497        if (bp) {
1498                int nbits;
1499
1500                if (cur->bc_flags & XFS_BTREE_CRC_BLOCKS) {
1501                        /*
1502                         * We don't log the CRC when updating a btree
1503                         * block but instead recreate it during log
1504                         * recovery.  As the log buffers have checksums
1505                         * of their own this is safe and avoids logging a crc
1506                         * update in a lot of places.
1507                         */
1508                        if (fields == XFS_BB_ALL_BITS)
1509                                fields = XFS_BB_ALL_BITS_CRC;
1510                        nbits = XFS_BB_NUM_BITS_CRC;
1511                } else {
1512                        nbits = XFS_BB_NUM_BITS;
1513                }
1514                xfs_btree_offsets(fields,
1515                                  (cur->bc_flags & XFS_BTREE_LONG_PTRS) ?
1516                                        loffsets : soffsets,
1517                                  nbits, &first, &last);
1518                xfs_trans_buf_set_type(cur->bc_tp, bp, XFS_BLFT_BTREE_BUF);
1519                xfs_trans_log_buf(cur->bc_tp, bp, first, last);
1520        } else {
1521                xfs_trans_log_inode(cur->bc_tp, cur->bc_ino.ip,
1522                        xfs_ilog_fbroot(cur->bc_ino.whichfork));
1523        }
1524}
1525
1526/*
1527 * Increment cursor by one record at the level.
1528 * For nonzero levels the leaf-ward information is untouched.
1529 */
1530int                                             /* error */
1531xfs_btree_increment(
1532        struct xfs_btree_cur    *cur,
1533        int                     level,
1534        int                     *stat)          /* success/failure */
1535{
1536        struct xfs_btree_block  *block;
1537        union xfs_btree_ptr     ptr;
1538        struct xfs_buf          *bp;
1539        int                     error;          /* error return value */
1540        int                     lev;
1541
1542        ASSERT(level < cur->bc_nlevels);
1543
1544        /* Read-ahead to the right at this level. */
1545        xfs_btree_readahead(cur, level, XFS_BTCUR_RIGHTRA);
1546
1547        /* Get a pointer to the btree block. */
1548        block = xfs_btree_get_block(cur, level, &bp);
1549
1550#ifdef DEBUG
1551        error = xfs_btree_check_block(cur, block, level, bp);
1552        if (error)
1553                goto error0;
1554#endif
1555
1556        /* We're done if we remain in the block after the increment. */
1557        if (++cur->bc_ptrs[level] <= xfs_btree_get_numrecs(block))
1558                goto out1;
1559
1560        /* Fail if we just went off the right edge of the tree. */
1561        xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_RIGHTSIB);
1562        if (xfs_btree_ptr_is_null(cur, &ptr))
1563                goto out0;
1564
1565        XFS_BTREE_STATS_INC(cur, increment);
1566
1567        /*
1568         * March up the tree incrementing pointers.
1569         * Stop when we don't go off the right edge of a block.
1570         */
1571        for (lev = level + 1; lev < cur->bc_nlevels; lev++) {
1572                block = xfs_btree_get_block(cur, lev, &bp);
1573
1574#ifdef DEBUG
1575                error = xfs_btree_check_block(cur, block, lev, bp);
1576                if (error)
1577                        goto error0;
1578#endif
1579
1580                if (++cur->bc_ptrs[lev] <= xfs_btree_get_numrecs(block))
1581                        break;
1582
1583                /* Read-ahead the right block for the next loop. */
1584                xfs_btree_readahead(cur, lev, XFS_BTCUR_RIGHTRA);
1585        }
1586
1587        /*
1588         * If we went off the root then we are either seriously
1589         * confused or have the tree root in an inode.
1590         */
1591        if (lev == cur->bc_nlevels) {
1592                if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE)
1593                        goto out0;
1594                ASSERT(0);
1595                error = -EFSCORRUPTED;
1596                goto error0;
1597        }
1598        ASSERT(lev < cur->bc_nlevels);
1599
1600        /*
1601         * Now walk back down the tree, fixing up the cursor's buffer
1602         * pointers and key numbers.
1603         */
1604        for (block = xfs_btree_get_block(cur, lev, &bp); lev > level; ) {
1605                union xfs_btree_ptr     *ptrp;
1606
1607                ptrp = xfs_btree_ptr_addr(cur, cur->bc_ptrs[lev], block);
1608                --lev;
1609                error = xfs_btree_read_buf_block(cur, ptrp, 0, &block, &bp);
1610                if (error)
1611                        goto error0;
1612
1613                xfs_btree_setbuf(cur, lev, bp);
1614                cur->bc_ptrs[lev] = 1;
1615        }
1616out1:
1617        *stat = 1;
1618        return 0;
1619
1620out0:
1621        *stat = 0;
1622        return 0;
1623
1624error0:
1625        return error;
1626}
1627
1628/*
1629 * Decrement cursor by one record at the level.
1630 * For nonzero levels the leaf-ward information is untouched.
1631 */
1632int                                             /* error */
1633xfs_btree_decrement(
1634        struct xfs_btree_cur    *cur,
1635        int                     level,
1636        int                     *stat)          /* success/failure */
1637{
1638        struct xfs_btree_block  *block;
1639        struct xfs_buf          *bp;
1640        int                     error;          /* error return value */
1641        int                     lev;
1642        union xfs_btree_ptr     ptr;
1643
1644        ASSERT(level < cur->bc_nlevels);
1645
1646        /* Read-ahead to the left at this level. */
1647        xfs_btree_readahead(cur, level, XFS_BTCUR_LEFTRA);
1648
1649        /* We're done if we remain in the block after the decrement. */
1650        if (--cur->bc_ptrs[level] > 0)
1651                goto out1;
1652
1653        /* Get a pointer to the btree block. */
1654        block = xfs_btree_get_block(cur, level, &bp);
1655
1656#ifdef DEBUG
1657        error = xfs_btree_check_block(cur, block, level, bp);
1658        if (error)
1659                goto error0;
1660#endif
1661
1662        /* Fail if we just went off the left edge of the tree. */
1663        xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_LEFTSIB);
1664        if (xfs_btree_ptr_is_null(cur, &ptr))
1665                goto out0;
1666
1667        XFS_BTREE_STATS_INC(cur, decrement);
1668
1669        /*
1670         * March up the tree decrementing pointers.
1671         * Stop when we don't go off the left edge of a block.
1672         */
1673        for (lev = level + 1; lev < cur->bc_nlevels; lev++) {
1674                if (--cur->bc_ptrs[lev] > 0)
1675                        break;
1676                /* Read-ahead the left block for the next loop. */
1677                xfs_btree_readahead(cur, lev, XFS_BTCUR_LEFTRA);
1678        }
1679
1680        /*
1681         * If we went off the root then we are seriously confused.
1682         * or the root of the tree is in an inode.
1683         */
1684        if (lev == cur->bc_nlevels) {
1685                if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE)
1686                        goto out0;
1687                ASSERT(0);
1688                error = -EFSCORRUPTED;
1689                goto error0;
1690        }
1691        ASSERT(lev < cur->bc_nlevels);
1692
1693        /*
1694         * Now walk back down the tree, fixing up the cursor's buffer
1695         * pointers and key numbers.
1696         */
1697        for (block = xfs_btree_get_block(cur, lev, &bp); lev > level; ) {
1698                union xfs_btree_ptr     *ptrp;
1699
1700                ptrp = xfs_btree_ptr_addr(cur, cur->bc_ptrs[lev], block);
1701                --lev;
1702                error = xfs_btree_read_buf_block(cur, ptrp, 0, &block, &bp);
1703                if (error)
1704                        goto error0;
1705                xfs_btree_setbuf(cur, lev, bp);
1706                cur->bc_ptrs[lev] = xfs_btree_get_numrecs(block);
1707        }
1708out1:
1709        *stat = 1;
1710        return 0;
1711
1712out0:
1713        *stat = 0;
1714        return 0;
1715
1716error0:
1717        return error;
1718}
1719
1720int
1721xfs_btree_lookup_get_block(
1722        struct xfs_btree_cur    *cur,   /* btree cursor */
1723        int                     level,  /* level in the btree */
1724        union xfs_btree_ptr     *pp,    /* ptr to btree block */
1725        struct xfs_btree_block  **blkp) /* return btree block */
1726{
1727        struct xfs_buf          *bp;    /* buffer pointer for btree block */
1728        xfs_daddr_t             daddr;
1729        int                     error = 0;
1730
1731        /* special case the root block if in an inode */
1732        if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
1733            (level == cur->bc_nlevels - 1)) {
1734                *blkp = xfs_btree_get_iroot(cur);
1735                return 0;
1736        }
1737
1738        /*
1739         * If the old buffer at this level for the disk address we are
1740         * looking for re-use it.
1741         *
1742         * Otherwise throw it away and get a new one.
1743         */
1744        bp = cur->bc_bufs[level];
1745        error = xfs_btree_ptr_to_daddr(cur, pp, &daddr);
1746        if (error)
1747                return error;
1748        if (bp && XFS_BUF_ADDR(bp) == daddr) {
1749                *blkp = XFS_BUF_TO_BLOCK(bp);
1750                return 0;
1751        }
1752
1753        error = xfs_btree_read_buf_block(cur, pp, 0, blkp, &bp);
1754        if (error)
1755                return error;
1756
1757        /* Check the inode owner since the verifiers don't. */
1758        if (xfs_sb_version_hascrc(&cur->bc_mp->m_sb) &&
1759            !(cur->bc_ino.flags & XFS_BTCUR_BMBT_INVALID_OWNER) &&
1760            (cur->bc_flags & XFS_BTREE_LONG_PTRS) &&
1761            be64_to_cpu((*blkp)->bb_u.l.bb_owner) !=
1762                        cur->bc_ino.ip->i_ino)
1763                goto out_bad;
1764
1765        /* Did we get the level we were looking for? */
1766        if (be16_to_cpu((*blkp)->bb_level) != level)
1767                goto out_bad;
1768
1769        /* Check that internal nodes have at least one record. */
1770        if (level != 0 && be16_to_cpu((*blkp)->bb_numrecs) == 0)
1771                goto out_bad;
1772
1773        xfs_btree_setbuf(cur, level, bp);
1774        return 0;
1775
1776out_bad:
1777        *blkp = NULL;
1778        xfs_buf_mark_corrupt(bp);
1779        xfs_trans_brelse(cur->bc_tp, bp);
1780        return -EFSCORRUPTED;
1781}
1782
1783/*
1784 * Get current search key.  For level 0 we don't actually have a key
1785 * structure so we make one up from the record.  For all other levels
1786 * we just return the right key.
1787 */
1788STATIC union xfs_btree_key *
1789xfs_lookup_get_search_key(
1790        struct xfs_btree_cur    *cur,
1791        int                     level,
1792        int                     keyno,
1793        struct xfs_btree_block  *block,
1794        union xfs_btree_key     *kp)
1795{
1796        if (level == 0) {
1797                cur->bc_ops->init_key_from_rec(kp,
1798                                xfs_btree_rec_addr(cur, keyno, block));
1799                return kp;
1800        }
1801
1802        return xfs_btree_key_addr(cur, keyno, block);
1803}
1804
1805/*
1806 * Lookup the record.  The cursor is made to point to it, based on dir.
1807 * stat is set to 0 if can't find any such record, 1 for success.
1808 */
1809int                                     /* error */
1810xfs_btree_lookup(
1811        struct xfs_btree_cur    *cur,   /* btree cursor */
1812        xfs_lookup_t            dir,    /* <=, ==, or >= */
1813        int                     *stat)  /* success/failure */
1814{
1815        struct xfs_btree_block  *block; /* current btree block */
1816        int64_t                 diff;   /* difference for the current key */
1817        int                     error;  /* error return value */
1818        int                     keyno;  /* current key number */
1819        int                     level;  /* level in the btree */
1820        union xfs_btree_ptr     *pp;    /* ptr to btree block */
1821        union xfs_btree_ptr     ptr;    /* ptr to btree block */
1822
1823        XFS_BTREE_STATS_INC(cur, lookup);
1824
1825        /* No such thing as a zero-level tree. */
1826        if (XFS_IS_CORRUPT(cur->bc_mp, cur->bc_nlevels == 0))
1827                return -EFSCORRUPTED;
1828
1829        block = NULL;
1830        keyno = 0;
1831
1832        /* initialise start pointer from cursor */
1833        cur->bc_ops->init_ptr_from_cur(cur, &ptr);
1834        pp = &ptr;
1835
1836        /*
1837         * Iterate over each level in the btree, starting at the root.
1838         * For each level above the leaves, find the key we need, based
1839         * on the lookup record, then follow the corresponding block
1840         * pointer down to the next level.
1841         */
1842        for (level = cur->bc_nlevels - 1, diff = 1; level >= 0; level--) {
1843                /* Get the block we need to do the lookup on. */
1844                error = xfs_btree_lookup_get_block(cur, level, pp, &block);
1845                if (error)
1846                        goto error0;
1847
1848                if (diff == 0) {
1849                        /*
1850                         * If we already had a key match at a higher level, we
1851                         * know we need to use the first entry in this block.
1852                         */
1853                        keyno = 1;
1854                } else {
1855                        /* Otherwise search this block. Do a binary search. */
1856
1857                        int     high;   /* high entry number */
1858                        int     low;    /* low entry number */
1859
1860                        /* Set low and high entry numbers, 1-based. */
1861                        low = 1;
1862                        high = xfs_btree_get_numrecs(block);
1863                        if (!high) {
1864                                /* Block is empty, must be an empty leaf. */
1865                                if (level != 0 || cur->bc_nlevels != 1) {
1866                                        XFS_CORRUPTION_ERROR(__func__,
1867                                                        XFS_ERRLEVEL_LOW,
1868                                                        cur->bc_mp, block,
1869                                                        sizeof(*block));
1870                                        return -EFSCORRUPTED;
1871                                }
1872
1873                                cur->bc_ptrs[0] = dir != XFS_LOOKUP_LE;
1874                                *stat = 0;
1875                                return 0;
1876                        }
1877
1878                        /* Binary search the block. */
1879                        while (low <= high) {
1880                                union xfs_btree_key     key;
1881                                union xfs_btree_key     *kp;
1882
1883                                XFS_BTREE_STATS_INC(cur, compare);
1884
1885                                /* keyno is average of low and high. */
1886                                keyno = (low + high) >> 1;
1887
1888                                /* Get current search key */
1889                                kp = xfs_lookup_get_search_key(cur, level,
1890                                                keyno, block, &key);
1891
1892                                /*
1893                                 * Compute difference to get next direction:
1894                                 *  - less than, move right
1895                                 *  - greater than, move left
1896                                 *  - equal, we're done
1897                                 */
1898                                diff = cur->bc_ops->key_diff(cur, kp);
1899                                if (diff < 0)
1900                                        low = keyno + 1;
1901                                else if (diff > 0)
1902                                        high = keyno - 1;
1903                                else
1904                                        break;
1905                        }
1906                }
1907
1908                /*
1909                 * If there are more levels, set up for the next level
1910                 * by getting the block number and filling in the cursor.
1911                 */
1912                if (level > 0) {
1913                        /*
1914                         * If we moved left, need the previous key number,
1915                         * unless there isn't one.
1916                         */
1917                        if (diff > 0 && --keyno < 1)
1918                                keyno = 1;
1919                        pp = xfs_btree_ptr_addr(cur, keyno, block);
1920
1921                        error = xfs_btree_debug_check_ptr(cur, pp, 0, level);
1922                        if (error)
1923                                goto error0;
1924
1925                        cur->bc_ptrs[level] = keyno;
1926                }
1927        }
1928
1929        /* Done with the search. See if we need to adjust the results. */
1930        if (dir != XFS_LOOKUP_LE && diff < 0) {
1931                keyno++;
1932                /*
1933                 * If ge search and we went off the end of the block, but it's
1934                 * not the last block, we're in the wrong block.
1935                 */
1936                xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_RIGHTSIB);
1937                if (dir == XFS_LOOKUP_GE &&
1938                    keyno > xfs_btree_get_numrecs(block) &&
1939                    !xfs_btree_ptr_is_null(cur, &ptr)) {
1940                        int     i;
1941
1942                        cur->bc_ptrs[0] = keyno;
1943                        error = xfs_btree_increment(cur, 0, &i);
1944                        if (error)
1945                                goto error0;
1946                        if (XFS_IS_CORRUPT(cur->bc_mp, i != 1))
1947                                return -EFSCORRUPTED;
1948                        *stat = 1;
1949                        return 0;
1950                }
1951        } else if (dir == XFS_LOOKUP_LE && diff > 0)
1952                keyno--;
1953        cur->bc_ptrs[0] = keyno;
1954
1955        /* Return if we succeeded or not. */
1956        if (keyno == 0 || keyno > xfs_btree_get_numrecs(block))
1957                *stat = 0;
1958        else if (dir != XFS_LOOKUP_EQ || diff == 0)
1959                *stat = 1;
1960        else
1961                *stat = 0;
1962        return 0;
1963
1964error0:
1965        return error;
1966}
1967
1968/* Find the high key storage area from a regular key. */
1969union xfs_btree_key *
1970xfs_btree_high_key_from_key(
1971        struct xfs_btree_cur    *cur,
1972        union xfs_btree_key     *key)
1973{
1974        ASSERT(cur->bc_flags & XFS_BTREE_OVERLAPPING);
1975        return (union xfs_btree_key *)((char *)key +
1976                        (cur->bc_ops->key_len / 2));
1977}
1978
1979/* Determine the low (and high if overlapped) keys of a leaf block */
1980STATIC void
1981xfs_btree_get_leaf_keys(
1982        struct xfs_btree_cur    *cur,
1983        struct xfs_btree_block  *block,
1984        union xfs_btree_key     *key)
1985{
1986        union xfs_btree_key     max_hkey;
1987        union xfs_btree_key     hkey;
1988        union xfs_btree_rec     *rec;
1989        union xfs_btree_key     *high;
1990        int                     n;
1991
1992        rec = xfs_btree_rec_addr(cur, 1, block);
1993        cur->bc_ops->init_key_from_rec(key, rec);
1994
1995        if (cur->bc_flags & XFS_BTREE_OVERLAPPING) {
1996
1997                cur->bc_ops->init_high_key_from_rec(&max_hkey, rec);
1998                for (n = 2; n <= xfs_btree_get_numrecs(block); n++) {
1999                        rec = xfs_btree_rec_addr(cur, n, block);
2000                        cur->bc_ops->init_high_key_from_rec(&hkey, rec);
2001                        if (cur->bc_ops->diff_two_keys(cur, &hkey, &max_hkey)
2002                                        > 0)
2003                                max_hkey = hkey;
2004                }
2005
2006                high = xfs_btree_high_key_from_key(cur, key);
2007                memcpy(high, &max_hkey, cur->bc_ops->key_len / 2);
2008        }
2009}
2010
2011/* Determine the low (and high if overlapped) keys of a node block */
2012STATIC void
2013xfs_btree_get_node_keys(
2014        struct xfs_btree_cur    *cur,
2015        struct xfs_btree_block  *block,
2016        union xfs_btree_key     *key)
2017{
2018        union xfs_btree_key     *hkey;
2019        union xfs_btree_key     *max_hkey;
2020        union xfs_btree_key     *high;
2021        int                     n;
2022
2023        if (cur->bc_flags & XFS_BTREE_OVERLAPPING) {
2024                memcpy(key, xfs_btree_key_addr(cur, 1, block),
2025                                cur->bc_ops->key_len / 2);
2026
2027                max_hkey = xfs_btree_high_key_addr(cur, 1, block);
2028                for (n = 2; n <= xfs_btree_get_numrecs(block); n++) {
2029                        hkey = xfs_btree_high_key_addr(cur, n, block);
2030                        if (cur->bc_ops->diff_two_keys(cur, hkey, max_hkey) > 0)
2031                                max_hkey = hkey;
2032                }
2033
2034                high = xfs_btree_high_key_from_key(cur, key);
2035                memcpy(high, max_hkey, cur->bc_ops->key_len / 2);
2036        } else {
2037                memcpy(key, xfs_btree_key_addr(cur, 1, block),
2038                                cur->bc_ops->key_len);
2039        }
2040}
2041
2042/* Derive the keys for any btree block. */
2043void
2044xfs_btree_get_keys(
2045        struct xfs_btree_cur    *cur,
2046        struct xfs_btree_block  *block,
2047        union xfs_btree_key     *key)
2048{
2049        if (be16_to_cpu(block->bb_level) == 0)
2050                xfs_btree_get_leaf_keys(cur, block, key);
2051        else
2052                xfs_btree_get_node_keys(cur, block, key);
2053}
2054
2055/*
2056 * Decide if we need to update the parent keys of a btree block.  For
2057 * a standard btree this is only necessary if we're updating the first
2058 * record/key.  For an overlapping btree, we must always update the
2059 * keys because the highest key can be in any of the records or keys
2060 * in the block.
2061 */
2062static inline bool
2063xfs_btree_needs_key_update(
2064        struct xfs_btree_cur    *cur,
2065        int                     ptr)
2066{
2067        return (cur->bc_flags & XFS_BTREE_OVERLAPPING) || ptr == 1;
2068}
2069
2070/*
2071 * Update the low and high parent keys of the given level, progressing
2072 * towards the root.  If force_all is false, stop if the keys for a given
2073 * level do not need updating.
2074 */
2075STATIC int
2076__xfs_btree_updkeys(
2077        struct xfs_btree_cur    *cur,
2078        int                     level,
2079        struct xfs_btree_block  *block,
2080        struct xfs_buf          *bp0,
2081        bool                    force_all)
2082{
2083        union xfs_btree_key     key;    /* keys from current level */
2084        union xfs_btree_key     *lkey;  /* keys from the next level up */
2085        union xfs_btree_key     *hkey;
2086        union xfs_btree_key     *nlkey; /* keys from the next level up */
2087        union xfs_btree_key     *nhkey;
2088        struct xfs_buf          *bp;
2089        int                     ptr;
2090
2091        ASSERT(cur->bc_flags & XFS_BTREE_OVERLAPPING);
2092
2093        /* Exit if there aren't any parent levels to update. */
2094        if (level + 1 >= cur->bc_nlevels)
2095                return 0;
2096
2097        trace_xfs_btree_updkeys(cur, level, bp0);
2098
2099        lkey = &key;
2100        hkey = xfs_btree_high_key_from_key(cur, lkey);
2101        xfs_btree_get_keys(cur, block, lkey);
2102        for (level++; level < cur->bc_nlevels; level++) {
2103#ifdef DEBUG
2104                int             error;
2105#endif
2106                block = xfs_btree_get_block(cur, level, &bp);
2107                trace_xfs_btree_updkeys(cur, level, bp);
2108#ifdef DEBUG
2109                error = xfs_btree_check_block(cur, block, level, bp);
2110                if (error)
2111                        return error;
2112#endif
2113                ptr = cur->bc_ptrs[level];
2114                nlkey = xfs_btree_key_addr(cur, ptr, block);
2115                nhkey = xfs_btree_high_key_addr(cur, ptr, block);
2116                if (!force_all &&
2117                    !(cur->bc_ops->diff_two_keys(cur, nlkey, lkey) != 0 ||
2118                      cur->bc_ops->diff_two_keys(cur, nhkey, hkey) != 0))
2119                        break;
2120                xfs_btree_copy_keys(cur, nlkey, lkey, 1);
2121                xfs_btree_log_keys(cur, bp, ptr, ptr);
2122                if (level + 1 >= cur->bc_nlevels)
2123                        break;
2124                xfs_btree_get_node_keys(cur, block, lkey);
2125        }
2126
2127        return 0;
2128}
2129
2130/* Update all the keys from some level in cursor back to the root. */
2131STATIC int
2132xfs_btree_updkeys_force(
2133        struct xfs_btree_cur    *cur,
2134        int                     level)
2135{
2136        struct xfs_buf          *bp;
2137        struct xfs_btree_block  *block;
2138
2139        block = xfs_btree_get_block(cur, level, &bp);
2140        return __xfs_btree_updkeys(cur, level, block, bp, true);
2141}
2142
2143/*
2144 * Update the parent keys of the given level, progressing towards the root.
2145 */
2146STATIC int
2147xfs_btree_update_keys(
2148        struct xfs_btree_cur    *cur,
2149        int                     level)
2150{
2151        struct xfs_btree_block  *block;
2152        struct xfs_buf          *bp;
2153        union xfs_btree_key     *kp;
2154        union xfs_btree_key     key;
2155        int                     ptr;
2156
2157        ASSERT(level >= 0);
2158
2159        block = xfs_btree_get_block(cur, level, &bp);
2160        if (cur->bc_flags & XFS_BTREE_OVERLAPPING)
2161                return __xfs_btree_updkeys(cur, level, block, bp, false);
2162
2163        /*
2164         * Go up the tree from this level toward the root.
2165         * At each level, update the key value to the value input.
2166         * Stop when we reach a level where the cursor isn't pointing
2167         * at the first entry in the block.
2168         */
2169        xfs_btree_get_keys(cur, block, &key);
2170        for (level++, ptr = 1; ptr == 1 && level < cur->bc_nlevels; level++) {
2171#ifdef DEBUG
2172                int             error;
2173#endif
2174                block = xfs_btree_get_block(cur, level, &bp);
2175#ifdef DEBUG
2176                error = xfs_btree_check_block(cur, block, level, bp);
2177                if (error)
2178                        return error;
2179#endif
2180                ptr = cur->bc_ptrs[level];
2181                kp = xfs_btree_key_addr(cur, ptr, block);
2182                xfs_btree_copy_keys(cur, kp, &key, 1);
2183                xfs_btree_log_keys(cur, bp, ptr, ptr);
2184        }
2185
2186        return 0;
2187}
2188
2189/*
2190 * Update the record referred to by cur to the value in the
2191 * given record. This either works (return 0) or gets an
2192 * EFSCORRUPTED error.
2193 */
2194int
2195xfs_btree_update(
2196        struct xfs_btree_cur    *cur,
2197        union xfs_btree_rec     *rec)
2198{
2199        struct xfs_btree_block  *block;
2200        struct xfs_buf          *bp;
2201        int                     error;
2202        int                     ptr;
2203        union xfs_btree_rec     *rp;
2204
2205        /* Pick up the current block. */
2206        block = xfs_btree_get_block(cur, 0, &bp);
2207
2208#ifdef DEBUG
2209        error = xfs_btree_check_block(cur, block, 0, bp);
2210        if (error)
2211                goto error0;
2212#endif
2213        /* Get the address of the rec to be updated. */
2214        ptr = cur->bc_ptrs[0];
2215        rp = xfs_btree_rec_addr(cur, ptr, block);
2216
2217        /* Fill in the new contents and log them. */
2218        xfs_btree_copy_recs(cur, rp, rec, 1);
2219        xfs_btree_log_recs(cur, bp, ptr, ptr);
2220
2221        /*
2222         * If we are tracking the last record in the tree and
2223         * we are at the far right edge of the tree, update it.
2224         */
2225        if (xfs_btree_is_lastrec(cur, block, 0)) {
2226                cur->bc_ops->update_lastrec(cur, block, rec,
2227                                            ptr, LASTREC_UPDATE);
2228        }
2229
2230        /* Pass new key value up to our parent. */
2231        if (xfs_btree_needs_key_update(cur, ptr)) {
2232                error = xfs_btree_update_keys(cur, 0);
2233                if (error)
2234                        goto error0;
2235        }
2236
2237        return 0;
2238
2239error0:
2240        return error;
2241}
2242
2243/*
2244 * Move 1 record left from cur/level if possible.
2245 * Update cur to reflect the new path.
2246 */
2247STATIC int                                      /* error */
2248xfs_btree_lshift(
2249        struct xfs_btree_cur    *cur,
2250        int                     level,
2251        int                     *stat)          /* success/failure */
2252{
2253        struct xfs_buf          *lbp;           /* left buffer pointer */
2254        struct xfs_btree_block  *left;          /* left btree block */
2255        int                     lrecs;          /* left record count */
2256        struct xfs_buf          *rbp;           /* right buffer pointer */
2257        struct xfs_btree_block  *right;         /* right btree block */
2258        struct xfs_btree_cur    *tcur;          /* temporary btree cursor */
2259        int                     rrecs;          /* right record count */
2260        union xfs_btree_ptr     lptr;           /* left btree pointer */
2261        union xfs_btree_key     *rkp = NULL;    /* right btree key */
2262        union xfs_btree_ptr     *rpp = NULL;    /* right address pointer */
2263        union xfs_btree_rec     *rrp = NULL;    /* right record pointer */
2264        int                     error;          /* error return value */
2265        int                     i;
2266
2267        if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
2268            level == cur->bc_nlevels - 1)
2269                goto out0;
2270
2271        /* Set up variables for this block as "right". */
2272        right = xfs_btree_get_block(cur, level, &rbp);
2273
2274#ifdef DEBUG
2275        error = xfs_btree_check_block(cur, right, level, rbp);
2276        if (error)
2277                goto error0;
2278#endif
2279
2280        /* If we've got no left sibling then we can't shift an entry left. */
2281        xfs_btree_get_sibling(cur, right, &lptr, XFS_BB_LEFTSIB);
2282        if (xfs_btree_ptr_is_null(cur, &lptr))
2283                goto out0;
2284
2285        /*
2286         * If the cursor entry is the one that would be moved, don't
2287         * do it... it's too complicated.
2288         */
2289        if (cur->bc_ptrs[level] <= 1)
2290                goto out0;
2291
2292        /* Set up the left neighbor as "left". */
2293        error = xfs_btree_read_buf_block(cur, &lptr, 0, &left, &lbp);
2294        if (error)
2295                goto error0;
2296
2297        /* If it's full, it can't take another entry. */
2298        lrecs = xfs_btree_get_numrecs(left);
2299        if (lrecs == cur->bc_ops->get_maxrecs(cur, level))
2300                goto out0;
2301
2302        rrecs = xfs_btree_get_numrecs(right);
2303
2304        /*
2305         * We add one entry to the left side and remove one for the right side.
2306         * Account for it here, the changes will be updated on disk and logged
2307         * later.
2308         */
2309        lrecs++;
2310        rrecs--;
2311
2312        XFS_BTREE_STATS_INC(cur, lshift);
2313        XFS_BTREE_STATS_ADD(cur, moves, 1);
2314
2315        /*
2316         * If non-leaf, copy a key and a ptr to the left block.
2317         * Log the changes to the left block.
2318         */
2319        if (level > 0) {
2320                /* It's a non-leaf.  Move keys and pointers. */
2321                union xfs_btree_key     *lkp;   /* left btree key */
2322                union xfs_btree_ptr     *lpp;   /* left address pointer */
2323
2324                lkp = xfs_btree_key_addr(cur, lrecs, left);
2325                rkp = xfs_btree_key_addr(cur, 1, right);
2326
2327                lpp = xfs_btree_ptr_addr(cur, lrecs, left);
2328                rpp = xfs_btree_ptr_addr(cur, 1, right);
2329
2330                error = xfs_btree_debug_check_ptr(cur, rpp, 0, level);
2331                if (error)
2332                        goto error0;
2333
2334                xfs_btree_copy_keys(cur, lkp, rkp, 1);
2335                xfs_btree_copy_ptrs(cur, lpp, rpp, 1);
2336
2337                xfs_btree_log_keys(cur, lbp, lrecs, lrecs);
2338                xfs_btree_log_ptrs(cur, lbp, lrecs, lrecs);
2339
2340                ASSERT(cur->bc_ops->keys_inorder(cur,
2341                        xfs_btree_key_addr(cur, lrecs - 1, left), lkp));
2342        } else {
2343                /* It's a leaf.  Move records.  */
2344                union xfs_btree_rec     *lrp;   /* left record pointer */
2345
2346                lrp = xfs_btree_rec_addr(cur, lrecs, left);
2347                rrp = xfs_btree_rec_addr(cur, 1, right);
2348
2349                xfs_btree_copy_recs(cur, lrp, rrp, 1);
2350                xfs_btree_log_recs(cur, lbp, lrecs, lrecs);
2351
2352                ASSERT(cur->bc_ops->recs_inorder(cur,
2353                        xfs_btree_rec_addr(cur, lrecs - 1, left), lrp));
2354        }
2355
2356        xfs_btree_set_numrecs(left, lrecs);
2357        xfs_btree_log_block(cur, lbp, XFS_BB_NUMRECS);
2358
2359        xfs_btree_set_numrecs(right, rrecs);
2360        xfs_btree_log_block(cur, rbp, XFS_BB_NUMRECS);
2361
2362        /*
2363         * Slide the contents of right down one entry.
2364         */
2365        XFS_BTREE_STATS_ADD(cur, moves, rrecs - 1);
2366        if (level > 0) {
2367                /* It's a nonleaf. operate on keys and ptrs */
2368                for (i = 0; i < rrecs; i++) {
2369                        error = xfs_btree_debug_check_ptr(cur, rpp, i + 1, level);
2370                        if (error)
2371                                goto error0;
2372                }
2373
2374                xfs_btree_shift_keys(cur,
2375                                xfs_btree_key_addr(cur, 2, right),
2376                                -1, rrecs);
2377                xfs_btree_shift_ptrs(cur,
2378                                xfs_btree_ptr_addr(cur, 2, right),
2379                                -1, rrecs);
2380
2381                xfs_btree_log_keys(cur, rbp, 1, rrecs);
2382                xfs_btree_log_ptrs(cur, rbp, 1, rrecs);
2383        } else {
2384                /* It's a leaf. operate on records */
2385                xfs_btree_shift_recs(cur,
2386                        xfs_btree_rec_addr(cur, 2, right),
2387                        -1, rrecs);
2388                xfs_btree_log_recs(cur, rbp, 1, rrecs);
2389        }
2390
2391        /*
2392         * Using a temporary cursor, update the parent key values of the
2393         * block on the left.
2394         */
2395        if (cur->bc_flags & XFS_BTREE_OVERLAPPING) {
2396                error = xfs_btree_dup_cursor(cur, &tcur);
2397                if (error)
2398                        goto error0;
2399                i = xfs_btree_firstrec(tcur, level);
2400                if (XFS_IS_CORRUPT(tcur->bc_mp, i != 1)) {
2401                        error = -EFSCORRUPTED;
2402                        goto error0;
2403                }
2404
2405                error = xfs_btree_decrement(tcur, level, &i);
2406                if (error)
2407                        goto error1;
2408
2409                /* Update the parent high keys of the left block, if needed. */
2410                error = xfs_btree_update_keys(tcur, level);
2411                if (error)
2412                        goto error1;
2413
2414                xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
2415        }
2416
2417        /* Update the parent keys of the right block. */
2418        error = xfs_btree_update_keys(cur, level);
2419        if (error)
2420                goto error0;
2421
2422        /* Slide the cursor value left one. */
2423        cur->bc_ptrs[level]--;
2424
2425        *stat = 1;
2426        return 0;
2427
2428out0:
2429        *stat = 0;
2430        return 0;
2431
2432error0:
2433        return error;
2434
2435error1:
2436        xfs_btree_del_cursor(tcur, XFS_BTREE_ERROR);
2437        return error;
2438}
2439
2440/*
2441 * Move 1 record right from cur/level if possible.
2442 * Update cur to reflect the new path.
2443 */
2444STATIC int                                      /* error */
2445xfs_btree_rshift(
2446        struct xfs_btree_cur    *cur,
2447        int                     level,
2448        int                     *stat)          /* success/failure */
2449{
2450        struct xfs_buf          *lbp;           /* left buffer pointer */
2451        struct xfs_btree_block  *left;          /* left btree block */
2452        struct xfs_buf          *rbp;           /* right buffer pointer */
2453        struct xfs_btree_block  *right;         /* right btree block */
2454        struct xfs_btree_cur    *tcur;          /* temporary btree cursor */
2455        union xfs_btree_ptr     rptr;           /* right block pointer */
2456        union xfs_btree_key     *rkp;           /* right btree key */
2457        int                     rrecs;          /* right record count */
2458        int                     lrecs;          /* left record count */
2459        int                     error;          /* error return value */
2460        int                     i;              /* loop counter */
2461
2462        if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
2463            (level == cur->bc_nlevels - 1))
2464                goto out0;
2465
2466        /* Set up variables for this block as "left". */
2467        left = xfs_btree_get_block(cur, level, &lbp);
2468
2469#ifdef DEBUG
2470        error = xfs_btree_check_block(cur, left, level, lbp);
2471        if (error)
2472                goto error0;
2473#endif
2474
2475        /* If we've got no right sibling then we can't shift an entry right. */
2476        xfs_btree_get_sibling(cur, left, &rptr, XFS_BB_RIGHTSIB);
2477        if (xfs_btree_ptr_is_null(cur, &rptr))
2478                goto out0;
2479
2480        /*
2481         * If the cursor entry is the one that would be moved, don't
2482         * do it... it's too complicated.
2483         */
2484        lrecs = xfs_btree_get_numrecs(left);
2485        if (cur->bc_ptrs[level] >= lrecs)
2486                goto out0;
2487
2488        /* Set up the right neighbor as "right". */
2489        error = xfs_btree_read_buf_block(cur, &rptr, 0, &right, &rbp);
2490        if (error)
2491                goto error0;
2492
2493        /* If it's full, it can't take another entry. */
2494        rrecs = xfs_btree_get_numrecs(right);
2495        if (rrecs == cur->bc_ops->get_maxrecs(cur, level))
2496                goto out0;
2497
2498        XFS_BTREE_STATS_INC(cur, rshift);
2499        XFS_BTREE_STATS_ADD(cur, moves, rrecs);
2500
2501        /*
2502         * Make a hole at the start of the right neighbor block, then
2503         * copy the last left block entry to the hole.
2504         */
2505        if (level > 0) {
2506                /* It's a nonleaf. make a hole in the keys and ptrs */
2507                union xfs_btree_key     *lkp;
2508                union xfs_btree_ptr     *lpp;
2509                union xfs_btree_ptr     *rpp;
2510
2511                lkp = xfs_btree_key_addr(cur, lrecs, left);
2512                lpp = xfs_btree_ptr_addr(cur, lrecs, left);
2513                rkp = xfs_btree_key_addr(cur, 1, right);
2514                rpp = xfs_btree_ptr_addr(cur, 1, right);
2515
2516                for (i = rrecs - 1; i >= 0; i--) {
2517                        error = xfs_btree_debug_check_ptr(cur, rpp, i, level);
2518                        if (error)
2519                                goto error0;
2520                }
2521
2522                xfs_btree_shift_keys(cur, rkp, 1, rrecs);
2523                xfs_btree_shift_ptrs(cur, rpp, 1, rrecs);
2524
2525                error = xfs_btree_debug_check_ptr(cur, lpp, 0, level);
2526                if (error)
2527                        goto error0;
2528
2529                /* Now put the new data in, and log it. */
2530                xfs_btree_copy_keys(cur, rkp, lkp, 1);
2531                xfs_btree_copy_ptrs(cur, rpp, lpp, 1);
2532
2533                xfs_btree_log_keys(cur, rbp, 1, rrecs + 1);
2534                xfs_btree_log_ptrs(cur, rbp, 1, rrecs + 1);
2535
2536                ASSERT(cur->bc_ops->keys_inorder(cur, rkp,
2537                        xfs_btree_key_addr(cur, 2, right)));
2538        } else {
2539                /* It's a leaf. make a hole in the records */
2540                union xfs_btree_rec     *lrp;
2541                union xfs_btree_rec     *rrp;
2542
2543                lrp = xfs_btree_rec_addr(cur, lrecs, left);
2544                rrp = xfs_btree_rec_addr(cur, 1, right);
2545
2546                xfs_btree_shift_recs(cur, rrp, 1, rrecs);
2547
2548                /* Now put the new data in, and log it. */
2549                xfs_btree_copy_recs(cur, rrp, lrp, 1);
2550                xfs_btree_log_recs(cur, rbp, 1, rrecs + 1);
2551        }
2552
2553        /*
2554         * Decrement and log left's numrecs, bump and log right's numrecs.
2555         */
2556        xfs_btree_set_numrecs(left, --lrecs);
2557        xfs_btree_log_block(cur, lbp, XFS_BB_NUMRECS);
2558
2559        xfs_btree_set_numrecs(right, ++rrecs);
2560        xfs_btree_log_block(cur, rbp, XFS_BB_NUMRECS);
2561
2562        /*
2563         * Using a temporary cursor, update the parent key values of the
2564         * block on the right.
2565         */
2566        error = xfs_btree_dup_cursor(cur, &tcur);
2567        if (error)
2568                goto error0;
2569        i = xfs_btree_lastrec(tcur, level);
2570        if (XFS_IS_CORRUPT(tcur->bc_mp, i != 1)) {
2571                error = -EFSCORRUPTED;
2572                goto error0;
2573        }
2574
2575        error = xfs_btree_increment(tcur, level, &i);
2576        if (error)
2577                goto error1;
2578
2579        /* Update the parent high keys of the left block, if needed. */
2580        if (cur->bc_flags & XFS_BTREE_OVERLAPPING) {
2581                error = xfs_btree_update_keys(cur, level);
2582                if (error)
2583                        goto error1;
2584        }
2585
2586        /* Update the parent keys of the right block. */
2587        error = xfs_btree_update_keys(tcur, level);
2588        if (error)
2589                goto error1;
2590
2591        xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
2592
2593        *stat = 1;
2594        return 0;
2595
2596out0:
2597        *stat = 0;
2598        return 0;
2599
2600error0:
2601        return error;
2602
2603error1:
2604        xfs_btree_del_cursor(tcur, XFS_BTREE_ERROR);
2605        return error;
2606}
2607
2608/*
2609 * Split cur/level block in half.
2610 * Return new block number and the key to its first
2611 * record (to be inserted into parent).
2612 */
2613STATIC int                                      /* error */
2614__xfs_btree_split(
2615        struct xfs_btree_cur    *cur,
2616        int                     level,
2617        union xfs_btree_ptr     *ptrp,
2618        union xfs_btree_key     *key,
2619        struct xfs_btree_cur    **curp,
2620        int                     *stat)          /* success/failure */
2621{
2622        union xfs_btree_ptr     lptr;           /* left sibling block ptr */
2623        struct xfs_buf          *lbp;           /* left buffer pointer */
2624        struct xfs_btree_block  *left;          /* left btree block */
2625        union xfs_btree_ptr     rptr;           /* right sibling block ptr */
2626        struct xfs_buf          *rbp;           /* right buffer pointer */
2627        struct xfs_btree_block  *right;         /* right btree block */
2628        union xfs_btree_ptr     rrptr;          /* right-right sibling ptr */
2629        struct xfs_buf          *rrbp;          /* right-right buffer pointer */
2630        struct xfs_btree_block  *rrblock;       /* right-right btree block */
2631        int                     lrecs;
2632        int                     rrecs;
2633        int                     src_index;
2634        int                     error;          /* error return value */
2635        int                     i;
2636
2637        XFS_BTREE_STATS_INC(cur, split);
2638
2639        /* Set up left block (current one). */
2640        left = xfs_btree_get_block(cur, level, &lbp);
2641
2642#ifdef DEBUG
2643        error = xfs_btree_check_block(cur, left, level, lbp);
2644        if (error)
2645                goto error0;
2646#endif
2647
2648        xfs_btree_buf_to_ptr(cur, lbp, &lptr);
2649
2650        /* Allocate the new block. If we can't do it, we're toast. Give up. */
2651        error = cur->bc_ops->alloc_block(cur, &lptr, &rptr, stat);
2652        if (error)
2653                goto error0;
2654        if (*stat == 0)
2655                goto out0;
2656        XFS_BTREE_STATS_INC(cur, alloc);
2657
2658        /* Set up the new block as "right". */
2659        error = xfs_btree_get_buf_block(cur, &rptr, &right, &rbp);
2660        if (error)
2661                goto error0;
2662
2663        /* Fill in the btree header for the new right block. */
2664        xfs_btree_init_block_cur(cur, rbp, xfs_btree_get_level(left), 0);
2665
2666        /*
2667         * Split the entries between the old and the new block evenly.
2668         * Make sure that if there's an odd number of entries now, that
2669         * each new block will have the same number of entries.
2670         */
2671        lrecs = xfs_btree_get_numrecs(left);
2672        rrecs = lrecs / 2;
2673        if ((lrecs & 1) && cur->bc_ptrs[level] <= rrecs + 1)
2674                rrecs++;
2675        src_index = (lrecs - rrecs + 1);
2676
2677        XFS_BTREE_STATS_ADD(cur, moves, rrecs);
2678
2679        /* Adjust numrecs for the later get_*_keys() calls. */
2680        lrecs -= rrecs;
2681        xfs_btree_set_numrecs(left, lrecs);
2682        xfs_btree_set_numrecs(right, xfs_btree_get_numrecs(right) + rrecs);
2683
2684        /*
2685         * Copy btree block entries from the left block over to the
2686         * new block, the right. Update the right block and log the
2687         * changes.
2688         */
2689        if (level > 0) {
2690                /* It's a non-leaf.  Move keys and pointers. */
2691                union xfs_btree_key     *lkp;   /* left btree key */
2692                union xfs_btree_ptr     *lpp;   /* left address pointer */
2693                union xfs_btree_key     *rkp;   /* right btree key */
2694                union xfs_btree_ptr     *rpp;   /* right address pointer */
2695
2696                lkp = xfs_btree_key_addr(cur, src_index, left);
2697                lpp = xfs_btree_ptr_addr(cur, src_index, left);
2698                rkp = xfs_btree_key_addr(cur, 1, right);
2699                rpp = xfs_btree_ptr_addr(cur, 1, right);
2700
2701                for (i = src_index; i < rrecs; i++) {
2702                        error = xfs_btree_debug_check_ptr(cur, lpp, i, level);
2703                        if (error)
2704                                goto error0;
2705                }
2706
2707                /* Copy the keys & pointers to the new block. */
2708                xfs_btree_copy_keys(cur, rkp, lkp, rrecs);
2709                xfs_btree_copy_ptrs(cur, rpp, lpp, rrecs);
2710
2711                xfs_btree_log_keys(cur, rbp, 1, rrecs);
2712                xfs_btree_log_ptrs(cur, rbp, 1, rrecs);
2713
2714                /* Stash the keys of the new block for later insertion. */
2715                xfs_btree_get_node_keys(cur, right, key);
2716        } else {
2717                /* It's a leaf.  Move records.  */
2718                union xfs_btree_rec     *lrp;   /* left record pointer */
2719                union xfs_btree_rec     *rrp;   /* right record pointer */
2720
2721                lrp = xfs_btree_rec_addr(cur, src_index, left);
2722                rrp = xfs_btree_rec_addr(cur, 1, right);
2723
2724                /* Copy records to the new block. */
2725                xfs_btree_copy_recs(cur, rrp, lrp, rrecs);
2726                xfs_btree_log_recs(cur, rbp, 1, rrecs);
2727
2728                /* Stash the keys of the new block for later insertion. */
2729                xfs_btree_get_leaf_keys(cur, right, key);
2730        }
2731
2732        /*
2733         * Find the left block number by looking in the buffer.
2734         * Adjust sibling pointers.
2735         */
2736        xfs_btree_get_sibling(cur, left, &rrptr, XFS_BB_RIGHTSIB);
2737        xfs_btree_set_sibling(cur, right, &rrptr, XFS_BB_RIGHTSIB);
2738        xfs_btree_set_sibling(cur, right, &lptr, XFS_BB_LEFTSIB);
2739        xfs_btree_set_sibling(cur, left, &rptr, XFS_BB_RIGHTSIB);
2740
2741        xfs_btree_log_block(cur, rbp, XFS_BB_ALL_BITS);
2742        xfs_btree_log_block(cur, lbp, XFS_BB_NUMRECS | XFS_BB_RIGHTSIB);
2743
2744        /*
2745         * If there's a block to the new block's right, make that block
2746         * point back to right instead of to left.
2747         */
2748        if (!xfs_btree_ptr_is_null(cur, &rrptr)) {
2749                error = xfs_btree_read_buf_block(cur, &rrptr,
2750                                                        0, &rrblock, &rrbp);
2751                if (error)
2752                        goto error0;
2753                xfs_btree_set_sibling(cur, rrblock, &rptr, XFS_BB_LEFTSIB);
2754                xfs_btree_log_block(cur, rrbp, XFS_BB_LEFTSIB);
2755        }
2756
2757        /* Update the parent high keys of the left block, if needed. */
2758        if (cur->bc_flags & XFS_BTREE_OVERLAPPING) {
2759                error = xfs_btree_update_keys(cur, level);
2760                if (error)
2761                        goto error0;
2762        }
2763
2764        /*
2765         * If the cursor is really in the right block, move it there.
2766         * If it's just pointing past the last entry in left, then we'll
2767         * insert there, so don't change anything in that case.
2768         */
2769        if (cur->bc_ptrs[level] > lrecs + 1) {
2770                xfs_btree_setbuf(cur, level, rbp);
2771                cur->bc_ptrs[level] -= lrecs;
2772        }
2773        /*
2774         * If there are more levels, we'll need another cursor which refers
2775         * the right block, no matter where this cursor was.
2776         */
2777        if (level + 1 < cur->bc_nlevels) {
2778                error = xfs_btree_dup_cursor(cur, curp);
2779                if (error)
2780                        goto error0;
2781                (*curp)->bc_ptrs[level + 1]++;
2782        }
2783        *ptrp = rptr;
2784        *stat = 1;
2785        return 0;
2786out0:
2787        *stat = 0;
2788        return 0;
2789
2790error0:
2791        return error;
2792}
2793
2794struct xfs_btree_split_args {
2795        struct xfs_btree_cur    *cur;
2796        int                     level;
2797        union xfs_btree_ptr     *ptrp;
2798        union xfs_btree_key     *key;
2799        struct xfs_btree_cur    **curp;
2800        int                     *stat;          /* success/failure */
2801        int                     result;
2802        bool                    kswapd; /* allocation in kswapd context */
2803        struct completion       *done;
2804        struct work_struct      work;
2805};
2806
2807/*
2808 * Stack switching interfaces for allocation
2809 */
2810static void
2811xfs_btree_split_worker(
2812        struct work_struct      *work)
2813{
2814        struct xfs_btree_split_args     *args = container_of(work,
2815                                                struct xfs_btree_split_args, work);
2816        unsigned long           pflags;
2817        unsigned long           new_pflags = PF_MEMALLOC_NOFS;
2818
2819        /*
2820         * we are in a transaction context here, but may also be doing work
2821         * in kswapd context, and hence we may need to inherit that state
2822         * temporarily to ensure that we don't block waiting for memory reclaim
2823         * in any way.
2824         */
2825        if (args->kswapd)
2826                new_pflags |= PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD;
2827
2828        current_set_flags_nested(&pflags, new_pflags);
2829
2830        args->result = __xfs_btree_split(args->cur, args->level, args->ptrp,
2831                                         args->key, args->curp, args->stat);
2832        complete(args->done);
2833
2834        current_restore_flags_nested(&pflags, new_pflags);
2835}
2836
2837/*
2838 * BMBT split requests often come in with little stack to work on. Push
2839 * them off to a worker thread so there is lots of stack to use. For the other
2840 * btree types, just call directly to avoid the context switch overhead here.
2841 */
2842STATIC int                                      /* error */
2843xfs_btree_split(
2844        struct xfs_btree_cur    *cur,
2845        int                     level,
2846        union xfs_btree_ptr     *ptrp,
2847        union xfs_btree_key     *key,
2848        struct xfs_btree_cur    **curp,
2849        int                     *stat)          /* success/failure */
2850{
2851        struct xfs_btree_split_args     args;
2852        DECLARE_COMPLETION_ONSTACK(done);
2853
2854        if (cur->bc_btnum != XFS_BTNUM_BMAP)
2855                return __xfs_btree_split(cur, level, ptrp, key, curp, stat);
2856
2857        args.cur = cur;
2858        args.level = level;
2859        args.ptrp = ptrp;
2860        args.key = key;
2861        args.curp = curp;
2862        args.stat = stat;
2863        args.done = &done;
2864        args.kswapd = current_is_kswapd();
2865        INIT_WORK_ONSTACK(&args.work, xfs_btree_split_worker);
2866        queue_work(xfs_alloc_wq, &args.work);
2867        wait_for_completion(&done);
2868        destroy_work_on_stack(&args.work);
2869        return args.result;
2870}
2871
2872
2873/*
2874 * Copy the old inode root contents into a real block and make the
2875 * broot point to it.
2876 */
2877int                                             /* error */
2878xfs_btree_new_iroot(
2879        struct xfs_btree_cur    *cur,           /* btree cursor */
2880        int                     *logflags,      /* logging flags for inode */
2881        int                     *stat)          /* return status - 0 fail */
2882{
2883        struct xfs_buf          *cbp;           /* buffer for cblock */
2884        struct xfs_btree_block  *block;         /* btree block */
2885        struct xfs_btree_block  *cblock;        /* child btree block */
2886        union xfs_btree_key     *ckp;           /* child key pointer */
2887        union xfs_btree_ptr     *cpp;           /* child ptr pointer */
2888        union xfs_btree_key     *kp;            /* pointer to btree key */
2889        union xfs_btree_ptr     *pp;            /* pointer to block addr */
2890        union xfs_btree_ptr     nptr;           /* new block addr */
2891        int                     level;          /* btree level */
2892        int                     error;          /* error return code */
2893        int                     i;              /* loop counter */
2894
2895        XFS_BTREE_STATS_INC(cur, newroot);
2896
2897        ASSERT(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE);
2898
2899        level = cur->bc_nlevels - 1;
2900
2901        block = xfs_btree_get_iroot(cur);
2902        pp = xfs_btree_ptr_addr(cur, 1, block);
2903
2904        /* Allocate the new block. If we can't do it, we're toast. Give up. */
2905        error = cur->bc_ops->alloc_block(cur, pp, &nptr, stat);
2906        if (error)
2907                goto error0;
2908        if (*stat == 0)
2909                return 0;
2910
2911        XFS_BTREE_STATS_INC(cur, alloc);
2912
2913        /* Copy the root into a real block. */
2914        error = xfs_btree_get_buf_block(cur, &nptr, &cblock, &cbp);
2915        if (error)
2916                goto error0;
2917
2918        /*
2919         * we can't just memcpy() the root in for CRC enabled btree blocks.
2920         * In that case have to also ensure the blkno remains correct
2921         */
2922        memcpy(cblock, block, xfs_btree_block_len(cur));
2923        if (cur->bc_flags & XFS_BTREE_CRC_BLOCKS) {
2924                if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
2925                        cblock->bb_u.l.bb_blkno = cpu_to_be64(cbp->b_bn);
2926                else
2927                        cblock->bb_u.s.bb_blkno = cpu_to_be64(cbp->b_bn);
2928        }
2929
2930        be16_add_cpu(&block->bb_level, 1);
2931        xfs_btree_set_numrecs(block, 1);
2932        cur->bc_nlevels++;
2933        cur->bc_ptrs[level + 1] = 1;
2934
2935        kp = xfs_btree_key_addr(cur, 1, block);
2936        ckp = xfs_btree_key_addr(cur, 1, cblock);
2937        xfs_btree_copy_keys(cur, ckp, kp, xfs_btree_get_numrecs(cblock));
2938
2939        cpp = xfs_btree_ptr_addr(cur, 1, cblock);
2940        for (i = 0; i < be16_to_cpu(cblock->bb_numrecs); i++) {
2941                error = xfs_btree_debug_check_ptr(cur, pp, i, level);
2942                if (error)
2943                        goto error0;
2944        }
2945
2946        xfs_btree_copy_ptrs(cur, cpp, pp, xfs_btree_get_numrecs(cblock));
2947
2948        error = xfs_btree_debug_check_ptr(cur, &nptr, 0, level);
2949        if (error)
2950                goto error0;
2951
2952        xfs_btree_copy_ptrs(cur, pp, &nptr, 1);
2953
2954        xfs_iroot_realloc(cur->bc_ino.ip,
2955                          1 - xfs_btree_get_numrecs(cblock),
2956                          cur->bc_ino.whichfork);
2957
2958        xfs_btree_setbuf(cur, level, cbp);
2959
2960        /*
2961         * Do all this logging at the end so that
2962         * the root is at the right level.
2963         */
2964        xfs_btree_log_block(cur, cbp, XFS_BB_ALL_BITS);
2965        xfs_btree_log_keys(cur, cbp, 1, be16_to_cpu(cblock->bb_numrecs));
2966        xfs_btree_log_ptrs(cur, cbp, 1, be16_to_cpu(cblock->bb_numrecs));
2967
2968        *logflags |=
2969                XFS_ILOG_CORE | xfs_ilog_fbroot(cur->bc_ino.whichfork);
2970        *stat = 1;
2971        return 0;
2972error0:
2973        return error;
2974}
2975
2976/*
2977 * Allocate a new root block, fill it in.
2978 */
2979STATIC int                              /* error */
2980xfs_btree_new_root(
2981        struct xfs_btree_cur    *cur,   /* btree cursor */
2982        int                     *stat)  /* success/failure */
2983{
2984        struct xfs_btree_block  *block; /* one half of the old root block */
2985        struct xfs_buf          *bp;    /* buffer containing block */
2986        int                     error;  /* error return value */
2987        struct xfs_buf          *lbp;   /* left buffer pointer */
2988        struct xfs_btree_block  *left;  /* left btree block */
2989        struct xfs_buf          *nbp;   /* new (root) buffer */
2990        struct xfs_btree_block  *new;   /* new (root) btree block */
2991        int                     nptr;   /* new value for key index, 1 or 2 */
2992        struct xfs_buf          *rbp;   /* right buffer pointer */
2993        struct xfs_btree_block  *right; /* right btree block */
2994        union xfs_btree_ptr     rptr;
2995        union xfs_btree_ptr     lptr;
2996
2997        XFS_BTREE_STATS_INC(cur, newroot);
2998
2999        /* initialise our start point from the cursor */
3000        cur->bc_ops->init_ptr_from_cur(cur, &rptr);
3001
3002        /* Allocate the new block. If we can't do it, we're toast. Give up. */
3003        error = cur->bc_ops->alloc_block(cur, &rptr, &lptr, stat);
3004        if (error)
3005                goto error0;
3006        if (*stat == 0)
3007                goto out0;
3008        XFS_BTREE_STATS_INC(cur, alloc);
3009
3010        /* Set up the new block. */
3011        error = xfs_btree_get_buf_block(cur, &lptr, &new, &nbp);
3012        if (error)
3013                goto error0;
3014
3015        /* Set the root in the holding structure  increasing the level by 1. */
3016        cur->bc_ops->set_root(cur, &lptr, 1);
3017
3018        /*
3019         * At the previous root level there are now two blocks: the old root,
3020         * and the new block generated when it was split.  We don't know which
3021         * one the cursor is pointing at, so we set up variables "left" and
3022         * "right" for each case.
3023         */
3024        block = xfs_btree_get_block(cur, cur->bc_nlevels - 1, &bp);
3025
3026#ifdef DEBUG
3027        error = xfs_btree_check_block(cur, block, cur->bc_nlevels - 1, bp);
3028        if (error)
3029                goto error0;
3030#endif
3031
3032        xfs_btree_get_sibling(cur, block, &rptr, XFS_BB_RIGHTSIB);
3033        if (!xfs_btree_ptr_is_null(cur, &rptr)) {
3034                /* Our block is left, pick up the right block. */
3035                lbp = bp;
3036                xfs_btree_buf_to_ptr(cur, lbp, &lptr);
3037                left = block;
3038                error = xfs_btree_read_buf_block(cur, &rptr, 0, &right, &rbp);
3039                if (error)
3040                        goto error0;
3041                bp = rbp;
3042                nptr = 1;
3043        } else {
3044                /* Our block is right, pick up the left block. */
3045                rbp = bp;
3046                xfs_btree_buf_to_ptr(cur, rbp, &rptr);
3047                right = block;
3048                xfs_btree_get_sibling(cur, right, &lptr, XFS_BB_LEFTSIB);
3049                error = xfs_btree_read_buf_block(cur, &lptr, 0, &left, &lbp);
3050                if (error)
3051                        goto error0;
3052                bp = lbp;
3053                nptr = 2;
3054        }
3055
3056        /* Fill in the new block's btree header and log it. */
3057        xfs_btree_init_block_cur(cur, nbp, cur->bc_nlevels, 2);
3058        xfs_btree_log_block(cur, nbp, XFS_BB_ALL_BITS);
3059        ASSERT(!xfs_btree_ptr_is_null(cur, &lptr) &&
3060                        !xfs_btree_ptr_is_null(cur, &rptr));
3061
3062        /* Fill in the key data in the new root. */
3063        if (xfs_btree_get_level(left) > 0) {
3064                /*
3065                 * Get the keys for the left block's keys and put them directly
3066                 * in the parent block.  Do the same for the right block.
3067                 */
3068                xfs_btree_get_node_keys(cur, left,
3069                                xfs_btree_key_addr(cur, 1, new));
3070                xfs_btree_get_node_keys(cur, right,
3071                                xfs_btree_key_addr(cur, 2, new));
3072        } else {
3073                /*
3074                 * Get the keys for the left block's records and put them
3075                 * directly in the parent block.  Do the same for the right
3076                 * block.
3077                 */
3078                xfs_btree_get_leaf_keys(cur, left,
3079                        xfs_btree_key_addr(cur, 1, new));
3080                xfs_btree_get_leaf_keys(cur, right,
3081                        xfs_btree_key_addr(cur, 2, new));
3082        }
3083        xfs_btree_log_keys(cur, nbp, 1, 2);
3084
3085        /* Fill in the pointer data in the new root. */
3086        xfs_btree_copy_ptrs(cur,
3087                xfs_btree_ptr_addr(cur, 1, new), &lptr, 1);
3088        xfs_btree_copy_ptrs(cur,
3089                xfs_btree_ptr_addr(cur, 2, new), &rptr, 1);
3090        xfs_btree_log_ptrs(cur, nbp, 1, 2);
3091
3092        /* Fix up the cursor. */
3093        xfs_btree_setbuf(cur, cur->bc_nlevels, nbp);
3094        cur->bc_ptrs[cur->bc_nlevels] = nptr;
3095        cur->bc_nlevels++;
3096        *stat = 1;
3097        return 0;
3098error0:
3099        return error;
3100out0:
3101        *stat = 0;
3102        return 0;
3103}
3104
3105STATIC int
3106xfs_btree_make_block_unfull(
3107        struct xfs_btree_cur    *cur,   /* btree cursor */
3108        int                     level,  /* btree level */
3109        int                     numrecs,/* # of recs in block */
3110        int                     *oindex,/* old tree index */
3111        int                     *index, /* new tree index */
3112        union xfs_btree_ptr     *nptr,  /* new btree ptr */
3113        struct xfs_btree_cur    **ncur, /* new btree cursor */
3114        union xfs_btree_key     *key,   /* key of new block */
3115        int                     *stat)
3116{
3117        int                     error = 0;
3118
3119        if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
3120            level == cur->bc_nlevels - 1) {
3121                struct xfs_inode *ip = cur->bc_ino.ip;
3122
3123                if (numrecs < cur->bc_ops->get_dmaxrecs(cur, level)) {
3124                        /* A root block that can be made bigger. */
3125                        xfs_iroot_realloc(ip, 1, cur->bc_ino.whichfork);
3126                        *stat = 1;
3127                } else {
3128                        /* A root block that needs replacing */
3129                        int     logflags = 0;
3130
3131                        error = xfs_btree_new_iroot(cur, &logflags, stat);
3132                        if (error || *stat == 0)
3133                                return error;
3134
3135                        xfs_trans_log_inode(cur->bc_tp, ip, logflags);
3136                }
3137
3138                return 0;
3139        }
3140
3141        /* First, try shifting an entry to the right neighbor. */
3142        error = xfs_btree_rshift(cur, level, stat);
3143        if (error || *stat)
3144                return error;
3145
3146        /* Next, try shifting an entry to the left neighbor. */
3147        error = xfs_btree_lshift(cur, level, stat);
3148        if (error)
3149                return error;
3150
3151        if (*stat) {
3152                *oindex = *index = cur->bc_ptrs[level];
3153                return 0;
3154        }
3155
3156        /*
3157         * Next, try splitting the current block in half.
3158         *
3159         * If this works we have to re-set our variables because we
3160         * could be in a different block now.
3161         */
3162        error = xfs_btree_split(cur, level, nptr, key, ncur, stat);
3163        if (error || *stat == 0)
3164                return error;
3165
3166
3167        *index = cur->bc_ptrs[level];
3168        return 0;
3169}
3170
3171/*
3172 * Insert one record/level.  Return information to the caller
3173 * allowing the next level up to proceed if necessary.
3174 */
3175STATIC int
3176xfs_btree_insrec(
3177        struct xfs_btree_cur    *cur,   /* btree cursor */
3178        int                     level,  /* level to insert record at */
3179        union xfs_btree_ptr     *ptrp,  /* i/o: block number inserted */
3180        union xfs_btree_rec     *rec,   /* record to insert */
3181        union xfs_btree_key     *key,   /* i/o: block key for ptrp */
3182        struct xfs_btree_cur    **curp, /* output: new cursor replacing cur */
3183        int                     *stat)  /* success/failure */
3184{
3185        struct xfs_btree_block  *block; /* btree block */
3186        struct xfs_buf          *bp;    /* buffer for block */
3187        union xfs_btree_ptr     nptr;   /* new block ptr */
3188        struct xfs_btree_cur    *ncur;  /* new btree cursor */
3189        union xfs_btree_key     nkey;   /* new block key */
3190        union xfs_btree_key     *lkey;
3191        int                     optr;   /* old key/record index */
3192        int                     ptr;    /* key/record index */
3193        int                     numrecs;/* number of records */
3194        int                     error;  /* error return value */
3195        int                     i;
3196        xfs_daddr_t             old_bn;
3197
3198        ncur = NULL;
3199        lkey = &nkey;
3200
3201        /*
3202         * If we have an external root pointer, and we've made it to the
3203         * root level, allocate a new root block and we're done.
3204         */
3205        if (!(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
3206            (level >= cur->bc_nlevels)) {
3207                error = xfs_btree_new_root(cur, stat);
3208                xfs_btree_set_ptr_null(cur, ptrp);
3209
3210                return error;
3211        }
3212
3213        /* If we're off the left edge, return failure. */
3214        ptr = cur->bc_ptrs[level];
3215        if (ptr == 0) {
3216                *stat = 0;
3217                return 0;
3218        }
3219
3220        optr = ptr;
3221
3222        XFS_BTREE_STATS_INC(cur, insrec);
3223
3224        /* Get pointers to the btree buffer and block. */
3225        block = xfs_btree_get_block(cur, level, &bp);
3226        old_bn = bp ? bp->b_bn : XFS_BUF_DADDR_NULL;
3227        numrecs = xfs_btree_get_numrecs(block);
3228
3229#ifdef DEBUG
3230        error = xfs_btree_check_block(cur, block, level, bp);
3231        if (error)
3232                goto error0;
3233
3234        /* Check that the new entry is being inserted in the right place. */
3235        if (ptr <= numrecs) {
3236                if (level == 0) {
3237                        ASSERT(cur->bc_ops->recs_inorder(cur, rec,
3238                                xfs_btree_rec_addr(cur, ptr, block)));
3239                } else {
3240                        ASSERT(cur->bc_ops->keys_inorder(cur, key,
3241                                xfs_btree_key_addr(cur, ptr, block)));
3242                }
3243        }
3244#endif
3245
3246        /*
3247         * If the block is full, we can't insert the new entry until we
3248         * make the block un-full.
3249         */
3250        xfs_btree_set_ptr_null(cur, &nptr);
3251        if (numrecs == cur->bc_ops->get_maxrecs(cur, level)) {
3252                error = xfs_btree_make_block_unfull(cur, level, numrecs,
3253                                        &optr, &ptr, &nptr, &ncur, lkey, stat);
3254                if (error || *stat == 0)
3255                        goto error0;
3256        }
3257
3258        /*
3259         * The current block may have changed if the block was
3260         * previously full and we have just made space in it.
3261         */
3262        block = xfs_btree_get_block(cur, level, &bp);
3263        numrecs = xfs_btree_get_numrecs(block);
3264
3265#ifdef DEBUG
3266        error = xfs_btree_check_block(cur, block, level, bp);
3267        if (error)
3268                return error;
3269#endif
3270
3271        /*
3272         * At this point we know there's room for our new entry in the block
3273         * we're pointing at.
3274         */
3275        XFS_BTREE_STATS_ADD(cur, moves, numrecs - ptr + 1);
3276
3277        if (level > 0) {
3278                /* It's a nonleaf. make a hole in the keys and ptrs */
3279                union xfs_btree_key     *kp;
3280                union xfs_btree_ptr     *pp;
3281
3282                kp = xfs_btree_key_addr(cur, ptr, block);
3283                pp = xfs_btree_ptr_addr(cur, ptr, block);
3284
3285                for (i = numrecs - ptr; i >= 0; i--) {
3286                        error = xfs_btree_debug_check_ptr(cur, pp, i, level);
3287                        if (error)
3288                                return error;
3289                }
3290
3291                xfs_btree_shift_keys(cur, kp, 1, numrecs - ptr + 1);
3292                xfs_btree_shift_ptrs(cur, pp, 1, numrecs - ptr + 1);
3293
3294                error = xfs_btree_debug_check_ptr(cur, ptrp, 0, level);
3295                if (error)
3296                        goto error0;
3297
3298                /* Now put the new data in, bump numrecs and log it. */
3299                xfs_btree_copy_keys(cur, kp, key, 1);
3300                xfs_btree_copy_ptrs(cur, pp, ptrp, 1);
3301                numrecs++;
3302                xfs_btree_set_numrecs(block, numrecs);
3303                xfs_btree_log_ptrs(cur, bp, ptr, numrecs);
3304                xfs_btree_log_keys(cur, bp, ptr, numrecs);
3305#ifdef DEBUG
3306                if (ptr < numrecs) {
3307                        ASSERT(cur->bc_ops->keys_inorder(cur, kp,
3308                                xfs_btree_key_addr(cur, ptr + 1, block)));
3309                }
3310#endif
3311        } else {
3312                /* It's a leaf. make a hole in the records */
3313                union xfs_btree_rec             *rp;
3314
3315                rp = xfs_btree_rec_addr(cur, ptr, block);
3316
3317                xfs_btree_shift_recs(cur, rp, 1, numrecs - ptr + 1);
3318
3319                /* Now put the new data in, bump numrecs and log it. */
3320                xfs_btree_copy_recs(cur, rp, rec, 1);
3321                xfs_btree_set_numrecs(block, ++numrecs);
3322                xfs_btree_log_recs(cur, bp, ptr, numrecs);
3323#ifdef DEBUG
3324                if (ptr < numrecs) {
3325                        ASSERT(cur->bc_ops->recs_inorder(cur, rp,
3326                                xfs_btree_rec_addr(cur, ptr + 1, block)));
3327                }
3328#endif
3329        }
3330
3331        /* Log the new number of records in the btree header. */
3332        xfs_btree_log_block(cur, bp, XFS_BB_NUMRECS);
3333
3334        /*
3335         * If we just inserted into a new tree block, we have to
3336         * recalculate nkey here because nkey is out of date.
3337         *
3338         * Otherwise we're just updating an existing block (having shoved
3339         * some records into the new tree block), so use the regular key
3340         * update mechanism.
3341         */
3342        if (bp && bp->b_bn != old_bn) {
3343                xfs_btree_get_keys(cur, block, lkey);
3344        } else if (xfs_btree_needs_key_update(cur, optr)) {
3345                error = xfs_btree_update_keys(cur, level);
3346                if (error)
3347                        goto error0;
3348        }
3349
3350        /*
3351         * If we are tracking the last record in the tree and
3352         * we are at the far right edge of the tree, update it.
3353         */
3354        if (xfs_btree_is_lastrec(cur, block, level)) {
3355                cur->bc_ops->update_lastrec(cur, block, rec,
3356                                            ptr, LASTREC_INSREC);
3357        }
3358
3359        /*
3360         * Return the new block number, if any.
3361         * If there is one, give back a record value and a cursor too.
3362         */
3363        *ptrp = nptr;
3364        if (!xfs_btree_ptr_is_null(cur, &nptr)) {
3365                xfs_btree_copy_keys(cur, key, lkey, 1);
3366                *curp = ncur;
3367        }
3368
3369        *stat = 1;
3370        return 0;
3371
3372error0:
3373        return error;
3374}
3375
3376/*
3377 * Insert the record at the point referenced by cur.
3378 *
3379 * A multi-level split of the tree on insert will invalidate the original
3380 * cursor.  All callers of this function should assume that the cursor is
3381 * no longer valid and revalidate it.
3382 */
3383int
3384xfs_btree_insert(
3385        struct xfs_btree_cur    *cur,
3386        int                     *stat)
3387{
3388        int                     error;  /* error return value */
3389        int                     i;      /* result value, 0 for failure */
3390        int                     level;  /* current level number in btree */
3391        union xfs_btree_ptr     nptr;   /* new block number (split result) */
3392        struct xfs_btree_cur    *ncur;  /* new cursor (split result) */
3393        struct xfs_btree_cur    *pcur;  /* previous level's cursor */
3394        union xfs_btree_key     bkey;   /* key of block to insert */
3395        union xfs_btree_key     *key;
3396        union xfs_btree_rec     rec;    /* record to insert */
3397
3398        level = 0;
3399        ncur = NULL;
3400        pcur = cur;
3401        key = &bkey;
3402
3403        xfs_btree_set_ptr_null(cur, &nptr);
3404
3405        /* Make a key out of the record data to be inserted, and save it. */
3406        cur->bc_ops->init_rec_from_cur(cur, &rec);
3407        cur->bc_ops->init_key_from_rec(key, &rec);
3408
3409        /*
3410         * Loop going up the tree, starting at the leaf level.
3411         * Stop when we don't get a split block, that must mean that
3412         * the insert is finished with this level.
3413         */
3414        do {
3415                /*
3416                 * Insert nrec/nptr into this level of the tree.
3417                 * Note if we fail, nptr will be null.
3418                 */
3419                error = xfs_btree_insrec(pcur, level, &nptr, &rec, key,
3420                                &ncur, &i);
3421                if (error) {
3422                        if (pcur != cur)
3423                                xfs_btree_del_cursor(pcur, XFS_BTREE_ERROR);
3424                        goto error0;
3425                }
3426
3427                if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) {
3428                        error = -EFSCORRUPTED;
3429                        goto error0;
3430                }
3431                level++;
3432
3433                /*
3434                 * See if the cursor we just used is trash.
3435                 * Can't trash the caller's cursor, but otherwise we should
3436                 * if ncur is a new cursor or we're about to be done.
3437                 */
3438                if (pcur != cur &&
3439                    (ncur || xfs_btree_ptr_is_null(cur, &nptr))) {
3440                        /* Save the state from the cursor before we trash it */
3441                        if (cur->bc_ops->update_cursor)
3442                                cur->bc_ops->update_cursor(pcur, cur);
3443                        cur->bc_nlevels = pcur->bc_nlevels;
3444                        xfs_btree_del_cursor(pcur, XFS_BTREE_NOERROR);
3445                }
3446                /* If we got a new cursor, switch to it. */
3447                if (ncur) {
3448                        pcur = ncur;
3449                        ncur = NULL;
3450                }
3451        } while (!xfs_btree_ptr_is_null(cur, &nptr));
3452
3453        *stat = i;
3454        return 0;
3455error0:
3456        return error;
3457}
3458
3459/*
3460 * Try to merge a non-leaf block back into the inode root.
3461 *
3462 * Note: the killroot names comes from the fact that we're effectively
3463 * killing the old root block.  But because we can't just delete the
3464 * inode we have to copy the single block it was pointing to into the
3465 * inode.
3466 */
3467STATIC int
3468xfs_btree_kill_iroot(
3469        struct xfs_btree_cur    *cur)
3470{
3471        int                     whichfork = cur->bc_ino.whichfork;
3472        struct xfs_inode        *ip = cur->bc_ino.ip;
3473        struct xfs_ifork        *ifp = XFS_IFORK_PTR(ip, whichfork);
3474        struct xfs_btree_block  *block;
3475        struct xfs_btree_block  *cblock;
3476        union xfs_btree_key     *kp;
3477        union xfs_btree_key     *ckp;
3478        union xfs_btree_ptr     *pp;
3479        union xfs_btree_ptr     *cpp;
3480        struct xfs_buf          *cbp;
3481        int                     level;
3482        int                     index;
3483        int                     numrecs;
3484        int                     error;
3485#ifdef DEBUG
3486        union xfs_btree_ptr     ptr;
3487#endif
3488        int                     i;
3489
3490        ASSERT(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE);
3491        ASSERT(cur->bc_nlevels > 1);
3492
3493        /*
3494         * Don't deal with the root block needs to be a leaf case.
3495         * We're just going to turn the thing back into extents anyway.
3496         */
3497        level = cur->bc_nlevels - 1;
3498        if (level == 1)
3499                goto out0;
3500
3501        /*
3502         * Give up if the root has multiple children.
3503         */
3504        block = xfs_btree_get_iroot(cur);
3505        if (xfs_btree_get_numrecs(block) != 1)
3506                goto out0;
3507
3508        cblock = xfs_btree_get_block(cur, level - 1, &cbp);
3509        numrecs = xfs_btree_get_numrecs(cblock);
3510
3511        /*
3512         * Only do this if the next level will fit.
3513         * Then the data must be copied up to the inode,
3514         * instead of freeing the root you free the next level.
3515         */
3516        if (numrecs > cur->bc_ops->get_dmaxrecs(cur, level))
3517                goto out0;
3518
3519        XFS_BTREE_STATS_INC(cur, killroot);
3520
3521#ifdef DEBUG
3522        xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_LEFTSIB);
3523        ASSERT(xfs_btree_ptr_is_null(cur, &ptr));
3524        xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_RIGHTSIB);
3525        ASSERT(xfs_btree_ptr_is_null(cur, &ptr));
3526#endif
3527
3528        index = numrecs - cur->bc_ops->get_maxrecs(cur, level);
3529        if (index) {
3530                xfs_iroot_realloc(cur->bc_ino.ip, index,
3531                                  cur->bc_ino.whichfork);
3532                block = ifp->if_broot;
3533        }
3534
3535        be16_add_cpu(&block->bb_numrecs, index);
3536        ASSERT(block->bb_numrecs == cblock->bb_numrecs);
3537
3538        kp = xfs_btree_key_addr(cur, 1, block);
3539        ckp = xfs_btree_key_addr(cur, 1, cblock);
3540        xfs_btree_copy_keys(cur, kp, ckp, numrecs);
3541
3542        pp = xfs_btree_ptr_addr(cur, 1, block);
3543        cpp = xfs_btree_ptr_addr(cur, 1, cblock);
3544
3545        for (i = 0; i < numrecs; i++) {
3546                error = xfs_btree_debug_check_ptr(cur, cpp, i, level - 1);
3547                if (error)
3548                        return error;
3549        }
3550
3551        xfs_btree_copy_ptrs(cur, pp, cpp, numrecs);
3552
3553        error = xfs_btree_free_block(cur, cbp);
3554        if (error)
3555                return error;
3556
3557        cur->bc_bufs[level - 1] = NULL;
3558        be16_add_cpu(&block->bb_level, -1);
3559        xfs_trans_log_inode(cur->bc_tp, ip,
3560                XFS_ILOG_CORE | xfs_ilog_fbroot(cur->bc_ino.whichfork));
3561        cur->bc_nlevels--;
3562out0:
3563        return 0;
3564}
3565
3566/*
3567 * Kill the current root node, and replace it with it's only child node.
3568 */
3569STATIC int
3570xfs_btree_kill_root(
3571        struct xfs_btree_cur    *cur,
3572        struct xfs_buf          *bp,
3573        int                     level,
3574        union xfs_btree_ptr     *newroot)
3575{
3576        int                     error;
3577
3578        XFS_BTREE_STATS_INC(cur, killroot);
3579
3580        /*
3581         * Update the root pointer, decreasing the level by 1 and then
3582         * free the old root.
3583         */
3584        cur->bc_ops->set_root(cur, newroot, -1);
3585
3586        error = xfs_btree_free_block(cur, bp);
3587        if (error)
3588                return error;
3589
3590        cur->bc_bufs[level] = NULL;
3591        cur->bc_ra[level] = 0;
3592        cur->bc_nlevels--;
3593
3594        return 0;
3595}
3596
3597STATIC int
3598xfs_btree_dec_cursor(
3599        struct xfs_btree_cur    *cur,
3600        int                     level,
3601        int                     *stat)
3602{
3603        int                     error;
3604        int                     i;
3605
3606        if (level > 0) {
3607                error = xfs_btree_decrement(cur, level, &i);
3608                if (error)
3609                        return error;
3610        }
3611
3612        *stat = 1;
3613        return 0;
3614}
3615
3616/*
3617 * Single level of the btree record deletion routine.
3618 * Delete record pointed to by cur/level.
3619 * Remove the record from its block then rebalance the tree.
3620 * Return 0 for error, 1 for done, 2 to go on to the next level.
3621 */
3622STATIC int                                      /* error */
3623xfs_btree_delrec(
3624        struct xfs_btree_cur    *cur,           /* btree cursor */
3625        int                     level,          /* level removing record from */
3626        int                     *stat)          /* fail/done/go-on */
3627{
3628        struct xfs_btree_block  *block;         /* btree block */
3629        union xfs_btree_ptr     cptr;           /* current block ptr */
3630        struct xfs_buf          *bp;            /* buffer for block */
3631        int                     error;          /* error return value */
3632        int                     i;              /* loop counter */
3633        union xfs_btree_ptr     lptr;           /* left sibling block ptr */
3634        struct xfs_buf          *lbp;           /* left buffer pointer */
3635        struct xfs_btree_block  *left;          /* left btree block */
3636        int                     lrecs = 0;      /* left record count */
3637        int                     ptr;            /* key/record index */
3638        union xfs_btree_ptr     rptr;           /* right sibling block ptr */
3639        struct xfs_buf          *rbp;           /* right buffer pointer */
3640        struct xfs_btree_block  *right;         /* right btree block */
3641        struct xfs_btree_block  *rrblock;       /* right-right btree block */
3642        struct xfs_buf          *rrbp;          /* right-right buffer pointer */
3643        int                     rrecs = 0;      /* right record count */
3644        struct xfs_btree_cur    *tcur;          /* temporary btree cursor */
3645        int                     numrecs;        /* temporary numrec count */
3646
3647        tcur = NULL;
3648
3649        /* Get the index of the entry being deleted, check for nothing there. */
3650        ptr = cur->bc_ptrs[level];
3651        if (ptr == 0) {
3652                *stat = 0;
3653                return 0;
3654        }
3655
3656        /* Get the buffer & block containing the record or key/ptr. */
3657        block = xfs_btree_get_block(cur, level, &bp);
3658        numrecs = xfs_btree_get_numrecs(block);
3659
3660#ifdef DEBUG
3661        error = xfs_btree_check_block(cur, block, level, bp);
3662        if (error)
3663                goto error0;
3664#endif
3665
3666        /* Fail if we're off the end of the block. */
3667        if (ptr > numrecs) {
3668                *stat = 0;
3669                return 0;
3670        }
3671
3672        XFS_BTREE_STATS_INC(cur, delrec);
3673        XFS_BTREE_STATS_ADD(cur, moves, numrecs - ptr);
3674
3675        /* Excise the entries being deleted. */
3676        if (level > 0) {
3677                /* It's a nonleaf. operate on keys and ptrs */
3678                union xfs_btree_key     *lkp;
3679                union xfs_btree_ptr     *lpp;
3680
3681                lkp = xfs_btree_key_addr(cur, ptr + 1, block);
3682                lpp = xfs_btree_ptr_addr(cur, ptr + 1, block);
3683
3684                for (i = 0; i < numrecs - ptr; i++) {
3685                        error = xfs_btree_debug_check_ptr(cur, lpp, i, level);
3686                        if (error)
3687                                goto error0;
3688                }
3689
3690                if (ptr < numrecs) {
3691                        xfs_btree_shift_keys(cur, lkp, -1, numrecs - ptr);
3692                        xfs_btree_shift_ptrs(cur, lpp, -1, numrecs - ptr);
3693                        xfs_btree_log_keys(cur, bp, ptr, numrecs - 1);
3694                        xfs_btree_log_ptrs(cur, bp, ptr, numrecs - 1);
3695                }
3696        } else {
3697                /* It's a leaf. operate on records */
3698                if (ptr < numrecs) {
3699                        xfs_btree_shift_recs(cur,
3700                                xfs_btree_rec_addr(cur, ptr + 1, block),
3701                                -1, numrecs - ptr);
3702                        xfs_btree_log_recs(cur, bp, ptr, numrecs - 1);
3703                }
3704        }
3705
3706        /*
3707         * Decrement and log the number of entries in the block.
3708         */
3709        xfs_btree_set_numrecs(block, --numrecs);
3710        xfs_btree_log_block(cur, bp, XFS_BB_NUMRECS);
3711
3712        /*
3713         * If we are tracking the last record in the tree and
3714         * we are at the far right edge of the tree, update it.
3715         */
3716        if (xfs_btree_is_lastrec(cur, block, level)) {
3717                cur->bc_ops->update_lastrec(cur, block, NULL,
3718                                            ptr, LASTREC_DELREC);
3719        }
3720
3721        /*
3722         * We're at the root level.  First, shrink the root block in-memory.
3723         * Try to get rid of the next level down.  If we can't then there's
3724         * nothing left to do.
3725         */
3726        if (level == cur->bc_nlevels - 1) {
3727                if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) {
3728                        xfs_iroot_realloc(cur->bc_ino.ip, -1,
3729                                          cur->bc_ino.whichfork);
3730
3731                        error = xfs_btree_kill_iroot(cur);
3732                        if (error)
3733                                goto error0;
3734
3735                        error = xfs_btree_dec_cursor(cur, level, stat);
3736                        if (error)
3737                                goto error0;
3738                        *stat = 1;
3739                        return 0;
3740                }
3741
3742                /*
3743                 * If this is the root level, and there's only one entry left,
3744                 * and it's NOT the leaf level, then we can get rid of this
3745                 * level.
3746                 */
3747                if (numrecs == 1 && level > 0) {
3748                        union xfs_btree_ptr     *pp;
3749                        /*
3750                         * pp is still set to the first pointer in the block.
3751                         * Make it the new root of the btree.
3752                         */
3753                        pp = xfs_btree_ptr_addr(cur, 1, block);
3754                        error = xfs_btree_kill_root(cur, bp, level, pp);
3755                        if (error)
3756                                goto error0;
3757                } else if (level > 0) {
3758                        error = xfs_btree_dec_cursor(cur, level, stat);
3759                        if (error)
3760                                goto error0;
3761                }
3762                *stat = 1;
3763                return 0;
3764        }
3765
3766        /*
3767         * If we deleted the leftmost entry in the block, update the
3768         * key values above us in the tree.
3769         */
3770        if (xfs_btree_needs_key_update(cur, ptr)) {
3771                error = xfs_btree_update_keys(cur, level);
3772                if (error)
3773                        goto error0;
3774        }
3775
3776        /*
3777         * If the number of records remaining in the block is at least
3778         * the minimum, we're done.
3779         */
3780        if (numrecs >= cur->bc_ops->get_minrecs(cur, level)) {
3781                error = xfs_btree_dec_cursor(cur, level, stat);
3782                if (error)
3783                        goto error0;
3784                return 0;
3785        }
3786
3787        /*
3788         * Otherwise, we have to move some records around to keep the
3789         * tree balanced.  Look at the left and right sibling blocks to
3790         * see if we can re-balance by moving only one record.
3791         */
3792        xfs_btree_get_sibling(cur, block, &rptr, XFS_BB_RIGHTSIB);
3793        xfs_btree_get_sibling(cur, block, &lptr, XFS_BB_LEFTSIB);
3794
3795        if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) {
3796                /*
3797                 * One child of root, need to get a chance to copy its contents
3798                 * into the root and delete it. Can't go up to next level,
3799                 * there's nothing to delete there.
3800                 */
3801                if (xfs_btree_ptr_is_null(cur, &rptr) &&
3802                    xfs_btree_ptr_is_null(cur, &lptr) &&
3803                    level == cur->bc_nlevels - 2) {
3804                        error = xfs_btree_kill_iroot(cur);
3805                        if (!error)
3806                                error = xfs_btree_dec_cursor(cur, level, stat);
3807                        if (error)
3808                                goto error0;
3809                        return 0;
3810                }
3811        }
3812
3813        ASSERT(!xfs_btree_ptr_is_null(cur, &rptr) ||
3814               !xfs_btree_ptr_is_null(cur, &lptr));
3815
3816        /*
3817         * Duplicate the cursor so our btree manipulations here won't
3818         * disrupt the next level up.
3819         */
3820        error = xfs_btree_dup_cursor(cur, &tcur);
3821        if (error)
3822                goto error0;
3823
3824        /*
3825         * If there's a right sibling, see if it's ok to shift an entry
3826         * out of it.
3827         */
3828        if (!xfs_btree_ptr_is_null(cur, &rptr)) {
3829                /*
3830                 * Move the temp cursor to the last entry in the next block.
3831                 * Actually any entry but the first would suffice.
3832                 */
3833                i = xfs_btree_lastrec(tcur, level);
3834                if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) {
3835                        error = -EFSCORRUPTED;
3836                        goto error0;
3837                }
3838
3839                error = xfs_btree_increment(tcur, level, &i);
3840                if (error)
3841                        goto error0;
3842                if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) {
3843                        error = -EFSCORRUPTED;
3844                        goto error0;
3845                }
3846
3847                i = xfs_btree_lastrec(tcur, level);
3848                if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) {
3849                        error = -EFSCORRUPTED;
3850                        goto error0;
3851                }
3852
3853                /* Grab a pointer to the block. */
3854                right = xfs_btree_get_block(tcur, level, &rbp);
3855#ifdef DEBUG
3856                error = xfs_btree_check_block(tcur, right, level, rbp);
3857                if (error)
3858                        goto error0;
3859#endif
3860                /* Grab the current block number, for future use. */
3861                xfs_btree_get_sibling(tcur, right, &cptr, XFS_BB_LEFTSIB);
3862
3863                /*
3864                 * If right block is full enough so that removing one entry
3865                 * won't make it too empty, and left-shifting an entry out
3866                 * of right to us works, we're done.
3867                 */
3868                if (xfs_btree_get_numrecs(right) - 1 >=
3869                    cur->bc_ops->get_minrecs(tcur, level)) {
3870                        error = xfs_btree_lshift(tcur, level, &i);
3871                        if (error)
3872                                goto error0;
3873                        if (i) {
3874                                ASSERT(xfs_btree_get_numrecs(block) >=
3875                                       cur->bc_ops->get_minrecs(tcur, level));
3876
3877                                xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
3878                                tcur = NULL;
3879
3880                                error = xfs_btree_dec_cursor(cur, level, stat);
3881                                if (error)
3882                                        goto error0;
3883                                return 0;
3884                        }
3885                }
3886
3887                /*
3888                 * Otherwise, grab the number of records in right for
3889                 * future reference, and fix up the temp cursor to point
3890                 * to our block again (last record).
3891                 */
3892                rrecs = xfs_btree_get_numrecs(right);
3893                if (!xfs_btree_ptr_is_null(cur, &lptr)) {
3894                        i = xfs_btree_firstrec(tcur, level);
3895                        if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) {
3896                                error = -EFSCORRUPTED;
3897                                goto error0;
3898                        }
3899
3900                        error = xfs_btree_decrement(tcur, level, &i);
3901                        if (error)
3902                                goto error0;
3903                        if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) {
3904                                error = -EFSCORRUPTED;
3905                                goto error0;
3906                        }
3907                }
3908        }
3909
3910        /*
3911         * If there's a left sibling, see if it's ok to shift an entry
3912         * out of it.
3913         */
3914        if (!xfs_btree_ptr_is_null(cur, &lptr)) {
3915                /*
3916                 * Move the temp cursor to the first entry in the
3917                 * previous block.
3918                 */
3919                i = xfs_btree_firstrec(tcur, level);
3920                if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) {
3921                        error = -EFSCORRUPTED;
3922                        goto error0;
3923                }
3924
3925                error = xfs_btree_decrement(tcur, level, &i);
3926                if (error)
3927                        goto error0;
3928                i = xfs_btree_firstrec(tcur, level);
3929                if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) {
3930                        error = -EFSCORRUPTED;
3931                        goto error0;
3932                }
3933
3934                /* Grab a pointer to the block. */
3935                left = xfs_btree_get_block(tcur, level, &lbp);
3936#ifdef DEBUG
3937                error = xfs_btree_check_block(cur, left, level, lbp);
3938                if (error)
3939                        goto error0;
3940#endif
3941                /* Grab the current block number, for future use. */
3942                xfs_btree_get_sibling(tcur, left, &cptr, XFS_BB_RIGHTSIB);
3943
3944                /*
3945                 * If left block is full enough so that removing one entry
3946                 * won't make it too empty, and right-shifting an entry out
3947                 * of left to us works, we're done.
3948                 */
3949                if (xfs_btree_get_numrecs(left) - 1 >=
3950                    cur->bc_ops->get_minrecs(tcur, level)) {
3951                        error = xfs_btree_rshift(tcur, level, &i);
3952                        if (error)
3953                                goto error0;
3954                        if (i) {
3955                                ASSERT(xfs_btree_get_numrecs(block) >=
3956                                       cur->bc_ops->get_minrecs(tcur, level));
3957                                xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
3958                                tcur = NULL;
3959                                if (level == 0)
3960                                        cur->bc_ptrs[0]++;
3961
3962                                *stat = 1;
3963                                return 0;
3964                        }
3965                }
3966
3967                /*
3968                 * Otherwise, grab the number of records in right for
3969                 * future reference.
3970                 */
3971                lrecs = xfs_btree_get_numrecs(left);
3972        }
3973
3974        /* Delete the temp cursor, we're done with it. */
3975        xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
3976        tcur = NULL;
3977
3978        /* If here, we need to do a join to keep the tree balanced. */
3979        ASSERT(!xfs_btree_ptr_is_null(cur, &cptr));
3980
3981        if (!xfs_btree_ptr_is_null(cur, &lptr) &&
3982            lrecs + xfs_btree_get_numrecs(block) <=
3983                        cur->bc_ops->get_maxrecs(cur, level)) {
3984                /*
3985                 * Set "right" to be the starting block,
3986                 * "left" to be the left neighbor.
3987                 */
3988                rptr = cptr;
3989                right = block;
3990                rbp = bp;
3991                error = xfs_btree_read_buf_block(cur, &lptr, 0, &left, &lbp);
3992                if (error)
3993                        goto error0;
3994
3995        /*
3996         * If that won't work, see if we can join with the right neighbor block.
3997         */
3998        } else if (!xfs_btree_ptr_is_null(cur, &rptr) &&
3999                   rrecs + xfs_btree_get_numrecs(block) <=
4000                        cur->bc_ops->get_maxrecs(cur, level)) {
4001                /*
4002                 * Set "left" to be the starting block,
4003                 * "right" to be the right neighbor.
4004                 */
4005                lptr = cptr;
4006                left = block;
4007                lbp = bp;
4008                error = xfs_btree_read_buf_block(cur, &rptr, 0, &right, &rbp);
4009                if (error)
4010                        goto error0;
4011
4012        /*
4013         * Otherwise, we can't fix the imbalance.
4014         * Just return.  This is probably a logic error, but it's not fatal.
4015         */
4016        } else {
4017                error = xfs_btree_dec_cursor(cur, level, stat);
4018                if (error)
4019                        goto error0;
4020                return 0;
4021        }
4022
4023        rrecs = xfs_btree_get_numrecs(right);
4024        lrecs = xfs_btree_get_numrecs(left);
4025
4026        /*
4027         * We're now going to join "left" and "right" by moving all the stuff
4028         * in "right" to "left" and deleting "right".
4029         */
4030        XFS_BTREE_STATS_ADD(cur, moves, rrecs);
4031        if (level > 0) {
4032                /* It's a non-leaf.  Move keys and pointers. */
4033                union xfs_btree_key     *lkp;   /* left btree key */
4034                union xfs_btree_ptr     *lpp;   /* left address pointer */
4035                union xfs_btree_key     *rkp;   /* right btree key */
4036                union xfs_btree_ptr     *rpp;   /* right address pointer */
4037
4038                lkp = xfs_btree_key_addr(cur, lrecs + 1, left);
4039                lpp = xfs_btree_ptr_addr(cur, lrecs + 1, left);
4040                rkp = xfs_btree_key_addr(cur, 1, right);
4041                rpp = xfs_btree_ptr_addr(cur, 1, right);
4042
4043                for (i = 1; i < rrecs; i++) {
4044                        error = xfs_btree_debug_check_ptr(cur, rpp, i, level);
4045                        if (error)
4046                                goto error0;
4047                }
4048
4049                xfs_btree_copy_keys(cur, lkp, rkp, rrecs);
4050                xfs_btree_copy_ptrs(cur, lpp, rpp, rrecs);
4051
4052                xfs_btree_log_keys(cur, lbp, lrecs + 1, lrecs + rrecs);
4053                xfs_btree_log_ptrs(cur, lbp, lrecs + 1, lrecs + rrecs);
4054        } else {
4055                /* It's a leaf.  Move records.  */
4056                union xfs_btree_rec     *lrp;   /* left record pointer */
4057                union xfs_btree_rec     *rrp;   /* right record pointer */
4058
4059                lrp = xfs_btree_rec_addr(cur, lrecs + 1, left);
4060                rrp = xfs_btree_rec_addr(cur, 1, right);
4061
4062                xfs_btree_copy_recs(cur, lrp, rrp, rrecs);
4063                xfs_btree_log_recs(cur, lbp, lrecs + 1, lrecs + rrecs);
4064        }
4065
4066        XFS_BTREE_STATS_INC(cur, join);
4067
4068        /*
4069         * Fix up the number of records and right block pointer in the
4070         * surviving block, and log it.
4071         */
4072        xfs_btree_set_numrecs(left, lrecs + rrecs);
4073        xfs_btree_get_sibling(cur, right, &cptr, XFS_BB_RIGHTSIB);
4074        xfs_btree_set_sibling(cur, left, &cptr, XFS_BB_RIGHTSIB);
4075        xfs_btree_log_block(cur, lbp, XFS_BB_NUMRECS | XFS_BB_RIGHTSIB);
4076
4077        /* If there is a right sibling, point it to the remaining block. */
4078        xfs_btree_get_sibling(cur, left, &cptr, XFS_BB_RIGHTSIB);
4079        if (!xfs_btree_ptr_is_null(cur, &cptr)) {
4080                error = xfs_btree_read_buf_block(cur, &cptr, 0, &rrblock, &rrbp);
4081                if (error)
4082                        goto error0;
4083                xfs_btree_set_sibling(cur, rrblock, &lptr, XFS_BB_LEFTSIB);
4084                xfs_btree_log_block(cur, rrbp, XFS_BB_LEFTSIB);
4085        }
4086
4087        /* Free the deleted block. */
4088        error = xfs_btree_free_block(cur, rbp);
4089        if (error)
4090                goto error0;
4091
4092        /*
4093         * If we joined with the left neighbor, set the buffer in the
4094         * cursor to the left block, and fix up the index.
4095         */
4096        if (bp != lbp) {
4097                cur->bc_bufs[level] = lbp;
4098                cur->bc_ptrs[level] += lrecs;
4099                cur->bc_ra[level] = 0;
4100        }
4101        /*
4102         * If we joined with the right neighbor and there's a level above
4103         * us, increment the cursor at that level.
4104         */
4105        else if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) ||
4106                   (level + 1 < cur->bc_nlevels)) {
4107                error = xfs_btree_increment(cur, level + 1, &i);
4108                if (error)
4109                        goto error0;
4110        }
4111
4112        /*
4113         * Readjust the ptr at this level if it's not a leaf, since it's
4114         * still pointing at the deletion point, which makes the cursor
4115         * inconsistent.  If this makes the ptr 0, the caller fixes it up.
4116         * We can't use decrement because it would change the next level up.
4117         */
4118        if (level > 0)
4119                cur->bc_ptrs[level]--;
4120
4121        /*
4122         * We combined blocks, so we have to update the parent keys if the
4123         * btree supports overlapped intervals.  However, bc_ptrs[level + 1]
4124         * points to the old block so that the caller knows which record to
4125         * delete.  Therefore, the caller must be savvy enough to call updkeys
4126         * for us if we return stat == 2.  The other exit points from this
4127         * function don't require deletions further up the tree, so they can
4128         * call updkeys directly.
4129         */
4130
4131        /* Return value means the next level up has something to do. */
4132        *stat = 2;
4133        return 0;
4134
4135error0:
4136        if (tcur)
4137                xfs_btree_del_cursor(tcur, XFS_BTREE_ERROR);
4138        return error;
4139}
4140
4141/*
4142 * Delete the record pointed to by cur.
4143 * The cursor refers to the place where the record was (could be inserted)
4144 * when the operation returns.
4145 */
4146int                                     /* error */
4147xfs_btree_delete(
4148        struct xfs_btree_cur    *cur,
4149        int                     *stat)  /* success/failure */
4150{
4151        int                     error;  /* error return value */
4152        int                     level;
4153        int                     i;
4154        bool                    joined = false;
4155
4156        /*
4157         * Go up the tree, starting at leaf level.
4158         *
4159         * If 2 is returned then a join was done; go to the next level.
4160         * Otherwise we are done.
4161         */
4162        for (level = 0, i = 2; i == 2; level++) {
4163                error = xfs_btree_delrec(cur, level, &i);
4164                if (error)
4165                        goto error0;
4166                if (i == 2)
4167                        joined = true;
4168        }
4169
4170        /*
4171         * If we combined blocks as part of deleting the record, delrec won't
4172         * have updated the parent high keys so we have to do that here.
4173         */
4174        if (joined && (cur->bc_flags & XFS_BTREE_OVERLAPPING)) {
4175                error = xfs_btree_updkeys_force(cur, 0);
4176                if (error)
4177                        goto error0;
4178        }
4179
4180        if (i == 0) {
4181                for (level = 1; level < cur->bc_nlevels; level++) {
4182                        if (cur->bc_ptrs[level] == 0) {
4183                                error = xfs_btree_decrement(cur, level, &i);
4184                                if (error)
4185                                        goto error0;
4186                                break;
4187                        }
4188                }
4189        }
4190
4191        *stat = i;
4192        return 0;
4193error0:
4194        return error;
4195}
4196
4197/*
4198 * Get the data from the pointed-to record.
4199 */
4200int                                     /* error */
4201xfs_btree_get_rec(
4202        struct xfs_btree_cur    *cur,   /* btree cursor */
4203        union xfs_btree_rec     **recp, /* output: btree record */
4204        int                     *stat)  /* output: success/failure */
4205{
4206        struct xfs_btree_block  *block; /* btree block */
4207        struct xfs_buf          *bp;    /* buffer pointer */
4208        int                     ptr;    /* record number */
4209#ifdef DEBUG
4210        int                     error;  /* error return value */
4211#endif
4212
4213        ptr = cur->bc_ptrs[0];
4214        block = xfs_btree_get_block(cur, 0, &bp);
4215
4216#ifdef DEBUG
4217        error = xfs_btree_check_block(cur, block, 0, bp);
4218        if (error)
4219                return error;
4220#endif
4221
4222        /*
4223         * Off the right end or left end, return failure.
4224         */
4225        if (ptr > xfs_btree_get_numrecs(block) || ptr <= 0) {
4226                *stat = 0;
4227                return 0;
4228        }
4229
4230        /*
4231         * Point to the record and extract its data.
4232         */
4233        *recp = xfs_btree_rec_addr(cur, ptr, block);
4234        *stat = 1;
4235        return 0;
4236}
4237
4238/* Visit a block in a btree. */
4239STATIC int
4240xfs_btree_visit_block(
4241        struct xfs_btree_cur            *cur,
4242        int                             level,
4243        xfs_btree_visit_blocks_fn       fn,
4244        void                            *data)
4245{
4246        struct xfs_btree_block          *block;
4247        struct xfs_buf                  *bp;
4248        union xfs_btree_ptr             rptr;
4249        int                             error;
4250
4251        /* do right sibling readahead */
4252        xfs_btree_readahead(cur, level, XFS_BTCUR_RIGHTRA);
4253        block = xfs_btree_get_block(cur, level, &bp);
4254
4255        /* process the block */
4256        error = fn(cur, level, data);
4257        if (error)
4258                return error;
4259
4260        /* now read rh sibling block for next iteration */
4261        xfs_btree_get_sibling(cur, block, &rptr, XFS_BB_RIGHTSIB);
4262        if (xfs_btree_ptr_is_null(cur, &rptr))
4263                return -ENOENT;
4264
4265        return xfs_btree_lookup_get_block(cur, level, &rptr, &block);
4266}
4267
4268
4269/* Visit every block in a btree. */
4270int
4271xfs_btree_visit_blocks(
4272        struct xfs_btree_cur            *cur,
4273        xfs_btree_visit_blocks_fn       fn,
4274        unsigned int                    flags,
4275        void                            *data)
4276{
4277        union xfs_btree_ptr             lptr;
4278        int                             level;
4279        struct xfs_btree_block          *block = NULL;
4280        int                             error = 0;
4281
4282        cur->bc_ops->init_ptr_from_cur(cur, &lptr);
4283
4284        /* for each level */
4285        for (level = cur->bc_nlevels - 1; level >= 0; level--) {
4286                /* grab the left hand block */
4287                error = xfs_btree_lookup_get_block(cur, level, &lptr, &block);
4288                if (error)
4289                        return error;
4290
4291                /* readahead the left most block for the next level down */
4292                if (level > 0) {
4293                        union xfs_btree_ptr     *ptr;
4294
4295                        ptr = xfs_btree_ptr_addr(cur, 1, block);
4296                        xfs_btree_readahead_ptr(cur, ptr, 1);
4297
4298                        /* save for the next iteration of the loop */
4299                        xfs_btree_copy_ptrs(cur, &lptr, ptr, 1);
4300
4301                        if (!(flags & XFS_BTREE_VISIT_LEAVES))
4302                                continue;
4303                } else if (!(flags & XFS_BTREE_VISIT_RECORDS)) {
4304                        continue;
4305                }
4306
4307                /* for each buffer in the level */
4308                do {
4309                        error = xfs_btree_visit_block(cur, level, fn, data);
4310                } while (!error);
4311
4312                if (error != -ENOENT)
4313                        return error;
4314        }
4315
4316        return 0;
4317}
4318
4319/*
4320 * Change the owner of a btree.
4321 *
4322 * The mechanism we use here is ordered buffer logging. Because we don't know
4323 * how many buffers were are going to need to modify, we don't really want to
4324 * have to make transaction reservations for the worst case of every buffer in a
4325 * full size btree as that may be more space that we can fit in the log....
4326 *
4327 * We do the btree walk in the most optimal manner possible - we have sibling
4328 * pointers so we can just walk all the blocks on each level from left to right
4329 * in a single pass, and then move to the next level and do the same. We can
4330 * also do readahead on the sibling pointers to get IO moving more quickly,
4331 * though for slow disks this is unlikely to make much difference to performance
4332 * as the amount of CPU work we have to do before moving to the next block is
4333 * relatively small.
4334 *
4335 * For each btree block that we load, modify the owner appropriately, set the
4336 * buffer as an ordered buffer and log it appropriately. We need to ensure that
4337 * we mark the region we change dirty so that if the buffer is relogged in
4338 * a subsequent transaction the changes we make here as an ordered buffer are
4339 * correctly relogged in that transaction.  If we are in recovery context, then
4340 * just queue the modified buffer as delayed write buffer so the transaction
4341 * recovery completion writes the changes to disk.
4342 */
4343struct xfs_btree_block_change_owner_info {
4344        uint64_t                new_owner;
4345        struct list_head        *buffer_list;
4346};
4347
4348static int
4349xfs_btree_block_change_owner(
4350        struct xfs_btree_cur    *cur,
4351        int                     level,
4352        void                    *data)
4353{
4354        struct xfs_btree_block_change_owner_info        *bbcoi = data;
4355        struct xfs_btree_block  *block;
4356        struct xfs_buf          *bp;
4357
4358        /* modify the owner */
4359        block = xfs_btree_get_block(cur, level, &bp);
4360        if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
4361                if (block->bb_u.l.bb_owner == cpu_to_be64(bbcoi->new_owner))
4362                        return 0;
4363                block->bb_u.l.bb_owner = cpu_to_be64(bbcoi->new_owner);
4364        } else {
4365                if (block->bb_u.s.bb_owner == cpu_to_be32(bbcoi->new_owner))
4366                        return 0;
4367                block->bb_u.s.bb_owner = cpu_to_be32(bbcoi->new_owner);
4368        }
4369
4370        /*
4371         * If the block is a root block hosted in an inode, we might not have a
4372         * buffer pointer here and we shouldn't attempt to log the change as the
4373         * information is already held in the inode and discarded when the root
4374         * block is formatted into the on-disk inode fork. We still change it,
4375         * though, so everything is consistent in memory.
4376         */
4377        if (!bp) {
4378                ASSERT(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE);
4379                ASSERT(level == cur->bc_nlevels - 1);
4380                return 0;
4381        }
4382
4383        if (cur->bc_tp) {
4384                if (!xfs_trans_ordered_buf(cur->bc_tp, bp)) {
4385                        xfs_btree_log_block(cur, bp, XFS_BB_OWNER);
4386                        return -EAGAIN;
4387                }
4388        } else {
4389                xfs_buf_delwri_queue(bp, bbcoi->buffer_list);
4390        }
4391
4392        return 0;
4393}
4394
4395int
4396xfs_btree_change_owner(
4397        struct xfs_btree_cur    *cur,
4398        uint64_t                new_owner,
4399        struct list_head        *buffer_list)
4400{
4401        struct xfs_btree_block_change_owner_info        bbcoi;
4402
4403        bbcoi.new_owner = new_owner;
4404        bbcoi.buffer_list = buffer_list;
4405
4406        return xfs_btree_visit_blocks(cur, xfs_btree_block_change_owner,
4407                        XFS_BTREE_VISIT_ALL, &bbcoi);
4408}
4409
4410/* Verify the v5 fields of a long-format btree block. */
4411xfs_failaddr_t
4412xfs_btree_lblock_v5hdr_verify(
4413        struct xfs_buf          *bp,
4414        uint64_t                owner)
4415{
4416        struct xfs_mount        *mp = bp->b_mount;
4417        struct xfs_btree_block  *block = XFS_BUF_TO_BLOCK(bp);
4418
4419        if (!xfs_sb_version_hascrc(&mp->m_sb))
4420                return __this_address;
4421        if (!uuid_equal(&block->bb_u.l.bb_uuid, &mp->m_sb.sb_meta_uuid))
4422                return __this_address;
4423        if (block->bb_u.l.bb_blkno != cpu_to_be64(bp->b_bn))
4424                return __this_address;
4425        if (owner != XFS_RMAP_OWN_UNKNOWN &&
4426            be64_to_cpu(block->bb_u.l.bb_owner) != owner)
4427                return __this_address;
4428        return NULL;
4429}
4430
4431/* Verify a long-format btree block. */
4432xfs_failaddr_t
4433xfs_btree_lblock_verify(
4434        struct xfs_buf          *bp,
4435        unsigned int            max_recs)
4436{
4437        struct xfs_mount        *mp = bp->b_mount;
4438        struct xfs_btree_block  *block = XFS_BUF_TO_BLOCK(bp);
4439
4440        /* numrecs verification */
4441        if (be16_to_cpu(block->bb_numrecs) > max_recs)
4442                return __this_address;
4443
4444        /* sibling pointer verification */
4445        if (block->bb_u.l.bb_leftsib != cpu_to_be64(NULLFSBLOCK) &&
4446            !xfs_verify_fsbno(mp, be64_to_cpu(block->bb_u.l.bb_leftsib)))
4447                return __this_address;
4448        if (block->bb_u.l.bb_rightsib != cpu_to_be64(NULLFSBLOCK) &&
4449            !xfs_verify_fsbno(mp, be64_to_cpu(block->bb_u.l.bb_rightsib)))
4450                return __this_address;
4451
4452        return NULL;
4453}
4454
4455/**
4456 * xfs_btree_sblock_v5hdr_verify() -- verify the v5 fields of a short-format
4457 *                                    btree block
4458 *
4459 * @bp: buffer containing the btree block
4460 */
4461xfs_failaddr_t
4462xfs_btree_sblock_v5hdr_verify(
4463        struct xfs_buf          *bp)
4464{
4465        struct xfs_mount        *mp = bp->b_mount;
4466        struct xfs_btree_block  *block = XFS_BUF_TO_BLOCK(bp);
4467        struct xfs_perag        *pag = bp->b_pag;
4468
4469        if (!xfs_sb_version_hascrc(&mp->m_sb))
4470                return __this_address;
4471        if (!uuid_equal(&block->bb_u.s.bb_uuid, &mp->m_sb.sb_meta_uuid))
4472                return __this_address;
4473        if (block->bb_u.s.bb_blkno != cpu_to_be64(bp->b_bn))
4474                return __this_address;
4475        if (pag && be32_to_cpu(block->bb_u.s.bb_owner) != pag->pag_agno)
4476                return __this_address;
4477        return NULL;
4478}
4479
4480/**
4481 * xfs_btree_sblock_verify() -- verify a short-format btree block
4482 *
4483 * @bp: buffer containing the btree block
4484 * @max_recs: maximum records allowed in this btree node
4485 */
4486xfs_failaddr_t
4487xfs_btree_sblock_verify(
4488        struct xfs_buf          *bp,
4489        unsigned int            max_recs)
4490{
4491        struct xfs_mount        *mp = bp->b_mount;
4492        struct xfs_btree_block  *block = XFS_BUF_TO_BLOCK(bp);
4493        xfs_agblock_t           agno;
4494
4495        /* numrecs verification */
4496        if (be16_to_cpu(block->bb_numrecs) > max_recs)
4497                return __this_address;
4498
4499        /* sibling pointer verification */
4500        agno = xfs_daddr_to_agno(mp, XFS_BUF_ADDR(bp));
4501        if (block->bb_u.s.bb_leftsib != cpu_to_be32(NULLAGBLOCK) &&
4502            !xfs_verify_agbno(mp, agno, be32_to_cpu(block->bb_u.s.bb_leftsib)))
4503                return __this_address;
4504        if (block->bb_u.s.bb_rightsib != cpu_to_be32(NULLAGBLOCK) &&
4505            !xfs_verify_agbno(mp, agno, be32_to_cpu(block->bb_u.s.bb_rightsib)))
4506                return __this_address;
4507
4508        return NULL;
4509}
4510
4511/*
4512 * Calculate the number of btree levels needed to store a given number of
4513 * records in a short-format btree.
4514 */
4515uint
4516xfs_btree_compute_maxlevels(
4517        uint                    *limits,
4518        unsigned long           len)
4519{
4520        uint                    level;
4521        unsigned long           maxblocks;
4522
4523        maxblocks = (len + limits[0] - 1) / limits[0];
4524        for (level = 1; maxblocks > 1; level++)
4525                maxblocks = (maxblocks + limits[1] - 1) / limits[1];
4526        return level;
4527}
4528
4529/*
4530 * Query a regular btree for all records overlapping a given interval.
4531 * Start with a LE lookup of the key of low_rec and return all records
4532 * until we find a record with a key greater than the key of high_rec.
4533 */
4534STATIC int
4535xfs_btree_simple_query_range(
4536        struct xfs_btree_cur            *cur,
4537        union xfs_btree_key             *low_key,
4538        union xfs_btree_key             *high_key,
4539        xfs_btree_query_range_fn        fn,
4540        void                            *priv)
4541{
4542        union xfs_btree_rec             *recp;
4543        union xfs_btree_key             rec_key;
4544        int64_t                         diff;
4545        int                             stat;
4546        bool                            firstrec = true;
4547        int                             error;
4548
4549        ASSERT(cur->bc_ops->init_high_key_from_rec);
4550        ASSERT(cur->bc_ops->diff_two_keys);
4551
4552        /*
4553         * Find the leftmost record.  The btree cursor must be set
4554         * to the low record used to generate low_key.
4555         */
4556        stat = 0;
4557        error = xfs_btree_lookup(cur, XFS_LOOKUP_LE, &stat);
4558        if (error)
4559                goto out;
4560
4561        /* Nothing?  See if there's anything to the right. */
4562        if (!stat) {
4563                error = xfs_btree_increment(cur, 0, &stat);
4564                if (error)
4565                        goto out;
4566        }
4567
4568        while (stat) {
4569                /* Find the record. */
4570                error = xfs_btree_get_rec(cur, &recp, &stat);
4571                if (error || !stat)
4572                        break;
4573
4574                /* Skip if high_key(rec) < low_key. */
4575                if (firstrec) {
4576                        cur->bc_ops->init_high_key_from_rec(&rec_key, recp);
4577                        firstrec = false;
4578                        diff = cur->bc_ops->diff_two_keys(cur, low_key,
4579                                        &rec_key);
4580                        if (diff > 0)
4581                                goto advloop;
4582                }
4583
4584                /* Stop if high_key < low_key(rec). */
4585                cur->bc_ops->init_key_from_rec(&rec_key, recp);
4586                diff = cur->bc_ops->diff_two_keys(cur, &rec_key, high_key);
4587                if (diff > 0)
4588                        break;
4589
4590                /* Callback */
4591                error = fn(cur, recp, priv);
4592                if (error)
4593                        break;
4594
4595advloop:
4596                /* Move on to the next record. */
4597                error = xfs_btree_increment(cur, 0, &stat);
4598                if (error)
4599                        break;
4600        }
4601
4602out:
4603        return error;
4604}
4605
4606/*
4607 * Query an overlapped interval btree for all records overlapping a given
4608 * interval.  This function roughly follows the algorithm given in
4609 * "Interval Trees" of _Introduction to Algorithms_, which is section
4610 * 14.3 in the 2nd and 3rd editions.
4611 *
4612 * First, generate keys for the low and high records passed in.
4613 *
4614 * For any leaf node, generate the high and low keys for the record.
4615 * If the record keys overlap with the query low/high keys, pass the
4616 * record to the function iterator.
4617 *
4618 * For any internal node, compare the low and high keys of each
4619 * pointer against the query low/high keys.  If there's an overlap,
4620 * follow the pointer.
4621 *
4622 * As an optimization, we stop scanning a block when we find a low key
4623 * that is greater than the query's high key.
4624 */
4625STATIC int
4626xfs_btree_overlapped_query_range(
4627        struct xfs_btree_cur            *cur,
4628        union xfs_btree_key             *low_key,
4629        union xfs_btree_key             *high_key,
4630        xfs_btree_query_range_fn        fn,
4631        void                            *priv)
4632{
4633        union xfs_btree_ptr             ptr;
4634        union xfs_btree_ptr             *pp;
4635        union xfs_btree_key             rec_key;
4636        union xfs_btree_key             rec_hkey;
4637        union xfs_btree_key             *lkp;
4638        union xfs_btree_key             *hkp;
4639        union xfs_btree_rec             *recp;
4640        struct xfs_btree_block          *block;
4641        int64_t                         ldiff;
4642        int64_t                         hdiff;
4643        int                             level;
4644        struct xfs_buf                  *bp;
4645        int                             i;
4646        int                             error;
4647
4648        /* Load the root of the btree. */
4649        level = cur->bc_nlevels - 1;
4650        cur->bc_ops->init_ptr_from_cur(cur, &ptr);
4651        error = xfs_btree_lookup_get_block(cur, level, &ptr, &block);
4652        if (error)
4653                return error;
4654        xfs_btree_get_block(cur, level, &bp);
4655        trace_xfs_btree_overlapped_query_range(cur, level, bp);
4656#ifdef DEBUG
4657        error = xfs_btree_check_block(cur, block, level, bp);
4658        if (error)
4659                goto out;
4660#endif
4661        cur->bc_ptrs[level] = 1;
4662
4663        while (level < cur->bc_nlevels) {
4664                block = xfs_btree_get_block(cur, level, &bp);
4665
4666                /* End of node, pop back towards the root. */
4667                if (cur->bc_ptrs[level] > be16_to_cpu(block->bb_numrecs)) {
4668pop_up:
4669                        if (level < cur->bc_nlevels - 1)
4670                                cur->bc_ptrs[level + 1]++;
4671                        level++;
4672                        continue;
4673                }
4674
4675                if (level == 0) {
4676                        /* Handle a leaf node. */
4677                        recp = xfs_btree_rec_addr(cur, cur->bc_ptrs[0], block);
4678
4679                        cur->bc_ops->init_high_key_from_rec(&rec_hkey, recp);
4680                        ldiff = cur->bc_ops->diff_two_keys(cur, &rec_hkey,
4681                                        low_key);
4682
4683                        cur->bc_ops->init_key_from_rec(&rec_key, recp);
4684                        hdiff = cur->bc_ops->diff_two_keys(cur, high_key,
4685                                        &rec_key);
4686
4687                        /*
4688                         * If (record's high key >= query's low key) and
4689                         *    (query's high key >= record's low key), then
4690                         * this record overlaps the query range; callback.
4691                         */
4692                        if (ldiff >= 0 && hdiff >= 0) {
4693                                error = fn(cur, recp, priv);
4694                                if (error)
4695                                        break;
4696                        } else if (hdiff < 0) {
4697                                /* Record is larger than high key; pop. */
4698                                goto pop_up;
4699                        }
4700                        cur->bc_ptrs[level]++;
4701                        continue;
4702                }
4703
4704                /* Handle an internal node. */
4705                lkp = xfs_btree_key_addr(cur, cur->bc_ptrs[level], block);
4706                hkp = xfs_btree_high_key_addr(cur, cur->bc_ptrs[level], block);
4707                pp = xfs_btree_ptr_addr(cur, cur->bc_ptrs[level], block);
4708
4709                ldiff = cur->bc_ops->diff_two_keys(cur, hkp, low_key);
4710                hdiff = cur->bc_ops->diff_two_keys(cur, high_key, lkp);
4711
4712                /*
4713                 * If (pointer's high key >= query's low key) and
4714                 *    (query's high key >= pointer's low key), then
4715                 * this record overlaps the query range; follow pointer.
4716                 */
4717                if (ldiff >= 0 && hdiff >= 0) {
4718                        level--;
4719                        error = xfs_btree_lookup_get_block(cur, level, pp,
4720                                        &block);
4721                        if (error)
4722                                goto out;
4723                        xfs_btree_get_block(cur, level, &bp);
4724                        trace_xfs_btree_overlapped_query_range(cur, level, bp);
4725#ifdef DEBUG
4726                        error = xfs_btree_check_block(cur, block, level, bp);
4727                        if (error)
4728                                goto out;
4729#endif
4730                        cur->bc_ptrs[level] = 1;
4731                        continue;
4732                } else if (hdiff < 0) {
4733                        /* The low key is larger than the upper range; pop. */
4734                        goto pop_up;
4735                }
4736                cur->bc_ptrs[level]++;
4737        }
4738
4739out:
4740        /*
4741         * If we don't end this function with the cursor pointing at a record
4742         * block, a subsequent non-error cursor deletion will not release
4743         * node-level buffers, causing a buffer leak.  This is quite possible
4744         * with a zero-results range query, so release the buffers if we
4745         * failed to return any results.
4746         */
4747        if (cur->bc_bufs[0] == NULL) {
4748                for (i = 0; i < cur->bc_nlevels; i++) {
4749                        if (cur->bc_bufs[i]) {
4750                                xfs_trans_brelse(cur->bc_tp, cur->bc_bufs[i]);
4751                                cur->bc_bufs[i] = NULL;
4752                                cur->bc_ptrs[i] = 0;
4753                                cur->bc_ra[i] = 0;
4754                        }
4755                }
4756        }
4757
4758        return error;
4759}
4760
4761/*
4762 * Query a btree for all records overlapping a given interval of keys.  The
4763 * supplied function will be called with each record found; return one of the
4764 * XFS_BTREE_QUERY_RANGE_{CONTINUE,ABORT} values or the usual negative error
4765 * code.  This function returns -ECANCELED, zero, or a negative error code.
4766 */
4767int
4768xfs_btree_query_range(
4769        struct xfs_btree_cur            *cur,
4770        union xfs_btree_irec            *low_rec,
4771        union xfs_btree_irec            *high_rec,
4772        xfs_btree_query_range_fn        fn,
4773        void                            *priv)
4774{
4775        union xfs_btree_rec             rec;
4776        union xfs_btree_key             low_key;
4777        union xfs_btree_key             high_key;
4778
4779        /* Find the keys of both ends of the interval. */
4780        cur->bc_rec = *high_rec;
4781        cur->bc_ops->init_rec_from_cur(cur, &rec);
4782        cur->bc_ops->init_key_from_rec(&high_key, &rec);
4783
4784        cur->bc_rec = *low_rec;
4785        cur->bc_ops->init_rec_from_cur(cur, &rec);
4786        cur->bc_ops->init_key_from_rec(&low_key, &rec);
4787
4788        /* Enforce low key < high key. */
4789        if (cur->bc_ops->diff_two_keys(cur, &low_key, &high_key) > 0)
4790                return -EINVAL;
4791
4792        if (!(cur->bc_flags & XFS_BTREE_OVERLAPPING))
4793                return xfs_btree_simple_query_range(cur, &low_key,
4794                                &high_key, fn, priv);
4795        return xfs_btree_overlapped_query_range(cur, &low_key, &high_key,
4796                        fn, priv);
4797}
4798
4799/* Query a btree for all records. */
4800int
4801xfs_btree_query_all(
4802        struct xfs_btree_cur            *cur,
4803        xfs_btree_query_range_fn        fn,
4804        void                            *priv)
4805{
4806        union xfs_btree_key             low_key;
4807        union xfs_btree_key             high_key;
4808
4809        memset(&cur->bc_rec, 0, sizeof(cur->bc_rec));
4810        memset(&low_key, 0, sizeof(low_key));
4811        memset(&high_key, 0xFF, sizeof(high_key));
4812
4813        return xfs_btree_simple_query_range(cur, &low_key, &high_key, fn, priv);
4814}
4815
4816/*
4817 * Calculate the number of blocks needed to store a given number of records
4818 * in a short-format (per-AG metadata) btree.
4819 */
4820unsigned long long
4821xfs_btree_calc_size(
4822        uint                    *limits,
4823        unsigned long long      len)
4824{
4825        int                     level;
4826        int                     maxrecs;
4827        unsigned long long      rval;
4828
4829        maxrecs = limits[0];
4830        for (level = 0, rval = 0; len > 1; level++) {
4831                len += maxrecs - 1;
4832                do_div(len, maxrecs);
4833                maxrecs = limits[1];
4834                rval += len;
4835        }
4836        return rval;
4837}
4838
4839static int
4840xfs_btree_count_blocks_helper(
4841        struct xfs_btree_cur    *cur,
4842        int                     level,
4843        void                    *data)
4844{
4845        xfs_extlen_t            *blocks = data;
4846        (*blocks)++;
4847
4848        return 0;
4849}
4850
4851/* Count the blocks in a btree and return the result in *blocks. */
4852int
4853xfs_btree_count_blocks(
4854        struct xfs_btree_cur    *cur,
4855        xfs_extlen_t            *blocks)
4856{
4857        *blocks = 0;
4858        return xfs_btree_visit_blocks(cur, xfs_btree_count_blocks_helper,
4859                        XFS_BTREE_VISIT_ALL, blocks);
4860}
4861
4862/* Compare two btree pointers. */
4863int64_t
4864xfs_btree_diff_two_ptrs(
4865        struct xfs_btree_cur            *cur,
4866        const union xfs_btree_ptr       *a,
4867        const union xfs_btree_ptr       *b)
4868{
4869        if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
4870                return (int64_t)be64_to_cpu(a->l) - be64_to_cpu(b->l);
4871        return (int64_t)be32_to_cpu(a->s) - be32_to_cpu(b->s);
4872}
4873
4874/* If there's an extent, we're done. */
4875STATIC int
4876xfs_btree_has_record_helper(
4877        struct xfs_btree_cur            *cur,
4878        union xfs_btree_rec             *rec,
4879        void                            *priv)
4880{
4881        return -ECANCELED;
4882}
4883
4884/* Is there a record covering a given range of keys? */
4885int
4886xfs_btree_has_record(
4887        struct xfs_btree_cur    *cur,
4888        union xfs_btree_irec    *low,
4889        union xfs_btree_irec    *high,
4890        bool                    *exists)
4891{
4892        int                     error;
4893
4894        error = xfs_btree_query_range(cur, low, high,
4895                        &xfs_btree_has_record_helper, NULL);
4896        if (error == -ECANCELED) {
4897                *exists = true;
4898                return 0;
4899        }
4900        *exists = false;
4901        return error;
4902}
4903
4904/* Are there more records in this btree? */
4905bool
4906xfs_btree_has_more_records(
4907        struct xfs_btree_cur    *cur)
4908{
4909        struct xfs_btree_block  *block;
4910        struct xfs_buf          *bp;
4911
4912        block = xfs_btree_get_block(cur, 0, &bp);
4913
4914        /* There are still records in this block. */
4915        if (cur->bc_ptrs[0] < xfs_btree_get_numrecs(block))
4916                return true;
4917
4918        /* There are more record blocks. */
4919        if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
4920                return block->bb_u.l.bb_rightsib != cpu_to_be64(NULLFSBLOCK);
4921        else
4922                return block->bb_u.s.bb_rightsib != cpu_to_be32(NULLAGBLOCK);
4923}
4924