linux/fs/xfs/libxfs/xfs_btree.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0
   2/*
   3 * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc.
   4 * All Rights Reserved.
   5 */
   6#include "xfs.h"
   7#include "xfs_fs.h"
   8#include "xfs_shared.h"
   9#include "xfs_format.h"
  10#include "xfs_log_format.h"
  11#include "xfs_trans_resv.h"
  12#include "xfs_bit.h"
  13#include "xfs_mount.h"
  14#include "xfs_inode.h"
  15#include "xfs_trans.h"
  16#include "xfs_buf_item.h"
  17#include "xfs_btree.h"
  18#include "xfs_errortag.h"
  19#include "xfs_error.h"
  20#include "xfs_trace.h"
  21#include "xfs_alloc.h"
  22#include "xfs_log.h"
  23#include "xfs_btree_staging.h"
  24#include "xfs_ag.h"
  25
  26/*
  27 * Cursor allocation zone.
  28 */
  29kmem_zone_t     *xfs_btree_cur_zone;
  30
  31/*
  32 * Btree magic numbers.
  33 */
  34static const uint32_t xfs_magics[2][XFS_BTNUM_MAX] = {
  35        { XFS_ABTB_MAGIC, XFS_ABTC_MAGIC, 0, XFS_BMAP_MAGIC, XFS_IBT_MAGIC,
  36          XFS_FIBT_MAGIC, 0 },
  37        { XFS_ABTB_CRC_MAGIC, XFS_ABTC_CRC_MAGIC, XFS_RMAP_CRC_MAGIC,
  38          XFS_BMAP_CRC_MAGIC, XFS_IBT_CRC_MAGIC, XFS_FIBT_CRC_MAGIC,
  39          XFS_REFC_CRC_MAGIC }
  40};
  41
  42uint32_t
  43xfs_btree_magic(
  44        int                     crc,
  45        xfs_btnum_t             btnum)
  46{
  47        uint32_t                magic = xfs_magics[crc][btnum];
  48
  49        /* Ensure we asked for crc for crc-only magics. */
  50        ASSERT(magic != 0);
  51        return magic;
  52}
  53
  54/*
  55 * Check a long btree block header.  Return the address of the failing check,
  56 * or NULL if everything is ok.
  57 */
  58xfs_failaddr_t
  59__xfs_btree_check_lblock(
  60        struct xfs_btree_cur    *cur,
  61        struct xfs_btree_block  *block,
  62        int                     level,
  63        struct xfs_buf          *bp)
  64{
  65        struct xfs_mount        *mp = cur->bc_mp;
  66        xfs_btnum_t             btnum = cur->bc_btnum;
  67        int                     crc = xfs_has_crc(mp);
  68
  69        if (crc) {
  70                if (!uuid_equal(&block->bb_u.l.bb_uuid, &mp->m_sb.sb_meta_uuid))
  71                        return __this_address;
  72                if (block->bb_u.l.bb_blkno !=
  73                    cpu_to_be64(bp ? xfs_buf_daddr(bp) : XFS_BUF_DADDR_NULL))
  74                        return __this_address;
  75                if (block->bb_u.l.bb_pad != cpu_to_be32(0))
  76                        return __this_address;
  77        }
  78
  79        if (be32_to_cpu(block->bb_magic) != xfs_btree_magic(crc, btnum))
  80                return __this_address;
  81        if (be16_to_cpu(block->bb_level) != level)
  82                return __this_address;
  83        if (be16_to_cpu(block->bb_numrecs) >
  84            cur->bc_ops->get_maxrecs(cur, level))
  85                return __this_address;
  86        if (block->bb_u.l.bb_leftsib != cpu_to_be64(NULLFSBLOCK) &&
  87            !xfs_btree_check_lptr(cur, be64_to_cpu(block->bb_u.l.bb_leftsib),
  88                        level + 1))
  89                return __this_address;
  90        if (block->bb_u.l.bb_rightsib != cpu_to_be64(NULLFSBLOCK) &&
  91            !xfs_btree_check_lptr(cur, be64_to_cpu(block->bb_u.l.bb_rightsib),
  92                        level + 1))
  93                return __this_address;
  94
  95        return NULL;
  96}
  97
  98/* Check a long btree block header. */
  99static int
 100xfs_btree_check_lblock(
 101        struct xfs_btree_cur    *cur,
 102        struct xfs_btree_block  *block,
 103        int                     level,
 104        struct xfs_buf          *bp)
 105{
 106        struct xfs_mount        *mp = cur->bc_mp;
 107        xfs_failaddr_t          fa;
 108
 109        fa = __xfs_btree_check_lblock(cur, block, level, bp);
 110        if (XFS_IS_CORRUPT(mp, fa != NULL) ||
 111            XFS_TEST_ERROR(false, mp, XFS_ERRTAG_BTREE_CHECK_LBLOCK)) {
 112                if (bp)
 113                        trace_xfs_btree_corrupt(bp, _RET_IP_);
 114                return -EFSCORRUPTED;
 115        }
 116        return 0;
 117}
 118
 119/*
 120 * Check a short btree block header.  Return the address of the failing check,
 121 * or NULL if everything is ok.
 122 */
 123xfs_failaddr_t
 124__xfs_btree_check_sblock(
 125        struct xfs_btree_cur    *cur,
 126        struct xfs_btree_block  *block,
 127        int                     level,
 128        struct xfs_buf          *bp)
 129{
 130        struct xfs_mount        *mp = cur->bc_mp;
 131        xfs_btnum_t             btnum = cur->bc_btnum;
 132        int                     crc = xfs_has_crc(mp);
 133
 134        if (crc) {
 135                if (!uuid_equal(&block->bb_u.s.bb_uuid, &mp->m_sb.sb_meta_uuid))
 136                        return __this_address;
 137                if (block->bb_u.s.bb_blkno !=
 138                    cpu_to_be64(bp ? xfs_buf_daddr(bp) : XFS_BUF_DADDR_NULL))
 139                        return __this_address;
 140        }
 141
 142        if (be32_to_cpu(block->bb_magic) != xfs_btree_magic(crc, btnum))
 143                return __this_address;
 144        if (be16_to_cpu(block->bb_level) != level)
 145                return __this_address;
 146        if (be16_to_cpu(block->bb_numrecs) >
 147            cur->bc_ops->get_maxrecs(cur, level))
 148                return __this_address;
 149        if (block->bb_u.s.bb_leftsib != cpu_to_be32(NULLAGBLOCK) &&
 150            !xfs_btree_check_sptr(cur, be32_to_cpu(block->bb_u.s.bb_leftsib),
 151                        level + 1))
 152                return __this_address;
 153        if (block->bb_u.s.bb_rightsib != cpu_to_be32(NULLAGBLOCK) &&
 154            !xfs_btree_check_sptr(cur, be32_to_cpu(block->bb_u.s.bb_rightsib),
 155                        level + 1))
 156                return __this_address;
 157
 158        return NULL;
 159}
 160
 161/* Check a short btree block header. */
 162STATIC int
 163xfs_btree_check_sblock(
 164        struct xfs_btree_cur    *cur,
 165        struct xfs_btree_block  *block,
 166        int                     level,
 167        struct xfs_buf          *bp)
 168{
 169        struct xfs_mount        *mp = cur->bc_mp;
 170        xfs_failaddr_t          fa;
 171
 172        fa = __xfs_btree_check_sblock(cur, block, level, bp);
 173        if (XFS_IS_CORRUPT(mp, fa != NULL) ||
 174            XFS_TEST_ERROR(false, mp, XFS_ERRTAG_BTREE_CHECK_SBLOCK)) {
 175                if (bp)
 176                        trace_xfs_btree_corrupt(bp, _RET_IP_);
 177                return -EFSCORRUPTED;
 178        }
 179        return 0;
 180}
 181
 182/*
 183 * Debug routine: check that block header is ok.
 184 */
 185int
 186xfs_btree_check_block(
 187        struct xfs_btree_cur    *cur,   /* btree cursor */
 188        struct xfs_btree_block  *block, /* generic btree block pointer */
 189        int                     level,  /* level of the btree block */
 190        struct xfs_buf          *bp)    /* buffer containing block, if any */
 191{
 192        if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
 193                return xfs_btree_check_lblock(cur, block, level, bp);
 194        else
 195                return xfs_btree_check_sblock(cur, block, level, bp);
 196}
 197
 198/* Check that this long pointer is valid and points within the fs. */
 199bool
 200xfs_btree_check_lptr(
 201        struct xfs_btree_cur    *cur,
 202        xfs_fsblock_t           fsbno,
 203        int                     level)
 204{
 205        if (level <= 0)
 206                return false;
 207        return xfs_verify_fsbno(cur->bc_mp, fsbno);
 208}
 209
 210/* Check that this short pointer is valid and points within the AG. */
 211bool
 212xfs_btree_check_sptr(
 213        struct xfs_btree_cur    *cur,
 214        xfs_agblock_t           agbno,
 215        int                     level)
 216{
 217        if (level <= 0)
 218                return false;
 219        return xfs_verify_agbno(cur->bc_mp, cur->bc_ag.pag->pag_agno, agbno);
 220}
 221
 222/*
 223 * Check that a given (indexed) btree pointer at a certain level of a
 224 * btree is valid and doesn't point past where it should.
 225 */
 226static int
 227xfs_btree_check_ptr(
 228        struct xfs_btree_cur            *cur,
 229        const union xfs_btree_ptr       *ptr,
 230        int                             index,
 231        int                             level)
 232{
 233        if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
 234                if (xfs_btree_check_lptr(cur, be64_to_cpu((&ptr->l)[index]),
 235                                level))
 236                        return 0;
 237                xfs_err(cur->bc_mp,
 238"Inode %llu fork %d: Corrupt btree %d pointer at level %d index %d.",
 239                                cur->bc_ino.ip->i_ino,
 240                                cur->bc_ino.whichfork, cur->bc_btnum,
 241                                level, index);
 242        } else {
 243                if (xfs_btree_check_sptr(cur, be32_to_cpu((&ptr->s)[index]),
 244                                level))
 245                        return 0;
 246                xfs_err(cur->bc_mp,
 247"AG %u: Corrupt btree %d pointer at level %d index %d.",
 248                                cur->bc_ag.pag->pag_agno, cur->bc_btnum,
 249                                level, index);
 250        }
 251
 252        return -EFSCORRUPTED;
 253}
 254
 255#ifdef DEBUG
 256# define xfs_btree_debug_check_ptr      xfs_btree_check_ptr
 257#else
 258# define xfs_btree_debug_check_ptr(...) (0)
 259#endif
 260
 261/*
 262 * Calculate CRC on the whole btree block and stuff it into the
 263 * long-form btree header.
 264 *
 265 * Prior to calculting the CRC, pull the LSN out of the buffer log item and put
 266 * it into the buffer so recovery knows what the last modification was that made
 267 * it to disk.
 268 */
 269void
 270xfs_btree_lblock_calc_crc(
 271        struct xfs_buf          *bp)
 272{
 273        struct xfs_btree_block  *block = XFS_BUF_TO_BLOCK(bp);
 274        struct xfs_buf_log_item *bip = bp->b_log_item;
 275
 276        if (!xfs_has_crc(bp->b_mount))
 277                return;
 278        if (bip)
 279                block->bb_u.l.bb_lsn = cpu_to_be64(bip->bli_item.li_lsn);
 280        xfs_buf_update_cksum(bp, XFS_BTREE_LBLOCK_CRC_OFF);
 281}
 282
 283bool
 284xfs_btree_lblock_verify_crc(
 285        struct xfs_buf          *bp)
 286{
 287        struct xfs_btree_block  *block = XFS_BUF_TO_BLOCK(bp);
 288        struct xfs_mount        *mp = bp->b_mount;
 289
 290        if (xfs_has_crc(mp)) {
 291                if (!xfs_log_check_lsn(mp, be64_to_cpu(block->bb_u.l.bb_lsn)))
 292                        return false;
 293                return xfs_buf_verify_cksum(bp, XFS_BTREE_LBLOCK_CRC_OFF);
 294        }
 295
 296        return true;
 297}
 298
 299/*
 300 * Calculate CRC on the whole btree block and stuff it into the
 301 * short-form btree header.
 302 *
 303 * Prior to calculting the CRC, pull the LSN out of the buffer log item and put
 304 * it into the buffer so recovery knows what the last modification was that made
 305 * it to disk.
 306 */
 307void
 308xfs_btree_sblock_calc_crc(
 309        struct xfs_buf          *bp)
 310{
 311        struct xfs_btree_block  *block = XFS_BUF_TO_BLOCK(bp);
 312        struct xfs_buf_log_item *bip = bp->b_log_item;
 313
 314        if (!xfs_has_crc(bp->b_mount))
 315                return;
 316        if (bip)
 317                block->bb_u.s.bb_lsn = cpu_to_be64(bip->bli_item.li_lsn);
 318        xfs_buf_update_cksum(bp, XFS_BTREE_SBLOCK_CRC_OFF);
 319}
 320
 321bool
 322xfs_btree_sblock_verify_crc(
 323        struct xfs_buf          *bp)
 324{
 325        struct xfs_btree_block  *block = XFS_BUF_TO_BLOCK(bp);
 326        struct xfs_mount        *mp = bp->b_mount;
 327
 328        if (xfs_has_crc(mp)) {
 329                if (!xfs_log_check_lsn(mp, be64_to_cpu(block->bb_u.s.bb_lsn)))
 330                        return false;
 331                return xfs_buf_verify_cksum(bp, XFS_BTREE_SBLOCK_CRC_OFF);
 332        }
 333
 334        return true;
 335}
 336
 337static int
 338xfs_btree_free_block(
 339        struct xfs_btree_cur    *cur,
 340        struct xfs_buf          *bp)
 341{
 342        int                     error;
 343
 344        error = cur->bc_ops->free_block(cur, bp);
 345        if (!error) {
 346                xfs_trans_binval(cur->bc_tp, bp);
 347                XFS_BTREE_STATS_INC(cur, free);
 348        }
 349        return error;
 350}
 351
 352/*
 353 * Delete the btree cursor.
 354 */
 355void
 356xfs_btree_del_cursor(
 357        struct xfs_btree_cur    *cur,           /* btree cursor */
 358        int                     error)          /* del because of error */
 359{
 360        int                     i;              /* btree level */
 361
 362        /*
 363         * Clear the buffer pointers and release the buffers. If we're doing
 364         * this because of an error, inspect all of the entries in the bc_bufs
 365         * array for buffers to be unlocked. This is because some of the btree
 366         * code works from level n down to 0, and if we get an error along the
 367         * way we won't have initialized all the entries down to 0.
 368         */
 369        for (i = 0; i < cur->bc_nlevels; i++) {
 370                if (cur->bc_bufs[i])
 371                        xfs_trans_brelse(cur->bc_tp, cur->bc_bufs[i]);
 372                else if (!error)
 373                        break;
 374        }
 375
 376        ASSERT(cur->bc_btnum != XFS_BTNUM_BMAP || cur->bc_ino.allocated == 0 ||
 377               xfs_is_shutdown(cur->bc_mp));
 378        if (unlikely(cur->bc_flags & XFS_BTREE_STAGING))
 379                kmem_free(cur->bc_ops);
 380        if (!(cur->bc_flags & XFS_BTREE_LONG_PTRS) && cur->bc_ag.pag)
 381                xfs_perag_put(cur->bc_ag.pag);
 382        kmem_cache_free(xfs_btree_cur_zone, cur);
 383}
 384
 385/*
 386 * Duplicate the btree cursor.
 387 * Allocate a new one, copy the record, re-get the buffers.
 388 */
 389int                                     /* error */
 390xfs_btree_dup_cursor(
 391        xfs_btree_cur_t *cur,           /* input cursor */
 392        xfs_btree_cur_t **ncur)         /* output cursor */
 393{
 394        struct xfs_buf  *bp;            /* btree block's buffer pointer */
 395        int             error;          /* error return value */
 396        int             i;              /* level number of btree block */
 397        xfs_mount_t     *mp;            /* mount structure for filesystem */
 398        xfs_btree_cur_t *new;           /* new cursor value */
 399        xfs_trans_t     *tp;            /* transaction pointer, can be NULL */
 400
 401        tp = cur->bc_tp;
 402        mp = cur->bc_mp;
 403
 404        /*
 405         * Allocate a new cursor like the old one.
 406         */
 407        new = cur->bc_ops->dup_cursor(cur);
 408
 409        /*
 410         * Copy the record currently in the cursor.
 411         */
 412        new->bc_rec = cur->bc_rec;
 413
 414        /*
 415         * For each level current, re-get the buffer and copy the ptr value.
 416         */
 417        for (i = 0; i < new->bc_nlevels; i++) {
 418                new->bc_ptrs[i] = cur->bc_ptrs[i];
 419                new->bc_ra[i] = cur->bc_ra[i];
 420                bp = cur->bc_bufs[i];
 421                if (bp) {
 422                        error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp,
 423                                                   xfs_buf_daddr(bp), mp->m_bsize,
 424                                                   0, &bp,
 425                                                   cur->bc_ops->buf_ops);
 426                        if (error) {
 427                                xfs_btree_del_cursor(new, error);
 428                                *ncur = NULL;
 429                                return error;
 430                        }
 431                }
 432                new->bc_bufs[i] = bp;
 433        }
 434        *ncur = new;
 435        return 0;
 436}
 437
 438/*
 439 * XFS btree block layout and addressing:
 440 *
 441 * There are two types of blocks in the btree: leaf and non-leaf blocks.
 442 *
 443 * The leaf record start with a header then followed by records containing
 444 * the values.  A non-leaf block also starts with the same header, and
 445 * then first contains lookup keys followed by an equal number of pointers
 446 * to the btree blocks at the previous level.
 447 *
 448 *              +--------+-------+-------+-------+-------+-------+-------+
 449 * Leaf:        | header | rec 1 | rec 2 | rec 3 | rec 4 | rec 5 | rec N |
 450 *              +--------+-------+-------+-------+-------+-------+-------+
 451 *
 452 *              +--------+-------+-------+-------+-------+-------+-------+
 453 * Non-Leaf:    | header | key 1 | key 2 | key N | ptr 1 | ptr 2 | ptr N |
 454 *              +--------+-------+-------+-------+-------+-------+-------+
 455 *
 456 * The header is called struct xfs_btree_block for reasons better left unknown
 457 * and comes in different versions for short (32bit) and long (64bit) block
 458 * pointers.  The record and key structures are defined by the btree instances
 459 * and opaque to the btree core.  The block pointers are simple disk endian
 460 * integers, available in a short (32bit) and long (64bit) variant.
 461 *
 462 * The helpers below calculate the offset of a given record, key or pointer
 463 * into a btree block (xfs_btree_*_offset) or return a pointer to the given
 464 * record, key or pointer (xfs_btree_*_addr).  Note that all addressing
 465 * inside the btree block is done using indices starting at one, not zero!
 466 *
 467 * If XFS_BTREE_OVERLAPPING is set, then this btree supports keys containing
 468 * overlapping intervals.  In such a tree, records are still sorted lowest to
 469 * highest and indexed by the smallest key value that refers to the record.
 470 * However, nodes are different: each pointer has two associated keys -- one
 471 * indexing the lowest key available in the block(s) below (the same behavior
 472 * as the key in a regular btree) and another indexing the highest key
 473 * available in the block(s) below.  Because records are /not/ sorted by the
 474 * highest key, all leaf block updates require us to compute the highest key
 475 * that matches any record in the leaf and to recursively update the high keys
 476 * in the nodes going further up in the tree, if necessary.  Nodes look like
 477 * this:
 478 *
 479 *              +--------+-----+-----+-----+-----+-----+-------+-------+-----+
 480 * Non-Leaf:    | header | lo1 | hi1 | lo2 | hi2 | ... | ptr 1 | ptr 2 | ... |
 481 *              +--------+-----+-----+-----+-----+-----+-------+-------+-----+
 482 *
 483 * To perform an interval query on an overlapped tree, perform the usual
 484 * depth-first search and use the low and high keys to decide if we can skip
 485 * that particular node.  If a leaf node is reached, return the records that
 486 * intersect the interval.  Note that an interval query may return numerous
 487 * entries.  For a non-overlapped tree, simply search for the record associated
 488 * with the lowest key and iterate forward until a non-matching record is
 489 * found.  Section 14.3 ("Interval Trees") of _Introduction to Algorithms_ by
 490 * Cormen, Leiserson, Rivest, and Stein (2nd or 3rd ed. only) discuss this in
 491 * more detail.
 492 *
 493 * Why do we care about overlapping intervals?  Let's say you have a bunch of
 494 * reverse mapping records on a reflink filesystem:
 495 *
 496 * 1: +- file A startblock B offset C length D -----------+
 497 * 2:      +- file E startblock F offset G length H --------------+
 498 * 3:      +- file I startblock F offset J length K --+
 499 * 4:                                                        +- file L... --+
 500 *
 501 * Now say we want to map block (B+D) into file A at offset (C+D).  Ideally,
 502 * we'd simply increment the length of record 1.  But how do we find the record
 503 * that ends at (B+D-1) (i.e. record 1)?  A LE lookup of (B+D-1) would return
 504 * record 3 because the keys are ordered first by startblock.  An interval
 505 * query would return records 1 and 2 because they both overlap (B+D-1), and
 506 * from that we can pick out record 1 as the appropriate left neighbor.
 507 *
 508 * In the non-overlapped case you can do a LE lookup and decrement the cursor
 509 * because a record's interval must end before the next record.
 510 */
 511
 512/*
 513 * Return size of the btree block header for this btree instance.
 514 */
 515static inline size_t xfs_btree_block_len(struct xfs_btree_cur *cur)
 516{
 517        if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
 518                if (cur->bc_flags & XFS_BTREE_CRC_BLOCKS)
 519                        return XFS_BTREE_LBLOCK_CRC_LEN;
 520                return XFS_BTREE_LBLOCK_LEN;
 521        }
 522        if (cur->bc_flags & XFS_BTREE_CRC_BLOCKS)
 523                return XFS_BTREE_SBLOCK_CRC_LEN;
 524        return XFS_BTREE_SBLOCK_LEN;
 525}
 526
 527/*
 528 * Return size of btree block pointers for this btree instance.
 529 */
 530static inline size_t xfs_btree_ptr_len(struct xfs_btree_cur *cur)
 531{
 532        return (cur->bc_flags & XFS_BTREE_LONG_PTRS) ?
 533                sizeof(__be64) : sizeof(__be32);
 534}
 535
 536/*
 537 * Calculate offset of the n-th record in a btree block.
 538 */
 539STATIC size_t
 540xfs_btree_rec_offset(
 541        struct xfs_btree_cur    *cur,
 542        int                     n)
 543{
 544        return xfs_btree_block_len(cur) +
 545                (n - 1) * cur->bc_ops->rec_len;
 546}
 547
 548/*
 549 * Calculate offset of the n-th key in a btree block.
 550 */
 551STATIC size_t
 552xfs_btree_key_offset(
 553        struct xfs_btree_cur    *cur,
 554        int                     n)
 555{
 556        return xfs_btree_block_len(cur) +
 557                (n - 1) * cur->bc_ops->key_len;
 558}
 559
 560/*
 561 * Calculate offset of the n-th high key in a btree block.
 562 */
 563STATIC size_t
 564xfs_btree_high_key_offset(
 565        struct xfs_btree_cur    *cur,
 566        int                     n)
 567{
 568        return xfs_btree_block_len(cur) +
 569                (n - 1) * cur->bc_ops->key_len + (cur->bc_ops->key_len / 2);
 570}
 571
 572/*
 573 * Calculate offset of the n-th block pointer in a btree block.
 574 */
 575STATIC size_t
 576xfs_btree_ptr_offset(
 577        struct xfs_btree_cur    *cur,
 578        int                     n,
 579        int                     level)
 580{
 581        return xfs_btree_block_len(cur) +
 582                cur->bc_ops->get_maxrecs(cur, level) * cur->bc_ops->key_len +
 583                (n - 1) * xfs_btree_ptr_len(cur);
 584}
 585
 586/*
 587 * Return a pointer to the n-th record in the btree block.
 588 */
 589union xfs_btree_rec *
 590xfs_btree_rec_addr(
 591        struct xfs_btree_cur    *cur,
 592        int                     n,
 593        struct xfs_btree_block  *block)
 594{
 595        return (union xfs_btree_rec *)
 596                ((char *)block + xfs_btree_rec_offset(cur, n));
 597}
 598
 599/*
 600 * Return a pointer to the n-th key in the btree block.
 601 */
 602union xfs_btree_key *
 603xfs_btree_key_addr(
 604        struct xfs_btree_cur    *cur,
 605        int                     n,
 606        struct xfs_btree_block  *block)
 607{
 608        return (union xfs_btree_key *)
 609                ((char *)block + xfs_btree_key_offset(cur, n));
 610}
 611
 612/*
 613 * Return a pointer to the n-th high key in the btree block.
 614 */
 615union xfs_btree_key *
 616xfs_btree_high_key_addr(
 617        struct xfs_btree_cur    *cur,
 618        int                     n,
 619        struct xfs_btree_block  *block)
 620{
 621        return (union xfs_btree_key *)
 622                ((char *)block + xfs_btree_high_key_offset(cur, n));
 623}
 624
 625/*
 626 * Return a pointer to the n-th block pointer in the btree block.
 627 */
 628union xfs_btree_ptr *
 629xfs_btree_ptr_addr(
 630        struct xfs_btree_cur    *cur,
 631        int                     n,
 632        struct xfs_btree_block  *block)
 633{
 634        int                     level = xfs_btree_get_level(block);
 635
 636        ASSERT(block->bb_level != 0);
 637
 638        return (union xfs_btree_ptr *)
 639                ((char *)block + xfs_btree_ptr_offset(cur, n, level));
 640}
 641
 642struct xfs_ifork *
 643xfs_btree_ifork_ptr(
 644        struct xfs_btree_cur    *cur)
 645{
 646        ASSERT(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE);
 647
 648        if (cur->bc_flags & XFS_BTREE_STAGING)
 649                return cur->bc_ino.ifake->if_fork;
 650        return XFS_IFORK_PTR(cur->bc_ino.ip, cur->bc_ino.whichfork);
 651}
 652
 653/*
 654 * Get the root block which is stored in the inode.
 655 *
 656 * For now this btree implementation assumes the btree root is always
 657 * stored in the if_broot field of an inode fork.
 658 */
 659STATIC struct xfs_btree_block *
 660xfs_btree_get_iroot(
 661        struct xfs_btree_cur    *cur)
 662{
 663        struct xfs_ifork        *ifp = xfs_btree_ifork_ptr(cur);
 664
 665        return (struct xfs_btree_block *)ifp->if_broot;
 666}
 667
 668/*
 669 * Retrieve the block pointer from the cursor at the given level.
 670 * This may be an inode btree root or from a buffer.
 671 */
 672struct xfs_btree_block *                /* generic btree block pointer */
 673xfs_btree_get_block(
 674        struct xfs_btree_cur    *cur,   /* btree cursor */
 675        int                     level,  /* level in btree */
 676        struct xfs_buf          **bpp)  /* buffer containing the block */
 677{
 678        if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
 679            (level == cur->bc_nlevels - 1)) {
 680                *bpp = NULL;
 681                return xfs_btree_get_iroot(cur);
 682        }
 683
 684        *bpp = cur->bc_bufs[level];
 685        return XFS_BUF_TO_BLOCK(*bpp);
 686}
 687
 688/*
 689 * Change the cursor to point to the first record at the given level.
 690 * Other levels are unaffected.
 691 */
 692STATIC int                              /* success=1, failure=0 */
 693xfs_btree_firstrec(
 694        xfs_btree_cur_t         *cur,   /* btree cursor */
 695        int                     level)  /* level to change */
 696{
 697        struct xfs_btree_block  *block; /* generic btree block pointer */
 698        struct xfs_buf          *bp;    /* buffer containing block */
 699
 700        /*
 701         * Get the block pointer for this level.
 702         */
 703        block = xfs_btree_get_block(cur, level, &bp);
 704        if (xfs_btree_check_block(cur, block, level, bp))
 705                return 0;
 706        /*
 707         * It's empty, there is no such record.
 708         */
 709        if (!block->bb_numrecs)
 710                return 0;
 711        /*
 712         * Set the ptr value to 1, that's the first record/key.
 713         */
 714        cur->bc_ptrs[level] = 1;
 715        return 1;
 716}
 717
 718/*
 719 * Change the cursor to point to the last record in the current block
 720 * at the given level.  Other levels are unaffected.
 721 */
 722STATIC int                              /* success=1, failure=0 */
 723xfs_btree_lastrec(
 724        xfs_btree_cur_t         *cur,   /* btree cursor */
 725        int                     level)  /* level to change */
 726{
 727        struct xfs_btree_block  *block; /* generic btree block pointer */
 728        struct xfs_buf          *bp;    /* buffer containing block */
 729
 730        /*
 731         * Get the block pointer for this level.
 732         */
 733        block = xfs_btree_get_block(cur, level, &bp);
 734        if (xfs_btree_check_block(cur, block, level, bp))
 735                return 0;
 736        /*
 737         * It's empty, there is no such record.
 738         */
 739        if (!block->bb_numrecs)
 740                return 0;
 741        /*
 742         * Set the ptr value to numrecs, that's the last record/key.
 743         */
 744        cur->bc_ptrs[level] = be16_to_cpu(block->bb_numrecs);
 745        return 1;
 746}
 747
 748/*
 749 * Compute first and last byte offsets for the fields given.
 750 * Interprets the offsets table, which contains struct field offsets.
 751 */
 752void
 753xfs_btree_offsets(
 754        int64_t         fields,         /* bitmask of fields */
 755        const short     *offsets,       /* table of field offsets */
 756        int             nbits,          /* number of bits to inspect */
 757        int             *first,         /* output: first byte offset */
 758        int             *last)          /* output: last byte offset */
 759{
 760        int             i;              /* current bit number */
 761        int64_t         imask;          /* mask for current bit number */
 762
 763        ASSERT(fields != 0);
 764        /*
 765         * Find the lowest bit, so the first byte offset.
 766         */
 767        for (i = 0, imask = 1LL; ; i++, imask <<= 1) {
 768                if (imask & fields) {
 769                        *first = offsets[i];
 770                        break;
 771                }
 772        }
 773        /*
 774         * Find the highest bit, so the last byte offset.
 775         */
 776        for (i = nbits - 1, imask = 1LL << i; ; i--, imask >>= 1) {
 777                if (imask & fields) {
 778                        *last = offsets[i + 1] - 1;
 779                        break;
 780                }
 781        }
 782}
 783
 784/*
 785 * Get a buffer for the block, return it read in.
 786 * Long-form addressing.
 787 */
 788int
 789xfs_btree_read_bufl(
 790        struct xfs_mount        *mp,            /* file system mount point */
 791        struct xfs_trans        *tp,            /* transaction pointer */
 792        xfs_fsblock_t           fsbno,          /* file system block number */
 793        struct xfs_buf          **bpp,          /* buffer for fsbno */
 794        int                     refval,         /* ref count value for buffer */
 795        const struct xfs_buf_ops *ops)
 796{
 797        struct xfs_buf          *bp;            /* return value */
 798        xfs_daddr_t             d;              /* real disk block address */
 799        int                     error;
 800
 801        if (!xfs_verify_fsbno(mp, fsbno))
 802                return -EFSCORRUPTED;
 803        d = XFS_FSB_TO_DADDR(mp, fsbno);
 804        error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, d,
 805                                   mp->m_bsize, 0, &bp, ops);
 806        if (error)
 807                return error;
 808        if (bp)
 809                xfs_buf_set_ref(bp, refval);
 810        *bpp = bp;
 811        return 0;
 812}
 813
 814/*
 815 * Read-ahead the block, don't wait for it, don't return a buffer.
 816 * Long-form addressing.
 817 */
 818/* ARGSUSED */
 819void
 820xfs_btree_reada_bufl(
 821        struct xfs_mount        *mp,            /* file system mount point */
 822        xfs_fsblock_t           fsbno,          /* file system block number */
 823        xfs_extlen_t            count,          /* count of filesystem blocks */
 824        const struct xfs_buf_ops *ops)
 825{
 826        xfs_daddr_t             d;
 827
 828        ASSERT(fsbno != NULLFSBLOCK);
 829        d = XFS_FSB_TO_DADDR(mp, fsbno);
 830        xfs_buf_readahead(mp->m_ddev_targp, d, mp->m_bsize * count, ops);
 831}
 832
 833/*
 834 * Read-ahead the block, don't wait for it, don't return a buffer.
 835 * Short-form addressing.
 836 */
 837/* ARGSUSED */
 838void
 839xfs_btree_reada_bufs(
 840        struct xfs_mount        *mp,            /* file system mount point */
 841        xfs_agnumber_t          agno,           /* allocation group number */
 842        xfs_agblock_t           agbno,          /* allocation group block number */
 843        xfs_extlen_t            count,          /* count of filesystem blocks */
 844        const struct xfs_buf_ops *ops)
 845{
 846        xfs_daddr_t             d;
 847
 848        ASSERT(agno != NULLAGNUMBER);
 849        ASSERT(agbno != NULLAGBLOCK);
 850        d = XFS_AGB_TO_DADDR(mp, agno, agbno);
 851        xfs_buf_readahead(mp->m_ddev_targp, d, mp->m_bsize * count, ops);
 852}
 853
 854STATIC int
 855xfs_btree_readahead_lblock(
 856        struct xfs_btree_cur    *cur,
 857        int                     lr,
 858        struct xfs_btree_block  *block)
 859{
 860        int                     rval = 0;
 861        xfs_fsblock_t           left = be64_to_cpu(block->bb_u.l.bb_leftsib);
 862        xfs_fsblock_t           right = be64_to_cpu(block->bb_u.l.bb_rightsib);
 863
 864        if ((lr & XFS_BTCUR_LEFTRA) && left != NULLFSBLOCK) {
 865                xfs_btree_reada_bufl(cur->bc_mp, left, 1,
 866                                     cur->bc_ops->buf_ops);
 867                rval++;
 868        }
 869
 870        if ((lr & XFS_BTCUR_RIGHTRA) && right != NULLFSBLOCK) {
 871                xfs_btree_reada_bufl(cur->bc_mp, right, 1,
 872                                     cur->bc_ops->buf_ops);
 873                rval++;
 874        }
 875
 876        return rval;
 877}
 878
 879STATIC int
 880xfs_btree_readahead_sblock(
 881        struct xfs_btree_cur    *cur,
 882        int                     lr,
 883        struct xfs_btree_block *block)
 884{
 885        int                     rval = 0;
 886        xfs_agblock_t           left = be32_to_cpu(block->bb_u.s.bb_leftsib);
 887        xfs_agblock_t           right = be32_to_cpu(block->bb_u.s.bb_rightsib);
 888
 889
 890        if ((lr & XFS_BTCUR_LEFTRA) && left != NULLAGBLOCK) {
 891                xfs_btree_reada_bufs(cur->bc_mp, cur->bc_ag.pag->pag_agno,
 892                                     left, 1, cur->bc_ops->buf_ops);
 893                rval++;
 894        }
 895
 896        if ((lr & XFS_BTCUR_RIGHTRA) && right != NULLAGBLOCK) {
 897                xfs_btree_reada_bufs(cur->bc_mp, cur->bc_ag.pag->pag_agno,
 898                                     right, 1, cur->bc_ops->buf_ops);
 899                rval++;
 900        }
 901
 902        return rval;
 903}
 904
 905/*
 906 * Read-ahead btree blocks, at the given level.
 907 * Bits in lr are set from XFS_BTCUR_{LEFT,RIGHT}RA.
 908 */
 909STATIC int
 910xfs_btree_readahead(
 911        struct xfs_btree_cur    *cur,           /* btree cursor */
 912        int                     lev,            /* level in btree */
 913        int                     lr)             /* left/right bits */
 914{
 915        struct xfs_btree_block  *block;
 916
 917        /*
 918         * No readahead needed if we are at the root level and the
 919         * btree root is stored in the inode.
 920         */
 921        if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
 922            (lev == cur->bc_nlevels - 1))
 923                return 0;
 924
 925        if ((cur->bc_ra[lev] | lr) == cur->bc_ra[lev])
 926                return 0;
 927
 928        cur->bc_ra[lev] |= lr;
 929        block = XFS_BUF_TO_BLOCK(cur->bc_bufs[lev]);
 930
 931        if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
 932                return xfs_btree_readahead_lblock(cur, lr, block);
 933        return xfs_btree_readahead_sblock(cur, lr, block);
 934}
 935
 936STATIC int
 937xfs_btree_ptr_to_daddr(
 938        struct xfs_btree_cur            *cur,
 939        const union xfs_btree_ptr       *ptr,
 940        xfs_daddr_t                     *daddr)
 941{
 942        xfs_fsblock_t           fsbno;
 943        xfs_agblock_t           agbno;
 944        int                     error;
 945
 946        error = xfs_btree_check_ptr(cur, ptr, 0, 1);
 947        if (error)
 948                return error;
 949
 950        if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
 951                fsbno = be64_to_cpu(ptr->l);
 952                *daddr = XFS_FSB_TO_DADDR(cur->bc_mp, fsbno);
 953        } else {
 954                agbno = be32_to_cpu(ptr->s);
 955                *daddr = XFS_AGB_TO_DADDR(cur->bc_mp, cur->bc_ag.pag->pag_agno,
 956                                agbno);
 957        }
 958
 959        return 0;
 960}
 961
 962/*
 963 * Readahead @count btree blocks at the given @ptr location.
 964 *
 965 * We don't need to care about long or short form btrees here as we have a
 966 * method of converting the ptr directly to a daddr available to us.
 967 */
 968STATIC void
 969xfs_btree_readahead_ptr(
 970        struct xfs_btree_cur    *cur,
 971        union xfs_btree_ptr     *ptr,
 972        xfs_extlen_t            count)
 973{
 974        xfs_daddr_t             daddr;
 975
 976        if (xfs_btree_ptr_to_daddr(cur, ptr, &daddr))
 977                return;
 978        xfs_buf_readahead(cur->bc_mp->m_ddev_targp, daddr,
 979                          cur->bc_mp->m_bsize * count, cur->bc_ops->buf_ops);
 980}
 981
 982/*
 983 * Set the buffer for level "lev" in the cursor to bp, releasing
 984 * any previous buffer.
 985 */
 986STATIC void
 987xfs_btree_setbuf(
 988        xfs_btree_cur_t         *cur,   /* btree cursor */
 989        int                     lev,    /* level in btree */
 990        struct xfs_buf          *bp)    /* new buffer to set */
 991{
 992        struct xfs_btree_block  *b;     /* btree block */
 993
 994        if (cur->bc_bufs[lev])
 995                xfs_trans_brelse(cur->bc_tp, cur->bc_bufs[lev]);
 996        cur->bc_bufs[lev] = bp;
 997        cur->bc_ra[lev] = 0;
 998
 999        b = XFS_BUF_TO_BLOCK(bp);
1000        if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
1001                if (b->bb_u.l.bb_leftsib == cpu_to_be64(NULLFSBLOCK))
1002                        cur->bc_ra[lev] |= XFS_BTCUR_LEFTRA;
1003                if (b->bb_u.l.bb_rightsib == cpu_to_be64(NULLFSBLOCK))
1004                        cur->bc_ra[lev] |= XFS_BTCUR_RIGHTRA;
1005        } else {
1006                if (b->bb_u.s.bb_leftsib == cpu_to_be32(NULLAGBLOCK))
1007                        cur->bc_ra[lev] |= XFS_BTCUR_LEFTRA;
1008                if (b->bb_u.s.bb_rightsib == cpu_to_be32(NULLAGBLOCK))
1009                        cur->bc_ra[lev] |= XFS_BTCUR_RIGHTRA;
1010        }
1011}
1012
1013bool
1014xfs_btree_ptr_is_null(
1015        struct xfs_btree_cur            *cur,
1016        const union xfs_btree_ptr       *ptr)
1017{
1018        if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
1019                return ptr->l == cpu_to_be64(NULLFSBLOCK);
1020        else
1021                return ptr->s == cpu_to_be32(NULLAGBLOCK);
1022}
1023
1024void
1025xfs_btree_set_ptr_null(
1026        struct xfs_btree_cur    *cur,
1027        union xfs_btree_ptr     *ptr)
1028{
1029        if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
1030                ptr->l = cpu_to_be64(NULLFSBLOCK);
1031        else
1032                ptr->s = cpu_to_be32(NULLAGBLOCK);
1033}
1034
1035/*
1036 * Get/set/init sibling pointers
1037 */
1038void
1039xfs_btree_get_sibling(
1040        struct xfs_btree_cur    *cur,
1041        struct xfs_btree_block  *block,
1042        union xfs_btree_ptr     *ptr,
1043        int                     lr)
1044{
1045        ASSERT(lr == XFS_BB_LEFTSIB || lr == XFS_BB_RIGHTSIB);
1046
1047        if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
1048                if (lr == XFS_BB_RIGHTSIB)
1049                        ptr->l = block->bb_u.l.bb_rightsib;
1050                else
1051                        ptr->l = block->bb_u.l.bb_leftsib;
1052        } else {
1053                if (lr == XFS_BB_RIGHTSIB)
1054                        ptr->s = block->bb_u.s.bb_rightsib;
1055                else
1056                        ptr->s = block->bb_u.s.bb_leftsib;
1057        }
1058}
1059
1060void
1061xfs_btree_set_sibling(
1062        struct xfs_btree_cur            *cur,
1063        struct xfs_btree_block          *block,
1064        const union xfs_btree_ptr       *ptr,
1065        int                             lr)
1066{
1067        ASSERT(lr == XFS_BB_LEFTSIB || lr == XFS_BB_RIGHTSIB);
1068
1069        if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
1070                if (lr == XFS_BB_RIGHTSIB)
1071                        block->bb_u.l.bb_rightsib = ptr->l;
1072                else
1073                        block->bb_u.l.bb_leftsib = ptr->l;
1074        } else {
1075                if (lr == XFS_BB_RIGHTSIB)
1076                        block->bb_u.s.bb_rightsib = ptr->s;
1077                else
1078                        block->bb_u.s.bb_leftsib = ptr->s;
1079        }
1080}
1081
1082void
1083xfs_btree_init_block_int(
1084        struct xfs_mount        *mp,
1085        struct xfs_btree_block  *buf,
1086        xfs_daddr_t             blkno,
1087        xfs_btnum_t             btnum,
1088        __u16                   level,
1089        __u16                   numrecs,
1090        __u64                   owner,
1091        unsigned int            flags)
1092{
1093        int                     crc = xfs_has_crc(mp);
1094        __u32                   magic = xfs_btree_magic(crc, btnum);
1095
1096        buf->bb_magic = cpu_to_be32(magic);
1097        buf->bb_level = cpu_to_be16(level);
1098        buf->bb_numrecs = cpu_to_be16(numrecs);
1099
1100        if (flags & XFS_BTREE_LONG_PTRS) {
1101                buf->bb_u.l.bb_leftsib = cpu_to_be64(NULLFSBLOCK);
1102                buf->bb_u.l.bb_rightsib = cpu_to_be64(NULLFSBLOCK);
1103                if (crc) {
1104                        buf->bb_u.l.bb_blkno = cpu_to_be64(blkno);
1105                        buf->bb_u.l.bb_owner = cpu_to_be64(owner);
1106                        uuid_copy(&buf->bb_u.l.bb_uuid, &mp->m_sb.sb_meta_uuid);
1107                        buf->bb_u.l.bb_pad = 0;
1108                        buf->bb_u.l.bb_lsn = 0;
1109                }
1110        } else {
1111                /* owner is a 32 bit value on short blocks */
1112                __u32 __owner = (__u32)owner;
1113
1114                buf->bb_u.s.bb_leftsib = cpu_to_be32(NULLAGBLOCK);
1115                buf->bb_u.s.bb_rightsib = cpu_to_be32(NULLAGBLOCK);
1116                if (crc) {
1117                        buf->bb_u.s.bb_blkno = cpu_to_be64(blkno);
1118                        buf->bb_u.s.bb_owner = cpu_to_be32(__owner);
1119                        uuid_copy(&buf->bb_u.s.bb_uuid, &mp->m_sb.sb_meta_uuid);
1120                        buf->bb_u.s.bb_lsn = 0;
1121                }
1122        }
1123}
1124
1125void
1126xfs_btree_init_block(
1127        struct xfs_mount *mp,
1128        struct xfs_buf  *bp,
1129        xfs_btnum_t     btnum,
1130        __u16           level,
1131        __u16           numrecs,
1132        __u64           owner)
1133{
1134        xfs_btree_init_block_int(mp, XFS_BUF_TO_BLOCK(bp), xfs_buf_daddr(bp),
1135                                 btnum, level, numrecs, owner, 0);
1136}
1137
1138void
1139xfs_btree_init_block_cur(
1140        struct xfs_btree_cur    *cur,
1141        struct xfs_buf          *bp,
1142        int                     level,
1143        int                     numrecs)
1144{
1145        __u64                   owner;
1146
1147        /*
1148         * we can pull the owner from the cursor right now as the different
1149         * owners align directly with the pointer size of the btree. This may
1150         * change in future, but is safe for current users of the generic btree
1151         * code.
1152         */
1153        if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
1154                owner = cur->bc_ino.ip->i_ino;
1155        else
1156                owner = cur->bc_ag.pag->pag_agno;
1157
1158        xfs_btree_init_block_int(cur->bc_mp, XFS_BUF_TO_BLOCK(bp),
1159                                xfs_buf_daddr(bp), cur->bc_btnum, level,
1160                                numrecs, owner, cur->bc_flags);
1161}
1162
1163/*
1164 * Return true if ptr is the last record in the btree and
1165 * we need to track updates to this record.  The decision
1166 * will be further refined in the update_lastrec method.
1167 */
1168STATIC int
1169xfs_btree_is_lastrec(
1170        struct xfs_btree_cur    *cur,
1171        struct xfs_btree_block  *block,
1172        int                     level)
1173{
1174        union xfs_btree_ptr     ptr;
1175
1176        if (level > 0)
1177                return 0;
1178        if (!(cur->bc_flags & XFS_BTREE_LASTREC_UPDATE))
1179                return 0;
1180
1181        xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_RIGHTSIB);
1182        if (!xfs_btree_ptr_is_null(cur, &ptr))
1183                return 0;
1184        return 1;
1185}
1186
1187STATIC void
1188xfs_btree_buf_to_ptr(
1189        struct xfs_btree_cur    *cur,
1190        struct xfs_buf          *bp,
1191        union xfs_btree_ptr     *ptr)
1192{
1193        if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
1194                ptr->l = cpu_to_be64(XFS_DADDR_TO_FSB(cur->bc_mp,
1195                                        xfs_buf_daddr(bp)));
1196        else {
1197                ptr->s = cpu_to_be32(xfs_daddr_to_agbno(cur->bc_mp,
1198                                        xfs_buf_daddr(bp)));
1199        }
1200}
1201
1202STATIC void
1203xfs_btree_set_refs(
1204        struct xfs_btree_cur    *cur,
1205        struct xfs_buf          *bp)
1206{
1207        switch (cur->bc_btnum) {
1208        case XFS_BTNUM_BNO:
1209        case XFS_BTNUM_CNT:
1210                xfs_buf_set_ref(bp, XFS_ALLOC_BTREE_REF);
1211                break;
1212        case XFS_BTNUM_INO:
1213        case XFS_BTNUM_FINO:
1214                xfs_buf_set_ref(bp, XFS_INO_BTREE_REF);
1215                break;
1216        case XFS_BTNUM_BMAP:
1217                xfs_buf_set_ref(bp, XFS_BMAP_BTREE_REF);
1218                break;
1219        case XFS_BTNUM_RMAP:
1220                xfs_buf_set_ref(bp, XFS_RMAP_BTREE_REF);
1221                break;
1222        case XFS_BTNUM_REFC:
1223                xfs_buf_set_ref(bp, XFS_REFC_BTREE_REF);
1224                break;
1225        default:
1226                ASSERT(0);
1227        }
1228}
1229
1230int
1231xfs_btree_get_buf_block(
1232        struct xfs_btree_cur            *cur,
1233        const union xfs_btree_ptr       *ptr,
1234        struct xfs_btree_block          **block,
1235        struct xfs_buf                  **bpp)
1236{
1237        struct xfs_mount        *mp = cur->bc_mp;
1238        xfs_daddr_t             d;
1239        int                     error;
1240
1241        error = xfs_btree_ptr_to_daddr(cur, ptr, &d);
1242        if (error)
1243                return error;
1244        error = xfs_trans_get_buf(cur->bc_tp, mp->m_ddev_targp, d, mp->m_bsize,
1245                        0, bpp);
1246        if (error)
1247                return error;
1248
1249        (*bpp)->b_ops = cur->bc_ops->buf_ops;
1250        *block = XFS_BUF_TO_BLOCK(*bpp);
1251        return 0;
1252}
1253
1254/*
1255 * Read in the buffer at the given ptr and return the buffer and
1256 * the block pointer within the buffer.
1257 */
1258STATIC int
1259xfs_btree_read_buf_block(
1260        struct xfs_btree_cur            *cur,
1261        const union xfs_btree_ptr       *ptr,
1262        int                             flags,
1263        struct xfs_btree_block          **block,
1264        struct xfs_buf                  **bpp)
1265{
1266        struct xfs_mount        *mp = cur->bc_mp;
1267        xfs_daddr_t             d;
1268        int                     error;
1269
1270        /* need to sort out how callers deal with failures first */
1271        ASSERT(!(flags & XBF_TRYLOCK));
1272
1273        error = xfs_btree_ptr_to_daddr(cur, ptr, &d);
1274        if (error)
1275                return error;
1276        error = xfs_trans_read_buf(mp, cur->bc_tp, mp->m_ddev_targp, d,
1277                                   mp->m_bsize, flags, bpp,
1278                                   cur->bc_ops->buf_ops);
1279        if (error)
1280                return error;
1281
1282        xfs_btree_set_refs(cur, *bpp);
1283        *block = XFS_BUF_TO_BLOCK(*bpp);
1284        return 0;
1285}
1286
1287/*
1288 * Copy keys from one btree block to another.
1289 */
1290void
1291xfs_btree_copy_keys(
1292        struct xfs_btree_cur            *cur,
1293        union xfs_btree_key             *dst_key,
1294        const union xfs_btree_key       *src_key,
1295        int                             numkeys)
1296{
1297        ASSERT(numkeys >= 0);
1298        memcpy(dst_key, src_key, numkeys * cur->bc_ops->key_len);
1299}
1300
1301/*
1302 * Copy records from one btree block to another.
1303 */
1304STATIC void
1305xfs_btree_copy_recs(
1306        struct xfs_btree_cur    *cur,
1307        union xfs_btree_rec     *dst_rec,
1308        union xfs_btree_rec     *src_rec,
1309        int                     numrecs)
1310{
1311        ASSERT(numrecs >= 0);
1312        memcpy(dst_rec, src_rec, numrecs * cur->bc_ops->rec_len);
1313}
1314
1315/*
1316 * Copy block pointers from one btree block to another.
1317 */
1318void
1319xfs_btree_copy_ptrs(
1320        struct xfs_btree_cur    *cur,
1321        union xfs_btree_ptr     *dst_ptr,
1322        const union xfs_btree_ptr *src_ptr,
1323        int                     numptrs)
1324{
1325        ASSERT(numptrs >= 0);
1326        memcpy(dst_ptr, src_ptr, numptrs * xfs_btree_ptr_len(cur));
1327}
1328
1329/*
1330 * Shift keys one index left/right inside a single btree block.
1331 */
1332STATIC void
1333xfs_btree_shift_keys(
1334        struct xfs_btree_cur    *cur,
1335        union xfs_btree_key     *key,
1336        int                     dir,
1337        int                     numkeys)
1338{
1339        char                    *dst_key;
1340
1341        ASSERT(numkeys >= 0);
1342        ASSERT(dir == 1 || dir == -1);
1343
1344        dst_key = (char *)key + (dir * cur->bc_ops->key_len);
1345        memmove(dst_key, key, numkeys * cur->bc_ops->key_len);
1346}
1347
1348/*
1349 * Shift records one index left/right inside a single btree block.
1350 */
1351STATIC void
1352xfs_btree_shift_recs(
1353        struct xfs_btree_cur    *cur,
1354        union xfs_btree_rec     *rec,
1355        int                     dir,
1356        int                     numrecs)
1357{
1358        char                    *dst_rec;
1359
1360        ASSERT(numrecs >= 0);
1361        ASSERT(dir == 1 || dir == -1);
1362
1363        dst_rec = (char *)rec + (dir * cur->bc_ops->rec_len);
1364        memmove(dst_rec, rec, numrecs * cur->bc_ops->rec_len);
1365}
1366
1367/*
1368 * Shift block pointers one index left/right inside a single btree block.
1369 */
1370STATIC void
1371xfs_btree_shift_ptrs(
1372        struct xfs_btree_cur    *cur,
1373        union xfs_btree_ptr     *ptr,
1374        int                     dir,
1375        int                     numptrs)
1376{
1377        char                    *dst_ptr;
1378
1379        ASSERT(numptrs >= 0);
1380        ASSERT(dir == 1 || dir == -1);
1381
1382        dst_ptr = (char *)ptr + (dir * xfs_btree_ptr_len(cur));
1383        memmove(dst_ptr, ptr, numptrs * xfs_btree_ptr_len(cur));
1384}
1385
1386/*
1387 * Log key values from the btree block.
1388 */
1389STATIC void
1390xfs_btree_log_keys(
1391        struct xfs_btree_cur    *cur,
1392        struct xfs_buf          *bp,
1393        int                     first,
1394        int                     last)
1395{
1396
1397        if (bp) {
1398                xfs_trans_buf_set_type(cur->bc_tp, bp, XFS_BLFT_BTREE_BUF);
1399                xfs_trans_log_buf(cur->bc_tp, bp,
1400                                  xfs_btree_key_offset(cur, first),
1401                                  xfs_btree_key_offset(cur, last + 1) - 1);
1402        } else {
1403                xfs_trans_log_inode(cur->bc_tp, cur->bc_ino.ip,
1404                                xfs_ilog_fbroot(cur->bc_ino.whichfork));
1405        }
1406}
1407
1408/*
1409 * Log record values from the btree block.
1410 */
1411void
1412xfs_btree_log_recs(
1413        struct xfs_btree_cur    *cur,
1414        struct xfs_buf          *bp,
1415        int                     first,
1416        int                     last)
1417{
1418
1419        xfs_trans_buf_set_type(cur->bc_tp, bp, XFS_BLFT_BTREE_BUF);
1420        xfs_trans_log_buf(cur->bc_tp, bp,
1421                          xfs_btree_rec_offset(cur, first),
1422                          xfs_btree_rec_offset(cur, last + 1) - 1);
1423
1424}
1425
1426/*
1427 * Log block pointer fields from a btree block (nonleaf).
1428 */
1429STATIC void
1430xfs_btree_log_ptrs(
1431        struct xfs_btree_cur    *cur,   /* btree cursor */
1432        struct xfs_buf          *bp,    /* buffer containing btree block */
1433        int                     first,  /* index of first pointer to log */
1434        int                     last)   /* index of last pointer to log */
1435{
1436
1437        if (bp) {
1438                struct xfs_btree_block  *block = XFS_BUF_TO_BLOCK(bp);
1439                int                     level = xfs_btree_get_level(block);
1440
1441                xfs_trans_buf_set_type(cur->bc_tp, bp, XFS_BLFT_BTREE_BUF);
1442                xfs_trans_log_buf(cur->bc_tp, bp,
1443                                xfs_btree_ptr_offset(cur, first, level),
1444                                xfs_btree_ptr_offset(cur, last + 1, level) - 1);
1445        } else {
1446                xfs_trans_log_inode(cur->bc_tp, cur->bc_ino.ip,
1447                        xfs_ilog_fbroot(cur->bc_ino.whichfork));
1448        }
1449
1450}
1451
1452/*
1453 * Log fields from a btree block header.
1454 */
1455void
1456xfs_btree_log_block(
1457        struct xfs_btree_cur    *cur,   /* btree cursor */
1458        struct xfs_buf          *bp,    /* buffer containing btree block */
1459        int                     fields) /* mask of fields: XFS_BB_... */
1460{
1461        int                     first;  /* first byte offset logged */
1462        int                     last;   /* last byte offset logged */
1463        static const short      soffsets[] = {  /* table of offsets (short) */
1464                offsetof(struct xfs_btree_block, bb_magic),
1465                offsetof(struct xfs_btree_block, bb_level),
1466                offsetof(struct xfs_btree_block, bb_numrecs),
1467                offsetof(struct xfs_btree_block, bb_u.s.bb_leftsib),
1468                offsetof(struct xfs_btree_block, bb_u.s.bb_rightsib),
1469                offsetof(struct xfs_btree_block, bb_u.s.bb_blkno),
1470                offsetof(struct xfs_btree_block, bb_u.s.bb_lsn),
1471                offsetof(struct xfs_btree_block, bb_u.s.bb_uuid),
1472                offsetof(struct xfs_btree_block, bb_u.s.bb_owner),
1473                offsetof(struct xfs_btree_block, bb_u.s.bb_crc),
1474                XFS_BTREE_SBLOCK_CRC_LEN
1475        };
1476        static const short      loffsets[] = {  /* table of offsets (long) */
1477                offsetof(struct xfs_btree_block, bb_magic),
1478                offsetof(struct xfs_btree_block, bb_level),
1479                offsetof(struct xfs_btree_block, bb_numrecs),
1480                offsetof(struct xfs_btree_block, bb_u.l.bb_leftsib),
1481                offsetof(struct xfs_btree_block, bb_u.l.bb_rightsib),
1482                offsetof(struct xfs_btree_block, bb_u.l.bb_blkno),
1483                offsetof(struct xfs_btree_block, bb_u.l.bb_lsn),
1484                offsetof(struct xfs_btree_block, bb_u.l.bb_uuid),
1485                offsetof(struct xfs_btree_block, bb_u.l.bb_owner),
1486                offsetof(struct xfs_btree_block, bb_u.l.bb_crc),
1487                offsetof(struct xfs_btree_block, bb_u.l.bb_pad),
1488                XFS_BTREE_LBLOCK_CRC_LEN
1489        };
1490
1491        if (bp) {
1492                int nbits;
1493
1494                if (cur->bc_flags & XFS_BTREE_CRC_BLOCKS) {
1495                        /*
1496                         * We don't log the CRC when updating a btree
1497                         * block but instead recreate it during log
1498                         * recovery.  As the log buffers have checksums
1499                         * of their own this is safe and avoids logging a crc
1500                         * update in a lot of places.
1501                         */
1502                        if (fields == XFS_BB_ALL_BITS)
1503                                fields = XFS_BB_ALL_BITS_CRC;
1504                        nbits = XFS_BB_NUM_BITS_CRC;
1505                } else {
1506                        nbits = XFS_BB_NUM_BITS;
1507                }
1508                xfs_btree_offsets(fields,
1509                                  (cur->bc_flags & XFS_BTREE_LONG_PTRS) ?
1510                                        loffsets : soffsets,
1511                                  nbits, &first, &last);
1512                xfs_trans_buf_set_type(cur->bc_tp, bp, XFS_BLFT_BTREE_BUF);
1513                xfs_trans_log_buf(cur->bc_tp, bp, first, last);
1514        } else {
1515                xfs_trans_log_inode(cur->bc_tp, cur->bc_ino.ip,
1516                        xfs_ilog_fbroot(cur->bc_ino.whichfork));
1517        }
1518}
1519
1520/*
1521 * Increment cursor by one record at the level.
1522 * For nonzero levels the leaf-ward information is untouched.
1523 */
1524int                                             /* error */
1525xfs_btree_increment(
1526        struct xfs_btree_cur    *cur,
1527        int                     level,
1528        int                     *stat)          /* success/failure */
1529{
1530        struct xfs_btree_block  *block;
1531        union xfs_btree_ptr     ptr;
1532        struct xfs_buf          *bp;
1533        int                     error;          /* error return value */
1534        int                     lev;
1535
1536        ASSERT(level < cur->bc_nlevels);
1537
1538        /* Read-ahead to the right at this level. */
1539        xfs_btree_readahead(cur, level, XFS_BTCUR_RIGHTRA);
1540
1541        /* Get a pointer to the btree block. */
1542        block = xfs_btree_get_block(cur, level, &bp);
1543
1544#ifdef DEBUG
1545        error = xfs_btree_check_block(cur, block, level, bp);
1546        if (error)
1547                goto error0;
1548#endif
1549
1550        /* We're done if we remain in the block after the increment. */
1551        if (++cur->bc_ptrs[level] <= xfs_btree_get_numrecs(block))
1552                goto out1;
1553
1554        /* Fail if we just went off the right edge of the tree. */
1555        xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_RIGHTSIB);
1556        if (xfs_btree_ptr_is_null(cur, &ptr))
1557                goto out0;
1558
1559        XFS_BTREE_STATS_INC(cur, increment);
1560
1561        /*
1562         * March up the tree incrementing pointers.
1563         * Stop when we don't go off the right edge of a block.
1564         */
1565        for (lev = level + 1; lev < cur->bc_nlevels; lev++) {
1566                block = xfs_btree_get_block(cur, lev, &bp);
1567
1568#ifdef DEBUG
1569                error = xfs_btree_check_block(cur, block, lev, bp);
1570                if (error)
1571                        goto error0;
1572#endif
1573
1574                if (++cur->bc_ptrs[lev] <= xfs_btree_get_numrecs(block))
1575                        break;
1576
1577                /* Read-ahead the right block for the next loop. */
1578                xfs_btree_readahead(cur, lev, XFS_BTCUR_RIGHTRA);
1579        }
1580
1581        /*
1582         * If we went off the root then we are either seriously
1583         * confused or have the tree root in an inode.
1584         */
1585        if (lev == cur->bc_nlevels) {
1586                if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE)
1587                        goto out0;
1588                ASSERT(0);
1589                error = -EFSCORRUPTED;
1590                goto error0;
1591        }
1592        ASSERT(lev < cur->bc_nlevels);
1593
1594        /*
1595         * Now walk back down the tree, fixing up the cursor's buffer
1596         * pointers and key numbers.
1597         */
1598        for (block = xfs_btree_get_block(cur, lev, &bp); lev > level; ) {
1599                union xfs_btree_ptr     *ptrp;
1600
1601                ptrp = xfs_btree_ptr_addr(cur, cur->bc_ptrs[lev], block);
1602                --lev;
1603                error = xfs_btree_read_buf_block(cur, ptrp, 0, &block, &bp);
1604                if (error)
1605                        goto error0;
1606
1607                xfs_btree_setbuf(cur, lev, bp);
1608                cur->bc_ptrs[lev] = 1;
1609        }
1610out1:
1611        *stat = 1;
1612        return 0;
1613
1614out0:
1615        *stat = 0;
1616        return 0;
1617
1618error0:
1619        return error;
1620}
1621
1622/*
1623 * Decrement cursor by one record at the level.
1624 * For nonzero levels the leaf-ward information is untouched.
1625 */
1626int                                             /* error */
1627xfs_btree_decrement(
1628        struct xfs_btree_cur    *cur,
1629        int                     level,
1630        int                     *stat)          /* success/failure */
1631{
1632        struct xfs_btree_block  *block;
1633        struct xfs_buf          *bp;
1634        int                     error;          /* error return value */
1635        int                     lev;
1636        union xfs_btree_ptr     ptr;
1637
1638        ASSERT(level < cur->bc_nlevels);
1639
1640        /* Read-ahead to the left at this level. */
1641        xfs_btree_readahead(cur, level, XFS_BTCUR_LEFTRA);
1642
1643        /* We're done if we remain in the block after the decrement. */
1644        if (--cur->bc_ptrs[level] > 0)
1645                goto out1;
1646
1647        /* Get a pointer to the btree block. */
1648        block = xfs_btree_get_block(cur, level, &bp);
1649
1650#ifdef DEBUG
1651        error = xfs_btree_check_block(cur, block, level, bp);
1652        if (error)
1653                goto error0;
1654#endif
1655
1656        /* Fail if we just went off the left edge of the tree. */
1657        xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_LEFTSIB);
1658        if (xfs_btree_ptr_is_null(cur, &ptr))
1659                goto out0;
1660
1661        XFS_BTREE_STATS_INC(cur, decrement);
1662
1663        /*
1664         * March up the tree decrementing pointers.
1665         * Stop when we don't go off the left edge of a block.
1666         */
1667        for (lev = level + 1; lev < cur->bc_nlevels; lev++) {
1668                if (--cur->bc_ptrs[lev] > 0)
1669                        break;
1670                /* Read-ahead the left block for the next loop. */
1671                xfs_btree_readahead(cur, lev, XFS_BTCUR_LEFTRA);
1672        }
1673
1674        /*
1675         * If we went off the root then we are seriously confused.
1676         * or the root of the tree is in an inode.
1677         */
1678        if (lev == cur->bc_nlevels) {
1679                if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE)
1680                        goto out0;
1681                ASSERT(0);
1682                error = -EFSCORRUPTED;
1683                goto error0;
1684        }
1685        ASSERT(lev < cur->bc_nlevels);
1686
1687        /*
1688         * Now walk back down the tree, fixing up the cursor's buffer
1689         * pointers and key numbers.
1690         */
1691        for (block = xfs_btree_get_block(cur, lev, &bp); lev > level; ) {
1692                union xfs_btree_ptr     *ptrp;
1693
1694                ptrp = xfs_btree_ptr_addr(cur, cur->bc_ptrs[lev], block);
1695                --lev;
1696                error = xfs_btree_read_buf_block(cur, ptrp, 0, &block, &bp);
1697                if (error)
1698                        goto error0;
1699                xfs_btree_setbuf(cur, lev, bp);
1700                cur->bc_ptrs[lev] = xfs_btree_get_numrecs(block);
1701        }
1702out1:
1703        *stat = 1;
1704        return 0;
1705
1706out0:
1707        *stat = 0;
1708        return 0;
1709
1710error0:
1711        return error;
1712}
1713
1714int
1715xfs_btree_lookup_get_block(
1716        struct xfs_btree_cur            *cur,   /* btree cursor */
1717        int                             level,  /* level in the btree */
1718        const union xfs_btree_ptr       *pp,    /* ptr to btree block */
1719        struct xfs_btree_block          **blkp) /* return btree block */
1720{
1721        struct xfs_buf          *bp;    /* buffer pointer for btree block */
1722        xfs_daddr_t             daddr;
1723        int                     error = 0;
1724
1725        /* special case the root block if in an inode */
1726        if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
1727            (level == cur->bc_nlevels - 1)) {
1728                *blkp = xfs_btree_get_iroot(cur);
1729                return 0;
1730        }
1731
1732        /*
1733         * If the old buffer at this level for the disk address we are
1734         * looking for re-use it.
1735         *
1736         * Otherwise throw it away and get a new one.
1737         */
1738        bp = cur->bc_bufs[level];
1739        error = xfs_btree_ptr_to_daddr(cur, pp, &daddr);
1740        if (error)
1741                return error;
1742        if (bp && xfs_buf_daddr(bp) == daddr) {
1743                *blkp = XFS_BUF_TO_BLOCK(bp);
1744                return 0;
1745        }
1746
1747        error = xfs_btree_read_buf_block(cur, pp, 0, blkp, &bp);
1748        if (error)
1749                return error;
1750
1751        /* Check the inode owner since the verifiers don't. */
1752        if (xfs_has_crc(cur->bc_mp) &&
1753            !(cur->bc_ino.flags & XFS_BTCUR_BMBT_INVALID_OWNER) &&
1754            (cur->bc_flags & XFS_BTREE_LONG_PTRS) &&
1755            be64_to_cpu((*blkp)->bb_u.l.bb_owner) !=
1756                        cur->bc_ino.ip->i_ino)
1757                goto out_bad;
1758
1759        /* Did we get the level we were looking for? */
1760        if (be16_to_cpu((*blkp)->bb_level) != level)
1761                goto out_bad;
1762
1763        /* Check that internal nodes have at least one record. */
1764        if (level != 0 && be16_to_cpu((*blkp)->bb_numrecs) == 0)
1765                goto out_bad;
1766
1767        xfs_btree_setbuf(cur, level, bp);
1768        return 0;
1769
1770out_bad:
1771        *blkp = NULL;
1772        xfs_buf_mark_corrupt(bp);
1773        xfs_trans_brelse(cur->bc_tp, bp);
1774        return -EFSCORRUPTED;
1775}
1776
1777/*
1778 * Get current search key.  For level 0 we don't actually have a key
1779 * structure so we make one up from the record.  For all other levels
1780 * we just return the right key.
1781 */
1782STATIC union xfs_btree_key *
1783xfs_lookup_get_search_key(
1784        struct xfs_btree_cur    *cur,
1785        int                     level,
1786        int                     keyno,
1787        struct xfs_btree_block  *block,
1788        union xfs_btree_key     *kp)
1789{
1790        if (level == 0) {
1791                cur->bc_ops->init_key_from_rec(kp,
1792                                xfs_btree_rec_addr(cur, keyno, block));
1793                return kp;
1794        }
1795
1796        return xfs_btree_key_addr(cur, keyno, block);
1797}
1798
1799/*
1800 * Lookup the record.  The cursor is made to point to it, based on dir.
1801 * stat is set to 0 if can't find any such record, 1 for success.
1802 */
1803int                                     /* error */
1804xfs_btree_lookup(
1805        struct xfs_btree_cur    *cur,   /* btree cursor */
1806        xfs_lookup_t            dir,    /* <=, ==, or >= */
1807        int                     *stat)  /* success/failure */
1808{
1809        struct xfs_btree_block  *block; /* current btree block */
1810        int64_t                 diff;   /* difference for the current key */
1811        int                     error;  /* error return value */
1812        int                     keyno;  /* current key number */
1813        int                     level;  /* level in the btree */
1814        union xfs_btree_ptr     *pp;    /* ptr to btree block */
1815        union xfs_btree_ptr     ptr;    /* ptr to btree block */
1816
1817        XFS_BTREE_STATS_INC(cur, lookup);
1818
1819        /* No such thing as a zero-level tree. */
1820        if (XFS_IS_CORRUPT(cur->bc_mp, cur->bc_nlevels == 0))
1821                return -EFSCORRUPTED;
1822
1823        block = NULL;
1824        keyno = 0;
1825
1826        /* initialise start pointer from cursor */
1827        cur->bc_ops->init_ptr_from_cur(cur, &ptr);
1828        pp = &ptr;
1829
1830        /*
1831         * Iterate over each level in the btree, starting at the root.
1832         * For each level above the leaves, find the key we need, based
1833         * on the lookup record, then follow the corresponding block
1834         * pointer down to the next level.
1835         */
1836        for (level = cur->bc_nlevels - 1, diff = 1; level >= 0; level--) {
1837                /* Get the block we need to do the lookup on. */
1838                error = xfs_btree_lookup_get_block(cur, level, pp, &block);
1839                if (error)
1840                        goto error0;
1841
1842                if (diff == 0) {
1843                        /*
1844                         * If we already had a key match at a higher level, we
1845                         * know we need to use the first entry in this block.
1846                         */
1847                        keyno = 1;
1848                } else {
1849                        /* Otherwise search this block. Do a binary search. */
1850
1851                        int     high;   /* high entry number */
1852                        int     low;    /* low entry number */
1853
1854                        /* Set low and high entry numbers, 1-based. */
1855                        low = 1;
1856                        high = xfs_btree_get_numrecs(block);
1857                        if (!high) {
1858                                /* Block is empty, must be an empty leaf. */
1859                                if (level != 0 || cur->bc_nlevels != 1) {
1860                                        XFS_CORRUPTION_ERROR(__func__,
1861                                                        XFS_ERRLEVEL_LOW,
1862                                                        cur->bc_mp, block,
1863                                                        sizeof(*block));
1864                                        return -EFSCORRUPTED;
1865                                }
1866
1867                                cur->bc_ptrs[0] = dir != XFS_LOOKUP_LE;
1868                                *stat = 0;
1869                                return 0;
1870                        }
1871
1872                        /* Binary search the block. */
1873                        while (low <= high) {
1874                                union xfs_btree_key     key;
1875                                union xfs_btree_key     *kp;
1876
1877                                XFS_BTREE_STATS_INC(cur, compare);
1878
1879                                /* keyno is average of low and high. */
1880                                keyno = (low + high) >> 1;
1881
1882                                /* Get current search key */
1883                                kp = xfs_lookup_get_search_key(cur, level,
1884                                                keyno, block, &key);
1885
1886                                /*
1887                                 * Compute difference to get next direction:
1888                                 *  - less than, move right
1889                                 *  - greater than, move left
1890                                 *  - equal, we're done
1891                                 */
1892                                diff = cur->bc_ops->key_diff(cur, kp);
1893                                if (diff < 0)
1894                                        low = keyno + 1;
1895                                else if (diff > 0)
1896                                        high = keyno - 1;
1897                                else
1898                                        break;
1899                        }
1900                }
1901
1902                /*
1903                 * If there are more levels, set up for the next level
1904                 * by getting the block number and filling in the cursor.
1905                 */
1906                if (level > 0) {
1907                        /*
1908                         * If we moved left, need the previous key number,
1909                         * unless there isn't one.
1910                         */
1911                        if (diff > 0 && --keyno < 1)
1912                                keyno = 1;
1913                        pp = xfs_btree_ptr_addr(cur, keyno, block);
1914
1915                        error = xfs_btree_debug_check_ptr(cur, pp, 0, level);
1916                        if (error)
1917                                goto error0;
1918
1919                        cur->bc_ptrs[level] = keyno;
1920                }
1921        }
1922
1923        /* Done with the search. See if we need to adjust the results. */
1924        if (dir != XFS_LOOKUP_LE && diff < 0) {
1925                keyno++;
1926                /*
1927                 * If ge search and we went off the end of the block, but it's
1928                 * not the last block, we're in the wrong block.
1929                 */
1930                xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_RIGHTSIB);
1931                if (dir == XFS_LOOKUP_GE &&
1932                    keyno > xfs_btree_get_numrecs(block) &&
1933                    !xfs_btree_ptr_is_null(cur, &ptr)) {
1934                        int     i;
1935
1936                        cur->bc_ptrs[0] = keyno;
1937                        error = xfs_btree_increment(cur, 0, &i);
1938                        if (error)
1939                                goto error0;
1940                        if (XFS_IS_CORRUPT(cur->bc_mp, i != 1))
1941                                return -EFSCORRUPTED;
1942                        *stat = 1;
1943                        return 0;
1944                }
1945        } else if (dir == XFS_LOOKUP_LE && diff > 0)
1946                keyno--;
1947        cur->bc_ptrs[0] = keyno;
1948
1949        /* Return if we succeeded or not. */
1950        if (keyno == 0 || keyno > xfs_btree_get_numrecs(block))
1951                *stat = 0;
1952        else if (dir != XFS_LOOKUP_EQ || diff == 0)
1953                *stat = 1;
1954        else
1955                *stat = 0;
1956        return 0;
1957
1958error0:
1959        return error;
1960}
1961
1962/* Find the high key storage area from a regular key. */
1963union xfs_btree_key *
1964xfs_btree_high_key_from_key(
1965        struct xfs_btree_cur    *cur,
1966        union xfs_btree_key     *key)
1967{
1968        ASSERT(cur->bc_flags & XFS_BTREE_OVERLAPPING);
1969        return (union xfs_btree_key *)((char *)key +
1970                        (cur->bc_ops->key_len / 2));
1971}
1972
1973/* Determine the low (and high if overlapped) keys of a leaf block */
1974STATIC void
1975xfs_btree_get_leaf_keys(
1976        struct xfs_btree_cur    *cur,
1977        struct xfs_btree_block  *block,
1978        union xfs_btree_key     *key)
1979{
1980        union xfs_btree_key     max_hkey;
1981        union xfs_btree_key     hkey;
1982        union xfs_btree_rec     *rec;
1983        union xfs_btree_key     *high;
1984        int                     n;
1985
1986        rec = xfs_btree_rec_addr(cur, 1, block);
1987        cur->bc_ops->init_key_from_rec(key, rec);
1988
1989        if (cur->bc_flags & XFS_BTREE_OVERLAPPING) {
1990
1991                cur->bc_ops->init_high_key_from_rec(&max_hkey, rec);
1992                for (n = 2; n <= xfs_btree_get_numrecs(block); n++) {
1993                        rec = xfs_btree_rec_addr(cur, n, block);
1994                        cur->bc_ops->init_high_key_from_rec(&hkey, rec);
1995                        if (cur->bc_ops->diff_two_keys(cur, &hkey, &max_hkey)
1996                                        > 0)
1997                                max_hkey = hkey;
1998                }
1999
2000                high = xfs_btree_high_key_from_key(cur, key);
2001                memcpy(high, &max_hkey, cur->bc_ops->key_len / 2);
2002        }
2003}
2004
2005/* Determine the low (and high if overlapped) keys of a node block */
2006STATIC void
2007xfs_btree_get_node_keys(
2008        struct xfs_btree_cur    *cur,
2009        struct xfs_btree_block  *block,
2010        union xfs_btree_key     *key)
2011{
2012        union xfs_btree_key     *hkey;
2013        union xfs_btree_key     *max_hkey;
2014        union xfs_btree_key     *high;
2015        int                     n;
2016
2017        if (cur->bc_flags & XFS_BTREE_OVERLAPPING) {
2018                memcpy(key, xfs_btree_key_addr(cur, 1, block),
2019                                cur->bc_ops->key_len / 2);
2020
2021                max_hkey = xfs_btree_high_key_addr(cur, 1, block);
2022                for (n = 2; n <= xfs_btree_get_numrecs(block); n++) {
2023                        hkey = xfs_btree_high_key_addr(cur, n, block);
2024                        if (cur->bc_ops->diff_two_keys(cur, hkey, max_hkey) > 0)
2025                                max_hkey = hkey;
2026                }
2027
2028                high = xfs_btree_high_key_from_key(cur, key);
2029                memcpy(high, max_hkey, cur->bc_ops->key_len / 2);
2030        } else {
2031                memcpy(key, xfs_btree_key_addr(cur, 1, block),
2032                                cur->bc_ops->key_len);
2033        }
2034}
2035
2036/* Derive the keys for any btree block. */
2037void
2038xfs_btree_get_keys(
2039        struct xfs_btree_cur    *cur,
2040        struct xfs_btree_block  *block,
2041        union xfs_btree_key     *key)
2042{
2043        if (be16_to_cpu(block->bb_level) == 0)
2044                xfs_btree_get_leaf_keys(cur, block, key);
2045        else
2046                xfs_btree_get_node_keys(cur, block, key);
2047}
2048
2049/*
2050 * Decide if we need to update the parent keys of a btree block.  For
2051 * a standard btree this is only necessary if we're updating the first
2052 * record/key.  For an overlapping btree, we must always update the
2053 * keys because the highest key can be in any of the records or keys
2054 * in the block.
2055 */
2056static inline bool
2057xfs_btree_needs_key_update(
2058        struct xfs_btree_cur    *cur,
2059        int                     ptr)
2060{
2061        return (cur->bc_flags & XFS_BTREE_OVERLAPPING) || ptr == 1;
2062}
2063
2064/*
2065 * Update the low and high parent keys of the given level, progressing
2066 * towards the root.  If force_all is false, stop if the keys for a given
2067 * level do not need updating.
2068 */
2069STATIC int
2070__xfs_btree_updkeys(
2071        struct xfs_btree_cur    *cur,
2072        int                     level,
2073        struct xfs_btree_block  *block,
2074        struct xfs_buf          *bp0,
2075        bool                    force_all)
2076{
2077        union xfs_btree_key     key;    /* keys from current level */
2078        union xfs_btree_key     *lkey;  /* keys from the next level up */
2079        union xfs_btree_key     *hkey;
2080        union xfs_btree_key     *nlkey; /* keys from the next level up */
2081        union xfs_btree_key     *nhkey;
2082        struct xfs_buf          *bp;
2083        int                     ptr;
2084
2085        ASSERT(cur->bc_flags & XFS_BTREE_OVERLAPPING);
2086
2087        /* Exit if there aren't any parent levels to update. */
2088        if (level + 1 >= cur->bc_nlevels)
2089                return 0;
2090
2091        trace_xfs_btree_updkeys(cur, level, bp0);
2092
2093        lkey = &key;
2094        hkey = xfs_btree_high_key_from_key(cur, lkey);
2095        xfs_btree_get_keys(cur, block, lkey);
2096        for (level++; level < cur->bc_nlevels; level++) {
2097#ifdef DEBUG
2098                int             error;
2099#endif
2100                block = xfs_btree_get_block(cur, level, &bp);
2101                trace_xfs_btree_updkeys(cur, level, bp);
2102#ifdef DEBUG
2103                error = xfs_btree_check_block(cur, block, level, bp);
2104                if (error)
2105                        return error;
2106#endif
2107                ptr = cur->bc_ptrs[level];
2108                nlkey = xfs_btree_key_addr(cur, ptr, block);
2109                nhkey = xfs_btree_high_key_addr(cur, ptr, block);
2110                if (!force_all &&
2111                    !(cur->bc_ops->diff_two_keys(cur, nlkey, lkey) != 0 ||
2112                      cur->bc_ops->diff_two_keys(cur, nhkey, hkey) != 0))
2113                        break;
2114                xfs_btree_copy_keys(cur, nlkey, lkey, 1);
2115                xfs_btree_log_keys(cur, bp, ptr, ptr);
2116                if (level + 1 >= cur->bc_nlevels)
2117                        break;
2118                xfs_btree_get_node_keys(cur, block, lkey);
2119        }
2120
2121        return 0;
2122}
2123
2124/* Update all the keys from some level in cursor back to the root. */
2125STATIC int
2126xfs_btree_updkeys_force(
2127        struct xfs_btree_cur    *cur,
2128        int                     level)
2129{
2130        struct xfs_buf          *bp;
2131        struct xfs_btree_block  *block;
2132
2133        block = xfs_btree_get_block(cur, level, &bp);
2134        return __xfs_btree_updkeys(cur, level, block, bp, true);
2135}
2136
2137/*
2138 * Update the parent keys of the given level, progressing towards the root.
2139 */
2140STATIC int
2141xfs_btree_update_keys(
2142        struct xfs_btree_cur    *cur,
2143        int                     level)
2144{
2145        struct xfs_btree_block  *block;
2146        struct xfs_buf          *bp;
2147        union xfs_btree_key     *kp;
2148        union xfs_btree_key     key;
2149        int                     ptr;
2150
2151        ASSERT(level >= 0);
2152
2153        block = xfs_btree_get_block(cur, level, &bp);
2154        if (cur->bc_flags & XFS_BTREE_OVERLAPPING)
2155                return __xfs_btree_updkeys(cur, level, block, bp, false);
2156
2157        /*
2158         * Go up the tree from this level toward the root.
2159         * At each level, update the key value to the value input.
2160         * Stop when we reach a level where the cursor isn't pointing
2161         * at the first entry in the block.
2162         */
2163        xfs_btree_get_keys(cur, block, &key);
2164        for (level++, ptr = 1; ptr == 1 && level < cur->bc_nlevels; level++) {
2165#ifdef DEBUG
2166                int             error;
2167#endif
2168                block = xfs_btree_get_block(cur, level, &bp);
2169#ifdef DEBUG
2170                error = xfs_btree_check_block(cur, block, level, bp);
2171                if (error)
2172                        return error;
2173#endif
2174                ptr = cur->bc_ptrs[level];
2175                kp = xfs_btree_key_addr(cur, ptr, block);
2176                xfs_btree_copy_keys(cur, kp, &key, 1);
2177                xfs_btree_log_keys(cur, bp, ptr, ptr);
2178        }
2179
2180        return 0;
2181}
2182
2183/*
2184 * Update the record referred to by cur to the value in the
2185 * given record. This either works (return 0) or gets an
2186 * EFSCORRUPTED error.
2187 */
2188int
2189xfs_btree_update(
2190        struct xfs_btree_cur    *cur,
2191        union xfs_btree_rec     *rec)
2192{
2193        struct xfs_btree_block  *block;
2194        struct xfs_buf          *bp;
2195        int                     error;
2196        int                     ptr;
2197        union xfs_btree_rec     *rp;
2198
2199        /* Pick up the current block. */
2200        block = xfs_btree_get_block(cur, 0, &bp);
2201
2202#ifdef DEBUG
2203        error = xfs_btree_check_block(cur, block, 0, bp);
2204        if (error)
2205                goto error0;
2206#endif
2207        /* Get the address of the rec to be updated. */
2208        ptr = cur->bc_ptrs[0];
2209        rp = xfs_btree_rec_addr(cur, ptr, block);
2210
2211        /* Fill in the new contents and log them. */
2212        xfs_btree_copy_recs(cur, rp, rec, 1);
2213        xfs_btree_log_recs(cur, bp, ptr, ptr);
2214
2215        /*
2216         * If we are tracking the last record in the tree and
2217         * we are at the far right edge of the tree, update it.
2218         */
2219        if (xfs_btree_is_lastrec(cur, block, 0)) {
2220                cur->bc_ops->update_lastrec(cur, block, rec,
2221                                            ptr, LASTREC_UPDATE);
2222        }
2223
2224        /* Pass new key value up to our parent. */
2225        if (xfs_btree_needs_key_update(cur, ptr)) {
2226                error = xfs_btree_update_keys(cur, 0);
2227                if (error)
2228                        goto error0;
2229        }
2230
2231        return 0;
2232
2233error0:
2234        return error;
2235}
2236
2237/*
2238 * Move 1 record left from cur/level if possible.
2239 * Update cur to reflect the new path.
2240 */
2241STATIC int                                      /* error */
2242xfs_btree_lshift(
2243        struct xfs_btree_cur    *cur,
2244        int                     level,
2245        int                     *stat)          /* success/failure */
2246{
2247        struct xfs_buf          *lbp;           /* left buffer pointer */
2248        struct xfs_btree_block  *left;          /* left btree block */
2249        int                     lrecs;          /* left record count */
2250        struct xfs_buf          *rbp;           /* right buffer pointer */
2251        struct xfs_btree_block  *right;         /* right btree block */
2252        struct xfs_btree_cur    *tcur;          /* temporary btree cursor */
2253        int                     rrecs;          /* right record count */
2254        union xfs_btree_ptr     lptr;           /* left btree pointer */
2255        union xfs_btree_key     *rkp = NULL;    /* right btree key */
2256        union xfs_btree_ptr     *rpp = NULL;    /* right address pointer */
2257        union xfs_btree_rec     *rrp = NULL;    /* right record pointer */
2258        int                     error;          /* error return value */
2259        int                     i;
2260
2261        if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
2262            level == cur->bc_nlevels - 1)
2263                goto out0;
2264
2265        /* Set up variables for this block as "right". */
2266        right = xfs_btree_get_block(cur, level, &rbp);
2267
2268#ifdef DEBUG
2269        error = xfs_btree_check_block(cur, right, level, rbp);
2270        if (error)
2271                goto error0;
2272#endif
2273
2274        /* If we've got no left sibling then we can't shift an entry left. */
2275        xfs_btree_get_sibling(cur, right, &lptr, XFS_BB_LEFTSIB);
2276        if (xfs_btree_ptr_is_null(cur, &lptr))
2277                goto out0;
2278
2279        /*
2280         * If the cursor entry is the one that would be moved, don't
2281         * do it... it's too complicated.
2282         */
2283        if (cur->bc_ptrs[level] <= 1)
2284                goto out0;
2285
2286        /* Set up the left neighbor as "left". */
2287        error = xfs_btree_read_buf_block(cur, &lptr, 0, &left, &lbp);
2288        if (error)
2289                goto error0;
2290
2291        /* If it's full, it can't take another entry. */
2292        lrecs = xfs_btree_get_numrecs(left);
2293        if (lrecs == cur->bc_ops->get_maxrecs(cur, level))
2294                goto out0;
2295
2296        rrecs = xfs_btree_get_numrecs(right);
2297
2298        /*
2299         * We add one entry to the left side and remove one for the right side.
2300         * Account for it here, the changes will be updated on disk and logged
2301         * later.
2302         */
2303        lrecs++;
2304        rrecs--;
2305
2306        XFS_BTREE_STATS_INC(cur, lshift);
2307        XFS_BTREE_STATS_ADD(cur, moves, 1);
2308
2309        /*
2310         * If non-leaf, copy a key and a ptr to the left block.
2311         * Log the changes to the left block.
2312         */
2313        if (level > 0) {
2314                /* It's a non-leaf.  Move keys and pointers. */
2315                union xfs_btree_key     *lkp;   /* left btree key */
2316                union xfs_btree_ptr     *lpp;   /* left address pointer */
2317
2318                lkp = xfs_btree_key_addr(cur, lrecs, left);
2319                rkp = xfs_btree_key_addr(cur, 1, right);
2320
2321                lpp = xfs_btree_ptr_addr(cur, lrecs, left);
2322                rpp = xfs_btree_ptr_addr(cur, 1, right);
2323
2324                error = xfs_btree_debug_check_ptr(cur, rpp, 0, level);
2325                if (error)
2326                        goto error0;
2327
2328                xfs_btree_copy_keys(cur, lkp, rkp, 1);
2329                xfs_btree_copy_ptrs(cur, lpp, rpp, 1);
2330
2331                xfs_btree_log_keys(cur, lbp, lrecs, lrecs);
2332                xfs_btree_log_ptrs(cur, lbp, lrecs, lrecs);
2333
2334                ASSERT(cur->bc_ops->keys_inorder(cur,
2335                        xfs_btree_key_addr(cur, lrecs - 1, left), lkp));
2336        } else {
2337                /* It's a leaf.  Move records.  */
2338                union xfs_btree_rec     *lrp;   /* left record pointer */
2339
2340                lrp = xfs_btree_rec_addr(cur, lrecs, left);
2341                rrp = xfs_btree_rec_addr(cur, 1, right);
2342
2343                xfs_btree_copy_recs(cur, lrp, rrp, 1);
2344                xfs_btree_log_recs(cur, lbp, lrecs, lrecs);
2345
2346                ASSERT(cur->bc_ops->recs_inorder(cur,
2347                        xfs_btree_rec_addr(cur, lrecs - 1, left), lrp));
2348        }
2349
2350        xfs_btree_set_numrecs(left, lrecs);
2351        xfs_btree_log_block(cur, lbp, XFS_BB_NUMRECS);
2352
2353        xfs_btree_set_numrecs(right, rrecs);
2354        xfs_btree_log_block(cur, rbp, XFS_BB_NUMRECS);
2355
2356        /*
2357         * Slide the contents of right down one entry.
2358         */
2359        XFS_BTREE_STATS_ADD(cur, moves, rrecs - 1);
2360        if (level > 0) {
2361                /* It's a nonleaf. operate on keys and ptrs */
2362                for (i = 0; i < rrecs; i++) {
2363                        error = xfs_btree_debug_check_ptr(cur, rpp, i + 1, level);
2364                        if (error)
2365                                goto error0;
2366                }
2367
2368                xfs_btree_shift_keys(cur,
2369                                xfs_btree_key_addr(cur, 2, right),
2370                                -1, rrecs);
2371                xfs_btree_shift_ptrs(cur,
2372                                xfs_btree_ptr_addr(cur, 2, right),
2373                                -1, rrecs);
2374
2375                xfs_btree_log_keys(cur, rbp, 1, rrecs);
2376                xfs_btree_log_ptrs(cur, rbp, 1, rrecs);
2377        } else {
2378                /* It's a leaf. operate on records */
2379                xfs_btree_shift_recs(cur,
2380                        xfs_btree_rec_addr(cur, 2, right),
2381                        -1, rrecs);
2382                xfs_btree_log_recs(cur, rbp, 1, rrecs);
2383        }
2384
2385        /*
2386         * Using a temporary cursor, update the parent key values of the
2387         * block on the left.
2388         */
2389        if (cur->bc_flags & XFS_BTREE_OVERLAPPING) {
2390                error = xfs_btree_dup_cursor(cur, &tcur);
2391                if (error)
2392                        goto error0;
2393                i = xfs_btree_firstrec(tcur, level);
2394                if (XFS_IS_CORRUPT(tcur->bc_mp, i != 1)) {
2395                        error = -EFSCORRUPTED;
2396                        goto error0;
2397                }
2398
2399                error = xfs_btree_decrement(tcur, level, &i);
2400                if (error)
2401                        goto error1;
2402
2403                /* Update the parent high keys of the left block, if needed. */
2404                error = xfs_btree_update_keys(tcur, level);
2405                if (error)
2406                        goto error1;
2407
2408                xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
2409        }
2410
2411        /* Update the parent keys of the right block. */
2412        error = xfs_btree_update_keys(cur, level);
2413        if (error)
2414                goto error0;
2415
2416        /* Slide the cursor value left one. */
2417        cur->bc_ptrs[level]--;
2418
2419        *stat = 1;
2420        return 0;
2421
2422out0:
2423        *stat = 0;
2424        return 0;
2425
2426error0:
2427        return error;
2428
2429error1:
2430        xfs_btree_del_cursor(tcur, XFS_BTREE_ERROR);
2431        return error;
2432}
2433
2434/*
2435 * Move 1 record right from cur/level if possible.
2436 * Update cur to reflect the new path.
2437 */
2438STATIC int                                      /* error */
2439xfs_btree_rshift(
2440        struct xfs_btree_cur    *cur,
2441        int                     level,
2442        int                     *stat)          /* success/failure */
2443{
2444        struct xfs_buf          *lbp;           /* left buffer pointer */
2445        struct xfs_btree_block  *left;          /* left btree block */
2446        struct xfs_buf          *rbp;           /* right buffer pointer */
2447        struct xfs_btree_block  *right;         /* right btree block */
2448        struct xfs_btree_cur    *tcur;          /* temporary btree cursor */
2449        union xfs_btree_ptr     rptr;           /* right block pointer */
2450        union xfs_btree_key     *rkp;           /* right btree key */
2451        int                     rrecs;          /* right record count */
2452        int                     lrecs;          /* left record count */
2453        int                     error;          /* error return value */
2454        int                     i;              /* loop counter */
2455
2456        if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
2457            (level == cur->bc_nlevels - 1))
2458                goto out0;
2459
2460        /* Set up variables for this block as "left". */
2461        left = xfs_btree_get_block(cur, level, &lbp);
2462
2463#ifdef DEBUG
2464        error = xfs_btree_check_block(cur, left, level, lbp);
2465        if (error)
2466                goto error0;
2467#endif
2468
2469        /* If we've got no right sibling then we can't shift an entry right. */
2470        xfs_btree_get_sibling(cur, left, &rptr, XFS_BB_RIGHTSIB);
2471        if (xfs_btree_ptr_is_null(cur, &rptr))
2472                goto out0;
2473
2474        /*
2475         * If the cursor entry is the one that would be moved, don't
2476         * do it... it's too complicated.
2477         */
2478        lrecs = xfs_btree_get_numrecs(left);
2479        if (cur->bc_ptrs[level] >= lrecs)
2480                goto out0;
2481
2482        /* Set up the right neighbor as "right". */
2483        error = xfs_btree_read_buf_block(cur, &rptr, 0, &right, &rbp);
2484        if (error)
2485                goto error0;
2486
2487        /* If it's full, it can't take another entry. */
2488        rrecs = xfs_btree_get_numrecs(right);
2489        if (rrecs == cur->bc_ops->get_maxrecs(cur, level))
2490                goto out0;
2491
2492        XFS_BTREE_STATS_INC(cur, rshift);
2493        XFS_BTREE_STATS_ADD(cur, moves, rrecs);
2494
2495        /*
2496         * Make a hole at the start of the right neighbor block, then
2497         * copy the last left block entry to the hole.
2498         */
2499        if (level > 0) {
2500                /* It's a nonleaf. make a hole in the keys and ptrs */
2501                union xfs_btree_key     *lkp;
2502                union xfs_btree_ptr     *lpp;
2503                union xfs_btree_ptr     *rpp;
2504
2505                lkp = xfs_btree_key_addr(cur, lrecs, left);
2506                lpp = xfs_btree_ptr_addr(cur, lrecs, left);
2507                rkp = xfs_btree_key_addr(cur, 1, right);
2508                rpp = xfs_btree_ptr_addr(cur, 1, right);
2509
2510                for (i = rrecs - 1; i >= 0; i--) {
2511                        error = xfs_btree_debug_check_ptr(cur, rpp, i, level);
2512                        if (error)
2513                                goto error0;
2514                }
2515
2516                xfs_btree_shift_keys(cur, rkp, 1, rrecs);
2517                xfs_btree_shift_ptrs(cur, rpp, 1, rrecs);
2518
2519                error = xfs_btree_debug_check_ptr(cur, lpp, 0, level);
2520                if (error)
2521                        goto error0;
2522
2523                /* Now put the new data in, and log it. */
2524                xfs_btree_copy_keys(cur, rkp, lkp, 1);
2525                xfs_btree_copy_ptrs(cur, rpp, lpp, 1);
2526
2527                xfs_btree_log_keys(cur, rbp, 1, rrecs + 1);
2528                xfs_btree_log_ptrs(cur, rbp, 1, rrecs + 1);
2529
2530                ASSERT(cur->bc_ops->keys_inorder(cur, rkp,
2531                        xfs_btree_key_addr(cur, 2, right)));
2532        } else {
2533                /* It's a leaf. make a hole in the records */
2534                union xfs_btree_rec     *lrp;
2535                union xfs_btree_rec     *rrp;
2536
2537                lrp = xfs_btree_rec_addr(cur, lrecs, left);
2538                rrp = xfs_btree_rec_addr(cur, 1, right);
2539
2540                xfs_btree_shift_recs(cur, rrp, 1, rrecs);
2541
2542                /* Now put the new data in, and log it. */
2543                xfs_btree_copy_recs(cur, rrp, lrp, 1);
2544                xfs_btree_log_recs(cur, rbp, 1, rrecs + 1);
2545        }
2546
2547        /*
2548         * Decrement and log left's numrecs, bump and log right's numrecs.
2549         */
2550        xfs_btree_set_numrecs(left, --lrecs);
2551        xfs_btree_log_block(cur, lbp, XFS_BB_NUMRECS);
2552
2553        xfs_btree_set_numrecs(right, ++rrecs);
2554        xfs_btree_log_block(cur, rbp, XFS_BB_NUMRECS);
2555
2556        /*
2557         * Using a temporary cursor, update the parent key values of the
2558         * block on the right.
2559         */
2560        error = xfs_btree_dup_cursor(cur, &tcur);
2561        if (error)
2562                goto error0;
2563        i = xfs_btree_lastrec(tcur, level);
2564        if (XFS_IS_CORRUPT(tcur->bc_mp, i != 1)) {
2565                error = -EFSCORRUPTED;
2566                goto error0;
2567        }
2568
2569        error = xfs_btree_increment(tcur, level, &i);
2570        if (error)
2571                goto error1;
2572
2573        /* Update the parent high keys of the left block, if needed. */
2574        if (cur->bc_flags & XFS_BTREE_OVERLAPPING) {
2575                error = xfs_btree_update_keys(cur, level);
2576                if (error)
2577                        goto error1;
2578        }
2579
2580        /* Update the parent keys of the right block. */
2581        error = xfs_btree_update_keys(tcur, level);
2582        if (error)
2583                goto error1;
2584
2585        xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
2586
2587        *stat = 1;
2588        return 0;
2589
2590out0:
2591        *stat = 0;
2592        return 0;
2593
2594error0:
2595        return error;
2596
2597error1:
2598        xfs_btree_del_cursor(tcur, XFS_BTREE_ERROR);
2599        return error;
2600}
2601
2602/*
2603 * Split cur/level block in half.
2604 * Return new block number and the key to its first
2605 * record (to be inserted into parent).
2606 */
2607STATIC int                                      /* error */
2608__xfs_btree_split(
2609        struct xfs_btree_cur    *cur,
2610        int                     level,
2611        union xfs_btree_ptr     *ptrp,
2612        union xfs_btree_key     *key,
2613        struct xfs_btree_cur    **curp,
2614        int                     *stat)          /* success/failure */
2615{
2616        union xfs_btree_ptr     lptr;           /* left sibling block ptr */
2617        struct xfs_buf          *lbp;           /* left buffer pointer */
2618        struct xfs_btree_block  *left;          /* left btree block */
2619        union xfs_btree_ptr     rptr;           /* right sibling block ptr */
2620        struct xfs_buf          *rbp;           /* right buffer pointer */
2621        struct xfs_btree_block  *right;         /* right btree block */
2622        union xfs_btree_ptr     rrptr;          /* right-right sibling ptr */
2623        struct xfs_buf          *rrbp;          /* right-right buffer pointer */
2624        struct xfs_btree_block  *rrblock;       /* right-right btree block */
2625        int                     lrecs;
2626        int                     rrecs;
2627        int                     src_index;
2628        int                     error;          /* error return value */
2629        int                     i;
2630
2631        XFS_BTREE_STATS_INC(cur, split);
2632
2633        /* Set up left block (current one). */
2634        left = xfs_btree_get_block(cur, level, &lbp);
2635
2636#ifdef DEBUG
2637        error = xfs_btree_check_block(cur, left, level, lbp);
2638        if (error)
2639                goto error0;
2640#endif
2641
2642        xfs_btree_buf_to_ptr(cur, lbp, &lptr);
2643
2644        /* Allocate the new block. If we can't do it, we're toast. Give up. */
2645        error = cur->bc_ops->alloc_block(cur, &lptr, &rptr, stat);
2646        if (error)
2647                goto error0;
2648        if (*stat == 0)
2649                goto out0;
2650        XFS_BTREE_STATS_INC(cur, alloc);
2651
2652        /* Set up the new block as "right". */
2653        error = xfs_btree_get_buf_block(cur, &rptr, &right, &rbp);
2654        if (error)
2655                goto error0;
2656
2657        /* Fill in the btree header for the new right block. */
2658        xfs_btree_init_block_cur(cur, rbp, xfs_btree_get_level(left), 0);
2659
2660        /*
2661         * Split the entries between the old and the new block evenly.
2662         * Make sure that if there's an odd number of entries now, that
2663         * each new block will have the same number of entries.
2664         */
2665        lrecs = xfs_btree_get_numrecs(left);
2666        rrecs = lrecs / 2;
2667        if ((lrecs & 1) && cur->bc_ptrs[level] <= rrecs + 1)
2668                rrecs++;
2669        src_index = (lrecs - rrecs + 1);
2670
2671        XFS_BTREE_STATS_ADD(cur, moves, rrecs);
2672
2673        /* Adjust numrecs for the later get_*_keys() calls. */
2674        lrecs -= rrecs;
2675        xfs_btree_set_numrecs(left, lrecs);
2676        xfs_btree_set_numrecs(right, xfs_btree_get_numrecs(right) + rrecs);
2677
2678        /*
2679         * Copy btree block entries from the left block over to the
2680         * new block, the right. Update the right block and log the
2681         * changes.
2682         */
2683        if (level > 0) {
2684                /* It's a non-leaf.  Move keys and pointers. */
2685                union xfs_btree_key     *lkp;   /* left btree key */
2686                union xfs_btree_ptr     *lpp;   /* left address pointer */
2687                union xfs_btree_key     *rkp;   /* right btree key */
2688                union xfs_btree_ptr     *rpp;   /* right address pointer */
2689
2690                lkp = xfs_btree_key_addr(cur, src_index, left);
2691                lpp = xfs_btree_ptr_addr(cur, src_index, left);
2692                rkp = xfs_btree_key_addr(cur, 1, right);
2693                rpp = xfs_btree_ptr_addr(cur, 1, right);
2694
2695                for (i = src_index; i < rrecs; i++) {
2696                        error = xfs_btree_debug_check_ptr(cur, lpp, i, level);
2697                        if (error)
2698                                goto error0;
2699                }
2700
2701                /* Copy the keys & pointers to the new block. */
2702                xfs_btree_copy_keys(cur, rkp, lkp, rrecs);
2703                xfs_btree_copy_ptrs(cur, rpp, lpp, rrecs);
2704
2705                xfs_btree_log_keys(cur, rbp, 1, rrecs);
2706                xfs_btree_log_ptrs(cur, rbp, 1, rrecs);
2707
2708                /* Stash the keys of the new block for later insertion. */
2709                xfs_btree_get_node_keys(cur, right, key);
2710        } else {
2711                /* It's a leaf.  Move records.  */
2712                union xfs_btree_rec     *lrp;   /* left record pointer */
2713                union xfs_btree_rec     *rrp;   /* right record pointer */
2714
2715                lrp = xfs_btree_rec_addr(cur, src_index, left);
2716                rrp = xfs_btree_rec_addr(cur, 1, right);
2717
2718                /* Copy records to the new block. */
2719                xfs_btree_copy_recs(cur, rrp, lrp, rrecs);
2720                xfs_btree_log_recs(cur, rbp, 1, rrecs);
2721
2722                /* Stash the keys of the new block for later insertion. */
2723                xfs_btree_get_leaf_keys(cur, right, key);
2724        }
2725
2726        /*
2727         * Find the left block number by looking in the buffer.
2728         * Adjust sibling pointers.
2729         */
2730        xfs_btree_get_sibling(cur, left, &rrptr, XFS_BB_RIGHTSIB);
2731        xfs_btree_set_sibling(cur, right, &rrptr, XFS_BB_RIGHTSIB);
2732        xfs_btree_set_sibling(cur, right, &lptr, XFS_BB_LEFTSIB);
2733        xfs_btree_set_sibling(cur, left, &rptr, XFS_BB_RIGHTSIB);
2734
2735        xfs_btree_log_block(cur, rbp, XFS_BB_ALL_BITS);
2736        xfs_btree_log_block(cur, lbp, XFS_BB_NUMRECS | XFS_BB_RIGHTSIB);
2737
2738        /*
2739         * If there's a block to the new block's right, make that block
2740         * point back to right instead of to left.
2741         */
2742        if (!xfs_btree_ptr_is_null(cur, &rrptr)) {
2743                error = xfs_btree_read_buf_block(cur, &rrptr,
2744                                                        0, &rrblock, &rrbp);
2745                if (error)
2746                        goto error0;
2747                xfs_btree_set_sibling(cur, rrblock, &rptr, XFS_BB_LEFTSIB);
2748                xfs_btree_log_block(cur, rrbp, XFS_BB_LEFTSIB);
2749        }
2750
2751        /* Update the parent high keys of the left block, if needed. */
2752        if (cur->bc_flags & XFS_BTREE_OVERLAPPING) {
2753                error = xfs_btree_update_keys(cur, level);
2754                if (error)
2755                        goto error0;
2756        }
2757
2758        /*
2759         * If the cursor is really in the right block, move it there.
2760         * If it's just pointing past the last entry in left, then we'll
2761         * insert there, so don't change anything in that case.
2762         */
2763        if (cur->bc_ptrs[level] > lrecs + 1) {
2764                xfs_btree_setbuf(cur, level, rbp);
2765                cur->bc_ptrs[level] -= lrecs;
2766        }
2767        /*
2768         * If there are more levels, we'll need another cursor which refers
2769         * the right block, no matter where this cursor was.
2770         */
2771        if (level + 1 < cur->bc_nlevels) {
2772                error = xfs_btree_dup_cursor(cur, curp);
2773                if (error)
2774                        goto error0;
2775                (*curp)->bc_ptrs[level + 1]++;
2776        }
2777        *ptrp = rptr;
2778        *stat = 1;
2779        return 0;
2780out0:
2781        *stat = 0;
2782        return 0;
2783
2784error0:
2785        return error;
2786}
2787
2788struct xfs_btree_split_args {
2789        struct xfs_btree_cur    *cur;
2790        int                     level;
2791        union xfs_btree_ptr     *ptrp;
2792        union xfs_btree_key     *key;
2793        struct xfs_btree_cur    **curp;
2794        int                     *stat;          /* success/failure */
2795        int                     result;
2796        bool                    kswapd; /* allocation in kswapd context */
2797        struct completion       *done;
2798        struct work_struct      work;
2799};
2800
2801/*
2802 * Stack switching interfaces for allocation
2803 */
2804static void
2805xfs_btree_split_worker(
2806        struct work_struct      *work)
2807{
2808        struct xfs_btree_split_args     *args = container_of(work,
2809                                                struct xfs_btree_split_args, work);
2810        unsigned long           pflags;
2811        unsigned long           new_pflags = 0;
2812
2813        /*
2814         * we are in a transaction context here, but may also be doing work
2815         * in kswapd context, and hence we may need to inherit that state
2816         * temporarily to ensure that we don't block waiting for memory reclaim
2817         * in any way.
2818         */
2819        if (args->kswapd)
2820                new_pflags |= PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD;
2821
2822        current_set_flags_nested(&pflags, new_pflags);
2823        xfs_trans_set_context(args->cur->bc_tp);
2824
2825        args->result = __xfs_btree_split(args->cur, args->level, args->ptrp,
2826                                         args->key, args->curp, args->stat);
2827
2828        xfs_trans_clear_context(args->cur->bc_tp);
2829        current_restore_flags_nested(&pflags, new_pflags);
2830
2831        /*
2832         * Do not access args after complete() has run here. We don't own args
2833         * and the owner may run and free args before we return here.
2834         */
2835        complete(args->done);
2836
2837}
2838
2839/*
2840 * BMBT split requests often come in with little stack to work on. Push
2841 * them off to a worker thread so there is lots of stack to use. For the other
2842 * btree types, just call directly to avoid the context switch overhead here.
2843 */
2844STATIC int                                      /* error */
2845xfs_btree_split(
2846        struct xfs_btree_cur    *cur,
2847        int                     level,
2848        union xfs_btree_ptr     *ptrp,
2849        union xfs_btree_key     *key,
2850        struct xfs_btree_cur    **curp,
2851        int                     *stat)          /* success/failure */
2852{
2853        struct xfs_btree_split_args     args;
2854        DECLARE_COMPLETION_ONSTACK(done);
2855
2856        if (cur->bc_btnum != XFS_BTNUM_BMAP)
2857                return __xfs_btree_split(cur, level, ptrp, key, curp, stat);
2858
2859        args.cur = cur;
2860        args.level = level;
2861        args.ptrp = ptrp;
2862        args.key = key;
2863        args.curp = curp;
2864        args.stat = stat;
2865        args.done = &done;
2866        args.kswapd = current_is_kswapd();
2867        INIT_WORK_ONSTACK(&args.work, xfs_btree_split_worker);
2868        queue_work(xfs_alloc_wq, &args.work);
2869        wait_for_completion(&done);
2870        destroy_work_on_stack(&args.work);
2871        return args.result;
2872}
2873
2874
2875/*
2876 * Copy the old inode root contents into a real block and make the
2877 * broot point to it.
2878 */
2879int                                             /* error */
2880xfs_btree_new_iroot(
2881        struct xfs_btree_cur    *cur,           /* btree cursor */
2882        int                     *logflags,      /* logging flags for inode */
2883        int                     *stat)          /* return status - 0 fail */
2884{
2885        struct xfs_buf          *cbp;           /* buffer for cblock */
2886        struct xfs_btree_block  *block;         /* btree block */
2887        struct xfs_btree_block  *cblock;        /* child btree block */
2888        union xfs_btree_key     *ckp;           /* child key pointer */
2889        union xfs_btree_ptr     *cpp;           /* child ptr pointer */
2890        union xfs_btree_key     *kp;            /* pointer to btree key */
2891        union xfs_btree_ptr     *pp;            /* pointer to block addr */
2892        union xfs_btree_ptr     nptr;           /* new block addr */
2893        int                     level;          /* btree level */
2894        int                     error;          /* error return code */
2895        int                     i;              /* loop counter */
2896
2897        XFS_BTREE_STATS_INC(cur, newroot);
2898
2899        ASSERT(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE);
2900
2901        level = cur->bc_nlevels - 1;
2902
2903        block = xfs_btree_get_iroot(cur);
2904        pp = xfs_btree_ptr_addr(cur, 1, block);
2905
2906        /* Allocate the new block. If we can't do it, we're toast. Give up. */
2907        error = cur->bc_ops->alloc_block(cur, pp, &nptr, stat);
2908        if (error)
2909                goto error0;
2910        if (*stat == 0)
2911                return 0;
2912
2913        XFS_BTREE_STATS_INC(cur, alloc);
2914
2915        /* Copy the root into a real block. */
2916        error = xfs_btree_get_buf_block(cur, &nptr, &cblock, &cbp);
2917        if (error)
2918                goto error0;
2919
2920        /*
2921         * we can't just memcpy() the root in for CRC enabled btree blocks.
2922         * In that case have to also ensure the blkno remains correct
2923         */
2924        memcpy(cblock, block, xfs_btree_block_len(cur));
2925        if (cur->bc_flags & XFS_BTREE_CRC_BLOCKS) {
2926                __be64 bno = cpu_to_be64(xfs_buf_daddr(cbp));
2927                if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
2928                        cblock->bb_u.l.bb_blkno = bno;
2929                else
2930                        cblock->bb_u.s.bb_blkno = bno;
2931        }
2932
2933        be16_add_cpu(&block->bb_level, 1);
2934        xfs_btree_set_numrecs(block, 1);
2935        cur->bc_nlevels++;
2936        cur->bc_ptrs[level + 1] = 1;
2937
2938        kp = xfs_btree_key_addr(cur, 1, block);
2939        ckp = xfs_btree_key_addr(cur, 1, cblock);
2940        xfs_btree_copy_keys(cur, ckp, kp, xfs_btree_get_numrecs(cblock));
2941
2942        cpp = xfs_btree_ptr_addr(cur, 1, cblock);
2943        for (i = 0; i < be16_to_cpu(cblock->bb_numrecs); i++) {
2944                error = xfs_btree_debug_check_ptr(cur, pp, i, level);
2945                if (error)
2946                        goto error0;
2947        }
2948
2949        xfs_btree_copy_ptrs(cur, cpp, pp, xfs_btree_get_numrecs(cblock));
2950
2951        error = xfs_btree_debug_check_ptr(cur, &nptr, 0, level);
2952        if (error)
2953                goto error0;
2954
2955        xfs_btree_copy_ptrs(cur, pp, &nptr, 1);
2956
2957        xfs_iroot_realloc(cur->bc_ino.ip,
2958                          1 - xfs_btree_get_numrecs(cblock),
2959                          cur->bc_ino.whichfork);
2960
2961        xfs_btree_setbuf(cur, level, cbp);
2962
2963        /*
2964         * Do all this logging at the end so that
2965         * the root is at the right level.
2966         */
2967        xfs_btree_log_block(cur, cbp, XFS_BB_ALL_BITS);
2968        xfs_btree_log_keys(cur, cbp, 1, be16_to_cpu(cblock->bb_numrecs));
2969        xfs_btree_log_ptrs(cur, cbp, 1, be16_to_cpu(cblock->bb_numrecs));
2970
2971        *logflags |=
2972                XFS_ILOG_CORE | xfs_ilog_fbroot(cur->bc_ino.whichfork);
2973        *stat = 1;
2974        return 0;
2975error0:
2976        return error;
2977}
2978
2979/*
2980 * Allocate a new root block, fill it in.
2981 */
2982STATIC int                              /* error */
2983xfs_btree_new_root(
2984        struct xfs_btree_cur    *cur,   /* btree cursor */
2985        int                     *stat)  /* success/failure */
2986{
2987        struct xfs_btree_block  *block; /* one half of the old root block */
2988        struct xfs_buf          *bp;    /* buffer containing block */
2989        int                     error;  /* error return value */
2990        struct xfs_buf          *lbp;   /* left buffer pointer */
2991        struct xfs_btree_block  *left;  /* left btree block */
2992        struct xfs_buf          *nbp;   /* new (root) buffer */
2993        struct xfs_btree_block  *new;   /* new (root) btree block */
2994        int                     nptr;   /* new value for key index, 1 or 2 */
2995        struct xfs_buf          *rbp;   /* right buffer pointer */
2996        struct xfs_btree_block  *right; /* right btree block */
2997        union xfs_btree_ptr     rptr;
2998        union xfs_btree_ptr     lptr;
2999
3000        XFS_BTREE_STATS_INC(cur, newroot);
3001
3002        /* initialise our start point from the cursor */
3003        cur->bc_ops->init_ptr_from_cur(cur, &rptr);
3004
3005        /* Allocate the new block. If we can't do it, we're toast. Give up. */
3006        error = cur->bc_ops->alloc_block(cur, &rptr, &lptr, stat);
3007        if (error)
3008                goto error0;
3009        if (*stat == 0)
3010                goto out0;
3011        XFS_BTREE_STATS_INC(cur, alloc);
3012
3013        /* Set up the new block. */
3014        error = xfs_btree_get_buf_block(cur, &lptr, &new, &nbp);
3015        if (error)
3016                goto error0;
3017
3018        /* Set the root in the holding structure  increasing the level by 1. */
3019        cur->bc_ops->set_root(cur, &lptr, 1);
3020
3021        /*
3022         * At the previous root level there are now two blocks: the old root,
3023         * and the new block generated when it was split.  We don't know which
3024         * one the cursor is pointing at, so we set up variables "left" and
3025         * "right" for each case.
3026         */
3027        block = xfs_btree_get_block(cur, cur->bc_nlevels - 1, &bp);
3028
3029#ifdef DEBUG
3030        error = xfs_btree_check_block(cur, block, cur->bc_nlevels - 1, bp);
3031        if (error)
3032                goto error0;
3033#endif
3034
3035        xfs_btree_get_sibling(cur, block, &rptr, XFS_BB_RIGHTSIB);
3036        if (!xfs_btree_ptr_is_null(cur, &rptr)) {
3037                /* Our block is left, pick up the right block. */
3038                lbp = bp;
3039                xfs_btree_buf_to_ptr(cur, lbp, &lptr);
3040                left = block;
3041                error = xfs_btree_read_buf_block(cur, &rptr, 0, &right, &rbp);
3042                if (error)
3043                        goto error0;
3044                bp = rbp;
3045                nptr = 1;
3046        } else {
3047                /* Our block is right, pick up the left block. */
3048                rbp = bp;
3049                xfs_btree_buf_to_ptr(cur, rbp, &rptr);
3050                right = block;
3051                xfs_btree_get_sibling(cur, right, &lptr, XFS_BB_LEFTSIB);
3052                error = xfs_btree_read_buf_block(cur, &lptr, 0, &left, &lbp);
3053                if (error)
3054                        goto error0;
3055                bp = lbp;
3056                nptr = 2;
3057        }
3058
3059        /* Fill in the new block's btree header and log it. */
3060        xfs_btree_init_block_cur(cur, nbp, cur->bc_nlevels, 2);
3061        xfs_btree_log_block(cur, nbp, XFS_BB_ALL_BITS);
3062        ASSERT(!xfs_btree_ptr_is_null(cur, &lptr) &&
3063                        !xfs_btree_ptr_is_null(cur, &rptr));
3064
3065        /* Fill in the key data in the new root. */
3066        if (xfs_btree_get_level(left) > 0) {
3067                /*
3068                 * Get the keys for the left block's keys and put them directly
3069                 * in the parent block.  Do the same for the right block.
3070                 */
3071                xfs_btree_get_node_keys(cur, left,
3072                                xfs_btree_key_addr(cur, 1, new));
3073                xfs_btree_get_node_keys(cur, right,
3074                                xfs_btree_key_addr(cur, 2, new));
3075        } else {
3076                /*
3077                 * Get the keys for the left block's records and put them
3078                 * directly in the parent block.  Do the same for the right
3079                 * block.
3080                 */
3081                xfs_btree_get_leaf_keys(cur, left,
3082                        xfs_btree_key_addr(cur, 1, new));
3083                xfs_btree_get_leaf_keys(cur, right,
3084                        xfs_btree_key_addr(cur, 2, new));
3085        }
3086        xfs_btree_log_keys(cur, nbp, 1, 2);
3087
3088        /* Fill in the pointer data in the new root. */
3089        xfs_btree_copy_ptrs(cur,
3090                xfs_btree_ptr_addr(cur, 1, new), &lptr, 1);
3091        xfs_btree_copy_ptrs(cur,
3092                xfs_btree_ptr_addr(cur, 2, new), &rptr, 1);
3093        xfs_btree_log_ptrs(cur, nbp, 1, 2);
3094
3095        /* Fix up the cursor. */
3096        xfs_btree_setbuf(cur, cur->bc_nlevels, nbp);
3097        cur->bc_ptrs[cur->bc_nlevels] = nptr;
3098        cur->bc_nlevels++;
3099        *stat = 1;
3100        return 0;
3101error0:
3102        return error;
3103out0:
3104        *stat = 0;
3105        return 0;
3106}
3107
3108STATIC int
3109xfs_btree_make_block_unfull(
3110        struct xfs_btree_cur    *cur,   /* btree cursor */
3111        int                     level,  /* btree level */
3112        int                     numrecs,/* # of recs in block */
3113        int                     *oindex,/* old tree index */
3114        int                     *index, /* new tree index */
3115        union xfs_btree_ptr     *nptr,  /* new btree ptr */
3116        struct xfs_btree_cur    **ncur, /* new btree cursor */
3117        union xfs_btree_key     *key,   /* key of new block */
3118        int                     *stat)
3119{
3120        int                     error = 0;
3121
3122        if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
3123            level == cur->bc_nlevels - 1) {
3124                struct xfs_inode *ip = cur->bc_ino.ip;
3125
3126                if (numrecs < cur->bc_ops->get_dmaxrecs(cur, level)) {
3127                        /* A root block that can be made bigger. */
3128                        xfs_iroot_realloc(ip, 1, cur->bc_ino.whichfork);
3129                        *stat = 1;
3130                } else {
3131                        /* A root block that needs replacing */
3132                        int     logflags = 0;
3133
3134                        error = xfs_btree_new_iroot(cur, &logflags, stat);
3135                        if (error || *stat == 0)
3136                                return error;
3137
3138                        xfs_trans_log_inode(cur->bc_tp, ip, logflags);
3139                }
3140
3141                return 0;
3142        }
3143
3144        /* First, try shifting an entry to the right neighbor. */
3145        error = xfs_btree_rshift(cur, level, stat);
3146        if (error || *stat)
3147                return error;
3148
3149        /* Next, try shifting an entry to the left neighbor. */
3150        error = xfs_btree_lshift(cur, level, stat);
3151        if (error)
3152                return error;
3153
3154        if (*stat) {
3155                *oindex = *index = cur->bc_ptrs[level];
3156                return 0;
3157        }
3158
3159        /*
3160         * Next, try splitting the current block in half.
3161         *
3162         * If this works we have to re-set our variables because we
3163         * could be in a different block now.
3164         */
3165        error = xfs_btree_split(cur, level, nptr, key, ncur, stat);
3166        if (error || *stat == 0)
3167                return error;
3168
3169
3170        *index = cur->bc_ptrs[level];
3171        return 0;
3172}
3173
3174/*
3175 * Insert one record/level.  Return information to the caller
3176 * allowing the next level up to proceed if necessary.
3177 */
3178STATIC int
3179xfs_btree_insrec(
3180        struct xfs_btree_cur    *cur,   /* btree cursor */
3181        int                     level,  /* level to insert record at */
3182        union xfs_btree_ptr     *ptrp,  /* i/o: block number inserted */
3183        union xfs_btree_rec     *rec,   /* record to insert */
3184        union xfs_btree_key     *key,   /* i/o: block key for ptrp */
3185        struct xfs_btree_cur    **curp, /* output: new cursor replacing cur */
3186        int                     *stat)  /* success/failure */
3187{
3188        struct xfs_btree_block  *block; /* btree block */
3189        struct xfs_buf          *bp;    /* buffer for block */
3190        union xfs_btree_ptr     nptr;   /* new block ptr */
3191        struct xfs_btree_cur    *ncur;  /* new btree cursor */
3192        union xfs_btree_key     nkey;   /* new block key */
3193        union xfs_btree_key     *lkey;
3194        int                     optr;   /* old key/record index */
3195        int                     ptr;    /* key/record index */
3196        int                     numrecs;/* number of records */
3197        int                     error;  /* error return value */
3198        int                     i;
3199        xfs_daddr_t             old_bn;
3200
3201        ncur = NULL;
3202        lkey = &nkey;
3203
3204        /*
3205         * If we have an external root pointer, and we've made it to the
3206         * root level, allocate a new root block and we're done.
3207         */
3208        if (!(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
3209            (level >= cur->bc_nlevels)) {
3210                error = xfs_btree_new_root(cur, stat);
3211                xfs_btree_set_ptr_null(cur, ptrp);
3212
3213                return error;
3214        }
3215
3216        /* If we're off the left edge, return failure. */
3217        ptr = cur->bc_ptrs[level];
3218        if (ptr == 0) {
3219                *stat = 0;
3220                return 0;
3221        }
3222
3223        optr = ptr;
3224
3225        XFS_BTREE_STATS_INC(cur, insrec);
3226
3227        /* Get pointers to the btree buffer and block. */
3228        block = xfs_btree_get_block(cur, level, &bp);
3229        old_bn = bp ? xfs_buf_daddr(bp) : XFS_BUF_DADDR_NULL;
3230        numrecs = xfs_btree_get_numrecs(block);
3231
3232#ifdef DEBUG
3233        error = xfs_btree_check_block(cur, block, level, bp);
3234        if (error)
3235                goto error0;
3236
3237        /* Check that the new entry is being inserted in the right place. */
3238        if (ptr <= numrecs) {
3239                if (level == 0) {
3240                        ASSERT(cur->bc_ops->recs_inorder(cur, rec,
3241                                xfs_btree_rec_addr(cur, ptr, block)));
3242                } else {
3243                        ASSERT(cur->bc_ops->keys_inorder(cur, key,
3244                                xfs_btree_key_addr(cur, ptr, block)));
3245                }
3246        }
3247#endif
3248
3249        /*
3250         * If the block is full, we can't insert the new entry until we
3251         * make the block un-full.
3252         */
3253        xfs_btree_set_ptr_null(cur, &nptr);
3254        if (numrecs == cur->bc_ops->get_maxrecs(cur, level)) {
3255                error = xfs_btree_make_block_unfull(cur, level, numrecs,
3256                                        &optr, &ptr, &nptr, &ncur, lkey, stat);
3257                if (error || *stat == 0)
3258                        goto error0;
3259        }
3260
3261        /*
3262         * The current block may have changed if the block was
3263         * previously full and we have just made space in it.
3264         */
3265        block = xfs_btree_get_block(cur, level, &bp);
3266        numrecs = xfs_btree_get_numrecs(block);
3267
3268#ifdef DEBUG
3269        error = xfs_btree_check_block(cur, block, level, bp);
3270        if (error)
3271                return error;
3272#endif
3273
3274        /*
3275         * At this point we know there's room for our new entry in the block
3276         * we're pointing at.
3277         */
3278        XFS_BTREE_STATS_ADD(cur, moves, numrecs - ptr + 1);
3279
3280        if (level > 0) {
3281                /* It's a nonleaf. make a hole in the keys and ptrs */
3282                union xfs_btree_key     *kp;
3283                union xfs_btree_ptr     *pp;
3284
3285                kp = xfs_btree_key_addr(cur, ptr, block);
3286                pp = xfs_btree_ptr_addr(cur, ptr, block);
3287
3288                for (i = numrecs - ptr; i >= 0; i--) {
3289                        error = xfs_btree_debug_check_ptr(cur, pp, i, level);
3290                        if (error)
3291                                return error;
3292                }
3293
3294                xfs_btree_shift_keys(cur, kp, 1, numrecs - ptr + 1);
3295                xfs_btree_shift_ptrs(cur, pp, 1, numrecs - ptr + 1);
3296
3297                error = xfs_btree_debug_check_ptr(cur, ptrp, 0, level);
3298                if (error)
3299                        goto error0;
3300
3301                /* Now put the new data in, bump numrecs and log it. */
3302                xfs_btree_copy_keys(cur, kp, key, 1);
3303                xfs_btree_copy_ptrs(cur, pp, ptrp, 1);
3304                numrecs++;
3305                xfs_btree_set_numrecs(block, numrecs);
3306                xfs_btree_log_ptrs(cur, bp, ptr, numrecs);
3307                xfs_btree_log_keys(cur, bp, ptr, numrecs);
3308#ifdef DEBUG
3309                if (ptr < numrecs) {
3310                        ASSERT(cur->bc_ops->keys_inorder(cur, kp,
3311                                xfs_btree_key_addr(cur, ptr + 1, block)));
3312                }
3313#endif
3314        } else {
3315                /* It's a leaf. make a hole in the records */
3316                union xfs_btree_rec             *rp;
3317
3318                rp = xfs_btree_rec_addr(cur, ptr, block);
3319
3320                xfs_btree_shift_recs(cur, rp, 1, numrecs - ptr + 1);
3321
3322                /* Now put the new data in, bump numrecs and log it. */
3323                xfs_btree_copy_recs(cur, rp, rec, 1);
3324                xfs_btree_set_numrecs(block, ++numrecs);
3325                xfs_btree_log_recs(cur, bp, ptr, numrecs);
3326#ifdef DEBUG
3327                if (ptr < numrecs) {
3328                        ASSERT(cur->bc_ops->recs_inorder(cur, rp,
3329                                xfs_btree_rec_addr(cur, ptr + 1, block)));
3330                }
3331#endif
3332        }
3333
3334        /* Log the new number of records in the btree header. */
3335        xfs_btree_log_block(cur, bp, XFS_BB_NUMRECS);
3336
3337        /*
3338         * If we just inserted into a new tree block, we have to
3339         * recalculate nkey here because nkey is out of date.
3340         *
3341         * Otherwise we're just updating an existing block (having shoved
3342         * some records into the new tree block), so use the regular key
3343         * update mechanism.
3344         */
3345        if (bp && xfs_buf_daddr(bp) != old_bn) {
3346                xfs_btree_get_keys(cur, block, lkey);
3347        } else if (xfs_btree_needs_key_update(cur, optr)) {
3348                error = xfs_btree_update_keys(cur, level);
3349                if (error)
3350                        goto error0;
3351        }
3352
3353        /*
3354         * If we are tracking the last record in the tree and
3355         * we are at the far right edge of the tree, update it.
3356         */
3357        if (xfs_btree_is_lastrec(cur, block, level)) {
3358                cur->bc_ops->update_lastrec(cur, block, rec,
3359                                            ptr, LASTREC_INSREC);
3360        }
3361
3362        /*
3363         * Return the new block number, if any.
3364         * If there is one, give back a record value and a cursor too.
3365         */
3366        *ptrp = nptr;
3367        if (!xfs_btree_ptr_is_null(cur, &nptr)) {
3368                xfs_btree_copy_keys(cur, key, lkey, 1);
3369                *curp = ncur;
3370        }
3371
3372        *stat = 1;
3373        return 0;
3374
3375error0:
3376        return error;
3377}
3378
3379/*
3380 * Insert the record at the point referenced by cur.
3381 *
3382 * A multi-level split of the tree on insert will invalidate the original
3383 * cursor.  All callers of this function should assume that the cursor is
3384 * no longer valid and revalidate it.
3385 */
3386int
3387xfs_btree_insert(
3388        struct xfs_btree_cur    *cur,
3389        int                     *stat)
3390{
3391        int                     error;  /* error return value */
3392        int                     i;      /* result value, 0 for failure */
3393        int                     level;  /* current level number in btree */
3394        union xfs_btree_ptr     nptr;   /* new block number (split result) */
3395        struct xfs_btree_cur    *ncur;  /* new cursor (split result) */
3396        struct xfs_btree_cur    *pcur;  /* previous level's cursor */
3397        union xfs_btree_key     bkey;   /* key of block to insert */
3398        union xfs_btree_key     *key;
3399        union xfs_btree_rec     rec;    /* record to insert */
3400
3401        level = 0;
3402        ncur = NULL;
3403        pcur = cur;
3404        key = &bkey;
3405
3406        xfs_btree_set_ptr_null(cur, &nptr);
3407
3408        /* Make a key out of the record data to be inserted, and save it. */
3409        cur->bc_ops->init_rec_from_cur(cur, &rec);
3410        cur->bc_ops->init_key_from_rec(key, &rec);
3411
3412        /*
3413         * Loop going up the tree, starting at the leaf level.
3414         * Stop when we don't get a split block, that must mean that
3415         * the insert is finished with this level.
3416         */
3417        do {
3418                /*
3419                 * Insert nrec/nptr into this level of the tree.
3420                 * Note if we fail, nptr will be null.
3421                 */
3422                error = xfs_btree_insrec(pcur, level, &nptr, &rec, key,
3423                                &ncur, &i);
3424                if (error) {
3425                        if (pcur != cur)
3426                                xfs_btree_del_cursor(pcur, XFS_BTREE_ERROR);
3427                        goto error0;
3428                }
3429
3430                if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) {
3431                        error = -EFSCORRUPTED;
3432                        goto error0;
3433                }
3434                level++;
3435
3436                /*
3437                 * See if the cursor we just used is trash.
3438                 * Can't trash the caller's cursor, but otherwise we should
3439                 * if ncur is a new cursor or we're about to be done.
3440                 */
3441                if (pcur != cur &&
3442                    (ncur || xfs_btree_ptr_is_null(cur, &nptr))) {
3443                        /* Save the state from the cursor before we trash it */
3444                        if (cur->bc_ops->update_cursor)
3445                                cur->bc_ops->update_cursor(pcur, cur);
3446                        cur->bc_nlevels = pcur->bc_nlevels;
3447                        xfs_btree_del_cursor(pcur, XFS_BTREE_NOERROR);
3448                }
3449                /* If we got a new cursor, switch to it. */
3450                if (ncur) {
3451                        pcur = ncur;
3452                        ncur = NULL;
3453                }
3454        } while (!xfs_btree_ptr_is_null(cur, &nptr));
3455
3456        *stat = i;
3457        return 0;
3458error0:
3459        return error;
3460}
3461
3462/*
3463 * Try to merge a non-leaf block back into the inode root.
3464 *
3465 * Note: the killroot names comes from the fact that we're effectively
3466 * killing the old root block.  But because we can't just delete the
3467 * inode we have to copy the single block it was pointing to into the
3468 * inode.
3469 */
3470STATIC int
3471xfs_btree_kill_iroot(
3472        struct xfs_btree_cur    *cur)
3473{
3474        int                     whichfork = cur->bc_ino.whichfork;
3475        struct xfs_inode        *ip = cur->bc_ino.ip;
3476        struct xfs_ifork        *ifp = XFS_IFORK_PTR(ip, whichfork);
3477        struct xfs_btree_block  *block;
3478        struct xfs_btree_block  *cblock;
3479        union xfs_btree_key     *kp;
3480        union xfs_btree_key     *ckp;
3481        union xfs_btree_ptr     *pp;
3482        union xfs_btree_ptr     *cpp;
3483        struct xfs_buf          *cbp;
3484        int                     level;
3485        int                     index;
3486        int                     numrecs;
3487        int                     error;
3488#ifdef DEBUG
3489        union xfs_btree_ptr     ptr;
3490#endif
3491        int                     i;
3492
3493        ASSERT(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE);
3494        ASSERT(cur->bc_nlevels > 1);
3495
3496        /*
3497         * Don't deal with the root block needs to be a leaf case.
3498         * We're just going to turn the thing back into extents anyway.
3499         */
3500        level = cur->bc_nlevels - 1;
3501        if (level == 1)
3502                goto out0;
3503
3504        /*
3505         * Give up if the root has multiple children.
3506         */
3507        block = xfs_btree_get_iroot(cur);
3508        if (xfs_btree_get_numrecs(block) != 1)
3509                goto out0;
3510
3511        cblock = xfs_btree_get_block(cur, level - 1, &cbp);
3512        numrecs = xfs_btree_get_numrecs(cblock);
3513
3514        /*
3515         * Only do this if the next level will fit.
3516         * Then the data must be copied up to the inode,
3517         * instead of freeing the root you free the next level.
3518         */
3519        if (numrecs > cur->bc_ops->get_dmaxrecs(cur, level))
3520                goto out0;
3521
3522        XFS_BTREE_STATS_INC(cur, killroot);
3523
3524#ifdef DEBUG
3525        xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_LEFTSIB);
3526        ASSERT(xfs_btree_ptr_is_null(cur, &ptr));
3527        xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_RIGHTSIB);
3528        ASSERT(xfs_btree_ptr_is_null(cur, &ptr));
3529#endif
3530
3531        index = numrecs - cur->bc_ops->get_maxrecs(cur, level);
3532        if (index) {
3533                xfs_iroot_realloc(cur->bc_ino.ip, index,
3534                                  cur->bc_ino.whichfork);
3535                block = ifp->if_broot;
3536        }
3537
3538        be16_add_cpu(&block->bb_numrecs, index);
3539        ASSERT(block->bb_numrecs == cblock->bb_numrecs);
3540
3541        kp = xfs_btree_key_addr(cur, 1, block);
3542        ckp = xfs_btree_key_addr(cur, 1, cblock);
3543        xfs_btree_copy_keys(cur, kp, ckp, numrecs);
3544
3545        pp = xfs_btree_ptr_addr(cur, 1, block);
3546        cpp = xfs_btree_ptr_addr(cur, 1, cblock);
3547
3548        for (i = 0; i < numrecs; i++) {
3549                error = xfs_btree_debug_check_ptr(cur, cpp, i, level - 1);
3550                if (error)
3551                        return error;
3552        }
3553
3554        xfs_btree_copy_ptrs(cur, pp, cpp, numrecs);
3555
3556        error = xfs_btree_free_block(cur, cbp);
3557        if (error)
3558                return error;
3559
3560        cur->bc_bufs[level - 1] = NULL;
3561        be16_add_cpu(&block->bb_level, -1);
3562        xfs_trans_log_inode(cur->bc_tp, ip,
3563                XFS_ILOG_CORE | xfs_ilog_fbroot(cur->bc_ino.whichfork));
3564        cur->bc_nlevels--;
3565out0:
3566        return 0;
3567}
3568
3569/*
3570 * Kill the current root node, and replace it with it's only child node.
3571 */
3572STATIC int
3573xfs_btree_kill_root(
3574        struct xfs_btree_cur    *cur,
3575        struct xfs_buf          *bp,
3576        int                     level,
3577        union xfs_btree_ptr     *newroot)
3578{
3579        int                     error;
3580
3581        XFS_BTREE_STATS_INC(cur, killroot);
3582
3583        /*
3584         * Update the root pointer, decreasing the level by 1 and then
3585         * free the old root.
3586         */
3587        cur->bc_ops->set_root(cur, newroot, -1);
3588
3589        error = xfs_btree_free_block(cur, bp);
3590        if (error)
3591                return error;
3592
3593        cur->bc_bufs[level] = NULL;
3594        cur->bc_ra[level] = 0;
3595        cur->bc_nlevels--;
3596
3597        return 0;
3598}
3599
3600STATIC int
3601xfs_btree_dec_cursor(
3602        struct xfs_btree_cur    *cur,
3603        int                     level,
3604        int                     *stat)
3605{
3606        int                     error;
3607        int                     i;
3608
3609        if (level > 0) {
3610                error = xfs_btree_decrement(cur, level, &i);
3611                if (error)
3612                        return error;
3613        }
3614
3615        *stat = 1;
3616        return 0;
3617}
3618
3619/*
3620 * Single level of the btree record deletion routine.
3621 * Delete record pointed to by cur/level.
3622 * Remove the record from its block then rebalance the tree.
3623 * Return 0 for error, 1 for done, 2 to go on to the next level.
3624 */
3625STATIC int                                      /* error */
3626xfs_btree_delrec(
3627        struct xfs_btree_cur    *cur,           /* btree cursor */
3628        int                     level,          /* level removing record from */
3629        int                     *stat)          /* fail/done/go-on */
3630{
3631        struct xfs_btree_block  *block;         /* btree block */
3632        union xfs_btree_ptr     cptr;           /* current block ptr */
3633        struct xfs_buf          *bp;            /* buffer for block */
3634        int                     error;          /* error return value */
3635        int                     i;              /* loop counter */
3636        union xfs_btree_ptr     lptr;           /* left sibling block ptr */
3637        struct xfs_buf          *lbp;           /* left buffer pointer */
3638        struct xfs_btree_block  *left;          /* left btree block */
3639        int                     lrecs = 0;      /* left record count */
3640        int                     ptr;            /* key/record index */
3641        union xfs_btree_ptr     rptr;           /* right sibling block ptr */
3642        struct xfs_buf          *rbp;           /* right buffer pointer */
3643        struct xfs_btree_block  *right;         /* right btree block */
3644        struct xfs_btree_block  *rrblock;       /* right-right btree block */
3645        struct xfs_buf          *rrbp;          /* right-right buffer pointer */
3646        int                     rrecs = 0;      /* right record count */
3647        struct xfs_btree_cur    *tcur;          /* temporary btree cursor */
3648        int                     numrecs;        /* temporary numrec count */
3649
3650        tcur = NULL;
3651
3652        /* Get the index of the entry being deleted, check for nothing there. */
3653        ptr = cur->bc_ptrs[level];
3654        if (ptr == 0) {
3655                *stat = 0;
3656                return 0;
3657        }
3658
3659        /* Get the buffer & block containing the record or key/ptr. */
3660        block = xfs_btree_get_block(cur, level, &bp);
3661        numrecs = xfs_btree_get_numrecs(block);
3662
3663#ifdef DEBUG
3664        error = xfs_btree_check_block(cur, block, level, bp);
3665        if (error)
3666                goto error0;
3667#endif
3668
3669        /* Fail if we're off the end of the block. */
3670        if (ptr > numrecs) {
3671                *stat = 0;
3672                return 0;
3673        }
3674
3675        XFS_BTREE_STATS_INC(cur, delrec);
3676        XFS_BTREE_STATS_ADD(cur, moves, numrecs - ptr);
3677
3678        /* Excise the entries being deleted. */
3679        if (level > 0) {
3680                /* It's a nonleaf. operate on keys and ptrs */
3681                union xfs_btree_key     *lkp;
3682                union xfs_btree_ptr     *lpp;
3683
3684                lkp = xfs_btree_key_addr(cur, ptr + 1, block);
3685                lpp = xfs_btree_ptr_addr(cur, ptr + 1, block);
3686
3687                for (i = 0; i < numrecs - ptr; i++) {
3688                        error = xfs_btree_debug_check_ptr(cur, lpp, i, level);
3689                        if (error)
3690                                goto error0;
3691                }
3692
3693                if (ptr < numrecs) {
3694                        xfs_btree_shift_keys(cur, lkp, -1, numrecs - ptr);
3695                        xfs_btree_shift_ptrs(cur, lpp, -1, numrecs - ptr);
3696                        xfs_btree_log_keys(cur, bp, ptr, numrecs - 1);
3697                        xfs_btree_log_ptrs(cur, bp, ptr, numrecs - 1);
3698                }
3699        } else {
3700                /* It's a leaf. operate on records */
3701                if (ptr < numrecs) {
3702                        xfs_btree_shift_recs(cur,
3703                                xfs_btree_rec_addr(cur, ptr + 1, block),
3704                                -1, numrecs - ptr);
3705                        xfs_btree_log_recs(cur, bp, ptr, numrecs - 1);
3706                }
3707        }
3708
3709        /*
3710         * Decrement and log the number of entries in the block.
3711         */
3712        xfs_btree_set_numrecs(block, --numrecs);
3713        xfs_btree_log_block(cur, bp, XFS_BB_NUMRECS);
3714
3715        /*
3716         * If we are tracking the last record in the tree and
3717         * we are at the far right edge of the tree, update it.
3718         */
3719        if (xfs_btree_is_lastrec(cur, block, level)) {
3720                cur->bc_ops->update_lastrec(cur, block, NULL,
3721                                            ptr, LASTREC_DELREC);
3722        }
3723
3724        /*
3725         * We're at the root level.  First, shrink the root block in-memory.
3726         * Try to get rid of the next level down.  If we can't then there's
3727         * nothing left to do.
3728         */
3729        if (level == cur->bc_nlevels - 1) {
3730                if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) {
3731                        xfs_iroot_realloc(cur->bc_ino.ip, -1,
3732                                          cur->bc_ino.whichfork);
3733
3734                        error = xfs_btree_kill_iroot(cur);
3735                        if (error)
3736                                goto error0;
3737
3738                        error = xfs_btree_dec_cursor(cur, level, stat);
3739                        if (error)
3740                                goto error0;
3741                        *stat = 1;
3742                        return 0;
3743                }
3744
3745                /*
3746                 * If this is the root level, and there's only one entry left,
3747                 * and it's NOT the leaf level, then we can get rid of this
3748                 * level.
3749                 */
3750                if (numrecs == 1 && level > 0) {
3751                        union xfs_btree_ptr     *pp;
3752                        /*
3753                         * pp is still set to the first pointer in the block.
3754                         * Make it the new root of the btree.
3755                         */
3756                        pp = xfs_btree_ptr_addr(cur, 1, block);
3757                        error = xfs_btree_kill_root(cur, bp, level, pp);
3758                        if (error)
3759                                goto error0;
3760                } else if (level > 0) {
3761                        error = xfs_btree_dec_cursor(cur, level, stat);
3762                        if (error)
3763                                goto error0;
3764                }
3765                *stat = 1;
3766                return 0;
3767        }
3768
3769        /*
3770         * If we deleted the leftmost entry in the block, update the
3771         * key values above us in the tree.
3772         */
3773        if (xfs_btree_needs_key_update(cur, ptr)) {
3774                error = xfs_btree_update_keys(cur, level);
3775                if (error)
3776                        goto error0;
3777        }
3778
3779        /*
3780         * If the number of records remaining in the block is at least
3781         * the minimum, we're done.
3782         */
3783        if (numrecs >= cur->bc_ops->get_minrecs(cur, level)) {
3784                error = xfs_btree_dec_cursor(cur, level, stat);
3785                if (error)
3786                        goto error0;
3787                return 0;
3788        }
3789
3790        /*
3791         * Otherwise, we have to move some records around to keep the
3792         * tree balanced.  Look at the left and right sibling blocks to
3793         * see if we can re-balance by moving only one record.
3794         */
3795        xfs_btree_get_sibling(cur, block, &rptr, XFS_BB_RIGHTSIB);
3796        xfs_btree_get_sibling(cur, block, &lptr, XFS_BB_LEFTSIB);
3797
3798        if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) {
3799                /*
3800                 * One child of root, need to get a chance to copy its contents
3801                 * into the root and delete it. Can't go up to next level,
3802                 * there's nothing to delete there.
3803                 */
3804                if (xfs_btree_ptr_is_null(cur, &rptr) &&
3805                    xfs_btree_ptr_is_null(cur, &lptr) &&
3806                    level == cur->bc_nlevels - 2) {
3807                        error = xfs_btree_kill_iroot(cur);
3808                        if (!error)
3809                                error = xfs_btree_dec_cursor(cur, level, stat);
3810                        if (error)
3811                                goto error0;
3812                        return 0;
3813                }
3814        }
3815
3816        ASSERT(!xfs_btree_ptr_is_null(cur, &rptr) ||
3817               !xfs_btree_ptr_is_null(cur, &lptr));
3818
3819        /*
3820         * Duplicate the cursor so our btree manipulations here won't
3821         * disrupt the next level up.
3822         */
3823        error = xfs_btree_dup_cursor(cur, &tcur);
3824        if (error)
3825                goto error0;
3826
3827        /*
3828         * If there's a right sibling, see if it's ok to shift an entry
3829         * out of it.
3830         */
3831        if (!xfs_btree_ptr_is_null(cur, &rptr)) {
3832                /*
3833                 * Move the temp cursor to the last entry in the next block.
3834                 * Actually any entry but the first would suffice.
3835                 */
3836                i = xfs_btree_lastrec(tcur, level);
3837                if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) {
3838                        error = -EFSCORRUPTED;
3839                        goto error0;
3840                }
3841
3842                error = xfs_btree_increment(tcur, level, &i);
3843                if (error)
3844                        goto error0;
3845                if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) {
3846                        error = -EFSCORRUPTED;
3847                        goto error0;
3848                }
3849
3850                i = xfs_btree_lastrec(tcur, level);
3851                if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) {
3852                        error = -EFSCORRUPTED;
3853                        goto error0;
3854                }
3855
3856                /* Grab a pointer to the block. */
3857                right = xfs_btree_get_block(tcur, level, &rbp);
3858#ifdef DEBUG
3859                error = xfs_btree_check_block(tcur, right, level, rbp);
3860                if (error)
3861                        goto error0;
3862#endif
3863                /* Grab the current block number, for future use. */
3864                xfs_btree_get_sibling(tcur, right, &cptr, XFS_BB_LEFTSIB);
3865
3866                /*
3867                 * If right block is full enough so that removing one entry
3868                 * won't make it too empty, and left-shifting an entry out
3869                 * of right to us works, we're done.
3870                 */
3871                if (xfs_btree_get_numrecs(right) - 1 >=
3872                    cur->bc_ops->get_minrecs(tcur, level)) {
3873                        error = xfs_btree_lshift(tcur, level, &i);
3874                        if (error)
3875                                goto error0;
3876                        if (i) {
3877                                ASSERT(xfs_btree_get_numrecs(block) >=
3878                                       cur->bc_ops->get_minrecs(tcur, level));
3879
3880                                xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
3881                                tcur = NULL;
3882
3883                                error = xfs_btree_dec_cursor(cur, level, stat);
3884                                if (error)
3885                                        goto error0;
3886                                return 0;
3887                        }
3888                }
3889
3890                /*
3891                 * Otherwise, grab the number of records in right for
3892                 * future reference, and fix up the temp cursor to point
3893                 * to our block again (last record).
3894                 */
3895                rrecs = xfs_btree_get_numrecs(right);
3896                if (!xfs_btree_ptr_is_null(cur, &lptr)) {
3897                        i = xfs_btree_firstrec(tcur, level);
3898                        if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) {
3899                                error = -EFSCORRUPTED;
3900                                goto error0;
3901                        }
3902
3903                        error = xfs_btree_decrement(tcur, level, &i);
3904                        if (error)
3905                                goto error0;
3906                        if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) {
3907                                error = -EFSCORRUPTED;
3908                                goto error0;
3909                        }
3910                }
3911        }
3912
3913        /*
3914         * If there's a left sibling, see if it's ok to shift an entry
3915         * out of it.
3916         */
3917        if (!xfs_btree_ptr_is_null(cur, &lptr)) {
3918                /*
3919                 * Move the temp cursor to the first entry in the
3920                 * previous block.
3921                 */
3922                i = xfs_btree_firstrec(tcur, level);
3923                if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) {
3924                        error = -EFSCORRUPTED;
3925                        goto error0;
3926                }
3927
3928                error = xfs_btree_decrement(tcur, level, &i);
3929                if (error)
3930                        goto error0;
3931                i = xfs_btree_firstrec(tcur, level);
3932                if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) {
3933                        error = -EFSCORRUPTED;
3934                        goto error0;
3935                }
3936
3937                /* Grab a pointer to the block. */
3938                left = xfs_btree_get_block(tcur, level, &lbp);
3939#ifdef DEBUG
3940                error = xfs_btree_check_block(cur, left, level, lbp);
3941                if (error)
3942                        goto error0;
3943#endif
3944                /* Grab the current block number, for future use. */
3945                xfs_btree_get_sibling(tcur, left, &cptr, XFS_BB_RIGHTSIB);
3946
3947                /*
3948                 * If left block is full enough so that removing one entry
3949                 * won't make it too empty, and right-shifting an entry out
3950                 * of left to us works, we're done.
3951                 */
3952                if (xfs_btree_get_numrecs(left) - 1 >=
3953                    cur->bc_ops->get_minrecs(tcur, level)) {
3954                        error = xfs_btree_rshift(tcur, level, &i);
3955                        if (error)
3956                                goto error0;
3957                        if (i) {
3958                                ASSERT(xfs_btree_get_numrecs(block) >=
3959                                       cur->bc_ops->get_minrecs(tcur, level));
3960                                xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
3961                                tcur = NULL;
3962                                if (level == 0)
3963                                        cur->bc_ptrs[0]++;
3964
3965                                *stat = 1;
3966                                return 0;
3967                        }
3968                }
3969
3970                /*
3971                 * Otherwise, grab the number of records in right for
3972                 * future reference.
3973                 */
3974                lrecs = xfs_btree_get_numrecs(left);
3975        }
3976
3977        /* Delete the temp cursor, we're done with it. */
3978        xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
3979        tcur = NULL;
3980
3981        /* If here, we need to do a join to keep the tree balanced. */
3982        ASSERT(!xfs_btree_ptr_is_null(cur, &cptr));
3983
3984        if (!xfs_btree_ptr_is_null(cur, &lptr) &&
3985            lrecs + xfs_btree_get_numrecs(block) <=
3986                        cur->bc_ops->get_maxrecs(cur, level)) {
3987                /*
3988                 * Set "right" to be the starting block,
3989                 * "left" to be the left neighbor.
3990                 */
3991                rptr = cptr;
3992                right = block;
3993                rbp = bp;
3994                error = xfs_btree_read_buf_block(cur, &lptr, 0, &left, &lbp);
3995                if (error)
3996                        goto error0;
3997
3998        /*
3999         * If that won't work, see if we can join with the right neighbor block.
4000         */
4001        } else if (!xfs_btree_ptr_is_null(cur, &rptr) &&
4002                   rrecs + xfs_btree_get_numrecs(block) <=
4003                        cur->bc_ops->get_maxrecs(cur, level)) {
4004                /*
4005                 * Set "left" to be the starting block,
4006                 * "right" to be the right neighbor.
4007                 */
4008                lptr = cptr;
4009                left = block;
4010                lbp = bp;
4011                error = xfs_btree_read_buf_block(cur, &rptr, 0, &right, &rbp);
4012                if (error)
4013                        goto error0;
4014
4015        /*
4016         * Otherwise, we can't fix the imbalance.
4017         * Just return.  This is probably a logic error, but it's not fatal.
4018         */
4019        } else {
4020                error = xfs_btree_dec_cursor(cur, level, stat);
4021                if (error)
4022                        goto error0;
4023                return 0;
4024        }
4025
4026        rrecs = xfs_btree_get_numrecs(right);
4027        lrecs = xfs_btree_get_numrecs(left);
4028
4029        /*
4030         * We're now going to join "left" and "right" by moving all the stuff
4031         * in "right" to "left" and deleting "right".
4032         */
4033        XFS_BTREE_STATS_ADD(cur, moves, rrecs);
4034        if (level > 0) {
4035                /* It's a non-leaf.  Move keys and pointers. */
4036                union xfs_btree_key     *lkp;   /* left btree key */
4037                union xfs_btree_ptr     *lpp;   /* left address pointer */
4038                union xfs_btree_key     *rkp;   /* right btree key */
4039                union xfs_btree_ptr     *rpp;   /* right address pointer */
4040
4041                lkp = xfs_btree_key_addr(cur, lrecs + 1, left);
4042                lpp = xfs_btree_ptr_addr(cur, lrecs + 1, left);
4043                rkp = xfs_btree_key_addr(cur, 1, right);
4044                rpp = xfs_btree_ptr_addr(cur, 1, right);
4045
4046                for (i = 1; i < rrecs; i++) {
4047                        error = xfs_btree_debug_check_ptr(cur, rpp, i, level);
4048                        if (error)
4049                                goto error0;
4050                }
4051
4052                xfs_btree_copy_keys(cur, lkp, rkp, rrecs);
4053                xfs_btree_copy_ptrs(cur, lpp, rpp, rrecs);
4054
4055                xfs_btree_log_keys(cur, lbp, lrecs + 1, lrecs + rrecs);
4056                xfs_btree_log_ptrs(cur, lbp, lrecs + 1, lrecs + rrecs);
4057        } else {
4058                /* It's a leaf.  Move records.  */
4059                union xfs_btree_rec     *lrp;   /* left record pointer */
4060                union xfs_btree_rec     *rrp;   /* right record pointer */
4061
4062                lrp = xfs_btree_rec_addr(cur, lrecs + 1, left);
4063                rrp = xfs_btree_rec_addr(cur, 1, right);
4064
4065                xfs_btree_copy_recs(cur, lrp, rrp, rrecs);
4066                xfs_btree_log_recs(cur, lbp, lrecs + 1, lrecs + rrecs);
4067        }
4068
4069        XFS_BTREE_STATS_INC(cur, join);
4070
4071        /*
4072         * Fix up the number of records and right block pointer in the
4073         * surviving block, and log it.
4074         */
4075        xfs_btree_set_numrecs(left, lrecs + rrecs);
4076        xfs_btree_get_sibling(cur, right, &cptr, XFS_BB_RIGHTSIB);
4077        xfs_btree_set_sibling(cur, left, &cptr, XFS_BB_RIGHTSIB);
4078        xfs_btree_log_block(cur, lbp, XFS_BB_NUMRECS | XFS_BB_RIGHTSIB);
4079
4080        /* If there is a right sibling, point it to the remaining block. */
4081        xfs_btree_get_sibling(cur, left, &cptr, XFS_BB_RIGHTSIB);
4082        if (!xfs_btree_ptr_is_null(cur, &cptr)) {
4083                error = xfs_btree_read_buf_block(cur, &cptr, 0, &rrblock, &rrbp);
4084                if (error)
4085                        goto error0;
4086                xfs_btree_set_sibling(cur, rrblock, &lptr, XFS_BB_LEFTSIB);
4087                xfs_btree_log_block(cur, rrbp, XFS_BB_LEFTSIB);
4088        }
4089
4090        /* Free the deleted block. */
4091        error = xfs_btree_free_block(cur, rbp);
4092        if (error)
4093                goto error0;
4094
4095        /*
4096         * If we joined with the left neighbor, set the buffer in the
4097         * cursor to the left block, and fix up the index.
4098         */
4099        if (bp != lbp) {
4100                cur->bc_bufs[level] = lbp;
4101                cur->bc_ptrs[level] += lrecs;
4102                cur->bc_ra[level] = 0;
4103        }
4104        /*
4105         * If we joined with the right neighbor and there's a level above
4106         * us, increment the cursor at that level.
4107         */
4108        else if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) ||
4109                   (level + 1 < cur->bc_nlevels)) {
4110                error = xfs_btree_increment(cur, level + 1, &i);
4111                if (error)
4112                        goto error0;
4113        }
4114
4115        /*
4116         * Readjust the ptr at this level if it's not a leaf, since it's
4117         * still pointing at the deletion point, which makes the cursor
4118         * inconsistent.  If this makes the ptr 0, the caller fixes it up.
4119         * We can't use decrement because it would change the next level up.
4120         */
4121        if (level > 0)
4122                cur->bc_ptrs[level]--;
4123
4124        /*
4125         * We combined blocks, so we have to update the parent keys if the
4126         * btree supports overlapped intervals.  However, bc_ptrs[level + 1]
4127         * points to the old block so that the caller knows which record to
4128         * delete.  Therefore, the caller must be savvy enough to call updkeys
4129         * for us if we return stat == 2.  The other exit points from this
4130         * function don't require deletions further up the tree, so they can
4131         * call updkeys directly.
4132         */
4133
4134        /* Return value means the next level up has something to do. */
4135        *stat = 2;
4136        return 0;
4137
4138error0:
4139        if (tcur)
4140                xfs_btree_del_cursor(tcur, XFS_BTREE_ERROR);
4141        return error;
4142}
4143
4144/*
4145 * Delete the record pointed to by cur.
4146 * The cursor refers to the place where the record was (could be inserted)
4147 * when the operation returns.
4148 */
4149int                                     /* error */
4150xfs_btree_delete(
4151        struct xfs_btree_cur    *cur,
4152        int                     *stat)  /* success/failure */
4153{
4154        int                     error;  /* error return value */
4155        int                     level;
4156        int                     i;
4157        bool                    joined = false;
4158
4159        /*
4160         * Go up the tree, starting at leaf level.
4161         *
4162         * If 2 is returned then a join was done; go to the next level.
4163         * Otherwise we are done.
4164         */
4165        for (level = 0, i = 2; i == 2; level++) {
4166                error = xfs_btree_delrec(cur, level, &i);
4167                if (error)
4168                        goto error0;
4169                if (i == 2)
4170                        joined = true;
4171        }
4172
4173        /*
4174         * If we combined blocks as part of deleting the record, delrec won't
4175         * have updated the parent high keys so we have to do that here.
4176         */
4177        if (joined && (cur->bc_flags & XFS_BTREE_OVERLAPPING)) {
4178                error = xfs_btree_updkeys_force(cur, 0);
4179                if (error)
4180                        goto error0;
4181        }
4182
4183        if (i == 0) {
4184                for (level = 1; level < cur->bc_nlevels; level++) {
4185                        if (cur->bc_ptrs[level] == 0) {
4186                                error = xfs_btree_decrement(cur, level, &i);
4187                                if (error)
4188                                        goto error0;
4189                                break;
4190                        }
4191                }
4192        }
4193
4194        *stat = i;
4195        return 0;
4196error0:
4197        return error;
4198}
4199
4200/*
4201 * Get the data from the pointed-to record.
4202 */
4203int                                     /* error */
4204xfs_btree_get_rec(
4205        struct xfs_btree_cur    *cur,   /* btree cursor */
4206        union xfs_btree_rec     **recp, /* output: btree record */
4207        int                     *stat)  /* output: success/failure */
4208{
4209        struct xfs_btree_block  *block; /* btree block */
4210        struct xfs_buf          *bp;    /* buffer pointer */
4211        int                     ptr;    /* record number */
4212#ifdef DEBUG
4213        int                     error;  /* error return value */
4214#endif
4215
4216        ptr = cur->bc_ptrs[0];
4217        block = xfs_btree_get_block(cur, 0, &bp);
4218
4219#ifdef DEBUG
4220        error = xfs_btree_check_block(cur, block, 0, bp);
4221        if (error)
4222                return error;
4223#endif
4224
4225        /*
4226         * Off the right end or left end, return failure.
4227         */
4228        if (ptr > xfs_btree_get_numrecs(block) || ptr <= 0) {
4229                *stat = 0;
4230                return 0;
4231        }
4232
4233        /*
4234         * Point to the record and extract its data.
4235         */
4236        *recp = xfs_btree_rec_addr(cur, ptr, block);
4237        *stat = 1;
4238        return 0;
4239}
4240
4241/* Visit a block in a btree. */
4242STATIC int
4243xfs_btree_visit_block(
4244        struct xfs_btree_cur            *cur,
4245        int                             level,
4246        xfs_btree_visit_blocks_fn       fn,
4247        void                            *data)
4248{
4249        struct xfs_btree_block          *block;
4250        struct xfs_buf                  *bp;
4251        union xfs_btree_ptr             rptr;
4252        int                             error;
4253
4254        /* do right sibling readahead */
4255        xfs_btree_readahead(cur, level, XFS_BTCUR_RIGHTRA);
4256        block = xfs_btree_get_block(cur, level, &bp);
4257
4258        /* process the block */
4259        error = fn(cur, level, data);
4260        if (error)
4261                return error;
4262
4263        /* now read rh sibling block for next iteration */
4264        xfs_btree_get_sibling(cur, block, &rptr, XFS_BB_RIGHTSIB);
4265        if (xfs_btree_ptr_is_null(cur, &rptr))
4266                return -ENOENT;
4267
4268        return xfs_btree_lookup_get_block(cur, level, &rptr, &block);
4269}
4270
4271
4272/* Visit every block in a btree. */
4273int
4274xfs_btree_visit_blocks(
4275        struct xfs_btree_cur            *cur,
4276        xfs_btree_visit_blocks_fn       fn,
4277        unsigned int                    flags,
4278        void                            *data)
4279{
4280        union xfs_btree_ptr             lptr;
4281        int                             level;
4282        struct xfs_btree_block          *block = NULL;
4283        int                             error = 0;
4284
4285        cur->bc_ops->init_ptr_from_cur(cur, &lptr);
4286
4287        /* for each level */
4288        for (level = cur->bc_nlevels - 1; level >= 0; level--) {
4289                /* grab the left hand block */
4290                error = xfs_btree_lookup_get_block(cur, level, &lptr, &block);
4291                if (error)
4292                        return error;
4293
4294                /* readahead the left most block for the next level down */
4295                if (level > 0) {
4296                        union xfs_btree_ptr     *ptr;
4297
4298                        ptr = xfs_btree_ptr_addr(cur, 1, block);
4299                        xfs_btree_readahead_ptr(cur, ptr, 1);
4300
4301                        /* save for the next iteration of the loop */
4302                        xfs_btree_copy_ptrs(cur, &lptr, ptr, 1);
4303
4304                        if (!(flags & XFS_BTREE_VISIT_LEAVES))
4305                                continue;
4306                } else if (!(flags & XFS_BTREE_VISIT_RECORDS)) {
4307                        continue;
4308                }
4309
4310                /* for each buffer in the level */
4311                do {
4312                        error = xfs_btree_visit_block(cur, level, fn, data);
4313                } while (!error);
4314
4315                if (error != -ENOENT)
4316                        return error;
4317        }
4318
4319        return 0;
4320}
4321
4322/*
4323 * Change the owner of a btree.
4324 *
4325 * The mechanism we use here is ordered buffer logging. Because we don't know
4326 * how many buffers were are going to need to modify, we don't really want to
4327 * have to make transaction reservations for the worst case of every buffer in a
4328 * full size btree as that may be more space that we can fit in the log....
4329 *
4330 * We do the btree walk in the most optimal manner possible - we have sibling
4331 * pointers so we can just walk all the blocks on each level from left to right
4332 * in a single pass, and then move to the next level and do the same. We can
4333 * also do readahead on the sibling pointers to get IO moving more quickly,
4334 * though for slow disks this is unlikely to make much difference to performance
4335 * as the amount of CPU work we have to do before moving to the next block is
4336 * relatively small.
4337 *
4338 * For each btree block that we load, modify the owner appropriately, set the
4339 * buffer as an ordered buffer and log it appropriately. We need to ensure that
4340 * we mark the region we change dirty so that if the buffer is relogged in
4341 * a subsequent transaction the changes we make here as an ordered buffer are
4342 * correctly relogged in that transaction.  If we are in recovery context, then
4343 * just queue the modified buffer as delayed write buffer so the transaction
4344 * recovery completion writes the changes to disk.
4345 */
4346struct xfs_btree_block_change_owner_info {
4347        uint64_t                new_owner;
4348        struct list_head        *buffer_list;
4349};
4350
4351static int
4352xfs_btree_block_change_owner(
4353        struct xfs_btree_cur    *cur,
4354        int                     level,
4355        void                    *data)
4356{
4357        struct xfs_btree_block_change_owner_info        *bbcoi = data;
4358        struct xfs_btree_block  *block;
4359        struct xfs_buf          *bp;
4360
4361        /* modify the owner */
4362        block = xfs_btree_get_block(cur, level, &bp);
4363        if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
4364                if (block->bb_u.l.bb_owner == cpu_to_be64(bbcoi->new_owner))
4365                        return 0;
4366                block->bb_u.l.bb_owner = cpu_to_be64(bbcoi->new_owner);
4367        } else {
4368                if (block->bb_u.s.bb_owner == cpu_to_be32(bbcoi->new_owner))
4369                        return 0;
4370                block->bb_u.s.bb_owner = cpu_to_be32(bbcoi->new_owner);
4371        }
4372
4373        /*
4374         * If the block is a root block hosted in an inode, we might not have a
4375         * buffer pointer here and we shouldn't attempt to log the change as the
4376         * information is already held in the inode and discarded when the root
4377         * block is formatted into the on-disk inode fork. We still change it,
4378         * though, so everything is consistent in memory.
4379         */
4380        if (!bp) {
4381                ASSERT(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE);
4382                ASSERT(level == cur->bc_nlevels - 1);
4383                return 0;
4384        }
4385
4386        if (cur->bc_tp) {
4387                if (!xfs_trans_ordered_buf(cur->bc_tp, bp)) {
4388                        xfs_btree_log_block(cur, bp, XFS_BB_OWNER);
4389                        return -EAGAIN;
4390                }
4391        } else {
4392                xfs_buf_delwri_queue(bp, bbcoi->buffer_list);
4393        }
4394
4395        return 0;
4396}
4397
4398int
4399xfs_btree_change_owner(
4400        struct xfs_btree_cur    *cur,
4401        uint64_t                new_owner,
4402        struct list_head        *buffer_list)
4403{
4404        struct xfs_btree_block_change_owner_info        bbcoi;
4405
4406        bbcoi.new_owner = new_owner;
4407        bbcoi.buffer_list = buffer_list;
4408
4409        return xfs_btree_visit_blocks(cur, xfs_btree_block_change_owner,
4410                        XFS_BTREE_VISIT_ALL, &bbcoi);
4411}
4412
4413/* Verify the v5 fields of a long-format btree block. */
4414xfs_failaddr_t
4415xfs_btree_lblock_v5hdr_verify(
4416        struct xfs_buf          *bp,
4417        uint64_t                owner)
4418{
4419        struct xfs_mount        *mp = bp->b_mount;
4420        struct xfs_btree_block  *block = XFS_BUF_TO_BLOCK(bp);
4421
4422        if (!xfs_has_crc(mp))
4423                return __this_address;
4424        if (!uuid_equal(&block->bb_u.l.bb_uuid, &mp->m_sb.sb_meta_uuid))
4425                return __this_address;
4426        if (block->bb_u.l.bb_blkno != cpu_to_be64(xfs_buf_daddr(bp)))
4427                return __this_address;
4428        if (owner != XFS_RMAP_OWN_UNKNOWN &&
4429            be64_to_cpu(block->bb_u.l.bb_owner) != owner)
4430                return __this_address;
4431        return NULL;
4432}
4433
4434/* Verify a long-format btree block. */
4435xfs_failaddr_t
4436xfs_btree_lblock_verify(
4437        struct xfs_buf          *bp,
4438        unsigned int            max_recs)
4439{
4440        struct xfs_mount        *mp = bp->b_mount;
4441        struct xfs_btree_block  *block = XFS_BUF_TO_BLOCK(bp);
4442
4443        /* numrecs verification */
4444        if (be16_to_cpu(block->bb_numrecs) > max_recs)
4445                return __this_address;
4446
4447        /* sibling pointer verification */
4448        if (block->bb_u.l.bb_leftsib != cpu_to_be64(NULLFSBLOCK) &&
4449            !xfs_verify_fsbno(mp, be64_to_cpu(block->bb_u.l.bb_leftsib)))
4450                return __this_address;
4451        if (block->bb_u.l.bb_rightsib != cpu_to_be64(NULLFSBLOCK) &&
4452            !xfs_verify_fsbno(mp, be64_to_cpu(block->bb_u.l.bb_rightsib)))
4453                return __this_address;
4454
4455        return NULL;
4456}
4457
4458/**
4459 * xfs_btree_sblock_v5hdr_verify() -- verify the v5 fields of a short-format
4460 *                                    btree block
4461 *
4462 * @bp: buffer containing the btree block
4463 */
4464xfs_failaddr_t
4465xfs_btree_sblock_v5hdr_verify(
4466        struct xfs_buf          *bp)
4467{
4468        struct xfs_mount        *mp = bp->b_mount;
4469        struct xfs_btree_block  *block = XFS_BUF_TO_BLOCK(bp);
4470        struct xfs_perag        *pag = bp->b_pag;
4471
4472        if (!xfs_has_crc(mp))
4473                return __this_address;
4474        if (!uuid_equal(&block->bb_u.s.bb_uuid, &mp->m_sb.sb_meta_uuid))
4475                return __this_address;
4476        if (block->bb_u.s.bb_blkno != cpu_to_be64(xfs_buf_daddr(bp)))
4477                return __this_address;
4478        if (pag && be32_to_cpu(block->bb_u.s.bb_owner) != pag->pag_agno)
4479                return __this_address;
4480        return NULL;
4481}
4482
4483/**
4484 * xfs_btree_sblock_verify() -- verify a short-format btree block
4485 *
4486 * @bp: buffer containing the btree block
4487 * @max_recs: maximum records allowed in this btree node
4488 */
4489xfs_failaddr_t
4490xfs_btree_sblock_verify(
4491        struct xfs_buf          *bp,
4492        unsigned int            max_recs)
4493{
4494        struct xfs_mount        *mp = bp->b_mount;
4495        struct xfs_btree_block  *block = XFS_BUF_TO_BLOCK(bp);
4496        xfs_agblock_t           agno;
4497
4498        /* numrecs verification */
4499        if (be16_to_cpu(block->bb_numrecs) > max_recs)
4500                return __this_address;
4501
4502        /* sibling pointer verification */
4503        agno = xfs_daddr_to_agno(mp, xfs_buf_daddr(bp));
4504        if (block->bb_u.s.bb_leftsib != cpu_to_be32(NULLAGBLOCK) &&
4505            !xfs_verify_agbno(mp, agno, be32_to_cpu(block->bb_u.s.bb_leftsib)))
4506                return __this_address;
4507        if (block->bb_u.s.bb_rightsib != cpu_to_be32(NULLAGBLOCK) &&
4508            !xfs_verify_agbno(mp, agno, be32_to_cpu(block->bb_u.s.bb_rightsib)))
4509                return __this_address;
4510
4511        return NULL;
4512}
4513
4514/*
4515 * Calculate the number of btree levels needed to store a given number of
4516 * records in a short-format btree.
4517 */
4518uint
4519xfs_btree_compute_maxlevels(
4520        uint                    *limits,
4521        unsigned long           len)
4522{
4523        uint                    level;
4524        unsigned long           maxblocks;
4525
4526        maxblocks = (len + limits[0] - 1) / limits[0];
4527        for (level = 1; maxblocks > 1; level++)
4528                maxblocks = (maxblocks + limits[1] - 1) / limits[1];
4529        return level;
4530}
4531
4532/*
4533 * Query a regular btree for all records overlapping a given interval.
4534 * Start with a LE lookup of the key of low_rec and return all records
4535 * until we find a record with a key greater than the key of high_rec.
4536 */
4537STATIC int
4538xfs_btree_simple_query_range(
4539        struct xfs_btree_cur            *cur,
4540        const union xfs_btree_key       *low_key,
4541        const union xfs_btree_key       *high_key,
4542        xfs_btree_query_range_fn        fn,
4543        void                            *priv)
4544{
4545        union xfs_btree_rec             *recp;
4546        union xfs_btree_key             rec_key;
4547        int64_t                         diff;
4548        int                             stat;
4549        bool                            firstrec = true;
4550        int                             error;
4551
4552        ASSERT(cur->bc_ops->init_high_key_from_rec);
4553        ASSERT(cur->bc_ops->diff_two_keys);
4554
4555        /*
4556         * Find the leftmost record.  The btree cursor must be set
4557         * to the low record used to generate low_key.
4558         */
4559        stat = 0;
4560        error = xfs_btree_lookup(cur, XFS_LOOKUP_LE, &stat);
4561        if (error)
4562                goto out;
4563
4564        /* Nothing?  See if there's anything to the right. */
4565        if (!stat) {
4566                error = xfs_btree_increment(cur, 0, &stat);
4567                if (error)
4568                        goto out;
4569        }
4570
4571        while (stat) {
4572                /* Find the record. */
4573                error = xfs_btree_get_rec(cur, &recp, &stat);
4574                if (error || !stat)
4575                        break;
4576
4577                /* Skip if high_key(rec) < low_key. */
4578                if (firstrec) {
4579                        cur->bc_ops->init_high_key_from_rec(&rec_key, recp);
4580                        firstrec = false;
4581                        diff = cur->bc_ops->diff_two_keys(cur, low_key,
4582                                        &rec_key);
4583                        if (diff > 0)
4584                                goto advloop;
4585                }
4586
4587                /* Stop if high_key < low_key(rec). */
4588                cur->bc_ops->init_key_from_rec(&rec_key, recp);
4589                diff = cur->bc_ops->diff_two_keys(cur, &rec_key, high_key);
4590                if (diff > 0)
4591                        break;
4592
4593                /* Callback */
4594                error = fn(cur, recp, priv);
4595                if (error)
4596                        break;
4597
4598advloop:
4599                /* Move on to the next record. */
4600                error = xfs_btree_increment(cur, 0, &stat);
4601                if (error)
4602                        break;
4603        }
4604
4605out:
4606        return error;
4607}
4608
4609/*
4610 * Query an overlapped interval btree for all records overlapping a given
4611 * interval.  This function roughly follows the algorithm given in
4612 * "Interval Trees" of _Introduction to Algorithms_, which is section
4613 * 14.3 in the 2nd and 3rd editions.
4614 *
4615 * First, generate keys for the low and high records passed in.
4616 *
4617 * For any leaf node, generate the high and low keys for the record.
4618 * If the record keys overlap with the query low/high keys, pass the
4619 * record to the function iterator.
4620 *
4621 * For any internal node, compare the low and high keys of each
4622 * pointer against the query low/high keys.  If there's an overlap,
4623 * follow the pointer.
4624 *
4625 * As an optimization, we stop scanning a block when we find a low key
4626 * that is greater than the query's high key.
4627 */
4628STATIC int
4629xfs_btree_overlapped_query_range(
4630        struct xfs_btree_cur            *cur,
4631        const union xfs_btree_key       *low_key,
4632        const union xfs_btree_key       *high_key,
4633        xfs_btree_query_range_fn        fn,
4634        void                            *priv)
4635{
4636        union xfs_btree_ptr             ptr;
4637        union xfs_btree_ptr             *pp;
4638        union xfs_btree_key             rec_key;
4639        union xfs_btree_key             rec_hkey;
4640        union xfs_btree_key             *lkp;
4641        union xfs_btree_key             *hkp;
4642        union xfs_btree_rec             *recp;
4643        struct xfs_btree_block          *block;
4644        int64_t                         ldiff;
4645        int64_t                         hdiff;
4646        int                             level;
4647        struct xfs_buf                  *bp;
4648        int                             i;
4649        int                             error;
4650
4651        /* Load the root of the btree. */
4652        level = cur->bc_nlevels - 1;
4653        cur->bc_ops->init_ptr_from_cur(cur, &ptr);
4654        error = xfs_btree_lookup_get_block(cur, level, &ptr, &block);
4655        if (error)
4656                return error;
4657        xfs_btree_get_block(cur, level, &bp);
4658        trace_xfs_btree_overlapped_query_range(cur, level, bp);
4659#ifdef DEBUG
4660        error = xfs_btree_check_block(cur, block, level, bp);
4661        if (error)
4662                goto out;
4663#endif
4664        cur->bc_ptrs[level] = 1;
4665
4666        while (level < cur->bc_nlevels) {
4667                block = xfs_btree_get_block(cur, level, &bp);
4668
4669                /* End of node, pop back towards the root. */
4670                if (cur->bc_ptrs[level] > be16_to_cpu(block->bb_numrecs)) {
4671pop_up:
4672                        if (level < cur->bc_nlevels - 1)
4673                                cur->bc_ptrs[level + 1]++;
4674                        level++;
4675                        continue;
4676                }
4677
4678                if (level == 0) {
4679                        /* Handle a leaf node. */
4680                        recp = xfs_btree_rec_addr(cur, cur->bc_ptrs[0], block);
4681
4682                        cur->bc_ops->init_high_key_from_rec(&rec_hkey, recp);
4683                        ldiff = cur->bc_ops->diff_two_keys(cur, &rec_hkey,
4684                                        low_key);
4685
4686                        cur->bc_ops->init_key_from_rec(&rec_key, recp);
4687                        hdiff = cur->bc_ops->diff_two_keys(cur, high_key,
4688                                        &rec_key);
4689
4690                        /*
4691                         * If (record's high key >= query's low key) and
4692                         *    (query's high key >= record's low key), then
4693                         * this record overlaps the query range; callback.
4694                         */
4695                        if (ldiff >= 0 && hdiff >= 0) {
4696                                error = fn(cur, recp, priv);
4697                                if (error)
4698                                        break;
4699                        } else if (hdiff < 0) {
4700                                /* Record is larger than high key; pop. */
4701                                goto pop_up;
4702                        }
4703                        cur->bc_ptrs[level]++;
4704                        continue;
4705                }
4706
4707                /* Handle an internal node. */
4708                lkp = xfs_btree_key_addr(cur, cur->bc_ptrs[level], block);
4709                hkp = xfs_btree_high_key_addr(cur, cur->bc_ptrs[level], block);
4710                pp = xfs_btree_ptr_addr(cur, cur->bc_ptrs[level], block);
4711
4712                ldiff = cur->bc_ops->diff_two_keys(cur, hkp, low_key);
4713                hdiff = cur->bc_ops->diff_two_keys(cur, high_key, lkp);
4714
4715                /*
4716                 * If (pointer's high key >= query's low key) and
4717                 *    (query's high key >= pointer's low key), then
4718                 * this record overlaps the query range; follow pointer.
4719                 */
4720                if (ldiff >= 0 && hdiff >= 0) {
4721                        level--;
4722                        error = xfs_btree_lookup_get_block(cur, level, pp,
4723                                        &block);
4724                        if (error)
4725                                goto out;
4726                        xfs_btree_get_block(cur, level, &bp);
4727                        trace_xfs_btree_overlapped_query_range(cur, level, bp);
4728#ifdef DEBUG
4729                        error = xfs_btree_check_block(cur, block, level, bp);
4730                        if (error)
4731                                goto out;
4732#endif
4733                        cur->bc_ptrs[level] = 1;
4734                        continue;
4735                } else if (hdiff < 0) {
4736                        /* The low key is larger than the upper range; pop. */
4737                        goto pop_up;
4738                }
4739                cur->bc_ptrs[level]++;
4740        }
4741
4742out:
4743        /*
4744         * If we don't end this function with the cursor pointing at a record
4745         * block, a subsequent non-error cursor deletion will not release
4746         * node-level buffers, causing a buffer leak.  This is quite possible
4747         * with a zero-results range query, so release the buffers if we
4748         * failed to return any results.
4749         */
4750        if (cur->bc_bufs[0] == NULL) {
4751                for (i = 0; i < cur->bc_nlevels; i++) {
4752                        if (cur->bc_bufs[i]) {
4753                                xfs_trans_brelse(cur->bc_tp, cur->bc_bufs[i]);
4754                                cur->bc_bufs[i] = NULL;
4755                                cur->bc_ptrs[i] = 0;
4756                                cur->bc_ra[i] = 0;
4757                        }
4758                }
4759        }
4760
4761        return error;
4762}
4763
4764/*
4765 * Query a btree for all records overlapping a given interval of keys.  The
4766 * supplied function will be called with each record found; return one of the
4767 * XFS_BTREE_QUERY_RANGE_{CONTINUE,ABORT} values or the usual negative error
4768 * code.  This function returns -ECANCELED, zero, or a negative error code.
4769 */
4770int
4771xfs_btree_query_range(
4772        struct xfs_btree_cur            *cur,
4773        const union xfs_btree_irec      *low_rec,
4774        const union xfs_btree_irec      *high_rec,
4775        xfs_btree_query_range_fn        fn,
4776        void                            *priv)
4777{
4778        union xfs_btree_rec             rec;
4779        union xfs_btree_key             low_key;
4780        union xfs_btree_key             high_key;
4781
4782        /* Find the keys of both ends of the interval. */
4783        cur->bc_rec = *high_rec;
4784        cur->bc_ops->init_rec_from_cur(cur, &rec);
4785        cur->bc_ops->init_key_from_rec(&high_key, &rec);
4786
4787        cur->bc_rec = *low_rec;
4788        cur->bc_ops->init_rec_from_cur(cur, &rec);
4789        cur->bc_ops->init_key_from_rec(&low_key, &rec);
4790
4791        /* Enforce low key < high key. */
4792        if (cur->bc_ops->diff_two_keys(cur, &low_key, &high_key) > 0)
4793                return -EINVAL;
4794
4795        if (!(cur->bc_flags & XFS_BTREE_OVERLAPPING))
4796                return xfs_btree_simple_query_range(cur, &low_key,
4797                                &high_key, fn, priv);
4798        return xfs_btree_overlapped_query_range(cur, &low_key, &high_key,
4799                        fn, priv);
4800}
4801
4802/* Query a btree for all records. */
4803int
4804xfs_btree_query_all(
4805        struct xfs_btree_cur            *cur,
4806        xfs_btree_query_range_fn        fn,
4807        void                            *priv)
4808{
4809        union xfs_btree_key             low_key;
4810        union xfs_btree_key             high_key;
4811
4812        memset(&cur->bc_rec, 0, sizeof(cur->bc_rec));
4813        memset(&low_key, 0, sizeof(low_key));
4814        memset(&high_key, 0xFF, sizeof(high_key));
4815
4816        return xfs_btree_simple_query_range(cur, &low_key, &high_key, fn, priv);
4817}
4818
4819/*
4820 * Calculate the number of blocks needed to store a given number of records
4821 * in a short-format (per-AG metadata) btree.
4822 */
4823unsigned long long
4824xfs_btree_calc_size(
4825        uint                    *limits,
4826        unsigned long long      len)
4827{
4828        int                     level;
4829        int                     maxrecs;
4830        unsigned long long      rval;
4831
4832        maxrecs = limits[0];
4833        for (level = 0, rval = 0; len > 1; level++) {
4834                len += maxrecs - 1;
4835                do_div(len, maxrecs);
4836                maxrecs = limits[1];
4837                rval += len;
4838        }
4839        return rval;
4840}
4841
4842static int
4843xfs_btree_count_blocks_helper(
4844        struct xfs_btree_cur    *cur,
4845        int                     level,
4846        void                    *data)
4847{
4848        xfs_extlen_t            *blocks = data;
4849        (*blocks)++;
4850
4851        return 0;
4852}
4853
4854/* Count the blocks in a btree and return the result in *blocks. */
4855int
4856xfs_btree_count_blocks(
4857        struct xfs_btree_cur    *cur,
4858        xfs_extlen_t            *blocks)
4859{
4860        *blocks = 0;
4861        return xfs_btree_visit_blocks(cur, xfs_btree_count_blocks_helper,
4862                        XFS_BTREE_VISIT_ALL, blocks);
4863}
4864
4865/* Compare two btree pointers. */
4866int64_t
4867xfs_btree_diff_two_ptrs(
4868        struct xfs_btree_cur            *cur,
4869        const union xfs_btree_ptr       *a,
4870        const union xfs_btree_ptr       *b)
4871{
4872        if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
4873                return (int64_t)be64_to_cpu(a->l) - be64_to_cpu(b->l);
4874        return (int64_t)be32_to_cpu(a->s) - be32_to_cpu(b->s);
4875}
4876
4877/* If there's an extent, we're done. */
4878STATIC int
4879xfs_btree_has_record_helper(
4880        struct xfs_btree_cur            *cur,
4881        const union xfs_btree_rec       *rec,
4882        void                            *priv)
4883{
4884        return -ECANCELED;
4885}
4886
4887/* Is there a record covering a given range of keys? */
4888int
4889xfs_btree_has_record(
4890        struct xfs_btree_cur            *cur,
4891        const union xfs_btree_irec      *low,
4892        const union xfs_btree_irec      *high,
4893        bool                            *exists)
4894{
4895        int                             error;
4896
4897        error = xfs_btree_query_range(cur, low, high,
4898                        &xfs_btree_has_record_helper, NULL);
4899        if (error == -ECANCELED) {
4900                *exists = true;
4901                return 0;
4902        }
4903        *exists = false;
4904        return error;
4905}
4906
4907/* Are there more records in this btree? */
4908bool
4909xfs_btree_has_more_records(
4910        struct xfs_btree_cur    *cur)
4911{
4912        struct xfs_btree_block  *block;
4913        struct xfs_buf          *bp;
4914
4915        block = xfs_btree_get_block(cur, 0, &bp);
4916
4917        /* There are still records in this block. */
4918        if (cur->bc_ptrs[0] < xfs_btree_get_numrecs(block))
4919                return true;
4920
4921        /* There are more record blocks. */
4922        if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
4923                return block->bb_u.l.bb_rightsib != cpu_to_be64(NULLFSBLOCK);
4924        else
4925                return block->bb_u.s.bb_rightsib != cpu_to_be32(NULLAGBLOCK);
4926}
4927