linux/fs/xfs/libxfs/xfs_btree.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0
   2/*
   3 * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc.
   4 * All Rights Reserved.
   5 */
   6#include "xfs.h"
   7#include "xfs_fs.h"
   8#include "xfs_shared.h"
   9#include "xfs_format.h"
  10#include "xfs_log_format.h"
  11#include "xfs_trans_resv.h"
  12#include "xfs_bit.h"
  13#include "xfs_mount.h"
  14#include "xfs_inode.h"
  15#include "xfs_trans.h"
  16#include "xfs_buf_item.h"
  17#include "xfs_btree.h"
  18#include "xfs_errortag.h"
  19#include "xfs_error.h"
  20#include "xfs_trace.h"
  21#include "xfs_alloc.h"
  22#include "xfs_log.h"
  23#include "xfs_btree_staging.h"
  24#include "xfs_ag.h"
  25
  26/*
  27 * Cursor allocation zone.
  28 */
  29kmem_zone_t     *xfs_btree_cur_zone;
  30
  31/*
  32 * Btree magic numbers.
  33 */
  34static const uint32_t xfs_magics[2][XFS_BTNUM_MAX] = {
  35        { XFS_ABTB_MAGIC, XFS_ABTC_MAGIC, 0, XFS_BMAP_MAGIC, XFS_IBT_MAGIC,
  36          XFS_FIBT_MAGIC, 0 },
  37        { XFS_ABTB_CRC_MAGIC, XFS_ABTC_CRC_MAGIC, XFS_RMAP_CRC_MAGIC,
  38          XFS_BMAP_CRC_MAGIC, XFS_IBT_CRC_MAGIC, XFS_FIBT_CRC_MAGIC,
  39          XFS_REFC_CRC_MAGIC }
  40};
  41
  42uint32_t
  43xfs_btree_magic(
  44        int                     crc,
  45        xfs_btnum_t             btnum)
  46{
  47        uint32_t                magic = xfs_magics[crc][btnum];
  48
  49        /* Ensure we asked for crc for crc-only magics. */
  50        ASSERT(magic != 0);
  51        return magic;
  52}
  53
  54/*
  55 * Check a long btree block header.  Return the address of the failing check,
  56 * or NULL if everything is ok.
  57 */
  58xfs_failaddr_t
  59__xfs_btree_check_lblock(
  60        struct xfs_btree_cur    *cur,
  61        struct xfs_btree_block  *block,
  62        int                     level,
  63        struct xfs_buf          *bp)
  64{
  65        struct xfs_mount        *mp = cur->bc_mp;
  66        xfs_btnum_t             btnum = cur->bc_btnum;
  67        int                     crc = xfs_sb_version_hascrc(&mp->m_sb);
  68
  69        if (crc) {
  70                if (!uuid_equal(&block->bb_u.l.bb_uuid, &mp->m_sb.sb_meta_uuid))
  71                        return __this_address;
  72                if (block->bb_u.l.bb_blkno !=
  73                    cpu_to_be64(bp ? bp->b_bn : XFS_BUF_DADDR_NULL))
  74                        return __this_address;
  75                if (block->bb_u.l.bb_pad != cpu_to_be32(0))
  76                        return __this_address;
  77        }
  78
  79        if (be32_to_cpu(block->bb_magic) != xfs_btree_magic(crc, btnum))
  80                return __this_address;
  81        if (be16_to_cpu(block->bb_level) != level)
  82                return __this_address;
  83        if (be16_to_cpu(block->bb_numrecs) >
  84            cur->bc_ops->get_maxrecs(cur, level))
  85                return __this_address;
  86        if (block->bb_u.l.bb_leftsib != cpu_to_be64(NULLFSBLOCK) &&
  87            !xfs_btree_check_lptr(cur, be64_to_cpu(block->bb_u.l.bb_leftsib),
  88                        level + 1))
  89                return __this_address;
  90        if (block->bb_u.l.bb_rightsib != cpu_to_be64(NULLFSBLOCK) &&
  91            !xfs_btree_check_lptr(cur, be64_to_cpu(block->bb_u.l.bb_rightsib),
  92                        level + 1))
  93                return __this_address;
  94
  95        return NULL;
  96}
  97
  98/* Check a long btree block header. */
  99static int
 100xfs_btree_check_lblock(
 101        struct xfs_btree_cur    *cur,
 102        struct xfs_btree_block  *block,
 103        int                     level,
 104        struct xfs_buf          *bp)
 105{
 106        struct xfs_mount        *mp = cur->bc_mp;
 107        xfs_failaddr_t          fa;
 108
 109        fa = __xfs_btree_check_lblock(cur, block, level, bp);
 110        if (XFS_IS_CORRUPT(mp, fa != NULL) ||
 111            XFS_TEST_ERROR(false, mp, XFS_ERRTAG_BTREE_CHECK_LBLOCK)) {
 112                if (bp)
 113                        trace_xfs_btree_corrupt(bp, _RET_IP_);
 114                return -EFSCORRUPTED;
 115        }
 116        return 0;
 117}
 118
 119/*
 120 * Check a short btree block header.  Return the address of the failing check,
 121 * or NULL if everything is ok.
 122 */
 123xfs_failaddr_t
 124__xfs_btree_check_sblock(
 125        struct xfs_btree_cur    *cur,
 126        struct xfs_btree_block  *block,
 127        int                     level,
 128        struct xfs_buf          *bp)
 129{
 130        struct xfs_mount        *mp = cur->bc_mp;
 131        xfs_btnum_t             btnum = cur->bc_btnum;
 132        int                     crc = xfs_sb_version_hascrc(&mp->m_sb);
 133
 134        if (crc) {
 135                if (!uuid_equal(&block->bb_u.s.bb_uuid, &mp->m_sb.sb_meta_uuid))
 136                        return __this_address;
 137                if (block->bb_u.s.bb_blkno !=
 138                    cpu_to_be64(bp ? bp->b_bn : XFS_BUF_DADDR_NULL))
 139                        return __this_address;
 140        }
 141
 142        if (be32_to_cpu(block->bb_magic) != xfs_btree_magic(crc, btnum))
 143                return __this_address;
 144        if (be16_to_cpu(block->bb_level) != level)
 145                return __this_address;
 146        if (be16_to_cpu(block->bb_numrecs) >
 147            cur->bc_ops->get_maxrecs(cur, level))
 148                return __this_address;
 149        if (block->bb_u.s.bb_leftsib != cpu_to_be32(NULLAGBLOCK) &&
 150            !xfs_btree_check_sptr(cur, be32_to_cpu(block->bb_u.s.bb_leftsib),
 151                        level + 1))
 152                return __this_address;
 153        if (block->bb_u.s.bb_rightsib != cpu_to_be32(NULLAGBLOCK) &&
 154            !xfs_btree_check_sptr(cur, be32_to_cpu(block->bb_u.s.bb_rightsib),
 155                        level + 1))
 156                return __this_address;
 157
 158        return NULL;
 159}
 160
 161/* Check a short btree block header. */
 162STATIC int
 163xfs_btree_check_sblock(
 164        struct xfs_btree_cur    *cur,
 165        struct xfs_btree_block  *block,
 166        int                     level,
 167        struct xfs_buf          *bp)
 168{
 169        struct xfs_mount        *mp = cur->bc_mp;
 170        xfs_failaddr_t          fa;
 171
 172        fa = __xfs_btree_check_sblock(cur, block, level, bp);
 173        if (XFS_IS_CORRUPT(mp, fa != NULL) ||
 174            XFS_TEST_ERROR(false, mp, XFS_ERRTAG_BTREE_CHECK_SBLOCK)) {
 175                if (bp)
 176                        trace_xfs_btree_corrupt(bp, _RET_IP_);
 177                return -EFSCORRUPTED;
 178        }
 179        return 0;
 180}
 181
 182/*
 183 * Debug routine: check that block header is ok.
 184 */
 185int
 186xfs_btree_check_block(
 187        struct xfs_btree_cur    *cur,   /* btree cursor */
 188        struct xfs_btree_block  *block, /* generic btree block pointer */
 189        int                     level,  /* level of the btree block */
 190        struct xfs_buf          *bp)    /* buffer containing block, if any */
 191{
 192        if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
 193                return xfs_btree_check_lblock(cur, block, level, bp);
 194        else
 195                return xfs_btree_check_sblock(cur, block, level, bp);
 196}
 197
 198/* Check that this long pointer is valid and points within the fs. */
 199bool
 200xfs_btree_check_lptr(
 201        struct xfs_btree_cur    *cur,
 202        xfs_fsblock_t           fsbno,
 203        int                     level)
 204{
 205        if (level <= 0)
 206                return false;
 207        return xfs_verify_fsbno(cur->bc_mp, fsbno);
 208}
 209
 210/* Check that this short pointer is valid and points within the AG. */
 211bool
 212xfs_btree_check_sptr(
 213        struct xfs_btree_cur    *cur,
 214        xfs_agblock_t           agbno,
 215        int                     level)
 216{
 217        if (level <= 0)
 218                return false;
 219        return xfs_verify_agbno(cur->bc_mp, cur->bc_ag.pag->pag_agno, agbno);
 220}
 221
 222/*
 223 * Check that a given (indexed) btree pointer at a certain level of a
 224 * btree is valid and doesn't point past where it should.
 225 */
 226static int
 227xfs_btree_check_ptr(
 228        struct xfs_btree_cur    *cur,
 229        union xfs_btree_ptr     *ptr,
 230        int                     index,
 231        int                     level)
 232{
 233        if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
 234                if (xfs_btree_check_lptr(cur, be64_to_cpu((&ptr->l)[index]),
 235                                level))
 236                        return 0;
 237                xfs_err(cur->bc_mp,
 238"Inode %llu fork %d: Corrupt btree %d pointer at level %d index %d.",
 239                                cur->bc_ino.ip->i_ino,
 240                                cur->bc_ino.whichfork, cur->bc_btnum,
 241                                level, index);
 242        } else {
 243                if (xfs_btree_check_sptr(cur, be32_to_cpu((&ptr->s)[index]),
 244                                level))
 245                        return 0;
 246                xfs_err(cur->bc_mp,
 247"AG %u: Corrupt btree %d pointer at level %d index %d.",
 248                                cur->bc_ag.pag->pag_agno, cur->bc_btnum,
 249                                level, index);
 250        }
 251
 252        return -EFSCORRUPTED;
 253}
 254
 255#ifdef DEBUG
 256# define xfs_btree_debug_check_ptr      xfs_btree_check_ptr
 257#else
 258# define xfs_btree_debug_check_ptr(...) (0)
 259#endif
 260
 261/*
 262 * Calculate CRC on the whole btree block and stuff it into the
 263 * long-form btree header.
 264 *
 265 * Prior to calculting the CRC, pull the LSN out of the buffer log item and put
 266 * it into the buffer so recovery knows what the last modification was that made
 267 * it to disk.
 268 */
 269void
 270xfs_btree_lblock_calc_crc(
 271        struct xfs_buf          *bp)
 272{
 273        struct xfs_btree_block  *block = XFS_BUF_TO_BLOCK(bp);
 274        struct xfs_buf_log_item *bip = bp->b_log_item;
 275
 276        if (!xfs_sb_version_hascrc(&bp->b_mount->m_sb))
 277                return;
 278        if (bip)
 279                block->bb_u.l.bb_lsn = cpu_to_be64(bip->bli_item.li_lsn);
 280        xfs_buf_update_cksum(bp, XFS_BTREE_LBLOCK_CRC_OFF);
 281}
 282
 283bool
 284xfs_btree_lblock_verify_crc(
 285        struct xfs_buf          *bp)
 286{
 287        struct xfs_btree_block  *block = XFS_BUF_TO_BLOCK(bp);
 288        struct xfs_mount        *mp = bp->b_mount;
 289
 290        if (xfs_sb_version_hascrc(&mp->m_sb)) {
 291                if (!xfs_log_check_lsn(mp, be64_to_cpu(block->bb_u.l.bb_lsn)))
 292                        return false;
 293                return xfs_buf_verify_cksum(bp, XFS_BTREE_LBLOCK_CRC_OFF);
 294        }
 295
 296        return true;
 297}
 298
 299/*
 300 * Calculate CRC on the whole btree block and stuff it into the
 301 * short-form btree header.
 302 *
 303 * Prior to calculting the CRC, pull the LSN out of the buffer log item and put
 304 * it into the buffer so recovery knows what the last modification was that made
 305 * it to disk.
 306 */
 307void
 308xfs_btree_sblock_calc_crc(
 309        struct xfs_buf          *bp)
 310{
 311        struct xfs_btree_block  *block = XFS_BUF_TO_BLOCK(bp);
 312        struct xfs_buf_log_item *bip = bp->b_log_item;
 313
 314        if (!xfs_sb_version_hascrc(&bp->b_mount->m_sb))
 315                return;
 316        if (bip)
 317                block->bb_u.s.bb_lsn = cpu_to_be64(bip->bli_item.li_lsn);
 318        xfs_buf_update_cksum(bp, XFS_BTREE_SBLOCK_CRC_OFF);
 319}
 320
 321bool
 322xfs_btree_sblock_verify_crc(
 323        struct xfs_buf          *bp)
 324{
 325        struct xfs_btree_block  *block = XFS_BUF_TO_BLOCK(bp);
 326        struct xfs_mount        *mp = bp->b_mount;
 327
 328        if (xfs_sb_version_hascrc(&mp->m_sb)) {
 329                if (!xfs_log_check_lsn(mp, be64_to_cpu(block->bb_u.s.bb_lsn)))
 330                        return false;
 331                return xfs_buf_verify_cksum(bp, XFS_BTREE_SBLOCK_CRC_OFF);
 332        }
 333
 334        return true;
 335}
 336
 337static int
 338xfs_btree_free_block(
 339        struct xfs_btree_cur    *cur,
 340        struct xfs_buf          *bp)
 341{
 342        int                     error;
 343
 344        error = cur->bc_ops->free_block(cur, bp);
 345        if (!error) {
 346                xfs_trans_binval(cur->bc_tp, bp);
 347                XFS_BTREE_STATS_INC(cur, free);
 348        }
 349        return error;
 350}
 351
 352/*
 353 * Delete the btree cursor.
 354 */
 355void
 356xfs_btree_del_cursor(
 357        struct xfs_btree_cur    *cur,           /* btree cursor */
 358        int                     error)          /* del because of error */
 359{
 360        int                     i;              /* btree level */
 361
 362        /*
 363         * Clear the buffer pointers and release the buffers. If we're doing
 364         * this because of an error, inspect all of the entries in the bc_bufs
 365         * array for buffers to be unlocked. This is because some of the btree
 366         * code works from level n down to 0, and if we get an error along the
 367         * way we won't have initialized all the entries down to 0.
 368         */
 369        for (i = 0; i < cur->bc_nlevels; i++) {
 370                if (cur->bc_bufs[i])
 371                        xfs_trans_brelse(cur->bc_tp, cur->bc_bufs[i]);
 372                else if (!error)
 373                        break;
 374        }
 375
 376        ASSERT(cur->bc_btnum != XFS_BTNUM_BMAP || cur->bc_ino.allocated == 0 ||
 377               XFS_FORCED_SHUTDOWN(cur->bc_mp));
 378        if (unlikely(cur->bc_flags & XFS_BTREE_STAGING))
 379                kmem_free(cur->bc_ops);
 380        if (!(cur->bc_flags & XFS_BTREE_LONG_PTRS) && cur->bc_ag.pag)
 381                xfs_perag_put(cur->bc_ag.pag);
 382        kmem_cache_free(xfs_btree_cur_zone, cur);
 383}
 384
 385/*
 386 * Duplicate the btree cursor.
 387 * Allocate a new one, copy the record, re-get the buffers.
 388 */
 389int                                     /* error */
 390xfs_btree_dup_cursor(
 391        xfs_btree_cur_t *cur,           /* input cursor */
 392        xfs_btree_cur_t **ncur)         /* output cursor */
 393{
 394        struct xfs_buf  *bp;            /* btree block's buffer pointer */
 395        int             error;          /* error return value */
 396        int             i;              /* level number of btree block */
 397        xfs_mount_t     *mp;            /* mount structure for filesystem */
 398        xfs_btree_cur_t *new;           /* new cursor value */
 399        xfs_trans_t     *tp;            /* transaction pointer, can be NULL */
 400
 401        tp = cur->bc_tp;
 402        mp = cur->bc_mp;
 403
 404        /*
 405         * Allocate a new cursor like the old one.
 406         */
 407        new = cur->bc_ops->dup_cursor(cur);
 408
 409        /*
 410         * Copy the record currently in the cursor.
 411         */
 412        new->bc_rec = cur->bc_rec;
 413
 414        /*
 415         * For each level current, re-get the buffer and copy the ptr value.
 416         */
 417        for (i = 0; i < new->bc_nlevels; i++) {
 418                new->bc_ptrs[i] = cur->bc_ptrs[i];
 419                new->bc_ra[i] = cur->bc_ra[i];
 420                bp = cur->bc_bufs[i];
 421                if (bp) {
 422                        error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp,
 423                                                   XFS_BUF_ADDR(bp), mp->m_bsize,
 424                                                   0, &bp,
 425                                                   cur->bc_ops->buf_ops);
 426                        if (error) {
 427                                xfs_btree_del_cursor(new, error);
 428                                *ncur = NULL;
 429                                return error;
 430                        }
 431                }
 432                new->bc_bufs[i] = bp;
 433        }
 434        *ncur = new;
 435        return 0;
 436}
 437
 438/*
 439 * XFS btree block layout and addressing:
 440 *
 441 * There are two types of blocks in the btree: leaf and non-leaf blocks.
 442 *
 443 * The leaf record start with a header then followed by records containing
 444 * the values.  A non-leaf block also starts with the same header, and
 445 * then first contains lookup keys followed by an equal number of pointers
 446 * to the btree blocks at the previous level.
 447 *
 448 *              +--------+-------+-------+-------+-------+-------+-------+
 449 * Leaf:        | header | rec 1 | rec 2 | rec 3 | rec 4 | rec 5 | rec N |
 450 *              +--------+-------+-------+-------+-------+-------+-------+
 451 *
 452 *              +--------+-------+-------+-------+-------+-------+-------+
 453 * Non-Leaf:    | header | key 1 | key 2 | key N | ptr 1 | ptr 2 | ptr N |
 454 *              +--------+-------+-------+-------+-------+-------+-------+
 455 *
 456 * The header is called struct xfs_btree_block for reasons better left unknown
 457 * and comes in different versions for short (32bit) and long (64bit) block
 458 * pointers.  The record and key structures are defined by the btree instances
 459 * and opaque to the btree core.  The block pointers are simple disk endian
 460 * integers, available in a short (32bit) and long (64bit) variant.
 461 *
 462 * The helpers below calculate the offset of a given record, key or pointer
 463 * into a btree block (xfs_btree_*_offset) or return a pointer to the given
 464 * record, key or pointer (xfs_btree_*_addr).  Note that all addressing
 465 * inside the btree block is done using indices starting at one, not zero!
 466 *
 467 * If XFS_BTREE_OVERLAPPING is set, then this btree supports keys containing
 468 * overlapping intervals.  In such a tree, records are still sorted lowest to
 469 * highest and indexed by the smallest key value that refers to the record.
 470 * However, nodes are different: each pointer has two associated keys -- one
 471 * indexing the lowest key available in the block(s) below (the same behavior
 472 * as the key in a regular btree) and another indexing the highest key
 473 * available in the block(s) below.  Because records are /not/ sorted by the
 474 * highest key, all leaf block updates require us to compute the highest key
 475 * that matches any record in the leaf and to recursively update the high keys
 476 * in the nodes going further up in the tree, if necessary.  Nodes look like
 477 * this:
 478 *
 479 *              +--------+-----+-----+-----+-----+-----+-------+-------+-----+
 480 * Non-Leaf:    | header | lo1 | hi1 | lo2 | hi2 | ... | ptr 1 | ptr 2 | ... |
 481 *              +--------+-----+-----+-----+-----+-----+-------+-------+-----+
 482 *
 483 * To perform an interval query on an overlapped tree, perform the usual
 484 * depth-first search and use the low and high keys to decide if we can skip
 485 * that particular node.  If a leaf node is reached, return the records that
 486 * intersect the interval.  Note that an interval query may return numerous
 487 * entries.  For a non-overlapped tree, simply search for the record associated
 488 * with the lowest key and iterate forward until a non-matching record is
 489 * found.  Section 14.3 ("Interval Trees") of _Introduction to Algorithms_ by
 490 * Cormen, Leiserson, Rivest, and Stein (2nd or 3rd ed. only) discuss this in
 491 * more detail.
 492 *
 493 * Why do we care about overlapping intervals?  Let's say you have a bunch of
 494 * reverse mapping records on a reflink filesystem:
 495 *
 496 * 1: +- file A startblock B offset C length D -----------+
 497 * 2:      +- file E startblock F offset G length H --------------+
 498 * 3:      +- file I startblock F offset J length K --+
 499 * 4:                                                        +- file L... --+
 500 *
 501 * Now say we want to map block (B+D) into file A at offset (C+D).  Ideally,
 502 * we'd simply increment the length of record 1.  But how do we find the record
 503 * that ends at (B+D-1) (i.e. record 1)?  A LE lookup of (B+D-1) would return
 504 * record 3 because the keys are ordered first by startblock.  An interval
 505 * query would return records 1 and 2 because they both overlap (B+D-1), and
 506 * from that we can pick out record 1 as the appropriate left neighbor.
 507 *
 508 * In the non-overlapped case you can do a LE lookup and decrement the cursor
 509 * because a record's interval must end before the next record.
 510 */
 511
 512/*
 513 * Return size of the btree block header for this btree instance.
 514 */
 515static inline size_t xfs_btree_block_len(struct xfs_btree_cur *cur)
 516{
 517        if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
 518                if (cur->bc_flags & XFS_BTREE_CRC_BLOCKS)
 519                        return XFS_BTREE_LBLOCK_CRC_LEN;
 520                return XFS_BTREE_LBLOCK_LEN;
 521        }
 522        if (cur->bc_flags & XFS_BTREE_CRC_BLOCKS)
 523                return XFS_BTREE_SBLOCK_CRC_LEN;
 524        return XFS_BTREE_SBLOCK_LEN;
 525}
 526
 527/*
 528 * Return size of btree block pointers for this btree instance.
 529 */
 530static inline size_t xfs_btree_ptr_len(struct xfs_btree_cur *cur)
 531{
 532        return (cur->bc_flags & XFS_BTREE_LONG_PTRS) ?
 533                sizeof(__be64) : sizeof(__be32);
 534}
 535
 536/*
 537 * Calculate offset of the n-th record in a btree block.
 538 */
 539STATIC size_t
 540xfs_btree_rec_offset(
 541        struct xfs_btree_cur    *cur,
 542        int                     n)
 543{
 544        return xfs_btree_block_len(cur) +
 545                (n - 1) * cur->bc_ops->rec_len;
 546}
 547
 548/*
 549 * Calculate offset of the n-th key in a btree block.
 550 */
 551STATIC size_t
 552xfs_btree_key_offset(
 553        struct xfs_btree_cur    *cur,
 554        int                     n)
 555{
 556        return xfs_btree_block_len(cur) +
 557                (n - 1) * cur->bc_ops->key_len;
 558}
 559
 560/*
 561 * Calculate offset of the n-th high key in a btree block.
 562 */
 563STATIC size_t
 564xfs_btree_high_key_offset(
 565        struct xfs_btree_cur    *cur,
 566        int                     n)
 567{
 568        return xfs_btree_block_len(cur) +
 569                (n - 1) * cur->bc_ops->key_len + (cur->bc_ops->key_len / 2);
 570}
 571
 572/*
 573 * Calculate offset of the n-th block pointer in a btree block.
 574 */
 575STATIC size_t
 576xfs_btree_ptr_offset(
 577        struct xfs_btree_cur    *cur,
 578        int                     n,
 579        int                     level)
 580{
 581        return xfs_btree_block_len(cur) +
 582                cur->bc_ops->get_maxrecs(cur, level) * cur->bc_ops->key_len +
 583                (n - 1) * xfs_btree_ptr_len(cur);
 584}
 585
 586/*
 587 * Return a pointer to the n-th record in the btree block.
 588 */
 589union xfs_btree_rec *
 590xfs_btree_rec_addr(
 591        struct xfs_btree_cur    *cur,
 592        int                     n,
 593        struct xfs_btree_block  *block)
 594{
 595        return (union xfs_btree_rec *)
 596                ((char *)block + xfs_btree_rec_offset(cur, n));
 597}
 598
 599/*
 600 * Return a pointer to the n-th key in the btree block.
 601 */
 602union xfs_btree_key *
 603xfs_btree_key_addr(
 604        struct xfs_btree_cur    *cur,
 605        int                     n,
 606        struct xfs_btree_block  *block)
 607{
 608        return (union xfs_btree_key *)
 609                ((char *)block + xfs_btree_key_offset(cur, n));
 610}
 611
 612/*
 613 * Return a pointer to the n-th high key in the btree block.
 614 */
 615union xfs_btree_key *
 616xfs_btree_high_key_addr(
 617        struct xfs_btree_cur    *cur,
 618        int                     n,
 619        struct xfs_btree_block  *block)
 620{
 621        return (union xfs_btree_key *)
 622                ((char *)block + xfs_btree_high_key_offset(cur, n));
 623}
 624
 625/*
 626 * Return a pointer to the n-th block pointer in the btree block.
 627 */
 628union xfs_btree_ptr *
 629xfs_btree_ptr_addr(
 630        struct xfs_btree_cur    *cur,
 631        int                     n,
 632        struct xfs_btree_block  *block)
 633{
 634        int                     level = xfs_btree_get_level(block);
 635
 636        ASSERT(block->bb_level != 0);
 637
 638        return (union xfs_btree_ptr *)
 639                ((char *)block + xfs_btree_ptr_offset(cur, n, level));
 640}
 641
 642struct xfs_ifork *
 643xfs_btree_ifork_ptr(
 644        struct xfs_btree_cur    *cur)
 645{
 646        ASSERT(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE);
 647
 648        if (cur->bc_flags & XFS_BTREE_STAGING)
 649                return cur->bc_ino.ifake->if_fork;
 650        return XFS_IFORK_PTR(cur->bc_ino.ip, cur->bc_ino.whichfork);
 651}
 652
 653/*
 654 * Get the root block which is stored in the inode.
 655 *
 656 * For now this btree implementation assumes the btree root is always
 657 * stored in the if_broot field of an inode fork.
 658 */
 659STATIC struct xfs_btree_block *
 660xfs_btree_get_iroot(
 661        struct xfs_btree_cur    *cur)
 662{
 663        struct xfs_ifork        *ifp = xfs_btree_ifork_ptr(cur);
 664
 665        return (struct xfs_btree_block *)ifp->if_broot;
 666}
 667
 668/*
 669 * Retrieve the block pointer from the cursor at the given level.
 670 * This may be an inode btree root or from a buffer.
 671 */
 672struct xfs_btree_block *                /* generic btree block pointer */
 673xfs_btree_get_block(
 674        struct xfs_btree_cur    *cur,   /* btree cursor */
 675        int                     level,  /* level in btree */
 676        struct xfs_buf          **bpp)  /* buffer containing the block */
 677{
 678        if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
 679            (level == cur->bc_nlevels - 1)) {
 680                *bpp = NULL;
 681                return xfs_btree_get_iroot(cur);
 682        }
 683
 684        *bpp = cur->bc_bufs[level];
 685        return XFS_BUF_TO_BLOCK(*bpp);
 686}
 687
 688/*
 689 * Change the cursor to point to the first record at the given level.
 690 * Other levels are unaffected.
 691 */
 692STATIC int                              /* success=1, failure=0 */
 693xfs_btree_firstrec(
 694        xfs_btree_cur_t         *cur,   /* btree cursor */
 695        int                     level)  /* level to change */
 696{
 697        struct xfs_btree_block  *block; /* generic btree block pointer */
 698        struct xfs_buf          *bp;    /* buffer containing block */
 699
 700        /*
 701         * Get the block pointer for this level.
 702         */
 703        block = xfs_btree_get_block(cur, level, &bp);
 704        if (xfs_btree_check_block(cur, block, level, bp))
 705                return 0;
 706        /*
 707         * It's empty, there is no such record.
 708         */
 709        if (!block->bb_numrecs)
 710                return 0;
 711        /*
 712         * Set the ptr value to 1, that's the first record/key.
 713         */
 714        cur->bc_ptrs[level] = 1;
 715        return 1;
 716}
 717
 718/*
 719 * Change the cursor to point to the last record in the current block
 720 * at the given level.  Other levels are unaffected.
 721 */
 722STATIC int                              /* success=1, failure=0 */
 723xfs_btree_lastrec(
 724        xfs_btree_cur_t         *cur,   /* btree cursor */
 725        int                     level)  /* level to change */
 726{
 727        struct xfs_btree_block  *block; /* generic btree block pointer */
 728        struct xfs_buf          *bp;    /* buffer containing block */
 729
 730        /*
 731         * Get the block pointer for this level.
 732         */
 733        block = xfs_btree_get_block(cur, level, &bp);
 734        if (xfs_btree_check_block(cur, block, level, bp))
 735                return 0;
 736        /*
 737         * It's empty, there is no such record.
 738         */
 739        if (!block->bb_numrecs)
 740                return 0;
 741        /*
 742         * Set the ptr value to numrecs, that's the last record/key.
 743         */
 744        cur->bc_ptrs[level] = be16_to_cpu(block->bb_numrecs);
 745        return 1;
 746}
 747
 748/*
 749 * Compute first and last byte offsets for the fields given.
 750 * Interprets the offsets table, which contains struct field offsets.
 751 */
 752void
 753xfs_btree_offsets(
 754        int64_t         fields,         /* bitmask of fields */
 755        const short     *offsets,       /* table of field offsets */
 756        int             nbits,          /* number of bits to inspect */
 757        int             *first,         /* output: first byte offset */
 758        int             *last)          /* output: last byte offset */
 759{
 760        int             i;              /* current bit number */
 761        int64_t         imask;          /* mask for current bit number */
 762
 763        ASSERT(fields != 0);
 764        /*
 765         * Find the lowest bit, so the first byte offset.
 766         */
 767        for (i = 0, imask = 1LL; ; i++, imask <<= 1) {
 768                if (imask & fields) {
 769                        *first = offsets[i];
 770                        break;
 771                }
 772        }
 773        /*
 774         * Find the highest bit, so the last byte offset.
 775         */
 776        for (i = nbits - 1, imask = 1LL << i; ; i--, imask >>= 1) {
 777                if (imask & fields) {
 778                        *last = offsets[i + 1] - 1;
 779                        break;
 780                }
 781        }
 782}
 783
 784/*
 785 * Get a buffer for the block, return it read in.
 786 * Long-form addressing.
 787 */
 788int
 789xfs_btree_read_bufl(
 790        struct xfs_mount        *mp,            /* file system mount point */
 791        struct xfs_trans        *tp,            /* transaction pointer */
 792        xfs_fsblock_t           fsbno,          /* file system block number */
 793        struct xfs_buf          **bpp,          /* buffer for fsbno */
 794        int                     refval,         /* ref count value for buffer */
 795        const struct xfs_buf_ops *ops)
 796{
 797        struct xfs_buf          *bp;            /* return value */
 798        xfs_daddr_t             d;              /* real disk block address */
 799        int                     error;
 800
 801        if (!xfs_verify_fsbno(mp, fsbno))
 802                return -EFSCORRUPTED;
 803        d = XFS_FSB_TO_DADDR(mp, fsbno);
 804        error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, d,
 805                                   mp->m_bsize, 0, &bp, ops);
 806        if (error)
 807                return error;
 808        if (bp)
 809                xfs_buf_set_ref(bp, refval);
 810        *bpp = bp;
 811        return 0;
 812}
 813
 814/*
 815 * Read-ahead the block, don't wait for it, don't return a buffer.
 816 * Long-form addressing.
 817 */
 818/* ARGSUSED */
 819void
 820xfs_btree_reada_bufl(
 821        struct xfs_mount        *mp,            /* file system mount point */
 822        xfs_fsblock_t           fsbno,          /* file system block number */
 823        xfs_extlen_t            count,          /* count of filesystem blocks */
 824        const struct xfs_buf_ops *ops)
 825{
 826        xfs_daddr_t             d;
 827
 828        ASSERT(fsbno != NULLFSBLOCK);
 829        d = XFS_FSB_TO_DADDR(mp, fsbno);
 830        xfs_buf_readahead(mp->m_ddev_targp, d, mp->m_bsize * count, ops);
 831}
 832
 833/*
 834 * Read-ahead the block, don't wait for it, don't return a buffer.
 835 * Short-form addressing.
 836 */
 837/* ARGSUSED */
 838void
 839xfs_btree_reada_bufs(
 840        struct xfs_mount        *mp,            /* file system mount point */
 841        xfs_agnumber_t          agno,           /* allocation group number */
 842        xfs_agblock_t           agbno,          /* allocation group block number */
 843        xfs_extlen_t            count,          /* count of filesystem blocks */
 844        const struct xfs_buf_ops *ops)
 845{
 846        xfs_daddr_t             d;
 847
 848        ASSERT(agno != NULLAGNUMBER);
 849        ASSERT(agbno != NULLAGBLOCK);
 850        d = XFS_AGB_TO_DADDR(mp, agno, agbno);
 851        xfs_buf_readahead(mp->m_ddev_targp, d, mp->m_bsize * count, ops);
 852}
 853
 854STATIC int
 855xfs_btree_readahead_lblock(
 856        struct xfs_btree_cur    *cur,
 857        int                     lr,
 858        struct xfs_btree_block  *block)
 859{
 860        int                     rval = 0;
 861        xfs_fsblock_t           left = be64_to_cpu(block->bb_u.l.bb_leftsib);
 862        xfs_fsblock_t           right = be64_to_cpu(block->bb_u.l.bb_rightsib);
 863
 864        if ((lr & XFS_BTCUR_LEFTRA) && left != NULLFSBLOCK) {
 865                xfs_btree_reada_bufl(cur->bc_mp, left, 1,
 866                                     cur->bc_ops->buf_ops);
 867                rval++;
 868        }
 869
 870        if ((lr & XFS_BTCUR_RIGHTRA) && right != NULLFSBLOCK) {
 871                xfs_btree_reada_bufl(cur->bc_mp, right, 1,
 872                                     cur->bc_ops->buf_ops);
 873                rval++;
 874        }
 875
 876        return rval;
 877}
 878
 879STATIC int
 880xfs_btree_readahead_sblock(
 881        struct xfs_btree_cur    *cur,
 882        int                     lr,
 883        struct xfs_btree_block *block)
 884{
 885        int                     rval = 0;
 886        xfs_agblock_t           left = be32_to_cpu(block->bb_u.s.bb_leftsib);
 887        xfs_agblock_t           right = be32_to_cpu(block->bb_u.s.bb_rightsib);
 888
 889
 890        if ((lr & XFS_BTCUR_LEFTRA) && left != NULLAGBLOCK) {
 891                xfs_btree_reada_bufs(cur->bc_mp, cur->bc_ag.pag->pag_agno,
 892                                     left, 1, cur->bc_ops->buf_ops);
 893                rval++;
 894        }
 895
 896        if ((lr & XFS_BTCUR_RIGHTRA) && right != NULLAGBLOCK) {
 897                xfs_btree_reada_bufs(cur->bc_mp, cur->bc_ag.pag->pag_agno,
 898                                     right, 1, cur->bc_ops->buf_ops);
 899                rval++;
 900        }
 901
 902        return rval;
 903}
 904
 905/*
 906 * Read-ahead btree blocks, at the given level.
 907 * Bits in lr are set from XFS_BTCUR_{LEFT,RIGHT}RA.
 908 */
 909STATIC int
 910xfs_btree_readahead(
 911        struct xfs_btree_cur    *cur,           /* btree cursor */
 912        int                     lev,            /* level in btree */
 913        int                     lr)             /* left/right bits */
 914{
 915        struct xfs_btree_block  *block;
 916
 917        /*
 918         * No readahead needed if we are at the root level and the
 919         * btree root is stored in the inode.
 920         */
 921        if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
 922            (lev == cur->bc_nlevels - 1))
 923                return 0;
 924
 925        if ((cur->bc_ra[lev] | lr) == cur->bc_ra[lev])
 926                return 0;
 927
 928        cur->bc_ra[lev] |= lr;
 929        block = XFS_BUF_TO_BLOCK(cur->bc_bufs[lev]);
 930
 931        if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
 932                return xfs_btree_readahead_lblock(cur, lr, block);
 933        return xfs_btree_readahead_sblock(cur, lr, block);
 934}
 935
 936STATIC int
 937xfs_btree_ptr_to_daddr(
 938        struct xfs_btree_cur    *cur,
 939        union xfs_btree_ptr     *ptr,
 940        xfs_daddr_t             *daddr)
 941{
 942        xfs_fsblock_t           fsbno;
 943        xfs_agblock_t           agbno;
 944        int                     error;
 945
 946        error = xfs_btree_check_ptr(cur, ptr, 0, 1);
 947        if (error)
 948                return error;
 949
 950        if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
 951                fsbno = be64_to_cpu(ptr->l);
 952                *daddr = XFS_FSB_TO_DADDR(cur->bc_mp, fsbno);
 953        } else {
 954                agbno = be32_to_cpu(ptr->s);
 955                *daddr = XFS_AGB_TO_DADDR(cur->bc_mp, cur->bc_ag.pag->pag_agno,
 956                                agbno);
 957        }
 958
 959        return 0;
 960}
 961
 962/*
 963 * Readahead @count btree blocks at the given @ptr location.
 964 *
 965 * We don't need to care about long or short form btrees here as we have a
 966 * method of converting the ptr directly to a daddr available to us.
 967 */
 968STATIC void
 969xfs_btree_readahead_ptr(
 970        struct xfs_btree_cur    *cur,
 971        union xfs_btree_ptr     *ptr,
 972        xfs_extlen_t            count)
 973{
 974        xfs_daddr_t             daddr;
 975
 976        if (xfs_btree_ptr_to_daddr(cur, ptr, &daddr))
 977                return;
 978        xfs_buf_readahead(cur->bc_mp->m_ddev_targp, daddr,
 979                          cur->bc_mp->m_bsize * count, cur->bc_ops->buf_ops);
 980}
 981
 982/*
 983 * Set the buffer for level "lev" in the cursor to bp, releasing
 984 * any previous buffer.
 985 */
 986STATIC void
 987xfs_btree_setbuf(
 988        xfs_btree_cur_t         *cur,   /* btree cursor */
 989        int                     lev,    /* level in btree */
 990        struct xfs_buf          *bp)    /* new buffer to set */
 991{
 992        struct xfs_btree_block  *b;     /* btree block */
 993
 994        if (cur->bc_bufs[lev])
 995                xfs_trans_brelse(cur->bc_tp, cur->bc_bufs[lev]);
 996        cur->bc_bufs[lev] = bp;
 997        cur->bc_ra[lev] = 0;
 998
 999        b = XFS_BUF_TO_BLOCK(bp);
1000        if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
1001                if (b->bb_u.l.bb_leftsib == cpu_to_be64(NULLFSBLOCK))
1002                        cur->bc_ra[lev] |= XFS_BTCUR_LEFTRA;
1003                if (b->bb_u.l.bb_rightsib == cpu_to_be64(NULLFSBLOCK))
1004                        cur->bc_ra[lev] |= XFS_BTCUR_RIGHTRA;
1005        } else {
1006                if (b->bb_u.s.bb_leftsib == cpu_to_be32(NULLAGBLOCK))
1007                        cur->bc_ra[lev] |= XFS_BTCUR_LEFTRA;
1008                if (b->bb_u.s.bb_rightsib == cpu_to_be32(NULLAGBLOCK))
1009                        cur->bc_ra[lev] |= XFS_BTCUR_RIGHTRA;
1010        }
1011}
1012
1013bool
1014xfs_btree_ptr_is_null(
1015        struct xfs_btree_cur    *cur,
1016        union xfs_btree_ptr     *ptr)
1017{
1018        if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
1019                return ptr->l == cpu_to_be64(NULLFSBLOCK);
1020        else
1021                return ptr->s == cpu_to_be32(NULLAGBLOCK);
1022}
1023
1024void
1025xfs_btree_set_ptr_null(
1026        struct xfs_btree_cur    *cur,
1027        union xfs_btree_ptr     *ptr)
1028{
1029        if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
1030                ptr->l = cpu_to_be64(NULLFSBLOCK);
1031        else
1032                ptr->s = cpu_to_be32(NULLAGBLOCK);
1033}
1034
1035/*
1036 * Get/set/init sibling pointers
1037 */
1038void
1039xfs_btree_get_sibling(
1040        struct xfs_btree_cur    *cur,
1041        struct xfs_btree_block  *block,
1042        union xfs_btree_ptr     *ptr,
1043        int                     lr)
1044{
1045        ASSERT(lr == XFS_BB_LEFTSIB || lr == XFS_BB_RIGHTSIB);
1046
1047        if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
1048                if (lr == XFS_BB_RIGHTSIB)
1049                        ptr->l = block->bb_u.l.bb_rightsib;
1050                else
1051                        ptr->l = block->bb_u.l.bb_leftsib;
1052        } else {
1053                if (lr == XFS_BB_RIGHTSIB)
1054                        ptr->s = block->bb_u.s.bb_rightsib;
1055                else
1056                        ptr->s = block->bb_u.s.bb_leftsib;
1057        }
1058}
1059
1060void
1061xfs_btree_set_sibling(
1062        struct xfs_btree_cur    *cur,
1063        struct xfs_btree_block  *block,
1064        union xfs_btree_ptr     *ptr,
1065        int                     lr)
1066{
1067        ASSERT(lr == XFS_BB_LEFTSIB || lr == XFS_BB_RIGHTSIB);
1068
1069        if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
1070                if (lr == XFS_BB_RIGHTSIB)
1071                        block->bb_u.l.bb_rightsib = ptr->l;
1072                else
1073                        block->bb_u.l.bb_leftsib = ptr->l;
1074        } else {
1075                if (lr == XFS_BB_RIGHTSIB)
1076                        block->bb_u.s.bb_rightsib = ptr->s;
1077                else
1078                        block->bb_u.s.bb_leftsib = ptr->s;
1079        }
1080}
1081
1082void
1083xfs_btree_init_block_int(
1084        struct xfs_mount        *mp,
1085        struct xfs_btree_block  *buf,
1086        xfs_daddr_t             blkno,
1087        xfs_btnum_t             btnum,
1088        __u16                   level,
1089        __u16                   numrecs,
1090        __u64                   owner,
1091        unsigned int            flags)
1092{
1093        int                     crc = xfs_sb_version_hascrc(&mp->m_sb);
1094        __u32                   magic = xfs_btree_magic(crc, btnum);
1095
1096        buf->bb_magic = cpu_to_be32(magic);
1097        buf->bb_level = cpu_to_be16(level);
1098        buf->bb_numrecs = cpu_to_be16(numrecs);
1099
1100        if (flags & XFS_BTREE_LONG_PTRS) {
1101                buf->bb_u.l.bb_leftsib = cpu_to_be64(NULLFSBLOCK);
1102                buf->bb_u.l.bb_rightsib = cpu_to_be64(NULLFSBLOCK);
1103                if (crc) {
1104                        buf->bb_u.l.bb_blkno = cpu_to_be64(blkno);
1105                        buf->bb_u.l.bb_owner = cpu_to_be64(owner);
1106                        uuid_copy(&buf->bb_u.l.bb_uuid, &mp->m_sb.sb_meta_uuid);
1107                        buf->bb_u.l.bb_pad = 0;
1108                        buf->bb_u.l.bb_lsn = 0;
1109                }
1110        } else {
1111                /* owner is a 32 bit value on short blocks */
1112                __u32 __owner = (__u32)owner;
1113
1114                buf->bb_u.s.bb_leftsib = cpu_to_be32(NULLAGBLOCK);
1115                buf->bb_u.s.bb_rightsib = cpu_to_be32(NULLAGBLOCK);
1116                if (crc) {
1117                        buf->bb_u.s.bb_blkno = cpu_to_be64(blkno);
1118                        buf->bb_u.s.bb_owner = cpu_to_be32(__owner);
1119                        uuid_copy(&buf->bb_u.s.bb_uuid, &mp->m_sb.sb_meta_uuid);
1120                        buf->bb_u.s.bb_lsn = 0;
1121                }
1122        }
1123}
1124
1125void
1126xfs_btree_init_block(
1127        struct xfs_mount *mp,
1128        struct xfs_buf  *bp,
1129        xfs_btnum_t     btnum,
1130        __u16           level,
1131        __u16           numrecs,
1132        __u64           owner)
1133{
1134        xfs_btree_init_block_int(mp, XFS_BUF_TO_BLOCK(bp), bp->b_bn,
1135                                 btnum, level, numrecs, owner, 0);
1136}
1137
1138void
1139xfs_btree_init_block_cur(
1140        struct xfs_btree_cur    *cur,
1141        struct xfs_buf          *bp,
1142        int                     level,
1143        int                     numrecs)
1144{
1145        __u64                   owner;
1146
1147        /*
1148         * we can pull the owner from the cursor right now as the different
1149         * owners align directly with the pointer size of the btree. This may
1150         * change in future, but is safe for current users of the generic btree
1151         * code.
1152         */
1153        if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
1154                owner = cur->bc_ino.ip->i_ino;
1155        else
1156                owner = cur->bc_ag.pag->pag_agno;
1157
1158        xfs_btree_init_block_int(cur->bc_mp, XFS_BUF_TO_BLOCK(bp), bp->b_bn,
1159                                 cur->bc_btnum, level, numrecs,
1160                                 owner, cur->bc_flags);
1161}
1162
1163/*
1164 * Return true if ptr is the last record in the btree and
1165 * we need to track updates to this record.  The decision
1166 * will be further refined in the update_lastrec method.
1167 */
1168STATIC int
1169xfs_btree_is_lastrec(
1170        struct xfs_btree_cur    *cur,
1171        struct xfs_btree_block  *block,
1172        int                     level)
1173{
1174        union xfs_btree_ptr     ptr;
1175
1176        if (level > 0)
1177                return 0;
1178        if (!(cur->bc_flags & XFS_BTREE_LASTREC_UPDATE))
1179                return 0;
1180
1181        xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_RIGHTSIB);
1182        if (!xfs_btree_ptr_is_null(cur, &ptr))
1183                return 0;
1184        return 1;
1185}
1186
1187STATIC void
1188xfs_btree_buf_to_ptr(
1189        struct xfs_btree_cur    *cur,
1190        struct xfs_buf          *bp,
1191        union xfs_btree_ptr     *ptr)
1192{
1193        if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
1194                ptr->l = cpu_to_be64(XFS_DADDR_TO_FSB(cur->bc_mp,
1195                                        XFS_BUF_ADDR(bp)));
1196        else {
1197                ptr->s = cpu_to_be32(xfs_daddr_to_agbno(cur->bc_mp,
1198                                        XFS_BUF_ADDR(bp)));
1199        }
1200}
1201
1202STATIC void
1203xfs_btree_set_refs(
1204        struct xfs_btree_cur    *cur,
1205        struct xfs_buf          *bp)
1206{
1207        switch (cur->bc_btnum) {
1208        case XFS_BTNUM_BNO:
1209        case XFS_BTNUM_CNT:
1210                xfs_buf_set_ref(bp, XFS_ALLOC_BTREE_REF);
1211                break;
1212        case XFS_BTNUM_INO:
1213        case XFS_BTNUM_FINO:
1214                xfs_buf_set_ref(bp, XFS_INO_BTREE_REF);
1215                break;
1216        case XFS_BTNUM_BMAP:
1217                xfs_buf_set_ref(bp, XFS_BMAP_BTREE_REF);
1218                break;
1219        case XFS_BTNUM_RMAP:
1220                xfs_buf_set_ref(bp, XFS_RMAP_BTREE_REF);
1221                break;
1222        case XFS_BTNUM_REFC:
1223                xfs_buf_set_ref(bp, XFS_REFC_BTREE_REF);
1224                break;
1225        default:
1226                ASSERT(0);
1227        }
1228}
1229
1230int
1231xfs_btree_get_buf_block(
1232        struct xfs_btree_cur    *cur,
1233        union xfs_btree_ptr     *ptr,
1234        struct xfs_btree_block  **block,
1235        struct xfs_buf          **bpp)
1236{
1237        struct xfs_mount        *mp = cur->bc_mp;
1238        xfs_daddr_t             d;
1239        int                     error;
1240
1241        error = xfs_btree_ptr_to_daddr(cur, ptr, &d);
1242        if (error)
1243                return error;
1244        error = xfs_trans_get_buf(cur->bc_tp, mp->m_ddev_targp, d, mp->m_bsize,
1245                        0, bpp);
1246        if (error)
1247                return error;
1248
1249        (*bpp)->b_ops = cur->bc_ops->buf_ops;
1250        *block = XFS_BUF_TO_BLOCK(*bpp);
1251        return 0;
1252}
1253
1254/*
1255 * Read in the buffer at the given ptr and return the buffer and
1256 * the block pointer within the buffer.
1257 */
1258STATIC int
1259xfs_btree_read_buf_block(
1260        struct xfs_btree_cur    *cur,
1261        union xfs_btree_ptr     *ptr,
1262        int                     flags,
1263        struct xfs_btree_block  **block,
1264        struct xfs_buf          **bpp)
1265{
1266        struct xfs_mount        *mp = cur->bc_mp;
1267        xfs_daddr_t             d;
1268        int                     error;
1269
1270        /* need to sort out how callers deal with failures first */
1271        ASSERT(!(flags & XBF_TRYLOCK));
1272
1273        error = xfs_btree_ptr_to_daddr(cur, ptr, &d);
1274        if (error)
1275                return error;
1276        error = xfs_trans_read_buf(mp, cur->bc_tp, mp->m_ddev_targp, d,
1277                                   mp->m_bsize, flags, bpp,
1278                                   cur->bc_ops->buf_ops);
1279        if (error)
1280                return error;
1281
1282        xfs_btree_set_refs(cur, *bpp);
1283        *block = XFS_BUF_TO_BLOCK(*bpp);
1284        return 0;
1285}
1286
1287/*
1288 * Copy keys from one btree block to another.
1289 */
1290void
1291xfs_btree_copy_keys(
1292        struct xfs_btree_cur    *cur,
1293        union xfs_btree_key     *dst_key,
1294        union xfs_btree_key     *src_key,
1295        int                     numkeys)
1296{
1297        ASSERT(numkeys >= 0);
1298        memcpy(dst_key, src_key, numkeys * cur->bc_ops->key_len);
1299}
1300
1301/*
1302 * Copy records from one btree block to another.
1303 */
1304STATIC void
1305xfs_btree_copy_recs(
1306        struct xfs_btree_cur    *cur,
1307        union xfs_btree_rec     *dst_rec,
1308        union xfs_btree_rec     *src_rec,
1309        int                     numrecs)
1310{
1311        ASSERT(numrecs >= 0);
1312        memcpy(dst_rec, src_rec, numrecs * cur->bc_ops->rec_len);
1313}
1314
1315/*
1316 * Copy block pointers from one btree block to another.
1317 */
1318void
1319xfs_btree_copy_ptrs(
1320        struct xfs_btree_cur    *cur,
1321        union xfs_btree_ptr     *dst_ptr,
1322        const union xfs_btree_ptr *src_ptr,
1323        int                     numptrs)
1324{
1325        ASSERT(numptrs >= 0);
1326        memcpy(dst_ptr, src_ptr, numptrs * xfs_btree_ptr_len(cur));
1327}
1328
1329/*
1330 * Shift keys one index left/right inside a single btree block.
1331 */
1332STATIC void
1333xfs_btree_shift_keys(
1334        struct xfs_btree_cur    *cur,
1335        union xfs_btree_key     *key,
1336        int                     dir,
1337        int                     numkeys)
1338{
1339        char                    *dst_key;
1340
1341        ASSERT(numkeys >= 0);
1342        ASSERT(dir == 1 || dir == -1);
1343
1344        dst_key = (char *)key + (dir * cur->bc_ops->key_len);
1345        memmove(dst_key, key, numkeys * cur->bc_ops->key_len);
1346}
1347
1348/*
1349 * Shift records one index left/right inside a single btree block.
1350 */
1351STATIC void
1352xfs_btree_shift_recs(
1353        struct xfs_btree_cur    *cur,
1354        union xfs_btree_rec     *rec,
1355        int                     dir,
1356        int                     numrecs)
1357{
1358        char                    *dst_rec;
1359
1360        ASSERT(numrecs >= 0);
1361        ASSERT(dir == 1 || dir == -1);
1362
1363        dst_rec = (char *)rec + (dir * cur->bc_ops->rec_len);
1364        memmove(dst_rec, rec, numrecs * cur->bc_ops->rec_len);
1365}
1366
1367/*
1368 * Shift block pointers one index left/right inside a single btree block.
1369 */
1370STATIC void
1371xfs_btree_shift_ptrs(
1372        struct xfs_btree_cur    *cur,
1373        union xfs_btree_ptr     *ptr,
1374        int                     dir,
1375        int                     numptrs)
1376{
1377        char                    *dst_ptr;
1378
1379        ASSERT(numptrs >= 0);
1380        ASSERT(dir == 1 || dir == -1);
1381
1382        dst_ptr = (char *)ptr + (dir * xfs_btree_ptr_len(cur));
1383        memmove(dst_ptr, ptr, numptrs * xfs_btree_ptr_len(cur));
1384}
1385
1386/*
1387 * Log key values from the btree block.
1388 */
1389STATIC void
1390xfs_btree_log_keys(
1391        struct xfs_btree_cur    *cur,
1392        struct xfs_buf          *bp,
1393        int                     first,
1394        int                     last)
1395{
1396
1397        if (bp) {
1398                xfs_trans_buf_set_type(cur->bc_tp, bp, XFS_BLFT_BTREE_BUF);
1399                xfs_trans_log_buf(cur->bc_tp, bp,
1400                                  xfs_btree_key_offset(cur, first),
1401                                  xfs_btree_key_offset(cur, last + 1) - 1);
1402        } else {
1403                xfs_trans_log_inode(cur->bc_tp, cur->bc_ino.ip,
1404                                xfs_ilog_fbroot(cur->bc_ino.whichfork));
1405        }
1406}
1407
1408/*
1409 * Log record values from the btree block.
1410 */
1411void
1412xfs_btree_log_recs(
1413        struct xfs_btree_cur    *cur,
1414        struct xfs_buf          *bp,
1415        int                     first,
1416        int                     last)
1417{
1418
1419        xfs_trans_buf_set_type(cur->bc_tp, bp, XFS_BLFT_BTREE_BUF);
1420        xfs_trans_log_buf(cur->bc_tp, bp,
1421                          xfs_btree_rec_offset(cur, first),
1422                          xfs_btree_rec_offset(cur, last + 1) - 1);
1423
1424}
1425
1426/*
1427 * Log block pointer fields from a btree block (nonleaf).
1428 */
1429STATIC void
1430xfs_btree_log_ptrs(
1431        struct xfs_btree_cur    *cur,   /* btree cursor */
1432        struct xfs_buf          *bp,    /* buffer containing btree block */
1433        int                     first,  /* index of first pointer to log */
1434        int                     last)   /* index of last pointer to log */
1435{
1436
1437        if (bp) {
1438                struct xfs_btree_block  *block = XFS_BUF_TO_BLOCK(bp);
1439                int                     level = xfs_btree_get_level(block);
1440
1441                xfs_trans_buf_set_type(cur->bc_tp, bp, XFS_BLFT_BTREE_BUF);
1442                xfs_trans_log_buf(cur->bc_tp, bp,
1443                                xfs_btree_ptr_offset(cur, first, level),
1444                                xfs_btree_ptr_offset(cur, last + 1, level) - 1);
1445        } else {
1446                xfs_trans_log_inode(cur->bc_tp, cur->bc_ino.ip,
1447                        xfs_ilog_fbroot(cur->bc_ino.whichfork));
1448        }
1449
1450}
1451
1452/*
1453 * Log fields from a btree block header.
1454 */
1455void
1456xfs_btree_log_block(
1457        struct xfs_btree_cur    *cur,   /* btree cursor */
1458        struct xfs_buf          *bp,    /* buffer containing btree block */
1459        int                     fields) /* mask of fields: XFS_BB_... */
1460{
1461        int                     first;  /* first byte offset logged */
1462        int                     last;   /* last byte offset logged */
1463        static const short      soffsets[] = {  /* table of offsets (short) */
1464                offsetof(struct xfs_btree_block, bb_magic),
1465                offsetof(struct xfs_btree_block, bb_level),
1466                offsetof(struct xfs_btree_block, bb_numrecs),
1467                offsetof(struct xfs_btree_block, bb_u.s.bb_leftsib),
1468                offsetof(struct xfs_btree_block, bb_u.s.bb_rightsib),
1469                offsetof(struct xfs_btree_block, bb_u.s.bb_blkno),
1470                offsetof(struct xfs_btree_block, bb_u.s.bb_lsn),
1471                offsetof(struct xfs_btree_block, bb_u.s.bb_uuid),
1472                offsetof(struct xfs_btree_block, bb_u.s.bb_owner),
1473                offsetof(struct xfs_btree_block, bb_u.s.bb_crc),
1474                XFS_BTREE_SBLOCK_CRC_LEN
1475        };
1476        static const short      loffsets[] = {  /* table of offsets (long) */
1477                offsetof(struct xfs_btree_block, bb_magic),
1478                offsetof(struct xfs_btree_block, bb_level),
1479                offsetof(struct xfs_btree_block, bb_numrecs),
1480                offsetof(struct xfs_btree_block, bb_u.l.bb_leftsib),
1481                offsetof(struct xfs_btree_block, bb_u.l.bb_rightsib),
1482                offsetof(struct xfs_btree_block, bb_u.l.bb_blkno),
1483                offsetof(struct xfs_btree_block, bb_u.l.bb_lsn),
1484                offsetof(struct xfs_btree_block, bb_u.l.bb_uuid),
1485                offsetof(struct xfs_btree_block, bb_u.l.bb_owner),
1486                offsetof(struct xfs_btree_block, bb_u.l.bb_crc),
1487                offsetof(struct xfs_btree_block, bb_u.l.bb_pad),
1488                XFS_BTREE_LBLOCK_CRC_LEN
1489        };
1490
1491        if (bp) {
1492                int nbits;
1493
1494                if (cur->bc_flags & XFS_BTREE_CRC_BLOCKS) {
1495                        /*
1496                         * We don't log the CRC when updating a btree
1497                         * block but instead recreate it during log
1498                         * recovery.  As the log buffers have checksums
1499                         * of their own this is safe and avoids logging a crc
1500                         * update in a lot of places.
1501                         */
1502                        if (fields == XFS_BB_ALL_BITS)
1503                                fields = XFS_BB_ALL_BITS_CRC;
1504                        nbits = XFS_BB_NUM_BITS_CRC;
1505                } else {
1506                        nbits = XFS_BB_NUM_BITS;
1507                }
1508                xfs_btree_offsets(fields,
1509                                  (cur->bc_flags & XFS_BTREE_LONG_PTRS) ?
1510                                        loffsets : soffsets,
1511                                  nbits, &first, &last);
1512                xfs_trans_buf_set_type(cur->bc_tp, bp, XFS_BLFT_BTREE_BUF);
1513                xfs_trans_log_buf(cur->bc_tp, bp, first, last);
1514        } else {
1515                xfs_trans_log_inode(cur->bc_tp, cur->bc_ino.ip,
1516                        xfs_ilog_fbroot(cur->bc_ino.whichfork));
1517        }
1518}
1519
1520/*
1521 * Increment cursor by one record at the level.
1522 * For nonzero levels the leaf-ward information is untouched.
1523 */
1524int                                             /* error */
1525xfs_btree_increment(
1526        struct xfs_btree_cur    *cur,
1527        int                     level,
1528        int                     *stat)          /* success/failure */
1529{
1530        struct xfs_btree_block  *block;
1531        union xfs_btree_ptr     ptr;
1532        struct xfs_buf          *bp;
1533        int                     error;          /* error return value */
1534        int                     lev;
1535
1536        ASSERT(level < cur->bc_nlevels);
1537
1538        /* Read-ahead to the right at this level. */
1539        xfs_btree_readahead(cur, level, XFS_BTCUR_RIGHTRA);
1540
1541        /* Get a pointer to the btree block. */
1542        block = xfs_btree_get_block(cur, level, &bp);
1543
1544#ifdef DEBUG
1545        error = xfs_btree_check_block(cur, block, level, bp);
1546        if (error)
1547                goto error0;
1548#endif
1549
1550        /* We're done if we remain in the block after the increment. */
1551        if (++cur->bc_ptrs[level] <= xfs_btree_get_numrecs(block))
1552                goto out1;
1553
1554        /* Fail if we just went off the right edge of the tree. */
1555        xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_RIGHTSIB);
1556        if (xfs_btree_ptr_is_null(cur, &ptr))
1557                goto out0;
1558
1559        XFS_BTREE_STATS_INC(cur, increment);
1560
1561        /*
1562         * March up the tree incrementing pointers.
1563         * Stop when we don't go off the right edge of a block.
1564         */
1565        for (lev = level + 1; lev < cur->bc_nlevels; lev++) {
1566                block = xfs_btree_get_block(cur, lev, &bp);
1567
1568#ifdef DEBUG
1569                error = xfs_btree_check_block(cur, block, lev, bp);
1570                if (error)
1571                        goto error0;
1572#endif
1573
1574                if (++cur->bc_ptrs[lev] <= xfs_btree_get_numrecs(block))
1575                        break;
1576
1577                /* Read-ahead the right block for the next loop. */
1578                xfs_btree_readahead(cur, lev, XFS_BTCUR_RIGHTRA);
1579        }
1580
1581        /*
1582         * If we went off the root then we are either seriously
1583         * confused or have the tree root in an inode.
1584         */
1585        if (lev == cur->bc_nlevels) {
1586                if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE)
1587                        goto out0;
1588                ASSERT(0);
1589                error = -EFSCORRUPTED;
1590                goto error0;
1591        }
1592        ASSERT(lev < cur->bc_nlevels);
1593
1594        /*
1595         * Now walk back down the tree, fixing up the cursor's buffer
1596         * pointers and key numbers.
1597         */
1598        for (block = xfs_btree_get_block(cur, lev, &bp); lev > level; ) {
1599                union xfs_btree_ptr     *ptrp;
1600
1601                ptrp = xfs_btree_ptr_addr(cur, cur->bc_ptrs[lev], block);
1602                --lev;
1603                error = xfs_btree_read_buf_block(cur, ptrp, 0, &block, &bp);
1604                if (error)
1605                        goto error0;
1606
1607                xfs_btree_setbuf(cur, lev, bp);
1608                cur->bc_ptrs[lev] = 1;
1609        }
1610out1:
1611        *stat = 1;
1612        return 0;
1613
1614out0:
1615        *stat = 0;
1616        return 0;
1617
1618error0:
1619        return error;
1620}
1621
1622/*
1623 * Decrement cursor by one record at the level.
1624 * For nonzero levels the leaf-ward information is untouched.
1625 */
1626int                                             /* error */
1627xfs_btree_decrement(
1628        struct xfs_btree_cur    *cur,
1629        int                     level,
1630        int                     *stat)          /* success/failure */
1631{
1632        struct xfs_btree_block  *block;
1633        struct xfs_buf          *bp;
1634        int                     error;          /* error return value */
1635        int                     lev;
1636        union xfs_btree_ptr     ptr;
1637
1638        ASSERT(level < cur->bc_nlevels);
1639
1640        /* Read-ahead to the left at this level. */
1641        xfs_btree_readahead(cur, level, XFS_BTCUR_LEFTRA);
1642
1643        /* We're done if we remain in the block after the decrement. */
1644        if (--cur->bc_ptrs[level] > 0)
1645                goto out1;
1646
1647        /* Get a pointer to the btree block. */
1648        block = xfs_btree_get_block(cur, level, &bp);
1649
1650#ifdef DEBUG
1651        error = xfs_btree_check_block(cur, block, level, bp);
1652        if (error)
1653                goto error0;
1654#endif
1655
1656        /* Fail if we just went off the left edge of the tree. */
1657        xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_LEFTSIB);
1658        if (xfs_btree_ptr_is_null(cur, &ptr))
1659                goto out0;
1660
1661        XFS_BTREE_STATS_INC(cur, decrement);
1662
1663        /*
1664         * March up the tree decrementing pointers.
1665         * Stop when we don't go off the left edge of a block.
1666         */
1667        for (lev = level + 1; lev < cur->bc_nlevels; lev++) {
1668                if (--cur->bc_ptrs[lev] > 0)
1669                        break;
1670                /* Read-ahead the left block for the next loop. */
1671                xfs_btree_readahead(cur, lev, XFS_BTCUR_LEFTRA);
1672        }
1673
1674        /*
1675         * If we went off the root then we are seriously confused.
1676         * or the root of the tree is in an inode.
1677         */
1678        if (lev == cur->bc_nlevels) {
1679                if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE)
1680                        goto out0;
1681                ASSERT(0);
1682                error = -EFSCORRUPTED;
1683                goto error0;
1684        }
1685        ASSERT(lev < cur->bc_nlevels);
1686
1687        /*
1688         * Now walk back down the tree, fixing up the cursor's buffer
1689         * pointers and key numbers.
1690         */
1691        for (block = xfs_btree_get_block(cur, lev, &bp); lev > level; ) {
1692                union xfs_btree_ptr     *ptrp;
1693
1694                ptrp = xfs_btree_ptr_addr(cur, cur->bc_ptrs[lev], block);
1695                --lev;
1696                error = xfs_btree_read_buf_block(cur, ptrp, 0, &block, &bp);
1697                if (error)
1698                        goto error0;
1699                xfs_btree_setbuf(cur, lev, bp);
1700                cur->bc_ptrs[lev] = xfs_btree_get_numrecs(block);
1701        }
1702out1:
1703        *stat = 1;
1704        return 0;
1705
1706out0:
1707        *stat = 0;
1708        return 0;
1709
1710error0:
1711        return error;
1712}
1713
1714int
1715xfs_btree_lookup_get_block(
1716        struct xfs_btree_cur    *cur,   /* btree cursor */
1717        int                     level,  /* level in the btree */
1718        union xfs_btree_ptr     *pp,    /* ptr to btree block */
1719        struct xfs_btree_block  **blkp) /* return btree block */
1720{
1721        struct xfs_buf          *bp;    /* buffer pointer for btree block */
1722        xfs_daddr_t             daddr;
1723        int                     error = 0;
1724
1725        /* special case the root block if in an inode */
1726        if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
1727            (level == cur->bc_nlevels - 1)) {
1728                *blkp = xfs_btree_get_iroot(cur);
1729                return 0;
1730        }
1731
1732        /*
1733         * If the old buffer at this level for the disk address we are
1734         * looking for re-use it.
1735         *
1736         * Otherwise throw it away and get a new one.
1737         */
1738        bp = cur->bc_bufs[level];
1739        error = xfs_btree_ptr_to_daddr(cur, pp, &daddr);
1740        if (error)
1741                return error;
1742        if (bp && XFS_BUF_ADDR(bp) == daddr) {
1743                *blkp = XFS_BUF_TO_BLOCK(bp);
1744                return 0;
1745        }
1746
1747        error = xfs_btree_read_buf_block(cur, pp, 0, blkp, &bp);
1748        if (error)
1749                return error;
1750
1751        /* Check the inode owner since the verifiers don't. */
1752        if (xfs_sb_version_hascrc(&cur->bc_mp->m_sb) &&
1753            !(cur->bc_ino.flags & XFS_BTCUR_BMBT_INVALID_OWNER) &&
1754            (cur->bc_flags & XFS_BTREE_LONG_PTRS) &&
1755            be64_to_cpu((*blkp)->bb_u.l.bb_owner) !=
1756                        cur->bc_ino.ip->i_ino)
1757                goto out_bad;
1758
1759        /* Did we get the level we were looking for? */
1760        if (be16_to_cpu((*blkp)->bb_level) != level)
1761                goto out_bad;
1762
1763        /* Check that internal nodes have at least one record. */
1764        if (level != 0 && be16_to_cpu((*blkp)->bb_numrecs) == 0)
1765                goto out_bad;
1766
1767        xfs_btree_setbuf(cur, level, bp);
1768        return 0;
1769
1770out_bad:
1771        *blkp = NULL;
1772        xfs_buf_mark_corrupt(bp);
1773        xfs_trans_brelse(cur->bc_tp, bp);
1774        return -EFSCORRUPTED;
1775}
1776
1777/*
1778 * Get current search key.  For level 0 we don't actually have a key
1779 * structure so we make one up from the record.  For all other levels
1780 * we just return the right key.
1781 */
1782STATIC union xfs_btree_key *
1783xfs_lookup_get_search_key(
1784        struct xfs_btree_cur    *cur,
1785        int                     level,
1786        int                     keyno,
1787        struct xfs_btree_block  *block,
1788        union xfs_btree_key     *kp)
1789{
1790        if (level == 0) {
1791                cur->bc_ops->init_key_from_rec(kp,
1792                                xfs_btree_rec_addr(cur, keyno, block));
1793                return kp;
1794        }
1795
1796        return xfs_btree_key_addr(cur, keyno, block);
1797}
1798
1799/*
1800 * Lookup the record.  The cursor is made to point to it, based on dir.
1801 * stat is set to 0 if can't find any such record, 1 for success.
1802 */
1803int                                     /* error */
1804xfs_btree_lookup(
1805        struct xfs_btree_cur    *cur,   /* btree cursor */
1806        xfs_lookup_t            dir,    /* <=, ==, or >= */
1807        int                     *stat)  /* success/failure */
1808{
1809        struct xfs_btree_block  *block; /* current btree block */
1810        int64_t                 diff;   /* difference for the current key */
1811        int                     error;  /* error return value */
1812        int                     keyno;  /* current key number */
1813        int                     level;  /* level in the btree */
1814        union xfs_btree_ptr     *pp;    /* ptr to btree block */
1815        union xfs_btree_ptr     ptr;    /* ptr to btree block */
1816
1817        XFS_BTREE_STATS_INC(cur, lookup);
1818
1819        /* No such thing as a zero-level tree. */
1820        if (XFS_IS_CORRUPT(cur->bc_mp, cur->bc_nlevels == 0))
1821                return -EFSCORRUPTED;
1822
1823        block = NULL;
1824        keyno = 0;
1825
1826        /* initialise start pointer from cursor */
1827        cur->bc_ops->init_ptr_from_cur(cur, &ptr);
1828        pp = &ptr;
1829
1830        /*
1831         * Iterate over each level in the btree, starting at the root.
1832         * For each level above the leaves, find the key we need, based
1833         * on the lookup record, then follow the corresponding block
1834         * pointer down to the next level.
1835         */
1836        for (level = cur->bc_nlevels - 1, diff = 1; level >= 0; level--) {
1837                /* Get the block we need to do the lookup on. */
1838                error = xfs_btree_lookup_get_block(cur, level, pp, &block);
1839                if (error)
1840                        goto error0;
1841
1842                if (diff == 0) {
1843                        /*
1844                         * If we already had a key match at a higher level, we
1845                         * know we need to use the first entry in this block.
1846                         */
1847                        keyno = 1;
1848                } else {
1849                        /* Otherwise search this block. Do a binary search. */
1850
1851                        int     high;   /* high entry number */
1852                        int     low;    /* low entry number */
1853
1854                        /* Set low and high entry numbers, 1-based. */
1855                        low = 1;
1856                        high = xfs_btree_get_numrecs(block);
1857                        if (!high) {
1858                                /* Block is empty, must be an empty leaf. */
1859                                if (level != 0 || cur->bc_nlevels != 1) {
1860                                        XFS_CORRUPTION_ERROR(__func__,
1861                                                        XFS_ERRLEVEL_LOW,
1862                                                        cur->bc_mp, block,
1863                                                        sizeof(*block));
1864                                        return -EFSCORRUPTED;
1865                                }
1866
1867                                cur->bc_ptrs[0] = dir != XFS_LOOKUP_LE;
1868                                *stat = 0;
1869                                return 0;
1870                        }
1871
1872                        /* Binary search the block. */
1873                        while (low <= high) {
1874                                union xfs_btree_key     key;
1875                                union xfs_btree_key     *kp;
1876
1877                                XFS_BTREE_STATS_INC(cur, compare);
1878
1879                                /* keyno is average of low and high. */
1880                                keyno = (low + high) >> 1;
1881
1882                                /* Get current search key */
1883                                kp = xfs_lookup_get_search_key(cur, level,
1884                                                keyno, block, &key);
1885
1886                                /*
1887                                 * Compute difference to get next direction:
1888                                 *  - less than, move right
1889                                 *  - greater than, move left
1890                                 *  - equal, we're done
1891                                 */
1892                                diff = cur->bc_ops->key_diff(cur, kp);
1893                                if (diff < 0)
1894                                        low = keyno + 1;
1895                                else if (diff > 0)
1896                                        high = keyno - 1;
1897                                else
1898                                        break;
1899                        }
1900                }
1901
1902                /*
1903                 * If there are more levels, set up for the next level
1904                 * by getting the block number and filling in the cursor.
1905                 */
1906                if (level > 0) {
1907                        /*
1908                         * If we moved left, need the previous key number,
1909                         * unless there isn't one.
1910                         */
1911                        if (diff > 0 && --keyno < 1)
1912                                keyno = 1;
1913                        pp = xfs_btree_ptr_addr(cur, keyno, block);
1914
1915                        error = xfs_btree_debug_check_ptr(cur, pp, 0, level);
1916                        if (error)
1917                                goto error0;
1918
1919                        cur->bc_ptrs[level] = keyno;
1920                }
1921        }
1922
1923        /* Done with the search. See if we need to adjust the results. */
1924        if (dir != XFS_LOOKUP_LE && diff < 0) {
1925                keyno++;
1926                /*
1927                 * If ge search and we went off the end of the block, but it's
1928                 * not the last block, we're in the wrong block.
1929                 */
1930                xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_RIGHTSIB);
1931                if (dir == XFS_LOOKUP_GE &&
1932                    keyno > xfs_btree_get_numrecs(block) &&
1933                    !xfs_btree_ptr_is_null(cur, &ptr)) {
1934                        int     i;
1935
1936                        cur->bc_ptrs[0] = keyno;
1937                        error = xfs_btree_increment(cur, 0, &i);
1938                        if (error)
1939                                goto error0;
1940                        if (XFS_IS_CORRUPT(cur->bc_mp, i != 1))
1941                                return -EFSCORRUPTED;
1942                        *stat = 1;
1943                        return 0;
1944                }
1945        } else if (dir == XFS_LOOKUP_LE && diff > 0)
1946                keyno--;
1947        cur->bc_ptrs[0] = keyno;
1948
1949        /* Return if we succeeded or not. */
1950        if (keyno == 0 || keyno > xfs_btree_get_numrecs(block))
1951                *stat = 0;
1952        else if (dir != XFS_LOOKUP_EQ || diff == 0)
1953                *stat = 1;
1954        else
1955                *stat = 0;
1956        return 0;
1957
1958error0:
1959        return error;
1960}
1961
1962/* Find the high key storage area from a regular key. */
1963union xfs_btree_key *
1964xfs_btree_high_key_from_key(
1965        struct xfs_btree_cur    *cur,
1966        union xfs_btree_key     *key)
1967{
1968        ASSERT(cur->bc_flags & XFS_BTREE_OVERLAPPING);
1969        return (union xfs_btree_key *)((char *)key +
1970                        (cur->bc_ops->key_len / 2));
1971}
1972
1973/* Determine the low (and high if overlapped) keys of a leaf block */
1974STATIC void
1975xfs_btree_get_leaf_keys(
1976        struct xfs_btree_cur    *cur,
1977        struct xfs_btree_block  *block,
1978        union xfs_btree_key     *key)
1979{
1980        union xfs_btree_key     max_hkey;
1981        union xfs_btree_key     hkey;
1982        union xfs_btree_rec     *rec;
1983        union xfs_btree_key     *high;
1984        int                     n;
1985
1986        rec = xfs_btree_rec_addr(cur, 1, block);
1987        cur->bc_ops->init_key_from_rec(key, rec);
1988
1989        if (cur->bc_flags & XFS_BTREE_OVERLAPPING) {
1990
1991                cur->bc_ops->init_high_key_from_rec(&max_hkey, rec);
1992                for (n = 2; n <= xfs_btree_get_numrecs(block); n++) {
1993                        rec = xfs_btree_rec_addr(cur, n, block);
1994                        cur->bc_ops->init_high_key_from_rec(&hkey, rec);
1995                        if (cur->bc_ops->diff_two_keys(cur, &hkey, &max_hkey)
1996                                        > 0)
1997                                max_hkey = hkey;
1998                }
1999
2000                high = xfs_btree_high_key_from_key(cur, key);
2001                memcpy(high, &max_hkey, cur->bc_ops->key_len / 2);
2002        }
2003}
2004
2005/* Determine the low (and high if overlapped) keys of a node block */
2006STATIC void
2007xfs_btree_get_node_keys(
2008        struct xfs_btree_cur    *cur,
2009        struct xfs_btree_block  *block,
2010        union xfs_btree_key     *key)
2011{
2012        union xfs_btree_key     *hkey;
2013        union xfs_btree_key     *max_hkey;
2014        union xfs_btree_key     *high;
2015        int                     n;
2016
2017        if (cur->bc_flags & XFS_BTREE_OVERLAPPING) {
2018                memcpy(key, xfs_btree_key_addr(cur, 1, block),
2019                                cur->bc_ops->key_len / 2);
2020
2021                max_hkey = xfs_btree_high_key_addr(cur, 1, block);
2022                for (n = 2; n <= xfs_btree_get_numrecs(block); n++) {
2023                        hkey = xfs_btree_high_key_addr(cur, n, block);
2024                        if (cur->bc_ops->diff_two_keys(cur, hkey, max_hkey) > 0)
2025                                max_hkey = hkey;
2026                }
2027
2028                high = xfs_btree_high_key_from_key(cur, key);
2029                memcpy(high, max_hkey, cur->bc_ops->key_len / 2);
2030        } else {
2031                memcpy(key, xfs_btree_key_addr(cur, 1, block),
2032                                cur->bc_ops->key_len);
2033        }
2034}
2035
2036/* Derive the keys for any btree block. */
2037void
2038xfs_btree_get_keys(
2039        struct xfs_btree_cur    *cur,
2040        struct xfs_btree_block  *block,
2041        union xfs_btree_key     *key)
2042{
2043        if (be16_to_cpu(block->bb_level) == 0)
2044                xfs_btree_get_leaf_keys(cur, block, key);
2045        else
2046                xfs_btree_get_node_keys(cur, block, key);
2047}
2048
2049/*
2050 * Decide if we need to update the parent keys of a btree block.  For
2051 * a standard btree this is only necessary if we're updating the first
2052 * record/key.  For an overlapping btree, we must always update the
2053 * keys because the highest key can be in any of the records or keys
2054 * in the block.
2055 */
2056static inline bool
2057xfs_btree_needs_key_update(
2058        struct xfs_btree_cur    *cur,
2059        int                     ptr)
2060{
2061        return (cur->bc_flags & XFS_BTREE_OVERLAPPING) || ptr == 1;
2062}
2063
2064/*
2065 * Update the low and high parent keys of the given level, progressing
2066 * towards the root.  If force_all is false, stop if the keys for a given
2067 * level do not need updating.
2068 */
2069STATIC int
2070__xfs_btree_updkeys(
2071        struct xfs_btree_cur    *cur,
2072        int                     level,
2073        struct xfs_btree_block  *block,
2074        struct xfs_buf          *bp0,
2075        bool                    force_all)
2076{
2077        union xfs_btree_key     key;    /* keys from current level */
2078        union xfs_btree_key     *lkey;  /* keys from the next level up */
2079        union xfs_btree_key     *hkey;
2080        union xfs_btree_key     *nlkey; /* keys from the next level up */
2081        union xfs_btree_key     *nhkey;
2082        struct xfs_buf          *bp;
2083        int                     ptr;
2084
2085        ASSERT(cur->bc_flags & XFS_BTREE_OVERLAPPING);
2086
2087        /* Exit if there aren't any parent levels to update. */
2088        if (level + 1 >= cur->bc_nlevels)
2089                return 0;
2090
2091        trace_xfs_btree_updkeys(cur, level, bp0);
2092
2093        lkey = &key;
2094        hkey = xfs_btree_high_key_from_key(cur, lkey);
2095        xfs_btree_get_keys(cur, block, lkey);
2096        for (level++; level < cur->bc_nlevels; level++) {
2097#ifdef DEBUG
2098                int             error;
2099#endif
2100                block = xfs_btree_get_block(cur, level, &bp);
2101                trace_xfs_btree_updkeys(cur, level, bp);
2102#ifdef DEBUG
2103                error = xfs_btree_check_block(cur, block, level, bp);
2104                if (error)
2105                        return error;
2106#endif
2107                ptr = cur->bc_ptrs[level];
2108                nlkey = xfs_btree_key_addr(cur, ptr, block);
2109                nhkey = xfs_btree_high_key_addr(cur, ptr, block);
2110                if (!force_all &&
2111                    !(cur->bc_ops->diff_two_keys(cur, nlkey, lkey) != 0 ||
2112                      cur->bc_ops->diff_two_keys(cur, nhkey, hkey) != 0))
2113                        break;
2114                xfs_btree_copy_keys(cur, nlkey, lkey, 1);
2115                xfs_btree_log_keys(cur, bp, ptr, ptr);
2116                if (level + 1 >= cur->bc_nlevels)
2117                        break;
2118                xfs_btree_get_node_keys(cur, block, lkey);
2119        }
2120
2121        return 0;
2122}
2123
2124/* Update all the keys from some level in cursor back to the root. */
2125STATIC int
2126xfs_btree_updkeys_force(
2127        struct xfs_btree_cur    *cur,
2128        int                     level)
2129{
2130        struct xfs_buf          *bp;
2131        struct xfs_btree_block  *block;
2132
2133        block = xfs_btree_get_block(cur, level, &bp);
2134        return __xfs_btree_updkeys(cur, level, block, bp, true);
2135}
2136
2137/*
2138 * Update the parent keys of the given level, progressing towards the root.
2139 */
2140STATIC int
2141xfs_btree_update_keys(
2142        struct xfs_btree_cur    *cur,
2143        int                     level)
2144{
2145        struct xfs_btree_block  *block;
2146        struct xfs_buf          *bp;
2147        union xfs_btree_key     *kp;
2148        union xfs_btree_key     key;
2149        int                     ptr;
2150
2151        ASSERT(level >= 0);
2152
2153        block = xfs_btree_get_block(cur, level, &bp);
2154        if (cur->bc_flags & XFS_BTREE_OVERLAPPING)
2155                return __xfs_btree_updkeys(cur, level, block, bp, false);
2156
2157        /*
2158         * Go up the tree from this level toward the root.
2159         * At each level, update the key value to the value input.
2160         * Stop when we reach a level where the cursor isn't pointing
2161         * at the first entry in the block.
2162         */
2163        xfs_btree_get_keys(cur, block, &key);
2164        for (level++, ptr = 1; ptr == 1 && level < cur->bc_nlevels; level++) {
2165#ifdef DEBUG
2166                int             error;
2167#endif
2168                block = xfs_btree_get_block(cur, level, &bp);
2169#ifdef DEBUG
2170                error = xfs_btree_check_block(cur, block, level, bp);
2171                if (error)
2172                        return error;
2173#endif
2174                ptr = cur->bc_ptrs[level];
2175                kp = xfs_btree_key_addr(cur, ptr, block);
2176                xfs_btree_copy_keys(cur, kp, &key, 1);
2177                xfs_btree_log_keys(cur, bp, ptr, ptr);
2178        }
2179
2180        return 0;
2181}
2182
2183/*
2184 * Update the record referred to by cur to the value in the
2185 * given record. This either works (return 0) or gets an
2186 * EFSCORRUPTED error.
2187 */
2188int
2189xfs_btree_update(
2190        struct xfs_btree_cur    *cur,
2191        union xfs_btree_rec     *rec)
2192{
2193        struct xfs_btree_block  *block;
2194        struct xfs_buf          *bp;
2195        int                     error;
2196        int                     ptr;
2197        union xfs_btree_rec     *rp;
2198
2199        /* Pick up the current block. */
2200        block = xfs_btree_get_block(cur, 0, &bp);
2201
2202#ifdef DEBUG
2203        error = xfs_btree_check_block(cur, block, 0, bp);
2204        if (error)
2205                goto error0;
2206#endif
2207        /* Get the address of the rec to be updated. */
2208        ptr = cur->bc_ptrs[0];
2209        rp = xfs_btree_rec_addr(cur, ptr, block);
2210
2211        /* Fill in the new contents and log them. */
2212        xfs_btree_copy_recs(cur, rp, rec, 1);
2213        xfs_btree_log_recs(cur, bp, ptr, ptr);
2214
2215        /*
2216         * If we are tracking the last record in the tree and
2217         * we are at the far right edge of the tree, update it.
2218         */
2219        if (xfs_btree_is_lastrec(cur, block, 0)) {
2220                cur->bc_ops->update_lastrec(cur, block, rec,
2221                                            ptr, LASTREC_UPDATE);
2222        }
2223
2224        /* Pass new key value up to our parent. */
2225        if (xfs_btree_needs_key_update(cur, ptr)) {
2226                error = xfs_btree_update_keys(cur, 0);
2227                if (error)
2228                        goto error0;
2229        }
2230
2231        return 0;
2232
2233error0:
2234        return error;
2235}
2236
2237/*
2238 * Move 1 record left from cur/level if possible.
2239 * Update cur to reflect the new path.
2240 */
2241STATIC int                                      /* error */
2242xfs_btree_lshift(
2243        struct xfs_btree_cur    *cur,
2244        int                     level,
2245        int                     *stat)          /* success/failure */
2246{
2247        struct xfs_buf          *lbp;           /* left buffer pointer */
2248        struct xfs_btree_block  *left;          /* left btree block */
2249        int                     lrecs;          /* left record count */
2250        struct xfs_buf          *rbp;           /* right buffer pointer */
2251        struct xfs_btree_block  *right;         /* right btree block */
2252        struct xfs_btree_cur    *tcur;          /* temporary btree cursor */
2253        int                     rrecs;          /* right record count */
2254        union xfs_btree_ptr     lptr;           /* left btree pointer */
2255        union xfs_btree_key     *rkp = NULL;    /* right btree key */
2256        union xfs_btree_ptr     *rpp = NULL;    /* right address pointer */
2257        union xfs_btree_rec     *rrp = NULL;    /* right record pointer */
2258        int                     error;          /* error return value */
2259        int                     i;
2260
2261        if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
2262            level == cur->bc_nlevels - 1)
2263                goto out0;
2264
2265        /* Set up variables for this block as "right". */
2266        right = xfs_btree_get_block(cur, level, &rbp);
2267
2268#ifdef DEBUG
2269        error = xfs_btree_check_block(cur, right, level, rbp);
2270        if (error)
2271                goto error0;
2272#endif
2273
2274        /* If we've got no left sibling then we can't shift an entry left. */
2275        xfs_btree_get_sibling(cur, right, &lptr, XFS_BB_LEFTSIB);
2276        if (xfs_btree_ptr_is_null(cur, &lptr))
2277                goto out0;
2278
2279        /*
2280         * If the cursor entry is the one that would be moved, don't
2281         * do it... it's too complicated.
2282         */
2283        if (cur->bc_ptrs[level] <= 1)
2284                goto out0;
2285
2286        /* Set up the left neighbor as "left". */
2287        error = xfs_btree_read_buf_block(cur, &lptr, 0, &left, &lbp);
2288        if (error)
2289                goto error0;
2290
2291        /* If it's full, it can't take another entry. */
2292        lrecs = xfs_btree_get_numrecs(left);
2293        if (lrecs == cur->bc_ops->get_maxrecs(cur, level))
2294                goto out0;
2295
2296        rrecs = xfs_btree_get_numrecs(right);
2297
2298        /*
2299         * We add one entry to the left side and remove one for the right side.
2300         * Account for it here, the changes will be updated on disk and logged
2301         * later.
2302         */
2303        lrecs++;
2304        rrecs--;
2305
2306        XFS_BTREE_STATS_INC(cur, lshift);
2307        XFS_BTREE_STATS_ADD(cur, moves, 1);
2308
2309        /*
2310         * If non-leaf, copy a key and a ptr to the left block.
2311         * Log the changes to the left block.
2312         */
2313        if (level > 0) {
2314                /* It's a non-leaf.  Move keys and pointers. */
2315                union xfs_btree_key     *lkp;   /* left btree key */
2316                union xfs_btree_ptr     *lpp;   /* left address pointer */
2317
2318                lkp = xfs_btree_key_addr(cur, lrecs, left);
2319                rkp = xfs_btree_key_addr(cur, 1, right);
2320
2321                lpp = xfs_btree_ptr_addr(cur, lrecs, left);
2322                rpp = xfs_btree_ptr_addr(cur, 1, right);
2323
2324                error = xfs_btree_debug_check_ptr(cur, rpp, 0, level);
2325                if (error)
2326                        goto error0;
2327
2328                xfs_btree_copy_keys(cur, lkp, rkp, 1);
2329                xfs_btree_copy_ptrs(cur, lpp, rpp, 1);
2330
2331                xfs_btree_log_keys(cur, lbp, lrecs, lrecs);
2332                xfs_btree_log_ptrs(cur, lbp, lrecs, lrecs);
2333
2334                ASSERT(cur->bc_ops->keys_inorder(cur,
2335                        xfs_btree_key_addr(cur, lrecs - 1, left), lkp));
2336        } else {
2337                /* It's a leaf.  Move records.  */
2338                union xfs_btree_rec     *lrp;   /* left record pointer */
2339
2340                lrp = xfs_btree_rec_addr(cur, lrecs, left);
2341                rrp = xfs_btree_rec_addr(cur, 1, right);
2342
2343                xfs_btree_copy_recs(cur, lrp, rrp, 1);
2344                xfs_btree_log_recs(cur, lbp, lrecs, lrecs);
2345
2346                ASSERT(cur->bc_ops->recs_inorder(cur,
2347                        xfs_btree_rec_addr(cur, lrecs - 1, left), lrp));
2348        }
2349
2350        xfs_btree_set_numrecs(left, lrecs);
2351        xfs_btree_log_block(cur, lbp, XFS_BB_NUMRECS);
2352
2353        xfs_btree_set_numrecs(right, rrecs);
2354        xfs_btree_log_block(cur, rbp, XFS_BB_NUMRECS);
2355
2356        /*
2357         * Slide the contents of right down one entry.
2358         */
2359        XFS_BTREE_STATS_ADD(cur, moves, rrecs - 1);
2360        if (level > 0) {
2361                /* It's a nonleaf. operate on keys and ptrs */
2362                for (i = 0; i < rrecs; i++) {
2363                        error = xfs_btree_debug_check_ptr(cur, rpp, i + 1, level);
2364                        if (error)
2365                                goto error0;
2366                }
2367
2368                xfs_btree_shift_keys(cur,
2369                                xfs_btree_key_addr(cur, 2, right),
2370                                -1, rrecs);
2371                xfs_btree_shift_ptrs(cur,
2372                                xfs_btree_ptr_addr(cur, 2, right),
2373                                -1, rrecs);
2374
2375                xfs_btree_log_keys(cur, rbp, 1, rrecs);
2376                xfs_btree_log_ptrs(cur, rbp, 1, rrecs);
2377        } else {
2378                /* It's a leaf. operate on records */
2379                xfs_btree_shift_recs(cur,
2380                        xfs_btree_rec_addr(cur, 2, right),
2381                        -1, rrecs);
2382                xfs_btree_log_recs(cur, rbp, 1, rrecs);
2383        }
2384
2385        /*
2386         * Using a temporary cursor, update the parent key values of the
2387         * block on the left.
2388         */
2389        if (cur->bc_flags & XFS_BTREE_OVERLAPPING) {
2390                error = xfs_btree_dup_cursor(cur, &tcur);
2391                if (error)
2392                        goto error0;
2393                i = xfs_btree_firstrec(tcur, level);
2394                if (XFS_IS_CORRUPT(tcur->bc_mp, i != 1)) {
2395                        error = -EFSCORRUPTED;
2396                        goto error0;
2397                }
2398
2399                error = xfs_btree_decrement(tcur, level, &i);
2400                if (error)
2401                        goto error1;
2402
2403                /* Update the parent high keys of the left block, if needed. */
2404                error = xfs_btree_update_keys(tcur, level);
2405                if (error)
2406                        goto error1;
2407
2408                xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
2409        }
2410
2411        /* Update the parent keys of the right block. */
2412        error = xfs_btree_update_keys(cur, level);
2413        if (error)
2414                goto error0;
2415
2416        /* Slide the cursor value left one. */
2417        cur->bc_ptrs[level]--;
2418
2419        *stat = 1;
2420        return 0;
2421
2422out0:
2423        *stat = 0;
2424        return 0;
2425
2426error0:
2427        return error;
2428
2429error1:
2430        xfs_btree_del_cursor(tcur, XFS_BTREE_ERROR);
2431        return error;
2432}
2433
2434/*
2435 * Move 1 record right from cur/level if possible.
2436 * Update cur to reflect the new path.
2437 */
2438STATIC int                                      /* error */
2439xfs_btree_rshift(
2440        struct xfs_btree_cur    *cur,
2441        int                     level,
2442        int                     *stat)          /* success/failure */
2443{
2444        struct xfs_buf          *lbp;           /* left buffer pointer */
2445        struct xfs_btree_block  *left;          /* left btree block */
2446        struct xfs_buf          *rbp;           /* right buffer pointer */
2447        struct xfs_btree_block  *right;         /* right btree block */
2448        struct xfs_btree_cur    *tcur;          /* temporary btree cursor */
2449        union xfs_btree_ptr     rptr;           /* right block pointer */
2450        union xfs_btree_key     *rkp;           /* right btree key */
2451        int                     rrecs;          /* right record count */
2452        int                     lrecs;          /* left record count */
2453        int                     error;          /* error return value */
2454        int                     i;              /* loop counter */
2455
2456        if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
2457            (level == cur->bc_nlevels - 1))
2458                goto out0;
2459
2460        /* Set up variables for this block as "left". */
2461        left = xfs_btree_get_block(cur, level, &lbp);
2462
2463#ifdef DEBUG
2464        error = xfs_btree_check_block(cur, left, level, lbp);
2465        if (error)
2466                goto error0;
2467#endif
2468
2469        /* If we've got no right sibling then we can't shift an entry right. */
2470        xfs_btree_get_sibling(cur, left, &rptr, XFS_BB_RIGHTSIB);
2471        if (xfs_btree_ptr_is_null(cur, &rptr))
2472                goto out0;
2473
2474        /*
2475         * If the cursor entry is the one that would be moved, don't
2476         * do it... it's too complicated.
2477         */
2478        lrecs = xfs_btree_get_numrecs(left);
2479        if (cur->bc_ptrs[level] >= lrecs)
2480                goto out0;
2481
2482        /* Set up the right neighbor as "right". */
2483        error = xfs_btree_read_buf_block(cur, &rptr, 0, &right, &rbp);
2484        if (error)
2485                goto error0;
2486
2487        /* If it's full, it can't take another entry. */
2488        rrecs = xfs_btree_get_numrecs(right);
2489        if (rrecs == cur->bc_ops->get_maxrecs(cur, level))
2490                goto out0;
2491
2492        XFS_BTREE_STATS_INC(cur, rshift);
2493        XFS_BTREE_STATS_ADD(cur, moves, rrecs);
2494
2495        /*
2496         * Make a hole at the start of the right neighbor block, then
2497         * copy the last left block entry to the hole.
2498         */
2499        if (level > 0) {
2500                /* It's a nonleaf. make a hole in the keys and ptrs */
2501                union xfs_btree_key     *lkp;
2502                union xfs_btree_ptr     *lpp;
2503                union xfs_btree_ptr     *rpp;
2504
2505                lkp = xfs_btree_key_addr(cur, lrecs, left);
2506                lpp = xfs_btree_ptr_addr(cur, lrecs, left);
2507                rkp = xfs_btree_key_addr(cur, 1, right);
2508                rpp = xfs_btree_ptr_addr(cur, 1, right);
2509
2510                for (i = rrecs - 1; i >= 0; i--) {
2511                        error = xfs_btree_debug_check_ptr(cur, rpp, i, level);
2512                        if (error)
2513                                goto error0;
2514                }
2515
2516                xfs_btree_shift_keys(cur, rkp, 1, rrecs);
2517                xfs_btree_shift_ptrs(cur, rpp, 1, rrecs);
2518
2519                error = xfs_btree_debug_check_ptr(cur, lpp, 0, level);
2520                if (error)
2521                        goto error0;
2522
2523                /* Now put the new data in, and log it. */
2524                xfs_btree_copy_keys(cur, rkp, lkp, 1);
2525                xfs_btree_copy_ptrs(cur, rpp, lpp, 1);
2526
2527                xfs_btree_log_keys(cur, rbp, 1, rrecs + 1);
2528                xfs_btree_log_ptrs(cur, rbp, 1, rrecs + 1);
2529
2530                ASSERT(cur->bc_ops->keys_inorder(cur, rkp,
2531                        xfs_btree_key_addr(cur, 2, right)));
2532        } else {
2533                /* It's a leaf. make a hole in the records */
2534                union xfs_btree_rec     *lrp;
2535                union xfs_btree_rec     *rrp;
2536
2537                lrp = xfs_btree_rec_addr(cur, lrecs, left);
2538                rrp = xfs_btree_rec_addr(cur, 1, right);
2539
2540                xfs_btree_shift_recs(cur, rrp, 1, rrecs);
2541
2542                /* Now put the new data in, and log it. */
2543                xfs_btree_copy_recs(cur, rrp, lrp, 1);
2544                xfs_btree_log_recs(cur, rbp, 1, rrecs + 1);
2545        }
2546
2547        /*
2548         * Decrement and log left's numrecs, bump and log right's numrecs.
2549         */
2550        xfs_btree_set_numrecs(left, --lrecs);
2551        xfs_btree_log_block(cur, lbp, XFS_BB_NUMRECS);
2552
2553        xfs_btree_set_numrecs(right, ++rrecs);
2554        xfs_btree_log_block(cur, rbp, XFS_BB_NUMRECS);
2555
2556        /*
2557         * Using a temporary cursor, update the parent key values of the
2558         * block on the right.
2559         */
2560        error = xfs_btree_dup_cursor(cur, &tcur);
2561        if (error)
2562                goto error0;
2563        i = xfs_btree_lastrec(tcur, level);
2564        if (XFS_IS_CORRUPT(tcur->bc_mp, i != 1)) {
2565                error = -EFSCORRUPTED;
2566                goto error0;
2567        }
2568
2569        error = xfs_btree_increment(tcur, level, &i);
2570        if (error)
2571                goto error1;
2572
2573        /* Update the parent high keys of the left block, if needed. */
2574        if (cur->bc_flags & XFS_BTREE_OVERLAPPING) {
2575                error = xfs_btree_update_keys(cur, level);
2576                if (error)
2577                        goto error1;
2578        }
2579
2580        /* Update the parent keys of the right block. */
2581        error = xfs_btree_update_keys(tcur, level);
2582        if (error)
2583                goto error1;
2584
2585        xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
2586
2587        *stat = 1;
2588        return 0;
2589
2590out0:
2591        *stat = 0;
2592        return 0;
2593
2594error0:
2595        return error;
2596
2597error1:
2598        xfs_btree_del_cursor(tcur, XFS_BTREE_ERROR);
2599        return error;
2600}
2601
2602/*
2603 * Split cur/level block in half.
2604 * Return new block number and the key to its first
2605 * record (to be inserted into parent).
2606 */
2607STATIC int                                      /* error */
2608__xfs_btree_split(
2609        struct xfs_btree_cur    *cur,
2610        int                     level,
2611        union xfs_btree_ptr     *ptrp,
2612        union xfs_btree_key     *key,
2613        struct xfs_btree_cur    **curp,
2614        int                     *stat)          /* success/failure */
2615{
2616        union xfs_btree_ptr     lptr;           /* left sibling block ptr */
2617        struct xfs_buf          *lbp;           /* left buffer pointer */
2618        struct xfs_btree_block  *left;          /* left btree block */
2619        union xfs_btree_ptr     rptr;           /* right sibling block ptr */
2620        struct xfs_buf          *rbp;           /* right buffer pointer */
2621        struct xfs_btree_block  *right;         /* right btree block */
2622        union xfs_btree_ptr     rrptr;          /* right-right sibling ptr */
2623        struct xfs_buf          *rrbp;          /* right-right buffer pointer */
2624        struct xfs_btree_block  *rrblock;       /* right-right btree block */
2625        int                     lrecs;
2626        int                     rrecs;
2627        int                     src_index;
2628        int                     error;          /* error return value */
2629        int                     i;
2630
2631        XFS_BTREE_STATS_INC(cur, split);
2632
2633        /* Set up left block (current one). */
2634        left = xfs_btree_get_block(cur, level, &lbp);
2635
2636#ifdef DEBUG
2637        error = xfs_btree_check_block(cur, left, level, lbp);
2638        if (error)
2639                goto error0;
2640#endif
2641
2642        xfs_btree_buf_to_ptr(cur, lbp, &lptr);
2643
2644        /* Allocate the new block. If we can't do it, we're toast. Give up. */
2645        error = cur->bc_ops->alloc_block(cur, &lptr, &rptr, stat);
2646        if (error)
2647                goto error0;
2648        if (*stat == 0)
2649                goto out0;
2650        XFS_BTREE_STATS_INC(cur, alloc);
2651
2652        /* Set up the new block as "right". */
2653        error = xfs_btree_get_buf_block(cur, &rptr, &right, &rbp);
2654        if (error)
2655                goto error0;
2656
2657        /* Fill in the btree header for the new right block. */
2658        xfs_btree_init_block_cur(cur, rbp, xfs_btree_get_level(left), 0);
2659
2660        /*
2661         * Split the entries between the old and the new block evenly.
2662         * Make sure that if there's an odd number of entries now, that
2663         * each new block will have the same number of entries.
2664         */
2665        lrecs = xfs_btree_get_numrecs(left);
2666        rrecs = lrecs / 2;
2667        if ((lrecs & 1) && cur->bc_ptrs[level] <= rrecs + 1)
2668                rrecs++;
2669        src_index = (lrecs - rrecs + 1);
2670
2671        XFS_BTREE_STATS_ADD(cur, moves, rrecs);
2672
2673        /* Adjust numrecs for the later get_*_keys() calls. */
2674        lrecs -= rrecs;
2675        xfs_btree_set_numrecs(left, lrecs);
2676        xfs_btree_set_numrecs(right, xfs_btree_get_numrecs(right) + rrecs);
2677
2678        /*
2679         * Copy btree block entries from the left block over to the
2680         * new block, the right. Update the right block and log the
2681         * changes.
2682         */
2683        if (level > 0) {
2684                /* It's a non-leaf.  Move keys and pointers. */
2685                union xfs_btree_key     *lkp;   /* left btree key */
2686                union xfs_btree_ptr     *lpp;   /* left address pointer */
2687                union xfs_btree_key     *rkp;   /* right btree key */
2688                union xfs_btree_ptr     *rpp;   /* right address pointer */
2689
2690                lkp = xfs_btree_key_addr(cur, src_index, left);
2691                lpp = xfs_btree_ptr_addr(cur, src_index, left);
2692                rkp = xfs_btree_key_addr(cur, 1, right);
2693                rpp = xfs_btree_ptr_addr(cur, 1, right);
2694
2695                for (i = src_index; i < rrecs; i++) {
2696                        error = xfs_btree_debug_check_ptr(cur, lpp, i, level);
2697                        if (error)
2698                                goto error0;
2699                }
2700
2701                /* Copy the keys & pointers to the new block. */
2702                xfs_btree_copy_keys(cur, rkp, lkp, rrecs);
2703                xfs_btree_copy_ptrs(cur, rpp, lpp, rrecs);
2704
2705                xfs_btree_log_keys(cur, rbp, 1, rrecs);
2706                xfs_btree_log_ptrs(cur, rbp, 1, rrecs);
2707
2708                /* Stash the keys of the new block for later insertion. */
2709                xfs_btree_get_node_keys(cur, right, key);
2710        } else {
2711                /* It's a leaf.  Move records.  */
2712                union xfs_btree_rec     *lrp;   /* left record pointer */
2713                union xfs_btree_rec     *rrp;   /* right record pointer */
2714
2715                lrp = xfs_btree_rec_addr(cur, src_index, left);
2716                rrp = xfs_btree_rec_addr(cur, 1, right);
2717
2718                /* Copy records to the new block. */
2719                xfs_btree_copy_recs(cur, rrp, lrp, rrecs);
2720                xfs_btree_log_recs(cur, rbp, 1, rrecs);
2721
2722                /* Stash the keys of the new block for later insertion. */
2723                xfs_btree_get_leaf_keys(cur, right, key);
2724        }
2725
2726        /*
2727         * Find the left block number by looking in the buffer.
2728         * Adjust sibling pointers.
2729         */
2730        xfs_btree_get_sibling(cur, left, &rrptr, XFS_BB_RIGHTSIB);
2731        xfs_btree_set_sibling(cur, right, &rrptr, XFS_BB_RIGHTSIB);
2732        xfs_btree_set_sibling(cur, right, &lptr, XFS_BB_LEFTSIB);
2733        xfs_btree_set_sibling(cur, left, &rptr, XFS_BB_RIGHTSIB);
2734
2735        xfs_btree_log_block(cur, rbp, XFS_BB_ALL_BITS);
2736        xfs_btree_log_block(cur, lbp, XFS_BB_NUMRECS | XFS_BB_RIGHTSIB);
2737
2738        /*
2739         * If there's a block to the new block's right, make that block
2740         * point back to right instead of to left.
2741         */
2742        if (!xfs_btree_ptr_is_null(cur, &rrptr)) {
2743                error = xfs_btree_read_buf_block(cur, &rrptr,
2744                                                        0, &rrblock, &rrbp);
2745                if (error)
2746                        goto error0;
2747                xfs_btree_set_sibling(cur, rrblock, &rptr, XFS_BB_LEFTSIB);
2748                xfs_btree_log_block(cur, rrbp, XFS_BB_LEFTSIB);
2749        }
2750
2751        /* Update the parent high keys of the left block, if needed. */
2752        if (cur->bc_flags & XFS_BTREE_OVERLAPPING) {
2753                error = xfs_btree_update_keys(cur, level);
2754                if (error)
2755                        goto error0;
2756        }
2757
2758        /*
2759         * If the cursor is really in the right block, move it there.
2760         * If it's just pointing past the last entry in left, then we'll
2761         * insert there, so don't change anything in that case.
2762         */
2763        if (cur->bc_ptrs[level] > lrecs + 1) {
2764                xfs_btree_setbuf(cur, level, rbp);
2765                cur->bc_ptrs[level] -= lrecs;
2766        }
2767        /*
2768         * If there are more levels, we'll need another cursor which refers
2769         * the right block, no matter where this cursor was.
2770         */
2771        if (level + 1 < cur->bc_nlevels) {
2772                error = xfs_btree_dup_cursor(cur, curp);
2773                if (error)
2774                        goto error0;
2775                (*curp)->bc_ptrs[level + 1]++;
2776        }
2777        *ptrp = rptr;
2778        *stat = 1;
2779        return 0;
2780out0:
2781        *stat = 0;
2782        return 0;
2783
2784error0:
2785        return error;
2786}
2787
2788struct xfs_btree_split_args {
2789        struct xfs_btree_cur    *cur;
2790        int                     level;
2791        union xfs_btree_ptr     *ptrp;
2792        union xfs_btree_key     *key;
2793        struct xfs_btree_cur    **curp;
2794        int                     *stat;          /* success/failure */
2795        int                     result;
2796        bool                    kswapd; /* allocation in kswapd context */
2797        struct completion       *done;
2798        struct work_struct      work;
2799};
2800
2801/*
2802 * Stack switching interfaces for allocation
2803 */
2804static void
2805xfs_btree_split_worker(
2806        struct work_struct      *work)
2807{
2808        struct xfs_btree_split_args     *args = container_of(work,
2809                                                struct xfs_btree_split_args, work);
2810        unsigned long           pflags;
2811        unsigned long           new_pflags = 0;
2812
2813        /*
2814         * we are in a transaction context here, but may also be doing work
2815         * in kswapd context, and hence we may need to inherit that state
2816         * temporarily to ensure that we don't block waiting for memory reclaim
2817         * in any way.
2818         */
2819        if (args->kswapd)
2820                new_pflags |= PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD;
2821
2822        current_set_flags_nested(&pflags, new_pflags);
2823        xfs_trans_set_context(args->cur->bc_tp);
2824
2825        args->result = __xfs_btree_split(args->cur, args->level, args->ptrp,
2826                                         args->key, args->curp, args->stat);
2827
2828        xfs_trans_clear_context(args->cur->bc_tp);
2829        current_restore_flags_nested(&pflags, new_pflags);
2830
2831        /*
2832         * Do not access args after complete() has run here. We don't own args
2833         * and the owner may run and free args before we return here.
2834         */
2835        complete(args->done);
2836
2837}
2838
2839/*
2840 * BMBT split requests often come in with little stack to work on. Push
2841 * them off to a worker thread so there is lots of stack to use. For the other
2842 * btree types, just call directly to avoid the context switch overhead here.
2843 */
2844STATIC int                                      /* error */
2845xfs_btree_split(
2846        struct xfs_btree_cur    *cur,
2847        int                     level,
2848        union xfs_btree_ptr     *ptrp,
2849        union xfs_btree_key     *key,
2850        struct xfs_btree_cur    **curp,
2851        int                     *stat)          /* success/failure */
2852{
2853        struct xfs_btree_split_args     args;
2854        DECLARE_COMPLETION_ONSTACK(done);
2855
2856        if (cur->bc_btnum != XFS_BTNUM_BMAP)
2857                return __xfs_btree_split(cur, level, ptrp, key, curp, stat);
2858
2859        args.cur = cur;
2860        args.level = level;
2861        args.ptrp = ptrp;
2862        args.key = key;
2863        args.curp = curp;
2864        args.stat = stat;
2865        args.done = &done;
2866        args.kswapd = current_is_kswapd();
2867        INIT_WORK_ONSTACK(&args.work, xfs_btree_split_worker);
2868        queue_work(xfs_alloc_wq, &args.work);
2869        wait_for_completion(&done);
2870        destroy_work_on_stack(&args.work);
2871        return args.result;
2872}
2873
2874
2875/*
2876 * Copy the old inode root contents into a real block and make the
2877 * broot point to it.
2878 */
2879int                                             /* error */
2880xfs_btree_new_iroot(
2881        struct xfs_btree_cur    *cur,           /* btree cursor */
2882        int                     *logflags,      /* logging flags for inode */
2883        int                     *stat)          /* return status - 0 fail */
2884{
2885        struct xfs_buf          *cbp;           /* buffer for cblock */
2886        struct xfs_btree_block  *block;         /* btree block */
2887        struct xfs_btree_block  *cblock;        /* child btree block */
2888        union xfs_btree_key     *ckp;           /* child key pointer */
2889        union xfs_btree_ptr     *cpp;           /* child ptr pointer */
2890        union xfs_btree_key     *kp;            /* pointer to btree key */
2891        union xfs_btree_ptr     *pp;            /* pointer to block addr */
2892        union xfs_btree_ptr     nptr;           /* new block addr */
2893        int                     level;          /* btree level */
2894        int                     error;          /* error return code */
2895        int                     i;              /* loop counter */
2896
2897        XFS_BTREE_STATS_INC(cur, newroot);
2898
2899        ASSERT(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE);
2900
2901        level = cur->bc_nlevels - 1;
2902
2903        block = xfs_btree_get_iroot(cur);
2904        pp = xfs_btree_ptr_addr(cur, 1, block);
2905
2906        /* Allocate the new block. If we can't do it, we're toast. Give up. */
2907        error = cur->bc_ops->alloc_block(cur, pp, &nptr, stat);
2908        if (error)
2909                goto error0;
2910        if (*stat == 0)
2911                return 0;
2912
2913        XFS_BTREE_STATS_INC(cur, alloc);
2914
2915        /* Copy the root into a real block. */
2916        error = xfs_btree_get_buf_block(cur, &nptr, &cblock, &cbp);
2917        if (error)
2918                goto error0;
2919
2920        /*
2921         * we can't just memcpy() the root in for CRC enabled btree blocks.
2922         * In that case have to also ensure the blkno remains correct
2923         */
2924        memcpy(cblock, block, xfs_btree_block_len(cur));
2925        if (cur->bc_flags & XFS_BTREE_CRC_BLOCKS) {
2926                if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
2927                        cblock->bb_u.l.bb_blkno = cpu_to_be64(cbp->b_bn);
2928                else
2929                        cblock->bb_u.s.bb_blkno = cpu_to_be64(cbp->b_bn);
2930        }
2931
2932        be16_add_cpu(&block->bb_level, 1);
2933        xfs_btree_set_numrecs(block, 1);
2934        cur->bc_nlevels++;
2935        cur->bc_ptrs[level + 1] = 1;
2936
2937        kp = xfs_btree_key_addr(cur, 1, block);
2938        ckp = xfs_btree_key_addr(cur, 1, cblock);
2939        xfs_btree_copy_keys(cur, ckp, kp, xfs_btree_get_numrecs(cblock));
2940
2941        cpp = xfs_btree_ptr_addr(cur, 1, cblock);
2942        for (i = 0; i < be16_to_cpu(cblock->bb_numrecs); i++) {
2943                error = xfs_btree_debug_check_ptr(cur, pp, i, level);
2944                if (error)
2945                        goto error0;
2946        }
2947
2948        xfs_btree_copy_ptrs(cur, cpp, pp, xfs_btree_get_numrecs(cblock));
2949
2950        error = xfs_btree_debug_check_ptr(cur, &nptr, 0, level);
2951        if (error)
2952                goto error0;
2953
2954        xfs_btree_copy_ptrs(cur, pp, &nptr, 1);
2955
2956        xfs_iroot_realloc(cur->bc_ino.ip,
2957                          1 - xfs_btree_get_numrecs(cblock),
2958                          cur->bc_ino.whichfork);
2959
2960        xfs_btree_setbuf(cur, level, cbp);
2961
2962        /*
2963         * Do all this logging at the end so that
2964         * the root is at the right level.
2965         */
2966        xfs_btree_log_block(cur, cbp, XFS_BB_ALL_BITS);
2967        xfs_btree_log_keys(cur, cbp, 1, be16_to_cpu(cblock->bb_numrecs));
2968        xfs_btree_log_ptrs(cur, cbp, 1, be16_to_cpu(cblock->bb_numrecs));
2969
2970        *logflags |=
2971                XFS_ILOG_CORE | xfs_ilog_fbroot(cur->bc_ino.whichfork);
2972        *stat = 1;
2973        return 0;
2974error0:
2975        return error;
2976}
2977
2978/*
2979 * Allocate a new root block, fill it in.
2980 */
2981STATIC int                              /* error */
2982xfs_btree_new_root(
2983        struct xfs_btree_cur    *cur,   /* btree cursor */
2984        int                     *stat)  /* success/failure */
2985{
2986        struct xfs_btree_block  *block; /* one half of the old root block */
2987        struct xfs_buf          *bp;    /* buffer containing block */
2988        int                     error;  /* error return value */
2989        struct xfs_buf          *lbp;   /* left buffer pointer */
2990        struct xfs_btree_block  *left;  /* left btree block */
2991        struct xfs_buf          *nbp;   /* new (root) buffer */
2992        struct xfs_btree_block  *new;   /* new (root) btree block */
2993        int                     nptr;   /* new value for key index, 1 or 2 */
2994        struct xfs_buf          *rbp;   /* right buffer pointer */
2995        struct xfs_btree_block  *right; /* right btree block */
2996        union xfs_btree_ptr     rptr;
2997        union xfs_btree_ptr     lptr;
2998
2999        XFS_BTREE_STATS_INC(cur, newroot);
3000
3001        /* initialise our start point from the cursor */
3002        cur->bc_ops->init_ptr_from_cur(cur, &rptr);
3003
3004        /* Allocate the new block. If we can't do it, we're toast. Give up. */
3005        error = cur->bc_ops->alloc_block(cur, &rptr, &lptr, stat);
3006        if (error)
3007                goto error0;
3008        if (*stat == 0)
3009                goto out0;
3010        XFS_BTREE_STATS_INC(cur, alloc);
3011
3012        /* Set up the new block. */
3013        error = xfs_btree_get_buf_block(cur, &lptr, &new, &nbp);
3014        if (error)
3015                goto error0;
3016
3017        /* Set the root in the holding structure  increasing the level by 1. */
3018        cur->bc_ops->set_root(cur, &lptr, 1);
3019
3020        /*
3021         * At the previous root level there are now two blocks: the old root,
3022         * and the new block generated when it was split.  We don't know which
3023         * one the cursor is pointing at, so we set up variables "left" and
3024         * "right" for each case.
3025         */
3026        block = xfs_btree_get_block(cur, cur->bc_nlevels - 1, &bp);
3027
3028#ifdef DEBUG
3029        error = xfs_btree_check_block(cur, block, cur->bc_nlevels - 1, bp);
3030        if (error)
3031                goto error0;
3032#endif
3033
3034        xfs_btree_get_sibling(cur, block, &rptr, XFS_BB_RIGHTSIB);
3035        if (!xfs_btree_ptr_is_null(cur, &rptr)) {
3036                /* Our block is left, pick up the right block. */
3037                lbp = bp;
3038                xfs_btree_buf_to_ptr(cur, lbp, &lptr);
3039                left = block;
3040                error = xfs_btree_read_buf_block(cur, &rptr, 0, &right, &rbp);
3041                if (error)
3042                        goto error0;
3043                bp = rbp;
3044                nptr = 1;
3045        } else {
3046                /* Our block is right, pick up the left block. */
3047                rbp = bp;
3048                xfs_btree_buf_to_ptr(cur, rbp, &rptr);
3049                right = block;
3050                xfs_btree_get_sibling(cur, right, &lptr, XFS_BB_LEFTSIB);
3051                error = xfs_btree_read_buf_block(cur, &lptr, 0, &left, &lbp);
3052                if (error)
3053                        goto error0;
3054                bp = lbp;
3055                nptr = 2;
3056        }
3057
3058        /* Fill in the new block's btree header and log it. */
3059        xfs_btree_init_block_cur(cur, nbp, cur->bc_nlevels, 2);
3060        xfs_btree_log_block(cur, nbp, XFS_BB_ALL_BITS);
3061        ASSERT(!xfs_btree_ptr_is_null(cur, &lptr) &&
3062                        !xfs_btree_ptr_is_null(cur, &rptr));
3063
3064        /* Fill in the key data in the new root. */
3065        if (xfs_btree_get_level(left) > 0) {
3066                /*
3067                 * Get the keys for the left block's keys and put them directly
3068                 * in the parent block.  Do the same for the right block.
3069                 */
3070                xfs_btree_get_node_keys(cur, left,
3071                                xfs_btree_key_addr(cur, 1, new));
3072                xfs_btree_get_node_keys(cur, right,
3073                                xfs_btree_key_addr(cur, 2, new));
3074        } else {
3075                /*
3076                 * Get the keys for the left block's records and put them
3077                 * directly in the parent block.  Do the same for the right
3078                 * block.
3079                 */
3080                xfs_btree_get_leaf_keys(cur, left,
3081                        xfs_btree_key_addr(cur, 1, new));
3082                xfs_btree_get_leaf_keys(cur, right,
3083                        xfs_btree_key_addr(cur, 2, new));
3084        }
3085        xfs_btree_log_keys(cur, nbp, 1, 2);
3086
3087        /* Fill in the pointer data in the new root. */
3088        xfs_btree_copy_ptrs(cur,
3089                xfs_btree_ptr_addr(cur, 1, new), &lptr, 1);
3090        xfs_btree_copy_ptrs(cur,
3091                xfs_btree_ptr_addr(cur, 2, new), &rptr, 1);
3092        xfs_btree_log_ptrs(cur, nbp, 1, 2);
3093
3094        /* Fix up the cursor. */
3095        xfs_btree_setbuf(cur, cur->bc_nlevels, nbp);
3096        cur->bc_ptrs[cur->bc_nlevels] = nptr;
3097        cur->bc_nlevels++;
3098        *stat = 1;
3099        return 0;
3100error0:
3101        return error;
3102out0:
3103        *stat = 0;
3104        return 0;
3105}
3106
3107STATIC int
3108xfs_btree_make_block_unfull(
3109        struct xfs_btree_cur    *cur,   /* btree cursor */
3110        int                     level,  /* btree level */
3111        int                     numrecs,/* # of recs in block */
3112        int                     *oindex,/* old tree index */
3113        int                     *index, /* new tree index */
3114        union xfs_btree_ptr     *nptr,  /* new btree ptr */
3115        struct xfs_btree_cur    **ncur, /* new btree cursor */
3116        union xfs_btree_key     *key,   /* key of new block */
3117        int                     *stat)
3118{
3119        int                     error = 0;
3120
3121        if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
3122            level == cur->bc_nlevels - 1) {
3123                struct xfs_inode *ip = cur->bc_ino.ip;
3124
3125                if (numrecs < cur->bc_ops->get_dmaxrecs(cur, level)) {
3126                        /* A root block that can be made bigger. */
3127                        xfs_iroot_realloc(ip, 1, cur->bc_ino.whichfork);
3128                        *stat = 1;
3129                } else {
3130                        /* A root block that needs replacing */
3131                        int     logflags = 0;
3132
3133                        error = xfs_btree_new_iroot(cur, &logflags, stat);
3134                        if (error || *stat == 0)
3135                                return error;
3136
3137                        xfs_trans_log_inode(cur->bc_tp, ip, logflags);
3138                }
3139
3140                return 0;
3141        }
3142
3143        /* First, try shifting an entry to the right neighbor. */
3144        error = xfs_btree_rshift(cur, level, stat);
3145        if (error || *stat)
3146                return error;
3147
3148        /* Next, try shifting an entry to the left neighbor. */
3149        error = xfs_btree_lshift(cur, level, stat);
3150        if (error)
3151                return error;
3152
3153        if (*stat) {
3154                *oindex = *index = cur->bc_ptrs[level];
3155                return 0;
3156        }
3157
3158        /*
3159         * Next, try splitting the current block in half.
3160         *
3161         * If this works we have to re-set our variables because we
3162         * could be in a different block now.
3163         */
3164        error = xfs_btree_split(cur, level, nptr, key, ncur, stat);
3165        if (error || *stat == 0)
3166                return error;
3167
3168
3169        *index = cur->bc_ptrs[level];
3170        return 0;
3171}
3172
3173/*
3174 * Insert one record/level.  Return information to the caller
3175 * allowing the next level up to proceed if necessary.
3176 */
3177STATIC int
3178xfs_btree_insrec(
3179        struct xfs_btree_cur    *cur,   /* btree cursor */
3180        int                     level,  /* level to insert record at */
3181        union xfs_btree_ptr     *ptrp,  /* i/o: block number inserted */
3182        union xfs_btree_rec     *rec,   /* record to insert */
3183        union xfs_btree_key     *key,   /* i/o: block key for ptrp */
3184        struct xfs_btree_cur    **curp, /* output: new cursor replacing cur */
3185        int                     *stat)  /* success/failure */
3186{
3187        struct xfs_btree_block  *block; /* btree block */
3188        struct xfs_buf          *bp;    /* buffer for block */
3189        union xfs_btree_ptr     nptr;   /* new block ptr */
3190        struct xfs_btree_cur    *ncur;  /* new btree cursor */
3191        union xfs_btree_key     nkey;   /* new block key */
3192        union xfs_btree_key     *lkey;
3193        int                     optr;   /* old key/record index */
3194        int                     ptr;    /* key/record index */
3195        int                     numrecs;/* number of records */
3196        int                     error;  /* error return value */
3197        int                     i;
3198        xfs_daddr_t             old_bn;
3199
3200        ncur = NULL;
3201        lkey = &nkey;
3202
3203        /*
3204         * If we have an external root pointer, and we've made it to the
3205         * root level, allocate a new root block and we're done.
3206         */
3207        if (!(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
3208            (level >= cur->bc_nlevels)) {
3209                error = xfs_btree_new_root(cur, stat);
3210                xfs_btree_set_ptr_null(cur, ptrp);
3211
3212                return error;
3213        }
3214
3215        /* If we're off the left edge, return failure. */
3216        ptr = cur->bc_ptrs[level];
3217        if (ptr == 0) {
3218                *stat = 0;
3219                return 0;
3220        }
3221
3222        optr = ptr;
3223
3224        XFS_BTREE_STATS_INC(cur, insrec);
3225
3226        /* Get pointers to the btree buffer and block. */
3227        block = xfs_btree_get_block(cur, level, &bp);
3228        old_bn = bp ? bp->b_bn : XFS_BUF_DADDR_NULL;
3229        numrecs = xfs_btree_get_numrecs(block);
3230
3231#ifdef DEBUG
3232        error = xfs_btree_check_block(cur, block, level, bp);
3233        if (error)
3234                goto error0;
3235
3236        /* Check that the new entry is being inserted in the right place. */
3237        if (ptr <= numrecs) {
3238                if (level == 0) {
3239                        ASSERT(cur->bc_ops->recs_inorder(cur, rec,
3240                                xfs_btree_rec_addr(cur, ptr, block)));
3241                } else {
3242                        ASSERT(cur->bc_ops->keys_inorder(cur, key,
3243                                xfs_btree_key_addr(cur, ptr, block)));
3244                }
3245        }
3246#endif
3247
3248        /*
3249         * If the block is full, we can't insert the new entry until we
3250         * make the block un-full.
3251         */
3252        xfs_btree_set_ptr_null(cur, &nptr);
3253        if (numrecs == cur->bc_ops->get_maxrecs(cur, level)) {
3254                error = xfs_btree_make_block_unfull(cur, level, numrecs,
3255                                        &optr, &ptr, &nptr, &ncur, lkey, stat);
3256                if (error || *stat == 0)
3257                        goto error0;
3258        }
3259
3260        /*
3261         * The current block may have changed if the block was
3262         * previously full and we have just made space in it.
3263         */
3264        block = xfs_btree_get_block(cur, level, &bp);
3265        numrecs = xfs_btree_get_numrecs(block);
3266
3267#ifdef DEBUG
3268        error = xfs_btree_check_block(cur, block, level, bp);
3269        if (error)
3270                return error;
3271#endif
3272
3273        /*
3274         * At this point we know there's room for our new entry in the block
3275         * we're pointing at.
3276         */
3277        XFS_BTREE_STATS_ADD(cur, moves, numrecs - ptr + 1);
3278
3279        if (level > 0) {
3280                /* It's a nonleaf. make a hole in the keys and ptrs */
3281                union xfs_btree_key     *kp;
3282                union xfs_btree_ptr     *pp;
3283
3284                kp = xfs_btree_key_addr(cur, ptr, block);
3285                pp = xfs_btree_ptr_addr(cur, ptr, block);
3286
3287                for (i = numrecs - ptr; i >= 0; i--) {
3288                        error = xfs_btree_debug_check_ptr(cur, pp, i, level);
3289                        if (error)
3290                                return error;
3291                }
3292
3293                xfs_btree_shift_keys(cur, kp, 1, numrecs - ptr + 1);
3294                xfs_btree_shift_ptrs(cur, pp, 1, numrecs - ptr + 1);
3295
3296                error = xfs_btree_debug_check_ptr(cur, ptrp, 0, level);
3297                if (error)
3298                        goto error0;
3299
3300                /* Now put the new data in, bump numrecs and log it. */
3301                xfs_btree_copy_keys(cur, kp, key, 1);
3302                xfs_btree_copy_ptrs(cur, pp, ptrp, 1);
3303                numrecs++;
3304                xfs_btree_set_numrecs(block, numrecs);
3305                xfs_btree_log_ptrs(cur, bp, ptr, numrecs);
3306                xfs_btree_log_keys(cur, bp, ptr, numrecs);
3307#ifdef DEBUG
3308                if (ptr < numrecs) {
3309                        ASSERT(cur->bc_ops->keys_inorder(cur, kp,
3310                                xfs_btree_key_addr(cur, ptr + 1, block)));
3311                }
3312#endif
3313        } else {
3314                /* It's a leaf. make a hole in the records */
3315                union xfs_btree_rec             *rp;
3316
3317                rp = xfs_btree_rec_addr(cur, ptr, block);
3318
3319                xfs_btree_shift_recs(cur, rp, 1, numrecs - ptr + 1);
3320
3321                /* Now put the new data in, bump numrecs and log it. */
3322                xfs_btree_copy_recs(cur, rp, rec, 1);
3323                xfs_btree_set_numrecs(block, ++numrecs);
3324                xfs_btree_log_recs(cur, bp, ptr, numrecs);
3325#ifdef DEBUG
3326                if (ptr < numrecs) {
3327                        ASSERT(cur->bc_ops->recs_inorder(cur, rp,
3328                                xfs_btree_rec_addr(cur, ptr + 1, block)));
3329                }
3330#endif
3331        }
3332
3333        /* Log the new number of records in the btree header. */
3334        xfs_btree_log_block(cur, bp, XFS_BB_NUMRECS);
3335
3336        /*
3337         * If we just inserted into a new tree block, we have to
3338         * recalculate nkey here because nkey is out of date.
3339         *
3340         * Otherwise we're just updating an existing block (having shoved
3341         * some records into the new tree block), so use the regular key
3342         * update mechanism.
3343         */
3344        if (bp && bp->b_bn != old_bn) {
3345                xfs_btree_get_keys(cur, block, lkey);
3346        } else if (xfs_btree_needs_key_update(cur, optr)) {
3347                error = xfs_btree_update_keys(cur, level);
3348                if (error)
3349                        goto error0;
3350        }
3351
3352        /*
3353         * If we are tracking the last record in the tree and
3354         * we are at the far right edge of the tree, update it.
3355         */
3356        if (xfs_btree_is_lastrec(cur, block, level)) {
3357                cur->bc_ops->update_lastrec(cur, block, rec,
3358                                            ptr, LASTREC_INSREC);
3359        }
3360
3361        /*
3362         * Return the new block number, if any.
3363         * If there is one, give back a record value and a cursor too.
3364         */
3365        *ptrp = nptr;
3366        if (!xfs_btree_ptr_is_null(cur, &nptr)) {
3367                xfs_btree_copy_keys(cur, key, lkey, 1);
3368                *curp = ncur;
3369        }
3370
3371        *stat = 1;
3372        return 0;
3373
3374error0:
3375        return error;
3376}
3377
3378/*
3379 * Insert the record at the point referenced by cur.
3380 *
3381 * A multi-level split of the tree on insert will invalidate the original
3382 * cursor.  All callers of this function should assume that the cursor is
3383 * no longer valid and revalidate it.
3384 */
3385int
3386xfs_btree_insert(
3387        struct xfs_btree_cur    *cur,
3388        int                     *stat)
3389{
3390        int                     error;  /* error return value */
3391        int                     i;      /* result value, 0 for failure */
3392        int                     level;  /* current level number in btree */
3393        union xfs_btree_ptr     nptr;   /* new block number (split result) */
3394        struct xfs_btree_cur    *ncur;  /* new cursor (split result) */
3395        struct xfs_btree_cur    *pcur;  /* previous level's cursor */
3396        union xfs_btree_key     bkey;   /* key of block to insert */
3397        union xfs_btree_key     *key;
3398        union xfs_btree_rec     rec;    /* record to insert */
3399
3400        level = 0;
3401        ncur = NULL;
3402        pcur = cur;
3403        key = &bkey;
3404
3405        xfs_btree_set_ptr_null(cur, &nptr);
3406
3407        /* Make a key out of the record data to be inserted, and save it. */
3408        cur->bc_ops->init_rec_from_cur(cur, &rec);
3409        cur->bc_ops->init_key_from_rec(key, &rec);
3410
3411        /*
3412         * Loop going up the tree, starting at the leaf level.
3413         * Stop when we don't get a split block, that must mean that
3414         * the insert is finished with this level.
3415         */
3416        do {
3417                /*
3418                 * Insert nrec/nptr into this level of the tree.
3419                 * Note if we fail, nptr will be null.
3420                 */
3421                error = xfs_btree_insrec(pcur, level, &nptr, &rec, key,
3422                                &ncur, &i);
3423                if (error) {
3424                        if (pcur != cur)
3425                                xfs_btree_del_cursor(pcur, XFS_BTREE_ERROR);
3426                        goto error0;
3427                }
3428
3429                if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) {
3430                        error = -EFSCORRUPTED;
3431                        goto error0;
3432                }
3433                level++;
3434
3435                /*
3436                 * See if the cursor we just used is trash.
3437                 * Can't trash the caller's cursor, but otherwise we should
3438                 * if ncur is a new cursor or we're about to be done.
3439                 */
3440                if (pcur != cur &&
3441                    (ncur || xfs_btree_ptr_is_null(cur, &nptr))) {
3442                        /* Save the state from the cursor before we trash it */
3443                        if (cur->bc_ops->update_cursor)
3444                                cur->bc_ops->update_cursor(pcur, cur);
3445                        cur->bc_nlevels = pcur->bc_nlevels;
3446                        xfs_btree_del_cursor(pcur, XFS_BTREE_NOERROR);
3447                }
3448                /* If we got a new cursor, switch to it. */
3449                if (ncur) {
3450                        pcur = ncur;
3451                        ncur = NULL;
3452                }
3453        } while (!xfs_btree_ptr_is_null(cur, &nptr));
3454
3455        *stat = i;
3456        return 0;
3457error0:
3458        return error;
3459}
3460
3461/*
3462 * Try to merge a non-leaf block back into the inode root.
3463 *
3464 * Note: the killroot names comes from the fact that we're effectively
3465 * killing the old root block.  But because we can't just delete the
3466 * inode we have to copy the single block it was pointing to into the
3467 * inode.
3468 */
3469STATIC int
3470xfs_btree_kill_iroot(
3471        struct xfs_btree_cur    *cur)
3472{
3473        int                     whichfork = cur->bc_ino.whichfork;
3474        struct xfs_inode        *ip = cur->bc_ino.ip;
3475        struct xfs_ifork        *ifp = XFS_IFORK_PTR(ip, whichfork);
3476        struct xfs_btree_block  *block;
3477        struct xfs_btree_block  *cblock;
3478        union xfs_btree_key     *kp;
3479        union xfs_btree_key     *ckp;
3480        union xfs_btree_ptr     *pp;
3481        union xfs_btree_ptr     *cpp;
3482        struct xfs_buf          *cbp;
3483        int                     level;
3484        int                     index;
3485        int                     numrecs;
3486        int                     error;
3487#ifdef DEBUG
3488        union xfs_btree_ptr     ptr;
3489#endif
3490        int                     i;
3491
3492        ASSERT(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE);
3493        ASSERT(cur->bc_nlevels > 1);
3494
3495        /*
3496         * Don't deal with the root block needs to be a leaf case.
3497         * We're just going to turn the thing back into extents anyway.
3498         */
3499        level = cur->bc_nlevels - 1;
3500        if (level == 1)
3501                goto out0;
3502
3503        /*
3504         * Give up if the root has multiple children.
3505         */
3506        block = xfs_btree_get_iroot(cur);
3507        if (xfs_btree_get_numrecs(block) != 1)
3508                goto out0;
3509
3510        cblock = xfs_btree_get_block(cur, level - 1, &cbp);
3511        numrecs = xfs_btree_get_numrecs(cblock);
3512
3513        /*
3514         * Only do this if the next level will fit.
3515         * Then the data must be copied up to the inode,
3516         * instead of freeing the root you free the next level.
3517         */
3518        if (numrecs > cur->bc_ops->get_dmaxrecs(cur, level))
3519                goto out0;
3520
3521        XFS_BTREE_STATS_INC(cur, killroot);
3522
3523#ifdef DEBUG
3524        xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_LEFTSIB);
3525        ASSERT(xfs_btree_ptr_is_null(cur, &ptr));
3526        xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_RIGHTSIB);
3527        ASSERT(xfs_btree_ptr_is_null(cur, &ptr));
3528#endif
3529
3530        index = numrecs - cur->bc_ops->get_maxrecs(cur, level);
3531        if (index) {
3532                xfs_iroot_realloc(cur->bc_ino.ip, index,
3533                                  cur->bc_ino.whichfork);
3534                block = ifp->if_broot;
3535        }
3536
3537        be16_add_cpu(&block->bb_numrecs, index);
3538        ASSERT(block->bb_numrecs == cblock->bb_numrecs);
3539
3540        kp = xfs_btree_key_addr(cur, 1, block);
3541        ckp = xfs_btree_key_addr(cur, 1, cblock);
3542        xfs_btree_copy_keys(cur, kp, ckp, numrecs);
3543
3544        pp = xfs_btree_ptr_addr(cur, 1, block);
3545        cpp = xfs_btree_ptr_addr(cur, 1, cblock);
3546
3547        for (i = 0; i < numrecs; i++) {
3548                error = xfs_btree_debug_check_ptr(cur, cpp, i, level - 1);
3549                if (error)
3550                        return error;
3551        }
3552
3553        xfs_btree_copy_ptrs(cur, pp, cpp, numrecs);
3554
3555        error = xfs_btree_free_block(cur, cbp);
3556        if (error)
3557                return error;
3558
3559        cur->bc_bufs[level - 1] = NULL;
3560        be16_add_cpu(&block->bb_level, -1);
3561        xfs_trans_log_inode(cur->bc_tp, ip,
3562                XFS_ILOG_CORE | xfs_ilog_fbroot(cur->bc_ino.whichfork));
3563        cur->bc_nlevels--;
3564out0:
3565        return 0;
3566}
3567
3568/*
3569 * Kill the current root node, and replace it with it's only child node.
3570 */
3571STATIC int
3572xfs_btree_kill_root(
3573        struct xfs_btree_cur    *cur,
3574        struct xfs_buf          *bp,
3575        int                     level,
3576        union xfs_btree_ptr     *newroot)
3577{
3578        int                     error;
3579
3580        XFS_BTREE_STATS_INC(cur, killroot);
3581
3582        /*
3583         * Update the root pointer, decreasing the level by 1 and then
3584         * free the old root.
3585         */
3586        cur->bc_ops->set_root(cur, newroot, -1);
3587
3588        error = xfs_btree_free_block(cur, bp);
3589        if (error)
3590                return error;
3591
3592        cur->bc_bufs[level] = NULL;
3593        cur->bc_ra[level] = 0;
3594        cur->bc_nlevels--;
3595
3596        return 0;
3597}
3598
3599STATIC int
3600xfs_btree_dec_cursor(
3601        struct xfs_btree_cur    *cur,
3602        int                     level,
3603        int                     *stat)
3604{
3605        int                     error;
3606        int                     i;
3607
3608        if (level > 0) {
3609                error = xfs_btree_decrement(cur, level, &i);
3610                if (error)
3611                        return error;
3612        }
3613
3614        *stat = 1;
3615        return 0;
3616}
3617
3618/*
3619 * Single level of the btree record deletion routine.
3620 * Delete record pointed to by cur/level.
3621 * Remove the record from its block then rebalance the tree.
3622 * Return 0 for error, 1 for done, 2 to go on to the next level.
3623 */
3624STATIC int                                      /* error */
3625xfs_btree_delrec(
3626        struct xfs_btree_cur    *cur,           /* btree cursor */
3627        int                     level,          /* level removing record from */
3628        int                     *stat)          /* fail/done/go-on */
3629{
3630        struct xfs_btree_block  *block;         /* btree block */
3631        union xfs_btree_ptr     cptr;           /* current block ptr */
3632        struct xfs_buf          *bp;            /* buffer for block */
3633        int                     error;          /* error return value */
3634        int                     i;              /* loop counter */
3635        union xfs_btree_ptr     lptr;           /* left sibling block ptr */
3636        struct xfs_buf          *lbp;           /* left buffer pointer */
3637        struct xfs_btree_block  *left;          /* left btree block */
3638        int                     lrecs = 0;      /* left record count */
3639        int                     ptr;            /* key/record index */
3640        union xfs_btree_ptr     rptr;           /* right sibling block ptr */
3641        struct xfs_buf          *rbp;           /* right buffer pointer */
3642        struct xfs_btree_block  *right;         /* right btree block */
3643        struct xfs_btree_block  *rrblock;       /* right-right btree block */
3644        struct xfs_buf          *rrbp;          /* right-right buffer pointer */
3645        int                     rrecs = 0;      /* right record count */
3646        struct xfs_btree_cur    *tcur;          /* temporary btree cursor */
3647        int                     numrecs;        /* temporary numrec count */
3648
3649        tcur = NULL;
3650
3651        /* Get the index of the entry being deleted, check for nothing there. */
3652        ptr = cur->bc_ptrs[level];
3653        if (ptr == 0) {
3654                *stat = 0;
3655                return 0;
3656        }
3657
3658        /* Get the buffer & block containing the record or key/ptr. */
3659        block = xfs_btree_get_block(cur, level, &bp);
3660        numrecs = xfs_btree_get_numrecs(block);
3661
3662#ifdef DEBUG
3663        error = xfs_btree_check_block(cur, block, level, bp);
3664        if (error)
3665                goto error0;
3666#endif
3667
3668        /* Fail if we're off the end of the block. */
3669        if (ptr > numrecs) {
3670                *stat = 0;
3671                return 0;
3672        }
3673
3674        XFS_BTREE_STATS_INC(cur, delrec);
3675        XFS_BTREE_STATS_ADD(cur, moves, numrecs - ptr);
3676
3677        /* Excise the entries being deleted. */
3678        if (level > 0) {
3679                /* It's a nonleaf. operate on keys and ptrs */
3680                union xfs_btree_key     *lkp;
3681                union xfs_btree_ptr     *lpp;
3682
3683                lkp = xfs_btree_key_addr(cur, ptr + 1, block);
3684                lpp = xfs_btree_ptr_addr(cur, ptr + 1, block);
3685
3686                for (i = 0; i < numrecs - ptr; i++) {
3687                        error = xfs_btree_debug_check_ptr(cur, lpp, i, level);
3688                        if (error)
3689                                goto error0;
3690                }
3691
3692                if (ptr < numrecs) {
3693                        xfs_btree_shift_keys(cur, lkp, -1, numrecs - ptr);
3694                        xfs_btree_shift_ptrs(cur, lpp, -1, numrecs - ptr);
3695                        xfs_btree_log_keys(cur, bp, ptr, numrecs - 1);
3696                        xfs_btree_log_ptrs(cur, bp, ptr, numrecs - 1);
3697                }
3698        } else {
3699                /* It's a leaf. operate on records */
3700                if (ptr < numrecs) {
3701                        xfs_btree_shift_recs(cur,
3702                                xfs_btree_rec_addr(cur, ptr + 1, block),
3703                                -1, numrecs - ptr);
3704                        xfs_btree_log_recs(cur, bp, ptr, numrecs - 1);
3705                }
3706        }
3707
3708        /*
3709         * Decrement and log the number of entries in the block.
3710         */
3711        xfs_btree_set_numrecs(block, --numrecs);
3712        xfs_btree_log_block(cur, bp, XFS_BB_NUMRECS);
3713
3714        /*
3715         * If we are tracking the last record in the tree and
3716         * we are at the far right edge of the tree, update it.
3717         */
3718        if (xfs_btree_is_lastrec(cur, block, level)) {
3719                cur->bc_ops->update_lastrec(cur, block, NULL,
3720                                            ptr, LASTREC_DELREC);
3721        }
3722
3723        /*
3724         * We're at the root level.  First, shrink the root block in-memory.
3725         * Try to get rid of the next level down.  If we can't then there's
3726         * nothing left to do.
3727         */
3728        if (level == cur->bc_nlevels - 1) {
3729                if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) {
3730                        xfs_iroot_realloc(cur->bc_ino.ip, -1,
3731                                          cur->bc_ino.whichfork);
3732
3733                        error = xfs_btree_kill_iroot(cur);
3734                        if (error)
3735                                goto error0;
3736
3737                        error = xfs_btree_dec_cursor(cur, level, stat);
3738                        if (error)
3739                                goto error0;
3740                        *stat = 1;
3741                        return 0;
3742                }
3743
3744                /*
3745                 * If this is the root level, and there's only one entry left,
3746                 * and it's NOT the leaf level, then we can get rid of this
3747                 * level.
3748                 */
3749                if (numrecs == 1 && level > 0) {
3750                        union xfs_btree_ptr     *pp;
3751                        /*
3752                         * pp is still set to the first pointer in the block.
3753                         * Make it the new root of the btree.
3754                         */
3755                        pp = xfs_btree_ptr_addr(cur, 1, block);
3756                        error = xfs_btree_kill_root(cur, bp, level, pp);
3757                        if (error)
3758                                goto error0;
3759                } else if (level > 0) {
3760                        error = xfs_btree_dec_cursor(cur, level, stat);
3761                        if (error)
3762                                goto error0;
3763                }
3764                *stat = 1;
3765                return 0;
3766        }
3767
3768        /*
3769         * If we deleted the leftmost entry in the block, update the
3770         * key values above us in the tree.
3771         */
3772        if (xfs_btree_needs_key_update(cur, ptr)) {
3773                error = xfs_btree_update_keys(cur, level);
3774                if (error)
3775                        goto error0;
3776        }
3777
3778        /*
3779         * If the number of records remaining in the block is at least
3780         * the minimum, we're done.
3781         */
3782        if (numrecs >= cur->bc_ops->get_minrecs(cur, level)) {
3783                error = xfs_btree_dec_cursor(cur, level, stat);
3784                if (error)
3785                        goto error0;
3786                return 0;
3787        }
3788
3789        /*
3790         * Otherwise, we have to move some records around to keep the
3791         * tree balanced.  Look at the left and right sibling blocks to
3792         * see if we can re-balance by moving only one record.
3793         */
3794        xfs_btree_get_sibling(cur, block, &rptr, XFS_BB_RIGHTSIB);
3795        xfs_btree_get_sibling(cur, block, &lptr, XFS_BB_LEFTSIB);
3796
3797        if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) {
3798                /*
3799                 * One child of root, need to get a chance to copy its contents
3800                 * into the root and delete it. Can't go up to next level,
3801                 * there's nothing to delete there.
3802                 */
3803                if (xfs_btree_ptr_is_null(cur, &rptr) &&
3804                    xfs_btree_ptr_is_null(cur, &lptr) &&
3805                    level == cur->bc_nlevels - 2) {
3806                        error = xfs_btree_kill_iroot(cur);
3807                        if (!error)
3808                                error = xfs_btree_dec_cursor(cur, level, stat);
3809                        if (error)
3810                                goto error0;
3811                        return 0;
3812                }
3813        }
3814
3815        ASSERT(!xfs_btree_ptr_is_null(cur, &rptr) ||
3816               !xfs_btree_ptr_is_null(cur, &lptr));
3817
3818        /*
3819         * Duplicate the cursor so our btree manipulations here won't
3820         * disrupt the next level up.
3821         */
3822        error = xfs_btree_dup_cursor(cur, &tcur);
3823        if (error)
3824                goto error0;
3825
3826        /*
3827         * If there's a right sibling, see if it's ok to shift an entry
3828         * out of it.
3829         */
3830        if (!xfs_btree_ptr_is_null(cur, &rptr)) {
3831                /*
3832                 * Move the temp cursor to the last entry in the next block.
3833                 * Actually any entry but the first would suffice.
3834                 */
3835                i = xfs_btree_lastrec(tcur, level);
3836                if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) {
3837                        error = -EFSCORRUPTED;
3838                        goto error0;
3839                }
3840
3841                error = xfs_btree_increment(tcur, level, &i);
3842                if (error)
3843                        goto error0;
3844                if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) {
3845                        error = -EFSCORRUPTED;
3846                        goto error0;
3847                }
3848
3849                i = xfs_btree_lastrec(tcur, level);
3850                if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) {
3851                        error = -EFSCORRUPTED;
3852                        goto error0;
3853                }
3854
3855                /* Grab a pointer to the block. */
3856                right = xfs_btree_get_block(tcur, level, &rbp);
3857#ifdef DEBUG
3858                error = xfs_btree_check_block(tcur, right, level, rbp);
3859                if (error)
3860                        goto error0;
3861#endif
3862                /* Grab the current block number, for future use. */
3863                xfs_btree_get_sibling(tcur, right, &cptr, XFS_BB_LEFTSIB);
3864
3865                /*
3866                 * If right block is full enough so that removing one entry
3867                 * won't make it too empty, and left-shifting an entry out
3868                 * of right to us works, we're done.
3869                 */
3870                if (xfs_btree_get_numrecs(right) - 1 >=
3871                    cur->bc_ops->get_minrecs(tcur, level)) {
3872                        error = xfs_btree_lshift(tcur, level, &i);
3873                        if (error)
3874                                goto error0;
3875                        if (i) {
3876                                ASSERT(xfs_btree_get_numrecs(block) >=
3877                                       cur->bc_ops->get_minrecs(tcur, level));
3878
3879                                xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
3880                                tcur = NULL;
3881
3882                                error = xfs_btree_dec_cursor(cur, level, stat);
3883                                if (error)
3884                                        goto error0;
3885                                return 0;
3886                        }
3887                }
3888
3889                /*
3890                 * Otherwise, grab the number of records in right for
3891                 * future reference, and fix up the temp cursor to point
3892                 * to our block again (last record).
3893                 */
3894                rrecs = xfs_btree_get_numrecs(right);
3895                if (!xfs_btree_ptr_is_null(cur, &lptr)) {
3896                        i = xfs_btree_firstrec(tcur, level);
3897                        if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) {
3898                                error = -EFSCORRUPTED;
3899                                goto error0;
3900                        }
3901
3902                        error = xfs_btree_decrement(tcur, level, &i);
3903                        if (error)
3904                                goto error0;
3905                        if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) {
3906                                error = -EFSCORRUPTED;
3907                                goto error0;
3908                        }
3909                }
3910        }
3911
3912        /*
3913         * If there's a left sibling, see if it's ok to shift an entry
3914         * out of it.
3915         */
3916        if (!xfs_btree_ptr_is_null(cur, &lptr)) {
3917                /*
3918                 * Move the temp cursor to the first entry in the
3919                 * previous block.
3920                 */
3921                i = xfs_btree_firstrec(tcur, level);
3922                if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) {
3923                        error = -EFSCORRUPTED;
3924                        goto error0;
3925                }
3926
3927                error = xfs_btree_decrement(tcur, level, &i);
3928                if (error)
3929                        goto error0;
3930                i = xfs_btree_firstrec(tcur, level);
3931                if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) {
3932                        error = -EFSCORRUPTED;
3933                        goto error0;
3934                }
3935
3936                /* Grab a pointer to the block. */
3937                left = xfs_btree_get_block(tcur, level, &lbp);
3938#ifdef DEBUG
3939                error = xfs_btree_check_block(cur, left, level, lbp);
3940                if (error)
3941                        goto error0;
3942#endif
3943                /* Grab the current block number, for future use. */
3944                xfs_btree_get_sibling(tcur, left, &cptr, XFS_BB_RIGHTSIB);
3945
3946                /*
3947                 * If left block is full enough so that removing one entry
3948                 * won't make it too empty, and right-shifting an entry out
3949                 * of left to us works, we're done.
3950                 */
3951                if (xfs_btree_get_numrecs(left) - 1 >=
3952                    cur->bc_ops->get_minrecs(tcur, level)) {
3953                        error = xfs_btree_rshift(tcur, level, &i);
3954                        if (error)
3955                                goto error0;
3956                        if (i) {
3957                                ASSERT(xfs_btree_get_numrecs(block) >=
3958                                       cur->bc_ops->get_minrecs(tcur, level));
3959                                xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
3960                                tcur = NULL;
3961                                if (level == 0)
3962                                        cur->bc_ptrs[0]++;
3963
3964                                *stat = 1;
3965                                return 0;
3966                        }
3967                }
3968
3969                /*
3970                 * Otherwise, grab the number of records in right for
3971                 * future reference.
3972                 */
3973                lrecs = xfs_btree_get_numrecs(left);
3974        }
3975
3976        /* Delete the temp cursor, we're done with it. */
3977        xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
3978        tcur = NULL;
3979
3980        /* If here, we need to do a join to keep the tree balanced. */
3981        ASSERT(!xfs_btree_ptr_is_null(cur, &cptr));
3982
3983        if (!xfs_btree_ptr_is_null(cur, &lptr) &&
3984            lrecs + xfs_btree_get_numrecs(block) <=
3985                        cur->bc_ops->get_maxrecs(cur, level)) {
3986                /*
3987                 * Set "right" to be the starting block,
3988                 * "left" to be the left neighbor.
3989                 */
3990                rptr = cptr;
3991                right = block;
3992                rbp = bp;
3993                error = xfs_btree_read_buf_block(cur, &lptr, 0, &left, &lbp);
3994                if (error)
3995                        goto error0;
3996
3997        /*
3998         * If that won't work, see if we can join with the right neighbor block.
3999         */
4000        } else if (!xfs_btree_ptr_is_null(cur, &rptr) &&
4001                   rrecs + xfs_btree_get_numrecs(block) <=
4002                        cur->bc_ops->get_maxrecs(cur, level)) {
4003                /*
4004                 * Set "left" to be the starting block,
4005                 * "right" to be the right neighbor.
4006                 */
4007                lptr = cptr;
4008                left = block;
4009                lbp = bp;
4010                error = xfs_btree_read_buf_block(cur, &rptr, 0, &right, &rbp);
4011                if (error)
4012                        goto error0;
4013
4014        /*
4015         * Otherwise, we can't fix the imbalance.
4016         * Just return.  This is probably a logic error, but it's not fatal.
4017         */
4018        } else {
4019                error = xfs_btree_dec_cursor(cur, level, stat);
4020                if (error)
4021                        goto error0;
4022                return 0;
4023        }
4024
4025        rrecs = xfs_btree_get_numrecs(right);
4026        lrecs = xfs_btree_get_numrecs(left);
4027
4028        /*
4029         * We're now going to join "left" and "right" by moving all the stuff
4030         * in "right" to "left" and deleting "right".
4031         */
4032        XFS_BTREE_STATS_ADD(cur, moves, rrecs);
4033        if (level > 0) {
4034                /* It's a non-leaf.  Move keys and pointers. */
4035                union xfs_btree_key     *lkp;   /* left btree key */
4036                union xfs_btree_ptr     *lpp;   /* left address pointer */
4037                union xfs_btree_key     *rkp;   /* right btree key */
4038                union xfs_btree_ptr     *rpp;   /* right address pointer */
4039
4040                lkp = xfs_btree_key_addr(cur, lrecs + 1, left);
4041                lpp = xfs_btree_ptr_addr(cur, lrecs + 1, left);
4042                rkp = xfs_btree_key_addr(cur, 1, right);
4043                rpp = xfs_btree_ptr_addr(cur, 1, right);
4044
4045                for (i = 1; i < rrecs; i++) {
4046                        error = xfs_btree_debug_check_ptr(cur, rpp, i, level);
4047                        if (error)
4048                                goto error0;
4049                }
4050
4051                xfs_btree_copy_keys(cur, lkp, rkp, rrecs);
4052                xfs_btree_copy_ptrs(cur, lpp, rpp, rrecs);
4053
4054                xfs_btree_log_keys(cur, lbp, lrecs + 1, lrecs + rrecs);
4055                xfs_btree_log_ptrs(cur, lbp, lrecs + 1, lrecs + rrecs);
4056        } else {
4057                /* It's a leaf.  Move records.  */
4058                union xfs_btree_rec     *lrp;   /* left record pointer */
4059                union xfs_btree_rec     *rrp;   /* right record pointer */
4060
4061                lrp = xfs_btree_rec_addr(cur, lrecs + 1, left);
4062                rrp = xfs_btree_rec_addr(cur, 1, right);
4063
4064                xfs_btree_copy_recs(cur, lrp, rrp, rrecs);
4065                xfs_btree_log_recs(cur, lbp, lrecs + 1, lrecs + rrecs);
4066        }
4067
4068        XFS_BTREE_STATS_INC(cur, join);
4069
4070        /*
4071         * Fix up the number of records and right block pointer in the
4072         * surviving block, and log it.
4073         */
4074        xfs_btree_set_numrecs(left, lrecs + rrecs);
4075        xfs_btree_get_sibling(cur, right, &cptr, XFS_BB_RIGHTSIB);
4076        xfs_btree_set_sibling(cur, left, &cptr, XFS_BB_RIGHTSIB);
4077        xfs_btree_log_block(cur, lbp, XFS_BB_NUMRECS | XFS_BB_RIGHTSIB);
4078
4079        /* If there is a right sibling, point it to the remaining block. */
4080        xfs_btree_get_sibling(cur, left, &cptr, XFS_BB_RIGHTSIB);
4081        if (!xfs_btree_ptr_is_null(cur, &cptr)) {
4082                error = xfs_btree_read_buf_block(cur, &cptr, 0, &rrblock, &rrbp);
4083                if (error)
4084                        goto error0;
4085                xfs_btree_set_sibling(cur, rrblock, &lptr, XFS_BB_LEFTSIB);
4086                xfs_btree_log_block(cur, rrbp, XFS_BB_LEFTSIB);
4087        }
4088
4089        /* Free the deleted block. */
4090        error = xfs_btree_free_block(cur, rbp);
4091        if (error)
4092                goto error0;
4093
4094        /*
4095         * If we joined with the left neighbor, set the buffer in the
4096         * cursor to the left block, and fix up the index.
4097         */
4098        if (bp != lbp) {
4099                cur->bc_bufs[level] = lbp;
4100                cur->bc_ptrs[level] += lrecs;
4101                cur->bc_ra[level] = 0;
4102        }
4103        /*
4104         * If we joined with the right neighbor and there's a level above
4105         * us, increment the cursor at that level.
4106         */
4107        else if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) ||
4108                   (level + 1 < cur->bc_nlevels)) {
4109                error = xfs_btree_increment(cur, level + 1, &i);
4110                if (error)
4111                        goto error0;
4112        }
4113
4114        /*
4115         * Readjust the ptr at this level if it's not a leaf, since it's
4116         * still pointing at the deletion point, which makes the cursor
4117         * inconsistent.  If this makes the ptr 0, the caller fixes it up.
4118         * We can't use decrement because it would change the next level up.
4119         */
4120        if (level > 0)
4121                cur->bc_ptrs[level]--;
4122
4123        /*
4124         * We combined blocks, so we have to update the parent keys if the
4125         * btree supports overlapped intervals.  However, bc_ptrs[level + 1]
4126         * points to the old block so that the caller knows which record to
4127         * delete.  Therefore, the caller must be savvy enough to call updkeys
4128         * for us if we return stat == 2.  The other exit points from this
4129         * function don't require deletions further up the tree, so they can
4130         * call updkeys directly.
4131         */
4132
4133        /* Return value means the next level up has something to do. */
4134        *stat = 2;
4135        return 0;
4136
4137error0:
4138        if (tcur)
4139                xfs_btree_del_cursor(tcur, XFS_BTREE_ERROR);
4140        return error;
4141}
4142
4143/*
4144 * Delete the record pointed to by cur.
4145 * The cursor refers to the place where the record was (could be inserted)
4146 * when the operation returns.
4147 */
4148int                                     /* error */
4149xfs_btree_delete(
4150        struct xfs_btree_cur    *cur,
4151        int                     *stat)  /* success/failure */
4152{
4153        int                     error;  /* error return value */
4154        int                     level;
4155        int                     i;
4156        bool                    joined = false;
4157
4158        /*
4159         * Go up the tree, starting at leaf level.
4160         *
4161         * If 2 is returned then a join was done; go to the next level.
4162         * Otherwise we are done.
4163         */
4164        for (level = 0, i = 2; i == 2; level++) {
4165                error = xfs_btree_delrec(cur, level, &i);
4166                if (error)
4167                        goto error0;
4168                if (i == 2)
4169                        joined = true;
4170        }
4171
4172        /*
4173         * If we combined blocks as part of deleting the record, delrec won't
4174         * have updated the parent high keys so we have to do that here.
4175         */
4176        if (joined && (cur->bc_flags & XFS_BTREE_OVERLAPPING)) {
4177                error = xfs_btree_updkeys_force(cur, 0);
4178                if (error)
4179                        goto error0;
4180        }
4181
4182        if (i == 0) {
4183                for (level = 1; level < cur->bc_nlevels; level++) {
4184                        if (cur->bc_ptrs[level] == 0) {
4185                                error = xfs_btree_decrement(cur, level, &i);
4186                                if (error)
4187                                        goto error0;
4188                                break;
4189                        }
4190                }
4191        }
4192
4193        *stat = i;
4194        return 0;
4195error0:
4196        return error;
4197}
4198
4199/*
4200 * Get the data from the pointed-to record.
4201 */
4202int                                     /* error */
4203xfs_btree_get_rec(
4204        struct xfs_btree_cur    *cur,   /* btree cursor */
4205        union xfs_btree_rec     **recp, /* output: btree record */
4206        int                     *stat)  /* output: success/failure */
4207{
4208        struct xfs_btree_block  *block; /* btree block */
4209        struct xfs_buf          *bp;    /* buffer pointer */
4210        int                     ptr;    /* record number */
4211#ifdef DEBUG
4212        int                     error;  /* error return value */
4213#endif
4214
4215        ptr = cur->bc_ptrs[0];
4216        block = xfs_btree_get_block(cur, 0, &bp);
4217
4218#ifdef DEBUG
4219        error = xfs_btree_check_block(cur, block, 0, bp);
4220        if (error)
4221                return error;
4222#endif
4223
4224        /*
4225         * Off the right end or left end, return failure.
4226         */
4227        if (ptr > xfs_btree_get_numrecs(block) || ptr <= 0) {
4228                *stat = 0;
4229                return 0;
4230        }
4231
4232        /*
4233         * Point to the record and extract its data.
4234         */
4235        *recp = xfs_btree_rec_addr(cur, ptr, block);
4236        *stat = 1;
4237        return 0;
4238}
4239
4240/* Visit a block in a btree. */
4241STATIC int
4242xfs_btree_visit_block(
4243        struct xfs_btree_cur            *cur,
4244        int                             level,
4245        xfs_btree_visit_blocks_fn       fn,
4246        void                            *data)
4247{
4248        struct xfs_btree_block          *block;
4249        struct xfs_buf                  *bp;
4250        union xfs_btree_ptr             rptr;
4251        int                             error;
4252
4253        /* do right sibling readahead */
4254        xfs_btree_readahead(cur, level, XFS_BTCUR_RIGHTRA);
4255        block = xfs_btree_get_block(cur, level, &bp);
4256
4257        /* process the block */
4258        error = fn(cur, level, data);
4259        if (error)
4260                return error;
4261
4262        /* now read rh sibling block for next iteration */
4263        xfs_btree_get_sibling(cur, block, &rptr, XFS_BB_RIGHTSIB);
4264        if (xfs_btree_ptr_is_null(cur, &rptr))
4265                return -ENOENT;
4266
4267        return xfs_btree_lookup_get_block(cur, level, &rptr, &block);
4268}
4269
4270
4271/* Visit every block in a btree. */
4272int
4273xfs_btree_visit_blocks(
4274        struct xfs_btree_cur            *cur,
4275        xfs_btree_visit_blocks_fn       fn,
4276        unsigned int                    flags,
4277        void                            *data)
4278{
4279        union xfs_btree_ptr             lptr;
4280        int                             level;
4281        struct xfs_btree_block          *block = NULL;
4282        int                             error = 0;
4283
4284        cur->bc_ops->init_ptr_from_cur(cur, &lptr);
4285
4286        /* for each level */
4287        for (level = cur->bc_nlevels - 1; level >= 0; level--) {
4288                /* grab the left hand block */
4289                error = xfs_btree_lookup_get_block(cur, level, &lptr, &block);
4290                if (error)
4291                        return error;
4292
4293                /* readahead the left most block for the next level down */
4294                if (level > 0) {
4295                        union xfs_btree_ptr     *ptr;
4296
4297                        ptr = xfs_btree_ptr_addr(cur, 1, block);
4298                        xfs_btree_readahead_ptr(cur, ptr, 1);
4299
4300                        /* save for the next iteration of the loop */
4301                        xfs_btree_copy_ptrs(cur, &lptr, ptr, 1);
4302
4303                        if (!(flags & XFS_BTREE_VISIT_LEAVES))
4304                                continue;
4305                } else if (!(flags & XFS_BTREE_VISIT_RECORDS)) {
4306                        continue;
4307                }
4308
4309                /* for each buffer in the level */
4310                do {
4311                        error = xfs_btree_visit_block(cur, level, fn, data);
4312                } while (!error);
4313
4314                if (error != -ENOENT)
4315                        return error;
4316        }
4317
4318        return 0;
4319}
4320
4321/*
4322 * Change the owner of a btree.
4323 *
4324 * The mechanism we use here is ordered buffer logging. Because we don't know
4325 * how many buffers were are going to need to modify, we don't really want to
4326 * have to make transaction reservations for the worst case of every buffer in a
4327 * full size btree as that may be more space that we can fit in the log....
4328 *
4329 * We do the btree walk in the most optimal manner possible - we have sibling
4330 * pointers so we can just walk all the blocks on each level from left to right
4331 * in a single pass, and then move to the next level and do the same. We can
4332 * also do readahead on the sibling pointers to get IO moving more quickly,
4333 * though for slow disks this is unlikely to make much difference to performance
4334 * as the amount of CPU work we have to do before moving to the next block is
4335 * relatively small.
4336 *
4337 * For each btree block that we load, modify the owner appropriately, set the
4338 * buffer as an ordered buffer and log it appropriately. We need to ensure that
4339 * we mark the region we change dirty so that if the buffer is relogged in
4340 * a subsequent transaction the changes we make here as an ordered buffer are
4341 * correctly relogged in that transaction.  If we are in recovery context, then
4342 * just queue the modified buffer as delayed write buffer so the transaction
4343 * recovery completion writes the changes to disk.
4344 */
4345struct xfs_btree_block_change_owner_info {
4346        uint64_t                new_owner;
4347        struct list_head        *buffer_list;
4348};
4349
4350static int
4351xfs_btree_block_change_owner(
4352        struct xfs_btree_cur    *cur,
4353        int                     level,
4354        void                    *data)
4355{
4356        struct xfs_btree_block_change_owner_info        *bbcoi = data;
4357        struct xfs_btree_block  *block;
4358        struct xfs_buf          *bp;
4359
4360        /* modify the owner */
4361        block = xfs_btree_get_block(cur, level, &bp);
4362        if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
4363                if (block->bb_u.l.bb_owner == cpu_to_be64(bbcoi->new_owner))
4364                        return 0;
4365                block->bb_u.l.bb_owner = cpu_to_be64(bbcoi->new_owner);
4366        } else {
4367                if (block->bb_u.s.bb_owner == cpu_to_be32(bbcoi->new_owner))
4368                        return 0;
4369                block->bb_u.s.bb_owner = cpu_to_be32(bbcoi->new_owner);
4370        }
4371
4372        /*
4373         * If the block is a root block hosted in an inode, we might not have a
4374         * buffer pointer here and we shouldn't attempt to log the change as the
4375         * information is already held in the inode and discarded when the root
4376         * block is formatted into the on-disk inode fork. We still change it,
4377         * though, so everything is consistent in memory.
4378         */
4379        if (!bp) {
4380                ASSERT(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE);
4381                ASSERT(level == cur->bc_nlevels - 1);
4382                return 0;
4383        }
4384
4385        if (cur->bc_tp) {
4386                if (!xfs_trans_ordered_buf(cur->bc_tp, bp)) {
4387                        xfs_btree_log_block(cur, bp, XFS_BB_OWNER);
4388                        return -EAGAIN;
4389                }
4390        } else {
4391                xfs_buf_delwri_queue(bp, bbcoi->buffer_list);
4392        }
4393
4394        return 0;
4395}
4396
4397int
4398xfs_btree_change_owner(
4399        struct xfs_btree_cur    *cur,
4400        uint64_t                new_owner,
4401        struct list_head        *buffer_list)
4402{
4403        struct xfs_btree_block_change_owner_info        bbcoi;
4404
4405        bbcoi.new_owner = new_owner;
4406        bbcoi.buffer_list = buffer_list;
4407
4408        return xfs_btree_visit_blocks(cur, xfs_btree_block_change_owner,
4409                        XFS_BTREE_VISIT_ALL, &bbcoi);
4410}
4411
4412/* Verify the v5 fields of a long-format btree block. */
4413xfs_failaddr_t
4414xfs_btree_lblock_v5hdr_verify(
4415        struct xfs_buf          *bp,
4416        uint64_t                owner)
4417{
4418        struct xfs_mount        *mp = bp->b_mount;
4419        struct xfs_btree_block  *block = XFS_BUF_TO_BLOCK(bp);
4420
4421        if (!xfs_sb_version_hascrc(&mp->m_sb))
4422                return __this_address;
4423        if (!uuid_equal(&block->bb_u.l.bb_uuid, &mp->m_sb.sb_meta_uuid))
4424                return __this_address;
4425        if (block->bb_u.l.bb_blkno != cpu_to_be64(bp->b_bn))
4426                return __this_address;
4427        if (owner != XFS_RMAP_OWN_UNKNOWN &&
4428            be64_to_cpu(block->bb_u.l.bb_owner) != owner)
4429                return __this_address;
4430        return NULL;
4431}
4432
4433/* Verify a long-format btree block. */
4434xfs_failaddr_t
4435xfs_btree_lblock_verify(
4436        struct xfs_buf          *bp,
4437        unsigned int            max_recs)
4438{
4439        struct xfs_mount        *mp = bp->b_mount;
4440        struct xfs_btree_block  *block = XFS_BUF_TO_BLOCK(bp);
4441
4442        /* numrecs verification */
4443        if (be16_to_cpu(block->bb_numrecs) > max_recs)
4444                return __this_address;
4445
4446        /* sibling pointer verification */
4447        if (block->bb_u.l.bb_leftsib != cpu_to_be64(NULLFSBLOCK) &&
4448            !xfs_verify_fsbno(mp, be64_to_cpu(block->bb_u.l.bb_leftsib)))
4449                return __this_address;
4450        if (block->bb_u.l.bb_rightsib != cpu_to_be64(NULLFSBLOCK) &&
4451            !xfs_verify_fsbno(mp, be64_to_cpu(block->bb_u.l.bb_rightsib)))
4452                return __this_address;
4453
4454        return NULL;
4455}
4456
4457/**
4458 * xfs_btree_sblock_v5hdr_verify() -- verify the v5 fields of a short-format
4459 *                                    btree block
4460 *
4461 * @bp: buffer containing the btree block
4462 */
4463xfs_failaddr_t
4464xfs_btree_sblock_v5hdr_verify(
4465        struct xfs_buf          *bp)
4466{
4467        struct xfs_mount        *mp = bp->b_mount;
4468        struct xfs_btree_block  *block = XFS_BUF_TO_BLOCK(bp);
4469        struct xfs_perag        *pag = bp->b_pag;
4470
4471        if (!xfs_sb_version_hascrc(&mp->m_sb))
4472                return __this_address;
4473        if (!uuid_equal(&block->bb_u.s.bb_uuid, &mp->m_sb.sb_meta_uuid))
4474                return __this_address;
4475        if (block->bb_u.s.bb_blkno != cpu_to_be64(bp->b_bn))
4476                return __this_address;
4477        if (pag && be32_to_cpu(block->bb_u.s.bb_owner) != pag->pag_agno)
4478                return __this_address;
4479        return NULL;
4480}
4481
4482/**
4483 * xfs_btree_sblock_verify() -- verify a short-format btree block
4484 *
4485 * @bp: buffer containing the btree block
4486 * @max_recs: maximum records allowed in this btree node
4487 */
4488xfs_failaddr_t
4489xfs_btree_sblock_verify(
4490        struct xfs_buf          *bp,
4491        unsigned int            max_recs)
4492{
4493        struct xfs_mount        *mp = bp->b_mount;
4494        struct xfs_btree_block  *block = XFS_BUF_TO_BLOCK(bp);
4495        xfs_agblock_t           agno;
4496
4497        /* numrecs verification */
4498        if (be16_to_cpu(block->bb_numrecs) > max_recs)
4499                return __this_address;
4500
4501        /* sibling pointer verification */
4502        agno = xfs_daddr_to_agno(mp, XFS_BUF_ADDR(bp));
4503        if (block->bb_u.s.bb_leftsib != cpu_to_be32(NULLAGBLOCK) &&
4504            !xfs_verify_agbno(mp, agno, be32_to_cpu(block->bb_u.s.bb_leftsib)))
4505                return __this_address;
4506        if (block->bb_u.s.bb_rightsib != cpu_to_be32(NULLAGBLOCK) &&
4507            !xfs_verify_agbno(mp, agno, be32_to_cpu(block->bb_u.s.bb_rightsib)))
4508                return __this_address;
4509
4510        return NULL;
4511}
4512
4513/*
4514 * Calculate the number of btree levels needed to store a given number of
4515 * records in a short-format btree.
4516 */
4517uint
4518xfs_btree_compute_maxlevels(
4519        uint                    *limits,
4520        unsigned long           len)
4521{
4522        uint                    level;
4523        unsigned long           maxblocks;
4524
4525        maxblocks = (len + limits[0] - 1) / limits[0];
4526        for (level = 1; maxblocks > 1; level++)
4527                maxblocks = (maxblocks + limits[1] - 1) / limits[1];
4528        return level;
4529}
4530
4531/*
4532 * Query a regular btree for all records overlapping a given interval.
4533 * Start with a LE lookup of the key of low_rec and return all records
4534 * until we find a record with a key greater than the key of high_rec.
4535 */
4536STATIC int
4537xfs_btree_simple_query_range(
4538        struct xfs_btree_cur            *cur,
4539        union xfs_btree_key             *low_key,
4540        union xfs_btree_key             *high_key,
4541        xfs_btree_query_range_fn        fn,
4542        void                            *priv)
4543{
4544        union xfs_btree_rec             *recp;
4545        union xfs_btree_key             rec_key;
4546        int64_t                         diff;
4547        int                             stat;
4548        bool                            firstrec = true;
4549        int                             error;
4550
4551        ASSERT(cur->bc_ops->init_high_key_from_rec);
4552        ASSERT(cur->bc_ops->diff_two_keys);
4553
4554        /*
4555         * Find the leftmost record.  The btree cursor must be set
4556         * to the low record used to generate low_key.
4557         */
4558        stat = 0;
4559        error = xfs_btree_lookup(cur, XFS_LOOKUP_LE, &stat);
4560        if (error)
4561                goto out;
4562
4563        /* Nothing?  See if there's anything to the right. */
4564        if (!stat) {
4565                error = xfs_btree_increment(cur, 0, &stat);
4566                if (error)
4567                        goto out;
4568        }
4569
4570        while (stat) {
4571                /* Find the record. */
4572                error = xfs_btree_get_rec(cur, &recp, &stat);
4573                if (error || !stat)
4574                        break;
4575
4576                /* Skip if high_key(rec) < low_key. */
4577                if (firstrec) {
4578                        cur->bc_ops->init_high_key_from_rec(&rec_key, recp);
4579                        firstrec = false;
4580                        diff = cur->bc_ops->diff_two_keys(cur, low_key,
4581                                        &rec_key);
4582                        if (diff > 0)
4583                                goto advloop;
4584                }
4585
4586                /* Stop if high_key < low_key(rec). */
4587                cur->bc_ops->init_key_from_rec(&rec_key, recp);
4588                diff = cur->bc_ops->diff_two_keys(cur, &rec_key, high_key);
4589                if (diff > 0)
4590                        break;
4591
4592                /* Callback */
4593                error = fn(cur, recp, priv);
4594                if (error)
4595                        break;
4596
4597advloop:
4598                /* Move on to the next record. */
4599                error = xfs_btree_increment(cur, 0, &stat);
4600                if (error)
4601                        break;
4602        }
4603
4604out:
4605        return error;
4606}
4607
4608/*
4609 * Query an overlapped interval btree for all records overlapping a given
4610 * interval.  This function roughly follows the algorithm given in
4611 * "Interval Trees" of _Introduction to Algorithms_, which is section
4612 * 14.3 in the 2nd and 3rd editions.
4613 *
4614 * First, generate keys for the low and high records passed in.
4615 *
4616 * For any leaf node, generate the high and low keys for the record.
4617 * If the record keys overlap with the query low/high keys, pass the
4618 * record to the function iterator.
4619 *
4620 * For any internal node, compare the low and high keys of each
4621 * pointer against the query low/high keys.  If there's an overlap,
4622 * follow the pointer.
4623 *
4624 * As an optimization, we stop scanning a block when we find a low key
4625 * that is greater than the query's high key.
4626 */
4627STATIC int
4628xfs_btree_overlapped_query_range(
4629        struct xfs_btree_cur            *cur,
4630        union xfs_btree_key             *low_key,
4631        union xfs_btree_key             *high_key,
4632        xfs_btree_query_range_fn        fn,
4633        void                            *priv)
4634{
4635        union xfs_btree_ptr             ptr;
4636        union xfs_btree_ptr             *pp;
4637        union xfs_btree_key             rec_key;
4638        union xfs_btree_key             rec_hkey;
4639        union xfs_btree_key             *lkp;
4640        union xfs_btree_key             *hkp;
4641        union xfs_btree_rec             *recp;
4642        struct xfs_btree_block          *block;
4643        int64_t                         ldiff;
4644        int64_t                         hdiff;
4645        int                             level;
4646        struct xfs_buf                  *bp;
4647        int                             i;
4648        int                             error;
4649
4650        /* Load the root of the btree. */
4651        level = cur->bc_nlevels - 1;
4652        cur->bc_ops->init_ptr_from_cur(cur, &ptr);
4653        error = xfs_btree_lookup_get_block(cur, level, &ptr, &block);
4654        if (error)
4655                return error;
4656        xfs_btree_get_block(cur, level, &bp);
4657        trace_xfs_btree_overlapped_query_range(cur, level, bp);
4658#ifdef DEBUG
4659        error = xfs_btree_check_block(cur, block, level, bp);
4660        if (error)
4661                goto out;
4662#endif
4663        cur->bc_ptrs[level] = 1;
4664
4665        while (level < cur->bc_nlevels) {
4666                block = xfs_btree_get_block(cur, level, &bp);
4667
4668                /* End of node, pop back towards the root. */
4669                if (cur->bc_ptrs[level] > be16_to_cpu(block->bb_numrecs)) {
4670pop_up:
4671                        if (level < cur->bc_nlevels - 1)
4672                                cur->bc_ptrs[level + 1]++;
4673                        level++;
4674                        continue;
4675                }
4676
4677                if (level == 0) {
4678                        /* Handle a leaf node. */
4679                        recp = xfs_btree_rec_addr(cur, cur->bc_ptrs[0], block);
4680
4681                        cur->bc_ops->init_high_key_from_rec(&rec_hkey, recp);
4682                        ldiff = cur->bc_ops->diff_two_keys(cur, &rec_hkey,
4683                                        low_key);
4684
4685                        cur->bc_ops->init_key_from_rec(&rec_key, recp);
4686                        hdiff = cur->bc_ops->diff_two_keys(cur, high_key,
4687                                        &rec_key);
4688
4689                        /*
4690                         * If (record's high key >= query's low key) and
4691                         *    (query's high key >= record's low key), then
4692                         * this record overlaps the query range; callback.
4693                         */
4694                        if (ldiff >= 0 && hdiff >= 0) {
4695                                error = fn(cur, recp, priv);
4696                                if (error)
4697                                        break;
4698                        } else if (hdiff < 0) {
4699                                /* Record is larger than high key; pop. */
4700                                goto pop_up;
4701                        }
4702                        cur->bc_ptrs[level]++;
4703                        continue;
4704                }
4705
4706                /* Handle an internal node. */
4707                lkp = xfs_btree_key_addr(cur, cur->bc_ptrs[level], block);
4708                hkp = xfs_btree_high_key_addr(cur, cur->bc_ptrs[level], block);
4709                pp = xfs_btree_ptr_addr(cur, cur->bc_ptrs[level], block);
4710
4711                ldiff = cur->bc_ops->diff_two_keys(cur, hkp, low_key);
4712                hdiff = cur->bc_ops->diff_two_keys(cur, high_key, lkp);
4713
4714                /*
4715                 * If (pointer's high key >= query's low key) and
4716                 *    (query's high key >= pointer's low key), then
4717                 * this record overlaps the query range; follow pointer.
4718                 */
4719                if (ldiff >= 0 && hdiff >= 0) {
4720                        level--;
4721                        error = xfs_btree_lookup_get_block(cur, level, pp,
4722                                        &block);
4723                        if (error)
4724                                goto out;
4725                        xfs_btree_get_block(cur, level, &bp);
4726                        trace_xfs_btree_overlapped_query_range(cur, level, bp);
4727#ifdef DEBUG
4728                        error = xfs_btree_check_block(cur, block, level, bp);
4729                        if (error)
4730                                goto out;
4731#endif
4732                        cur->bc_ptrs[level] = 1;
4733                        continue;
4734                } else if (hdiff < 0) {
4735                        /* The low key is larger than the upper range; pop. */
4736                        goto pop_up;
4737                }
4738                cur->bc_ptrs[level]++;
4739        }
4740
4741out:
4742        /*
4743         * If we don't end this function with the cursor pointing at a record
4744         * block, a subsequent non-error cursor deletion will not release
4745         * node-level buffers, causing a buffer leak.  This is quite possible
4746         * with a zero-results range query, so release the buffers if we
4747         * failed to return any results.
4748         */
4749        if (cur->bc_bufs[0] == NULL) {
4750                for (i = 0; i < cur->bc_nlevels; i++) {
4751                        if (cur->bc_bufs[i]) {
4752                                xfs_trans_brelse(cur->bc_tp, cur->bc_bufs[i]);
4753                                cur->bc_bufs[i] = NULL;
4754                                cur->bc_ptrs[i] = 0;
4755                                cur->bc_ra[i] = 0;
4756                        }
4757                }
4758        }
4759
4760        return error;
4761}
4762
4763/*
4764 * Query a btree for all records overlapping a given interval of keys.  The
4765 * supplied function will be called with each record found; return one of the
4766 * XFS_BTREE_QUERY_RANGE_{CONTINUE,ABORT} values or the usual negative error
4767 * code.  This function returns -ECANCELED, zero, or a negative error code.
4768 */
4769int
4770xfs_btree_query_range(
4771        struct xfs_btree_cur            *cur,
4772        union xfs_btree_irec            *low_rec,
4773        union xfs_btree_irec            *high_rec,
4774        xfs_btree_query_range_fn        fn,
4775        void                            *priv)
4776{
4777        union xfs_btree_rec             rec;
4778        union xfs_btree_key             low_key;
4779        union xfs_btree_key             high_key;
4780
4781        /* Find the keys of both ends of the interval. */
4782        cur->bc_rec = *high_rec;
4783        cur->bc_ops->init_rec_from_cur(cur, &rec);
4784        cur->bc_ops->init_key_from_rec(&high_key, &rec);
4785
4786        cur->bc_rec = *low_rec;
4787        cur->bc_ops->init_rec_from_cur(cur, &rec);
4788        cur->bc_ops->init_key_from_rec(&low_key, &rec);
4789
4790        /* Enforce low key < high key. */
4791        if (cur->bc_ops->diff_two_keys(cur, &low_key, &high_key) > 0)
4792                return -EINVAL;
4793
4794        if (!(cur->bc_flags & XFS_BTREE_OVERLAPPING))
4795                return xfs_btree_simple_query_range(cur, &low_key,
4796                                &high_key, fn, priv);
4797        return xfs_btree_overlapped_query_range(cur, &low_key, &high_key,
4798                        fn, priv);
4799}
4800
4801/* Query a btree for all records. */
4802int
4803xfs_btree_query_all(
4804        struct xfs_btree_cur            *cur,
4805        xfs_btree_query_range_fn        fn,
4806        void                            *priv)
4807{
4808        union xfs_btree_key             low_key;
4809        union xfs_btree_key             high_key;
4810
4811        memset(&cur->bc_rec, 0, sizeof(cur->bc_rec));
4812        memset(&low_key, 0, sizeof(low_key));
4813        memset(&high_key, 0xFF, sizeof(high_key));
4814
4815        return xfs_btree_simple_query_range(cur, &low_key, &high_key, fn, priv);
4816}
4817
4818/*
4819 * Calculate the number of blocks needed to store a given number of records
4820 * in a short-format (per-AG metadata) btree.
4821 */
4822unsigned long long
4823xfs_btree_calc_size(
4824        uint                    *limits,
4825        unsigned long long      len)
4826{
4827        int                     level;
4828        int                     maxrecs;
4829        unsigned long long      rval;
4830
4831        maxrecs = limits[0];
4832        for (level = 0, rval = 0; len > 1; level++) {
4833                len += maxrecs - 1;
4834                do_div(len, maxrecs);
4835                maxrecs = limits[1];
4836                rval += len;
4837        }
4838        return rval;
4839}
4840
4841static int
4842xfs_btree_count_blocks_helper(
4843        struct xfs_btree_cur    *cur,
4844        int                     level,
4845        void                    *data)
4846{
4847        xfs_extlen_t            *blocks = data;
4848        (*blocks)++;
4849
4850        return 0;
4851}
4852
4853/* Count the blocks in a btree and return the result in *blocks. */
4854int
4855xfs_btree_count_blocks(
4856        struct xfs_btree_cur    *cur,
4857        xfs_extlen_t            *blocks)
4858{
4859        *blocks = 0;
4860        return xfs_btree_visit_blocks(cur, xfs_btree_count_blocks_helper,
4861                        XFS_BTREE_VISIT_ALL, blocks);
4862}
4863
4864/* Compare two btree pointers. */
4865int64_t
4866xfs_btree_diff_two_ptrs(
4867        struct xfs_btree_cur            *cur,
4868        const union xfs_btree_ptr       *a,
4869        const union xfs_btree_ptr       *b)
4870{
4871        if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
4872                return (int64_t)be64_to_cpu(a->l) - be64_to_cpu(b->l);
4873        return (int64_t)be32_to_cpu(a->s) - be32_to_cpu(b->s);
4874}
4875
4876/* If there's an extent, we're done. */
4877STATIC int
4878xfs_btree_has_record_helper(
4879        struct xfs_btree_cur            *cur,
4880        union xfs_btree_rec             *rec,
4881        void                            *priv)
4882{
4883        return -ECANCELED;
4884}
4885
4886/* Is there a record covering a given range of keys? */
4887int
4888xfs_btree_has_record(
4889        struct xfs_btree_cur    *cur,
4890        union xfs_btree_irec    *low,
4891        union xfs_btree_irec    *high,
4892        bool                    *exists)
4893{
4894        int                     error;
4895
4896        error = xfs_btree_query_range(cur, low, high,
4897                        &xfs_btree_has_record_helper, NULL);
4898        if (error == -ECANCELED) {
4899                *exists = true;
4900                return 0;
4901        }
4902        *exists = false;
4903        return error;
4904}
4905
4906/* Are there more records in this btree? */
4907bool
4908xfs_btree_has_more_records(
4909        struct xfs_btree_cur    *cur)
4910{
4911        struct xfs_btree_block  *block;
4912        struct xfs_buf          *bp;
4913
4914        block = xfs_btree_get_block(cur, 0, &bp);
4915
4916        /* There are still records in this block. */
4917        if (cur->bc_ptrs[0] < xfs_btree_get_numrecs(block))
4918                return true;
4919
4920        /* There are more record blocks. */
4921        if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
4922                return block->bb_u.l.bb_rightsib != cpu_to_be64(NULLFSBLOCK);
4923        else
4924                return block->bb_u.s.bb_rightsib != cpu_to_be32(NULLAGBLOCK);
4925}
4926