linux/fs/xfs/libxfs/xfs_btree.c
<<
>>
Prefs
   1/*
   2 * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc.
   3 * All Rights Reserved.
   4 *
   5 * This program is free software; you can redistribute it and/or
   6 * modify it under the terms of the GNU General Public License as
   7 * published by the Free Software Foundation.
   8 *
   9 * This program is distributed in the hope that it would be useful,
  10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
  11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12 * GNU General Public License for more details.
  13 *
  14 * You should have received a copy of the GNU General Public License
  15 * along with this program; if not, write the Free Software Foundation,
  16 * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
  17 */
  18#include "xfs.h"
  19#include "xfs_fs.h"
  20#include "xfs_shared.h"
  21#include "xfs_format.h"
  22#include "xfs_log_format.h"
  23#include "xfs_trans_resv.h"
  24#include "xfs_bit.h"
  25#include "xfs_mount.h"
  26#include "xfs_inode.h"
  27#include "xfs_trans.h"
  28#include "xfs_inode_item.h"
  29#include "xfs_buf_item.h"
  30#include "xfs_btree.h"
  31#include "xfs_error.h"
  32#include "xfs_trace.h"
  33#include "xfs_cksum.h"
  34#include "xfs_alloc.h"
  35#include "xfs_log.h"
  36
  37/*
  38 * Cursor allocation zone.
  39 */
  40kmem_zone_t     *xfs_btree_cur_zone;
  41
  42/*
  43 * Btree magic numbers.
  44 */
  45static const __uint32_t xfs_magics[2][XFS_BTNUM_MAX] = {
  46        { XFS_ABTB_MAGIC, XFS_ABTC_MAGIC, XFS_BMAP_MAGIC, XFS_IBT_MAGIC,
  47          XFS_FIBT_MAGIC },
  48        { XFS_ABTB_CRC_MAGIC, XFS_ABTC_CRC_MAGIC,
  49          XFS_BMAP_CRC_MAGIC, XFS_IBT_CRC_MAGIC, XFS_FIBT_CRC_MAGIC }
  50};
  51#define xfs_btree_magic(cur) \
  52        xfs_magics[!!((cur)->bc_flags & XFS_BTREE_CRC_BLOCKS)][cur->bc_btnum]
  53
  54
  55STATIC int                              /* error (0 or EFSCORRUPTED) */
  56xfs_btree_check_lblock(
  57        struct xfs_btree_cur    *cur,   /* btree cursor */
  58        struct xfs_btree_block  *block, /* btree long form block pointer */
  59        int                     level,  /* level of the btree block */
  60        struct xfs_buf          *bp)    /* buffer for block, if any */
  61{
  62        int                     lblock_ok = 1; /* block passes checks */
  63        struct xfs_mount        *mp;    /* file system mount point */
  64
  65        mp = cur->bc_mp;
  66
  67        if (xfs_sb_version_hascrc(&mp->m_sb)) {
  68                lblock_ok = lblock_ok &&
  69                        uuid_equal(&block->bb_u.l.bb_uuid,
  70                                   &mp->m_sb.sb_meta_uuid) &&
  71                        block->bb_u.l.bb_blkno == cpu_to_be64(
  72                                bp ? bp->b_bn : XFS_BUF_DADDR_NULL);
  73        }
  74
  75        lblock_ok = lblock_ok &&
  76                be32_to_cpu(block->bb_magic) == xfs_btree_magic(cur) &&
  77                be16_to_cpu(block->bb_level) == level &&
  78                be16_to_cpu(block->bb_numrecs) <=
  79                        cur->bc_ops->get_maxrecs(cur, level) &&
  80                block->bb_u.l.bb_leftsib &&
  81                (block->bb_u.l.bb_leftsib == cpu_to_be64(NULLFSBLOCK) ||
  82                 XFS_FSB_SANITY_CHECK(mp,
  83                        be64_to_cpu(block->bb_u.l.bb_leftsib))) &&
  84                block->bb_u.l.bb_rightsib &&
  85                (block->bb_u.l.bb_rightsib == cpu_to_be64(NULLFSBLOCK) ||
  86                 XFS_FSB_SANITY_CHECK(mp,
  87                        be64_to_cpu(block->bb_u.l.bb_rightsib)));
  88
  89        if (unlikely(XFS_TEST_ERROR(!lblock_ok, mp,
  90                        XFS_ERRTAG_BTREE_CHECK_LBLOCK,
  91                        XFS_RANDOM_BTREE_CHECK_LBLOCK))) {
  92                if (bp)
  93                        trace_xfs_btree_corrupt(bp, _RET_IP_);
  94                XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, mp);
  95                return -EFSCORRUPTED;
  96        }
  97        return 0;
  98}
  99
 100STATIC int                              /* error (0 or EFSCORRUPTED) */
 101xfs_btree_check_sblock(
 102        struct xfs_btree_cur    *cur,   /* btree cursor */
 103        struct xfs_btree_block  *block, /* btree short form block pointer */
 104        int                     level,  /* level of the btree block */
 105        struct xfs_buf          *bp)    /* buffer containing block */
 106{
 107        struct xfs_mount        *mp;    /* file system mount point */
 108        struct xfs_buf          *agbp;  /* buffer for ag. freespace struct */
 109        struct xfs_agf          *agf;   /* ag. freespace structure */
 110        xfs_agblock_t           agflen; /* native ag. freespace length */
 111        int                     sblock_ok = 1; /* block passes checks */
 112
 113        mp = cur->bc_mp;
 114        agbp = cur->bc_private.a.agbp;
 115        agf = XFS_BUF_TO_AGF(agbp);
 116        agflen = be32_to_cpu(agf->agf_length);
 117
 118        if (xfs_sb_version_hascrc(&mp->m_sb)) {
 119                sblock_ok = sblock_ok &&
 120                        uuid_equal(&block->bb_u.s.bb_uuid,
 121                                   &mp->m_sb.sb_meta_uuid) &&
 122                        block->bb_u.s.bb_blkno == cpu_to_be64(
 123                                bp ? bp->b_bn : XFS_BUF_DADDR_NULL);
 124        }
 125
 126        sblock_ok = sblock_ok &&
 127                be32_to_cpu(block->bb_magic) == xfs_btree_magic(cur) &&
 128                be16_to_cpu(block->bb_level) == level &&
 129                be16_to_cpu(block->bb_numrecs) <=
 130                        cur->bc_ops->get_maxrecs(cur, level) &&
 131                (block->bb_u.s.bb_leftsib == cpu_to_be32(NULLAGBLOCK) ||
 132                 be32_to_cpu(block->bb_u.s.bb_leftsib) < agflen) &&
 133                block->bb_u.s.bb_leftsib &&
 134                (block->bb_u.s.bb_rightsib == cpu_to_be32(NULLAGBLOCK) ||
 135                 be32_to_cpu(block->bb_u.s.bb_rightsib) < agflen) &&
 136                block->bb_u.s.bb_rightsib;
 137
 138        if (unlikely(XFS_TEST_ERROR(!sblock_ok, mp,
 139                        XFS_ERRTAG_BTREE_CHECK_SBLOCK,
 140                        XFS_RANDOM_BTREE_CHECK_SBLOCK))) {
 141                if (bp)
 142                        trace_xfs_btree_corrupt(bp, _RET_IP_);
 143                XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, mp);
 144                return -EFSCORRUPTED;
 145        }
 146        return 0;
 147}
 148
 149/*
 150 * Debug routine: check that block header is ok.
 151 */
 152int
 153xfs_btree_check_block(
 154        struct xfs_btree_cur    *cur,   /* btree cursor */
 155        struct xfs_btree_block  *block, /* generic btree block pointer */
 156        int                     level,  /* level of the btree block */
 157        struct xfs_buf          *bp)    /* buffer containing block, if any */
 158{
 159        if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
 160                return xfs_btree_check_lblock(cur, block, level, bp);
 161        else
 162                return xfs_btree_check_sblock(cur, block, level, bp);
 163}
 164
 165/*
 166 * Check that (long) pointer is ok.
 167 */
 168int                                     /* error (0 or EFSCORRUPTED) */
 169xfs_btree_check_lptr(
 170        struct xfs_btree_cur    *cur,   /* btree cursor */
 171        xfs_fsblock_t           bno,    /* btree block disk address */
 172        int                     level)  /* btree block level */
 173{
 174        XFS_WANT_CORRUPTED_RETURN(cur->bc_mp,
 175                level > 0 &&
 176                bno != NULLFSBLOCK &&
 177                XFS_FSB_SANITY_CHECK(cur->bc_mp, bno));
 178        return 0;
 179}
 180
 181#ifdef DEBUG
 182/*
 183 * Check that (short) pointer is ok.
 184 */
 185STATIC int                              /* error (0 or EFSCORRUPTED) */
 186xfs_btree_check_sptr(
 187        struct xfs_btree_cur    *cur,   /* btree cursor */
 188        xfs_agblock_t           bno,    /* btree block disk address */
 189        int                     level)  /* btree block level */
 190{
 191        xfs_agblock_t           agblocks = cur->bc_mp->m_sb.sb_agblocks;
 192
 193        XFS_WANT_CORRUPTED_RETURN(cur->bc_mp,
 194                level > 0 &&
 195                bno != NULLAGBLOCK &&
 196                bno != 0 &&
 197                bno < agblocks);
 198        return 0;
 199}
 200
 201/*
 202 * Check that block ptr is ok.
 203 */
 204STATIC int                              /* error (0 or EFSCORRUPTED) */
 205xfs_btree_check_ptr(
 206        struct xfs_btree_cur    *cur,   /* btree cursor */
 207        union xfs_btree_ptr     *ptr,   /* btree block disk address */
 208        int                     index,  /* offset from ptr to check */
 209        int                     level)  /* btree block level */
 210{
 211        if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
 212                return xfs_btree_check_lptr(cur,
 213                                be64_to_cpu((&ptr->l)[index]), level);
 214        } else {
 215                return xfs_btree_check_sptr(cur,
 216                                be32_to_cpu((&ptr->s)[index]), level);
 217        }
 218}
 219#endif
 220
 221/*
 222 * Calculate CRC on the whole btree block and stuff it into the
 223 * long-form btree header.
 224 *
 225 * Prior to calculting the CRC, pull the LSN out of the buffer log item and put
 226 * it into the buffer so recovery knows what the last modification was that made
 227 * it to disk.
 228 */
 229void
 230xfs_btree_lblock_calc_crc(
 231        struct xfs_buf          *bp)
 232{
 233        struct xfs_btree_block  *block = XFS_BUF_TO_BLOCK(bp);
 234        struct xfs_buf_log_item *bip = bp->b_fspriv;
 235
 236        if (!xfs_sb_version_hascrc(&bp->b_target->bt_mount->m_sb))
 237                return;
 238        if (bip)
 239                block->bb_u.l.bb_lsn = cpu_to_be64(bip->bli_item.li_lsn);
 240        xfs_buf_update_cksum(bp, XFS_BTREE_LBLOCK_CRC_OFF);
 241}
 242
 243bool
 244xfs_btree_lblock_verify_crc(
 245        struct xfs_buf          *bp)
 246{
 247        struct xfs_btree_block  *block = XFS_BUF_TO_BLOCK(bp);
 248        struct xfs_mount        *mp = bp->b_target->bt_mount;
 249
 250        if (xfs_sb_version_hascrc(&mp->m_sb)) {
 251                if (!xfs_log_check_lsn(mp, be64_to_cpu(block->bb_u.l.bb_lsn)))
 252                        return false;
 253                return xfs_buf_verify_cksum(bp, XFS_BTREE_LBLOCK_CRC_OFF);
 254        }
 255
 256        return true;
 257}
 258
 259/*
 260 * Calculate CRC on the whole btree block and stuff it into the
 261 * short-form btree header.
 262 *
 263 * Prior to calculting the CRC, pull the LSN out of the buffer log item and put
 264 * it into the buffer so recovery knows what the last modification was that made
 265 * it to disk.
 266 */
 267void
 268xfs_btree_sblock_calc_crc(
 269        struct xfs_buf          *bp)
 270{
 271        struct xfs_btree_block  *block = XFS_BUF_TO_BLOCK(bp);
 272        struct xfs_buf_log_item *bip = bp->b_fspriv;
 273
 274        if (!xfs_sb_version_hascrc(&bp->b_target->bt_mount->m_sb))
 275                return;
 276        if (bip)
 277                block->bb_u.s.bb_lsn = cpu_to_be64(bip->bli_item.li_lsn);
 278        xfs_buf_update_cksum(bp, XFS_BTREE_SBLOCK_CRC_OFF);
 279}
 280
 281bool
 282xfs_btree_sblock_verify_crc(
 283        struct xfs_buf          *bp)
 284{
 285        struct xfs_btree_block  *block = XFS_BUF_TO_BLOCK(bp);
 286        struct xfs_mount        *mp = bp->b_target->bt_mount;
 287
 288        if (xfs_sb_version_hascrc(&mp->m_sb)) {
 289                if (!xfs_log_check_lsn(mp, be64_to_cpu(block->bb_u.s.bb_lsn)))
 290                        return false;
 291                return xfs_buf_verify_cksum(bp, XFS_BTREE_SBLOCK_CRC_OFF);
 292        }
 293
 294        return true;
 295}
 296
 297static int
 298xfs_btree_free_block(
 299        struct xfs_btree_cur    *cur,
 300        struct xfs_buf          *bp)
 301{
 302        int                     error;
 303
 304        error = cur->bc_ops->free_block(cur, bp);
 305        if (!error) {
 306                xfs_trans_binval(cur->bc_tp, bp);
 307                XFS_BTREE_STATS_INC(cur, free);
 308        }
 309        return error;
 310}
 311
 312/*
 313 * Delete the btree cursor.
 314 */
 315void
 316xfs_btree_del_cursor(
 317        xfs_btree_cur_t *cur,           /* btree cursor */
 318        int             error)          /* del because of error */
 319{
 320        int             i;              /* btree level */
 321
 322        /*
 323         * Clear the buffer pointers, and release the buffers.
 324         * If we're doing this in the face of an error, we
 325         * need to make sure to inspect all of the entries
 326         * in the bc_bufs array for buffers to be unlocked.
 327         * This is because some of the btree code works from
 328         * level n down to 0, and if we get an error along
 329         * the way we won't have initialized all the entries
 330         * down to 0.
 331         */
 332        for (i = 0; i < cur->bc_nlevels; i++) {
 333                if (cur->bc_bufs[i])
 334                        xfs_trans_brelse(cur->bc_tp, cur->bc_bufs[i]);
 335                else if (!error)
 336                        break;
 337        }
 338        /*
 339         * Can't free a bmap cursor without having dealt with the
 340         * allocated indirect blocks' accounting.
 341         */
 342        ASSERT(cur->bc_btnum != XFS_BTNUM_BMAP ||
 343               cur->bc_private.b.allocated == 0);
 344        /*
 345         * Free the cursor.
 346         */
 347        kmem_zone_free(xfs_btree_cur_zone, cur);
 348}
 349
 350/*
 351 * Duplicate the btree cursor.
 352 * Allocate a new one, copy the record, re-get the buffers.
 353 */
 354int                                     /* error */
 355xfs_btree_dup_cursor(
 356        xfs_btree_cur_t *cur,           /* input cursor */
 357        xfs_btree_cur_t **ncur)         /* output cursor */
 358{
 359        xfs_buf_t       *bp;            /* btree block's buffer pointer */
 360        int             error;          /* error return value */
 361        int             i;              /* level number of btree block */
 362        xfs_mount_t     *mp;            /* mount structure for filesystem */
 363        xfs_btree_cur_t *new;           /* new cursor value */
 364        xfs_trans_t     *tp;            /* transaction pointer, can be NULL */
 365
 366        tp = cur->bc_tp;
 367        mp = cur->bc_mp;
 368
 369        /*
 370         * Allocate a new cursor like the old one.
 371         */
 372        new = cur->bc_ops->dup_cursor(cur);
 373
 374        /*
 375         * Copy the record currently in the cursor.
 376         */
 377        new->bc_rec = cur->bc_rec;
 378
 379        /*
 380         * For each level current, re-get the buffer and copy the ptr value.
 381         */
 382        for (i = 0; i < new->bc_nlevels; i++) {
 383                new->bc_ptrs[i] = cur->bc_ptrs[i];
 384                new->bc_ra[i] = cur->bc_ra[i];
 385                bp = cur->bc_bufs[i];
 386                if (bp) {
 387                        error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp,
 388                                                   XFS_BUF_ADDR(bp), mp->m_bsize,
 389                                                   0, &bp,
 390                                                   cur->bc_ops->buf_ops);
 391                        if (error) {
 392                                xfs_btree_del_cursor(new, error);
 393                                *ncur = NULL;
 394                                return error;
 395                        }
 396                }
 397                new->bc_bufs[i] = bp;
 398        }
 399        *ncur = new;
 400        return 0;
 401}
 402
 403/*
 404 * XFS btree block layout and addressing:
 405 *
 406 * There are two types of blocks in the btree: leaf and non-leaf blocks.
 407 *
 408 * The leaf record start with a header then followed by records containing
 409 * the values.  A non-leaf block also starts with the same header, and
 410 * then first contains lookup keys followed by an equal number of pointers
 411 * to the btree blocks at the previous level.
 412 *
 413 *              +--------+-------+-------+-------+-------+-------+-------+
 414 * Leaf:        | header | rec 1 | rec 2 | rec 3 | rec 4 | rec 5 | rec N |
 415 *              +--------+-------+-------+-------+-------+-------+-------+
 416 *
 417 *              +--------+-------+-------+-------+-------+-------+-------+
 418 * Non-Leaf:    | header | key 1 | key 2 | key N | ptr 1 | ptr 2 | ptr N |
 419 *              +--------+-------+-------+-------+-------+-------+-------+
 420 *
 421 * The header is called struct xfs_btree_block for reasons better left unknown
 422 * and comes in different versions for short (32bit) and long (64bit) block
 423 * pointers.  The record and key structures are defined by the btree instances
 424 * and opaque to the btree core.  The block pointers are simple disk endian
 425 * integers, available in a short (32bit) and long (64bit) variant.
 426 *
 427 * The helpers below calculate the offset of a given record, key or pointer
 428 * into a btree block (xfs_btree_*_offset) or return a pointer to the given
 429 * record, key or pointer (xfs_btree_*_addr).  Note that all addressing
 430 * inside the btree block is done using indices starting at one, not zero!
 431 */
 432
 433/*
 434 * Return size of the btree block header for this btree instance.
 435 */
 436static inline size_t xfs_btree_block_len(struct xfs_btree_cur *cur)
 437{
 438        if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
 439                if (cur->bc_flags & XFS_BTREE_CRC_BLOCKS)
 440                        return XFS_BTREE_LBLOCK_CRC_LEN;
 441                return XFS_BTREE_LBLOCK_LEN;
 442        }
 443        if (cur->bc_flags & XFS_BTREE_CRC_BLOCKS)
 444                return XFS_BTREE_SBLOCK_CRC_LEN;
 445        return XFS_BTREE_SBLOCK_LEN;
 446}
 447
 448/*
 449 * Return size of btree block pointers for this btree instance.
 450 */
 451static inline size_t xfs_btree_ptr_len(struct xfs_btree_cur *cur)
 452{
 453        return (cur->bc_flags & XFS_BTREE_LONG_PTRS) ?
 454                sizeof(__be64) : sizeof(__be32);
 455}
 456
 457/*
 458 * Calculate offset of the n-th record in a btree block.
 459 */
 460STATIC size_t
 461xfs_btree_rec_offset(
 462        struct xfs_btree_cur    *cur,
 463        int                     n)
 464{
 465        return xfs_btree_block_len(cur) +
 466                (n - 1) * cur->bc_ops->rec_len;
 467}
 468
 469/*
 470 * Calculate offset of the n-th key in a btree block.
 471 */
 472STATIC size_t
 473xfs_btree_key_offset(
 474        struct xfs_btree_cur    *cur,
 475        int                     n)
 476{
 477        return xfs_btree_block_len(cur) +
 478                (n - 1) * cur->bc_ops->key_len;
 479}
 480
 481/*
 482 * Calculate offset of the n-th block pointer in a btree block.
 483 */
 484STATIC size_t
 485xfs_btree_ptr_offset(
 486        struct xfs_btree_cur    *cur,
 487        int                     n,
 488        int                     level)
 489{
 490        return xfs_btree_block_len(cur) +
 491                cur->bc_ops->get_maxrecs(cur, level) * cur->bc_ops->key_len +
 492                (n - 1) * xfs_btree_ptr_len(cur);
 493}
 494
 495/*
 496 * Return a pointer to the n-th record in the btree block.
 497 */
 498STATIC union xfs_btree_rec *
 499xfs_btree_rec_addr(
 500        struct xfs_btree_cur    *cur,
 501        int                     n,
 502        struct xfs_btree_block  *block)
 503{
 504        return (union xfs_btree_rec *)
 505                ((char *)block + xfs_btree_rec_offset(cur, n));
 506}
 507
 508/*
 509 * Return a pointer to the n-th key in the btree block.
 510 */
 511STATIC union xfs_btree_key *
 512xfs_btree_key_addr(
 513        struct xfs_btree_cur    *cur,
 514        int                     n,
 515        struct xfs_btree_block  *block)
 516{
 517        return (union xfs_btree_key *)
 518                ((char *)block + xfs_btree_key_offset(cur, n));
 519}
 520
 521/*
 522 * Return a pointer to the n-th block pointer in the btree block.
 523 */
 524STATIC union xfs_btree_ptr *
 525xfs_btree_ptr_addr(
 526        struct xfs_btree_cur    *cur,
 527        int                     n,
 528        struct xfs_btree_block  *block)
 529{
 530        int                     level = xfs_btree_get_level(block);
 531
 532        ASSERT(block->bb_level != 0);
 533
 534        return (union xfs_btree_ptr *)
 535                ((char *)block + xfs_btree_ptr_offset(cur, n, level));
 536}
 537
 538/*
 539 * Get the root block which is stored in the inode.
 540 *
 541 * For now this btree implementation assumes the btree root is always
 542 * stored in the if_broot field of an inode fork.
 543 */
 544STATIC struct xfs_btree_block *
 545xfs_btree_get_iroot(
 546       struct xfs_btree_cur    *cur)
 547{
 548       struct xfs_ifork        *ifp;
 549
 550       ifp = XFS_IFORK_PTR(cur->bc_private.b.ip, cur->bc_private.b.whichfork);
 551       return (struct xfs_btree_block *)ifp->if_broot;
 552}
 553
 554/*
 555 * Retrieve the block pointer from the cursor at the given level.
 556 * This may be an inode btree root or from a buffer.
 557 */
 558STATIC struct xfs_btree_block *         /* generic btree block pointer */
 559xfs_btree_get_block(
 560        struct xfs_btree_cur    *cur,   /* btree cursor */
 561        int                     level,  /* level in btree */
 562        struct xfs_buf          **bpp)  /* buffer containing the block */
 563{
 564        if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
 565            (level == cur->bc_nlevels - 1)) {
 566                *bpp = NULL;
 567                return xfs_btree_get_iroot(cur);
 568        }
 569
 570        *bpp = cur->bc_bufs[level];
 571        return XFS_BUF_TO_BLOCK(*bpp);
 572}
 573
 574/*
 575 * Get a buffer for the block, return it with no data read.
 576 * Long-form addressing.
 577 */
 578xfs_buf_t *                             /* buffer for fsbno */
 579xfs_btree_get_bufl(
 580        xfs_mount_t     *mp,            /* file system mount point */
 581        xfs_trans_t     *tp,            /* transaction pointer */
 582        xfs_fsblock_t   fsbno,          /* file system block number */
 583        uint            lock)           /* lock flags for get_buf */
 584{
 585        xfs_daddr_t             d;              /* real disk block address */
 586
 587        ASSERT(fsbno != NULLFSBLOCK);
 588        d = XFS_FSB_TO_DADDR(mp, fsbno);
 589        return xfs_trans_get_buf(tp, mp->m_ddev_targp, d, mp->m_bsize, lock);
 590}
 591
 592/*
 593 * Get a buffer for the block, return it with no data read.
 594 * Short-form addressing.
 595 */
 596xfs_buf_t *                             /* buffer for agno/agbno */
 597xfs_btree_get_bufs(
 598        xfs_mount_t     *mp,            /* file system mount point */
 599        xfs_trans_t     *tp,            /* transaction pointer */
 600        xfs_agnumber_t  agno,           /* allocation group number */
 601        xfs_agblock_t   agbno,          /* allocation group block number */
 602        uint            lock)           /* lock flags for get_buf */
 603{
 604        xfs_daddr_t             d;              /* real disk block address */
 605
 606        ASSERT(agno != NULLAGNUMBER);
 607        ASSERT(agbno != NULLAGBLOCK);
 608        d = XFS_AGB_TO_DADDR(mp, agno, agbno);
 609        return xfs_trans_get_buf(tp, mp->m_ddev_targp, d, mp->m_bsize, lock);
 610}
 611
 612/*
 613 * Check for the cursor referring to the last block at the given level.
 614 */
 615int                                     /* 1=is last block, 0=not last block */
 616xfs_btree_islastblock(
 617        xfs_btree_cur_t         *cur,   /* btree cursor */
 618        int                     level)  /* level to check */
 619{
 620        struct xfs_btree_block  *block; /* generic btree block pointer */
 621        xfs_buf_t               *bp;    /* buffer containing block */
 622
 623        block = xfs_btree_get_block(cur, level, &bp);
 624        xfs_btree_check_block(cur, block, level, bp);
 625        if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
 626                return block->bb_u.l.bb_rightsib == cpu_to_be64(NULLFSBLOCK);
 627        else
 628                return block->bb_u.s.bb_rightsib == cpu_to_be32(NULLAGBLOCK);
 629}
 630
 631/*
 632 * Change the cursor to point to the first record at the given level.
 633 * Other levels are unaffected.
 634 */
 635STATIC int                              /* success=1, failure=0 */
 636xfs_btree_firstrec(
 637        xfs_btree_cur_t         *cur,   /* btree cursor */
 638        int                     level)  /* level to change */
 639{
 640        struct xfs_btree_block  *block; /* generic btree block pointer */
 641        xfs_buf_t               *bp;    /* buffer containing block */
 642
 643        /*
 644         * Get the block pointer for this level.
 645         */
 646        block = xfs_btree_get_block(cur, level, &bp);
 647        xfs_btree_check_block(cur, block, level, bp);
 648        /*
 649         * It's empty, there is no such record.
 650         */
 651        if (!block->bb_numrecs)
 652                return 0;
 653        /*
 654         * Set the ptr value to 1, that's the first record/key.
 655         */
 656        cur->bc_ptrs[level] = 1;
 657        return 1;
 658}
 659
 660/*
 661 * Change the cursor to point to the last record in the current block
 662 * at the given level.  Other levels are unaffected.
 663 */
 664STATIC int                              /* success=1, failure=0 */
 665xfs_btree_lastrec(
 666        xfs_btree_cur_t         *cur,   /* btree cursor */
 667        int                     level)  /* level to change */
 668{
 669        struct xfs_btree_block  *block; /* generic btree block pointer */
 670        xfs_buf_t               *bp;    /* buffer containing block */
 671
 672        /*
 673         * Get the block pointer for this level.
 674         */
 675        block = xfs_btree_get_block(cur, level, &bp);
 676        xfs_btree_check_block(cur, block, level, bp);
 677        /*
 678         * It's empty, there is no such record.
 679         */
 680        if (!block->bb_numrecs)
 681                return 0;
 682        /*
 683         * Set the ptr value to numrecs, that's the last record/key.
 684         */
 685        cur->bc_ptrs[level] = be16_to_cpu(block->bb_numrecs);
 686        return 1;
 687}
 688
 689/*
 690 * Compute first and last byte offsets for the fields given.
 691 * Interprets the offsets table, which contains struct field offsets.
 692 */
 693void
 694xfs_btree_offsets(
 695        __int64_t       fields,         /* bitmask of fields */
 696        const short     *offsets,       /* table of field offsets */
 697        int             nbits,          /* number of bits to inspect */
 698        int             *first,         /* output: first byte offset */
 699        int             *last)          /* output: last byte offset */
 700{
 701        int             i;              /* current bit number */
 702        __int64_t       imask;          /* mask for current bit number */
 703
 704        ASSERT(fields != 0);
 705        /*
 706         * Find the lowest bit, so the first byte offset.
 707         */
 708        for (i = 0, imask = 1LL; ; i++, imask <<= 1) {
 709                if (imask & fields) {
 710                        *first = offsets[i];
 711                        break;
 712                }
 713        }
 714        /*
 715         * Find the highest bit, so the last byte offset.
 716         */
 717        for (i = nbits - 1, imask = 1LL << i; ; i--, imask >>= 1) {
 718                if (imask & fields) {
 719                        *last = offsets[i + 1] - 1;
 720                        break;
 721                }
 722        }
 723}
 724
 725/*
 726 * Get a buffer for the block, return it read in.
 727 * Long-form addressing.
 728 */
 729int
 730xfs_btree_read_bufl(
 731        struct xfs_mount        *mp,            /* file system mount point */
 732        struct xfs_trans        *tp,            /* transaction pointer */
 733        xfs_fsblock_t           fsbno,          /* file system block number */
 734        uint                    lock,           /* lock flags for read_buf */
 735        struct xfs_buf          **bpp,          /* buffer for fsbno */
 736        int                     refval,         /* ref count value for buffer */
 737        const struct xfs_buf_ops *ops)
 738{
 739        struct xfs_buf          *bp;            /* return value */
 740        xfs_daddr_t             d;              /* real disk block address */
 741        int                     error;
 742
 743        ASSERT(fsbno != NULLFSBLOCK);
 744        d = XFS_FSB_TO_DADDR(mp, fsbno);
 745        error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, d,
 746                                   mp->m_bsize, lock, &bp, ops);
 747        if (error)
 748                return error;
 749        if (bp)
 750                xfs_buf_set_ref(bp, refval);
 751        *bpp = bp;
 752        return 0;
 753}
 754
 755/*
 756 * Read-ahead the block, don't wait for it, don't return a buffer.
 757 * Long-form addressing.
 758 */
 759/* ARGSUSED */
 760void
 761xfs_btree_reada_bufl(
 762        struct xfs_mount        *mp,            /* file system mount point */
 763        xfs_fsblock_t           fsbno,          /* file system block number */
 764        xfs_extlen_t            count,          /* count of filesystem blocks */
 765        const struct xfs_buf_ops *ops)
 766{
 767        xfs_daddr_t             d;
 768
 769        ASSERT(fsbno != NULLFSBLOCK);
 770        d = XFS_FSB_TO_DADDR(mp, fsbno);
 771        xfs_buf_readahead(mp->m_ddev_targp, d, mp->m_bsize * count, ops);
 772}
 773
 774/*
 775 * Read-ahead the block, don't wait for it, don't return a buffer.
 776 * Short-form addressing.
 777 */
 778/* ARGSUSED */
 779void
 780xfs_btree_reada_bufs(
 781        struct xfs_mount        *mp,            /* file system mount point */
 782        xfs_agnumber_t          agno,           /* allocation group number */
 783        xfs_agblock_t           agbno,          /* allocation group block number */
 784        xfs_extlen_t            count,          /* count of filesystem blocks */
 785        const struct xfs_buf_ops *ops)
 786{
 787        xfs_daddr_t             d;
 788
 789        ASSERT(agno != NULLAGNUMBER);
 790        ASSERT(agbno != NULLAGBLOCK);
 791        d = XFS_AGB_TO_DADDR(mp, agno, agbno);
 792        xfs_buf_readahead(mp->m_ddev_targp, d, mp->m_bsize * count, ops);
 793}
 794
 795STATIC int
 796xfs_btree_readahead_lblock(
 797        struct xfs_btree_cur    *cur,
 798        int                     lr,
 799        struct xfs_btree_block  *block)
 800{
 801        int                     rval = 0;
 802        xfs_fsblock_t           left = be64_to_cpu(block->bb_u.l.bb_leftsib);
 803        xfs_fsblock_t           right = be64_to_cpu(block->bb_u.l.bb_rightsib);
 804
 805        if ((lr & XFS_BTCUR_LEFTRA) && left != NULLFSBLOCK) {
 806                xfs_btree_reada_bufl(cur->bc_mp, left, 1,
 807                                     cur->bc_ops->buf_ops);
 808                rval++;
 809        }
 810
 811        if ((lr & XFS_BTCUR_RIGHTRA) && right != NULLFSBLOCK) {
 812                xfs_btree_reada_bufl(cur->bc_mp, right, 1,
 813                                     cur->bc_ops->buf_ops);
 814                rval++;
 815        }
 816
 817        return rval;
 818}
 819
 820STATIC int
 821xfs_btree_readahead_sblock(
 822        struct xfs_btree_cur    *cur,
 823        int                     lr,
 824        struct xfs_btree_block *block)
 825{
 826        int                     rval = 0;
 827        xfs_agblock_t           left = be32_to_cpu(block->bb_u.s.bb_leftsib);
 828        xfs_agblock_t           right = be32_to_cpu(block->bb_u.s.bb_rightsib);
 829
 830
 831        if ((lr & XFS_BTCUR_LEFTRA) && left != NULLAGBLOCK) {
 832                xfs_btree_reada_bufs(cur->bc_mp, cur->bc_private.a.agno,
 833                                     left, 1, cur->bc_ops->buf_ops);
 834                rval++;
 835        }
 836
 837        if ((lr & XFS_BTCUR_RIGHTRA) && right != NULLAGBLOCK) {
 838                xfs_btree_reada_bufs(cur->bc_mp, cur->bc_private.a.agno,
 839                                     right, 1, cur->bc_ops->buf_ops);
 840                rval++;
 841        }
 842
 843        return rval;
 844}
 845
 846/*
 847 * Read-ahead btree blocks, at the given level.
 848 * Bits in lr are set from XFS_BTCUR_{LEFT,RIGHT}RA.
 849 */
 850STATIC int
 851xfs_btree_readahead(
 852        struct xfs_btree_cur    *cur,           /* btree cursor */
 853        int                     lev,            /* level in btree */
 854        int                     lr)             /* left/right bits */
 855{
 856        struct xfs_btree_block  *block;
 857
 858        /*
 859         * No readahead needed if we are at the root level and the
 860         * btree root is stored in the inode.
 861         */
 862        if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
 863            (lev == cur->bc_nlevels - 1))
 864                return 0;
 865
 866        if ((cur->bc_ra[lev] | lr) == cur->bc_ra[lev])
 867                return 0;
 868
 869        cur->bc_ra[lev] |= lr;
 870        block = XFS_BUF_TO_BLOCK(cur->bc_bufs[lev]);
 871
 872        if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
 873                return xfs_btree_readahead_lblock(cur, lr, block);
 874        return xfs_btree_readahead_sblock(cur, lr, block);
 875}
 876
 877STATIC xfs_daddr_t
 878xfs_btree_ptr_to_daddr(
 879        struct xfs_btree_cur    *cur,
 880        union xfs_btree_ptr     *ptr)
 881{
 882        if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
 883                ASSERT(ptr->l != cpu_to_be64(NULLFSBLOCK));
 884
 885                return XFS_FSB_TO_DADDR(cur->bc_mp, be64_to_cpu(ptr->l));
 886        } else {
 887                ASSERT(cur->bc_private.a.agno != NULLAGNUMBER);
 888                ASSERT(ptr->s != cpu_to_be32(NULLAGBLOCK));
 889
 890                return XFS_AGB_TO_DADDR(cur->bc_mp, cur->bc_private.a.agno,
 891                                        be32_to_cpu(ptr->s));
 892        }
 893}
 894
 895/*
 896 * Readahead @count btree blocks at the given @ptr location.
 897 *
 898 * We don't need to care about long or short form btrees here as we have a
 899 * method of converting the ptr directly to a daddr available to us.
 900 */
 901STATIC void
 902xfs_btree_readahead_ptr(
 903        struct xfs_btree_cur    *cur,
 904        union xfs_btree_ptr     *ptr,
 905        xfs_extlen_t            count)
 906{
 907        xfs_buf_readahead(cur->bc_mp->m_ddev_targp,
 908                          xfs_btree_ptr_to_daddr(cur, ptr),
 909                          cur->bc_mp->m_bsize * count, cur->bc_ops->buf_ops);
 910}
 911
 912/*
 913 * Set the buffer for level "lev" in the cursor to bp, releasing
 914 * any previous buffer.
 915 */
 916STATIC void
 917xfs_btree_setbuf(
 918        xfs_btree_cur_t         *cur,   /* btree cursor */
 919        int                     lev,    /* level in btree */
 920        xfs_buf_t               *bp)    /* new buffer to set */
 921{
 922        struct xfs_btree_block  *b;     /* btree block */
 923
 924        if (cur->bc_bufs[lev])
 925                xfs_trans_brelse(cur->bc_tp, cur->bc_bufs[lev]);
 926        cur->bc_bufs[lev] = bp;
 927        cur->bc_ra[lev] = 0;
 928
 929        b = XFS_BUF_TO_BLOCK(bp);
 930        if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
 931                if (b->bb_u.l.bb_leftsib == cpu_to_be64(NULLFSBLOCK))
 932                        cur->bc_ra[lev] |= XFS_BTCUR_LEFTRA;
 933                if (b->bb_u.l.bb_rightsib == cpu_to_be64(NULLFSBLOCK))
 934                        cur->bc_ra[lev] |= XFS_BTCUR_RIGHTRA;
 935        } else {
 936                if (b->bb_u.s.bb_leftsib == cpu_to_be32(NULLAGBLOCK))
 937                        cur->bc_ra[lev] |= XFS_BTCUR_LEFTRA;
 938                if (b->bb_u.s.bb_rightsib == cpu_to_be32(NULLAGBLOCK))
 939                        cur->bc_ra[lev] |= XFS_BTCUR_RIGHTRA;
 940        }
 941}
 942
 943STATIC int
 944xfs_btree_ptr_is_null(
 945        struct xfs_btree_cur    *cur,
 946        union xfs_btree_ptr     *ptr)
 947{
 948        if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
 949                return ptr->l == cpu_to_be64(NULLFSBLOCK);
 950        else
 951                return ptr->s == cpu_to_be32(NULLAGBLOCK);
 952}
 953
 954STATIC void
 955xfs_btree_set_ptr_null(
 956        struct xfs_btree_cur    *cur,
 957        union xfs_btree_ptr     *ptr)
 958{
 959        if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
 960                ptr->l = cpu_to_be64(NULLFSBLOCK);
 961        else
 962                ptr->s = cpu_to_be32(NULLAGBLOCK);
 963}
 964
 965/*
 966 * Get/set/init sibling pointers
 967 */
 968STATIC void
 969xfs_btree_get_sibling(
 970        struct xfs_btree_cur    *cur,
 971        struct xfs_btree_block  *block,
 972        union xfs_btree_ptr     *ptr,
 973        int                     lr)
 974{
 975        ASSERT(lr == XFS_BB_LEFTSIB || lr == XFS_BB_RIGHTSIB);
 976
 977        if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
 978                if (lr == XFS_BB_RIGHTSIB)
 979                        ptr->l = block->bb_u.l.bb_rightsib;
 980                else
 981                        ptr->l = block->bb_u.l.bb_leftsib;
 982        } else {
 983                if (lr == XFS_BB_RIGHTSIB)
 984                        ptr->s = block->bb_u.s.bb_rightsib;
 985                else
 986                        ptr->s = block->bb_u.s.bb_leftsib;
 987        }
 988}
 989
 990STATIC void
 991xfs_btree_set_sibling(
 992        struct xfs_btree_cur    *cur,
 993        struct xfs_btree_block  *block,
 994        union xfs_btree_ptr     *ptr,
 995        int                     lr)
 996{
 997        ASSERT(lr == XFS_BB_LEFTSIB || lr == XFS_BB_RIGHTSIB);
 998
 999        if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
1000                if (lr == XFS_BB_RIGHTSIB)
1001                        block->bb_u.l.bb_rightsib = ptr->l;
1002                else
1003                        block->bb_u.l.bb_leftsib = ptr->l;
1004        } else {
1005                if (lr == XFS_BB_RIGHTSIB)
1006                        block->bb_u.s.bb_rightsib = ptr->s;
1007                else
1008                        block->bb_u.s.bb_leftsib = ptr->s;
1009        }
1010}
1011
1012void
1013xfs_btree_init_block_int(
1014        struct xfs_mount        *mp,
1015        struct xfs_btree_block  *buf,
1016        xfs_daddr_t             blkno,
1017        __u32                   magic,
1018        __u16                   level,
1019        __u16                   numrecs,
1020        __u64                   owner,
1021        unsigned int            flags)
1022{
1023        buf->bb_magic = cpu_to_be32(magic);
1024        buf->bb_level = cpu_to_be16(level);
1025        buf->bb_numrecs = cpu_to_be16(numrecs);
1026
1027        if (flags & XFS_BTREE_LONG_PTRS) {
1028                buf->bb_u.l.bb_leftsib = cpu_to_be64(NULLFSBLOCK);
1029                buf->bb_u.l.bb_rightsib = cpu_to_be64(NULLFSBLOCK);
1030                if (flags & XFS_BTREE_CRC_BLOCKS) {
1031                        buf->bb_u.l.bb_blkno = cpu_to_be64(blkno);
1032                        buf->bb_u.l.bb_owner = cpu_to_be64(owner);
1033                        uuid_copy(&buf->bb_u.l.bb_uuid, &mp->m_sb.sb_meta_uuid);
1034                        buf->bb_u.l.bb_pad = 0;
1035                        buf->bb_u.l.bb_lsn = 0;
1036                }
1037        } else {
1038                /* owner is a 32 bit value on short blocks */
1039                __u32 __owner = (__u32)owner;
1040
1041                buf->bb_u.s.bb_leftsib = cpu_to_be32(NULLAGBLOCK);
1042                buf->bb_u.s.bb_rightsib = cpu_to_be32(NULLAGBLOCK);
1043                if (flags & XFS_BTREE_CRC_BLOCKS) {
1044                        buf->bb_u.s.bb_blkno = cpu_to_be64(blkno);
1045                        buf->bb_u.s.bb_owner = cpu_to_be32(__owner);
1046                        uuid_copy(&buf->bb_u.s.bb_uuid, &mp->m_sb.sb_meta_uuid);
1047                        buf->bb_u.s.bb_lsn = 0;
1048                }
1049        }
1050}
1051
1052void
1053xfs_btree_init_block(
1054        struct xfs_mount *mp,
1055        struct xfs_buf  *bp,
1056        __u32           magic,
1057        __u16           level,
1058        __u16           numrecs,
1059        __u64           owner,
1060        unsigned int    flags)
1061{
1062        xfs_btree_init_block_int(mp, XFS_BUF_TO_BLOCK(bp), bp->b_bn,
1063                                 magic, level, numrecs, owner, flags);
1064}
1065
1066STATIC void
1067xfs_btree_init_block_cur(
1068        struct xfs_btree_cur    *cur,
1069        struct xfs_buf          *bp,
1070        int                     level,
1071        int                     numrecs)
1072{
1073        __u64 owner;
1074
1075        /*
1076         * we can pull the owner from the cursor right now as the different
1077         * owners align directly with the pointer size of the btree. This may
1078         * change in future, but is safe for current users of the generic btree
1079         * code.
1080         */
1081        if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
1082                owner = cur->bc_private.b.ip->i_ino;
1083        else
1084                owner = cur->bc_private.a.agno;
1085
1086        xfs_btree_init_block_int(cur->bc_mp, XFS_BUF_TO_BLOCK(bp), bp->b_bn,
1087                                 xfs_btree_magic(cur), level, numrecs,
1088                                 owner, cur->bc_flags);
1089}
1090
1091/*
1092 * Return true if ptr is the last record in the btree and
1093 * we need to track updates to this record.  The decision
1094 * will be further refined in the update_lastrec method.
1095 */
1096STATIC int
1097xfs_btree_is_lastrec(
1098        struct xfs_btree_cur    *cur,
1099        struct xfs_btree_block  *block,
1100        int                     level)
1101{
1102        union xfs_btree_ptr     ptr;
1103
1104        if (level > 0)
1105                return 0;
1106        if (!(cur->bc_flags & XFS_BTREE_LASTREC_UPDATE))
1107                return 0;
1108
1109        xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_RIGHTSIB);
1110        if (!xfs_btree_ptr_is_null(cur, &ptr))
1111                return 0;
1112        return 1;
1113}
1114
1115STATIC void
1116xfs_btree_buf_to_ptr(
1117        struct xfs_btree_cur    *cur,
1118        struct xfs_buf          *bp,
1119        union xfs_btree_ptr     *ptr)
1120{
1121        if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
1122                ptr->l = cpu_to_be64(XFS_DADDR_TO_FSB(cur->bc_mp,
1123                                        XFS_BUF_ADDR(bp)));
1124        else {
1125                ptr->s = cpu_to_be32(xfs_daddr_to_agbno(cur->bc_mp,
1126                                        XFS_BUF_ADDR(bp)));
1127        }
1128}
1129
1130STATIC void
1131xfs_btree_set_refs(
1132        struct xfs_btree_cur    *cur,
1133        struct xfs_buf          *bp)
1134{
1135        switch (cur->bc_btnum) {
1136        case XFS_BTNUM_BNO:
1137        case XFS_BTNUM_CNT:
1138                xfs_buf_set_ref(bp, XFS_ALLOC_BTREE_REF);
1139                break;
1140        case XFS_BTNUM_INO:
1141        case XFS_BTNUM_FINO:
1142                xfs_buf_set_ref(bp, XFS_INO_BTREE_REF);
1143                break;
1144        case XFS_BTNUM_BMAP:
1145                xfs_buf_set_ref(bp, XFS_BMAP_BTREE_REF);
1146                break;
1147        default:
1148                ASSERT(0);
1149        }
1150}
1151
1152STATIC int
1153xfs_btree_get_buf_block(
1154        struct xfs_btree_cur    *cur,
1155        union xfs_btree_ptr     *ptr,
1156        int                     flags,
1157        struct xfs_btree_block  **block,
1158        struct xfs_buf          **bpp)
1159{
1160        struct xfs_mount        *mp = cur->bc_mp;
1161        xfs_daddr_t             d;
1162
1163        /* need to sort out how callers deal with failures first */
1164        ASSERT(!(flags & XBF_TRYLOCK));
1165
1166        d = xfs_btree_ptr_to_daddr(cur, ptr);
1167        *bpp = xfs_trans_get_buf(cur->bc_tp, mp->m_ddev_targp, d,
1168                                 mp->m_bsize, flags);
1169
1170        if (!*bpp)
1171                return -ENOMEM;
1172
1173        (*bpp)->b_ops = cur->bc_ops->buf_ops;
1174        *block = XFS_BUF_TO_BLOCK(*bpp);
1175        return 0;
1176}
1177
1178/*
1179 * Read in the buffer at the given ptr and return the buffer and
1180 * the block pointer within the buffer.
1181 */
1182STATIC int
1183xfs_btree_read_buf_block(
1184        struct xfs_btree_cur    *cur,
1185        union xfs_btree_ptr     *ptr,
1186        int                     flags,
1187        struct xfs_btree_block  **block,
1188        struct xfs_buf          **bpp)
1189{
1190        struct xfs_mount        *mp = cur->bc_mp;
1191        xfs_daddr_t             d;
1192        int                     error;
1193
1194        /* need to sort out how callers deal with failures first */
1195        ASSERT(!(flags & XBF_TRYLOCK));
1196
1197        d = xfs_btree_ptr_to_daddr(cur, ptr);
1198        error = xfs_trans_read_buf(mp, cur->bc_tp, mp->m_ddev_targp, d,
1199                                   mp->m_bsize, flags, bpp,
1200                                   cur->bc_ops->buf_ops);
1201        if (error)
1202                return error;
1203
1204        xfs_btree_set_refs(cur, *bpp);
1205        *block = XFS_BUF_TO_BLOCK(*bpp);
1206        return 0;
1207}
1208
1209/*
1210 * Copy keys from one btree block to another.
1211 */
1212STATIC void
1213xfs_btree_copy_keys(
1214        struct xfs_btree_cur    *cur,
1215        union xfs_btree_key     *dst_key,
1216        union xfs_btree_key     *src_key,
1217        int                     numkeys)
1218{
1219        ASSERT(numkeys >= 0);
1220        memcpy(dst_key, src_key, numkeys * cur->bc_ops->key_len);
1221}
1222
1223/*
1224 * Copy records from one btree block to another.
1225 */
1226STATIC void
1227xfs_btree_copy_recs(
1228        struct xfs_btree_cur    *cur,
1229        union xfs_btree_rec     *dst_rec,
1230        union xfs_btree_rec     *src_rec,
1231        int                     numrecs)
1232{
1233        ASSERT(numrecs >= 0);
1234        memcpy(dst_rec, src_rec, numrecs * cur->bc_ops->rec_len);
1235}
1236
1237/*
1238 * Copy block pointers from one btree block to another.
1239 */
1240STATIC void
1241xfs_btree_copy_ptrs(
1242        struct xfs_btree_cur    *cur,
1243        union xfs_btree_ptr     *dst_ptr,
1244        union xfs_btree_ptr     *src_ptr,
1245        int                     numptrs)
1246{
1247        ASSERT(numptrs >= 0);
1248        memcpy(dst_ptr, src_ptr, numptrs * xfs_btree_ptr_len(cur));
1249}
1250
1251/*
1252 * Shift keys one index left/right inside a single btree block.
1253 */
1254STATIC void
1255xfs_btree_shift_keys(
1256        struct xfs_btree_cur    *cur,
1257        union xfs_btree_key     *key,
1258        int                     dir,
1259        int                     numkeys)
1260{
1261        char                    *dst_key;
1262
1263        ASSERT(numkeys >= 0);
1264        ASSERT(dir == 1 || dir == -1);
1265
1266        dst_key = (char *)key + (dir * cur->bc_ops->key_len);
1267        memmove(dst_key, key, numkeys * cur->bc_ops->key_len);
1268}
1269
1270/*
1271 * Shift records one index left/right inside a single btree block.
1272 */
1273STATIC void
1274xfs_btree_shift_recs(
1275        struct xfs_btree_cur    *cur,
1276        union xfs_btree_rec     *rec,
1277        int                     dir,
1278        int                     numrecs)
1279{
1280        char                    *dst_rec;
1281
1282        ASSERT(numrecs >= 0);
1283        ASSERT(dir == 1 || dir == -1);
1284
1285        dst_rec = (char *)rec + (dir * cur->bc_ops->rec_len);
1286        memmove(dst_rec, rec, numrecs * cur->bc_ops->rec_len);
1287}
1288
1289/*
1290 * Shift block pointers one index left/right inside a single btree block.
1291 */
1292STATIC void
1293xfs_btree_shift_ptrs(
1294        struct xfs_btree_cur    *cur,
1295        union xfs_btree_ptr     *ptr,
1296        int                     dir,
1297        int                     numptrs)
1298{
1299        char                    *dst_ptr;
1300
1301        ASSERT(numptrs >= 0);
1302        ASSERT(dir == 1 || dir == -1);
1303
1304        dst_ptr = (char *)ptr + (dir * xfs_btree_ptr_len(cur));
1305        memmove(dst_ptr, ptr, numptrs * xfs_btree_ptr_len(cur));
1306}
1307
1308/*
1309 * Log key values from the btree block.
1310 */
1311STATIC void
1312xfs_btree_log_keys(
1313        struct xfs_btree_cur    *cur,
1314        struct xfs_buf          *bp,
1315        int                     first,
1316        int                     last)
1317{
1318        XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
1319        XFS_BTREE_TRACE_ARGBII(cur, bp, first, last);
1320
1321        if (bp) {
1322                xfs_trans_buf_set_type(cur->bc_tp, bp, XFS_BLFT_BTREE_BUF);
1323                xfs_trans_log_buf(cur->bc_tp, bp,
1324                                  xfs_btree_key_offset(cur, first),
1325                                  xfs_btree_key_offset(cur, last + 1) - 1);
1326        } else {
1327                xfs_trans_log_inode(cur->bc_tp, cur->bc_private.b.ip,
1328                                xfs_ilog_fbroot(cur->bc_private.b.whichfork));
1329        }
1330
1331        XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
1332}
1333
1334/*
1335 * Log record values from the btree block.
1336 */
1337void
1338xfs_btree_log_recs(
1339        struct xfs_btree_cur    *cur,
1340        struct xfs_buf          *bp,
1341        int                     first,
1342        int                     last)
1343{
1344        XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
1345        XFS_BTREE_TRACE_ARGBII(cur, bp, first, last);
1346
1347        xfs_trans_buf_set_type(cur->bc_tp, bp, XFS_BLFT_BTREE_BUF);
1348        xfs_trans_log_buf(cur->bc_tp, bp,
1349                          xfs_btree_rec_offset(cur, first),
1350                          xfs_btree_rec_offset(cur, last + 1) - 1);
1351
1352        XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
1353}
1354
1355/*
1356 * Log block pointer fields from a btree block (nonleaf).
1357 */
1358STATIC void
1359xfs_btree_log_ptrs(
1360        struct xfs_btree_cur    *cur,   /* btree cursor */
1361        struct xfs_buf          *bp,    /* buffer containing btree block */
1362        int                     first,  /* index of first pointer to log */
1363        int                     last)   /* index of last pointer to log */
1364{
1365        XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
1366        XFS_BTREE_TRACE_ARGBII(cur, bp, first, last);
1367
1368        if (bp) {
1369                struct xfs_btree_block  *block = XFS_BUF_TO_BLOCK(bp);
1370                int                     level = xfs_btree_get_level(block);
1371
1372                xfs_trans_buf_set_type(cur->bc_tp, bp, XFS_BLFT_BTREE_BUF);
1373                xfs_trans_log_buf(cur->bc_tp, bp,
1374                                xfs_btree_ptr_offset(cur, first, level),
1375                                xfs_btree_ptr_offset(cur, last + 1, level) - 1);
1376        } else {
1377                xfs_trans_log_inode(cur->bc_tp, cur->bc_private.b.ip,
1378                        xfs_ilog_fbroot(cur->bc_private.b.whichfork));
1379        }
1380
1381        XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
1382}
1383
1384/*
1385 * Log fields from a btree block header.
1386 */
1387void
1388xfs_btree_log_block(
1389        struct xfs_btree_cur    *cur,   /* btree cursor */
1390        struct xfs_buf          *bp,    /* buffer containing btree block */
1391        int                     fields) /* mask of fields: XFS_BB_... */
1392{
1393        int                     first;  /* first byte offset logged */
1394        int                     last;   /* last byte offset logged */
1395        static const short      soffsets[] = {  /* table of offsets (short) */
1396                offsetof(struct xfs_btree_block, bb_magic),
1397                offsetof(struct xfs_btree_block, bb_level),
1398                offsetof(struct xfs_btree_block, bb_numrecs),
1399                offsetof(struct xfs_btree_block, bb_u.s.bb_leftsib),
1400                offsetof(struct xfs_btree_block, bb_u.s.bb_rightsib),
1401                offsetof(struct xfs_btree_block, bb_u.s.bb_blkno),
1402                offsetof(struct xfs_btree_block, bb_u.s.bb_lsn),
1403                offsetof(struct xfs_btree_block, bb_u.s.bb_uuid),
1404                offsetof(struct xfs_btree_block, bb_u.s.bb_owner),
1405                offsetof(struct xfs_btree_block, bb_u.s.bb_crc),
1406                XFS_BTREE_SBLOCK_CRC_LEN
1407        };
1408        static const short      loffsets[] = {  /* table of offsets (long) */
1409                offsetof(struct xfs_btree_block, bb_magic),
1410                offsetof(struct xfs_btree_block, bb_level),
1411                offsetof(struct xfs_btree_block, bb_numrecs),
1412                offsetof(struct xfs_btree_block, bb_u.l.bb_leftsib),
1413                offsetof(struct xfs_btree_block, bb_u.l.bb_rightsib),
1414                offsetof(struct xfs_btree_block, bb_u.l.bb_blkno),
1415                offsetof(struct xfs_btree_block, bb_u.l.bb_lsn),
1416                offsetof(struct xfs_btree_block, bb_u.l.bb_uuid),
1417                offsetof(struct xfs_btree_block, bb_u.l.bb_owner),
1418                offsetof(struct xfs_btree_block, bb_u.l.bb_crc),
1419                offsetof(struct xfs_btree_block, bb_u.l.bb_pad),
1420                XFS_BTREE_LBLOCK_CRC_LEN
1421        };
1422
1423        XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
1424        XFS_BTREE_TRACE_ARGBI(cur, bp, fields);
1425
1426        if (bp) {
1427                int nbits;
1428
1429                if (cur->bc_flags & XFS_BTREE_CRC_BLOCKS) {
1430                        /*
1431                         * We don't log the CRC when updating a btree
1432                         * block but instead recreate it during log
1433                         * recovery.  As the log buffers have checksums
1434                         * of their own this is safe and avoids logging a crc
1435                         * update in a lot of places.
1436                         */
1437                        if (fields == XFS_BB_ALL_BITS)
1438                                fields = XFS_BB_ALL_BITS_CRC;
1439                        nbits = XFS_BB_NUM_BITS_CRC;
1440                } else {
1441                        nbits = XFS_BB_NUM_BITS;
1442                }
1443                xfs_btree_offsets(fields,
1444                                  (cur->bc_flags & XFS_BTREE_LONG_PTRS) ?
1445                                        loffsets : soffsets,
1446                                  nbits, &first, &last);
1447                xfs_trans_buf_set_type(cur->bc_tp, bp, XFS_BLFT_BTREE_BUF);
1448                xfs_trans_log_buf(cur->bc_tp, bp, first, last);
1449        } else {
1450                xfs_trans_log_inode(cur->bc_tp, cur->bc_private.b.ip,
1451                        xfs_ilog_fbroot(cur->bc_private.b.whichfork));
1452        }
1453
1454        XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
1455}
1456
1457/*
1458 * Increment cursor by one record at the level.
1459 * For nonzero levels the leaf-ward information is untouched.
1460 */
1461int                                             /* error */
1462xfs_btree_increment(
1463        struct xfs_btree_cur    *cur,
1464        int                     level,
1465        int                     *stat)          /* success/failure */
1466{
1467        struct xfs_btree_block  *block;
1468        union xfs_btree_ptr     ptr;
1469        struct xfs_buf          *bp;
1470        int                     error;          /* error return value */
1471        int                     lev;
1472
1473        XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
1474        XFS_BTREE_TRACE_ARGI(cur, level);
1475
1476        ASSERT(level < cur->bc_nlevels);
1477
1478        /* Read-ahead to the right at this level. */
1479        xfs_btree_readahead(cur, level, XFS_BTCUR_RIGHTRA);
1480
1481        /* Get a pointer to the btree block. */
1482        block = xfs_btree_get_block(cur, level, &bp);
1483
1484#ifdef DEBUG
1485        error = xfs_btree_check_block(cur, block, level, bp);
1486        if (error)
1487                goto error0;
1488#endif
1489
1490        /* We're done if we remain in the block after the increment. */
1491        if (++cur->bc_ptrs[level] <= xfs_btree_get_numrecs(block))
1492                goto out1;
1493
1494        /* Fail if we just went off the right edge of the tree. */
1495        xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_RIGHTSIB);
1496        if (xfs_btree_ptr_is_null(cur, &ptr))
1497                goto out0;
1498
1499        XFS_BTREE_STATS_INC(cur, increment);
1500
1501        /*
1502         * March up the tree incrementing pointers.
1503         * Stop when we don't go off the right edge of a block.
1504         */
1505        for (lev = level + 1; lev < cur->bc_nlevels; lev++) {
1506                block = xfs_btree_get_block(cur, lev, &bp);
1507
1508#ifdef DEBUG
1509                error = xfs_btree_check_block(cur, block, lev, bp);
1510                if (error)
1511                        goto error0;
1512#endif
1513
1514                if (++cur->bc_ptrs[lev] <= xfs_btree_get_numrecs(block))
1515                        break;
1516
1517                /* Read-ahead the right block for the next loop. */
1518                xfs_btree_readahead(cur, lev, XFS_BTCUR_RIGHTRA);
1519        }
1520
1521        /*
1522         * If we went off the root then we are either seriously
1523         * confused or have the tree root in an inode.
1524         */
1525        if (lev == cur->bc_nlevels) {
1526                if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE)
1527                        goto out0;
1528                ASSERT(0);
1529                error = -EFSCORRUPTED;
1530                goto error0;
1531        }
1532        ASSERT(lev < cur->bc_nlevels);
1533
1534        /*
1535         * Now walk back down the tree, fixing up the cursor's buffer
1536         * pointers and key numbers.
1537         */
1538        for (block = xfs_btree_get_block(cur, lev, &bp); lev > level; ) {
1539                union xfs_btree_ptr     *ptrp;
1540
1541                ptrp = xfs_btree_ptr_addr(cur, cur->bc_ptrs[lev], block);
1542                --lev;
1543                error = xfs_btree_read_buf_block(cur, ptrp, 0, &block, &bp);
1544                if (error)
1545                        goto error0;
1546
1547                xfs_btree_setbuf(cur, lev, bp);
1548                cur->bc_ptrs[lev] = 1;
1549        }
1550out1:
1551        XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
1552        *stat = 1;
1553        return 0;
1554
1555out0:
1556        XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
1557        *stat = 0;
1558        return 0;
1559
1560error0:
1561        XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
1562        return error;
1563}
1564
1565/*
1566 * Decrement cursor by one record at the level.
1567 * For nonzero levels the leaf-ward information is untouched.
1568 */
1569int                                             /* error */
1570xfs_btree_decrement(
1571        struct xfs_btree_cur    *cur,
1572        int                     level,
1573        int                     *stat)          /* success/failure */
1574{
1575        struct xfs_btree_block  *block;
1576        xfs_buf_t               *bp;
1577        int                     error;          /* error return value */
1578        int                     lev;
1579        union xfs_btree_ptr     ptr;
1580
1581        XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
1582        XFS_BTREE_TRACE_ARGI(cur, level);
1583
1584        ASSERT(level < cur->bc_nlevels);
1585
1586        /* Read-ahead to the left at this level. */
1587        xfs_btree_readahead(cur, level, XFS_BTCUR_LEFTRA);
1588
1589        /* We're done if we remain in the block after the decrement. */
1590        if (--cur->bc_ptrs[level] > 0)
1591                goto out1;
1592
1593        /* Get a pointer to the btree block. */
1594        block = xfs_btree_get_block(cur, level, &bp);
1595
1596#ifdef DEBUG
1597        error = xfs_btree_check_block(cur, block, level, bp);
1598        if (error)
1599                goto error0;
1600#endif
1601
1602        /* Fail if we just went off the left edge of the tree. */
1603        xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_LEFTSIB);
1604        if (xfs_btree_ptr_is_null(cur, &ptr))
1605                goto out0;
1606
1607        XFS_BTREE_STATS_INC(cur, decrement);
1608
1609        /*
1610         * March up the tree decrementing pointers.
1611         * Stop when we don't go off the left edge of a block.
1612         */
1613        for (lev = level + 1; lev < cur->bc_nlevels; lev++) {
1614                if (--cur->bc_ptrs[lev] > 0)
1615                        break;
1616                /* Read-ahead the left block for the next loop. */
1617                xfs_btree_readahead(cur, lev, XFS_BTCUR_LEFTRA);
1618        }
1619
1620        /*
1621         * If we went off the root then we are seriously confused.
1622         * or the root of the tree is in an inode.
1623         */
1624        if (lev == cur->bc_nlevels) {
1625                if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE)
1626                        goto out0;
1627                ASSERT(0);
1628                error = -EFSCORRUPTED;
1629                goto error0;
1630        }
1631        ASSERT(lev < cur->bc_nlevels);
1632
1633        /*
1634         * Now walk back down the tree, fixing up the cursor's buffer
1635         * pointers and key numbers.
1636         */
1637        for (block = xfs_btree_get_block(cur, lev, &bp); lev > level; ) {
1638                union xfs_btree_ptr     *ptrp;
1639
1640                ptrp = xfs_btree_ptr_addr(cur, cur->bc_ptrs[lev], block);
1641                --lev;
1642                error = xfs_btree_read_buf_block(cur, ptrp, 0, &block, &bp);
1643                if (error)
1644                        goto error0;
1645                xfs_btree_setbuf(cur, lev, bp);
1646                cur->bc_ptrs[lev] = xfs_btree_get_numrecs(block);
1647        }
1648out1:
1649        XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
1650        *stat = 1;
1651        return 0;
1652
1653out0:
1654        XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
1655        *stat = 0;
1656        return 0;
1657
1658error0:
1659        XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
1660        return error;
1661}
1662
1663STATIC int
1664xfs_btree_lookup_get_block(
1665        struct xfs_btree_cur    *cur,   /* btree cursor */
1666        int                     level,  /* level in the btree */
1667        union xfs_btree_ptr     *pp,    /* ptr to btree block */
1668        struct xfs_btree_block  **blkp) /* return btree block */
1669{
1670        struct xfs_buf          *bp;    /* buffer pointer for btree block */
1671        int                     error = 0;
1672
1673        /* special case the root block if in an inode */
1674        if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
1675            (level == cur->bc_nlevels - 1)) {
1676                *blkp = xfs_btree_get_iroot(cur);
1677                return 0;
1678        }
1679
1680        /*
1681         * If the old buffer at this level for the disk address we are
1682         * looking for re-use it.
1683         *
1684         * Otherwise throw it away and get a new one.
1685         */
1686        bp = cur->bc_bufs[level];
1687        if (bp && XFS_BUF_ADDR(bp) == xfs_btree_ptr_to_daddr(cur, pp)) {
1688                *blkp = XFS_BUF_TO_BLOCK(bp);
1689                return 0;
1690        }
1691
1692        error = xfs_btree_read_buf_block(cur, pp, 0, blkp, &bp);
1693        if (error)
1694                return error;
1695
1696        xfs_btree_setbuf(cur, level, bp);
1697        return 0;
1698}
1699
1700/*
1701 * Get current search key.  For level 0 we don't actually have a key
1702 * structure so we make one up from the record.  For all other levels
1703 * we just return the right key.
1704 */
1705STATIC union xfs_btree_key *
1706xfs_lookup_get_search_key(
1707        struct xfs_btree_cur    *cur,
1708        int                     level,
1709        int                     keyno,
1710        struct xfs_btree_block  *block,
1711        union xfs_btree_key     *kp)
1712{
1713        if (level == 0) {
1714                cur->bc_ops->init_key_from_rec(kp,
1715                                xfs_btree_rec_addr(cur, keyno, block));
1716                return kp;
1717        }
1718
1719        return xfs_btree_key_addr(cur, keyno, block);
1720}
1721
1722/*
1723 * Lookup the record.  The cursor is made to point to it, based on dir.
1724 * stat is set to 0 if can't find any such record, 1 for success.
1725 */
1726int                                     /* error */
1727xfs_btree_lookup(
1728        struct xfs_btree_cur    *cur,   /* btree cursor */
1729        xfs_lookup_t            dir,    /* <=, ==, or >= */
1730        int                     *stat)  /* success/failure */
1731{
1732        struct xfs_btree_block  *block; /* current btree block */
1733        __int64_t               diff;   /* difference for the current key */
1734        int                     error;  /* error return value */
1735        int                     keyno;  /* current key number */
1736        int                     level;  /* level in the btree */
1737        union xfs_btree_ptr     *pp;    /* ptr to btree block */
1738        union xfs_btree_ptr     ptr;    /* ptr to btree block */
1739
1740        XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
1741        XFS_BTREE_TRACE_ARGI(cur, dir);
1742
1743        XFS_BTREE_STATS_INC(cur, lookup);
1744
1745        block = NULL;
1746        keyno = 0;
1747
1748        /* initialise start pointer from cursor */
1749        cur->bc_ops->init_ptr_from_cur(cur, &ptr);
1750        pp = &ptr;
1751
1752        /*
1753         * Iterate over each level in the btree, starting at the root.
1754         * For each level above the leaves, find the key we need, based
1755         * on the lookup record, then follow the corresponding block
1756         * pointer down to the next level.
1757         */
1758        for (level = cur->bc_nlevels - 1, diff = 1; level >= 0; level--) {
1759                /* Get the block we need to do the lookup on. */
1760                error = xfs_btree_lookup_get_block(cur, level, pp, &block);
1761                if (error)
1762                        goto error0;
1763
1764                if (diff == 0) {
1765                        /*
1766                         * If we already had a key match at a higher level, we
1767                         * know we need to use the first entry in this block.
1768                         */
1769                        keyno = 1;
1770                } else {
1771                        /* Otherwise search this block. Do a binary search. */
1772
1773                        int     high;   /* high entry number */
1774                        int     low;    /* low entry number */
1775
1776                        /* Set low and high entry numbers, 1-based. */
1777                        low = 1;
1778                        high = xfs_btree_get_numrecs(block);
1779                        if (!high) {
1780                                /* Block is empty, must be an empty leaf. */
1781                                ASSERT(level == 0 && cur->bc_nlevels == 1);
1782
1783                                cur->bc_ptrs[0] = dir != XFS_LOOKUP_LE;
1784                                XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
1785                                *stat = 0;
1786                                return 0;
1787                        }
1788
1789                        /* Binary search the block. */
1790                        while (low <= high) {
1791                                union xfs_btree_key     key;
1792                                union xfs_btree_key     *kp;
1793
1794                                XFS_BTREE_STATS_INC(cur, compare);
1795
1796                                /* keyno is average of low and high. */
1797                                keyno = (low + high) >> 1;
1798
1799                                /* Get current search key */
1800                                kp = xfs_lookup_get_search_key(cur, level,
1801                                                keyno, block, &key);
1802
1803                                /*
1804                                 * Compute difference to get next direction:
1805                                 *  - less than, move right
1806                                 *  - greater than, move left
1807                                 *  - equal, we're done
1808                                 */
1809                                diff = cur->bc_ops->key_diff(cur, kp);
1810                                if (diff < 0)
1811                                        low = keyno + 1;
1812                                else if (diff > 0)
1813                                        high = keyno - 1;
1814                                else
1815                                        break;
1816                        }
1817                }
1818
1819                /*
1820                 * If there are more levels, set up for the next level
1821                 * by getting the block number and filling in the cursor.
1822                 */
1823                if (level > 0) {
1824                        /*
1825                         * If we moved left, need the previous key number,
1826                         * unless there isn't one.
1827                         */
1828                        if (diff > 0 && --keyno < 1)
1829                                keyno = 1;
1830                        pp = xfs_btree_ptr_addr(cur, keyno, block);
1831
1832#ifdef DEBUG
1833                        error = xfs_btree_check_ptr(cur, pp, 0, level);
1834                        if (error)
1835                                goto error0;
1836#endif
1837                        cur->bc_ptrs[level] = keyno;
1838                }
1839        }
1840
1841        /* Done with the search. See if we need to adjust the results. */
1842        if (dir != XFS_LOOKUP_LE && diff < 0) {
1843                keyno++;
1844                /*
1845                 * If ge search and we went off the end of the block, but it's
1846                 * not the last block, we're in the wrong block.
1847                 */
1848                xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_RIGHTSIB);
1849                if (dir == XFS_LOOKUP_GE &&
1850                    keyno > xfs_btree_get_numrecs(block) &&
1851                    !xfs_btree_ptr_is_null(cur, &ptr)) {
1852                        int     i;
1853
1854                        cur->bc_ptrs[0] = keyno;
1855                        error = xfs_btree_increment(cur, 0, &i);
1856                        if (error)
1857                                goto error0;
1858                        XFS_WANT_CORRUPTED_RETURN(cur->bc_mp, i == 1);
1859                        XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
1860                        *stat = 1;
1861                        return 0;
1862                }
1863        } else if (dir == XFS_LOOKUP_LE && diff > 0)
1864                keyno--;
1865        cur->bc_ptrs[0] = keyno;
1866
1867        /* Return if we succeeded or not. */
1868        if (keyno == 0 || keyno > xfs_btree_get_numrecs(block))
1869                *stat = 0;
1870        else if (dir != XFS_LOOKUP_EQ || diff == 0)
1871                *stat = 1;
1872        else
1873                *stat = 0;
1874        XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
1875        return 0;
1876
1877error0:
1878        XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
1879        return error;
1880}
1881
1882/*
1883 * Update keys at all levels from here to the root along the cursor's path.
1884 */
1885STATIC int
1886xfs_btree_updkey(
1887        struct xfs_btree_cur    *cur,
1888        union xfs_btree_key     *keyp,
1889        int                     level)
1890{
1891        struct xfs_btree_block  *block;
1892        struct xfs_buf          *bp;
1893        union xfs_btree_key     *kp;
1894        int                     ptr;
1895
1896        XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
1897        XFS_BTREE_TRACE_ARGIK(cur, level, keyp);
1898
1899        ASSERT(!(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) || level >= 1);
1900
1901        /*
1902         * Go up the tree from this level toward the root.
1903         * At each level, update the key value to the value input.
1904         * Stop when we reach a level where the cursor isn't pointing
1905         * at the first entry in the block.
1906         */
1907        for (ptr = 1; ptr == 1 && level < cur->bc_nlevels; level++) {
1908#ifdef DEBUG
1909                int             error;
1910#endif
1911                block = xfs_btree_get_block(cur, level, &bp);
1912#ifdef DEBUG
1913                error = xfs_btree_check_block(cur, block, level, bp);
1914                if (error) {
1915                        XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
1916                        return error;
1917                }
1918#endif
1919                ptr = cur->bc_ptrs[level];
1920                kp = xfs_btree_key_addr(cur, ptr, block);
1921                xfs_btree_copy_keys(cur, kp, keyp, 1);
1922                xfs_btree_log_keys(cur, bp, ptr, ptr);
1923        }
1924
1925        XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
1926        return 0;
1927}
1928
1929/*
1930 * Update the record referred to by cur to the value in the
1931 * given record. This either works (return 0) or gets an
1932 * EFSCORRUPTED error.
1933 */
1934int
1935xfs_btree_update(
1936        struct xfs_btree_cur    *cur,
1937        union xfs_btree_rec     *rec)
1938{
1939        struct xfs_btree_block  *block;
1940        struct xfs_buf          *bp;
1941        int                     error;
1942        int                     ptr;
1943        union xfs_btree_rec     *rp;
1944
1945        XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
1946        XFS_BTREE_TRACE_ARGR(cur, rec);
1947
1948        /* Pick up the current block. */
1949        block = xfs_btree_get_block(cur, 0, &bp);
1950
1951#ifdef DEBUG
1952        error = xfs_btree_check_block(cur, block, 0, bp);
1953        if (error)
1954                goto error0;
1955#endif
1956        /* Get the address of the rec to be updated. */
1957        ptr = cur->bc_ptrs[0];
1958        rp = xfs_btree_rec_addr(cur, ptr, block);
1959
1960        /* Fill in the new contents and log them. */
1961        xfs_btree_copy_recs(cur, rp, rec, 1);
1962        xfs_btree_log_recs(cur, bp, ptr, ptr);
1963
1964        /*
1965         * If we are tracking the last record in the tree and
1966         * we are at the far right edge of the tree, update it.
1967         */
1968        if (xfs_btree_is_lastrec(cur, block, 0)) {
1969                cur->bc_ops->update_lastrec(cur, block, rec,
1970                                            ptr, LASTREC_UPDATE);
1971        }
1972
1973        /* Updating first rec in leaf. Pass new key value up to our parent. */
1974        if (ptr == 1) {
1975                union xfs_btree_key     key;
1976
1977                cur->bc_ops->init_key_from_rec(&key, rec);
1978                error = xfs_btree_updkey(cur, &key, 1);
1979                if (error)
1980                        goto error0;
1981        }
1982
1983        XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
1984        return 0;
1985
1986error0:
1987        XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
1988        return error;
1989}
1990
1991/*
1992 * Move 1 record left from cur/level if possible.
1993 * Update cur to reflect the new path.
1994 */
1995STATIC int                                      /* error */
1996xfs_btree_lshift(
1997        struct xfs_btree_cur    *cur,
1998        int                     level,
1999        int                     *stat)          /* success/failure */
2000{
2001        union xfs_btree_key     key;            /* btree key */
2002        struct xfs_buf          *lbp;           /* left buffer pointer */
2003        struct xfs_btree_block  *left;          /* left btree block */
2004        int                     lrecs;          /* left record count */
2005        struct xfs_buf          *rbp;           /* right buffer pointer */
2006        struct xfs_btree_block  *right;         /* right btree block */
2007        int                     rrecs;          /* right record count */
2008        union xfs_btree_ptr     lptr;           /* left btree pointer */
2009        union xfs_btree_key     *rkp = NULL;    /* right btree key */
2010        union xfs_btree_ptr     *rpp = NULL;    /* right address pointer */
2011        union xfs_btree_rec     *rrp = NULL;    /* right record pointer */
2012        int                     error;          /* error return value */
2013
2014        XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
2015        XFS_BTREE_TRACE_ARGI(cur, level);
2016
2017        if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
2018            level == cur->bc_nlevels - 1)
2019                goto out0;
2020
2021        /* Set up variables for this block as "right". */
2022        right = xfs_btree_get_block(cur, level, &rbp);
2023
2024#ifdef DEBUG
2025        error = xfs_btree_check_block(cur, right, level, rbp);
2026        if (error)
2027                goto error0;
2028#endif
2029
2030        /* If we've got no left sibling then we can't shift an entry left. */
2031        xfs_btree_get_sibling(cur, right, &lptr, XFS_BB_LEFTSIB);
2032        if (xfs_btree_ptr_is_null(cur, &lptr))
2033                goto out0;
2034
2035        /*
2036         * If the cursor entry is the one that would be moved, don't
2037         * do it... it's too complicated.
2038         */
2039        if (cur->bc_ptrs[level] <= 1)
2040                goto out0;
2041
2042        /* Set up the left neighbor as "left". */
2043        error = xfs_btree_read_buf_block(cur, &lptr, 0, &left, &lbp);
2044        if (error)
2045                goto error0;
2046
2047        /* If it's full, it can't take another entry. */
2048        lrecs = xfs_btree_get_numrecs(left);
2049        if (lrecs == cur->bc_ops->get_maxrecs(cur, level))
2050                goto out0;
2051
2052        rrecs = xfs_btree_get_numrecs(right);
2053
2054        /*
2055         * We add one entry to the left side and remove one for the right side.
2056         * Account for it here, the changes will be updated on disk and logged
2057         * later.
2058         */
2059        lrecs++;
2060        rrecs--;
2061
2062        XFS_BTREE_STATS_INC(cur, lshift);
2063        XFS_BTREE_STATS_ADD(cur, moves, 1);
2064
2065        /*
2066         * If non-leaf, copy a key and a ptr to the left block.
2067         * Log the changes to the left block.
2068         */
2069        if (level > 0) {
2070                /* It's a non-leaf.  Move keys and pointers. */
2071                union xfs_btree_key     *lkp;   /* left btree key */
2072                union xfs_btree_ptr     *lpp;   /* left address pointer */
2073
2074                lkp = xfs_btree_key_addr(cur, lrecs, left);
2075                rkp = xfs_btree_key_addr(cur, 1, right);
2076
2077                lpp = xfs_btree_ptr_addr(cur, lrecs, left);
2078                rpp = xfs_btree_ptr_addr(cur, 1, right);
2079#ifdef DEBUG
2080                error = xfs_btree_check_ptr(cur, rpp, 0, level);
2081                if (error)
2082                        goto error0;
2083#endif
2084                xfs_btree_copy_keys(cur, lkp, rkp, 1);
2085                xfs_btree_copy_ptrs(cur, lpp, rpp, 1);
2086
2087                xfs_btree_log_keys(cur, lbp, lrecs, lrecs);
2088                xfs_btree_log_ptrs(cur, lbp, lrecs, lrecs);
2089
2090                ASSERT(cur->bc_ops->keys_inorder(cur,
2091                        xfs_btree_key_addr(cur, lrecs - 1, left), lkp));
2092        } else {
2093                /* It's a leaf.  Move records.  */
2094                union xfs_btree_rec     *lrp;   /* left record pointer */
2095
2096                lrp = xfs_btree_rec_addr(cur, lrecs, left);
2097                rrp = xfs_btree_rec_addr(cur, 1, right);
2098
2099                xfs_btree_copy_recs(cur, lrp, rrp, 1);
2100                xfs_btree_log_recs(cur, lbp, lrecs, lrecs);
2101
2102                ASSERT(cur->bc_ops->recs_inorder(cur,
2103                        xfs_btree_rec_addr(cur, lrecs - 1, left), lrp));
2104        }
2105
2106        xfs_btree_set_numrecs(left, lrecs);
2107        xfs_btree_log_block(cur, lbp, XFS_BB_NUMRECS);
2108
2109        xfs_btree_set_numrecs(right, rrecs);
2110        xfs_btree_log_block(cur, rbp, XFS_BB_NUMRECS);
2111
2112        /*
2113         * Slide the contents of right down one entry.
2114         */
2115        XFS_BTREE_STATS_ADD(cur, moves, rrecs - 1);
2116        if (level > 0) {
2117                /* It's a nonleaf. operate on keys and ptrs */
2118#ifdef DEBUG
2119                int                     i;              /* loop index */
2120
2121                for (i = 0; i < rrecs; i++) {
2122                        error = xfs_btree_check_ptr(cur, rpp, i + 1, level);
2123                        if (error)
2124                                goto error0;
2125                }
2126#endif
2127                xfs_btree_shift_keys(cur,
2128                                xfs_btree_key_addr(cur, 2, right),
2129                                -1, rrecs);
2130                xfs_btree_shift_ptrs(cur,
2131                                xfs_btree_ptr_addr(cur, 2, right),
2132                                -1, rrecs);
2133
2134                xfs_btree_log_keys(cur, rbp, 1, rrecs);
2135                xfs_btree_log_ptrs(cur, rbp, 1, rrecs);
2136        } else {
2137                /* It's a leaf. operate on records */
2138                xfs_btree_shift_recs(cur,
2139                        xfs_btree_rec_addr(cur, 2, right),
2140                        -1, rrecs);
2141                xfs_btree_log_recs(cur, rbp, 1, rrecs);
2142
2143                /*
2144                 * If it's the first record in the block, we'll need a key
2145                 * structure to pass up to the next level (updkey).
2146                 */
2147                cur->bc_ops->init_key_from_rec(&key,
2148                        xfs_btree_rec_addr(cur, 1, right));
2149                rkp = &key;
2150        }
2151
2152        /* Update the parent key values of right. */
2153        error = xfs_btree_updkey(cur, rkp, level + 1);
2154        if (error)
2155                goto error0;
2156
2157        /* Slide the cursor value left one. */
2158        cur->bc_ptrs[level]--;
2159
2160        XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
2161        *stat = 1;
2162        return 0;
2163
2164out0:
2165        XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
2166        *stat = 0;
2167        return 0;
2168
2169error0:
2170        XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
2171        return error;
2172}
2173
2174/*
2175 * Move 1 record right from cur/level if possible.
2176 * Update cur to reflect the new path.
2177 */
2178STATIC int                                      /* error */
2179xfs_btree_rshift(
2180        struct xfs_btree_cur    *cur,
2181        int                     level,
2182        int                     *stat)          /* success/failure */
2183{
2184        union xfs_btree_key     key;            /* btree key */
2185        struct xfs_buf          *lbp;           /* left buffer pointer */
2186        struct xfs_btree_block  *left;          /* left btree block */
2187        struct xfs_buf          *rbp;           /* right buffer pointer */
2188        struct xfs_btree_block  *right;         /* right btree block */
2189        struct xfs_btree_cur    *tcur;          /* temporary btree cursor */
2190        union xfs_btree_ptr     rptr;           /* right block pointer */
2191        union xfs_btree_key     *rkp;           /* right btree key */
2192        int                     rrecs;          /* right record count */
2193        int                     lrecs;          /* left record count */
2194        int                     error;          /* error return value */
2195        int                     i;              /* loop counter */
2196
2197        XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
2198        XFS_BTREE_TRACE_ARGI(cur, level);
2199
2200        if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
2201            (level == cur->bc_nlevels - 1))
2202                goto out0;
2203
2204        /* Set up variables for this block as "left". */
2205        left = xfs_btree_get_block(cur, level, &lbp);
2206
2207#ifdef DEBUG
2208        error = xfs_btree_check_block(cur, left, level, lbp);
2209        if (error)
2210                goto error0;
2211#endif
2212
2213        /* If we've got no right sibling then we can't shift an entry right. */
2214        xfs_btree_get_sibling(cur, left, &rptr, XFS_BB_RIGHTSIB);
2215        if (xfs_btree_ptr_is_null(cur, &rptr))
2216                goto out0;
2217
2218        /*
2219         * If the cursor entry is the one that would be moved, don't
2220         * do it... it's too complicated.
2221         */
2222        lrecs = xfs_btree_get_numrecs(left);
2223        if (cur->bc_ptrs[level] >= lrecs)
2224                goto out0;
2225
2226        /* Set up the right neighbor as "right". */
2227        error = xfs_btree_read_buf_block(cur, &rptr, 0, &right, &rbp);
2228        if (error)
2229                goto error0;
2230
2231        /* If it's full, it can't take another entry. */
2232        rrecs = xfs_btree_get_numrecs(right);
2233        if (rrecs == cur->bc_ops->get_maxrecs(cur, level))
2234                goto out0;
2235
2236        XFS_BTREE_STATS_INC(cur, rshift);
2237        XFS_BTREE_STATS_ADD(cur, moves, rrecs);
2238
2239        /*
2240         * Make a hole at the start of the right neighbor block, then
2241         * copy the last left block entry to the hole.
2242         */
2243        if (level > 0) {
2244                /* It's a nonleaf. make a hole in the keys and ptrs */
2245                union xfs_btree_key     *lkp;
2246                union xfs_btree_ptr     *lpp;
2247                union xfs_btree_ptr     *rpp;
2248
2249                lkp = xfs_btree_key_addr(cur, lrecs, left);
2250                lpp = xfs_btree_ptr_addr(cur, lrecs, left);
2251                rkp = xfs_btree_key_addr(cur, 1, right);
2252                rpp = xfs_btree_ptr_addr(cur, 1, right);
2253
2254#ifdef DEBUG
2255                for (i = rrecs - 1; i >= 0; i--) {
2256                        error = xfs_btree_check_ptr(cur, rpp, i, level);
2257                        if (error)
2258                                goto error0;
2259                }
2260#endif
2261
2262                xfs_btree_shift_keys(cur, rkp, 1, rrecs);
2263                xfs_btree_shift_ptrs(cur, rpp, 1, rrecs);
2264
2265#ifdef DEBUG
2266                error = xfs_btree_check_ptr(cur, lpp, 0, level);
2267                if (error)
2268                        goto error0;
2269#endif
2270
2271                /* Now put the new data in, and log it. */
2272                xfs_btree_copy_keys(cur, rkp, lkp, 1);
2273                xfs_btree_copy_ptrs(cur, rpp, lpp, 1);
2274
2275                xfs_btree_log_keys(cur, rbp, 1, rrecs + 1);
2276                xfs_btree_log_ptrs(cur, rbp, 1, rrecs + 1);
2277
2278                ASSERT(cur->bc_ops->keys_inorder(cur, rkp,
2279                        xfs_btree_key_addr(cur, 2, right)));
2280        } else {
2281                /* It's a leaf. make a hole in the records */
2282                union xfs_btree_rec     *lrp;
2283                union xfs_btree_rec     *rrp;
2284
2285                lrp = xfs_btree_rec_addr(cur, lrecs, left);
2286                rrp = xfs_btree_rec_addr(cur, 1, right);
2287
2288                xfs_btree_shift_recs(cur, rrp, 1, rrecs);
2289
2290                /* Now put the new data in, and log it. */
2291                xfs_btree_copy_recs(cur, rrp, lrp, 1);
2292                xfs_btree_log_recs(cur, rbp, 1, rrecs + 1);
2293
2294                cur->bc_ops->init_key_from_rec(&key, rrp);
2295                rkp = &key;
2296
2297                ASSERT(cur->bc_ops->recs_inorder(cur, rrp,
2298                        xfs_btree_rec_addr(cur, 2, right)));
2299        }
2300
2301        /*
2302         * Decrement and log left's numrecs, bump and log right's numrecs.
2303         */
2304        xfs_btree_set_numrecs(left, --lrecs);
2305        xfs_btree_log_block(cur, lbp, XFS_BB_NUMRECS);
2306
2307        xfs_btree_set_numrecs(right, ++rrecs);
2308        xfs_btree_log_block(cur, rbp, XFS_BB_NUMRECS);
2309
2310        /*
2311         * Using a temporary cursor, update the parent key values of the
2312         * block on the right.
2313         */
2314        error = xfs_btree_dup_cursor(cur, &tcur);
2315        if (error)
2316                goto error0;
2317        i = xfs_btree_lastrec(tcur, level);
2318        XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, i == 1, error0);
2319
2320        error = xfs_btree_increment(tcur, level, &i);
2321        if (error)
2322                goto error1;
2323
2324        error = xfs_btree_updkey(tcur, rkp, level + 1);
2325        if (error)
2326                goto error1;
2327
2328        xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
2329
2330        XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
2331        *stat = 1;
2332        return 0;
2333
2334out0:
2335        XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
2336        *stat = 0;
2337        return 0;
2338
2339error0:
2340        XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
2341        return error;
2342
2343error1:
2344        XFS_BTREE_TRACE_CURSOR(tcur, XBT_ERROR);
2345        xfs_btree_del_cursor(tcur, XFS_BTREE_ERROR);
2346        return error;
2347}
2348
2349/*
2350 * Split cur/level block in half.
2351 * Return new block number and the key to its first
2352 * record (to be inserted into parent).
2353 */
2354STATIC int                                      /* error */
2355__xfs_btree_split(
2356        struct xfs_btree_cur    *cur,
2357        int                     level,
2358        union xfs_btree_ptr     *ptrp,
2359        union xfs_btree_key     *key,
2360        struct xfs_btree_cur    **curp,
2361        int                     *stat)          /* success/failure */
2362{
2363        union xfs_btree_ptr     lptr;           /* left sibling block ptr */
2364        struct xfs_buf          *lbp;           /* left buffer pointer */
2365        struct xfs_btree_block  *left;          /* left btree block */
2366        union xfs_btree_ptr     rptr;           /* right sibling block ptr */
2367        struct xfs_buf          *rbp;           /* right buffer pointer */
2368        struct xfs_btree_block  *right;         /* right btree block */
2369        union xfs_btree_ptr     rrptr;          /* right-right sibling ptr */
2370        struct xfs_buf          *rrbp;          /* right-right buffer pointer */
2371        struct xfs_btree_block  *rrblock;       /* right-right btree block */
2372        int                     lrecs;
2373        int                     rrecs;
2374        int                     src_index;
2375        int                     error;          /* error return value */
2376#ifdef DEBUG
2377        int                     i;
2378#endif
2379
2380        XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
2381        XFS_BTREE_TRACE_ARGIPK(cur, level, *ptrp, key);
2382
2383        XFS_BTREE_STATS_INC(cur, split);
2384
2385        /* Set up left block (current one). */
2386        left = xfs_btree_get_block(cur, level, &lbp);
2387
2388#ifdef DEBUG
2389        error = xfs_btree_check_block(cur, left, level, lbp);
2390        if (error)
2391                goto error0;
2392#endif
2393
2394        xfs_btree_buf_to_ptr(cur, lbp, &lptr);
2395
2396        /* Allocate the new block. If we can't do it, we're toast. Give up. */
2397        error = cur->bc_ops->alloc_block(cur, &lptr, &rptr, stat);
2398        if (error)
2399                goto error0;
2400        if (*stat == 0)
2401                goto out0;
2402        XFS_BTREE_STATS_INC(cur, alloc);
2403
2404        /* Set up the new block as "right". */
2405        error = xfs_btree_get_buf_block(cur, &rptr, 0, &right, &rbp);
2406        if (error)
2407                goto error0;
2408
2409        /* Fill in the btree header for the new right block. */
2410        xfs_btree_init_block_cur(cur, rbp, xfs_btree_get_level(left), 0);
2411
2412        /*
2413         * Split the entries between the old and the new block evenly.
2414         * Make sure that if there's an odd number of entries now, that
2415         * each new block will have the same number of entries.
2416         */
2417        lrecs = xfs_btree_get_numrecs(left);
2418        rrecs = lrecs / 2;
2419        if ((lrecs & 1) && cur->bc_ptrs[level] <= rrecs + 1)
2420                rrecs++;
2421        src_index = (lrecs - rrecs + 1);
2422
2423        XFS_BTREE_STATS_ADD(cur, moves, rrecs);
2424
2425        /*
2426         * Copy btree block entries from the left block over to the
2427         * new block, the right. Update the right block and log the
2428         * changes.
2429         */
2430        if (level > 0) {
2431                /* It's a non-leaf.  Move keys and pointers. */
2432                union xfs_btree_key     *lkp;   /* left btree key */
2433                union xfs_btree_ptr     *lpp;   /* left address pointer */
2434                union xfs_btree_key     *rkp;   /* right btree key */
2435                union xfs_btree_ptr     *rpp;   /* right address pointer */
2436
2437                lkp = xfs_btree_key_addr(cur, src_index, left);
2438                lpp = xfs_btree_ptr_addr(cur, src_index, left);
2439                rkp = xfs_btree_key_addr(cur, 1, right);
2440                rpp = xfs_btree_ptr_addr(cur, 1, right);
2441
2442#ifdef DEBUG
2443                for (i = src_index; i < rrecs; i++) {
2444                        error = xfs_btree_check_ptr(cur, lpp, i, level);
2445                        if (error)
2446                                goto error0;
2447                }
2448#endif
2449
2450                xfs_btree_copy_keys(cur, rkp, lkp, rrecs);
2451                xfs_btree_copy_ptrs(cur, rpp, lpp, rrecs);
2452
2453                xfs_btree_log_keys(cur, rbp, 1, rrecs);
2454                xfs_btree_log_ptrs(cur, rbp, 1, rrecs);
2455
2456                /* Grab the keys to the entries moved to the right block */
2457                xfs_btree_copy_keys(cur, key, rkp, 1);
2458        } else {
2459                /* It's a leaf.  Move records.  */
2460                union xfs_btree_rec     *lrp;   /* left record pointer */
2461                union xfs_btree_rec     *rrp;   /* right record pointer */
2462
2463                lrp = xfs_btree_rec_addr(cur, src_index, left);
2464                rrp = xfs_btree_rec_addr(cur, 1, right);
2465
2466                xfs_btree_copy_recs(cur, rrp, lrp, rrecs);
2467                xfs_btree_log_recs(cur, rbp, 1, rrecs);
2468
2469                cur->bc_ops->init_key_from_rec(key,
2470                        xfs_btree_rec_addr(cur, 1, right));
2471        }
2472
2473
2474        /*
2475         * Find the left block number by looking in the buffer.
2476         * Adjust numrecs, sibling pointers.
2477         */
2478        xfs_btree_get_sibling(cur, left, &rrptr, XFS_BB_RIGHTSIB);
2479        xfs_btree_set_sibling(cur, right, &rrptr, XFS_BB_RIGHTSIB);
2480        xfs_btree_set_sibling(cur, right, &lptr, XFS_BB_LEFTSIB);
2481        xfs_btree_set_sibling(cur, left, &rptr, XFS_BB_RIGHTSIB);
2482
2483        lrecs -= rrecs;
2484        xfs_btree_set_numrecs(left, lrecs);
2485        xfs_btree_set_numrecs(right, xfs_btree_get_numrecs(right) + rrecs);
2486
2487        xfs_btree_log_block(cur, rbp, XFS_BB_ALL_BITS);
2488        xfs_btree_log_block(cur, lbp, XFS_BB_NUMRECS | XFS_BB_RIGHTSIB);
2489
2490        /*
2491         * If there's a block to the new block's right, make that block
2492         * point back to right instead of to left.
2493         */
2494        if (!xfs_btree_ptr_is_null(cur, &rrptr)) {
2495                error = xfs_btree_read_buf_block(cur, &rrptr,
2496                                                        0, &rrblock, &rrbp);
2497                if (error)
2498                        goto error0;
2499                xfs_btree_set_sibling(cur, rrblock, &rptr, XFS_BB_LEFTSIB);
2500                xfs_btree_log_block(cur, rrbp, XFS_BB_LEFTSIB);
2501        }
2502        /*
2503         * If the cursor is really in the right block, move it there.
2504         * If it's just pointing past the last entry in left, then we'll
2505         * insert there, so don't change anything in that case.
2506         */
2507        if (cur->bc_ptrs[level] > lrecs + 1) {
2508                xfs_btree_setbuf(cur, level, rbp);
2509                cur->bc_ptrs[level] -= lrecs;
2510        }
2511        /*
2512         * If there are more levels, we'll need another cursor which refers
2513         * the right block, no matter where this cursor was.
2514         */
2515        if (level + 1 < cur->bc_nlevels) {
2516                error = xfs_btree_dup_cursor(cur, curp);
2517                if (error)
2518                        goto error0;
2519                (*curp)->bc_ptrs[level + 1]++;
2520        }
2521        *ptrp = rptr;
2522        XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
2523        *stat = 1;
2524        return 0;
2525out0:
2526        XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
2527        *stat = 0;
2528        return 0;
2529
2530error0:
2531        XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
2532        return error;
2533}
2534
2535struct xfs_btree_split_args {
2536        struct xfs_btree_cur    *cur;
2537        int                     level;
2538        union xfs_btree_ptr     *ptrp;
2539        union xfs_btree_key     *key;
2540        struct xfs_btree_cur    **curp;
2541        int                     *stat;          /* success/failure */
2542        int                     result;
2543        bool                    kswapd; /* allocation in kswapd context */
2544        struct completion       *done;
2545        struct work_struct      work;
2546};
2547
2548/*
2549 * Stack switching interfaces for allocation
2550 */
2551static void
2552xfs_btree_split_worker(
2553        struct work_struct      *work)
2554{
2555        struct xfs_btree_split_args     *args = container_of(work,
2556                                                struct xfs_btree_split_args, work);
2557        unsigned long           pflags;
2558        unsigned long           new_pflags = PF_FSTRANS;
2559
2560        /*
2561         * we are in a transaction context here, but may also be doing work
2562         * in kswapd context, and hence we may need to inherit that state
2563         * temporarily to ensure that we don't block waiting for memory reclaim
2564         * in any way.
2565         */
2566        if (args->kswapd)
2567                new_pflags |= PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD;
2568
2569        current_set_flags_nested(&pflags, new_pflags);
2570
2571        args->result = __xfs_btree_split(args->cur, args->level, args->ptrp,
2572                                         args->key, args->curp, args->stat);
2573        complete(args->done);
2574
2575        current_restore_flags_nested(&pflags, new_pflags);
2576}
2577
2578/*
2579 * BMBT split requests often come in with little stack to work on. Push
2580 * them off to a worker thread so there is lots of stack to use. For the other
2581 * btree types, just call directly to avoid the context switch overhead here.
2582 */
2583STATIC int                                      /* error */
2584xfs_btree_split(
2585        struct xfs_btree_cur    *cur,
2586        int                     level,
2587        union xfs_btree_ptr     *ptrp,
2588        union xfs_btree_key     *key,
2589        struct xfs_btree_cur    **curp,
2590        int                     *stat)          /* success/failure */
2591{
2592        struct xfs_btree_split_args     args;
2593        DECLARE_COMPLETION_ONSTACK(done);
2594
2595        if (cur->bc_btnum != XFS_BTNUM_BMAP)
2596                return __xfs_btree_split(cur, level, ptrp, key, curp, stat);
2597
2598        args.cur = cur;
2599        args.level = level;
2600        args.ptrp = ptrp;
2601        args.key = key;
2602        args.curp = curp;
2603        args.stat = stat;
2604        args.done = &done;
2605        args.kswapd = current_is_kswapd();
2606        INIT_WORK_ONSTACK(&args.work, xfs_btree_split_worker);
2607        queue_work(xfs_alloc_wq, &args.work);
2608        wait_for_completion(&done);
2609        destroy_work_on_stack(&args.work);
2610        return args.result;
2611}
2612
2613
2614/*
2615 * Copy the old inode root contents into a real block and make the
2616 * broot point to it.
2617 */
2618int                                             /* error */
2619xfs_btree_new_iroot(
2620        struct xfs_btree_cur    *cur,           /* btree cursor */
2621        int                     *logflags,      /* logging flags for inode */
2622        int                     *stat)          /* return status - 0 fail */
2623{
2624        struct xfs_buf          *cbp;           /* buffer for cblock */
2625        struct xfs_btree_block  *block;         /* btree block */
2626        struct xfs_btree_block  *cblock;        /* child btree block */
2627        union xfs_btree_key     *ckp;           /* child key pointer */
2628        union xfs_btree_ptr     *cpp;           /* child ptr pointer */
2629        union xfs_btree_key     *kp;            /* pointer to btree key */
2630        union xfs_btree_ptr     *pp;            /* pointer to block addr */
2631        union xfs_btree_ptr     nptr;           /* new block addr */
2632        int                     level;          /* btree level */
2633        int                     error;          /* error return code */
2634#ifdef DEBUG
2635        int                     i;              /* loop counter */
2636#endif
2637
2638        XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
2639        XFS_BTREE_STATS_INC(cur, newroot);
2640
2641        ASSERT(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE);
2642
2643        level = cur->bc_nlevels - 1;
2644
2645        block = xfs_btree_get_iroot(cur);
2646        pp = xfs_btree_ptr_addr(cur, 1, block);
2647
2648        /* Allocate the new block. If we can't do it, we're toast. Give up. */
2649        error = cur->bc_ops->alloc_block(cur, pp, &nptr, stat);
2650        if (error)
2651                goto error0;
2652        if (*stat == 0) {
2653                XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
2654                return 0;
2655        }
2656        XFS_BTREE_STATS_INC(cur, alloc);
2657
2658        /* Copy the root into a real block. */
2659        error = xfs_btree_get_buf_block(cur, &nptr, 0, &cblock, &cbp);
2660        if (error)
2661                goto error0;
2662
2663        /*
2664         * we can't just memcpy() the root in for CRC enabled btree blocks.
2665         * In that case have to also ensure the blkno remains correct
2666         */
2667        memcpy(cblock, block, xfs_btree_block_len(cur));
2668        if (cur->bc_flags & XFS_BTREE_CRC_BLOCKS) {
2669                if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
2670                        cblock->bb_u.l.bb_blkno = cpu_to_be64(cbp->b_bn);
2671                else
2672                        cblock->bb_u.s.bb_blkno = cpu_to_be64(cbp->b_bn);
2673        }
2674
2675        be16_add_cpu(&block->bb_level, 1);
2676        xfs_btree_set_numrecs(block, 1);
2677        cur->bc_nlevels++;
2678        cur->bc_ptrs[level + 1] = 1;
2679
2680        kp = xfs_btree_key_addr(cur, 1, block);
2681        ckp = xfs_btree_key_addr(cur, 1, cblock);
2682        xfs_btree_copy_keys(cur, ckp, kp, xfs_btree_get_numrecs(cblock));
2683
2684        cpp = xfs_btree_ptr_addr(cur, 1, cblock);
2685#ifdef DEBUG
2686        for (i = 0; i < be16_to_cpu(cblock->bb_numrecs); i++) {
2687                error = xfs_btree_check_ptr(cur, pp, i, level);
2688                if (error)
2689                        goto error0;
2690        }
2691#endif
2692        xfs_btree_copy_ptrs(cur, cpp, pp, xfs_btree_get_numrecs(cblock));
2693
2694#ifdef DEBUG
2695        error = xfs_btree_check_ptr(cur, &nptr, 0, level);
2696        if (error)
2697                goto error0;
2698#endif
2699        xfs_btree_copy_ptrs(cur, pp, &nptr, 1);
2700
2701        xfs_iroot_realloc(cur->bc_private.b.ip,
2702                          1 - xfs_btree_get_numrecs(cblock),
2703                          cur->bc_private.b.whichfork);
2704
2705        xfs_btree_setbuf(cur, level, cbp);
2706
2707        /*
2708         * Do all this logging at the end so that
2709         * the root is at the right level.
2710         */
2711        xfs_btree_log_block(cur, cbp, XFS_BB_ALL_BITS);
2712        xfs_btree_log_keys(cur, cbp, 1, be16_to_cpu(cblock->bb_numrecs));
2713        xfs_btree_log_ptrs(cur, cbp, 1, be16_to_cpu(cblock->bb_numrecs));
2714
2715        *logflags |=
2716                XFS_ILOG_CORE | xfs_ilog_fbroot(cur->bc_private.b.whichfork);
2717        *stat = 1;
2718        XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
2719        return 0;
2720error0:
2721        XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
2722        return error;
2723}
2724
2725/*
2726 * Allocate a new root block, fill it in.
2727 */
2728STATIC int                              /* error */
2729xfs_btree_new_root(
2730        struct xfs_btree_cur    *cur,   /* btree cursor */
2731        int                     *stat)  /* success/failure */
2732{
2733        struct xfs_btree_block  *block; /* one half of the old root block */
2734        struct xfs_buf          *bp;    /* buffer containing block */
2735        int                     error;  /* error return value */
2736        struct xfs_buf          *lbp;   /* left buffer pointer */
2737        struct xfs_btree_block  *left;  /* left btree block */
2738        struct xfs_buf          *nbp;   /* new (root) buffer */
2739        struct xfs_btree_block  *new;   /* new (root) btree block */
2740        int                     nptr;   /* new value for key index, 1 or 2 */
2741        struct xfs_buf          *rbp;   /* right buffer pointer */
2742        struct xfs_btree_block  *right; /* right btree block */
2743        union xfs_btree_ptr     rptr;
2744        union xfs_btree_ptr     lptr;
2745
2746        XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
2747        XFS_BTREE_STATS_INC(cur, newroot);
2748
2749        /* initialise our start point from the cursor */
2750        cur->bc_ops->init_ptr_from_cur(cur, &rptr);
2751
2752        /* Allocate the new block. If we can't do it, we're toast. Give up. */
2753        error = cur->bc_ops->alloc_block(cur, &rptr, &lptr, stat);
2754        if (error)
2755                goto error0;
2756        if (*stat == 0)
2757                goto out0;
2758        XFS_BTREE_STATS_INC(cur, alloc);
2759
2760        /* Set up the new block. */
2761        error = xfs_btree_get_buf_block(cur, &lptr, 0, &new, &nbp);
2762        if (error)
2763                goto error0;
2764
2765        /* Set the root in the holding structure  increasing the level by 1. */
2766        cur->bc_ops->set_root(cur, &lptr, 1);
2767
2768        /*
2769         * At the previous root level there are now two blocks: the old root,
2770         * and the new block generated when it was split.  We don't know which
2771         * one the cursor is pointing at, so we set up variables "left" and
2772         * "right" for each case.
2773         */
2774        block = xfs_btree_get_block(cur, cur->bc_nlevels - 1, &bp);
2775
2776#ifdef DEBUG
2777        error = xfs_btree_check_block(cur, block, cur->bc_nlevels - 1, bp);
2778        if (error)
2779                goto error0;
2780#endif
2781
2782        xfs_btree_get_sibling(cur, block, &rptr, XFS_BB_RIGHTSIB);
2783        if (!xfs_btree_ptr_is_null(cur, &rptr)) {
2784                /* Our block is left, pick up the right block. */
2785                lbp = bp;
2786                xfs_btree_buf_to_ptr(cur, lbp, &lptr);
2787                left = block;
2788                error = xfs_btree_read_buf_block(cur, &rptr, 0, &right, &rbp);
2789                if (error)
2790                        goto error0;
2791                bp = rbp;
2792                nptr = 1;
2793        } else {
2794                /* Our block is right, pick up the left block. */
2795                rbp = bp;
2796                xfs_btree_buf_to_ptr(cur, rbp, &rptr);
2797                right = block;
2798                xfs_btree_get_sibling(cur, right, &lptr, XFS_BB_LEFTSIB);
2799                error = xfs_btree_read_buf_block(cur, &lptr, 0, &left, &lbp);
2800                if (error)
2801                        goto error0;
2802                bp = lbp;
2803                nptr = 2;
2804        }
2805        /* Fill in the new block's btree header and log it. */
2806        xfs_btree_init_block_cur(cur, nbp, cur->bc_nlevels, 2);
2807        xfs_btree_log_block(cur, nbp, XFS_BB_ALL_BITS);
2808        ASSERT(!xfs_btree_ptr_is_null(cur, &lptr) &&
2809                        !xfs_btree_ptr_is_null(cur, &rptr));
2810
2811        /* Fill in the key data in the new root. */
2812        if (xfs_btree_get_level(left) > 0) {
2813                xfs_btree_copy_keys(cur,
2814                                xfs_btree_key_addr(cur, 1, new),
2815                                xfs_btree_key_addr(cur, 1, left), 1);
2816                xfs_btree_copy_keys(cur,
2817                                xfs_btree_key_addr(cur, 2, new),
2818                                xfs_btree_key_addr(cur, 1, right), 1);
2819        } else {
2820                cur->bc_ops->init_key_from_rec(
2821                                xfs_btree_key_addr(cur, 1, new),
2822                                xfs_btree_rec_addr(cur, 1, left));
2823                cur->bc_ops->init_key_from_rec(
2824                                xfs_btree_key_addr(cur, 2, new),
2825                                xfs_btree_rec_addr(cur, 1, right));
2826        }
2827        xfs_btree_log_keys(cur, nbp, 1, 2);
2828
2829        /* Fill in the pointer data in the new root. */
2830        xfs_btree_copy_ptrs(cur,
2831                xfs_btree_ptr_addr(cur, 1, new), &lptr, 1);
2832        xfs_btree_copy_ptrs(cur,
2833                xfs_btree_ptr_addr(cur, 2, new), &rptr, 1);
2834        xfs_btree_log_ptrs(cur, nbp, 1, 2);
2835
2836        /* Fix up the cursor. */
2837        xfs_btree_setbuf(cur, cur->bc_nlevels, nbp);
2838        cur->bc_ptrs[cur->bc_nlevels] = nptr;
2839        cur->bc_nlevels++;
2840        XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
2841        *stat = 1;
2842        return 0;
2843error0:
2844        XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
2845        return error;
2846out0:
2847        XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
2848        *stat = 0;
2849        return 0;
2850}
2851
2852STATIC int
2853xfs_btree_make_block_unfull(
2854        struct xfs_btree_cur    *cur,   /* btree cursor */
2855        int                     level,  /* btree level */
2856        int                     numrecs,/* # of recs in block */
2857        int                     *oindex,/* old tree index */
2858        int                     *index, /* new tree index */
2859        union xfs_btree_ptr     *nptr,  /* new btree ptr */
2860        struct xfs_btree_cur    **ncur, /* new btree cursor */
2861        union xfs_btree_rec     *nrec,  /* new record */
2862        int                     *stat)
2863{
2864        union xfs_btree_key     key;    /* new btree key value */
2865        int                     error = 0;
2866
2867        if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
2868            level == cur->bc_nlevels - 1) {
2869                struct xfs_inode *ip = cur->bc_private.b.ip;
2870
2871                if (numrecs < cur->bc_ops->get_dmaxrecs(cur, level)) {
2872                        /* A root block that can be made bigger. */
2873                        xfs_iroot_realloc(ip, 1, cur->bc_private.b.whichfork);
2874                } else {
2875                        /* A root block that needs replacing */
2876                        int     logflags = 0;
2877
2878                        error = xfs_btree_new_iroot(cur, &logflags, stat);
2879                        if (error || *stat == 0)
2880                                return error;
2881
2882                        xfs_trans_log_inode(cur->bc_tp, ip, logflags);
2883                }
2884
2885                return 0;
2886        }
2887
2888        /* First, try shifting an entry to the right neighbor. */
2889        error = xfs_btree_rshift(cur, level, stat);
2890        if (error || *stat)
2891                return error;
2892
2893        /* Next, try shifting an entry to the left neighbor. */
2894        error = xfs_btree_lshift(cur, level, stat);
2895        if (error)
2896                return error;
2897
2898        if (*stat) {
2899                *oindex = *index = cur->bc_ptrs[level];
2900                return 0;
2901        }
2902
2903        /*
2904         * Next, try splitting the current block in half.
2905         *
2906         * If this works we have to re-set our variables because we
2907         * could be in a different block now.
2908         */
2909        error = xfs_btree_split(cur, level, nptr, &key, ncur, stat);
2910        if (error || *stat == 0)
2911                return error;
2912
2913
2914        *index = cur->bc_ptrs[level];
2915        cur->bc_ops->init_rec_from_key(&key, nrec);
2916        return 0;
2917}
2918
2919/*
2920 * Insert one record/level.  Return information to the caller
2921 * allowing the next level up to proceed if necessary.
2922 */
2923STATIC int
2924xfs_btree_insrec(
2925        struct xfs_btree_cur    *cur,   /* btree cursor */
2926        int                     level,  /* level to insert record at */
2927        union xfs_btree_ptr     *ptrp,  /* i/o: block number inserted */
2928        union xfs_btree_rec     *recp,  /* i/o: record data inserted */
2929        struct xfs_btree_cur    **curp, /* output: new cursor replacing cur */
2930        int                     *stat)  /* success/failure */
2931{
2932        struct xfs_btree_block  *block; /* btree block */
2933        struct xfs_buf          *bp;    /* buffer for block */
2934        union xfs_btree_key     key;    /* btree key */
2935        union xfs_btree_ptr     nptr;   /* new block ptr */
2936        struct xfs_btree_cur    *ncur;  /* new btree cursor */
2937        union xfs_btree_rec     nrec;   /* new record count */
2938        int                     optr;   /* old key/record index */
2939        int                     ptr;    /* key/record index */
2940        int                     numrecs;/* number of records */
2941        int                     error;  /* error return value */
2942#ifdef DEBUG
2943        int                     i;
2944#endif
2945
2946        XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
2947        XFS_BTREE_TRACE_ARGIPR(cur, level, *ptrp, recp);
2948
2949        ncur = NULL;
2950
2951        /*
2952         * If we have an external root pointer, and we've made it to the
2953         * root level, allocate a new root block and we're done.
2954         */
2955        if (!(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
2956            (level >= cur->bc_nlevels)) {
2957                error = xfs_btree_new_root(cur, stat);
2958                xfs_btree_set_ptr_null(cur, ptrp);
2959
2960                XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
2961                return error;
2962        }
2963
2964        /* If we're off the left edge, return failure. */
2965        ptr = cur->bc_ptrs[level];
2966        if (ptr == 0) {
2967                XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
2968                *stat = 0;
2969                return 0;
2970        }
2971
2972        /* Make a key out of the record data to be inserted, and save it. */
2973        cur->bc_ops->init_key_from_rec(&key, recp);
2974
2975        optr = ptr;
2976
2977        XFS_BTREE_STATS_INC(cur, insrec);
2978
2979        /* Get pointers to the btree buffer and block. */
2980        block = xfs_btree_get_block(cur, level, &bp);
2981        numrecs = xfs_btree_get_numrecs(block);
2982
2983#ifdef DEBUG
2984        error = xfs_btree_check_block(cur, block, level, bp);
2985        if (error)
2986                goto error0;
2987
2988        /* Check that the new entry is being inserted in the right place. */
2989        if (ptr <= numrecs) {
2990                if (level == 0) {
2991                        ASSERT(cur->bc_ops->recs_inorder(cur, recp,
2992                                xfs_btree_rec_addr(cur, ptr, block)));
2993                } else {
2994                        ASSERT(cur->bc_ops->keys_inorder(cur, &key,
2995                                xfs_btree_key_addr(cur, ptr, block)));
2996                }
2997        }
2998#endif
2999
3000        /*
3001         * If the block is full, we can't insert the new entry until we
3002         * make the block un-full.
3003         */
3004        xfs_btree_set_ptr_null(cur, &nptr);
3005        if (numrecs == cur->bc_ops->get_maxrecs(cur, level)) {
3006                error = xfs_btree_make_block_unfull(cur, level, numrecs,
3007                                        &optr, &ptr, &nptr, &ncur, &nrec, stat);
3008                if (error || *stat == 0)
3009                        goto error0;
3010        }
3011
3012        /*
3013         * The current block may have changed if the block was
3014         * previously full and we have just made space in it.
3015         */
3016        block = xfs_btree_get_block(cur, level, &bp);
3017        numrecs = xfs_btree_get_numrecs(block);
3018
3019#ifdef DEBUG
3020        error = xfs_btree_check_block(cur, block, level, bp);
3021        if (error)
3022                return error;
3023#endif
3024
3025        /*
3026         * At this point we know there's room for our new entry in the block
3027         * we're pointing at.
3028         */
3029        XFS_BTREE_STATS_ADD(cur, moves, numrecs - ptr + 1);
3030
3031        if (level > 0) {
3032                /* It's a nonleaf. make a hole in the keys and ptrs */
3033                union xfs_btree_key     *kp;
3034                union xfs_btree_ptr     *pp;
3035
3036                kp = xfs_btree_key_addr(cur, ptr, block);
3037                pp = xfs_btree_ptr_addr(cur, ptr, block);
3038
3039#ifdef DEBUG
3040                for (i = numrecs - ptr; i >= 0; i--) {
3041                        error = xfs_btree_check_ptr(cur, pp, i, level);
3042                        if (error)
3043                                return error;
3044                }
3045#endif
3046
3047                xfs_btree_shift_keys(cur, kp, 1, numrecs - ptr + 1);
3048                xfs_btree_shift_ptrs(cur, pp, 1, numrecs - ptr + 1);
3049
3050#ifdef DEBUG
3051                error = xfs_btree_check_ptr(cur, ptrp, 0, level);
3052                if (error)
3053                        goto error0;
3054#endif
3055
3056                /* Now put the new data in, bump numrecs and log it. */
3057                xfs_btree_copy_keys(cur, kp, &key, 1);
3058                xfs_btree_copy_ptrs(cur, pp, ptrp, 1);
3059                numrecs++;
3060                xfs_btree_set_numrecs(block, numrecs);
3061                xfs_btree_log_ptrs(cur, bp, ptr, numrecs);
3062                xfs_btree_log_keys(cur, bp, ptr, numrecs);
3063#ifdef DEBUG
3064                if (ptr < numrecs) {
3065                        ASSERT(cur->bc_ops->keys_inorder(cur, kp,
3066                                xfs_btree_key_addr(cur, ptr + 1, block)));
3067                }
3068#endif
3069        } else {
3070                /* It's a leaf. make a hole in the records */
3071                union xfs_btree_rec             *rp;
3072
3073                rp = xfs_btree_rec_addr(cur, ptr, block);
3074
3075                xfs_btree_shift_recs(cur, rp, 1, numrecs - ptr + 1);
3076
3077                /* Now put the new data in, bump numrecs and log it. */
3078                xfs_btree_copy_recs(cur, rp, recp, 1);
3079                xfs_btree_set_numrecs(block, ++numrecs);
3080                xfs_btree_log_recs(cur, bp, ptr, numrecs);
3081#ifdef DEBUG
3082                if (ptr < numrecs) {
3083                        ASSERT(cur->bc_ops->recs_inorder(cur, rp,
3084                                xfs_btree_rec_addr(cur, ptr + 1, block)));
3085                }
3086#endif
3087        }
3088
3089        /* Log the new number of records in the btree header. */
3090        xfs_btree_log_block(cur, bp, XFS_BB_NUMRECS);
3091
3092        /* If we inserted at the start of a block, update the parents' keys. */
3093        if (optr == 1) {
3094                error = xfs_btree_updkey(cur, &key, level + 1);
3095                if (error)
3096                        goto error0;
3097        }
3098
3099        /*
3100         * If we are tracking the last record in the tree and
3101         * we are at the far right edge of the tree, update it.
3102         */
3103        if (xfs_btree_is_lastrec(cur, block, level)) {
3104                cur->bc_ops->update_lastrec(cur, block, recp,
3105                                            ptr, LASTREC_INSREC);
3106        }
3107
3108        /*
3109         * Return the new block number, if any.
3110         * If there is one, give back a record value and a cursor too.
3111         */
3112        *ptrp = nptr;
3113        if (!xfs_btree_ptr_is_null(cur, &nptr)) {
3114                *recp = nrec;
3115                *curp = ncur;
3116        }
3117
3118        XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
3119        *stat = 1;
3120        return 0;
3121
3122error0:
3123        XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
3124        return error;
3125}
3126
3127/*
3128 * Insert the record at the point referenced by cur.
3129 *
3130 * A multi-level split of the tree on insert will invalidate the original
3131 * cursor.  All callers of this function should assume that the cursor is
3132 * no longer valid and revalidate it.
3133 */
3134int
3135xfs_btree_insert(
3136        struct xfs_btree_cur    *cur,
3137        int                     *stat)
3138{
3139        int                     error;  /* error return value */
3140        int                     i;      /* result value, 0 for failure */
3141        int                     level;  /* current level number in btree */
3142        union xfs_btree_ptr     nptr;   /* new block number (split result) */
3143        struct xfs_btree_cur    *ncur;  /* new cursor (split result) */
3144        struct xfs_btree_cur    *pcur;  /* previous level's cursor */
3145        union xfs_btree_rec     rec;    /* record to insert */
3146
3147        level = 0;
3148        ncur = NULL;
3149        pcur = cur;
3150
3151        xfs_btree_set_ptr_null(cur, &nptr);
3152        cur->bc_ops->init_rec_from_cur(cur, &rec);
3153
3154        /*
3155         * Loop going up the tree, starting at the leaf level.
3156         * Stop when we don't get a split block, that must mean that
3157         * the insert is finished with this level.
3158         */
3159        do {
3160                /*
3161                 * Insert nrec/nptr into this level of the tree.
3162                 * Note if we fail, nptr will be null.
3163                 */
3164                error = xfs_btree_insrec(pcur, level, &nptr, &rec, &ncur, &i);
3165                if (error) {
3166                        if (pcur != cur)
3167                                xfs_btree_del_cursor(pcur, XFS_BTREE_ERROR);
3168                        goto error0;
3169                }
3170
3171                XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, i == 1, error0);
3172                level++;
3173
3174                /*
3175                 * See if the cursor we just used is trash.
3176                 * Can't trash the caller's cursor, but otherwise we should
3177                 * if ncur is a new cursor or we're about to be done.
3178                 */
3179                if (pcur != cur &&
3180                    (ncur || xfs_btree_ptr_is_null(cur, &nptr))) {
3181                        /* Save the state from the cursor before we trash it */
3182                        if (cur->bc_ops->update_cursor)
3183                                cur->bc_ops->update_cursor(pcur, cur);
3184                        cur->bc_nlevels = pcur->bc_nlevels;
3185                        xfs_btree_del_cursor(pcur, XFS_BTREE_NOERROR);
3186                }
3187                /* If we got a new cursor, switch to it. */
3188                if (ncur) {
3189                        pcur = ncur;
3190                        ncur = NULL;
3191                }
3192        } while (!xfs_btree_ptr_is_null(cur, &nptr));
3193
3194        XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
3195        *stat = i;
3196        return 0;
3197error0:
3198        XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
3199        return error;
3200}
3201
3202/*
3203 * Try to merge a non-leaf block back into the inode root.
3204 *
3205 * Note: the killroot names comes from the fact that we're effectively
3206 * killing the old root block.  But because we can't just delete the
3207 * inode we have to copy the single block it was pointing to into the
3208 * inode.
3209 */
3210STATIC int
3211xfs_btree_kill_iroot(
3212        struct xfs_btree_cur    *cur)
3213{
3214        int                     whichfork = cur->bc_private.b.whichfork;
3215        struct xfs_inode        *ip = cur->bc_private.b.ip;
3216        struct xfs_ifork        *ifp = XFS_IFORK_PTR(ip, whichfork);
3217        struct xfs_btree_block  *block;
3218        struct xfs_btree_block  *cblock;
3219        union xfs_btree_key     *kp;
3220        union xfs_btree_key     *ckp;
3221        union xfs_btree_ptr     *pp;
3222        union xfs_btree_ptr     *cpp;
3223        struct xfs_buf          *cbp;
3224        int                     level;
3225        int                     index;
3226        int                     numrecs;
3227        int                     error;
3228#ifdef DEBUG
3229        union xfs_btree_ptr     ptr;
3230        int                     i;
3231#endif
3232
3233        XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
3234
3235        ASSERT(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE);
3236        ASSERT(cur->bc_nlevels > 1);
3237
3238        /*
3239         * Don't deal with the root block needs to be a leaf case.
3240         * We're just going to turn the thing back into extents anyway.
3241         */
3242        level = cur->bc_nlevels - 1;
3243        if (level == 1)
3244                goto out0;
3245
3246        /*
3247         * Give up if the root has multiple children.
3248         */
3249        block = xfs_btree_get_iroot(cur);
3250        if (xfs_btree_get_numrecs(block) != 1)
3251                goto out0;
3252
3253        cblock = xfs_btree_get_block(cur, level - 1, &cbp);
3254        numrecs = xfs_btree_get_numrecs(cblock);
3255
3256        /*
3257         * Only do this if the next level will fit.
3258         * Then the data must be copied up to the inode,
3259         * instead of freeing the root you free the next level.
3260         */
3261        if (numrecs > cur->bc_ops->get_dmaxrecs(cur, level))
3262                goto out0;
3263
3264        XFS_BTREE_STATS_INC(cur, killroot);
3265
3266#ifdef DEBUG
3267        xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_LEFTSIB);
3268        ASSERT(xfs_btree_ptr_is_null(cur, &ptr));
3269        xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_RIGHTSIB);
3270        ASSERT(xfs_btree_ptr_is_null(cur, &ptr));
3271#endif
3272
3273        index = numrecs - cur->bc_ops->get_maxrecs(cur, level);
3274        if (index) {
3275                xfs_iroot_realloc(cur->bc_private.b.ip, index,
3276                                  cur->bc_private.b.whichfork);
3277                block = ifp->if_broot;
3278        }
3279
3280        be16_add_cpu(&block->bb_numrecs, index);
3281        ASSERT(block->bb_numrecs == cblock->bb_numrecs);
3282
3283        kp = xfs_btree_key_addr(cur, 1, block);
3284        ckp = xfs_btree_key_addr(cur, 1, cblock);
3285        xfs_btree_copy_keys(cur, kp, ckp, numrecs);
3286
3287        pp = xfs_btree_ptr_addr(cur, 1, block);
3288        cpp = xfs_btree_ptr_addr(cur, 1, cblock);
3289#ifdef DEBUG
3290        for (i = 0; i < numrecs; i++) {
3291                error = xfs_btree_check_ptr(cur, cpp, i, level - 1);
3292                if (error) {
3293                        XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
3294                        return error;
3295                }
3296        }
3297#endif
3298        xfs_btree_copy_ptrs(cur, pp, cpp, numrecs);
3299
3300        error = xfs_btree_free_block(cur, cbp);
3301        if (error) {
3302                XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
3303                return error;
3304        }
3305
3306        cur->bc_bufs[level - 1] = NULL;
3307        be16_add_cpu(&block->bb_level, -1);
3308        xfs_trans_log_inode(cur->bc_tp, ip,
3309                XFS_ILOG_CORE | xfs_ilog_fbroot(cur->bc_private.b.whichfork));
3310        cur->bc_nlevels--;
3311out0:
3312        XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
3313        return 0;
3314}
3315
3316/*
3317 * Kill the current root node, and replace it with it's only child node.
3318 */
3319STATIC int
3320xfs_btree_kill_root(
3321        struct xfs_btree_cur    *cur,
3322        struct xfs_buf          *bp,
3323        int                     level,
3324        union xfs_btree_ptr     *newroot)
3325{
3326        int                     error;
3327
3328        XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
3329        XFS_BTREE_STATS_INC(cur, killroot);
3330
3331        /*
3332         * Update the root pointer, decreasing the level by 1 and then
3333         * free the old root.
3334         */
3335        cur->bc_ops->set_root(cur, newroot, -1);
3336
3337        error = xfs_btree_free_block(cur, bp);
3338        if (error) {
3339                XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
3340                return error;
3341        }
3342
3343        cur->bc_bufs[level] = NULL;
3344        cur->bc_ra[level] = 0;
3345        cur->bc_nlevels--;
3346
3347        XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
3348        return 0;
3349}
3350
3351STATIC int
3352xfs_btree_dec_cursor(
3353        struct xfs_btree_cur    *cur,
3354        int                     level,
3355        int                     *stat)
3356{
3357        int                     error;
3358        int                     i;
3359
3360        if (level > 0) {
3361                error = xfs_btree_decrement(cur, level, &i);
3362                if (error)
3363                        return error;
3364        }
3365
3366        XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
3367        *stat = 1;
3368        return 0;
3369}
3370
3371/*
3372 * Single level of the btree record deletion routine.
3373 * Delete record pointed to by cur/level.
3374 * Remove the record from its block then rebalance the tree.
3375 * Return 0 for error, 1 for done, 2 to go on to the next level.
3376 */
3377STATIC int                                      /* error */
3378xfs_btree_delrec(
3379        struct xfs_btree_cur    *cur,           /* btree cursor */
3380        int                     level,          /* level removing record from */
3381        int                     *stat)          /* fail/done/go-on */
3382{
3383        struct xfs_btree_block  *block;         /* btree block */
3384        union xfs_btree_ptr     cptr;           /* current block ptr */
3385        struct xfs_buf          *bp;            /* buffer for block */
3386        int                     error;          /* error return value */
3387        int                     i;              /* loop counter */
3388        union xfs_btree_key     key;            /* storage for keyp */
3389        union xfs_btree_key     *keyp = &key;   /* passed to the next level */
3390        union xfs_btree_ptr     lptr;           /* left sibling block ptr */
3391        struct xfs_buf          *lbp;           /* left buffer pointer */
3392        struct xfs_btree_block  *left;          /* left btree block */
3393        int                     lrecs = 0;      /* left record count */
3394        int                     ptr;            /* key/record index */
3395        union xfs_btree_ptr     rptr;           /* right sibling block ptr */
3396        struct xfs_buf          *rbp;           /* right buffer pointer */
3397        struct xfs_btree_block  *right;         /* right btree block */
3398        struct xfs_btree_block  *rrblock;       /* right-right btree block */
3399        struct xfs_buf          *rrbp;          /* right-right buffer pointer */
3400        int                     rrecs = 0;      /* right record count */
3401        struct xfs_btree_cur    *tcur;          /* temporary btree cursor */
3402        int                     numrecs;        /* temporary numrec count */
3403
3404        XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
3405        XFS_BTREE_TRACE_ARGI(cur, level);
3406
3407        tcur = NULL;
3408
3409        /* Get the index of the entry being deleted, check for nothing there. */
3410        ptr = cur->bc_ptrs[level];
3411        if (ptr == 0) {
3412                XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
3413                *stat = 0;
3414                return 0;
3415        }
3416
3417        /* Get the buffer & block containing the record or key/ptr. */
3418        block = xfs_btree_get_block(cur, level, &bp);
3419        numrecs = xfs_btree_get_numrecs(block);
3420
3421#ifdef DEBUG
3422        error = xfs_btree_check_block(cur, block, level, bp);
3423        if (error)
3424                goto error0;
3425#endif
3426
3427        /* Fail if we're off the end of the block. */
3428        if (ptr > numrecs) {
3429                XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
3430                *stat = 0;
3431                return 0;
3432        }
3433
3434        XFS_BTREE_STATS_INC(cur, delrec);
3435        XFS_BTREE_STATS_ADD(cur, moves, numrecs - ptr);
3436
3437        /* Excise the entries being deleted. */
3438        if (level > 0) {
3439                /* It's a nonleaf. operate on keys and ptrs */
3440                union xfs_btree_key     *lkp;
3441                union xfs_btree_ptr     *lpp;
3442
3443                lkp = xfs_btree_key_addr(cur, ptr + 1, block);
3444                lpp = xfs_btree_ptr_addr(cur, ptr + 1, block);
3445
3446#ifdef DEBUG
3447                for (i = 0; i < numrecs - ptr; i++) {
3448                        error = xfs_btree_check_ptr(cur, lpp, i, level);
3449                        if (error)
3450                                goto error0;
3451                }
3452#endif
3453
3454                if (ptr < numrecs) {
3455                        xfs_btree_shift_keys(cur, lkp, -1, numrecs - ptr);
3456                        xfs_btree_shift_ptrs(cur, lpp, -1, numrecs - ptr);
3457                        xfs_btree_log_keys(cur, bp, ptr, numrecs - 1);
3458                        xfs_btree_log_ptrs(cur, bp, ptr, numrecs - 1);
3459                }
3460
3461                /*
3462                 * If it's the first record in the block, we'll need to pass a
3463                 * key up to the next level (updkey).
3464                 */
3465                if (ptr == 1)
3466                        keyp = xfs_btree_key_addr(cur, 1, block);
3467        } else {
3468                /* It's a leaf. operate on records */
3469                if (ptr < numrecs) {
3470                        xfs_btree_shift_recs(cur,
3471                                xfs_btree_rec_addr(cur, ptr + 1, block),
3472                                -1, numrecs - ptr);
3473                        xfs_btree_log_recs(cur, bp, ptr, numrecs - 1);
3474                }
3475
3476                /*
3477                 * If it's the first record in the block, we'll need a key
3478                 * structure to pass up to the next level (updkey).
3479                 */
3480                if (ptr == 1) {
3481                        cur->bc_ops->init_key_from_rec(&key,
3482                                        xfs_btree_rec_addr(cur, 1, block));
3483                        keyp = &key;
3484                }
3485        }
3486
3487        /*
3488         * Decrement and log the number of entries in the block.
3489         */
3490        xfs_btree_set_numrecs(block, --numrecs);
3491        xfs_btree_log_block(cur, bp, XFS_BB_NUMRECS);
3492
3493        /*
3494         * If we are tracking the last record in the tree and
3495         * we are at the far right edge of the tree, update it.
3496         */
3497        if (xfs_btree_is_lastrec(cur, block, level)) {
3498                cur->bc_ops->update_lastrec(cur, block, NULL,
3499                                            ptr, LASTREC_DELREC);
3500        }
3501
3502        /*
3503         * We're at the root level.  First, shrink the root block in-memory.
3504         * Try to get rid of the next level down.  If we can't then there's
3505         * nothing left to do.
3506         */
3507        if (level == cur->bc_nlevels - 1) {
3508                if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) {
3509                        xfs_iroot_realloc(cur->bc_private.b.ip, -1,
3510                                          cur->bc_private.b.whichfork);
3511
3512                        error = xfs_btree_kill_iroot(cur);
3513                        if (error)
3514                                goto error0;
3515
3516                        error = xfs_btree_dec_cursor(cur, level, stat);
3517                        if (error)
3518                                goto error0;
3519                        *stat = 1;
3520                        return 0;
3521                }
3522
3523                /*
3524                 * If this is the root level, and there's only one entry left,
3525                 * and it's NOT the leaf level, then we can get rid of this
3526                 * level.
3527                 */
3528                if (numrecs == 1 && level > 0) {
3529                        union xfs_btree_ptr     *pp;
3530                        /*
3531                         * pp is still set to the first pointer in the block.
3532                         * Make it the new root of the btree.
3533                         */
3534                        pp = xfs_btree_ptr_addr(cur, 1, block);
3535                        error = xfs_btree_kill_root(cur, bp, level, pp);
3536                        if (error)
3537                                goto error0;
3538                } else if (level > 0) {
3539                        error = xfs_btree_dec_cursor(cur, level, stat);
3540                        if (error)
3541                                goto error0;
3542                }
3543                *stat = 1;
3544                return 0;
3545        }
3546
3547        /*
3548         * If we deleted the leftmost entry in the block, update the
3549         * key values above us in the tree.
3550         */
3551        if (ptr == 1) {
3552                error = xfs_btree_updkey(cur, keyp, level + 1);
3553                if (error)
3554                        goto error0;
3555        }
3556
3557        /*
3558         * If the number of records remaining in the block is at least
3559         * the minimum, we're done.
3560         */
3561        if (numrecs >= cur->bc_ops->get_minrecs(cur, level)) {
3562                error = xfs_btree_dec_cursor(cur, level, stat);
3563                if (error)
3564                        goto error0;
3565                return 0;
3566        }
3567
3568        /*
3569         * Otherwise, we have to move some records around to keep the
3570         * tree balanced.  Look at the left and right sibling blocks to
3571         * see if we can re-balance by moving only one record.
3572         */
3573        xfs_btree_get_sibling(cur, block, &rptr, XFS_BB_RIGHTSIB);
3574        xfs_btree_get_sibling(cur, block, &lptr, XFS_BB_LEFTSIB);
3575
3576        if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) {
3577                /*
3578                 * One child of root, need to get a chance to copy its contents
3579                 * into the root and delete it. Can't go up to next level,
3580                 * there's nothing to delete there.
3581                 */
3582                if (xfs_btree_ptr_is_null(cur, &rptr) &&
3583                    xfs_btree_ptr_is_null(cur, &lptr) &&
3584                    level == cur->bc_nlevels - 2) {
3585                        error = xfs_btree_kill_iroot(cur);
3586                        if (!error)
3587                                error = xfs_btree_dec_cursor(cur, level, stat);
3588                        if (error)
3589                                goto error0;
3590                        return 0;
3591                }
3592        }
3593
3594        ASSERT(!xfs_btree_ptr_is_null(cur, &rptr) ||
3595               !xfs_btree_ptr_is_null(cur, &lptr));
3596
3597        /*
3598         * Duplicate the cursor so our btree manipulations here won't
3599         * disrupt the next level up.
3600         */
3601        error = xfs_btree_dup_cursor(cur, &tcur);
3602        if (error)
3603                goto error0;
3604
3605        /*
3606         * If there's a right sibling, see if it's ok to shift an entry
3607         * out of it.
3608         */
3609        if (!xfs_btree_ptr_is_null(cur, &rptr)) {
3610                /*
3611                 * Move the temp cursor to the last entry in the next block.
3612                 * Actually any entry but the first would suffice.
3613                 */
3614                i = xfs_btree_lastrec(tcur, level);
3615                XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, i == 1, error0);
3616
3617                error = xfs_btree_increment(tcur, level, &i);
3618                if (error)
3619                        goto error0;
3620                XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, i == 1, error0);
3621
3622                i = xfs_btree_lastrec(tcur, level);
3623                XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, i == 1, error0);
3624
3625                /* Grab a pointer to the block. */
3626                right = xfs_btree_get_block(tcur, level, &rbp);
3627#ifdef DEBUG
3628                error = xfs_btree_check_block(tcur, right, level, rbp);
3629                if (error)
3630                        goto error0;
3631#endif
3632                /* Grab the current block number, for future use. */
3633                xfs_btree_get_sibling(tcur, right, &cptr, XFS_BB_LEFTSIB);
3634
3635                /*
3636                 * If right block is full enough so that removing one entry
3637                 * won't make it too empty, and left-shifting an entry out
3638                 * of right to us works, we're done.
3639                 */
3640                if (xfs_btree_get_numrecs(right) - 1 >=
3641                    cur->bc_ops->get_minrecs(tcur, level)) {
3642                        error = xfs_btree_lshift(tcur, level, &i);
3643                        if (error)
3644                                goto error0;
3645                        if (i) {
3646                                ASSERT(xfs_btree_get_numrecs(block) >=
3647                                       cur->bc_ops->get_minrecs(tcur, level));
3648
3649                                xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
3650                                tcur = NULL;
3651
3652                                error = xfs_btree_dec_cursor(cur, level, stat);
3653                                if (error)
3654                                        goto error0;
3655                                return 0;
3656                        }
3657                }
3658
3659                /*
3660                 * Otherwise, grab the number of records in right for
3661                 * future reference, and fix up the temp cursor to point
3662                 * to our block again (last record).
3663                 */
3664                rrecs = xfs_btree_get_numrecs(right);
3665                if (!xfs_btree_ptr_is_null(cur, &lptr)) {
3666                        i = xfs_btree_firstrec(tcur, level);
3667                        XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, i == 1, error0);
3668
3669                        error = xfs_btree_decrement(tcur, level, &i);
3670                        if (error)
3671                                goto error0;
3672                        XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, i == 1, error0);
3673                }
3674        }
3675
3676        /*
3677         * If there's a left sibling, see if it's ok to shift an entry
3678         * out of it.
3679         */
3680        if (!xfs_btree_ptr_is_null(cur, &lptr)) {
3681                /*
3682                 * Move the temp cursor to the first entry in the
3683                 * previous block.
3684                 */
3685                i = xfs_btree_firstrec(tcur, level);
3686                XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, i == 1, error0);
3687
3688                error = xfs_btree_decrement(tcur, level, &i);
3689                if (error)
3690                        goto error0;
3691                i = xfs_btree_firstrec(tcur, level);
3692                XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, i == 1, error0);
3693
3694                /* Grab a pointer to the block. */
3695                left = xfs_btree_get_block(tcur, level, &lbp);
3696#ifdef DEBUG
3697                error = xfs_btree_check_block(cur, left, level, lbp);
3698                if (error)
3699                        goto error0;
3700#endif
3701                /* Grab the current block number, for future use. */
3702                xfs_btree_get_sibling(tcur, left, &cptr, XFS_BB_RIGHTSIB);
3703
3704                /*
3705                 * If left block is full enough so that removing one entry
3706                 * won't make it too empty, and right-shifting an entry out
3707                 * of left to us works, we're done.
3708                 */
3709                if (xfs_btree_get_numrecs(left) - 1 >=
3710                    cur->bc_ops->get_minrecs(tcur, level)) {
3711                        error = xfs_btree_rshift(tcur, level, &i);
3712                        if (error)
3713                                goto error0;
3714                        if (i) {
3715                                ASSERT(xfs_btree_get_numrecs(block) >=
3716                                       cur->bc_ops->get_minrecs(tcur, level));
3717                                xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
3718                                tcur = NULL;
3719                                if (level == 0)
3720                                        cur->bc_ptrs[0]++;
3721                                XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
3722                                *stat = 1;
3723                                return 0;
3724                        }
3725                }
3726
3727                /*
3728                 * Otherwise, grab the number of records in right for
3729                 * future reference.
3730                 */
3731                lrecs = xfs_btree_get_numrecs(left);
3732        }
3733
3734        /* Delete the temp cursor, we're done with it. */
3735        xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
3736        tcur = NULL;
3737
3738        /* If here, we need to do a join to keep the tree balanced. */
3739        ASSERT(!xfs_btree_ptr_is_null(cur, &cptr));
3740
3741        if (!xfs_btree_ptr_is_null(cur, &lptr) &&
3742            lrecs + xfs_btree_get_numrecs(block) <=
3743                        cur->bc_ops->get_maxrecs(cur, level)) {
3744                /*
3745                 * Set "right" to be the starting block,
3746                 * "left" to be the left neighbor.
3747                 */
3748                rptr = cptr;
3749                right = block;
3750                rbp = bp;
3751                error = xfs_btree_read_buf_block(cur, &lptr, 0, &left, &lbp);
3752                if (error)
3753                        goto error0;
3754
3755        /*
3756         * If that won't work, see if we can join with the right neighbor block.
3757         */
3758        } else if (!xfs_btree_ptr_is_null(cur, &rptr) &&
3759                   rrecs + xfs_btree_get_numrecs(block) <=
3760                        cur->bc_ops->get_maxrecs(cur, level)) {
3761                /*
3762                 * Set "left" to be the starting block,
3763                 * "right" to be the right neighbor.
3764                 */
3765                lptr = cptr;
3766                left = block;
3767                lbp = bp;
3768                error = xfs_btree_read_buf_block(cur, &rptr, 0, &right, &rbp);
3769                if (error)
3770                        goto error0;
3771
3772        /*
3773         * Otherwise, we can't fix the imbalance.
3774         * Just return.  This is probably a logic error, but it's not fatal.
3775         */
3776        } else {
3777                error = xfs_btree_dec_cursor(cur, level, stat);
3778                if (error)
3779                        goto error0;
3780                return 0;
3781        }
3782
3783        rrecs = xfs_btree_get_numrecs(right);
3784        lrecs = xfs_btree_get_numrecs(left);
3785
3786        /*
3787         * We're now going to join "left" and "right" by moving all the stuff
3788         * in "right" to "left" and deleting "right".
3789         */
3790        XFS_BTREE_STATS_ADD(cur, moves, rrecs);
3791        if (level > 0) {
3792                /* It's a non-leaf.  Move keys and pointers. */
3793                union xfs_btree_key     *lkp;   /* left btree key */
3794                union xfs_btree_ptr     *lpp;   /* left address pointer */
3795                union xfs_btree_key     *rkp;   /* right btree key */
3796                union xfs_btree_ptr     *rpp;   /* right address pointer */
3797
3798                lkp = xfs_btree_key_addr(cur, lrecs + 1, left);
3799                lpp = xfs_btree_ptr_addr(cur, lrecs + 1, left);
3800                rkp = xfs_btree_key_addr(cur, 1, right);
3801                rpp = xfs_btree_ptr_addr(cur, 1, right);
3802#ifdef DEBUG
3803                for (i = 1; i < rrecs; i++) {
3804                        error = xfs_btree_check_ptr(cur, rpp, i, level);
3805                        if (error)
3806                                goto error0;
3807                }
3808#endif
3809                xfs_btree_copy_keys(cur, lkp, rkp, rrecs);
3810                xfs_btree_copy_ptrs(cur, lpp, rpp, rrecs);
3811
3812                xfs_btree_log_keys(cur, lbp, lrecs + 1, lrecs + rrecs);
3813                xfs_btree_log_ptrs(cur, lbp, lrecs + 1, lrecs + rrecs);
3814        } else {
3815                /* It's a leaf.  Move records.  */
3816                union xfs_btree_rec     *lrp;   /* left record pointer */
3817                union xfs_btree_rec     *rrp;   /* right record pointer */
3818
3819                lrp = xfs_btree_rec_addr(cur, lrecs + 1, left);
3820                rrp = xfs_btree_rec_addr(cur, 1, right);
3821
3822                xfs_btree_copy_recs(cur, lrp, rrp, rrecs);
3823                xfs_btree_log_recs(cur, lbp, lrecs + 1, lrecs + rrecs);
3824        }
3825
3826        XFS_BTREE_STATS_INC(cur, join);
3827
3828        /*
3829         * Fix up the number of records and right block pointer in the
3830         * surviving block, and log it.
3831         */
3832        xfs_btree_set_numrecs(left, lrecs + rrecs);
3833        xfs_btree_get_sibling(cur, right, &cptr, XFS_BB_RIGHTSIB),
3834        xfs_btree_set_sibling(cur, left, &cptr, XFS_BB_RIGHTSIB);
3835        xfs_btree_log_block(cur, lbp, XFS_BB_NUMRECS | XFS_BB_RIGHTSIB);
3836
3837        /* If there is a right sibling, point it to the remaining block. */
3838        xfs_btree_get_sibling(cur, left, &cptr, XFS_BB_RIGHTSIB);
3839        if (!xfs_btree_ptr_is_null(cur, &cptr)) {
3840                error = xfs_btree_read_buf_block(cur, &cptr, 0, &rrblock, &rrbp);
3841                if (error)
3842                        goto error0;
3843                xfs_btree_set_sibling(cur, rrblock, &lptr, XFS_BB_LEFTSIB);
3844                xfs_btree_log_block(cur, rrbp, XFS_BB_LEFTSIB);
3845        }
3846
3847        /* Free the deleted block. */
3848        error = xfs_btree_free_block(cur, rbp);
3849        if (error)
3850                goto error0;
3851
3852        /*
3853         * If we joined with the left neighbor, set the buffer in the
3854         * cursor to the left block, and fix up the index.
3855         */
3856        if (bp != lbp) {
3857                cur->bc_bufs[level] = lbp;
3858                cur->bc_ptrs[level] += lrecs;
3859                cur->bc_ra[level] = 0;
3860        }
3861        /*
3862         * If we joined with the right neighbor and there's a level above
3863         * us, increment the cursor at that level.
3864         */
3865        else if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) ||
3866                   (level + 1 < cur->bc_nlevels)) {
3867                error = xfs_btree_increment(cur, level + 1, &i);
3868                if (error)
3869                        goto error0;
3870        }
3871
3872        /*
3873         * Readjust the ptr at this level if it's not a leaf, since it's
3874         * still pointing at the deletion point, which makes the cursor
3875         * inconsistent.  If this makes the ptr 0, the caller fixes it up.
3876         * We can't use decrement because it would change the next level up.
3877         */
3878        if (level > 0)
3879                cur->bc_ptrs[level]--;
3880
3881        XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
3882        /* Return value means the next level up has something to do. */
3883        *stat = 2;
3884        return 0;
3885
3886error0:
3887        XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
3888        if (tcur)
3889                xfs_btree_del_cursor(tcur, XFS_BTREE_ERROR);
3890        return error;
3891}
3892
3893/*
3894 * Delete the record pointed to by cur.
3895 * The cursor refers to the place where the record was (could be inserted)
3896 * when the operation returns.
3897 */
3898int                                     /* error */
3899xfs_btree_delete(
3900        struct xfs_btree_cur    *cur,
3901        int                     *stat)  /* success/failure */
3902{
3903        int                     error;  /* error return value */
3904        int                     level;
3905        int                     i;
3906
3907        XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
3908
3909        /*
3910         * Go up the tree, starting at leaf level.
3911         *
3912         * If 2 is returned then a join was done; go to the next level.
3913         * Otherwise we are done.
3914         */
3915        for (level = 0, i = 2; i == 2; level++) {
3916                error = xfs_btree_delrec(cur, level, &i);
3917                if (error)
3918                        goto error0;
3919        }
3920
3921        if (i == 0) {
3922                for (level = 1; level < cur->bc_nlevels; level++) {
3923                        if (cur->bc_ptrs[level] == 0) {
3924                                error = xfs_btree_decrement(cur, level, &i);
3925                                if (error)
3926                                        goto error0;
3927                                break;
3928                        }
3929                }
3930        }
3931
3932        XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
3933        *stat = i;
3934        return 0;
3935error0:
3936        XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
3937        return error;
3938}
3939
3940/*
3941 * Get the data from the pointed-to record.
3942 */
3943int                                     /* error */
3944xfs_btree_get_rec(
3945        struct xfs_btree_cur    *cur,   /* btree cursor */
3946        union xfs_btree_rec     **recp, /* output: btree record */
3947        int                     *stat)  /* output: success/failure */
3948{
3949        struct xfs_btree_block  *block; /* btree block */
3950        struct xfs_buf          *bp;    /* buffer pointer */
3951        int                     ptr;    /* record number */
3952#ifdef DEBUG
3953        int                     error;  /* error return value */
3954#endif
3955
3956        ptr = cur->bc_ptrs[0];
3957        block = xfs_btree_get_block(cur, 0, &bp);
3958
3959#ifdef DEBUG
3960        error = xfs_btree_check_block(cur, block, 0, bp);
3961        if (error)
3962                return error;
3963#endif
3964
3965        /*
3966         * Off the right end or left end, return failure.
3967         */
3968        if (ptr > xfs_btree_get_numrecs(block) || ptr <= 0) {
3969                *stat = 0;
3970                return 0;
3971        }
3972
3973        /*
3974         * Point to the record and extract its data.
3975         */
3976        *recp = xfs_btree_rec_addr(cur, ptr, block);
3977        *stat = 1;
3978        return 0;
3979}
3980
3981/*
3982 * Change the owner of a btree.
3983 *
3984 * The mechanism we use here is ordered buffer logging. Because we don't know
3985 * how many buffers were are going to need to modify, we don't really want to
3986 * have to make transaction reservations for the worst case of every buffer in a
3987 * full size btree as that may be more space that we can fit in the log....
3988 *
3989 * We do the btree walk in the most optimal manner possible - we have sibling
3990 * pointers so we can just walk all the blocks on each level from left to right
3991 * in a single pass, and then move to the next level and do the same. We can
3992 * also do readahead on the sibling pointers to get IO moving more quickly,
3993 * though for slow disks this is unlikely to make much difference to performance
3994 * as the amount of CPU work we have to do before moving to the next block is
3995 * relatively small.
3996 *
3997 * For each btree block that we load, modify the owner appropriately, set the
3998 * buffer as an ordered buffer and log it appropriately. We need to ensure that
3999 * we mark the region we change dirty so that if the buffer is relogged in
4000 * a subsequent transaction the changes we make here as an ordered buffer are
4001 * correctly relogged in that transaction.  If we are in recovery context, then
4002 * just queue the modified buffer as delayed write buffer so the transaction
4003 * recovery completion writes the changes to disk.
4004 */
4005static int
4006xfs_btree_block_change_owner(
4007        struct xfs_btree_cur    *cur,
4008        int                     level,
4009        __uint64_t              new_owner,
4010        struct list_head        *buffer_list)
4011{
4012        struct xfs_btree_block  *block;
4013        struct xfs_buf          *bp;
4014        union xfs_btree_ptr     rptr;
4015
4016        /* do right sibling readahead */
4017        xfs_btree_readahead(cur, level, XFS_BTCUR_RIGHTRA);
4018
4019        /* modify the owner */
4020        block = xfs_btree_get_block(cur, level, &bp);
4021        if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
4022                block->bb_u.l.bb_owner = cpu_to_be64(new_owner);
4023        else
4024                block->bb_u.s.bb_owner = cpu_to_be32(new_owner);
4025
4026        /*
4027         * If the block is a root block hosted in an inode, we might not have a
4028         * buffer pointer here and we shouldn't attempt to log the change as the
4029         * information is already held in the inode and discarded when the root
4030         * block is formatted into the on-disk inode fork. We still change it,
4031         * though, so everything is consistent in memory.
4032         */
4033        if (bp) {
4034                if (cur->bc_tp) {
4035                        xfs_trans_ordered_buf(cur->bc_tp, bp);
4036                        xfs_btree_log_block(cur, bp, XFS_BB_OWNER);
4037                } else {
4038                        xfs_buf_delwri_queue(bp, buffer_list);
4039                }
4040        } else {
4041                ASSERT(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE);
4042                ASSERT(level == cur->bc_nlevels - 1);
4043        }
4044
4045        /* now read rh sibling block for next iteration */
4046        xfs_btree_get_sibling(cur, block, &rptr, XFS_BB_RIGHTSIB);
4047        if (xfs_btree_ptr_is_null(cur, &rptr))
4048                return -ENOENT;
4049
4050        return xfs_btree_lookup_get_block(cur, level, &rptr, &block);
4051}
4052
4053int
4054xfs_btree_change_owner(
4055        struct xfs_btree_cur    *cur,
4056        __uint64_t              new_owner,
4057        struct list_head        *buffer_list)
4058{
4059        union xfs_btree_ptr     lptr;
4060        int                     level;
4061        struct xfs_btree_block  *block = NULL;
4062        int                     error = 0;
4063
4064        cur->bc_ops->init_ptr_from_cur(cur, &lptr);
4065
4066        /* for each level */
4067        for (level = cur->bc_nlevels - 1; level >= 0; level--) {
4068                /* grab the left hand block */
4069                error = xfs_btree_lookup_get_block(cur, level, &lptr, &block);
4070                if (error)
4071                        return error;
4072
4073                /* readahead the left most block for the next level down */
4074                if (level > 0) {
4075                        union xfs_btree_ptr     *ptr;
4076
4077                        ptr = xfs_btree_ptr_addr(cur, 1, block);
4078                        xfs_btree_readahead_ptr(cur, ptr, 1);
4079
4080                        /* save for the next iteration of the loop */
4081                        lptr = *ptr;
4082                }
4083
4084                /* for each buffer in the level */
4085                do {
4086                        error = xfs_btree_block_change_owner(cur, level,
4087                                                             new_owner,
4088                                                             buffer_list);
4089                } while (!error);
4090
4091                if (error != -ENOENT)
4092                        return error;
4093        }
4094
4095        return 0;
4096}
4097
4098/**
4099 * xfs_btree_sblock_v5hdr_verify() -- verify the v5 fields of a short-format
4100 *                                    btree block
4101 *
4102 * @bp: buffer containing the btree block
4103 * @max_recs: pointer to the m_*_mxr max records field in the xfs mount
4104 * @pag_max_level: pointer to the per-ag max level field
4105 */
4106bool
4107xfs_btree_sblock_v5hdr_verify(
4108        struct xfs_buf          *bp)
4109{
4110        struct xfs_mount        *mp = bp->b_target->bt_mount;
4111        struct xfs_btree_block  *block = XFS_BUF_TO_BLOCK(bp);
4112        struct xfs_perag        *pag = bp->b_pag;
4113
4114        if (!xfs_sb_version_hascrc(&mp->m_sb))
4115                return false;
4116        if (!uuid_equal(&block->bb_u.s.bb_uuid, &mp->m_sb.sb_meta_uuid))
4117                return false;
4118        if (block->bb_u.s.bb_blkno != cpu_to_be64(bp->b_bn))
4119                return false;
4120        if (pag && be32_to_cpu(block->bb_u.s.bb_owner) != pag->pag_agno)
4121                return false;
4122        return true;
4123}
4124
4125/**
4126 * xfs_btree_sblock_verify() -- verify a short-format btree block
4127 *
4128 * @bp: buffer containing the btree block
4129 * @max_recs: maximum records allowed in this btree node
4130 */
4131bool
4132xfs_btree_sblock_verify(
4133        struct xfs_buf          *bp,
4134        unsigned int            max_recs)
4135{
4136        struct xfs_mount        *mp = bp->b_target->bt_mount;
4137        struct xfs_btree_block  *block = XFS_BUF_TO_BLOCK(bp);
4138
4139        /* numrecs verification */
4140        if (be16_to_cpu(block->bb_numrecs) > max_recs)
4141                return false;
4142
4143        /* sibling pointer verification */
4144        if (!block->bb_u.s.bb_leftsib ||
4145            (be32_to_cpu(block->bb_u.s.bb_leftsib) >= mp->m_sb.sb_agblocks &&
4146             block->bb_u.s.bb_leftsib != cpu_to_be32(NULLAGBLOCK)))
4147                return false;
4148        if (!block->bb_u.s.bb_rightsib ||
4149            (be32_to_cpu(block->bb_u.s.bb_rightsib) >= mp->m_sb.sb_agblocks &&
4150             block->bb_u.s.bb_rightsib != cpu_to_be32(NULLAGBLOCK)))
4151                return false;
4152
4153        return true;
4154}
4155