linux/fs/xfs/libxfs/xfs_btree.c
<<
>>
Prefs
   1/*
   2 * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc.
   3 * All Rights Reserved.
   4 *
   5 * This program is free software; you can redistribute it and/or
   6 * modify it under the terms of the GNU General Public License as
   7 * published by the Free Software Foundation.
   8 *
   9 * This program is distributed in the hope that it would be useful,
  10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
  11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12 * GNU General Public License for more details.
  13 *
  14 * You should have received a copy of the GNU General Public License
  15 * along with this program; if not, write the Free Software Foundation,
  16 * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
  17 */
  18#include "xfs.h"
  19#include "xfs_fs.h"
  20#include "xfs_shared.h"
  21#include "xfs_format.h"
  22#include "xfs_log_format.h"
  23#include "xfs_trans_resv.h"
  24#include "xfs_bit.h"
  25#include "xfs_mount.h"
  26#include "xfs_inode.h"
  27#include "xfs_trans.h"
  28#include "xfs_inode_item.h"
  29#include "xfs_buf_item.h"
  30#include "xfs_btree.h"
  31#include "xfs_error.h"
  32#include "xfs_trace.h"
  33#include "xfs_cksum.h"
  34#include "xfs_alloc.h"
  35
  36/*
  37 * Cursor allocation zone.
  38 */
  39kmem_zone_t     *xfs_btree_cur_zone;
  40
  41/*
  42 * Btree magic numbers.
  43 */
  44static const __uint32_t xfs_magics[2][XFS_BTNUM_MAX] = {
  45        { XFS_ABTB_MAGIC, XFS_ABTC_MAGIC, XFS_BMAP_MAGIC, XFS_IBT_MAGIC,
  46          XFS_FIBT_MAGIC },
  47        { XFS_ABTB_CRC_MAGIC, XFS_ABTC_CRC_MAGIC,
  48          XFS_BMAP_CRC_MAGIC, XFS_IBT_CRC_MAGIC, XFS_FIBT_CRC_MAGIC }
  49};
  50#define xfs_btree_magic(cur) \
  51        xfs_magics[!!((cur)->bc_flags & XFS_BTREE_CRC_BLOCKS)][cur->bc_btnum]
  52
  53
  54STATIC int                              /* error (0 or EFSCORRUPTED) */
  55xfs_btree_check_lblock(
  56        struct xfs_btree_cur    *cur,   /* btree cursor */
  57        struct xfs_btree_block  *block, /* btree long form block pointer */
  58        int                     level,  /* level of the btree block */
  59        struct xfs_buf          *bp)    /* buffer for block, if any */
  60{
  61        int                     lblock_ok = 1; /* block passes checks */
  62        struct xfs_mount        *mp;    /* file system mount point */
  63
  64        mp = cur->bc_mp;
  65
  66        if (xfs_sb_version_hascrc(&mp->m_sb)) {
  67                lblock_ok = lblock_ok &&
  68                        uuid_equal(&block->bb_u.l.bb_uuid, &mp->m_sb.sb_uuid) &&
  69                        block->bb_u.l.bb_blkno == cpu_to_be64(
  70                                bp ? bp->b_bn : XFS_BUF_DADDR_NULL);
  71        }
  72
  73        lblock_ok = lblock_ok &&
  74                be32_to_cpu(block->bb_magic) == xfs_btree_magic(cur) &&
  75                be16_to_cpu(block->bb_level) == level &&
  76                be16_to_cpu(block->bb_numrecs) <=
  77                        cur->bc_ops->get_maxrecs(cur, level) &&
  78                block->bb_u.l.bb_leftsib &&
  79                (block->bb_u.l.bb_leftsib == cpu_to_be64(NULLFSBLOCK) ||
  80                 XFS_FSB_SANITY_CHECK(mp,
  81                        be64_to_cpu(block->bb_u.l.bb_leftsib))) &&
  82                block->bb_u.l.bb_rightsib &&
  83                (block->bb_u.l.bb_rightsib == cpu_to_be64(NULLFSBLOCK) ||
  84                 XFS_FSB_SANITY_CHECK(mp,
  85                        be64_to_cpu(block->bb_u.l.bb_rightsib)));
  86
  87        if (unlikely(XFS_TEST_ERROR(!lblock_ok, mp,
  88                        XFS_ERRTAG_BTREE_CHECK_LBLOCK,
  89                        XFS_RANDOM_BTREE_CHECK_LBLOCK))) {
  90                if (bp)
  91                        trace_xfs_btree_corrupt(bp, _RET_IP_);
  92                XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, mp);
  93                return -EFSCORRUPTED;
  94        }
  95        return 0;
  96}
  97
  98STATIC int                              /* error (0 or EFSCORRUPTED) */
  99xfs_btree_check_sblock(
 100        struct xfs_btree_cur    *cur,   /* btree cursor */
 101        struct xfs_btree_block  *block, /* btree short form block pointer */
 102        int                     level,  /* level of the btree block */
 103        struct xfs_buf          *bp)    /* buffer containing block */
 104{
 105        struct xfs_mount        *mp;    /* file system mount point */
 106        struct xfs_buf          *agbp;  /* buffer for ag. freespace struct */
 107        struct xfs_agf          *agf;   /* ag. freespace structure */
 108        xfs_agblock_t           agflen; /* native ag. freespace length */
 109        int                     sblock_ok = 1; /* block passes checks */
 110
 111        mp = cur->bc_mp;
 112        agbp = cur->bc_private.a.agbp;
 113        agf = XFS_BUF_TO_AGF(agbp);
 114        agflen = be32_to_cpu(agf->agf_length);
 115
 116        if (xfs_sb_version_hascrc(&mp->m_sb)) {
 117                sblock_ok = sblock_ok &&
 118                        uuid_equal(&block->bb_u.s.bb_uuid, &mp->m_sb.sb_uuid) &&
 119                        block->bb_u.s.bb_blkno == cpu_to_be64(
 120                                bp ? bp->b_bn : XFS_BUF_DADDR_NULL);
 121        }
 122
 123        sblock_ok = sblock_ok &&
 124                be32_to_cpu(block->bb_magic) == xfs_btree_magic(cur) &&
 125                be16_to_cpu(block->bb_level) == level &&
 126                be16_to_cpu(block->bb_numrecs) <=
 127                        cur->bc_ops->get_maxrecs(cur, level) &&
 128                (block->bb_u.s.bb_leftsib == cpu_to_be32(NULLAGBLOCK) ||
 129                 be32_to_cpu(block->bb_u.s.bb_leftsib) < agflen) &&
 130                block->bb_u.s.bb_leftsib &&
 131                (block->bb_u.s.bb_rightsib == cpu_to_be32(NULLAGBLOCK) ||
 132                 be32_to_cpu(block->bb_u.s.bb_rightsib) < agflen) &&
 133                block->bb_u.s.bb_rightsib;
 134
 135        if (unlikely(XFS_TEST_ERROR(!sblock_ok, mp,
 136                        XFS_ERRTAG_BTREE_CHECK_SBLOCK,
 137                        XFS_RANDOM_BTREE_CHECK_SBLOCK))) {
 138                if (bp)
 139                        trace_xfs_btree_corrupt(bp, _RET_IP_);
 140                XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, mp);
 141                return -EFSCORRUPTED;
 142        }
 143        return 0;
 144}
 145
 146/*
 147 * Debug routine: check that block header is ok.
 148 */
 149int
 150xfs_btree_check_block(
 151        struct xfs_btree_cur    *cur,   /* btree cursor */
 152        struct xfs_btree_block  *block, /* generic btree block pointer */
 153        int                     level,  /* level of the btree block */
 154        struct xfs_buf          *bp)    /* buffer containing block, if any */
 155{
 156        if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
 157                return xfs_btree_check_lblock(cur, block, level, bp);
 158        else
 159                return xfs_btree_check_sblock(cur, block, level, bp);
 160}
 161
 162/*
 163 * Check that (long) pointer is ok.
 164 */
 165int                                     /* error (0 or EFSCORRUPTED) */
 166xfs_btree_check_lptr(
 167        struct xfs_btree_cur    *cur,   /* btree cursor */
 168        xfs_fsblock_t           bno,    /* btree block disk address */
 169        int                     level)  /* btree block level */
 170{
 171        XFS_WANT_CORRUPTED_RETURN(cur->bc_mp,
 172                level > 0 &&
 173                bno != NULLFSBLOCK &&
 174                XFS_FSB_SANITY_CHECK(cur->bc_mp, bno));
 175        return 0;
 176}
 177
 178#ifdef DEBUG
 179/*
 180 * Check that (short) pointer is ok.
 181 */
 182STATIC int                              /* error (0 or EFSCORRUPTED) */
 183xfs_btree_check_sptr(
 184        struct xfs_btree_cur    *cur,   /* btree cursor */
 185        xfs_agblock_t           bno,    /* btree block disk address */
 186        int                     level)  /* btree block level */
 187{
 188        xfs_agblock_t           agblocks = cur->bc_mp->m_sb.sb_agblocks;
 189
 190        XFS_WANT_CORRUPTED_RETURN(cur->bc_mp,
 191                level > 0 &&
 192                bno != NULLAGBLOCK &&
 193                bno != 0 &&
 194                bno < agblocks);
 195        return 0;
 196}
 197
 198/*
 199 * Check that block ptr is ok.
 200 */
 201STATIC int                              /* error (0 or EFSCORRUPTED) */
 202xfs_btree_check_ptr(
 203        struct xfs_btree_cur    *cur,   /* btree cursor */
 204        union xfs_btree_ptr     *ptr,   /* btree block disk address */
 205        int                     index,  /* offset from ptr to check */
 206        int                     level)  /* btree block level */
 207{
 208        if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
 209                return xfs_btree_check_lptr(cur,
 210                                be64_to_cpu((&ptr->l)[index]), level);
 211        } else {
 212                return xfs_btree_check_sptr(cur,
 213                                be32_to_cpu((&ptr->s)[index]), level);
 214        }
 215}
 216#endif
 217
 218/*
 219 * Calculate CRC on the whole btree block and stuff it into the
 220 * long-form btree header.
 221 *
 222 * Prior to calculting the CRC, pull the LSN out of the buffer log item and put
 223 * it into the buffer so recovery knows what the last modifcation was that made
 224 * it to disk.
 225 */
 226void
 227xfs_btree_lblock_calc_crc(
 228        struct xfs_buf          *bp)
 229{
 230        struct xfs_btree_block  *block = XFS_BUF_TO_BLOCK(bp);
 231        struct xfs_buf_log_item *bip = bp->b_fspriv;
 232
 233        if (!xfs_sb_version_hascrc(&bp->b_target->bt_mount->m_sb))
 234                return;
 235        if (bip)
 236                block->bb_u.l.bb_lsn = cpu_to_be64(bip->bli_item.li_lsn);
 237        xfs_buf_update_cksum(bp, XFS_BTREE_LBLOCK_CRC_OFF);
 238}
 239
 240bool
 241xfs_btree_lblock_verify_crc(
 242        struct xfs_buf          *bp)
 243{
 244        if (xfs_sb_version_hascrc(&bp->b_target->bt_mount->m_sb))
 245                return xfs_buf_verify_cksum(bp, XFS_BTREE_LBLOCK_CRC_OFF);
 246
 247        return true;
 248}
 249
 250/*
 251 * Calculate CRC on the whole btree block and stuff it into the
 252 * short-form btree header.
 253 *
 254 * Prior to calculting the CRC, pull the LSN out of the buffer log item and put
 255 * it into the buffer so recovery knows what the last modifcation was that made
 256 * it to disk.
 257 */
 258void
 259xfs_btree_sblock_calc_crc(
 260        struct xfs_buf          *bp)
 261{
 262        struct xfs_btree_block  *block = XFS_BUF_TO_BLOCK(bp);
 263        struct xfs_buf_log_item *bip = bp->b_fspriv;
 264
 265        if (!xfs_sb_version_hascrc(&bp->b_target->bt_mount->m_sb))
 266                return;
 267        if (bip)
 268                block->bb_u.s.bb_lsn = cpu_to_be64(bip->bli_item.li_lsn);
 269        xfs_buf_update_cksum(bp, XFS_BTREE_SBLOCK_CRC_OFF);
 270}
 271
 272bool
 273xfs_btree_sblock_verify_crc(
 274        struct xfs_buf          *bp)
 275{
 276        if (xfs_sb_version_hascrc(&bp->b_target->bt_mount->m_sb))
 277                return xfs_buf_verify_cksum(bp, XFS_BTREE_SBLOCK_CRC_OFF);
 278
 279        return true;
 280}
 281
 282/*
 283 * Delete the btree cursor.
 284 */
 285void
 286xfs_btree_del_cursor(
 287        xfs_btree_cur_t *cur,           /* btree cursor */
 288        int             error)          /* del because of error */
 289{
 290        int             i;              /* btree level */
 291
 292        /*
 293         * Clear the buffer pointers, and release the buffers.
 294         * If we're doing this in the face of an error, we
 295         * need to make sure to inspect all of the entries
 296         * in the bc_bufs array for buffers to be unlocked.
 297         * This is because some of the btree code works from
 298         * level n down to 0, and if we get an error along
 299         * the way we won't have initialized all the entries
 300         * down to 0.
 301         */
 302        for (i = 0; i < cur->bc_nlevels; i++) {
 303                if (cur->bc_bufs[i])
 304                        xfs_trans_brelse(cur->bc_tp, cur->bc_bufs[i]);
 305                else if (!error)
 306                        break;
 307        }
 308        /*
 309         * Can't free a bmap cursor without having dealt with the
 310         * allocated indirect blocks' accounting.
 311         */
 312        ASSERT(cur->bc_btnum != XFS_BTNUM_BMAP ||
 313               cur->bc_private.b.allocated == 0);
 314        /*
 315         * Free the cursor.
 316         */
 317        kmem_zone_free(xfs_btree_cur_zone, cur);
 318}
 319
 320/*
 321 * Duplicate the btree cursor.
 322 * Allocate a new one, copy the record, re-get the buffers.
 323 */
 324int                                     /* error */
 325xfs_btree_dup_cursor(
 326        xfs_btree_cur_t *cur,           /* input cursor */
 327        xfs_btree_cur_t **ncur)         /* output cursor */
 328{
 329        xfs_buf_t       *bp;            /* btree block's buffer pointer */
 330        int             error;          /* error return value */
 331        int             i;              /* level number of btree block */
 332        xfs_mount_t     *mp;            /* mount structure for filesystem */
 333        xfs_btree_cur_t *new;           /* new cursor value */
 334        xfs_trans_t     *tp;            /* transaction pointer, can be NULL */
 335
 336        tp = cur->bc_tp;
 337        mp = cur->bc_mp;
 338
 339        /*
 340         * Allocate a new cursor like the old one.
 341         */
 342        new = cur->bc_ops->dup_cursor(cur);
 343
 344        /*
 345         * Copy the record currently in the cursor.
 346         */
 347        new->bc_rec = cur->bc_rec;
 348
 349        /*
 350         * For each level current, re-get the buffer and copy the ptr value.
 351         */
 352        for (i = 0; i < new->bc_nlevels; i++) {
 353                new->bc_ptrs[i] = cur->bc_ptrs[i];
 354                new->bc_ra[i] = cur->bc_ra[i];
 355                bp = cur->bc_bufs[i];
 356                if (bp) {
 357                        error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp,
 358                                                   XFS_BUF_ADDR(bp), mp->m_bsize,
 359                                                   0, &bp,
 360                                                   cur->bc_ops->buf_ops);
 361                        if (error) {
 362                                xfs_btree_del_cursor(new, error);
 363                                *ncur = NULL;
 364                                return error;
 365                        }
 366                }
 367                new->bc_bufs[i] = bp;
 368        }
 369        *ncur = new;
 370        return 0;
 371}
 372
 373/*
 374 * XFS btree block layout and addressing:
 375 *
 376 * There are two types of blocks in the btree: leaf and non-leaf blocks.
 377 *
 378 * The leaf record start with a header then followed by records containing
 379 * the values.  A non-leaf block also starts with the same header, and
 380 * then first contains lookup keys followed by an equal number of pointers
 381 * to the btree blocks at the previous level.
 382 *
 383 *              +--------+-------+-------+-------+-------+-------+-------+
 384 * Leaf:        | header | rec 1 | rec 2 | rec 3 | rec 4 | rec 5 | rec N |
 385 *              +--------+-------+-------+-------+-------+-------+-------+
 386 *
 387 *              +--------+-------+-------+-------+-------+-------+-------+
 388 * Non-Leaf:    | header | key 1 | key 2 | key N | ptr 1 | ptr 2 | ptr N |
 389 *              +--------+-------+-------+-------+-------+-------+-------+
 390 *
 391 * The header is called struct xfs_btree_block for reasons better left unknown
 392 * and comes in different versions for short (32bit) and long (64bit) block
 393 * pointers.  The record and key structures are defined by the btree instances
 394 * and opaque to the btree core.  The block pointers are simple disk endian
 395 * integers, available in a short (32bit) and long (64bit) variant.
 396 *
 397 * The helpers below calculate the offset of a given record, key or pointer
 398 * into a btree block (xfs_btree_*_offset) or return a pointer to the given
 399 * record, key or pointer (xfs_btree_*_addr).  Note that all addressing
 400 * inside the btree block is done using indices starting at one, not zero!
 401 */
 402
 403/*
 404 * Return size of the btree block header for this btree instance.
 405 */
 406static inline size_t xfs_btree_block_len(struct xfs_btree_cur *cur)
 407{
 408        if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
 409                if (cur->bc_flags & XFS_BTREE_CRC_BLOCKS)
 410                        return XFS_BTREE_LBLOCK_CRC_LEN;
 411                return XFS_BTREE_LBLOCK_LEN;
 412        }
 413        if (cur->bc_flags & XFS_BTREE_CRC_BLOCKS)
 414                return XFS_BTREE_SBLOCK_CRC_LEN;
 415        return XFS_BTREE_SBLOCK_LEN;
 416}
 417
 418/*
 419 * Return size of btree block pointers for this btree instance.
 420 */
 421static inline size_t xfs_btree_ptr_len(struct xfs_btree_cur *cur)
 422{
 423        return (cur->bc_flags & XFS_BTREE_LONG_PTRS) ?
 424                sizeof(__be64) : sizeof(__be32);
 425}
 426
 427/*
 428 * Calculate offset of the n-th record in a btree block.
 429 */
 430STATIC size_t
 431xfs_btree_rec_offset(
 432        struct xfs_btree_cur    *cur,
 433        int                     n)
 434{
 435        return xfs_btree_block_len(cur) +
 436                (n - 1) * cur->bc_ops->rec_len;
 437}
 438
 439/*
 440 * Calculate offset of the n-th key in a btree block.
 441 */
 442STATIC size_t
 443xfs_btree_key_offset(
 444        struct xfs_btree_cur    *cur,
 445        int                     n)
 446{
 447        return xfs_btree_block_len(cur) +
 448                (n - 1) * cur->bc_ops->key_len;
 449}
 450
 451/*
 452 * Calculate offset of the n-th block pointer in a btree block.
 453 */
 454STATIC size_t
 455xfs_btree_ptr_offset(
 456        struct xfs_btree_cur    *cur,
 457        int                     n,
 458        int                     level)
 459{
 460        return xfs_btree_block_len(cur) +
 461                cur->bc_ops->get_maxrecs(cur, level) * cur->bc_ops->key_len +
 462                (n - 1) * xfs_btree_ptr_len(cur);
 463}
 464
 465/*
 466 * Return a pointer to the n-th record in the btree block.
 467 */
 468STATIC union xfs_btree_rec *
 469xfs_btree_rec_addr(
 470        struct xfs_btree_cur    *cur,
 471        int                     n,
 472        struct xfs_btree_block  *block)
 473{
 474        return (union xfs_btree_rec *)
 475                ((char *)block + xfs_btree_rec_offset(cur, n));
 476}
 477
 478/*
 479 * Return a pointer to the n-th key in the btree block.
 480 */
 481STATIC union xfs_btree_key *
 482xfs_btree_key_addr(
 483        struct xfs_btree_cur    *cur,
 484        int                     n,
 485        struct xfs_btree_block  *block)
 486{
 487        return (union xfs_btree_key *)
 488                ((char *)block + xfs_btree_key_offset(cur, n));
 489}
 490
 491/*
 492 * Return a pointer to the n-th block pointer in the btree block.
 493 */
 494STATIC union xfs_btree_ptr *
 495xfs_btree_ptr_addr(
 496        struct xfs_btree_cur    *cur,
 497        int                     n,
 498        struct xfs_btree_block  *block)
 499{
 500        int                     level = xfs_btree_get_level(block);
 501
 502        ASSERT(block->bb_level != 0);
 503
 504        return (union xfs_btree_ptr *)
 505                ((char *)block + xfs_btree_ptr_offset(cur, n, level));
 506}
 507
 508/*
 509 * Get the root block which is stored in the inode.
 510 *
 511 * For now this btree implementation assumes the btree root is always
 512 * stored in the if_broot field of an inode fork.
 513 */
 514STATIC struct xfs_btree_block *
 515xfs_btree_get_iroot(
 516       struct xfs_btree_cur    *cur)
 517{
 518       struct xfs_ifork        *ifp;
 519
 520       ifp = XFS_IFORK_PTR(cur->bc_private.b.ip, cur->bc_private.b.whichfork);
 521       return (struct xfs_btree_block *)ifp->if_broot;
 522}
 523
 524/*
 525 * Retrieve the block pointer from the cursor at the given level.
 526 * This may be an inode btree root or from a buffer.
 527 */
 528STATIC struct xfs_btree_block *         /* generic btree block pointer */
 529xfs_btree_get_block(
 530        struct xfs_btree_cur    *cur,   /* btree cursor */
 531        int                     level,  /* level in btree */
 532        struct xfs_buf          **bpp)  /* buffer containing the block */
 533{
 534        if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
 535            (level == cur->bc_nlevels - 1)) {
 536                *bpp = NULL;
 537                return xfs_btree_get_iroot(cur);
 538        }
 539
 540        *bpp = cur->bc_bufs[level];
 541        return XFS_BUF_TO_BLOCK(*bpp);
 542}
 543
 544/*
 545 * Get a buffer for the block, return it with no data read.
 546 * Long-form addressing.
 547 */
 548xfs_buf_t *                             /* buffer for fsbno */
 549xfs_btree_get_bufl(
 550        xfs_mount_t     *mp,            /* file system mount point */
 551        xfs_trans_t     *tp,            /* transaction pointer */
 552        xfs_fsblock_t   fsbno,          /* file system block number */
 553        uint            lock)           /* lock flags for get_buf */
 554{
 555        xfs_daddr_t             d;              /* real disk block address */
 556
 557        ASSERT(fsbno != NULLFSBLOCK);
 558        d = XFS_FSB_TO_DADDR(mp, fsbno);
 559        return xfs_trans_get_buf(tp, mp->m_ddev_targp, d, mp->m_bsize, lock);
 560}
 561
 562/*
 563 * Get a buffer for the block, return it with no data read.
 564 * Short-form addressing.
 565 */
 566xfs_buf_t *                             /* buffer for agno/agbno */
 567xfs_btree_get_bufs(
 568        xfs_mount_t     *mp,            /* file system mount point */
 569        xfs_trans_t     *tp,            /* transaction pointer */
 570        xfs_agnumber_t  agno,           /* allocation group number */
 571        xfs_agblock_t   agbno,          /* allocation group block number */
 572        uint            lock)           /* lock flags for get_buf */
 573{
 574        xfs_daddr_t             d;              /* real disk block address */
 575
 576        ASSERT(agno != NULLAGNUMBER);
 577        ASSERT(agbno != NULLAGBLOCK);
 578        d = XFS_AGB_TO_DADDR(mp, agno, agbno);
 579        return xfs_trans_get_buf(tp, mp->m_ddev_targp, d, mp->m_bsize, lock);
 580}
 581
 582/*
 583 * Check for the cursor referring to the last block at the given level.
 584 */
 585int                                     /* 1=is last block, 0=not last block */
 586xfs_btree_islastblock(
 587        xfs_btree_cur_t         *cur,   /* btree cursor */
 588        int                     level)  /* level to check */
 589{
 590        struct xfs_btree_block  *block; /* generic btree block pointer */
 591        xfs_buf_t               *bp;    /* buffer containing block */
 592
 593        block = xfs_btree_get_block(cur, level, &bp);
 594        xfs_btree_check_block(cur, block, level, bp);
 595        if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
 596                return block->bb_u.l.bb_rightsib == cpu_to_be64(NULLFSBLOCK);
 597        else
 598                return block->bb_u.s.bb_rightsib == cpu_to_be32(NULLAGBLOCK);
 599}
 600
 601/*
 602 * Change the cursor to point to the first record at the given level.
 603 * Other levels are unaffected.
 604 */
 605STATIC int                              /* success=1, failure=0 */
 606xfs_btree_firstrec(
 607        xfs_btree_cur_t         *cur,   /* btree cursor */
 608        int                     level)  /* level to change */
 609{
 610        struct xfs_btree_block  *block; /* generic btree block pointer */
 611        xfs_buf_t               *bp;    /* buffer containing block */
 612
 613        /*
 614         * Get the block pointer for this level.
 615         */
 616        block = xfs_btree_get_block(cur, level, &bp);
 617        xfs_btree_check_block(cur, block, level, bp);
 618        /*
 619         * It's empty, there is no such record.
 620         */
 621        if (!block->bb_numrecs)
 622                return 0;
 623        /*
 624         * Set the ptr value to 1, that's the first record/key.
 625         */
 626        cur->bc_ptrs[level] = 1;
 627        return 1;
 628}
 629
 630/*
 631 * Change the cursor to point to the last record in the current block
 632 * at the given level.  Other levels are unaffected.
 633 */
 634STATIC int                              /* success=1, failure=0 */
 635xfs_btree_lastrec(
 636        xfs_btree_cur_t         *cur,   /* btree cursor */
 637        int                     level)  /* level to change */
 638{
 639        struct xfs_btree_block  *block; /* generic btree block pointer */
 640        xfs_buf_t               *bp;    /* buffer containing block */
 641
 642        /*
 643         * Get the block pointer for this level.
 644         */
 645        block = xfs_btree_get_block(cur, level, &bp);
 646        xfs_btree_check_block(cur, block, level, bp);
 647        /*
 648         * It's empty, there is no such record.
 649         */
 650        if (!block->bb_numrecs)
 651                return 0;
 652        /*
 653         * Set the ptr value to numrecs, that's the last record/key.
 654         */
 655        cur->bc_ptrs[level] = be16_to_cpu(block->bb_numrecs);
 656        return 1;
 657}
 658
 659/*
 660 * Compute first and last byte offsets for the fields given.
 661 * Interprets the offsets table, which contains struct field offsets.
 662 */
 663void
 664xfs_btree_offsets(
 665        __int64_t       fields,         /* bitmask of fields */
 666        const short     *offsets,       /* table of field offsets */
 667        int             nbits,          /* number of bits to inspect */
 668        int             *first,         /* output: first byte offset */
 669        int             *last)          /* output: last byte offset */
 670{
 671        int             i;              /* current bit number */
 672        __int64_t       imask;          /* mask for current bit number */
 673
 674        ASSERT(fields != 0);
 675        /*
 676         * Find the lowest bit, so the first byte offset.
 677         */
 678        for (i = 0, imask = 1LL; ; i++, imask <<= 1) {
 679                if (imask & fields) {
 680                        *first = offsets[i];
 681                        break;
 682                }
 683        }
 684        /*
 685         * Find the highest bit, so the last byte offset.
 686         */
 687        for (i = nbits - 1, imask = 1LL << i; ; i--, imask >>= 1) {
 688                if (imask & fields) {
 689                        *last = offsets[i + 1] - 1;
 690                        break;
 691                }
 692        }
 693}
 694
 695/*
 696 * Get a buffer for the block, return it read in.
 697 * Long-form addressing.
 698 */
 699int
 700xfs_btree_read_bufl(
 701        struct xfs_mount        *mp,            /* file system mount point */
 702        struct xfs_trans        *tp,            /* transaction pointer */
 703        xfs_fsblock_t           fsbno,          /* file system block number */
 704        uint                    lock,           /* lock flags for read_buf */
 705        struct xfs_buf          **bpp,          /* buffer for fsbno */
 706        int                     refval,         /* ref count value for buffer */
 707        const struct xfs_buf_ops *ops)
 708{
 709        struct xfs_buf          *bp;            /* return value */
 710        xfs_daddr_t             d;              /* real disk block address */
 711        int                     error;
 712
 713        ASSERT(fsbno != NULLFSBLOCK);
 714        d = XFS_FSB_TO_DADDR(mp, fsbno);
 715        error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, d,
 716                                   mp->m_bsize, lock, &bp, ops);
 717        if (error)
 718                return error;
 719        if (bp)
 720                xfs_buf_set_ref(bp, refval);
 721        *bpp = bp;
 722        return 0;
 723}
 724
 725/*
 726 * Read-ahead the block, don't wait for it, don't return a buffer.
 727 * Long-form addressing.
 728 */
 729/* ARGSUSED */
 730void
 731xfs_btree_reada_bufl(
 732        struct xfs_mount        *mp,            /* file system mount point */
 733        xfs_fsblock_t           fsbno,          /* file system block number */
 734        xfs_extlen_t            count,          /* count of filesystem blocks */
 735        const struct xfs_buf_ops *ops)
 736{
 737        xfs_daddr_t             d;
 738
 739        ASSERT(fsbno != NULLFSBLOCK);
 740        d = XFS_FSB_TO_DADDR(mp, fsbno);
 741        xfs_buf_readahead(mp->m_ddev_targp, d, mp->m_bsize * count, ops);
 742}
 743
 744/*
 745 * Read-ahead the block, don't wait for it, don't return a buffer.
 746 * Short-form addressing.
 747 */
 748/* ARGSUSED */
 749void
 750xfs_btree_reada_bufs(
 751        struct xfs_mount        *mp,            /* file system mount point */
 752        xfs_agnumber_t          agno,           /* allocation group number */
 753        xfs_agblock_t           agbno,          /* allocation group block number */
 754        xfs_extlen_t            count,          /* count of filesystem blocks */
 755        const struct xfs_buf_ops *ops)
 756{
 757        xfs_daddr_t             d;
 758
 759        ASSERT(agno != NULLAGNUMBER);
 760        ASSERT(agbno != NULLAGBLOCK);
 761        d = XFS_AGB_TO_DADDR(mp, agno, agbno);
 762        xfs_buf_readahead(mp->m_ddev_targp, d, mp->m_bsize * count, ops);
 763}
 764
 765STATIC int
 766xfs_btree_readahead_lblock(
 767        struct xfs_btree_cur    *cur,
 768        int                     lr,
 769        struct xfs_btree_block  *block)
 770{
 771        int                     rval = 0;
 772        xfs_fsblock_t           left = be64_to_cpu(block->bb_u.l.bb_leftsib);
 773        xfs_fsblock_t           right = be64_to_cpu(block->bb_u.l.bb_rightsib);
 774
 775        if ((lr & XFS_BTCUR_LEFTRA) && left != NULLFSBLOCK) {
 776                xfs_btree_reada_bufl(cur->bc_mp, left, 1,
 777                                     cur->bc_ops->buf_ops);
 778                rval++;
 779        }
 780
 781        if ((lr & XFS_BTCUR_RIGHTRA) && right != NULLFSBLOCK) {
 782                xfs_btree_reada_bufl(cur->bc_mp, right, 1,
 783                                     cur->bc_ops->buf_ops);
 784                rval++;
 785        }
 786
 787        return rval;
 788}
 789
 790STATIC int
 791xfs_btree_readahead_sblock(
 792        struct xfs_btree_cur    *cur,
 793        int                     lr,
 794        struct xfs_btree_block *block)
 795{
 796        int                     rval = 0;
 797        xfs_agblock_t           left = be32_to_cpu(block->bb_u.s.bb_leftsib);
 798        xfs_agblock_t           right = be32_to_cpu(block->bb_u.s.bb_rightsib);
 799
 800
 801        if ((lr & XFS_BTCUR_LEFTRA) && left != NULLAGBLOCK) {
 802                xfs_btree_reada_bufs(cur->bc_mp, cur->bc_private.a.agno,
 803                                     left, 1, cur->bc_ops->buf_ops);
 804                rval++;
 805        }
 806
 807        if ((lr & XFS_BTCUR_RIGHTRA) && right != NULLAGBLOCK) {
 808                xfs_btree_reada_bufs(cur->bc_mp, cur->bc_private.a.agno,
 809                                     right, 1, cur->bc_ops->buf_ops);
 810                rval++;
 811        }
 812
 813        return rval;
 814}
 815
 816/*
 817 * Read-ahead btree blocks, at the given level.
 818 * Bits in lr are set from XFS_BTCUR_{LEFT,RIGHT}RA.
 819 */
 820STATIC int
 821xfs_btree_readahead(
 822        struct xfs_btree_cur    *cur,           /* btree cursor */
 823        int                     lev,            /* level in btree */
 824        int                     lr)             /* left/right bits */
 825{
 826        struct xfs_btree_block  *block;
 827
 828        /*
 829         * No readahead needed if we are at the root level and the
 830         * btree root is stored in the inode.
 831         */
 832        if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
 833            (lev == cur->bc_nlevels - 1))
 834                return 0;
 835
 836        if ((cur->bc_ra[lev] | lr) == cur->bc_ra[lev])
 837                return 0;
 838
 839        cur->bc_ra[lev] |= lr;
 840        block = XFS_BUF_TO_BLOCK(cur->bc_bufs[lev]);
 841
 842        if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
 843                return xfs_btree_readahead_lblock(cur, lr, block);
 844        return xfs_btree_readahead_sblock(cur, lr, block);
 845}
 846
 847STATIC xfs_daddr_t
 848xfs_btree_ptr_to_daddr(
 849        struct xfs_btree_cur    *cur,
 850        union xfs_btree_ptr     *ptr)
 851{
 852        if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
 853                ASSERT(ptr->l != cpu_to_be64(NULLFSBLOCK));
 854
 855                return XFS_FSB_TO_DADDR(cur->bc_mp, be64_to_cpu(ptr->l));
 856        } else {
 857                ASSERT(cur->bc_private.a.agno != NULLAGNUMBER);
 858                ASSERT(ptr->s != cpu_to_be32(NULLAGBLOCK));
 859
 860                return XFS_AGB_TO_DADDR(cur->bc_mp, cur->bc_private.a.agno,
 861                                        be32_to_cpu(ptr->s));
 862        }
 863}
 864
 865/*
 866 * Readahead @count btree blocks at the given @ptr location.
 867 *
 868 * We don't need to care about long or short form btrees here as we have a
 869 * method of converting the ptr directly to a daddr available to us.
 870 */
 871STATIC void
 872xfs_btree_readahead_ptr(
 873        struct xfs_btree_cur    *cur,
 874        union xfs_btree_ptr     *ptr,
 875        xfs_extlen_t            count)
 876{
 877        xfs_buf_readahead(cur->bc_mp->m_ddev_targp,
 878                          xfs_btree_ptr_to_daddr(cur, ptr),
 879                          cur->bc_mp->m_bsize * count, cur->bc_ops->buf_ops);
 880}
 881
 882/*
 883 * Set the buffer for level "lev" in the cursor to bp, releasing
 884 * any previous buffer.
 885 */
 886STATIC void
 887xfs_btree_setbuf(
 888        xfs_btree_cur_t         *cur,   /* btree cursor */
 889        int                     lev,    /* level in btree */
 890        xfs_buf_t               *bp)    /* new buffer to set */
 891{
 892        struct xfs_btree_block  *b;     /* btree block */
 893
 894        if (cur->bc_bufs[lev])
 895                xfs_trans_brelse(cur->bc_tp, cur->bc_bufs[lev]);
 896        cur->bc_bufs[lev] = bp;
 897        cur->bc_ra[lev] = 0;
 898
 899        b = XFS_BUF_TO_BLOCK(bp);
 900        if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
 901                if (b->bb_u.l.bb_leftsib == cpu_to_be64(NULLFSBLOCK))
 902                        cur->bc_ra[lev] |= XFS_BTCUR_LEFTRA;
 903                if (b->bb_u.l.bb_rightsib == cpu_to_be64(NULLFSBLOCK))
 904                        cur->bc_ra[lev] |= XFS_BTCUR_RIGHTRA;
 905        } else {
 906                if (b->bb_u.s.bb_leftsib == cpu_to_be32(NULLAGBLOCK))
 907                        cur->bc_ra[lev] |= XFS_BTCUR_LEFTRA;
 908                if (b->bb_u.s.bb_rightsib == cpu_to_be32(NULLAGBLOCK))
 909                        cur->bc_ra[lev] |= XFS_BTCUR_RIGHTRA;
 910        }
 911}
 912
 913STATIC int
 914xfs_btree_ptr_is_null(
 915        struct xfs_btree_cur    *cur,
 916        union xfs_btree_ptr     *ptr)
 917{
 918        if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
 919                return ptr->l == cpu_to_be64(NULLFSBLOCK);
 920        else
 921                return ptr->s == cpu_to_be32(NULLAGBLOCK);
 922}
 923
 924STATIC void
 925xfs_btree_set_ptr_null(
 926        struct xfs_btree_cur    *cur,
 927        union xfs_btree_ptr     *ptr)
 928{
 929        if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
 930                ptr->l = cpu_to_be64(NULLFSBLOCK);
 931        else
 932                ptr->s = cpu_to_be32(NULLAGBLOCK);
 933}
 934
 935/*
 936 * Get/set/init sibling pointers
 937 */
 938STATIC void
 939xfs_btree_get_sibling(
 940        struct xfs_btree_cur    *cur,
 941        struct xfs_btree_block  *block,
 942        union xfs_btree_ptr     *ptr,
 943        int                     lr)
 944{
 945        ASSERT(lr == XFS_BB_LEFTSIB || lr == XFS_BB_RIGHTSIB);
 946
 947        if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
 948                if (lr == XFS_BB_RIGHTSIB)
 949                        ptr->l = block->bb_u.l.bb_rightsib;
 950                else
 951                        ptr->l = block->bb_u.l.bb_leftsib;
 952        } else {
 953                if (lr == XFS_BB_RIGHTSIB)
 954                        ptr->s = block->bb_u.s.bb_rightsib;
 955                else
 956                        ptr->s = block->bb_u.s.bb_leftsib;
 957        }
 958}
 959
 960STATIC void
 961xfs_btree_set_sibling(
 962        struct xfs_btree_cur    *cur,
 963        struct xfs_btree_block  *block,
 964        union xfs_btree_ptr     *ptr,
 965        int                     lr)
 966{
 967        ASSERT(lr == XFS_BB_LEFTSIB || lr == XFS_BB_RIGHTSIB);
 968
 969        if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
 970                if (lr == XFS_BB_RIGHTSIB)
 971                        block->bb_u.l.bb_rightsib = ptr->l;
 972                else
 973                        block->bb_u.l.bb_leftsib = ptr->l;
 974        } else {
 975                if (lr == XFS_BB_RIGHTSIB)
 976                        block->bb_u.s.bb_rightsib = ptr->s;
 977                else
 978                        block->bb_u.s.bb_leftsib = ptr->s;
 979        }
 980}
 981
 982void
 983xfs_btree_init_block_int(
 984        struct xfs_mount        *mp,
 985        struct xfs_btree_block  *buf,
 986        xfs_daddr_t             blkno,
 987        __u32                   magic,
 988        __u16                   level,
 989        __u16                   numrecs,
 990        __u64                   owner,
 991        unsigned int            flags)
 992{
 993        buf->bb_magic = cpu_to_be32(magic);
 994        buf->bb_level = cpu_to_be16(level);
 995        buf->bb_numrecs = cpu_to_be16(numrecs);
 996
 997        if (flags & XFS_BTREE_LONG_PTRS) {
 998                buf->bb_u.l.bb_leftsib = cpu_to_be64(NULLFSBLOCK);
 999                buf->bb_u.l.bb_rightsib = cpu_to_be64(NULLFSBLOCK);
1000                if (flags & XFS_BTREE_CRC_BLOCKS) {
1001                        buf->bb_u.l.bb_blkno = cpu_to_be64(blkno);
1002                        buf->bb_u.l.bb_owner = cpu_to_be64(owner);
1003                        uuid_copy(&buf->bb_u.l.bb_uuid, &mp->m_sb.sb_uuid);
1004                        buf->bb_u.l.bb_pad = 0;
1005                        buf->bb_u.l.bb_lsn = 0;
1006                }
1007        } else {
1008                /* owner is a 32 bit value on short blocks */
1009                __u32 __owner = (__u32)owner;
1010
1011                buf->bb_u.s.bb_leftsib = cpu_to_be32(NULLAGBLOCK);
1012                buf->bb_u.s.bb_rightsib = cpu_to_be32(NULLAGBLOCK);
1013                if (flags & XFS_BTREE_CRC_BLOCKS) {
1014                        buf->bb_u.s.bb_blkno = cpu_to_be64(blkno);
1015                        buf->bb_u.s.bb_owner = cpu_to_be32(__owner);
1016                        uuid_copy(&buf->bb_u.s.bb_uuid, &mp->m_sb.sb_uuid);
1017                        buf->bb_u.s.bb_lsn = 0;
1018                }
1019        }
1020}
1021
1022void
1023xfs_btree_init_block(
1024        struct xfs_mount *mp,
1025        struct xfs_buf  *bp,
1026        __u32           magic,
1027        __u16           level,
1028        __u16           numrecs,
1029        __u64           owner,
1030        unsigned int    flags)
1031{
1032        xfs_btree_init_block_int(mp, XFS_BUF_TO_BLOCK(bp), bp->b_bn,
1033                                 magic, level, numrecs, owner, flags);
1034}
1035
1036STATIC void
1037xfs_btree_init_block_cur(
1038        struct xfs_btree_cur    *cur,
1039        struct xfs_buf          *bp,
1040        int                     level,
1041        int                     numrecs)
1042{
1043        __u64 owner;
1044
1045        /*
1046         * we can pull the owner from the cursor right now as the different
1047         * owners align directly with the pointer size of the btree. This may
1048         * change in future, but is safe for current users of the generic btree
1049         * code.
1050         */
1051        if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
1052                owner = cur->bc_private.b.ip->i_ino;
1053        else
1054                owner = cur->bc_private.a.agno;
1055
1056        xfs_btree_init_block_int(cur->bc_mp, XFS_BUF_TO_BLOCK(bp), bp->b_bn,
1057                                 xfs_btree_magic(cur), level, numrecs,
1058                                 owner, cur->bc_flags);
1059}
1060
1061/*
1062 * Return true if ptr is the last record in the btree and
1063 * we need to track updates to this record.  The decision
1064 * will be further refined in the update_lastrec method.
1065 */
1066STATIC int
1067xfs_btree_is_lastrec(
1068        struct xfs_btree_cur    *cur,
1069        struct xfs_btree_block  *block,
1070        int                     level)
1071{
1072        union xfs_btree_ptr     ptr;
1073
1074        if (level > 0)
1075                return 0;
1076        if (!(cur->bc_flags & XFS_BTREE_LASTREC_UPDATE))
1077                return 0;
1078
1079        xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_RIGHTSIB);
1080        if (!xfs_btree_ptr_is_null(cur, &ptr))
1081                return 0;
1082        return 1;
1083}
1084
1085STATIC void
1086xfs_btree_buf_to_ptr(
1087        struct xfs_btree_cur    *cur,
1088        struct xfs_buf          *bp,
1089        union xfs_btree_ptr     *ptr)
1090{
1091        if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
1092                ptr->l = cpu_to_be64(XFS_DADDR_TO_FSB(cur->bc_mp,
1093                                        XFS_BUF_ADDR(bp)));
1094        else {
1095                ptr->s = cpu_to_be32(xfs_daddr_to_agbno(cur->bc_mp,
1096                                        XFS_BUF_ADDR(bp)));
1097        }
1098}
1099
1100STATIC void
1101xfs_btree_set_refs(
1102        struct xfs_btree_cur    *cur,
1103        struct xfs_buf          *bp)
1104{
1105        switch (cur->bc_btnum) {
1106        case XFS_BTNUM_BNO:
1107        case XFS_BTNUM_CNT:
1108                xfs_buf_set_ref(bp, XFS_ALLOC_BTREE_REF);
1109                break;
1110        case XFS_BTNUM_INO:
1111        case XFS_BTNUM_FINO:
1112                xfs_buf_set_ref(bp, XFS_INO_BTREE_REF);
1113                break;
1114        case XFS_BTNUM_BMAP:
1115                xfs_buf_set_ref(bp, XFS_BMAP_BTREE_REF);
1116                break;
1117        default:
1118                ASSERT(0);
1119        }
1120}
1121
1122STATIC int
1123xfs_btree_get_buf_block(
1124        struct xfs_btree_cur    *cur,
1125        union xfs_btree_ptr     *ptr,
1126        int                     flags,
1127        struct xfs_btree_block  **block,
1128        struct xfs_buf          **bpp)
1129{
1130        struct xfs_mount        *mp = cur->bc_mp;
1131        xfs_daddr_t             d;
1132
1133        /* need to sort out how callers deal with failures first */
1134        ASSERT(!(flags & XBF_TRYLOCK));
1135
1136        d = xfs_btree_ptr_to_daddr(cur, ptr);
1137        *bpp = xfs_trans_get_buf(cur->bc_tp, mp->m_ddev_targp, d,
1138                                 mp->m_bsize, flags);
1139
1140        if (!*bpp)
1141                return -ENOMEM;
1142
1143        (*bpp)->b_ops = cur->bc_ops->buf_ops;
1144        *block = XFS_BUF_TO_BLOCK(*bpp);
1145        return 0;
1146}
1147
1148/*
1149 * Read in the buffer at the given ptr and return the buffer and
1150 * the block pointer within the buffer.
1151 */
1152STATIC int
1153xfs_btree_read_buf_block(
1154        struct xfs_btree_cur    *cur,
1155        union xfs_btree_ptr     *ptr,
1156        int                     flags,
1157        struct xfs_btree_block  **block,
1158        struct xfs_buf          **bpp)
1159{
1160        struct xfs_mount        *mp = cur->bc_mp;
1161        xfs_daddr_t             d;
1162        int                     error;
1163
1164        /* need to sort out how callers deal with failures first */
1165        ASSERT(!(flags & XBF_TRYLOCK));
1166
1167        d = xfs_btree_ptr_to_daddr(cur, ptr);
1168        error = xfs_trans_read_buf(mp, cur->bc_tp, mp->m_ddev_targp, d,
1169                                   mp->m_bsize, flags, bpp,
1170                                   cur->bc_ops->buf_ops);
1171        if (error)
1172                return error;
1173
1174        xfs_btree_set_refs(cur, *bpp);
1175        *block = XFS_BUF_TO_BLOCK(*bpp);
1176        return 0;
1177}
1178
1179/*
1180 * Copy keys from one btree block to another.
1181 */
1182STATIC void
1183xfs_btree_copy_keys(
1184        struct xfs_btree_cur    *cur,
1185        union xfs_btree_key     *dst_key,
1186        union xfs_btree_key     *src_key,
1187        int                     numkeys)
1188{
1189        ASSERT(numkeys >= 0);
1190        memcpy(dst_key, src_key, numkeys * cur->bc_ops->key_len);
1191}
1192
1193/*
1194 * Copy records from one btree block to another.
1195 */
1196STATIC void
1197xfs_btree_copy_recs(
1198        struct xfs_btree_cur    *cur,
1199        union xfs_btree_rec     *dst_rec,
1200        union xfs_btree_rec     *src_rec,
1201        int                     numrecs)
1202{
1203        ASSERT(numrecs >= 0);
1204        memcpy(dst_rec, src_rec, numrecs * cur->bc_ops->rec_len);
1205}
1206
1207/*
1208 * Copy block pointers from one btree block to another.
1209 */
1210STATIC void
1211xfs_btree_copy_ptrs(
1212        struct xfs_btree_cur    *cur,
1213        union xfs_btree_ptr     *dst_ptr,
1214        union xfs_btree_ptr     *src_ptr,
1215        int                     numptrs)
1216{
1217        ASSERT(numptrs >= 0);
1218        memcpy(dst_ptr, src_ptr, numptrs * xfs_btree_ptr_len(cur));
1219}
1220
1221/*
1222 * Shift keys one index left/right inside a single btree block.
1223 */
1224STATIC void
1225xfs_btree_shift_keys(
1226        struct xfs_btree_cur    *cur,
1227        union xfs_btree_key     *key,
1228        int                     dir,
1229        int                     numkeys)
1230{
1231        char                    *dst_key;
1232
1233        ASSERT(numkeys >= 0);
1234        ASSERT(dir == 1 || dir == -1);
1235
1236        dst_key = (char *)key + (dir * cur->bc_ops->key_len);
1237        memmove(dst_key, key, numkeys * cur->bc_ops->key_len);
1238}
1239
1240/*
1241 * Shift records one index left/right inside a single btree block.
1242 */
1243STATIC void
1244xfs_btree_shift_recs(
1245        struct xfs_btree_cur    *cur,
1246        union xfs_btree_rec     *rec,
1247        int                     dir,
1248        int                     numrecs)
1249{
1250        char                    *dst_rec;
1251
1252        ASSERT(numrecs >= 0);
1253        ASSERT(dir == 1 || dir == -1);
1254
1255        dst_rec = (char *)rec + (dir * cur->bc_ops->rec_len);
1256        memmove(dst_rec, rec, numrecs * cur->bc_ops->rec_len);
1257}
1258
1259/*
1260 * Shift block pointers one index left/right inside a single btree block.
1261 */
1262STATIC void
1263xfs_btree_shift_ptrs(
1264        struct xfs_btree_cur    *cur,
1265        union xfs_btree_ptr     *ptr,
1266        int                     dir,
1267        int                     numptrs)
1268{
1269        char                    *dst_ptr;
1270
1271        ASSERT(numptrs >= 0);
1272        ASSERT(dir == 1 || dir == -1);
1273
1274        dst_ptr = (char *)ptr + (dir * xfs_btree_ptr_len(cur));
1275        memmove(dst_ptr, ptr, numptrs * xfs_btree_ptr_len(cur));
1276}
1277
1278/*
1279 * Log key values from the btree block.
1280 */
1281STATIC void
1282xfs_btree_log_keys(
1283        struct xfs_btree_cur    *cur,
1284        struct xfs_buf          *bp,
1285        int                     first,
1286        int                     last)
1287{
1288        XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
1289        XFS_BTREE_TRACE_ARGBII(cur, bp, first, last);
1290
1291        if (bp) {
1292                xfs_trans_buf_set_type(cur->bc_tp, bp, XFS_BLFT_BTREE_BUF);
1293                xfs_trans_log_buf(cur->bc_tp, bp,
1294                                  xfs_btree_key_offset(cur, first),
1295                                  xfs_btree_key_offset(cur, last + 1) - 1);
1296        } else {
1297                xfs_trans_log_inode(cur->bc_tp, cur->bc_private.b.ip,
1298                                xfs_ilog_fbroot(cur->bc_private.b.whichfork));
1299        }
1300
1301        XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
1302}
1303
1304/*
1305 * Log record values from the btree block.
1306 */
1307void
1308xfs_btree_log_recs(
1309        struct xfs_btree_cur    *cur,
1310        struct xfs_buf          *bp,
1311        int                     first,
1312        int                     last)
1313{
1314        XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
1315        XFS_BTREE_TRACE_ARGBII(cur, bp, first, last);
1316
1317        xfs_trans_buf_set_type(cur->bc_tp, bp, XFS_BLFT_BTREE_BUF);
1318        xfs_trans_log_buf(cur->bc_tp, bp,
1319                          xfs_btree_rec_offset(cur, first),
1320                          xfs_btree_rec_offset(cur, last + 1) - 1);
1321
1322        XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
1323}
1324
1325/*
1326 * Log block pointer fields from a btree block (nonleaf).
1327 */
1328STATIC void
1329xfs_btree_log_ptrs(
1330        struct xfs_btree_cur    *cur,   /* btree cursor */
1331        struct xfs_buf          *bp,    /* buffer containing btree block */
1332        int                     first,  /* index of first pointer to log */
1333        int                     last)   /* index of last pointer to log */
1334{
1335        XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
1336        XFS_BTREE_TRACE_ARGBII(cur, bp, first, last);
1337
1338        if (bp) {
1339                struct xfs_btree_block  *block = XFS_BUF_TO_BLOCK(bp);
1340                int                     level = xfs_btree_get_level(block);
1341
1342                xfs_trans_buf_set_type(cur->bc_tp, bp, XFS_BLFT_BTREE_BUF);
1343                xfs_trans_log_buf(cur->bc_tp, bp,
1344                                xfs_btree_ptr_offset(cur, first, level),
1345                                xfs_btree_ptr_offset(cur, last + 1, level) - 1);
1346        } else {
1347                xfs_trans_log_inode(cur->bc_tp, cur->bc_private.b.ip,
1348                        xfs_ilog_fbroot(cur->bc_private.b.whichfork));
1349        }
1350
1351        XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
1352}
1353
1354/*
1355 * Log fields from a btree block header.
1356 */
1357void
1358xfs_btree_log_block(
1359        struct xfs_btree_cur    *cur,   /* btree cursor */
1360        struct xfs_buf          *bp,    /* buffer containing btree block */
1361        int                     fields) /* mask of fields: XFS_BB_... */
1362{
1363        int                     first;  /* first byte offset logged */
1364        int                     last;   /* last byte offset logged */
1365        static const short      soffsets[] = {  /* table of offsets (short) */
1366                offsetof(struct xfs_btree_block, bb_magic),
1367                offsetof(struct xfs_btree_block, bb_level),
1368                offsetof(struct xfs_btree_block, bb_numrecs),
1369                offsetof(struct xfs_btree_block, bb_u.s.bb_leftsib),
1370                offsetof(struct xfs_btree_block, bb_u.s.bb_rightsib),
1371                offsetof(struct xfs_btree_block, bb_u.s.bb_blkno),
1372                offsetof(struct xfs_btree_block, bb_u.s.bb_lsn),
1373                offsetof(struct xfs_btree_block, bb_u.s.bb_uuid),
1374                offsetof(struct xfs_btree_block, bb_u.s.bb_owner),
1375                offsetof(struct xfs_btree_block, bb_u.s.bb_crc),
1376                XFS_BTREE_SBLOCK_CRC_LEN
1377        };
1378        static const short      loffsets[] = {  /* table of offsets (long) */
1379                offsetof(struct xfs_btree_block, bb_magic),
1380                offsetof(struct xfs_btree_block, bb_level),
1381                offsetof(struct xfs_btree_block, bb_numrecs),
1382                offsetof(struct xfs_btree_block, bb_u.l.bb_leftsib),
1383                offsetof(struct xfs_btree_block, bb_u.l.bb_rightsib),
1384                offsetof(struct xfs_btree_block, bb_u.l.bb_blkno),
1385                offsetof(struct xfs_btree_block, bb_u.l.bb_lsn),
1386                offsetof(struct xfs_btree_block, bb_u.l.bb_uuid),
1387                offsetof(struct xfs_btree_block, bb_u.l.bb_owner),
1388                offsetof(struct xfs_btree_block, bb_u.l.bb_crc),
1389                offsetof(struct xfs_btree_block, bb_u.l.bb_pad),
1390                XFS_BTREE_LBLOCK_CRC_LEN
1391        };
1392
1393        XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
1394        XFS_BTREE_TRACE_ARGBI(cur, bp, fields);
1395
1396        if (bp) {
1397                int nbits;
1398
1399                if (cur->bc_flags & XFS_BTREE_CRC_BLOCKS) {
1400                        /*
1401                         * We don't log the CRC when updating a btree
1402                         * block but instead recreate it during log
1403                         * recovery.  As the log buffers have checksums
1404                         * of their own this is safe and avoids logging a crc
1405                         * update in a lot of places.
1406                         */
1407                        if (fields == XFS_BB_ALL_BITS)
1408                                fields = XFS_BB_ALL_BITS_CRC;
1409                        nbits = XFS_BB_NUM_BITS_CRC;
1410                } else {
1411                        nbits = XFS_BB_NUM_BITS;
1412                }
1413                xfs_btree_offsets(fields,
1414                                  (cur->bc_flags & XFS_BTREE_LONG_PTRS) ?
1415                                        loffsets : soffsets,
1416                                  nbits, &first, &last);
1417                xfs_trans_buf_set_type(cur->bc_tp, bp, XFS_BLFT_BTREE_BUF);
1418                xfs_trans_log_buf(cur->bc_tp, bp, first, last);
1419        } else {
1420                xfs_trans_log_inode(cur->bc_tp, cur->bc_private.b.ip,
1421                        xfs_ilog_fbroot(cur->bc_private.b.whichfork));
1422        }
1423
1424        XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
1425}
1426
1427/*
1428 * Increment cursor by one record at the level.
1429 * For nonzero levels the leaf-ward information is untouched.
1430 */
1431int                                             /* error */
1432xfs_btree_increment(
1433        struct xfs_btree_cur    *cur,
1434        int                     level,
1435        int                     *stat)          /* success/failure */
1436{
1437        struct xfs_btree_block  *block;
1438        union xfs_btree_ptr     ptr;
1439        struct xfs_buf          *bp;
1440        int                     error;          /* error return value */
1441        int                     lev;
1442
1443        XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
1444        XFS_BTREE_TRACE_ARGI(cur, level);
1445
1446        ASSERT(level < cur->bc_nlevels);
1447
1448        /* Read-ahead to the right at this level. */
1449        xfs_btree_readahead(cur, level, XFS_BTCUR_RIGHTRA);
1450
1451        /* Get a pointer to the btree block. */
1452        block = xfs_btree_get_block(cur, level, &bp);
1453
1454#ifdef DEBUG
1455        error = xfs_btree_check_block(cur, block, level, bp);
1456        if (error)
1457                goto error0;
1458#endif
1459
1460        /* We're done if we remain in the block after the increment. */
1461        if (++cur->bc_ptrs[level] <= xfs_btree_get_numrecs(block))
1462                goto out1;
1463
1464        /* Fail if we just went off the right edge of the tree. */
1465        xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_RIGHTSIB);
1466        if (xfs_btree_ptr_is_null(cur, &ptr))
1467                goto out0;
1468
1469        XFS_BTREE_STATS_INC(cur, increment);
1470
1471        /*
1472         * March up the tree incrementing pointers.
1473         * Stop when we don't go off the right edge of a block.
1474         */
1475        for (lev = level + 1; lev < cur->bc_nlevels; lev++) {
1476                block = xfs_btree_get_block(cur, lev, &bp);
1477
1478#ifdef DEBUG
1479                error = xfs_btree_check_block(cur, block, lev, bp);
1480                if (error)
1481                        goto error0;
1482#endif
1483
1484                if (++cur->bc_ptrs[lev] <= xfs_btree_get_numrecs(block))
1485                        break;
1486
1487                /* Read-ahead the right block for the next loop. */
1488                xfs_btree_readahead(cur, lev, XFS_BTCUR_RIGHTRA);
1489        }
1490
1491        /*
1492         * If we went off the root then we are either seriously
1493         * confused or have the tree root in an inode.
1494         */
1495        if (lev == cur->bc_nlevels) {
1496                if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE)
1497                        goto out0;
1498                ASSERT(0);
1499                error = -EFSCORRUPTED;
1500                goto error0;
1501        }
1502        ASSERT(lev < cur->bc_nlevels);
1503
1504        /*
1505         * Now walk back down the tree, fixing up the cursor's buffer
1506         * pointers and key numbers.
1507         */
1508        for (block = xfs_btree_get_block(cur, lev, &bp); lev > level; ) {
1509                union xfs_btree_ptr     *ptrp;
1510
1511                ptrp = xfs_btree_ptr_addr(cur, cur->bc_ptrs[lev], block);
1512                --lev;
1513                error = xfs_btree_read_buf_block(cur, ptrp, 0, &block, &bp);
1514                if (error)
1515                        goto error0;
1516
1517                xfs_btree_setbuf(cur, lev, bp);
1518                cur->bc_ptrs[lev] = 1;
1519        }
1520out1:
1521        XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
1522        *stat = 1;
1523        return 0;
1524
1525out0:
1526        XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
1527        *stat = 0;
1528        return 0;
1529
1530error0:
1531        XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
1532        return error;
1533}
1534
1535/*
1536 * Decrement cursor by one record at the level.
1537 * For nonzero levels the leaf-ward information is untouched.
1538 */
1539int                                             /* error */
1540xfs_btree_decrement(
1541        struct xfs_btree_cur    *cur,
1542        int                     level,
1543        int                     *stat)          /* success/failure */
1544{
1545        struct xfs_btree_block  *block;
1546        xfs_buf_t               *bp;
1547        int                     error;          /* error return value */
1548        int                     lev;
1549        union xfs_btree_ptr     ptr;
1550
1551        XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
1552        XFS_BTREE_TRACE_ARGI(cur, level);
1553
1554        ASSERT(level < cur->bc_nlevels);
1555
1556        /* Read-ahead to the left at this level. */
1557        xfs_btree_readahead(cur, level, XFS_BTCUR_LEFTRA);
1558
1559        /* We're done if we remain in the block after the decrement. */
1560        if (--cur->bc_ptrs[level] > 0)
1561                goto out1;
1562
1563        /* Get a pointer to the btree block. */
1564        block = xfs_btree_get_block(cur, level, &bp);
1565
1566#ifdef DEBUG
1567        error = xfs_btree_check_block(cur, block, level, bp);
1568        if (error)
1569                goto error0;
1570#endif
1571
1572        /* Fail if we just went off the left edge of the tree. */
1573        xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_LEFTSIB);
1574        if (xfs_btree_ptr_is_null(cur, &ptr))
1575                goto out0;
1576
1577        XFS_BTREE_STATS_INC(cur, decrement);
1578
1579        /*
1580         * March up the tree decrementing pointers.
1581         * Stop when we don't go off the left edge of a block.
1582         */
1583        for (lev = level + 1; lev < cur->bc_nlevels; lev++) {
1584                if (--cur->bc_ptrs[lev] > 0)
1585                        break;
1586                /* Read-ahead the left block for the next loop. */
1587                xfs_btree_readahead(cur, lev, XFS_BTCUR_LEFTRA);
1588        }
1589
1590        /*
1591         * If we went off the root then we are seriously confused.
1592         * or the root of the tree is in an inode.
1593         */
1594        if (lev == cur->bc_nlevels) {
1595                if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE)
1596                        goto out0;
1597                ASSERT(0);
1598                error = -EFSCORRUPTED;
1599                goto error0;
1600        }
1601        ASSERT(lev < cur->bc_nlevels);
1602
1603        /*
1604         * Now walk back down the tree, fixing up the cursor's buffer
1605         * pointers and key numbers.
1606         */
1607        for (block = xfs_btree_get_block(cur, lev, &bp); lev > level; ) {
1608                union xfs_btree_ptr     *ptrp;
1609
1610                ptrp = xfs_btree_ptr_addr(cur, cur->bc_ptrs[lev], block);
1611                --lev;
1612                error = xfs_btree_read_buf_block(cur, ptrp, 0, &block, &bp);
1613                if (error)
1614                        goto error0;
1615                xfs_btree_setbuf(cur, lev, bp);
1616                cur->bc_ptrs[lev] = xfs_btree_get_numrecs(block);
1617        }
1618out1:
1619        XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
1620        *stat = 1;
1621        return 0;
1622
1623out0:
1624        XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
1625        *stat = 0;
1626        return 0;
1627
1628error0:
1629        XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
1630        return error;
1631}
1632
1633STATIC int
1634xfs_btree_lookup_get_block(
1635        struct xfs_btree_cur    *cur,   /* btree cursor */
1636        int                     level,  /* level in the btree */
1637        union xfs_btree_ptr     *pp,    /* ptr to btree block */
1638        struct xfs_btree_block  **blkp) /* return btree block */
1639{
1640        struct xfs_buf          *bp;    /* buffer pointer for btree block */
1641        int                     error = 0;
1642
1643        /* special case the root block if in an inode */
1644        if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
1645            (level == cur->bc_nlevels - 1)) {
1646                *blkp = xfs_btree_get_iroot(cur);
1647                return 0;
1648        }
1649
1650        /*
1651         * If the old buffer at this level for the disk address we are
1652         * looking for re-use it.
1653         *
1654         * Otherwise throw it away and get a new one.
1655         */
1656        bp = cur->bc_bufs[level];
1657        if (bp && XFS_BUF_ADDR(bp) == xfs_btree_ptr_to_daddr(cur, pp)) {
1658                *blkp = XFS_BUF_TO_BLOCK(bp);
1659                return 0;
1660        }
1661
1662        error = xfs_btree_read_buf_block(cur, pp, 0, blkp, &bp);
1663        if (error)
1664                return error;
1665
1666        xfs_btree_setbuf(cur, level, bp);
1667        return 0;
1668}
1669
1670/*
1671 * Get current search key.  For level 0 we don't actually have a key
1672 * structure so we make one up from the record.  For all other levels
1673 * we just return the right key.
1674 */
1675STATIC union xfs_btree_key *
1676xfs_lookup_get_search_key(
1677        struct xfs_btree_cur    *cur,
1678        int                     level,
1679        int                     keyno,
1680        struct xfs_btree_block  *block,
1681        union xfs_btree_key     *kp)
1682{
1683        if (level == 0) {
1684                cur->bc_ops->init_key_from_rec(kp,
1685                                xfs_btree_rec_addr(cur, keyno, block));
1686                return kp;
1687        }
1688
1689        return xfs_btree_key_addr(cur, keyno, block);
1690}
1691
1692/*
1693 * Lookup the record.  The cursor is made to point to it, based on dir.
1694 * stat is set to 0 if can't find any such record, 1 for success.
1695 */
1696int                                     /* error */
1697xfs_btree_lookup(
1698        struct xfs_btree_cur    *cur,   /* btree cursor */
1699        xfs_lookup_t            dir,    /* <=, ==, or >= */
1700        int                     *stat)  /* success/failure */
1701{
1702        struct xfs_btree_block  *block; /* current btree block */
1703        __int64_t               diff;   /* difference for the current key */
1704        int                     error;  /* error return value */
1705        int                     keyno;  /* current key number */
1706        int                     level;  /* level in the btree */
1707        union xfs_btree_ptr     *pp;    /* ptr to btree block */
1708        union xfs_btree_ptr     ptr;    /* ptr to btree block */
1709
1710        XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
1711        XFS_BTREE_TRACE_ARGI(cur, dir);
1712
1713        XFS_BTREE_STATS_INC(cur, lookup);
1714
1715        block = NULL;
1716        keyno = 0;
1717
1718        /* initialise start pointer from cursor */
1719        cur->bc_ops->init_ptr_from_cur(cur, &ptr);
1720        pp = &ptr;
1721
1722        /*
1723         * Iterate over each level in the btree, starting at the root.
1724         * For each level above the leaves, find the key we need, based
1725         * on the lookup record, then follow the corresponding block
1726         * pointer down to the next level.
1727         */
1728        for (level = cur->bc_nlevels - 1, diff = 1; level >= 0; level--) {
1729                /* Get the block we need to do the lookup on. */
1730                error = xfs_btree_lookup_get_block(cur, level, pp, &block);
1731                if (error)
1732                        goto error0;
1733
1734                if (diff == 0) {
1735                        /*
1736                         * If we already had a key match at a higher level, we
1737                         * know we need to use the first entry in this block.
1738                         */
1739                        keyno = 1;
1740                } else {
1741                        /* Otherwise search this block. Do a binary search. */
1742
1743                        int     high;   /* high entry number */
1744                        int     low;    /* low entry number */
1745
1746                        /* Set low and high entry numbers, 1-based. */
1747                        low = 1;
1748                        high = xfs_btree_get_numrecs(block);
1749                        if (!high) {
1750                                /* Block is empty, must be an empty leaf. */
1751                                ASSERT(level == 0 && cur->bc_nlevels == 1);
1752
1753                                cur->bc_ptrs[0] = dir != XFS_LOOKUP_LE;
1754                                XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
1755                                *stat = 0;
1756                                return 0;
1757                        }
1758
1759                        /* Binary search the block. */
1760                        while (low <= high) {
1761                                union xfs_btree_key     key;
1762                                union xfs_btree_key     *kp;
1763
1764                                XFS_BTREE_STATS_INC(cur, compare);
1765
1766                                /* keyno is average of low and high. */
1767                                keyno = (low + high) >> 1;
1768
1769                                /* Get current search key */
1770                                kp = xfs_lookup_get_search_key(cur, level,
1771                                                keyno, block, &key);
1772
1773                                /*
1774                                 * Compute difference to get next direction:
1775                                 *  - less than, move right
1776                                 *  - greater than, move left
1777                                 *  - equal, we're done
1778                                 */
1779                                diff = cur->bc_ops->key_diff(cur, kp);
1780                                if (diff < 0)
1781                                        low = keyno + 1;
1782                                else if (diff > 0)
1783                                        high = keyno - 1;
1784                                else
1785                                        break;
1786                        }
1787                }
1788
1789                /*
1790                 * If there are more levels, set up for the next level
1791                 * by getting the block number and filling in the cursor.
1792                 */
1793                if (level > 0) {
1794                        /*
1795                         * If we moved left, need the previous key number,
1796                         * unless there isn't one.
1797                         */
1798                        if (diff > 0 && --keyno < 1)
1799                                keyno = 1;
1800                        pp = xfs_btree_ptr_addr(cur, keyno, block);
1801
1802#ifdef DEBUG
1803                        error = xfs_btree_check_ptr(cur, pp, 0, level);
1804                        if (error)
1805                                goto error0;
1806#endif
1807                        cur->bc_ptrs[level] = keyno;
1808                }
1809        }
1810
1811        /* Done with the search. See if we need to adjust the results. */
1812        if (dir != XFS_LOOKUP_LE && diff < 0) {
1813                keyno++;
1814                /*
1815                 * If ge search and we went off the end of the block, but it's
1816                 * not the last block, we're in the wrong block.
1817                 */
1818                xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_RIGHTSIB);
1819                if (dir == XFS_LOOKUP_GE &&
1820                    keyno > xfs_btree_get_numrecs(block) &&
1821                    !xfs_btree_ptr_is_null(cur, &ptr)) {
1822                        int     i;
1823
1824                        cur->bc_ptrs[0] = keyno;
1825                        error = xfs_btree_increment(cur, 0, &i);
1826                        if (error)
1827                                goto error0;
1828                        XFS_WANT_CORRUPTED_RETURN(cur->bc_mp, i == 1);
1829                        XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
1830                        *stat = 1;
1831                        return 0;
1832                }
1833        } else if (dir == XFS_LOOKUP_LE && diff > 0)
1834                keyno--;
1835        cur->bc_ptrs[0] = keyno;
1836
1837        /* Return if we succeeded or not. */
1838        if (keyno == 0 || keyno > xfs_btree_get_numrecs(block))
1839                *stat = 0;
1840        else if (dir != XFS_LOOKUP_EQ || diff == 0)
1841                *stat = 1;
1842        else
1843                *stat = 0;
1844        XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
1845        return 0;
1846
1847error0:
1848        XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
1849        return error;
1850}
1851
1852/*
1853 * Update keys at all levels from here to the root along the cursor's path.
1854 */
1855STATIC int
1856xfs_btree_updkey(
1857        struct xfs_btree_cur    *cur,
1858        union xfs_btree_key     *keyp,
1859        int                     level)
1860{
1861        struct xfs_btree_block  *block;
1862        struct xfs_buf          *bp;
1863        union xfs_btree_key     *kp;
1864        int                     ptr;
1865
1866        XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
1867        XFS_BTREE_TRACE_ARGIK(cur, level, keyp);
1868
1869        ASSERT(!(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) || level >= 1);
1870
1871        /*
1872         * Go up the tree from this level toward the root.
1873         * At each level, update the key value to the value input.
1874         * Stop when we reach a level where the cursor isn't pointing
1875         * at the first entry in the block.
1876         */
1877        for (ptr = 1; ptr == 1 && level < cur->bc_nlevels; level++) {
1878#ifdef DEBUG
1879                int             error;
1880#endif
1881                block = xfs_btree_get_block(cur, level, &bp);
1882#ifdef DEBUG
1883                error = xfs_btree_check_block(cur, block, level, bp);
1884                if (error) {
1885                        XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
1886                        return error;
1887                }
1888#endif
1889                ptr = cur->bc_ptrs[level];
1890                kp = xfs_btree_key_addr(cur, ptr, block);
1891                xfs_btree_copy_keys(cur, kp, keyp, 1);
1892                xfs_btree_log_keys(cur, bp, ptr, ptr);
1893        }
1894
1895        XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
1896        return 0;
1897}
1898
1899/*
1900 * Update the record referred to by cur to the value in the
1901 * given record. This either works (return 0) or gets an
1902 * EFSCORRUPTED error.
1903 */
1904int
1905xfs_btree_update(
1906        struct xfs_btree_cur    *cur,
1907        union xfs_btree_rec     *rec)
1908{
1909        struct xfs_btree_block  *block;
1910        struct xfs_buf          *bp;
1911        int                     error;
1912        int                     ptr;
1913        union xfs_btree_rec     *rp;
1914
1915        XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
1916        XFS_BTREE_TRACE_ARGR(cur, rec);
1917
1918        /* Pick up the current block. */
1919        block = xfs_btree_get_block(cur, 0, &bp);
1920
1921#ifdef DEBUG
1922        error = xfs_btree_check_block(cur, block, 0, bp);
1923        if (error)
1924                goto error0;
1925#endif
1926        /* Get the address of the rec to be updated. */
1927        ptr = cur->bc_ptrs[0];
1928        rp = xfs_btree_rec_addr(cur, ptr, block);
1929
1930        /* Fill in the new contents and log them. */
1931        xfs_btree_copy_recs(cur, rp, rec, 1);
1932        xfs_btree_log_recs(cur, bp, ptr, ptr);
1933
1934        /*
1935         * If we are tracking the last record in the tree and
1936         * we are at the far right edge of the tree, update it.
1937         */
1938        if (xfs_btree_is_lastrec(cur, block, 0)) {
1939                cur->bc_ops->update_lastrec(cur, block, rec,
1940                                            ptr, LASTREC_UPDATE);
1941        }
1942
1943        /* Updating first rec in leaf. Pass new key value up to our parent. */
1944        if (ptr == 1) {
1945                union xfs_btree_key     key;
1946
1947                cur->bc_ops->init_key_from_rec(&key, rec);
1948                error = xfs_btree_updkey(cur, &key, 1);
1949                if (error)
1950                        goto error0;
1951        }
1952
1953        XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
1954        return 0;
1955
1956error0:
1957        XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
1958        return error;
1959}
1960
1961/*
1962 * Move 1 record left from cur/level if possible.
1963 * Update cur to reflect the new path.
1964 */
1965STATIC int                                      /* error */
1966xfs_btree_lshift(
1967        struct xfs_btree_cur    *cur,
1968        int                     level,
1969        int                     *stat)          /* success/failure */
1970{
1971        union xfs_btree_key     key;            /* btree key */
1972        struct xfs_buf          *lbp;           /* left buffer pointer */
1973        struct xfs_btree_block  *left;          /* left btree block */
1974        int                     lrecs;          /* left record count */
1975        struct xfs_buf          *rbp;           /* right buffer pointer */
1976        struct xfs_btree_block  *right;         /* right btree block */
1977        int                     rrecs;          /* right record count */
1978        union xfs_btree_ptr     lptr;           /* left btree pointer */
1979        union xfs_btree_key     *rkp = NULL;    /* right btree key */
1980        union xfs_btree_ptr     *rpp = NULL;    /* right address pointer */
1981        union xfs_btree_rec     *rrp = NULL;    /* right record pointer */
1982        int                     error;          /* error return value */
1983
1984        XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
1985        XFS_BTREE_TRACE_ARGI(cur, level);
1986
1987        if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
1988            level == cur->bc_nlevels - 1)
1989                goto out0;
1990
1991        /* Set up variables for this block as "right". */
1992        right = xfs_btree_get_block(cur, level, &rbp);
1993
1994#ifdef DEBUG
1995        error = xfs_btree_check_block(cur, right, level, rbp);
1996        if (error)
1997                goto error0;
1998#endif
1999
2000        /* If we've got no left sibling then we can't shift an entry left. */
2001        xfs_btree_get_sibling(cur, right, &lptr, XFS_BB_LEFTSIB);
2002        if (xfs_btree_ptr_is_null(cur, &lptr))
2003                goto out0;
2004
2005        /*
2006         * If the cursor entry is the one that would be moved, don't
2007         * do it... it's too complicated.
2008         */
2009        if (cur->bc_ptrs[level] <= 1)
2010                goto out0;
2011
2012        /* Set up the left neighbor as "left". */
2013        error = xfs_btree_read_buf_block(cur, &lptr, 0, &left, &lbp);
2014        if (error)
2015                goto error0;
2016
2017        /* If it's full, it can't take another entry. */
2018        lrecs = xfs_btree_get_numrecs(left);
2019        if (lrecs == cur->bc_ops->get_maxrecs(cur, level))
2020                goto out0;
2021
2022        rrecs = xfs_btree_get_numrecs(right);
2023
2024        /*
2025         * We add one entry to the left side and remove one for the right side.
2026         * Account for it here, the changes will be updated on disk and logged
2027         * later.
2028         */
2029        lrecs++;
2030        rrecs--;
2031
2032        XFS_BTREE_STATS_INC(cur, lshift);
2033        XFS_BTREE_STATS_ADD(cur, moves, 1);
2034
2035        /*
2036         * If non-leaf, copy a key and a ptr to the left block.
2037         * Log the changes to the left block.
2038         */
2039        if (level > 0) {
2040                /* It's a non-leaf.  Move keys and pointers. */
2041                union xfs_btree_key     *lkp;   /* left btree key */
2042                union xfs_btree_ptr     *lpp;   /* left address pointer */
2043
2044                lkp = xfs_btree_key_addr(cur, lrecs, left);
2045                rkp = xfs_btree_key_addr(cur, 1, right);
2046
2047                lpp = xfs_btree_ptr_addr(cur, lrecs, left);
2048                rpp = xfs_btree_ptr_addr(cur, 1, right);
2049#ifdef DEBUG
2050                error = xfs_btree_check_ptr(cur, rpp, 0, level);
2051                if (error)
2052                        goto error0;
2053#endif
2054                xfs_btree_copy_keys(cur, lkp, rkp, 1);
2055                xfs_btree_copy_ptrs(cur, lpp, rpp, 1);
2056
2057                xfs_btree_log_keys(cur, lbp, lrecs, lrecs);
2058                xfs_btree_log_ptrs(cur, lbp, lrecs, lrecs);
2059
2060                ASSERT(cur->bc_ops->keys_inorder(cur,
2061                        xfs_btree_key_addr(cur, lrecs - 1, left), lkp));
2062        } else {
2063                /* It's a leaf.  Move records.  */
2064                union xfs_btree_rec     *lrp;   /* left record pointer */
2065
2066                lrp = xfs_btree_rec_addr(cur, lrecs, left);
2067                rrp = xfs_btree_rec_addr(cur, 1, right);
2068
2069                xfs_btree_copy_recs(cur, lrp, rrp, 1);
2070                xfs_btree_log_recs(cur, lbp, lrecs, lrecs);
2071
2072                ASSERT(cur->bc_ops->recs_inorder(cur,
2073                        xfs_btree_rec_addr(cur, lrecs - 1, left), lrp));
2074        }
2075
2076        xfs_btree_set_numrecs(left, lrecs);
2077        xfs_btree_log_block(cur, lbp, XFS_BB_NUMRECS);
2078
2079        xfs_btree_set_numrecs(right, rrecs);
2080        xfs_btree_log_block(cur, rbp, XFS_BB_NUMRECS);
2081
2082        /*
2083         * Slide the contents of right down one entry.
2084         */
2085        XFS_BTREE_STATS_ADD(cur, moves, rrecs - 1);
2086        if (level > 0) {
2087                /* It's a nonleaf. operate on keys and ptrs */
2088#ifdef DEBUG
2089                int                     i;              /* loop index */
2090
2091                for (i = 0; i < rrecs; i++) {
2092                        error = xfs_btree_check_ptr(cur, rpp, i + 1, level);
2093                        if (error)
2094                                goto error0;
2095                }
2096#endif
2097                xfs_btree_shift_keys(cur,
2098                                xfs_btree_key_addr(cur, 2, right),
2099                                -1, rrecs);
2100                xfs_btree_shift_ptrs(cur,
2101                                xfs_btree_ptr_addr(cur, 2, right),
2102                                -1, rrecs);
2103
2104                xfs_btree_log_keys(cur, rbp, 1, rrecs);
2105                xfs_btree_log_ptrs(cur, rbp, 1, rrecs);
2106        } else {
2107                /* It's a leaf. operate on records */
2108                xfs_btree_shift_recs(cur,
2109                        xfs_btree_rec_addr(cur, 2, right),
2110                        -1, rrecs);
2111                xfs_btree_log_recs(cur, rbp, 1, rrecs);
2112
2113                /*
2114                 * If it's the first record in the block, we'll need a key
2115                 * structure to pass up to the next level (updkey).
2116                 */
2117                cur->bc_ops->init_key_from_rec(&key,
2118                        xfs_btree_rec_addr(cur, 1, right));
2119                rkp = &key;
2120        }
2121
2122        /* Update the parent key values of right. */
2123        error = xfs_btree_updkey(cur, rkp, level + 1);
2124        if (error)
2125                goto error0;
2126
2127        /* Slide the cursor value left one. */
2128        cur->bc_ptrs[level]--;
2129
2130        XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
2131        *stat = 1;
2132        return 0;
2133
2134out0:
2135        XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
2136        *stat = 0;
2137        return 0;
2138
2139error0:
2140        XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
2141        return error;
2142}
2143
2144/*
2145 * Move 1 record right from cur/level if possible.
2146 * Update cur to reflect the new path.
2147 */
2148STATIC int                                      /* error */
2149xfs_btree_rshift(
2150        struct xfs_btree_cur    *cur,
2151        int                     level,
2152        int                     *stat)          /* success/failure */
2153{
2154        union xfs_btree_key     key;            /* btree key */
2155        struct xfs_buf          *lbp;           /* left buffer pointer */
2156        struct xfs_btree_block  *left;          /* left btree block */
2157        struct xfs_buf          *rbp;           /* right buffer pointer */
2158        struct xfs_btree_block  *right;         /* right btree block */
2159        struct xfs_btree_cur    *tcur;          /* temporary btree cursor */
2160        union xfs_btree_ptr     rptr;           /* right block pointer */
2161        union xfs_btree_key     *rkp;           /* right btree key */
2162        int                     rrecs;          /* right record count */
2163        int                     lrecs;          /* left record count */
2164        int                     error;          /* error return value */
2165        int                     i;              /* loop counter */
2166
2167        XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
2168        XFS_BTREE_TRACE_ARGI(cur, level);
2169
2170        if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
2171            (level == cur->bc_nlevels - 1))
2172                goto out0;
2173
2174        /* Set up variables for this block as "left". */
2175        left = xfs_btree_get_block(cur, level, &lbp);
2176
2177#ifdef DEBUG
2178        error = xfs_btree_check_block(cur, left, level, lbp);
2179        if (error)
2180                goto error0;
2181#endif
2182
2183        /* If we've got no right sibling then we can't shift an entry right. */
2184        xfs_btree_get_sibling(cur, left, &rptr, XFS_BB_RIGHTSIB);
2185        if (xfs_btree_ptr_is_null(cur, &rptr))
2186                goto out0;
2187
2188        /*
2189         * If the cursor entry is the one that would be moved, don't
2190         * do it... it's too complicated.
2191         */
2192        lrecs = xfs_btree_get_numrecs(left);
2193        if (cur->bc_ptrs[level] >= lrecs)
2194                goto out0;
2195
2196        /* Set up the right neighbor as "right". */
2197        error = xfs_btree_read_buf_block(cur, &rptr, 0, &right, &rbp);
2198        if (error)
2199                goto error0;
2200
2201        /* If it's full, it can't take another entry. */
2202        rrecs = xfs_btree_get_numrecs(right);
2203        if (rrecs == cur->bc_ops->get_maxrecs(cur, level))
2204                goto out0;
2205
2206        XFS_BTREE_STATS_INC(cur, rshift);
2207        XFS_BTREE_STATS_ADD(cur, moves, rrecs);
2208
2209        /*
2210         * Make a hole at the start of the right neighbor block, then
2211         * copy the last left block entry to the hole.
2212         */
2213        if (level > 0) {
2214                /* It's a nonleaf. make a hole in the keys and ptrs */
2215                union xfs_btree_key     *lkp;
2216                union xfs_btree_ptr     *lpp;
2217                union xfs_btree_ptr     *rpp;
2218
2219                lkp = xfs_btree_key_addr(cur, lrecs, left);
2220                lpp = xfs_btree_ptr_addr(cur, lrecs, left);
2221                rkp = xfs_btree_key_addr(cur, 1, right);
2222                rpp = xfs_btree_ptr_addr(cur, 1, right);
2223
2224#ifdef DEBUG
2225                for (i = rrecs - 1; i >= 0; i--) {
2226                        error = xfs_btree_check_ptr(cur, rpp, i, level);
2227                        if (error)
2228                                goto error0;
2229                }
2230#endif
2231
2232                xfs_btree_shift_keys(cur, rkp, 1, rrecs);
2233                xfs_btree_shift_ptrs(cur, rpp, 1, rrecs);
2234
2235#ifdef DEBUG
2236                error = xfs_btree_check_ptr(cur, lpp, 0, level);
2237                if (error)
2238                        goto error0;
2239#endif
2240
2241                /* Now put the new data in, and log it. */
2242                xfs_btree_copy_keys(cur, rkp, lkp, 1);
2243                xfs_btree_copy_ptrs(cur, rpp, lpp, 1);
2244
2245                xfs_btree_log_keys(cur, rbp, 1, rrecs + 1);
2246                xfs_btree_log_ptrs(cur, rbp, 1, rrecs + 1);
2247
2248                ASSERT(cur->bc_ops->keys_inorder(cur, rkp,
2249                        xfs_btree_key_addr(cur, 2, right)));
2250        } else {
2251                /* It's a leaf. make a hole in the records */
2252                union xfs_btree_rec     *lrp;
2253                union xfs_btree_rec     *rrp;
2254
2255                lrp = xfs_btree_rec_addr(cur, lrecs, left);
2256                rrp = xfs_btree_rec_addr(cur, 1, right);
2257
2258                xfs_btree_shift_recs(cur, rrp, 1, rrecs);
2259
2260                /* Now put the new data in, and log it. */
2261                xfs_btree_copy_recs(cur, rrp, lrp, 1);
2262                xfs_btree_log_recs(cur, rbp, 1, rrecs + 1);
2263
2264                cur->bc_ops->init_key_from_rec(&key, rrp);
2265                rkp = &key;
2266
2267                ASSERT(cur->bc_ops->recs_inorder(cur, rrp,
2268                        xfs_btree_rec_addr(cur, 2, right)));
2269        }
2270
2271        /*
2272         * Decrement and log left's numrecs, bump and log right's numrecs.
2273         */
2274        xfs_btree_set_numrecs(left, --lrecs);
2275        xfs_btree_log_block(cur, lbp, XFS_BB_NUMRECS);
2276
2277        xfs_btree_set_numrecs(right, ++rrecs);
2278        xfs_btree_log_block(cur, rbp, XFS_BB_NUMRECS);
2279
2280        /*
2281         * Using a temporary cursor, update the parent key values of the
2282         * block on the right.
2283         */
2284        error = xfs_btree_dup_cursor(cur, &tcur);
2285        if (error)
2286                goto error0;
2287        i = xfs_btree_lastrec(tcur, level);
2288        XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, i == 1, error0);
2289
2290        error = xfs_btree_increment(tcur, level, &i);
2291        if (error)
2292                goto error1;
2293
2294        error = xfs_btree_updkey(tcur, rkp, level + 1);
2295        if (error)
2296                goto error1;
2297
2298        xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
2299
2300        XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
2301        *stat = 1;
2302        return 0;
2303
2304out0:
2305        XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
2306        *stat = 0;
2307        return 0;
2308
2309error0:
2310        XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
2311        return error;
2312
2313error1:
2314        XFS_BTREE_TRACE_CURSOR(tcur, XBT_ERROR);
2315        xfs_btree_del_cursor(tcur, XFS_BTREE_ERROR);
2316        return error;
2317}
2318
2319/*
2320 * Split cur/level block in half.
2321 * Return new block number and the key to its first
2322 * record (to be inserted into parent).
2323 */
2324STATIC int                                      /* error */
2325__xfs_btree_split(
2326        struct xfs_btree_cur    *cur,
2327        int                     level,
2328        union xfs_btree_ptr     *ptrp,
2329        union xfs_btree_key     *key,
2330        struct xfs_btree_cur    **curp,
2331        int                     *stat)          /* success/failure */
2332{
2333        union xfs_btree_ptr     lptr;           /* left sibling block ptr */
2334        struct xfs_buf          *lbp;           /* left buffer pointer */
2335        struct xfs_btree_block  *left;          /* left btree block */
2336        union xfs_btree_ptr     rptr;           /* right sibling block ptr */
2337        struct xfs_buf          *rbp;           /* right buffer pointer */
2338        struct xfs_btree_block  *right;         /* right btree block */
2339        union xfs_btree_ptr     rrptr;          /* right-right sibling ptr */
2340        struct xfs_buf          *rrbp;          /* right-right buffer pointer */
2341        struct xfs_btree_block  *rrblock;       /* right-right btree block */
2342        int                     lrecs;
2343        int                     rrecs;
2344        int                     src_index;
2345        int                     error;          /* error return value */
2346#ifdef DEBUG
2347        int                     i;
2348#endif
2349
2350        XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
2351        XFS_BTREE_TRACE_ARGIPK(cur, level, *ptrp, key);
2352
2353        XFS_BTREE_STATS_INC(cur, split);
2354
2355        /* Set up left block (current one). */
2356        left = xfs_btree_get_block(cur, level, &lbp);
2357
2358#ifdef DEBUG
2359        error = xfs_btree_check_block(cur, left, level, lbp);
2360        if (error)
2361                goto error0;
2362#endif
2363
2364        xfs_btree_buf_to_ptr(cur, lbp, &lptr);
2365
2366        /* Allocate the new block. If we can't do it, we're toast. Give up. */
2367        error = cur->bc_ops->alloc_block(cur, &lptr, &rptr, stat);
2368        if (error)
2369                goto error0;
2370        if (*stat == 0)
2371                goto out0;
2372        XFS_BTREE_STATS_INC(cur, alloc);
2373
2374        /* Set up the new block as "right". */
2375        error = xfs_btree_get_buf_block(cur, &rptr, 0, &right, &rbp);
2376        if (error)
2377                goto error0;
2378
2379        /* Fill in the btree header for the new right block. */
2380        xfs_btree_init_block_cur(cur, rbp, xfs_btree_get_level(left), 0);
2381
2382        /*
2383         * Split the entries between the old and the new block evenly.
2384         * Make sure that if there's an odd number of entries now, that
2385         * each new block will have the same number of entries.
2386         */
2387        lrecs = xfs_btree_get_numrecs(left);
2388        rrecs = lrecs / 2;
2389        if ((lrecs & 1) && cur->bc_ptrs[level] <= rrecs + 1)
2390                rrecs++;
2391        src_index = (lrecs - rrecs + 1);
2392
2393        XFS_BTREE_STATS_ADD(cur, moves, rrecs);
2394
2395        /*
2396         * Copy btree block entries from the left block over to the
2397         * new block, the right. Update the right block and log the
2398         * changes.
2399         */
2400        if (level > 0) {
2401                /* It's a non-leaf.  Move keys and pointers. */
2402                union xfs_btree_key     *lkp;   /* left btree key */
2403                union xfs_btree_ptr     *lpp;   /* left address pointer */
2404                union xfs_btree_key     *rkp;   /* right btree key */
2405                union xfs_btree_ptr     *rpp;   /* right address pointer */
2406
2407                lkp = xfs_btree_key_addr(cur, src_index, left);
2408                lpp = xfs_btree_ptr_addr(cur, src_index, left);
2409                rkp = xfs_btree_key_addr(cur, 1, right);
2410                rpp = xfs_btree_ptr_addr(cur, 1, right);
2411
2412#ifdef DEBUG
2413                for (i = src_index; i < rrecs; i++) {
2414                        error = xfs_btree_check_ptr(cur, lpp, i, level);
2415                        if (error)
2416                                goto error0;
2417                }
2418#endif
2419
2420                xfs_btree_copy_keys(cur, rkp, lkp, rrecs);
2421                xfs_btree_copy_ptrs(cur, rpp, lpp, rrecs);
2422
2423                xfs_btree_log_keys(cur, rbp, 1, rrecs);
2424                xfs_btree_log_ptrs(cur, rbp, 1, rrecs);
2425
2426                /* Grab the keys to the entries moved to the right block */
2427                xfs_btree_copy_keys(cur, key, rkp, 1);
2428        } else {
2429                /* It's a leaf.  Move records.  */
2430                union xfs_btree_rec     *lrp;   /* left record pointer */
2431                union xfs_btree_rec     *rrp;   /* right record pointer */
2432
2433                lrp = xfs_btree_rec_addr(cur, src_index, left);
2434                rrp = xfs_btree_rec_addr(cur, 1, right);
2435
2436                xfs_btree_copy_recs(cur, rrp, lrp, rrecs);
2437                xfs_btree_log_recs(cur, rbp, 1, rrecs);
2438
2439                cur->bc_ops->init_key_from_rec(key,
2440                        xfs_btree_rec_addr(cur, 1, right));
2441        }
2442
2443
2444        /*
2445         * Find the left block number by looking in the buffer.
2446         * Adjust numrecs, sibling pointers.
2447         */
2448        xfs_btree_get_sibling(cur, left, &rrptr, XFS_BB_RIGHTSIB);
2449        xfs_btree_set_sibling(cur, right, &rrptr, XFS_BB_RIGHTSIB);
2450        xfs_btree_set_sibling(cur, right, &lptr, XFS_BB_LEFTSIB);
2451        xfs_btree_set_sibling(cur, left, &rptr, XFS_BB_RIGHTSIB);
2452
2453        lrecs -= rrecs;
2454        xfs_btree_set_numrecs(left, lrecs);
2455        xfs_btree_set_numrecs(right, xfs_btree_get_numrecs(right) + rrecs);
2456
2457        xfs_btree_log_block(cur, rbp, XFS_BB_ALL_BITS);
2458        xfs_btree_log_block(cur, lbp, XFS_BB_NUMRECS | XFS_BB_RIGHTSIB);
2459
2460        /*
2461         * If there's a block to the new block's right, make that block
2462         * point back to right instead of to left.
2463         */
2464        if (!xfs_btree_ptr_is_null(cur, &rrptr)) {
2465                error = xfs_btree_read_buf_block(cur, &rrptr,
2466                                                        0, &rrblock, &rrbp);
2467                if (error)
2468                        goto error0;
2469                xfs_btree_set_sibling(cur, rrblock, &rptr, XFS_BB_LEFTSIB);
2470                xfs_btree_log_block(cur, rrbp, XFS_BB_LEFTSIB);
2471        }
2472        /*
2473         * If the cursor is really in the right block, move it there.
2474         * If it's just pointing past the last entry in left, then we'll
2475         * insert there, so don't change anything in that case.
2476         */
2477        if (cur->bc_ptrs[level] > lrecs + 1) {
2478                xfs_btree_setbuf(cur, level, rbp);
2479                cur->bc_ptrs[level] -= lrecs;
2480        }
2481        /*
2482         * If there are more levels, we'll need another cursor which refers
2483         * the right block, no matter where this cursor was.
2484         */
2485        if (level + 1 < cur->bc_nlevels) {
2486                error = xfs_btree_dup_cursor(cur, curp);
2487                if (error)
2488                        goto error0;
2489                (*curp)->bc_ptrs[level + 1]++;
2490        }
2491        *ptrp = rptr;
2492        XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
2493        *stat = 1;
2494        return 0;
2495out0:
2496        XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
2497        *stat = 0;
2498        return 0;
2499
2500error0:
2501        XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
2502        return error;
2503}
2504
2505struct xfs_btree_split_args {
2506        struct xfs_btree_cur    *cur;
2507        int                     level;
2508        union xfs_btree_ptr     *ptrp;
2509        union xfs_btree_key     *key;
2510        struct xfs_btree_cur    **curp;
2511        int                     *stat;          /* success/failure */
2512        int                     result;
2513        bool                    kswapd; /* allocation in kswapd context */
2514        struct completion       *done;
2515        struct work_struct      work;
2516};
2517
2518/*
2519 * Stack switching interfaces for allocation
2520 */
2521static void
2522xfs_btree_split_worker(
2523        struct work_struct      *work)
2524{
2525        struct xfs_btree_split_args     *args = container_of(work,
2526                                                struct xfs_btree_split_args, work);
2527        unsigned long           pflags;
2528        unsigned long           new_pflags = PF_FSTRANS;
2529
2530        /*
2531         * we are in a transaction context here, but may also be doing work
2532         * in kswapd context, and hence we may need to inherit that state
2533         * temporarily to ensure that we don't block waiting for memory reclaim
2534         * in any way.
2535         */
2536        if (args->kswapd)
2537                new_pflags |= PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD;
2538
2539        current_set_flags_nested(&pflags, new_pflags);
2540
2541        args->result = __xfs_btree_split(args->cur, args->level, args->ptrp,
2542                                         args->key, args->curp, args->stat);
2543        complete(args->done);
2544
2545        current_restore_flags_nested(&pflags, new_pflags);
2546}
2547
2548/*
2549 * BMBT split requests often come in with little stack to work on. Push
2550 * them off to a worker thread so there is lots of stack to use. For the other
2551 * btree types, just call directly to avoid the context switch overhead here.
2552 */
2553STATIC int                                      /* error */
2554xfs_btree_split(
2555        struct xfs_btree_cur    *cur,
2556        int                     level,
2557        union xfs_btree_ptr     *ptrp,
2558        union xfs_btree_key     *key,
2559        struct xfs_btree_cur    **curp,
2560        int                     *stat)          /* success/failure */
2561{
2562        struct xfs_btree_split_args     args;
2563        DECLARE_COMPLETION_ONSTACK(done);
2564
2565        if (cur->bc_btnum != XFS_BTNUM_BMAP)
2566                return __xfs_btree_split(cur, level, ptrp, key, curp, stat);
2567
2568        args.cur = cur;
2569        args.level = level;
2570        args.ptrp = ptrp;
2571        args.key = key;
2572        args.curp = curp;
2573        args.stat = stat;
2574        args.done = &done;
2575        args.kswapd = current_is_kswapd();
2576        INIT_WORK_ONSTACK(&args.work, xfs_btree_split_worker);
2577        queue_work(xfs_alloc_wq, &args.work);
2578        wait_for_completion(&done);
2579        destroy_work_on_stack(&args.work);
2580        return args.result;
2581}
2582
2583
2584/*
2585 * Copy the old inode root contents into a real block and make the
2586 * broot point to it.
2587 */
2588int                                             /* error */
2589xfs_btree_new_iroot(
2590        struct xfs_btree_cur    *cur,           /* btree cursor */
2591        int                     *logflags,      /* logging flags for inode */
2592        int                     *stat)          /* return status - 0 fail */
2593{
2594        struct xfs_buf          *cbp;           /* buffer for cblock */
2595        struct xfs_btree_block  *block;         /* btree block */
2596        struct xfs_btree_block  *cblock;        /* child btree block */
2597        union xfs_btree_key     *ckp;           /* child key pointer */
2598        union xfs_btree_ptr     *cpp;           /* child ptr pointer */
2599        union xfs_btree_key     *kp;            /* pointer to btree key */
2600        union xfs_btree_ptr     *pp;            /* pointer to block addr */
2601        union xfs_btree_ptr     nptr;           /* new block addr */
2602        int                     level;          /* btree level */
2603        int                     error;          /* error return code */
2604#ifdef DEBUG
2605        int                     i;              /* loop counter */
2606#endif
2607
2608        XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
2609        XFS_BTREE_STATS_INC(cur, newroot);
2610
2611        ASSERT(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE);
2612
2613        level = cur->bc_nlevels - 1;
2614
2615        block = xfs_btree_get_iroot(cur);
2616        pp = xfs_btree_ptr_addr(cur, 1, block);
2617
2618        /* Allocate the new block. If we can't do it, we're toast. Give up. */
2619        error = cur->bc_ops->alloc_block(cur, pp, &nptr, stat);
2620        if (error)
2621                goto error0;
2622        if (*stat == 0) {
2623                XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
2624                return 0;
2625        }
2626        XFS_BTREE_STATS_INC(cur, alloc);
2627
2628        /* Copy the root into a real block. */
2629        error = xfs_btree_get_buf_block(cur, &nptr, 0, &cblock, &cbp);
2630        if (error)
2631                goto error0;
2632
2633        /*
2634         * we can't just memcpy() the root in for CRC enabled btree blocks.
2635         * In that case have to also ensure the blkno remains correct
2636         */
2637        memcpy(cblock, block, xfs_btree_block_len(cur));
2638        if (cur->bc_flags & XFS_BTREE_CRC_BLOCKS) {
2639                if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
2640                        cblock->bb_u.l.bb_blkno = cpu_to_be64(cbp->b_bn);
2641                else
2642                        cblock->bb_u.s.bb_blkno = cpu_to_be64(cbp->b_bn);
2643        }
2644
2645        be16_add_cpu(&block->bb_level, 1);
2646        xfs_btree_set_numrecs(block, 1);
2647        cur->bc_nlevels++;
2648        cur->bc_ptrs[level + 1] = 1;
2649
2650        kp = xfs_btree_key_addr(cur, 1, block);
2651        ckp = xfs_btree_key_addr(cur, 1, cblock);
2652        xfs_btree_copy_keys(cur, ckp, kp, xfs_btree_get_numrecs(cblock));
2653
2654        cpp = xfs_btree_ptr_addr(cur, 1, cblock);
2655#ifdef DEBUG
2656        for (i = 0; i < be16_to_cpu(cblock->bb_numrecs); i++) {
2657                error = xfs_btree_check_ptr(cur, pp, i, level);
2658                if (error)
2659                        goto error0;
2660        }
2661#endif
2662        xfs_btree_copy_ptrs(cur, cpp, pp, xfs_btree_get_numrecs(cblock));
2663
2664#ifdef DEBUG
2665        error = xfs_btree_check_ptr(cur, &nptr, 0, level);
2666        if (error)
2667                goto error0;
2668#endif
2669        xfs_btree_copy_ptrs(cur, pp, &nptr, 1);
2670
2671        xfs_iroot_realloc(cur->bc_private.b.ip,
2672                          1 - xfs_btree_get_numrecs(cblock),
2673                          cur->bc_private.b.whichfork);
2674
2675        xfs_btree_setbuf(cur, level, cbp);
2676
2677        /*
2678         * Do all this logging at the end so that
2679         * the root is at the right level.
2680         */
2681        xfs_btree_log_block(cur, cbp, XFS_BB_ALL_BITS);
2682        xfs_btree_log_keys(cur, cbp, 1, be16_to_cpu(cblock->bb_numrecs));
2683        xfs_btree_log_ptrs(cur, cbp, 1, be16_to_cpu(cblock->bb_numrecs));
2684
2685        *logflags |=
2686                XFS_ILOG_CORE | xfs_ilog_fbroot(cur->bc_private.b.whichfork);
2687        *stat = 1;
2688        XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
2689        return 0;
2690error0:
2691        XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
2692        return error;
2693}
2694
2695/*
2696 * Allocate a new root block, fill it in.
2697 */
2698STATIC int                              /* error */
2699xfs_btree_new_root(
2700        struct xfs_btree_cur    *cur,   /* btree cursor */
2701        int                     *stat)  /* success/failure */
2702{
2703        struct xfs_btree_block  *block; /* one half of the old root block */
2704        struct xfs_buf          *bp;    /* buffer containing block */
2705        int                     error;  /* error return value */
2706        struct xfs_buf          *lbp;   /* left buffer pointer */
2707        struct xfs_btree_block  *left;  /* left btree block */
2708        struct xfs_buf          *nbp;   /* new (root) buffer */
2709        struct xfs_btree_block  *new;   /* new (root) btree block */
2710        int                     nptr;   /* new value for key index, 1 or 2 */
2711        struct xfs_buf          *rbp;   /* right buffer pointer */
2712        struct xfs_btree_block  *right; /* right btree block */
2713        union xfs_btree_ptr     rptr;
2714        union xfs_btree_ptr     lptr;
2715
2716        XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
2717        XFS_BTREE_STATS_INC(cur, newroot);
2718
2719        /* initialise our start point from the cursor */
2720        cur->bc_ops->init_ptr_from_cur(cur, &rptr);
2721
2722        /* Allocate the new block. If we can't do it, we're toast. Give up. */
2723        error = cur->bc_ops->alloc_block(cur, &rptr, &lptr, stat);
2724        if (error)
2725                goto error0;
2726        if (*stat == 0)
2727                goto out0;
2728        XFS_BTREE_STATS_INC(cur, alloc);
2729
2730        /* Set up the new block. */
2731        error = xfs_btree_get_buf_block(cur, &lptr, 0, &new, &nbp);
2732        if (error)
2733                goto error0;
2734
2735        /* Set the root in the holding structure  increasing the level by 1. */
2736        cur->bc_ops->set_root(cur, &lptr, 1);
2737
2738        /*
2739         * At the previous root level there are now two blocks: the old root,
2740         * and the new block generated when it was split.  We don't know which
2741         * one the cursor is pointing at, so we set up variables "left" and
2742         * "right" for each case.
2743         */
2744        block = xfs_btree_get_block(cur, cur->bc_nlevels - 1, &bp);
2745
2746#ifdef DEBUG
2747        error = xfs_btree_check_block(cur, block, cur->bc_nlevels - 1, bp);
2748        if (error)
2749                goto error0;
2750#endif
2751
2752        xfs_btree_get_sibling(cur, block, &rptr, XFS_BB_RIGHTSIB);
2753        if (!xfs_btree_ptr_is_null(cur, &rptr)) {
2754                /* Our block is left, pick up the right block. */
2755                lbp = bp;
2756                xfs_btree_buf_to_ptr(cur, lbp, &lptr);
2757                left = block;
2758                error = xfs_btree_read_buf_block(cur, &rptr, 0, &right, &rbp);
2759                if (error)
2760                        goto error0;
2761                bp = rbp;
2762                nptr = 1;
2763        } else {
2764                /* Our block is right, pick up the left block. */
2765                rbp = bp;
2766                xfs_btree_buf_to_ptr(cur, rbp, &rptr);
2767                right = block;
2768                xfs_btree_get_sibling(cur, right, &lptr, XFS_BB_LEFTSIB);
2769                error = xfs_btree_read_buf_block(cur, &lptr, 0, &left, &lbp);
2770                if (error)
2771                        goto error0;
2772                bp = lbp;
2773                nptr = 2;
2774        }
2775        /* Fill in the new block's btree header and log it. */
2776        xfs_btree_init_block_cur(cur, nbp, cur->bc_nlevels, 2);
2777        xfs_btree_log_block(cur, nbp, XFS_BB_ALL_BITS);
2778        ASSERT(!xfs_btree_ptr_is_null(cur, &lptr) &&
2779                        !xfs_btree_ptr_is_null(cur, &rptr));
2780
2781        /* Fill in the key data in the new root. */
2782        if (xfs_btree_get_level(left) > 0) {
2783                xfs_btree_copy_keys(cur,
2784                                xfs_btree_key_addr(cur, 1, new),
2785                                xfs_btree_key_addr(cur, 1, left), 1);
2786                xfs_btree_copy_keys(cur,
2787                                xfs_btree_key_addr(cur, 2, new),
2788                                xfs_btree_key_addr(cur, 1, right), 1);
2789        } else {
2790                cur->bc_ops->init_key_from_rec(
2791                                xfs_btree_key_addr(cur, 1, new),
2792                                xfs_btree_rec_addr(cur, 1, left));
2793                cur->bc_ops->init_key_from_rec(
2794                                xfs_btree_key_addr(cur, 2, new),
2795                                xfs_btree_rec_addr(cur, 1, right));
2796        }
2797        xfs_btree_log_keys(cur, nbp, 1, 2);
2798
2799        /* Fill in the pointer data in the new root. */
2800        xfs_btree_copy_ptrs(cur,
2801                xfs_btree_ptr_addr(cur, 1, new), &lptr, 1);
2802        xfs_btree_copy_ptrs(cur,
2803                xfs_btree_ptr_addr(cur, 2, new), &rptr, 1);
2804        xfs_btree_log_ptrs(cur, nbp, 1, 2);
2805
2806        /* Fix up the cursor. */
2807        xfs_btree_setbuf(cur, cur->bc_nlevels, nbp);
2808        cur->bc_ptrs[cur->bc_nlevels] = nptr;
2809        cur->bc_nlevels++;
2810        XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
2811        *stat = 1;
2812        return 0;
2813error0:
2814        XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
2815        return error;
2816out0:
2817        XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
2818        *stat = 0;
2819        return 0;
2820}
2821
2822STATIC int
2823xfs_btree_make_block_unfull(
2824        struct xfs_btree_cur    *cur,   /* btree cursor */
2825        int                     level,  /* btree level */
2826        int                     numrecs,/* # of recs in block */
2827        int                     *oindex,/* old tree index */
2828        int                     *index, /* new tree index */
2829        union xfs_btree_ptr     *nptr,  /* new btree ptr */
2830        struct xfs_btree_cur    **ncur, /* new btree cursor */
2831        union xfs_btree_rec     *nrec,  /* new record */
2832        int                     *stat)
2833{
2834        union xfs_btree_key     key;    /* new btree key value */
2835        int                     error = 0;
2836
2837        if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
2838            level == cur->bc_nlevels - 1) {
2839                struct xfs_inode *ip = cur->bc_private.b.ip;
2840
2841                if (numrecs < cur->bc_ops->get_dmaxrecs(cur, level)) {
2842                        /* A root block that can be made bigger. */
2843                        xfs_iroot_realloc(ip, 1, cur->bc_private.b.whichfork);
2844                } else {
2845                        /* A root block that needs replacing */
2846                        int     logflags = 0;
2847
2848                        error = xfs_btree_new_iroot(cur, &logflags, stat);
2849                        if (error || *stat == 0)
2850                                return error;
2851
2852                        xfs_trans_log_inode(cur->bc_tp, ip, logflags);
2853                }
2854
2855                return 0;
2856        }
2857
2858        /* First, try shifting an entry to the right neighbor. */
2859        error = xfs_btree_rshift(cur, level, stat);
2860        if (error || *stat)
2861                return error;
2862
2863        /* Next, try shifting an entry to the left neighbor. */
2864        error = xfs_btree_lshift(cur, level, stat);
2865        if (error)
2866                return error;
2867
2868        if (*stat) {
2869                *oindex = *index = cur->bc_ptrs[level];
2870                return 0;
2871        }
2872
2873        /*
2874         * Next, try splitting the current block in half.
2875         *
2876         * If this works we have to re-set our variables because we
2877         * could be in a different block now.
2878         */
2879        error = xfs_btree_split(cur, level, nptr, &key, ncur, stat);
2880        if (error || *stat == 0)
2881                return error;
2882
2883
2884        *index = cur->bc_ptrs[level];
2885        cur->bc_ops->init_rec_from_key(&key, nrec);
2886        return 0;
2887}
2888
2889/*
2890 * Insert one record/level.  Return information to the caller
2891 * allowing the next level up to proceed if necessary.
2892 */
2893STATIC int
2894xfs_btree_insrec(
2895        struct xfs_btree_cur    *cur,   /* btree cursor */
2896        int                     level,  /* level to insert record at */
2897        union xfs_btree_ptr     *ptrp,  /* i/o: block number inserted */
2898        union xfs_btree_rec     *recp,  /* i/o: record data inserted */
2899        struct xfs_btree_cur    **curp, /* output: new cursor replacing cur */
2900        int                     *stat)  /* success/failure */
2901{
2902        struct xfs_btree_block  *block; /* btree block */
2903        struct xfs_buf          *bp;    /* buffer for block */
2904        union xfs_btree_key     key;    /* btree key */
2905        union xfs_btree_ptr     nptr;   /* new block ptr */
2906        struct xfs_btree_cur    *ncur;  /* new btree cursor */
2907        union xfs_btree_rec     nrec;   /* new record count */
2908        int                     optr;   /* old key/record index */
2909        int                     ptr;    /* key/record index */
2910        int                     numrecs;/* number of records */
2911        int                     error;  /* error return value */
2912#ifdef DEBUG
2913        int                     i;
2914#endif
2915
2916        XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
2917        XFS_BTREE_TRACE_ARGIPR(cur, level, *ptrp, recp);
2918
2919        ncur = NULL;
2920
2921        /*
2922         * If we have an external root pointer, and we've made it to the
2923         * root level, allocate a new root block and we're done.
2924         */
2925        if (!(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
2926            (level >= cur->bc_nlevels)) {
2927                error = xfs_btree_new_root(cur, stat);
2928                xfs_btree_set_ptr_null(cur, ptrp);
2929
2930                XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
2931                return error;
2932        }
2933
2934        /* If we're off the left edge, return failure. */
2935        ptr = cur->bc_ptrs[level];
2936        if (ptr == 0) {
2937                XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
2938                *stat = 0;
2939                return 0;
2940        }
2941
2942        /* Make a key out of the record data to be inserted, and save it. */
2943        cur->bc_ops->init_key_from_rec(&key, recp);
2944
2945        optr = ptr;
2946
2947        XFS_BTREE_STATS_INC(cur, insrec);
2948
2949        /* Get pointers to the btree buffer and block. */
2950        block = xfs_btree_get_block(cur, level, &bp);
2951        numrecs = xfs_btree_get_numrecs(block);
2952
2953#ifdef DEBUG
2954        error = xfs_btree_check_block(cur, block, level, bp);
2955        if (error)
2956                goto error0;
2957
2958        /* Check that the new entry is being inserted in the right place. */
2959        if (ptr <= numrecs) {
2960                if (level == 0) {
2961                        ASSERT(cur->bc_ops->recs_inorder(cur, recp,
2962                                xfs_btree_rec_addr(cur, ptr, block)));
2963                } else {
2964                        ASSERT(cur->bc_ops->keys_inorder(cur, &key,
2965                                xfs_btree_key_addr(cur, ptr, block)));
2966                }
2967        }
2968#endif
2969
2970        /*
2971         * If the block is full, we can't insert the new entry until we
2972         * make the block un-full.
2973         */
2974        xfs_btree_set_ptr_null(cur, &nptr);
2975        if (numrecs == cur->bc_ops->get_maxrecs(cur, level)) {
2976                error = xfs_btree_make_block_unfull(cur, level, numrecs,
2977                                        &optr, &ptr, &nptr, &ncur, &nrec, stat);
2978                if (error || *stat == 0)
2979                        goto error0;
2980        }
2981
2982        /*
2983         * The current block may have changed if the block was
2984         * previously full and we have just made space in it.
2985         */
2986        block = xfs_btree_get_block(cur, level, &bp);
2987        numrecs = xfs_btree_get_numrecs(block);
2988
2989#ifdef DEBUG
2990        error = xfs_btree_check_block(cur, block, level, bp);
2991        if (error)
2992                return error;
2993#endif
2994
2995        /*
2996         * At this point we know there's room for our new entry in the block
2997         * we're pointing at.
2998         */
2999        XFS_BTREE_STATS_ADD(cur, moves, numrecs - ptr + 1);
3000
3001        if (level > 0) {
3002                /* It's a nonleaf. make a hole in the keys and ptrs */
3003                union xfs_btree_key     *kp;
3004                union xfs_btree_ptr     *pp;
3005
3006                kp = xfs_btree_key_addr(cur, ptr, block);
3007                pp = xfs_btree_ptr_addr(cur, ptr, block);
3008
3009#ifdef DEBUG
3010                for (i = numrecs - ptr; i >= 0; i--) {
3011                        error = xfs_btree_check_ptr(cur, pp, i, level);
3012                        if (error)
3013                                return error;
3014                }
3015#endif
3016
3017                xfs_btree_shift_keys(cur, kp, 1, numrecs - ptr + 1);
3018                xfs_btree_shift_ptrs(cur, pp, 1, numrecs - ptr + 1);
3019
3020#ifdef DEBUG
3021                error = xfs_btree_check_ptr(cur, ptrp, 0, level);
3022                if (error)
3023                        goto error0;
3024#endif
3025
3026                /* Now put the new data in, bump numrecs and log it. */
3027                xfs_btree_copy_keys(cur, kp, &key, 1);
3028                xfs_btree_copy_ptrs(cur, pp, ptrp, 1);
3029                numrecs++;
3030                xfs_btree_set_numrecs(block, numrecs);
3031                xfs_btree_log_ptrs(cur, bp, ptr, numrecs);
3032                xfs_btree_log_keys(cur, bp, ptr, numrecs);
3033#ifdef DEBUG
3034                if (ptr < numrecs) {
3035                        ASSERT(cur->bc_ops->keys_inorder(cur, kp,
3036                                xfs_btree_key_addr(cur, ptr + 1, block)));
3037                }
3038#endif
3039        } else {
3040                /* It's a leaf. make a hole in the records */
3041                union xfs_btree_rec             *rp;
3042
3043                rp = xfs_btree_rec_addr(cur, ptr, block);
3044
3045                xfs_btree_shift_recs(cur, rp, 1, numrecs - ptr + 1);
3046
3047                /* Now put the new data in, bump numrecs and log it. */
3048                xfs_btree_copy_recs(cur, rp, recp, 1);
3049                xfs_btree_set_numrecs(block, ++numrecs);
3050                xfs_btree_log_recs(cur, bp, ptr, numrecs);
3051#ifdef DEBUG
3052                if (ptr < numrecs) {
3053                        ASSERT(cur->bc_ops->recs_inorder(cur, rp,
3054                                xfs_btree_rec_addr(cur, ptr + 1, block)));
3055                }
3056#endif
3057        }
3058
3059        /* Log the new number of records in the btree header. */
3060        xfs_btree_log_block(cur, bp, XFS_BB_NUMRECS);
3061
3062        /* If we inserted at the start of a block, update the parents' keys. */
3063        if (optr == 1) {
3064                error = xfs_btree_updkey(cur, &key, level + 1);
3065                if (error)
3066                        goto error0;
3067        }
3068
3069        /*
3070         * If we are tracking the last record in the tree and
3071         * we are at the far right edge of the tree, update it.
3072         */
3073        if (xfs_btree_is_lastrec(cur, block, level)) {
3074                cur->bc_ops->update_lastrec(cur, block, recp,
3075                                            ptr, LASTREC_INSREC);
3076        }
3077
3078        /*
3079         * Return the new block number, if any.
3080         * If there is one, give back a record value and a cursor too.
3081         */
3082        *ptrp = nptr;
3083        if (!xfs_btree_ptr_is_null(cur, &nptr)) {
3084                *recp = nrec;
3085                *curp = ncur;
3086        }
3087
3088        XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
3089        *stat = 1;
3090        return 0;
3091
3092error0:
3093        XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
3094        return error;
3095}
3096
3097/*
3098 * Insert the record at the point referenced by cur.
3099 *
3100 * A multi-level split of the tree on insert will invalidate the original
3101 * cursor.  All callers of this function should assume that the cursor is
3102 * no longer valid and revalidate it.
3103 */
3104int
3105xfs_btree_insert(
3106        struct xfs_btree_cur    *cur,
3107        int                     *stat)
3108{
3109        int                     error;  /* error return value */
3110        int                     i;      /* result value, 0 for failure */
3111        int                     level;  /* current level number in btree */
3112        union xfs_btree_ptr     nptr;   /* new block number (split result) */
3113        struct xfs_btree_cur    *ncur;  /* new cursor (split result) */
3114        struct xfs_btree_cur    *pcur;  /* previous level's cursor */
3115        union xfs_btree_rec     rec;    /* record to insert */
3116
3117        level = 0;
3118        ncur = NULL;
3119        pcur = cur;
3120
3121        xfs_btree_set_ptr_null(cur, &nptr);
3122        cur->bc_ops->init_rec_from_cur(cur, &rec);
3123
3124        /*
3125         * Loop going up the tree, starting at the leaf level.
3126         * Stop when we don't get a split block, that must mean that
3127         * the insert is finished with this level.
3128         */
3129        do {
3130                /*
3131                 * Insert nrec/nptr into this level of the tree.
3132                 * Note if we fail, nptr will be null.
3133                 */
3134                error = xfs_btree_insrec(pcur, level, &nptr, &rec, &ncur, &i);
3135                if (error) {
3136                        if (pcur != cur)
3137                                xfs_btree_del_cursor(pcur, XFS_BTREE_ERROR);
3138                        goto error0;
3139                }
3140
3141                XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, i == 1, error0);
3142                level++;
3143
3144                /*
3145                 * See if the cursor we just used is trash.
3146                 * Can't trash the caller's cursor, but otherwise we should
3147                 * if ncur is a new cursor or we're about to be done.
3148                 */
3149                if (pcur != cur &&
3150                    (ncur || xfs_btree_ptr_is_null(cur, &nptr))) {
3151                        /* Save the state from the cursor before we trash it */
3152                        if (cur->bc_ops->update_cursor)
3153                                cur->bc_ops->update_cursor(pcur, cur);
3154                        cur->bc_nlevels = pcur->bc_nlevels;
3155                        xfs_btree_del_cursor(pcur, XFS_BTREE_NOERROR);
3156                }
3157                /* If we got a new cursor, switch to it. */
3158                if (ncur) {
3159                        pcur = ncur;
3160                        ncur = NULL;
3161                }
3162        } while (!xfs_btree_ptr_is_null(cur, &nptr));
3163
3164        XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
3165        *stat = i;
3166        return 0;
3167error0:
3168        XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
3169        return error;
3170}
3171
3172/*
3173 * Try to merge a non-leaf block back into the inode root.
3174 *
3175 * Note: the killroot names comes from the fact that we're effectively
3176 * killing the old root block.  But because we can't just delete the
3177 * inode we have to copy the single block it was pointing to into the
3178 * inode.
3179 */
3180STATIC int
3181xfs_btree_kill_iroot(
3182        struct xfs_btree_cur    *cur)
3183{
3184        int                     whichfork = cur->bc_private.b.whichfork;
3185        struct xfs_inode        *ip = cur->bc_private.b.ip;
3186        struct xfs_ifork        *ifp = XFS_IFORK_PTR(ip, whichfork);
3187        struct xfs_btree_block  *block;
3188        struct xfs_btree_block  *cblock;
3189        union xfs_btree_key     *kp;
3190        union xfs_btree_key     *ckp;
3191        union xfs_btree_ptr     *pp;
3192        union xfs_btree_ptr     *cpp;
3193        struct xfs_buf          *cbp;
3194        int                     level;
3195        int                     index;
3196        int                     numrecs;
3197#ifdef DEBUG
3198        union xfs_btree_ptr     ptr;
3199        int                     i;
3200#endif
3201
3202        XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
3203
3204        ASSERT(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE);
3205        ASSERT(cur->bc_nlevels > 1);
3206
3207        /*
3208         * Don't deal with the root block needs to be a leaf case.
3209         * We're just going to turn the thing back into extents anyway.
3210         */
3211        level = cur->bc_nlevels - 1;
3212        if (level == 1)
3213                goto out0;
3214
3215        /*
3216         * Give up if the root has multiple children.
3217         */
3218        block = xfs_btree_get_iroot(cur);
3219        if (xfs_btree_get_numrecs(block) != 1)
3220                goto out0;
3221
3222        cblock = xfs_btree_get_block(cur, level - 1, &cbp);
3223        numrecs = xfs_btree_get_numrecs(cblock);
3224
3225        /*
3226         * Only do this if the next level will fit.
3227         * Then the data must be copied up to the inode,
3228         * instead of freeing the root you free the next level.
3229         */
3230        if (numrecs > cur->bc_ops->get_dmaxrecs(cur, level))
3231                goto out0;
3232
3233        XFS_BTREE_STATS_INC(cur, killroot);
3234
3235#ifdef DEBUG
3236        xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_LEFTSIB);
3237        ASSERT(xfs_btree_ptr_is_null(cur, &ptr));
3238        xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_RIGHTSIB);
3239        ASSERT(xfs_btree_ptr_is_null(cur, &ptr));
3240#endif
3241
3242        index = numrecs - cur->bc_ops->get_maxrecs(cur, level);
3243        if (index) {
3244                xfs_iroot_realloc(cur->bc_private.b.ip, index,
3245                                  cur->bc_private.b.whichfork);
3246                block = ifp->if_broot;
3247        }
3248
3249        be16_add_cpu(&block->bb_numrecs, index);
3250        ASSERT(block->bb_numrecs == cblock->bb_numrecs);
3251
3252        kp = xfs_btree_key_addr(cur, 1, block);
3253        ckp = xfs_btree_key_addr(cur, 1, cblock);
3254        xfs_btree_copy_keys(cur, kp, ckp, numrecs);
3255
3256        pp = xfs_btree_ptr_addr(cur, 1, block);
3257        cpp = xfs_btree_ptr_addr(cur, 1, cblock);
3258#ifdef DEBUG
3259        for (i = 0; i < numrecs; i++) {
3260                int             error;
3261
3262                error = xfs_btree_check_ptr(cur, cpp, i, level - 1);
3263                if (error) {
3264                        XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
3265                        return error;
3266                }
3267        }
3268#endif
3269        xfs_btree_copy_ptrs(cur, pp, cpp, numrecs);
3270
3271        cur->bc_ops->free_block(cur, cbp);
3272        XFS_BTREE_STATS_INC(cur, free);
3273
3274        cur->bc_bufs[level - 1] = NULL;
3275        be16_add_cpu(&block->bb_level, -1);
3276        xfs_trans_log_inode(cur->bc_tp, ip,
3277                XFS_ILOG_CORE | xfs_ilog_fbroot(cur->bc_private.b.whichfork));
3278        cur->bc_nlevels--;
3279out0:
3280        XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
3281        return 0;
3282}
3283
3284/*
3285 * Kill the current root node, and replace it with it's only child node.
3286 */
3287STATIC int
3288xfs_btree_kill_root(
3289        struct xfs_btree_cur    *cur,
3290        struct xfs_buf          *bp,
3291        int                     level,
3292        union xfs_btree_ptr     *newroot)
3293{
3294        int                     error;
3295
3296        XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
3297        XFS_BTREE_STATS_INC(cur, killroot);
3298
3299        /*
3300         * Update the root pointer, decreasing the level by 1 and then
3301         * free the old root.
3302         */
3303        cur->bc_ops->set_root(cur, newroot, -1);
3304
3305        error = cur->bc_ops->free_block(cur, bp);
3306        if (error) {
3307                XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
3308                return error;
3309        }
3310
3311        XFS_BTREE_STATS_INC(cur, free);
3312
3313        cur->bc_bufs[level] = NULL;
3314        cur->bc_ra[level] = 0;
3315        cur->bc_nlevels--;
3316
3317        XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
3318        return 0;
3319}
3320
3321STATIC int
3322xfs_btree_dec_cursor(
3323        struct xfs_btree_cur    *cur,
3324        int                     level,
3325        int                     *stat)
3326{
3327        int                     error;
3328        int                     i;
3329
3330        if (level > 0) {
3331                error = xfs_btree_decrement(cur, level, &i);
3332                if (error)
3333                        return error;
3334        }
3335
3336        XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
3337        *stat = 1;
3338        return 0;
3339}
3340
3341/*
3342 * Single level of the btree record deletion routine.
3343 * Delete record pointed to by cur/level.
3344 * Remove the record from its block then rebalance the tree.
3345 * Return 0 for error, 1 for done, 2 to go on to the next level.
3346 */
3347STATIC int                                      /* error */
3348xfs_btree_delrec(
3349        struct xfs_btree_cur    *cur,           /* btree cursor */
3350        int                     level,          /* level removing record from */
3351        int                     *stat)          /* fail/done/go-on */
3352{
3353        struct xfs_btree_block  *block;         /* btree block */
3354        union xfs_btree_ptr     cptr;           /* current block ptr */
3355        struct xfs_buf          *bp;            /* buffer for block */
3356        int                     error;          /* error return value */
3357        int                     i;              /* loop counter */
3358        union xfs_btree_key     key;            /* storage for keyp */
3359        union xfs_btree_key     *keyp = &key;   /* passed to the next level */
3360        union xfs_btree_ptr     lptr;           /* left sibling block ptr */
3361        struct xfs_buf          *lbp;           /* left buffer pointer */
3362        struct xfs_btree_block  *left;          /* left btree block */
3363        int                     lrecs = 0;      /* left record count */
3364        int                     ptr;            /* key/record index */
3365        union xfs_btree_ptr     rptr;           /* right sibling block ptr */
3366        struct xfs_buf          *rbp;           /* right buffer pointer */
3367        struct xfs_btree_block  *right;         /* right btree block */
3368        struct xfs_btree_block  *rrblock;       /* right-right btree block */
3369        struct xfs_buf          *rrbp;          /* right-right buffer pointer */
3370        int                     rrecs = 0;      /* right record count */
3371        struct xfs_btree_cur    *tcur;          /* temporary btree cursor */
3372        int                     numrecs;        /* temporary numrec count */
3373
3374        XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
3375        XFS_BTREE_TRACE_ARGI(cur, level);
3376
3377        tcur = NULL;
3378
3379        /* Get the index of the entry being deleted, check for nothing there. */
3380        ptr = cur->bc_ptrs[level];
3381        if (ptr == 0) {
3382                XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
3383                *stat = 0;
3384                return 0;
3385        }
3386
3387        /* Get the buffer & block containing the record or key/ptr. */
3388        block = xfs_btree_get_block(cur, level, &bp);
3389        numrecs = xfs_btree_get_numrecs(block);
3390
3391#ifdef DEBUG
3392        error = xfs_btree_check_block(cur, block, level, bp);
3393        if (error)
3394                goto error0;
3395#endif
3396
3397        /* Fail if we're off the end of the block. */
3398        if (ptr > numrecs) {
3399                XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
3400                *stat = 0;
3401                return 0;
3402        }
3403
3404        XFS_BTREE_STATS_INC(cur, delrec);
3405        XFS_BTREE_STATS_ADD(cur, moves, numrecs - ptr);
3406
3407        /* Excise the entries being deleted. */
3408        if (level > 0) {
3409                /* It's a nonleaf. operate on keys and ptrs */
3410                union xfs_btree_key     *lkp;
3411                union xfs_btree_ptr     *lpp;
3412
3413                lkp = xfs_btree_key_addr(cur, ptr + 1, block);
3414                lpp = xfs_btree_ptr_addr(cur, ptr + 1, block);
3415
3416#ifdef DEBUG
3417                for (i = 0; i < numrecs - ptr; i++) {
3418                        error = xfs_btree_check_ptr(cur, lpp, i, level);
3419                        if (error)
3420                                goto error0;
3421                }
3422#endif
3423
3424                if (ptr < numrecs) {
3425                        xfs_btree_shift_keys(cur, lkp, -1, numrecs - ptr);
3426                        xfs_btree_shift_ptrs(cur, lpp, -1, numrecs - ptr);
3427                        xfs_btree_log_keys(cur, bp, ptr, numrecs - 1);
3428                        xfs_btree_log_ptrs(cur, bp, ptr, numrecs - 1);
3429                }
3430
3431                /*
3432                 * If it's the first record in the block, we'll need to pass a
3433                 * key up to the next level (updkey).
3434                 */
3435                if (ptr == 1)
3436                        keyp = xfs_btree_key_addr(cur, 1, block);
3437        } else {
3438                /* It's a leaf. operate on records */
3439                if (ptr < numrecs) {
3440                        xfs_btree_shift_recs(cur,
3441                                xfs_btree_rec_addr(cur, ptr + 1, block),
3442                                -1, numrecs - ptr);
3443                        xfs_btree_log_recs(cur, bp, ptr, numrecs - 1);
3444                }
3445
3446                /*
3447                 * If it's the first record in the block, we'll need a key
3448                 * structure to pass up to the next level (updkey).
3449                 */
3450                if (ptr == 1) {
3451                        cur->bc_ops->init_key_from_rec(&key,
3452                                        xfs_btree_rec_addr(cur, 1, block));
3453                        keyp = &key;
3454                }
3455        }
3456
3457        /*
3458         * Decrement and log the number of entries in the block.
3459         */
3460        xfs_btree_set_numrecs(block, --numrecs);
3461        xfs_btree_log_block(cur, bp, XFS_BB_NUMRECS);
3462
3463        /*
3464         * If we are tracking the last record in the tree and
3465         * we are at the far right edge of the tree, update it.
3466         */
3467        if (xfs_btree_is_lastrec(cur, block, level)) {
3468                cur->bc_ops->update_lastrec(cur, block, NULL,
3469                                            ptr, LASTREC_DELREC);
3470        }
3471
3472        /*
3473         * We're at the root level.  First, shrink the root block in-memory.
3474         * Try to get rid of the next level down.  If we can't then there's
3475         * nothing left to do.
3476         */
3477        if (level == cur->bc_nlevels - 1) {
3478                if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) {
3479                        xfs_iroot_realloc(cur->bc_private.b.ip, -1,
3480                                          cur->bc_private.b.whichfork);
3481
3482                        error = xfs_btree_kill_iroot(cur);
3483                        if (error)
3484                                goto error0;
3485
3486                        error = xfs_btree_dec_cursor(cur, level, stat);
3487                        if (error)
3488                                goto error0;
3489                        *stat = 1;
3490                        return 0;
3491                }
3492
3493                /*
3494                 * If this is the root level, and there's only one entry left,
3495                 * and it's NOT the leaf level, then we can get rid of this
3496                 * level.
3497                 */
3498                if (numrecs == 1 && level > 0) {
3499                        union xfs_btree_ptr     *pp;
3500                        /*
3501                         * pp is still set to the first pointer in the block.
3502                         * Make it the new root of the btree.
3503                         */
3504                        pp = xfs_btree_ptr_addr(cur, 1, block);
3505                        error = xfs_btree_kill_root(cur, bp, level, pp);
3506                        if (error)
3507                                goto error0;
3508                } else if (level > 0) {
3509                        error = xfs_btree_dec_cursor(cur, level, stat);
3510                        if (error)
3511                                goto error0;
3512                }
3513                *stat = 1;
3514                return 0;
3515        }
3516
3517        /*
3518         * If we deleted the leftmost entry in the block, update the
3519         * key values above us in the tree.
3520         */
3521        if (ptr == 1) {
3522                error = xfs_btree_updkey(cur, keyp, level + 1);
3523                if (error)
3524                        goto error0;
3525        }
3526
3527        /*
3528         * If the number of records remaining in the block is at least
3529         * the minimum, we're done.
3530         */
3531        if (numrecs >= cur->bc_ops->get_minrecs(cur, level)) {
3532                error = xfs_btree_dec_cursor(cur, level, stat);
3533                if (error)
3534                        goto error0;
3535                return 0;
3536        }
3537
3538        /*
3539         * Otherwise, we have to move some records around to keep the
3540         * tree balanced.  Look at the left and right sibling blocks to
3541         * see if we can re-balance by moving only one record.
3542         */
3543        xfs_btree_get_sibling(cur, block, &rptr, XFS_BB_RIGHTSIB);
3544        xfs_btree_get_sibling(cur, block, &lptr, XFS_BB_LEFTSIB);
3545
3546        if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) {
3547                /*
3548                 * One child of root, need to get a chance to copy its contents
3549                 * into the root and delete it. Can't go up to next level,
3550                 * there's nothing to delete there.
3551                 */
3552                if (xfs_btree_ptr_is_null(cur, &rptr) &&
3553                    xfs_btree_ptr_is_null(cur, &lptr) &&
3554                    level == cur->bc_nlevels - 2) {
3555                        error = xfs_btree_kill_iroot(cur);
3556                        if (!error)
3557                                error = xfs_btree_dec_cursor(cur, level, stat);
3558                        if (error)
3559                                goto error0;
3560                        return 0;
3561                }
3562        }
3563
3564        ASSERT(!xfs_btree_ptr_is_null(cur, &rptr) ||
3565               !xfs_btree_ptr_is_null(cur, &lptr));
3566
3567        /*
3568         * Duplicate the cursor so our btree manipulations here won't
3569         * disrupt the next level up.
3570         */
3571        error = xfs_btree_dup_cursor(cur, &tcur);
3572        if (error)
3573                goto error0;
3574
3575        /*
3576         * If there's a right sibling, see if it's ok to shift an entry
3577         * out of it.
3578         */
3579        if (!xfs_btree_ptr_is_null(cur, &rptr)) {
3580                /*
3581                 * Move the temp cursor to the last entry in the next block.
3582                 * Actually any entry but the first would suffice.
3583                 */
3584                i = xfs_btree_lastrec(tcur, level);
3585                XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, i == 1, error0);
3586
3587                error = xfs_btree_increment(tcur, level, &i);
3588                if (error)
3589                        goto error0;
3590                XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, i == 1, error0);
3591
3592                i = xfs_btree_lastrec(tcur, level);
3593                XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, i == 1, error0);
3594
3595                /* Grab a pointer to the block. */
3596                right = xfs_btree_get_block(tcur, level, &rbp);
3597#ifdef DEBUG
3598                error = xfs_btree_check_block(tcur, right, level, rbp);
3599                if (error)
3600                        goto error0;
3601#endif
3602                /* Grab the current block number, for future use. */
3603                xfs_btree_get_sibling(tcur, right, &cptr, XFS_BB_LEFTSIB);
3604
3605                /*
3606                 * If right block is full enough so that removing one entry
3607                 * won't make it too empty, and left-shifting an entry out
3608                 * of right to us works, we're done.
3609                 */
3610                if (xfs_btree_get_numrecs(right) - 1 >=
3611                    cur->bc_ops->get_minrecs(tcur, level)) {
3612                        error = xfs_btree_lshift(tcur, level, &i);
3613                        if (error)
3614                                goto error0;
3615                        if (i) {
3616                                ASSERT(xfs_btree_get_numrecs(block) >=
3617                                       cur->bc_ops->get_minrecs(tcur, level));
3618
3619                                xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
3620                                tcur = NULL;
3621
3622                                error = xfs_btree_dec_cursor(cur, level, stat);
3623                                if (error)
3624                                        goto error0;
3625                                return 0;
3626                        }
3627                }
3628
3629                /*
3630                 * Otherwise, grab the number of records in right for
3631                 * future reference, and fix up the temp cursor to point
3632                 * to our block again (last record).
3633                 */
3634                rrecs = xfs_btree_get_numrecs(right);
3635                if (!xfs_btree_ptr_is_null(cur, &lptr)) {
3636                        i = xfs_btree_firstrec(tcur, level);
3637                        XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, i == 1, error0);
3638
3639                        error = xfs_btree_decrement(tcur, level, &i);
3640                        if (error)
3641                                goto error0;
3642                        XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, i == 1, error0);
3643                }
3644        }
3645
3646        /*
3647         * If there's a left sibling, see if it's ok to shift an entry
3648         * out of it.
3649         */
3650        if (!xfs_btree_ptr_is_null(cur, &lptr)) {
3651                /*
3652                 * Move the temp cursor to the first entry in the
3653                 * previous block.
3654                 */
3655                i = xfs_btree_firstrec(tcur, level);
3656                XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, i == 1, error0);
3657
3658                error = xfs_btree_decrement(tcur, level, &i);
3659                if (error)
3660                        goto error0;
3661                i = xfs_btree_firstrec(tcur, level);
3662                XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, i == 1, error0);
3663
3664                /* Grab a pointer to the block. */
3665                left = xfs_btree_get_block(tcur, level, &lbp);
3666#ifdef DEBUG
3667                error = xfs_btree_check_block(cur, left, level, lbp);
3668                if (error)
3669                        goto error0;
3670#endif
3671                /* Grab the current block number, for future use. */
3672                xfs_btree_get_sibling(tcur, left, &cptr, XFS_BB_RIGHTSIB);
3673
3674                /*
3675                 * If left block is full enough so that removing one entry
3676                 * won't make it too empty, and right-shifting an entry out
3677                 * of left to us works, we're done.
3678                 */
3679                if (xfs_btree_get_numrecs(left) - 1 >=
3680                    cur->bc_ops->get_minrecs(tcur, level)) {
3681                        error = xfs_btree_rshift(tcur, level, &i);
3682                        if (error)
3683                                goto error0;
3684                        if (i) {
3685                                ASSERT(xfs_btree_get_numrecs(block) >=
3686                                       cur->bc_ops->get_minrecs(tcur, level));
3687                                xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
3688                                tcur = NULL;
3689                                if (level == 0)
3690                                        cur->bc_ptrs[0]++;
3691                                XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
3692                                *stat = 1;
3693                                return 0;
3694                        }
3695                }
3696
3697                /*
3698                 * Otherwise, grab the number of records in right for
3699                 * future reference.
3700                 */
3701                lrecs = xfs_btree_get_numrecs(left);
3702        }
3703
3704        /* Delete the temp cursor, we're done with it. */
3705        xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
3706        tcur = NULL;
3707
3708        /* If here, we need to do a join to keep the tree balanced. */
3709        ASSERT(!xfs_btree_ptr_is_null(cur, &cptr));
3710
3711        if (!xfs_btree_ptr_is_null(cur, &lptr) &&
3712            lrecs + xfs_btree_get_numrecs(block) <=
3713                        cur->bc_ops->get_maxrecs(cur, level)) {
3714                /*
3715                 * Set "right" to be the starting block,
3716                 * "left" to be the left neighbor.
3717                 */
3718                rptr = cptr;
3719                right = block;
3720                rbp = bp;
3721                error = xfs_btree_read_buf_block(cur, &lptr, 0, &left, &lbp);
3722                if (error)
3723                        goto error0;
3724
3725        /*
3726         * If that won't work, see if we can join with the right neighbor block.
3727         */
3728        } else if (!xfs_btree_ptr_is_null(cur, &rptr) &&
3729                   rrecs + xfs_btree_get_numrecs(block) <=
3730                        cur->bc_ops->get_maxrecs(cur, level)) {
3731                /*
3732                 * Set "left" to be the starting block,
3733                 * "right" to be the right neighbor.
3734                 */
3735                lptr = cptr;
3736                left = block;
3737                lbp = bp;
3738                error = xfs_btree_read_buf_block(cur, &rptr, 0, &right, &rbp);
3739                if (error)
3740                        goto error0;
3741
3742        /*
3743         * Otherwise, we can't fix the imbalance.
3744         * Just return.  This is probably a logic error, but it's not fatal.
3745         */
3746        } else {
3747                error = xfs_btree_dec_cursor(cur, level, stat);
3748                if (error)
3749                        goto error0;
3750                return 0;
3751        }
3752
3753        rrecs = xfs_btree_get_numrecs(right);
3754        lrecs = xfs_btree_get_numrecs(left);
3755
3756        /*
3757         * We're now going to join "left" and "right" by moving all the stuff
3758         * in "right" to "left" and deleting "right".
3759         */
3760        XFS_BTREE_STATS_ADD(cur, moves, rrecs);
3761        if (level > 0) {
3762                /* It's a non-leaf.  Move keys and pointers. */
3763                union xfs_btree_key     *lkp;   /* left btree key */
3764                union xfs_btree_ptr     *lpp;   /* left address pointer */
3765                union xfs_btree_key     *rkp;   /* right btree key */
3766                union xfs_btree_ptr     *rpp;   /* right address pointer */
3767
3768                lkp = xfs_btree_key_addr(cur, lrecs + 1, left);
3769                lpp = xfs_btree_ptr_addr(cur, lrecs + 1, left);
3770                rkp = xfs_btree_key_addr(cur, 1, right);
3771                rpp = xfs_btree_ptr_addr(cur, 1, right);
3772#ifdef DEBUG
3773                for (i = 1; i < rrecs; i++) {
3774                        error = xfs_btree_check_ptr(cur, rpp, i, level);
3775                        if (error)
3776                                goto error0;
3777                }
3778#endif
3779                xfs_btree_copy_keys(cur, lkp, rkp, rrecs);
3780                xfs_btree_copy_ptrs(cur, lpp, rpp, rrecs);
3781
3782                xfs_btree_log_keys(cur, lbp, lrecs + 1, lrecs + rrecs);
3783                xfs_btree_log_ptrs(cur, lbp, lrecs + 1, lrecs + rrecs);
3784        } else {
3785                /* It's a leaf.  Move records.  */
3786                union xfs_btree_rec     *lrp;   /* left record pointer */
3787                union xfs_btree_rec     *rrp;   /* right record pointer */
3788
3789                lrp = xfs_btree_rec_addr(cur, lrecs + 1, left);
3790                rrp = xfs_btree_rec_addr(cur, 1, right);
3791
3792                xfs_btree_copy_recs(cur, lrp, rrp, rrecs);
3793                xfs_btree_log_recs(cur, lbp, lrecs + 1, lrecs + rrecs);
3794        }
3795
3796        XFS_BTREE_STATS_INC(cur, join);
3797
3798        /*
3799         * Fix up the number of records and right block pointer in the
3800         * surviving block, and log it.
3801         */
3802        xfs_btree_set_numrecs(left, lrecs + rrecs);
3803        xfs_btree_get_sibling(cur, right, &cptr, XFS_BB_RIGHTSIB),
3804        xfs_btree_set_sibling(cur, left, &cptr, XFS_BB_RIGHTSIB);
3805        xfs_btree_log_block(cur, lbp, XFS_BB_NUMRECS | XFS_BB_RIGHTSIB);
3806
3807        /* If there is a right sibling, point it to the remaining block. */
3808        xfs_btree_get_sibling(cur, left, &cptr, XFS_BB_RIGHTSIB);
3809        if (!xfs_btree_ptr_is_null(cur, &cptr)) {
3810                error = xfs_btree_read_buf_block(cur, &cptr, 0, &rrblock, &rrbp);
3811                if (error)
3812                        goto error0;
3813                xfs_btree_set_sibling(cur, rrblock, &lptr, XFS_BB_LEFTSIB);
3814                xfs_btree_log_block(cur, rrbp, XFS_BB_LEFTSIB);
3815        }
3816
3817        /* Free the deleted block. */
3818        error = cur->bc_ops->free_block(cur, rbp);
3819        if (error)
3820                goto error0;
3821        XFS_BTREE_STATS_INC(cur, free);
3822
3823        /*
3824         * If we joined with the left neighbor, set the buffer in the
3825         * cursor to the left block, and fix up the index.
3826         */
3827        if (bp != lbp) {
3828                cur->bc_bufs[level] = lbp;
3829                cur->bc_ptrs[level] += lrecs;
3830                cur->bc_ra[level] = 0;
3831        }
3832        /*
3833         * If we joined with the right neighbor and there's a level above
3834         * us, increment the cursor at that level.
3835         */
3836        else if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) ||
3837                   (level + 1 < cur->bc_nlevels)) {
3838                error = xfs_btree_increment(cur, level + 1, &i);
3839                if (error)
3840                        goto error0;
3841        }
3842
3843        /*
3844         * Readjust the ptr at this level if it's not a leaf, since it's
3845         * still pointing at the deletion point, which makes the cursor
3846         * inconsistent.  If this makes the ptr 0, the caller fixes it up.
3847         * We can't use decrement because it would change the next level up.
3848         */
3849        if (level > 0)
3850                cur->bc_ptrs[level]--;
3851
3852        XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
3853        /* Return value means the next level up has something to do. */
3854        *stat = 2;
3855        return 0;
3856
3857error0:
3858        XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
3859        if (tcur)
3860                xfs_btree_del_cursor(tcur, XFS_BTREE_ERROR);
3861        return error;
3862}
3863
3864/*
3865 * Delete the record pointed to by cur.
3866 * The cursor refers to the place where the record was (could be inserted)
3867 * when the operation returns.
3868 */
3869int                                     /* error */
3870xfs_btree_delete(
3871        struct xfs_btree_cur    *cur,
3872        int                     *stat)  /* success/failure */
3873{
3874        int                     error;  /* error return value */
3875        int                     level;
3876        int                     i;
3877
3878        XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
3879
3880        /*
3881         * Go up the tree, starting at leaf level.
3882         *
3883         * If 2 is returned then a join was done; go to the next level.
3884         * Otherwise we are done.
3885         */
3886        for (level = 0, i = 2; i == 2; level++) {
3887                error = xfs_btree_delrec(cur, level, &i);
3888                if (error)
3889                        goto error0;
3890        }
3891
3892        if (i == 0) {
3893                for (level = 1; level < cur->bc_nlevels; level++) {
3894                        if (cur->bc_ptrs[level] == 0) {
3895                                error = xfs_btree_decrement(cur, level, &i);
3896                                if (error)
3897                                        goto error0;
3898                                break;
3899                        }
3900                }
3901        }
3902
3903        XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
3904        *stat = i;
3905        return 0;
3906error0:
3907        XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
3908        return error;
3909}
3910
3911/*
3912 * Get the data from the pointed-to record.
3913 */
3914int                                     /* error */
3915xfs_btree_get_rec(
3916        struct xfs_btree_cur    *cur,   /* btree cursor */
3917        union xfs_btree_rec     **recp, /* output: btree record */
3918        int                     *stat)  /* output: success/failure */
3919{
3920        struct xfs_btree_block  *block; /* btree block */
3921        struct xfs_buf          *bp;    /* buffer pointer */
3922        int                     ptr;    /* record number */
3923#ifdef DEBUG
3924        int                     error;  /* error return value */
3925#endif
3926
3927        ptr = cur->bc_ptrs[0];
3928        block = xfs_btree_get_block(cur, 0, &bp);
3929
3930#ifdef DEBUG
3931        error = xfs_btree_check_block(cur, block, 0, bp);
3932        if (error)
3933                return error;
3934#endif
3935
3936        /*
3937         * Off the right end or left end, return failure.
3938         */
3939        if (ptr > xfs_btree_get_numrecs(block) || ptr <= 0) {
3940                *stat = 0;
3941                return 0;
3942        }
3943
3944        /*
3945         * Point to the record and extract its data.
3946         */
3947        *recp = xfs_btree_rec_addr(cur, ptr, block);
3948        *stat = 1;
3949        return 0;
3950}
3951
3952/*
3953 * Change the owner of a btree.
3954 *
3955 * The mechanism we use here is ordered buffer logging. Because we don't know
3956 * how many buffers were are going to need to modify, we don't really want to
3957 * have to make transaction reservations for the worst case of every buffer in a
3958 * full size btree as that may be more space that we can fit in the log....
3959 *
3960 * We do the btree walk in the most optimal manner possible - we have sibling
3961 * pointers so we can just walk all the blocks on each level from left to right
3962 * in a single pass, and then move to the next level and do the same. We can
3963 * also do readahead on the sibling pointers to get IO moving more quickly,
3964 * though for slow disks this is unlikely to make much difference to performance
3965 * as the amount of CPU work we have to do before moving to the next block is
3966 * relatively small.
3967 *
3968 * For each btree block that we load, modify the owner appropriately, set the
3969 * buffer as an ordered buffer and log it appropriately. We need to ensure that
3970 * we mark the region we change dirty so that if the buffer is relogged in
3971 * a subsequent transaction the changes we make here as an ordered buffer are
3972 * correctly relogged in that transaction.  If we are in recovery context, then
3973 * just queue the modified buffer as delayed write buffer so the transaction
3974 * recovery completion writes the changes to disk.
3975 */
3976static int
3977xfs_btree_block_change_owner(
3978        struct xfs_btree_cur    *cur,
3979        int                     level,
3980        __uint64_t              new_owner,
3981        struct list_head        *buffer_list)
3982{
3983        struct xfs_btree_block  *block;
3984        struct xfs_buf          *bp;
3985        union xfs_btree_ptr     rptr;
3986
3987        /* do right sibling readahead */
3988        xfs_btree_readahead(cur, level, XFS_BTCUR_RIGHTRA);
3989
3990        /* modify the owner */
3991        block = xfs_btree_get_block(cur, level, &bp);
3992        if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
3993                block->bb_u.l.bb_owner = cpu_to_be64(new_owner);
3994        else
3995                block->bb_u.s.bb_owner = cpu_to_be32(new_owner);
3996
3997        /*
3998         * If the block is a root block hosted in an inode, we might not have a
3999         * buffer pointer here and we shouldn't attempt to log the change as the
4000         * information is already held in the inode and discarded when the root
4001         * block is formatted into the on-disk inode fork. We still change it,
4002         * though, so everything is consistent in memory.
4003         */
4004        if (bp) {
4005                if (cur->bc_tp) {
4006                        xfs_trans_ordered_buf(cur->bc_tp, bp);
4007                        xfs_btree_log_block(cur, bp, XFS_BB_OWNER);
4008                } else {
4009                        xfs_buf_delwri_queue(bp, buffer_list);
4010                }
4011        } else {
4012                ASSERT(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE);
4013                ASSERT(level == cur->bc_nlevels - 1);
4014        }
4015
4016        /* now read rh sibling block for next iteration */
4017        xfs_btree_get_sibling(cur, block, &rptr, XFS_BB_RIGHTSIB);
4018        if (xfs_btree_ptr_is_null(cur, &rptr))
4019                return -ENOENT;
4020
4021        return xfs_btree_lookup_get_block(cur, level, &rptr, &block);
4022}
4023
4024int
4025xfs_btree_change_owner(
4026        struct xfs_btree_cur    *cur,
4027        __uint64_t              new_owner,
4028        struct list_head        *buffer_list)
4029{
4030        union xfs_btree_ptr     lptr;
4031        int                     level;
4032        struct xfs_btree_block  *block = NULL;
4033        int                     error = 0;
4034
4035        cur->bc_ops->init_ptr_from_cur(cur, &lptr);
4036
4037        /* for each level */
4038        for (level = cur->bc_nlevels - 1; level >= 0; level--) {
4039                /* grab the left hand block */
4040                error = xfs_btree_lookup_get_block(cur, level, &lptr, &block);
4041                if (error)
4042                        return error;
4043
4044                /* readahead the left most block for the next level down */
4045                if (level > 0) {
4046                        union xfs_btree_ptr     *ptr;
4047
4048                        ptr = xfs_btree_ptr_addr(cur, 1, block);
4049                        xfs_btree_readahead_ptr(cur, ptr, 1);
4050
4051                        /* save for the next iteration of the loop */
4052                        lptr = *ptr;
4053                }
4054
4055                /* for each buffer in the level */
4056                do {
4057                        error = xfs_btree_block_change_owner(cur, level,
4058                                                             new_owner,
4059                                                             buffer_list);
4060                } while (!error);
4061
4062                if (error != -ENOENT)
4063                        return error;
4064        }
4065
4066        return 0;
4067}
4068