linux/fs/xfs/libxfs/xfs_btree.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0
   2/*
   3 * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc.
   4 * All Rights Reserved.
   5 */
   6#include "xfs.h"
   7#include "xfs_fs.h"
   8#include "xfs_shared.h"
   9#include "xfs_format.h"
  10#include "xfs_log_format.h"
  11#include "xfs_trans_resv.h"
  12#include "xfs_bit.h"
  13#include "xfs_mount.h"
  14#include "xfs_inode.h"
  15#include "xfs_trans.h"
  16#include "xfs_buf_item.h"
  17#include "xfs_btree.h"
  18#include "xfs_errortag.h"
  19#include "xfs_error.h"
  20#include "xfs_trace.h"
  21#include "xfs_alloc.h"
  22#include "xfs_log.h"
  23
  24/*
  25 * Cursor allocation zone.
  26 */
  27kmem_zone_t     *xfs_btree_cur_zone;
  28
  29/*
  30 * Btree magic numbers.
  31 */
  32static const uint32_t xfs_magics[2][XFS_BTNUM_MAX] = {
  33        { XFS_ABTB_MAGIC, XFS_ABTC_MAGIC, 0, XFS_BMAP_MAGIC, XFS_IBT_MAGIC,
  34          XFS_FIBT_MAGIC, 0 },
  35        { XFS_ABTB_CRC_MAGIC, XFS_ABTC_CRC_MAGIC, XFS_RMAP_CRC_MAGIC,
  36          XFS_BMAP_CRC_MAGIC, XFS_IBT_CRC_MAGIC, XFS_FIBT_CRC_MAGIC,
  37          XFS_REFC_CRC_MAGIC }
  38};
  39
  40uint32_t
  41xfs_btree_magic(
  42        int                     crc,
  43        xfs_btnum_t             btnum)
  44{
  45        uint32_t                magic = xfs_magics[crc][btnum];
  46
  47        /* Ensure we asked for crc for crc-only magics. */
  48        ASSERT(magic != 0);
  49        return magic;
  50}
  51
  52/*
  53 * Check a long btree block header.  Return the address of the failing check,
  54 * or NULL if everything is ok.
  55 */
  56xfs_failaddr_t
  57__xfs_btree_check_lblock(
  58        struct xfs_btree_cur    *cur,
  59        struct xfs_btree_block  *block,
  60        int                     level,
  61        struct xfs_buf          *bp)
  62{
  63        struct xfs_mount        *mp = cur->bc_mp;
  64        xfs_btnum_t             btnum = cur->bc_btnum;
  65        int                     crc = xfs_sb_version_hascrc(&mp->m_sb);
  66
  67        if (crc) {
  68                if (!uuid_equal(&block->bb_u.l.bb_uuid, &mp->m_sb.sb_meta_uuid))
  69                        return __this_address;
  70                if (block->bb_u.l.bb_blkno !=
  71                    cpu_to_be64(bp ? bp->b_bn : XFS_BUF_DADDR_NULL))
  72                        return __this_address;
  73                if (block->bb_u.l.bb_pad != cpu_to_be32(0))
  74                        return __this_address;
  75        }
  76
  77        if (be32_to_cpu(block->bb_magic) != xfs_btree_magic(crc, btnum))
  78                return __this_address;
  79        if (be16_to_cpu(block->bb_level) != level)
  80                return __this_address;
  81        if (be16_to_cpu(block->bb_numrecs) >
  82            cur->bc_ops->get_maxrecs(cur, level))
  83                return __this_address;
  84        if (block->bb_u.l.bb_leftsib != cpu_to_be64(NULLFSBLOCK) &&
  85            !xfs_btree_check_lptr(cur, be64_to_cpu(block->bb_u.l.bb_leftsib),
  86                        level + 1))
  87                return __this_address;
  88        if (block->bb_u.l.bb_rightsib != cpu_to_be64(NULLFSBLOCK) &&
  89            !xfs_btree_check_lptr(cur, be64_to_cpu(block->bb_u.l.bb_rightsib),
  90                        level + 1))
  91                return __this_address;
  92
  93        return NULL;
  94}
  95
  96/* Check a long btree block header. */
  97static int
  98xfs_btree_check_lblock(
  99        struct xfs_btree_cur    *cur,
 100        struct xfs_btree_block  *block,
 101        int                     level,
 102        struct xfs_buf          *bp)
 103{
 104        struct xfs_mount        *mp = cur->bc_mp;
 105        xfs_failaddr_t          fa;
 106
 107        fa = __xfs_btree_check_lblock(cur, block, level, bp);
 108        if (unlikely(XFS_TEST_ERROR(fa != NULL, mp,
 109                        XFS_ERRTAG_BTREE_CHECK_LBLOCK))) {
 110                if (bp)
 111                        trace_xfs_btree_corrupt(bp, _RET_IP_);
 112                XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, mp);
 113                return -EFSCORRUPTED;
 114        }
 115        return 0;
 116}
 117
 118/*
 119 * Check a short btree block header.  Return the address of the failing check,
 120 * or NULL if everything is ok.
 121 */
 122xfs_failaddr_t
 123__xfs_btree_check_sblock(
 124        struct xfs_btree_cur    *cur,
 125        struct xfs_btree_block  *block,
 126        int                     level,
 127        struct xfs_buf          *bp)
 128{
 129        struct xfs_mount        *mp = cur->bc_mp;
 130        xfs_btnum_t             btnum = cur->bc_btnum;
 131        int                     crc = xfs_sb_version_hascrc(&mp->m_sb);
 132
 133        if (crc) {
 134                if (!uuid_equal(&block->bb_u.s.bb_uuid, &mp->m_sb.sb_meta_uuid))
 135                        return __this_address;
 136                if (block->bb_u.s.bb_blkno !=
 137                    cpu_to_be64(bp ? bp->b_bn : XFS_BUF_DADDR_NULL))
 138                        return __this_address;
 139        }
 140
 141        if (be32_to_cpu(block->bb_magic) != xfs_btree_magic(crc, btnum))
 142                return __this_address;
 143        if (be16_to_cpu(block->bb_level) != level)
 144                return __this_address;
 145        if (be16_to_cpu(block->bb_numrecs) >
 146            cur->bc_ops->get_maxrecs(cur, level))
 147                return __this_address;
 148        if (block->bb_u.s.bb_leftsib != cpu_to_be32(NULLAGBLOCK) &&
 149            !xfs_btree_check_sptr(cur, be32_to_cpu(block->bb_u.s.bb_leftsib),
 150                        level + 1))
 151                return __this_address;
 152        if (block->bb_u.s.bb_rightsib != cpu_to_be32(NULLAGBLOCK) &&
 153            !xfs_btree_check_sptr(cur, be32_to_cpu(block->bb_u.s.bb_rightsib),
 154                        level + 1))
 155                return __this_address;
 156
 157        return NULL;
 158}
 159
 160/* Check a short btree block header. */
 161STATIC int
 162xfs_btree_check_sblock(
 163        struct xfs_btree_cur    *cur,
 164        struct xfs_btree_block  *block,
 165        int                     level,
 166        struct xfs_buf          *bp)
 167{
 168        struct xfs_mount        *mp = cur->bc_mp;
 169        xfs_failaddr_t          fa;
 170
 171        fa = __xfs_btree_check_sblock(cur, block, level, bp);
 172        if (unlikely(XFS_TEST_ERROR(fa != NULL, mp,
 173                        XFS_ERRTAG_BTREE_CHECK_SBLOCK))) {
 174                if (bp)
 175                        trace_xfs_btree_corrupt(bp, _RET_IP_);
 176                XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, mp);
 177                return -EFSCORRUPTED;
 178        }
 179        return 0;
 180}
 181
 182/*
 183 * Debug routine: check that block header is ok.
 184 */
 185int
 186xfs_btree_check_block(
 187        struct xfs_btree_cur    *cur,   /* btree cursor */
 188        struct xfs_btree_block  *block, /* generic btree block pointer */
 189        int                     level,  /* level of the btree block */
 190        struct xfs_buf          *bp)    /* buffer containing block, if any */
 191{
 192        if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
 193                return xfs_btree_check_lblock(cur, block, level, bp);
 194        else
 195                return xfs_btree_check_sblock(cur, block, level, bp);
 196}
 197
 198/* Check that this long pointer is valid and points within the fs. */
 199bool
 200xfs_btree_check_lptr(
 201        struct xfs_btree_cur    *cur,
 202        xfs_fsblock_t           fsbno,
 203        int                     level)
 204{
 205        if (level <= 0)
 206                return false;
 207        return xfs_verify_fsbno(cur->bc_mp, fsbno);
 208}
 209
 210/* Check that this short pointer is valid and points within the AG. */
 211bool
 212xfs_btree_check_sptr(
 213        struct xfs_btree_cur    *cur,
 214        xfs_agblock_t           agbno,
 215        int                     level)
 216{
 217        if (level <= 0)
 218                return false;
 219        return xfs_verify_agbno(cur->bc_mp, cur->bc_private.a.agno, agbno);
 220}
 221
 222/*
 223 * Check that a given (indexed) btree pointer at a certain level of a
 224 * btree is valid and doesn't point past where it should.
 225 */
 226static int
 227xfs_btree_check_ptr(
 228        struct xfs_btree_cur    *cur,
 229        union xfs_btree_ptr     *ptr,
 230        int                     index,
 231        int                     level)
 232{
 233        if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
 234                if (xfs_btree_check_lptr(cur, be64_to_cpu((&ptr->l)[index]),
 235                                level))
 236                        return 0;
 237                xfs_err(cur->bc_mp,
 238"Inode %llu fork %d: Corrupt btree %d pointer at level %d index %d.",
 239                                cur->bc_private.b.ip->i_ino,
 240                                cur->bc_private.b.whichfork, cur->bc_btnum,
 241                                level, index);
 242        } else {
 243                if (xfs_btree_check_sptr(cur, be32_to_cpu((&ptr->s)[index]),
 244                                level))
 245                        return 0;
 246                xfs_err(cur->bc_mp,
 247"AG %u: Corrupt btree %d pointer at level %d index %d.",
 248                                cur->bc_private.a.agno, cur->bc_btnum,
 249                                level, index);
 250        }
 251
 252        return -EFSCORRUPTED;
 253}
 254
 255#ifdef DEBUG
 256# define xfs_btree_debug_check_ptr      xfs_btree_check_ptr
 257#else
 258# define xfs_btree_debug_check_ptr(...) (0)
 259#endif
 260
 261/*
 262 * Calculate CRC on the whole btree block and stuff it into the
 263 * long-form btree header.
 264 *
 265 * Prior to calculting the CRC, pull the LSN out of the buffer log item and put
 266 * it into the buffer so recovery knows what the last modification was that made
 267 * it to disk.
 268 */
 269void
 270xfs_btree_lblock_calc_crc(
 271        struct xfs_buf          *bp)
 272{
 273        struct xfs_btree_block  *block = XFS_BUF_TO_BLOCK(bp);
 274        struct xfs_buf_log_item *bip = bp->b_log_item;
 275
 276        if (!xfs_sb_version_hascrc(&bp->b_mount->m_sb))
 277                return;
 278        if (bip)
 279                block->bb_u.l.bb_lsn = cpu_to_be64(bip->bli_item.li_lsn);
 280        xfs_buf_update_cksum(bp, XFS_BTREE_LBLOCK_CRC_OFF);
 281}
 282
 283bool
 284xfs_btree_lblock_verify_crc(
 285        struct xfs_buf          *bp)
 286{
 287        struct xfs_btree_block  *block = XFS_BUF_TO_BLOCK(bp);
 288        struct xfs_mount        *mp = bp->b_mount;
 289
 290        if (xfs_sb_version_hascrc(&mp->m_sb)) {
 291                if (!xfs_log_check_lsn(mp, be64_to_cpu(block->bb_u.l.bb_lsn)))
 292                        return false;
 293                return xfs_buf_verify_cksum(bp, XFS_BTREE_LBLOCK_CRC_OFF);
 294        }
 295
 296        return true;
 297}
 298
 299/*
 300 * Calculate CRC on the whole btree block and stuff it into the
 301 * short-form btree header.
 302 *
 303 * Prior to calculting the CRC, pull the LSN out of the buffer log item and put
 304 * it into the buffer so recovery knows what the last modification was that made
 305 * it to disk.
 306 */
 307void
 308xfs_btree_sblock_calc_crc(
 309        struct xfs_buf          *bp)
 310{
 311        struct xfs_btree_block  *block = XFS_BUF_TO_BLOCK(bp);
 312        struct xfs_buf_log_item *bip = bp->b_log_item;
 313
 314        if (!xfs_sb_version_hascrc(&bp->b_mount->m_sb))
 315                return;
 316        if (bip)
 317                block->bb_u.s.bb_lsn = cpu_to_be64(bip->bli_item.li_lsn);
 318        xfs_buf_update_cksum(bp, XFS_BTREE_SBLOCK_CRC_OFF);
 319}
 320
 321bool
 322xfs_btree_sblock_verify_crc(
 323        struct xfs_buf          *bp)
 324{
 325        struct xfs_btree_block  *block = XFS_BUF_TO_BLOCK(bp);
 326        struct xfs_mount        *mp = bp->b_mount;
 327
 328        if (xfs_sb_version_hascrc(&mp->m_sb)) {
 329                if (!xfs_log_check_lsn(mp, be64_to_cpu(block->bb_u.s.bb_lsn)))
 330                        return false;
 331                return xfs_buf_verify_cksum(bp, XFS_BTREE_SBLOCK_CRC_OFF);
 332        }
 333
 334        return true;
 335}
 336
 337static int
 338xfs_btree_free_block(
 339        struct xfs_btree_cur    *cur,
 340        struct xfs_buf          *bp)
 341{
 342        int                     error;
 343
 344        error = cur->bc_ops->free_block(cur, bp);
 345        if (!error) {
 346                xfs_trans_binval(cur->bc_tp, bp);
 347                XFS_BTREE_STATS_INC(cur, free);
 348        }
 349        return error;
 350}
 351
 352/*
 353 * Delete the btree cursor.
 354 */
 355void
 356xfs_btree_del_cursor(
 357        xfs_btree_cur_t *cur,           /* btree cursor */
 358        int             error)          /* del because of error */
 359{
 360        int             i;              /* btree level */
 361
 362        /*
 363         * Clear the buffer pointers, and release the buffers.
 364         * If we're doing this in the face of an error, we
 365         * need to make sure to inspect all of the entries
 366         * in the bc_bufs array for buffers to be unlocked.
 367         * This is because some of the btree code works from
 368         * level n down to 0, and if we get an error along
 369         * the way we won't have initialized all the entries
 370         * down to 0.
 371         */
 372        for (i = 0; i < cur->bc_nlevels; i++) {
 373                if (cur->bc_bufs[i])
 374                        xfs_trans_brelse(cur->bc_tp, cur->bc_bufs[i]);
 375                else if (!error)
 376                        break;
 377        }
 378        /*
 379         * Can't free a bmap cursor without having dealt with the
 380         * allocated indirect blocks' accounting.
 381         */
 382        ASSERT(cur->bc_btnum != XFS_BTNUM_BMAP ||
 383               cur->bc_private.b.allocated == 0);
 384        /*
 385         * Free the cursor.
 386         */
 387        kmem_zone_free(xfs_btree_cur_zone, cur);
 388}
 389
 390/*
 391 * Duplicate the btree cursor.
 392 * Allocate a new one, copy the record, re-get the buffers.
 393 */
 394int                                     /* error */
 395xfs_btree_dup_cursor(
 396        xfs_btree_cur_t *cur,           /* input cursor */
 397        xfs_btree_cur_t **ncur)         /* output cursor */
 398{
 399        xfs_buf_t       *bp;            /* btree block's buffer pointer */
 400        int             error;          /* error return value */
 401        int             i;              /* level number of btree block */
 402        xfs_mount_t     *mp;            /* mount structure for filesystem */
 403        xfs_btree_cur_t *new;           /* new cursor value */
 404        xfs_trans_t     *tp;            /* transaction pointer, can be NULL */
 405
 406        tp = cur->bc_tp;
 407        mp = cur->bc_mp;
 408
 409        /*
 410         * Allocate a new cursor like the old one.
 411         */
 412        new = cur->bc_ops->dup_cursor(cur);
 413
 414        /*
 415         * Copy the record currently in the cursor.
 416         */
 417        new->bc_rec = cur->bc_rec;
 418
 419        /*
 420         * For each level current, re-get the buffer and copy the ptr value.
 421         */
 422        for (i = 0; i < new->bc_nlevels; i++) {
 423                new->bc_ptrs[i] = cur->bc_ptrs[i];
 424                new->bc_ra[i] = cur->bc_ra[i];
 425                bp = cur->bc_bufs[i];
 426                if (bp) {
 427                        error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp,
 428                                                   XFS_BUF_ADDR(bp), mp->m_bsize,
 429                                                   0, &bp,
 430                                                   cur->bc_ops->buf_ops);
 431                        if (error) {
 432                                xfs_btree_del_cursor(new, error);
 433                                *ncur = NULL;
 434                                return error;
 435                        }
 436                }
 437                new->bc_bufs[i] = bp;
 438        }
 439        *ncur = new;
 440        return 0;
 441}
 442
 443/*
 444 * XFS btree block layout and addressing:
 445 *
 446 * There are two types of blocks in the btree: leaf and non-leaf blocks.
 447 *
 448 * The leaf record start with a header then followed by records containing
 449 * the values.  A non-leaf block also starts with the same header, and
 450 * then first contains lookup keys followed by an equal number of pointers
 451 * to the btree blocks at the previous level.
 452 *
 453 *              +--------+-------+-------+-------+-------+-------+-------+
 454 * Leaf:        | header | rec 1 | rec 2 | rec 3 | rec 4 | rec 5 | rec N |
 455 *              +--------+-------+-------+-------+-------+-------+-------+
 456 *
 457 *              +--------+-------+-------+-------+-------+-------+-------+
 458 * Non-Leaf:    | header | key 1 | key 2 | key N | ptr 1 | ptr 2 | ptr N |
 459 *              +--------+-------+-------+-------+-------+-------+-------+
 460 *
 461 * The header is called struct xfs_btree_block for reasons better left unknown
 462 * and comes in different versions for short (32bit) and long (64bit) block
 463 * pointers.  The record and key structures are defined by the btree instances
 464 * and opaque to the btree core.  The block pointers are simple disk endian
 465 * integers, available in a short (32bit) and long (64bit) variant.
 466 *
 467 * The helpers below calculate the offset of a given record, key or pointer
 468 * into a btree block (xfs_btree_*_offset) or return a pointer to the given
 469 * record, key or pointer (xfs_btree_*_addr).  Note that all addressing
 470 * inside the btree block is done using indices starting at one, not zero!
 471 *
 472 * If XFS_BTREE_OVERLAPPING is set, then this btree supports keys containing
 473 * overlapping intervals.  In such a tree, records are still sorted lowest to
 474 * highest and indexed by the smallest key value that refers to the record.
 475 * However, nodes are different: each pointer has two associated keys -- one
 476 * indexing the lowest key available in the block(s) below (the same behavior
 477 * as the key in a regular btree) and another indexing the highest key
 478 * available in the block(s) below.  Because records are /not/ sorted by the
 479 * highest key, all leaf block updates require us to compute the highest key
 480 * that matches any record in the leaf and to recursively update the high keys
 481 * in the nodes going further up in the tree, if necessary.  Nodes look like
 482 * this:
 483 *
 484 *              +--------+-----+-----+-----+-----+-----+-------+-------+-----+
 485 * Non-Leaf:    | header | lo1 | hi1 | lo2 | hi2 | ... | ptr 1 | ptr 2 | ... |
 486 *              +--------+-----+-----+-----+-----+-----+-------+-------+-----+
 487 *
 488 * To perform an interval query on an overlapped tree, perform the usual
 489 * depth-first search and use the low and high keys to decide if we can skip
 490 * that particular node.  If a leaf node is reached, return the records that
 491 * intersect the interval.  Note that an interval query may return numerous
 492 * entries.  For a non-overlapped tree, simply search for the record associated
 493 * with the lowest key and iterate forward until a non-matching record is
 494 * found.  Section 14.3 ("Interval Trees") of _Introduction to Algorithms_ by
 495 * Cormen, Leiserson, Rivest, and Stein (2nd or 3rd ed. only) discuss this in
 496 * more detail.
 497 *
 498 * Why do we care about overlapping intervals?  Let's say you have a bunch of
 499 * reverse mapping records on a reflink filesystem:
 500 *
 501 * 1: +- file A startblock B offset C length D -----------+
 502 * 2:      +- file E startblock F offset G length H --------------+
 503 * 3:      +- file I startblock F offset J length K --+
 504 * 4:                                                        +- file L... --+
 505 *
 506 * Now say we want to map block (B+D) into file A at offset (C+D).  Ideally,
 507 * we'd simply increment the length of record 1.  But how do we find the record
 508 * that ends at (B+D-1) (i.e. record 1)?  A LE lookup of (B+D-1) would return
 509 * record 3 because the keys are ordered first by startblock.  An interval
 510 * query would return records 1 and 2 because they both overlap (B+D-1), and
 511 * from that we can pick out record 1 as the appropriate left neighbor.
 512 *
 513 * In the non-overlapped case you can do a LE lookup and decrement the cursor
 514 * because a record's interval must end before the next record.
 515 */
 516
 517/*
 518 * Return size of the btree block header for this btree instance.
 519 */
 520static inline size_t xfs_btree_block_len(struct xfs_btree_cur *cur)
 521{
 522        if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
 523                if (cur->bc_flags & XFS_BTREE_CRC_BLOCKS)
 524                        return XFS_BTREE_LBLOCK_CRC_LEN;
 525                return XFS_BTREE_LBLOCK_LEN;
 526        }
 527        if (cur->bc_flags & XFS_BTREE_CRC_BLOCKS)
 528                return XFS_BTREE_SBLOCK_CRC_LEN;
 529        return XFS_BTREE_SBLOCK_LEN;
 530}
 531
 532/*
 533 * Return size of btree block pointers for this btree instance.
 534 */
 535static inline size_t xfs_btree_ptr_len(struct xfs_btree_cur *cur)
 536{
 537        return (cur->bc_flags & XFS_BTREE_LONG_PTRS) ?
 538                sizeof(__be64) : sizeof(__be32);
 539}
 540
 541/*
 542 * Calculate offset of the n-th record in a btree block.
 543 */
 544STATIC size_t
 545xfs_btree_rec_offset(
 546        struct xfs_btree_cur    *cur,
 547        int                     n)
 548{
 549        return xfs_btree_block_len(cur) +
 550                (n - 1) * cur->bc_ops->rec_len;
 551}
 552
 553/*
 554 * Calculate offset of the n-th key in a btree block.
 555 */
 556STATIC size_t
 557xfs_btree_key_offset(
 558        struct xfs_btree_cur    *cur,
 559        int                     n)
 560{
 561        return xfs_btree_block_len(cur) +
 562                (n - 1) * cur->bc_ops->key_len;
 563}
 564
 565/*
 566 * Calculate offset of the n-th high key in a btree block.
 567 */
 568STATIC size_t
 569xfs_btree_high_key_offset(
 570        struct xfs_btree_cur    *cur,
 571        int                     n)
 572{
 573        return xfs_btree_block_len(cur) +
 574                (n - 1) * cur->bc_ops->key_len + (cur->bc_ops->key_len / 2);
 575}
 576
 577/*
 578 * Calculate offset of the n-th block pointer in a btree block.
 579 */
 580STATIC size_t
 581xfs_btree_ptr_offset(
 582        struct xfs_btree_cur    *cur,
 583        int                     n,
 584        int                     level)
 585{
 586        return xfs_btree_block_len(cur) +
 587                cur->bc_ops->get_maxrecs(cur, level) * cur->bc_ops->key_len +
 588                (n - 1) * xfs_btree_ptr_len(cur);
 589}
 590
 591/*
 592 * Return a pointer to the n-th record in the btree block.
 593 */
 594union xfs_btree_rec *
 595xfs_btree_rec_addr(
 596        struct xfs_btree_cur    *cur,
 597        int                     n,
 598        struct xfs_btree_block  *block)
 599{
 600        return (union xfs_btree_rec *)
 601                ((char *)block + xfs_btree_rec_offset(cur, n));
 602}
 603
 604/*
 605 * Return a pointer to the n-th key in the btree block.
 606 */
 607union xfs_btree_key *
 608xfs_btree_key_addr(
 609        struct xfs_btree_cur    *cur,
 610        int                     n,
 611        struct xfs_btree_block  *block)
 612{
 613        return (union xfs_btree_key *)
 614                ((char *)block + xfs_btree_key_offset(cur, n));
 615}
 616
 617/*
 618 * Return a pointer to the n-th high key in the btree block.
 619 */
 620union xfs_btree_key *
 621xfs_btree_high_key_addr(
 622        struct xfs_btree_cur    *cur,
 623        int                     n,
 624        struct xfs_btree_block  *block)
 625{
 626        return (union xfs_btree_key *)
 627                ((char *)block + xfs_btree_high_key_offset(cur, n));
 628}
 629
 630/*
 631 * Return a pointer to the n-th block pointer in the btree block.
 632 */
 633union xfs_btree_ptr *
 634xfs_btree_ptr_addr(
 635        struct xfs_btree_cur    *cur,
 636        int                     n,
 637        struct xfs_btree_block  *block)
 638{
 639        int                     level = xfs_btree_get_level(block);
 640
 641        ASSERT(block->bb_level != 0);
 642
 643        return (union xfs_btree_ptr *)
 644                ((char *)block + xfs_btree_ptr_offset(cur, n, level));
 645}
 646
 647/*
 648 * Get the root block which is stored in the inode.
 649 *
 650 * For now this btree implementation assumes the btree root is always
 651 * stored in the if_broot field of an inode fork.
 652 */
 653STATIC struct xfs_btree_block *
 654xfs_btree_get_iroot(
 655        struct xfs_btree_cur    *cur)
 656{
 657        struct xfs_ifork        *ifp;
 658
 659        ifp = XFS_IFORK_PTR(cur->bc_private.b.ip, cur->bc_private.b.whichfork);
 660        return (struct xfs_btree_block *)ifp->if_broot;
 661}
 662
 663/*
 664 * Retrieve the block pointer from the cursor at the given level.
 665 * This may be an inode btree root or from a buffer.
 666 */
 667struct xfs_btree_block *                /* generic btree block pointer */
 668xfs_btree_get_block(
 669        struct xfs_btree_cur    *cur,   /* btree cursor */
 670        int                     level,  /* level in btree */
 671        struct xfs_buf          **bpp)  /* buffer containing the block */
 672{
 673        if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
 674            (level == cur->bc_nlevels - 1)) {
 675                *bpp = NULL;
 676                return xfs_btree_get_iroot(cur);
 677        }
 678
 679        *bpp = cur->bc_bufs[level];
 680        return XFS_BUF_TO_BLOCK(*bpp);
 681}
 682
 683/*
 684 * Get a buffer for the block, return it with no data read.
 685 * Long-form addressing.
 686 */
 687xfs_buf_t *                             /* buffer for fsbno */
 688xfs_btree_get_bufl(
 689        xfs_mount_t     *mp,            /* file system mount point */
 690        xfs_trans_t     *tp,            /* transaction pointer */
 691        xfs_fsblock_t   fsbno)          /* file system block number */
 692{
 693        xfs_daddr_t             d;              /* real disk block address */
 694
 695        ASSERT(fsbno != NULLFSBLOCK);
 696        d = XFS_FSB_TO_DADDR(mp, fsbno);
 697        return xfs_trans_get_buf(tp, mp->m_ddev_targp, d, mp->m_bsize, 0);
 698}
 699
 700/*
 701 * Get a buffer for the block, return it with no data read.
 702 * Short-form addressing.
 703 */
 704xfs_buf_t *                             /* buffer for agno/agbno */
 705xfs_btree_get_bufs(
 706        xfs_mount_t     *mp,            /* file system mount point */
 707        xfs_trans_t     *tp,            /* transaction pointer */
 708        xfs_agnumber_t  agno,           /* allocation group number */
 709        xfs_agblock_t   agbno)          /* allocation group block number */
 710{
 711        xfs_daddr_t             d;              /* real disk block address */
 712
 713        ASSERT(agno != NULLAGNUMBER);
 714        ASSERT(agbno != NULLAGBLOCK);
 715        d = XFS_AGB_TO_DADDR(mp, agno, agbno);
 716        return xfs_trans_get_buf(tp, mp->m_ddev_targp, d, mp->m_bsize, 0);
 717}
 718
 719/*
 720 * Check for the cursor referring to the last block at the given level.
 721 */
 722int                                     /* 1=is last block, 0=not last block */
 723xfs_btree_islastblock(
 724        xfs_btree_cur_t         *cur,   /* btree cursor */
 725        int                     level)  /* level to check */
 726{
 727        struct xfs_btree_block  *block; /* generic btree block pointer */
 728        xfs_buf_t               *bp;    /* buffer containing block */
 729
 730        block = xfs_btree_get_block(cur, level, &bp);
 731        xfs_btree_check_block(cur, block, level, bp);
 732        if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
 733                return block->bb_u.l.bb_rightsib == cpu_to_be64(NULLFSBLOCK);
 734        else
 735                return block->bb_u.s.bb_rightsib == cpu_to_be32(NULLAGBLOCK);
 736}
 737
 738/*
 739 * Change the cursor to point to the first record at the given level.
 740 * Other levels are unaffected.
 741 */
 742STATIC int                              /* success=1, failure=0 */
 743xfs_btree_firstrec(
 744        xfs_btree_cur_t         *cur,   /* btree cursor */
 745        int                     level)  /* level to change */
 746{
 747        struct xfs_btree_block  *block; /* generic btree block pointer */
 748        xfs_buf_t               *bp;    /* buffer containing block */
 749
 750        /*
 751         * Get the block pointer for this level.
 752         */
 753        block = xfs_btree_get_block(cur, level, &bp);
 754        if (xfs_btree_check_block(cur, block, level, bp))
 755                return 0;
 756        /*
 757         * It's empty, there is no such record.
 758         */
 759        if (!block->bb_numrecs)
 760                return 0;
 761        /*
 762         * Set the ptr value to 1, that's the first record/key.
 763         */
 764        cur->bc_ptrs[level] = 1;
 765        return 1;
 766}
 767
 768/*
 769 * Change the cursor to point to the last record in the current block
 770 * at the given level.  Other levels are unaffected.
 771 */
 772STATIC int                              /* success=1, failure=0 */
 773xfs_btree_lastrec(
 774        xfs_btree_cur_t         *cur,   /* btree cursor */
 775        int                     level)  /* level to change */
 776{
 777        struct xfs_btree_block  *block; /* generic btree block pointer */
 778        xfs_buf_t               *bp;    /* buffer containing block */
 779
 780        /*
 781         * Get the block pointer for this level.
 782         */
 783        block = xfs_btree_get_block(cur, level, &bp);
 784        if (xfs_btree_check_block(cur, block, level, bp))
 785                return 0;
 786        /*
 787         * It's empty, there is no such record.
 788         */
 789        if (!block->bb_numrecs)
 790                return 0;
 791        /*
 792         * Set the ptr value to numrecs, that's the last record/key.
 793         */
 794        cur->bc_ptrs[level] = be16_to_cpu(block->bb_numrecs);
 795        return 1;
 796}
 797
 798/*
 799 * Compute first and last byte offsets for the fields given.
 800 * Interprets the offsets table, which contains struct field offsets.
 801 */
 802void
 803xfs_btree_offsets(
 804        int64_t         fields,         /* bitmask of fields */
 805        const short     *offsets,       /* table of field offsets */
 806        int             nbits,          /* number of bits to inspect */
 807        int             *first,         /* output: first byte offset */
 808        int             *last)          /* output: last byte offset */
 809{
 810        int             i;              /* current bit number */
 811        int64_t         imask;          /* mask for current bit number */
 812
 813        ASSERT(fields != 0);
 814        /*
 815         * Find the lowest bit, so the first byte offset.
 816         */
 817        for (i = 0, imask = 1LL; ; i++, imask <<= 1) {
 818                if (imask & fields) {
 819                        *first = offsets[i];
 820                        break;
 821                }
 822        }
 823        /*
 824         * Find the highest bit, so the last byte offset.
 825         */
 826        for (i = nbits - 1, imask = 1LL << i; ; i--, imask >>= 1) {
 827                if (imask & fields) {
 828                        *last = offsets[i + 1] - 1;
 829                        break;
 830                }
 831        }
 832}
 833
 834/*
 835 * Get a buffer for the block, return it read in.
 836 * Long-form addressing.
 837 */
 838int
 839xfs_btree_read_bufl(
 840        struct xfs_mount        *mp,            /* file system mount point */
 841        struct xfs_trans        *tp,            /* transaction pointer */
 842        xfs_fsblock_t           fsbno,          /* file system block number */
 843        struct xfs_buf          **bpp,          /* buffer for fsbno */
 844        int                     refval,         /* ref count value for buffer */
 845        const struct xfs_buf_ops *ops)
 846{
 847        struct xfs_buf          *bp;            /* return value */
 848        xfs_daddr_t             d;              /* real disk block address */
 849        int                     error;
 850
 851        if (!xfs_verify_fsbno(mp, fsbno))
 852                return -EFSCORRUPTED;
 853        d = XFS_FSB_TO_DADDR(mp, fsbno);
 854        error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, d,
 855                                   mp->m_bsize, 0, &bp, ops);
 856        if (error)
 857                return error;
 858        if (bp)
 859                xfs_buf_set_ref(bp, refval);
 860        *bpp = bp;
 861        return 0;
 862}
 863
 864/*
 865 * Read-ahead the block, don't wait for it, don't return a buffer.
 866 * Long-form addressing.
 867 */
 868/* ARGSUSED */
 869void
 870xfs_btree_reada_bufl(
 871        struct xfs_mount        *mp,            /* file system mount point */
 872        xfs_fsblock_t           fsbno,          /* file system block number */
 873        xfs_extlen_t            count,          /* count of filesystem blocks */
 874        const struct xfs_buf_ops *ops)
 875{
 876        xfs_daddr_t             d;
 877
 878        ASSERT(fsbno != NULLFSBLOCK);
 879        d = XFS_FSB_TO_DADDR(mp, fsbno);
 880        xfs_buf_readahead(mp->m_ddev_targp, d, mp->m_bsize * count, ops);
 881}
 882
 883/*
 884 * Read-ahead the block, don't wait for it, don't return a buffer.
 885 * Short-form addressing.
 886 */
 887/* ARGSUSED */
 888void
 889xfs_btree_reada_bufs(
 890        struct xfs_mount        *mp,            /* file system mount point */
 891        xfs_agnumber_t          agno,           /* allocation group number */
 892        xfs_agblock_t           agbno,          /* allocation group block number */
 893        xfs_extlen_t            count,          /* count of filesystem blocks */
 894        const struct xfs_buf_ops *ops)
 895{
 896        xfs_daddr_t             d;
 897
 898        ASSERT(agno != NULLAGNUMBER);
 899        ASSERT(agbno != NULLAGBLOCK);
 900        d = XFS_AGB_TO_DADDR(mp, agno, agbno);
 901        xfs_buf_readahead(mp->m_ddev_targp, d, mp->m_bsize * count, ops);
 902}
 903
 904STATIC int
 905xfs_btree_readahead_lblock(
 906        struct xfs_btree_cur    *cur,
 907        int                     lr,
 908        struct xfs_btree_block  *block)
 909{
 910        int                     rval = 0;
 911        xfs_fsblock_t           left = be64_to_cpu(block->bb_u.l.bb_leftsib);
 912        xfs_fsblock_t           right = be64_to_cpu(block->bb_u.l.bb_rightsib);
 913
 914        if ((lr & XFS_BTCUR_LEFTRA) && left != NULLFSBLOCK) {
 915                xfs_btree_reada_bufl(cur->bc_mp, left, 1,
 916                                     cur->bc_ops->buf_ops);
 917                rval++;
 918        }
 919
 920        if ((lr & XFS_BTCUR_RIGHTRA) && right != NULLFSBLOCK) {
 921                xfs_btree_reada_bufl(cur->bc_mp, right, 1,
 922                                     cur->bc_ops->buf_ops);
 923                rval++;
 924        }
 925
 926        return rval;
 927}
 928
 929STATIC int
 930xfs_btree_readahead_sblock(
 931        struct xfs_btree_cur    *cur,
 932        int                     lr,
 933        struct xfs_btree_block *block)
 934{
 935        int                     rval = 0;
 936        xfs_agblock_t           left = be32_to_cpu(block->bb_u.s.bb_leftsib);
 937        xfs_agblock_t           right = be32_to_cpu(block->bb_u.s.bb_rightsib);
 938
 939
 940        if ((lr & XFS_BTCUR_LEFTRA) && left != NULLAGBLOCK) {
 941                xfs_btree_reada_bufs(cur->bc_mp, cur->bc_private.a.agno,
 942                                     left, 1, cur->bc_ops->buf_ops);
 943                rval++;
 944        }
 945
 946        if ((lr & XFS_BTCUR_RIGHTRA) && right != NULLAGBLOCK) {
 947                xfs_btree_reada_bufs(cur->bc_mp, cur->bc_private.a.agno,
 948                                     right, 1, cur->bc_ops->buf_ops);
 949                rval++;
 950        }
 951
 952        return rval;
 953}
 954
 955/*
 956 * Read-ahead btree blocks, at the given level.
 957 * Bits in lr are set from XFS_BTCUR_{LEFT,RIGHT}RA.
 958 */
 959STATIC int
 960xfs_btree_readahead(
 961        struct xfs_btree_cur    *cur,           /* btree cursor */
 962        int                     lev,            /* level in btree */
 963        int                     lr)             /* left/right bits */
 964{
 965        struct xfs_btree_block  *block;
 966
 967        /*
 968         * No readahead needed if we are at the root level and the
 969         * btree root is stored in the inode.
 970         */
 971        if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
 972            (lev == cur->bc_nlevels - 1))
 973                return 0;
 974
 975        if ((cur->bc_ra[lev] | lr) == cur->bc_ra[lev])
 976                return 0;
 977
 978        cur->bc_ra[lev] |= lr;
 979        block = XFS_BUF_TO_BLOCK(cur->bc_bufs[lev]);
 980
 981        if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
 982                return xfs_btree_readahead_lblock(cur, lr, block);
 983        return xfs_btree_readahead_sblock(cur, lr, block);
 984}
 985
 986STATIC int
 987xfs_btree_ptr_to_daddr(
 988        struct xfs_btree_cur    *cur,
 989        union xfs_btree_ptr     *ptr,
 990        xfs_daddr_t             *daddr)
 991{
 992        xfs_fsblock_t           fsbno;
 993        xfs_agblock_t           agbno;
 994        int                     error;
 995
 996        error = xfs_btree_check_ptr(cur, ptr, 0, 1);
 997        if (error)
 998                return error;
 999
1000        if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
1001                fsbno = be64_to_cpu(ptr->l);
1002                *daddr = XFS_FSB_TO_DADDR(cur->bc_mp, fsbno);
1003        } else {
1004                agbno = be32_to_cpu(ptr->s);
1005                *daddr = XFS_AGB_TO_DADDR(cur->bc_mp, cur->bc_private.a.agno,
1006                                agbno);
1007        }
1008
1009        return 0;
1010}
1011
1012/*
1013 * Readahead @count btree blocks at the given @ptr location.
1014 *
1015 * We don't need to care about long or short form btrees here as we have a
1016 * method of converting the ptr directly to a daddr available to us.
1017 */
1018STATIC void
1019xfs_btree_readahead_ptr(
1020        struct xfs_btree_cur    *cur,
1021        union xfs_btree_ptr     *ptr,
1022        xfs_extlen_t            count)
1023{
1024        xfs_daddr_t             daddr;
1025
1026        if (xfs_btree_ptr_to_daddr(cur, ptr, &daddr))
1027                return;
1028        xfs_buf_readahead(cur->bc_mp->m_ddev_targp, daddr,
1029                          cur->bc_mp->m_bsize * count, cur->bc_ops->buf_ops);
1030}
1031
1032/*
1033 * Set the buffer for level "lev" in the cursor to bp, releasing
1034 * any previous buffer.
1035 */
1036STATIC void
1037xfs_btree_setbuf(
1038        xfs_btree_cur_t         *cur,   /* btree cursor */
1039        int                     lev,    /* level in btree */
1040        xfs_buf_t               *bp)    /* new buffer to set */
1041{
1042        struct xfs_btree_block  *b;     /* btree block */
1043
1044        if (cur->bc_bufs[lev])
1045                xfs_trans_brelse(cur->bc_tp, cur->bc_bufs[lev]);
1046        cur->bc_bufs[lev] = bp;
1047        cur->bc_ra[lev] = 0;
1048
1049        b = XFS_BUF_TO_BLOCK(bp);
1050        if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
1051                if (b->bb_u.l.bb_leftsib == cpu_to_be64(NULLFSBLOCK))
1052                        cur->bc_ra[lev] |= XFS_BTCUR_LEFTRA;
1053                if (b->bb_u.l.bb_rightsib == cpu_to_be64(NULLFSBLOCK))
1054                        cur->bc_ra[lev] |= XFS_BTCUR_RIGHTRA;
1055        } else {
1056                if (b->bb_u.s.bb_leftsib == cpu_to_be32(NULLAGBLOCK))
1057                        cur->bc_ra[lev] |= XFS_BTCUR_LEFTRA;
1058                if (b->bb_u.s.bb_rightsib == cpu_to_be32(NULLAGBLOCK))
1059                        cur->bc_ra[lev] |= XFS_BTCUR_RIGHTRA;
1060        }
1061}
1062
1063bool
1064xfs_btree_ptr_is_null(
1065        struct xfs_btree_cur    *cur,
1066        union xfs_btree_ptr     *ptr)
1067{
1068        if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
1069                return ptr->l == cpu_to_be64(NULLFSBLOCK);
1070        else
1071                return ptr->s == cpu_to_be32(NULLAGBLOCK);
1072}
1073
1074STATIC void
1075xfs_btree_set_ptr_null(
1076        struct xfs_btree_cur    *cur,
1077        union xfs_btree_ptr     *ptr)
1078{
1079        if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
1080                ptr->l = cpu_to_be64(NULLFSBLOCK);
1081        else
1082                ptr->s = cpu_to_be32(NULLAGBLOCK);
1083}
1084
1085/*
1086 * Get/set/init sibling pointers
1087 */
1088void
1089xfs_btree_get_sibling(
1090        struct xfs_btree_cur    *cur,
1091        struct xfs_btree_block  *block,
1092        union xfs_btree_ptr     *ptr,
1093        int                     lr)
1094{
1095        ASSERT(lr == XFS_BB_LEFTSIB || lr == XFS_BB_RIGHTSIB);
1096
1097        if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
1098                if (lr == XFS_BB_RIGHTSIB)
1099                        ptr->l = block->bb_u.l.bb_rightsib;
1100                else
1101                        ptr->l = block->bb_u.l.bb_leftsib;
1102        } else {
1103                if (lr == XFS_BB_RIGHTSIB)
1104                        ptr->s = block->bb_u.s.bb_rightsib;
1105                else
1106                        ptr->s = block->bb_u.s.bb_leftsib;
1107        }
1108}
1109
1110STATIC void
1111xfs_btree_set_sibling(
1112        struct xfs_btree_cur    *cur,
1113        struct xfs_btree_block  *block,
1114        union xfs_btree_ptr     *ptr,
1115        int                     lr)
1116{
1117        ASSERT(lr == XFS_BB_LEFTSIB || lr == XFS_BB_RIGHTSIB);
1118
1119        if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
1120                if (lr == XFS_BB_RIGHTSIB)
1121                        block->bb_u.l.bb_rightsib = ptr->l;
1122                else
1123                        block->bb_u.l.bb_leftsib = ptr->l;
1124        } else {
1125                if (lr == XFS_BB_RIGHTSIB)
1126                        block->bb_u.s.bb_rightsib = ptr->s;
1127                else
1128                        block->bb_u.s.bb_leftsib = ptr->s;
1129        }
1130}
1131
1132void
1133xfs_btree_init_block_int(
1134        struct xfs_mount        *mp,
1135        struct xfs_btree_block  *buf,
1136        xfs_daddr_t             blkno,
1137        xfs_btnum_t             btnum,
1138        __u16                   level,
1139        __u16                   numrecs,
1140        __u64                   owner,
1141        unsigned int            flags)
1142{
1143        int                     crc = xfs_sb_version_hascrc(&mp->m_sb);
1144        __u32                   magic = xfs_btree_magic(crc, btnum);
1145
1146        buf->bb_magic = cpu_to_be32(magic);
1147        buf->bb_level = cpu_to_be16(level);
1148        buf->bb_numrecs = cpu_to_be16(numrecs);
1149
1150        if (flags & XFS_BTREE_LONG_PTRS) {
1151                buf->bb_u.l.bb_leftsib = cpu_to_be64(NULLFSBLOCK);
1152                buf->bb_u.l.bb_rightsib = cpu_to_be64(NULLFSBLOCK);
1153                if (crc) {
1154                        buf->bb_u.l.bb_blkno = cpu_to_be64(blkno);
1155                        buf->bb_u.l.bb_owner = cpu_to_be64(owner);
1156                        uuid_copy(&buf->bb_u.l.bb_uuid, &mp->m_sb.sb_meta_uuid);
1157                        buf->bb_u.l.bb_pad = 0;
1158                        buf->bb_u.l.bb_lsn = 0;
1159                }
1160        } else {
1161                /* owner is a 32 bit value on short blocks */
1162                __u32 __owner = (__u32)owner;
1163
1164                buf->bb_u.s.bb_leftsib = cpu_to_be32(NULLAGBLOCK);
1165                buf->bb_u.s.bb_rightsib = cpu_to_be32(NULLAGBLOCK);
1166                if (crc) {
1167                        buf->bb_u.s.bb_blkno = cpu_to_be64(blkno);
1168                        buf->bb_u.s.bb_owner = cpu_to_be32(__owner);
1169                        uuid_copy(&buf->bb_u.s.bb_uuid, &mp->m_sb.sb_meta_uuid);
1170                        buf->bb_u.s.bb_lsn = 0;
1171                }
1172        }
1173}
1174
1175void
1176xfs_btree_init_block(
1177        struct xfs_mount *mp,
1178        struct xfs_buf  *bp,
1179        xfs_btnum_t     btnum,
1180        __u16           level,
1181        __u16           numrecs,
1182        __u64           owner)
1183{
1184        xfs_btree_init_block_int(mp, XFS_BUF_TO_BLOCK(bp), bp->b_bn,
1185                                 btnum, level, numrecs, owner, 0);
1186}
1187
1188STATIC void
1189xfs_btree_init_block_cur(
1190        struct xfs_btree_cur    *cur,
1191        struct xfs_buf          *bp,
1192        int                     level,
1193        int                     numrecs)
1194{
1195        __u64                   owner;
1196
1197        /*
1198         * we can pull the owner from the cursor right now as the different
1199         * owners align directly with the pointer size of the btree. This may
1200         * change in future, but is safe for current users of the generic btree
1201         * code.
1202         */
1203        if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
1204                owner = cur->bc_private.b.ip->i_ino;
1205        else
1206                owner = cur->bc_private.a.agno;
1207
1208        xfs_btree_init_block_int(cur->bc_mp, XFS_BUF_TO_BLOCK(bp), bp->b_bn,
1209                                 cur->bc_btnum, level, numrecs,
1210                                 owner, cur->bc_flags);
1211}
1212
1213/*
1214 * Return true if ptr is the last record in the btree and
1215 * we need to track updates to this record.  The decision
1216 * will be further refined in the update_lastrec method.
1217 */
1218STATIC int
1219xfs_btree_is_lastrec(
1220        struct xfs_btree_cur    *cur,
1221        struct xfs_btree_block  *block,
1222        int                     level)
1223{
1224        union xfs_btree_ptr     ptr;
1225
1226        if (level > 0)
1227                return 0;
1228        if (!(cur->bc_flags & XFS_BTREE_LASTREC_UPDATE))
1229                return 0;
1230
1231        xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_RIGHTSIB);
1232        if (!xfs_btree_ptr_is_null(cur, &ptr))
1233                return 0;
1234        return 1;
1235}
1236
1237STATIC void
1238xfs_btree_buf_to_ptr(
1239        struct xfs_btree_cur    *cur,
1240        struct xfs_buf          *bp,
1241        union xfs_btree_ptr     *ptr)
1242{
1243        if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
1244                ptr->l = cpu_to_be64(XFS_DADDR_TO_FSB(cur->bc_mp,
1245                                        XFS_BUF_ADDR(bp)));
1246        else {
1247                ptr->s = cpu_to_be32(xfs_daddr_to_agbno(cur->bc_mp,
1248                                        XFS_BUF_ADDR(bp)));
1249        }
1250}
1251
1252STATIC void
1253xfs_btree_set_refs(
1254        struct xfs_btree_cur    *cur,
1255        struct xfs_buf          *bp)
1256{
1257        switch (cur->bc_btnum) {
1258        case XFS_BTNUM_BNO:
1259        case XFS_BTNUM_CNT:
1260                xfs_buf_set_ref(bp, XFS_ALLOC_BTREE_REF);
1261                break;
1262        case XFS_BTNUM_INO:
1263        case XFS_BTNUM_FINO:
1264                xfs_buf_set_ref(bp, XFS_INO_BTREE_REF);
1265                break;
1266        case XFS_BTNUM_BMAP:
1267                xfs_buf_set_ref(bp, XFS_BMAP_BTREE_REF);
1268                break;
1269        case XFS_BTNUM_RMAP:
1270                xfs_buf_set_ref(bp, XFS_RMAP_BTREE_REF);
1271                break;
1272        case XFS_BTNUM_REFC:
1273                xfs_buf_set_ref(bp, XFS_REFC_BTREE_REF);
1274                break;
1275        default:
1276                ASSERT(0);
1277        }
1278}
1279
1280STATIC int
1281xfs_btree_get_buf_block(
1282        struct xfs_btree_cur    *cur,
1283        union xfs_btree_ptr     *ptr,
1284        struct xfs_btree_block  **block,
1285        struct xfs_buf          **bpp)
1286{
1287        struct xfs_mount        *mp = cur->bc_mp;
1288        xfs_daddr_t             d;
1289        int                     error;
1290
1291        error = xfs_btree_ptr_to_daddr(cur, ptr, &d);
1292        if (error)
1293                return error;
1294        *bpp = xfs_trans_get_buf(cur->bc_tp, mp->m_ddev_targp, d,
1295                                 mp->m_bsize, 0);
1296
1297        if (!*bpp)
1298                return -ENOMEM;
1299
1300        (*bpp)->b_ops = cur->bc_ops->buf_ops;
1301        *block = XFS_BUF_TO_BLOCK(*bpp);
1302        return 0;
1303}
1304
1305/*
1306 * Read in the buffer at the given ptr and return the buffer and
1307 * the block pointer within the buffer.
1308 */
1309STATIC int
1310xfs_btree_read_buf_block(
1311        struct xfs_btree_cur    *cur,
1312        union xfs_btree_ptr     *ptr,
1313        int                     flags,
1314        struct xfs_btree_block  **block,
1315        struct xfs_buf          **bpp)
1316{
1317        struct xfs_mount        *mp = cur->bc_mp;
1318        xfs_daddr_t             d;
1319        int                     error;
1320
1321        /* need to sort out how callers deal with failures first */
1322        ASSERT(!(flags & XBF_TRYLOCK));
1323
1324        error = xfs_btree_ptr_to_daddr(cur, ptr, &d);
1325        if (error)
1326                return error;
1327        error = xfs_trans_read_buf(mp, cur->bc_tp, mp->m_ddev_targp, d,
1328                                   mp->m_bsize, flags, bpp,
1329                                   cur->bc_ops->buf_ops);
1330        if (error)
1331                return error;
1332
1333        xfs_btree_set_refs(cur, *bpp);
1334        *block = XFS_BUF_TO_BLOCK(*bpp);
1335        return 0;
1336}
1337
1338/*
1339 * Copy keys from one btree block to another.
1340 */
1341STATIC void
1342xfs_btree_copy_keys(
1343        struct xfs_btree_cur    *cur,
1344        union xfs_btree_key     *dst_key,
1345        union xfs_btree_key     *src_key,
1346        int                     numkeys)
1347{
1348        ASSERT(numkeys >= 0);
1349        memcpy(dst_key, src_key, numkeys * cur->bc_ops->key_len);
1350}
1351
1352/*
1353 * Copy records from one btree block to another.
1354 */
1355STATIC void
1356xfs_btree_copy_recs(
1357        struct xfs_btree_cur    *cur,
1358        union xfs_btree_rec     *dst_rec,
1359        union xfs_btree_rec     *src_rec,
1360        int                     numrecs)
1361{
1362        ASSERT(numrecs >= 0);
1363        memcpy(dst_rec, src_rec, numrecs * cur->bc_ops->rec_len);
1364}
1365
1366/*
1367 * Copy block pointers from one btree block to another.
1368 */
1369STATIC void
1370xfs_btree_copy_ptrs(
1371        struct xfs_btree_cur    *cur,
1372        union xfs_btree_ptr     *dst_ptr,
1373        union xfs_btree_ptr     *src_ptr,
1374        int                     numptrs)
1375{
1376        ASSERT(numptrs >= 0);
1377        memcpy(dst_ptr, src_ptr, numptrs * xfs_btree_ptr_len(cur));
1378}
1379
1380/*
1381 * Shift keys one index left/right inside a single btree block.
1382 */
1383STATIC void
1384xfs_btree_shift_keys(
1385        struct xfs_btree_cur    *cur,
1386        union xfs_btree_key     *key,
1387        int                     dir,
1388        int                     numkeys)
1389{
1390        char                    *dst_key;
1391
1392        ASSERT(numkeys >= 0);
1393        ASSERT(dir == 1 || dir == -1);
1394
1395        dst_key = (char *)key + (dir * cur->bc_ops->key_len);
1396        memmove(dst_key, key, numkeys * cur->bc_ops->key_len);
1397}
1398
1399/*
1400 * Shift records one index left/right inside a single btree block.
1401 */
1402STATIC void
1403xfs_btree_shift_recs(
1404        struct xfs_btree_cur    *cur,
1405        union xfs_btree_rec     *rec,
1406        int                     dir,
1407        int                     numrecs)
1408{
1409        char                    *dst_rec;
1410
1411        ASSERT(numrecs >= 0);
1412        ASSERT(dir == 1 || dir == -1);
1413
1414        dst_rec = (char *)rec + (dir * cur->bc_ops->rec_len);
1415        memmove(dst_rec, rec, numrecs * cur->bc_ops->rec_len);
1416}
1417
1418/*
1419 * Shift block pointers one index left/right inside a single btree block.
1420 */
1421STATIC void
1422xfs_btree_shift_ptrs(
1423        struct xfs_btree_cur    *cur,
1424        union xfs_btree_ptr     *ptr,
1425        int                     dir,
1426        int                     numptrs)
1427{
1428        char                    *dst_ptr;
1429
1430        ASSERT(numptrs >= 0);
1431        ASSERT(dir == 1 || dir == -1);
1432
1433        dst_ptr = (char *)ptr + (dir * xfs_btree_ptr_len(cur));
1434        memmove(dst_ptr, ptr, numptrs * xfs_btree_ptr_len(cur));
1435}
1436
1437/*
1438 * Log key values from the btree block.
1439 */
1440STATIC void
1441xfs_btree_log_keys(
1442        struct xfs_btree_cur    *cur,
1443        struct xfs_buf          *bp,
1444        int                     first,
1445        int                     last)
1446{
1447
1448        if (bp) {
1449                xfs_trans_buf_set_type(cur->bc_tp, bp, XFS_BLFT_BTREE_BUF);
1450                xfs_trans_log_buf(cur->bc_tp, bp,
1451                                  xfs_btree_key_offset(cur, first),
1452                                  xfs_btree_key_offset(cur, last + 1) - 1);
1453        } else {
1454                xfs_trans_log_inode(cur->bc_tp, cur->bc_private.b.ip,
1455                                xfs_ilog_fbroot(cur->bc_private.b.whichfork));
1456        }
1457}
1458
1459/*
1460 * Log record values from the btree block.
1461 */
1462void
1463xfs_btree_log_recs(
1464        struct xfs_btree_cur    *cur,
1465        struct xfs_buf          *bp,
1466        int                     first,
1467        int                     last)
1468{
1469
1470        xfs_trans_buf_set_type(cur->bc_tp, bp, XFS_BLFT_BTREE_BUF);
1471        xfs_trans_log_buf(cur->bc_tp, bp,
1472                          xfs_btree_rec_offset(cur, first),
1473                          xfs_btree_rec_offset(cur, last + 1) - 1);
1474
1475}
1476
1477/*
1478 * Log block pointer fields from a btree block (nonleaf).
1479 */
1480STATIC void
1481xfs_btree_log_ptrs(
1482        struct xfs_btree_cur    *cur,   /* btree cursor */
1483        struct xfs_buf          *bp,    /* buffer containing btree block */
1484        int                     first,  /* index of first pointer to log */
1485        int                     last)   /* index of last pointer to log */
1486{
1487
1488        if (bp) {
1489                struct xfs_btree_block  *block = XFS_BUF_TO_BLOCK(bp);
1490                int                     level = xfs_btree_get_level(block);
1491
1492                xfs_trans_buf_set_type(cur->bc_tp, bp, XFS_BLFT_BTREE_BUF);
1493                xfs_trans_log_buf(cur->bc_tp, bp,
1494                                xfs_btree_ptr_offset(cur, first, level),
1495                                xfs_btree_ptr_offset(cur, last + 1, level) - 1);
1496        } else {
1497                xfs_trans_log_inode(cur->bc_tp, cur->bc_private.b.ip,
1498                        xfs_ilog_fbroot(cur->bc_private.b.whichfork));
1499        }
1500
1501}
1502
1503/*
1504 * Log fields from a btree block header.
1505 */
1506void
1507xfs_btree_log_block(
1508        struct xfs_btree_cur    *cur,   /* btree cursor */
1509        struct xfs_buf          *bp,    /* buffer containing btree block */
1510        int                     fields) /* mask of fields: XFS_BB_... */
1511{
1512        int                     first;  /* first byte offset logged */
1513        int                     last;   /* last byte offset logged */
1514        static const short      soffsets[] = {  /* table of offsets (short) */
1515                offsetof(struct xfs_btree_block, bb_magic),
1516                offsetof(struct xfs_btree_block, bb_level),
1517                offsetof(struct xfs_btree_block, bb_numrecs),
1518                offsetof(struct xfs_btree_block, bb_u.s.bb_leftsib),
1519                offsetof(struct xfs_btree_block, bb_u.s.bb_rightsib),
1520                offsetof(struct xfs_btree_block, bb_u.s.bb_blkno),
1521                offsetof(struct xfs_btree_block, bb_u.s.bb_lsn),
1522                offsetof(struct xfs_btree_block, bb_u.s.bb_uuid),
1523                offsetof(struct xfs_btree_block, bb_u.s.bb_owner),
1524                offsetof(struct xfs_btree_block, bb_u.s.bb_crc),
1525                XFS_BTREE_SBLOCK_CRC_LEN
1526        };
1527        static const short      loffsets[] = {  /* table of offsets (long) */
1528                offsetof(struct xfs_btree_block, bb_magic),
1529                offsetof(struct xfs_btree_block, bb_level),
1530                offsetof(struct xfs_btree_block, bb_numrecs),
1531                offsetof(struct xfs_btree_block, bb_u.l.bb_leftsib),
1532                offsetof(struct xfs_btree_block, bb_u.l.bb_rightsib),
1533                offsetof(struct xfs_btree_block, bb_u.l.bb_blkno),
1534                offsetof(struct xfs_btree_block, bb_u.l.bb_lsn),
1535                offsetof(struct xfs_btree_block, bb_u.l.bb_uuid),
1536                offsetof(struct xfs_btree_block, bb_u.l.bb_owner),
1537                offsetof(struct xfs_btree_block, bb_u.l.bb_crc),
1538                offsetof(struct xfs_btree_block, bb_u.l.bb_pad),
1539                XFS_BTREE_LBLOCK_CRC_LEN
1540        };
1541
1542        if (bp) {
1543                int nbits;
1544
1545                if (cur->bc_flags & XFS_BTREE_CRC_BLOCKS) {
1546                        /*
1547                         * We don't log the CRC when updating a btree
1548                         * block but instead recreate it during log
1549                         * recovery.  As the log buffers have checksums
1550                         * of their own this is safe and avoids logging a crc
1551                         * update in a lot of places.
1552                         */
1553                        if (fields == XFS_BB_ALL_BITS)
1554                                fields = XFS_BB_ALL_BITS_CRC;
1555                        nbits = XFS_BB_NUM_BITS_CRC;
1556                } else {
1557                        nbits = XFS_BB_NUM_BITS;
1558                }
1559                xfs_btree_offsets(fields,
1560                                  (cur->bc_flags & XFS_BTREE_LONG_PTRS) ?
1561                                        loffsets : soffsets,
1562                                  nbits, &first, &last);
1563                xfs_trans_buf_set_type(cur->bc_tp, bp, XFS_BLFT_BTREE_BUF);
1564                xfs_trans_log_buf(cur->bc_tp, bp, first, last);
1565        } else {
1566                xfs_trans_log_inode(cur->bc_tp, cur->bc_private.b.ip,
1567                        xfs_ilog_fbroot(cur->bc_private.b.whichfork));
1568        }
1569}
1570
1571/*
1572 * Increment cursor by one record at the level.
1573 * For nonzero levels the leaf-ward information is untouched.
1574 */
1575int                                             /* error */
1576xfs_btree_increment(
1577        struct xfs_btree_cur    *cur,
1578        int                     level,
1579        int                     *stat)          /* success/failure */
1580{
1581        struct xfs_btree_block  *block;
1582        union xfs_btree_ptr     ptr;
1583        struct xfs_buf          *bp;
1584        int                     error;          /* error return value */
1585        int                     lev;
1586
1587        ASSERT(level < cur->bc_nlevels);
1588
1589        /* Read-ahead to the right at this level. */
1590        xfs_btree_readahead(cur, level, XFS_BTCUR_RIGHTRA);
1591
1592        /* Get a pointer to the btree block. */
1593        block = xfs_btree_get_block(cur, level, &bp);
1594
1595#ifdef DEBUG
1596        error = xfs_btree_check_block(cur, block, level, bp);
1597        if (error)
1598                goto error0;
1599#endif
1600
1601        /* We're done if we remain in the block after the increment. */
1602        if (++cur->bc_ptrs[level] <= xfs_btree_get_numrecs(block))
1603                goto out1;
1604
1605        /* Fail if we just went off the right edge of the tree. */
1606        xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_RIGHTSIB);
1607        if (xfs_btree_ptr_is_null(cur, &ptr))
1608                goto out0;
1609
1610        XFS_BTREE_STATS_INC(cur, increment);
1611
1612        /*
1613         * March up the tree incrementing pointers.
1614         * Stop when we don't go off the right edge of a block.
1615         */
1616        for (lev = level + 1; lev < cur->bc_nlevels; lev++) {
1617                block = xfs_btree_get_block(cur, lev, &bp);
1618
1619#ifdef DEBUG
1620                error = xfs_btree_check_block(cur, block, lev, bp);
1621                if (error)
1622                        goto error0;
1623#endif
1624
1625                if (++cur->bc_ptrs[lev] <= xfs_btree_get_numrecs(block))
1626                        break;
1627
1628                /* Read-ahead the right block for the next loop. */
1629                xfs_btree_readahead(cur, lev, XFS_BTCUR_RIGHTRA);
1630        }
1631
1632        /*
1633         * If we went off the root then we are either seriously
1634         * confused or have the tree root in an inode.
1635         */
1636        if (lev == cur->bc_nlevels) {
1637                if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE)
1638                        goto out0;
1639                ASSERT(0);
1640                error = -EFSCORRUPTED;
1641                goto error0;
1642        }
1643        ASSERT(lev < cur->bc_nlevels);
1644
1645        /*
1646         * Now walk back down the tree, fixing up the cursor's buffer
1647         * pointers and key numbers.
1648         */
1649        for (block = xfs_btree_get_block(cur, lev, &bp); lev > level; ) {
1650                union xfs_btree_ptr     *ptrp;
1651
1652                ptrp = xfs_btree_ptr_addr(cur, cur->bc_ptrs[lev], block);
1653                --lev;
1654                error = xfs_btree_read_buf_block(cur, ptrp, 0, &block, &bp);
1655                if (error)
1656                        goto error0;
1657
1658                xfs_btree_setbuf(cur, lev, bp);
1659                cur->bc_ptrs[lev] = 1;
1660        }
1661out1:
1662        *stat = 1;
1663        return 0;
1664
1665out0:
1666        *stat = 0;
1667        return 0;
1668
1669error0:
1670        return error;
1671}
1672
1673/*
1674 * Decrement cursor by one record at the level.
1675 * For nonzero levels the leaf-ward information is untouched.
1676 */
1677int                                             /* error */
1678xfs_btree_decrement(
1679        struct xfs_btree_cur    *cur,
1680        int                     level,
1681        int                     *stat)          /* success/failure */
1682{
1683        struct xfs_btree_block  *block;
1684        xfs_buf_t               *bp;
1685        int                     error;          /* error return value */
1686        int                     lev;
1687        union xfs_btree_ptr     ptr;
1688
1689        ASSERT(level < cur->bc_nlevels);
1690
1691        /* Read-ahead to the left at this level. */
1692        xfs_btree_readahead(cur, level, XFS_BTCUR_LEFTRA);
1693
1694        /* We're done if we remain in the block after the decrement. */
1695        if (--cur->bc_ptrs[level] > 0)
1696                goto out1;
1697
1698        /* Get a pointer to the btree block. */
1699        block = xfs_btree_get_block(cur, level, &bp);
1700
1701#ifdef DEBUG
1702        error = xfs_btree_check_block(cur, block, level, bp);
1703        if (error)
1704                goto error0;
1705#endif
1706
1707        /* Fail if we just went off the left edge of the tree. */
1708        xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_LEFTSIB);
1709        if (xfs_btree_ptr_is_null(cur, &ptr))
1710                goto out0;
1711
1712        XFS_BTREE_STATS_INC(cur, decrement);
1713
1714        /*
1715         * March up the tree decrementing pointers.
1716         * Stop when we don't go off the left edge of a block.
1717         */
1718        for (lev = level + 1; lev < cur->bc_nlevels; lev++) {
1719                if (--cur->bc_ptrs[lev] > 0)
1720                        break;
1721                /* Read-ahead the left block for the next loop. */
1722                xfs_btree_readahead(cur, lev, XFS_BTCUR_LEFTRA);
1723        }
1724
1725        /*
1726         * If we went off the root then we are seriously confused.
1727         * or the root of the tree is in an inode.
1728         */
1729        if (lev == cur->bc_nlevels) {
1730                if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE)
1731                        goto out0;
1732                ASSERT(0);
1733                error = -EFSCORRUPTED;
1734                goto error0;
1735        }
1736        ASSERT(lev < cur->bc_nlevels);
1737
1738        /*
1739         * Now walk back down the tree, fixing up the cursor's buffer
1740         * pointers and key numbers.
1741         */
1742        for (block = xfs_btree_get_block(cur, lev, &bp); lev > level; ) {
1743                union xfs_btree_ptr     *ptrp;
1744
1745                ptrp = xfs_btree_ptr_addr(cur, cur->bc_ptrs[lev], block);
1746                --lev;
1747                error = xfs_btree_read_buf_block(cur, ptrp, 0, &block, &bp);
1748                if (error)
1749                        goto error0;
1750                xfs_btree_setbuf(cur, lev, bp);
1751                cur->bc_ptrs[lev] = xfs_btree_get_numrecs(block);
1752        }
1753out1:
1754        *stat = 1;
1755        return 0;
1756
1757out0:
1758        *stat = 0;
1759        return 0;
1760
1761error0:
1762        return error;
1763}
1764
1765int
1766xfs_btree_lookup_get_block(
1767        struct xfs_btree_cur    *cur,   /* btree cursor */
1768        int                     level,  /* level in the btree */
1769        union xfs_btree_ptr     *pp,    /* ptr to btree block */
1770        struct xfs_btree_block  **blkp) /* return btree block */
1771{
1772        struct xfs_buf          *bp;    /* buffer pointer for btree block */
1773        xfs_daddr_t             daddr;
1774        int                     error = 0;
1775
1776        /* special case the root block if in an inode */
1777        if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
1778            (level == cur->bc_nlevels - 1)) {
1779                *blkp = xfs_btree_get_iroot(cur);
1780                return 0;
1781        }
1782
1783        /*
1784         * If the old buffer at this level for the disk address we are
1785         * looking for re-use it.
1786         *
1787         * Otherwise throw it away and get a new one.
1788         */
1789        bp = cur->bc_bufs[level];
1790        error = xfs_btree_ptr_to_daddr(cur, pp, &daddr);
1791        if (error)
1792                return error;
1793        if (bp && XFS_BUF_ADDR(bp) == daddr) {
1794                *blkp = XFS_BUF_TO_BLOCK(bp);
1795                return 0;
1796        }
1797
1798        error = xfs_btree_read_buf_block(cur, pp, 0, blkp, &bp);
1799        if (error)
1800                return error;
1801
1802        /* Check the inode owner since the verifiers don't. */
1803        if (xfs_sb_version_hascrc(&cur->bc_mp->m_sb) &&
1804            !(cur->bc_private.b.flags & XFS_BTCUR_BPRV_INVALID_OWNER) &&
1805            (cur->bc_flags & XFS_BTREE_LONG_PTRS) &&
1806            be64_to_cpu((*blkp)->bb_u.l.bb_owner) !=
1807                        cur->bc_private.b.ip->i_ino)
1808                goto out_bad;
1809
1810        /* Did we get the level we were looking for? */
1811        if (be16_to_cpu((*blkp)->bb_level) != level)
1812                goto out_bad;
1813
1814        /* Check that internal nodes have at least one record. */
1815        if (level != 0 && be16_to_cpu((*blkp)->bb_numrecs) == 0)
1816                goto out_bad;
1817
1818        xfs_btree_setbuf(cur, level, bp);
1819        return 0;
1820
1821out_bad:
1822        *blkp = NULL;
1823        xfs_trans_brelse(cur->bc_tp, bp);
1824        return -EFSCORRUPTED;
1825}
1826
1827/*
1828 * Get current search key.  For level 0 we don't actually have a key
1829 * structure so we make one up from the record.  For all other levels
1830 * we just return the right key.
1831 */
1832STATIC union xfs_btree_key *
1833xfs_lookup_get_search_key(
1834        struct xfs_btree_cur    *cur,
1835        int                     level,
1836        int                     keyno,
1837        struct xfs_btree_block  *block,
1838        union xfs_btree_key     *kp)
1839{
1840        if (level == 0) {
1841                cur->bc_ops->init_key_from_rec(kp,
1842                                xfs_btree_rec_addr(cur, keyno, block));
1843                return kp;
1844        }
1845
1846        return xfs_btree_key_addr(cur, keyno, block);
1847}
1848
1849/*
1850 * Lookup the record.  The cursor is made to point to it, based on dir.
1851 * stat is set to 0 if can't find any such record, 1 for success.
1852 */
1853int                                     /* error */
1854xfs_btree_lookup(
1855        struct xfs_btree_cur    *cur,   /* btree cursor */
1856        xfs_lookup_t            dir,    /* <=, ==, or >= */
1857        int                     *stat)  /* success/failure */
1858{
1859        struct xfs_btree_block  *block; /* current btree block */
1860        int64_t                 diff;   /* difference for the current key */
1861        int                     error;  /* error return value */
1862        int                     keyno;  /* current key number */
1863        int                     level;  /* level in the btree */
1864        union xfs_btree_ptr     *pp;    /* ptr to btree block */
1865        union xfs_btree_ptr     ptr;    /* ptr to btree block */
1866
1867        XFS_BTREE_STATS_INC(cur, lookup);
1868
1869        /* No such thing as a zero-level tree. */
1870        if (cur->bc_nlevels == 0)
1871                return -EFSCORRUPTED;
1872
1873        block = NULL;
1874        keyno = 0;
1875
1876        /* initialise start pointer from cursor */
1877        cur->bc_ops->init_ptr_from_cur(cur, &ptr);
1878        pp = &ptr;
1879
1880        /*
1881         * Iterate over each level in the btree, starting at the root.
1882         * For each level above the leaves, find the key we need, based
1883         * on the lookup record, then follow the corresponding block
1884         * pointer down to the next level.
1885         */
1886        for (level = cur->bc_nlevels - 1, diff = 1; level >= 0; level--) {
1887                /* Get the block we need to do the lookup on. */
1888                error = xfs_btree_lookup_get_block(cur, level, pp, &block);
1889                if (error)
1890                        goto error0;
1891
1892                if (diff == 0) {
1893                        /*
1894                         * If we already had a key match at a higher level, we
1895                         * know we need to use the first entry in this block.
1896                         */
1897                        keyno = 1;
1898                } else {
1899                        /* Otherwise search this block. Do a binary search. */
1900
1901                        int     high;   /* high entry number */
1902                        int     low;    /* low entry number */
1903
1904                        /* Set low and high entry numbers, 1-based. */
1905                        low = 1;
1906                        high = xfs_btree_get_numrecs(block);
1907                        if (!high) {
1908                                /* Block is empty, must be an empty leaf. */
1909                                if (level != 0 || cur->bc_nlevels != 1) {
1910                                        XFS_CORRUPTION_ERROR(__func__,
1911                                                        XFS_ERRLEVEL_LOW,
1912                                                        cur->bc_mp, block,
1913                                                        sizeof(*block));
1914                                        return -EFSCORRUPTED;
1915                                }
1916
1917                                cur->bc_ptrs[0] = dir != XFS_LOOKUP_LE;
1918                                *stat = 0;
1919                                return 0;
1920                        }
1921
1922                        /* Binary search the block. */
1923                        while (low <= high) {
1924                                union xfs_btree_key     key;
1925                                union xfs_btree_key     *kp;
1926
1927                                XFS_BTREE_STATS_INC(cur, compare);
1928
1929                                /* keyno is average of low and high. */
1930                                keyno = (low + high) >> 1;
1931
1932                                /* Get current search key */
1933                                kp = xfs_lookup_get_search_key(cur, level,
1934                                                keyno, block, &key);
1935
1936                                /*
1937                                 * Compute difference to get next direction:
1938                                 *  - less than, move right
1939                                 *  - greater than, move left
1940                                 *  - equal, we're done
1941                                 */
1942                                diff = cur->bc_ops->key_diff(cur, kp);
1943                                if (diff < 0)
1944                                        low = keyno + 1;
1945                                else if (diff > 0)
1946                                        high = keyno - 1;
1947                                else
1948                                        break;
1949                        }
1950                }
1951
1952                /*
1953                 * If there are more levels, set up for the next level
1954                 * by getting the block number and filling in the cursor.
1955                 */
1956                if (level > 0) {
1957                        /*
1958                         * If we moved left, need the previous key number,
1959                         * unless there isn't one.
1960                         */
1961                        if (diff > 0 && --keyno < 1)
1962                                keyno = 1;
1963                        pp = xfs_btree_ptr_addr(cur, keyno, block);
1964
1965                        error = xfs_btree_debug_check_ptr(cur, pp, 0, level);
1966                        if (error)
1967                                goto error0;
1968
1969                        cur->bc_ptrs[level] = keyno;
1970                }
1971        }
1972
1973        /* Done with the search. See if we need to adjust the results. */
1974        if (dir != XFS_LOOKUP_LE && diff < 0) {
1975                keyno++;
1976                /*
1977                 * If ge search and we went off the end of the block, but it's
1978                 * not the last block, we're in the wrong block.
1979                 */
1980                xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_RIGHTSIB);
1981                if (dir == XFS_LOOKUP_GE &&
1982                    keyno > xfs_btree_get_numrecs(block) &&
1983                    !xfs_btree_ptr_is_null(cur, &ptr)) {
1984                        int     i;
1985
1986                        cur->bc_ptrs[0] = keyno;
1987                        error = xfs_btree_increment(cur, 0, &i);
1988                        if (error)
1989                                goto error0;
1990                        XFS_WANT_CORRUPTED_RETURN(cur->bc_mp, i == 1);
1991                        *stat = 1;
1992                        return 0;
1993                }
1994        } else if (dir == XFS_LOOKUP_LE && diff > 0)
1995                keyno--;
1996        cur->bc_ptrs[0] = keyno;
1997
1998        /* Return if we succeeded or not. */
1999        if (keyno == 0 || keyno > xfs_btree_get_numrecs(block))
2000                *stat = 0;
2001        else if (dir != XFS_LOOKUP_EQ || diff == 0)
2002                *stat = 1;
2003        else
2004                *stat = 0;
2005        return 0;
2006
2007error0:
2008        return error;
2009}
2010
2011/* Find the high key storage area from a regular key. */
2012union xfs_btree_key *
2013xfs_btree_high_key_from_key(
2014        struct xfs_btree_cur    *cur,
2015        union xfs_btree_key     *key)
2016{
2017        ASSERT(cur->bc_flags & XFS_BTREE_OVERLAPPING);
2018        return (union xfs_btree_key *)((char *)key +
2019                        (cur->bc_ops->key_len / 2));
2020}
2021
2022/* Determine the low (and high if overlapped) keys of a leaf block */
2023STATIC void
2024xfs_btree_get_leaf_keys(
2025        struct xfs_btree_cur    *cur,
2026        struct xfs_btree_block  *block,
2027        union xfs_btree_key     *key)
2028{
2029        union xfs_btree_key     max_hkey;
2030        union xfs_btree_key     hkey;
2031        union xfs_btree_rec     *rec;
2032        union xfs_btree_key     *high;
2033        int                     n;
2034
2035        rec = xfs_btree_rec_addr(cur, 1, block);
2036        cur->bc_ops->init_key_from_rec(key, rec);
2037
2038        if (cur->bc_flags & XFS_BTREE_OVERLAPPING) {
2039
2040                cur->bc_ops->init_high_key_from_rec(&max_hkey, rec);
2041                for (n = 2; n <= xfs_btree_get_numrecs(block); n++) {
2042                        rec = xfs_btree_rec_addr(cur, n, block);
2043                        cur->bc_ops->init_high_key_from_rec(&hkey, rec);
2044                        if (cur->bc_ops->diff_two_keys(cur, &hkey, &max_hkey)
2045                                        > 0)
2046                                max_hkey = hkey;
2047                }
2048
2049                high = xfs_btree_high_key_from_key(cur, key);
2050                memcpy(high, &max_hkey, cur->bc_ops->key_len / 2);
2051        }
2052}
2053
2054/* Determine the low (and high if overlapped) keys of a node block */
2055STATIC void
2056xfs_btree_get_node_keys(
2057        struct xfs_btree_cur    *cur,
2058        struct xfs_btree_block  *block,
2059        union xfs_btree_key     *key)
2060{
2061        union xfs_btree_key     *hkey;
2062        union xfs_btree_key     *max_hkey;
2063        union xfs_btree_key     *high;
2064        int                     n;
2065
2066        if (cur->bc_flags & XFS_BTREE_OVERLAPPING) {
2067                memcpy(key, xfs_btree_key_addr(cur, 1, block),
2068                                cur->bc_ops->key_len / 2);
2069
2070                max_hkey = xfs_btree_high_key_addr(cur, 1, block);
2071                for (n = 2; n <= xfs_btree_get_numrecs(block); n++) {
2072                        hkey = xfs_btree_high_key_addr(cur, n, block);
2073                        if (cur->bc_ops->diff_two_keys(cur, hkey, max_hkey) > 0)
2074                                max_hkey = hkey;
2075                }
2076
2077                high = xfs_btree_high_key_from_key(cur, key);
2078                memcpy(high, max_hkey, cur->bc_ops->key_len / 2);
2079        } else {
2080                memcpy(key, xfs_btree_key_addr(cur, 1, block),
2081                                cur->bc_ops->key_len);
2082        }
2083}
2084
2085/* Derive the keys for any btree block. */
2086void
2087xfs_btree_get_keys(
2088        struct xfs_btree_cur    *cur,
2089        struct xfs_btree_block  *block,
2090        union xfs_btree_key     *key)
2091{
2092        if (be16_to_cpu(block->bb_level) == 0)
2093                xfs_btree_get_leaf_keys(cur, block, key);
2094        else
2095                xfs_btree_get_node_keys(cur, block, key);
2096}
2097
2098/*
2099 * Decide if we need to update the parent keys of a btree block.  For
2100 * a standard btree this is only necessary if we're updating the first
2101 * record/key.  For an overlapping btree, we must always update the
2102 * keys because the highest key can be in any of the records or keys
2103 * in the block.
2104 */
2105static inline bool
2106xfs_btree_needs_key_update(
2107        struct xfs_btree_cur    *cur,
2108        int                     ptr)
2109{
2110        return (cur->bc_flags & XFS_BTREE_OVERLAPPING) || ptr == 1;
2111}
2112
2113/*
2114 * Update the low and high parent keys of the given level, progressing
2115 * towards the root.  If force_all is false, stop if the keys for a given
2116 * level do not need updating.
2117 */
2118STATIC int
2119__xfs_btree_updkeys(
2120        struct xfs_btree_cur    *cur,
2121        int                     level,
2122        struct xfs_btree_block  *block,
2123        struct xfs_buf          *bp0,
2124        bool                    force_all)
2125{
2126        union xfs_btree_key     key;    /* keys from current level */
2127        union xfs_btree_key     *lkey;  /* keys from the next level up */
2128        union xfs_btree_key     *hkey;
2129        union xfs_btree_key     *nlkey; /* keys from the next level up */
2130        union xfs_btree_key     *nhkey;
2131        struct xfs_buf          *bp;
2132        int                     ptr;
2133
2134        ASSERT(cur->bc_flags & XFS_BTREE_OVERLAPPING);
2135
2136        /* Exit if there aren't any parent levels to update. */
2137        if (level + 1 >= cur->bc_nlevels)
2138                return 0;
2139
2140        trace_xfs_btree_updkeys(cur, level, bp0);
2141
2142        lkey = &key;
2143        hkey = xfs_btree_high_key_from_key(cur, lkey);
2144        xfs_btree_get_keys(cur, block, lkey);
2145        for (level++; level < cur->bc_nlevels; level++) {
2146#ifdef DEBUG
2147                int             error;
2148#endif
2149                block = xfs_btree_get_block(cur, level, &bp);
2150                trace_xfs_btree_updkeys(cur, level, bp);
2151#ifdef DEBUG
2152                error = xfs_btree_check_block(cur, block, level, bp);
2153                if (error)
2154                        return error;
2155#endif
2156                ptr = cur->bc_ptrs[level];
2157                nlkey = xfs_btree_key_addr(cur, ptr, block);
2158                nhkey = xfs_btree_high_key_addr(cur, ptr, block);
2159                if (!force_all &&
2160                    !(cur->bc_ops->diff_two_keys(cur, nlkey, lkey) != 0 ||
2161                      cur->bc_ops->diff_two_keys(cur, nhkey, hkey) != 0))
2162                        break;
2163                xfs_btree_copy_keys(cur, nlkey, lkey, 1);
2164                xfs_btree_log_keys(cur, bp, ptr, ptr);
2165                if (level + 1 >= cur->bc_nlevels)
2166                        break;
2167                xfs_btree_get_node_keys(cur, block, lkey);
2168        }
2169
2170        return 0;
2171}
2172
2173/* Update all the keys from some level in cursor back to the root. */
2174STATIC int
2175xfs_btree_updkeys_force(
2176        struct xfs_btree_cur    *cur,
2177        int                     level)
2178{
2179        struct xfs_buf          *bp;
2180        struct xfs_btree_block  *block;
2181
2182        block = xfs_btree_get_block(cur, level, &bp);
2183        return __xfs_btree_updkeys(cur, level, block, bp, true);
2184}
2185
2186/*
2187 * Update the parent keys of the given level, progressing towards the root.
2188 */
2189STATIC int
2190xfs_btree_update_keys(
2191        struct xfs_btree_cur    *cur,
2192        int                     level)
2193{
2194        struct xfs_btree_block  *block;
2195        struct xfs_buf          *bp;
2196        union xfs_btree_key     *kp;
2197        union xfs_btree_key     key;
2198        int                     ptr;
2199
2200        ASSERT(level >= 0);
2201
2202        block = xfs_btree_get_block(cur, level, &bp);
2203        if (cur->bc_flags & XFS_BTREE_OVERLAPPING)
2204                return __xfs_btree_updkeys(cur, level, block, bp, false);
2205
2206        /*
2207         * Go up the tree from this level toward the root.
2208         * At each level, update the key value to the value input.
2209         * Stop when we reach a level where the cursor isn't pointing
2210         * at the first entry in the block.
2211         */
2212        xfs_btree_get_keys(cur, block, &key);
2213        for (level++, ptr = 1; ptr == 1 && level < cur->bc_nlevels; level++) {
2214#ifdef DEBUG
2215                int             error;
2216#endif
2217                block = xfs_btree_get_block(cur, level, &bp);
2218#ifdef DEBUG
2219                error = xfs_btree_check_block(cur, block, level, bp);
2220                if (error)
2221                        return error;
2222#endif
2223                ptr = cur->bc_ptrs[level];
2224                kp = xfs_btree_key_addr(cur, ptr, block);
2225                xfs_btree_copy_keys(cur, kp, &key, 1);
2226                xfs_btree_log_keys(cur, bp, ptr, ptr);
2227        }
2228
2229        return 0;
2230}
2231
2232/*
2233 * Update the record referred to by cur to the value in the
2234 * given record. This either works (return 0) or gets an
2235 * EFSCORRUPTED error.
2236 */
2237int
2238xfs_btree_update(
2239        struct xfs_btree_cur    *cur,
2240        union xfs_btree_rec     *rec)
2241{
2242        struct xfs_btree_block  *block;
2243        struct xfs_buf          *bp;
2244        int                     error;
2245        int                     ptr;
2246        union xfs_btree_rec     *rp;
2247
2248        /* Pick up the current block. */
2249        block = xfs_btree_get_block(cur, 0, &bp);
2250
2251#ifdef DEBUG
2252        error = xfs_btree_check_block(cur, block, 0, bp);
2253        if (error)
2254                goto error0;
2255#endif
2256        /* Get the address of the rec to be updated. */
2257        ptr = cur->bc_ptrs[0];
2258        rp = xfs_btree_rec_addr(cur, ptr, block);
2259
2260        /* Fill in the new contents and log them. */
2261        xfs_btree_copy_recs(cur, rp, rec, 1);
2262        xfs_btree_log_recs(cur, bp, ptr, ptr);
2263
2264        /*
2265         * If we are tracking the last record in the tree and
2266         * we are at the far right edge of the tree, update it.
2267         */
2268        if (xfs_btree_is_lastrec(cur, block, 0)) {
2269                cur->bc_ops->update_lastrec(cur, block, rec,
2270                                            ptr, LASTREC_UPDATE);
2271        }
2272
2273        /* Pass new key value up to our parent. */
2274        if (xfs_btree_needs_key_update(cur, ptr)) {
2275                error = xfs_btree_update_keys(cur, 0);
2276                if (error)
2277                        goto error0;
2278        }
2279
2280        return 0;
2281
2282error0:
2283        return error;
2284}
2285
2286/*
2287 * Move 1 record left from cur/level if possible.
2288 * Update cur to reflect the new path.
2289 */
2290STATIC int                                      /* error */
2291xfs_btree_lshift(
2292        struct xfs_btree_cur    *cur,
2293        int                     level,
2294        int                     *stat)          /* success/failure */
2295{
2296        struct xfs_buf          *lbp;           /* left buffer pointer */
2297        struct xfs_btree_block  *left;          /* left btree block */
2298        int                     lrecs;          /* left record count */
2299        struct xfs_buf          *rbp;           /* right buffer pointer */
2300        struct xfs_btree_block  *right;         /* right btree block */
2301        struct xfs_btree_cur    *tcur;          /* temporary btree cursor */
2302        int                     rrecs;          /* right record count */
2303        union xfs_btree_ptr     lptr;           /* left btree pointer */
2304        union xfs_btree_key     *rkp = NULL;    /* right btree key */
2305        union xfs_btree_ptr     *rpp = NULL;    /* right address pointer */
2306        union xfs_btree_rec     *rrp = NULL;    /* right record pointer */
2307        int                     error;          /* error return value */
2308        int                     i;
2309
2310        if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
2311            level == cur->bc_nlevels - 1)
2312                goto out0;
2313
2314        /* Set up variables for this block as "right". */
2315        right = xfs_btree_get_block(cur, level, &rbp);
2316
2317#ifdef DEBUG
2318        error = xfs_btree_check_block(cur, right, level, rbp);
2319        if (error)
2320                goto error0;
2321#endif
2322
2323        /* If we've got no left sibling then we can't shift an entry left. */
2324        xfs_btree_get_sibling(cur, right, &lptr, XFS_BB_LEFTSIB);
2325        if (xfs_btree_ptr_is_null(cur, &lptr))
2326                goto out0;
2327
2328        /*
2329         * If the cursor entry is the one that would be moved, don't
2330         * do it... it's too complicated.
2331         */
2332        if (cur->bc_ptrs[level] <= 1)
2333                goto out0;
2334
2335        /* Set up the left neighbor as "left". */
2336        error = xfs_btree_read_buf_block(cur, &lptr, 0, &left, &lbp);
2337        if (error)
2338                goto error0;
2339
2340        /* If it's full, it can't take another entry. */
2341        lrecs = xfs_btree_get_numrecs(left);
2342        if (lrecs == cur->bc_ops->get_maxrecs(cur, level))
2343                goto out0;
2344
2345        rrecs = xfs_btree_get_numrecs(right);
2346
2347        /*
2348         * We add one entry to the left side and remove one for the right side.
2349         * Account for it here, the changes will be updated on disk and logged
2350         * later.
2351         */
2352        lrecs++;
2353        rrecs--;
2354
2355        XFS_BTREE_STATS_INC(cur, lshift);
2356        XFS_BTREE_STATS_ADD(cur, moves, 1);
2357
2358        /*
2359         * If non-leaf, copy a key and a ptr to the left block.
2360         * Log the changes to the left block.
2361         */
2362        if (level > 0) {
2363                /* It's a non-leaf.  Move keys and pointers. */
2364                union xfs_btree_key     *lkp;   /* left btree key */
2365                union xfs_btree_ptr     *lpp;   /* left address pointer */
2366
2367                lkp = xfs_btree_key_addr(cur, lrecs, left);
2368                rkp = xfs_btree_key_addr(cur, 1, right);
2369
2370                lpp = xfs_btree_ptr_addr(cur, lrecs, left);
2371                rpp = xfs_btree_ptr_addr(cur, 1, right);
2372
2373                error = xfs_btree_debug_check_ptr(cur, rpp, 0, level);
2374                if (error)
2375                        goto error0;
2376
2377                xfs_btree_copy_keys(cur, lkp, rkp, 1);
2378                xfs_btree_copy_ptrs(cur, lpp, rpp, 1);
2379
2380                xfs_btree_log_keys(cur, lbp, lrecs, lrecs);
2381                xfs_btree_log_ptrs(cur, lbp, lrecs, lrecs);
2382
2383                ASSERT(cur->bc_ops->keys_inorder(cur,
2384                        xfs_btree_key_addr(cur, lrecs - 1, left), lkp));
2385        } else {
2386                /* It's a leaf.  Move records.  */
2387                union xfs_btree_rec     *lrp;   /* left record pointer */
2388
2389                lrp = xfs_btree_rec_addr(cur, lrecs, left);
2390                rrp = xfs_btree_rec_addr(cur, 1, right);
2391
2392                xfs_btree_copy_recs(cur, lrp, rrp, 1);
2393                xfs_btree_log_recs(cur, lbp, lrecs, lrecs);
2394
2395                ASSERT(cur->bc_ops->recs_inorder(cur,
2396                        xfs_btree_rec_addr(cur, lrecs - 1, left), lrp));
2397        }
2398
2399        xfs_btree_set_numrecs(left, lrecs);
2400        xfs_btree_log_block(cur, lbp, XFS_BB_NUMRECS);
2401
2402        xfs_btree_set_numrecs(right, rrecs);
2403        xfs_btree_log_block(cur, rbp, XFS_BB_NUMRECS);
2404
2405        /*
2406         * Slide the contents of right down one entry.
2407         */
2408        XFS_BTREE_STATS_ADD(cur, moves, rrecs - 1);
2409        if (level > 0) {
2410                /* It's a nonleaf. operate on keys and ptrs */
2411                int                     i;              /* loop index */
2412
2413                for (i = 0; i < rrecs; i++) {
2414                        error = xfs_btree_debug_check_ptr(cur, rpp, i + 1, level);
2415                        if (error)
2416                                goto error0;
2417                }
2418
2419                xfs_btree_shift_keys(cur,
2420                                xfs_btree_key_addr(cur, 2, right),
2421                                -1, rrecs);
2422                xfs_btree_shift_ptrs(cur,
2423                                xfs_btree_ptr_addr(cur, 2, right),
2424                                -1, rrecs);
2425
2426                xfs_btree_log_keys(cur, rbp, 1, rrecs);
2427                xfs_btree_log_ptrs(cur, rbp, 1, rrecs);
2428        } else {
2429                /* It's a leaf. operate on records */
2430                xfs_btree_shift_recs(cur,
2431                        xfs_btree_rec_addr(cur, 2, right),
2432                        -1, rrecs);
2433                xfs_btree_log_recs(cur, rbp, 1, rrecs);
2434        }
2435
2436        /*
2437         * Using a temporary cursor, update the parent key values of the
2438         * block on the left.
2439         */
2440        if (cur->bc_flags & XFS_BTREE_OVERLAPPING) {
2441                error = xfs_btree_dup_cursor(cur, &tcur);
2442                if (error)
2443                        goto error0;
2444                i = xfs_btree_firstrec(tcur, level);
2445                XFS_WANT_CORRUPTED_GOTO(tcur->bc_mp, i == 1, error0);
2446
2447                error = xfs_btree_decrement(tcur, level, &i);
2448                if (error)
2449                        goto error1;
2450
2451                /* Update the parent high keys of the left block, if needed. */
2452                error = xfs_btree_update_keys(tcur, level);
2453                if (error)
2454                        goto error1;
2455
2456                xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
2457        }
2458
2459        /* Update the parent keys of the right block. */
2460        error = xfs_btree_update_keys(cur, level);
2461        if (error)
2462                goto error0;
2463
2464        /* Slide the cursor value left one. */
2465        cur->bc_ptrs[level]--;
2466
2467        *stat = 1;
2468        return 0;
2469
2470out0:
2471        *stat = 0;
2472        return 0;
2473
2474error0:
2475        return error;
2476
2477error1:
2478        xfs_btree_del_cursor(tcur, XFS_BTREE_ERROR);
2479        return error;
2480}
2481
2482/*
2483 * Move 1 record right from cur/level if possible.
2484 * Update cur to reflect the new path.
2485 */
2486STATIC int                                      /* error */
2487xfs_btree_rshift(
2488        struct xfs_btree_cur    *cur,
2489        int                     level,
2490        int                     *stat)          /* success/failure */
2491{
2492        struct xfs_buf          *lbp;           /* left buffer pointer */
2493        struct xfs_btree_block  *left;          /* left btree block */
2494        struct xfs_buf          *rbp;           /* right buffer pointer */
2495        struct xfs_btree_block  *right;         /* right btree block */
2496        struct xfs_btree_cur    *tcur;          /* temporary btree cursor */
2497        union xfs_btree_ptr     rptr;           /* right block pointer */
2498        union xfs_btree_key     *rkp;           /* right btree key */
2499        int                     rrecs;          /* right record count */
2500        int                     lrecs;          /* left record count */
2501        int                     error;          /* error return value */
2502        int                     i;              /* loop counter */
2503
2504        if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
2505            (level == cur->bc_nlevels - 1))
2506                goto out0;
2507
2508        /* Set up variables for this block as "left". */
2509        left = xfs_btree_get_block(cur, level, &lbp);
2510
2511#ifdef DEBUG
2512        error = xfs_btree_check_block(cur, left, level, lbp);
2513        if (error)
2514                goto error0;
2515#endif
2516
2517        /* If we've got no right sibling then we can't shift an entry right. */
2518        xfs_btree_get_sibling(cur, left, &rptr, XFS_BB_RIGHTSIB);
2519        if (xfs_btree_ptr_is_null(cur, &rptr))
2520                goto out0;
2521
2522        /*
2523         * If the cursor entry is the one that would be moved, don't
2524         * do it... it's too complicated.
2525         */
2526        lrecs = xfs_btree_get_numrecs(left);
2527        if (cur->bc_ptrs[level] >= lrecs)
2528                goto out0;
2529
2530        /* Set up the right neighbor as "right". */
2531        error = xfs_btree_read_buf_block(cur, &rptr, 0, &right, &rbp);
2532        if (error)
2533                goto error0;
2534
2535        /* If it's full, it can't take another entry. */
2536        rrecs = xfs_btree_get_numrecs(right);
2537        if (rrecs == cur->bc_ops->get_maxrecs(cur, level))
2538                goto out0;
2539
2540        XFS_BTREE_STATS_INC(cur, rshift);
2541        XFS_BTREE_STATS_ADD(cur, moves, rrecs);
2542
2543        /*
2544         * Make a hole at the start of the right neighbor block, then
2545         * copy the last left block entry to the hole.
2546         */
2547        if (level > 0) {
2548                /* It's a nonleaf. make a hole in the keys and ptrs */
2549                union xfs_btree_key     *lkp;
2550                union xfs_btree_ptr     *lpp;
2551                union xfs_btree_ptr     *rpp;
2552
2553                lkp = xfs_btree_key_addr(cur, lrecs, left);
2554                lpp = xfs_btree_ptr_addr(cur, lrecs, left);
2555                rkp = xfs_btree_key_addr(cur, 1, right);
2556                rpp = xfs_btree_ptr_addr(cur, 1, right);
2557
2558                for (i = rrecs - 1; i >= 0; i--) {
2559                        error = xfs_btree_debug_check_ptr(cur, rpp, i, level);
2560                        if (error)
2561                                goto error0;
2562                }
2563
2564                xfs_btree_shift_keys(cur, rkp, 1, rrecs);
2565                xfs_btree_shift_ptrs(cur, rpp, 1, rrecs);
2566
2567                error = xfs_btree_debug_check_ptr(cur, lpp, 0, level);
2568                if (error)
2569                        goto error0;
2570
2571                /* Now put the new data in, and log it. */
2572                xfs_btree_copy_keys(cur, rkp, lkp, 1);
2573                xfs_btree_copy_ptrs(cur, rpp, lpp, 1);
2574
2575                xfs_btree_log_keys(cur, rbp, 1, rrecs + 1);
2576                xfs_btree_log_ptrs(cur, rbp, 1, rrecs + 1);
2577
2578                ASSERT(cur->bc_ops->keys_inorder(cur, rkp,
2579                        xfs_btree_key_addr(cur, 2, right)));
2580        } else {
2581                /* It's a leaf. make a hole in the records */
2582                union xfs_btree_rec     *lrp;
2583                union xfs_btree_rec     *rrp;
2584
2585                lrp = xfs_btree_rec_addr(cur, lrecs, left);
2586                rrp = xfs_btree_rec_addr(cur, 1, right);
2587
2588                xfs_btree_shift_recs(cur, rrp, 1, rrecs);
2589
2590                /* Now put the new data in, and log it. */
2591                xfs_btree_copy_recs(cur, rrp, lrp, 1);
2592                xfs_btree_log_recs(cur, rbp, 1, rrecs + 1);
2593        }
2594
2595        /*
2596         * Decrement and log left's numrecs, bump and log right's numrecs.
2597         */
2598        xfs_btree_set_numrecs(left, --lrecs);
2599        xfs_btree_log_block(cur, lbp, XFS_BB_NUMRECS);
2600
2601        xfs_btree_set_numrecs(right, ++rrecs);
2602        xfs_btree_log_block(cur, rbp, XFS_BB_NUMRECS);
2603
2604        /*
2605         * Using a temporary cursor, update the parent key values of the
2606         * block on the right.
2607         */
2608        error = xfs_btree_dup_cursor(cur, &tcur);
2609        if (error)
2610                goto error0;
2611        i = xfs_btree_lastrec(tcur, level);
2612        XFS_WANT_CORRUPTED_GOTO(tcur->bc_mp, i == 1, error0);
2613
2614        error = xfs_btree_increment(tcur, level, &i);
2615        if (error)
2616                goto error1;
2617
2618        /* Update the parent high keys of the left block, if needed. */
2619        if (cur->bc_flags & XFS_BTREE_OVERLAPPING) {
2620                error = xfs_btree_update_keys(cur, level);
2621                if (error)
2622                        goto error1;
2623        }
2624
2625        /* Update the parent keys of the right block. */
2626        error = xfs_btree_update_keys(tcur, level);
2627        if (error)
2628                goto error1;
2629
2630        xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
2631
2632        *stat = 1;
2633        return 0;
2634
2635out0:
2636        *stat = 0;
2637        return 0;
2638
2639error0:
2640        return error;
2641
2642error1:
2643        xfs_btree_del_cursor(tcur, XFS_BTREE_ERROR);
2644        return error;
2645}
2646
2647/*
2648 * Split cur/level block in half.
2649 * Return new block number and the key to its first
2650 * record (to be inserted into parent).
2651 */
2652STATIC int                                      /* error */
2653__xfs_btree_split(
2654        struct xfs_btree_cur    *cur,
2655        int                     level,
2656        union xfs_btree_ptr     *ptrp,
2657        union xfs_btree_key     *key,
2658        struct xfs_btree_cur    **curp,
2659        int                     *stat)          /* success/failure */
2660{
2661        union xfs_btree_ptr     lptr;           /* left sibling block ptr */
2662        struct xfs_buf          *lbp;           /* left buffer pointer */
2663        struct xfs_btree_block  *left;          /* left btree block */
2664        union xfs_btree_ptr     rptr;           /* right sibling block ptr */
2665        struct xfs_buf          *rbp;           /* right buffer pointer */
2666        struct xfs_btree_block  *right;         /* right btree block */
2667        union xfs_btree_ptr     rrptr;          /* right-right sibling ptr */
2668        struct xfs_buf          *rrbp;          /* right-right buffer pointer */
2669        struct xfs_btree_block  *rrblock;       /* right-right btree block */
2670        int                     lrecs;
2671        int                     rrecs;
2672        int                     src_index;
2673        int                     error;          /* error return value */
2674        int                     i;
2675
2676        XFS_BTREE_STATS_INC(cur, split);
2677
2678        /* Set up left block (current one). */
2679        left = xfs_btree_get_block(cur, level, &lbp);
2680
2681#ifdef DEBUG
2682        error = xfs_btree_check_block(cur, left, level, lbp);
2683        if (error)
2684                goto error0;
2685#endif
2686
2687        xfs_btree_buf_to_ptr(cur, lbp, &lptr);
2688
2689        /* Allocate the new block. If we can't do it, we're toast. Give up. */
2690        error = cur->bc_ops->alloc_block(cur, &lptr, &rptr, stat);
2691        if (error)
2692                goto error0;
2693        if (*stat == 0)
2694                goto out0;
2695        XFS_BTREE_STATS_INC(cur, alloc);
2696
2697        /* Set up the new block as "right". */
2698        error = xfs_btree_get_buf_block(cur, &rptr, &right, &rbp);
2699        if (error)
2700                goto error0;
2701
2702        /* Fill in the btree header for the new right block. */
2703        xfs_btree_init_block_cur(cur, rbp, xfs_btree_get_level(left), 0);
2704
2705        /*
2706         * Split the entries between the old and the new block evenly.
2707         * Make sure that if there's an odd number of entries now, that
2708         * each new block will have the same number of entries.
2709         */
2710        lrecs = xfs_btree_get_numrecs(left);
2711        rrecs = lrecs / 2;
2712        if ((lrecs & 1) && cur->bc_ptrs[level] <= rrecs + 1)
2713                rrecs++;
2714        src_index = (lrecs - rrecs + 1);
2715
2716        XFS_BTREE_STATS_ADD(cur, moves, rrecs);
2717
2718        /* Adjust numrecs for the later get_*_keys() calls. */
2719        lrecs -= rrecs;
2720        xfs_btree_set_numrecs(left, lrecs);
2721        xfs_btree_set_numrecs(right, xfs_btree_get_numrecs(right) + rrecs);
2722
2723        /*
2724         * Copy btree block entries from the left block over to the
2725         * new block, the right. Update the right block and log the
2726         * changes.
2727         */
2728        if (level > 0) {
2729                /* It's a non-leaf.  Move keys and pointers. */
2730                union xfs_btree_key     *lkp;   /* left btree key */
2731                union xfs_btree_ptr     *lpp;   /* left address pointer */
2732                union xfs_btree_key     *rkp;   /* right btree key */
2733                union xfs_btree_ptr     *rpp;   /* right address pointer */
2734
2735                lkp = xfs_btree_key_addr(cur, src_index, left);
2736                lpp = xfs_btree_ptr_addr(cur, src_index, left);
2737                rkp = xfs_btree_key_addr(cur, 1, right);
2738                rpp = xfs_btree_ptr_addr(cur, 1, right);
2739
2740                for (i = src_index; i < rrecs; i++) {
2741                        error = xfs_btree_debug_check_ptr(cur, lpp, i, level);
2742                        if (error)
2743                                goto error0;
2744                }
2745
2746                /* Copy the keys & pointers to the new block. */
2747                xfs_btree_copy_keys(cur, rkp, lkp, rrecs);
2748                xfs_btree_copy_ptrs(cur, rpp, lpp, rrecs);
2749
2750                xfs_btree_log_keys(cur, rbp, 1, rrecs);
2751                xfs_btree_log_ptrs(cur, rbp, 1, rrecs);
2752
2753                /* Stash the keys of the new block for later insertion. */
2754                xfs_btree_get_node_keys(cur, right, key);
2755        } else {
2756                /* It's a leaf.  Move records.  */
2757                union xfs_btree_rec     *lrp;   /* left record pointer */
2758                union xfs_btree_rec     *rrp;   /* right record pointer */
2759
2760                lrp = xfs_btree_rec_addr(cur, src_index, left);
2761                rrp = xfs_btree_rec_addr(cur, 1, right);
2762
2763                /* Copy records to the new block. */
2764                xfs_btree_copy_recs(cur, rrp, lrp, rrecs);
2765                xfs_btree_log_recs(cur, rbp, 1, rrecs);
2766
2767                /* Stash the keys of the new block for later insertion. */
2768                xfs_btree_get_leaf_keys(cur, right, key);
2769        }
2770
2771        /*
2772         * Find the left block number by looking in the buffer.
2773         * Adjust sibling pointers.
2774         */
2775        xfs_btree_get_sibling(cur, left, &rrptr, XFS_BB_RIGHTSIB);
2776        xfs_btree_set_sibling(cur, right, &rrptr, XFS_BB_RIGHTSIB);
2777        xfs_btree_set_sibling(cur, right, &lptr, XFS_BB_LEFTSIB);
2778        xfs_btree_set_sibling(cur, left, &rptr, XFS_BB_RIGHTSIB);
2779
2780        xfs_btree_log_block(cur, rbp, XFS_BB_ALL_BITS);
2781        xfs_btree_log_block(cur, lbp, XFS_BB_NUMRECS | XFS_BB_RIGHTSIB);
2782
2783        /*
2784         * If there's a block to the new block's right, make that block
2785         * point back to right instead of to left.
2786         */
2787        if (!xfs_btree_ptr_is_null(cur, &rrptr)) {
2788                error = xfs_btree_read_buf_block(cur, &rrptr,
2789                                                        0, &rrblock, &rrbp);
2790                if (error)
2791                        goto error0;
2792                xfs_btree_set_sibling(cur, rrblock, &rptr, XFS_BB_LEFTSIB);
2793                xfs_btree_log_block(cur, rrbp, XFS_BB_LEFTSIB);
2794        }
2795
2796        /* Update the parent high keys of the left block, if needed. */
2797        if (cur->bc_flags & XFS_BTREE_OVERLAPPING) {
2798                error = xfs_btree_update_keys(cur, level);
2799                if (error)
2800                        goto error0;
2801        }
2802
2803        /*
2804         * If the cursor is really in the right block, move it there.
2805         * If it's just pointing past the last entry in left, then we'll
2806         * insert there, so don't change anything in that case.
2807         */
2808        if (cur->bc_ptrs[level] > lrecs + 1) {
2809                xfs_btree_setbuf(cur, level, rbp);
2810                cur->bc_ptrs[level] -= lrecs;
2811        }
2812        /*
2813         * If there are more levels, we'll need another cursor which refers
2814         * the right block, no matter where this cursor was.
2815         */
2816        if (level + 1 < cur->bc_nlevels) {
2817                error = xfs_btree_dup_cursor(cur, curp);
2818                if (error)
2819                        goto error0;
2820                (*curp)->bc_ptrs[level + 1]++;
2821        }
2822        *ptrp = rptr;
2823        *stat = 1;
2824        return 0;
2825out0:
2826        *stat = 0;
2827        return 0;
2828
2829error0:
2830        return error;
2831}
2832
2833struct xfs_btree_split_args {
2834        struct xfs_btree_cur    *cur;
2835        int                     level;
2836        union xfs_btree_ptr     *ptrp;
2837        union xfs_btree_key     *key;
2838        struct xfs_btree_cur    **curp;
2839        int                     *stat;          /* success/failure */
2840        int                     result;
2841        bool                    kswapd; /* allocation in kswapd context */
2842        struct completion       *done;
2843        struct work_struct      work;
2844};
2845
2846/*
2847 * Stack switching interfaces for allocation
2848 */
2849static void
2850xfs_btree_split_worker(
2851        struct work_struct      *work)
2852{
2853        struct xfs_btree_split_args     *args = container_of(work,
2854                                                struct xfs_btree_split_args, work);
2855        unsigned long           pflags;
2856        unsigned long           new_pflags = PF_MEMALLOC_NOFS;
2857
2858        /*
2859         * we are in a transaction context here, but may also be doing work
2860         * in kswapd context, and hence we may need to inherit that state
2861         * temporarily to ensure that we don't block waiting for memory reclaim
2862         * in any way.
2863         */
2864        if (args->kswapd)
2865                new_pflags |= PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD;
2866
2867        current_set_flags_nested(&pflags, new_pflags);
2868
2869        args->result = __xfs_btree_split(args->cur, args->level, args->ptrp,
2870                                         args->key, args->curp, args->stat);
2871        complete(args->done);
2872
2873        current_restore_flags_nested(&pflags, new_pflags);
2874}
2875
2876/*
2877 * BMBT split requests often come in with little stack to work on. Push
2878 * them off to a worker thread so there is lots of stack to use. For the other
2879 * btree types, just call directly to avoid the context switch overhead here.
2880 */
2881STATIC int                                      /* error */
2882xfs_btree_split(
2883        struct xfs_btree_cur    *cur,
2884        int                     level,
2885        union xfs_btree_ptr     *ptrp,
2886        union xfs_btree_key     *key,
2887        struct xfs_btree_cur    **curp,
2888        int                     *stat)          /* success/failure */
2889{
2890        struct xfs_btree_split_args     args;
2891        DECLARE_COMPLETION_ONSTACK(done);
2892
2893        if (cur->bc_btnum != XFS_BTNUM_BMAP)
2894                return __xfs_btree_split(cur, level, ptrp, key, curp, stat);
2895
2896        args.cur = cur;
2897        args.level = level;
2898        args.ptrp = ptrp;
2899        args.key = key;
2900        args.curp = curp;
2901        args.stat = stat;
2902        args.done = &done;
2903        args.kswapd = current_is_kswapd();
2904        INIT_WORK_ONSTACK(&args.work, xfs_btree_split_worker);
2905        queue_work(xfs_alloc_wq, &args.work);
2906        wait_for_completion(&done);
2907        destroy_work_on_stack(&args.work);
2908        return args.result;
2909}
2910
2911
2912/*
2913 * Copy the old inode root contents into a real block and make the
2914 * broot point to it.
2915 */
2916int                                             /* error */
2917xfs_btree_new_iroot(
2918        struct xfs_btree_cur    *cur,           /* btree cursor */
2919        int                     *logflags,      /* logging flags for inode */
2920        int                     *stat)          /* return status - 0 fail */
2921{
2922        struct xfs_buf          *cbp;           /* buffer for cblock */
2923        struct xfs_btree_block  *block;         /* btree block */
2924        struct xfs_btree_block  *cblock;        /* child btree block */
2925        union xfs_btree_key     *ckp;           /* child key pointer */
2926        union xfs_btree_ptr     *cpp;           /* child ptr pointer */
2927        union xfs_btree_key     *kp;            /* pointer to btree key */
2928        union xfs_btree_ptr     *pp;            /* pointer to block addr */
2929        union xfs_btree_ptr     nptr;           /* new block addr */
2930        int                     level;          /* btree level */
2931        int                     error;          /* error return code */
2932        int                     i;              /* loop counter */
2933
2934        XFS_BTREE_STATS_INC(cur, newroot);
2935
2936        ASSERT(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE);
2937
2938        level = cur->bc_nlevels - 1;
2939
2940        block = xfs_btree_get_iroot(cur);
2941        pp = xfs_btree_ptr_addr(cur, 1, block);
2942
2943        /* Allocate the new block. If we can't do it, we're toast. Give up. */
2944        error = cur->bc_ops->alloc_block(cur, pp, &nptr, stat);
2945        if (error)
2946                goto error0;
2947        if (*stat == 0)
2948                return 0;
2949
2950        XFS_BTREE_STATS_INC(cur, alloc);
2951
2952        /* Copy the root into a real block. */
2953        error = xfs_btree_get_buf_block(cur, &nptr, &cblock, &cbp);
2954        if (error)
2955                goto error0;
2956
2957        /*
2958         * we can't just memcpy() the root in for CRC enabled btree blocks.
2959         * In that case have to also ensure the blkno remains correct
2960         */
2961        memcpy(cblock, block, xfs_btree_block_len(cur));
2962        if (cur->bc_flags & XFS_BTREE_CRC_BLOCKS) {
2963                if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
2964                        cblock->bb_u.l.bb_blkno = cpu_to_be64(cbp->b_bn);
2965                else
2966                        cblock->bb_u.s.bb_blkno = cpu_to_be64(cbp->b_bn);
2967        }
2968
2969        be16_add_cpu(&block->bb_level, 1);
2970        xfs_btree_set_numrecs(block, 1);
2971        cur->bc_nlevels++;
2972        cur->bc_ptrs[level + 1] = 1;
2973
2974        kp = xfs_btree_key_addr(cur, 1, block);
2975        ckp = xfs_btree_key_addr(cur, 1, cblock);
2976        xfs_btree_copy_keys(cur, ckp, kp, xfs_btree_get_numrecs(cblock));
2977
2978        cpp = xfs_btree_ptr_addr(cur, 1, cblock);
2979        for (i = 0; i < be16_to_cpu(cblock->bb_numrecs); i++) {
2980                error = xfs_btree_debug_check_ptr(cur, pp, i, level);
2981                if (error)
2982                        goto error0;
2983        }
2984
2985        xfs_btree_copy_ptrs(cur, cpp, pp, xfs_btree_get_numrecs(cblock));
2986
2987        error = xfs_btree_debug_check_ptr(cur, &nptr, 0, level);
2988        if (error)
2989                goto error0;
2990
2991        xfs_btree_copy_ptrs(cur, pp, &nptr, 1);
2992
2993        xfs_iroot_realloc(cur->bc_private.b.ip,
2994                          1 - xfs_btree_get_numrecs(cblock),
2995                          cur->bc_private.b.whichfork);
2996
2997        xfs_btree_setbuf(cur, level, cbp);
2998
2999        /*
3000         * Do all this logging at the end so that
3001         * the root is at the right level.
3002         */
3003        xfs_btree_log_block(cur, cbp, XFS_BB_ALL_BITS);
3004        xfs_btree_log_keys(cur, cbp, 1, be16_to_cpu(cblock->bb_numrecs));
3005        xfs_btree_log_ptrs(cur, cbp, 1, be16_to_cpu(cblock->bb_numrecs));
3006
3007        *logflags |=
3008                XFS_ILOG_CORE | xfs_ilog_fbroot(cur->bc_private.b.whichfork);
3009        *stat = 1;
3010        return 0;
3011error0:
3012        return error;
3013}
3014
3015/*
3016 * Allocate a new root block, fill it in.
3017 */
3018STATIC int                              /* error */
3019xfs_btree_new_root(
3020        struct xfs_btree_cur    *cur,   /* btree cursor */
3021        int                     *stat)  /* success/failure */
3022{
3023        struct xfs_btree_block  *block; /* one half of the old root block */
3024        struct xfs_buf          *bp;    /* buffer containing block */
3025        int                     error;  /* error return value */
3026        struct xfs_buf          *lbp;   /* left buffer pointer */
3027        struct xfs_btree_block  *left;  /* left btree block */
3028        struct xfs_buf          *nbp;   /* new (root) buffer */
3029        struct xfs_btree_block  *new;   /* new (root) btree block */
3030        int                     nptr;   /* new value for key index, 1 or 2 */
3031        struct xfs_buf          *rbp;   /* right buffer pointer */
3032        struct xfs_btree_block  *right; /* right btree block */
3033        union xfs_btree_ptr     rptr;
3034        union xfs_btree_ptr     lptr;
3035
3036        XFS_BTREE_STATS_INC(cur, newroot);
3037
3038        /* initialise our start point from the cursor */
3039        cur->bc_ops->init_ptr_from_cur(cur, &rptr);
3040
3041        /* Allocate the new block. If we can't do it, we're toast. Give up. */
3042        error = cur->bc_ops->alloc_block(cur, &rptr, &lptr, stat);
3043        if (error)
3044                goto error0;
3045        if (*stat == 0)
3046                goto out0;
3047        XFS_BTREE_STATS_INC(cur, alloc);
3048
3049        /* Set up the new block. */
3050        error = xfs_btree_get_buf_block(cur, &lptr, &new, &nbp);
3051        if (error)
3052                goto error0;
3053
3054        /* Set the root in the holding structure  increasing the level by 1. */
3055        cur->bc_ops->set_root(cur, &lptr, 1);
3056
3057        /*
3058         * At the previous root level there are now two blocks: the old root,
3059         * and the new block generated when it was split.  We don't know which
3060         * one the cursor is pointing at, so we set up variables "left" and
3061         * "right" for each case.
3062         */
3063        block = xfs_btree_get_block(cur, cur->bc_nlevels - 1, &bp);
3064
3065#ifdef DEBUG
3066        error = xfs_btree_check_block(cur, block, cur->bc_nlevels - 1, bp);
3067        if (error)
3068                goto error0;
3069#endif
3070
3071        xfs_btree_get_sibling(cur, block, &rptr, XFS_BB_RIGHTSIB);
3072        if (!xfs_btree_ptr_is_null(cur, &rptr)) {
3073                /* Our block is left, pick up the right block. */
3074                lbp = bp;
3075                xfs_btree_buf_to_ptr(cur, lbp, &lptr);
3076                left = block;
3077                error = xfs_btree_read_buf_block(cur, &rptr, 0, &right, &rbp);
3078                if (error)
3079                        goto error0;
3080                bp = rbp;
3081                nptr = 1;
3082        } else {
3083                /* Our block is right, pick up the left block. */
3084                rbp = bp;
3085                xfs_btree_buf_to_ptr(cur, rbp, &rptr);
3086                right = block;
3087                xfs_btree_get_sibling(cur, right, &lptr, XFS_BB_LEFTSIB);
3088                error = xfs_btree_read_buf_block(cur, &lptr, 0, &left, &lbp);
3089                if (error)
3090                        goto error0;
3091                bp = lbp;
3092                nptr = 2;
3093        }
3094
3095        /* Fill in the new block's btree header and log it. */
3096        xfs_btree_init_block_cur(cur, nbp, cur->bc_nlevels, 2);
3097        xfs_btree_log_block(cur, nbp, XFS_BB_ALL_BITS);
3098        ASSERT(!xfs_btree_ptr_is_null(cur, &lptr) &&
3099                        !xfs_btree_ptr_is_null(cur, &rptr));
3100
3101        /* Fill in the key data in the new root. */
3102        if (xfs_btree_get_level(left) > 0) {
3103                /*
3104                 * Get the keys for the left block's keys and put them directly
3105                 * in the parent block.  Do the same for the right block.
3106                 */
3107                xfs_btree_get_node_keys(cur, left,
3108                                xfs_btree_key_addr(cur, 1, new));
3109                xfs_btree_get_node_keys(cur, right,
3110                                xfs_btree_key_addr(cur, 2, new));
3111        } else {
3112                /*
3113                 * Get the keys for the left block's records and put them
3114                 * directly in the parent block.  Do the same for the right
3115                 * block.
3116                 */
3117                xfs_btree_get_leaf_keys(cur, left,
3118                        xfs_btree_key_addr(cur, 1, new));
3119                xfs_btree_get_leaf_keys(cur, right,
3120                        xfs_btree_key_addr(cur, 2, new));
3121        }
3122        xfs_btree_log_keys(cur, nbp, 1, 2);
3123
3124        /* Fill in the pointer data in the new root. */
3125        xfs_btree_copy_ptrs(cur,
3126                xfs_btree_ptr_addr(cur, 1, new), &lptr, 1);
3127        xfs_btree_copy_ptrs(cur,
3128                xfs_btree_ptr_addr(cur, 2, new), &rptr, 1);
3129        xfs_btree_log_ptrs(cur, nbp, 1, 2);
3130
3131        /* Fix up the cursor. */
3132        xfs_btree_setbuf(cur, cur->bc_nlevels, nbp);
3133        cur->bc_ptrs[cur->bc_nlevels] = nptr;
3134        cur->bc_nlevels++;
3135        *stat = 1;
3136        return 0;
3137error0:
3138        return error;
3139out0:
3140        *stat = 0;
3141        return 0;
3142}
3143
3144STATIC int
3145xfs_btree_make_block_unfull(
3146        struct xfs_btree_cur    *cur,   /* btree cursor */
3147        int                     level,  /* btree level */
3148        int                     numrecs,/* # of recs in block */
3149        int                     *oindex,/* old tree index */
3150        int                     *index, /* new tree index */
3151        union xfs_btree_ptr     *nptr,  /* new btree ptr */
3152        struct xfs_btree_cur    **ncur, /* new btree cursor */
3153        union xfs_btree_key     *key,   /* key of new block */
3154        int                     *stat)
3155{
3156        int                     error = 0;
3157
3158        if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
3159            level == cur->bc_nlevels - 1) {
3160                struct xfs_inode *ip = cur->bc_private.b.ip;
3161
3162                if (numrecs < cur->bc_ops->get_dmaxrecs(cur, level)) {
3163                        /* A root block that can be made bigger. */
3164                        xfs_iroot_realloc(ip, 1, cur->bc_private.b.whichfork);
3165                        *stat = 1;
3166                } else {
3167                        /* A root block that needs replacing */
3168                        int     logflags = 0;
3169
3170                        error = xfs_btree_new_iroot(cur, &logflags, stat);
3171                        if (error || *stat == 0)
3172                                return error;
3173
3174                        xfs_trans_log_inode(cur->bc_tp, ip, logflags);
3175                }
3176
3177                return 0;
3178        }
3179
3180        /* First, try shifting an entry to the right neighbor. */
3181        error = xfs_btree_rshift(cur, level, stat);
3182        if (error || *stat)
3183                return error;
3184
3185        /* Next, try shifting an entry to the left neighbor. */
3186        error = xfs_btree_lshift(cur, level, stat);
3187        if (error)
3188                return error;
3189
3190        if (*stat) {
3191                *oindex = *index = cur->bc_ptrs[level];
3192                return 0;
3193        }
3194
3195        /*
3196         * Next, try splitting the current block in half.
3197         *
3198         * If this works we have to re-set our variables because we
3199         * could be in a different block now.
3200         */
3201        error = xfs_btree_split(cur, level, nptr, key, ncur, stat);
3202        if (error || *stat == 0)
3203                return error;
3204
3205
3206        *index = cur->bc_ptrs[level];
3207        return 0;
3208}
3209
3210/*
3211 * Insert one record/level.  Return information to the caller
3212 * allowing the next level up to proceed if necessary.
3213 */
3214STATIC int
3215xfs_btree_insrec(
3216        struct xfs_btree_cur    *cur,   /* btree cursor */
3217        int                     level,  /* level to insert record at */
3218        union xfs_btree_ptr     *ptrp,  /* i/o: block number inserted */
3219        union xfs_btree_rec     *rec,   /* record to insert */
3220        union xfs_btree_key     *key,   /* i/o: block key for ptrp */
3221        struct xfs_btree_cur    **curp, /* output: new cursor replacing cur */
3222        int                     *stat)  /* success/failure */
3223{
3224        struct xfs_btree_block  *block; /* btree block */
3225        struct xfs_buf          *bp;    /* buffer for block */
3226        union xfs_btree_ptr     nptr;   /* new block ptr */
3227        struct xfs_btree_cur    *ncur;  /* new btree cursor */
3228        union xfs_btree_key     nkey;   /* new block key */
3229        union xfs_btree_key     *lkey;
3230        int                     optr;   /* old key/record index */
3231        int                     ptr;    /* key/record index */
3232        int                     numrecs;/* number of records */
3233        int                     error;  /* error return value */
3234        int                     i;
3235        xfs_daddr_t             old_bn;
3236
3237        ncur = NULL;
3238        lkey = &nkey;
3239
3240        /*
3241         * If we have an external root pointer, and we've made it to the
3242         * root level, allocate a new root block and we're done.
3243         */
3244        if (!(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
3245            (level >= cur->bc_nlevels)) {
3246                error = xfs_btree_new_root(cur, stat);
3247                xfs_btree_set_ptr_null(cur, ptrp);
3248
3249                return error;
3250        }
3251
3252        /* If we're off the left edge, return failure. */
3253        ptr = cur->bc_ptrs[level];
3254        if (ptr == 0) {
3255                *stat = 0;
3256                return 0;
3257        }
3258
3259        optr = ptr;
3260
3261        XFS_BTREE_STATS_INC(cur, insrec);
3262
3263        /* Get pointers to the btree buffer and block. */
3264        block = xfs_btree_get_block(cur, level, &bp);
3265        old_bn = bp ? bp->b_bn : XFS_BUF_DADDR_NULL;
3266        numrecs = xfs_btree_get_numrecs(block);
3267
3268#ifdef DEBUG
3269        error = xfs_btree_check_block(cur, block, level, bp);
3270        if (error)
3271                goto error0;
3272
3273        /* Check that the new entry is being inserted in the right place. */
3274        if (ptr <= numrecs) {
3275                if (level == 0) {
3276                        ASSERT(cur->bc_ops->recs_inorder(cur, rec,
3277                                xfs_btree_rec_addr(cur, ptr, block)));
3278                } else {
3279                        ASSERT(cur->bc_ops->keys_inorder(cur, key,
3280                                xfs_btree_key_addr(cur, ptr, block)));
3281                }
3282        }
3283#endif
3284
3285        /*
3286         * If the block is full, we can't insert the new entry until we
3287         * make the block un-full.
3288         */
3289        xfs_btree_set_ptr_null(cur, &nptr);
3290        if (numrecs == cur->bc_ops->get_maxrecs(cur, level)) {
3291                error = xfs_btree_make_block_unfull(cur, level, numrecs,
3292                                        &optr, &ptr, &nptr, &ncur, lkey, stat);
3293                if (error || *stat == 0)
3294                        goto error0;
3295        }
3296
3297        /*
3298         * The current block may have changed if the block was
3299         * previously full and we have just made space in it.
3300         */
3301        block = xfs_btree_get_block(cur, level, &bp);
3302        numrecs = xfs_btree_get_numrecs(block);
3303
3304#ifdef DEBUG
3305        error = xfs_btree_check_block(cur, block, level, bp);
3306        if (error)
3307                return error;
3308#endif
3309
3310        /*
3311         * At this point we know there's room for our new entry in the block
3312         * we're pointing at.
3313         */
3314        XFS_BTREE_STATS_ADD(cur, moves, numrecs - ptr + 1);
3315
3316        if (level > 0) {
3317                /* It's a nonleaf. make a hole in the keys and ptrs */
3318                union xfs_btree_key     *kp;
3319                union xfs_btree_ptr     *pp;
3320
3321                kp = xfs_btree_key_addr(cur, ptr, block);
3322                pp = xfs_btree_ptr_addr(cur, ptr, block);
3323
3324                for (i = numrecs - ptr; i >= 0; i--) {
3325                        error = xfs_btree_debug_check_ptr(cur, pp, i, level);
3326                        if (error)
3327                                return error;
3328                }
3329
3330                xfs_btree_shift_keys(cur, kp, 1, numrecs - ptr + 1);
3331                xfs_btree_shift_ptrs(cur, pp, 1, numrecs - ptr + 1);
3332
3333                error = xfs_btree_debug_check_ptr(cur, ptrp, 0, level);
3334                if (error)
3335                        goto error0;
3336
3337                /* Now put the new data in, bump numrecs and log it. */
3338                xfs_btree_copy_keys(cur, kp, key, 1);
3339                xfs_btree_copy_ptrs(cur, pp, ptrp, 1);
3340                numrecs++;
3341                xfs_btree_set_numrecs(block, numrecs);
3342                xfs_btree_log_ptrs(cur, bp, ptr, numrecs);
3343                xfs_btree_log_keys(cur, bp, ptr, numrecs);
3344#ifdef DEBUG
3345                if (ptr < numrecs) {
3346                        ASSERT(cur->bc_ops->keys_inorder(cur, kp,
3347                                xfs_btree_key_addr(cur, ptr + 1, block)));
3348                }
3349#endif
3350        } else {
3351                /* It's a leaf. make a hole in the records */
3352                union xfs_btree_rec             *rp;
3353
3354                rp = xfs_btree_rec_addr(cur, ptr, block);
3355
3356                xfs_btree_shift_recs(cur, rp, 1, numrecs - ptr + 1);
3357
3358                /* Now put the new data in, bump numrecs and log it. */
3359                xfs_btree_copy_recs(cur, rp, rec, 1);
3360                xfs_btree_set_numrecs(block, ++numrecs);
3361                xfs_btree_log_recs(cur, bp, ptr, numrecs);
3362#ifdef DEBUG
3363                if (ptr < numrecs) {
3364                        ASSERT(cur->bc_ops->recs_inorder(cur, rp,
3365                                xfs_btree_rec_addr(cur, ptr + 1, block)));
3366                }
3367#endif
3368        }
3369
3370        /* Log the new number of records in the btree header. */
3371        xfs_btree_log_block(cur, bp, XFS_BB_NUMRECS);
3372
3373        /*
3374         * If we just inserted into a new tree block, we have to
3375         * recalculate nkey here because nkey is out of date.
3376         *
3377         * Otherwise we're just updating an existing block (having shoved
3378         * some records into the new tree block), so use the regular key
3379         * update mechanism.
3380         */
3381        if (bp && bp->b_bn != old_bn) {
3382                xfs_btree_get_keys(cur, block, lkey);
3383        } else if (xfs_btree_needs_key_update(cur, optr)) {
3384                error = xfs_btree_update_keys(cur, level);
3385                if (error)
3386                        goto error0;
3387        }
3388
3389        /*
3390         * If we are tracking the last record in the tree and
3391         * we are at the far right edge of the tree, update it.
3392         */
3393        if (xfs_btree_is_lastrec(cur, block, level)) {
3394                cur->bc_ops->update_lastrec(cur, block, rec,
3395                                            ptr, LASTREC_INSREC);
3396        }
3397
3398        /*
3399         * Return the new block number, if any.
3400         * If there is one, give back a record value and a cursor too.
3401         */
3402        *ptrp = nptr;
3403        if (!xfs_btree_ptr_is_null(cur, &nptr)) {
3404                xfs_btree_copy_keys(cur, key, lkey, 1);
3405                *curp = ncur;
3406        }
3407
3408        *stat = 1;
3409        return 0;
3410
3411error0:
3412        return error;
3413}
3414
3415/*
3416 * Insert the record at the point referenced by cur.
3417 *
3418 * A multi-level split of the tree on insert will invalidate the original
3419 * cursor.  All callers of this function should assume that the cursor is
3420 * no longer valid and revalidate it.
3421 */
3422int
3423xfs_btree_insert(
3424        struct xfs_btree_cur    *cur,
3425        int                     *stat)
3426{
3427        int                     error;  /* error return value */
3428        int                     i;      /* result value, 0 for failure */
3429        int                     level;  /* current level number in btree */
3430        union xfs_btree_ptr     nptr;   /* new block number (split result) */
3431        struct xfs_btree_cur    *ncur;  /* new cursor (split result) */
3432        struct xfs_btree_cur    *pcur;  /* previous level's cursor */
3433        union xfs_btree_key     bkey;   /* key of block to insert */
3434        union xfs_btree_key     *key;
3435        union xfs_btree_rec     rec;    /* record to insert */
3436
3437        level = 0;
3438        ncur = NULL;
3439        pcur = cur;
3440        key = &bkey;
3441
3442        xfs_btree_set_ptr_null(cur, &nptr);
3443
3444        /* Make a key out of the record data to be inserted, and save it. */
3445        cur->bc_ops->init_rec_from_cur(cur, &rec);
3446        cur->bc_ops->init_key_from_rec(key, &rec);
3447
3448        /*
3449         * Loop going up the tree, starting at the leaf level.
3450         * Stop when we don't get a split block, that must mean that
3451         * the insert is finished with this level.
3452         */
3453        do {
3454                /*
3455                 * Insert nrec/nptr into this level of the tree.
3456                 * Note if we fail, nptr will be null.
3457                 */
3458                error = xfs_btree_insrec(pcur, level, &nptr, &rec, key,
3459                                &ncur, &i);
3460                if (error) {
3461                        if (pcur != cur)
3462                                xfs_btree_del_cursor(pcur, XFS_BTREE_ERROR);
3463                        goto error0;
3464                }
3465
3466                XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, i == 1, error0);
3467                level++;
3468
3469                /*
3470                 * See if the cursor we just used is trash.
3471                 * Can't trash the caller's cursor, but otherwise we should
3472                 * if ncur is a new cursor or we're about to be done.
3473                 */
3474                if (pcur != cur &&
3475                    (ncur || xfs_btree_ptr_is_null(cur, &nptr))) {
3476                        /* Save the state from the cursor before we trash it */
3477                        if (cur->bc_ops->update_cursor)
3478                                cur->bc_ops->update_cursor(pcur, cur);
3479                        cur->bc_nlevels = pcur->bc_nlevels;
3480                        xfs_btree_del_cursor(pcur, XFS_BTREE_NOERROR);
3481                }
3482                /* If we got a new cursor, switch to it. */
3483                if (ncur) {
3484                        pcur = ncur;
3485                        ncur = NULL;
3486                }
3487        } while (!xfs_btree_ptr_is_null(cur, &nptr));
3488
3489        *stat = i;
3490        return 0;
3491error0:
3492        return error;
3493}
3494
3495/*
3496 * Try to merge a non-leaf block back into the inode root.
3497 *
3498 * Note: the killroot names comes from the fact that we're effectively
3499 * killing the old root block.  But because we can't just delete the
3500 * inode we have to copy the single block it was pointing to into the
3501 * inode.
3502 */
3503STATIC int
3504xfs_btree_kill_iroot(
3505        struct xfs_btree_cur    *cur)
3506{
3507        int                     whichfork = cur->bc_private.b.whichfork;
3508        struct xfs_inode        *ip = cur->bc_private.b.ip;
3509        struct xfs_ifork        *ifp = XFS_IFORK_PTR(ip, whichfork);
3510        struct xfs_btree_block  *block;
3511        struct xfs_btree_block  *cblock;
3512        union xfs_btree_key     *kp;
3513        union xfs_btree_key     *ckp;
3514        union xfs_btree_ptr     *pp;
3515        union xfs_btree_ptr     *cpp;
3516        struct xfs_buf          *cbp;
3517        int                     level;
3518        int                     index;
3519        int                     numrecs;
3520        int                     error;
3521#ifdef DEBUG
3522        union xfs_btree_ptr     ptr;
3523#endif
3524        int                     i;
3525
3526        ASSERT(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE);
3527        ASSERT(cur->bc_nlevels > 1);
3528
3529        /*
3530         * Don't deal with the root block needs to be a leaf case.
3531         * We're just going to turn the thing back into extents anyway.
3532         */
3533        level = cur->bc_nlevels - 1;
3534        if (level == 1)
3535                goto out0;
3536
3537        /*
3538         * Give up if the root has multiple children.
3539         */
3540        block = xfs_btree_get_iroot(cur);
3541        if (xfs_btree_get_numrecs(block) != 1)
3542                goto out0;
3543
3544        cblock = xfs_btree_get_block(cur, level - 1, &cbp);
3545        numrecs = xfs_btree_get_numrecs(cblock);
3546
3547        /*
3548         * Only do this if the next level will fit.
3549         * Then the data must be copied up to the inode,
3550         * instead of freeing the root you free the next level.
3551         */
3552        if (numrecs > cur->bc_ops->get_dmaxrecs(cur, level))
3553                goto out0;
3554
3555        XFS_BTREE_STATS_INC(cur, killroot);
3556
3557#ifdef DEBUG
3558        xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_LEFTSIB);
3559        ASSERT(xfs_btree_ptr_is_null(cur, &ptr));
3560        xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_RIGHTSIB);
3561        ASSERT(xfs_btree_ptr_is_null(cur, &ptr));
3562#endif
3563
3564        index = numrecs - cur->bc_ops->get_maxrecs(cur, level);
3565        if (index) {
3566                xfs_iroot_realloc(cur->bc_private.b.ip, index,
3567                                  cur->bc_private.b.whichfork);
3568                block = ifp->if_broot;
3569        }
3570
3571        be16_add_cpu(&block->bb_numrecs, index);
3572        ASSERT(block->bb_numrecs == cblock->bb_numrecs);
3573
3574        kp = xfs_btree_key_addr(cur, 1, block);
3575        ckp = xfs_btree_key_addr(cur, 1, cblock);
3576        xfs_btree_copy_keys(cur, kp, ckp, numrecs);
3577
3578        pp = xfs_btree_ptr_addr(cur, 1, block);
3579        cpp = xfs_btree_ptr_addr(cur, 1, cblock);
3580
3581        for (i = 0; i < numrecs; i++) {
3582                error = xfs_btree_debug_check_ptr(cur, cpp, i, level - 1);
3583                if (error)
3584                        return error;
3585        }
3586
3587        xfs_btree_copy_ptrs(cur, pp, cpp, numrecs);
3588
3589        error = xfs_btree_free_block(cur, cbp);
3590        if (error)
3591                return error;
3592
3593        cur->bc_bufs[level - 1] = NULL;
3594        be16_add_cpu(&block->bb_level, -1);
3595        xfs_trans_log_inode(cur->bc_tp, ip,
3596                XFS_ILOG_CORE | xfs_ilog_fbroot(cur->bc_private.b.whichfork));
3597        cur->bc_nlevels--;
3598out0:
3599        return 0;
3600}
3601
3602/*
3603 * Kill the current root node, and replace it with it's only child node.
3604 */
3605STATIC int
3606xfs_btree_kill_root(
3607        struct xfs_btree_cur    *cur,
3608        struct xfs_buf          *bp,
3609        int                     level,
3610        union xfs_btree_ptr     *newroot)
3611{
3612        int                     error;
3613
3614        XFS_BTREE_STATS_INC(cur, killroot);
3615
3616        /*
3617         * Update the root pointer, decreasing the level by 1 and then
3618         * free the old root.
3619         */
3620        cur->bc_ops->set_root(cur, newroot, -1);
3621
3622        error = xfs_btree_free_block(cur, bp);
3623        if (error)
3624                return error;
3625
3626        cur->bc_bufs[level] = NULL;
3627        cur->bc_ra[level] = 0;
3628        cur->bc_nlevels--;
3629
3630        return 0;
3631}
3632
3633STATIC int
3634xfs_btree_dec_cursor(
3635        struct xfs_btree_cur    *cur,
3636        int                     level,
3637        int                     *stat)
3638{
3639        int                     error;
3640        int                     i;
3641
3642        if (level > 0) {
3643                error = xfs_btree_decrement(cur, level, &i);
3644                if (error)
3645                        return error;
3646        }
3647
3648        *stat = 1;
3649        return 0;
3650}
3651
3652/*
3653 * Single level of the btree record deletion routine.
3654 * Delete record pointed to by cur/level.
3655 * Remove the record from its block then rebalance the tree.
3656 * Return 0 for error, 1 for done, 2 to go on to the next level.
3657 */
3658STATIC int                                      /* error */
3659xfs_btree_delrec(
3660        struct xfs_btree_cur    *cur,           /* btree cursor */
3661        int                     level,          /* level removing record from */
3662        int                     *stat)          /* fail/done/go-on */
3663{
3664        struct xfs_btree_block  *block;         /* btree block */
3665        union xfs_btree_ptr     cptr;           /* current block ptr */
3666        struct xfs_buf          *bp;            /* buffer for block */
3667        int                     error;          /* error return value */
3668        int                     i;              /* loop counter */
3669        union xfs_btree_ptr     lptr;           /* left sibling block ptr */
3670        struct xfs_buf          *lbp;           /* left buffer pointer */
3671        struct xfs_btree_block  *left;          /* left btree block */
3672        int                     lrecs = 0;      /* left record count */
3673        int                     ptr;            /* key/record index */
3674        union xfs_btree_ptr     rptr;           /* right sibling block ptr */
3675        struct xfs_buf          *rbp;           /* right buffer pointer */
3676        struct xfs_btree_block  *right;         /* right btree block */
3677        struct xfs_btree_block  *rrblock;       /* right-right btree block */
3678        struct xfs_buf          *rrbp;          /* right-right buffer pointer */
3679        int                     rrecs = 0;      /* right record count */
3680        struct xfs_btree_cur    *tcur;          /* temporary btree cursor */
3681        int                     numrecs;        /* temporary numrec count */
3682
3683        tcur = NULL;
3684
3685        /* Get the index of the entry being deleted, check for nothing there. */
3686        ptr = cur->bc_ptrs[level];
3687        if (ptr == 0) {
3688                *stat = 0;
3689                return 0;
3690        }
3691
3692        /* Get the buffer & block containing the record or key/ptr. */
3693        block = xfs_btree_get_block(cur, level, &bp);
3694        numrecs = xfs_btree_get_numrecs(block);
3695
3696#ifdef DEBUG
3697        error = xfs_btree_check_block(cur, block, level, bp);
3698        if (error)
3699                goto error0;
3700#endif
3701
3702        /* Fail if we're off the end of the block. */
3703        if (ptr > numrecs) {
3704                *stat = 0;
3705                return 0;
3706        }
3707
3708        XFS_BTREE_STATS_INC(cur, delrec);
3709        XFS_BTREE_STATS_ADD(cur, moves, numrecs - ptr);
3710
3711        /* Excise the entries being deleted. */
3712        if (level > 0) {
3713                /* It's a nonleaf. operate on keys and ptrs */
3714                union xfs_btree_key     *lkp;
3715                union xfs_btree_ptr     *lpp;
3716
3717                lkp = xfs_btree_key_addr(cur, ptr + 1, block);
3718                lpp = xfs_btree_ptr_addr(cur, ptr + 1, block);
3719
3720                for (i = 0; i < numrecs - ptr; i++) {
3721                        error = xfs_btree_debug_check_ptr(cur, lpp, i, level);
3722                        if (error)
3723                                goto error0;
3724                }
3725
3726                if (ptr < numrecs) {
3727                        xfs_btree_shift_keys(cur, lkp, -1, numrecs - ptr);
3728                        xfs_btree_shift_ptrs(cur, lpp, -1, numrecs - ptr);
3729                        xfs_btree_log_keys(cur, bp, ptr, numrecs - 1);
3730                        xfs_btree_log_ptrs(cur, bp, ptr, numrecs - 1);
3731                }
3732        } else {
3733                /* It's a leaf. operate on records */
3734                if (ptr < numrecs) {
3735                        xfs_btree_shift_recs(cur,
3736                                xfs_btree_rec_addr(cur, ptr + 1, block),
3737                                -1, numrecs - ptr);
3738                        xfs_btree_log_recs(cur, bp, ptr, numrecs - 1);
3739                }
3740        }
3741
3742        /*
3743         * Decrement and log the number of entries in the block.
3744         */
3745        xfs_btree_set_numrecs(block, --numrecs);
3746        xfs_btree_log_block(cur, bp, XFS_BB_NUMRECS);
3747
3748        /*
3749         * If we are tracking the last record in the tree and
3750         * we are at the far right edge of the tree, update it.
3751         */
3752        if (xfs_btree_is_lastrec(cur, block, level)) {
3753                cur->bc_ops->update_lastrec(cur, block, NULL,
3754                                            ptr, LASTREC_DELREC);
3755        }
3756
3757        /*
3758         * We're at the root level.  First, shrink the root block in-memory.
3759         * Try to get rid of the next level down.  If we can't then there's
3760         * nothing left to do.
3761         */
3762        if (level == cur->bc_nlevels - 1) {
3763                if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) {
3764                        xfs_iroot_realloc(cur->bc_private.b.ip, -1,
3765                                          cur->bc_private.b.whichfork);
3766
3767                        error = xfs_btree_kill_iroot(cur);
3768                        if (error)
3769                                goto error0;
3770
3771                        error = xfs_btree_dec_cursor(cur, level, stat);
3772                        if (error)
3773                                goto error0;
3774                        *stat = 1;
3775                        return 0;
3776                }
3777
3778                /*
3779                 * If this is the root level, and there's only one entry left,
3780                 * and it's NOT the leaf level, then we can get rid of this
3781                 * level.
3782                 */
3783                if (numrecs == 1 && level > 0) {
3784                        union xfs_btree_ptr     *pp;
3785                        /*
3786                         * pp is still set to the first pointer in the block.
3787                         * Make it the new root of the btree.
3788                         */
3789                        pp = xfs_btree_ptr_addr(cur, 1, block);
3790                        error = xfs_btree_kill_root(cur, bp, level, pp);
3791                        if (error)
3792                                goto error0;
3793                } else if (level > 0) {
3794                        error = xfs_btree_dec_cursor(cur, level, stat);
3795                        if (error)
3796                                goto error0;
3797                }
3798                *stat = 1;
3799                return 0;
3800        }
3801
3802        /*
3803         * If we deleted the leftmost entry in the block, update the
3804         * key values above us in the tree.
3805         */
3806        if (xfs_btree_needs_key_update(cur, ptr)) {
3807                error = xfs_btree_update_keys(cur, level);
3808                if (error)
3809                        goto error0;
3810        }
3811
3812        /*
3813         * If the number of records remaining in the block is at least
3814         * the minimum, we're done.
3815         */
3816        if (numrecs >= cur->bc_ops->get_minrecs(cur, level)) {
3817                error = xfs_btree_dec_cursor(cur, level, stat);
3818                if (error)
3819                        goto error0;
3820                return 0;
3821        }
3822
3823        /*
3824         * Otherwise, we have to move some records around to keep the
3825         * tree balanced.  Look at the left and right sibling blocks to
3826         * see if we can re-balance by moving only one record.
3827         */
3828        xfs_btree_get_sibling(cur, block, &rptr, XFS_BB_RIGHTSIB);
3829        xfs_btree_get_sibling(cur, block, &lptr, XFS_BB_LEFTSIB);
3830
3831        if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) {
3832                /*
3833                 * One child of root, need to get a chance to copy its contents
3834                 * into the root and delete it. Can't go up to next level,
3835                 * there's nothing to delete there.
3836                 */
3837                if (xfs_btree_ptr_is_null(cur, &rptr) &&
3838                    xfs_btree_ptr_is_null(cur, &lptr) &&
3839                    level == cur->bc_nlevels - 2) {
3840                        error = xfs_btree_kill_iroot(cur);
3841                        if (!error)
3842                                error = xfs_btree_dec_cursor(cur, level, stat);
3843                        if (error)
3844                                goto error0;
3845                        return 0;
3846                }
3847        }
3848
3849        ASSERT(!xfs_btree_ptr_is_null(cur, &rptr) ||
3850               !xfs_btree_ptr_is_null(cur, &lptr));
3851
3852        /*
3853         * Duplicate the cursor so our btree manipulations here won't
3854         * disrupt the next level up.
3855         */
3856        error = xfs_btree_dup_cursor(cur, &tcur);
3857        if (error)
3858                goto error0;
3859
3860        /*
3861         * If there's a right sibling, see if it's ok to shift an entry
3862         * out of it.
3863         */
3864        if (!xfs_btree_ptr_is_null(cur, &rptr)) {
3865                /*
3866                 * Move the temp cursor to the last entry in the next block.
3867                 * Actually any entry but the first would suffice.
3868                 */
3869                i = xfs_btree_lastrec(tcur, level);
3870                XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, i == 1, error0);
3871
3872                error = xfs_btree_increment(tcur, level, &i);
3873                if (error)
3874                        goto error0;
3875                XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, i == 1, error0);
3876
3877                i = xfs_btree_lastrec(tcur, level);
3878                XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, i == 1, error0);
3879
3880                /* Grab a pointer to the block. */
3881                right = xfs_btree_get_block(tcur, level, &rbp);
3882#ifdef DEBUG
3883                error = xfs_btree_check_block(tcur, right, level, rbp);
3884                if (error)
3885                        goto error0;
3886#endif
3887                /* Grab the current block number, for future use. */
3888                xfs_btree_get_sibling(tcur, right, &cptr, XFS_BB_LEFTSIB);
3889
3890                /*
3891                 * If right block is full enough so that removing one entry
3892                 * won't make it too empty, and left-shifting an entry out
3893                 * of right to us works, we're done.
3894                 */
3895                if (xfs_btree_get_numrecs(right) - 1 >=
3896                    cur->bc_ops->get_minrecs(tcur, level)) {
3897                        error = xfs_btree_lshift(tcur, level, &i);
3898                        if (error)
3899                                goto error0;
3900                        if (i) {
3901                                ASSERT(xfs_btree_get_numrecs(block) >=
3902                                       cur->bc_ops->get_minrecs(tcur, level));
3903
3904                                xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
3905                                tcur = NULL;
3906
3907                                error = xfs_btree_dec_cursor(cur, level, stat);
3908                                if (error)
3909                                        goto error0;
3910                                return 0;
3911                        }
3912                }
3913
3914                /*
3915                 * Otherwise, grab the number of records in right for
3916                 * future reference, and fix up the temp cursor to point
3917                 * to our block again (last record).
3918                 */
3919                rrecs = xfs_btree_get_numrecs(right);
3920                if (!xfs_btree_ptr_is_null(cur, &lptr)) {
3921                        i = xfs_btree_firstrec(tcur, level);
3922                        XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, i == 1, error0);
3923
3924                        error = xfs_btree_decrement(tcur, level, &i);
3925                        if (error)
3926                                goto error0;
3927                        XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, i == 1, error0);
3928                }
3929        }
3930
3931        /*
3932         * If there's a left sibling, see if it's ok to shift an entry
3933         * out of it.
3934         */
3935        if (!xfs_btree_ptr_is_null(cur, &lptr)) {
3936                /*
3937                 * Move the temp cursor to the first entry in the
3938                 * previous block.
3939                 */
3940                i = xfs_btree_firstrec(tcur, level);
3941                XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, i == 1, error0);
3942
3943                error = xfs_btree_decrement(tcur, level, &i);
3944                if (error)
3945                        goto error0;
3946                i = xfs_btree_firstrec(tcur, level);
3947                XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, i == 1, error0);
3948
3949                /* Grab a pointer to the block. */
3950                left = xfs_btree_get_block(tcur, level, &lbp);
3951#ifdef DEBUG
3952                error = xfs_btree_check_block(cur, left, level, lbp);
3953                if (error)
3954                        goto error0;
3955#endif
3956                /* Grab the current block number, for future use. */
3957                xfs_btree_get_sibling(tcur, left, &cptr, XFS_BB_RIGHTSIB);
3958
3959                /*
3960                 * If left block is full enough so that removing one entry
3961                 * won't make it too empty, and right-shifting an entry out
3962                 * of left to us works, we're done.
3963                 */
3964                if (xfs_btree_get_numrecs(left) - 1 >=
3965                    cur->bc_ops->get_minrecs(tcur, level)) {
3966                        error = xfs_btree_rshift(tcur, level, &i);
3967                        if (error)
3968                                goto error0;
3969                        if (i) {
3970                                ASSERT(xfs_btree_get_numrecs(block) >=
3971                                       cur->bc_ops->get_minrecs(tcur, level));
3972                                xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
3973                                tcur = NULL;
3974                                if (level == 0)
3975                                        cur->bc_ptrs[0]++;
3976
3977                                *stat = 1;
3978                                return 0;
3979                        }
3980                }
3981
3982                /*
3983                 * Otherwise, grab the number of records in right for
3984                 * future reference.
3985                 */
3986                lrecs = xfs_btree_get_numrecs(left);
3987        }
3988
3989        /* Delete the temp cursor, we're done with it. */
3990        xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
3991        tcur = NULL;
3992
3993        /* If here, we need to do a join to keep the tree balanced. */
3994        ASSERT(!xfs_btree_ptr_is_null(cur, &cptr));
3995
3996        if (!xfs_btree_ptr_is_null(cur, &lptr) &&
3997            lrecs + xfs_btree_get_numrecs(block) <=
3998                        cur->bc_ops->get_maxrecs(cur, level)) {
3999                /*
4000                 * Set "right" to be the starting block,
4001                 * "left" to be the left neighbor.
4002                 */
4003                rptr = cptr;
4004                right = block;
4005                rbp = bp;
4006                error = xfs_btree_read_buf_block(cur, &lptr, 0, &left, &lbp);
4007                if (error)
4008                        goto error0;
4009
4010        /*
4011         * If that won't work, see if we can join with the right neighbor block.
4012         */
4013        } else if (!xfs_btree_ptr_is_null(cur, &rptr) &&
4014                   rrecs + xfs_btree_get_numrecs(block) <=
4015                        cur->bc_ops->get_maxrecs(cur, level)) {
4016                /*
4017                 * Set "left" to be the starting block,
4018                 * "right" to be the right neighbor.
4019                 */
4020                lptr = cptr;
4021                left = block;
4022                lbp = bp;
4023                error = xfs_btree_read_buf_block(cur, &rptr, 0, &right, &rbp);
4024                if (error)
4025                        goto error0;
4026
4027        /*
4028         * Otherwise, we can't fix the imbalance.
4029         * Just return.  This is probably a logic error, but it's not fatal.
4030         */
4031        } else {
4032                error = xfs_btree_dec_cursor(cur, level, stat);
4033                if (error)
4034                        goto error0;
4035                return 0;
4036        }
4037
4038        rrecs = xfs_btree_get_numrecs(right);
4039        lrecs = xfs_btree_get_numrecs(left);
4040
4041        /*
4042         * We're now going to join "left" and "right" by moving all the stuff
4043         * in "right" to "left" and deleting "right".
4044         */
4045        XFS_BTREE_STATS_ADD(cur, moves, rrecs);
4046        if (level > 0) {
4047                /* It's a non-leaf.  Move keys and pointers. */
4048                union xfs_btree_key     *lkp;   /* left btree key */
4049                union xfs_btree_ptr     *lpp;   /* left address pointer */
4050                union xfs_btree_key     *rkp;   /* right btree key */
4051                union xfs_btree_ptr     *rpp;   /* right address pointer */
4052
4053                lkp = xfs_btree_key_addr(cur, lrecs + 1, left);
4054                lpp = xfs_btree_ptr_addr(cur, lrecs + 1, left);
4055                rkp = xfs_btree_key_addr(cur, 1, right);
4056                rpp = xfs_btree_ptr_addr(cur, 1, right);
4057
4058                for (i = 1; i < rrecs; i++) {
4059                        error = xfs_btree_debug_check_ptr(cur, rpp, i, level);
4060                        if (error)
4061                                goto error0;
4062                }
4063
4064                xfs_btree_copy_keys(cur, lkp, rkp, rrecs);
4065                xfs_btree_copy_ptrs(cur, lpp, rpp, rrecs);
4066
4067                xfs_btree_log_keys(cur, lbp, lrecs + 1, lrecs + rrecs);
4068                xfs_btree_log_ptrs(cur, lbp, lrecs + 1, lrecs + rrecs);
4069        } else {
4070                /* It's a leaf.  Move records.  */
4071                union xfs_btree_rec     *lrp;   /* left record pointer */
4072                union xfs_btree_rec     *rrp;   /* right record pointer */
4073
4074                lrp = xfs_btree_rec_addr(cur, lrecs + 1, left);
4075                rrp = xfs_btree_rec_addr(cur, 1, right);
4076
4077                xfs_btree_copy_recs(cur, lrp, rrp, rrecs);
4078                xfs_btree_log_recs(cur, lbp, lrecs + 1, lrecs + rrecs);
4079        }
4080
4081        XFS_BTREE_STATS_INC(cur, join);
4082
4083        /*
4084         * Fix up the number of records and right block pointer in the
4085         * surviving block, and log it.
4086         */
4087        xfs_btree_set_numrecs(left, lrecs + rrecs);
4088        xfs_btree_get_sibling(cur, right, &cptr, XFS_BB_RIGHTSIB),
4089        xfs_btree_set_sibling(cur, left, &cptr, XFS_BB_RIGHTSIB);
4090        xfs_btree_log_block(cur, lbp, XFS_BB_NUMRECS | XFS_BB_RIGHTSIB);
4091
4092        /* If there is a right sibling, point it to the remaining block. */
4093        xfs_btree_get_sibling(cur, left, &cptr, XFS_BB_RIGHTSIB);
4094        if (!xfs_btree_ptr_is_null(cur, &cptr)) {
4095                error = xfs_btree_read_buf_block(cur, &cptr, 0, &rrblock, &rrbp);
4096                if (error)
4097                        goto error0;
4098                xfs_btree_set_sibling(cur, rrblock, &lptr, XFS_BB_LEFTSIB);
4099                xfs_btree_log_block(cur, rrbp, XFS_BB_LEFTSIB);
4100        }
4101
4102        /* Free the deleted block. */
4103        error = xfs_btree_free_block(cur, rbp);
4104        if (error)
4105                goto error0;
4106
4107        /*
4108         * If we joined with the left neighbor, set the buffer in the
4109         * cursor to the left block, and fix up the index.
4110         */
4111        if (bp != lbp) {
4112                cur->bc_bufs[level] = lbp;
4113                cur->bc_ptrs[level] += lrecs;
4114                cur->bc_ra[level] = 0;
4115        }
4116        /*
4117         * If we joined with the right neighbor and there's a level above
4118         * us, increment the cursor at that level.
4119         */
4120        else if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) ||
4121                   (level + 1 < cur->bc_nlevels)) {
4122                error = xfs_btree_increment(cur, level + 1, &i);
4123                if (error)
4124                        goto error0;
4125        }
4126
4127        /*
4128         * Readjust the ptr at this level if it's not a leaf, since it's
4129         * still pointing at the deletion point, which makes the cursor
4130         * inconsistent.  If this makes the ptr 0, the caller fixes it up.
4131         * We can't use decrement because it would change the next level up.
4132         */
4133        if (level > 0)
4134                cur->bc_ptrs[level]--;
4135
4136        /*
4137         * We combined blocks, so we have to update the parent keys if the
4138         * btree supports overlapped intervals.  However, bc_ptrs[level + 1]
4139         * points to the old block so that the caller knows which record to
4140         * delete.  Therefore, the caller must be savvy enough to call updkeys
4141         * for us if we return stat == 2.  The other exit points from this
4142         * function don't require deletions further up the tree, so they can
4143         * call updkeys directly.
4144         */
4145
4146        /* Return value means the next level up has something to do. */
4147        *stat = 2;
4148        return 0;
4149
4150error0:
4151        if (tcur)
4152                xfs_btree_del_cursor(tcur, XFS_BTREE_ERROR);
4153        return error;
4154}
4155
4156/*
4157 * Delete the record pointed to by cur.
4158 * The cursor refers to the place where the record was (could be inserted)
4159 * when the operation returns.
4160 */
4161int                                     /* error */
4162xfs_btree_delete(
4163        struct xfs_btree_cur    *cur,
4164        int                     *stat)  /* success/failure */
4165{
4166        int                     error;  /* error return value */
4167        int                     level;
4168        int                     i;
4169        bool                    joined = false;
4170
4171        /*
4172         * Go up the tree, starting at leaf level.
4173         *
4174         * If 2 is returned then a join was done; go to the next level.
4175         * Otherwise we are done.
4176         */
4177        for (level = 0, i = 2; i == 2; level++) {
4178                error = xfs_btree_delrec(cur, level, &i);
4179                if (error)
4180                        goto error0;
4181                if (i == 2)
4182                        joined = true;
4183        }
4184
4185        /*
4186         * If we combined blocks as part of deleting the record, delrec won't
4187         * have updated the parent high keys so we have to do that here.
4188         */
4189        if (joined && (cur->bc_flags & XFS_BTREE_OVERLAPPING)) {
4190                error = xfs_btree_updkeys_force(cur, 0);
4191                if (error)
4192                        goto error0;
4193        }
4194
4195        if (i == 0) {
4196                for (level = 1; level < cur->bc_nlevels; level++) {
4197                        if (cur->bc_ptrs[level] == 0) {
4198                                error = xfs_btree_decrement(cur, level, &i);
4199                                if (error)
4200                                        goto error0;
4201                                break;
4202                        }
4203                }
4204        }
4205
4206        *stat = i;
4207        return 0;
4208error0:
4209        return error;
4210}
4211
4212/*
4213 * Get the data from the pointed-to record.
4214 */
4215int                                     /* error */
4216xfs_btree_get_rec(
4217        struct xfs_btree_cur    *cur,   /* btree cursor */
4218        union xfs_btree_rec     **recp, /* output: btree record */
4219        int                     *stat)  /* output: success/failure */
4220{
4221        struct xfs_btree_block  *block; /* btree block */
4222        struct xfs_buf          *bp;    /* buffer pointer */
4223        int                     ptr;    /* record number */
4224#ifdef DEBUG
4225        int                     error;  /* error return value */
4226#endif
4227
4228        ptr = cur->bc_ptrs[0];
4229        block = xfs_btree_get_block(cur, 0, &bp);
4230
4231#ifdef DEBUG
4232        error = xfs_btree_check_block(cur, block, 0, bp);
4233        if (error)
4234                return error;
4235#endif
4236
4237        /*
4238         * Off the right end or left end, return failure.
4239         */
4240        if (ptr > xfs_btree_get_numrecs(block) || ptr <= 0) {
4241                *stat = 0;
4242                return 0;
4243        }
4244
4245        /*
4246         * Point to the record and extract its data.
4247         */
4248        *recp = xfs_btree_rec_addr(cur, ptr, block);
4249        *stat = 1;
4250        return 0;
4251}
4252
4253/* Visit a block in a btree. */
4254STATIC int
4255xfs_btree_visit_block(
4256        struct xfs_btree_cur            *cur,
4257        int                             level,
4258        xfs_btree_visit_blocks_fn       fn,
4259        void                            *data)
4260{
4261        struct xfs_btree_block          *block;
4262        struct xfs_buf                  *bp;
4263        union xfs_btree_ptr             rptr;
4264        int                             error;
4265
4266        /* do right sibling readahead */
4267        xfs_btree_readahead(cur, level, XFS_BTCUR_RIGHTRA);
4268        block = xfs_btree_get_block(cur, level, &bp);
4269
4270        /* process the block */
4271        error = fn(cur, level, data);
4272        if (error)
4273                return error;
4274
4275        /* now read rh sibling block for next iteration */
4276        xfs_btree_get_sibling(cur, block, &rptr, XFS_BB_RIGHTSIB);
4277        if (xfs_btree_ptr_is_null(cur, &rptr))
4278                return -ENOENT;
4279
4280        return xfs_btree_lookup_get_block(cur, level, &rptr, &block);
4281}
4282
4283
4284/* Visit every block in a btree. */
4285int
4286xfs_btree_visit_blocks(
4287        struct xfs_btree_cur            *cur,
4288        xfs_btree_visit_blocks_fn       fn,
4289        void                            *data)
4290{
4291        union xfs_btree_ptr             lptr;
4292        int                             level;
4293        struct xfs_btree_block          *block = NULL;
4294        int                             error = 0;
4295
4296        cur->bc_ops->init_ptr_from_cur(cur, &lptr);
4297
4298        /* for each level */
4299        for (level = cur->bc_nlevels - 1; level >= 0; level--) {
4300                /* grab the left hand block */
4301                error = xfs_btree_lookup_get_block(cur, level, &lptr, &block);
4302                if (error)
4303                        return error;
4304
4305                /* readahead the left most block for the next level down */
4306                if (level > 0) {
4307                        union xfs_btree_ptr     *ptr;
4308
4309                        ptr = xfs_btree_ptr_addr(cur, 1, block);
4310                        xfs_btree_readahead_ptr(cur, ptr, 1);
4311
4312                        /* save for the next iteration of the loop */
4313                        xfs_btree_copy_ptrs(cur, &lptr, ptr, 1);
4314                }
4315
4316                /* for each buffer in the level */
4317                do {
4318                        error = xfs_btree_visit_block(cur, level, fn, data);
4319                } while (!error);
4320
4321                if (error != -ENOENT)
4322                        return error;
4323        }
4324
4325        return 0;
4326}
4327
4328/*
4329 * Change the owner of a btree.
4330 *
4331 * The mechanism we use here is ordered buffer logging. Because we don't know
4332 * how many buffers were are going to need to modify, we don't really want to
4333 * have to make transaction reservations for the worst case of every buffer in a
4334 * full size btree as that may be more space that we can fit in the log....
4335 *
4336 * We do the btree walk in the most optimal manner possible - we have sibling
4337 * pointers so we can just walk all the blocks on each level from left to right
4338 * in a single pass, and then move to the next level and do the same. We can
4339 * also do readahead on the sibling pointers to get IO moving more quickly,
4340 * though for slow disks this is unlikely to make much difference to performance
4341 * as the amount of CPU work we have to do before moving to the next block is
4342 * relatively small.
4343 *
4344 * For each btree block that we load, modify the owner appropriately, set the
4345 * buffer as an ordered buffer and log it appropriately. We need to ensure that
4346 * we mark the region we change dirty so that if the buffer is relogged in
4347 * a subsequent transaction the changes we make here as an ordered buffer are
4348 * correctly relogged in that transaction.  If we are in recovery context, then
4349 * just queue the modified buffer as delayed write buffer so the transaction
4350 * recovery completion writes the changes to disk.
4351 */
4352struct xfs_btree_block_change_owner_info {
4353        uint64_t                new_owner;
4354        struct list_head        *buffer_list;
4355};
4356
4357static int
4358xfs_btree_block_change_owner(
4359        struct xfs_btree_cur    *cur,
4360        int                     level,
4361        void                    *data)
4362{
4363        struct xfs_btree_block_change_owner_info        *bbcoi = data;
4364        struct xfs_btree_block  *block;
4365        struct xfs_buf          *bp;
4366
4367        /* modify the owner */
4368        block = xfs_btree_get_block(cur, level, &bp);
4369        if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
4370                if (block->bb_u.l.bb_owner == cpu_to_be64(bbcoi->new_owner))
4371                        return 0;
4372                block->bb_u.l.bb_owner = cpu_to_be64(bbcoi->new_owner);
4373        } else {
4374                if (block->bb_u.s.bb_owner == cpu_to_be32(bbcoi->new_owner))
4375                        return 0;
4376                block->bb_u.s.bb_owner = cpu_to_be32(bbcoi->new_owner);
4377        }
4378
4379        /*
4380         * If the block is a root block hosted in an inode, we might not have a
4381         * buffer pointer here and we shouldn't attempt to log the change as the
4382         * information is already held in the inode and discarded when the root
4383         * block is formatted into the on-disk inode fork. We still change it,
4384         * though, so everything is consistent in memory.
4385         */
4386        if (!bp) {
4387                ASSERT(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE);
4388                ASSERT(level == cur->bc_nlevels - 1);
4389                return 0;
4390        }
4391
4392        if (cur->bc_tp) {
4393                if (!xfs_trans_ordered_buf(cur->bc_tp, bp)) {
4394                        xfs_btree_log_block(cur, bp, XFS_BB_OWNER);
4395                        return -EAGAIN;
4396                }
4397        } else {
4398                xfs_buf_delwri_queue(bp, bbcoi->buffer_list);
4399        }
4400
4401        return 0;
4402}
4403
4404int
4405xfs_btree_change_owner(
4406        struct xfs_btree_cur    *cur,
4407        uint64_t                new_owner,
4408        struct list_head        *buffer_list)
4409{
4410        struct xfs_btree_block_change_owner_info        bbcoi;
4411
4412        bbcoi.new_owner = new_owner;
4413        bbcoi.buffer_list = buffer_list;
4414
4415        return xfs_btree_visit_blocks(cur, xfs_btree_block_change_owner,
4416                        &bbcoi);
4417}
4418
4419/* Verify the v5 fields of a long-format btree block. */
4420xfs_failaddr_t
4421xfs_btree_lblock_v5hdr_verify(
4422        struct xfs_buf          *bp,
4423        uint64_t                owner)
4424{
4425        struct xfs_mount        *mp = bp->b_mount;
4426        struct xfs_btree_block  *block = XFS_BUF_TO_BLOCK(bp);
4427
4428        if (!xfs_sb_version_hascrc(&mp->m_sb))
4429                return __this_address;
4430        if (!uuid_equal(&block->bb_u.l.bb_uuid, &mp->m_sb.sb_meta_uuid))
4431                return __this_address;
4432        if (block->bb_u.l.bb_blkno != cpu_to_be64(bp->b_bn))
4433                return __this_address;
4434        if (owner != XFS_RMAP_OWN_UNKNOWN &&
4435            be64_to_cpu(block->bb_u.l.bb_owner) != owner)
4436                return __this_address;
4437        return NULL;
4438}
4439
4440/* Verify a long-format btree block. */
4441xfs_failaddr_t
4442xfs_btree_lblock_verify(
4443        struct xfs_buf          *bp,
4444        unsigned int            max_recs)
4445{
4446        struct xfs_mount        *mp = bp->b_mount;
4447        struct xfs_btree_block  *block = XFS_BUF_TO_BLOCK(bp);
4448
4449        /* numrecs verification */
4450        if (be16_to_cpu(block->bb_numrecs) > max_recs)
4451                return __this_address;
4452
4453        /* sibling pointer verification */
4454        if (block->bb_u.l.bb_leftsib != cpu_to_be64(NULLFSBLOCK) &&
4455            !xfs_verify_fsbno(mp, be64_to_cpu(block->bb_u.l.bb_leftsib)))
4456                return __this_address;
4457        if (block->bb_u.l.bb_rightsib != cpu_to_be64(NULLFSBLOCK) &&
4458            !xfs_verify_fsbno(mp, be64_to_cpu(block->bb_u.l.bb_rightsib)))
4459                return __this_address;
4460
4461        return NULL;
4462}
4463
4464/**
4465 * xfs_btree_sblock_v5hdr_verify() -- verify the v5 fields of a short-format
4466 *                                    btree block
4467 *
4468 * @bp: buffer containing the btree block
4469 * @max_recs: pointer to the m_*_mxr max records field in the xfs mount
4470 * @pag_max_level: pointer to the per-ag max level field
4471 */
4472xfs_failaddr_t
4473xfs_btree_sblock_v5hdr_verify(
4474        struct xfs_buf          *bp)
4475{
4476        struct xfs_mount        *mp = bp->b_mount;
4477        struct xfs_btree_block  *block = XFS_BUF_TO_BLOCK(bp);
4478        struct xfs_perag        *pag = bp->b_pag;
4479
4480        if (!xfs_sb_version_hascrc(&mp->m_sb))
4481                return __this_address;
4482        if (!uuid_equal(&block->bb_u.s.bb_uuid, &mp->m_sb.sb_meta_uuid))
4483                return __this_address;
4484        if (block->bb_u.s.bb_blkno != cpu_to_be64(bp->b_bn))
4485                return __this_address;
4486        if (pag && be32_to_cpu(block->bb_u.s.bb_owner) != pag->pag_agno)
4487                return __this_address;
4488        return NULL;
4489}
4490
4491/**
4492 * xfs_btree_sblock_verify() -- verify a short-format btree block
4493 *
4494 * @bp: buffer containing the btree block
4495 * @max_recs: maximum records allowed in this btree node
4496 */
4497xfs_failaddr_t
4498xfs_btree_sblock_verify(
4499        struct xfs_buf          *bp,
4500        unsigned int            max_recs)
4501{
4502        struct xfs_mount        *mp = bp->b_mount;
4503        struct xfs_btree_block  *block = XFS_BUF_TO_BLOCK(bp);
4504        xfs_agblock_t           agno;
4505
4506        /* numrecs verification */
4507        if (be16_to_cpu(block->bb_numrecs) > max_recs)
4508                return __this_address;
4509
4510        /* sibling pointer verification */
4511        agno = xfs_daddr_to_agno(mp, XFS_BUF_ADDR(bp));
4512        if (block->bb_u.s.bb_leftsib != cpu_to_be32(NULLAGBLOCK) &&
4513            !xfs_verify_agbno(mp, agno, be32_to_cpu(block->bb_u.s.bb_leftsib)))
4514                return __this_address;
4515        if (block->bb_u.s.bb_rightsib != cpu_to_be32(NULLAGBLOCK) &&
4516            !xfs_verify_agbno(mp, agno, be32_to_cpu(block->bb_u.s.bb_rightsib)))
4517                return __this_address;
4518
4519        return NULL;
4520}
4521
4522/*
4523 * Calculate the number of btree levels needed to store a given number of
4524 * records in a short-format btree.
4525 */
4526uint
4527xfs_btree_compute_maxlevels(
4528        uint                    *limits,
4529        unsigned long           len)
4530{
4531        uint                    level;
4532        unsigned long           maxblocks;
4533
4534        maxblocks = (len + limits[0] - 1) / limits[0];
4535        for (level = 1; maxblocks > 1; level++)
4536                maxblocks = (maxblocks + limits[1] - 1) / limits[1];
4537        return level;
4538}
4539
4540/*
4541 * Query a regular btree for all records overlapping a given interval.
4542 * Start with a LE lookup of the key of low_rec and return all records
4543 * until we find a record with a key greater than the key of high_rec.
4544 */
4545STATIC int
4546xfs_btree_simple_query_range(
4547        struct xfs_btree_cur            *cur,
4548        union xfs_btree_key             *low_key,
4549        union xfs_btree_key             *high_key,
4550        xfs_btree_query_range_fn        fn,
4551        void                            *priv)
4552{
4553        union xfs_btree_rec             *recp;
4554        union xfs_btree_key             rec_key;
4555        int64_t                         diff;
4556        int                             stat;
4557        bool                            firstrec = true;
4558        int                             error;
4559
4560        ASSERT(cur->bc_ops->init_high_key_from_rec);
4561        ASSERT(cur->bc_ops->diff_two_keys);
4562
4563        /*
4564         * Find the leftmost record.  The btree cursor must be set
4565         * to the low record used to generate low_key.
4566         */
4567        stat = 0;
4568        error = xfs_btree_lookup(cur, XFS_LOOKUP_LE, &stat);
4569        if (error)
4570                goto out;
4571
4572        /* Nothing?  See if there's anything to the right. */
4573        if (!stat) {
4574                error = xfs_btree_increment(cur, 0, &stat);
4575                if (error)
4576                        goto out;
4577        }
4578
4579        while (stat) {
4580                /* Find the record. */
4581                error = xfs_btree_get_rec(cur, &recp, &stat);
4582                if (error || !stat)
4583                        break;
4584
4585                /* Skip if high_key(rec) < low_key. */
4586                if (firstrec) {
4587                        cur->bc_ops->init_high_key_from_rec(&rec_key, recp);
4588                        firstrec = false;
4589                        diff = cur->bc_ops->diff_two_keys(cur, low_key,
4590                                        &rec_key);
4591                        if (diff > 0)
4592                                goto advloop;
4593                }
4594
4595                /* Stop if high_key < low_key(rec). */
4596                cur->bc_ops->init_key_from_rec(&rec_key, recp);
4597                diff = cur->bc_ops->diff_two_keys(cur, &rec_key, high_key);
4598                if (diff > 0)
4599                        break;
4600
4601                /* Callback */
4602                error = fn(cur, recp, priv);
4603                if (error < 0 || error == XFS_BTREE_QUERY_RANGE_ABORT)
4604                        break;
4605
4606advloop:
4607                /* Move on to the next record. */
4608                error = xfs_btree_increment(cur, 0, &stat);
4609                if (error)
4610                        break;
4611        }
4612
4613out:
4614        return error;
4615}
4616
4617/*
4618 * Query an overlapped interval btree for all records overlapping a given
4619 * interval.  This function roughly follows the algorithm given in
4620 * "Interval Trees" of _Introduction to Algorithms_, which is section
4621 * 14.3 in the 2nd and 3rd editions.
4622 *
4623 * First, generate keys for the low and high records passed in.
4624 *
4625 * For any leaf node, generate the high and low keys for the record.
4626 * If the record keys overlap with the query low/high keys, pass the
4627 * record to the function iterator.
4628 *
4629 * For any internal node, compare the low and high keys of each
4630 * pointer against the query low/high keys.  If there's an overlap,
4631 * follow the pointer.
4632 *
4633 * As an optimization, we stop scanning a block when we find a low key
4634 * that is greater than the query's high key.
4635 */
4636STATIC int
4637xfs_btree_overlapped_query_range(
4638        struct xfs_btree_cur            *cur,
4639        union xfs_btree_key             *low_key,
4640        union xfs_btree_key             *high_key,
4641        xfs_btree_query_range_fn        fn,
4642        void                            *priv)
4643{
4644        union xfs_btree_ptr             ptr;
4645        union xfs_btree_ptr             *pp;
4646        union xfs_btree_key             rec_key;
4647        union xfs_btree_key             rec_hkey;
4648        union xfs_btree_key             *lkp;
4649        union xfs_btree_key             *hkp;
4650        union xfs_btree_rec             *recp;
4651        struct xfs_btree_block          *block;
4652        int64_t                         ldiff;
4653        int64_t                         hdiff;
4654        int                             level;
4655        struct xfs_buf                  *bp;
4656        int                             i;
4657        int                             error;
4658
4659        /* Load the root of the btree. */
4660        level = cur->bc_nlevels - 1;
4661        cur->bc_ops->init_ptr_from_cur(cur, &ptr);
4662        error = xfs_btree_lookup_get_block(cur, level, &ptr, &block);
4663        if (error)
4664                return error;
4665        xfs_btree_get_block(cur, level, &bp);
4666        trace_xfs_btree_overlapped_query_range(cur, level, bp);
4667#ifdef DEBUG
4668        error = xfs_btree_check_block(cur, block, level, bp);
4669        if (error)
4670                goto out;
4671#endif
4672        cur->bc_ptrs[level] = 1;
4673
4674        while (level < cur->bc_nlevels) {
4675                block = xfs_btree_get_block(cur, level, &bp);
4676
4677                /* End of node, pop back towards the root. */
4678                if (cur->bc_ptrs[level] > be16_to_cpu(block->bb_numrecs)) {
4679pop_up:
4680                        if (level < cur->bc_nlevels - 1)
4681                                cur->bc_ptrs[level + 1]++;
4682                        level++;
4683                        continue;
4684                }
4685
4686                if (level == 0) {
4687                        /* Handle a leaf node. */
4688                        recp = xfs_btree_rec_addr(cur, cur->bc_ptrs[0], block);
4689
4690                        cur->bc_ops->init_high_key_from_rec(&rec_hkey, recp);
4691                        ldiff = cur->bc_ops->diff_two_keys(cur, &rec_hkey,
4692                                        low_key);
4693
4694                        cur->bc_ops->init_key_from_rec(&rec_key, recp);
4695                        hdiff = cur->bc_ops->diff_two_keys(cur, high_key,
4696                                        &rec_key);
4697
4698                        /*
4699                         * If (record's high key >= query's low key) and
4700                         *    (query's high key >= record's low key), then
4701                         * this record overlaps the query range; callback.
4702                         */
4703                        if (ldiff >= 0 && hdiff >= 0) {
4704                                error = fn(cur, recp, priv);
4705                                if (error < 0 ||
4706                                    error == XFS_BTREE_QUERY_RANGE_ABORT)
4707                                        break;
4708                        } else if (hdiff < 0) {
4709                                /* Record is larger than high key; pop. */
4710                                goto pop_up;
4711                        }
4712                        cur->bc_ptrs[level]++;
4713                        continue;
4714                }
4715
4716                /* Handle an internal node. */
4717                lkp = xfs_btree_key_addr(cur, cur->bc_ptrs[level], block);
4718                hkp = xfs_btree_high_key_addr(cur, cur->bc_ptrs[level], block);
4719                pp = xfs_btree_ptr_addr(cur, cur->bc_ptrs[level], block);
4720
4721                ldiff = cur->bc_ops->diff_two_keys(cur, hkp, low_key);
4722                hdiff = cur->bc_ops->diff_two_keys(cur, high_key, lkp);
4723
4724                /*
4725                 * If (pointer's high key >= query's low key) and
4726                 *    (query's high key >= pointer's low key), then
4727                 * this record overlaps the query range; follow pointer.
4728                 */
4729                if (ldiff >= 0 && hdiff >= 0) {
4730                        level--;
4731                        error = xfs_btree_lookup_get_block(cur, level, pp,
4732                                        &block);
4733                        if (error)
4734                                goto out;
4735                        xfs_btree_get_block(cur, level, &bp);
4736                        trace_xfs_btree_overlapped_query_range(cur, level, bp);
4737#ifdef DEBUG
4738                        error = xfs_btree_check_block(cur, block, level, bp);
4739                        if (error)
4740                                goto out;
4741#endif
4742                        cur->bc_ptrs[level] = 1;
4743                        continue;
4744                } else if (hdiff < 0) {
4745                        /* The low key is larger than the upper range; pop. */
4746                        goto pop_up;
4747                }
4748                cur->bc_ptrs[level]++;
4749        }
4750
4751out:
4752        /*
4753         * If we don't end this function with the cursor pointing at a record
4754         * block, a subsequent non-error cursor deletion will not release
4755         * node-level buffers, causing a buffer leak.  This is quite possible
4756         * with a zero-results range query, so release the buffers if we
4757         * failed to return any results.
4758         */
4759        if (cur->bc_bufs[0] == NULL) {
4760                for (i = 0; i < cur->bc_nlevels; i++) {
4761                        if (cur->bc_bufs[i]) {
4762                                xfs_trans_brelse(cur->bc_tp, cur->bc_bufs[i]);
4763                                cur->bc_bufs[i] = NULL;
4764                                cur->bc_ptrs[i] = 0;
4765                                cur->bc_ra[i] = 0;
4766                        }
4767                }
4768        }
4769
4770        return error;
4771}
4772
4773/*
4774 * Query a btree for all records overlapping a given interval of keys.  The
4775 * supplied function will be called with each record found; return one of the
4776 * XFS_BTREE_QUERY_RANGE_{CONTINUE,ABORT} values or the usual negative error
4777 * code.  This function returns XFS_BTREE_QUERY_RANGE_ABORT, zero, or a
4778 * negative error code.
4779 */
4780int
4781xfs_btree_query_range(
4782        struct xfs_btree_cur            *cur,
4783        union xfs_btree_irec            *low_rec,
4784        union xfs_btree_irec            *high_rec,
4785        xfs_btree_query_range_fn        fn,
4786        void                            *priv)
4787{
4788        union xfs_btree_rec             rec;
4789        union xfs_btree_key             low_key;
4790        union xfs_btree_key             high_key;
4791
4792        /* Find the keys of both ends of the interval. */
4793        cur->bc_rec = *high_rec;
4794        cur->bc_ops->init_rec_from_cur(cur, &rec);
4795        cur->bc_ops->init_key_from_rec(&high_key, &rec);
4796
4797        cur->bc_rec = *low_rec;
4798        cur->bc_ops->init_rec_from_cur(cur, &rec);
4799        cur->bc_ops->init_key_from_rec(&low_key, &rec);
4800
4801        /* Enforce low key < high key. */
4802        if (cur->bc_ops->diff_two_keys(cur, &low_key, &high_key) > 0)
4803                return -EINVAL;
4804
4805        if (!(cur->bc_flags & XFS_BTREE_OVERLAPPING))
4806                return xfs_btree_simple_query_range(cur, &low_key,
4807                                &high_key, fn, priv);
4808        return xfs_btree_overlapped_query_range(cur, &low_key, &high_key,
4809                        fn, priv);
4810}
4811
4812/* Query a btree for all records. */
4813int
4814xfs_btree_query_all(
4815        struct xfs_btree_cur            *cur,
4816        xfs_btree_query_range_fn        fn,
4817        void                            *priv)
4818{
4819        union xfs_btree_key             low_key;
4820        union xfs_btree_key             high_key;
4821
4822        memset(&cur->bc_rec, 0, sizeof(cur->bc_rec));
4823        memset(&low_key, 0, sizeof(low_key));
4824        memset(&high_key, 0xFF, sizeof(high_key));
4825
4826        return xfs_btree_simple_query_range(cur, &low_key, &high_key, fn, priv);
4827}
4828
4829/*
4830 * Calculate the number of blocks needed to store a given number of records
4831 * in a short-format (per-AG metadata) btree.
4832 */
4833unsigned long long
4834xfs_btree_calc_size(
4835        uint                    *limits,
4836        unsigned long long      len)
4837{
4838        int                     level;
4839        int                     maxrecs;
4840        unsigned long long      rval;
4841
4842        maxrecs = limits[0];
4843        for (level = 0, rval = 0; len > 1; level++) {
4844                len += maxrecs - 1;
4845                do_div(len, maxrecs);
4846                maxrecs = limits[1];
4847                rval += len;
4848        }
4849        return rval;
4850}
4851
4852static int
4853xfs_btree_count_blocks_helper(
4854        struct xfs_btree_cur    *cur,
4855        int                     level,
4856        void                    *data)
4857{
4858        xfs_extlen_t            *blocks = data;
4859        (*blocks)++;
4860
4861        return 0;
4862}
4863
4864/* Count the blocks in a btree and return the result in *blocks. */
4865int
4866xfs_btree_count_blocks(
4867        struct xfs_btree_cur    *cur,
4868        xfs_extlen_t            *blocks)
4869{
4870        *blocks = 0;
4871        return xfs_btree_visit_blocks(cur, xfs_btree_count_blocks_helper,
4872                        blocks);
4873}
4874
4875/* Compare two btree pointers. */
4876int64_t
4877xfs_btree_diff_two_ptrs(
4878        struct xfs_btree_cur            *cur,
4879        const union xfs_btree_ptr       *a,
4880        const union xfs_btree_ptr       *b)
4881{
4882        if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
4883                return (int64_t)be64_to_cpu(a->l) - be64_to_cpu(b->l);
4884        return (int64_t)be32_to_cpu(a->s) - be32_to_cpu(b->s);
4885}
4886
4887/* If there's an extent, we're done. */
4888STATIC int
4889xfs_btree_has_record_helper(
4890        struct xfs_btree_cur            *cur,
4891        union xfs_btree_rec             *rec,
4892        void                            *priv)
4893{
4894        return XFS_BTREE_QUERY_RANGE_ABORT;
4895}
4896
4897/* Is there a record covering a given range of keys? */
4898int
4899xfs_btree_has_record(
4900        struct xfs_btree_cur    *cur,
4901        union xfs_btree_irec    *low,
4902        union xfs_btree_irec    *high,
4903        bool                    *exists)
4904{
4905        int                     error;
4906
4907        error = xfs_btree_query_range(cur, low, high,
4908                        &xfs_btree_has_record_helper, NULL);
4909        if (error == XFS_BTREE_QUERY_RANGE_ABORT) {
4910                *exists = true;
4911                return 0;
4912        }
4913        *exists = false;
4914        return error;
4915}
4916
4917/* Are there more records in this btree? */
4918bool
4919xfs_btree_has_more_records(
4920        struct xfs_btree_cur    *cur)
4921{
4922        struct xfs_btree_block  *block;
4923        struct xfs_buf          *bp;
4924
4925        block = xfs_btree_get_block(cur, 0, &bp);
4926
4927        /* There are still records in this block. */
4928        if (cur->bc_ptrs[0] < xfs_btree_get_numrecs(block))
4929                return true;
4930
4931        /* There are more record blocks. */
4932        if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
4933                return block->bb_u.l.bb_rightsib != cpu_to_be64(NULLFSBLOCK);
4934        else
4935                return block->bb_u.s.bb_rightsib != cpu_to_be32(NULLAGBLOCK);
4936}
4937