linux/fs/xfs/libxfs/xfs_btree.c
<<
>>
Prefs
   1/*
   2 * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc.
   3 * All Rights Reserved.
   4 *
   5 * This program is free software; you can redistribute it and/or
   6 * modify it under the terms of the GNU General Public License as
   7 * published by the Free Software Foundation.
   8 *
   9 * This program is distributed in the hope that it would be useful,
  10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
  11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12 * GNU General Public License for more details.
  13 *
  14 * You should have received a copy of the GNU General Public License
  15 * along with this program; if not, write the Free Software Foundation,
  16 * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
  17 */
  18#include "xfs.h"
  19#include "xfs_fs.h"
  20#include "xfs_shared.h"
  21#include "xfs_format.h"
  22#include "xfs_log_format.h"
  23#include "xfs_trans_resv.h"
  24#include "xfs_bit.h"
  25#include "xfs_mount.h"
  26#include "xfs_defer.h"
  27#include "xfs_inode.h"
  28#include "xfs_trans.h"
  29#include "xfs_inode_item.h"
  30#include "xfs_buf_item.h"
  31#include "xfs_btree.h"
  32#include "xfs_error.h"
  33#include "xfs_trace.h"
  34#include "xfs_cksum.h"
  35#include "xfs_alloc.h"
  36#include "xfs_log.h"
  37
  38/*
  39 * Cursor allocation zone.
  40 */
  41kmem_zone_t     *xfs_btree_cur_zone;
  42
  43/*
  44 * Btree magic numbers.
  45 */
  46static const __uint32_t xfs_magics[2][XFS_BTNUM_MAX] = {
  47        { XFS_ABTB_MAGIC, XFS_ABTC_MAGIC, 0, XFS_BMAP_MAGIC, XFS_IBT_MAGIC,
  48          XFS_FIBT_MAGIC, 0 },
  49        { XFS_ABTB_CRC_MAGIC, XFS_ABTC_CRC_MAGIC, XFS_RMAP_CRC_MAGIC,
  50          XFS_BMAP_CRC_MAGIC, XFS_IBT_CRC_MAGIC, XFS_FIBT_CRC_MAGIC,
  51          XFS_REFC_CRC_MAGIC }
  52};
  53
  54__uint32_t
  55xfs_btree_magic(
  56        int                     crc,
  57        xfs_btnum_t             btnum)
  58{
  59        __uint32_t              magic = xfs_magics[crc][btnum];
  60
  61        /* Ensure we asked for crc for crc-only magics. */
  62        ASSERT(magic != 0);
  63        return magic;
  64}
  65
  66STATIC int                              /* error (0 or EFSCORRUPTED) */
  67xfs_btree_check_lblock(
  68        struct xfs_btree_cur    *cur,   /* btree cursor */
  69        struct xfs_btree_block  *block, /* btree long form block pointer */
  70        int                     level,  /* level of the btree block */
  71        struct xfs_buf          *bp)    /* buffer for block, if any */
  72{
  73        int                     lblock_ok = 1; /* block passes checks */
  74        struct xfs_mount        *mp;    /* file system mount point */
  75        xfs_btnum_t             btnum = cur->bc_btnum;
  76        int                     crc;
  77
  78        mp = cur->bc_mp;
  79        crc = xfs_sb_version_hascrc(&mp->m_sb);
  80
  81        if (crc) {
  82                lblock_ok = lblock_ok &&
  83                        uuid_equal(&block->bb_u.l.bb_uuid,
  84                                   &mp->m_sb.sb_meta_uuid) &&
  85                        block->bb_u.l.bb_blkno == cpu_to_be64(
  86                                bp ? bp->b_bn : XFS_BUF_DADDR_NULL);
  87        }
  88
  89        lblock_ok = lblock_ok &&
  90                be32_to_cpu(block->bb_magic) == xfs_btree_magic(crc, btnum) &&
  91                be16_to_cpu(block->bb_level) == level &&
  92                be16_to_cpu(block->bb_numrecs) <=
  93                        cur->bc_ops->get_maxrecs(cur, level) &&
  94                block->bb_u.l.bb_leftsib &&
  95                (block->bb_u.l.bb_leftsib == cpu_to_be64(NULLFSBLOCK) ||
  96                 XFS_FSB_SANITY_CHECK(mp,
  97                        be64_to_cpu(block->bb_u.l.bb_leftsib))) &&
  98                block->bb_u.l.bb_rightsib &&
  99                (block->bb_u.l.bb_rightsib == cpu_to_be64(NULLFSBLOCK) ||
 100                 XFS_FSB_SANITY_CHECK(mp,
 101                        be64_to_cpu(block->bb_u.l.bb_rightsib)));
 102
 103        if (unlikely(XFS_TEST_ERROR(!lblock_ok, mp,
 104                        XFS_ERRTAG_BTREE_CHECK_LBLOCK,
 105                        XFS_RANDOM_BTREE_CHECK_LBLOCK))) {
 106                if (bp)
 107                        trace_xfs_btree_corrupt(bp, _RET_IP_);
 108                XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, mp);
 109                return -EFSCORRUPTED;
 110        }
 111        return 0;
 112}
 113
 114STATIC int                              /* error (0 or EFSCORRUPTED) */
 115xfs_btree_check_sblock(
 116        struct xfs_btree_cur    *cur,   /* btree cursor */
 117        struct xfs_btree_block  *block, /* btree short form block pointer */
 118        int                     level,  /* level of the btree block */
 119        struct xfs_buf          *bp)    /* buffer containing block */
 120{
 121        struct xfs_mount        *mp;    /* file system mount point */
 122        struct xfs_buf          *agbp;  /* buffer for ag. freespace struct */
 123        struct xfs_agf          *agf;   /* ag. freespace structure */
 124        xfs_agblock_t           agflen; /* native ag. freespace length */
 125        int                     sblock_ok = 1; /* block passes checks */
 126        xfs_btnum_t             btnum = cur->bc_btnum;
 127        int                     crc;
 128
 129        mp = cur->bc_mp;
 130        crc = xfs_sb_version_hascrc(&mp->m_sb);
 131        agbp = cur->bc_private.a.agbp;
 132        agf = XFS_BUF_TO_AGF(agbp);
 133        agflen = be32_to_cpu(agf->agf_length);
 134
 135        if (crc) {
 136                sblock_ok = sblock_ok &&
 137                        uuid_equal(&block->bb_u.s.bb_uuid,
 138                                   &mp->m_sb.sb_meta_uuid) &&
 139                        block->bb_u.s.bb_blkno == cpu_to_be64(
 140                                bp ? bp->b_bn : XFS_BUF_DADDR_NULL);
 141        }
 142
 143        sblock_ok = sblock_ok &&
 144                be32_to_cpu(block->bb_magic) == xfs_btree_magic(crc, btnum) &&
 145                be16_to_cpu(block->bb_level) == level &&
 146                be16_to_cpu(block->bb_numrecs) <=
 147                        cur->bc_ops->get_maxrecs(cur, level) &&
 148                (block->bb_u.s.bb_leftsib == cpu_to_be32(NULLAGBLOCK) ||
 149                 be32_to_cpu(block->bb_u.s.bb_leftsib) < agflen) &&
 150                block->bb_u.s.bb_leftsib &&
 151                (block->bb_u.s.bb_rightsib == cpu_to_be32(NULLAGBLOCK) ||
 152                 be32_to_cpu(block->bb_u.s.bb_rightsib) < agflen) &&
 153                block->bb_u.s.bb_rightsib;
 154
 155        if (unlikely(XFS_TEST_ERROR(!sblock_ok, mp,
 156                        XFS_ERRTAG_BTREE_CHECK_SBLOCK,
 157                        XFS_RANDOM_BTREE_CHECK_SBLOCK))) {
 158                if (bp)
 159                        trace_xfs_btree_corrupt(bp, _RET_IP_);
 160                XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, mp);
 161                return -EFSCORRUPTED;
 162        }
 163        return 0;
 164}
 165
 166/*
 167 * Debug routine: check that block header is ok.
 168 */
 169int
 170xfs_btree_check_block(
 171        struct xfs_btree_cur    *cur,   /* btree cursor */
 172        struct xfs_btree_block  *block, /* generic btree block pointer */
 173        int                     level,  /* level of the btree block */
 174        struct xfs_buf          *bp)    /* buffer containing block, if any */
 175{
 176        if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
 177                return xfs_btree_check_lblock(cur, block, level, bp);
 178        else
 179                return xfs_btree_check_sblock(cur, block, level, bp);
 180}
 181
 182/*
 183 * Check that (long) pointer is ok.
 184 */
 185int                                     /* error (0 or EFSCORRUPTED) */
 186xfs_btree_check_lptr(
 187        struct xfs_btree_cur    *cur,   /* btree cursor */
 188        xfs_fsblock_t           bno,    /* btree block disk address */
 189        int                     level)  /* btree block level */
 190{
 191        XFS_WANT_CORRUPTED_RETURN(cur->bc_mp,
 192                level > 0 &&
 193                bno != NULLFSBLOCK &&
 194                XFS_FSB_SANITY_CHECK(cur->bc_mp, bno));
 195        return 0;
 196}
 197
 198#ifdef DEBUG
 199/*
 200 * Check that (short) pointer is ok.
 201 */
 202STATIC int                              /* error (0 or EFSCORRUPTED) */
 203xfs_btree_check_sptr(
 204        struct xfs_btree_cur    *cur,   /* btree cursor */
 205        xfs_agblock_t           bno,    /* btree block disk address */
 206        int                     level)  /* btree block level */
 207{
 208        xfs_agblock_t           agblocks = cur->bc_mp->m_sb.sb_agblocks;
 209
 210        XFS_WANT_CORRUPTED_RETURN(cur->bc_mp,
 211                level > 0 &&
 212                bno != NULLAGBLOCK &&
 213                bno != 0 &&
 214                bno < agblocks);
 215        return 0;
 216}
 217
 218/*
 219 * Check that block ptr is ok.
 220 */
 221STATIC int                              /* error (0 or EFSCORRUPTED) */
 222xfs_btree_check_ptr(
 223        struct xfs_btree_cur    *cur,   /* btree cursor */
 224        union xfs_btree_ptr     *ptr,   /* btree block disk address */
 225        int                     index,  /* offset from ptr to check */
 226        int                     level)  /* btree block level */
 227{
 228        if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
 229                return xfs_btree_check_lptr(cur,
 230                                be64_to_cpu((&ptr->l)[index]), level);
 231        } else {
 232                return xfs_btree_check_sptr(cur,
 233                                be32_to_cpu((&ptr->s)[index]), level);
 234        }
 235}
 236#endif
 237
 238/*
 239 * Calculate CRC on the whole btree block and stuff it into the
 240 * long-form btree header.
 241 *
 242 * Prior to calculting the CRC, pull the LSN out of the buffer log item and put
 243 * it into the buffer so recovery knows what the last modification was that made
 244 * it to disk.
 245 */
 246void
 247xfs_btree_lblock_calc_crc(
 248        struct xfs_buf          *bp)
 249{
 250        struct xfs_btree_block  *block = XFS_BUF_TO_BLOCK(bp);
 251        struct xfs_buf_log_item *bip = bp->b_fspriv;
 252
 253        if (!xfs_sb_version_hascrc(&bp->b_target->bt_mount->m_sb))
 254                return;
 255        if (bip)
 256                block->bb_u.l.bb_lsn = cpu_to_be64(bip->bli_item.li_lsn);
 257        xfs_buf_update_cksum(bp, XFS_BTREE_LBLOCK_CRC_OFF);
 258}
 259
 260bool
 261xfs_btree_lblock_verify_crc(
 262        struct xfs_buf          *bp)
 263{
 264        struct xfs_btree_block  *block = XFS_BUF_TO_BLOCK(bp);
 265        struct xfs_mount        *mp = bp->b_target->bt_mount;
 266
 267        if (xfs_sb_version_hascrc(&mp->m_sb)) {
 268                if (!xfs_log_check_lsn(mp, be64_to_cpu(block->bb_u.l.bb_lsn)))
 269                        return false;
 270                return xfs_buf_verify_cksum(bp, XFS_BTREE_LBLOCK_CRC_OFF);
 271        }
 272
 273        return true;
 274}
 275
 276/*
 277 * Calculate CRC on the whole btree block and stuff it into the
 278 * short-form btree header.
 279 *
 280 * Prior to calculting the CRC, pull the LSN out of the buffer log item and put
 281 * it into the buffer so recovery knows what the last modification was that made
 282 * it to disk.
 283 */
 284void
 285xfs_btree_sblock_calc_crc(
 286        struct xfs_buf          *bp)
 287{
 288        struct xfs_btree_block  *block = XFS_BUF_TO_BLOCK(bp);
 289        struct xfs_buf_log_item *bip = bp->b_fspriv;
 290
 291        if (!xfs_sb_version_hascrc(&bp->b_target->bt_mount->m_sb))
 292                return;
 293        if (bip)
 294                block->bb_u.s.bb_lsn = cpu_to_be64(bip->bli_item.li_lsn);
 295        xfs_buf_update_cksum(bp, XFS_BTREE_SBLOCK_CRC_OFF);
 296}
 297
 298bool
 299xfs_btree_sblock_verify_crc(
 300        struct xfs_buf          *bp)
 301{
 302        struct xfs_btree_block  *block = XFS_BUF_TO_BLOCK(bp);
 303        struct xfs_mount        *mp = bp->b_target->bt_mount;
 304
 305        if (xfs_sb_version_hascrc(&mp->m_sb)) {
 306                if (!xfs_log_check_lsn(mp, be64_to_cpu(block->bb_u.s.bb_lsn)))
 307                        return false;
 308                return xfs_buf_verify_cksum(bp, XFS_BTREE_SBLOCK_CRC_OFF);
 309        }
 310
 311        return true;
 312}
 313
 314static int
 315xfs_btree_free_block(
 316        struct xfs_btree_cur    *cur,
 317        struct xfs_buf          *bp)
 318{
 319        int                     error;
 320
 321        error = cur->bc_ops->free_block(cur, bp);
 322        if (!error) {
 323                xfs_trans_binval(cur->bc_tp, bp);
 324                XFS_BTREE_STATS_INC(cur, free);
 325        }
 326        return error;
 327}
 328
 329/*
 330 * Delete the btree cursor.
 331 */
 332void
 333xfs_btree_del_cursor(
 334        xfs_btree_cur_t *cur,           /* btree cursor */
 335        int             error)          /* del because of error */
 336{
 337        int             i;              /* btree level */
 338
 339        /*
 340         * Clear the buffer pointers, and release the buffers.
 341         * If we're doing this in the face of an error, we
 342         * need to make sure to inspect all of the entries
 343         * in the bc_bufs array for buffers to be unlocked.
 344         * This is because some of the btree code works from
 345         * level n down to 0, and if we get an error along
 346         * the way we won't have initialized all the entries
 347         * down to 0.
 348         */
 349        for (i = 0; i < cur->bc_nlevels; i++) {
 350                if (cur->bc_bufs[i])
 351                        xfs_trans_brelse(cur->bc_tp, cur->bc_bufs[i]);
 352                else if (!error)
 353                        break;
 354        }
 355        /*
 356         * Can't free a bmap cursor without having dealt with the
 357         * allocated indirect blocks' accounting.
 358         */
 359        ASSERT(cur->bc_btnum != XFS_BTNUM_BMAP ||
 360               cur->bc_private.b.allocated == 0);
 361        /*
 362         * Free the cursor.
 363         */
 364        kmem_zone_free(xfs_btree_cur_zone, cur);
 365}
 366
 367/*
 368 * Duplicate the btree cursor.
 369 * Allocate a new one, copy the record, re-get the buffers.
 370 */
 371int                                     /* error */
 372xfs_btree_dup_cursor(
 373        xfs_btree_cur_t *cur,           /* input cursor */
 374        xfs_btree_cur_t **ncur)         /* output cursor */
 375{
 376        xfs_buf_t       *bp;            /* btree block's buffer pointer */
 377        int             error;          /* error return value */
 378        int             i;              /* level number of btree block */
 379        xfs_mount_t     *mp;            /* mount structure for filesystem */
 380        xfs_btree_cur_t *new;           /* new cursor value */
 381        xfs_trans_t     *tp;            /* transaction pointer, can be NULL */
 382
 383        tp = cur->bc_tp;
 384        mp = cur->bc_mp;
 385
 386        /*
 387         * Allocate a new cursor like the old one.
 388         */
 389        new = cur->bc_ops->dup_cursor(cur);
 390
 391        /*
 392         * Copy the record currently in the cursor.
 393         */
 394        new->bc_rec = cur->bc_rec;
 395
 396        /*
 397         * For each level current, re-get the buffer and copy the ptr value.
 398         */
 399        for (i = 0; i < new->bc_nlevels; i++) {
 400                new->bc_ptrs[i] = cur->bc_ptrs[i];
 401                new->bc_ra[i] = cur->bc_ra[i];
 402                bp = cur->bc_bufs[i];
 403                if (bp) {
 404                        error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp,
 405                                                   XFS_BUF_ADDR(bp), mp->m_bsize,
 406                                                   0, &bp,
 407                                                   cur->bc_ops->buf_ops);
 408                        if (error) {
 409                                xfs_btree_del_cursor(new, error);
 410                                *ncur = NULL;
 411                                return error;
 412                        }
 413                }
 414                new->bc_bufs[i] = bp;
 415        }
 416        *ncur = new;
 417        return 0;
 418}
 419
 420/*
 421 * XFS btree block layout and addressing:
 422 *
 423 * There are two types of blocks in the btree: leaf and non-leaf blocks.
 424 *
 425 * The leaf record start with a header then followed by records containing
 426 * the values.  A non-leaf block also starts with the same header, and
 427 * then first contains lookup keys followed by an equal number of pointers
 428 * to the btree blocks at the previous level.
 429 *
 430 *              +--------+-------+-------+-------+-------+-------+-------+
 431 * Leaf:        | header | rec 1 | rec 2 | rec 3 | rec 4 | rec 5 | rec N |
 432 *              +--------+-------+-------+-------+-------+-------+-------+
 433 *
 434 *              +--------+-------+-------+-------+-------+-------+-------+
 435 * Non-Leaf:    | header | key 1 | key 2 | key N | ptr 1 | ptr 2 | ptr N |
 436 *              +--------+-------+-------+-------+-------+-------+-------+
 437 *
 438 * The header is called struct xfs_btree_block for reasons better left unknown
 439 * and comes in different versions for short (32bit) and long (64bit) block
 440 * pointers.  The record and key structures are defined by the btree instances
 441 * and opaque to the btree core.  The block pointers are simple disk endian
 442 * integers, available in a short (32bit) and long (64bit) variant.
 443 *
 444 * The helpers below calculate the offset of a given record, key or pointer
 445 * into a btree block (xfs_btree_*_offset) or return a pointer to the given
 446 * record, key or pointer (xfs_btree_*_addr).  Note that all addressing
 447 * inside the btree block is done using indices starting at one, not zero!
 448 *
 449 * If XFS_BTREE_OVERLAPPING is set, then this btree supports keys containing
 450 * overlapping intervals.  In such a tree, records are still sorted lowest to
 451 * highest and indexed by the smallest key value that refers to the record.
 452 * However, nodes are different: each pointer has two associated keys -- one
 453 * indexing the lowest key available in the block(s) below (the same behavior
 454 * as the key in a regular btree) and another indexing the highest key
 455 * available in the block(s) below.  Because records are /not/ sorted by the
 456 * highest key, all leaf block updates require us to compute the highest key
 457 * that matches any record in the leaf and to recursively update the high keys
 458 * in the nodes going further up in the tree, if necessary.  Nodes look like
 459 * this:
 460 *
 461 *              +--------+-----+-----+-----+-----+-----+-------+-------+-----+
 462 * Non-Leaf:    | header | lo1 | hi1 | lo2 | hi2 | ... | ptr 1 | ptr 2 | ... |
 463 *              +--------+-----+-----+-----+-----+-----+-------+-------+-----+
 464 *
 465 * To perform an interval query on an overlapped tree, perform the usual
 466 * depth-first search and use the low and high keys to decide if we can skip
 467 * that particular node.  If a leaf node is reached, return the records that
 468 * intersect the interval.  Note that an interval query may return numerous
 469 * entries.  For a non-overlapped tree, simply search for the record associated
 470 * with the lowest key and iterate forward until a non-matching record is
 471 * found.  Section 14.3 ("Interval Trees") of _Introduction to Algorithms_ by
 472 * Cormen, Leiserson, Rivest, and Stein (2nd or 3rd ed. only) discuss this in
 473 * more detail.
 474 *
 475 * Why do we care about overlapping intervals?  Let's say you have a bunch of
 476 * reverse mapping records on a reflink filesystem:
 477 *
 478 * 1: +- file A startblock B offset C length D -----------+
 479 * 2:      +- file E startblock F offset G length H --------------+
 480 * 3:      +- file I startblock F offset J length K --+
 481 * 4:                                                        +- file L... --+
 482 *
 483 * Now say we want to map block (B+D) into file A at offset (C+D).  Ideally,
 484 * we'd simply increment the length of record 1.  But how do we find the record
 485 * that ends at (B+D-1) (i.e. record 1)?  A LE lookup of (B+D-1) would return
 486 * record 3 because the keys are ordered first by startblock.  An interval
 487 * query would return records 1 and 2 because they both overlap (B+D-1), and
 488 * from that we can pick out record 1 as the appropriate left neighbor.
 489 *
 490 * In the non-overlapped case you can do a LE lookup and decrement the cursor
 491 * because a record's interval must end before the next record.
 492 */
 493
 494/*
 495 * Return size of the btree block header for this btree instance.
 496 */
 497static inline size_t xfs_btree_block_len(struct xfs_btree_cur *cur)
 498{
 499        if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
 500                if (cur->bc_flags & XFS_BTREE_CRC_BLOCKS)
 501                        return XFS_BTREE_LBLOCK_CRC_LEN;
 502                return XFS_BTREE_LBLOCK_LEN;
 503        }
 504        if (cur->bc_flags & XFS_BTREE_CRC_BLOCKS)
 505                return XFS_BTREE_SBLOCK_CRC_LEN;
 506        return XFS_BTREE_SBLOCK_LEN;
 507}
 508
 509/*
 510 * Return size of btree block pointers for this btree instance.
 511 */
 512static inline size_t xfs_btree_ptr_len(struct xfs_btree_cur *cur)
 513{
 514        return (cur->bc_flags & XFS_BTREE_LONG_PTRS) ?
 515                sizeof(__be64) : sizeof(__be32);
 516}
 517
 518/*
 519 * Calculate offset of the n-th record in a btree block.
 520 */
 521STATIC size_t
 522xfs_btree_rec_offset(
 523        struct xfs_btree_cur    *cur,
 524        int                     n)
 525{
 526        return xfs_btree_block_len(cur) +
 527                (n - 1) * cur->bc_ops->rec_len;
 528}
 529
 530/*
 531 * Calculate offset of the n-th key in a btree block.
 532 */
 533STATIC size_t
 534xfs_btree_key_offset(
 535        struct xfs_btree_cur    *cur,
 536        int                     n)
 537{
 538        return xfs_btree_block_len(cur) +
 539                (n - 1) * cur->bc_ops->key_len;
 540}
 541
 542/*
 543 * Calculate offset of the n-th high key in a btree block.
 544 */
 545STATIC size_t
 546xfs_btree_high_key_offset(
 547        struct xfs_btree_cur    *cur,
 548        int                     n)
 549{
 550        return xfs_btree_block_len(cur) +
 551                (n - 1) * cur->bc_ops->key_len + (cur->bc_ops->key_len / 2);
 552}
 553
 554/*
 555 * Calculate offset of the n-th block pointer in a btree block.
 556 */
 557STATIC size_t
 558xfs_btree_ptr_offset(
 559        struct xfs_btree_cur    *cur,
 560        int                     n,
 561        int                     level)
 562{
 563        return xfs_btree_block_len(cur) +
 564                cur->bc_ops->get_maxrecs(cur, level) * cur->bc_ops->key_len +
 565                (n - 1) * xfs_btree_ptr_len(cur);
 566}
 567
 568/*
 569 * Return a pointer to the n-th record in the btree block.
 570 */
 571STATIC union xfs_btree_rec *
 572xfs_btree_rec_addr(
 573        struct xfs_btree_cur    *cur,
 574        int                     n,
 575        struct xfs_btree_block  *block)
 576{
 577        return (union xfs_btree_rec *)
 578                ((char *)block + xfs_btree_rec_offset(cur, n));
 579}
 580
 581/*
 582 * Return a pointer to the n-th key in the btree block.
 583 */
 584STATIC union xfs_btree_key *
 585xfs_btree_key_addr(
 586        struct xfs_btree_cur    *cur,
 587        int                     n,
 588        struct xfs_btree_block  *block)
 589{
 590        return (union xfs_btree_key *)
 591                ((char *)block + xfs_btree_key_offset(cur, n));
 592}
 593
 594/*
 595 * Return a pointer to the n-th high key in the btree block.
 596 */
 597STATIC union xfs_btree_key *
 598xfs_btree_high_key_addr(
 599        struct xfs_btree_cur    *cur,
 600        int                     n,
 601        struct xfs_btree_block  *block)
 602{
 603        return (union xfs_btree_key *)
 604                ((char *)block + xfs_btree_high_key_offset(cur, n));
 605}
 606
 607/*
 608 * Return a pointer to the n-th block pointer in the btree block.
 609 */
 610STATIC union xfs_btree_ptr *
 611xfs_btree_ptr_addr(
 612        struct xfs_btree_cur    *cur,
 613        int                     n,
 614        struct xfs_btree_block  *block)
 615{
 616        int                     level = xfs_btree_get_level(block);
 617
 618        ASSERT(block->bb_level != 0);
 619
 620        return (union xfs_btree_ptr *)
 621                ((char *)block + xfs_btree_ptr_offset(cur, n, level));
 622}
 623
 624/*
 625 * Get the root block which is stored in the inode.
 626 *
 627 * For now this btree implementation assumes the btree root is always
 628 * stored in the if_broot field of an inode fork.
 629 */
 630STATIC struct xfs_btree_block *
 631xfs_btree_get_iroot(
 632        struct xfs_btree_cur    *cur)
 633{
 634        struct xfs_ifork        *ifp;
 635
 636        ifp = XFS_IFORK_PTR(cur->bc_private.b.ip, cur->bc_private.b.whichfork);
 637        return (struct xfs_btree_block *)ifp->if_broot;
 638}
 639
 640/*
 641 * Retrieve the block pointer from the cursor at the given level.
 642 * This may be an inode btree root or from a buffer.
 643 */
 644STATIC struct xfs_btree_block *         /* generic btree block pointer */
 645xfs_btree_get_block(
 646        struct xfs_btree_cur    *cur,   /* btree cursor */
 647        int                     level,  /* level in btree */
 648        struct xfs_buf          **bpp)  /* buffer containing the block */
 649{
 650        if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
 651            (level == cur->bc_nlevels - 1)) {
 652                *bpp = NULL;
 653                return xfs_btree_get_iroot(cur);
 654        }
 655
 656        *bpp = cur->bc_bufs[level];
 657        return XFS_BUF_TO_BLOCK(*bpp);
 658}
 659
 660/*
 661 * Get a buffer for the block, return it with no data read.
 662 * Long-form addressing.
 663 */
 664xfs_buf_t *                             /* buffer for fsbno */
 665xfs_btree_get_bufl(
 666        xfs_mount_t     *mp,            /* file system mount point */
 667        xfs_trans_t     *tp,            /* transaction pointer */
 668        xfs_fsblock_t   fsbno,          /* file system block number */
 669        uint            lock)           /* lock flags for get_buf */
 670{
 671        xfs_daddr_t             d;              /* real disk block address */
 672
 673        ASSERT(fsbno != NULLFSBLOCK);
 674        d = XFS_FSB_TO_DADDR(mp, fsbno);
 675        return xfs_trans_get_buf(tp, mp->m_ddev_targp, d, mp->m_bsize, lock);
 676}
 677
 678/*
 679 * Get a buffer for the block, return it with no data read.
 680 * Short-form addressing.
 681 */
 682xfs_buf_t *                             /* buffer for agno/agbno */
 683xfs_btree_get_bufs(
 684        xfs_mount_t     *mp,            /* file system mount point */
 685        xfs_trans_t     *tp,            /* transaction pointer */
 686        xfs_agnumber_t  agno,           /* allocation group number */
 687        xfs_agblock_t   agbno,          /* allocation group block number */
 688        uint            lock)           /* lock flags for get_buf */
 689{
 690        xfs_daddr_t             d;              /* real disk block address */
 691
 692        ASSERT(agno != NULLAGNUMBER);
 693        ASSERT(agbno != NULLAGBLOCK);
 694        d = XFS_AGB_TO_DADDR(mp, agno, agbno);
 695        return xfs_trans_get_buf(tp, mp->m_ddev_targp, d, mp->m_bsize, lock);
 696}
 697
 698/*
 699 * Check for the cursor referring to the last block at the given level.
 700 */
 701int                                     /* 1=is last block, 0=not last block */
 702xfs_btree_islastblock(
 703        xfs_btree_cur_t         *cur,   /* btree cursor */
 704        int                     level)  /* level to check */
 705{
 706        struct xfs_btree_block  *block; /* generic btree block pointer */
 707        xfs_buf_t               *bp;    /* buffer containing block */
 708
 709        block = xfs_btree_get_block(cur, level, &bp);
 710        xfs_btree_check_block(cur, block, level, bp);
 711        if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
 712                return block->bb_u.l.bb_rightsib == cpu_to_be64(NULLFSBLOCK);
 713        else
 714                return block->bb_u.s.bb_rightsib == cpu_to_be32(NULLAGBLOCK);
 715}
 716
 717/*
 718 * Change the cursor to point to the first record at the given level.
 719 * Other levels are unaffected.
 720 */
 721STATIC int                              /* success=1, failure=0 */
 722xfs_btree_firstrec(
 723        xfs_btree_cur_t         *cur,   /* btree cursor */
 724        int                     level)  /* level to change */
 725{
 726        struct xfs_btree_block  *block; /* generic btree block pointer */
 727        xfs_buf_t               *bp;    /* buffer containing block */
 728
 729        /*
 730         * Get the block pointer for this level.
 731         */
 732        block = xfs_btree_get_block(cur, level, &bp);
 733        xfs_btree_check_block(cur, block, level, bp);
 734        /*
 735         * It's empty, there is no such record.
 736         */
 737        if (!block->bb_numrecs)
 738                return 0;
 739        /*
 740         * Set the ptr value to 1, that's the first record/key.
 741         */
 742        cur->bc_ptrs[level] = 1;
 743        return 1;
 744}
 745
 746/*
 747 * Change the cursor to point to the last record in the current block
 748 * at the given level.  Other levels are unaffected.
 749 */
 750STATIC int                              /* success=1, failure=0 */
 751xfs_btree_lastrec(
 752        xfs_btree_cur_t         *cur,   /* btree cursor */
 753        int                     level)  /* level to change */
 754{
 755        struct xfs_btree_block  *block; /* generic btree block pointer */
 756        xfs_buf_t               *bp;    /* buffer containing block */
 757
 758        /*
 759         * Get the block pointer for this level.
 760         */
 761        block = xfs_btree_get_block(cur, level, &bp);
 762        xfs_btree_check_block(cur, block, level, bp);
 763        /*
 764         * It's empty, there is no such record.
 765         */
 766        if (!block->bb_numrecs)
 767                return 0;
 768        /*
 769         * Set the ptr value to numrecs, that's the last record/key.
 770         */
 771        cur->bc_ptrs[level] = be16_to_cpu(block->bb_numrecs);
 772        return 1;
 773}
 774
 775/*
 776 * Compute first and last byte offsets for the fields given.
 777 * Interprets the offsets table, which contains struct field offsets.
 778 */
 779void
 780xfs_btree_offsets(
 781        __int64_t       fields,         /* bitmask of fields */
 782        const short     *offsets,       /* table of field offsets */
 783        int             nbits,          /* number of bits to inspect */
 784        int             *first,         /* output: first byte offset */
 785        int             *last)          /* output: last byte offset */
 786{
 787        int             i;              /* current bit number */
 788        __int64_t       imask;          /* mask for current bit number */
 789
 790        ASSERT(fields != 0);
 791        /*
 792         * Find the lowest bit, so the first byte offset.
 793         */
 794        for (i = 0, imask = 1LL; ; i++, imask <<= 1) {
 795                if (imask & fields) {
 796                        *first = offsets[i];
 797                        break;
 798                }
 799        }
 800        /*
 801         * Find the highest bit, so the last byte offset.
 802         */
 803        for (i = nbits - 1, imask = 1LL << i; ; i--, imask >>= 1) {
 804                if (imask & fields) {
 805                        *last = offsets[i + 1] - 1;
 806                        break;
 807                }
 808        }
 809}
 810
 811/*
 812 * Get a buffer for the block, return it read in.
 813 * Long-form addressing.
 814 */
 815int
 816xfs_btree_read_bufl(
 817        struct xfs_mount        *mp,            /* file system mount point */
 818        struct xfs_trans        *tp,            /* transaction pointer */
 819        xfs_fsblock_t           fsbno,          /* file system block number */
 820        uint                    lock,           /* lock flags for read_buf */
 821        struct xfs_buf          **bpp,          /* buffer for fsbno */
 822        int                     refval,         /* ref count value for buffer */
 823        const struct xfs_buf_ops *ops)
 824{
 825        struct xfs_buf          *bp;            /* return value */
 826        xfs_daddr_t             d;              /* real disk block address */
 827        int                     error;
 828
 829        if (!XFS_FSB_SANITY_CHECK(mp, fsbno))
 830                return -EFSCORRUPTED;
 831        d = XFS_FSB_TO_DADDR(mp, fsbno);
 832        error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, d,
 833                                   mp->m_bsize, lock, &bp, ops);
 834        if (error)
 835                return error;
 836        if (bp)
 837                xfs_buf_set_ref(bp, refval);
 838        *bpp = bp;
 839        return 0;
 840}
 841
 842/*
 843 * Read-ahead the block, don't wait for it, don't return a buffer.
 844 * Long-form addressing.
 845 */
 846/* ARGSUSED */
 847void
 848xfs_btree_reada_bufl(
 849        struct xfs_mount        *mp,            /* file system mount point */
 850        xfs_fsblock_t           fsbno,          /* file system block number */
 851        xfs_extlen_t            count,          /* count of filesystem blocks */
 852        const struct xfs_buf_ops *ops)
 853{
 854        xfs_daddr_t             d;
 855
 856        ASSERT(fsbno != NULLFSBLOCK);
 857        d = XFS_FSB_TO_DADDR(mp, fsbno);
 858        xfs_buf_readahead(mp->m_ddev_targp, d, mp->m_bsize * count, ops);
 859}
 860
 861/*
 862 * Read-ahead the block, don't wait for it, don't return a buffer.
 863 * Short-form addressing.
 864 */
 865/* ARGSUSED */
 866void
 867xfs_btree_reada_bufs(
 868        struct xfs_mount        *mp,            /* file system mount point */
 869        xfs_agnumber_t          agno,           /* allocation group number */
 870        xfs_agblock_t           agbno,          /* allocation group block number */
 871        xfs_extlen_t            count,          /* count of filesystem blocks */
 872        const struct xfs_buf_ops *ops)
 873{
 874        xfs_daddr_t             d;
 875
 876        ASSERT(agno != NULLAGNUMBER);
 877        ASSERT(agbno != NULLAGBLOCK);
 878        d = XFS_AGB_TO_DADDR(mp, agno, agbno);
 879        xfs_buf_readahead(mp->m_ddev_targp, d, mp->m_bsize * count, ops);
 880}
 881
 882STATIC int
 883xfs_btree_readahead_lblock(
 884        struct xfs_btree_cur    *cur,
 885        int                     lr,
 886        struct xfs_btree_block  *block)
 887{
 888        int                     rval = 0;
 889        xfs_fsblock_t           left = be64_to_cpu(block->bb_u.l.bb_leftsib);
 890        xfs_fsblock_t           right = be64_to_cpu(block->bb_u.l.bb_rightsib);
 891
 892        if ((lr & XFS_BTCUR_LEFTRA) && left != NULLFSBLOCK) {
 893                xfs_btree_reada_bufl(cur->bc_mp, left, 1,
 894                                     cur->bc_ops->buf_ops);
 895                rval++;
 896        }
 897
 898        if ((lr & XFS_BTCUR_RIGHTRA) && right != NULLFSBLOCK) {
 899                xfs_btree_reada_bufl(cur->bc_mp, right, 1,
 900                                     cur->bc_ops->buf_ops);
 901                rval++;
 902        }
 903
 904        return rval;
 905}
 906
 907STATIC int
 908xfs_btree_readahead_sblock(
 909        struct xfs_btree_cur    *cur,
 910        int                     lr,
 911        struct xfs_btree_block *block)
 912{
 913        int                     rval = 0;
 914        xfs_agblock_t           left = be32_to_cpu(block->bb_u.s.bb_leftsib);
 915        xfs_agblock_t           right = be32_to_cpu(block->bb_u.s.bb_rightsib);
 916
 917
 918        if ((lr & XFS_BTCUR_LEFTRA) && left != NULLAGBLOCK) {
 919                xfs_btree_reada_bufs(cur->bc_mp, cur->bc_private.a.agno,
 920                                     left, 1, cur->bc_ops->buf_ops);
 921                rval++;
 922        }
 923
 924        if ((lr & XFS_BTCUR_RIGHTRA) && right != NULLAGBLOCK) {
 925                xfs_btree_reada_bufs(cur->bc_mp, cur->bc_private.a.agno,
 926                                     right, 1, cur->bc_ops->buf_ops);
 927                rval++;
 928        }
 929
 930        return rval;
 931}
 932
 933/*
 934 * Read-ahead btree blocks, at the given level.
 935 * Bits in lr are set from XFS_BTCUR_{LEFT,RIGHT}RA.
 936 */
 937STATIC int
 938xfs_btree_readahead(
 939        struct xfs_btree_cur    *cur,           /* btree cursor */
 940        int                     lev,            /* level in btree */
 941        int                     lr)             /* left/right bits */
 942{
 943        struct xfs_btree_block  *block;
 944
 945        /*
 946         * No readahead needed if we are at the root level and the
 947         * btree root is stored in the inode.
 948         */
 949        if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
 950            (lev == cur->bc_nlevels - 1))
 951                return 0;
 952
 953        if ((cur->bc_ra[lev] | lr) == cur->bc_ra[lev])
 954                return 0;
 955
 956        cur->bc_ra[lev] |= lr;
 957        block = XFS_BUF_TO_BLOCK(cur->bc_bufs[lev]);
 958
 959        if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
 960                return xfs_btree_readahead_lblock(cur, lr, block);
 961        return xfs_btree_readahead_sblock(cur, lr, block);
 962}
 963
 964STATIC xfs_daddr_t
 965xfs_btree_ptr_to_daddr(
 966        struct xfs_btree_cur    *cur,
 967        union xfs_btree_ptr     *ptr)
 968{
 969        if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
 970                ASSERT(ptr->l != cpu_to_be64(NULLFSBLOCK));
 971
 972                return XFS_FSB_TO_DADDR(cur->bc_mp, be64_to_cpu(ptr->l));
 973        } else {
 974                ASSERT(cur->bc_private.a.agno != NULLAGNUMBER);
 975                ASSERT(ptr->s != cpu_to_be32(NULLAGBLOCK));
 976
 977                return XFS_AGB_TO_DADDR(cur->bc_mp, cur->bc_private.a.agno,
 978                                        be32_to_cpu(ptr->s));
 979        }
 980}
 981
 982/*
 983 * Readahead @count btree blocks at the given @ptr location.
 984 *
 985 * We don't need to care about long or short form btrees here as we have a
 986 * method of converting the ptr directly to a daddr available to us.
 987 */
 988STATIC void
 989xfs_btree_readahead_ptr(
 990        struct xfs_btree_cur    *cur,
 991        union xfs_btree_ptr     *ptr,
 992        xfs_extlen_t            count)
 993{
 994        xfs_buf_readahead(cur->bc_mp->m_ddev_targp,
 995                          xfs_btree_ptr_to_daddr(cur, ptr),
 996                          cur->bc_mp->m_bsize * count, cur->bc_ops->buf_ops);
 997}
 998
 999/*
1000 * Set the buffer for level "lev" in the cursor to bp, releasing
1001 * any previous buffer.
1002 */
1003STATIC void
1004xfs_btree_setbuf(
1005        xfs_btree_cur_t         *cur,   /* btree cursor */
1006        int                     lev,    /* level in btree */
1007        xfs_buf_t               *bp)    /* new buffer to set */
1008{
1009        struct xfs_btree_block  *b;     /* btree block */
1010
1011        if (cur->bc_bufs[lev])
1012                xfs_trans_brelse(cur->bc_tp, cur->bc_bufs[lev]);
1013        cur->bc_bufs[lev] = bp;
1014        cur->bc_ra[lev] = 0;
1015
1016        b = XFS_BUF_TO_BLOCK(bp);
1017        if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
1018                if (b->bb_u.l.bb_leftsib == cpu_to_be64(NULLFSBLOCK))
1019                        cur->bc_ra[lev] |= XFS_BTCUR_LEFTRA;
1020                if (b->bb_u.l.bb_rightsib == cpu_to_be64(NULLFSBLOCK))
1021                        cur->bc_ra[lev] |= XFS_BTCUR_RIGHTRA;
1022        } else {
1023                if (b->bb_u.s.bb_leftsib == cpu_to_be32(NULLAGBLOCK))
1024                        cur->bc_ra[lev] |= XFS_BTCUR_LEFTRA;
1025                if (b->bb_u.s.bb_rightsib == cpu_to_be32(NULLAGBLOCK))
1026                        cur->bc_ra[lev] |= XFS_BTCUR_RIGHTRA;
1027        }
1028}
1029
1030STATIC int
1031xfs_btree_ptr_is_null(
1032        struct xfs_btree_cur    *cur,
1033        union xfs_btree_ptr     *ptr)
1034{
1035        if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
1036                return ptr->l == cpu_to_be64(NULLFSBLOCK);
1037        else
1038                return ptr->s == cpu_to_be32(NULLAGBLOCK);
1039}
1040
1041STATIC void
1042xfs_btree_set_ptr_null(
1043        struct xfs_btree_cur    *cur,
1044        union xfs_btree_ptr     *ptr)
1045{
1046        if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
1047                ptr->l = cpu_to_be64(NULLFSBLOCK);
1048        else
1049                ptr->s = cpu_to_be32(NULLAGBLOCK);
1050}
1051
1052/*
1053 * Get/set/init sibling pointers
1054 */
1055STATIC void
1056xfs_btree_get_sibling(
1057        struct xfs_btree_cur    *cur,
1058        struct xfs_btree_block  *block,
1059        union xfs_btree_ptr     *ptr,
1060        int                     lr)
1061{
1062        ASSERT(lr == XFS_BB_LEFTSIB || lr == XFS_BB_RIGHTSIB);
1063
1064        if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
1065                if (lr == XFS_BB_RIGHTSIB)
1066                        ptr->l = block->bb_u.l.bb_rightsib;
1067                else
1068                        ptr->l = block->bb_u.l.bb_leftsib;
1069        } else {
1070                if (lr == XFS_BB_RIGHTSIB)
1071                        ptr->s = block->bb_u.s.bb_rightsib;
1072                else
1073                        ptr->s = block->bb_u.s.bb_leftsib;
1074        }
1075}
1076
1077STATIC void
1078xfs_btree_set_sibling(
1079        struct xfs_btree_cur    *cur,
1080        struct xfs_btree_block  *block,
1081        union xfs_btree_ptr     *ptr,
1082        int                     lr)
1083{
1084        ASSERT(lr == XFS_BB_LEFTSIB || lr == XFS_BB_RIGHTSIB);
1085
1086        if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
1087                if (lr == XFS_BB_RIGHTSIB)
1088                        block->bb_u.l.bb_rightsib = ptr->l;
1089                else
1090                        block->bb_u.l.bb_leftsib = ptr->l;
1091        } else {
1092                if (lr == XFS_BB_RIGHTSIB)
1093                        block->bb_u.s.bb_rightsib = ptr->s;
1094                else
1095                        block->bb_u.s.bb_leftsib = ptr->s;
1096        }
1097}
1098
1099void
1100xfs_btree_init_block_int(
1101        struct xfs_mount        *mp,
1102        struct xfs_btree_block  *buf,
1103        xfs_daddr_t             blkno,
1104        xfs_btnum_t             btnum,
1105        __u16                   level,
1106        __u16                   numrecs,
1107        __u64                   owner,
1108        unsigned int            flags)
1109{
1110        int                     crc = xfs_sb_version_hascrc(&mp->m_sb);
1111        __u32                   magic = xfs_btree_magic(crc, btnum);
1112
1113        buf->bb_magic = cpu_to_be32(magic);
1114        buf->bb_level = cpu_to_be16(level);
1115        buf->bb_numrecs = cpu_to_be16(numrecs);
1116
1117        if (flags & XFS_BTREE_LONG_PTRS) {
1118                buf->bb_u.l.bb_leftsib = cpu_to_be64(NULLFSBLOCK);
1119                buf->bb_u.l.bb_rightsib = cpu_to_be64(NULLFSBLOCK);
1120                if (crc) {
1121                        buf->bb_u.l.bb_blkno = cpu_to_be64(blkno);
1122                        buf->bb_u.l.bb_owner = cpu_to_be64(owner);
1123                        uuid_copy(&buf->bb_u.l.bb_uuid, &mp->m_sb.sb_meta_uuid);
1124                        buf->bb_u.l.bb_pad = 0;
1125                        buf->bb_u.l.bb_lsn = 0;
1126                }
1127        } else {
1128                /* owner is a 32 bit value on short blocks */
1129                __u32 __owner = (__u32)owner;
1130
1131                buf->bb_u.s.bb_leftsib = cpu_to_be32(NULLAGBLOCK);
1132                buf->bb_u.s.bb_rightsib = cpu_to_be32(NULLAGBLOCK);
1133                if (crc) {
1134                        buf->bb_u.s.bb_blkno = cpu_to_be64(blkno);
1135                        buf->bb_u.s.bb_owner = cpu_to_be32(__owner);
1136                        uuid_copy(&buf->bb_u.s.bb_uuid, &mp->m_sb.sb_meta_uuid);
1137                        buf->bb_u.s.bb_lsn = 0;
1138                }
1139        }
1140}
1141
1142void
1143xfs_btree_init_block(
1144        struct xfs_mount *mp,
1145        struct xfs_buf  *bp,
1146        xfs_btnum_t     btnum,
1147        __u16           level,
1148        __u16           numrecs,
1149        __u64           owner,
1150        unsigned int    flags)
1151{
1152        xfs_btree_init_block_int(mp, XFS_BUF_TO_BLOCK(bp), bp->b_bn,
1153                                 btnum, level, numrecs, owner, flags);
1154}
1155
1156STATIC void
1157xfs_btree_init_block_cur(
1158        struct xfs_btree_cur    *cur,
1159        struct xfs_buf          *bp,
1160        int                     level,
1161        int                     numrecs)
1162{
1163        __u64                   owner;
1164
1165        /*
1166         * we can pull the owner from the cursor right now as the different
1167         * owners align directly with the pointer size of the btree. This may
1168         * change in future, but is safe for current users of the generic btree
1169         * code.
1170         */
1171        if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
1172                owner = cur->bc_private.b.ip->i_ino;
1173        else
1174                owner = cur->bc_private.a.agno;
1175
1176        xfs_btree_init_block_int(cur->bc_mp, XFS_BUF_TO_BLOCK(bp), bp->b_bn,
1177                                 cur->bc_btnum, level, numrecs,
1178                                 owner, cur->bc_flags);
1179}
1180
1181/*
1182 * Return true if ptr is the last record in the btree and
1183 * we need to track updates to this record.  The decision
1184 * will be further refined in the update_lastrec method.
1185 */
1186STATIC int
1187xfs_btree_is_lastrec(
1188        struct xfs_btree_cur    *cur,
1189        struct xfs_btree_block  *block,
1190        int                     level)
1191{
1192        union xfs_btree_ptr     ptr;
1193
1194        if (level > 0)
1195                return 0;
1196        if (!(cur->bc_flags & XFS_BTREE_LASTREC_UPDATE))
1197                return 0;
1198
1199        xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_RIGHTSIB);
1200        if (!xfs_btree_ptr_is_null(cur, &ptr))
1201                return 0;
1202        return 1;
1203}
1204
1205STATIC void
1206xfs_btree_buf_to_ptr(
1207        struct xfs_btree_cur    *cur,
1208        struct xfs_buf          *bp,
1209        union xfs_btree_ptr     *ptr)
1210{
1211        if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
1212                ptr->l = cpu_to_be64(XFS_DADDR_TO_FSB(cur->bc_mp,
1213                                        XFS_BUF_ADDR(bp)));
1214        else {
1215                ptr->s = cpu_to_be32(xfs_daddr_to_agbno(cur->bc_mp,
1216                                        XFS_BUF_ADDR(bp)));
1217        }
1218}
1219
1220STATIC void
1221xfs_btree_set_refs(
1222        struct xfs_btree_cur    *cur,
1223        struct xfs_buf          *bp)
1224{
1225        switch (cur->bc_btnum) {
1226        case XFS_BTNUM_BNO:
1227        case XFS_BTNUM_CNT:
1228                xfs_buf_set_ref(bp, XFS_ALLOC_BTREE_REF);
1229                break;
1230        case XFS_BTNUM_INO:
1231        case XFS_BTNUM_FINO:
1232                xfs_buf_set_ref(bp, XFS_INO_BTREE_REF);
1233                break;
1234        case XFS_BTNUM_BMAP:
1235                xfs_buf_set_ref(bp, XFS_BMAP_BTREE_REF);
1236                break;
1237        case XFS_BTNUM_RMAP:
1238                xfs_buf_set_ref(bp, XFS_RMAP_BTREE_REF);
1239                break;
1240        case XFS_BTNUM_REFC:
1241                xfs_buf_set_ref(bp, XFS_REFC_BTREE_REF);
1242                break;
1243        default:
1244                ASSERT(0);
1245        }
1246}
1247
1248STATIC int
1249xfs_btree_get_buf_block(
1250        struct xfs_btree_cur    *cur,
1251        union xfs_btree_ptr     *ptr,
1252        int                     flags,
1253        struct xfs_btree_block  **block,
1254        struct xfs_buf          **bpp)
1255{
1256        struct xfs_mount        *mp = cur->bc_mp;
1257        xfs_daddr_t             d;
1258
1259        /* need to sort out how callers deal with failures first */
1260        ASSERT(!(flags & XBF_TRYLOCK));
1261
1262        d = xfs_btree_ptr_to_daddr(cur, ptr);
1263        *bpp = xfs_trans_get_buf(cur->bc_tp, mp->m_ddev_targp, d,
1264                                 mp->m_bsize, flags);
1265
1266        if (!*bpp)
1267                return -ENOMEM;
1268
1269        (*bpp)->b_ops = cur->bc_ops->buf_ops;
1270        *block = XFS_BUF_TO_BLOCK(*bpp);
1271        return 0;
1272}
1273
1274/*
1275 * Read in the buffer at the given ptr and return the buffer and
1276 * the block pointer within the buffer.
1277 */
1278STATIC int
1279xfs_btree_read_buf_block(
1280        struct xfs_btree_cur    *cur,
1281        union xfs_btree_ptr     *ptr,
1282        int                     flags,
1283        struct xfs_btree_block  **block,
1284        struct xfs_buf          **bpp)
1285{
1286        struct xfs_mount        *mp = cur->bc_mp;
1287        xfs_daddr_t             d;
1288        int                     error;
1289
1290        /* need to sort out how callers deal with failures first */
1291        ASSERT(!(flags & XBF_TRYLOCK));
1292
1293        d = xfs_btree_ptr_to_daddr(cur, ptr);
1294        error = xfs_trans_read_buf(mp, cur->bc_tp, mp->m_ddev_targp, d,
1295                                   mp->m_bsize, flags, bpp,
1296                                   cur->bc_ops->buf_ops);
1297        if (error)
1298                return error;
1299
1300        xfs_btree_set_refs(cur, *bpp);
1301        *block = XFS_BUF_TO_BLOCK(*bpp);
1302        return 0;
1303}
1304
1305/*
1306 * Copy keys from one btree block to another.
1307 */
1308STATIC void
1309xfs_btree_copy_keys(
1310        struct xfs_btree_cur    *cur,
1311        union xfs_btree_key     *dst_key,
1312        union xfs_btree_key     *src_key,
1313        int                     numkeys)
1314{
1315        ASSERT(numkeys >= 0);
1316        memcpy(dst_key, src_key, numkeys * cur->bc_ops->key_len);
1317}
1318
1319/*
1320 * Copy records from one btree block to another.
1321 */
1322STATIC void
1323xfs_btree_copy_recs(
1324        struct xfs_btree_cur    *cur,
1325        union xfs_btree_rec     *dst_rec,
1326        union xfs_btree_rec     *src_rec,
1327        int                     numrecs)
1328{
1329        ASSERT(numrecs >= 0);
1330        memcpy(dst_rec, src_rec, numrecs * cur->bc_ops->rec_len);
1331}
1332
1333/*
1334 * Copy block pointers from one btree block to another.
1335 */
1336STATIC void
1337xfs_btree_copy_ptrs(
1338        struct xfs_btree_cur    *cur,
1339        union xfs_btree_ptr     *dst_ptr,
1340        union xfs_btree_ptr     *src_ptr,
1341        int                     numptrs)
1342{
1343        ASSERT(numptrs >= 0);
1344        memcpy(dst_ptr, src_ptr, numptrs * xfs_btree_ptr_len(cur));
1345}
1346
1347/*
1348 * Shift keys one index left/right inside a single btree block.
1349 */
1350STATIC void
1351xfs_btree_shift_keys(
1352        struct xfs_btree_cur    *cur,
1353        union xfs_btree_key     *key,
1354        int                     dir,
1355        int                     numkeys)
1356{
1357        char                    *dst_key;
1358
1359        ASSERT(numkeys >= 0);
1360        ASSERT(dir == 1 || dir == -1);
1361
1362        dst_key = (char *)key + (dir * cur->bc_ops->key_len);
1363        memmove(dst_key, key, numkeys * cur->bc_ops->key_len);
1364}
1365
1366/*
1367 * Shift records one index left/right inside a single btree block.
1368 */
1369STATIC void
1370xfs_btree_shift_recs(
1371        struct xfs_btree_cur    *cur,
1372        union xfs_btree_rec     *rec,
1373        int                     dir,
1374        int                     numrecs)
1375{
1376        char                    *dst_rec;
1377
1378        ASSERT(numrecs >= 0);
1379        ASSERT(dir == 1 || dir == -1);
1380
1381        dst_rec = (char *)rec + (dir * cur->bc_ops->rec_len);
1382        memmove(dst_rec, rec, numrecs * cur->bc_ops->rec_len);
1383}
1384
1385/*
1386 * Shift block pointers one index left/right inside a single btree block.
1387 */
1388STATIC void
1389xfs_btree_shift_ptrs(
1390        struct xfs_btree_cur    *cur,
1391        union xfs_btree_ptr     *ptr,
1392        int                     dir,
1393        int                     numptrs)
1394{
1395        char                    *dst_ptr;
1396
1397        ASSERT(numptrs >= 0);
1398        ASSERT(dir == 1 || dir == -1);
1399
1400        dst_ptr = (char *)ptr + (dir * xfs_btree_ptr_len(cur));
1401        memmove(dst_ptr, ptr, numptrs * xfs_btree_ptr_len(cur));
1402}
1403
1404/*
1405 * Log key values from the btree block.
1406 */
1407STATIC void
1408xfs_btree_log_keys(
1409        struct xfs_btree_cur    *cur,
1410        struct xfs_buf          *bp,
1411        int                     first,
1412        int                     last)
1413{
1414        XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
1415        XFS_BTREE_TRACE_ARGBII(cur, bp, first, last);
1416
1417        if (bp) {
1418                xfs_trans_buf_set_type(cur->bc_tp, bp, XFS_BLFT_BTREE_BUF);
1419                xfs_trans_log_buf(cur->bc_tp, bp,
1420                                  xfs_btree_key_offset(cur, first),
1421                                  xfs_btree_key_offset(cur, last + 1) - 1);
1422        } else {
1423                xfs_trans_log_inode(cur->bc_tp, cur->bc_private.b.ip,
1424                                xfs_ilog_fbroot(cur->bc_private.b.whichfork));
1425        }
1426
1427        XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
1428}
1429
1430/*
1431 * Log record values from the btree block.
1432 */
1433void
1434xfs_btree_log_recs(
1435        struct xfs_btree_cur    *cur,
1436        struct xfs_buf          *bp,
1437        int                     first,
1438        int                     last)
1439{
1440        XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
1441        XFS_BTREE_TRACE_ARGBII(cur, bp, first, last);
1442
1443        xfs_trans_buf_set_type(cur->bc_tp, bp, XFS_BLFT_BTREE_BUF);
1444        xfs_trans_log_buf(cur->bc_tp, bp,
1445                          xfs_btree_rec_offset(cur, first),
1446                          xfs_btree_rec_offset(cur, last + 1) - 1);
1447
1448        XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
1449}
1450
1451/*
1452 * Log block pointer fields from a btree block (nonleaf).
1453 */
1454STATIC void
1455xfs_btree_log_ptrs(
1456        struct xfs_btree_cur    *cur,   /* btree cursor */
1457        struct xfs_buf          *bp,    /* buffer containing btree block */
1458        int                     first,  /* index of first pointer to log */
1459        int                     last)   /* index of last pointer to log */
1460{
1461        XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
1462        XFS_BTREE_TRACE_ARGBII(cur, bp, first, last);
1463
1464        if (bp) {
1465                struct xfs_btree_block  *block = XFS_BUF_TO_BLOCK(bp);
1466                int                     level = xfs_btree_get_level(block);
1467
1468                xfs_trans_buf_set_type(cur->bc_tp, bp, XFS_BLFT_BTREE_BUF);
1469                xfs_trans_log_buf(cur->bc_tp, bp,
1470                                xfs_btree_ptr_offset(cur, first, level),
1471                                xfs_btree_ptr_offset(cur, last + 1, level) - 1);
1472        } else {
1473                xfs_trans_log_inode(cur->bc_tp, cur->bc_private.b.ip,
1474                        xfs_ilog_fbroot(cur->bc_private.b.whichfork));
1475        }
1476
1477        XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
1478}
1479
1480/*
1481 * Log fields from a btree block header.
1482 */
1483void
1484xfs_btree_log_block(
1485        struct xfs_btree_cur    *cur,   /* btree cursor */
1486        struct xfs_buf          *bp,    /* buffer containing btree block */
1487        int                     fields) /* mask of fields: XFS_BB_... */
1488{
1489        int                     first;  /* first byte offset logged */
1490        int                     last;   /* last byte offset logged */
1491        static const short      soffsets[] = {  /* table of offsets (short) */
1492                offsetof(struct xfs_btree_block, bb_magic),
1493                offsetof(struct xfs_btree_block, bb_level),
1494                offsetof(struct xfs_btree_block, bb_numrecs),
1495                offsetof(struct xfs_btree_block, bb_u.s.bb_leftsib),
1496                offsetof(struct xfs_btree_block, bb_u.s.bb_rightsib),
1497                offsetof(struct xfs_btree_block, bb_u.s.bb_blkno),
1498                offsetof(struct xfs_btree_block, bb_u.s.bb_lsn),
1499                offsetof(struct xfs_btree_block, bb_u.s.bb_uuid),
1500                offsetof(struct xfs_btree_block, bb_u.s.bb_owner),
1501                offsetof(struct xfs_btree_block, bb_u.s.bb_crc),
1502                XFS_BTREE_SBLOCK_CRC_LEN
1503        };
1504        static const short      loffsets[] = {  /* table of offsets (long) */
1505                offsetof(struct xfs_btree_block, bb_magic),
1506                offsetof(struct xfs_btree_block, bb_level),
1507                offsetof(struct xfs_btree_block, bb_numrecs),
1508                offsetof(struct xfs_btree_block, bb_u.l.bb_leftsib),
1509                offsetof(struct xfs_btree_block, bb_u.l.bb_rightsib),
1510                offsetof(struct xfs_btree_block, bb_u.l.bb_blkno),
1511                offsetof(struct xfs_btree_block, bb_u.l.bb_lsn),
1512                offsetof(struct xfs_btree_block, bb_u.l.bb_uuid),
1513                offsetof(struct xfs_btree_block, bb_u.l.bb_owner),
1514                offsetof(struct xfs_btree_block, bb_u.l.bb_crc),
1515                offsetof(struct xfs_btree_block, bb_u.l.bb_pad),
1516                XFS_BTREE_LBLOCK_CRC_LEN
1517        };
1518
1519        XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
1520        XFS_BTREE_TRACE_ARGBI(cur, bp, fields);
1521
1522        if (bp) {
1523                int nbits;
1524
1525                if (cur->bc_flags & XFS_BTREE_CRC_BLOCKS) {
1526                        /*
1527                         * We don't log the CRC when updating a btree
1528                         * block but instead recreate it during log
1529                         * recovery.  As the log buffers have checksums
1530                         * of their own this is safe and avoids logging a crc
1531                         * update in a lot of places.
1532                         */
1533                        if (fields == XFS_BB_ALL_BITS)
1534                                fields = XFS_BB_ALL_BITS_CRC;
1535                        nbits = XFS_BB_NUM_BITS_CRC;
1536                } else {
1537                        nbits = XFS_BB_NUM_BITS;
1538                }
1539                xfs_btree_offsets(fields,
1540                                  (cur->bc_flags & XFS_BTREE_LONG_PTRS) ?
1541                                        loffsets : soffsets,
1542                                  nbits, &first, &last);
1543                xfs_trans_buf_set_type(cur->bc_tp, bp, XFS_BLFT_BTREE_BUF);
1544                xfs_trans_log_buf(cur->bc_tp, bp, first, last);
1545        } else {
1546                xfs_trans_log_inode(cur->bc_tp, cur->bc_private.b.ip,
1547                        xfs_ilog_fbroot(cur->bc_private.b.whichfork));
1548        }
1549
1550        XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
1551}
1552
1553/*
1554 * Increment cursor by one record at the level.
1555 * For nonzero levels the leaf-ward information is untouched.
1556 */
1557int                                             /* error */
1558xfs_btree_increment(
1559        struct xfs_btree_cur    *cur,
1560        int                     level,
1561        int                     *stat)          /* success/failure */
1562{
1563        struct xfs_btree_block  *block;
1564        union xfs_btree_ptr     ptr;
1565        struct xfs_buf          *bp;
1566        int                     error;          /* error return value */
1567        int                     lev;
1568
1569        XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
1570        XFS_BTREE_TRACE_ARGI(cur, level);
1571
1572        ASSERT(level < cur->bc_nlevels);
1573
1574        /* Read-ahead to the right at this level. */
1575        xfs_btree_readahead(cur, level, XFS_BTCUR_RIGHTRA);
1576
1577        /* Get a pointer to the btree block. */
1578        block = xfs_btree_get_block(cur, level, &bp);
1579
1580#ifdef DEBUG
1581        error = xfs_btree_check_block(cur, block, level, bp);
1582        if (error)
1583                goto error0;
1584#endif
1585
1586        /* We're done if we remain in the block after the increment. */
1587        if (++cur->bc_ptrs[level] <= xfs_btree_get_numrecs(block))
1588                goto out1;
1589
1590        /* Fail if we just went off the right edge of the tree. */
1591        xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_RIGHTSIB);
1592        if (xfs_btree_ptr_is_null(cur, &ptr))
1593                goto out0;
1594
1595        XFS_BTREE_STATS_INC(cur, increment);
1596
1597        /*
1598         * March up the tree incrementing pointers.
1599         * Stop when we don't go off the right edge of a block.
1600         */
1601        for (lev = level + 1; lev < cur->bc_nlevels; lev++) {
1602                block = xfs_btree_get_block(cur, lev, &bp);
1603
1604#ifdef DEBUG
1605                error = xfs_btree_check_block(cur, block, lev, bp);
1606                if (error)
1607                        goto error0;
1608#endif
1609
1610                if (++cur->bc_ptrs[lev] <= xfs_btree_get_numrecs(block))
1611                        break;
1612
1613                /* Read-ahead the right block for the next loop. */
1614                xfs_btree_readahead(cur, lev, XFS_BTCUR_RIGHTRA);
1615        }
1616
1617        /*
1618         * If we went off the root then we are either seriously
1619         * confused or have the tree root in an inode.
1620         */
1621        if (lev == cur->bc_nlevels) {
1622                if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE)
1623                        goto out0;
1624                ASSERT(0);
1625                error = -EFSCORRUPTED;
1626                goto error0;
1627        }
1628        ASSERT(lev < cur->bc_nlevels);
1629
1630        /*
1631         * Now walk back down the tree, fixing up the cursor's buffer
1632         * pointers and key numbers.
1633         */
1634        for (block = xfs_btree_get_block(cur, lev, &bp); lev > level; ) {
1635                union xfs_btree_ptr     *ptrp;
1636
1637                ptrp = xfs_btree_ptr_addr(cur, cur->bc_ptrs[lev], block);
1638                --lev;
1639                error = xfs_btree_read_buf_block(cur, ptrp, 0, &block, &bp);
1640                if (error)
1641                        goto error0;
1642
1643                xfs_btree_setbuf(cur, lev, bp);
1644                cur->bc_ptrs[lev] = 1;
1645        }
1646out1:
1647        XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
1648        *stat = 1;
1649        return 0;
1650
1651out0:
1652        XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
1653        *stat = 0;
1654        return 0;
1655
1656error0:
1657        XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
1658        return error;
1659}
1660
1661/*
1662 * Decrement cursor by one record at the level.
1663 * For nonzero levels the leaf-ward information is untouched.
1664 */
1665int                                             /* error */
1666xfs_btree_decrement(
1667        struct xfs_btree_cur    *cur,
1668        int                     level,
1669        int                     *stat)          /* success/failure */
1670{
1671        struct xfs_btree_block  *block;
1672        xfs_buf_t               *bp;
1673        int                     error;          /* error return value */
1674        int                     lev;
1675        union xfs_btree_ptr     ptr;
1676
1677        XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
1678        XFS_BTREE_TRACE_ARGI(cur, level);
1679
1680        ASSERT(level < cur->bc_nlevels);
1681
1682        /* Read-ahead to the left at this level. */
1683        xfs_btree_readahead(cur, level, XFS_BTCUR_LEFTRA);
1684
1685        /* We're done if we remain in the block after the decrement. */
1686        if (--cur->bc_ptrs[level] > 0)
1687                goto out1;
1688
1689        /* Get a pointer to the btree block. */
1690        block = xfs_btree_get_block(cur, level, &bp);
1691
1692#ifdef DEBUG
1693        error = xfs_btree_check_block(cur, block, level, bp);
1694        if (error)
1695                goto error0;
1696#endif
1697
1698        /* Fail if we just went off the left edge of the tree. */
1699        xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_LEFTSIB);
1700        if (xfs_btree_ptr_is_null(cur, &ptr))
1701                goto out0;
1702
1703        XFS_BTREE_STATS_INC(cur, decrement);
1704
1705        /*
1706         * March up the tree decrementing pointers.
1707         * Stop when we don't go off the left edge of a block.
1708         */
1709        for (lev = level + 1; lev < cur->bc_nlevels; lev++) {
1710                if (--cur->bc_ptrs[lev] > 0)
1711                        break;
1712                /* Read-ahead the left block for the next loop. */
1713                xfs_btree_readahead(cur, lev, XFS_BTCUR_LEFTRA);
1714        }
1715
1716        /*
1717         * If we went off the root then we are seriously confused.
1718         * or the root of the tree is in an inode.
1719         */
1720        if (lev == cur->bc_nlevels) {
1721                if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE)
1722                        goto out0;
1723                ASSERT(0);
1724                error = -EFSCORRUPTED;
1725                goto error0;
1726        }
1727        ASSERT(lev < cur->bc_nlevels);
1728
1729        /*
1730         * Now walk back down the tree, fixing up the cursor's buffer
1731         * pointers and key numbers.
1732         */
1733        for (block = xfs_btree_get_block(cur, lev, &bp); lev > level; ) {
1734                union xfs_btree_ptr     *ptrp;
1735
1736                ptrp = xfs_btree_ptr_addr(cur, cur->bc_ptrs[lev], block);
1737                --lev;
1738                error = xfs_btree_read_buf_block(cur, ptrp, 0, &block, &bp);
1739                if (error)
1740                        goto error0;
1741                xfs_btree_setbuf(cur, lev, bp);
1742                cur->bc_ptrs[lev] = xfs_btree_get_numrecs(block);
1743        }
1744out1:
1745        XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
1746        *stat = 1;
1747        return 0;
1748
1749out0:
1750        XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
1751        *stat = 0;
1752        return 0;
1753
1754error0:
1755        XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
1756        return error;
1757}
1758
1759STATIC int
1760xfs_btree_lookup_get_block(
1761        struct xfs_btree_cur    *cur,   /* btree cursor */
1762        int                     level,  /* level in the btree */
1763        union xfs_btree_ptr     *pp,    /* ptr to btree block */
1764        struct xfs_btree_block  **blkp) /* return btree block */
1765{
1766        struct xfs_buf          *bp;    /* buffer pointer for btree block */
1767        int                     error = 0;
1768
1769        /* special case the root block if in an inode */
1770        if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
1771            (level == cur->bc_nlevels - 1)) {
1772                *blkp = xfs_btree_get_iroot(cur);
1773                return 0;
1774        }
1775
1776        /*
1777         * If the old buffer at this level for the disk address we are
1778         * looking for re-use it.
1779         *
1780         * Otherwise throw it away and get a new one.
1781         */
1782        bp = cur->bc_bufs[level];
1783        if (bp && XFS_BUF_ADDR(bp) == xfs_btree_ptr_to_daddr(cur, pp)) {
1784                *blkp = XFS_BUF_TO_BLOCK(bp);
1785                return 0;
1786        }
1787
1788        error = xfs_btree_read_buf_block(cur, pp, 0, blkp, &bp);
1789        if (error)
1790                return error;
1791
1792        /* Check the inode owner since the verifiers don't. */
1793        if (xfs_sb_version_hascrc(&cur->bc_mp->m_sb) &&
1794            (cur->bc_flags & XFS_BTREE_LONG_PTRS) &&
1795            be64_to_cpu((*blkp)->bb_u.l.bb_owner) !=
1796                        cur->bc_private.b.ip->i_ino)
1797                goto out_bad;
1798
1799        /* Did we get the level we were looking for? */
1800        if (be16_to_cpu((*blkp)->bb_level) != level)
1801                goto out_bad;
1802
1803        /* Check that internal nodes have at least one record. */
1804        if (level != 0 && be16_to_cpu((*blkp)->bb_numrecs) == 0)
1805                goto out_bad;
1806
1807        xfs_btree_setbuf(cur, level, bp);
1808        return 0;
1809
1810out_bad:
1811        *blkp = NULL;
1812        xfs_trans_brelse(cur->bc_tp, bp);
1813        return -EFSCORRUPTED;
1814}
1815
1816/*
1817 * Get current search key.  For level 0 we don't actually have a key
1818 * structure so we make one up from the record.  For all other levels
1819 * we just return the right key.
1820 */
1821STATIC union xfs_btree_key *
1822xfs_lookup_get_search_key(
1823        struct xfs_btree_cur    *cur,
1824        int                     level,
1825        int                     keyno,
1826        struct xfs_btree_block  *block,
1827        union xfs_btree_key     *kp)
1828{
1829        if (level == 0) {
1830                cur->bc_ops->init_key_from_rec(kp,
1831                                xfs_btree_rec_addr(cur, keyno, block));
1832                return kp;
1833        }
1834
1835        return xfs_btree_key_addr(cur, keyno, block);
1836}
1837
1838/*
1839 * Lookup the record.  The cursor is made to point to it, based on dir.
1840 * stat is set to 0 if can't find any such record, 1 for success.
1841 */
1842int                                     /* error */
1843xfs_btree_lookup(
1844        struct xfs_btree_cur    *cur,   /* btree cursor */
1845        xfs_lookup_t            dir,    /* <=, ==, or >= */
1846        int                     *stat)  /* success/failure */
1847{
1848        struct xfs_btree_block  *block; /* current btree block */
1849        __int64_t               diff;   /* difference for the current key */
1850        int                     error;  /* error return value */
1851        int                     keyno;  /* current key number */
1852        int                     level;  /* level in the btree */
1853        union xfs_btree_ptr     *pp;    /* ptr to btree block */
1854        union xfs_btree_ptr     ptr;    /* ptr to btree block */
1855
1856        XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
1857        XFS_BTREE_TRACE_ARGI(cur, dir);
1858
1859        XFS_BTREE_STATS_INC(cur, lookup);
1860
1861        /* No such thing as a zero-level tree. */
1862        if (cur->bc_nlevels == 0)
1863                return -EFSCORRUPTED;
1864
1865        block = NULL;
1866        keyno = 0;
1867
1868        /* initialise start pointer from cursor */
1869        cur->bc_ops->init_ptr_from_cur(cur, &ptr);
1870        pp = &ptr;
1871
1872        /*
1873         * Iterate over each level in the btree, starting at the root.
1874         * For each level above the leaves, find the key we need, based
1875         * on the lookup record, then follow the corresponding block
1876         * pointer down to the next level.
1877         */
1878        for (level = cur->bc_nlevels - 1, diff = 1; level >= 0; level--) {
1879                /* Get the block we need to do the lookup on. */
1880                error = xfs_btree_lookup_get_block(cur, level, pp, &block);
1881                if (error)
1882                        goto error0;
1883
1884                if (diff == 0) {
1885                        /*
1886                         * If we already had a key match at a higher level, we
1887                         * know we need to use the first entry in this block.
1888                         */
1889                        keyno = 1;
1890                } else {
1891                        /* Otherwise search this block. Do a binary search. */
1892
1893                        int     high;   /* high entry number */
1894                        int     low;    /* low entry number */
1895
1896                        /* Set low and high entry numbers, 1-based. */
1897                        low = 1;
1898                        high = xfs_btree_get_numrecs(block);
1899                        if (!high) {
1900                                /* Block is empty, must be an empty leaf. */
1901                                ASSERT(level == 0 && cur->bc_nlevels == 1);
1902
1903                                cur->bc_ptrs[0] = dir != XFS_LOOKUP_LE;
1904                                XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
1905                                *stat = 0;
1906                                return 0;
1907                        }
1908
1909                        /* Binary search the block. */
1910                        while (low <= high) {
1911                                union xfs_btree_key     key;
1912                                union xfs_btree_key     *kp;
1913
1914                                XFS_BTREE_STATS_INC(cur, compare);
1915
1916                                /* keyno is average of low and high. */
1917                                keyno = (low + high) >> 1;
1918
1919                                /* Get current search key */
1920                                kp = xfs_lookup_get_search_key(cur, level,
1921                                                keyno, block, &key);
1922
1923                                /*
1924                                 * Compute difference to get next direction:
1925                                 *  - less than, move right
1926                                 *  - greater than, move left
1927                                 *  - equal, we're done
1928                                 */
1929                                diff = cur->bc_ops->key_diff(cur, kp);
1930                                if (diff < 0)
1931                                        low = keyno + 1;
1932                                else if (diff > 0)
1933                                        high = keyno - 1;
1934                                else
1935                                        break;
1936                        }
1937                }
1938
1939                /*
1940                 * If there are more levels, set up for the next level
1941                 * by getting the block number and filling in the cursor.
1942                 */
1943                if (level > 0) {
1944                        /*
1945                         * If we moved left, need the previous key number,
1946                         * unless there isn't one.
1947                         */
1948                        if (diff > 0 && --keyno < 1)
1949                                keyno = 1;
1950                        pp = xfs_btree_ptr_addr(cur, keyno, block);
1951
1952#ifdef DEBUG
1953                        error = xfs_btree_check_ptr(cur, pp, 0, level);
1954                        if (error)
1955                                goto error0;
1956#endif
1957                        cur->bc_ptrs[level] = keyno;
1958                }
1959        }
1960
1961        /* Done with the search. See if we need to adjust the results. */
1962        if (dir != XFS_LOOKUP_LE && diff < 0) {
1963                keyno++;
1964                /*
1965                 * If ge search and we went off the end of the block, but it's
1966                 * not the last block, we're in the wrong block.
1967                 */
1968                xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_RIGHTSIB);
1969                if (dir == XFS_LOOKUP_GE &&
1970                    keyno > xfs_btree_get_numrecs(block) &&
1971                    !xfs_btree_ptr_is_null(cur, &ptr)) {
1972                        int     i;
1973
1974                        cur->bc_ptrs[0] = keyno;
1975                        error = xfs_btree_increment(cur, 0, &i);
1976                        if (error)
1977                                goto error0;
1978                        XFS_WANT_CORRUPTED_RETURN(cur->bc_mp, i == 1);
1979                        XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
1980                        *stat = 1;
1981                        return 0;
1982                }
1983        } else if (dir == XFS_LOOKUP_LE && diff > 0)
1984                keyno--;
1985        cur->bc_ptrs[0] = keyno;
1986
1987        /* Return if we succeeded or not. */
1988        if (keyno == 0 || keyno > xfs_btree_get_numrecs(block))
1989                *stat = 0;
1990        else if (dir != XFS_LOOKUP_EQ || diff == 0)
1991                *stat = 1;
1992        else
1993                *stat = 0;
1994        XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
1995        return 0;
1996
1997error0:
1998        XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
1999        return error;
2000}
2001
2002/* Find the high key storage area from a regular key. */
2003STATIC union xfs_btree_key *
2004xfs_btree_high_key_from_key(
2005        struct xfs_btree_cur    *cur,
2006        union xfs_btree_key     *key)
2007{
2008        ASSERT(cur->bc_flags & XFS_BTREE_OVERLAPPING);
2009        return (union xfs_btree_key *)((char *)key +
2010                        (cur->bc_ops->key_len / 2));
2011}
2012
2013/* Determine the low (and high if overlapped) keys of a leaf block */
2014STATIC void
2015xfs_btree_get_leaf_keys(
2016        struct xfs_btree_cur    *cur,
2017        struct xfs_btree_block  *block,
2018        union xfs_btree_key     *key)
2019{
2020        union xfs_btree_key     max_hkey;
2021        union xfs_btree_key     hkey;
2022        union xfs_btree_rec     *rec;
2023        union xfs_btree_key     *high;
2024        int                     n;
2025
2026        rec = xfs_btree_rec_addr(cur, 1, block);
2027        cur->bc_ops->init_key_from_rec(key, rec);
2028
2029        if (cur->bc_flags & XFS_BTREE_OVERLAPPING) {
2030
2031                cur->bc_ops->init_high_key_from_rec(&max_hkey, rec);
2032                for (n = 2; n <= xfs_btree_get_numrecs(block); n++) {
2033                        rec = xfs_btree_rec_addr(cur, n, block);
2034                        cur->bc_ops->init_high_key_from_rec(&hkey, rec);
2035                        if (cur->bc_ops->diff_two_keys(cur, &hkey, &max_hkey)
2036                                        > 0)
2037                                max_hkey = hkey;
2038                }
2039
2040                high = xfs_btree_high_key_from_key(cur, key);
2041                memcpy(high, &max_hkey, cur->bc_ops->key_len / 2);
2042        }
2043}
2044
2045/* Determine the low (and high if overlapped) keys of a node block */
2046STATIC void
2047xfs_btree_get_node_keys(
2048        struct xfs_btree_cur    *cur,
2049        struct xfs_btree_block  *block,
2050        union xfs_btree_key     *key)
2051{
2052        union xfs_btree_key     *hkey;
2053        union xfs_btree_key     *max_hkey;
2054        union xfs_btree_key     *high;
2055        int                     n;
2056
2057        if (cur->bc_flags & XFS_BTREE_OVERLAPPING) {
2058                memcpy(key, xfs_btree_key_addr(cur, 1, block),
2059                                cur->bc_ops->key_len / 2);
2060
2061                max_hkey = xfs_btree_high_key_addr(cur, 1, block);
2062                for (n = 2; n <= xfs_btree_get_numrecs(block); n++) {
2063                        hkey = xfs_btree_high_key_addr(cur, n, block);
2064                        if (cur->bc_ops->diff_two_keys(cur, hkey, max_hkey) > 0)
2065                                max_hkey = hkey;
2066                }
2067
2068                high = xfs_btree_high_key_from_key(cur, key);
2069                memcpy(high, max_hkey, cur->bc_ops->key_len / 2);
2070        } else {
2071                memcpy(key, xfs_btree_key_addr(cur, 1, block),
2072                                cur->bc_ops->key_len);
2073        }
2074}
2075
2076/* Derive the keys for any btree block. */
2077STATIC void
2078xfs_btree_get_keys(
2079        struct xfs_btree_cur    *cur,
2080        struct xfs_btree_block  *block,
2081        union xfs_btree_key     *key)
2082{
2083        if (be16_to_cpu(block->bb_level) == 0)
2084                xfs_btree_get_leaf_keys(cur, block, key);
2085        else
2086                xfs_btree_get_node_keys(cur, block, key);
2087}
2088
2089/*
2090 * Decide if we need to update the parent keys of a btree block.  For
2091 * a standard btree this is only necessary if we're updating the first
2092 * record/key.  For an overlapping btree, we must always update the
2093 * keys because the highest key can be in any of the records or keys
2094 * in the block.
2095 */
2096static inline bool
2097xfs_btree_needs_key_update(
2098        struct xfs_btree_cur    *cur,
2099        int                     ptr)
2100{
2101        return (cur->bc_flags & XFS_BTREE_OVERLAPPING) || ptr == 1;
2102}
2103
2104/*
2105 * Update the low and high parent keys of the given level, progressing
2106 * towards the root.  If force_all is false, stop if the keys for a given
2107 * level do not need updating.
2108 */
2109STATIC int
2110__xfs_btree_updkeys(
2111        struct xfs_btree_cur    *cur,
2112        int                     level,
2113        struct xfs_btree_block  *block,
2114        struct xfs_buf          *bp0,
2115        bool                    force_all)
2116{
2117        union xfs_btree_key     key;    /* keys from current level */
2118        union xfs_btree_key     *lkey;  /* keys from the next level up */
2119        union xfs_btree_key     *hkey;
2120        union xfs_btree_key     *nlkey; /* keys from the next level up */
2121        union xfs_btree_key     *nhkey;
2122        struct xfs_buf          *bp;
2123        int                     ptr;
2124
2125        ASSERT(cur->bc_flags & XFS_BTREE_OVERLAPPING);
2126
2127        /* Exit if there aren't any parent levels to update. */
2128        if (level + 1 >= cur->bc_nlevels)
2129                return 0;
2130
2131        trace_xfs_btree_updkeys(cur, level, bp0);
2132
2133        lkey = &key;
2134        hkey = xfs_btree_high_key_from_key(cur, lkey);
2135        xfs_btree_get_keys(cur, block, lkey);
2136        for (level++; level < cur->bc_nlevels; level++) {
2137#ifdef DEBUG
2138                int             error;
2139#endif
2140                block = xfs_btree_get_block(cur, level, &bp);
2141                trace_xfs_btree_updkeys(cur, level, bp);
2142#ifdef DEBUG
2143                error = xfs_btree_check_block(cur, block, level, bp);
2144                if (error) {
2145                        XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
2146                        return error;
2147                }
2148#endif
2149                ptr = cur->bc_ptrs[level];
2150                nlkey = xfs_btree_key_addr(cur, ptr, block);
2151                nhkey = xfs_btree_high_key_addr(cur, ptr, block);
2152                if (!force_all &&
2153                    !(cur->bc_ops->diff_two_keys(cur, nlkey, lkey) != 0 ||
2154                      cur->bc_ops->diff_two_keys(cur, nhkey, hkey) != 0))
2155                        break;
2156                xfs_btree_copy_keys(cur, nlkey, lkey, 1);
2157                xfs_btree_log_keys(cur, bp, ptr, ptr);
2158                if (level + 1 >= cur->bc_nlevels)
2159                        break;
2160                xfs_btree_get_node_keys(cur, block, lkey);
2161        }
2162
2163        return 0;
2164}
2165
2166/* Update all the keys from some level in cursor back to the root. */
2167STATIC int
2168xfs_btree_updkeys_force(
2169        struct xfs_btree_cur    *cur,
2170        int                     level)
2171{
2172        struct xfs_buf          *bp;
2173        struct xfs_btree_block  *block;
2174
2175        block = xfs_btree_get_block(cur, level, &bp);
2176        return __xfs_btree_updkeys(cur, level, block, bp, true);
2177}
2178
2179/*
2180 * Update the parent keys of the given level, progressing towards the root.
2181 */
2182STATIC int
2183xfs_btree_update_keys(
2184        struct xfs_btree_cur    *cur,
2185        int                     level)
2186{
2187        struct xfs_btree_block  *block;
2188        struct xfs_buf          *bp;
2189        union xfs_btree_key     *kp;
2190        union xfs_btree_key     key;
2191        int                     ptr;
2192
2193        ASSERT(level >= 0);
2194
2195        block = xfs_btree_get_block(cur, level, &bp);
2196        if (cur->bc_flags & XFS_BTREE_OVERLAPPING)
2197                return __xfs_btree_updkeys(cur, level, block, bp, false);
2198
2199        XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
2200        XFS_BTREE_TRACE_ARGIK(cur, level, keyp);
2201
2202        /*
2203         * Go up the tree from this level toward the root.
2204         * At each level, update the key value to the value input.
2205         * Stop when we reach a level where the cursor isn't pointing
2206         * at the first entry in the block.
2207         */
2208        xfs_btree_get_keys(cur, block, &key);
2209        for (level++, ptr = 1; ptr == 1 && level < cur->bc_nlevels; level++) {
2210#ifdef DEBUG
2211                int             error;
2212#endif
2213                block = xfs_btree_get_block(cur, level, &bp);
2214#ifdef DEBUG
2215                error = xfs_btree_check_block(cur, block, level, bp);
2216                if (error) {
2217                        XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
2218                        return error;
2219                }
2220#endif
2221                ptr = cur->bc_ptrs[level];
2222                kp = xfs_btree_key_addr(cur, ptr, block);
2223                xfs_btree_copy_keys(cur, kp, &key, 1);
2224                xfs_btree_log_keys(cur, bp, ptr, ptr);
2225        }
2226
2227        XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
2228        return 0;
2229}
2230
2231/*
2232 * Update the record referred to by cur to the value in the
2233 * given record. This either works (return 0) or gets an
2234 * EFSCORRUPTED error.
2235 */
2236int
2237xfs_btree_update(
2238        struct xfs_btree_cur    *cur,
2239        union xfs_btree_rec     *rec)
2240{
2241        struct xfs_btree_block  *block;
2242        struct xfs_buf          *bp;
2243        int                     error;
2244        int                     ptr;
2245        union xfs_btree_rec     *rp;
2246
2247        XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
2248        XFS_BTREE_TRACE_ARGR(cur, rec);
2249
2250        /* Pick up the current block. */
2251        block = xfs_btree_get_block(cur, 0, &bp);
2252
2253#ifdef DEBUG
2254        error = xfs_btree_check_block(cur, block, 0, bp);
2255        if (error)
2256                goto error0;
2257#endif
2258        /* Get the address of the rec to be updated. */
2259        ptr = cur->bc_ptrs[0];
2260        rp = xfs_btree_rec_addr(cur, ptr, block);
2261
2262        /* Fill in the new contents and log them. */
2263        xfs_btree_copy_recs(cur, rp, rec, 1);
2264        xfs_btree_log_recs(cur, bp, ptr, ptr);
2265
2266        /*
2267         * If we are tracking the last record in the tree and
2268         * we are at the far right edge of the tree, update it.
2269         */
2270        if (xfs_btree_is_lastrec(cur, block, 0)) {
2271                cur->bc_ops->update_lastrec(cur, block, rec,
2272                                            ptr, LASTREC_UPDATE);
2273        }
2274
2275        /* Pass new key value up to our parent. */
2276        if (xfs_btree_needs_key_update(cur, ptr)) {
2277                error = xfs_btree_update_keys(cur, 0);
2278                if (error)
2279                        goto error0;
2280        }
2281
2282        XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
2283        return 0;
2284
2285error0:
2286        XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
2287        return error;
2288}
2289
2290/*
2291 * Move 1 record left from cur/level if possible.
2292 * Update cur to reflect the new path.
2293 */
2294STATIC int                                      /* error */
2295xfs_btree_lshift(
2296        struct xfs_btree_cur    *cur,
2297        int                     level,
2298        int                     *stat)          /* success/failure */
2299{
2300        struct xfs_buf          *lbp;           /* left buffer pointer */
2301        struct xfs_btree_block  *left;          /* left btree block */
2302        int                     lrecs;          /* left record count */
2303        struct xfs_buf          *rbp;           /* right buffer pointer */
2304        struct xfs_btree_block  *right;         /* right btree block */
2305        struct xfs_btree_cur    *tcur;          /* temporary btree cursor */
2306        int                     rrecs;          /* right record count */
2307        union xfs_btree_ptr     lptr;           /* left btree pointer */
2308        union xfs_btree_key     *rkp = NULL;    /* right btree key */
2309        union xfs_btree_ptr     *rpp = NULL;    /* right address pointer */
2310        union xfs_btree_rec     *rrp = NULL;    /* right record pointer */
2311        int                     error;          /* error return value */
2312        int                     i;
2313
2314        XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
2315        XFS_BTREE_TRACE_ARGI(cur, level);
2316
2317        if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
2318            level == cur->bc_nlevels - 1)
2319                goto out0;
2320
2321        /* Set up variables for this block as "right". */
2322        right = xfs_btree_get_block(cur, level, &rbp);
2323
2324#ifdef DEBUG
2325        error = xfs_btree_check_block(cur, right, level, rbp);
2326        if (error)
2327                goto error0;
2328#endif
2329
2330        /* If we've got no left sibling then we can't shift an entry left. */
2331        xfs_btree_get_sibling(cur, right, &lptr, XFS_BB_LEFTSIB);
2332        if (xfs_btree_ptr_is_null(cur, &lptr))
2333                goto out0;
2334
2335        /*
2336         * If the cursor entry is the one that would be moved, don't
2337         * do it... it's too complicated.
2338         */
2339        if (cur->bc_ptrs[level] <= 1)
2340                goto out0;
2341
2342        /* Set up the left neighbor as "left". */
2343        error = xfs_btree_read_buf_block(cur, &lptr, 0, &left, &lbp);
2344        if (error)
2345                goto error0;
2346
2347        /* If it's full, it can't take another entry. */
2348        lrecs = xfs_btree_get_numrecs(left);
2349        if (lrecs == cur->bc_ops->get_maxrecs(cur, level))
2350                goto out0;
2351
2352        rrecs = xfs_btree_get_numrecs(right);
2353
2354        /*
2355         * We add one entry to the left side and remove one for the right side.
2356         * Account for it here, the changes will be updated on disk and logged
2357         * later.
2358         */
2359        lrecs++;
2360        rrecs--;
2361
2362        XFS_BTREE_STATS_INC(cur, lshift);
2363        XFS_BTREE_STATS_ADD(cur, moves, 1);
2364
2365        /*
2366         * If non-leaf, copy a key and a ptr to the left block.
2367         * Log the changes to the left block.
2368         */
2369        if (level > 0) {
2370                /* It's a non-leaf.  Move keys and pointers. */
2371                union xfs_btree_key     *lkp;   /* left btree key */
2372                union xfs_btree_ptr     *lpp;   /* left address pointer */
2373
2374                lkp = xfs_btree_key_addr(cur, lrecs, left);
2375                rkp = xfs_btree_key_addr(cur, 1, right);
2376
2377                lpp = xfs_btree_ptr_addr(cur, lrecs, left);
2378                rpp = xfs_btree_ptr_addr(cur, 1, right);
2379#ifdef DEBUG
2380                error = xfs_btree_check_ptr(cur, rpp, 0, level);
2381                if (error)
2382                        goto error0;
2383#endif
2384                xfs_btree_copy_keys(cur, lkp, rkp, 1);
2385                xfs_btree_copy_ptrs(cur, lpp, rpp, 1);
2386
2387                xfs_btree_log_keys(cur, lbp, lrecs, lrecs);
2388                xfs_btree_log_ptrs(cur, lbp, lrecs, lrecs);
2389
2390                ASSERT(cur->bc_ops->keys_inorder(cur,
2391                        xfs_btree_key_addr(cur, lrecs - 1, left), lkp));
2392        } else {
2393                /* It's a leaf.  Move records.  */
2394                union xfs_btree_rec     *lrp;   /* left record pointer */
2395
2396                lrp = xfs_btree_rec_addr(cur, lrecs, left);
2397                rrp = xfs_btree_rec_addr(cur, 1, right);
2398
2399                xfs_btree_copy_recs(cur, lrp, rrp, 1);
2400                xfs_btree_log_recs(cur, lbp, lrecs, lrecs);
2401
2402                ASSERT(cur->bc_ops->recs_inorder(cur,
2403                        xfs_btree_rec_addr(cur, lrecs - 1, left), lrp));
2404        }
2405
2406        xfs_btree_set_numrecs(left, lrecs);
2407        xfs_btree_log_block(cur, lbp, XFS_BB_NUMRECS);
2408
2409        xfs_btree_set_numrecs(right, rrecs);
2410        xfs_btree_log_block(cur, rbp, XFS_BB_NUMRECS);
2411
2412        /*
2413         * Slide the contents of right down one entry.
2414         */
2415        XFS_BTREE_STATS_ADD(cur, moves, rrecs - 1);
2416        if (level > 0) {
2417                /* It's a nonleaf. operate on keys and ptrs */
2418#ifdef DEBUG
2419                int                     i;              /* loop index */
2420
2421                for (i = 0; i < rrecs; i++) {
2422                        error = xfs_btree_check_ptr(cur, rpp, i + 1, level);
2423                        if (error)
2424                                goto error0;
2425                }
2426#endif
2427                xfs_btree_shift_keys(cur,
2428                                xfs_btree_key_addr(cur, 2, right),
2429                                -1, rrecs);
2430                xfs_btree_shift_ptrs(cur,
2431                                xfs_btree_ptr_addr(cur, 2, right),
2432                                -1, rrecs);
2433
2434                xfs_btree_log_keys(cur, rbp, 1, rrecs);
2435                xfs_btree_log_ptrs(cur, rbp, 1, rrecs);
2436        } else {
2437                /* It's a leaf. operate on records */
2438                xfs_btree_shift_recs(cur,
2439                        xfs_btree_rec_addr(cur, 2, right),
2440                        -1, rrecs);
2441                xfs_btree_log_recs(cur, rbp, 1, rrecs);
2442        }
2443
2444        /*
2445         * Using a temporary cursor, update the parent key values of the
2446         * block on the left.
2447         */
2448        if (cur->bc_flags & XFS_BTREE_OVERLAPPING) {
2449                error = xfs_btree_dup_cursor(cur, &tcur);
2450                if (error)
2451                        goto error0;
2452                i = xfs_btree_firstrec(tcur, level);
2453                XFS_WANT_CORRUPTED_GOTO(tcur->bc_mp, i == 1, error0);
2454
2455                error = xfs_btree_decrement(tcur, level, &i);
2456                if (error)
2457                        goto error1;
2458
2459                /* Update the parent high keys of the left block, if needed. */
2460                error = xfs_btree_update_keys(tcur, level);
2461                if (error)
2462                        goto error1;
2463
2464                xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
2465        }
2466
2467        /* Update the parent keys of the right block. */
2468        error = xfs_btree_update_keys(cur, level);
2469        if (error)
2470                goto error0;
2471
2472        /* Slide the cursor value left one. */
2473        cur->bc_ptrs[level]--;
2474
2475        XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
2476        *stat = 1;
2477        return 0;
2478
2479out0:
2480        XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
2481        *stat = 0;
2482        return 0;
2483
2484error0:
2485        XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
2486        return error;
2487
2488error1:
2489        XFS_BTREE_TRACE_CURSOR(tcur, XBT_ERROR);
2490        xfs_btree_del_cursor(tcur, XFS_BTREE_ERROR);
2491        return error;
2492}
2493
2494/*
2495 * Move 1 record right from cur/level if possible.
2496 * Update cur to reflect the new path.
2497 */
2498STATIC int                                      /* error */
2499xfs_btree_rshift(
2500        struct xfs_btree_cur    *cur,
2501        int                     level,
2502        int                     *stat)          /* success/failure */
2503{
2504        struct xfs_buf          *lbp;           /* left buffer pointer */
2505        struct xfs_btree_block  *left;          /* left btree block */
2506        struct xfs_buf          *rbp;           /* right buffer pointer */
2507        struct xfs_btree_block  *right;         /* right btree block */
2508        struct xfs_btree_cur    *tcur;          /* temporary btree cursor */
2509        union xfs_btree_ptr     rptr;           /* right block pointer */
2510        union xfs_btree_key     *rkp;           /* right btree key */
2511        int                     rrecs;          /* right record count */
2512        int                     lrecs;          /* left record count */
2513        int                     error;          /* error return value */
2514        int                     i;              /* loop counter */
2515
2516        XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
2517        XFS_BTREE_TRACE_ARGI(cur, level);
2518
2519        if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
2520            (level == cur->bc_nlevels - 1))
2521                goto out0;
2522
2523        /* Set up variables for this block as "left". */
2524        left = xfs_btree_get_block(cur, level, &lbp);
2525
2526#ifdef DEBUG
2527        error = xfs_btree_check_block(cur, left, level, lbp);
2528        if (error)
2529                goto error0;
2530#endif
2531
2532        /* If we've got no right sibling then we can't shift an entry right. */
2533        xfs_btree_get_sibling(cur, left, &rptr, XFS_BB_RIGHTSIB);
2534        if (xfs_btree_ptr_is_null(cur, &rptr))
2535                goto out0;
2536
2537        /*
2538         * If the cursor entry is the one that would be moved, don't
2539         * do it... it's too complicated.
2540         */
2541        lrecs = xfs_btree_get_numrecs(left);
2542        if (cur->bc_ptrs[level] >= lrecs)
2543                goto out0;
2544
2545        /* Set up the right neighbor as "right". */
2546        error = xfs_btree_read_buf_block(cur, &rptr, 0, &right, &rbp);
2547        if (error)
2548                goto error0;
2549
2550        /* If it's full, it can't take another entry. */
2551        rrecs = xfs_btree_get_numrecs(right);
2552        if (rrecs == cur->bc_ops->get_maxrecs(cur, level))
2553                goto out0;
2554
2555        XFS_BTREE_STATS_INC(cur, rshift);
2556        XFS_BTREE_STATS_ADD(cur, moves, rrecs);
2557
2558        /*
2559         * Make a hole at the start of the right neighbor block, then
2560         * copy the last left block entry to the hole.
2561         */
2562        if (level > 0) {
2563                /* It's a nonleaf. make a hole in the keys and ptrs */
2564                union xfs_btree_key     *lkp;
2565                union xfs_btree_ptr     *lpp;
2566                union xfs_btree_ptr     *rpp;
2567
2568                lkp = xfs_btree_key_addr(cur, lrecs, left);
2569                lpp = xfs_btree_ptr_addr(cur, lrecs, left);
2570                rkp = xfs_btree_key_addr(cur, 1, right);
2571                rpp = xfs_btree_ptr_addr(cur, 1, right);
2572
2573#ifdef DEBUG
2574                for (i = rrecs - 1; i >= 0; i--) {
2575                        error = xfs_btree_check_ptr(cur, rpp, i, level);
2576                        if (error)
2577                                goto error0;
2578                }
2579#endif
2580
2581                xfs_btree_shift_keys(cur, rkp, 1, rrecs);
2582                xfs_btree_shift_ptrs(cur, rpp, 1, rrecs);
2583
2584#ifdef DEBUG
2585                error = xfs_btree_check_ptr(cur, lpp, 0, level);
2586                if (error)
2587                        goto error0;
2588#endif
2589
2590                /* Now put the new data in, and log it. */
2591                xfs_btree_copy_keys(cur, rkp, lkp, 1);
2592                xfs_btree_copy_ptrs(cur, rpp, lpp, 1);
2593
2594                xfs_btree_log_keys(cur, rbp, 1, rrecs + 1);
2595                xfs_btree_log_ptrs(cur, rbp, 1, rrecs + 1);
2596
2597                ASSERT(cur->bc_ops->keys_inorder(cur, rkp,
2598                        xfs_btree_key_addr(cur, 2, right)));
2599        } else {
2600                /* It's a leaf. make a hole in the records */
2601                union xfs_btree_rec     *lrp;
2602                union xfs_btree_rec     *rrp;
2603
2604                lrp = xfs_btree_rec_addr(cur, lrecs, left);
2605                rrp = xfs_btree_rec_addr(cur, 1, right);
2606
2607                xfs_btree_shift_recs(cur, rrp, 1, rrecs);
2608
2609                /* Now put the new data in, and log it. */
2610                xfs_btree_copy_recs(cur, rrp, lrp, 1);
2611                xfs_btree_log_recs(cur, rbp, 1, rrecs + 1);
2612        }
2613
2614        /*
2615         * Decrement and log left's numrecs, bump and log right's numrecs.
2616         */
2617        xfs_btree_set_numrecs(left, --lrecs);
2618        xfs_btree_log_block(cur, lbp, XFS_BB_NUMRECS);
2619
2620        xfs_btree_set_numrecs(right, ++rrecs);
2621        xfs_btree_log_block(cur, rbp, XFS_BB_NUMRECS);
2622
2623        /*
2624         * Using a temporary cursor, update the parent key values of the
2625         * block on the right.
2626         */
2627        error = xfs_btree_dup_cursor(cur, &tcur);
2628        if (error)
2629                goto error0;
2630        i = xfs_btree_lastrec(tcur, level);
2631        XFS_WANT_CORRUPTED_GOTO(tcur->bc_mp, i == 1, error0);
2632
2633        error = xfs_btree_increment(tcur, level, &i);
2634        if (error)
2635                goto error1;
2636
2637        /* Update the parent high keys of the left block, if needed. */
2638        if (cur->bc_flags & XFS_BTREE_OVERLAPPING) {
2639                error = xfs_btree_update_keys(cur, level);
2640                if (error)
2641                        goto error1;
2642        }
2643
2644        /* Update the parent keys of the right block. */
2645        error = xfs_btree_update_keys(tcur, level);
2646        if (error)
2647                goto error1;
2648
2649        xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
2650
2651        XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
2652        *stat = 1;
2653        return 0;
2654
2655out0:
2656        XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
2657        *stat = 0;
2658        return 0;
2659
2660error0:
2661        XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
2662        return error;
2663
2664error1:
2665        XFS_BTREE_TRACE_CURSOR(tcur, XBT_ERROR);
2666        xfs_btree_del_cursor(tcur, XFS_BTREE_ERROR);
2667        return error;
2668}
2669
2670/*
2671 * Split cur/level block in half.
2672 * Return new block number and the key to its first
2673 * record (to be inserted into parent).
2674 */
2675STATIC int                                      /* error */
2676__xfs_btree_split(
2677        struct xfs_btree_cur    *cur,
2678        int                     level,
2679        union xfs_btree_ptr     *ptrp,
2680        union xfs_btree_key     *key,
2681        struct xfs_btree_cur    **curp,
2682        int                     *stat)          /* success/failure */
2683{
2684        union xfs_btree_ptr     lptr;           /* left sibling block ptr */
2685        struct xfs_buf          *lbp;           /* left buffer pointer */
2686        struct xfs_btree_block  *left;          /* left btree block */
2687        union xfs_btree_ptr     rptr;           /* right sibling block ptr */
2688        struct xfs_buf          *rbp;           /* right buffer pointer */
2689        struct xfs_btree_block  *right;         /* right btree block */
2690        union xfs_btree_ptr     rrptr;          /* right-right sibling ptr */
2691        struct xfs_buf          *rrbp;          /* right-right buffer pointer */
2692        struct xfs_btree_block  *rrblock;       /* right-right btree block */
2693        int                     lrecs;
2694        int                     rrecs;
2695        int                     src_index;
2696        int                     error;          /* error return value */
2697#ifdef DEBUG
2698        int                     i;
2699#endif
2700
2701        XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
2702        XFS_BTREE_TRACE_ARGIPK(cur, level, *ptrp, key);
2703
2704        XFS_BTREE_STATS_INC(cur, split);
2705
2706        /* Set up left block (current one). */
2707        left = xfs_btree_get_block(cur, level, &lbp);
2708
2709#ifdef DEBUG
2710        error = xfs_btree_check_block(cur, left, level, lbp);
2711        if (error)
2712                goto error0;
2713#endif
2714
2715        xfs_btree_buf_to_ptr(cur, lbp, &lptr);
2716
2717        /* Allocate the new block. If we can't do it, we're toast. Give up. */
2718        error = cur->bc_ops->alloc_block(cur, &lptr, &rptr, stat);
2719        if (error)
2720                goto error0;
2721        if (*stat == 0)
2722                goto out0;
2723        XFS_BTREE_STATS_INC(cur, alloc);
2724
2725        /* Set up the new block as "right". */
2726        error = xfs_btree_get_buf_block(cur, &rptr, 0, &right, &rbp);
2727        if (error)
2728                goto error0;
2729
2730        /* Fill in the btree header for the new right block. */
2731        xfs_btree_init_block_cur(cur, rbp, xfs_btree_get_level(left), 0);
2732
2733        /*
2734         * Split the entries between the old and the new block evenly.
2735         * Make sure that if there's an odd number of entries now, that
2736         * each new block will have the same number of entries.
2737         */
2738        lrecs = xfs_btree_get_numrecs(left);
2739        rrecs = lrecs / 2;
2740        if ((lrecs & 1) && cur->bc_ptrs[level] <= rrecs + 1)
2741                rrecs++;
2742        src_index = (lrecs - rrecs + 1);
2743
2744        XFS_BTREE_STATS_ADD(cur, moves, rrecs);
2745
2746        /* Adjust numrecs for the later get_*_keys() calls. */
2747        lrecs -= rrecs;
2748        xfs_btree_set_numrecs(left, lrecs);
2749        xfs_btree_set_numrecs(right, xfs_btree_get_numrecs(right) + rrecs);
2750
2751        /*
2752         * Copy btree block entries from the left block over to the
2753         * new block, the right. Update the right block and log the
2754         * changes.
2755         */
2756        if (level > 0) {
2757                /* It's a non-leaf.  Move keys and pointers. */
2758                union xfs_btree_key     *lkp;   /* left btree key */
2759                union xfs_btree_ptr     *lpp;   /* left address pointer */
2760                union xfs_btree_key     *rkp;   /* right btree key */
2761                union xfs_btree_ptr     *rpp;   /* right address pointer */
2762
2763                lkp = xfs_btree_key_addr(cur, src_index, left);
2764                lpp = xfs_btree_ptr_addr(cur, src_index, left);
2765                rkp = xfs_btree_key_addr(cur, 1, right);
2766                rpp = xfs_btree_ptr_addr(cur, 1, right);
2767
2768#ifdef DEBUG
2769                for (i = src_index; i < rrecs; i++) {
2770                        error = xfs_btree_check_ptr(cur, lpp, i, level);
2771                        if (error)
2772                                goto error0;
2773                }
2774#endif
2775
2776                /* Copy the keys & pointers to the new block. */
2777                xfs_btree_copy_keys(cur, rkp, lkp, rrecs);
2778                xfs_btree_copy_ptrs(cur, rpp, lpp, rrecs);
2779
2780                xfs_btree_log_keys(cur, rbp, 1, rrecs);
2781                xfs_btree_log_ptrs(cur, rbp, 1, rrecs);
2782
2783                /* Stash the keys of the new block for later insertion. */
2784                xfs_btree_get_node_keys(cur, right, key);
2785        } else {
2786                /* It's a leaf.  Move records.  */
2787                union xfs_btree_rec     *lrp;   /* left record pointer */
2788                union xfs_btree_rec     *rrp;   /* right record pointer */
2789
2790                lrp = xfs_btree_rec_addr(cur, src_index, left);
2791                rrp = xfs_btree_rec_addr(cur, 1, right);
2792
2793                /* Copy records to the new block. */
2794                xfs_btree_copy_recs(cur, rrp, lrp, rrecs);
2795                xfs_btree_log_recs(cur, rbp, 1, rrecs);
2796
2797                /* Stash the keys of the new block for later insertion. */
2798                xfs_btree_get_leaf_keys(cur, right, key);
2799        }
2800
2801        /*
2802         * Find the left block number by looking in the buffer.
2803         * Adjust sibling pointers.
2804         */
2805        xfs_btree_get_sibling(cur, left, &rrptr, XFS_BB_RIGHTSIB);
2806        xfs_btree_set_sibling(cur, right, &rrptr, XFS_BB_RIGHTSIB);
2807        xfs_btree_set_sibling(cur, right, &lptr, XFS_BB_LEFTSIB);
2808        xfs_btree_set_sibling(cur, left, &rptr, XFS_BB_RIGHTSIB);
2809
2810        xfs_btree_log_block(cur, rbp, XFS_BB_ALL_BITS);
2811        xfs_btree_log_block(cur, lbp, XFS_BB_NUMRECS | XFS_BB_RIGHTSIB);
2812
2813        /*
2814         * If there's a block to the new block's right, make that block
2815         * point back to right instead of to left.
2816         */
2817        if (!xfs_btree_ptr_is_null(cur, &rrptr)) {
2818                error = xfs_btree_read_buf_block(cur, &rrptr,
2819                                                        0, &rrblock, &rrbp);
2820                if (error)
2821                        goto error0;
2822                xfs_btree_set_sibling(cur, rrblock, &rptr, XFS_BB_LEFTSIB);
2823                xfs_btree_log_block(cur, rrbp, XFS_BB_LEFTSIB);
2824        }
2825
2826        /* Update the parent high keys of the left block, if needed. */
2827        if (cur->bc_flags & XFS_BTREE_OVERLAPPING) {
2828                error = xfs_btree_update_keys(cur, level);
2829                if (error)
2830                        goto error0;
2831        }
2832
2833        /*
2834         * If the cursor is really in the right block, move it there.
2835         * If it's just pointing past the last entry in left, then we'll
2836         * insert there, so don't change anything in that case.
2837         */
2838        if (cur->bc_ptrs[level] > lrecs + 1) {
2839                xfs_btree_setbuf(cur, level, rbp);
2840                cur->bc_ptrs[level] -= lrecs;
2841        }
2842        /*
2843         * If there are more levels, we'll need another cursor which refers
2844         * the right block, no matter where this cursor was.
2845         */
2846        if (level + 1 < cur->bc_nlevels) {
2847                error = xfs_btree_dup_cursor(cur, curp);
2848                if (error)
2849                        goto error0;
2850                (*curp)->bc_ptrs[level + 1]++;
2851        }
2852        *ptrp = rptr;
2853        XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
2854        *stat = 1;
2855        return 0;
2856out0:
2857        XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
2858        *stat = 0;
2859        return 0;
2860
2861error0:
2862        XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
2863        return error;
2864}
2865
2866struct xfs_btree_split_args {
2867        struct xfs_btree_cur    *cur;
2868        int                     level;
2869        union xfs_btree_ptr     *ptrp;
2870        union xfs_btree_key     *key;
2871        struct xfs_btree_cur    **curp;
2872        int                     *stat;          /* success/failure */
2873        int                     result;
2874        bool                    kswapd; /* allocation in kswapd context */
2875        struct completion       *done;
2876        struct work_struct      work;
2877};
2878
2879/*
2880 * Stack switching interfaces for allocation
2881 */
2882static void
2883xfs_btree_split_worker(
2884        struct work_struct      *work)
2885{
2886        struct xfs_btree_split_args     *args = container_of(work,
2887                                                struct xfs_btree_split_args, work);
2888        unsigned long           pflags;
2889        unsigned long           new_pflags = PF_MEMALLOC_NOFS;
2890
2891        /*
2892         * we are in a transaction context here, but may also be doing work
2893         * in kswapd context, and hence we may need to inherit that state
2894         * temporarily to ensure that we don't block waiting for memory reclaim
2895         * in any way.
2896         */
2897        if (args->kswapd)
2898                new_pflags |= PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD;
2899
2900        current_set_flags_nested(&pflags, new_pflags);
2901
2902        args->result = __xfs_btree_split(args->cur, args->level, args->ptrp,
2903                                         args->key, args->curp, args->stat);
2904        complete(args->done);
2905
2906        current_restore_flags_nested(&pflags, new_pflags);
2907}
2908
2909/*
2910 * BMBT split requests often come in with little stack to work on. Push
2911 * them off to a worker thread so there is lots of stack to use. For the other
2912 * btree types, just call directly to avoid the context switch overhead here.
2913 */
2914STATIC int                                      /* error */
2915xfs_btree_split(
2916        struct xfs_btree_cur    *cur,
2917        int                     level,
2918        union xfs_btree_ptr     *ptrp,
2919        union xfs_btree_key     *key,
2920        struct xfs_btree_cur    **curp,
2921        int                     *stat)          /* success/failure */
2922{
2923        struct xfs_btree_split_args     args;
2924        DECLARE_COMPLETION_ONSTACK(done);
2925
2926        if (cur->bc_btnum != XFS_BTNUM_BMAP)
2927                return __xfs_btree_split(cur, level, ptrp, key, curp, stat);
2928
2929        args.cur = cur;
2930        args.level = level;
2931        args.ptrp = ptrp;
2932        args.key = key;
2933        args.curp = curp;
2934        args.stat = stat;
2935        args.done = &done;
2936        args.kswapd = current_is_kswapd();
2937        INIT_WORK_ONSTACK(&args.work, xfs_btree_split_worker);
2938        queue_work(xfs_alloc_wq, &args.work);
2939        wait_for_completion(&done);
2940        destroy_work_on_stack(&args.work);
2941        return args.result;
2942}
2943
2944
2945/*
2946 * Copy the old inode root contents into a real block and make the
2947 * broot point to it.
2948 */
2949int                                             /* error */
2950xfs_btree_new_iroot(
2951        struct xfs_btree_cur    *cur,           /* btree cursor */
2952        int                     *logflags,      /* logging flags for inode */
2953        int                     *stat)          /* return status - 0 fail */
2954{
2955        struct xfs_buf          *cbp;           /* buffer for cblock */
2956        struct xfs_btree_block  *block;         /* btree block */
2957        struct xfs_btree_block  *cblock;        /* child btree block */
2958        union xfs_btree_key     *ckp;           /* child key pointer */
2959        union xfs_btree_ptr     *cpp;           /* child ptr pointer */
2960        union xfs_btree_key     *kp;            /* pointer to btree key */
2961        union xfs_btree_ptr     *pp;            /* pointer to block addr */
2962        union xfs_btree_ptr     nptr;           /* new block addr */
2963        int                     level;          /* btree level */
2964        int                     error;          /* error return code */
2965#ifdef DEBUG
2966        int                     i;              /* loop counter */
2967#endif
2968
2969        XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
2970        XFS_BTREE_STATS_INC(cur, newroot);
2971
2972        ASSERT(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE);
2973
2974        level = cur->bc_nlevels - 1;
2975
2976        block = xfs_btree_get_iroot(cur);
2977        pp = xfs_btree_ptr_addr(cur, 1, block);
2978
2979        /* Allocate the new block. If we can't do it, we're toast. Give up. */
2980        error = cur->bc_ops->alloc_block(cur, pp, &nptr, stat);
2981        if (error)
2982                goto error0;
2983        if (*stat == 0) {
2984                XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
2985                return 0;
2986        }
2987        XFS_BTREE_STATS_INC(cur, alloc);
2988
2989        /* Copy the root into a real block. */
2990        error = xfs_btree_get_buf_block(cur, &nptr, 0, &cblock, &cbp);
2991        if (error)
2992                goto error0;
2993
2994        /*
2995         * we can't just memcpy() the root in for CRC enabled btree blocks.
2996         * In that case have to also ensure the blkno remains correct
2997         */
2998        memcpy(cblock, block, xfs_btree_block_len(cur));
2999        if (cur->bc_flags & XFS_BTREE_CRC_BLOCKS) {
3000                if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
3001                        cblock->bb_u.l.bb_blkno = cpu_to_be64(cbp->b_bn);
3002                else
3003                        cblock->bb_u.s.bb_blkno = cpu_to_be64(cbp->b_bn);
3004        }
3005
3006        be16_add_cpu(&block->bb_level, 1);
3007        xfs_btree_set_numrecs(block, 1);
3008        cur->bc_nlevels++;
3009        cur->bc_ptrs[level + 1] = 1;
3010
3011        kp = xfs_btree_key_addr(cur, 1, block);
3012        ckp = xfs_btree_key_addr(cur, 1, cblock);
3013        xfs_btree_copy_keys(cur, ckp, kp, xfs_btree_get_numrecs(cblock));
3014
3015        cpp = xfs_btree_ptr_addr(cur, 1, cblock);
3016#ifdef DEBUG
3017        for (i = 0; i < be16_to_cpu(cblock->bb_numrecs); i++) {
3018                error = xfs_btree_check_ptr(cur, pp, i, level);
3019                if (error)
3020                        goto error0;
3021        }
3022#endif
3023        xfs_btree_copy_ptrs(cur, cpp, pp, xfs_btree_get_numrecs(cblock));
3024
3025#ifdef DEBUG
3026        error = xfs_btree_check_ptr(cur, &nptr, 0, level);
3027        if (error)
3028                goto error0;
3029#endif
3030        xfs_btree_copy_ptrs(cur, pp, &nptr, 1);
3031
3032        xfs_iroot_realloc(cur->bc_private.b.ip,
3033                          1 - xfs_btree_get_numrecs(cblock),
3034                          cur->bc_private.b.whichfork);
3035
3036        xfs_btree_setbuf(cur, level, cbp);
3037
3038        /*
3039         * Do all this logging at the end so that
3040         * the root is at the right level.
3041         */
3042        xfs_btree_log_block(cur, cbp, XFS_BB_ALL_BITS);
3043        xfs_btree_log_keys(cur, cbp, 1, be16_to_cpu(cblock->bb_numrecs));
3044        xfs_btree_log_ptrs(cur, cbp, 1, be16_to_cpu(cblock->bb_numrecs));
3045
3046        *logflags |=
3047                XFS_ILOG_CORE | xfs_ilog_fbroot(cur->bc_private.b.whichfork);
3048        *stat = 1;
3049        XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
3050        return 0;
3051error0:
3052        XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
3053        return error;
3054}
3055
3056/*
3057 * Allocate a new root block, fill it in.
3058 */
3059STATIC int                              /* error */
3060xfs_btree_new_root(
3061        struct xfs_btree_cur    *cur,   /* btree cursor */
3062        int                     *stat)  /* success/failure */
3063{
3064        struct xfs_btree_block  *block; /* one half of the old root block */
3065        struct xfs_buf          *bp;    /* buffer containing block */
3066        int                     error;  /* error return value */
3067        struct xfs_buf          *lbp;   /* left buffer pointer */
3068        struct xfs_btree_block  *left;  /* left btree block */
3069        struct xfs_buf          *nbp;   /* new (root) buffer */
3070        struct xfs_btree_block  *new;   /* new (root) btree block */
3071        int                     nptr;   /* new value for key index, 1 or 2 */
3072        struct xfs_buf          *rbp;   /* right buffer pointer */
3073        struct xfs_btree_block  *right; /* right btree block */
3074        union xfs_btree_ptr     rptr;
3075        union xfs_btree_ptr     lptr;
3076
3077        XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
3078        XFS_BTREE_STATS_INC(cur, newroot);
3079
3080        /* initialise our start point from the cursor */
3081        cur->bc_ops->init_ptr_from_cur(cur, &rptr);
3082
3083        /* Allocate the new block. If we can't do it, we're toast. Give up. */
3084        error = cur->bc_ops->alloc_block(cur, &rptr, &lptr, stat);
3085        if (error)
3086                goto error0;
3087        if (*stat == 0)
3088                goto out0;
3089        XFS_BTREE_STATS_INC(cur, alloc);
3090
3091        /* Set up the new block. */
3092        error = xfs_btree_get_buf_block(cur, &lptr, 0, &new, &nbp);
3093        if (error)
3094                goto error0;
3095
3096        /* Set the root in the holding structure  increasing the level by 1. */
3097        cur->bc_ops->set_root(cur, &lptr, 1);
3098
3099        /*
3100         * At the previous root level there are now two blocks: the old root,
3101         * and the new block generated when it was split.  We don't know which
3102         * one the cursor is pointing at, so we set up variables "left" and
3103         * "right" for each case.
3104         */
3105        block = xfs_btree_get_block(cur, cur->bc_nlevels - 1, &bp);
3106
3107#ifdef DEBUG
3108        error = xfs_btree_check_block(cur, block, cur->bc_nlevels - 1, bp);
3109        if (error)
3110                goto error0;
3111#endif
3112
3113        xfs_btree_get_sibling(cur, block, &rptr, XFS_BB_RIGHTSIB);
3114        if (!xfs_btree_ptr_is_null(cur, &rptr)) {
3115                /* Our block is left, pick up the right block. */
3116                lbp = bp;
3117                xfs_btree_buf_to_ptr(cur, lbp, &lptr);
3118                left = block;
3119                error = xfs_btree_read_buf_block(cur, &rptr, 0, &right, &rbp);
3120                if (error)
3121                        goto error0;
3122                bp = rbp;
3123                nptr = 1;
3124        } else {
3125                /* Our block is right, pick up the left block. */
3126                rbp = bp;
3127                xfs_btree_buf_to_ptr(cur, rbp, &rptr);
3128                right = block;
3129                xfs_btree_get_sibling(cur, right, &lptr, XFS_BB_LEFTSIB);
3130                error = xfs_btree_read_buf_block(cur, &lptr, 0, &left, &lbp);
3131                if (error)
3132                        goto error0;
3133                bp = lbp;
3134                nptr = 2;
3135        }
3136
3137        /* Fill in the new block's btree header and log it. */
3138        xfs_btree_init_block_cur(cur, nbp, cur->bc_nlevels, 2);
3139        xfs_btree_log_block(cur, nbp, XFS_BB_ALL_BITS);
3140        ASSERT(!xfs_btree_ptr_is_null(cur, &lptr) &&
3141                        !xfs_btree_ptr_is_null(cur, &rptr));
3142
3143        /* Fill in the key data in the new root. */
3144        if (xfs_btree_get_level(left) > 0) {
3145                /*
3146                 * Get the keys for the left block's keys and put them directly
3147                 * in the parent block.  Do the same for the right block.
3148                 */
3149                xfs_btree_get_node_keys(cur, left,
3150                                xfs_btree_key_addr(cur, 1, new));
3151                xfs_btree_get_node_keys(cur, right,
3152                                xfs_btree_key_addr(cur, 2, new));
3153        } else {
3154                /*
3155                 * Get the keys for the left block's records and put them
3156                 * directly in the parent block.  Do the same for the right
3157                 * block.
3158                 */
3159                xfs_btree_get_leaf_keys(cur, left,
3160                        xfs_btree_key_addr(cur, 1, new));
3161                xfs_btree_get_leaf_keys(cur, right,
3162                        xfs_btree_key_addr(cur, 2, new));
3163        }
3164        xfs_btree_log_keys(cur, nbp, 1, 2);
3165
3166        /* Fill in the pointer data in the new root. */
3167        xfs_btree_copy_ptrs(cur,
3168                xfs_btree_ptr_addr(cur, 1, new), &lptr, 1);
3169        xfs_btree_copy_ptrs(cur,
3170                xfs_btree_ptr_addr(cur, 2, new), &rptr, 1);
3171        xfs_btree_log_ptrs(cur, nbp, 1, 2);
3172
3173        /* Fix up the cursor. */
3174        xfs_btree_setbuf(cur, cur->bc_nlevels, nbp);
3175        cur->bc_ptrs[cur->bc_nlevels] = nptr;
3176        cur->bc_nlevels++;
3177        XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
3178        *stat = 1;
3179        return 0;
3180error0:
3181        XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
3182        return error;
3183out0:
3184        XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
3185        *stat = 0;
3186        return 0;
3187}
3188
3189STATIC int
3190xfs_btree_make_block_unfull(
3191        struct xfs_btree_cur    *cur,   /* btree cursor */
3192        int                     level,  /* btree level */
3193        int                     numrecs,/* # of recs in block */
3194        int                     *oindex,/* old tree index */
3195        int                     *index, /* new tree index */
3196        union xfs_btree_ptr     *nptr,  /* new btree ptr */
3197        struct xfs_btree_cur    **ncur, /* new btree cursor */
3198        union xfs_btree_key     *key,   /* key of new block */
3199        int                     *stat)
3200{
3201        int                     error = 0;
3202
3203        if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
3204            level == cur->bc_nlevels - 1) {
3205                struct xfs_inode *ip = cur->bc_private.b.ip;
3206
3207                if (numrecs < cur->bc_ops->get_dmaxrecs(cur, level)) {
3208                        /* A root block that can be made bigger. */
3209                        xfs_iroot_realloc(ip, 1, cur->bc_private.b.whichfork);
3210                        *stat = 1;
3211                } else {
3212                        /* A root block that needs replacing */
3213                        int     logflags = 0;
3214
3215                        error = xfs_btree_new_iroot(cur, &logflags, stat);
3216                        if (error || *stat == 0)
3217                                return error;
3218
3219                        xfs_trans_log_inode(cur->bc_tp, ip, logflags);
3220                }
3221
3222                return 0;
3223        }
3224
3225        /* First, try shifting an entry to the right neighbor. */
3226        error = xfs_btree_rshift(cur, level, stat);
3227        if (error || *stat)
3228                return error;
3229
3230        /* Next, try shifting an entry to the left neighbor. */
3231        error = xfs_btree_lshift(cur, level, stat);
3232        if (error)
3233                return error;
3234
3235        if (*stat) {
3236                *oindex = *index = cur->bc_ptrs[level];
3237                return 0;
3238        }
3239
3240        /*
3241         * Next, try splitting the current block in half.
3242         *
3243         * If this works we have to re-set our variables because we
3244         * could be in a different block now.
3245         */
3246        error = xfs_btree_split(cur, level, nptr, key, ncur, stat);
3247        if (error || *stat == 0)
3248                return error;
3249
3250
3251        *index = cur->bc_ptrs[level];
3252        return 0;
3253}
3254
3255/*
3256 * Insert one record/level.  Return information to the caller
3257 * allowing the next level up to proceed if necessary.
3258 */
3259STATIC int
3260xfs_btree_insrec(
3261        struct xfs_btree_cur    *cur,   /* btree cursor */
3262        int                     level,  /* level to insert record at */
3263        union xfs_btree_ptr     *ptrp,  /* i/o: block number inserted */
3264        union xfs_btree_rec     *rec,   /* record to insert */
3265        union xfs_btree_key     *key,   /* i/o: block key for ptrp */
3266        struct xfs_btree_cur    **curp, /* output: new cursor replacing cur */
3267        int                     *stat)  /* success/failure */
3268{
3269        struct xfs_btree_block  *block; /* btree block */
3270        struct xfs_buf          *bp;    /* buffer for block */
3271        union xfs_btree_ptr     nptr;   /* new block ptr */
3272        struct xfs_btree_cur    *ncur;  /* new btree cursor */
3273        union xfs_btree_key     nkey;   /* new block key */
3274        union xfs_btree_key     *lkey;
3275        int                     optr;   /* old key/record index */
3276        int                     ptr;    /* key/record index */
3277        int                     numrecs;/* number of records */
3278        int                     error;  /* error return value */
3279#ifdef DEBUG
3280        int                     i;
3281#endif
3282        xfs_daddr_t             old_bn;
3283
3284        XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
3285        XFS_BTREE_TRACE_ARGIPR(cur, level, *ptrp, &rec);
3286
3287        ncur = NULL;
3288        lkey = &nkey;
3289
3290        /*
3291         * If we have an external root pointer, and we've made it to the
3292         * root level, allocate a new root block and we're done.
3293         */
3294        if (!(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
3295            (level >= cur->bc_nlevels)) {
3296                error = xfs_btree_new_root(cur, stat);
3297                xfs_btree_set_ptr_null(cur, ptrp);
3298
3299                XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
3300                return error;
3301        }
3302
3303        /* If we're off the left edge, return failure. */
3304        ptr = cur->bc_ptrs[level];
3305        if (ptr == 0) {
3306                XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
3307                *stat = 0;
3308                return 0;
3309        }
3310
3311        optr = ptr;
3312
3313        XFS_BTREE_STATS_INC(cur, insrec);
3314
3315        /* Get pointers to the btree buffer and block. */
3316        block = xfs_btree_get_block(cur, level, &bp);
3317        old_bn = bp ? bp->b_bn : XFS_BUF_DADDR_NULL;
3318        numrecs = xfs_btree_get_numrecs(block);
3319
3320#ifdef DEBUG
3321        error = xfs_btree_check_block(cur, block, level, bp);
3322        if (error)
3323                goto error0;
3324
3325        /* Check that the new entry is being inserted in the right place. */
3326        if (ptr <= numrecs) {
3327                if (level == 0) {
3328                        ASSERT(cur->bc_ops->recs_inorder(cur, rec,
3329                                xfs_btree_rec_addr(cur, ptr, block)));
3330                } else {
3331                        ASSERT(cur->bc_ops->keys_inorder(cur, key,
3332                                xfs_btree_key_addr(cur, ptr, block)));
3333                }
3334        }
3335#endif
3336
3337        /*
3338         * If the block is full, we can't insert the new entry until we
3339         * make the block un-full.
3340         */
3341        xfs_btree_set_ptr_null(cur, &nptr);
3342        if (numrecs == cur->bc_ops->get_maxrecs(cur, level)) {
3343                error = xfs_btree_make_block_unfull(cur, level, numrecs,
3344                                        &optr, &ptr, &nptr, &ncur, lkey, stat);
3345                if (error || *stat == 0)
3346                        goto error0;
3347        }
3348
3349        /*
3350         * The current block may have changed if the block was
3351         * previously full and we have just made space in it.
3352         */
3353        block = xfs_btree_get_block(cur, level, &bp);
3354        numrecs = xfs_btree_get_numrecs(block);
3355
3356#ifdef DEBUG
3357        error = xfs_btree_check_block(cur, block, level, bp);
3358        if (error)
3359                return error;
3360#endif
3361
3362        /*
3363         * At this point we know there's room for our new entry in the block
3364         * we're pointing at.
3365         */
3366        XFS_BTREE_STATS_ADD(cur, moves, numrecs - ptr + 1);
3367
3368        if (level > 0) {
3369                /* It's a nonleaf. make a hole in the keys and ptrs */
3370                union xfs_btree_key     *kp;
3371                union xfs_btree_ptr     *pp;
3372
3373                kp = xfs_btree_key_addr(cur, ptr, block);
3374                pp = xfs_btree_ptr_addr(cur, ptr, block);
3375
3376#ifdef DEBUG
3377                for (i = numrecs - ptr; i >= 0; i--) {
3378                        error = xfs_btree_check_ptr(cur, pp, i, level);
3379                        if (error)
3380                                return error;
3381                }
3382#endif
3383
3384                xfs_btree_shift_keys(cur, kp, 1, numrecs - ptr + 1);
3385                xfs_btree_shift_ptrs(cur, pp, 1, numrecs - ptr + 1);
3386
3387#ifdef DEBUG
3388                error = xfs_btree_check_ptr(cur, ptrp, 0, level);
3389                if (error)
3390                        goto error0;
3391#endif
3392
3393                /* Now put the new data in, bump numrecs and log it. */
3394                xfs_btree_copy_keys(cur, kp, key, 1);
3395                xfs_btree_copy_ptrs(cur, pp, ptrp, 1);
3396                numrecs++;
3397                xfs_btree_set_numrecs(block, numrecs);
3398                xfs_btree_log_ptrs(cur, bp, ptr, numrecs);
3399                xfs_btree_log_keys(cur, bp, ptr, numrecs);
3400#ifdef DEBUG
3401                if (ptr < numrecs) {
3402                        ASSERT(cur->bc_ops->keys_inorder(cur, kp,
3403                                xfs_btree_key_addr(cur, ptr + 1, block)));
3404                }
3405#endif
3406        } else {
3407                /* It's a leaf. make a hole in the records */
3408                union xfs_btree_rec             *rp;
3409
3410                rp = xfs_btree_rec_addr(cur, ptr, block);
3411
3412                xfs_btree_shift_recs(cur, rp, 1, numrecs - ptr + 1);
3413
3414                /* Now put the new data in, bump numrecs and log it. */
3415                xfs_btree_copy_recs(cur, rp, rec, 1);
3416                xfs_btree_set_numrecs(block, ++numrecs);
3417                xfs_btree_log_recs(cur, bp, ptr, numrecs);
3418#ifdef DEBUG
3419                if (ptr < numrecs) {
3420                        ASSERT(cur->bc_ops->recs_inorder(cur, rp,
3421                                xfs_btree_rec_addr(cur, ptr + 1, block)));
3422                }
3423#endif
3424        }
3425
3426        /* Log the new number of records in the btree header. */
3427        xfs_btree_log_block(cur, bp, XFS_BB_NUMRECS);
3428
3429        /*
3430         * If we just inserted into a new tree block, we have to
3431         * recalculate nkey here because nkey is out of date.
3432         *
3433         * Otherwise we're just updating an existing block (having shoved
3434         * some records into the new tree block), so use the regular key
3435         * update mechanism.
3436         */
3437        if (bp && bp->b_bn != old_bn) {
3438                xfs_btree_get_keys(cur, block, lkey);
3439        } else if (xfs_btree_needs_key_update(cur, optr)) {
3440                error = xfs_btree_update_keys(cur, level);
3441                if (error)
3442                        goto error0;
3443        }
3444
3445        /*
3446         * If we are tracking the last record in the tree and
3447         * we are at the far right edge of the tree, update it.
3448         */
3449        if (xfs_btree_is_lastrec(cur, block, level)) {
3450                cur->bc_ops->update_lastrec(cur, block, rec,
3451                                            ptr, LASTREC_INSREC);
3452        }
3453
3454        /*
3455         * Return the new block number, if any.
3456         * If there is one, give back a record value and a cursor too.
3457         */
3458        *ptrp = nptr;
3459        if (!xfs_btree_ptr_is_null(cur, &nptr)) {
3460                xfs_btree_copy_keys(cur, key, lkey, 1);
3461                *curp = ncur;
3462        }
3463
3464        XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
3465        *stat = 1;
3466        return 0;
3467
3468error0:
3469        XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
3470        return error;
3471}
3472
3473/*
3474 * Insert the record at the point referenced by cur.
3475 *
3476 * A multi-level split of the tree on insert will invalidate the original
3477 * cursor.  All callers of this function should assume that the cursor is
3478 * no longer valid and revalidate it.
3479 */
3480int
3481xfs_btree_insert(
3482        struct xfs_btree_cur    *cur,
3483        int                     *stat)
3484{
3485        int                     error;  /* error return value */
3486        int                     i;      /* result value, 0 for failure */
3487        int                     level;  /* current level number in btree */
3488        union xfs_btree_ptr     nptr;   /* new block number (split result) */
3489        struct xfs_btree_cur    *ncur;  /* new cursor (split result) */
3490        struct xfs_btree_cur    *pcur;  /* previous level's cursor */
3491        union xfs_btree_key     bkey;   /* key of block to insert */
3492        union xfs_btree_key     *key;
3493        union xfs_btree_rec     rec;    /* record to insert */
3494
3495        level = 0;
3496        ncur = NULL;
3497        pcur = cur;
3498        key = &bkey;
3499
3500        xfs_btree_set_ptr_null(cur, &nptr);
3501
3502        /* Make a key out of the record data to be inserted, and save it. */
3503        cur->bc_ops->init_rec_from_cur(cur, &rec);
3504        cur->bc_ops->init_key_from_rec(key, &rec);
3505
3506        /*
3507         * Loop going up the tree, starting at the leaf level.
3508         * Stop when we don't get a split block, that must mean that
3509         * the insert is finished with this level.
3510         */
3511        do {
3512                /*
3513                 * Insert nrec/nptr into this level of the tree.
3514                 * Note if we fail, nptr will be null.
3515                 */
3516                error = xfs_btree_insrec(pcur, level, &nptr, &rec, key,
3517                                &ncur, &i);
3518                if (error) {
3519                        if (pcur != cur)
3520                                xfs_btree_del_cursor(pcur, XFS_BTREE_ERROR);
3521                        goto error0;
3522                }
3523
3524                XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, i == 1, error0);
3525                level++;
3526
3527                /*
3528                 * See if the cursor we just used is trash.
3529                 * Can't trash the caller's cursor, but otherwise we should
3530                 * if ncur is a new cursor or we're about to be done.
3531                 */
3532                if (pcur != cur &&
3533                    (ncur || xfs_btree_ptr_is_null(cur, &nptr))) {
3534                        /* Save the state from the cursor before we trash it */
3535                        if (cur->bc_ops->update_cursor)
3536                                cur->bc_ops->update_cursor(pcur, cur);
3537                        cur->bc_nlevels = pcur->bc_nlevels;
3538                        xfs_btree_del_cursor(pcur, XFS_BTREE_NOERROR);
3539                }
3540                /* If we got a new cursor, switch to it. */
3541                if (ncur) {
3542                        pcur = ncur;
3543                        ncur = NULL;
3544                }
3545        } while (!xfs_btree_ptr_is_null(cur, &nptr));
3546
3547        XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
3548        *stat = i;
3549        return 0;
3550error0:
3551        XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
3552        return error;
3553}
3554
3555/*
3556 * Try to merge a non-leaf block back into the inode root.
3557 *
3558 * Note: the killroot names comes from the fact that we're effectively
3559 * killing the old root block.  But because we can't just delete the
3560 * inode we have to copy the single block it was pointing to into the
3561 * inode.
3562 */
3563STATIC int
3564xfs_btree_kill_iroot(
3565        struct xfs_btree_cur    *cur)
3566{
3567        int                     whichfork = cur->bc_private.b.whichfork;
3568        struct xfs_inode        *ip = cur->bc_private.b.ip;
3569        struct xfs_ifork        *ifp = XFS_IFORK_PTR(ip, whichfork);
3570        struct xfs_btree_block  *block;
3571        struct xfs_btree_block  *cblock;
3572        union xfs_btree_key     *kp;
3573        union xfs_btree_key     *ckp;
3574        union xfs_btree_ptr     *pp;
3575        union xfs_btree_ptr     *cpp;
3576        struct xfs_buf          *cbp;
3577        int                     level;
3578        int                     index;
3579        int                     numrecs;
3580        int                     error;
3581#ifdef DEBUG
3582        union xfs_btree_ptr     ptr;
3583        int                     i;
3584#endif
3585
3586        XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
3587
3588        ASSERT(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE);
3589        ASSERT(cur->bc_nlevels > 1);
3590
3591        /*
3592         * Don't deal with the root block needs to be a leaf case.
3593         * We're just going to turn the thing back into extents anyway.
3594         */
3595        level = cur->bc_nlevels - 1;
3596        if (level == 1)
3597                goto out0;
3598
3599        /*
3600         * Give up if the root has multiple children.
3601         */
3602        block = xfs_btree_get_iroot(cur);
3603        if (xfs_btree_get_numrecs(block) != 1)
3604                goto out0;
3605
3606        cblock = xfs_btree_get_block(cur, level - 1, &cbp);
3607        numrecs = xfs_btree_get_numrecs(cblock);
3608
3609        /*
3610         * Only do this if the next level will fit.
3611         * Then the data must be copied up to the inode,
3612         * instead of freeing the root you free the next level.
3613         */
3614        if (numrecs > cur->bc_ops->get_dmaxrecs(cur, level))
3615                goto out0;
3616
3617        XFS_BTREE_STATS_INC(cur, killroot);
3618
3619#ifdef DEBUG
3620        xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_LEFTSIB);
3621        ASSERT(xfs_btree_ptr_is_null(cur, &ptr));
3622        xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_RIGHTSIB);
3623        ASSERT(xfs_btree_ptr_is_null(cur, &ptr));
3624#endif
3625
3626        index = numrecs - cur->bc_ops->get_maxrecs(cur, level);
3627        if (index) {
3628                xfs_iroot_realloc(cur->bc_private.b.ip, index,
3629                                  cur->bc_private.b.whichfork);
3630                block = ifp->if_broot;
3631        }
3632
3633        be16_add_cpu(&block->bb_numrecs, index);
3634        ASSERT(block->bb_numrecs == cblock->bb_numrecs);
3635
3636        kp = xfs_btree_key_addr(cur, 1, block);
3637        ckp = xfs_btree_key_addr(cur, 1, cblock);
3638        xfs_btree_copy_keys(cur, kp, ckp, numrecs);
3639
3640        pp = xfs_btree_ptr_addr(cur, 1, block);
3641        cpp = xfs_btree_ptr_addr(cur, 1, cblock);
3642#ifdef DEBUG
3643        for (i = 0; i < numrecs; i++) {
3644                error = xfs_btree_check_ptr(cur, cpp, i, level - 1);
3645                if (error) {
3646                        XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
3647                        return error;
3648                }
3649        }
3650#endif
3651        xfs_btree_copy_ptrs(cur, pp, cpp, numrecs);
3652
3653        error = xfs_btree_free_block(cur, cbp);
3654        if (error) {
3655                XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
3656                return error;
3657        }
3658
3659        cur->bc_bufs[level - 1] = NULL;
3660        be16_add_cpu(&block->bb_level, -1);
3661        xfs_trans_log_inode(cur->bc_tp, ip,
3662                XFS_ILOG_CORE | xfs_ilog_fbroot(cur->bc_private.b.whichfork));
3663        cur->bc_nlevels--;
3664out0:
3665        XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
3666        return 0;
3667}
3668
3669/*
3670 * Kill the current root node, and replace it with it's only child node.
3671 */
3672STATIC int
3673xfs_btree_kill_root(
3674        struct xfs_btree_cur    *cur,
3675        struct xfs_buf          *bp,
3676        int                     level,
3677        union xfs_btree_ptr     *newroot)
3678{
3679        int                     error;
3680
3681        XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
3682        XFS_BTREE_STATS_INC(cur, killroot);
3683
3684        /*
3685         * Update the root pointer, decreasing the level by 1 and then
3686         * free the old root.
3687         */
3688        cur->bc_ops->set_root(cur, newroot, -1);
3689
3690        error = xfs_btree_free_block(cur, bp);
3691        if (error) {
3692                XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
3693                return error;
3694        }
3695
3696        cur->bc_bufs[level] = NULL;
3697        cur->bc_ra[level] = 0;
3698        cur->bc_nlevels--;
3699
3700        XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
3701        return 0;
3702}
3703
3704STATIC int
3705xfs_btree_dec_cursor(
3706        struct xfs_btree_cur    *cur,
3707        int                     level,
3708        int                     *stat)
3709{
3710        int                     error;
3711        int                     i;
3712
3713        if (level > 0) {
3714                error = xfs_btree_decrement(cur, level, &i);
3715                if (error)
3716                        return error;
3717        }
3718
3719        XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
3720        *stat = 1;
3721        return 0;
3722}
3723
3724/*
3725 * Single level of the btree record deletion routine.
3726 * Delete record pointed to by cur/level.
3727 * Remove the record from its block then rebalance the tree.
3728 * Return 0 for error, 1 for done, 2 to go on to the next level.
3729 */
3730STATIC int                                      /* error */
3731xfs_btree_delrec(
3732        struct xfs_btree_cur    *cur,           /* btree cursor */
3733        int                     level,          /* level removing record from */
3734        int                     *stat)          /* fail/done/go-on */
3735{
3736        struct xfs_btree_block  *block;         /* btree block */
3737        union xfs_btree_ptr     cptr;           /* current block ptr */
3738        struct xfs_buf          *bp;            /* buffer for block */
3739        int                     error;          /* error return value */
3740        int                     i;              /* loop counter */
3741        union xfs_btree_ptr     lptr;           /* left sibling block ptr */
3742        struct xfs_buf          *lbp;           /* left buffer pointer */
3743        struct xfs_btree_block  *left;          /* left btree block */
3744        int                     lrecs = 0;      /* left record count */
3745        int                     ptr;            /* key/record index */
3746        union xfs_btree_ptr     rptr;           /* right sibling block ptr */
3747        struct xfs_buf          *rbp;           /* right buffer pointer */
3748        struct xfs_btree_block  *right;         /* right btree block */
3749        struct xfs_btree_block  *rrblock;       /* right-right btree block */
3750        struct xfs_buf          *rrbp;          /* right-right buffer pointer */
3751        int                     rrecs = 0;      /* right record count */
3752        struct xfs_btree_cur    *tcur;          /* temporary btree cursor */
3753        int                     numrecs;        /* temporary numrec count */
3754
3755        XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
3756        XFS_BTREE_TRACE_ARGI(cur, level);
3757
3758        tcur = NULL;
3759
3760        /* Get the index of the entry being deleted, check for nothing there. */
3761        ptr = cur->bc_ptrs[level];
3762        if (ptr == 0) {
3763                XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
3764                *stat = 0;
3765                return 0;
3766        }
3767
3768        /* Get the buffer & block containing the record or key/ptr. */
3769        block = xfs_btree_get_block(cur, level, &bp);
3770        numrecs = xfs_btree_get_numrecs(block);
3771
3772#ifdef DEBUG
3773        error = xfs_btree_check_block(cur, block, level, bp);
3774        if (error)
3775                goto error0;
3776#endif
3777
3778        /* Fail if we're off the end of the block. */
3779        if (ptr > numrecs) {
3780                XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
3781                *stat = 0;
3782                return 0;
3783        }
3784
3785        XFS_BTREE_STATS_INC(cur, delrec);
3786        XFS_BTREE_STATS_ADD(cur, moves, numrecs - ptr);
3787
3788        /* Excise the entries being deleted. */
3789        if (level > 0) {
3790                /* It's a nonleaf. operate on keys and ptrs */
3791                union xfs_btree_key     *lkp;
3792                union xfs_btree_ptr     *lpp;
3793
3794                lkp = xfs_btree_key_addr(cur, ptr + 1, block);
3795                lpp = xfs_btree_ptr_addr(cur, ptr + 1, block);
3796
3797#ifdef DEBUG
3798                for (i = 0; i < numrecs - ptr; i++) {
3799                        error = xfs_btree_check_ptr(cur, lpp, i, level);
3800                        if (error)
3801                                goto error0;
3802                }
3803#endif
3804
3805                if (ptr < numrecs) {
3806                        xfs_btree_shift_keys(cur, lkp, -1, numrecs - ptr);
3807                        xfs_btree_shift_ptrs(cur, lpp, -1, numrecs - ptr);
3808                        xfs_btree_log_keys(cur, bp, ptr, numrecs - 1);
3809                        xfs_btree_log_ptrs(cur, bp, ptr, numrecs - 1);
3810                }
3811        } else {
3812                /* It's a leaf. operate on records */
3813                if (ptr < numrecs) {
3814                        xfs_btree_shift_recs(cur,
3815                                xfs_btree_rec_addr(cur, ptr + 1, block),
3816                                -1, numrecs - ptr);
3817                        xfs_btree_log_recs(cur, bp, ptr, numrecs - 1);
3818                }
3819        }
3820
3821        /*
3822         * Decrement and log the number of entries in the block.
3823         */
3824        xfs_btree_set_numrecs(block, --numrecs);
3825        xfs_btree_log_block(cur, bp, XFS_BB_NUMRECS);
3826
3827        /*
3828         * If we are tracking the last record in the tree and
3829         * we are at the far right edge of the tree, update it.
3830         */
3831        if (xfs_btree_is_lastrec(cur, block, level)) {
3832                cur->bc_ops->update_lastrec(cur, block, NULL,
3833                                            ptr, LASTREC_DELREC);
3834        }
3835
3836        /*
3837         * We're at the root level.  First, shrink the root block in-memory.
3838         * Try to get rid of the next level down.  If we can't then there's
3839         * nothing left to do.
3840         */
3841        if (level == cur->bc_nlevels - 1) {
3842                if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) {
3843                        xfs_iroot_realloc(cur->bc_private.b.ip, -1,
3844                                          cur->bc_private.b.whichfork);
3845
3846                        error = xfs_btree_kill_iroot(cur);
3847                        if (error)
3848                                goto error0;
3849
3850                        error = xfs_btree_dec_cursor(cur, level, stat);
3851                        if (error)
3852                                goto error0;
3853                        *stat = 1;
3854                        return 0;
3855                }
3856
3857                /*
3858                 * If this is the root level, and there's only one entry left,
3859                 * and it's NOT the leaf level, then we can get rid of this
3860                 * level.
3861                 */
3862                if (numrecs == 1 && level > 0) {
3863                        union xfs_btree_ptr     *pp;
3864                        /*
3865                         * pp is still set to the first pointer in the block.
3866                         * Make it the new root of the btree.
3867                         */
3868                        pp = xfs_btree_ptr_addr(cur, 1, block);
3869                        error = xfs_btree_kill_root(cur, bp, level, pp);
3870                        if (error)
3871                                goto error0;
3872                } else if (level > 0) {
3873                        error = xfs_btree_dec_cursor(cur, level, stat);
3874                        if (error)
3875                                goto error0;
3876                }
3877                *stat = 1;
3878                return 0;
3879        }
3880
3881        /*
3882         * If we deleted the leftmost entry in the block, update the
3883         * key values above us in the tree.
3884         */
3885        if (xfs_btree_needs_key_update(cur, ptr)) {
3886                error = xfs_btree_update_keys(cur, level);
3887                if (error)
3888                        goto error0;
3889        }
3890
3891        /*
3892         * If the number of records remaining in the block is at least
3893         * the minimum, we're done.
3894         */
3895        if (numrecs >= cur->bc_ops->get_minrecs(cur, level)) {
3896                error = xfs_btree_dec_cursor(cur, level, stat);
3897                if (error)
3898                        goto error0;
3899                return 0;
3900        }
3901
3902        /*
3903         * Otherwise, we have to move some records around to keep the
3904         * tree balanced.  Look at the left and right sibling blocks to
3905         * see if we can re-balance by moving only one record.
3906         */
3907        xfs_btree_get_sibling(cur, block, &rptr, XFS_BB_RIGHTSIB);
3908        xfs_btree_get_sibling(cur, block, &lptr, XFS_BB_LEFTSIB);
3909
3910        if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) {
3911                /*
3912                 * One child of root, need to get a chance to copy its contents
3913                 * into the root and delete it. Can't go up to next level,
3914                 * there's nothing to delete there.
3915                 */
3916                if (xfs_btree_ptr_is_null(cur, &rptr) &&
3917                    xfs_btree_ptr_is_null(cur, &lptr) &&
3918                    level == cur->bc_nlevels - 2) {
3919                        error = xfs_btree_kill_iroot(cur);
3920                        if (!error)
3921                                error = xfs_btree_dec_cursor(cur, level, stat);
3922                        if (error)
3923                                goto error0;
3924                        return 0;
3925                }
3926        }
3927
3928        ASSERT(!xfs_btree_ptr_is_null(cur, &rptr) ||
3929               !xfs_btree_ptr_is_null(cur, &lptr));
3930
3931        /*
3932         * Duplicate the cursor so our btree manipulations here won't
3933         * disrupt the next level up.
3934         */
3935        error = xfs_btree_dup_cursor(cur, &tcur);
3936        if (error)
3937                goto error0;
3938
3939        /*
3940         * If there's a right sibling, see if it's ok to shift an entry
3941         * out of it.
3942         */
3943        if (!xfs_btree_ptr_is_null(cur, &rptr)) {
3944                /*
3945                 * Move the temp cursor to the last entry in the next block.
3946                 * Actually any entry but the first would suffice.
3947                 */
3948                i = xfs_btree_lastrec(tcur, level);
3949                XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, i == 1, error0);
3950
3951                error = xfs_btree_increment(tcur, level, &i);
3952                if (error)
3953                        goto error0;
3954                XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, i == 1, error0);
3955
3956                i = xfs_btree_lastrec(tcur, level);
3957                XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, i == 1, error0);
3958
3959                /* Grab a pointer to the block. */
3960                right = xfs_btree_get_block(tcur, level, &rbp);
3961#ifdef DEBUG
3962                error = xfs_btree_check_block(tcur, right, level, rbp);
3963                if (error)
3964                        goto error0;
3965#endif
3966                /* Grab the current block number, for future use. */
3967                xfs_btree_get_sibling(tcur, right, &cptr, XFS_BB_LEFTSIB);
3968
3969                /*
3970                 * If right block is full enough so that removing one entry
3971                 * won't make it too empty, and left-shifting an entry out
3972                 * of right to us works, we're done.
3973                 */
3974                if (xfs_btree_get_numrecs(right) - 1 >=
3975                    cur->bc_ops->get_minrecs(tcur, level)) {
3976                        error = xfs_btree_lshift(tcur, level, &i);
3977                        if (error)
3978                                goto error0;
3979                        if (i) {
3980                                ASSERT(xfs_btree_get_numrecs(block) >=
3981                                       cur->bc_ops->get_minrecs(tcur, level));
3982
3983                                xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
3984                                tcur = NULL;
3985
3986                                error = xfs_btree_dec_cursor(cur, level, stat);
3987                                if (error)
3988                                        goto error0;
3989                                return 0;
3990                        }
3991                }
3992
3993                /*
3994                 * Otherwise, grab the number of records in right for
3995                 * future reference, and fix up the temp cursor to point
3996                 * to our block again (last record).
3997                 */
3998                rrecs = xfs_btree_get_numrecs(right);
3999                if (!xfs_btree_ptr_is_null(cur, &lptr)) {
4000                        i = xfs_btree_firstrec(tcur, level);
4001                        XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, i == 1, error0);
4002
4003                        error = xfs_btree_decrement(tcur, level, &i);
4004                        if (error)
4005                                goto error0;
4006                        XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, i == 1, error0);
4007                }
4008        }
4009
4010        /*
4011         * If there's a left sibling, see if it's ok to shift an entry
4012         * out of it.
4013         */
4014        if (!xfs_btree_ptr_is_null(cur, &lptr)) {
4015                /*
4016                 * Move the temp cursor to the first entry in the
4017                 * previous block.
4018                 */
4019                i = xfs_btree_firstrec(tcur, level);
4020                XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, i == 1, error0);
4021
4022                error = xfs_btree_decrement(tcur, level, &i);
4023                if (error)
4024                        goto error0;
4025                i = xfs_btree_firstrec(tcur, level);
4026                XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, i == 1, error0);
4027
4028                /* Grab a pointer to the block. */
4029                left = xfs_btree_get_block(tcur, level, &lbp);
4030#ifdef DEBUG
4031                error = xfs_btree_check_block(cur, left, level, lbp);
4032                if (error)
4033                        goto error0;
4034#endif
4035                /* Grab the current block number, for future use. */
4036                xfs_btree_get_sibling(tcur, left, &cptr, XFS_BB_RIGHTSIB);
4037
4038                /*
4039                 * If left block is full enough so that removing one entry
4040                 * won't make it too empty, and right-shifting an entry out
4041                 * of left to us works, we're done.
4042                 */
4043                if (xfs_btree_get_numrecs(left) - 1 >=
4044                    cur->bc_ops->get_minrecs(tcur, level)) {
4045                        error = xfs_btree_rshift(tcur, level, &i);
4046                        if (error)
4047                                goto error0;
4048                        if (i) {
4049                                ASSERT(xfs_btree_get_numrecs(block) >=
4050                                       cur->bc_ops->get_minrecs(tcur, level));
4051                                xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
4052                                tcur = NULL;
4053                                if (level == 0)
4054                                        cur->bc_ptrs[0]++;
4055                                XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
4056                                *stat = 1;
4057                                return 0;
4058                        }
4059                }
4060
4061                /*
4062                 * Otherwise, grab the number of records in right for
4063                 * future reference.
4064                 */
4065                lrecs = xfs_btree_get_numrecs(left);
4066        }
4067
4068        /* Delete the temp cursor, we're done with it. */
4069        xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
4070        tcur = NULL;
4071
4072        /* If here, we need to do a join to keep the tree balanced. */
4073        ASSERT(!xfs_btree_ptr_is_null(cur, &cptr));
4074
4075        if (!xfs_btree_ptr_is_null(cur, &lptr) &&
4076            lrecs + xfs_btree_get_numrecs(block) <=
4077                        cur->bc_ops->get_maxrecs(cur, level)) {
4078                /*
4079                 * Set "right" to be the starting block,
4080                 * "left" to be the left neighbor.
4081                 */
4082                rptr = cptr;
4083                right = block;
4084                rbp = bp;
4085                error = xfs_btree_read_buf_block(cur, &lptr, 0, &left, &lbp);
4086                if (error)
4087                        goto error0;
4088
4089        /*
4090         * If that won't work, see if we can join with the right neighbor block.
4091         */
4092        } else if (!xfs_btree_ptr_is_null(cur, &rptr) &&
4093                   rrecs + xfs_btree_get_numrecs(block) <=
4094                        cur->bc_ops->get_maxrecs(cur, level)) {
4095                /*
4096                 * Set "left" to be the starting block,
4097                 * "right" to be the right neighbor.
4098                 */
4099                lptr = cptr;
4100                left = block;
4101                lbp = bp;
4102                error = xfs_btree_read_buf_block(cur, &rptr, 0, &right, &rbp);
4103                if (error)
4104                        goto error0;
4105
4106        /*
4107         * Otherwise, we can't fix the imbalance.
4108         * Just return.  This is probably a logic error, but it's not fatal.
4109         */
4110        } else {
4111                error = xfs_btree_dec_cursor(cur, level, stat);
4112                if (error)
4113                        goto error0;
4114                return 0;
4115        }
4116
4117        rrecs = xfs_btree_get_numrecs(right);
4118        lrecs = xfs_btree_get_numrecs(left);
4119
4120        /*
4121         * We're now going to join "left" and "right" by moving all the stuff
4122         * in "right" to "left" and deleting "right".
4123         */
4124        XFS_BTREE_STATS_ADD(cur, moves, rrecs);
4125        if (level > 0) {
4126                /* It's a non-leaf.  Move keys and pointers. */
4127                union xfs_btree_key     *lkp;   /* left btree key */
4128                union xfs_btree_ptr     *lpp;   /* left address pointer */
4129                union xfs_btree_key     *rkp;   /* right btree key */
4130                union xfs_btree_ptr     *rpp;   /* right address pointer */
4131
4132                lkp = xfs_btree_key_addr(cur, lrecs + 1, left);
4133                lpp = xfs_btree_ptr_addr(cur, lrecs + 1, left);
4134                rkp = xfs_btree_key_addr(cur, 1, right);
4135                rpp = xfs_btree_ptr_addr(cur, 1, right);
4136#ifdef DEBUG
4137                for (i = 1; i < rrecs; i++) {
4138                        error = xfs_btree_check_ptr(cur, rpp, i, level);
4139                        if (error)
4140                                goto error0;
4141                }
4142#endif
4143                xfs_btree_copy_keys(cur, lkp, rkp, rrecs);
4144                xfs_btree_copy_ptrs(cur, lpp, rpp, rrecs);
4145
4146                xfs_btree_log_keys(cur, lbp, lrecs + 1, lrecs + rrecs);
4147                xfs_btree_log_ptrs(cur, lbp, lrecs + 1, lrecs + rrecs);
4148        } else {
4149                /* It's a leaf.  Move records.  */
4150                union xfs_btree_rec     *lrp;   /* left record pointer */
4151                union xfs_btree_rec     *rrp;   /* right record pointer */
4152
4153                lrp = xfs_btree_rec_addr(cur, lrecs + 1, left);
4154                rrp = xfs_btree_rec_addr(cur, 1, right);
4155
4156                xfs_btree_copy_recs(cur, lrp, rrp, rrecs);
4157                xfs_btree_log_recs(cur, lbp, lrecs + 1, lrecs + rrecs);
4158        }
4159
4160        XFS_BTREE_STATS_INC(cur, join);
4161
4162        /*
4163         * Fix up the number of records and right block pointer in the
4164         * surviving block, and log it.
4165         */
4166        xfs_btree_set_numrecs(left, lrecs + rrecs);
4167        xfs_btree_get_sibling(cur, right, &cptr, XFS_BB_RIGHTSIB),
4168        xfs_btree_set_sibling(cur, left, &cptr, XFS_BB_RIGHTSIB);
4169        xfs_btree_log_block(cur, lbp, XFS_BB_NUMRECS | XFS_BB_RIGHTSIB);
4170
4171        /* If there is a right sibling, point it to the remaining block. */
4172        xfs_btree_get_sibling(cur, left, &cptr, XFS_BB_RIGHTSIB);
4173        if (!xfs_btree_ptr_is_null(cur, &cptr)) {
4174                error = xfs_btree_read_buf_block(cur, &cptr, 0, &rrblock, &rrbp);
4175                if (error)
4176                        goto error0;
4177                xfs_btree_set_sibling(cur, rrblock, &lptr, XFS_BB_LEFTSIB);
4178                xfs_btree_log_block(cur, rrbp, XFS_BB_LEFTSIB);
4179        }
4180
4181        /* Free the deleted block. */
4182        error = xfs_btree_free_block(cur, rbp);
4183        if (error)
4184                goto error0;
4185
4186        /*
4187         * If we joined with the left neighbor, set the buffer in the
4188         * cursor to the left block, and fix up the index.
4189         */
4190        if (bp != lbp) {
4191                cur->bc_bufs[level] = lbp;
4192                cur->bc_ptrs[level] += lrecs;
4193                cur->bc_ra[level] = 0;
4194        }
4195        /*
4196         * If we joined with the right neighbor and there's a level above
4197         * us, increment the cursor at that level.
4198         */
4199        else if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) ||
4200                   (level + 1 < cur->bc_nlevels)) {
4201                error = xfs_btree_increment(cur, level + 1, &i);
4202                if (error)
4203                        goto error0;
4204        }
4205
4206        /*
4207         * Readjust the ptr at this level if it's not a leaf, since it's
4208         * still pointing at the deletion point, which makes the cursor
4209         * inconsistent.  If this makes the ptr 0, the caller fixes it up.
4210         * We can't use decrement because it would change the next level up.
4211         */
4212        if (level > 0)
4213                cur->bc_ptrs[level]--;
4214
4215        /*
4216         * We combined blocks, so we have to update the parent keys if the
4217         * btree supports overlapped intervals.  However, bc_ptrs[level + 1]
4218         * points to the old block so that the caller knows which record to
4219         * delete.  Therefore, the caller must be savvy enough to call updkeys
4220         * for us if we return stat == 2.  The other exit points from this
4221         * function don't require deletions further up the tree, so they can
4222         * call updkeys directly.
4223         */
4224
4225        XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
4226        /* Return value means the next level up has something to do. */
4227        *stat = 2;
4228        return 0;
4229
4230error0:
4231        XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
4232        if (tcur)
4233                xfs_btree_del_cursor(tcur, XFS_BTREE_ERROR);
4234        return error;
4235}
4236
4237/*
4238 * Delete the record pointed to by cur.
4239 * The cursor refers to the place where the record was (could be inserted)
4240 * when the operation returns.
4241 */
4242int                                     /* error */
4243xfs_btree_delete(
4244        struct xfs_btree_cur    *cur,
4245        int                     *stat)  /* success/failure */
4246{
4247        int                     error;  /* error return value */
4248        int                     level;
4249        int                     i;
4250        bool                    joined = false;
4251
4252        XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
4253
4254        /*
4255         * Go up the tree, starting at leaf level.
4256         *
4257         * If 2 is returned then a join was done; go to the next level.
4258         * Otherwise we are done.
4259         */
4260        for (level = 0, i = 2; i == 2; level++) {
4261                error = xfs_btree_delrec(cur, level, &i);
4262                if (error)
4263                        goto error0;
4264                if (i == 2)
4265                        joined = true;
4266        }
4267
4268        /*
4269         * If we combined blocks as part of deleting the record, delrec won't
4270         * have updated the parent high keys so we have to do that here.
4271         */
4272        if (joined && (cur->bc_flags & XFS_BTREE_OVERLAPPING)) {
4273                error = xfs_btree_updkeys_force(cur, 0);
4274                if (error)
4275                        goto error0;
4276        }
4277
4278        if (i == 0) {
4279                for (level = 1; level < cur->bc_nlevels; level++) {
4280                        if (cur->bc_ptrs[level] == 0) {
4281                                error = xfs_btree_decrement(cur, level, &i);
4282                                if (error)
4283                                        goto error0;
4284                                break;
4285                        }
4286                }
4287        }
4288
4289        XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
4290        *stat = i;
4291        return 0;
4292error0:
4293        XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
4294        return error;
4295}
4296
4297/*
4298 * Get the data from the pointed-to record.
4299 */
4300int                                     /* error */
4301xfs_btree_get_rec(
4302        struct xfs_btree_cur    *cur,   /* btree cursor */
4303        union xfs_btree_rec     **recp, /* output: btree record */
4304        int                     *stat)  /* output: success/failure */
4305{
4306        struct xfs_btree_block  *block; /* btree block */
4307        struct xfs_buf          *bp;    /* buffer pointer */
4308        int                     ptr;    /* record number */
4309#ifdef DEBUG
4310        int                     error;  /* error return value */
4311#endif
4312
4313        ptr = cur->bc_ptrs[0];
4314        block = xfs_btree_get_block(cur, 0, &bp);
4315
4316#ifdef DEBUG
4317        error = xfs_btree_check_block(cur, block, 0, bp);
4318        if (error)
4319                return error;
4320#endif
4321
4322        /*
4323         * Off the right end or left end, return failure.
4324         */
4325        if (ptr > xfs_btree_get_numrecs(block) || ptr <= 0) {
4326                *stat = 0;
4327                return 0;
4328        }
4329
4330        /*
4331         * Point to the record and extract its data.
4332         */
4333        *recp = xfs_btree_rec_addr(cur, ptr, block);
4334        *stat = 1;
4335        return 0;
4336}
4337
4338/* Visit a block in a btree. */
4339STATIC int
4340xfs_btree_visit_block(
4341        struct xfs_btree_cur            *cur,
4342        int                             level,
4343        xfs_btree_visit_blocks_fn       fn,
4344        void                            *data)
4345{
4346        struct xfs_btree_block          *block;
4347        struct xfs_buf                  *bp;
4348        union xfs_btree_ptr             rptr;
4349        int                             error;
4350
4351        /* do right sibling readahead */
4352        xfs_btree_readahead(cur, level, XFS_BTCUR_RIGHTRA);
4353        block = xfs_btree_get_block(cur, level, &bp);
4354
4355        /* process the block */
4356        error = fn(cur, level, data);
4357        if (error)
4358                return error;
4359
4360        /* now read rh sibling block for next iteration */
4361        xfs_btree_get_sibling(cur, block, &rptr, XFS_BB_RIGHTSIB);
4362        if (xfs_btree_ptr_is_null(cur, &rptr))
4363                return -ENOENT;
4364
4365        return xfs_btree_lookup_get_block(cur, level, &rptr, &block);
4366}
4367
4368
4369/* Visit every block in a btree. */
4370int
4371xfs_btree_visit_blocks(
4372        struct xfs_btree_cur            *cur,
4373        xfs_btree_visit_blocks_fn       fn,
4374        void                            *data)
4375{
4376        union xfs_btree_ptr             lptr;
4377        int                             level;
4378        struct xfs_btree_block          *block = NULL;
4379        int                             error = 0;
4380
4381        cur->bc_ops->init_ptr_from_cur(cur, &lptr);
4382
4383        /* for each level */
4384        for (level = cur->bc_nlevels - 1; level >= 0; level--) {
4385                /* grab the left hand block */
4386                error = xfs_btree_lookup_get_block(cur, level, &lptr, &block);
4387                if (error)
4388                        return error;
4389
4390                /* readahead the left most block for the next level down */
4391                if (level > 0) {
4392                        union xfs_btree_ptr     *ptr;
4393
4394                        ptr = xfs_btree_ptr_addr(cur, 1, block);
4395                        xfs_btree_readahead_ptr(cur, ptr, 1);
4396
4397                        /* save for the next iteration of the loop */
4398                        xfs_btree_copy_ptrs(cur, &lptr, ptr, 1);
4399                }
4400
4401                /* for each buffer in the level */
4402                do {
4403                        error = xfs_btree_visit_block(cur, level, fn, data);
4404                } while (!error);
4405
4406                if (error != -ENOENT)
4407                        return error;
4408        }
4409
4410        return 0;
4411}
4412
4413/*
4414 * Change the owner of a btree.
4415 *
4416 * The mechanism we use here is ordered buffer logging. Because we don't know
4417 * how many buffers were are going to need to modify, we don't really want to
4418 * have to make transaction reservations for the worst case of every buffer in a
4419 * full size btree as that may be more space that we can fit in the log....
4420 *
4421 * We do the btree walk in the most optimal manner possible - we have sibling
4422 * pointers so we can just walk all the blocks on each level from left to right
4423 * in a single pass, and then move to the next level and do the same. We can
4424 * also do readahead on the sibling pointers to get IO moving more quickly,
4425 * though for slow disks this is unlikely to make much difference to performance
4426 * as the amount of CPU work we have to do before moving to the next block is
4427 * relatively small.
4428 *
4429 * For each btree block that we load, modify the owner appropriately, set the
4430 * buffer as an ordered buffer and log it appropriately. We need to ensure that
4431 * we mark the region we change dirty so that if the buffer is relogged in
4432 * a subsequent transaction the changes we make here as an ordered buffer are
4433 * correctly relogged in that transaction.  If we are in recovery context, then
4434 * just queue the modified buffer as delayed write buffer so the transaction
4435 * recovery completion writes the changes to disk.
4436 */
4437struct xfs_btree_block_change_owner_info {
4438        __uint64_t              new_owner;
4439        struct list_head        *buffer_list;
4440};
4441
4442static int
4443xfs_btree_block_change_owner(
4444        struct xfs_btree_cur    *cur,
4445        int                     level,
4446        void                    *data)
4447{
4448        struct xfs_btree_block_change_owner_info        *bbcoi = data;
4449        struct xfs_btree_block  *block;
4450        struct xfs_buf          *bp;
4451
4452        /* modify the owner */
4453        block = xfs_btree_get_block(cur, level, &bp);
4454        if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
4455                block->bb_u.l.bb_owner = cpu_to_be64(bbcoi->new_owner);
4456        else
4457                block->bb_u.s.bb_owner = cpu_to_be32(bbcoi->new_owner);
4458
4459        /*
4460         * If the block is a root block hosted in an inode, we might not have a
4461         * buffer pointer here and we shouldn't attempt to log the change as the
4462         * information is already held in the inode and discarded when the root
4463         * block is formatted into the on-disk inode fork. We still change it,
4464         * though, so everything is consistent in memory.
4465         */
4466        if (bp) {
4467                if (cur->bc_tp) {
4468                        xfs_trans_ordered_buf(cur->bc_tp, bp);
4469                        xfs_btree_log_block(cur, bp, XFS_BB_OWNER);
4470                } else {
4471                        xfs_buf_delwri_queue(bp, bbcoi->buffer_list);
4472                }
4473        } else {
4474                ASSERT(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE);
4475                ASSERT(level == cur->bc_nlevels - 1);
4476        }
4477
4478        return 0;
4479}
4480
4481int
4482xfs_btree_change_owner(
4483        struct xfs_btree_cur    *cur,
4484        __uint64_t              new_owner,
4485        struct list_head        *buffer_list)
4486{
4487        struct xfs_btree_block_change_owner_info        bbcoi;
4488
4489        bbcoi.new_owner = new_owner;
4490        bbcoi.buffer_list = buffer_list;
4491
4492        return xfs_btree_visit_blocks(cur, xfs_btree_block_change_owner,
4493                        &bbcoi);
4494}
4495
4496/**
4497 * xfs_btree_sblock_v5hdr_verify() -- verify the v5 fields of a short-format
4498 *                                    btree block
4499 *
4500 * @bp: buffer containing the btree block
4501 * @max_recs: pointer to the m_*_mxr max records field in the xfs mount
4502 * @pag_max_level: pointer to the per-ag max level field
4503 */
4504bool
4505xfs_btree_sblock_v5hdr_verify(
4506        struct xfs_buf          *bp)
4507{
4508        struct xfs_mount        *mp = bp->b_target->bt_mount;
4509        struct xfs_btree_block  *block = XFS_BUF_TO_BLOCK(bp);
4510        struct xfs_perag        *pag = bp->b_pag;
4511
4512        if (!xfs_sb_version_hascrc(&mp->m_sb))
4513                return false;
4514        if (!uuid_equal(&block->bb_u.s.bb_uuid, &mp->m_sb.sb_meta_uuid))
4515                return false;
4516        if (block->bb_u.s.bb_blkno != cpu_to_be64(bp->b_bn))
4517                return false;
4518        if (pag && be32_to_cpu(block->bb_u.s.bb_owner) != pag->pag_agno)
4519                return false;
4520        return true;
4521}
4522
4523/**
4524 * xfs_btree_sblock_verify() -- verify a short-format btree block
4525 *
4526 * @bp: buffer containing the btree block
4527 * @max_recs: maximum records allowed in this btree node
4528 */
4529bool
4530xfs_btree_sblock_verify(
4531        struct xfs_buf          *bp,
4532        unsigned int            max_recs)
4533{
4534        struct xfs_mount        *mp = bp->b_target->bt_mount;
4535        struct xfs_btree_block  *block = XFS_BUF_TO_BLOCK(bp);
4536
4537        /* numrecs verification */
4538        if (be16_to_cpu(block->bb_numrecs) > max_recs)
4539                return false;
4540
4541        /* sibling pointer verification */
4542        if (!block->bb_u.s.bb_leftsib ||
4543            (be32_to_cpu(block->bb_u.s.bb_leftsib) >= mp->m_sb.sb_agblocks &&
4544             block->bb_u.s.bb_leftsib != cpu_to_be32(NULLAGBLOCK)))
4545                return false;
4546        if (!block->bb_u.s.bb_rightsib ||
4547            (be32_to_cpu(block->bb_u.s.bb_rightsib) >= mp->m_sb.sb_agblocks &&
4548             block->bb_u.s.bb_rightsib != cpu_to_be32(NULLAGBLOCK)))
4549                return false;
4550
4551        return true;
4552}
4553
4554/*
4555 * Calculate the number of btree levels needed to store a given number of
4556 * records in a short-format btree.
4557 */
4558uint
4559xfs_btree_compute_maxlevels(
4560        struct xfs_mount        *mp,
4561        uint                    *limits,
4562        unsigned long           len)
4563{
4564        uint                    level;
4565        unsigned long           maxblocks;
4566
4567        maxblocks = (len + limits[0] - 1) / limits[0];
4568        for (level = 1; maxblocks > 1; level++)
4569                maxblocks = (maxblocks + limits[1] - 1) / limits[1];
4570        return level;
4571}
4572
4573/*
4574 * Query a regular btree for all records overlapping a given interval.
4575 * Start with a LE lookup of the key of low_rec and return all records
4576 * until we find a record with a key greater than the key of high_rec.
4577 */
4578STATIC int
4579xfs_btree_simple_query_range(
4580        struct xfs_btree_cur            *cur,
4581        union xfs_btree_key             *low_key,
4582        union xfs_btree_key             *high_key,
4583        xfs_btree_query_range_fn        fn,
4584        void                            *priv)
4585{
4586        union xfs_btree_rec             *recp;
4587        union xfs_btree_key             rec_key;
4588        __int64_t                       diff;
4589        int                             stat;
4590        bool                            firstrec = true;
4591        int                             error;
4592
4593        ASSERT(cur->bc_ops->init_high_key_from_rec);
4594        ASSERT(cur->bc_ops->diff_two_keys);
4595
4596        /*
4597         * Find the leftmost record.  The btree cursor must be set
4598         * to the low record used to generate low_key.
4599         */
4600        stat = 0;
4601        error = xfs_btree_lookup(cur, XFS_LOOKUP_LE, &stat);
4602        if (error)
4603                goto out;
4604
4605        /* Nothing?  See if there's anything to the right. */
4606        if (!stat) {
4607                error = xfs_btree_increment(cur, 0, &stat);
4608                if (error)
4609                        goto out;
4610        }
4611
4612        while (stat) {
4613                /* Find the record. */
4614                error = xfs_btree_get_rec(cur, &recp, &stat);
4615                if (error || !stat)
4616                        break;
4617
4618                /* Skip if high_key(rec) < low_key. */
4619                if (firstrec) {
4620                        cur->bc_ops->init_high_key_from_rec(&rec_key, recp);
4621                        firstrec = false;
4622                        diff = cur->bc_ops->diff_two_keys(cur, low_key,
4623                                        &rec_key);
4624                        if (diff > 0)
4625                                goto advloop;
4626                }
4627
4628                /* Stop if high_key < low_key(rec). */
4629                cur->bc_ops->init_key_from_rec(&rec_key, recp);
4630                diff = cur->bc_ops->diff_two_keys(cur, &rec_key, high_key);
4631                if (diff > 0)
4632                        break;
4633
4634                /* Callback */
4635                error = fn(cur, recp, priv);
4636                if (error < 0 || error == XFS_BTREE_QUERY_RANGE_ABORT)
4637                        break;
4638
4639advloop:
4640                /* Move on to the next record. */
4641                error = xfs_btree_increment(cur, 0, &stat);
4642                if (error)
4643                        break;
4644        }
4645
4646out:
4647        return error;
4648}
4649
4650/*
4651 * Query an overlapped interval btree for all records overlapping a given
4652 * interval.  This function roughly follows the algorithm given in
4653 * "Interval Trees" of _Introduction to Algorithms_, which is section
4654 * 14.3 in the 2nd and 3rd editions.
4655 *
4656 * First, generate keys for the low and high records passed in.
4657 *
4658 * For any leaf node, generate the high and low keys for the record.
4659 * If the record keys overlap with the query low/high keys, pass the
4660 * record to the function iterator.
4661 *
4662 * For any internal node, compare the low and high keys of each
4663 * pointer against the query low/high keys.  If there's an overlap,
4664 * follow the pointer.
4665 *
4666 * As an optimization, we stop scanning a block when we find a low key
4667 * that is greater than the query's high key.
4668 */
4669STATIC int
4670xfs_btree_overlapped_query_range(
4671        struct xfs_btree_cur            *cur,
4672        union xfs_btree_key             *low_key,
4673        union xfs_btree_key             *high_key,
4674        xfs_btree_query_range_fn        fn,
4675        void                            *priv)
4676{
4677        union xfs_btree_ptr             ptr;
4678        union xfs_btree_ptr             *pp;
4679        union xfs_btree_key             rec_key;
4680        union xfs_btree_key             rec_hkey;
4681        union xfs_btree_key             *lkp;
4682        union xfs_btree_key             *hkp;
4683        union xfs_btree_rec             *recp;
4684        struct xfs_btree_block          *block;
4685        __int64_t                       ldiff;
4686        __int64_t                       hdiff;
4687        int                             level;
4688        struct xfs_buf                  *bp;
4689        int                             i;
4690        int                             error;
4691
4692        /* Load the root of the btree. */
4693        level = cur->bc_nlevels - 1;
4694        cur->bc_ops->init_ptr_from_cur(cur, &ptr);
4695        error = xfs_btree_lookup_get_block(cur, level, &ptr, &block);
4696        if (error)
4697                return error;
4698        xfs_btree_get_block(cur, level, &bp);
4699        trace_xfs_btree_overlapped_query_range(cur, level, bp);
4700#ifdef DEBUG
4701        error = xfs_btree_check_block(cur, block, level, bp);
4702        if (error)
4703                goto out;
4704#endif
4705        cur->bc_ptrs[level] = 1;
4706
4707        while (level < cur->bc_nlevels) {
4708                block = xfs_btree_get_block(cur, level, &bp);
4709
4710                /* End of node, pop back towards the root. */
4711                if (cur->bc_ptrs[level] > be16_to_cpu(block->bb_numrecs)) {
4712pop_up:
4713                        if (level < cur->bc_nlevels - 1)
4714                                cur->bc_ptrs[level + 1]++;
4715                        level++;
4716                        continue;
4717                }
4718
4719                if (level == 0) {
4720                        /* Handle a leaf node. */
4721                        recp = xfs_btree_rec_addr(cur, cur->bc_ptrs[0], block);
4722
4723                        cur->bc_ops->init_high_key_from_rec(&rec_hkey, recp);
4724                        ldiff = cur->bc_ops->diff_two_keys(cur, &rec_hkey,
4725                                        low_key);
4726
4727                        cur->bc_ops->init_key_from_rec(&rec_key, recp);
4728                        hdiff = cur->bc_ops->diff_two_keys(cur, high_key,
4729                                        &rec_key);
4730
4731                        /*
4732                         * If (record's high key >= query's low key) and
4733                         *    (query's high key >= record's low key), then
4734                         * this record overlaps the query range; callback.
4735                         */
4736                        if (ldiff >= 0 && hdiff >= 0) {
4737                                error = fn(cur, recp, priv);
4738                                if (error < 0 ||
4739                                    error == XFS_BTREE_QUERY_RANGE_ABORT)
4740                                        break;
4741                        } else if (hdiff < 0) {
4742                                /* Record is larger than high key; pop. */
4743                                goto pop_up;
4744                        }
4745                        cur->bc_ptrs[level]++;
4746                        continue;
4747                }
4748
4749                /* Handle an internal node. */
4750                lkp = xfs_btree_key_addr(cur, cur->bc_ptrs[level], block);
4751                hkp = xfs_btree_high_key_addr(cur, cur->bc_ptrs[level], block);
4752                pp = xfs_btree_ptr_addr(cur, cur->bc_ptrs[level], block);
4753
4754                ldiff = cur->bc_ops->diff_two_keys(cur, hkp, low_key);
4755                hdiff = cur->bc_ops->diff_two_keys(cur, high_key, lkp);
4756
4757                /*
4758                 * If (pointer's high key >= query's low key) and
4759                 *    (query's high key >= pointer's low key), then
4760                 * this record overlaps the query range; follow pointer.
4761                 */
4762                if (ldiff >= 0 && hdiff >= 0) {
4763                        level--;
4764                        error = xfs_btree_lookup_get_block(cur, level, pp,
4765                                        &block);
4766                        if (error)
4767                                goto out;
4768                        xfs_btree_get_block(cur, level, &bp);
4769                        trace_xfs_btree_overlapped_query_range(cur, level, bp);
4770#ifdef DEBUG
4771                        error = xfs_btree_check_block(cur, block, level, bp);
4772                        if (error)
4773                                goto out;
4774#endif
4775                        cur->bc_ptrs[level] = 1;
4776                        continue;
4777                } else if (hdiff < 0) {
4778                        /* The low key is larger than the upper range; pop. */
4779                        goto pop_up;
4780                }
4781                cur->bc_ptrs[level]++;
4782        }
4783
4784out:
4785        /*
4786         * If we don't end this function with the cursor pointing at a record
4787         * block, a subsequent non-error cursor deletion will not release
4788         * node-level buffers, causing a buffer leak.  This is quite possible
4789         * with a zero-results range query, so release the buffers if we
4790         * failed to return any results.
4791         */
4792        if (cur->bc_bufs[0] == NULL) {
4793                for (i = 0; i < cur->bc_nlevels; i++) {
4794                        if (cur->bc_bufs[i]) {
4795                                xfs_trans_brelse(cur->bc_tp, cur->bc_bufs[i]);
4796                                cur->bc_bufs[i] = NULL;
4797                                cur->bc_ptrs[i] = 0;
4798                                cur->bc_ra[i] = 0;
4799                        }
4800                }
4801        }
4802
4803        return error;
4804}
4805
4806/*
4807 * Query a btree for all records overlapping a given interval of keys.  The
4808 * supplied function will be called with each record found; return one of the
4809 * XFS_BTREE_QUERY_RANGE_{CONTINUE,ABORT} values or the usual negative error
4810 * code.  This function returns XFS_BTREE_QUERY_RANGE_ABORT, zero, or a
4811 * negative error code.
4812 */
4813int
4814xfs_btree_query_range(
4815        struct xfs_btree_cur            *cur,
4816        union xfs_btree_irec            *low_rec,
4817        union xfs_btree_irec            *high_rec,
4818        xfs_btree_query_range_fn        fn,
4819        void                            *priv)
4820{
4821        union xfs_btree_rec             rec;
4822        union xfs_btree_key             low_key;
4823        union xfs_btree_key             high_key;
4824
4825        /* Find the keys of both ends of the interval. */
4826        cur->bc_rec = *high_rec;
4827        cur->bc_ops->init_rec_from_cur(cur, &rec);
4828        cur->bc_ops->init_key_from_rec(&high_key, &rec);
4829
4830        cur->bc_rec = *low_rec;
4831        cur->bc_ops->init_rec_from_cur(cur, &rec);
4832        cur->bc_ops->init_key_from_rec(&low_key, &rec);
4833
4834        /* Enforce low key < high key. */
4835        if (cur->bc_ops->diff_two_keys(cur, &low_key, &high_key) > 0)
4836                return -EINVAL;
4837
4838        if (!(cur->bc_flags & XFS_BTREE_OVERLAPPING))
4839                return xfs_btree_simple_query_range(cur, &low_key,
4840                                &high_key, fn, priv);
4841        return xfs_btree_overlapped_query_range(cur, &low_key, &high_key,
4842                        fn, priv);
4843}
4844
4845/* Query a btree for all records. */
4846int
4847xfs_btree_query_all(
4848        struct xfs_btree_cur            *cur,
4849        xfs_btree_query_range_fn        fn,
4850        void                            *priv)
4851{
4852        union xfs_btree_irec            low_rec;
4853        union xfs_btree_irec            high_rec;
4854
4855        memset(&low_rec, 0, sizeof(low_rec));
4856        memset(&high_rec, 0xFF, sizeof(high_rec));
4857        return xfs_btree_query_range(cur, &low_rec, &high_rec, fn, priv);
4858}
4859
4860/*
4861 * Calculate the number of blocks needed to store a given number of records
4862 * in a short-format (per-AG metadata) btree.
4863 */
4864xfs_extlen_t
4865xfs_btree_calc_size(
4866        struct xfs_mount        *mp,
4867        uint                    *limits,
4868        unsigned long long      len)
4869{
4870        int                     level;
4871        int                     maxrecs;
4872        xfs_extlen_t            rval;
4873
4874        maxrecs = limits[0];
4875        for (level = 0, rval = 0; len > 1; level++) {
4876                len += maxrecs - 1;
4877                do_div(len, maxrecs);
4878                maxrecs = limits[1];
4879                rval += len;
4880        }
4881        return rval;
4882}
4883
4884static int
4885xfs_btree_count_blocks_helper(
4886        struct xfs_btree_cur    *cur,
4887        int                     level,
4888        void                    *data)
4889{
4890        xfs_extlen_t            *blocks = data;
4891        (*blocks)++;
4892
4893        return 0;
4894}
4895
4896/* Count the blocks in a btree and return the result in *blocks. */
4897int
4898xfs_btree_count_blocks(
4899        struct xfs_btree_cur    *cur,
4900        xfs_extlen_t            *blocks)
4901{
4902        *blocks = 0;
4903        return xfs_btree_visit_blocks(cur, xfs_btree_count_blocks_helper,
4904                        blocks);
4905}
4906