linux/fs/reiserfs/fix_node.c
<<
>>
Prefs
   1/*
   2 * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README
   3 */
   4
   5#include <linux/time.h>
   6#include <linux/slab.h>
   7#include <linux/string.h>
   8#include "reiserfs.h"
   9#include <linux/buffer_head.h>
  10
  11/*
  12 * To make any changes in the tree we find a node that contains item
  13 * to be changed/deleted or position in the node we insert a new item
  14 * to. We call this node S. To do balancing we need to decide what we
  15 * will shift to left/right neighbor, or to a new node, where new item
  16 * will be etc. To make this analysis simpler we build virtual
  17 * node. Virtual node is an array of items, that will replace items of
  18 * node S. (For instance if we are going to delete an item, virtual
  19 * node does not contain it). Virtual node keeps information about
  20 * item sizes and types, mergeability of first and last items, sizes
  21 * of all entries in directory item. We use this array of items when
  22 * calculating what we can shift to neighbors and how many nodes we
  23 * have to have if we do not any shiftings, if we shift to left/right
  24 * neighbor or to both.
  25 */
  26
  27/*
  28 * Takes item number in virtual node, returns number of item
  29 * that it has in source buffer
  30 */
  31static inline int old_item_num(int new_num, int affected_item_num, int mode)
  32{
  33        if (mode == M_PASTE || mode == M_CUT || new_num < affected_item_num)
  34                return new_num;
  35
  36        if (mode == M_INSERT) {
  37
  38                RFALSE(new_num == 0,
  39                       "vs-8005: for INSERT mode and item number of inserted item");
  40
  41                return new_num - 1;
  42        }
  43
  44        RFALSE(mode != M_DELETE,
  45               "vs-8010: old_item_num: mode must be M_DELETE (mode = \'%c\'",
  46               mode);
  47        /* delete mode */
  48        return new_num + 1;
  49}
  50
  51static void create_virtual_node(struct tree_balance *tb, int h)
  52{
  53        struct item_head *ih;
  54        struct virtual_node *vn = tb->tb_vn;
  55        int new_num;
  56        struct buffer_head *Sh; /* this comes from tb->S[h] */
  57
  58        Sh = PATH_H_PBUFFER(tb->tb_path, h);
  59
  60        /* size of changed node */
  61        vn->vn_size =
  62            MAX_CHILD_SIZE(Sh) - B_FREE_SPACE(Sh) + tb->insert_size[h];
  63
  64        /* for internal nodes array if virtual items is not created */
  65        if (h) {
  66                vn->vn_nr_item = (vn->vn_size - DC_SIZE) / (DC_SIZE + KEY_SIZE);
  67                return;
  68        }
  69
  70        /* number of items in virtual node  */
  71        vn->vn_nr_item =
  72            B_NR_ITEMS(Sh) + ((vn->vn_mode == M_INSERT) ? 1 : 0) -
  73            ((vn->vn_mode == M_DELETE) ? 1 : 0);
  74
  75        /* first virtual item */
  76        vn->vn_vi = (struct virtual_item *)(tb->tb_vn + 1);
  77        memset(vn->vn_vi, 0, vn->vn_nr_item * sizeof(struct virtual_item));
  78        vn->vn_free_ptr += vn->vn_nr_item * sizeof(struct virtual_item);
  79
  80        /* first item in the node */
  81        ih = item_head(Sh, 0);
  82
  83        /* define the mergeability for 0-th item (if it is not being deleted) */
  84        if (op_is_left_mergeable(&ih->ih_key, Sh->b_size)
  85            && (vn->vn_mode != M_DELETE || vn->vn_affected_item_num))
  86                vn->vn_vi[0].vi_type |= VI_TYPE_LEFT_MERGEABLE;
  87
  88        /*
  89         * go through all items that remain in the virtual
  90         * node (except for the new (inserted) one)
  91         */
  92        for (new_num = 0; new_num < vn->vn_nr_item; new_num++) {
  93                int j;
  94                struct virtual_item *vi = vn->vn_vi + new_num;
  95                int is_affected =
  96                    ((new_num != vn->vn_affected_item_num) ? 0 : 1);
  97
  98                if (is_affected && vn->vn_mode == M_INSERT)
  99                        continue;
 100
 101                /* get item number in source node */
 102                j = old_item_num(new_num, vn->vn_affected_item_num,
 103                                 vn->vn_mode);
 104
 105                vi->vi_item_len += ih_item_len(ih + j) + IH_SIZE;
 106                vi->vi_ih = ih + j;
 107                vi->vi_item = ih_item_body(Sh, ih + j);
 108                vi->vi_uarea = vn->vn_free_ptr;
 109
 110                /*
 111                 * FIXME: there is no check that item operation did not
 112                 * consume too much memory
 113                 */
 114                vn->vn_free_ptr +=
 115                    op_create_vi(vn, vi, is_affected, tb->insert_size[0]);
 116                if (tb->vn_buf + tb->vn_buf_size < vn->vn_free_ptr)
 117                        reiserfs_panic(tb->tb_sb, "vs-8030",
 118                                       "virtual node space consumed");
 119
 120                if (!is_affected)
 121                        /* this is not being changed */
 122                        continue;
 123
 124                if (vn->vn_mode == M_PASTE || vn->vn_mode == M_CUT) {
 125                        vn->vn_vi[new_num].vi_item_len += tb->insert_size[0];
 126                        /* pointer to data which is going to be pasted */
 127                        vi->vi_new_data = vn->vn_data;
 128                }
 129        }
 130
 131        /* virtual inserted item is not defined yet */
 132        if (vn->vn_mode == M_INSERT) {
 133                struct virtual_item *vi = vn->vn_vi + vn->vn_affected_item_num;
 134
 135                RFALSE(vn->vn_ins_ih == NULL,
 136                       "vs-8040: item header of inserted item is not specified");
 137                vi->vi_item_len = tb->insert_size[0];
 138                vi->vi_ih = vn->vn_ins_ih;
 139                vi->vi_item = vn->vn_data;
 140                vi->vi_uarea = vn->vn_free_ptr;
 141
 142                op_create_vi(vn, vi, 0 /*not pasted or cut */ ,
 143                             tb->insert_size[0]);
 144        }
 145
 146        /*
 147         * set right merge flag we take right delimiting key and
 148         * check whether it is a mergeable item
 149         */
 150        if (tb->CFR[0]) {
 151                struct reiserfs_key *key;
 152
 153                key = internal_key(tb->CFR[0], tb->rkey[0]);
 154                if (op_is_left_mergeable(key, Sh->b_size)
 155                    && (vn->vn_mode != M_DELETE
 156                        || vn->vn_affected_item_num != B_NR_ITEMS(Sh) - 1))
 157                        vn->vn_vi[vn->vn_nr_item - 1].vi_type |=
 158                            VI_TYPE_RIGHT_MERGEABLE;
 159
 160#ifdef CONFIG_REISERFS_CHECK
 161                if (op_is_left_mergeable(key, Sh->b_size) &&
 162                    !(vn->vn_mode != M_DELETE
 163                      || vn->vn_affected_item_num != B_NR_ITEMS(Sh) - 1)) {
 164                        /*
 165                         * we delete last item and it could be merged
 166                         * with right neighbor's first item
 167                         */
 168                        if (!
 169                            (B_NR_ITEMS(Sh) == 1
 170                             && is_direntry_le_ih(item_head(Sh, 0))
 171                             && ih_entry_count(item_head(Sh, 0)) == 1)) {
 172                                /*
 173                                 * node contains more than 1 item, or item
 174                                 * is not directory item, or this item
 175                                 * contains more than 1 entry
 176                                 */
 177                                print_block(Sh, 0, -1, -1);
 178                                reiserfs_panic(tb->tb_sb, "vs-8045",
 179                                               "rdkey %k, affected item==%d "
 180                                               "(mode==%c) Must be %c",
 181                                               key, vn->vn_affected_item_num,
 182                                               vn->vn_mode, M_DELETE);
 183                        }
 184                }
 185#endif
 186
 187        }
 188}
 189
 190/*
 191 * Using virtual node check, how many items can be
 192 * shifted to left neighbor
 193 */
 194static void check_left(struct tree_balance *tb, int h, int cur_free)
 195{
 196        int i;
 197        struct virtual_node *vn = tb->tb_vn;
 198        struct virtual_item *vi;
 199        int d_size, ih_size;
 200
 201        RFALSE(cur_free < 0, "vs-8050: cur_free (%d) < 0", cur_free);
 202
 203        /* internal level */
 204        if (h > 0) {
 205                tb->lnum[h] = cur_free / (DC_SIZE + KEY_SIZE);
 206                return;
 207        }
 208
 209        /* leaf level */
 210
 211        if (!cur_free || !vn->vn_nr_item) {
 212                /* no free space or nothing to move */
 213                tb->lnum[h] = 0;
 214                tb->lbytes = -1;
 215                return;
 216        }
 217
 218        RFALSE(!PATH_H_PPARENT(tb->tb_path, 0),
 219               "vs-8055: parent does not exist or invalid");
 220
 221        vi = vn->vn_vi;
 222        if ((unsigned int)cur_free >=
 223            (vn->vn_size -
 224             ((vi->vi_type & VI_TYPE_LEFT_MERGEABLE) ? IH_SIZE : 0))) {
 225                /* all contents of S[0] fits into L[0] */
 226
 227                RFALSE(vn->vn_mode == M_INSERT || vn->vn_mode == M_PASTE,
 228                       "vs-8055: invalid mode or balance condition failed");
 229
 230                tb->lnum[0] = vn->vn_nr_item;
 231                tb->lbytes = -1;
 232                return;
 233        }
 234
 235        d_size = 0, ih_size = IH_SIZE;
 236
 237        /* first item may be merge with last item in left neighbor */
 238        if (vi->vi_type & VI_TYPE_LEFT_MERGEABLE)
 239                d_size = -((int)IH_SIZE), ih_size = 0;
 240
 241        tb->lnum[0] = 0;
 242        for (i = 0; i < vn->vn_nr_item;
 243             i++, ih_size = IH_SIZE, d_size = 0, vi++) {
 244                d_size += vi->vi_item_len;
 245                if (cur_free >= d_size) {
 246                        /* the item can be shifted entirely */
 247                        cur_free -= d_size;
 248                        tb->lnum[0]++;
 249                        continue;
 250                }
 251
 252                /* the item cannot be shifted entirely, try to split it */
 253                /*
 254                 * check whether L[0] can hold ih and at least one byte
 255                 * of the item body
 256                 */
 257
 258                /* cannot shift even a part of the current item */
 259                if (cur_free <= ih_size) {
 260                        tb->lbytes = -1;
 261                        return;
 262                }
 263                cur_free -= ih_size;
 264
 265                tb->lbytes = op_check_left(vi, cur_free, 0, 0);
 266                if (tb->lbytes != -1)
 267                        /* count partially shifted item */
 268                        tb->lnum[0]++;
 269
 270                break;
 271        }
 272
 273        return;
 274}
 275
 276/*
 277 * Using virtual node check, how many items can be
 278 * shifted to right neighbor
 279 */
 280static void check_right(struct tree_balance *tb, int h, int cur_free)
 281{
 282        int i;
 283        struct virtual_node *vn = tb->tb_vn;
 284        struct virtual_item *vi;
 285        int d_size, ih_size;
 286
 287        RFALSE(cur_free < 0, "vs-8070: cur_free < 0");
 288
 289        /* internal level */
 290        if (h > 0) {
 291                tb->rnum[h] = cur_free / (DC_SIZE + KEY_SIZE);
 292                return;
 293        }
 294
 295        /* leaf level */
 296
 297        if (!cur_free || !vn->vn_nr_item) {
 298                /* no free space  */
 299                tb->rnum[h] = 0;
 300                tb->rbytes = -1;
 301                return;
 302        }
 303
 304        RFALSE(!PATH_H_PPARENT(tb->tb_path, 0),
 305               "vs-8075: parent does not exist or invalid");
 306
 307        vi = vn->vn_vi + vn->vn_nr_item - 1;
 308        if ((unsigned int)cur_free >=
 309            (vn->vn_size -
 310             ((vi->vi_type & VI_TYPE_RIGHT_MERGEABLE) ? IH_SIZE : 0))) {
 311                /* all contents of S[0] fits into R[0] */
 312
 313                RFALSE(vn->vn_mode == M_INSERT || vn->vn_mode == M_PASTE,
 314                       "vs-8080: invalid mode or balance condition failed");
 315
 316                tb->rnum[h] = vn->vn_nr_item;
 317                tb->rbytes = -1;
 318                return;
 319        }
 320
 321        d_size = 0, ih_size = IH_SIZE;
 322
 323        /* last item may be merge with first item in right neighbor */
 324        if (vi->vi_type & VI_TYPE_RIGHT_MERGEABLE)
 325                d_size = -(int)IH_SIZE, ih_size = 0;
 326
 327        tb->rnum[0] = 0;
 328        for (i = vn->vn_nr_item - 1; i >= 0;
 329             i--, d_size = 0, ih_size = IH_SIZE, vi--) {
 330                d_size += vi->vi_item_len;
 331                if (cur_free >= d_size) {
 332                        /* the item can be shifted entirely */
 333                        cur_free -= d_size;
 334                        tb->rnum[0]++;
 335                        continue;
 336                }
 337
 338                /*
 339                 * check whether R[0] can hold ih and at least one
 340                 * byte of the item body
 341                 */
 342
 343                /* cannot shift even a part of the current item */
 344                if (cur_free <= ih_size) {
 345                        tb->rbytes = -1;
 346                        return;
 347                }
 348
 349                /*
 350                 * R[0] can hold the header of the item and at least
 351                 * one byte of its body
 352                 */
 353                cur_free -= ih_size;    /* cur_free is still > 0 */
 354
 355                tb->rbytes = op_check_right(vi, cur_free);
 356                if (tb->rbytes != -1)
 357                        /* count partially shifted item */
 358                        tb->rnum[0]++;
 359
 360                break;
 361        }
 362
 363        return;
 364}
 365
 366/*
 367 * from - number of items, which are shifted to left neighbor entirely
 368 * to - number of item, which are shifted to right neighbor entirely
 369 * from_bytes - number of bytes of boundary item (or directory entries)
 370 *              which are shifted to left neighbor
 371 * to_bytes - number of bytes of boundary item (or directory entries)
 372 *            which are shifted to right neighbor
 373 */
 374static int get_num_ver(int mode, struct tree_balance *tb, int h,
 375                       int from, int from_bytes,
 376                       int to, int to_bytes, short *snum012, int flow)
 377{
 378        int i;
 379        int cur_free;
 380        int units;
 381        struct virtual_node *vn = tb->tb_vn;
 382        int total_node_size, max_node_size, current_item_size;
 383        int needed_nodes;
 384
 385        /* position of item we start filling node from */
 386        int start_item;
 387
 388        /* position of item we finish filling node by */
 389        int end_item;
 390
 391        /*
 392         * number of first bytes (entries for directory) of start_item-th item
 393         * we do not include into node that is being filled
 394         */
 395        int start_bytes;
 396
 397        /*
 398         * number of last bytes (entries for directory) of end_item-th item
 399         * we do node include into node that is being filled
 400         */
 401        int end_bytes;
 402
 403        /*
 404         * these are positions in virtual item of items, that are split
 405         * between S[0] and S1new and S1new and S2new
 406         */
 407        int split_item_positions[2];
 408
 409        split_item_positions[0] = -1;
 410        split_item_positions[1] = -1;
 411
 412        /*
 413         * We only create additional nodes if we are in insert or paste mode
 414         * or we are in replace mode at the internal level. If h is 0 and
 415         * the mode is M_REPLACE then in fix_nodes we change the mode to
 416         * paste or insert before we get here in the code.
 417         */
 418        RFALSE(tb->insert_size[h] < 0 || (mode != M_INSERT && mode != M_PASTE),
 419               "vs-8100: insert_size < 0 in overflow");
 420
 421        max_node_size = MAX_CHILD_SIZE(PATH_H_PBUFFER(tb->tb_path, h));
 422
 423        /*
 424         * snum012 [0-2] - number of items, that lay
 425         * to S[0], first new node and second new node
 426         */
 427        snum012[3] = -1;        /* s1bytes */
 428        snum012[4] = -1;        /* s2bytes */
 429
 430        /* internal level */
 431        if (h > 0) {
 432                i = ((to - from) * (KEY_SIZE + DC_SIZE) + DC_SIZE);
 433                if (i == max_node_size)
 434                        return 1;
 435                return (i / max_node_size + 1);
 436        }
 437
 438        /* leaf level */
 439        needed_nodes = 1;
 440        total_node_size = 0;
 441        cur_free = max_node_size;
 442
 443        /* start from 'from'-th item */
 444        start_item = from;
 445        /* skip its first 'start_bytes' units */
 446        start_bytes = ((from_bytes != -1) ? from_bytes : 0);
 447
 448        /* last included item is the 'end_item'-th one */
 449        end_item = vn->vn_nr_item - to - 1;
 450        /* do not count last 'end_bytes' units of 'end_item'-th item */
 451        end_bytes = (to_bytes != -1) ? to_bytes : 0;
 452
 453        /*
 454         * go through all item beginning from the start_item-th item
 455         * and ending by the end_item-th item. Do not count first
 456         * 'start_bytes' units of 'start_item'-th item and last
 457         * 'end_bytes' of 'end_item'-th item
 458         */
 459        for (i = start_item; i <= end_item; i++) {
 460                struct virtual_item *vi = vn->vn_vi + i;
 461                int skip_from_end = ((i == end_item) ? end_bytes : 0);
 462
 463                RFALSE(needed_nodes > 3, "vs-8105: too many nodes are needed");
 464
 465                /* get size of current item */
 466                current_item_size = vi->vi_item_len;
 467
 468                /*
 469                 * do not take in calculation head part (from_bytes)
 470                 * of from-th item
 471                 */
 472                current_item_size -=
 473                    op_part_size(vi, 0 /*from start */ , start_bytes);
 474
 475                /* do not take in calculation tail part of last item */
 476                current_item_size -=
 477                    op_part_size(vi, 1 /*from end */ , skip_from_end);
 478
 479                /* if item fits into current node entierly */
 480                if (total_node_size + current_item_size <= max_node_size) {
 481                        snum012[needed_nodes - 1]++;
 482                        total_node_size += current_item_size;
 483                        start_bytes = 0;
 484                        continue;
 485                }
 486
 487                /*
 488                 * virtual item length is longer, than max size of item in
 489                 * a node. It is impossible for direct item
 490                 */
 491                if (current_item_size > max_node_size) {
 492                        RFALSE(is_direct_le_ih(vi->vi_ih),
 493                               "vs-8110: "
 494                               "direct item length is %d. It can not be longer than %d",
 495                               current_item_size, max_node_size);
 496                        /* we will try to split it */
 497                        flow = 1;
 498                }
 499
 500                /* as we do not split items, take new node and continue */
 501                if (!flow) {
 502                        needed_nodes++;
 503                        i--;
 504                        total_node_size = 0;
 505                        continue;
 506                }
 507
 508                /*
 509                 * calculate number of item units which fit into node being
 510                 * filled
 511                 */
 512                {
 513                        int free_space;
 514
 515                        free_space = max_node_size - total_node_size - IH_SIZE;
 516                        units =
 517                            op_check_left(vi, free_space, start_bytes,
 518                                          skip_from_end);
 519                        /*
 520                         * nothing fits into current node, take new
 521                         * node and continue
 522                         */
 523                        if (units == -1) {
 524                                needed_nodes++, i--, total_node_size = 0;
 525                                continue;
 526                        }
 527                }
 528
 529                /* something fits into the current node */
 530                start_bytes += units;
 531                snum012[needed_nodes - 1 + 3] = units;
 532
 533                if (needed_nodes > 2)
 534                        reiserfs_warning(tb->tb_sb, "vs-8111",
 535                                         "split_item_position is out of range");
 536                snum012[needed_nodes - 1]++;
 537                split_item_positions[needed_nodes - 1] = i;
 538                needed_nodes++;
 539                /* continue from the same item with start_bytes != -1 */
 540                start_item = i;
 541                i--;
 542                total_node_size = 0;
 543        }
 544
 545        /*
 546         * sum012[4] (if it is not -1) contains number of units of which
 547         * are to be in S1new, snum012[3] - to be in S0. They are supposed
 548         * to be S1bytes and S2bytes correspondingly, so recalculate
 549         */
 550        if (snum012[4] > 0) {
 551                int split_item_num;
 552                int bytes_to_r, bytes_to_l;
 553                int bytes_to_S1new;
 554
 555                split_item_num = split_item_positions[1];
 556                bytes_to_l =
 557                    ((from == split_item_num
 558                      && from_bytes != -1) ? from_bytes : 0);
 559                bytes_to_r =
 560                    ((end_item == split_item_num
 561                      && end_bytes != -1) ? end_bytes : 0);
 562                bytes_to_S1new =
 563                    ((split_item_positions[0] ==
 564                      split_item_positions[1]) ? snum012[3] : 0);
 565
 566                /* s2bytes */
 567                snum012[4] =
 568                    op_unit_num(&vn->vn_vi[split_item_num]) - snum012[4] -
 569                    bytes_to_r - bytes_to_l - bytes_to_S1new;
 570
 571                if (vn->vn_vi[split_item_num].vi_index != TYPE_DIRENTRY &&
 572                    vn->vn_vi[split_item_num].vi_index != TYPE_INDIRECT)
 573                        reiserfs_warning(tb->tb_sb, "vs-8115",
 574                                         "not directory or indirect item");
 575        }
 576
 577        /* now we know S2bytes, calculate S1bytes */
 578        if (snum012[3] > 0) {
 579                int split_item_num;
 580                int bytes_to_r, bytes_to_l;
 581                int bytes_to_S2new;
 582
 583                split_item_num = split_item_positions[0];
 584                bytes_to_l =
 585                    ((from == split_item_num
 586                      && from_bytes != -1) ? from_bytes : 0);
 587                bytes_to_r =
 588                    ((end_item == split_item_num
 589                      && end_bytes != -1) ? end_bytes : 0);
 590                bytes_to_S2new =
 591                    ((split_item_positions[0] == split_item_positions[1]
 592                      && snum012[4] != -1) ? snum012[4] : 0);
 593
 594                /* s1bytes */
 595                snum012[3] =
 596                    op_unit_num(&vn->vn_vi[split_item_num]) - snum012[3] -
 597                    bytes_to_r - bytes_to_l - bytes_to_S2new;
 598        }
 599
 600        return needed_nodes;
 601}
 602
 603
 604/*
 605 * Set parameters for balancing.
 606 * Performs write of results of analysis of balancing into structure tb,
 607 * where it will later be used by the functions that actually do the balancing.
 608 * Parameters:
 609 *      tb      tree_balance structure;
 610 *      h       current level of the node;
 611 *      lnum    number of items from S[h] that must be shifted to L[h];
 612 *      rnum    number of items from S[h] that must be shifted to R[h];
 613 *      blk_num number of blocks that S[h] will be splitted into;
 614 *      s012    number of items that fall into splitted nodes.
 615 *      lbytes  number of bytes which flow to the left neighbor from the
 616 *              item that is not not shifted entirely
 617 *      rbytes  number of bytes which flow to the right neighbor from the
 618 *              item that is not not shifted entirely
 619 *      s1bytes number of bytes which flow to the first  new node when
 620 *              S[0] splits (this number is contained in s012 array)
 621 */
 622
 623static void set_parameters(struct tree_balance *tb, int h, int lnum,
 624                           int rnum, int blk_num, short *s012, int lb, int rb)
 625{
 626
 627        tb->lnum[h] = lnum;
 628        tb->rnum[h] = rnum;
 629        tb->blknum[h] = blk_num;
 630
 631        /* only for leaf level */
 632        if (h == 0) {
 633                if (s012 != NULL) {
 634                        tb->s0num = *s012++;
 635                        tb->snum[0] = *s012++;
 636                        tb->snum[1] = *s012++;
 637                        tb->sbytes[0] = *s012++;
 638                        tb->sbytes[1] = *s012;
 639                }
 640                tb->lbytes = lb;
 641                tb->rbytes = rb;
 642        }
 643        PROC_INFO_ADD(tb->tb_sb, lnum[h], lnum);
 644        PROC_INFO_ADD(tb->tb_sb, rnum[h], rnum);
 645
 646        PROC_INFO_ADD(tb->tb_sb, lbytes[h], lb);
 647        PROC_INFO_ADD(tb->tb_sb, rbytes[h], rb);
 648}
 649
 650/*
 651 * check if node disappears if we shift tb->lnum[0] items to left
 652 * neighbor and tb->rnum[0] to the right one.
 653 */
 654static int is_leaf_removable(struct tree_balance *tb)
 655{
 656        struct virtual_node *vn = tb->tb_vn;
 657        int to_left, to_right;
 658        int size;
 659        int remain_items;
 660
 661        /*
 662         * number of items that will be shifted to left (right) neighbor
 663         * entirely
 664         */
 665        to_left = tb->lnum[0] - ((tb->lbytes != -1) ? 1 : 0);
 666        to_right = tb->rnum[0] - ((tb->rbytes != -1) ? 1 : 0);
 667        remain_items = vn->vn_nr_item;
 668
 669        /* how many items remain in S[0] after shiftings to neighbors */
 670        remain_items -= (to_left + to_right);
 671
 672        /* all content of node can be shifted to neighbors */
 673        if (remain_items < 1) {
 674                set_parameters(tb, 0, to_left, vn->vn_nr_item - to_left, 0,
 675                               NULL, -1, -1);
 676                return 1;
 677        }
 678
 679        /* S[0] is not removable */
 680        if (remain_items > 1 || tb->lbytes == -1 || tb->rbytes == -1)
 681                return 0;
 682
 683        /* check whether we can divide 1 remaining item between neighbors */
 684
 685        /* get size of remaining item (in item units) */
 686        size = op_unit_num(&vn->vn_vi[to_left]);
 687
 688        if (tb->lbytes + tb->rbytes >= size) {
 689                set_parameters(tb, 0, to_left + 1, to_right + 1, 0, NULL,
 690                               tb->lbytes, -1);
 691                return 1;
 692        }
 693
 694        return 0;
 695}
 696
 697/* check whether L, S, R can be joined in one node */
 698static int are_leaves_removable(struct tree_balance *tb, int lfree, int rfree)
 699{
 700        struct virtual_node *vn = tb->tb_vn;
 701        int ih_size;
 702        struct buffer_head *S0;
 703
 704        S0 = PATH_H_PBUFFER(tb->tb_path, 0);
 705
 706        ih_size = 0;
 707        if (vn->vn_nr_item) {
 708                if (vn->vn_vi[0].vi_type & VI_TYPE_LEFT_MERGEABLE)
 709                        ih_size += IH_SIZE;
 710
 711                if (vn->vn_vi[vn->vn_nr_item - 1].
 712                    vi_type & VI_TYPE_RIGHT_MERGEABLE)
 713                        ih_size += IH_SIZE;
 714        } else {
 715                /* there was only one item and it will be deleted */
 716                struct item_head *ih;
 717
 718                RFALSE(B_NR_ITEMS(S0) != 1,
 719                       "vs-8125: item number must be 1: it is %d",
 720                       B_NR_ITEMS(S0));
 721
 722                ih = item_head(S0, 0);
 723                if (tb->CFR[0]
 724                    && !comp_short_le_keys(&ih->ih_key,
 725                                           internal_key(tb->CFR[0],
 726                                                          tb->rkey[0])))
 727                        /*
 728                         * Directory must be in correct state here: that is
 729                         * somewhere at the left side should exist first
 730                         * directory item. But the item being deleted can
 731                         * not be that first one because its right neighbor
 732                         * is item of the same directory. (But first item
 733                         * always gets deleted in last turn). So, neighbors
 734                         * of deleted item can be merged, so we can save
 735                         * ih_size
 736                         */
 737                        if (is_direntry_le_ih(ih)) {
 738                                ih_size = IH_SIZE;
 739
 740                                /*
 741                                 * we might check that left neighbor exists
 742                                 * and is of the same directory
 743                                 */
 744                                RFALSE(le_ih_k_offset(ih) == DOT_OFFSET,
 745                                       "vs-8130: first directory item can not be removed until directory is not empty");
 746                        }
 747
 748        }
 749
 750        if (MAX_CHILD_SIZE(S0) + vn->vn_size <= rfree + lfree + ih_size) {
 751                set_parameters(tb, 0, -1, -1, -1, NULL, -1, -1);
 752                PROC_INFO_INC(tb->tb_sb, leaves_removable);
 753                return 1;
 754        }
 755        return 0;
 756
 757}
 758
 759/* when we do not split item, lnum and rnum are numbers of entire items */
 760#define SET_PAR_SHIFT_LEFT \
 761if (h)\
 762{\
 763   int to_l;\
 764   \
 765   to_l = (MAX_NR_KEY(Sh)+1 - lpar + vn->vn_nr_item + 1) / 2 -\
 766              (MAX_NR_KEY(Sh) + 1 - lpar);\
 767              \
 768              set_parameters (tb, h, to_l, 0, lnver, NULL, -1, -1);\
 769}\
 770else \
 771{\
 772   if (lset==LEFT_SHIFT_FLOW)\
 773     set_parameters (tb, h, lpar, 0, lnver, snum012+lset,\
 774                     tb->lbytes, -1);\
 775   else\
 776     set_parameters (tb, h, lpar - (tb->lbytes!=-1), 0, lnver, snum012+lset,\
 777                     -1, -1);\
 778}
 779
 780#define SET_PAR_SHIFT_RIGHT \
 781if (h)\
 782{\
 783   int to_r;\
 784   \
 785   to_r = (MAX_NR_KEY(Sh)+1 - rpar + vn->vn_nr_item + 1) / 2 - (MAX_NR_KEY(Sh) + 1 - rpar);\
 786   \
 787   set_parameters (tb, h, 0, to_r, rnver, NULL, -1, -1);\
 788}\
 789else \
 790{\
 791   if (rset==RIGHT_SHIFT_FLOW)\
 792     set_parameters (tb, h, 0, rpar, rnver, snum012+rset,\
 793                  -1, tb->rbytes);\
 794   else\
 795     set_parameters (tb, h, 0, rpar - (tb->rbytes!=-1), rnver, snum012+rset,\
 796                  -1, -1);\
 797}
 798
 799static void free_buffers_in_tb(struct tree_balance *tb)
 800{
 801        int i;
 802
 803        pathrelse(tb->tb_path);
 804
 805        for (i = 0; i < MAX_HEIGHT; i++) {
 806                brelse(tb->L[i]);
 807                brelse(tb->R[i]);
 808                brelse(tb->FL[i]);
 809                brelse(tb->FR[i]);
 810                brelse(tb->CFL[i]);
 811                brelse(tb->CFR[i]);
 812
 813                tb->L[i] = NULL;
 814                tb->R[i] = NULL;
 815                tb->FL[i] = NULL;
 816                tb->FR[i] = NULL;
 817                tb->CFL[i] = NULL;
 818                tb->CFR[i] = NULL;
 819        }
 820}
 821
 822/*
 823 * Get new buffers for storing new nodes that are created while balancing.
 824 * Returns:     SCHEDULE_OCCURRED - schedule occurred while the function worked;
 825 *              CARRY_ON - schedule didn't occur while the function worked;
 826 *              NO_DISK_SPACE - no disk space.
 827 */
 828/* The function is NOT SCHEDULE-SAFE! */
 829static int get_empty_nodes(struct tree_balance *tb, int h)
 830{
 831        struct buffer_head *new_bh, *Sh = PATH_H_PBUFFER(tb->tb_path, h);
 832        b_blocknr_t *blocknr, blocknrs[MAX_AMOUNT_NEEDED] = { 0, };
 833        int counter, number_of_freeblk;
 834        int  amount_needed;     /* number of needed empty blocks */
 835        int  retval = CARRY_ON;
 836        struct super_block *sb = tb->tb_sb;
 837
 838        /*
 839         * number_of_freeblk is the number of empty blocks which have been
 840         * acquired for use by the balancing algorithm minus the number of
 841         * empty blocks used in the previous levels of the analysis,
 842         * number_of_freeblk = tb->cur_blknum can be non-zero if a schedule
 843         * occurs after empty blocks are acquired, and the balancing analysis
 844         * is then restarted, amount_needed is the number needed by this
 845         * level (h) of the balancing analysis.
 846         *
 847         * Note that for systems with many processes writing, it would be
 848         * more layout optimal to calculate the total number needed by all
 849         * levels and then to run reiserfs_new_blocks to get all of them at
 850         * once.
 851         */
 852
 853        /*
 854         * Initiate number_of_freeblk to the amount acquired prior to the
 855         * restart of the analysis or 0 if not restarted, then subtract the
 856         * amount needed by all of the levels of the tree below h.
 857         */
 858        /* blknum includes S[h], so we subtract 1 in this calculation */
 859        for (counter = 0, number_of_freeblk = tb->cur_blknum;
 860             counter < h; counter++)
 861                number_of_freeblk -=
 862                    (tb->blknum[counter]) ? (tb->blknum[counter] -
 863                                                   1) : 0;
 864
 865        /* Allocate missing empty blocks. */
 866        /* if Sh == 0  then we are getting a new root */
 867        amount_needed = (Sh) ? (tb->blknum[h] - 1) : 1;
 868        /*
 869         * Amount_needed = the amount that we need more than the
 870         * amount that we have.
 871         */
 872        if (amount_needed > number_of_freeblk)
 873                amount_needed -= number_of_freeblk;
 874        else    /* If we have enough already then there is nothing to do. */
 875                return CARRY_ON;
 876
 877        /*
 878         * No need to check quota - is not allocated for blocks used
 879         * for formatted nodes
 880         */
 881        if (reiserfs_new_form_blocknrs(tb, blocknrs,
 882                                       amount_needed) == NO_DISK_SPACE)
 883                return NO_DISK_SPACE;
 884
 885        /* for each blocknumber we just got, get a buffer and stick it on FEB */
 886        for (blocknr = blocknrs, counter = 0;
 887             counter < amount_needed; blocknr++, counter++) {
 888
 889                RFALSE(!*blocknr,
 890                       "PAP-8135: reiserfs_new_blocknrs failed when got new blocks");
 891
 892                new_bh = sb_getblk(sb, *blocknr);
 893                RFALSE(buffer_dirty(new_bh) ||
 894                       buffer_journaled(new_bh) ||
 895                       buffer_journal_dirty(new_bh),
 896                       "PAP-8140: journaled or dirty buffer %b for the new block",
 897                       new_bh);
 898
 899                /* Put empty buffers into the array. */
 900                RFALSE(tb->FEB[tb->cur_blknum],
 901                       "PAP-8141: busy slot for new buffer");
 902
 903                set_buffer_journal_new(new_bh);
 904                tb->FEB[tb->cur_blknum++] = new_bh;
 905        }
 906
 907        if (retval == CARRY_ON && FILESYSTEM_CHANGED_TB(tb))
 908                retval = REPEAT_SEARCH;
 909
 910        return retval;
 911}
 912
 913/*
 914 * Get free space of the left neighbor, which is stored in the parent
 915 * node of the left neighbor.
 916 */
 917static int get_lfree(struct tree_balance *tb, int h)
 918{
 919        struct buffer_head *l, *f;
 920        int order;
 921
 922        if ((f = PATH_H_PPARENT(tb->tb_path, h)) == NULL ||
 923            (l = tb->FL[h]) == NULL)
 924                return 0;
 925
 926        if (f == l)
 927                order = PATH_H_B_ITEM_ORDER(tb->tb_path, h) - 1;
 928        else {
 929                order = B_NR_ITEMS(l);
 930                f = l;
 931        }
 932
 933        return (MAX_CHILD_SIZE(f) - dc_size(B_N_CHILD(f, order)));
 934}
 935
 936/*
 937 * Get free space of the right neighbor,
 938 * which is stored in the parent node of the right neighbor.
 939 */
 940static int get_rfree(struct tree_balance *tb, int h)
 941{
 942        struct buffer_head *r, *f;
 943        int order;
 944
 945        if ((f = PATH_H_PPARENT(tb->tb_path, h)) == NULL ||
 946            (r = tb->FR[h]) == NULL)
 947                return 0;
 948
 949        if (f == r)
 950                order = PATH_H_B_ITEM_ORDER(tb->tb_path, h) + 1;
 951        else {
 952                order = 0;
 953                f = r;
 954        }
 955
 956        return (MAX_CHILD_SIZE(f) - dc_size(B_N_CHILD(f, order)));
 957
 958}
 959
 960/* Check whether left neighbor is in memory. */
 961static int is_left_neighbor_in_cache(struct tree_balance *tb, int h)
 962{
 963        struct buffer_head *father, *left;
 964        struct super_block *sb = tb->tb_sb;
 965        b_blocknr_t left_neighbor_blocknr;
 966        int left_neighbor_position;
 967
 968        /* Father of the left neighbor does not exist. */
 969        if (!tb->FL[h])
 970                return 0;
 971
 972        /* Calculate father of the node to be balanced. */
 973        father = PATH_H_PBUFFER(tb->tb_path, h + 1);
 974
 975        RFALSE(!father ||
 976               !B_IS_IN_TREE(father) ||
 977               !B_IS_IN_TREE(tb->FL[h]) ||
 978               !buffer_uptodate(father) ||
 979               !buffer_uptodate(tb->FL[h]),
 980               "vs-8165: F[h] (%b) or FL[h] (%b) is invalid",
 981               father, tb->FL[h]);
 982
 983        /*
 984         * Get position of the pointer to the left neighbor
 985         * into the left father.
 986         */
 987        left_neighbor_position = (father == tb->FL[h]) ?
 988            tb->lkey[h] : B_NR_ITEMS(tb->FL[h]);
 989        /* Get left neighbor block number. */
 990        left_neighbor_blocknr =
 991            B_N_CHILD_NUM(tb->FL[h], left_neighbor_position);
 992        /* Look for the left neighbor in the cache. */
 993        if ((left = sb_find_get_block(sb, left_neighbor_blocknr))) {
 994
 995                RFALSE(buffer_uptodate(left) && !B_IS_IN_TREE(left),
 996                       "vs-8170: left neighbor (%b %z) is not in the tree",
 997                       left, left);
 998                put_bh(left);
 999                return 1;
1000        }
1001
1002        return 0;
1003}
1004
1005#define LEFT_PARENTS  'l'
1006#define RIGHT_PARENTS 'r'
1007
1008static void decrement_key(struct cpu_key *key)
1009{
1010        /* call item specific function for this key */
1011        item_ops[cpu_key_k_type(key)]->decrement_key(key);
1012}
1013
1014/*
1015 * Calculate far left/right parent of the left/right neighbor of the
1016 * current node, that is calculate the left/right (FL[h]/FR[h]) neighbor
1017 * of the parent F[h].
1018 * Calculate left/right common parent of the current node and L[h]/R[h].
1019 * Calculate left/right delimiting key position.
1020 * Returns:     PATH_INCORRECT    - path in the tree is not correct
1021 *              SCHEDULE_OCCURRED - schedule occurred while the function worked
1022 *              CARRY_ON          - schedule didn't occur while the function
1023 *                                  worked
1024 */
1025static int get_far_parent(struct tree_balance *tb,
1026                          int h,
1027                          struct buffer_head **pfather,
1028                          struct buffer_head **pcom_father, char c_lr_par)
1029{
1030        struct buffer_head *parent;
1031        INITIALIZE_PATH(s_path_to_neighbor_father);
1032        struct treepath *path = tb->tb_path;
1033        struct cpu_key s_lr_father_key;
1034        int counter,
1035            position = INT_MAX,
1036            first_last_position = 0,
1037            path_offset = PATH_H_PATH_OFFSET(path, h);
1038
1039        /*
1040         * Starting from F[h] go upwards in the tree, and look for the common
1041         * ancestor of F[h], and its neighbor l/r, that should be obtained.
1042         */
1043
1044        counter = path_offset;
1045
1046        RFALSE(counter < FIRST_PATH_ELEMENT_OFFSET,
1047               "PAP-8180: invalid path length");
1048
1049        for (; counter > FIRST_PATH_ELEMENT_OFFSET; counter--) {
1050                /*
1051                 * Check whether parent of the current buffer in the path
1052                 * is really parent in the tree.
1053                 */
1054                if (!B_IS_IN_TREE
1055                    (parent = PATH_OFFSET_PBUFFER(path, counter - 1)))
1056                        return REPEAT_SEARCH;
1057
1058                /* Check whether position in the parent is correct. */
1059                if ((position =
1060                     PATH_OFFSET_POSITION(path,
1061                                          counter - 1)) >
1062                    B_NR_ITEMS(parent))
1063                        return REPEAT_SEARCH;
1064
1065                /*
1066                 * Check whether parent at the path really points
1067                 * to the child.
1068                 */
1069                if (B_N_CHILD_NUM(parent, position) !=
1070                    PATH_OFFSET_PBUFFER(path, counter)->b_blocknr)
1071                        return REPEAT_SEARCH;
1072
1073                /*
1074                 * Return delimiting key if position in the parent is not
1075                 * equal to first/last one.
1076                 */
1077                if (c_lr_par == RIGHT_PARENTS)
1078                        first_last_position = B_NR_ITEMS(parent);
1079                if (position != first_last_position) {
1080                        *pcom_father = parent;
1081                        get_bh(*pcom_father);
1082                        /*(*pcom_father = parent)->b_count++; */
1083                        break;
1084                }
1085        }
1086
1087        /* if we are in the root of the tree, then there is no common father */
1088        if (counter == FIRST_PATH_ELEMENT_OFFSET) {
1089                /*
1090                 * Check whether first buffer in the path is the
1091                 * root of the tree.
1092                 */
1093                if (PATH_OFFSET_PBUFFER
1094                    (tb->tb_path,
1095                     FIRST_PATH_ELEMENT_OFFSET)->b_blocknr ==
1096                    SB_ROOT_BLOCK(tb->tb_sb)) {
1097                        *pfather = *pcom_father = NULL;
1098                        return CARRY_ON;
1099                }
1100                return REPEAT_SEARCH;
1101        }
1102
1103        RFALSE(B_LEVEL(*pcom_father) <= DISK_LEAF_NODE_LEVEL,
1104               "PAP-8185: (%b %z) level too small",
1105               *pcom_father, *pcom_father);
1106
1107        /* Check whether the common parent is locked. */
1108
1109        if (buffer_locked(*pcom_father)) {
1110
1111                /* Release the write lock while the buffer is busy */
1112                int depth = reiserfs_write_unlock_nested(tb->tb_sb);
1113                __wait_on_buffer(*pcom_father);
1114                reiserfs_write_lock_nested(tb->tb_sb, depth);
1115                if (FILESYSTEM_CHANGED_TB(tb)) {
1116                        brelse(*pcom_father);
1117                        return REPEAT_SEARCH;
1118                }
1119        }
1120
1121        /*
1122         * So, we got common parent of the current node and its
1123         * left/right neighbor.  Now we are getting the parent of the
1124         * left/right neighbor.
1125         */
1126
1127        /* Form key to get parent of the left/right neighbor. */
1128        le_key2cpu_key(&s_lr_father_key,
1129                       internal_key(*pcom_father,
1130                                      (c_lr_par ==
1131                                       LEFT_PARENTS) ? (tb->lkey[h - 1] =
1132                                                        position -
1133                                                        1) : (tb->rkey[h -
1134                                                                           1] =
1135                                                              position)));
1136
1137        if (c_lr_par == LEFT_PARENTS)
1138                decrement_key(&s_lr_father_key);
1139
1140        if (search_by_key
1141            (tb->tb_sb, &s_lr_father_key, &s_path_to_neighbor_father,
1142             h + 1) == IO_ERROR)
1143                /* path is released */
1144                return IO_ERROR;
1145
1146        if (FILESYSTEM_CHANGED_TB(tb)) {
1147                pathrelse(&s_path_to_neighbor_father);
1148                brelse(*pcom_father);
1149                return REPEAT_SEARCH;
1150        }
1151
1152        *pfather = PATH_PLAST_BUFFER(&s_path_to_neighbor_father);
1153
1154        RFALSE(B_LEVEL(*pfather) != h + 1,
1155               "PAP-8190: (%b %z) level too small", *pfather, *pfather);
1156        RFALSE(s_path_to_neighbor_father.path_length <
1157               FIRST_PATH_ELEMENT_OFFSET, "PAP-8192: path length is too small");
1158
1159        s_path_to_neighbor_father.path_length--;
1160        pathrelse(&s_path_to_neighbor_father);
1161        return CARRY_ON;
1162}
1163
1164/*
1165 * Get parents of neighbors of node in the path(S[path_offset]) and
1166 * common parents of S[path_offset] and L[path_offset]/R[path_offset]:
1167 * F[path_offset], FL[path_offset], FR[path_offset], CFL[path_offset],
1168 * CFR[path_offset].
1169 * Calculate numbers of left and right delimiting keys position:
1170 * lkey[path_offset], rkey[path_offset].
1171 * Returns:     SCHEDULE_OCCURRED - schedule occurred while the function worked
1172 *              CARRY_ON - schedule didn't occur while the function worked
1173 */
1174static int get_parents(struct tree_balance *tb, int h)
1175{
1176        struct treepath *path = tb->tb_path;
1177        int position,
1178            ret,
1179            path_offset = PATH_H_PATH_OFFSET(tb->tb_path, h);
1180        struct buffer_head *curf, *curcf;
1181
1182        /* Current node is the root of the tree or will be root of the tree */
1183        if (path_offset <= FIRST_PATH_ELEMENT_OFFSET) {
1184                /*
1185                 * The root can not have parents.
1186                 * Release nodes which previously were obtained as
1187                 * parents of the current node neighbors.
1188                 */
1189                brelse(tb->FL[h]);
1190                brelse(tb->CFL[h]);
1191                brelse(tb->FR[h]);
1192                brelse(tb->CFR[h]);
1193                tb->FL[h]  = NULL;
1194                tb->CFL[h] = NULL;
1195                tb->FR[h]  = NULL;
1196                tb->CFR[h] = NULL;
1197                return CARRY_ON;
1198        }
1199
1200        /* Get parent FL[path_offset] of L[path_offset]. */
1201        position = PATH_OFFSET_POSITION(path, path_offset - 1);
1202        if (position) {
1203                /* Current node is not the first child of its parent. */
1204                curf = PATH_OFFSET_PBUFFER(path, path_offset - 1);
1205                curcf = PATH_OFFSET_PBUFFER(path, path_offset - 1);
1206                get_bh(curf);
1207                get_bh(curf);
1208                tb->lkey[h] = position - 1;
1209        } else {
1210                /*
1211                 * Calculate current parent of L[path_offset], which is the
1212                 * left neighbor of the current node.  Calculate current
1213                 * common parent of L[path_offset] and the current node.
1214                 * Note that CFL[path_offset] not equal FL[path_offset] and
1215                 * CFL[path_offset] not equal F[path_offset].
1216                 * Calculate lkey[path_offset].
1217                 */
1218                if ((ret = get_far_parent(tb, h + 1, &curf,
1219                                                  &curcf,
1220                                                  LEFT_PARENTS)) != CARRY_ON)
1221                        return ret;
1222        }
1223
1224        brelse(tb->FL[h]);
1225        tb->FL[h] = curf;       /* New initialization of FL[h]. */
1226        brelse(tb->CFL[h]);
1227        tb->CFL[h] = curcf;     /* New initialization of CFL[h]. */
1228
1229        RFALSE((curf && !B_IS_IN_TREE(curf)) ||
1230               (curcf && !B_IS_IN_TREE(curcf)),
1231               "PAP-8195: FL (%b) or CFL (%b) is invalid", curf, curcf);
1232
1233        /* Get parent FR[h] of R[h]. */
1234
1235        /* Current node is the last child of F[h]. FR[h] != F[h]. */
1236        if (position == B_NR_ITEMS(PATH_H_PBUFFER(path, h + 1))) {
1237                /*
1238                 * Calculate current parent of R[h], which is the right
1239                 * neighbor of F[h].  Calculate current common parent of
1240                 * R[h] and current node. Note that CFR[h] not equal
1241                 * FR[path_offset] and CFR[h] not equal F[h].
1242                 */
1243                if ((ret =
1244                     get_far_parent(tb, h + 1, &curf, &curcf,
1245                                    RIGHT_PARENTS)) != CARRY_ON)
1246                        return ret;
1247        } else {
1248                /* Current node is not the last child of its parent F[h]. */
1249                curf = PATH_OFFSET_PBUFFER(path, path_offset - 1);
1250                curcf = PATH_OFFSET_PBUFFER(path, path_offset - 1);
1251                get_bh(curf);
1252                get_bh(curf);
1253                tb->rkey[h] = position;
1254        }
1255
1256        brelse(tb->FR[h]);
1257        /* New initialization of FR[path_offset]. */
1258        tb->FR[h] = curf;
1259
1260        brelse(tb->CFR[h]);
1261        /* New initialization of CFR[path_offset]. */
1262        tb->CFR[h] = curcf;
1263
1264        RFALSE((curf && !B_IS_IN_TREE(curf)) ||
1265               (curcf && !B_IS_IN_TREE(curcf)),
1266               "PAP-8205: FR (%b) or CFR (%b) is invalid", curf, curcf);
1267
1268        return CARRY_ON;
1269}
1270
1271/*
1272 * it is possible to remove node as result of shiftings to
1273 * neighbors even when we insert or paste item.
1274 */
1275static inline int can_node_be_removed(int mode, int lfree, int sfree, int rfree,
1276                                      struct tree_balance *tb, int h)
1277{
1278        struct buffer_head *Sh = PATH_H_PBUFFER(tb->tb_path, h);
1279        int levbytes = tb->insert_size[h];
1280        struct item_head *ih;
1281        struct reiserfs_key *r_key = NULL;
1282
1283        ih = item_head(Sh, 0);
1284        if (tb->CFR[h])
1285                r_key = internal_key(tb->CFR[h], tb->rkey[h]);
1286
1287        if (lfree + rfree + sfree < MAX_CHILD_SIZE(Sh) + levbytes
1288            /* shifting may merge items which might save space */
1289            -
1290            ((!h
1291              && op_is_left_mergeable(&ih->ih_key, Sh->b_size)) ? IH_SIZE : 0)
1292            -
1293            ((!h && r_key
1294              && op_is_left_mergeable(r_key, Sh->b_size)) ? IH_SIZE : 0)
1295            + ((h) ? KEY_SIZE : 0)) {
1296                /* node can not be removed */
1297                if (sfree >= levbytes) {
1298                        /* new item fits into node S[h] without any shifting */
1299                        if (!h)
1300                                tb->s0num =
1301                                    B_NR_ITEMS(Sh) +
1302                                    ((mode == M_INSERT) ? 1 : 0);
1303                        set_parameters(tb, h, 0, 0, 1, NULL, -1, -1);
1304                        return NO_BALANCING_NEEDED;
1305                }
1306        }
1307        PROC_INFO_INC(tb->tb_sb, can_node_be_removed[h]);
1308        return !NO_BALANCING_NEEDED;
1309}
1310
1311/*
1312 * Check whether current node S[h] is balanced when increasing its size by
1313 * Inserting or Pasting.
1314 * Calculate parameters for balancing for current level h.
1315 * Parameters:
1316 *      tb      tree_balance structure;
1317 *      h       current level of the node;
1318 *      inum    item number in S[h];
1319 *      mode    i - insert, p - paste;
1320 * Returns:     1 - schedule occurred;
1321 *              0 - balancing for higher levels needed;
1322 *             -1 - no balancing for higher levels needed;
1323 *             -2 - no disk space.
1324 */
1325/* ip means Inserting or Pasting */
1326static int ip_check_balance(struct tree_balance *tb, int h)
1327{
1328        struct virtual_node *vn = tb->tb_vn;
1329        /*
1330         * Number of bytes that must be inserted into (value is negative
1331         * if bytes are deleted) buffer which contains node being balanced.
1332         * The mnemonic is that the attempted change in node space used
1333         * level is levbytes bytes.
1334         */
1335        int levbytes;
1336        int ret;
1337
1338        int lfree, sfree, rfree /* free space in L, S and R */ ;
1339
1340        /*
1341         * nver is short for number of vertixes, and lnver is the number if
1342         * we shift to the left, rnver is the number if we shift to the
1343         * right, and lrnver is the number if we shift in both directions.
1344         * The goal is to minimize first the number of vertixes, and second,
1345         * the number of vertixes whose contents are changed by shifting,
1346         * and third the number of uncached vertixes whose contents are
1347         * changed by shifting and must be read from disk.
1348         */
1349        int nver, lnver, rnver, lrnver;
1350
1351        /*
1352         * used at leaf level only, S0 = S[0] is the node being balanced,
1353         * sInum [ I = 0,1,2 ] is the number of items that will
1354         * remain in node SI after balancing.  S1 and S2 are new
1355         * nodes that might be created.
1356         */
1357
1358        /*
1359         * we perform 8 calls to get_num_ver().  For each call we
1360         * calculate five parameters.  where 4th parameter is s1bytes
1361         * and 5th - s2bytes
1362         *
1363         * s0num, s1num, s2num for 8 cases
1364         * 0,1 - do not shift and do not shift but bottle
1365         * 2   - shift only whole item to left
1366         * 3   - shift to left and bottle as much as possible
1367         * 4,5 - shift to right (whole items and as much as possible
1368         * 6,7 - shift to both directions (whole items and as much as possible)
1369         */
1370        short snum012[40] = { 0, };
1371
1372        /* Sh is the node whose balance is currently being checked */
1373        struct buffer_head *Sh;
1374
1375        Sh = PATH_H_PBUFFER(tb->tb_path, h);
1376        levbytes = tb->insert_size[h];
1377
1378        /* Calculate balance parameters for creating new root. */
1379        if (!Sh) {
1380                if (!h)
1381                        reiserfs_panic(tb->tb_sb, "vs-8210",
1382                                       "S[0] can not be 0");
1383                switch (ret = get_empty_nodes(tb, h)) {
1384                /* no balancing for higher levels needed */
1385                case CARRY_ON:
1386                        set_parameters(tb, h, 0, 0, 1, NULL, -1, -1);
1387                        return NO_BALANCING_NEEDED;
1388
1389                case NO_DISK_SPACE:
1390                case REPEAT_SEARCH:
1391                        return ret;
1392                default:
1393                        reiserfs_panic(tb->tb_sb, "vs-8215", "incorrect "
1394                                       "return value of get_empty_nodes");
1395                }
1396        }
1397
1398        /* get parents of S[h] neighbors. */
1399        ret = get_parents(tb, h);
1400        if (ret != CARRY_ON)
1401                return ret;
1402
1403        sfree = B_FREE_SPACE(Sh);
1404
1405        /* get free space of neighbors */
1406        rfree = get_rfree(tb, h);
1407        lfree = get_lfree(tb, h);
1408
1409        /* and new item fits into node S[h] without any shifting */
1410        if (can_node_be_removed(vn->vn_mode, lfree, sfree, rfree, tb, h) ==
1411            NO_BALANCING_NEEDED)
1412                return NO_BALANCING_NEEDED;
1413
1414        create_virtual_node(tb, h);
1415
1416        /*
1417         * determine maximal number of items we can shift to the left
1418         * neighbor (in tb structure) and the maximal number of bytes
1419         * that can flow to the left neighbor from the left most liquid
1420         * item that cannot be shifted from S[0] entirely (returned value)
1421         */
1422        check_left(tb, h, lfree);
1423
1424        /*
1425         * determine maximal number of items we can shift to the right
1426         * neighbor (in tb structure) and the maximal number of bytes
1427         * that can flow to the right neighbor from the right most liquid
1428         * item that cannot be shifted from S[0] entirely (returned value)
1429         */
1430        check_right(tb, h, rfree);
1431
1432        /*
1433         * all contents of internal node S[h] can be moved into its
1434         * neighbors, S[h] will be removed after balancing
1435         */
1436        if (h && (tb->rnum[h] + tb->lnum[h] >= vn->vn_nr_item + 1)) {
1437                int to_r;
1438
1439                /*
1440                 * Since we are working on internal nodes, and our internal
1441                 * nodes have fixed size entries, then we can balance by the
1442                 * number of items rather than the space they consume.  In this
1443                 * routine we set the left node equal to the right node,
1444                 * allowing a difference of less than or equal to 1 child
1445                 * pointer.
1446                 */
1447                to_r =
1448                    ((MAX_NR_KEY(Sh) << 1) + 2 - tb->lnum[h] - tb->rnum[h] +
1449                     vn->vn_nr_item + 1) / 2 - (MAX_NR_KEY(Sh) + 1 -
1450                                                tb->rnum[h]);
1451                set_parameters(tb, h, vn->vn_nr_item + 1 - to_r, to_r, 0, NULL,
1452                               -1, -1);
1453                return CARRY_ON;
1454        }
1455
1456        /*
1457         * this checks balance condition, that any two neighboring nodes
1458         * can not fit in one node
1459         */
1460        RFALSE(h &&
1461               (tb->lnum[h] >= vn->vn_nr_item + 1 ||
1462                tb->rnum[h] >= vn->vn_nr_item + 1),
1463               "vs-8220: tree is not balanced on internal level");
1464        RFALSE(!h && ((tb->lnum[h] >= vn->vn_nr_item && (tb->lbytes == -1)) ||
1465                      (tb->rnum[h] >= vn->vn_nr_item && (tb->rbytes == -1))),
1466               "vs-8225: tree is not balanced on leaf level");
1467
1468        /*
1469         * all contents of S[0] can be moved into its neighbors
1470         * S[0] will be removed after balancing.
1471         */
1472        if (!h && is_leaf_removable(tb))
1473                return CARRY_ON;
1474
1475        /*
1476         * why do we perform this check here rather than earlier??
1477         * Answer: we can win 1 node in some cases above. Moreover we
1478         * checked it above, when we checked, that S[0] is not removable
1479         * in principle
1480         */
1481
1482         /* new item fits into node S[h] without any shifting */
1483        if (sfree >= levbytes) {
1484                if (!h)
1485                        tb->s0num = vn->vn_nr_item;
1486                set_parameters(tb, h, 0, 0, 1, NULL, -1, -1);
1487                return NO_BALANCING_NEEDED;
1488        }
1489
1490        {
1491                int lpar, rpar, nset, lset, rset, lrset;
1492                /* regular overflowing of the node */
1493
1494                /*
1495                 * get_num_ver works in 2 modes (FLOW & NO_FLOW)
1496                 * lpar, rpar - number of items we can shift to left/right
1497                 *              neighbor (including splitting item)
1498                 * nset, lset, rset, lrset - shows, whether flowing items
1499                 *                           give better packing
1500                 */
1501#define FLOW 1
1502#define NO_FLOW 0               /* do not any splitting */
1503
1504                /* we choose one of the following */
1505#define NOTHING_SHIFT_NO_FLOW   0
1506#define NOTHING_SHIFT_FLOW      5
1507#define LEFT_SHIFT_NO_FLOW      10
1508#define LEFT_SHIFT_FLOW         15
1509#define RIGHT_SHIFT_NO_FLOW     20
1510#define RIGHT_SHIFT_FLOW        25
1511#define LR_SHIFT_NO_FLOW        30
1512#define LR_SHIFT_FLOW           35
1513
1514                lpar = tb->lnum[h];
1515                rpar = tb->rnum[h];
1516
1517                /*
1518                 * calculate number of blocks S[h] must be split into when
1519                 * nothing is shifted to the neighbors, as well as number of
1520                 * items in each part of the split node (s012 numbers),
1521                 * and number of bytes (s1bytes) of the shared drop which
1522                 * flow to S1 if any
1523                 */
1524                nset = NOTHING_SHIFT_NO_FLOW;
1525                nver = get_num_ver(vn->vn_mode, tb, h,
1526                                   0, -1, h ? vn->vn_nr_item : 0, -1,
1527                                   snum012, NO_FLOW);
1528
1529                if (!h) {
1530                        int nver1;
1531
1532                        /*
1533                         * note, that in this case we try to bottle
1534                         * between S[0] and S1 (S1 - the first new node)
1535                         */
1536                        nver1 = get_num_ver(vn->vn_mode, tb, h,
1537                                            0, -1, 0, -1,
1538                                            snum012 + NOTHING_SHIFT_FLOW, FLOW);
1539                        if (nver > nver1)
1540                                nset = NOTHING_SHIFT_FLOW, nver = nver1;
1541                }
1542
1543                /*
1544                 * calculate number of blocks S[h] must be split into when
1545                 * l_shift_num first items and l_shift_bytes of the right
1546                 * most liquid item to be shifted are shifted to the left
1547                 * neighbor, as well as number of items in each part of the
1548                 * splitted node (s012 numbers), and number of bytes
1549                 * (s1bytes) of the shared drop which flow to S1 if any
1550                 */
1551                lset = LEFT_SHIFT_NO_FLOW;
1552                lnver = get_num_ver(vn->vn_mode, tb, h,
1553                                    lpar - ((h || tb->lbytes == -1) ? 0 : 1),
1554                                    -1, h ? vn->vn_nr_item : 0, -1,
1555                                    snum012 + LEFT_SHIFT_NO_FLOW, NO_FLOW);
1556                if (!h) {
1557                        int lnver1;
1558
1559                        lnver1 = get_num_ver(vn->vn_mode, tb, h,
1560                                             lpar -
1561                                             ((tb->lbytes != -1) ? 1 : 0),
1562                                             tb->lbytes, 0, -1,
1563                                             snum012 + LEFT_SHIFT_FLOW, FLOW);
1564                        if (lnver > lnver1)
1565                                lset = LEFT_SHIFT_FLOW, lnver = lnver1;
1566                }
1567
1568                /*
1569                 * calculate number of blocks S[h] must be split into when
1570                 * r_shift_num first items and r_shift_bytes of the left most
1571                 * liquid item to be shifted are shifted to the right neighbor,
1572                 * as well as number of items in each part of the splitted
1573                 * node (s012 numbers), and number of bytes (s1bytes) of the
1574                 * shared drop which flow to S1 if any
1575                 */
1576                rset = RIGHT_SHIFT_NO_FLOW;
1577                rnver = get_num_ver(vn->vn_mode, tb, h,
1578                                    0, -1,
1579                                    h ? (vn->vn_nr_item - rpar) : (rpar -
1580                                                                   ((tb->
1581                                                                     rbytes !=
1582                                                                     -1) ? 1 :
1583                                                                    0)), -1,
1584                                    snum012 + RIGHT_SHIFT_NO_FLOW, NO_FLOW);
1585                if (!h) {
1586                        int rnver1;
1587
1588                        rnver1 = get_num_ver(vn->vn_mode, tb, h,
1589                                             0, -1,
1590                                             (rpar -
1591                                              ((tb->rbytes != -1) ? 1 : 0)),
1592                                             tb->rbytes,
1593                                             snum012 + RIGHT_SHIFT_FLOW, FLOW);
1594
1595                        if (rnver > rnver1)
1596                                rset = RIGHT_SHIFT_FLOW, rnver = rnver1;
1597                }
1598
1599                /*
1600                 * calculate number of blocks S[h] must be split into when
1601                 * items are shifted in both directions, as well as number
1602                 * of items in each part of the splitted node (s012 numbers),
1603                 * and number of bytes (s1bytes) of the shared drop which
1604                 * flow to S1 if any
1605                 */
1606                lrset = LR_SHIFT_NO_FLOW;
1607                lrnver = get_num_ver(vn->vn_mode, tb, h,
1608                                     lpar - ((h || tb->lbytes == -1) ? 0 : 1),
1609                                     -1,
1610                                     h ? (vn->vn_nr_item - rpar) : (rpar -
1611                                                                    ((tb->
1612                                                                      rbytes !=
1613                                                                      -1) ? 1 :
1614                                                                     0)), -1,
1615                                     snum012 + LR_SHIFT_NO_FLOW, NO_FLOW);
1616                if (!h) {
1617                        int lrnver1;
1618
1619                        lrnver1 = get_num_ver(vn->vn_mode, tb, h,
1620                                              lpar -
1621                                              ((tb->lbytes != -1) ? 1 : 0),
1622                                              tb->lbytes,
1623                                              (rpar -
1624                                               ((tb->rbytes != -1) ? 1 : 0)),
1625                                              tb->rbytes,
1626                                              snum012 + LR_SHIFT_FLOW, FLOW);
1627                        if (lrnver > lrnver1)
1628                                lrset = LR_SHIFT_FLOW, lrnver = lrnver1;
1629                }
1630
1631                /*
1632                 * Our general shifting strategy is:
1633                 * 1) to minimized number of new nodes;
1634                 * 2) to minimized number of neighbors involved in shifting;
1635                 * 3) to minimized number of disk reads;
1636                 */
1637
1638                /* we can win TWO or ONE nodes by shifting in both directions */
1639                if (lrnver < lnver && lrnver < rnver) {
1640                        RFALSE(h &&
1641                               (tb->lnum[h] != 1 ||
1642                                tb->rnum[h] != 1 ||
1643                                lrnver != 1 || rnver != 2 || lnver != 2
1644                                || h != 1), "vs-8230: bad h");
1645                        if (lrset == LR_SHIFT_FLOW)
1646                                set_parameters(tb, h, tb->lnum[h], tb->rnum[h],
1647                                               lrnver, snum012 + lrset,
1648                                               tb->lbytes, tb->rbytes);
1649                        else
1650                                set_parameters(tb, h,
1651                                               tb->lnum[h] -
1652                                               ((tb->lbytes == -1) ? 0 : 1),
1653                                               tb->rnum[h] -
1654                                               ((tb->rbytes == -1) ? 0 : 1),
1655                                               lrnver, snum012 + lrset, -1, -1);
1656
1657                        return CARRY_ON;
1658                }
1659
1660                /*
1661                 * if shifting doesn't lead to better packing
1662                 * then don't shift
1663                 */
1664                if (nver == lrnver) {
1665                        set_parameters(tb, h, 0, 0, nver, snum012 + nset, -1,
1666                                       -1);
1667                        return CARRY_ON;
1668                }
1669
1670                /*
1671                 * now we know that for better packing shifting in only one
1672                 * direction either to the left or to the right is required
1673                 */
1674
1675                /*
1676                 * if shifting to the left is better than
1677                 * shifting to the right
1678                 */
1679                if (lnver < rnver) {
1680                        SET_PAR_SHIFT_LEFT;
1681                        return CARRY_ON;
1682                }
1683
1684                /*
1685                 * if shifting to the right is better than
1686                 * shifting to the left
1687                 */
1688                if (lnver > rnver) {
1689                        SET_PAR_SHIFT_RIGHT;
1690                        return CARRY_ON;
1691                }
1692
1693                /*
1694                 * now shifting in either direction gives the same number
1695                 * of nodes and we can make use of the cached neighbors
1696                 */
1697                if (is_left_neighbor_in_cache(tb, h)) {
1698                        SET_PAR_SHIFT_LEFT;
1699                        return CARRY_ON;
1700                }
1701
1702                /*
1703                 * shift to the right independently on whether the
1704                 * right neighbor in cache or not
1705                 */
1706                SET_PAR_SHIFT_RIGHT;
1707                return CARRY_ON;
1708        }
1709}
1710
1711/*
1712 * Check whether current node S[h] is balanced when Decreasing its size by
1713 * Deleting or Cutting for INTERNAL node of S+tree.
1714 * Calculate parameters for balancing for current level h.
1715 * Parameters:
1716 *      tb      tree_balance structure;
1717 *      h       current level of the node;
1718 *      inum    item number in S[h];
1719 *      mode    i - insert, p - paste;
1720 * Returns:     1 - schedule occurred;
1721 *              0 - balancing for higher levels needed;
1722 *             -1 - no balancing for higher levels needed;
1723 *             -2 - no disk space.
1724 *
1725 * Note: Items of internal nodes have fixed size, so the balance condition for
1726 * the internal part of S+tree is as for the B-trees.
1727 */
1728static int dc_check_balance_internal(struct tree_balance *tb, int h)
1729{
1730        struct virtual_node *vn = tb->tb_vn;
1731
1732        /*
1733         * Sh is the node whose balance is currently being checked,
1734         * and Fh is its father.
1735         */
1736        struct buffer_head *Sh, *Fh;
1737        int maxsize, ret;
1738        int lfree, rfree /* free space in L and R */ ;
1739
1740        Sh = PATH_H_PBUFFER(tb->tb_path, h);
1741        Fh = PATH_H_PPARENT(tb->tb_path, h);
1742
1743        maxsize = MAX_CHILD_SIZE(Sh);
1744
1745        /*
1746         * using tb->insert_size[h], which is negative in this case,
1747         * create_virtual_node calculates:
1748         * new_nr_item = number of items node would have if operation is
1749         * performed without balancing (new_nr_item);
1750         */
1751        create_virtual_node(tb, h);
1752
1753        if (!Fh) {              /* S[h] is the root. */
1754                /* no balancing for higher levels needed */
1755                if (vn->vn_nr_item > 0) {
1756                        set_parameters(tb, h, 0, 0, 1, NULL, -1, -1);
1757                        return NO_BALANCING_NEEDED;
1758                }
1759                /*
1760                 * new_nr_item == 0.
1761                 * Current root will be deleted resulting in
1762                 * decrementing the tree height.
1763                 */
1764                set_parameters(tb, h, 0, 0, 0, NULL, -1, -1);
1765                return CARRY_ON;
1766        }
1767
1768        if ((ret = get_parents(tb, h)) != CARRY_ON)
1769                return ret;
1770
1771        /* get free space of neighbors */
1772        rfree = get_rfree(tb, h);
1773        lfree = get_lfree(tb, h);
1774
1775        /* determine maximal number of items we can fit into neighbors */
1776        check_left(tb, h, lfree);
1777        check_right(tb, h, rfree);
1778
1779        /*
1780         * Balance condition for the internal node is valid.
1781         * In this case we balance only if it leads to better packing.
1782         */
1783        if (vn->vn_nr_item >= MIN_NR_KEY(Sh)) {
1784                /*
1785                 * Here we join S[h] with one of its neighbors,
1786                 * which is impossible with greater values of new_nr_item.
1787                 */
1788                if (vn->vn_nr_item == MIN_NR_KEY(Sh)) {
1789                        /* All contents of S[h] can be moved to L[h]. */
1790                        if (tb->lnum[h] >= vn->vn_nr_item + 1) {
1791                                int n;
1792                                int order_L;
1793
1794                                order_L =
1795                                    ((n =
1796                                      PATH_H_B_ITEM_ORDER(tb->tb_path,
1797                                                          h)) ==
1798                                     0) ? B_NR_ITEMS(tb->FL[h]) : n - 1;
1799                                n = dc_size(B_N_CHILD(tb->FL[h], order_L)) /
1800                                    (DC_SIZE + KEY_SIZE);
1801                                set_parameters(tb, h, -n - 1, 0, 0, NULL, -1,
1802                                               -1);
1803                                return CARRY_ON;
1804                        }
1805
1806                        /* All contents of S[h] can be moved to R[h]. */
1807                        if (tb->rnum[h] >= vn->vn_nr_item + 1) {
1808                                int n;
1809                                int order_R;
1810
1811                                order_R =
1812                                    ((n =
1813                                      PATH_H_B_ITEM_ORDER(tb->tb_path,
1814                                                          h)) ==
1815                                     B_NR_ITEMS(Fh)) ? 0 : n + 1;
1816                                n = dc_size(B_N_CHILD(tb->FR[h], order_R)) /
1817                                    (DC_SIZE + KEY_SIZE);
1818                                set_parameters(tb, h, 0, -n - 1, 0, NULL, -1,
1819                                               -1);
1820                                return CARRY_ON;
1821                        }
1822                }
1823
1824                /*
1825                 * All contents of S[h] can be moved to the neighbors
1826                 * (L[h] & R[h]).
1827                 */
1828                if (tb->rnum[h] + tb->lnum[h] >= vn->vn_nr_item + 1) {
1829                        int to_r;
1830
1831                        to_r =
1832                            ((MAX_NR_KEY(Sh) << 1) + 2 - tb->lnum[h] -
1833                             tb->rnum[h] + vn->vn_nr_item + 1) / 2 -
1834                            (MAX_NR_KEY(Sh) + 1 - tb->rnum[h]);
1835                        set_parameters(tb, h, vn->vn_nr_item + 1 - to_r, to_r,
1836                                       0, NULL, -1, -1);
1837                        return CARRY_ON;
1838                }
1839
1840                /* Balancing does not lead to better packing. */
1841                set_parameters(tb, h, 0, 0, 1, NULL, -1, -1);
1842                return NO_BALANCING_NEEDED;
1843        }
1844
1845        /*
1846         * Current node contain insufficient number of items.
1847         * Balancing is required.
1848         */
1849        /* Check whether we can merge S[h] with left neighbor. */
1850        if (tb->lnum[h] >= vn->vn_nr_item + 1)
1851                if (is_left_neighbor_in_cache(tb, h)
1852                    || tb->rnum[h] < vn->vn_nr_item + 1 || !tb->FR[h]) {
1853                        int n;
1854                        int order_L;
1855
1856                        order_L =
1857                            ((n =
1858                              PATH_H_B_ITEM_ORDER(tb->tb_path,
1859                                                  h)) ==
1860                             0) ? B_NR_ITEMS(tb->FL[h]) : n - 1;
1861                        n = dc_size(B_N_CHILD(tb->FL[h], order_L)) / (DC_SIZE +
1862                                                                      KEY_SIZE);
1863                        set_parameters(tb, h, -n - 1, 0, 0, NULL, -1, -1);
1864                        return CARRY_ON;
1865                }
1866
1867        /* Check whether we can merge S[h] with right neighbor. */
1868        if (tb->rnum[h] >= vn->vn_nr_item + 1) {
1869                int n;
1870                int order_R;
1871
1872                order_R =
1873                    ((n =
1874                      PATH_H_B_ITEM_ORDER(tb->tb_path,
1875                                          h)) == B_NR_ITEMS(Fh)) ? 0 : (n + 1);
1876                n = dc_size(B_N_CHILD(tb->FR[h], order_R)) / (DC_SIZE +
1877                                                              KEY_SIZE);
1878                set_parameters(tb, h, 0, -n - 1, 0, NULL, -1, -1);
1879                return CARRY_ON;
1880        }
1881
1882        /* All contents of S[h] can be moved to the neighbors (L[h] & R[h]). */
1883        if (tb->rnum[h] + tb->lnum[h] >= vn->vn_nr_item + 1) {
1884                int to_r;
1885
1886                to_r =
1887                    ((MAX_NR_KEY(Sh) << 1) + 2 - tb->lnum[h] - tb->rnum[h] +
1888                     vn->vn_nr_item + 1) / 2 - (MAX_NR_KEY(Sh) + 1 -
1889                                                tb->rnum[h]);
1890                set_parameters(tb, h, vn->vn_nr_item + 1 - to_r, to_r, 0, NULL,
1891                               -1, -1);
1892                return CARRY_ON;
1893        }
1894
1895        /* For internal nodes try to borrow item from a neighbor */
1896        RFALSE(!tb->FL[h] && !tb->FR[h], "vs-8235: trying to borrow for root");
1897
1898        /* Borrow one or two items from caching neighbor */
1899        if (is_left_neighbor_in_cache(tb, h) || !tb->FR[h]) {
1900                int from_l;
1901
1902                from_l =
1903                    (MAX_NR_KEY(Sh) + 1 - tb->lnum[h] + vn->vn_nr_item +
1904                     1) / 2 - (vn->vn_nr_item + 1);
1905                set_parameters(tb, h, -from_l, 0, 1, NULL, -1, -1);
1906                return CARRY_ON;
1907        }
1908
1909        set_parameters(tb, h, 0,
1910                       -((MAX_NR_KEY(Sh) + 1 - tb->rnum[h] + vn->vn_nr_item +
1911                          1) / 2 - (vn->vn_nr_item + 1)), 1, NULL, -1, -1);
1912        return CARRY_ON;
1913}
1914
1915/*
1916 * Check whether current node S[h] is balanced when Decreasing its size by
1917 * Deleting or Truncating for LEAF node of S+tree.
1918 * Calculate parameters for balancing for current level h.
1919 * Parameters:
1920 *      tb      tree_balance structure;
1921 *      h       current level of the node;
1922 *      inum    item number in S[h];
1923 *      mode    i - insert, p - paste;
1924 * Returns:     1 - schedule occurred;
1925 *              0 - balancing for higher levels needed;
1926 *             -1 - no balancing for higher levels needed;
1927 *             -2 - no disk space.
1928 */
1929static int dc_check_balance_leaf(struct tree_balance *tb, int h)
1930{
1931        struct virtual_node *vn = tb->tb_vn;
1932
1933        /*
1934         * Number of bytes that must be deleted from
1935         * (value is negative if bytes are deleted) buffer which
1936         * contains node being balanced.  The mnemonic is that the
1937         * attempted change in node space used level is levbytes bytes.
1938         */
1939        int levbytes;
1940
1941        /* the maximal item size */
1942        int maxsize, ret;
1943
1944        /*
1945         * S0 is the node whose balance is currently being checked,
1946         * and F0 is its father.
1947         */
1948        struct buffer_head *S0, *F0;
1949        int lfree, rfree /* free space in L and R */ ;
1950
1951        S0 = PATH_H_PBUFFER(tb->tb_path, 0);
1952        F0 = PATH_H_PPARENT(tb->tb_path, 0);
1953
1954        levbytes = tb->insert_size[h];
1955
1956        maxsize = MAX_CHILD_SIZE(S0);   /* maximal possible size of an item */
1957
1958        if (!F0) {              /* S[0] is the root now. */
1959
1960                RFALSE(-levbytes >= maxsize - B_FREE_SPACE(S0),
1961                       "vs-8240: attempt to create empty buffer tree");
1962
1963                set_parameters(tb, h, 0, 0, 1, NULL, -1, -1);
1964                return NO_BALANCING_NEEDED;
1965        }
1966
1967        if ((ret = get_parents(tb, h)) != CARRY_ON)
1968                return ret;
1969
1970        /* get free space of neighbors */
1971        rfree = get_rfree(tb, h);
1972        lfree = get_lfree(tb, h);
1973
1974        create_virtual_node(tb, h);
1975
1976        /* if 3 leaves can be merge to one, set parameters and return */
1977        if (are_leaves_removable(tb, lfree, rfree))
1978                return CARRY_ON;
1979
1980        /*
1981         * determine maximal number of items we can shift to the left/right
1982         * neighbor and the maximal number of bytes that can flow to the
1983         * left/right neighbor from the left/right most liquid item that
1984         * cannot be shifted from S[0] entirely
1985         */
1986        check_left(tb, h, lfree);
1987        check_right(tb, h, rfree);
1988
1989        /* check whether we can merge S with left neighbor. */
1990        if (tb->lnum[0] >= vn->vn_nr_item && tb->lbytes == -1)
1991                if (is_left_neighbor_in_cache(tb, h) || ((tb->rnum[0] - ((tb->rbytes == -1) ? 0 : 1)) < vn->vn_nr_item) ||      /* S can not be merged with R */
1992                    !tb->FR[h]) {
1993
1994                        RFALSE(!tb->FL[h],
1995                               "vs-8245: dc_check_balance_leaf: FL[h] must exist");
1996
1997                        /* set parameter to merge S[0] with its left neighbor */
1998                        set_parameters(tb, h, -1, 0, 0, NULL, -1, -1);
1999                        return CARRY_ON;
2000                }
2001
2002        /* check whether we can merge S[0] with right neighbor. */
2003        if (tb->rnum[0] >= vn->vn_nr_item && tb->rbytes == -1) {
2004                set_parameters(tb, h, 0, -1, 0, NULL, -1, -1);
2005                return CARRY_ON;
2006        }
2007
2008        /*
2009         * All contents of S[0] can be moved to the neighbors (L[0] & R[0]).
2010         * Set parameters and return
2011         */
2012        if (is_leaf_removable(tb))
2013                return CARRY_ON;
2014
2015        /* Balancing is not required. */
2016        tb->s0num = vn->vn_nr_item;
2017        set_parameters(tb, h, 0, 0, 1, NULL, -1, -1);
2018        return NO_BALANCING_NEEDED;
2019}
2020
2021/*
2022 * Check whether current node S[h] is balanced when Decreasing its size by
2023 * Deleting or Cutting.
2024 * Calculate parameters for balancing for current level h.
2025 * Parameters:
2026 *      tb      tree_balance structure;
2027 *      h       current level of the node;
2028 *      inum    item number in S[h];
2029 *      mode    d - delete, c - cut.
2030 * Returns:     1 - schedule occurred;
2031 *              0 - balancing for higher levels needed;
2032 *             -1 - no balancing for higher levels needed;
2033 *             -2 - no disk space.
2034 */
2035static int dc_check_balance(struct tree_balance *tb, int h)
2036{
2037        RFALSE(!(PATH_H_PBUFFER(tb->tb_path, h)),
2038               "vs-8250: S is not initialized");
2039
2040        if (h)
2041                return dc_check_balance_internal(tb, h);
2042        else
2043                return dc_check_balance_leaf(tb, h);
2044}
2045
2046/*
2047 * Check whether current node S[h] is balanced.
2048 * Calculate parameters for balancing for current level h.
2049 * Parameters:
2050 *
2051 *      tb      tree_balance structure:
2052 *
2053 *              tb is a large structure that must be read about in the header
2054 *              file at the same time as this procedure if the reader is
2055 *              to successfully understand this procedure
2056 *
2057 *      h       current level of the node;
2058 *      inum    item number in S[h];
2059 *      mode    i - insert, p - paste, d - delete, c - cut.
2060 * Returns:     1 - schedule occurred;
2061 *              0 - balancing for higher levels needed;
2062 *             -1 - no balancing for higher levels needed;
2063 *             -2 - no disk space.
2064 */
2065static int check_balance(int mode,
2066                         struct tree_balance *tb,
2067                         int h,
2068                         int inum,
2069                         int pos_in_item,
2070                         struct item_head *ins_ih, const void *data)
2071{
2072        struct virtual_node *vn;
2073
2074        vn = tb->tb_vn = (struct virtual_node *)(tb->vn_buf);
2075        vn->vn_free_ptr = (char *)(tb->tb_vn + 1);
2076        vn->vn_mode = mode;
2077        vn->vn_affected_item_num = inum;
2078        vn->vn_pos_in_item = pos_in_item;
2079        vn->vn_ins_ih = ins_ih;
2080        vn->vn_data = data;
2081
2082        RFALSE(mode == M_INSERT && !vn->vn_ins_ih,
2083               "vs-8255: ins_ih can not be 0 in insert mode");
2084
2085        /* Calculate balance parameters when size of node is increasing. */
2086        if (tb->insert_size[h] > 0)
2087                return ip_check_balance(tb, h);
2088
2089        /* Calculate balance parameters when  size of node is decreasing. */
2090        return dc_check_balance(tb, h);
2091}
2092
2093/* Check whether parent at the path is the really parent of the current node.*/
2094static int get_direct_parent(struct tree_balance *tb, int h)
2095{
2096        struct buffer_head *bh;
2097        struct treepath *path = tb->tb_path;
2098        int position,
2099            path_offset = PATH_H_PATH_OFFSET(tb->tb_path, h);
2100
2101        /* We are in the root or in the new root. */
2102        if (path_offset <= FIRST_PATH_ELEMENT_OFFSET) {
2103
2104                RFALSE(path_offset < FIRST_PATH_ELEMENT_OFFSET - 1,
2105                       "PAP-8260: invalid offset in the path");
2106
2107                if (PATH_OFFSET_PBUFFER(path, FIRST_PATH_ELEMENT_OFFSET)->
2108                    b_blocknr == SB_ROOT_BLOCK(tb->tb_sb)) {
2109                        /* Root is not changed. */
2110                        PATH_OFFSET_PBUFFER(path, path_offset - 1) = NULL;
2111                        PATH_OFFSET_POSITION(path, path_offset - 1) = 0;
2112                        return CARRY_ON;
2113                }
2114                /* Root is changed and we must recalculate the path. */
2115                return REPEAT_SEARCH;
2116        }
2117
2118        /* Parent in the path is not in the tree. */
2119        if (!B_IS_IN_TREE
2120            (bh = PATH_OFFSET_PBUFFER(path, path_offset - 1)))
2121                return REPEAT_SEARCH;
2122
2123        if ((position =
2124             PATH_OFFSET_POSITION(path,
2125                                  path_offset - 1)) > B_NR_ITEMS(bh))
2126                return REPEAT_SEARCH;
2127
2128        /* Parent in the path is not parent of the current node in the tree. */
2129        if (B_N_CHILD_NUM(bh, position) !=
2130            PATH_OFFSET_PBUFFER(path, path_offset)->b_blocknr)
2131                return REPEAT_SEARCH;
2132
2133        if (buffer_locked(bh)) {
2134                int depth = reiserfs_write_unlock_nested(tb->tb_sb);
2135                __wait_on_buffer(bh);
2136                reiserfs_write_lock_nested(tb->tb_sb, depth);
2137                if (FILESYSTEM_CHANGED_TB(tb))
2138                        return REPEAT_SEARCH;
2139        }
2140
2141        /*
2142         * Parent in the path is unlocked and really parent
2143         * of the current node.
2144         */
2145        return CARRY_ON;
2146}
2147
2148/*
2149 * Using lnum[h] and rnum[h] we should determine what neighbors
2150 * of S[h] we
2151 * need in order to balance S[h], and get them if necessary.
2152 * Returns:     SCHEDULE_OCCURRED - schedule occurred while the function worked;
2153 *              CARRY_ON - schedule didn't occur while the function worked;
2154 */
2155static int get_neighbors(struct tree_balance *tb, int h)
2156{
2157        int child_position,
2158            path_offset = PATH_H_PATH_OFFSET(tb->tb_path, h + 1);
2159        unsigned long son_number;
2160        struct super_block *sb = tb->tb_sb;
2161        struct buffer_head *bh;
2162        int depth;
2163
2164        PROC_INFO_INC(sb, get_neighbors[h]);
2165
2166        if (tb->lnum[h]) {
2167                /* We need left neighbor to balance S[h]. */
2168                PROC_INFO_INC(sb, need_l_neighbor[h]);
2169                bh = PATH_OFFSET_PBUFFER(tb->tb_path, path_offset);
2170
2171                RFALSE(bh == tb->FL[h] &&
2172                       !PATH_OFFSET_POSITION(tb->tb_path, path_offset),
2173                       "PAP-8270: invalid position in the parent");
2174
2175                child_position =
2176                    (bh ==
2177                     tb->FL[h]) ? tb->lkey[h] : B_NR_ITEMS(tb->
2178                                                                       FL[h]);
2179                son_number = B_N_CHILD_NUM(tb->FL[h], child_position);
2180                depth = reiserfs_write_unlock_nested(tb->tb_sb);
2181                bh = sb_bread(sb, son_number);
2182                reiserfs_write_lock_nested(tb->tb_sb, depth);
2183                if (!bh)
2184                        return IO_ERROR;
2185                if (FILESYSTEM_CHANGED_TB(tb)) {
2186                        brelse(bh);
2187                        PROC_INFO_INC(sb, get_neighbors_restart[h]);
2188                        return REPEAT_SEARCH;
2189                }
2190
2191                RFALSE(!B_IS_IN_TREE(tb->FL[h]) ||
2192                       child_position > B_NR_ITEMS(tb->FL[h]) ||
2193                       B_N_CHILD_NUM(tb->FL[h], child_position) !=
2194                       bh->b_blocknr, "PAP-8275: invalid parent");
2195                RFALSE(!B_IS_IN_TREE(bh), "PAP-8280: invalid child");
2196                RFALSE(!h &&
2197                       B_FREE_SPACE(bh) !=
2198                       MAX_CHILD_SIZE(bh) -
2199                       dc_size(B_N_CHILD(tb->FL[0], child_position)),
2200                       "PAP-8290: invalid child size of left neighbor");
2201
2202                brelse(tb->L[h]);
2203                tb->L[h] = bh;
2204        }
2205
2206        /* We need right neighbor to balance S[path_offset]. */
2207        if (tb->rnum[h]) {
2208                PROC_INFO_INC(sb, need_r_neighbor[h]);
2209                bh = PATH_OFFSET_PBUFFER(tb->tb_path, path_offset);
2210
2211                RFALSE(bh == tb->FR[h] &&
2212                       PATH_OFFSET_POSITION(tb->tb_path,
2213                                            path_offset) >=
2214                       B_NR_ITEMS(bh),
2215                       "PAP-8295: invalid position in the parent");
2216
2217                child_position =
2218                    (bh == tb->FR[h]) ? tb->rkey[h] + 1 : 0;
2219                son_number = B_N_CHILD_NUM(tb->FR[h], child_position);
2220                depth = reiserfs_write_unlock_nested(tb->tb_sb);
2221                bh = sb_bread(sb, son_number);
2222                reiserfs_write_lock_nested(tb->tb_sb, depth);
2223                if (!bh)
2224                        return IO_ERROR;
2225                if (FILESYSTEM_CHANGED_TB(tb)) {
2226                        brelse(bh);
2227                        PROC_INFO_INC(sb, get_neighbors_restart[h]);
2228                        return REPEAT_SEARCH;
2229                }
2230                brelse(tb->R[h]);
2231                tb->R[h] = bh;
2232
2233                RFALSE(!h
2234                       && B_FREE_SPACE(bh) !=
2235                       MAX_CHILD_SIZE(bh) -
2236                       dc_size(B_N_CHILD(tb->FR[0], child_position)),
2237                       "PAP-8300: invalid child size of right neighbor (%d != %d - %d)",
2238                       B_FREE_SPACE(bh), MAX_CHILD_SIZE(bh),
2239                       dc_size(B_N_CHILD(tb->FR[0], child_position)));
2240
2241        }
2242        return CARRY_ON;
2243}
2244
2245static int get_virtual_node_size(struct super_block *sb, struct buffer_head *bh)
2246{
2247        int max_num_of_items;
2248        int max_num_of_entries;
2249        unsigned long blocksize = sb->s_blocksize;
2250
2251#define MIN_NAME_LEN 1
2252
2253        max_num_of_items = (blocksize - BLKH_SIZE) / (IH_SIZE + MIN_ITEM_LEN);
2254        max_num_of_entries = (blocksize - BLKH_SIZE - IH_SIZE) /
2255            (DEH_SIZE + MIN_NAME_LEN);
2256
2257        return sizeof(struct virtual_node) +
2258            max(max_num_of_items * sizeof(struct virtual_item),
2259                sizeof(struct virtual_item) + sizeof(struct direntry_uarea) +
2260                (max_num_of_entries - 1) * sizeof(__u16));
2261}
2262
2263/*
2264 * maybe we should fail balancing we are going to perform when kmalloc
2265 * fails several times. But now it will loop until kmalloc gets
2266 * required memory
2267 */
2268static int get_mem_for_virtual_node(struct tree_balance *tb)
2269{
2270        int check_fs = 0;
2271        int size;
2272        char *buf;
2273
2274        size = get_virtual_node_size(tb->tb_sb, PATH_PLAST_BUFFER(tb->tb_path));
2275
2276        /* we have to allocate more memory for virtual node */
2277        if (size > tb->vn_buf_size) {
2278                if (tb->vn_buf) {
2279                        /* free memory allocated before */
2280                        kfree(tb->vn_buf);
2281                        /* this is not needed if kfree is atomic */
2282                        check_fs = 1;
2283                }
2284
2285                /* virtual node requires now more memory */
2286                tb->vn_buf_size = size;
2287
2288                /* get memory for virtual item */
2289                buf = kmalloc(size, GFP_ATOMIC | __GFP_NOWARN);
2290                if (!buf) {
2291                        /*
2292                         * getting memory with GFP_KERNEL priority may involve
2293                         * balancing now (due to indirect_to_direct conversion
2294                         * on dcache shrinking). So, release path and collected
2295                         * resources here
2296                         */
2297                        free_buffers_in_tb(tb);
2298                        buf = kmalloc(size, GFP_NOFS);
2299                        if (!buf) {
2300                                tb->vn_buf_size = 0;
2301                        }
2302                        tb->vn_buf = buf;
2303                        schedule();
2304                        return REPEAT_SEARCH;
2305                }
2306
2307                tb->vn_buf = buf;
2308        }
2309
2310        if (check_fs && FILESYSTEM_CHANGED_TB(tb))
2311                return REPEAT_SEARCH;
2312
2313        return CARRY_ON;
2314}
2315
2316#ifdef CONFIG_REISERFS_CHECK
2317static void tb_buffer_sanity_check(struct super_block *sb,
2318                                   struct buffer_head *bh,
2319                                   const char *descr, int level)
2320{
2321        if (bh) {
2322                if (atomic_read(&(bh->b_count)) <= 0)
2323
2324                        reiserfs_panic(sb, "jmacd-1", "negative or zero "
2325                                       "reference counter for buffer %s[%d] "
2326                                       "(%b)", descr, level, bh);
2327
2328                if (!buffer_uptodate(bh))
2329                        reiserfs_panic(sb, "jmacd-2", "buffer is not up "
2330                                       "to date %s[%d] (%b)",
2331                                       descr, level, bh);
2332
2333                if (!B_IS_IN_TREE(bh))
2334                        reiserfs_panic(sb, "jmacd-3", "buffer is not "
2335                                       "in tree %s[%d] (%b)",
2336                                       descr, level, bh);
2337
2338                if (bh->b_bdev != sb->s_bdev)
2339                        reiserfs_panic(sb, "jmacd-4", "buffer has wrong "
2340                                       "device %s[%d] (%b)",
2341                                       descr, level, bh);
2342
2343                if (bh->b_size != sb->s_blocksize)
2344                        reiserfs_panic(sb, "jmacd-5", "buffer has wrong "
2345                                       "blocksize %s[%d] (%b)",
2346                                       descr, level, bh);
2347
2348                if (bh->b_blocknr > SB_BLOCK_COUNT(sb))
2349                        reiserfs_panic(sb, "jmacd-6", "buffer block "
2350                                       "number too high %s[%d] (%b)",
2351                                       descr, level, bh);
2352        }
2353}
2354#else
2355static void tb_buffer_sanity_check(struct super_block *sb,
2356                                   struct buffer_head *bh,
2357                                   const char *descr, int level)
2358{;
2359}
2360#endif
2361
2362static int clear_all_dirty_bits(struct super_block *s, struct buffer_head *bh)
2363{
2364        return reiserfs_prepare_for_journal(s, bh, 0);
2365}
2366
2367static int wait_tb_buffers_until_unlocked(struct tree_balance *tb)
2368{
2369        struct buffer_head *locked;
2370#ifdef CONFIG_REISERFS_CHECK
2371        int repeat_counter = 0;
2372#endif
2373        int i;
2374
2375        do {
2376
2377                locked = NULL;
2378
2379                for (i = tb->tb_path->path_length;
2380                     !locked && i > ILLEGAL_PATH_ELEMENT_OFFSET; i--) {
2381                        if (PATH_OFFSET_PBUFFER(tb->tb_path, i)) {
2382                                /*
2383                                 * if I understand correctly, we can only
2384                                 * be sure the last buffer in the path is
2385                                 * in the tree --clm
2386                                 */
2387#ifdef CONFIG_REISERFS_CHECK
2388                                if (PATH_PLAST_BUFFER(tb->tb_path) ==
2389                                    PATH_OFFSET_PBUFFER(tb->tb_path, i))
2390                                        tb_buffer_sanity_check(tb->tb_sb,
2391                                                               PATH_OFFSET_PBUFFER
2392                                                               (tb->tb_path,
2393                                                                i), "S",
2394                                                               tb->tb_path->
2395                                                               path_length - i);
2396#endif
2397                                if (!clear_all_dirty_bits(tb->tb_sb,
2398                                                          PATH_OFFSET_PBUFFER
2399                                                          (tb->tb_path,
2400                                                           i))) {
2401                                        locked =
2402                                            PATH_OFFSET_PBUFFER(tb->tb_path,
2403                                                                i);
2404                                }
2405                        }
2406                }
2407
2408                for (i = 0; !locked && i < MAX_HEIGHT && tb->insert_size[i];
2409                     i++) {
2410
2411                        if (tb->lnum[i]) {
2412
2413                                if (tb->L[i]) {
2414                                        tb_buffer_sanity_check(tb->tb_sb,
2415                                                               tb->L[i],
2416                                                               "L", i);
2417                                        if (!clear_all_dirty_bits
2418                                            (tb->tb_sb, tb->L[i]))
2419                                                locked = tb->L[i];
2420                                }
2421
2422                                if (!locked && tb->FL[i]) {
2423                                        tb_buffer_sanity_check(tb->tb_sb,
2424                                                               tb->FL[i],
2425                                                               "FL", i);
2426                                        if (!clear_all_dirty_bits
2427                                            (tb->tb_sb, tb->FL[i]))
2428                                                locked = tb->FL[i];
2429                                }
2430
2431                                if (!locked && tb->CFL[i]) {
2432                                        tb_buffer_sanity_check(tb->tb_sb,
2433                                                               tb->CFL[i],
2434                                                               "CFL", i);
2435                                        if (!clear_all_dirty_bits
2436                                            (tb->tb_sb, tb->CFL[i]))
2437                                                locked = tb->CFL[i];
2438                                }
2439
2440                        }
2441
2442                        if (!locked && (tb->rnum[i])) {
2443
2444                                if (tb->R[i]) {
2445                                        tb_buffer_sanity_check(tb->tb_sb,
2446                                                               tb->R[i],
2447                                                               "R", i);
2448                                        if (!clear_all_dirty_bits
2449                                            (tb->tb_sb, tb->R[i]))
2450                                                locked = tb->R[i];
2451                                }
2452
2453                                if (!locked && tb->FR[i]) {
2454                                        tb_buffer_sanity_check(tb->tb_sb,
2455                                                               tb->FR[i],
2456                                                               "FR", i);
2457                                        if (!clear_all_dirty_bits
2458                                            (tb->tb_sb, tb->FR[i]))
2459                                                locked = tb->FR[i];
2460                                }
2461
2462                                if (!locked && tb->CFR[i]) {
2463                                        tb_buffer_sanity_check(tb->tb_sb,
2464                                                               tb->CFR[i],
2465                                                               "CFR", i);
2466                                        if (!clear_all_dirty_bits
2467                                            (tb->tb_sb, tb->CFR[i]))
2468                                                locked = tb->CFR[i];
2469                                }
2470                        }
2471                }
2472
2473                /*
2474                 * as far as I can tell, this is not required.  The FEB list
2475                 * seems to be full of newly allocated nodes, which will
2476                 * never be locked, dirty, or anything else.
2477                 * To be safe, I'm putting in the checks and waits in.
2478                 * For the moment, they are needed to keep the code in
2479                 * journal.c from complaining about the buffer.
2480                 * That code is inside CONFIG_REISERFS_CHECK as well.  --clm
2481                 */
2482                for (i = 0; !locked && i < MAX_FEB_SIZE; i++) {
2483                        if (tb->FEB[i]) {
2484                                if (!clear_all_dirty_bits
2485                                    (tb->tb_sb, tb->FEB[i]))
2486                                        locked = tb->FEB[i];
2487                        }
2488                }
2489
2490                if (locked) {
2491                        int depth;
2492#ifdef CONFIG_REISERFS_CHECK
2493                        repeat_counter++;
2494                        if ((repeat_counter % 10000) == 0) {
2495                                reiserfs_warning(tb->tb_sb, "reiserfs-8200",
2496                                                 "too many iterations waiting "
2497                                                 "for buffer to unlock "
2498                                                 "(%b)", locked);
2499
2500                                /* Don't loop forever.  Try to recover from possible error. */
2501
2502                                return (FILESYSTEM_CHANGED_TB(tb)) ?
2503                                    REPEAT_SEARCH : CARRY_ON;
2504                        }
2505#endif
2506                        depth = reiserfs_write_unlock_nested(tb->tb_sb);
2507                        __wait_on_buffer(locked);
2508                        reiserfs_write_lock_nested(tb->tb_sb, depth);
2509                        if (FILESYSTEM_CHANGED_TB(tb))
2510                                return REPEAT_SEARCH;
2511                }
2512
2513        } while (locked);
2514
2515        return CARRY_ON;
2516}
2517
2518/*
2519 * Prepare for balancing, that is
2520 *      get all necessary parents, and neighbors;
2521 *      analyze what and where should be moved;
2522 *      get sufficient number of new nodes;
2523 * Balancing will start only after all resources will be collected at a time.
2524 *
2525 * When ported to SMP kernels, only at the last moment after all needed nodes
2526 * are collected in cache, will the resources be locked using the usual
2527 * textbook ordered lock acquisition algorithms.  Note that ensuring that
2528 * this code neither write locks what it does not need to write lock nor locks
2529 * out of order will be a pain in the butt that could have been avoided.
2530 * Grumble grumble. -Hans
2531 *
2532 * fix is meant in the sense of render unchanging
2533 *
2534 * Latency might be improved by first gathering a list of what buffers
2535 * are needed and then getting as many of them in parallel as possible? -Hans
2536 *
2537 * Parameters:
2538 *      op_mode i - insert, d - delete, c - cut (truncate), p - paste (append)
2539 *      tb      tree_balance structure;
2540 *      inum    item number in S[h];
2541 *      pos_in_item - comment this if you can
2542 *      ins_ih  item head of item being inserted
2543 *      data    inserted item or data to be pasted
2544 * Returns:     1 - schedule occurred while the function worked;
2545 *              0 - schedule didn't occur while the function worked;
2546 *             -1 - if no_disk_space
2547 */
2548
2549int fix_nodes(int op_mode, struct tree_balance *tb,
2550              struct item_head *ins_ih, const void *data)
2551{
2552        int ret, h, item_num = PATH_LAST_POSITION(tb->tb_path);
2553        int pos_in_item;
2554
2555        /*
2556         * we set wait_tb_buffers_run when we have to restore any dirty
2557         * bits cleared during wait_tb_buffers_run
2558         */
2559        int wait_tb_buffers_run = 0;
2560        struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path);
2561
2562        ++REISERFS_SB(tb->tb_sb)->s_fix_nodes;
2563
2564        pos_in_item = tb->tb_path->pos_in_item;
2565
2566        tb->fs_gen = get_generation(tb->tb_sb);
2567
2568        /*
2569         * we prepare and log the super here so it will already be in the
2570         * transaction when do_balance needs to change it.
2571         * This way do_balance won't have to schedule when trying to prepare
2572         * the super for logging
2573         */
2574        reiserfs_prepare_for_journal(tb->tb_sb,
2575                                     SB_BUFFER_WITH_SB(tb->tb_sb), 1);
2576        journal_mark_dirty(tb->transaction_handle,
2577                           SB_BUFFER_WITH_SB(tb->tb_sb));
2578        if (FILESYSTEM_CHANGED_TB(tb))
2579                return REPEAT_SEARCH;
2580
2581        /* if it possible in indirect_to_direct conversion */
2582        if (buffer_locked(tbS0)) {
2583                int depth = reiserfs_write_unlock_nested(tb->tb_sb);
2584                __wait_on_buffer(tbS0);
2585                reiserfs_write_lock_nested(tb->tb_sb, depth);
2586                if (FILESYSTEM_CHANGED_TB(tb))
2587                        return REPEAT_SEARCH;
2588        }
2589#ifdef CONFIG_REISERFS_CHECK
2590        if (REISERFS_SB(tb->tb_sb)->cur_tb) {
2591                print_cur_tb("fix_nodes");
2592                reiserfs_panic(tb->tb_sb, "PAP-8305",
2593                               "there is pending do_balance");
2594        }
2595
2596        if (!buffer_uptodate(tbS0) || !B_IS_IN_TREE(tbS0))
2597                reiserfs_panic(tb->tb_sb, "PAP-8320", "S[0] (%b %z) is "
2598                               "not uptodate at the beginning of fix_nodes "
2599                               "or not in tree (mode %c)",
2600                               tbS0, tbS0, op_mode);
2601
2602        /* Check parameters. */
2603        switch (op_mode) {
2604        case M_INSERT:
2605                if (item_num <= 0 || item_num > B_NR_ITEMS(tbS0))
2606                        reiserfs_panic(tb->tb_sb, "PAP-8330", "Incorrect "
2607                                       "item number %d (in S0 - %d) in case "
2608                                       "of insert", item_num,
2609                                       B_NR_ITEMS(tbS0));
2610                break;
2611        case M_PASTE:
2612        case M_DELETE:
2613        case M_CUT:
2614                if (item_num < 0 || item_num >= B_NR_ITEMS(tbS0)) {
2615                        print_block(tbS0, 0, -1, -1);
2616                        reiserfs_panic(tb->tb_sb, "PAP-8335", "Incorrect "
2617                                       "item number(%d); mode = %c "
2618                                       "insert_size = %d",
2619                                       item_num, op_mode,
2620                                       tb->insert_size[0]);
2621                }
2622                break;
2623        default:
2624                reiserfs_panic(tb->tb_sb, "PAP-8340", "Incorrect mode "
2625                               "of operation");
2626        }
2627#endif
2628
2629        if (get_mem_for_virtual_node(tb) == REPEAT_SEARCH)
2630                /* FIXME: maybe -ENOMEM when tb->vn_buf == 0? Now just repeat */
2631                return REPEAT_SEARCH;
2632
2633        /* Starting from the leaf level; for all levels h of the tree. */
2634        for (h = 0; h < MAX_HEIGHT && tb->insert_size[h]; h++) {
2635                ret = get_direct_parent(tb, h);
2636                if (ret != CARRY_ON)
2637                        goto repeat;
2638
2639                ret = check_balance(op_mode, tb, h, item_num,
2640                                    pos_in_item, ins_ih, data);
2641                if (ret != CARRY_ON) {
2642                        if (ret == NO_BALANCING_NEEDED) {
2643                                /* No balancing for higher levels needed. */
2644                                ret = get_neighbors(tb, h);
2645                                if (ret != CARRY_ON)
2646                                        goto repeat;
2647                                if (h != MAX_HEIGHT - 1)
2648                                        tb->insert_size[h + 1] = 0;
2649                                /*
2650                                 * ok, analysis and resource gathering
2651                                 * are complete
2652                                 */
2653                                break;
2654                        }
2655                        goto repeat;
2656                }
2657
2658                ret = get_neighbors(tb, h);
2659                if (ret != CARRY_ON)
2660                        goto repeat;
2661
2662                /*
2663                 * No disk space, or schedule occurred and analysis may be
2664                 * invalid and needs to be redone.
2665                 */
2666                ret = get_empty_nodes(tb, h);
2667                if (ret != CARRY_ON)
2668                        goto repeat;
2669
2670                /*
2671                 * We have a positive insert size but no nodes exist on this
2672                 * level, this means that we are creating a new root.
2673                 */
2674                if (!PATH_H_PBUFFER(tb->tb_path, h)) {
2675
2676                        RFALSE(tb->blknum[h] != 1,
2677                               "PAP-8350: creating new empty root");
2678
2679                        if (h < MAX_HEIGHT - 1)
2680                                tb->insert_size[h + 1] = 0;
2681                } else if (!PATH_H_PBUFFER(tb->tb_path, h + 1)) {
2682                        /*
2683                         * The tree needs to be grown, so this node S[h]
2684                         * which is the root node is split into two nodes,
2685                         * and a new node (S[h+1]) will be created to
2686                         * become the root node.
2687                         */
2688                        if (tb->blknum[h] > 1) {
2689
2690                                RFALSE(h == MAX_HEIGHT - 1,
2691                                       "PAP-8355: attempt to create too high of a tree");
2692
2693                                tb->insert_size[h + 1] =
2694                                    (DC_SIZE +
2695                                     KEY_SIZE) * (tb->blknum[h] - 1) +
2696                                    DC_SIZE;
2697                        } else if (h < MAX_HEIGHT - 1)
2698                                tb->insert_size[h + 1] = 0;
2699                } else
2700                        tb->insert_size[h + 1] =
2701                            (DC_SIZE + KEY_SIZE) * (tb->blknum[h] - 1);
2702        }
2703
2704        ret = wait_tb_buffers_until_unlocked(tb);
2705        if (ret == CARRY_ON) {
2706                if (FILESYSTEM_CHANGED_TB(tb)) {
2707                        wait_tb_buffers_run = 1;
2708                        ret = REPEAT_SEARCH;
2709                        goto repeat;
2710                } else {
2711                        return CARRY_ON;
2712                }
2713        } else {
2714                wait_tb_buffers_run = 1;
2715                goto repeat;
2716        }
2717
2718repeat:
2719        /*
2720         * fix_nodes was unable to perform its calculation due to
2721         * filesystem got changed under us, lack of free disk space or i/o
2722         * failure. If the first is the case - the search will be
2723         * repeated. For now - free all resources acquired so far except
2724         * for the new allocated nodes
2725         */
2726        {
2727                int i;
2728
2729                /* Release path buffers. */
2730                if (wait_tb_buffers_run) {
2731                        pathrelse_and_restore(tb->tb_sb, tb->tb_path);
2732                } else {
2733                        pathrelse(tb->tb_path);
2734                }
2735                /* brelse all resources collected for balancing */
2736                for (i = 0; i < MAX_HEIGHT; i++) {
2737                        if (wait_tb_buffers_run) {
2738                                reiserfs_restore_prepared_buffer(tb->tb_sb,
2739                                                                 tb->L[i]);
2740                                reiserfs_restore_prepared_buffer(tb->tb_sb,
2741                                                                 tb->R[i]);
2742                                reiserfs_restore_prepared_buffer(tb->tb_sb,
2743                                                                 tb->FL[i]);
2744                                reiserfs_restore_prepared_buffer(tb->tb_sb,
2745                                                                 tb->FR[i]);
2746                                reiserfs_restore_prepared_buffer(tb->tb_sb,
2747                                                                 tb->
2748                                                                 CFL[i]);
2749                                reiserfs_restore_prepared_buffer(tb->tb_sb,
2750                                                                 tb->
2751                                                                 CFR[i]);
2752                        }
2753
2754                        brelse(tb->L[i]);
2755                        brelse(tb->R[i]);
2756                        brelse(tb->FL[i]);
2757                        brelse(tb->FR[i]);
2758                        brelse(tb->CFL[i]);
2759                        brelse(tb->CFR[i]);
2760
2761                        tb->L[i] = NULL;
2762                        tb->R[i] = NULL;
2763                        tb->FL[i] = NULL;
2764                        tb->FR[i] = NULL;
2765                        tb->CFL[i] = NULL;
2766                        tb->CFR[i] = NULL;
2767                }
2768
2769                if (wait_tb_buffers_run) {
2770                        for (i = 0; i < MAX_FEB_SIZE; i++) {
2771                                if (tb->FEB[i])
2772                                        reiserfs_restore_prepared_buffer
2773                                            (tb->tb_sb, tb->FEB[i]);
2774                        }
2775                }
2776                return ret;
2777        }
2778
2779}
2780
2781void unfix_nodes(struct tree_balance *tb)
2782{
2783        int i;
2784
2785        /* Release path buffers. */
2786        pathrelse_and_restore(tb->tb_sb, tb->tb_path);
2787
2788        /* brelse all resources collected for balancing */
2789        for (i = 0; i < MAX_HEIGHT; i++) {
2790                reiserfs_restore_prepared_buffer(tb->tb_sb, tb->L[i]);
2791                reiserfs_restore_prepared_buffer(tb->tb_sb, tb->R[i]);
2792                reiserfs_restore_prepared_buffer(tb->tb_sb, tb->FL[i]);
2793                reiserfs_restore_prepared_buffer(tb->tb_sb, tb->FR[i]);
2794                reiserfs_restore_prepared_buffer(tb->tb_sb, tb->CFL[i]);
2795                reiserfs_restore_prepared_buffer(tb->tb_sb, tb->CFR[i]);
2796
2797                brelse(tb->L[i]);
2798                brelse(tb->R[i]);
2799                brelse(tb->FL[i]);
2800                brelse(tb->FR[i]);
2801                brelse(tb->CFL[i]);
2802                brelse(tb->CFR[i]);
2803        }
2804
2805        /* deal with list of allocated (used and unused) nodes */
2806        for (i = 0; i < MAX_FEB_SIZE; i++) {
2807                if (tb->FEB[i]) {
2808                        b_blocknr_t blocknr = tb->FEB[i]->b_blocknr;
2809                        /*
2810                         * de-allocated block which was not used by
2811                         * balancing and bforget about buffer for it
2812                         */
2813                        brelse(tb->FEB[i]);
2814                        reiserfs_free_block(tb->transaction_handle, NULL,
2815                                            blocknr, 0);
2816                }
2817                if (tb->used[i]) {
2818                        /* release used as new nodes including a new root */
2819                        brelse(tb->used[i]);
2820                }
2821        }
2822
2823        kfree(tb->vn_buf);
2824
2825}
2826