linux/fs/xfs/xfs_extent_busy.c
<<
>>
Prefs
   1/*
   2 * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc.
   3 * Copyright (c) 2010 David Chinner.
   4 * Copyright (c) 2011 Christoph Hellwig.
   5 * All Rights Reserved.
   6 *
   7 * This program is free software; you can redistribute it and/or
   8 * modify it under the terms of the GNU General Public License as
   9 * published by the Free Software Foundation.
  10 *
  11 * This program is distributed in the hope that it would be useful,
  12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14 * GNU General Public License for more details.
  15 *
  16 * You should have received a copy of the GNU General Public License
  17 * along with this program; if not, write the Free Software Foundation,
  18 * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
  19 */
  20#include "xfs.h"
  21#include "xfs_fs.h"
  22#include "xfs_format.h"
  23#include "xfs_log_format.h"
  24#include "xfs_shared.h"
  25#include "xfs_trans_resv.h"
  26#include "xfs_sb.h"
  27#include "xfs_mount.h"
  28#include "xfs_alloc.h"
  29#include "xfs_extent_busy.h"
  30#include "xfs_trace.h"
  31#include "xfs_trans.h"
  32#include "xfs_log.h"
  33
  34void
  35xfs_extent_busy_insert(
  36        struct xfs_trans        *tp,
  37        xfs_agnumber_t          agno,
  38        xfs_agblock_t           bno,
  39        xfs_extlen_t            len,
  40        unsigned int            flags)
  41{
  42        struct xfs_extent_busy  *new;
  43        struct xfs_extent_busy  *busyp;
  44        struct xfs_perag        *pag;
  45        struct rb_node          **rbp;
  46        struct rb_node          *parent = NULL;
  47
  48        new = kmem_zalloc(sizeof(struct xfs_extent_busy), KM_MAYFAIL);
  49        if (!new) {
  50                /*
  51                 * No Memory!  Since it is now not possible to track the free
  52                 * block, make this a synchronous transaction to insure that
  53                 * the block is not reused before this transaction commits.
  54                 */
  55                trace_xfs_extent_busy_enomem(tp->t_mountp, agno, bno, len);
  56                xfs_trans_set_sync(tp);
  57                return;
  58        }
  59
  60        new->agno = agno;
  61        new->bno = bno;
  62        new->length = len;
  63        INIT_LIST_HEAD(&new->list);
  64        new->flags = flags;
  65
  66        /* trace before insert to be able to see failed inserts */
  67        trace_xfs_extent_busy(tp->t_mountp, agno, bno, len);
  68
  69        pag = xfs_perag_get(tp->t_mountp, new->agno);
  70        spin_lock(&pag->pagb_lock);
  71        rbp = &pag->pagb_tree.rb_node;
  72        while (*rbp) {
  73                parent = *rbp;
  74                busyp = rb_entry(parent, struct xfs_extent_busy, rb_node);
  75
  76                if (new->bno < busyp->bno) {
  77                        rbp = &(*rbp)->rb_left;
  78                        ASSERT(new->bno + new->length <= busyp->bno);
  79                } else if (new->bno > busyp->bno) {
  80                        rbp = &(*rbp)->rb_right;
  81                        ASSERT(bno >= busyp->bno + busyp->length);
  82                } else {
  83                        ASSERT(0);
  84                }
  85        }
  86
  87        rb_link_node(&new->rb_node, parent, rbp);
  88        rb_insert_color(&new->rb_node, &pag->pagb_tree);
  89
  90        list_add(&new->list, &tp->t_busy);
  91        spin_unlock(&pag->pagb_lock);
  92        xfs_perag_put(pag);
  93}
  94
  95/*
  96 * Search for a busy extent within the range of the extent we are about to
  97 * allocate.  You need to be holding the busy extent tree lock when calling
  98 * xfs_extent_busy_search(). This function returns 0 for no overlapping busy
  99 * extent, -1 for an overlapping but not exact busy extent, and 1 for an exact
 100 * match. This is done so that a non-zero return indicates an overlap that
 101 * will require a synchronous transaction, but it can still be
 102 * used to distinguish between a partial or exact match.
 103 */
 104int
 105xfs_extent_busy_search(
 106        struct xfs_mount        *mp,
 107        xfs_agnumber_t          agno,
 108        xfs_agblock_t           bno,
 109        xfs_extlen_t            len)
 110{
 111        struct xfs_perag        *pag;
 112        struct rb_node          *rbp;
 113        struct xfs_extent_busy  *busyp;
 114        int                     match = 0;
 115
 116        pag = xfs_perag_get(mp, agno);
 117        spin_lock(&pag->pagb_lock);
 118
 119        rbp = pag->pagb_tree.rb_node;
 120
 121        /* find closest start bno overlap */
 122        while (rbp) {
 123                busyp = rb_entry(rbp, struct xfs_extent_busy, rb_node);
 124                if (bno < busyp->bno) {
 125                        /* may overlap, but exact start block is lower */
 126                        if (bno + len > busyp->bno)
 127                                match = -1;
 128                        rbp = rbp->rb_left;
 129                } else if (bno > busyp->bno) {
 130                        /* may overlap, but exact start block is higher */
 131                        if (bno < busyp->bno + busyp->length)
 132                                match = -1;
 133                        rbp = rbp->rb_right;
 134                } else {
 135                        /* bno matches busyp, length determines exact match */
 136                        match = (busyp->length == len) ? 1 : -1;
 137                        break;
 138                }
 139        }
 140        spin_unlock(&pag->pagb_lock);
 141        xfs_perag_put(pag);
 142        return match;
 143}
 144
 145/*
 146 * The found free extent [fbno, fend] overlaps part or all of the given busy
 147 * extent.  If the overlap covers the beginning, the end, or all of the busy
 148 * extent, the overlapping portion can be made unbusy and used for the
 149 * allocation.  We can't split a busy extent because we can't modify a
 150 * transaction/CIL context busy list, but we can update an entry's block
 151 * number or length.
 152 *
 153 * Returns true if the extent can safely be reused, or false if the search
 154 * needs to be restarted.
 155 */
 156STATIC bool
 157xfs_extent_busy_update_extent(
 158        struct xfs_mount        *mp,
 159        struct xfs_perag        *pag,
 160        struct xfs_extent_busy  *busyp,
 161        xfs_agblock_t           fbno,
 162        xfs_extlen_t            flen,
 163        bool                    userdata) __releases(&pag->pagb_lock)
 164                                          __acquires(&pag->pagb_lock)
 165{
 166        xfs_agblock_t           fend = fbno + flen;
 167        xfs_agblock_t           bbno = busyp->bno;
 168        xfs_agblock_t           bend = bbno + busyp->length;
 169
 170        /*
 171         * This extent is currently being discarded.  Give the thread
 172         * performing the discard a chance to mark the extent unbusy
 173         * and retry.
 174         */
 175        if (busyp->flags & XFS_EXTENT_BUSY_DISCARDED) {
 176                spin_unlock(&pag->pagb_lock);
 177                delay(1);
 178                spin_lock(&pag->pagb_lock);
 179                return false;
 180        }
 181
 182        /*
 183         * If there is a busy extent overlapping a user allocation, we have
 184         * no choice but to force the log and retry the search.
 185         *
 186         * Fortunately this does not happen during normal operation, but
 187         * only if the filesystem is very low on space and has to dip into
 188         * the AGFL for normal allocations.
 189         */
 190        if (userdata)
 191                goto out_force_log;
 192
 193        if (bbno < fbno && bend > fend) {
 194                /*
 195                 * Case 1:
 196                 *    bbno           bend
 197                 *    +BBBBBBBBBBBBBBBBB+
 198                 *        +---------+
 199                 *        fbno   fend
 200                 */
 201
 202                /*
 203                 * We would have to split the busy extent to be able to track
 204                 * it correct, which we cannot do because we would have to
 205                 * modify the list of busy extents attached to the transaction
 206                 * or CIL context, which is immutable.
 207                 *
 208                 * Force out the log to clear the busy extent and retry the
 209                 * search.
 210                 */
 211                goto out_force_log;
 212        } else if (bbno >= fbno && bend <= fend) {
 213                /*
 214                 * Case 2:
 215                 *    bbno           bend
 216                 *    +BBBBBBBBBBBBBBBBB+
 217                 *    +-----------------+
 218                 *    fbno           fend
 219                 *
 220                 * Case 3:
 221                 *    bbno           bend
 222                 *    +BBBBBBBBBBBBBBBBB+
 223                 *    +--------------------------+
 224                 *    fbno                    fend
 225                 *
 226                 * Case 4:
 227                 *             bbno           bend
 228                 *             +BBBBBBBBBBBBBBBBB+
 229                 *    +--------------------------+
 230                 *    fbno                    fend
 231                 *
 232                 * Case 5:
 233                 *             bbno           bend
 234                 *             +BBBBBBBBBBBBBBBBB+
 235                 *    +-----------------------------------+
 236                 *    fbno                             fend
 237                 *
 238                 */
 239
 240                /*
 241                 * The busy extent is fully covered by the extent we are
 242                 * allocating, and can simply be removed from the rbtree.
 243                 * However we cannot remove it from the immutable list
 244                 * tracking busy extents in the transaction or CIL context,
 245                 * so set the length to zero to mark it invalid.
 246                 *
 247                 * We also need to restart the busy extent search from the
 248                 * tree root, because erasing the node can rearrange the
 249                 * tree topology.
 250                 */
 251                rb_erase(&busyp->rb_node, &pag->pagb_tree);
 252                busyp->length = 0;
 253                return false;
 254        } else if (fend < bend) {
 255                /*
 256                 * Case 6:
 257                 *              bbno           bend
 258                 *             +BBBBBBBBBBBBBBBBB+
 259                 *             +---------+
 260                 *             fbno   fend
 261                 *
 262                 * Case 7:
 263                 *             bbno           bend
 264                 *             +BBBBBBBBBBBBBBBBB+
 265                 *    +------------------+
 266                 *    fbno            fend
 267                 *
 268                 */
 269                busyp->bno = fend;
 270        } else if (bbno < fbno) {
 271                /*
 272                 * Case 8:
 273                 *    bbno           bend
 274                 *    +BBBBBBBBBBBBBBBBB+
 275                 *        +-------------+
 276                 *        fbno       fend
 277                 *
 278                 * Case 9:
 279                 *    bbno           bend
 280                 *    +BBBBBBBBBBBBBBBBB+
 281                 *        +----------------------+
 282                 *        fbno                fend
 283                 */
 284                busyp->length = fbno - busyp->bno;
 285        } else {
 286                ASSERT(0);
 287        }
 288
 289        trace_xfs_extent_busy_reuse(mp, pag->pag_agno, fbno, flen);
 290        return true;
 291
 292out_force_log:
 293        spin_unlock(&pag->pagb_lock);
 294        xfs_log_force(mp, XFS_LOG_SYNC);
 295        trace_xfs_extent_busy_force(mp, pag->pag_agno, fbno, flen);
 296        spin_lock(&pag->pagb_lock);
 297        return false;
 298}
 299
 300
 301/*
 302 * For a given extent [fbno, flen], make sure we can reuse it safely.
 303 */
 304void
 305xfs_extent_busy_reuse(
 306        struct xfs_mount        *mp,
 307        xfs_agnumber_t          agno,
 308        xfs_agblock_t           fbno,
 309        xfs_extlen_t            flen,
 310        bool                    userdata)
 311{
 312        struct xfs_perag        *pag;
 313        struct rb_node          *rbp;
 314
 315        ASSERT(flen > 0);
 316
 317        pag = xfs_perag_get(mp, agno);
 318        spin_lock(&pag->pagb_lock);
 319restart:
 320        rbp = pag->pagb_tree.rb_node;
 321        while (rbp) {
 322                struct xfs_extent_busy *busyp =
 323                        rb_entry(rbp, struct xfs_extent_busy, rb_node);
 324                xfs_agblock_t   bbno = busyp->bno;
 325                xfs_agblock_t   bend = bbno + busyp->length;
 326
 327                if (fbno + flen <= bbno) {
 328                        rbp = rbp->rb_left;
 329                        continue;
 330                } else if (fbno >= bend) {
 331                        rbp = rbp->rb_right;
 332                        continue;
 333                }
 334
 335                if (!xfs_extent_busy_update_extent(mp, pag, busyp, fbno, flen,
 336                                                  userdata))
 337                        goto restart;
 338        }
 339        spin_unlock(&pag->pagb_lock);
 340        xfs_perag_put(pag);
 341}
 342
 343/*
 344 * For a given extent [fbno, flen], search the busy extent list to find a
 345 * subset of the extent that is not busy.  If *rlen is smaller than
 346 * args->minlen no suitable extent could be found, and the higher level
 347 * code needs to force out the log and retry the allocation.
 348 */
 349void
 350xfs_extent_busy_trim(
 351        struct xfs_alloc_arg    *args,
 352        xfs_agblock_t           bno,
 353        xfs_extlen_t            len,
 354        xfs_agblock_t           *rbno,
 355        xfs_extlen_t            *rlen)
 356{
 357        xfs_agblock_t           fbno;
 358        xfs_extlen_t            flen;
 359        struct rb_node          *rbp;
 360
 361        ASSERT(len > 0);
 362
 363        spin_lock(&args->pag->pagb_lock);
 364restart:
 365        fbno = bno;
 366        flen = len;
 367        rbp = args->pag->pagb_tree.rb_node;
 368        while (rbp && flen >= args->minlen) {
 369                struct xfs_extent_busy *busyp =
 370                        rb_entry(rbp, struct xfs_extent_busy, rb_node);
 371                xfs_agblock_t   fend = fbno + flen;
 372                xfs_agblock_t   bbno = busyp->bno;
 373                xfs_agblock_t   bend = bbno + busyp->length;
 374
 375                if (fend <= bbno) {
 376                        rbp = rbp->rb_left;
 377                        continue;
 378                } else if (fbno >= bend) {
 379                        rbp = rbp->rb_right;
 380                        continue;
 381                }
 382
 383                /*
 384                 * If this is a metadata allocation, try to reuse the busy
 385                 * extent instead of trimming the allocation.
 386                 */
 387                if (!args->userdata &&
 388                    !(busyp->flags & XFS_EXTENT_BUSY_DISCARDED)) {
 389                        if (!xfs_extent_busy_update_extent(args->mp, args->pag,
 390                                                          busyp, fbno, flen,
 391                                                          false))
 392                                goto restart;
 393                        continue;
 394                }
 395
 396                if (bbno <= fbno) {
 397                        /* start overlap */
 398
 399                        /*
 400                         * Case 1:
 401                         *    bbno           bend
 402                         *    +BBBBBBBBBBBBBBBBB+
 403                         *        +---------+
 404                         *        fbno   fend
 405                         *
 406                         * Case 2:
 407                         *    bbno           bend
 408                         *    +BBBBBBBBBBBBBBBBB+
 409                         *    +-------------+
 410                         *    fbno       fend
 411                         *
 412                         * Case 3:
 413                         *    bbno           bend
 414                         *    +BBBBBBBBBBBBBBBBB+
 415                         *        +-------------+
 416                         *        fbno       fend
 417                         *
 418                         * Case 4:
 419                         *    bbno           bend
 420                         *    +BBBBBBBBBBBBBBBBB+
 421                         *    +-----------------+
 422                         *    fbno           fend
 423                         *
 424                         * No unbusy region in extent, return failure.
 425                         */
 426                        if (fend <= bend)
 427                                goto fail;
 428
 429                        /*
 430                         * Case 5:
 431                         *    bbno           bend
 432                         *    +BBBBBBBBBBBBBBBBB+
 433                         *        +----------------------+
 434                         *        fbno                fend
 435                         *
 436                         * Case 6:
 437                         *    bbno           bend
 438                         *    +BBBBBBBBBBBBBBBBB+
 439                         *    +--------------------------+
 440                         *    fbno                    fend
 441                         *
 442                         * Needs to be trimmed to:
 443                         *                       +-------+
 444                         *                       fbno fend
 445                         */
 446                        fbno = bend;
 447                } else if (bend >= fend) {
 448                        /* end overlap */
 449
 450                        /*
 451                         * Case 7:
 452                         *             bbno           bend
 453                         *             +BBBBBBBBBBBBBBBBB+
 454                         *    +------------------+
 455                         *    fbno            fend
 456                         *
 457                         * Case 8:
 458                         *             bbno           bend
 459                         *             +BBBBBBBBBBBBBBBBB+
 460                         *    +--------------------------+
 461                         *    fbno                    fend
 462                         *
 463                         * Needs to be trimmed to:
 464                         *    +-------+
 465                         *    fbno fend
 466                         */
 467                        fend = bbno;
 468                } else {
 469                        /* middle overlap */
 470
 471                        /*
 472                         * Case 9:
 473                         *             bbno           bend
 474                         *             +BBBBBBBBBBBBBBBBB+
 475                         *    +-----------------------------------+
 476                         *    fbno                             fend
 477                         *
 478                         * Can be trimmed to:
 479                         *    +-------+        OR         +-------+
 480                         *    fbno fend                   fbno fend
 481                         *
 482                         * Backward allocation leads to significant
 483                         * fragmentation of directories, which degrades
 484                         * directory performance, therefore we always want to
 485                         * choose the option that produces forward allocation
 486                         * patterns.
 487                         * Preferring the lower bno extent will make the next
 488                         * request use "fend" as the start of the next
 489                         * allocation;  if the segment is no longer busy at
 490                         * that point, we'll get a contiguous allocation, but
 491                         * even if it is still busy, we will get a forward
 492                         * allocation.
 493                         * We try to avoid choosing the segment at "bend",
 494                         * because that can lead to the next allocation
 495                         * taking the segment at "fbno", which would be a
 496                         * backward allocation.  We only use the segment at
 497                         * "fbno" if it is much larger than the current
 498                         * requested size, because in that case there's a
 499                         * good chance subsequent allocations will be
 500                         * contiguous.
 501                         */
 502                        if (bbno - fbno >= args->maxlen) {
 503                                /* left candidate fits perfect */
 504                                fend = bbno;
 505                        } else if (fend - bend >= args->maxlen * 4) {
 506                                /* right candidate has enough free space */
 507                                fbno = bend;
 508                        } else if (bbno - fbno >= args->minlen) {
 509                                /* left candidate fits minimum requirement */
 510                                fend = bbno;
 511                        } else {
 512                                goto fail;
 513                        }
 514                }
 515
 516                flen = fend - fbno;
 517        }
 518        spin_unlock(&args->pag->pagb_lock);
 519
 520        if (fbno != bno || flen != len) {
 521                trace_xfs_extent_busy_trim(args->mp, args->agno, bno, len,
 522                                          fbno, flen);
 523        }
 524        *rbno = fbno;
 525        *rlen = flen;
 526        return;
 527fail:
 528        /*
 529         * Return a zero extent length as failure indications.  All callers
 530         * re-check if the trimmed extent satisfies the minlen requirement.
 531         */
 532        spin_unlock(&args->pag->pagb_lock);
 533        trace_xfs_extent_busy_trim(args->mp, args->agno, bno, len, fbno, 0);
 534        *rbno = fbno;
 535        *rlen = 0;
 536}
 537
 538STATIC void
 539xfs_extent_busy_clear_one(
 540        struct xfs_mount        *mp,
 541        struct xfs_perag        *pag,
 542        struct xfs_extent_busy  *busyp)
 543{
 544        if (busyp->length) {
 545                trace_xfs_extent_busy_clear(mp, busyp->agno, busyp->bno,
 546                                                busyp->length);
 547                rb_erase(&busyp->rb_node, &pag->pagb_tree);
 548        }
 549
 550        list_del_init(&busyp->list);
 551        kmem_free(busyp);
 552}
 553
 554/*
 555 * Remove all extents on the passed in list from the busy extents tree.
 556 * If do_discard is set skip extents that need to be discarded, and mark
 557 * these as undergoing a discard operation instead.
 558 */
 559void
 560xfs_extent_busy_clear(
 561        struct xfs_mount        *mp,
 562        struct list_head        *list,
 563        bool                    do_discard)
 564{
 565        struct xfs_extent_busy  *busyp, *n;
 566        struct xfs_perag        *pag = NULL;
 567        xfs_agnumber_t          agno = NULLAGNUMBER;
 568
 569        list_for_each_entry_safe(busyp, n, list, list) {
 570                if (busyp->agno != agno) {
 571                        if (pag) {
 572                                spin_unlock(&pag->pagb_lock);
 573                                xfs_perag_put(pag);
 574                        }
 575                        pag = xfs_perag_get(mp, busyp->agno);
 576                        spin_lock(&pag->pagb_lock);
 577                        agno = busyp->agno;
 578                }
 579
 580                if (do_discard && busyp->length &&
 581                    !(busyp->flags & XFS_EXTENT_BUSY_SKIP_DISCARD))
 582                        busyp->flags = XFS_EXTENT_BUSY_DISCARDED;
 583                else
 584                        xfs_extent_busy_clear_one(mp, pag, busyp);
 585        }
 586
 587        if (pag) {
 588                spin_unlock(&pag->pagb_lock);
 589                xfs_perag_put(pag);
 590        }
 591}
 592
 593/*
 594 * Callback for list_sort to sort busy extents by the AG they reside in.
 595 */
 596int
 597xfs_extent_busy_ag_cmp(
 598        void                    *priv,
 599        struct list_head        *a,
 600        struct list_head        *b)
 601{
 602        return container_of(a, struct xfs_extent_busy, list)->agno -
 603                container_of(b, struct xfs_extent_busy, list)->agno;
 604}
 605