linux/fs/xfs/xfs_extent_busy.c
<<
>>
Prefs
   1/*
   2 * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc.
   3 * Copyright (c) 2010 David Chinner.
   4 * Copyright (c) 2011 Christoph Hellwig.
   5 * All Rights Reserved.
   6 *
   7 * This program is free software; you can redistribute it and/or
   8 * modify it under the terms of the GNU General Public License as
   9 * published by the Free Software Foundation.
  10 *
  11 * This program is distributed in the hope that it would be useful,
  12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14 * GNU General Public License for more details.
  15 *
  16 * You should have received a copy of the GNU General Public License
  17 * along with this program; if not, write the Free Software Foundation,
  18 * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
  19 */
  20#include "xfs.h"
  21#include "xfs_fs.h"
  22#include "xfs_types.h"
  23#include "xfs_log.h"
  24#include "xfs_trans.h"
  25#include "xfs_sb.h"
  26#include "xfs_ag.h"
  27#include "xfs_mount.h"
  28#include "xfs_bmap_btree.h"
  29#include "xfs_alloc.h"
  30#include "xfs_inode.h"
  31#include "xfs_extent_busy.h"
  32#include "xfs_trace.h"
  33
  34void
  35xfs_extent_busy_insert(
  36        struct xfs_trans        *tp,
  37        xfs_agnumber_t          agno,
  38        xfs_agblock_t           bno,
  39        xfs_extlen_t            len,
  40        unsigned int            flags)
  41{
  42        struct xfs_extent_busy  *new;
  43        struct xfs_extent_busy  *busyp;
  44        struct xfs_perag        *pag;
  45        struct rb_node          **rbp;
  46        struct rb_node          *parent = NULL;
  47
  48        new = kmem_zalloc(sizeof(struct xfs_extent_busy), KM_MAYFAIL);
  49        if (!new) {
  50                /*
  51                 * No Memory!  Since it is now not possible to track the free
  52                 * block, make this a synchronous transaction to insure that
  53                 * the block is not reused before this transaction commits.
  54                 */
  55                trace_xfs_extent_busy_enomem(tp->t_mountp, agno, bno, len);
  56                xfs_trans_set_sync(tp);
  57                return;
  58        }
  59
  60        new->agno = agno;
  61        new->bno = bno;
  62        new->length = len;
  63        INIT_LIST_HEAD(&new->list);
  64        new->flags = flags;
  65
  66        /* trace before insert to be able to see failed inserts */
  67        trace_xfs_extent_busy(tp->t_mountp, agno, bno, len);
  68
  69        pag = xfs_perag_get(tp->t_mountp, new->agno);
  70        spin_lock(&pag->pagb_lock);
  71        rbp = &pag->pagb_tree.rb_node;
  72        while (*rbp) {
  73                parent = *rbp;
  74                busyp = rb_entry(parent, struct xfs_extent_busy, rb_node);
  75
  76                if (new->bno < busyp->bno) {
  77                        rbp = &(*rbp)->rb_left;
  78                        ASSERT(new->bno + new->length <= busyp->bno);
  79                } else if (new->bno > busyp->bno) {
  80                        rbp = &(*rbp)->rb_right;
  81                        ASSERT(bno >= busyp->bno + busyp->length);
  82                } else {
  83                        ASSERT(0);
  84                }
  85        }
  86
  87        rb_link_node(&new->rb_node, parent, rbp);
  88        rb_insert_color(&new->rb_node, &pag->pagb_tree);
  89
  90        list_add(&new->list, &tp->t_busy);
  91        spin_unlock(&pag->pagb_lock);
  92        xfs_perag_put(pag);
  93}
  94
  95/*
  96 * Search for a busy extent within the range of the extent we are about to
  97 * allocate.  You need to be holding the busy extent tree lock when calling
  98 * xfs_extent_busy_search(). This function returns 0 for no overlapping busy
  99 * extent, -1 for an overlapping but not exact busy extent, and 1 for an exact
 100 * match. This is done so that a non-zero return indicates an overlap that
 101 * will require a synchronous transaction, but it can still be
 102 * used to distinguish between a partial or exact match.
 103 */
 104int
 105xfs_extent_busy_search(
 106        struct xfs_mount        *mp,
 107        xfs_agnumber_t          agno,
 108        xfs_agblock_t           bno,
 109        xfs_extlen_t            len)
 110{
 111        struct xfs_perag        *pag;
 112        struct rb_node          *rbp;
 113        struct xfs_extent_busy  *busyp;
 114        int                     match = 0;
 115
 116        pag = xfs_perag_get(mp, agno);
 117        spin_lock(&pag->pagb_lock);
 118
 119        rbp = pag->pagb_tree.rb_node;
 120
 121        /* find closest start bno overlap */
 122        while (rbp) {
 123                busyp = rb_entry(rbp, struct xfs_extent_busy, rb_node);
 124                if (bno < busyp->bno) {
 125                        /* may overlap, but exact start block is lower */
 126                        if (bno + len > busyp->bno)
 127                                match = -1;
 128                        rbp = rbp->rb_left;
 129                } else if (bno > busyp->bno) {
 130                        /* may overlap, but exact start block is higher */
 131                        if (bno < busyp->bno + busyp->length)
 132                                match = -1;
 133                        rbp = rbp->rb_right;
 134                } else {
 135                        /* bno matches busyp, length determines exact match */
 136                        match = (busyp->length == len) ? 1 : -1;
 137                        break;
 138                }
 139        }
 140        spin_unlock(&pag->pagb_lock);
 141        xfs_perag_put(pag);
 142        return match;
 143}
 144
 145/*
 146 * The found free extent [fbno, fend] overlaps part or all of the given busy
 147 * extent.  If the overlap covers the beginning, the end, or all of the busy
 148 * extent, the overlapping portion can be made unbusy and used for the
 149 * allocation.  We can't split a busy extent because we can't modify a
 150 * transaction/CIL context busy list, but we can update an entries block
 151 * number or length.
 152 *
 153 * Returns true if the extent can safely be reused, or false if the search
 154 * needs to be restarted.
 155 */
 156STATIC bool
 157xfs_extent_busy_update_extent(
 158        struct xfs_mount        *mp,
 159        struct xfs_perag        *pag,
 160        struct xfs_extent_busy  *busyp,
 161        xfs_agblock_t           fbno,
 162        xfs_extlen_t            flen,
 163        bool                    userdata)
 164{
 165        xfs_agblock_t           fend = fbno + flen;
 166        xfs_agblock_t           bbno = busyp->bno;
 167        xfs_agblock_t           bend = bbno + busyp->length;
 168
 169        /*
 170         * This extent is currently being discarded.  Give the thread
 171         * performing the discard a chance to mark the extent unbusy
 172         * and retry.
 173         */
 174        if (busyp->flags & XFS_EXTENT_BUSY_DISCARDED) {
 175                spin_unlock(&pag->pagb_lock);
 176                delay(1);
 177                spin_lock(&pag->pagb_lock);
 178                return false;
 179        }
 180
 181        /*
 182         * If there is a busy extent overlapping a user allocation, we have
 183         * no choice but to force the log and retry the search.
 184         *
 185         * Fortunately this does not happen during normal operation, but
 186         * only if the filesystem is very low on space and has to dip into
 187         * the AGFL for normal allocations.
 188         */
 189        if (userdata)
 190                goto out_force_log;
 191
 192        if (bbno < fbno && bend > fend) {
 193                /*
 194                 * Case 1:
 195                 *    bbno           bend
 196                 *    +BBBBBBBBBBBBBBBBB+
 197                 *        +---------+
 198                 *        fbno   fend
 199                 */
 200
 201                /*
 202                 * We would have to split the busy extent to be able to track
 203                 * it correct, which we cannot do because we would have to
 204                 * modify the list of busy extents attached to the transaction
 205                 * or CIL context, which is immutable.
 206                 *
 207                 * Force out the log to clear the busy extent and retry the
 208                 * search.
 209                 */
 210                goto out_force_log;
 211        } else if (bbno >= fbno && bend <= fend) {
 212                /*
 213                 * Case 2:
 214                 *    bbno           bend
 215                 *    +BBBBBBBBBBBBBBBBB+
 216                 *    +-----------------+
 217                 *    fbno           fend
 218                 *
 219                 * Case 3:
 220                 *    bbno           bend
 221                 *    +BBBBBBBBBBBBBBBBB+
 222                 *    +--------------------------+
 223                 *    fbno                    fend
 224                 *
 225                 * Case 4:
 226                 *             bbno           bend
 227                 *             +BBBBBBBBBBBBBBBBB+
 228                 *    +--------------------------+
 229                 *    fbno                    fend
 230                 *
 231                 * Case 5:
 232                 *             bbno           bend
 233                 *             +BBBBBBBBBBBBBBBBB+
 234                 *    +-----------------------------------+
 235                 *    fbno                             fend
 236                 *
 237                 */
 238
 239                /*
 240                 * The busy extent is fully covered by the extent we are
 241                 * allocating, and can simply be removed from the rbtree.
 242                 * However we cannot remove it from the immutable list
 243                 * tracking busy extents in the transaction or CIL context,
 244                 * so set the length to zero to mark it invalid.
 245                 *
 246                 * We also need to restart the busy extent search from the
 247                 * tree root, because erasing the node can rearrange the
 248                 * tree topology.
 249                 */
 250                rb_erase(&busyp->rb_node, &pag->pagb_tree);
 251                busyp->length = 0;
 252                return false;
 253        } else if (fend < bend) {
 254                /*
 255                 * Case 6:
 256                 *              bbno           bend
 257                 *             +BBBBBBBBBBBBBBBBB+
 258                 *             +---------+
 259                 *             fbno   fend
 260                 *
 261                 * Case 7:
 262                 *             bbno           bend
 263                 *             +BBBBBBBBBBBBBBBBB+
 264                 *    +------------------+
 265                 *    fbno            fend
 266                 *
 267                 */
 268                busyp->bno = fend;
 269        } else if (bbno < fbno) {
 270                /*
 271                 * Case 8:
 272                 *    bbno           bend
 273                 *    +BBBBBBBBBBBBBBBBB+
 274                 *        +-------------+
 275                 *        fbno       fend
 276                 *
 277                 * Case 9:
 278                 *    bbno           bend
 279                 *    +BBBBBBBBBBBBBBBBB+
 280                 *        +----------------------+
 281                 *        fbno                fend
 282                 */
 283                busyp->length = fbno - busyp->bno;
 284        } else {
 285                ASSERT(0);
 286        }
 287
 288        trace_xfs_extent_busy_reuse(mp, pag->pag_agno, fbno, flen);
 289        return true;
 290
 291out_force_log:
 292        spin_unlock(&pag->pagb_lock);
 293        xfs_log_force(mp, XFS_LOG_SYNC);
 294        trace_xfs_extent_busy_force(mp, pag->pag_agno, fbno, flen);
 295        spin_lock(&pag->pagb_lock);
 296        return false;
 297}
 298
 299
 300/*
 301 * For a given extent [fbno, flen], make sure we can reuse it safely.
 302 */
 303void
 304xfs_extent_busy_reuse(
 305        struct xfs_mount        *mp,
 306        xfs_agnumber_t          agno,
 307        xfs_agblock_t           fbno,
 308        xfs_extlen_t            flen,
 309        bool                    userdata)
 310{
 311        struct xfs_perag        *pag;
 312        struct rb_node          *rbp;
 313
 314        ASSERT(flen > 0);
 315
 316        pag = xfs_perag_get(mp, agno);
 317        spin_lock(&pag->pagb_lock);
 318restart:
 319        rbp = pag->pagb_tree.rb_node;
 320        while (rbp) {
 321                struct xfs_extent_busy *busyp =
 322                        rb_entry(rbp, struct xfs_extent_busy, rb_node);
 323                xfs_agblock_t   bbno = busyp->bno;
 324                xfs_agblock_t   bend = bbno + busyp->length;
 325
 326                if (fbno + flen <= bbno) {
 327                        rbp = rbp->rb_left;
 328                        continue;
 329                } else if (fbno >= bend) {
 330                        rbp = rbp->rb_right;
 331                        continue;
 332                }
 333
 334                if (!xfs_extent_busy_update_extent(mp, pag, busyp, fbno, flen,
 335                                                  userdata))
 336                        goto restart;
 337        }
 338        spin_unlock(&pag->pagb_lock);
 339        xfs_perag_put(pag);
 340}
 341
 342/*
 343 * For a given extent [fbno, flen], search the busy extent list to find a
 344 * subset of the extent that is not busy.  If *rlen is smaller than
 345 * args->minlen no suitable extent could be found, and the higher level
 346 * code needs to force out the log and retry the allocation.
 347 */
 348void
 349xfs_extent_busy_trim(
 350        struct xfs_alloc_arg    *args,
 351        xfs_agblock_t           bno,
 352        xfs_extlen_t            len,
 353        xfs_agblock_t           *rbno,
 354        xfs_extlen_t            *rlen)
 355{
 356        xfs_agblock_t           fbno;
 357        xfs_extlen_t            flen;
 358        struct rb_node          *rbp;
 359
 360        ASSERT(len > 0);
 361
 362        spin_lock(&args->pag->pagb_lock);
 363restart:
 364        fbno = bno;
 365        flen = len;
 366        rbp = args->pag->pagb_tree.rb_node;
 367        while (rbp && flen >= args->minlen) {
 368                struct xfs_extent_busy *busyp =
 369                        rb_entry(rbp, struct xfs_extent_busy, rb_node);
 370                xfs_agblock_t   fend = fbno + flen;
 371                xfs_agblock_t   bbno = busyp->bno;
 372                xfs_agblock_t   bend = bbno + busyp->length;
 373
 374                if (fend <= bbno) {
 375                        rbp = rbp->rb_left;
 376                        continue;
 377                } else if (fbno >= bend) {
 378                        rbp = rbp->rb_right;
 379                        continue;
 380                }
 381
 382                /*
 383                 * If this is a metadata allocation, try to reuse the busy
 384                 * extent instead of trimming the allocation.
 385                 */
 386                if (!args->userdata &&
 387                    !(busyp->flags & XFS_EXTENT_BUSY_DISCARDED)) {
 388                        if (!xfs_extent_busy_update_extent(args->mp, args->pag,
 389                                                          busyp, fbno, flen,
 390                                                          false))
 391                                goto restart;
 392                        continue;
 393                }
 394
 395                if (bbno <= fbno) {
 396                        /* start overlap */
 397
 398                        /*
 399                         * Case 1:
 400                         *    bbno           bend
 401                         *    +BBBBBBBBBBBBBBBBB+
 402                         *        +---------+
 403                         *        fbno   fend
 404                         *
 405                         * Case 2:
 406                         *    bbno           bend
 407                         *    +BBBBBBBBBBBBBBBBB+
 408                         *    +-------------+
 409                         *    fbno       fend
 410                         *
 411                         * Case 3:
 412                         *    bbno           bend
 413                         *    +BBBBBBBBBBBBBBBBB+
 414                         *        +-------------+
 415                         *        fbno       fend
 416                         *
 417                         * Case 4:
 418                         *    bbno           bend
 419                         *    +BBBBBBBBBBBBBBBBB+
 420                         *    +-----------------+
 421                         *    fbno           fend
 422                         *
 423                         * No unbusy region in extent, return failure.
 424                         */
 425                        if (fend <= bend)
 426                                goto fail;
 427
 428                        /*
 429                         * Case 5:
 430                         *    bbno           bend
 431                         *    +BBBBBBBBBBBBBBBBB+
 432                         *        +----------------------+
 433                         *        fbno                fend
 434                         *
 435                         * Case 6:
 436                         *    bbno           bend
 437                         *    +BBBBBBBBBBBBBBBBB+
 438                         *    +--------------------------+
 439                         *    fbno                    fend
 440                         *
 441                         * Needs to be trimmed to:
 442                         *                       +-------+
 443                         *                       fbno fend
 444                         */
 445                        fbno = bend;
 446                } else if (bend >= fend) {
 447                        /* end overlap */
 448
 449                        /*
 450                         * Case 7:
 451                         *             bbno           bend
 452                         *             +BBBBBBBBBBBBBBBBB+
 453                         *    +------------------+
 454                         *    fbno            fend
 455                         *
 456                         * Case 8:
 457                         *             bbno           bend
 458                         *             +BBBBBBBBBBBBBBBBB+
 459                         *    +--------------------------+
 460                         *    fbno                    fend
 461                         *
 462                         * Needs to be trimmed to:
 463                         *    +-------+
 464                         *    fbno fend
 465                         */
 466                        fend = bbno;
 467                } else {
 468                        /* middle overlap */
 469
 470                        /*
 471                         * Case 9:
 472                         *             bbno           bend
 473                         *             +BBBBBBBBBBBBBBBBB+
 474                         *    +-----------------------------------+
 475                         *    fbno                             fend
 476                         *
 477                         * Can be trimmed to:
 478                         *    +-------+        OR         +-------+
 479                         *    fbno fend                   fbno fend
 480                         *
 481                         * Backward allocation leads to significant
 482                         * fragmentation of directories, which degrades
 483                         * directory performance, therefore we always want to
 484                         * choose the option that produces forward allocation
 485                         * patterns.
 486                         * Preferring the lower bno extent will make the next
 487                         * request use "fend" as the start of the next
 488                         * allocation;  if the segment is no longer busy at
 489                         * that point, we'll get a contiguous allocation, but
 490                         * even if it is still busy, we will get a forward
 491                         * allocation.
 492                         * We try to avoid choosing the segment at "bend",
 493                         * because that can lead to the next allocation
 494                         * taking the segment at "fbno", which would be a
 495                         * backward allocation.  We only use the segment at
 496                         * "fbno" if it is much larger than the current
 497                         * requested size, because in that case there's a
 498                         * good chance subsequent allocations will be
 499                         * contiguous.
 500                         */
 501                        if (bbno - fbno >= args->maxlen) {
 502                                /* left candidate fits perfect */
 503                                fend = bbno;
 504                        } else if (fend - bend >= args->maxlen * 4) {
 505                                /* right candidate has enough free space */
 506                                fbno = bend;
 507                        } else if (bbno - fbno >= args->minlen) {
 508                                /* left candidate fits minimum requirement */
 509                                fend = bbno;
 510                        } else {
 511                                goto fail;
 512                        }
 513                }
 514
 515                flen = fend - fbno;
 516        }
 517        spin_unlock(&args->pag->pagb_lock);
 518
 519        if (fbno != bno || flen != len) {
 520                trace_xfs_extent_busy_trim(args->mp, args->agno, bno, len,
 521                                          fbno, flen);
 522        }
 523        *rbno = fbno;
 524        *rlen = flen;
 525        return;
 526fail:
 527        /*
 528         * Return a zero extent length as failure indications.  All callers
 529         * re-check if the trimmed extent satisfies the minlen requirement.
 530         */
 531        spin_unlock(&args->pag->pagb_lock);
 532        trace_xfs_extent_busy_trim(args->mp, args->agno, bno, len, fbno, 0);
 533        *rbno = fbno;
 534        *rlen = 0;
 535}
 536
 537STATIC void
 538xfs_extent_busy_clear_one(
 539        struct xfs_mount        *mp,
 540        struct xfs_perag        *pag,
 541        struct xfs_extent_busy  *busyp)
 542{
 543        if (busyp->length) {
 544                trace_xfs_extent_busy_clear(mp, busyp->agno, busyp->bno,
 545                                                busyp->length);
 546                rb_erase(&busyp->rb_node, &pag->pagb_tree);
 547        }
 548
 549        list_del_init(&busyp->list);
 550        kmem_free(busyp);
 551}
 552
 553/*
 554 * Remove all extents on the passed in list from the busy extents tree.
 555 * If do_discard is set skip extents that need to be discarded, and mark
 556 * these as undergoing a discard operation instead.
 557 */
 558void
 559xfs_extent_busy_clear(
 560        struct xfs_mount        *mp,
 561        struct list_head        *list,
 562        bool                    do_discard)
 563{
 564        struct xfs_extent_busy  *busyp, *n;
 565        struct xfs_perag        *pag = NULL;
 566        xfs_agnumber_t          agno = NULLAGNUMBER;
 567
 568        list_for_each_entry_safe(busyp, n, list, list) {
 569                if (busyp->agno != agno) {
 570                        if (pag) {
 571                                spin_unlock(&pag->pagb_lock);
 572                                xfs_perag_put(pag);
 573                        }
 574                        pag = xfs_perag_get(mp, busyp->agno);
 575                        spin_lock(&pag->pagb_lock);
 576                        agno = busyp->agno;
 577                }
 578
 579                if (do_discard && busyp->length &&
 580                    !(busyp->flags & XFS_EXTENT_BUSY_SKIP_DISCARD))
 581                        busyp->flags = XFS_EXTENT_BUSY_DISCARDED;
 582                else
 583                        xfs_extent_busy_clear_one(mp, pag, busyp);
 584        }
 585
 586        if (pag) {
 587                spin_unlock(&pag->pagb_lock);
 588                xfs_perag_put(pag);
 589        }
 590}
 591
 592/*
 593 * Callback for list_sort to sort busy extents by the AG they reside in.
 594 */
 595int
 596xfs_extent_busy_ag_cmp(
 597        void                    *priv,
 598        struct list_head        *a,
 599        struct list_head        *b)
 600{
 601        return container_of(a, struct xfs_extent_busy, list)->agno -
 602                container_of(b, struct xfs_extent_busy, list)->agno;
 603}
 604