linux/fs/xfs/xfs_extent_busy.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0
   2/*
   3 * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc.
   4 * Copyright (c) 2010 David Chinner.
   5 * Copyright (c) 2011 Christoph Hellwig.
   6 * All Rights Reserved.
   7 */
   8#include "xfs.h"
   9#include "xfs_fs.h"
  10#include "xfs_format.h"
  11#include "xfs_log_format.h"
  12#include "xfs_shared.h"
  13#include "xfs_trans_resv.h"
  14#include "xfs_sb.h"
  15#include "xfs_mount.h"
  16#include "xfs_alloc.h"
  17#include "xfs_extent_busy.h"
  18#include "xfs_trace.h"
  19#include "xfs_trans.h"
  20#include "xfs_log.h"
  21
  22void
  23xfs_extent_busy_insert(
  24        struct xfs_trans        *tp,
  25        xfs_agnumber_t          agno,
  26        xfs_agblock_t           bno,
  27        xfs_extlen_t            len,
  28        unsigned int            flags)
  29{
  30        struct xfs_extent_busy  *new;
  31        struct xfs_extent_busy  *busyp;
  32        struct xfs_perag        *pag;
  33        struct rb_node          **rbp;
  34        struct rb_node          *parent = NULL;
  35
  36        new = kmem_zalloc(sizeof(struct xfs_extent_busy), KM_SLEEP);
  37        new->agno = agno;
  38        new->bno = bno;
  39        new->length = len;
  40        INIT_LIST_HEAD(&new->list);
  41        new->flags = flags;
  42
  43        /* trace before insert to be able to see failed inserts */
  44        trace_xfs_extent_busy(tp->t_mountp, agno, bno, len);
  45
  46        pag = xfs_perag_get(tp->t_mountp, new->agno);
  47        spin_lock(&pag->pagb_lock);
  48        rbp = &pag->pagb_tree.rb_node;
  49        while (*rbp) {
  50                parent = *rbp;
  51                busyp = rb_entry(parent, struct xfs_extent_busy, rb_node);
  52
  53                if (new->bno < busyp->bno) {
  54                        rbp = &(*rbp)->rb_left;
  55                        ASSERT(new->bno + new->length <= busyp->bno);
  56                } else if (new->bno > busyp->bno) {
  57                        rbp = &(*rbp)->rb_right;
  58                        ASSERT(bno >= busyp->bno + busyp->length);
  59                } else {
  60                        ASSERT(0);
  61                }
  62        }
  63
  64        rb_link_node(&new->rb_node, parent, rbp);
  65        rb_insert_color(&new->rb_node, &pag->pagb_tree);
  66
  67        list_add(&new->list, &tp->t_busy);
  68        spin_unlock(&pag->pagb_lock);
  69        xfs_perag_put(pag);
  70}
  71
  72/*
  73 * Search for a busy extent within the range of the extent we are about to
  74 * allocate.  You need to be holding the busy extent tree lock when calling
  75 * xfs_extent_busy_search(). This function returns 0 for no overlapping busy
  76 * extent, -1 for an overlapping but not exact busy extent, and 1 for an exact
  77 * match. This is done so that a non-zero return indicates an overlap that
  78 * will require a synchronous transaction, but it can still be
  79 * used to distinguish between a partial or exact match.
  80 */
  81int
  82xfs_extent_busy_search(
  83        struct xfs_mount        *mp,
  84        xfs_agnumber_t          agno,
  85        xfs_agblock_t           bno,
  86        xfs_extlen_t            len)
  87{
  88        struct xfs_perag        *pag;
  89        struct rb_node          *rbp;
  90        struct xfs_extent_busy  *busyp;
  91        int                     match = 0;
  92
  93        pag = xfs_perag_get(mp, agno);
  94        spin_lock(&pag->pagb_lock);
  95
  96        rbp = pag->pagb_tree.rb_node;
  97
  98        /* find closest start bno overlap */
  99        while (rbp) {
 100                busyp = rb_entry(rbp, struct xfs_extent_busy, rb_node);
 101                if (bno < busyp->bno) {
 102                        /* may overlap, but exact start block is lower */
 103                        if (bno + len > busyp->bno)
 104                                match = -1;
 105                        rbp = rbp->rb_left;
 106                } else if (bno > busyp->bno) {
 107                        /* may overlap, but exact start block is higher */
 108                        if (bno < busyp->bno + busyp->length)
 109                                match = -1;
 110                        rbp = rbp->rb_right;
 111                } else {
 112                        /* bno matches busyp, length determines exact match */
 113                        match = (busyp->length == len) ? 1 : -1;
 114                        break;
 115                }
 116        }
 117        spin_unlock(&pag->pagb_lock);
 118        xfs_perag_put(pag);
 119        return match;
 120}
 121
 122/*
 123 * The found free extent [fbno, fend] overlaps part or all of the given busy
 124 * extent.  If the overlap covers the beginning, the end, or all of the busy
 125 * extent, the overlapping portion can be made unbusy and used for the
 126 * allocation.  We can't split a busy extent because we can't modify a
 127 * transaction/CIL context busy list, but we can update an entry's block
 128 * number or length.
 129 *
 130 * Returns true if the extent can safely be reused, or false if the search
 131 * needs to be restarted.
 132 */
 133STATIC bool
 134xfs_extent_busy_update_extent(
 135        struct xfs_mount        *mp,
 136        struct xfs_perag        *pag,
 137        struct xfs_extent_busy  *busyp,
 138        xfs_agblock_t           fbno,
 139        xfs_extlen_t            flen,
 140        bool                    userdata) __releases(&pag->pagb_lock)
 141                                          __acquires(&pag->pagb_lock)
 142{
 143        xfs_agblock_t           fend = fbno + flen;
 144        xfs_agblock_t           bbno = busyp->bno;
 145        xfs_agblock_t           bend = bbno + busyp->length;
 146
 147        /*
 148         * This extent is currently being discarded.  Give the thread
 149         * performing the discard a chance to mark the extent unbusy
 150         * and retry.
 151         */
 152        if (busyp->flags & XFS_EXTENT_BUSY_DISCARDED) {
 153                spin_unlock(&pag->pagb_lock);
 154                delay(1);
 155                spin_lock(&pag->pagb_lock);
 156                return false;
 157        }
 158
 159        /*
 160         * If there is a busy extent overlapping a user allocation, we have
 161         * no choice but to force the log and retry the search.
 162         *
 163         * Fortunately this does not happen during normal operation, but
 164         * only if the filesystem is very low on space and has to dip into
 165         * the AGFL for normal allocations.
 166         */
 167        if (userdata)
 168                goto out_force_log;
 169
 170        if (bbno < fbno && bend > fend) {
 171                /*
 172                 * Case 1:
 173                 *    bbno           bend
 174                 *    +BBBBBBBBBBBBBBBBB+
 175                 *        +---------+
 176                 *        fbno   fend
 177                 */
 178
 179                /*
 180                 * We would have to split the busy extent to be able to track
 181                 * it correct, which we cannot do because we would have to
 182                 * modify the list of busy extents attached to the transaction
 183                 * or CIL context, which is immutable.
 184                 *
 185                 * Force out the log to clear the busy extent and retry the
 186                 * search.
 187                 */
 188                goto out_force_log;
 189        } else if (bbno >= fbno && bend <= fend) {
 190                /*
 191                 * Case 2:
 192                 *    bbno           bend
 193                 *    +BBBBBBBBBBBBBBBBB+
 194                 *    +-----------------+
 195                 *    fbno           fend
 196                 *
 197                 * Case 3:
 198                 *    bbno           bend
 199                 *    +BBBBBBBBBBBBBBBBB+
 200                 *    +--------------------------+
 201                 *    fbno                    fend
 202                 *
 203                 * Case 4:
 204                 *             bbno           bend
 205                 *             +BBBBBBBBBBBBBBBBB+
 206                 *    +--------------------------+
 207                 *    fbno                    fend
 208                 *
 209                 * Case 5:
 210                 *             bbno           bend
 211                 *             +BBBBBBBBBBBBBBBBB+
 212                 *    +-----------------------------------+
 213                 *    fbno                             fend
 214                 *
 215                 */
 216
 217                /*
 218                 * The busy extent is fully covered by the extent we are
 219                 * allocating, and can simply be removed from the rbtree.
 220                 * However we cannot remove it from the immutable list
 221                 * tracking busy extents in the transaction or CIL context,
 222                 * so set the length to zero to mark it invalid.
 223                 *
 224                 * We also need to restart the busy extent search from the
 225                 * tree root, because erasing the node can rearrange the
 226                 * tree topology.
 227                 */
 228                rb_erase(&busyp->rb_node, &pag->pagb_tree);
 229                busyp->length = 0;
 230                return false;
 231        } else if (fend < bend) {
 232                /*
 233                 * Case 6:
 234                 *              bbno           bend
 235                 *             +BBBBBBBBBBBBBBBBB+
 236                 *             +---------+
 237                 *             fbno   fend
 238                 *
 239                 * Case 7:
 240                 *             bbno           bend
 241                 *             +BBBBBBBBBBBBBBBBB+
 242                 *    +------------------+
 243                 *    fbno            fend
 244                 *
 245                 */
 246                busyp->bno = fend;
 247        } else if (bbno < fbno) {
 248                /*
 249                 * Case 8:
 250                 *    bbno           bend
 251                 *    +BBBBBBBBBBBBBBBBB+
 252                 *        +-------------+
 253                 *        fbno       fend
 254                 *
 255                 * Case 9:
 256                 *    bbno           bend
 257                 *    +BBBBBBBBBBBBBBBBB+
 258                 *        +----------------------+
 259                 *        fbno                fend
 260                 */
 261                busyp->length = fbno - busyp->bno;
 262        } else {
 263                ASSERT(0);
 264        }
 265
 266        trace_xfs_extent_busy_reuse(mp, pag->pag_agno, fbno, flen);
 267        return true;
 268
 269out_force_log:
 270        spin_unlock(&pag->pagb_lock);
 271        xfs_log_force(mp, XFS_LOG_SYNC);
 272        trace_xfs_extent_busy_force(mp, pag->pag_agno, fbno, flen);
 273        spin_lock(&pag->pagb_lock);
 274        return false;
 275}
 276
 277
 278/*
 279 * For a given extent [fbno, flen], make sure we can reuse it safely.
 280 */
 281void
 282xfs_extent_busy_reuse(
 283        struct xfs_mount        *mp,
 284        xfs_agnumber_t          agno,
 285        xfs_agblock_t           fbno,
 286        xfs_extlen_t            flen,
 287        bool                    userdata)
 288{
 289        struct xfs_perag        *pag;
 290        struct rb_node          *rbp;
 291
 292        ASSERT(flen > 0);
 293
 294        pag = xfs_perag_get(mp, agno);
 295        spin_lock(&pag->pagb_lock);
 296restart:
 297        rbp = pag->pagb_tree.rb_node;
 298        while (rbp) {
 299                struct xfs_extent_busy *busyp =
 300                        rb_entry(rbp, struct xfs_extent_busy, rb_node);
 301                xfs_agblock_t   bbno = busyp->bno;
 302                xfs_agblock_t   bend = bbno + busyp->length;
 303
 304                if (fbno + flen <= bbno) {
 305                        rbp = rbp->rb_left;
 306                        continue;
 307                } else if (fbno >= bend) {
 308                        rbp = rbp->rb_right;
 309                        continue;
 310                }
 311
 312                if (!xfs_extent_busy_update_extent(mp, pag, busyp, fbno, flen,
 313                                                  userdata))
 314                        goto restart;
 315        }
 316        spin_unlock(&pag->pagb_lock);
 317        xfs_perag_put(pag);
 318}
 319
 320/*
 321 * For a given extent [fbno, flen], search the busy extent list to find a
 322 * subset of the extent that is not busy.  If *rlen is smaller than
 323 * args->minlen no suitable extent could be found, and the higher level
 324 * code needs to force out the log and retry the allocation.
 325 *
 326 * Return the current busy generation for the AG if the extent is busy. This
 327 * value can be used to wait for at least one of the currently busy extents
 328 * to be cleared. Note that the busy list is not guaranteed to be empty after
 329 * the gen is woken. The state of a specific extent must always be confirmed
 330 * with another call to xfs_extent_busy_trim() before it can be used.
 331 */
 332bool
 333xfs_extent_busy_trim(
 334        struct xfs_alloc_arg    *args,
 335        xfs_agblock_t           *bno,
 336        xfs_extlen_t            *len,
 337        unsigned                *busy_gen)
 338{
 339        xfs_agblock_t           fbno;
 340        xfs_extlen_t            flen;
 341        struct rb_node          *rbp;
 342        bool                    ret = false;
 343
 344        ASSERT(*len > 0);
 345
 346        spin_lock(&args->pag->pagb_lock);
 347restart:
 348        fbno = *bno;
 349        flen = *len;
 350        rbp = args->pag->pagb_tree.rb_node;
 351        while (rbp && flen >= args->minlen) {
 352                struct xfs_extent_busy *busyp =
 353                        rb_entry(rbp, struct xfs_extent_busy, rb_node);
 354                xfs_agblock_t   fend = fbno + flen;
 355                xfs_agblock_t   bbno = busyp->bno;
 356                xfs_agblock_t   bend = bbno + busyp->length;
 357
 358                if (fend <= bbno) {
 359                        rbp = rbp->rb_left;
 360                        continue;
 361                } else if (fbno >= bend) {
 362                        rbp = rbp->rb_right;
 363                        continue;
 364                }
 365
 366                /*
 367                 * If this is a metadata allocation, try to reuse the busy
 368                 * extent instead of trimming the allocation.
 369                 */
 370                if (!xfs_alloc_is_userdata(args->datatype) &&
 371                    !(busyp->flags & XFS_EXTENT_BUSY_DISCARDED)) {
 372                        if (!xfs_extent_busy_update_extent(args->mp, args->pag,
 373                                                          busyp, fbno, flen,
 374                                                          false))
 375                                goto restart;
 376                        continue;
 377                }
 378
 379                if (bbno <= fbno) {
 380                        /* start overlap */
 381
 382                        /*
 383                         * Case 1:
 384                         *    bbno           bend
 385                         *    +BBBBBBBBBBBBBBBBB+
 386                         *        +---------+
 387                         *        fbno   fend
 388                         *
 389                         * Case 2:
 390                         *    bbno           bend
 391                         *    +BBBBBBBBBBBBBBBBB+
 392                         *    +-------------+
 393                         *    fbno       fend
 394                         *
 395                         * Case 3:
 396                         *    bbno           bend
 397                         *    +BBBBBBBBBBBBBBBBB+
 398                         *        +-------------+
 399                         *        fbno       fend
 400                         *
 401                         * Case 4:
 402                         *    bbno           bend
 403                         *    +BBBBBBBBBBBBBBBBB+
 404                         *    +-----------------+
 405                         *    fbno           fend
 406                         *
 407                         * No unbusy region in extent, return failure.
 408                         */
 409                        if (fend <= bend)
 410                                goto fail;
 411
 412                        /*
 413                         * Case 5:
 414                         *    bbno           bend
 415                         *    +BBBBBBBBBBBBBBBBB+
 416                         *        +----------------------+
 417                         *        fbno                fend
 418                         *
 419                         * Case 6:
 420                         *    bbno           bend
 421                         *    +BBBBBBBBBBBBBBBBB+
 422                         *    +--------------------------+
 423                         *    fbno                    fend
 424                         *
 425                         * Needs to be trimmed to:
 426                         *                       +-------+
 427                         *                       fbno fend
 428                         */
 429                        fbno = bend;
 430                } else if (bend >= fend) {
 431                        /* end overlap */
 432
 433                        /*
 434                         * Case 7:
 435                         *             bbno           bend
 436                         *             +BBBBBBBBBBBBBBBBB+
 437                         *    +------------------+
 438                         *    fbno            fend
 439                         *
 440                         * Case 8:
 441                         *             bbno           bend
 442                         *             +BBBBBBBBBBBBBBBBB+
 443                         *    +--------------------------+
 444                         *    fbno                    fend
 445                         *
 446                         * Needs to be trimmed to:
 447                         *    +-------+
 448                         *    fbno fend
 449                         */
 450                        fend = bbno;
 451                } else {
 452                        /* middle overlap */
 453
 454                        /*
 455                         * Case 9:
 456                         *             bbno           bend
 457                         *             +BBBBBBBBBBBBBBBBB+
 458                         *    +-----------------------------------+
 459                         *    fbno                             fend
 460                         *
 461                         * Can be trimmed to:
 462                         *    +-------+        OR         +-------+
 463                         *    fbno fend                   fbno fend
 464                         *
 465                         * Backward allocation leads to significant
 466                         * fragmentation of directories, which degrades
 467                         * directory performance, therefore we always want to
 468                         * choose the option that produces forward allocation
 469                         * patterns.
 470                         * Preferring the lower bno extent will make the next
 471                         * request use "fend" as the start of the next
 472                         * allocation;  if the segment is no longer busy at
 473                         * that point, we'll get a contiguous allocation, but
 474                         * even if it is still busy, we will get a forward
 475                         * allocation.
 476                         * We try to avoid choosing the segment at "bend",
 477                         * because that can lead to the next allocation
 478                         * taking the segment at "fbno", which would be a
 479                         * backward allocation.  We only use the segment at
 480                         * "fbno" if it is much larger than the current
 481                         * requested size, because in that case there's a
 482                         * good chance subsequent allocations will be
 483                         * contiguous.
 484                         */
 485                        if (bbno - fbno >= args->maxlen) {
 486                                /* left candidate fits perfect */
 487                                fend = bbno;
 488                        } else if (fend - bend >= args->maxlen * 4) {
 489                                /* right candidate has enough free space */
 490                                fbno = bend;
 491                        } else if (bbno - fbno >= args->minlen) {
 492                                /* left candidate fits minimum requirement */
 493                                fend = bbno;
 494                        } else {
 495                                goto fail;
 496                        }
 497                }
 498
 499                flen = fend - fbno;
 500        }
 501out:
 502
 503        if (fbno != *bno || flen != *len) {
 504                trace_xfs_extent_busy_trim(args->mp, args->agno, *bno, *len,
 505                                          fbno, flen);
 506                *bno = fbno;
 507                *len = flen;
 508                *busy_gen = args->pag->pagb_gen;
 509                ret = true;
 510        }
 511        spin_unlock(&args->pag->pagb_lock);
 512        return ret;
 513fail:
 514        /*
 515         * Return a zero extent length as failure indications.  All callers
 516         * re-check if the trimmed extent satisfies the minlen requirement.
 517         */
 518        flen = 0;
 519        goto out;
 520}
 521
 522STATIC void
 523xfs_extent_busy_clear_one(
 524        struct xfs_mount        *mp,
 525        struct xfs_perag        *pag,
 526        struct xfs_extent_busy  *busyp)
 527{
 528        if (busyp->length) {
 529                trace_xfs_extent_busy_clear(mp, busyp->agno, busyp->bno,
 530                                                busyp->length);
 531                rb_erase(&busyp->rb_node, &pag->pagb_tree);
 532        }
 533
 534        list_del_init(&busyp->list);
 535        kmem_free(busyp);
 536}
 537
 538static void
 539xfs_extent_busy_put_pag(
 540        struct xfs_perag        *pag,
 541        bool                    wakeup)
 542                __releases(pag->pagb_lock)
 543{
 544        if (wakeup) {
 545                pag->pagb_gen++;
 546                wake_up_all(&pag->pagb_wait);
 547        }
 548
 549        spin_unlock(&pag->pagb_lock);
 550        xfs_perag_put(pag);
 551}
 552
 553/*
 554 * Remove all extents on the passed in list from the busy extents tree.
 555 * If do_discard is set skip extents that need to be discarded, and mark
 556 * these as undergoing a discard operation instead.
 557 */
 558void
 559xfs_extent_busy_clear(
 560        struct xfs_mount        *mp,
 561        struct list_head        *list,
 562        bool                    do_discard)
 563{
 564        struct xfs_extent_busy  *busyp, *n;
 565        struct xfs_perag        *pag = NULL;
 566        xfs_agnumber_t          agno = NULLAGNUMBER;
 567        bool                    wakeup = false;
 568
 569        list_for_each_entry_safe(busyp, n, list, list) {
 570                if (busyp->agno != agno) {
 571                        if (pag)
 572                                xfs_extent_busy_put_pag(pag, wakeup);
 573                        agno = busyp->agno;
 574                        pag = xfs_perag_get(mp, agno);
 575                        spin_lock(&pag->pagb_lock);
 576                        wakeup = false;
 577                }
 578
 579                if (do_discard && busyp->length &&
 580                    !(busyp->flags & XFS_EXTENT_BUSY_SKIP_DISCARD)) {
 581                        busyp->flags = XFS_EXTENT_BUSY_DISCARDED;
 582                } else {
 583                        xfs_extent_busy_clear_one(mp, pag, busyp);
 584                        wakeup = true;
 585                }
 586        }
 587
 588        if (pag)
 589                xfs_extent_busy_put_pag(pag, wakeup);
 590}
 591
 592/*
 593 * Flush out all busy extents for this AG.
 594 */
 595void
 596xfs_extent_busy_flush(
 597        struct xfs_mount        *mp,
 598        struct xfs_perag        *pag,
 599        unsigned                busy_gen)
 600{
 601        DEFINE_WAIT             (wait);
 602        int                     error;
 603
 604        error = xfs_log_force(mp, XFS_LOG_SYNC);
 605        if (error)
 606                return;
 607
 608        do {
 609                prepare_to_wait(&pag->pagb_wait, &wait, TASK_KILLABLE);
 610                if  (busy_gen != READ_ONCE(pag->pagb_gen))
 611                        break;
 612                schedule();
 613        } while (1);
 614
 615        finish_wait(&pag->pagb_wait, &wait);
 616}
 617
 618void
 619xfs_extent_busy_wait_all(
 620        struct xfs_mount        *mp)
 621{
 622        DEFINE_WAIT             (wait);
 623        xfs_agnumber_t          agno;
 624
 625        for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) {
 626                struct xfs_perag *pag = xfs_perag_get(mp, agno);
 627
 628                do {
 629                        prepare_to_wait(&pag->pagb_wait, &wait, TASK_KILLABLE);
 630                        if  (RB_EMPTY_ROOT(&pag->pagb_tree))
 631                                break;
 632                        schedule();
 633                } while (1);
 634                finish_wait(&pag->pagb_wait, &wait);
 635
 636                xfs_perag_put(pag);
 637        }
 638}
 639
 640/*
 641 * Callback for list_sort to sort busy extents by the AG they reside in.
 642 */
 643int
 644xfs_extent_busy_ag_cmp(
 645        void                    *priv,
 646        struct list_head        *l1,
 647        struct list_head        *l2)
 648{
 649        struct xfs_extent_busy  *b1 =
 650                container_of(l1, struct xfs_extent_busy, list);
 651        struct xfs_extent_busy  *b2 =
 652                container_of(l2, struct xfs_extent_busy, list);
 653        s32 diff;
 654
 655        diff = b1->agno - b2->agno;
 656        if (!diff)
 657                diff = b1->bno - b2->bno;
 658        return diff;
 659}
 660