LXR linux/fs/xfs/xfs

   1/*
   2 * Copyright (c) 2006-2007 Silicon Graphics, Inc.
   3 * All Rights Reserved.
   4 *
   5 * This program is free software; you can redistribute it and/or
   6 * modify it under the terms of the GNU General Public License as
   7 * published by the Free Software Foundation.
   8 *
   9 * This program is distributed in the hope that it would be useful,
  10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
  11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12 * GNU General Public License for more details.
  13 *
  14 * You should have received a copy of the GNU General Public License
  15 * along with this program; if not, write the Free Software Foundation,
  16 * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
  17 */
  18#include "xfs.h"
  19#include "xfs_bmap_btree.h"
  20#include "xfs_inum.h"
  21#include "xfs_dinode.h"
  22#include "xfs_inode.h"
  23#include "xfs_ag.h"
  24#include "xfs_log.h"
  25#include "xfs_trans.h"
  26#include "xfs_sb.h"
  27#include "xfs_mount.h"
  28#include "xfs_bmap.h"
  29#include "xfs_alloc.h"
  30#include "xfs_utils.h"
  31#include "xfs_mru_cache.h"
  32#include "xfs_filestream.h"
  33#include "xfs_trace.h"
  34
  35#ifdef XFS_FILESTREAMS_TRACE
  36
  37ktrace_t *xfs_filestreams_trace_buf;
  38
  39STATIC void
  40xfs_filestreams_trace(
  41        xfs_mount_t     *mp,    /* mount point */
  42        int             type,   /* type of trace */
  43        const char      *func,  /* source function */
  44        int             line,   /* source line number */
  45        __psunsigned_t  arg0,
  46        __psunsigned_t  arg1,
  47        __psunsigned_t  arg2,
  48        __psunsigned_t  arg3,
  49        __psunsigned_t  arg4,
  50        __psunsigned_t  arg5)
  51{
  52        ktrace_enter(xfs_filestreams_trace_buf,
  53                (void *)(__psint_t)(type | (line << 16)),
  54                (void *)func,
  55                (void *)(__psunsigned_t)current_pid(),
  56                (void *)mp,
  57                (void *)(__psunsigned_t)arg0,
  58                (void *)(__psunsigned_t)arg1,
  59                (void *)(__psunsigned_t)arg2,
  60                (void *)(__psunsigned_t)arg3,
  61                (void *)(__psunsigned_t)arg4,
  62                (void *)(__psunsigned_t)arg5,
  63                NULL, NULL, NULL, NULL, NULL, NULL);
  64}
  65
  66#define TRACE0(mp,t)                    TRACE6(mp,t,0,0,0,0,0,0)
  67#define TRACE1(mp,t,a0)                 TRACE6(mp,t,a0,0,0,0,0,0)
  68#define TRACE2(mp,t,a0,a1)              TRACE6(mp,t,a0,a1,0,0,0,0)
  69#define TRACE3(mp,t,a0,a1,a2)           TRACE6(mp,t,a0,a1,a2,0,0,0)
  70#define TRACE4(mp,t,a0,a1,a2,a3)        TRACE6(mp,t,a0,a1,a2,a3,0,0)
  71#define TRACE5(mp,t,a0,a1,a2,a3,a4)     TRACE6(mp,t,a0,a1,a2,a3,a4,0)
  72#define TRACE6(mp,t,a0,a1,a2,a3,a4,a5) \
  73        xfs_filestreams_trace(mp, t, __func__, __LINE__, \
  74                                (__psunsigned_t)a0, (__psunsigned_t)a1, \
  75                                (__psunsigned_t)a2, (__psunsigned_t)a3, \
  76                                (__psunsigned_t)a4, (__psunsigned_t)a5)
  77
  78#define TRACE_AG_SCAN(mp, ag, ag2) \
  79                TRACE2(mp, XFS_FSTRM_KTRACE_AGSCAN, ag, ag2);
  80#define TRACE_AG_PICK1(mp, max_ag, maxfree) \
  81                TRACE2(mp, XFS_FSTRM_KTRACE_AGPICK1, max_ag, maxfree);
  82#define TRACE_AG_PICK2(mp, ag, ag2, cnt, free, scan, flag) \
  83                TRACE6(mp, XFS_FSTRM_KTRACE_AGPICK2, ag, ag2, \
  84                         cnt, free, scan, flag)
  85#define TRACE_UPDATE(mp, ip, ag, cnt, ag2, cnt2) \
  86                TRACE5(mp, XFS_FSTRM_KTRACE_UPDATE, ip, ag, cnt, ag2, cnt2)
  87#define TRACE_FREE(mp, ip, pip, ag, cnt) \
  88                TRACE4(mp, XFS_FSTRM_KTRACE_FREE, ip, pip, ag, cnt)
  89#define TRACE_LOOKUP(mp, ip, pip, ag, cnt) \
  90                TRACE4(mp, XFS_FSTRM_KTRACE_ITEM_LOOKUP, ip, pip, ag, cnt)
  91#define TRACE_ASSOCIATE(mp, ip, pip, ag, cnt) \
  92                TRACE4(mp, XFS_FSTRM_KTRACE_ASSOCIATE, ip, pip, ag, cnt)
  93#define TRACE_MOVEAG(mp, ip, pip, oag, ocnt, nag, ncnt) \
  94                TRACE6(mp, XFS_FSTRM_KTRACE_MOVEAG, ip, pip, oag, ocnt, nag, ncnt)
  95#define TRACE_ORPHAN(mp, ip, ag) \
  96                TRACE2(mp, XFS_FSTRM_KTRACE_ORPHAN, ip, ag);
  97
  98
  99#else
 100#define TRACE_AG_SCAN(mp, ag, ag2)
 101#define TRACE_AG_PICK1(mp, max_ag, maxfree)
 102#define TRACE_AG_PICK2(mp, ag, ag2, cnt, free, scan, flag)
 103#define TRACE_UPDATE(mp, ip, ag, cnt, ag2, cnt2)
 104#define TRACE_FREE(mp, ip, pip, ag, cnt)
 105#define TRACE_LOOKUP(mp, ip, pip, ag, cnt)
 106#define TRACE_ASSOCIATE(mp, ip, pip, ag, cnt)
 107#define TRACE_MOVEAG(mp, ip, pip, oag, ocnt, nag, ncnt)
 108#define TRACE_ORPHAN(mp, ip, ag)
 109#endif
 110
 111static kmem_zone_t *item_zone;
 112
 113/*
 114 * Structure for associating a file or a directory with an allocation group.
 115 * The parent directory pointer is only needed for files, but since there will
 116 * generally be vastly more files than directories in the cache, using the same
 117 * data structure simplifies the code with very little memory overhead.
 118 */
 119typedef struct fstrm_item
 120{
 121        xfs_agnumber_t  ag;     /* AG currently in use for the file/directory. */
 122        xfs_inode_t     *ip;    /* inode self-pointer. */
 123        xfs_inode_t     *pip;   /* Parent directory inode pointer. */
 124} fstrm_item_t;
 125
 126/*
 127 * Allocation group filestream associations are tracked with per-ag atomic
 128 * counters.  These counters allow _xfs_filestream_pick_ag() to tell whether a
 129 * particular AG already has active filestreams associated with it. The mount
 130 * point's m_peraglock is used to protect these counters from per-ag array
 131 * re-allocation during a growfs operation.  When xfs_growfs_data_private() is
 132 * about to reallocate the array, it calls xfs_filestream_flush() with the
 133 * m_peraglock held in write mode.
 134 *
 135 * Since xfs_mru_cache_flush() guarantees that all the free functions for all
 136 * the cache elements have finished executing before it returns, it's safe for
 137 * the free functions to use the atomic counters without m_peraglock protection.
 138 * This allows the implementation of xfs_fstrm_free_func() to be agnostic about
 139 * whether it was called with the m_peraglock held in read mode, write mode or
 140 * not held at all.  The race condition this addresses is the following:
 141 *
 142 *  - The work queue scheduler fires and pulls a filestream directory cache
 143 *    element off the LRU end of the cache for deletion, then gets pre-empted.
 144 *  - A growfs operation grabs the m_peraglock in write mode, flushes all the
 145 *    remaining items from the cache and reallocates the mount point's per-ag
 146 *    array, resetting all the counters to zero.
 147 *  - The work queue thread resumes and calls the free function for the element
 148 *    it started cleaning up earlier.  In the process it decrements the
 149 *    filestreams counter for an AG that now has no references.
 150 *
 151 * With a shrinkfs feature, the above scenario could panic the system.
 152 *
 153 * All other uses of the following macros should be protected by either the
 154 * m_peraglock held in read mode, or the cache's internal locking exposed by the
 155 * interval between a call to xfs_mru_cache_lookup() and a call to
 156 * xfs_mru_cache_done().  In addition, the m_peraglock must be held in read mode
 157 * when new elements are added to the cache.
 158 *
 159 * Combined, these locking rules ensure that no associations will ever exist in
 160 * the cache that reference per-ag array elements that have since been
 161 * reallocated.
 162 */
 163static int
 164xfs_filestream_peek_ag(
 165        xfs_mount_t     *mp,
 166        xfs_agnumber_t  agno)
 167{
 168        struct xfs_perag *pag;
 169        int             ret;
 170
 171        pag = xfs_perag_get(mp, agno);
 172        ret = atomic_read(&pag->pagf_fstrms);
 173        xfs_perag_put(pag);
 174        return ret;
 175}
 176
 177static int
 178xfs_filestream_get_ag(
 179        xfs_mount_t     *mp,
 180        xfs_agnumber_t  agno)
 181{
 182        struct xfs_perag *pag;
 183        int             ret;
 184
 185        pag = xfs_perag_get(mp, agno);
 186        ret = atomic_inc_return(&pag->pagf_fstrms);
 187        xfs_perag_put(pag);
 188        return ret;
 189}
 190
 191static void
 192xfs_filestream_put_ag(
 193        xfs_mount_t     *mp,
 194        xfs_agnumber_t  agno)
 195{
 196        struct xfs_perag *pag;
 197
 198        pag = xfs_perag_get(mp, agno);
 199        atomic_dec(&pag->pagf_fstrms);
 200        xfs_perag_put(pag);
 201}
 202
 203/*
 204 * Scan the AGs starting at startag looking for an AG that isn't in use and has
 205 * at least minlen blocks free.
 206 */
 207static int
 208_xfs_filestream_pick_ag(
 209        xfs_mount_t     *mp,
 210        xfs_agnumber_t  startag,
 211        xfs_agnumber_t  *agp,
 212        int             flags,
 213        xfs_extlen_t    minlen)
 214{
 215        int             streams, max_streams;
 216        int             err, trylock, nscan;
 217        xfs_extlen_t    longest, free, minfree, maxfree = 0;
 218        xfs_agnumber_t  ag, max_ag = NULLAGNUMBER;
 219        struct xfs_perag *pag;
 220
 221        /* 2% of an AG's blocks must be free for it to be chosen. */
 222        minfree = mp->m_sb.sb_agblocks / 50;
 223
 224        ag = startag;
 225        *agp = NULLAGNUMBER;
 226
 227        /* For the first pass, don't sleep trying to init the per-AG. */
 228        trylock = XFS_ALLOC_FLAG_TRYLOCK;
 229
 230        for (nscan = 0; 1; nscan++) {
 231                pag = xfs_perag_get(mp, ag);
 232                TRACE_AG_SCAN(mp, ag, atomic_read(&pag->pagf_fstrms));
 233
 234                if (!pag->pagf_init) {
 235                        err = xfs_alloc_pagf_init(mp, NULL, ag, trylock);
 236                        if (err && !trylock) {
 237                                xfs_perag_put(pag);
 238                                return err;
 239                        }
 240                }
 241
 242                /* Might fail sometimes during the 1st pass with trylock set. */
 243                if (!pag->pagf_init)
 244                        goto next_ag;
 245
 246                /* Keep track of the AG with the most free blocks. */
 247                if (pag->pagf_freeblks > maxfree) {
 248                        maxfree = pag->pagf_freeblks;
 249                        max_streams = atomic_read(&pag->pagf_fstrms);
 250                        max_ag = ag;
 251                }
 252
 253                /*
 254                 * The AG reference count does two things: it enforces mutual
 255                 * exclusion when examining the suitability of an AG in this
 256                 * loop, and it guards against two filestreams being established
 257                 * in the same AG as each other.
 258                 */
 259                if (xfs_filestream_get_ag(mp, ag) > 1) {
 260                        xfs_filestream_put_ag(mp, ag);
 261                        goto next_ag;
 262                }
 263
 264                longest = xfs_alloc_longest_free_extent(mp, pag);
 265                if (((minlen && longest >= minlen) ||
 266                     (!minlen && pag->pagf_freeblks >= minfree)) &&
 267                    (!pag->pagf_metadata || !(flags & XFS_PICK_USERDATA) ||
 268                     (flags & XFS_PICK_LOWSPACE))) {
 269
 270                        /* Break out, retaining the reference on the AG. */
 271                        free = pag->pagf_freeblks;
 272                        streams = atomic_read(&pag->pagf_fstrms);
 273                        xfs_perag_put(pag);
 274                        *agp = ag;
 275                        break;
 276                }
 277
 278                /* Drop the reference on this AG, it's not usable. */
 279                xfs_filestream_put_ag(mp, ag);
 280next_ag:
 281                xfs_perag_put(pag);
 282                /* Move to the next AG, wrapping to AG 0 if necessary. */
 283                if (++ag >= mp->m_sb.sb_agcount)
 284                        ag = 0;
 285
 286                /* If a full pass of the AGs hasn't been done yet, continue. */
 287                if (ag != startag)
 288                        continue;
 289
 290                /* Allow sleeping in xfs_alloc_pagf_init() on the 2nd pass. */
 291                if (trylock != 0) {
 292                        trylock = 0;
 293                        continue;
 294                }
 295
 296                /* Finally, if lowspace wasn't set, set it for the 3rd pass. */
 297                if (!(flags & XFS_PICK_LOWSPACE)) {
 298                        flags |= XFS_PICK_LOWSPACE;
 299                        continue;
 300                }
 301
 302                /*
 303                 * Take the AG with the most free space, regardless of whether
 304                 * it's already in use by another filestream.
 305                 */
 306                if (max_ag != NULLAGNUMBER) {
 307                        xfs_filestream_get_ag(mp, max_ag);
 308                        TRACE_AG_PICK1(mp, max_ag, maxfree);
 309                        streams = max_streams;
 310                        free = maxfree;
 311                        *agp = max_ag;
 312                        break;
 313                }
 314
 315                /* take AG 0 if none matched */
 316                TRACE_AG_PICK1(mp, max_ag, maxfree);
 317                *agp = 0;
 318                return 0;
 319        }
 320
 321        TRACE_AG_PICK2(mp, startag, *agp, streams, free, nscan, flags);
 322
 323        return 0;
 324}
 325
 326/*
 327 * Set the allocation group number for a file or a directory, updating inode
 328 * references and per-AG references as appropriate.
 329 */
 330static int
 331_xfs_filestream_update_ag(
 332        xfs_inode_t     *ip,
 333        xfs_inode_t     *pip,
 334        xfs_agnumber_t  ag)
 335{
 336        int             err = 0;
 337        xfs_mount_t     *mp;
 338        xfs_mru_cache_t *cache;
 339        fstrm_item_t    *item;
 340        xfs_agnumber_t  old_ag;
 341        xfs_inode_t     *old_pip;
 342
 343        /*
 344         * Either ip is a regular file and pip is a directory, or ip is a
 345         * directory and pip is NULL.
 346         */
 347        ASSERT(ip && (((ip->i_d.di_mode & S_IFREG) && pip &&
 348                       (pip->i_d.di_mode & S_IFDIR)) ||
 349                      ((ip->i_d.di_mode & S_IFDIR) && !pip)));
 350
 351        mp = ip->i_mount;
 352        cache = mp->m_filestream;
 353
 354        item = xfs_mru_cache_lookup(cache, ip->i_ino);
 355        if (item) {
 356                ASSERT(item->ip == ip);
 357                old_ag = item->ag;
 358                item->ag = ag;
 359                old_pip = item->pip;
 360                item->pip = pip;
 361                xfs_mru_cache_done(cache);
 362
 363                /*
 364                 * If the AG has changed, drop the old ref and take a new one,
 365                 * effectively transferring the reference from old to new AG.
 366                 */
 367                if (ag != old_ag) {
 368                        xfs_filestream_put_ag(mp, old_ag);
 369                        xfs_filestream_get_ag(mp, ag);
 370                }
 371
 372                /*
 373                 * If ip is a file and its pip has changed, drop the old ref and
 374                 * take a new one.
 375                 */
 376                if (pip && pip != old_pip) {
 377                        IRELE(old_pip);
 378                        IHOLD(pip);
 379                }
 380
 381                TRACE_UPDATE(mp, ip, old_ag, xfs_filestream_peek_ag(mp, old_ag),
 382                                ag, xfs_filestream_peek_ag(mp, ag));
 383                return 0;
 384        }
 385
 386        item = kmem_zone_zalloc(item_zone, KM_MAYFAIL);
 387        if (!item)
 388                return ENOMEM;
 389
 390        item->ag = ag;
 391        item->ip = ip;
 392        item->pip = pip;
 393
 394        err = xfs_mru_cache_insert(cache, ip->i_ino, item);
 395        if (err) {
 396                kmem_zone_free(item_zone, item);
 397                return err;
 398        }
 399
 400        /* Take a reference on the AG. */
 401        xfs_filestream_get_ag(mp, ag);
 402
 403        /*
 404         * Take a reference on the inode itself regardless of whether it's a
 405         * regular file or a directory.
 406         */
 407        IHOLD(ip);
 408
 409        /*
 410         * In the case of a regular file, take a reference on the parent inode
 411         * as well to ensure it remains in-core.
 412         */
 413        if (pip)
 414                IHOLD(pip);
 415
 416        TRACE_UPDATE(mp, ip, ag, xfs_filestream_peek_ag(mp, ag),
 417                        ag, xfs_filestream_peek_ag(mp, ag));
 418
 419        return 0;
 420}
 421
 422/* xfs_fstrm_free_func(): callback for freeing cached stream items. */
 423STATIC void
 424xfs_fstrm_free_func(
 425        unsigned long   ino,
 426        void            *data)
 427{
 428        fstrm_item_t    *item  = (fstrm_item_t *)data;
 429        xfs_inode_t     *ip = item->ip;
 430
 431        ASSERT(ip->i_ino == ino);
 432
 433        xfs_iflags_clear(ip, XFS_IFILESTREAM);
 434
 435        /* Drop the reference taken on the AG when the item was added. */
 436        xfs_filestream_put_ag(ip->i_mount, item->ag);
 437
 438        TRACE_FREE(ip->i_mount, ip, item->pip, item->ag,
 439                xfs_filestream_peek_ag(ip->i_mount, item->ag));
 440
 441        /*
 442         * _xfs_filestream_update_ag() always takes a reference on the inode
 443         * itself, whether it's a file or a directory.  Release it here.
 444         * This can result in the inode being freed and so we must
 445         * not hold any inode locks when freeing filesstreams objects
 446         * otherwise we can deadlock here.
 447         */
 448        IRELE(ip);
 449
 450        /*
 451         * In the case of a regular file, _xfs_filestream_update_ag() also
 452         * takes a ref on the parent inode to keep it in-core.  Release that
 453         * too.
 454         */
 455        if (item->pip)
 456                IRELE(item->pip);
 457
 458        /* Finally, free the memory allocated for the item. */
 459        kmem_zone_free(item_zone, item);
 460}
 461
 462/*
 463 * xfs_filestream_init() is called at xfs initialisation time to set up the
 464 * memory zone that will be used for filestream data structure allocation.
 465 */
 466int
 467xfs_filestream_init(void)
 468{
 469        item_zone = kmem_zone_init(sizeof(fstrm_item_t), "fstrm_item");
 470        if (!item_zone)
 471                return -ENOMEM;
 472
 473        return 0;
 474}
 475
 476/*
 477 * xfs_filestream_uninit() is called at xfs termination time to destroy the
 478 * memory zone that was used for filestream data structure allocation.
 479 */
 480void
 481xfs_filestream_uninit(void)
 482{
 483        kmem_zone_destroy(item_zone);
 484}
 485
 486/*
 487 * xfs_filestream_mount() is called when a file system is mounted with the
 488 * filestream option.  It is responsible for allocating the data structures
 489 * needed to track the new file system's file streams.
 490 */
 491int
 492xfs_filestream_mount(
 493        xfs_mount_t     *mp)
 494{
 495        int             err;
 496        unsigned int    lifetime, grp_count;
 497
 498        /*
 499         * The filestream timer tunable is currently fixed within the range of
 500         * one second to four minutes, with five seconds being the default.  The
 501         * group count is somewhat arbitrary, but it'd be nice to adhere to the
 502         * timer tunable to within about 10 percent.  This requires at least 10
 503         * groups.
 504         */
 505        lifetime  = xfs_fstrm_centisecs * 10;
 506        grp_count = 10;
 507
 508        err = xfs_mru_cache_create(&mp->m_filestream, lifetime, grp_count,
 509                             xfs_fstrm_free_func);
 510
 511        return err;
 512}
 513
 514/*
 515 * xfs_filestream_unmount() is called when a file system that was mounted with
 516 * the filestream option is unmounted.  It drains the data structures created
 517 * to track the file system's file streams and frees all the memory that was
 518 * allocated.
 519 */
 520void
 521xfs_filestream_unmount(
 522        xfs_mount_t     *mp)
 523{
 524        xfs_mru_cache_destroy(mp->m_filestream);
 525}
 526
 527/*
 528 * Return the AG of the filestream the file or directory belongs to, or
 529 * NULLAGNUMBER otherwise.
 530 */
 531xfs_agnumber_t
 532xfs_filestream_lookup_ag(
 533        xfs_inode_t     *ip)
 534{
 535        xfs_mru_cache_t *cache;
 536        fstrm_item_t    *item;
 537        xfs_agnumber_t  ag;
 538        int             ref;
 539
 540        if (!(ip->i_d.di_mode & (S_IFREG | S_IFDIR))) {
 541                ASSERT(0);
 542                return NULLAGNUMBER;
 543        }
 544
 545        cache = ip->i_mount->m_filestream;
 546        item = xfs_mru_cache_lookup(cache, ip->i_ino);
 547        if (!item) {
 548                TRACE_LOOKUP(ip->i_mount, ip, NULL, NULLAGNUMBER, 0);
 549                return NULLAGNUMBER;
 550        }
 551
 552        ASSERT(ip == item->ip);
 553        ag = item->ag;
 554        ref = xfs_filestream_peek_ag(ip->i_mount, ag);
 555        xfs_mru_cache_done(cache);
 556
 557        TRACE_LOOKUP(ip->i_mount, ip, item->pip, ag, ref);
 558        return ag;
 559}
 560
 561/*
 562 * xfs_filestream_associate() should only be called to associate a regular file
 563 * with its parent directory.  Calling it with a child directory isn't
 564 * appropriate because filestreams don't apply to entire directory hierarchies.
 565 * Creating a file in a child directory of an existing filestream directory
 566 * starts a new filestream with its own allocation group association.
 567 *
 568 * Returns < 0 on error, 0 if successful association occurred, > 0 if
 569 * we failed to get an association because of locking issues.
 570 */
 571int
 572xfs_filestream_associate(
 573        xfs_inode_t     *pip,
 574        xfs_inode_t     *ip)
 575{
 576        xfs_mount_t     *mp;
 577        xfs_mru_cache_t *cache;
 578        fstrm_item_t    *item;
 579        xfs_agnumber_t  ag, rotorstep, startag;
 580        int             err = 0;
 581
 582        ASSERT(pip->i_d.di_mode & S_IFDIR);
 583        ASSERT(ip->i_d.di_mode & S_IFREG);
 584        if (!(pip->i_d.di_mode & S_IFDIR) || !(ip->i_d.di_mode & S_IFREG))
 585                return -EINVAL;
 586
 587        mp = pip->i_mount;
 588        cache = mp->m_filestream;
 589
 590        /*
 591         * We have a problem, Houston.
 592         *
 593         * Taking the iolock here violates inode locking order - we already
 594         * hold the ilock. Hence if we block getting this lock we may never
 595         * wake. Unfortunately, that means if we can't get the lock, we're
 596         * screwed in terms of getting a stream association - we can't spin
 597         * waiting for the lock because someone else is waiting on the lock we
 598         * hold and we cannot drop that as we are in a transaction here.
 599         *
 600         * Lucky for us, this inversion is not a problem because it's a
 601         * directory inode that we are trying to lock here.
 602         *
 603         * So, if we can't get the iolock without sleeping then just give up
 604         */
 605        if (!xfs_ilock_nowait(pip, XFS_IOLOCK_EXCL))
 606                return 1;
 607
 608        /* If the parent directory is already in the cache, use its AG. */
 609        item = xfs_mru_cache_lookup(cache, pip->i_ino);
 610        if (item) {
 611                ASSERT(item->ip == pip);
 612                ag = item->ag;
 613                xfs_mru_cache_done(cache);
 614
 615                TRACE_LOOKUP(mp, pip, pip, ag, xfs_filestream_peek_ag(mp, ag));
 616                err = _xfs_filestream_update_ag(ip, pip, ag);
 617
 618                goto exit;
 619        }
 620
 621        /*
 622         * Set the starting AG using the rotor for inode32, otherwise
 623         * use the directory inode's AG.
 624         */
 625        if (mp->m_flags & XFS_MOUNT_32BITINODES) {
 626                rotorstep = xfs_rotorstep;
 627                startag = (mp->m_agfrotor / rotorstep) % mp->m_sb.sb_agcount;
 628                mp->m_agfrotor = (mp->m_agfrotor + 1) %
 629                                 (mp->m_sb.sb_agcount * rotorstep);
 630        } else
 631                startag = XFS_INO_TO_AGNO(mp, pip->i_ino);
 632
 633        /* Pick a new AG for the parent inode starting at startag. */
 634        err = _xfs_filestream_pick_ag(mp, startag, &ag, 0, 0);
 635        if (err || ag == NULLAGNUMBER)
 636                goto exit_did_pick;
 637
 638        /* Associate the parent inode with the AG. */
 639        err = _xfs_filestream_update_ag(pip, NULL, ag);
 640        if (err)
 641                goto exit_did_pick;
 642
 643        /* Associate the file inode with the AG. */
 644        err = _xfs_filestream_update_ag(ip, pip, ag);
 645        if (err)
 646                goto exit_did_pick;
 647
 648        TRACE_ASSOCIATE(mp, ip, pip, ag, xfs_filestream_peek_ag(mp, ag));
 649
 650exit_did_pick:
 651        /*
 652         * If _xfs_filestream_pick_ag() returned a valid AG, remove the
 653         * reference it took on it, since the file and directory will have taken
 654         * their own now if they were successfully cached.
 655         */
 656        if (ag != NULLAGNUMBER)
 657                xfs_filestream_put_ag(mp, ag);
 658
 659exit:
 660        xfs_iunlock(pip, XFS_IOLOCK_EXCL);
 661        return -err;
 662}
 663
 664/*
 665 * Pick a new allocation group for the current file and its file stream.  This
 666 * function is called by xfs_bmap_filestreams() with the mount point's per-ag
 667 * lock held.
 668 */
 669int
 670xfs_filestream_new_ag(
 671        xfs_bmalloca_t  *ap,
 672        xfs_agnumber_t  *agp)
 673{
 674        int             flags, err;
 675        xfs_inode_t     *ip, *pip = NULL;
 676        xfs_mount_t     *mp;
 677        xfs_mru_cache_t *cache;
 678        xfs_extlen_t    minlen;
 679        fstrm_item_t    *dir, *file;
 680        xfs_agnumber_t  ag = NULLAGNUMBER;
 681
 682        ip = ap->ip;
 683        mp = ip->i_mount;
 684        cache = mp->m_filestream;
 685        minlen = ap->alen;
 686        *agp = NULLAGNUMBER;
 687
 688        /*
 689         * Look for the file in the cache, removing it if it's found.  Doing
 690         * this allows it to be held across the dir lookup that follows.
 691         */
 692        file = xfs_mru_cache_remove(cache, ip->i_ino);
 693        if (file) {
 694                ASSERT(ip == file->ip);
 695
 696                /* Save the file's parent inode and old AG number for later. */
 697                pip = file->pip;
 698                ag = file->ag;
 699
 700                /* Look for the file's directory in the cache. */
 701                dir = xfs_mru_cache_lookup(cache, pip->i_ino);
 702                if (dir) {
 703                        ASSERT(pip == dir->ip);
 704
 705                        /*
 706                         * If the directory has already moved on to a new AG,
 707                         * use that AG as the new AG for the file. Don't
 708                         * forget to twiddle the AG refcounts to match the
 709                         * movement.
 710                         */
 711                        if (dir->ag != file->ag) {
 712                                xfs_filestream_put_ag(mp, file->ag);
 713                                xfs_filestream_get_ag(mp, dir->ag);
 714                                *agp = file->ag = dir->ag;
 715                        }
 716
 717                        xfs_mru_cache_done(cache);
 718                }
 719
 720                /*
 721                 * Put the file back in the cache.  If this fails, the free
 722                 * function needs to be called to tidy up in the same way as if
 723                 * the item had simply expired from the cache.
 724                 */
 725                err = xfs_mru_cache_insert(cache, ip->i_ino, file);
 726                if (err) {
 727                        xfs_fstrm_free_func(ip->i_ino, file);
 728                        return err;
 729                }
 730
 731                /*
 732                 * If the file's AG was moved to the directory's new AG, there's
 733                 * nothing more to be done.
 734                 */
 735                if (*agp != NULLAGNUMBER) {
 736                        TRACE_MOVEAG(mp, ip, pip,
 737                                        ag, xfs_filestream_peek_ag(mp, ag),
 738                                        *agp, xfs_filestream_peek_ag(mp, *agp));
 739                        return 0;
 740                }
 741        }
 742
 743        /*
 744         * If the file's parent directory is known, take its iolock in exclusive
 745         * mode to prevent two sibling files from racing each other to migrate
 746         * themselves and their parent to different AGs.
 747         *
 748         * Note that we lock the parent directory iolock inside the child
 749         * iolock here.  That's fine as we never hold both parent and child
 750         * iolock in any other place.  This is different from the ilock,
 751         * which requires locking of the child after the parent for namespace
 752         * operations.
 753         */
 754        if (pip)
 755                xfs_ilock(pip, XFS_IOLOCK_EXCL | XFS_IOLOCK_PARENT);
 756
 757        /*
 758         * A new AG needs to be found for the file.  If the file's parent
 759         * directory is also known, it will be moved to the new AG as well to
 760         * ensure that files created inside it in future use the new AG.
 761         */
 762        ag = (ag == NULLAGNUMBER) ? 0 : (ag + 1) % mp->m_sb.sb_agcount;
 763        flags = (ap->userdata ? XFS_PICK_USERDATA : 0) |
 764                (ap->low ? XFS_PICK_LOWSPACE : 0);
 765
 766        err = _xfs_filestream_pick_ag(mp, ag, agp, flags, minlen);
 767        if (err || *agp == NULLAGNUMBER)
 768                goto exit;
 769
 770        /*
 771         * If the file wasn't found in the file cache, then its parent directory
 772         * inode isn't known.  For this to have happened, the file must either
 773         * be pre-existing, or it was created long enough ago that its cache
 774         * entry has expired.  This isn't the sort of usage that the filestreams
 775         * allocator is trying to optimise, so there's no point trying to track
 776         * its new AG somehow in the filestream data structures.
 777         */
 778        if (!pip) {
 779                TRACE_ORPHAN(mp, ip, *agp);
 780                goto exit;
 781        }
 782
 783        /* Associate the parent inode with the AG. */
 784        err = _xfs_filestream_update_ag(pip, NULL, *agp);
 785        if (err)
 786                goto exit;
 787
 788        /* Associate the file inode with the AG. */
 789        err = _xfs_filestream_update_ag(ip, pip, *agp);
 790        if (err)
 791                goto exit;
 792
 793        TRACE_MOVEAG(mp, ip, pip, NULLAGNUMBER, 0,
 794                        *agp, xfs_filestream_peek_ag(mp, *agp));
 795
 796exit:
 797        /*
 798         * If _xfs_filestream_pick_ag() returned a valid AG, remove the
 799         * reference it took on it, since the file and directory will have taken
 800         * their own now if they were successfully cached.
 801         */
 802        if (*agp != NULLAGNUMBER)
 803                xfs_filestream_put_ag(mp, *agp);
 804        else
 805                *agp = 0;
 806
 807        if (pip)
 808                xfs_iunlock(pip, XFS_IOLOCK_EXCL);
 809
 810        return err;
 811}
 812
 813/*
 814 * Remove an association between an inode and a filestream object.
 815 * Typically this is done on last close of an unlinked file.
 816 */
 817void
 818xfs_filestream_deassociate(
 819        xfs_inode_t     *ip)
 820{
 821        xfs_mru_cache_t *cache = ip->i_mount->m_filestream;
 822
 823        xfs_mru_cache_delete(cache, ip->i_ino);
 824}
 825