linux/fs/notify/inode_mark.c
<<
>>
Prefs
   1/*
   2 *  Copyright (C) 2008 Red Hat, Inc., Eric Paris <eparis@redhat.com>
   3 *
   4 *  This program is free software; you can redistribute it and/or modify
   5 *  it under the terms of the GNU General Public License as published by
   6 *  the Free Software Foundation; either version 2, or (at your option)
   7 *  any later version.
   8 *
   9 *  This program is distributed in the hope that it will be useful,
  10 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
  11 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12 *  GNU General Public License for more details.
  13 *
  14 *  You should have received a copy of the GNU General Public License
  15 *  along with this program; see the file COPYING.  If not, write to
  16 *  the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
  17 */
  18
  19/*
  20 * fsnotify inode mark locking/lifetime/and refcnting
  21 *
  22 * REFCNT:
  23 * The mark->refcnt tells how many "things" in the kernel currently are
  24 * referencing this object.  The object typically will live inside the kernel
  25 * with a refcnt of 2, one for each list it is on (i_list, g_list).  Any task
  26 * which can find this object holding the appropriete locks, can take a reference
  27 * and the object itself is guarenteed to survive until the reference is dropped.
  28 *
  29 * LOCKING:
  30 * There are 3 spinlocks involved with fsnotify inode marks and they MUST
  31 * be taken in order as follows:
  32 *
  33 * entry->lock
  34 * group->mark_lock
  35 * inode->i_lock
  36 *
  37 * entry->lock protects 2 things, entry->group and entry->inode.  You must hold
  38 * that lock to dereference either of these things (they could be NULL even with
  39 * the lock)
  40 *
  41 * group->mark_lock protects the mark_entries list anchored inside a given group
  42 * and each entry is hooked via the g_list.  It also sorta protects the
  43 * free_g_list, which when used is anchored by a private list on the stack of the
  44 * task which held the group->mark_lock.
  45 *
  46 * inode->i_lock protects the i_fsnotify_mark_entries list anchored inside a
  47 * given inode and each entry is hooked via the i_list. (and sorta the
  48 * free_i_list)
  49 *
  50 *
  51 * LIFETIME:
  52 * Inode marks survive between when they are added to an inode and when their
  53 * refcnt==0.
  54 *
  55 * The inode mark can be cleared for a number of different reasons including:
  56 * - The inode is unlinked for the last time.  (fsnotify_inode_remove)
  57 * - The inode is being evicted from cache. (fsnotify_inode_delete)
  58 * - The fs the inode is on is unmounted.  (fsnotify_inode_delete/fsnotify_unmount_inodes)
  59 * - Something explicitly requests that it be removed.  (fsnotify_destroy_mark_by_entry)
  60 * - The fsnotify_group associated with the mark is going away and all such marks
  61 *   need to be cleaned up. (fsnotify_clear_marks_by_group)
  62 *
  63 * Worst case we are given an inode and need to clean up all the marks on that
  64 * inode.  We take i_lock and walk the i_fsnotify_mark_entries safely.  For each
  65 * mark on the list we take a reference (so the mark can't disappear under us).
  66 * We remove that mark form the inode's list of marks and we add this mark to a
  67 * private list anchored on the stack using i_free_list;  At this point we no
  68 * longer fear anything finding the mark using the inode's list of marks.
  69 *
  70 * We can safely and locklessly run the private list on the stack of everything
  71 * we just unattached from the original inode.  For each mark on the private list
  72 * we grab the mark-> and can thus dereference mark->group and mark->inode.  If
  73 * we see the group and inode are not NULL we take those locks.  Now holding all
  74 * 3 locks we can completely remove the mark from other tasks finding it in the
  75 * future.  Remember, 10 things might already be referencing this mark, but they
  76 * better be holding a ref.  We drop our reference we took before we unhooked it
  77 * from the inode.  When the ref hits 0 we can free the mark.
  78 *
  79 * Very similarly for freeing by group, except we use free_g_list.
  80 *
  81 * This has the very interesting property of being able to run concurrently with
  82 * any (or all) other directions.
  83 */
  84
  85#include <linux/fs.h>
  86#include <linux/init.h>
  87#include <linux/kernel.h>
  88#include <linux/module.h>
  89#include <linux/mutex.h>
  90#include <linux/slab.h>
  91#include <linux/spinlock.h>
  92#include <linux/writeback.h> /* for inode_lock */
  93
  94#include <asm/atomic.h>
  95
  96#include <linux/fsnotify_backend.h>
  97#include "fsnotify.h"
  98
  99void fsnotify_get_mark(struct fsnotify_mark_entry *entry)
 100{
 101        atomic_inc(&entry->refcnt);
 102}
 103
 104void fsnotify_put_mark(struct fsnotify_mark_entry *entry)
 105{
 106        if (atomic_dec_and_test(&entry->refcnt))
 107                entry->free_mark(entry);
 108}
 109
 110/*
 111 * Recalculate the mask of events relevant to a given inode locked.
 112 */
 113static void fsnotify_recalc_inode_mask_locked(struct inode *inode)
 114{
 115        struct fsnotify_mark_entry *entry;
 116        struct hlist_node *pos;
 117        __u32 new_mask = 0;
 118
 119        assert_spin_locked(&inode->i_lock);
 120
 121        hlist_for_each_entry(entry, pos, &inode->i_fsnotify_mark_entries, i_list)
 122                new_mask |= entry->mask;
 123        inode->i_fsnotify_mask = new_mask;
 124}
 125
 126/*
 127 * Recalculate the inode->i_fsnotify_mask, or the mask of all FS_* event types
 128 * any notifier is interested in hearing for this inode.
 129 */
 130void fsnotify_recalc_inode_mask(struct inode *inode)
 131{
 132        spin_lock(&inode->i_lock);
 133        fsnotify_recalc_inode_mask_locked(inode);
 134        spin_unlock(&inode->i_lock);
 135
 136        __fsnotify_update_child_dentry_flags(inode);
 137}
 138
 139/*
 140 * Any time a mark is getting freed we end up here.
 141 * The caller had better be holding a reference to this mark so we don't actually
 142 * do the final put under the entry->lock
 143 */
 144void fsnotify_destroy_mark_by_entry(struct fsnotify_mark_entry *entry)
 145{
 146        struct fsnotify_group *group;
 147        struct inode *inode;
 148
 149        spin_lock(&entry->lock);
 150
 151        group = entry->group;
 152        inode = entry->inode;
 153
 154        BUG_ON(group && !inode);
 155        BUG_ON(!group && inode);
 156
 157        /* if !group something else already marked this to die */
 158        if (!group) {
 159                spin_unlock(&entry->lock);
 160                return;
 161        }
 162
 163        /* 1 from caller and 1 for being on i_list/g_list */
 164        BUG_ON(atomic_read(&entry->refcnt) < 2);
 165
 166        spin_lock(&group->mark_lock);
 167        spin_lock(&inode->i_lock);
 168
 169        hlist_del_init(&entry->i_list);
 170        entry->inode = NULL;
 171
 172        list_del_init(&entry->g_list);
 173        entry->group = NULL;
 174
 175        fsnotify_put_mark(entry); /* for i_list and g_list */
 176
 177        /*
 178         * this mark is now off the inode->i_fsnotify_mark_entries list and we
 179         * hold the inode->i_lock, so this is the perfect time to update the
 180         * inode->i_fsnotify_mask
 181         */
 182        fsnotify_recalc_inode_mask_locked(inode);
 183
 184        spin_unlock(&inode->i_lock);
 185        spin_unlock(&group->mark_lock);
 186        spin_unlock(&entry->lock);
 187
 188        /*
 189         * Some groups like to know that marks are being freed.  This is a
 190         * callback to the group function to let it know that this entry
 191         * is being freed.
 192         */
 193        if (group->ops->freeing_mark)
 194                group->ops->freeing_mark(entry, group);
 195
 196        /*
 197         * __fsnotify_update_child_dentry_flags(inode);
 198         *
 199         * I really want to call that, but we can't, we have no idea if the inode
 200         * still exists the second we drop the entry->lock.
 201         *
 202         * The next time an event arrive to this inode from one of it's children
 203         * __fsnotify_parent will see that the inode doesn't care about it's
 204         * children and will update all of these flags then.  So really this
 205         * is just a lazy update (and could be a perf win...)
 206         */
 207
 208
 209        iput(inode);
 210
 211        /*
 212         * it's possible that this group tried to destroy itself, but this
 213         * this mark was simultaneously being freed by inode.  If that's the
 214         * case, we finish freeing the group here.
 215         */
 216        if (unlikely(atomic_dec_and_test(&group->num_marks)))
 217                fsnotify_final_destroy_group(group);
 218}
 219
 220/*
 221 * Given a group, destroy all of the marks associated with that group.
 222 */
 223void fsnotify_clear_marks_by_group(struct fsnotify_group *group)
 224{
 225        struct fsnotify_mark_entry *lentry, *entry;
 226        LIST_HEAD(free_list);
 227
 228        spin_lock(&group->mark_lock);
 229        list_for_each_entry_safe(entry, lentry, &group->mark_entries, g_list) {
 230                list_add(&entry->free_g_list, &free_list);
 231                list_del_init(&entry->g_list);
 232                fsnotify_get_mark(entry);
 233        }
 234        spin_unlock(&group->mark_lock);
 235
 236        list_for_each_entry_safe(entry, lentry, &free_list, free_g_list) {
 237                fsnotify_destroy_mark_by_entry(entry);
 238                fsnotify_put_mark(entry);
 239        }
 240}
 241
 242/*
 243 * Given an inode, destroy all of the marks associated with that inode.
 244 */
 245void fsnotify_clear_marks_by_inode(struct inode *inode)
 246{
 247        struct fsnotify_mark_entry *entry, *lentry;
 248        struct hlist_node *pos, *n;
 249        LIST_HEAD(free_list);
 250
 251        spin_lock(&inode->i_lock);
 252        hlist_for_each_entry_safe(entry, pos, n, &inode->i_fsnotify_mark_entries, i_list) {
 253                list_add(&entry->free_i_list, &free_list);
 254                hlist_del_init(&entry->i_list);
 255                fsnotify_get_mark(entry);
 256        }
 257        spin_unlock(&inode->i_lock);
 258
 259        list_for_each_entry_safe(entry, lentry, &free_list, free_i_list) {
 260                fsnotify_destroy_mark_by_entry(entry);
 261                fsnotify_put_mark(entry);
 262        }
 263}
 264
 265/*
 266 * given a group and inode, find the mark associated with that combination.
 267 * if found take a reference to that mark and return it, else return NULL
 268 */
 269struct fsnotify_mark_entry *fsnotify_find_mark_entry(struct fsnotify_group *group,
 270                                                     struct inode *inode)
 271{
 272        struct fsnotify_mark_entry *entry;
 273        struct hlist_node *pos;
 274
 275        assert_spin_locked(&inode->i_lock);
 276
 277        hlist_for_each_entry(entry, pos, &inode->i_fsnotify_mark_entries, i_list) {
 278                if (entry->group == group) {
 279                        fsnotify_get_mark(entry);
 280                        return entry;
 281                }
 282        }
 283        return NULL;
 284}
 285
 286/*
 287 * Nothing fancy, just initialize lists and locks and counters.
 288 */
 289void fsnotify_init_mark(struct fsnotify_mark_entry *entry,
 290                        void (*free_mark)(struct fsnotify_mark_entry *entry))
 291
 292{
 293        spin_lock_init(&entry->lock);
 294        atomic_set(&entry->refcnt, 1);
 295        INIT_HLIST_NODE(&entry->i_list);
 296        entry->group = NULL;
 297        entry->mask = 0;
 298        entry->inode = NULL;
 299        entry->free_mark = free_mark;
 300}
 301
 302/*
 303 * Attach an initialized mark entry to a given group and inode.
 304 * These marks may be used for the fsnotify backend to determine which
 305 * event types should be delivered to which group and for which inodes.
 306 */
 307int fsnotify_add_mark(struct fsnotify_mark_entry *entry,
 308                      struct fsnotify_group *group, struct inode *inode)
 309{
 310        struct fsnotify_mark_entry *lentry;
 311        int ret = 0;
 312
 313        inode = igrab(inode);
 314        if (unlikely(!inode))
 315                return -EINVAL;
 316
 317        /*
 318         * LOCKING ORDER!!!!
 319         * entry->lock
 320         * group->mark_lock
 321         * inode->i_lock
 322         */
 323        spin_lock(&entry->lock);
 324        spin_lock(&group->mark_lock);
 325        spin_lock(&inode->i_lock);
 326
 327        lentry = fsnotify_find_mark_entry(group, inode);
 328        if (!lentry) {
 329                entry->group = group;
 330                entry->inode = inode;
 331
 332                hlist_add_head(&entry->i_list, &inode->i_fsnotify_mark_entries);
 333                list_add(&entry->g_list, &group->mark_entries);
 334
 335                fsnotify_get_mark(entry); /* for i_list and g_list */
 336
 337                atomic_inc(&group->num_marks);
 338
 339                fsnotify_recalc_inode_mask_locked(inode);
 340        }
 341
 342        spin_unlock(&inode->i_lock);
 343        spin_unlock(&group->mark_lock);
 344        spin_unlock(&entry->lock);
 345
 346        if (lentry) {
 347                ret = -EEXIST;
 348                iput(inode);
 349                fsnotify_put_mark(lentry);
 350        } else {
 351                __fsnotify_update_child_dentry_flags(inode);
 352        }
 353
 354        return ret;
 355}
 356
 357/**
 358 * fsnotify_unmount_inodes - an sb is unmounting.  handle any watched inodes.
 359 * @list: list of inodes being unmounted (sb->s_inodes)
 360 *
 361 * Called with inode_lock held, protecting the unmounting super block's list
 362 * of inodes, and with iprune_mutex held, keeping shrink_icache_memory() at bay.
 363 * We temporarily drop inode_lock, however, and CAN block.
 364 */
 365void fsnotify_unmount_inodes(struct list_head *list)
 366{
 367        struct inode *inode, *next_i, *need_iput = NULL;
 368
 369        list_for_each_entry_safe(inode, next_i, list, i_sb_list) {
 370                struct inode *need_iput_tmp;
 371
 372                /*
 373                 * We cannot __iget() an inode in state I_CLEAR, I_FREEING,
 374                 * I_WILL_FREE, or I_NEW which is fine because by that point
 375                 * the inode cannot have any associated watches.
 376                 */
 377                if (inode->i_state & (I_CLEAR|I_FREEING|I_WILL_FREE|I_NEW))
 378                        continue;
 379
 380                /*
 381                 * If i_count is zero, the inode cannot have any watches and
 382                 * doing an __iget/iput with MS_ACTIVE clear would actually
 383                 * evict all inodes with zero i_count from icache which is
 384                 * unnecessarily violent and may in fact be illegal to do.
 385                 */
 386                if (!atomic_read(&inode->i_count))
 387                        continue;
 388
 389                need_iput_tmp = need_iput;
 390                need_iput = NULL;
 391
 392                /* In case fsnotify_inode_delete() drops a reference. */
 393                if (inode != need_iput_tmp)
 394                        __iget(inode);
 395                else
 396                        need_iput_tmp = NULL;
 397
 398                /* In case the dropping of a reference would nuke next_i. */
 399                if ((&next_i->i_sb_list != list) &&
 400                    atomic_read(&next_i->i_count) &&
 401                    !(next_i->i_state & (I_CLEAR | I_FREEING | I_WILL_FREE))) {
 402                        __iget(next_i);
 403                        need_iput = next_i;
 404                }
 405
 406                /*
 407                 * We can safely drop inode_lock here because we hold
 408                 * references on both inode and next_i.  Also no new inodes
 409                 * will be added since the umount has begun.  Finally,
 410                 * iprune_mutex keeps shrink_icache_memory() away.
 411                 */
 412                spin_unlock(&inode_lock);
 413
 414                if (need_iput_tmp)
 415                        iput(need_iput_tmp);
 416
 417                /* for each watch, send FS_UNMOUNT and then remove it */
 418                fsnotify(inode, FS_UNMOUNT, inode, FSNOTIFY_EVENT_INODE, NULL, 0);
 419
 420                fsnotify_inode_delete(inode);
 421
 422                iput(inode);
 423
 424                spin_lock(&inode_lock);
 425        }
 426}
 427