linux/kernel/watch_queue.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0
   2/* Watch queue and general notification mechanism, built on pipes
   3 *
   4 * Copyright (C) 2020 Red Hat, Inc. All Rights Reserved.
   5 * Written by David Howells (dhowells@redhat.com)
   6 *
   7 * See Documentation/watch_queue.rst
   8 */
   9
  10#define pr_fmt(fmt) "watchq: " fmt
  11#include <linux/module.h>
  12#include <linux/init.h>
  13#include <linux/sched.h>
  14#include <linux/slab.h>
  15#include <linux/printk.h>
  16#include <linux/miscdevice.h>
  17#include <linux/fs.h>
  18#include <linux/mm.h>
  19#include <linux/pagemap.h>
  20#include <linux/poll.h>
  21#include <linux/uaccess.h>
  22#include <linux/vmalloc.h>
  23#include <linux/file.h>
  24#include <linux/security.h>
  25#include <linux/cred.h>
  26#include <linux/sched/signal.h>
  27#include <linux/watch_queue.h>
  28#include <linux/pipe_fs_i.h>
  29
  30MODULE_DESCRIPTION("Watch queue");
  31MODULE_AUTHOR("Red Hat, Inc.");
  32MODULE_LICENSE("GPL");
  33
  34#define WATCH_QUEUE_NOTE_SIZE 128
  35#define WATCH_QUEUE_NOTES_PER_PAGE (PAGE_SIZE / WATCH_QUEUE_NOTE_SIZE)
  36
  37static void watch_queue_pipe_buf_release(struct pipe_inode_info *pipe,
  38                                         struct pipe_buffer *buf)
  39{
  40        struct watch_queue *wqueue = (struct watch_queue *)buf->private;
  41        struct page *page;
  42        unsigned int bit;
  43
  44        /* We need to work out which note within the page this refers to, but
  45         * the note might have been maximum size, so merely ANDing the offset
  46         * off doesn't work.  OTOH, the note must've been more than zero size.
  47         */
  48        bit = buf->offset + buf->len;
  49        if ((bit & (WATCH_QUEUE_NOTE_SIZE - 1)) == 0)
  50                bit -= WATCH_QUEUE_NOTE_SIZE;
  51        bit /= WATCH_QUEUE_NOTE_SIZE;
  52
  53        page = buf->page;
  54        bit += page->index;
  55
  56        set_bit(bit, wqueue->notes_bitmap);
  57}
  58
  59// No try_steal function => no stealing
  60#define watch_queue_pipe_buf_try_steal NULL
  61
  62/* New data written to a pipe may be appended to a buffer with this type. */
  63static const struct pipe_buf_operations watch_queue_pipe_buf_ops = {
  64        .release        = watch_queue_pipe_buf_release,
  65        .try_steal      = watch_queue_pipe_buf_try_steal,
  66        .get            = generic_pipe_buf_get,
  67};
  68
  69/*
  70 * Post a notification to a watch queue.
  71 */
  72static bool post_one_notification(struct watch_queue *wqueue,
  73                                  struct watch_notification *n)
  74{
  75        void *p;
  76        struct pipe_inode_info *pipe = wqueue->pipe;
  77        struct pipe_buffer *buf;
  78        struct page *page;
  79        unsigned int head, tail, mask, note, offset, len;
  80        bool done = false;
  81
  82        if (!pipe)
  83                return false;
  84
  85        spin_lock_irq(&pipe->rd_wait.lock);
  86
  87        if (wqueue->defunct)
  88                goto out;
  89
  90        mask = pipe->ring_size - 1;
  91        head = pipe->head;
  92        tail = pipe->tail;
  93        if (pipe_full(head, tail, pipe->ring_size))
  94                goto lost;
  95
  96        note = find_first_bit(wqueue->notes_bitmap, wqueue->nr_notes);
  97        if (note >= wqueue->nr_notes)
  98                goto lost;
  99
 100        page = wqueue->notes[note / WATCH_QUEUE_NOTES_PER_PAGE];
 101        offset = note % WATCH_QUEUE_NOTES_PER_PAGE * WATCH_QUEUE_NOTE_SIZE;
 102        get_page(page);
 103        len = n->info & WATCH_INFO_LENGTH;
 104        p = kmap_atomic(page);
 105        memcpy(p + offset, n, len);
 106        kunmap_atomic(p);
 107
 108        buf = &pipe->bufs[head & mask];
 109        buf->page = page;
 110        buf->private = (unsigned long)wqueue;
 111        buf->ops = &watch_queue_pipe_buf_ops;
 112        buf->offset = offset;
 113        buf->len = len;
 114        buf->flags = PIPE_BUF_FLAG_WHOLE;
 115        pipe->head = head + 1;
 116
 117        if (!test_and_clear_bit(note, wqueue->notes_bitmap)) {
 118                spin_unlock_irq(&pipe->rd_wait.lock);
 119                BUG();
 120        }
 121        wake_up_interruptible_sync_poll_locked(&pipe->rd_wait, EPOLLIN | EPOLLRDNORM);
 122        done = true;
 123
 124out:
 125        spin_unlock_irq(&pipe->rd_wait.lock);
 126        if (done)
 127                kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
 128        return done;
 129
 130lost:
 131        buf = &pipe->bufs[(head - 1) & mask];
 132        buf->flags |= PIPE_BUF_FLAG_LOSS;
 133        goto out;
 134}
 135
 136/*
 137 * Apply filter rules to a notification.
 138 */
 139static bool filter_watch_notification(const struct watch_filter *wf,
 140                                      const struct watch_notification *n)
 141{
 142        const struct watch_type_filter *wt;
 143        unsigned int st_bits = sizeof(wt->subtype_filter[0]) * 8;
 144        unsigned int st_index = n->subtype / st_bits;
 145        unsigned int st_bit = 1U << (n->subtype % st_bits);
 146        int i;
 147
 148        if (!test_bit(n->type, wf->type_filter))
 149                return false;
 150
 151        for (i = 0; i < wf->nr_filters; i++) {
 152                wt = &wf->filters[i];
 153                if (n->type == wt->type &&
 154                    (wt->subtype_filter[st_index] & st_bit) &&
 155                    (n->info & wt->info_mask) == wt->info_filter)
 156                        return true;
 157        }
 158
 159        return false; /* If there is a filter, the default is to reject. */
 160}
 161
 162/**
 163 * __post_watch_notification - Post an event notification
 164 * @wlist: The watch list to post the event to.
 165 * @n: The notification record to post.
 166 * @cred: The creds of the process that triggered the notification.
 167 * @id: The ID to match on the watch.
 168 *
 169 * Post a notification of an event into a set of watch queues and let the users
 170 * know.
 171 *
 172 * The size of the notification should be set in n->info & WATCH_INFO_LENGTH and
 173 * should be in units of sizeof(*n).
 174 */
 175void __post_watch_notification(struct watch_list *wlist,
 176                               struct watch_notification *n,
 177                               const struct cred *cred,
 178                               u64 id)
 179{
 180        const struct watch_filter *wf;
 181        struct watch_queue *wqueue;
 182        struct watch *watch;
 183
 184        if (((n->info & WATCH_INFO_LENGTH) >> WATCH_INFO_LENGTH__SHIFT) == 0) {
 185                WARN_ON(1);
 186                return;
 187        }
 188
 189        rcu_read_lock();
 190
 191        hlist_for_each_entry_rcu(watch, &wlist->watchers, list_node) {
 192                if (watch->id != id)
 193                        continue;
 194                n->info &= ~WATCH_INFO_ID;
 195                n->info |= watch->info_id;
 196
 197                wqueue = rcu_dereference(watch->queue);
 198                wf = rcu_dereference(wqueue->filter);
 199                if (wf && !filter_watch_notification(wf, n))
 200                        continue;
 201
 202                if (security_post_notification(watch->cred, cred, n) < 0)
 203                        continue;
 204
 205                post_one_notification(wqueue, n);
 206        }
 207
 208        rcu_read_unlock();
 209}
 210EXPORT_SYMBOL(__post_watch_notification);
 211
 212/*
 213 * Allocate sufficient pages to preallocation for the requested number of
 214 * notifications.
 215 */
 216long watch_queue_set_size(struct pipe_inode_info *pipe, unsigned int nr_notes)
 217{
 218        struct watch_queue *wqueue = pipe->watch_queue;
 219        struct page **pages;
 220        unsigned long *bitmap;
 221        unsigned long user_bufs;
 222        unsigned int bmsize;
 223        int ret, i, nr_pages;
 224
 225        if (!wqueue)
 226                return -ENODEV;
 227        if (wqueue->notes)
 228                return -EBUSY;
 229
 230        if (nr_notes < 1 ||
 231            nr_notes > 512) /* TODO: choose a better hard limit */
 232                return -EINVAL;
 233
 234        nr_pages = (nr_notes + WATCH_QUEUE_NOTES_PER_PAGE - 1);
 235        nr_pages /= WATCH_QUEUE_NOTES_PER_PAGE;
 236        user_bufs = account_pipe_buffers(pipe->user, pipe->nr_accounted, nr_pages);
 237
 238        if (nr_pages > pipe->max_usage &&
 239            (too_many_pipe_buffers_hard(user_bufs) ||
 240             too_many_pipe_buffers_soft(user_bufs)) &&
 241            pipe_is_unprivileged_user()) {
 242                ret = -EPERM;
 243                goto error;
 244        }
 245
 246        ret = pipe_resize_ring(pipe, nr_notes);
 247        if (ret < 0)
 248                goto error;
 249
 250        pages = kcalloc(sizeof(struct page *), nr_pages, GFP_KERNEL);
 251        if (!pages)
 252                goto error;
 253
 254        for (i = 0; i < nr_pages; i++) {
 255                pages[i] = alloc_page(GFP_KERNEL);
 256                if (!pages[i])
 257                        goto error_p;
 258                pages[i]->index = i * WATCH_QUEUE_NOTES_PER_PAGE;
 259        }
 260
 261        bmsize = (nr_notes + BITS_PER_LONG - 1) / BITS_PER_LONG;
 262        bmsize *= sizeof(unsigned long);
 263        bitmap = kmalloc(bmsize, GFP_KERNEL);
 264        if (!bitmap)
 265                goto error_p;
 266
 267        memset(bitmap, 0xff, bmsize);
 268        wqueue->notes = pages;
 269        wqueue->notes_bitmap = bitmap;
 270        wqueue->nr_pages = nr_pages;
 271        wqueue->nr_notes = nr_pages * WATCH_QUEUE_NOTES_PER_PAGE;
 272        return 0;
 273
 274error_p:
 275        for (i = 0; i < nr_pages; i++)
 276                __free_page(pages[i]);
 277        kfree(pages);
 278error:
 279        (void) account_pipe_buffers(pipe->user, nr_pages, pipe->nr_accounted);
 280        return ret;
 281}
 282
 283/*
 284 * Set the filter on a watch queue.
 285 */
 286long watch_queue_set_filter(struct pipe_inode_info *pipe,
 287                            struct watch_notification_filter __user *_filter)
 288{
 289        struct watch_notification_type_filter *tf;
 290        struct watch_notification_filter filter;
 291        struct watch_type_filter *q;
 292        struct watch_filter *wfilter;
 293        struct watch_queue *wqueue = pipe->watch_queue;
 294        int ret, nr_filter = 0, i;
 295
 296        if (!wqueue)
 297                return -ENODEV;
 298
 299        if (!_filter) {
 300                /* Remove the old filter */
 301                wfilter = NULL;
 302                goto set;
 303        }
 304
 305        /* Grab the user's filter specification */
 306        if (copy_from_user(&filter, _filter, sizeof(filter)) != 0)
 307                return -EFAULT;
 308        if (filter.nr_filters == 0 ||
 309            filter.nr_filters > 16 ||
 310            filter.__reserved != 0)
 311                return -EINVAL;
 312
 313        tf = memdup_user(_filter->filters, filter.nr_filters * sizeof(*tf));
 314        if (IS_ERR(tf))
 315                return PTR_ERR(tf);
 316
 317        ret = -EINVAL;
 318        for (i = 0; i < filter.nr_filters; i++) {
 319                if ((tf[i].info_filter & ~tf[i].info_mask) ||
 320                    tf[i].info_mask & WATCH_INFO_LENGTH)
 321                        goto err_filter;
 322                /* Ignore any unknown types */
 323                if (tf[i].type >= sizeof(wfilter->type_filter) * 8)
 324                        continue;
 325                nr_filter++;
 326        }
 327
 328        /* Now we need to build the internal filter from only the relevant
 329         * user-specified filters.
 330         */
 331        ret = -ENOMEM;
 332        wfilter = kzalloc(struct_size(wfilter, filters, nr_filter), GFP_KERNEL);
 333        if (!wfilter)
 334                goto err_filter;
 335        wfilter->nr_filters = nr_filter;
 336
 337        q = wfilter->filters;
 338        for (i = 0; i < filter.nr_filters; i++) {
 339                if (tf[i].type >= sizeof(wfilter->type_filter) * BITS_PER_LONG)
 340                        continue;
 341
 342                q->type                 = tf[i].type;
 343                q->info_filter          = tf[i].info_filter;
 344                q->info_mask            = tf[i].info_mask;
 345                q->subtype_filter[0]    = tf[i].subtype_filter[0];
 346                __set_bit(q->type, wfilter->type_filter);
 347                q++;
 348        }
 349
 350        kfree(tf);
 351set:
 352        pipe_lock(pipe);
 353        wfilter = rcu_replace_pointer(wqueue->filter, wfilter,
 354                                      lockdep_is_held(&pipe->mutex));
 355        pipe_unlock(pipe);
 356        if (wfilter)
 357                kfree_rcu(wfilter, rcu);
 358        return 0;
 359
 360err_filter:
 361        kfree(tf);
 362        return ret;
 363}
 364
 365static void __put_watch_queue(struct kref *kref)
 366{
 367        struct watch_queue *wqueue =
 368                container_of(kref, struct watch_queue, usage);
 369        struct watch_filter *wfilter;
 370        int i;
 371
 372        for (i = 0; i < wqueue->nr_pages; i++)
 373                __free_page(wqueue->notes[i]);
 374
 375        wfilter = rcu_access_pointer(wqueue->filter);
 376        if (wfilter)
 377                kfree_rcu(wfilter, rcu);
 378        kfree_rcu(wqueue, rcu);
 379}
 380
 381/**
 382 * put_watch_queue - Dispose of a ref on a watchqueue.
 383 * @wqueue: The watch queue to unref.
 384 */
 385void put_watch_queue(struct watch_queue *wqueue)
 386{
 387        kref_put(&wqueue->usage, __put_watch_queue);
 388}
 389EXPORT_SYMBOL(put_watch_queue);
 390
 391static void free_watch(struct rcu_head *rcu)
 392{
 393        struct watch *watch = container_of(rcu, struct watch, rcu);
 394
 395        put_watch_queue(rcu_access_pointer(watch->queue));
 396        atomic_dec(&watch->cred->user->nr_watches);
 397        put_cred(watch->cred);
 398}
 399
 400static void __put_watch(struct kref *kref)
 401{
 402        struct watch *watch = container_of(kref, struct watch, usage);
 403
 404        call_rcu(&watch->rcu, free_watch);
 405}
 406
 407/*
 408 * Discard a watch.
 409 */
 410static void put_watch(struct watch *watch)
 411{
 412        kref_put(&watch->usage, __put_watch);
 413}
 414
 415/**
 416 * init_watch - Initialise a watch
 417 * @watch: The watch to initialise.
 418 * @wqueue: The queue to assign.
 419 *
 420 * Initialise a watch and set the watch queue.
 421 */
 422void init_watch(struct watch *watch, struct watch_queue *wqueue)
 423{
 424        kref_init(&watch->usage);
 425        INIT_HLIST_NODE(&watch->list_node);
 426        INIT_HLIST_NODE(&watch->queue_node);
 427        rcu_assign_pointer(watch->queue, wqueue);
 428}
 429
 430/**
 431 * add_watch_to_object - Add a watch on an object to a watch list
 432 * @watch: The watch to add
 433 * @wlist: The watch list to add to
 434 *
 435 * @watch->queue must have been set to point to the queue to post notifications
 436 * to and the watch list of the object to be watched.  @watch->cred must also
 437 * have been set to the appropriate credentials and a ref taken on them.
 438 *
 439 * The caller must pin the queue and the list both and must hold the list
 440 * locked against racing watch additions/removals.
 441 */
 442int add_watch_to_object(struct watch *watch, struct watch_list *wlist)
 443{
 444        struct watch_queue *wqueue = rcu_access_pointer(watch->queue);
 445        struct watch *w;
 446
 447        hlist_for_each_entry(w, &wlist->watchers, list_node) {
 448                struct watch_queue *wq = rcu_access_pointer(w->queue);
 449                if (wqueue == wq && watch->id == w->id)
 450                        return -EBUSY;
 451        }
 452
 453        watch->cred = get_current_cred();
 454        rcu_assign_pointer(watch->watch_list, wlist);
 455
 456        if (atomic_inc_return(&watch->cred->user->nr_watches) >
 457            task_rlimit(current, RLIMIT_NOFILE)) {
 458                atomic_dec(&watch->cred->user->nr_watches);
 459                put_cred(watch->cred);
 460                return -EAGAIN;
 461        }
 462
 463        spin_lock_bh(&wqueue->lock);
 464        kref_get(&wqueue->usage);
 465        kref_get(&watch->usage);
 466        hlist_add_head(&watch->queue_node, &wqueue->watches);
 467        spin_unlock_bh(&wqueue->lock);
 468
 469        hlist_add_head(&watch->list_node, &wlist->watchers);
 470        return 0;
 471}
 472EXPORT_SYMBOL(add_watch_to_object);
 473
 474/**
 475 * remove_watch_from_object - Remove a watch or all watches from an object.
 476 * @wlist: The watch list to remove from
 477 * @wq: The watch queue of interest (ignored if @all is true)
 478 * @id: The ID of the watch to remove (ignored if @all is true)
 479 * @all: True to remove all objects
 480 *
 481 * Remove a specific watch or all watches from an object.  A notification is
 482 * sent to the watcher to tell them that this happened.
 483 */
 484int remove_watch_from_object(struct watch_list *wlist, struct watch_queue *wq,
 485                             u64 id, bool all)
 486{
 487        struct watch_notification_removal n;
 488        struct watch_queue *wqueue;
 489        struct watch *watch;
 490        int ret = -EBADSLT;
 491
 492        rcu_read_lock();
 493
 494again:
 495        spin_lock(&wlist->lock);
 496        hlist_for_each_entry(watch, &wlist->watchers, list_node) {
 497                if (all ||
 498                    (watch->id == id && rcu_access_pointer(watch->queue) == wq))
 499                        goto found;
 500        }
 501        spin_unlock(&wlist->lock);
 502        goto out;
 503
 504found:
 505        ret = 0;
 506        hlist_del_init_rcu(&watch->list_node);
 507        rcu_assign_pointer(watch->watch_list, NULL);
 508        spin_unlock(&wlist->lock);
 509
 510        /* We now own the reference on watch that used to belong to wlist. */
 511
 512        n.watch.type = WATCH_TYPE_META;
 513        n.watch.subtype = WATCH_META_REMOVAL_NOTIFICATION;
 514        n.watch.info = watch->info_id | watch_sizeof(n.watch);
 515        n.id = id;
 516        if (id != 0)
 517                n.watch.info = watch->info_id | watch_sizeof(n);
 518
 519        wqueue = rcu_dereference(watch->queue);
 520
 521        /* We don't need the watch list lock for the next bit as RCU is
 522         * protecting *wqueue from deallocation.
 523         */
 524        if (wqueue) {
 525                post_one_notification(wqueue, &n.watch);
 526
 527                spin_lock_bh(&wqueue->lock);
 528
 529                if (!hlist_unhashed(&watch->queue_node)) {
 530                        hlist_del_init_rcu(&watch->queue_node);
 531                        put_watch(watch);
 532                }
 533
 534                spin_unlock_bh(&wqueue->lock);
 535        }
 536
 537        if (wlist->release_watch) {
 538                void (*release_watch)(struct watch *);
 539
 540                release_watch = wlist->release_watch;
 541                rcu_read_unlock();
 542                (*release_watch)(watch);
 543                rcu_read_lock();
 544        }
 545        put_watch(watch);
 546
 547        if (all && !hlist_empty(&wlist->watchers))
 548                goto again;
 549out:
 550        rcu_read_unlock();
 551        return ret;
 552}
 553EXPORT_SYMBOL(remove_watch_from_object);
 554
 555/*
 556 * Remove all the watches that are contributory to a queue.  This has the
 557 * potential to race with removal of the watches by the destruction of the
 558 * objects being watched or with the distribution of notifications.
 559 */
 560void watch_queue_clear(struct watch_queue *wqueue)
 561{
 562        struct watch_list *wlist;
 563        struct watch *watch;
 564        bool release;
 565
 566        rcu_read_lock();
 567        spin_lock_bh(&wqueue->lock);
 568
 569        /* Prevent new additions and prevent notifications from happening */
 570        wqueue->defunct = true;
 571
 572        while (!hlist_empty(&wqueue->watches)) {
 573                watch = hlist_entry(wqueue->watches.first, struct watch, queue_node);
 574                hlist_del_init_rcu(&watch->queue_node);
 575                /* We now own a ref on the watch. */
 576                spin_unlock_bh(&wqueue->lock);
 577
 578                /* We can't do the next bit under the queue lock as we need to
 579                 * get the list lock - which would cause a deadlock if someone
 580                 * was removing from the opposite direction at the same time or
 581                 * posting a notification.
 582                 */
 583                wlist = rcu_dereference(watch->watch_list);
 584                if (wlist) {
 585                        void (*release_watch)(struct watch *);
 586
 587                        spin_lock(&wlist->lock);
 588
 589                        release = !hlist_unhashed(&watch->list_node);
 590                        if (release) {
 591                                hlist_del_init_rcu(&watch->list_node);
 592                                rcu_assign_pointer(watch->watch_list, NULL);
 593
 594                                /* We now own a second ref on the watch. */
 595                        }
 596
 597                        release_watch = wlist->release_watch;
 598                        spin_unlock(&wlist->lock);
 599
 600                        if (release) {
 601                                if (release_watch) {
 602                                        rcu_read_unlock();
 603                                        /* This might need to call dput(), so
 604                                         * we have to drop all the locks.
 605                                         */
 606                                        (*release_watch)(watch);
 607                                        rcu_read_lock();
 608                                }
 609                                put_watch(watch);
 610                        }
 611                }
 612
 613                put_watch(watch);
 614                spin_lock_bh(&wqueue->lock);
 615        }
 616
 617        spin_unlock_bh(&wqueue->lock);
 618        rcu_read_unlock();
 619}
 620
 621/**
 622 * get_watch_queue - Get a watch queue from its file descriptor.
 623 * @fd: The fd to query.
 624 */
 625struct watch_queue *get_watch_queue(int fd)
 626{
 627        struct pipe_inode_info *pipe;
 628        struct watch_queue *wqueue = ERR_PTR(-EINVAL);
 629        struct fd f;
 630
 631        f = fdget(fd);
 632        if (f.file) {
 633                pipe = get_pipe_info(f.file, false);
 634                if (pipe && pipe->watch_queue) {
 635                        wqueue = pipe->watch_queue;
 636                        kref_get(&wqueue->usage);
 637                }
 638                fdput(f);
 639        }
 640
 641        return wqueue;
 642}
 643EXPORT_SYMBOL(get_watch_queue);
 644
 645/*
 646 * Initialise a watch queue
 647 */
 648int watch_queue_init(struct pipe_inode_info *pipe)
 649{
 650        struct watch_queue *wqueue;
 651
 652        wqueue = kzalloc(sizeof(*wqueue), GFP_KERNEL);
 653        if (!wqueue)
 654                return -ENOMEM;
 655
 656        wqueue->pipe = pipe;
 657        kref_init(&wqueue->usage);
 658        spin_lock_init(&wqueue->lock);
 659        INIT_HLIST_HEAD(&wqueue->watches);
 660
 661        pipe->watch_queue = wqueue;
 662        return 0;
 663}
 664