linux/fs/notify/fanotify/fanotify_user.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0
   2#include <linux/fanotify.h>
   3#include <linux/fcntl.h>
   4#include <linux/fdtable.h>
   5#include <linux/file.h>
   6#include <linux/fs.h>
   7#include <linux/anon_inodes.h>
   8#include <linux/fsnotify_backend.h>
   9#include <linux/init.h>
  10#include <linux/mount.h>
  11#include <linux/namei.h>
  12#include <linux/poll.h>
  13#include <linux/security.h>
  14#include <linux/syscalls.h>
  15#include <linux/slab.h>
  16#include <linux/types.h>
  17#include <linux/uaccess.h>
  18#include <linux/compat.h>
  19#include <linux/sched/signal.h>
  20#include <linux/memcontrol.h>
  21#include <linux/statfs.h>
  22#include <linux/exportfs.h>
  23
  24#include <asm/ioctls.h>
  25
  26#include "../../mount.h"
  27#include "../fdinfo.h"
  28#include "fanotify.h"
  29
  30#define FANOTIFY_DEFAULT_MAX_EVENTS     16384
  31#define FANOTIFY_OLD_DEFAULT_MAX_MARKS  8192
  32#define FANOTIFY_DEFAULT_MAX_GROUPS     128
  33
  34/*
  35 * Legacy fanotify marks limits (8192) is per group and we introduced a tunable
  36 * limit of marks per user, similar to inotify.  Effectively, the legacy limit
  37 * of fanotify marks per user is <max marks per group> * <max groups per user>.
  38 * This default limit (1M) also happens to match the increased limit of inotify
  39 * max_user_watches since v5.10.
  40 */
  41#define FANOTIFY_DEFAULT_MAX_USER_MARKS \
  42        (FANOTIFY_OLD_DEFAULT_MAX_MARKS * FANOTIFY_DEFAULT_MAX_GROUPS)
  43
  44/*
  45 * Most of the memory cost of adding an inode mark is pinning the marked inode.
  46 * The size of the filesystem inode struct is not uniform across filesystems,
  47 * so double the size of a VFS inode is used as a conservative approximation.
  48 */
  49#define INODE_MARK_COST (2 * sizeof(struct inode))
  50
  51/* configurable via /proc/sys/fs/fanotify/ */
  52static int fanotify_max_queued_events __read_mostly;
  53
  54#ifdef CONFIG_SYSCTL
  55
  56#include <linux/sysctl.h>
  57
  58static long ft_zero = 0;
  59static long ft_int_max = INT_MAX;
  60
  61struct ctl_table fanotify_table[] = {
  62        {
  63                .procname       = "max_user_groups",
  64                .data   = &init_user_ns.ucount_max[UCOUNT_FANOTIFY_GROUPS],
  65                .maxlen         = sizeof(long),
  66                .mode           = 0644,
  67                .proc_handler   = proc_doulongvec_minmax,
  68                .extra1         = &ft_zero,
  69                .extra2         = &ft_int_max,
  70        },
  71        {
  72                .procname       = "max_user_marks",
  73                .data   = &init_user_ns.ucount_max[UCOUNT_FANOTIFY_MARKS],
  74                .maxlen         = sizeof(long),
  75                .mode           = 0644,
  76                .proc_handler   = proc_doulongvec_minmax,
  77                .extra1         = &ft_zero,
  78                .extra2         = &ft_int_max,
  79        },
  80        {
  81                .procname       = "max_queued_events",
  82                .data           = &fanotify_max_queued_events,
  83                .maxlen         = sizeof(int),
  84                .mode           = 0644,
  85                .proc_handler   = proc_dointvec_minmax,
  86                .extra1         = SYSCTL_ZERO
  87        },
  88        { }
  89};
  90#endif /* CONFIG_SYSCTL */
  91
  92/*
  93 * All flags that may be specified in parameter event_f_flags of fanotify_init.
  94 *
  95 * Internal and external open flags are stored together in field f_flags of
  96 * struct file. Only external open flags shall be allowed in event_f_flags.
  97 * Internal flags like FMODE_NONOTIFY, FMODE_EXEC, FMODE_NOCMTIME shall be
  98 * excluded.
  99 */
 100#define FANOTIFY_INIT_ALL_EVENT_F_BITS                          ( \
 101                O_ACCMODE       | O_APPEND      | O_NONBLOCK    | \
 102                __O_SYNC        | O_DSYNC       | O_CLOEXEC     | \
 103                O_LARGEFILE     | O_NOATIME     )
 104
 105extern const struct fsnotify_ops fanotify_fsnotify_ops;
 106
 107struct kmem_cache *fanotify_mark_cache __read_mostly;
 108struct kmem_cache *fanotify_fid_event_cachep __read_mostly;
 109struct kmem_cache *fanotify_path_event_cachep __read_mostly;
 110struct kmem_cache *fanotify_perm_event_cachep __read_mostly;
 111
 112#define FANOTIFY_EVENT_ALIGN 4
 113#define FANOTIFY_FID_INFO_HDR_LEN \
 114        (sizeof(struct fanotify_event_info_fid) + sizeof(struct file_handle))
 115#define FANOTIFY_PIDFD_INFO_HDR_LEN \
 116        sizeof(struct fanotify_event_info_pidfd)
 117
 118static int fanotify_fid_info_len(int fh_len, int name_len)
 119{
 120        int info_len = fh_len;
 121
 122        if (name_len)
 123                info_len += name_len + 1;
 124
 125        return roundup(FANOTIFY_FID_INFO_HDR_LEN + info_len,
 126                       FANOTIFY_EVENT_ALIGN);
 127}
 128
 129static int fanotify_event_info_len(unsigned int info_mode,
 130                                   struct fanotify_event *event)
 131{
 132        struct fanotify_info *info = fanotify_event_info(event);
 133        int dir_fh_len = fanotify_event_dir_fh_len(event);
 134        int fh_len = fanotify_event_object_fh_len(event);
 135        int info_len = 0;
 136        int dot_len = 0;
 137
 138        if (dir_fh_len) {
 139                info_len += fanotify_fid_info_len(dir_fh_len, info->name_len);
 140        } else if ((info_mode & FAN_REPORT_NAME) &&
 141                   (event->mask & FAN_ONDIR)) {
 142                /*
 143                 * With group flag FAN_REPORT_NAME, if name was not recorded in
 144                 * event on a directory, we will report the name ".".
 145                 */
 146                dot_len = 1;
 147        }
 148
 149        if (info_mode & FAN_REPORT_PIDFD)
 150                info_len += FANOTIFY_PIDFD_INFO_HDR_LEN;
 151
 152        if (fh_len)
 153                info_len += fanotify_fid_info_len(fh_len, dot_len);
 154
 155        return info_len;
 156}
 157
 158/*
 159 * Remove an hashed event from merge hash table.
 160 */
 161static void fanotify_unhash_event(struct fsnotify_group *group,
 162                                  struct fanotify_event *event)
 163{
 164        assert_spin_locked(&group->notification_lock);
 165
 166        pr_debug("%s: group=%p event=%p bucket=%u\n", __func__,
 167                 group, event, fanotify_event_hash_bucket(group, event));
 168
 169        if (WARN_ON_ONCE(hlist_unhashed(&event->merge_list)))
 170                return;
 171
 172        hlist_del_init(&event->merge_list);
 173}
 174
 175/*
 176 * Get an fanotify notification event if one exists and is small
 177 * enough to fit in "count". Return an error pointer if the count
 178 * is not large enough. When permission event is dequeued, its state is
 179 * updated accordingly.
 180 */
 181static struct fanotify_event *get_one_event(struct fsnotify_group *group,
 182                                            size_t count)
 183{
 184        size_t event_size = FAN_EVENT_METADATA_LEN;
 185        struct fanotify_event *event = NULL;
 186        struct fsnotify_event *fsn_event;
 187        unsigned int info_mode = FAN_GROUP_FLAG(group, FANOTIFY_INFO_MODES);
 188
 189        pr_debug("%s: group=%p count=%zd\n", __func__, group, count);
 190
 191        spin_lock(&group->notification_lock);
 192        fsn_event = fsnotify_peek_first_event(group);
 193        if (!fsn_event)
 194                goto out;
 195
 196        event = FANOTIFY_E(fsn_event);
 197        if (info_mode)
 198                event_size += fanotify_event_info_len(info_mode, event);
 199
 200        if (event_size > count) {
 201                event = ERR_PTR(-EINVAL);
 202                goto out;
 203        }
 204
 205        /*
 206         * Held the notification_lock the whole time, so this is the
 207         * same event we peeked above.
 208         */
 209        fsnotify_remove_first_event(group);
 210        if (fanotify_is_perm_event(event->mask))
 211                FANOTIFY_PERM(event)->state = FAN_EVENT_REPORTED;
 212        if (fanotify_is_hashed_event(event->mask))
 213                fanotify_unhash_event(group, event);
 214out:
 215        spin_unlock(&group->notification_lock);
 216        return event;
 217}
 218
 219static int create_fd(struct fsnotify_group *group, struct path *path,
 220                     struct file **file)
 221{
 222        int client_fd;
 223        struct file *new_file;
 224
 225        client_fd = get_unused_fd_flags(group->fanotify_data.f_flags);
 226        if (client_fd < 0)
 227                return client_fd;
 228
 229        /*
 230         * we need a new file handle for the userspace program so it can read even if it was
 231         * originally opened O_WRONLY.
 232         */
 233        new_file = dentry_open(path,
 234                               group->fanotify_data.f_flags | FMODE_NONOTIFY,
 235                               current_cred());
 236        if (IS_ERR(new_file)) {
 237                /*
 238                 * we still send an event even if we can't open the file.  this
 239                 * can happen when say tasks are gone and we try to open their
 240                 * /proc files or we try to open a WRONLY file like in sysfs
 241                 * we just send the errno to userspace since there isn't much
 242                 * else we can do.
 243                 */
 244                put_unused_fd(client_fd);
 245                client_fd = PTR_ERR(new_file);
 246        } else {
 247                *file = new_file;
 248        }
 249
 250        return client_fd;
 251}
 252
 253/*
 254 * Finish processing of permission event by setting it to ANSWERED state and
 255 * drop group->notification_lock.
 256 */
 257static void finish_permission_event(struct fsnotify_group *group,
 258                                    struct fanotify_perm_event *event,
 259                                    unsigned int response)
 260                                    __releases(&group->notification_lock)
 261{
 262        bool destroy = false;
 263
 264        assert_spin_locked(&group->notification_lock);
 265        event->response = response;
 266        if (event->state == FAN_EVENT_CANCELED)
 267                destroy = true;
 268        else
 269                event->state = FAN_EVENT_ANSWERED;
 270        spin_unlock(&group->notification_lock);
 271        if (destroy)
 272                fsnotify_destroy_event(group, &event->fae.fse);
 273}
 274
 275static int process_access_response(struct fsnotify_group *group,
 276                                   struct fanotify_response *response_struct)
 277{
 278        struct fanotify_perm_event *event;
 279        int fd = response_struct->fd;
 280        int response = response_struct->response;
 281
 282        pr_debug("%s: group=%p fd=%d response=%d\n", __func__, group,
 283                 fd, response);
 284        /*
 285         * make sure the response is valid, if invalid we do nothing and either
 286         * userspace can send a valid response or we will clean it up after the
 287         * timeout
 288         */
 289        switch (response & ~FAN_AUDIT) {
 290        case FAN_ALLOW:
 291        case FAN_DENY:
 292                break;
 293        default:
 294                return -EINVAL;
 295        }
 296
 297        if (fd < 0)
 298                return -EINVAL;
 299
 300        if ((response & FAN_AUDIT) && !FAN_GROUP_FLAG(group, FAN_ENABLE_AUDIT))
 301                return -EINVAL;
 302
 303        spin_lock(&group->notification_lock);
 304        list_for_each_entry(event, &group->fanotify_data.access_list,
 305                            fae.fse.list) {
 306                if (event->fd != fd)
 307                        continue;
 308
 309                list_del_init(&event->fae.fse.list);
 310                finish_permission_event(group, event, response);
 311                wake_up(&group->fanotify_data.access_waitq);
 312                return 0;
 313        }
 314        spin_unlock(&group->notification_lock);
 315
 316        return -ENOENT;
 317}
 318
 319static int copy_fid_info_to_user(__kernel_fsid_t *fsid, struct fanotify_fh *fh,
 320                                 int info_type, const char *name,
 321                                 size_t name_len,
 322                                 char __user *buf, size_t count)
 323{
 324        struct fanotify_event_info_fid info = { };
 325        struct file_handle handle = { };
 326        unsigned char bounce[FANOTIFY_INLINE_FH_LEN], *fh_buf;
 327        size_t fh_len = fh ? fh->len : 0;
 328        size_t info_len = fanotify_fid_info_len(fh_len, name_len);
 329        size_t len = info_len;
 330
 331        pr_debug("%s: fh_len=%zu name_len=%zu, info_len=%zu, count=%zu\n",
 332                 __func__, fh_len, name_len, info_len, count);
 333
 334        if (!fh_len)
 335                return 0;
 336
 337        if (WARN_ON_ONCE(len < sizeof(info) || len > count))
 338                return -EFAULT;
 339
 340        /*
 341         * Copy event info fid header followed by variable sized file handle
 342         * and optionally followed by variable sized filename.
 343         */
 344        switch (info_type) {
 345        case FAN_EVENT_INFO_TYPE_FID:
 346        case FAN_EVENT_INFO_TYPE_DFID:
 347                if (WARN_ON_ONCE(name_len))
 348                        return -EFAULT;
 349                break;
 350        case FAN_EVENT_INFO_TYPE_DFID_NAME:
 351                if (WARN_ON_ONCE(!name || !name_len))
 352                        return -EFAULT;
 353                break;
 354        default:
 355                return -EFAULT;
 356        }
 357
 358        info.hdr.info_type = info_type;
 359        info.hdr.len = len;
 360        info.fsid = *fsid;
 361        if (copy_to_user(buf, &info, sizeof(info)))
 362                return -EFAULT;
 363
 364        buf += sizeof(info);
 365        len -= sizeof(info);
 366        if (WARN_ON_ONCE(len < sizeof(handle)))
 367                return -EFAULT;
 368
 369        handle.handle_type = fh->type;
 370        handle.handle_bytes = fh_len;
 371        if (copy_to_user(buf, &handle, sizeof(handle)))
 372                return -EFAULT;
 373
 374        buf += sizeof(handle);
 375        len -= sizeof(handle);
 376        if (WARN_ON_ONCE(len < fh_len))
 377                return -EFAULT;
 378
 379        /*
 380         * For an inline fh and inline file name, copy through stack to exclude
 381         * the copy from usercopy hardening protections.
 382         */
 383        fh_buf = fanotify_fh_buf(fh);
 384        if (fh_len <= FANOTIFY_INLINE_FH_LEN) {
 385                memcpy(bounce, fh_buf, fh_len);
 386                fh_buf = bounce;
 387        }
 388        if (copy_to_user(buf, fh_buf, fh_len))
 389                return -EFAULT;
 390
 391        buf += fh_len;
 392        len -= fh_len;
 393
 394        if (name_len) {
 395                /* Copy the filename with terminating null */
 396                name_len++;
 397                if (WARN_ON_ONCE(len < name_len))
 398                        return -EFAULT;
 399
 400                if (copy_to_user(buf, name, name_len))
 401                        return -EFAULT;
 402
 403                buf += name_len;
 404                len -= name_len;
 405        }
 406
 407        /* Pad with 0's */
 408        WARN_ON_ONCE(len < 0 || len >= FANOTIFY_EVENT_ALIGN);
 409        if (len > 0 && clear_user(buf, len))
 410                return -EFAULT;
 411
 412        return info_len;
 413}
 414
 415static int copy_pidfd_info_to_user(int pidfd,
 416                                   char __user *buf,
 417                                   size_t count)
 418{
 419        struct fanotify_event_info_pidfd info = { };
 420        size_t info_len = FANOTIFY_PIDFD_INFO_HDR_LEN;
 421
 422        if (WARN_ON_ONCE(info_len > count))
 423                return -EFAULT;
 424
 425        info.hdr.info_type = FAN_EVENT_INFO_TYPE_PIDFD;
 426        info.hdr.len = info_len;
 427        info.pidfd = pidfd;
 428
 429        if (copy_to_user(buf, &info, info_len))
 430                return -EFAULT;
 431
 432        return info_len;
 433}
 434
 435static int copy_info_records_to_user(struct fanotify_event *event,
 436                                     struct fanotify_info *info,
 437                                     unsigned int info_mode, int pidfd,
 438                                     char __user *buf, size_t count)
 439{
 440        int ret, total_bytes = 0, info_type = 0;
 441        unsigned int fid_mode = info_mode & FANOTIFY_FID_BITS;
 442        unsigned int pidfd_mode = info_mode & FAN_REPORT_PIDFD;
 443
 444        /*
 445         * Event info records order is as follows: dir fid + name, child fid.
 446         */
 447        if (fanotify_event_dir_fh_len(event)) {
 448                info_type = info->name_len ? FAN_EVENT_INFO_TYPE_DFID_NAME :
 449                                             FAN_EVENT_INFO_TYPE_DFID;
 450                ret = copy_fid_info_to_user(fanotify_event_fsid(event),
 451                                            fanotify_info_dir_fh(info),
 452                                            info_type,
 453                                            fanotify_info_name(info),
 454                                            info->name_len, buf, count);
 455                if (ret < 0)
 456                        return ret;
 457
 458                buf += ret;
 459                count -= ret;
 460                total_bytes += ret;
 461        }
 462
 463        if (fanotify_event_object_fh_len(event)) {
 464                const char *dot = NULL;
 465                int dot_len = 0;
 466
 467                if (fid_mode == FAN_REPORT_FID || info_type) {
 468                        /*
 469                         * With only group flag FAN_REPORT_FID only type FID is
 470                         * reported. Second info record type is always FID.
 471                         */
 472                        info_type = FAN_EVENT_INFO_TYPE_FID;
 473                } else if ((fid_mode & FAN_REPORT_NAME) &&
 474                           (event->mask & FAN_ONDIR)) {
 475                        /*
 476                         * With group flag FAN_REPORT_NAME, if name was not
 477                         * recorded in an event on a directory, report the name
 478                         * "." with info type DFID_NAME.
 479                         */
 480                        info_type = FAN_EVENT_INFO_TYPE_DFID_NAME;
 481                        dot = ".";
 482                        dot_len = 1;
 483                } else if ((event->mask & ALL_FSNOTIFY_DIRENT_EVENTS) ||
 484                           (event->mask & FAN_ONDIR)) {
 485                        /*
 486                         * With group flag FAN_REPORT_DIR_FID, a single info
 487                         * record has type DFID for directory entry modification
 488                         * event and for event on a directory.
 489                         */
 490                        info_type = FAN_EVENT_INFO_TYPE_DFID;
 491                } else {
 492                        /*
 493                         * With group flags FAN_REPORT_DIR_FID|FAN_REPORT_FID,
 494                         * a single info record has type FID for event on a
 495                         * non-directory, when there is no directory to report.
 496                         * For example, on FAN_DELETE_SELF event.
 497                         */
 498                        info_type = FAN_EVENT_INFO_TYPE_FID;
 499                }
 500
 501                ret = copy_fid_info_to_user(fanotify_event_fsid(event),
 502                                            fanotify_event_object_fh(event),
 503                                            info_type, dot, dot_len,
 504                                            buf, count);
 505                if (ret < 0)
 506                        return ret;
 507
 508                buf += ret;
 509                count -= ret;
 510                total_bytes += ret;
 511        }
 512
 513        if (pidfd_mode) {
 514                ret = copy_pidfd_info_to_user(pidfd, buf, count);
 515                if (ret < 0)
 516                        return ret;
 517
 518                buf += ret;
 519                count -= ret;
 520                total_bytes += ret;
 521        }
 522
 523        return total_bytes;
 524}
 525
 526static ssize_t copy_event_to_user(struct fsnotify_group *group,
 527                                  struct fanotify_event *event,
 528                                  char __user *buf, size_t count)
 529{
 530        struct fanotify_event_metadata metadata;
 531        struct path *path = fanotify_event_path(event);
 532        struct fanotify_info *info = fanotify_event_info(event);
 533        unsigned int info_mode = FAN_GROUP_FLAG(group, FANOTIFY_INFO_MODES);
 534        unsigned int pidfd_mode = info_mode & FAN_REPORT_PIDFD;
 535        struct file *f = NULL;
 536        int ret, pidfd = FAN_NOPIDFD, fd = FAN_NOFD;
 537
 538        pr_debug("%s: group=%p event=%p\n", __func__, group, event);
 539
 540        metadata.event_len = FAN_EVENT_METADATA_LEN +
 541                                fanotify_event_info_len(info_mode, event);
 542        metadata.metadata_len = FAN_EVENT_METADATA_LEN;
 543        metadata.vers = FANOTIFY_METADATA_VERSION;
 544        metadata.reserved = 0;
 545        metadata.mask = event->mask & FANOTIFY_OUTGOING_EVENTS;
 546        metadata.pid = pid_vnr(event->pid);
 547        /*
 548         * For an unprivileged listener, event->pid can be used to identify the
 549         * events generated by the listener process itself, without disclosing
 550         * the pids of other processes.
 551         */
 552        if (FAN_GROUP_FLAG(group, FANOTIFY_UNPRIV) &&
 553            task_tgid(current) != event->pid)
 554                metadata.pid = 0;
 555
 556        /*
 557         * For now, fid mode is required for an unprivileged listener and
 558         * fid mode does not report fd in events.  Keep this check anyway
 559         * for safety in case fid mode requirement is relaxed in the future
 560         * to allow unprivileged listener to get events with no fd and no fid.
 561         */
 562        if (!FAN_GROUP_FLAG(group, FANOTIFY_UNPRIV) &&
 563            path && path->mnt && path->dentry) {
 564                fd = create_fd(group, path, &f);
 565                if (fd < 0)
 566                        return fd;
 567        }
 568        metadata.fd = fd;
 569
 570        if (pidfd_mode) {
 571                /*
 572                 * Complain if the FAN_REPORT_PIDFD and FAN_REPORT_TID mutual
 573                 * exclusion is ever lifted. At the time of incoporating pidfd
 574                 * support within fanotify, the pidfd API only supported the
 575                 * creation of pidfds for thread-group leaders.
 576                 */
 577                WARN_ON_ONCE(FAN_GROUP_FLAG(group, FAN_REPORT_TID));
 578
 579                /*
 580                 * The PIDTYPE_TGID check for an event->pid is performed
 581                 * preemptively in an attempt to catch out cases where the event
 582                 * listener reads events after the event generating process has
 583                 * already terminated. Report FAN_NOPIDFD to the event listener
 584                 * in those cases, with all other pidfd creation errors being
 585                 * reported as FAN_EPIDFD.
 586                 */
 587                if (metadata.pid == 0 ||
 588                    !pid_has_task(event->pid, PIDTYPE_TGID)) {
 589                        pidfd = FAN_NOPIDFD;
 590                } else {
 591                        pidfd = pidfd_create(event->pid, 0);
 592                        if (pidfd < 0)
 593                                pidfd = FAN_EPIDFD;
 594                }
 595        }
 596
 597        ret = -EFAULT;
 598        /*
 599         * Sanity check copy size in case get_one_event() and
 600         * event_len sizes ever get out of sync.
 601         */
 602        if (WARN_ON_ONCE(metadata.event_len > count))
 603                goto out_close_fd;
 604
 605        if (copy_to_user(buf, &metadata, FAN_EVENT_METADATA_LEN))
 606                goto out_close_fd;
 607
 608        buf += FAN_EVENT_METADATA_LEN;
 609        count -= FAN_EVENT_METADATA_LEN;
 610
 611        if (fanotify_is_perm_event(event->mask))
 612                FANOTIFY_PERM(event)->fd = fd;
 613
 614        if (f)
 615                fd_install(fd, f);
 616
 617        if (info_mode) {
 618                ret = copy_info_records_to_user(event, info, info_mode, pidfd,
 619                                                buf, count);
 620                if (ret < 0)
 621                        goto out_close_fd;
 622        }
 623
 624        return metadata.event_len;
 625
 626out_close_fd:
 627        if (fd != FAN_NOFD) {
 628                put_unused_fd(fd);
 629                fput(f);
 630        }
 631
 632        if (pidfd >= 0)
 633                close_fd(pidfd);
 634
 635        return ret;
 636}
 637
 638/* intofiy userspace file descriptor functions */
 639static __poll_t fanotify_poll(struct file *file, poll_table *wait)
 640{
 641        struct fsnotify_group *group = file->private_data;
 642        __poll_t ret = 0;
 643
 644        poll_wait(file, &group->notification_waitq, wait);
 645        spin_lock(&group->notification_lock);
 646        if (!fsnotify_notify_queue_is_empty(group))
 647                ret = EPOLLIN | EPOLLRDNORM;
 648        spin_unlock(&group->notification_lock);
 649
 650        return ret;
 651}
 652
 653static ssize_t fanotify_read(struct file *file, char __user *buf,
 654                             size_t count, loff_t *pos)
 655{
 656        struct fsnotify_group *group;
 657        struct fanotify_event *event;
 658        char __user *start;
 659        int ret;
 660        DEFINE_WAIT_FUNC(wait, woken_wake_function);
 661
 662        start = buf;
 663        group = file->private_data;
 664
 665        pr_debug("%s: group=%p\n", __func__, group);
 666
 667        add_wait_queue(&group->notification_waitq, &wait);
 668        while (1) {
 669                /*
 670                 * User can supply arbitrarily large buffer. Avoid softlockups
 671                 * in case there are lots of available events.
 672                 */
 673                cond_resched();
 674                event = get_one_event(group, count);
 675                if (IS_ERR(event)) {
 676                        ret = PTR_ERR(event);
 677                        break;
 678                }
 679
 680                if (!event) {
 681                        ret = -EAGAIN;
 682                        if (file->f_flags & O_NONBLOCK)
 683                                break;
 684
 685                        ret = -ERESTARTSYS;
 686                        if (signal_pending(current))
 687                                break;
 688
 689                        if (start != buf)
 690                                break;
 691
 692                        wait_woken(&wait, TASK_INTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
 693                        continue;
 694                }
 695
 696                ret = copy_event_to_user(group, event, buf, count);
 697                if (unlikely(ret == -EOPENSTALE)) {
 698                        /*
 699                         * We cannot report events with stale fd so drop it.
 700                         * Setting ret to 0 will continue the event loop and
 701                         * do the right thing if there are no more events to
 702                         * read (i.e. return bytes read, -EAGAIN or wait).
 703                         */
 704                        ret = 0;
 705                }
 706
 707                /*
 708                 * Permission events get queued to wait for response.  Other
 709                 * events can be destroyed now.
 710                 */
 711                if (!fanotify_is_perm_event(event->mask)) {
 712                        fsnotify_destroy_event(group, &event->fse);
 713                } else {
 714                        if (ret <= 0) {
 715                                spin_lock(&group->notification_lock);
 716                                finish_permission_event(group,
 717                                        FANOTIFY_PERM(event), FAN_DENY);
 718                                wake_up(&group->fanotify_data.access_waitq);
 719                        } else {
 720                                spin_lock(&group->notification_lock);
 721                                list_add_tail(&event->fse.list,
 722                                        &group->fanotify_data.access_list);
 723                                spin_unlock(&group->notification_lock);
 724                        }
 725                }
 726                if (ret < 0)
 727                        break;
 728                buf += ret;
 729                count -= ret;
 730        }
 731        remove_wait_queue(&group->notification_waitq, &wait);
 732
 733        if (start != buf && ret != -EFAULT)
 734                ret = buf - start;
 735        return ret;
 736}
 737
 738static ssize_t fanotify_write(struct file *file, const char __user *buf, size_t count, loff_t *pos)
 739{
 740        struct fanotify_response response = { .fd = -1, .response = -1 };
 741        struct fsnotify_group *group;
 742        int ret;
 743
 744        if (!IS_ENABLED(CONFIG_FANOTIFY_ACCESS_PERMISSIONS))
 745                return -EINVAL;
 746
 747        group = file->private_data;
 748
 749        if (count < sizeof(response))
 750                return -EINVAL;
 751
 752        count = sizeof(response);
 753
 754        pr_debug("%s: group=%p count=%zu\n", __func__, group, count);
 755
 756        if (copy_from_user(&response, buf, count))
 757                return -EFAULT;
 758
 759        ret = process_access_response(group, &response);
 760        if (ret < 0)
 761                count = ret;
 762
 763        return count;
 764}
 765
 766static int fanotify_release(struct inode *ignored, struct file *file)
 767{
 768        struct fsnotify_group *group = file->private_data;
 769        struct fsnotify_event *fsn_event;
 770
 771        /*
 772         * Stop new events from arriving in the notification queue. since
 773         * userspace cannot use fanotify fd anymore, no event can enter or
 774         * leave access_list by now either.
 775         */
 776        fsnotify_group_stop_queueing(group);
 777
 778        /*
 779         * Process all permission events on access_list and notification queue
 780         * and simulate reply from userspace.
 781         */
 782        spin_lock(&group->notification_lock);
 783        while (!list_empty(&group->fanotify_data.access_list)) {
 784                struct fanotify_perm_event *event;
 785
 786                event = list_first_entry(&group->fanotify_data.access_list,
 787                                struct fanotify_perm_event, fae.fse.list);
 788                list_del_init(&event->fae.fse.list);
 789                finish_permission_event(group, event, FAN_ALLOW);
 790                spin_lock(&group->notification_lock);
 791        }
 792
 793        /*
 794         * Destroy all non-permission events. For permission events just
 795         * dequeue them and set the response. They will be freed once the
 796         * response is consumed and fanotify_get_response() returns.
 797         */
 798        while ((fsn_event = fsnotify_remove_first_event(group))) {
 799                struct fanotify_event *event = FANOTIFY_E(fsn_event);
 800
 801                if (!(event->mask & FANOTIFY_PERM_EVENTS)) {
 802                        spin_unlock(&group->notification_lock);
 803                        fsnotify_destroy_event(group, fsn_event);
 804                } else {
 805                        finish_permission_event(group, FANOTIFY_PERM(event),
 806                                                FAN_ALLOW);
 807                }
 808                spin_lock(&group->notification_lock);
 809        }
 810        spin_unlock(&group->notification_lock);
 811
 812        /* Response for all permission events it set, wakeup waiters */
 813        wake_up(&group->fanotify_data.access_waitq);
 814
 815        /* matches the fanotify_init->fsnotify_alloc_group */
 816        fsnotify_destroy_group(group);
 817
 818        return 0;
 819}
 820
 821static long fanotify_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 822{
 823        struct fsnotify_group *group;
 824        struct fsnotify_event *fsn_event;
 825        void __user *p;
 826        int ret = -ENOTTY;
 827        size_t send_len = 0;
 828
 829        group = file->private_data;
 830
 831        p = (void __user *) arg;
 832
 833        switch (cmd) {
 834        case FIONREAD:
 835                spin_lock(&group->notification_lock);
 836                list_for_each_entry(fsn_event, &group->notification_list, list)
 837                        send_len += FAN_EVENT_METADATA_LEN;
 838                spin_unlock(&group->notification_lock);
 839                ret = put_user(send_len, (int __user *) p);
 840                break;
 841        }
 842
 843        return ret;
 844}
 845
 846static const struct file_operations fanotify_fops = {
 847        .show_fdinfo    = fanotify_show_fdinfo,
 848        .poll           = fanotify_poll,
 849        .read           = fanotify_read,
 850        .write          = fanotify_write,
 851        .fasync         = NULL,
 852        .release        = fanotify_release,
 853        .unlocked_ioctl = fanotify_ioctl,
 854        .compat_ioctl   = compat_ptr_ioctl,
 855        .llseek         = noop_llseek,
 856};
 857
 858static int fanotify_find_path(int dfd, const char __user *filename,
 859                              struct path *path, unsigned int flags, __u64 mask,
 860                              unsigned int obj_type)
 861{
 862        int ret;
 863
 864        pr_debug("%s: dfd=%d filename=%p flags=%x\n", __func__,
 865                 dfd, filename, flags);
 866
 867        if (filename == NULL) {
 868                struct fd f = fdget(dfd);
 869
 870                ret = -EBADF;
 871                if (!f.file)
 872                        goto out;
 873
 874                ret = -ENOTDIR;
 875                if ((flags & FAN_MARK_ONLYDIR) &&
 876                    !(S_ISDIR(file_inode(f.file)->i_mode))) {
 877                        fdput(f);
 878                        goto out;
 879                }
 880
 881                *path = f.file->f_path;
 882                path_get(path);
 883                fdput(f);
 884        } else {
 885                unsigned int lookup_flags = 0;
 886
 887                if (!(flags & FAN_MARK_DONT_FOLLOW))
 888                        lookup_flags |= LOOKUP_FOLLOW;
 889                if (flags & FAN_MARK_ONLYDIR)
 890                        lookup_flags |= LOOKUP_DIRECTORY;
 891
 892                ret = user_path_at(dfd, filename, lookup_flags, path);
 893                if (ret)
 894                        goto out;
 895        }
 896
 897        /* you can only watch an inode if you have read permissions on it */
 898        ret = path_permission(path, MAY_READ);
 899        if (ret) {
 900                path_put(path);
 901                goto out;
 902        }
 903
 904        ret = security_path_notify(path, mask, obj_type);
 905        if (ret)
 906                path_put(path);
 907
 908out:
 909        return ret;
 910}
 911
 912static __u32 fanotify_mark_remove_from_mask(struct fsnotify_mark *fsn_mark,
 913                                            __u32 mask, unsigned int flags,
 914                                            __u32 umask, int *destroy)
 915{
 916        __u32 oldmask = 0;
 917
 918        /* umask bits cannot be removed by user */
 919        mask &= ~umask;
 920        spin_lock(&fsn_mark->lock);
 921        if (!(flags & FAN_MARK_IGNORED_MASK)) {
 922                oldmask = fsn_mark->mask;
 923                fsn_mark->mask &= ~mask;
 924        } else {
 925                fsn_mark->ignored_mask &= ~mask;
 926        }
 927        /*
 928         * We need to keep the mark around even if remaining mask cannot
 929         * result in any events (e.g. mask == FAN_ONDIR) to support incremenal
 930         * changes to the mask.
 931         * Destroy mark when only umask bits remain.
 932         */
 933        *destroy = !((fsn_mark->mask | fsn_mark->ignored_mask) & ~umask);
 934        spin_unlock(&fsn_mark->lock);
 935
 936        return mask & oldmask;
 937}
 938
 939static int fanotify_remove_mark(struct fsnotify_group *group,
 940                                fsnotify_connp_t *connp, __u32 mask,
 941                                unsigned int flags, __u32 umask)
 942{
 943        struct fsnotify_mark *fsn_mark = NULL;
 944        __u32 removed;
 945        int destroy_mark;
 946
 947        mutex_lock(&group->mark_mutex);
 948        fsn_mark = fsnotify_find_mark(connp, group);
 949        if (!fsn_mark) {
 950                mutex_unlock(&group->mark_mutex);
 951                return -ENOENT;
 952        }
 953
 954        removed = fanotify_mark_remove_from_mask(fsn_mark, mask, flags,
 955                                                 umask, &destroy_mark);
 956        if (removed & fsnotify_conn_mask(fsn_mark->connector))
 957                fsnotify_recalc_mask(fsn_mark->connector);
 958        if (destroy_mark)
 959                fsnotify_detach_mark(fsn_mark);
 960        mutex_unlock(&group->mark_mutex);
 961        if (destroy_mark)
 962                fsnotify_free_mark(fsn_mark);
 963
 964        /* matches the fsnotify_find_mark() */
 965        fsnotify_put_mark(fsn_mark);
 966        return 0;
 967}
 968
 969static int fanotify_remove_vfsmount_mark(struct fsnotify_group *group,
 970                                         struct vfsmount *mnt, __u32 mask,
 971                                         unsigned int flags, __u32 umask)
 972{
 973        return fanotify_remove_mark(group, &real_mount(mnt)->mnt_fsnotify_marks,
 974                                    mask, flags, umask);
 975}
 976
 977static int fanotify_remove_sb_mark(struct fsnotify_group *group,
 978                                   struct super_block *sb, __u32 mask,
 979                                   unsigned int flags, __u32 umask)
 980{
 981        return fanotify_remove_mark(group, &sb->s_fsnotify_marks, mask,
 982                                    flags, umask);
 983}
 984
 985static int fanotify_remove_inode_mark(struct fsnotify_group *group,
 986                                      struct inode *inode, __u32 mask,
 987                                      unsigned int flags, __u32 umask)
 988{
 989        return fanotify_remove_mark(group, &inode->i_fsnotify_marks, mask,
 990                                    flags, umask);
 991}
 992
 993static __u32 fanotify_mark_add_to_mask(struct fsnotify_mark *fsn_mark,
 994                                       __u32 mask,
 995                                       unsigned int flags)
 996{
 997        __u32 oldmask = -1;
 998
 999        spin_lock(&fsn_mark->lock);
1000        if (!(flags & FAN_MARK_IGNORED_MASK)) {
1001                oldmask = fsn_mark->mask;
1002                fsn_mark->mask |= mask;
1003        } else {
1004                fsn_mark->ignored_mask |= mask;
1005                if (flags & FAN_MARK_IGNORED_SURV_MODIFY)
1006                        fsn_mark->flags |= FSNOTIFY_MARK_FLAG_IGNORED_SURV_MODIFY;
1007        }
1008        spin_unlock(&fsn_mark->lock);
1009
1010        return mask & ~oldmask;
1011}
1012
1013static struct fsnotify_mark *fanotify_add_new_mark(struct fsnotify_group *group,
1014                                                   fsnotify_connp_t *connp,
1015                                                   unsigned int type,
1016                                                   __kernel_fsid_t *fsid)
1017{
1018        struct ucounts *ucounts = group->fanotify_data.ucounts;
1019        struct fsnotify_mark *mark;
1020        int ret;
1021
1022        /*
1023         * Enforce per user marks limits per user in all containing user ns.
1024         * A group with FAN_UNLIMITED_MARKS does not contribute to mark count
1025         * in the limited groups account.
1026         */
1027        if (!FAN_GROUP_FLAG(group, FAN_UNLIMITED_MARKS) &&
1028            !inc_ucount(ucounts->ns, ucounts->uid, UCOUNT_FANOTIFY_MARKS))
1029                return ERR_PTR(-ENOSPC);
1030
1031        mark = kmem_cache_alloc(fanotify_mark_cache, GFP_KERNEL);
1032        if (!mark) {
1033                ret = -ENOMEM;
1034                goto out_dec_ucounts;
1035        }
1036
1037        fsnotify_init_mark(mark, group);
1038        ret = fsnotify_add_mark_locked(mark, connp, type, 0, fsid);
1039        if (ret) {
1040                fsnotify_put_mark(mark);
1041                goto out_dec_ucounts;
1042        }
1043
1044        return mark;
1045
1046out_dec_ucounts:
1047        if (!FAN_GROUP_FLAG(group, FAN_UNLIMITED_MARKS))
1048                dec_ucount(ucounts, UCOUNT_FANOTIFY_MARKS);
1049        return ERR_PTR(ret);
1050}
1051
1052
1053static int fanotify_add_mark(struct fsnotify_group *group,
1054                             fsnotify_connp_t *connp, unsigned int type,
1055                             __u32 mask, unsigned int flags,
1056                             __kernel_fsid_t *fsid)
1057{
1058        struct fsnotify_mark *fsn_mark;
1059        __u32 added;
1060
1061        mutex_lock(&group->mark_mutex);
1062        fsn_mark = fsnotify_find_mark(connp, group);
1063        if (!fsn_mark) {
1064                fsn_mark = fanotify_add_new_mark(group, connp, type, fsid);
1065                if (IS_ERR(fsn_mark)) {
1066                        mutex_unlock(&group->mark_mutex);
1067                        return PTR_ERR(fsn_mark);
1068                }
1069        }
1070        added = fanotify_mark_add_to_mask(fsn_mark, mask, flags);
1071        if (added & ~fsnotify_conn_mask(fsn_mark->connector))
1072                fsnotify_recalc_mask(fsn_mark->connector);
1073        mutex_unlock(&group->mark_mutex);
1074
1075        fsnotify_put_mark(fsn_mark);
1076        return 0;
1077}
1078
1079static int fanotify_add_vfsmount_mark(struct fsnotify_group *group,
1080                                      struct vfsmount *mnt, __u32 mask,
1081                                      unsigned int flags, __kernel_fsid_t *fsid)
1082{
1083        return fanotify_add_mark(group, &real_mount(mnt)->mnt_fsnotify_marks,
1084                                 FSNOTIFY_OBJ_TYPE_VFSMOUNT, mask, flags, fsid);
1085}
1086
1087static int fanotify_add_sb_mark(struct fsnotify_group *group,
1088                                struct super_block *sb, __u32 mask,
1089                                unsigned int flags, __kernel_fsid_t *fsid)
1090{
1091        return fanotify_add_mark(group, &sb->s_fsnotify_marks,
1092                                 FSNOTIFY_OBJ_TYPE_SB, mask, flags, fsid);
1093}
1094
1095static int fanotify_add_inode_mark(struct fsnotify_group *group,
1096                                   struct inode *inode, __u32 mask,
1097                                   unsigned int flags, __kernel_fsid_t *fsid)
1098{
1099        pr_debug("%s: group=%p inode=%p\n", __func__, group, inode);
1100
1101        /*
1102         * If some other task has this inode open for write we should not add
1103         * an ignored mark, unless that ignored mark is supposed to survive
1104         * modification changes anyway.
1105         */
1106        if ((flags & FAN_MARK_IGNORED_MASK) &&
1107            !(flags & FAN_MARK_IGNORED_SURV_MODIFY) &&
1108            inode_is_open_for_write(inode))
1109                return 0;
1110
1111        return fanotify_add_mark(group, &inode->i_fsnotify_marks,
1112                                 FSNOTIFY_OBJ_TYPE_INODE, mask, flags, fsid);
1113}
1114
1115static struct fsnotify_event *fanotify_alloc_overflow_event(void)
1116{
1117        struct fanotify_event *oevent;
1118
1119        oevent = kmalloc(sizeof(*oevent), GFP_KERNEL_ACCOUNT);
1120        if (!oevent)
1121                return NULL;
1122
1123        fanotify_init_event(oevent, 0, FS_Q_OVERFLOW);
1124        oevent->type = FANOTIFY_EVENT_TYPE_OVERFLOW;
1125
1126        return &oevent->fse;
1127}
1128
1129static struct hlist_head *fanotify_alloc_merge_hash(void)
1130{
1131        struct hlist_head *hash;
1132
1133        hash = kmalloc(sizeof(struct hlist_head) << FANOTIFY_HTABLE_BITS,
1134                       GFP_KERNEL_ACCOUNT);
1135        if (!hash)
1136                return NULL;
1137
1138        __hash_init(hash, FANOTIFY_HTABLE_SIZE);
1139
1140        return hash;
1141}
1142
1143/* fanotify syscalls */
1144SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags)
1145{
1146        struct fsnotify_group *group;
1147        int f_flags, fd;
1148        unsigned int fid_mode = flags & FANOTIFY_FID_BITS;
1149        unsigned int class = flags & FANOTIFY_CLASS_BITS;
1150        unsigned int internal_flags = 0;
1151
1152        pr_debug("%s: flags=%x event_f_flags=%x\n",
1153                 __func__, flags, event_f_flags);
1154
1155        if (!capable(CAP_SYS_ADMIN)) {
1156                /*
1157                 * An unprivileged user can setup an fanotify group with
1158                 * limited functionality - an unprivileged group is limited to
1159                 * notification events with file handles and it cannot use
1160                 * unlimited queue/marks.
1161                 */
1162                if ((flags & FANOTIFY_ADMIN_INIT_FLAGS) || !fid_mode)
1163                        return -EPERM;
1164
1165                /*
1166                 * Setting the internal flag FANOTIFY_UNPRIV on the group
1167                 * prevents setting mount/filesystem marks on this group and
1168                 * prevents reporting pid and open fd in events.
1169                 */
1170                internal_flags |= FANOTIFY_UNPRIV;
1171        }
1172
1173#ifdef CONFIG_AUDITSYSCALL
1174        if (flags & ~(FANOTIFY_INIT_FLAGS | FAN_ENABLE_AUDIT))
1175#else
1176        if (flags & ~FANOTIFY_INIT_FLAGS)
1177#endif
1178                return -EINVAL;
1179
1180        /*
1181         * A pidfd can only be returned for a thread-group leader; thus
1182         * FAN_REPORT_PIDFD and FAN_REPORT_TID need to remain mutually
1183         * exclusive.
1184         */
1185        if ((flags & FAN_REPORT_PIDFD) && (flags & FAN_REPORT_TID))
1186                return -EINVAL;
1187
1188        if (event_f_flags & ~FANOTIFY_INIT_ALL_EVENT_F_BITS)
1189                return -EINVAL;
1190
1191        switch (event_f_flags & O_ACCMODE) {
1192        case O_RDONLY:
1193        case O_RDWR:
1194        case O_WRONLY:
1195                break;
1196        default:
1197                return -EINVAL;
1198        }
1199
1200        if (fid_mode && class != FAN_CLASS_NOTIF)
1201                return -EINVAL;
1202
1203        /*
1204         * Child name is reported with parent fid so requires dir fid.
1205         * We can report both child fid and dir fid with or without name.
1206         */
1207        if ((fid_mode & FAN_REPORT_NAME) && !(fid_mode & FAN_REPORT_DIR_FID))
1208                return -EINVAL;
1209
1210        f_flags = O_RDWR | FMODE_NONOTIFY;
1211        if (flags & FAN_CLOEXEC)
1212                f_flags |= O_CLOEXEC;
1213        if (flags & FAN_NONBLOCK)
1214                f_flags |= O_NONBLOCK;
1215
1216        /* fsnotify_alloc_group takes a ref.  Dropped in fanotify_release */
1217        group = fsnotify_alloc_user_group(&fanotify_fsnotify_ops);
1218        if (IS_ERR(group)) {
1219                return PTR_ERR(group);
1220        }
1221
1222        /* Enforce groups limits per user in all containing user ns */
1223        group->fanotify_data.ucounts = inc_ucount(current_user_ns(),
1224                                                  current_euid(),
1225                                                  UCOUNT_FANOTIFY_GROUPS);
1226        if (!group->fanotify_data.ucounts) {
1227                fd = -EMFILE;
1228                goto out_destroy_group;
1229        }
1230
1231        group->fanotify_data.flags = flags | internal_flags;
1232        group->memcg = get_mem_cgroup_from_mm(current->mm);
1233
1234        group->fanotify_data.merge_hash = fanotify_alloc_merge_hash();
1235        if (!group->fanotify_data.merge_hash) {
1236                fd = -ENOMEM;
1237                goto out_destroy_group;
1238        }
1239
1240        group->overflow_event = fanotify_alloc_overflow_event();
1241        if (unlikely(!group->overflow_event)) {
1242                fd = -ENOMEM;
1243                goto out_destroy_group;
1244        }
1245
1246        if (force_o_largefile())
1247                event_f_flags |= O_LARGEFILE;
1248        group->fanotify_data.f_flags = event_f_flags;
1249        init_waitqueue_head(&group->fanotify_data.access_waitq);
1250        INIT_LIST_HEAD(&group->fanotify_data.access_list);
1251        switch (class) {
1252        case FAN_CLASS_NOTIF:
1253                group->priority = FS_PRIO_0;
1254                break;
1255        case FAN_CLASS_CONTENT:
1256                group->priority = FS_PRIO_1;
1257                break;
1258        case FAN_CLASS_PRE_CONTENT:
1259                group->priority = FS_PRIO_2;
1260                break;
1261        default:
1262                fd = -EINVAL;
1263                goto out_destroy_group;
1264        }
1265
1266        if (flags & FAN_UNLIMITED_QUEUE) {
1267                fd = -EPERM;
1268                if (!capable(CAP_SYS_ADMIN))
1269                        goto out_destroy_group;
1270                group->max_events = UINT_MAX;
1271        } else {
1272                group->max_events = fanotify_max_queued_events;
1273        }
1274
1275        if (flags & FAN_UNLIMITED_MARKS) {
1276                fd = -EPERM;
1277                if (!capable(CAP_SYS_ADMIN))
1278                        goto out_destroy_group;
1279        }
1280
1281        if (flags & FAN_ENABLE_AUDIT) {
1282                fd = -EPERM;
1283                if (!capable(CAP_AUDIT_WRITE))
1284                        goto out_destroy_group;
1285        }
1286
1287        fd = anon_inode_getfd("[fanotify]", &fanotify_fops, group, f_flags);
1288        if (fd < 0)
1289                goto out_destroy_group;
1290
1291        return fd;
1292
1293out_destroy_group:
1294        fsnotify_destroy_group(group);
1295        return fd;
1296}
1297
1298/* Check if filesystem can encode a unique fid */
1299static int fanotify_test_fid(struct path *path, __kernel_fsid_t *fsid)
1300{
1301        __kernel_fsid_t root_fsid;
1302        int err;
1303
1304        /*
1305         * Make sure path is not in filesystem with zero fsid (e.g. tmpfs).
1306         */
1307        err = vfs_get_fsid(path->dentry, fsid);
1308        if (err)
1309                return err;
1310
1311        if (!fsid->val[0] && !fsid->val[1])
1312                return -ENODEV;
1313
1314        /*
1315         * Make sure path is not inside a filesystem subvolume (e.g. btrfs)
1316         * which uses a different fsid than sb root.
1317         */
1318        err = vfs_get_fsid(path->dentry->d_sb->s_root, &root_fsid);
1319        if (err)
1320                return err;
1321
1322        if (root_fsid.val[0] != fsid->val[0] ||
1323            root_fsid.val[1] != fsid->val[1])
1324                return -EXDEV;
1325
1326        /*
1327         * We need to make sure that the file system supports at least
1328         * encoding a file handle so user can use name_to_handle_at() to
1329         * compare fid returned with event to the file handle of watched
1330         * objects. However, name_to_handle_at() requires that the
1331         * filesystem also supports decoding file handles.
1332         */
1333        if (!path->dentry->d_sb->s_export_op ||
1334            !path->dentry->d_sb->s_export_op->fh_to_dentry)
1335                return -EOPNOTSUPP;
1336
1337        return 0;
1338}
1339
1340static int fanotify_events_supported(struct path *path, __u64 mask)
1341{
1342        /*
1343         * Some filesystems such as 'proc' acquire unusual locks when opening
1344         * files. For them fanotify permission events have high chances of
1345         * deadlocking the system - open done when reporting fanotify event
1346         * blocks on this "unusual" lock while another process holding the lock
1347         * waits for fanotify permission event to be answered. Just disallow
1348         * permission events for such filesystems.
1349         */
1350        if (mask & FANOTIFY_PERM_EVENTS &&
1351            path->mnt->mnt_sb->s_type->fs_flags & FS_DISALLOW_NOTIFY_PERM)
1352                return -EINVAL;
1353        return 0;
1354}
1355
1356static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask,
1357                            int dfd, const char  __user *pathname)
1358{
1359        struct inode *inode = NULL;
1360        struct vfsmount *mnt = NULL;
1361        struct fsnotify_group *group;
1362        struct fd f;
1363        struct path path;
1364        __kernel_fsid_t __fsid, *fsid = NULL;
1365        u32 valid_mask = FANOTIFY_EVENTS | FANOTIFY_EVENT_FLAGS;
1366        unsigned int mark_type = flags & FANOTIFY_MARK_TYPE_BITS;
1367        bool ignored = flags & FAN_MARK_IGNORED_MASK;
1368        unsigned int obj_type, fid_mode;
1369        u32 umask = 0;
1370        int ret;
1371
1372        pr_debug("%s: fanotify_fd=%d flags=%x dfd=%d pathname=%p mask=%llx\n",
1373                 __func__, fanotify_fd, flags, dfd, pathname, mask);
1374
1375        /* we only use the lower 32 bits as of right now. */
1376        if (upper_32_bits(mask))
1377                return -EINVAL;
1378
1379        if (flags & ~FANOTIFY_MARK_FLAGS)
1380                return -EINVAL;
1381
1382        switch (mark_type) {
1383        case FAN_MARK_INODE:
1384                obj_type = FSNOTIFY_OBJ_TYPE_INODE;
1385                break;
1386        case FAN_MARK_MOUNT:
1387                obj_type = FSNOTIFY_OBJ_TYPE_VFSMOUNT;
1388                break;
1389        case FAN_MARK_FILESYSTEM:
1390                obj_type = FSNOTIFY_OBJ_TYPE_SB;
1391                break;
1392        default:
1393                return -EINVAL;
1394        }
1395
1396        switch (flags & (FAN_MARK_ADD | FAN_MARK_REMOVE | FAN_MARK_FLUSH)) {
1397        case FAN_MARK_ADD:
1398        case FAN_MARK_REMOVE:
1399                if (!mask)
1400                        return -EINVAL;
1401                break;
1402        case FAN_MARK_FLUSH:
1403                if (flags & ~(FANOTIFY_MARK_TYPE_BITS | FAN_MARK_FLUSH))
1404                        return -EINVAL;
1405                break;
1406        default:
1407                return -EINVAL;
1408        }
1409
1410        if (IS_ENABLED(CONFIG_FANOTIFY_ACCESS_PERMISSIONS))
1411                valid_mask |= FANOTIFY_PERM_EVENTS;
1412
1413        if (mask & ~valid_mask)
1414                return -EINVAL;
1415
1416        /* Event flags (ONDIR, ON_CHILD) are meaningless in ignored mask */
1417        if (ignored)
1418                mask &= ~FANOTIFY_EVENT_FLAGS;
1419
1420        f = fdget(fanotify_fd);
1421        if (unlikely(!f.file))
1422                return -EBADF;
1423
1424        /* verify that this is indeed an fanotify instance */
1425        ret = -EINVAL;
1426        if (unlikely(f.file->f_op != &fanotify_fops))
1427                goto fput_and_out;
1428        group = f.file->private_data;
1429
1430        /*
1431         * An unprivileged user is not allowed to setup mount nor filesystem
1432         * marks.  This also includes setting up such marks by a group that
1433         * was initialized by an unprivileged user.
1434         */
1435        ret = -EPERM;
1436        if ((!capable(CAP_SYS_ADMIN) ||
1437             FAN_GROUP_FLAG(group, FANOTIFY_UNPRIV)) &&
1438            mark_type != FAN_MARK_INODE)
1439                goto fput_and_out;
1440
1441        /*
1442         * group->priority == FS_PRIO_0 == FAN_CLASS_NOTIF.  These are not
1443         * allowed to set permissions events.
1444         */
1445        ret = -EINVAL;
1446        if (mask & FANOTIFY_PERM_EVENTS &&
1447            group->priority == FS_PRIO_0)
1448                goto fput_and_out;
1449
1450        /*
1451         * Events with data type inode do not carry enough information to report
1452         * event->fd, so we do not allow setting a mask for inode events unless
1453         * group supports reporting fid.
1454         * inode events are not supported on a mount mark, because they do not
1455         * carry enough information (i.e. path) to be filtered by mount point.
1456         */
1457        fid_mode = FAN_GROUP_FLAG(group, FANOTIFY_FID_BITS);
1458        if (mask & FANOTIFY_INODE_EVENTS &&
1459            (!fid_mode || mark_type == FAN_MARK_MOUNT))
1460                goto fput_and_out;
1461
1462        if (flags & FAN_MARK_FLUSH) {
1463                ret = 0;
1464                if (mark_type == FAN_MARK_MOUNT)
1465                        fsnotify_clear_vfsmount_marks_by_group(group);
1466                else if (mark_type == FAN_MARK_FILESYSTEM)
1467                        fsnotify_clear_sb_marks_by_group(group);
1468                else
1469                        fsnotify_clear_inode_marks_by_group(group);
1470                goto fput_and_out;
1471        }
1472
1473        ret = fanotify_find_path(dfd, pathname, &path, flags,
1474                        (mask & ALL_FSNOTIFY_EVENTS), obj_type);
1475        if (ret)
1476                goto fput_and_out;
1477
1478        if (flags & FAN_MARK_ADD) {
1479                ret = fanotify_events_supported(&path, mask);
1480                if (ret)
1481                        goto path_put_and_out;
1482        }
1483
1484        if (fid_mode) {
1485                ret = fanotify_test_fid(&path, &__fsid);
1486                if (ret)
1487                        goto path_put_and_out;
1488
1489                fsid = &__fsid;
1490        }
1491
1492        /* inode held in place by reference to path; group by fget on fd */
1493        if (mark_type == FAN_MARK_INODE)
1494                inode = path.dentry->d_inode;
1495        else
1496                mnt = path.mnt;
1497
1498        /* Mask out FAN_EVENT_ON_CHILD flag for sb/mount/non-dir marks */
1499        if (mnt || !S_ISDIR(inode->i_mode)) {
1500                mask &= ~FAN_EVENT_ON_CHILD;
1501                umask = FAN_EVENT_ON_CHILD;
1502                /*
1503                 * If group needs to report parent fid, register for getting
1504                 * events with parent/name info for non-directory.
1505                 */
1506                if ((fid_mode & FAN_REPORT_DIR_FID) &&
1507                    (flags & FAN_MARK_ADD) && !ignored)
1508                        mask |= FAN_EVENT_ON_CHILD;
1509        }
1510
1511        /* create/update an inode mark */
1512        switch (flags & (FAN_MARK_ADD | FAN_MARK_REMOVE)) {
1513        case FAN_MARK_ADD:
1514                if (mark_type == FAN_MARK_MOUNT)
1515                        ret = fanotify_add_vfsmount_mark(group, mnt, mask,
1516                                                         flags, fsid);
1517                else if (mark_type == FAN_MARK_FILESYSTEM)
1518                        ret = fanotify_add_sb_mark(group, mnt->mnt_sb, mask,
1519                                                   flags, fsid);
1520                else
1521                        ret = fanotify_add_inode_mark(group, inode, mask,
1522                                                      flags, fsid);
1523                break;
1524        case FAN_MARK_REMOVE:
1525                if (mark_type == FAN_MARK_MOUNT)
1526                        ret = fanotify_remove_vfsmount_mark(group, mnt, mask,
1527                                                            flags, umask);
1528                else if (mark_type == FAN_MARK_FILESYSTEM)
1529                        ret = fanotify_remove_sb_mark(group, mnt->mnt_sb, mask,
1530                                                      flags, umask);
1531                else
1532                        ret = fanotify_remove_inode_mark(group, inode, mask,
1533                                                         flags, umask);
1534                break;
1535        default:
1536                ret = -EINVAL;
1537        }
1538
1539path_put_and_out:
1540        path_put(&path);
1541fput_and_out:
1542        fdput(f);
1543        return ret;
1544}
1545
1546#ifndef CONFIG_ARCH_SPLIT_ARG64
1547SYSCALL_DEFINE5(fanotify_mark, int, fanotify_fd, unsigned int, flags,
1548                              __u64, mask, int, dfd,
1549                              const char  __user *, pathname)
1550{
1551        return do_fanotify_mark(fanotify_fd, flags, mask, dfd, pathname);
1552}
1553#endif
1554
1555#if defined(CONFIG_ARCH_SPLIT_ARG64) || defined(CONFIG_COMPAT)
1556SYSCALL32_DEFINE6(fanotify_mark,
1557                                int, fanotify_fd, unsigned int, flags,
1558                                SC_ARG64(mask), int, dfd,
1559                                const char  __user *, pathname)
1560{
1561        return do_fanotify_mark(fanotify_fd, flags, SC_VAL64(__u64, mask),
1562                                dfd, pathname);
1563}
1564#endif
1565
1566/*
1567 * fanotify_user_setup - Our initialization function.  Note that we cannot return
1568 * error because we have compiled-in VFS hooks.  So an (unlikely) failure here
1569 * must result in panic().
1570 */
1571static int __init fanotify_user_setup(void)
1572{
1573        struct sysinfo si;
1574        int max_marks;
1575
1576        si_meminfo(&si);
1577        /*
1578         * Allow up to 1% of addressable memory to be accounted for per user
1579         * marks limited to the range [8192, 1048576]. mount and sb marks are
1580         * a lot cheaper than inode marks, but there is no reason for a user
1581         * to have many of those, so calculate by the cost of inode marks.
1582         */
1583        max_marks = (((si.totalram - si.totalhigh) / 100) << PAGE_SHIFT) /
1584                    INODE_MARK_COST;
1585        max_marks = clamp(max_marks, FANOTIFY_OLD_DEFAULT_MAX_MARKS,
1586                                     FANOTIFY_DEFAULT_MAX_USER_MARKS);
1587
1588        BUILD_BUG_ON(FANOTIFY_INIT_FLAGS & FANOTIFY_INTERNAL_GROUP_FLAGS);
1589        BUILD_BUG_ON(HWEIGHT32(FANOTIFY_INIT_FLAGS) != 11);
1590        BUILD_BUG_ON(HWEIGHT32(FANOTIFY_MARK_FLAGS) != 9);
1591
1592        fanotify_mark_cache = KMEM_CACHE(fsnotify_mark,
1593                                         SLAB_PANIC|SLAB_ACCOUNT);
1594        fanotify_fid_event_cachep = KMEM_CACHE(fanotify_fid_event,
1595                                               SLAB_PANIC);
1596        fanotify_path_event_cachep = KMEM_CACHE(fanotify_path_event,
1597                                                SLAB_PANIC);
1598        if (IS_ENABLED(CONFIG_FANOTIFY_ACCESS_PERMISSIONS)) {
1599                fanotify_perm_event_cachep =
1600                        KMEM_CACHE(fanotify_perm_event, SLAB_PANIC);
1601        }
1602
1603        fanotify_max_queued_events = FANOTIFY_DEFAULT_MAX_EVENTS;
1604        init_user_ns.ucount_max[UCOUNT_FANOTIFY_GROUPS] =
1605                                        FANOTIFY_DEFAULT_MAX_GROUPS;
1606        init_user_ns.ucount_max[UCOUNT_FANOTIFY_MARKS] = max_marks;
1607
1608        return 0;
1609}
1610device_initcall(fanotify_user_setup);
1611