LXR linux/kernel/seccomp.c

   1// SPDX-License-Identifier: GPL-2.0
   2/*
   3 * linux/kernel/seccomp.c
   4 *
   5 * Copyright 2004-2005  Andrea Arcangeli <andrea@cpushare.com>
   6 *
   7 * Copyright (C) 2012 Google, Inc.
   8 * Will Drewry <wad@chromium.org>
   9 *
  10 * This defines a simple but solid secure-computing facility.
  11 *
  12 * Mode 1 uses a fixed list of allowed system calls.
  13 * Mode 2 allows user-defined system call filters in the form
  14 *        of Berkeley Packet Filters/Linux Socket Filters.
  15 */
  16#define pr_fmt(fmt) "seccomp: " fmt
  17
  18#include <linux/refcount.h>
  19#include <linux/audit.h>
  20#include <linux/compat.h>
  21#include <linux/coredump.h>
  22#include <linux/kmemleak.h>
  23#include <linux/nospec.h>
  24#include <linux/prctl.h>
  25#include <linux/sched.h>
  26#include <linux/sched/task_stack.h>
  27#include <linux/seccomp.h>
  28#include <linux/slab.h>
  29#include <linux/syscalls.h>
  30#include <linux/sysctl.h>
  31
  32#ifdef CONFIG_HAVE_ARCH_SECCOMP_FILTER
  33#include <asm/syscall.h>
  34#endif
  35
  36#ifdef CONFIG_SECCOMP_FILTER
  37#include <linux/file.h>
  38#include <linux/filter.h>
  39#include <linux/pid.h>
  40#include <linux/ptrace.h>
  41#include <linux/capability.h>
  42#include <linux/tracehook.h>
  43#include <linux/uaccess.h>
  44#include <linux/anon_inodes.h>
  45#include <linux/lockdep.h>
  46
  47/*
  48 * When SECCOMP_IOCTL_NOTIF_ID_VALID was first introduced, it had the
  49 * wrong direction flag in the ioctl number. This is the broken one,
  50 * which the kernel needs to keep supporting until all userspaces stop
  51 * using the wrong command number.
  52 */
  53#define SECCOMP_IOCTL_NOTIF_ID_VALID_WRONG_DIR  SECCOMP_IOR(2, __u64)
  54
  55enum notify_state {
  56        SECCOMP_NOTIFY_INIT,
  57        SECCOMP_NOTIFY_SENT,
  58        SECCOMP_NOTIFY_REPLIED,
  59};
  60
  61struct seccomp_knotif {
  62        /* The struct pid of the task whose filter triggered the notification */
  63        struct task_struct *task;
  64
  65        /* The "cookie" for this request; this is unique for this filter. */
  66        u64 id;
  67
  68        /*
  69         * The seccomp data. This pointer is valid the entire time this
  70         * notification is active, since it comes from __seccomp_filter which
  71         * eclipses the entire lifecycle here.
  72         */
  73        const struct seccomp_data *data;
  74
  75        /*
  76         * Notification states. When SECCOMP_RET_USER_NOTIF is returned, a
  77         * struct seccomp_knotif is created and starts out in INIT. Once the
  78         * handler reads the notification off of an FD, it transitions to SENT.
  79         * If a signal is received the state transitions back to INIT and
  80         * another message is sent. When the userspace handler replies, state
  81         * transitions to REPLIED.
  82         */
  83        enum notify_state state;
  84
  85        /* The return values, only valid when in SECCOMP_NOTIFY_REPLIED */
  86        int error;
  87        long val;
  88        u32 flags;
  89
  90        /*
  91         * Signals when this has changed states, such as the listener
  92         * dying, a new seccomp addfd message, or changing to REPLIED
  93         */
  94        struct completion ready;
  95
  96        struct list_head list;
  97
  98        /* outstanding addfd requests */
  99        struct list_head addfd;
 100};
 101
 102/**
 103 * struct seccomp_kaddfd - container for seccomp_addfd ioctl messages
 104 *
 105 * @file: A reference to the file to install in the other task
 106 * @fd: The fd number to install it at. If the fd number is -1, it means the
 107 *      installing process should allocate the fd as normal.
 108 * @flags: The flags for the new file descriptor. At the moment, only O_CLOEXEC
 109 *         is allowed.
 110 * @ret: The return value of the installing process. It is set to the fd num
 111 *       upon success (>= 0).
 112 * @completion: Indicates that the installing process has completed fd
 113 *              installation, or gone away (either due to successful
 114 *              reply, or signal)
 115 *
 116 */
 117struct seccomp_kaddfd {
 118        struct file *file;
 119        int fd;
 120        unsigned int flags;
 121
 122        /* To only be set on reply */
 123        int ret;
 124        struct completion completion;
 125        struct list_head list;
 126};
 127
 128/**
 129 * struct notification - container for seccomp userspace notifications. Since
 130 * most seccomp filters will not have notification listeners attached and this
 131 * structure is fairly large, we store the notification-specific stuff in a
 132 * separate structure.
 133 *
 134 * @request: A semaphore that users of this notification can wait on for
 135 *           changes. Actual reads and writes are still controlled with
 136 *           filter->notify_lock.
 137 * @next_id: The id of the next request.
 138 * @notifications: A list of struct seccomp_knotif elements.
 139 */
 140struct notification {
 141        struct semaphore request;
 142        u64 next_id;
 143        struct list_head notifications;
 144};
 145
 146/**
 147 * struct seccomp_filter - container for seccomp BPF programs
 148 *
 149 * @refs: Reference count to manage the object lifetime.
 150 *        A filter's reference count is incremented for each directly
 151 *        attached task, once for the dependent filter, and if
 152 *        requested for the user notifier. When @refs reaches zero,
 153 *        the filter can be freed.
 154 * @users: A filter's @users count is incremented for each directly
 155 *         attached task (filter installation, fork(), thread_sync),
 156 *         and once for the dependent filter (tracked in filter->prev).
 157 *         When it reaches zero it indicates that no direct or indirect
 158 *         users of that filter exist. No new tasks can get associated with
 159 *         this filter after reaching 0. The @users count is always smaller
 160 *         or equal to @refs. Hence, reaching 0 for @users does not mean
 161 *         the filter can be freed.
 162 * @log: true if all actions except for SECCOMP_RET_ALLOW should be logged
 163 * @prev: points to a previously installed, or inherited, filter
 164 * @prog: the BPF program to evaluate
 165 * @notif: the struct that holds all notification related information
 166 * @notify_lock: A lock for all notification-related accesses.
 167 * @wqh: A wait queue for poll if a notifier is in use.
 168 *
 169 * seccomp_filter objects are organized in a tree linked via the @prev
 170 * pointer.  For any task, it appears to be a singly-linked list starting
 171 * with current->seccomp.filter, the most recently attached or inherited filter.
 172 * However, multiple filters may share a @prev node, by way of fork(), which
 173 * results in a unidirectional tree existing in memory.  This is similar to
 174 * how namespaces work.
 175 *
 176 * seccomp_filter objects should never be modified after being attached
 177 * to a task_struct (other than @refs).
 178 */
 179struct seccomp_filter {
 180        refcount_t refs;
 181        refcount_t users;
 182        bool log;
 183        struct seccomp_filter *prev;
 184        struct bpf_prog *prog;
 185        struct notification *notif;
 186        struct mutex notify_lock;
 187        wait_queue_head_t wqh;
 188};
 189
 190/* Limit any path through the tree to 256KB worth of instructions. */
 191#define MAX_INSNS_PER_PATH ((1 << 18) / sizeof(struct sock_filter))
 192
 193/*
 194 * Endianness is explicitly ignored and left for BPF program authors to manage
 195 * as per the specific architecture.
 196 */
 197static void populate_seccomp_data(struct seccomp_data *sd)
 198{
 199        /*
 200         * Instead of using current_pt_reg(), we're already doing the work
 201         * to safely fetch "current", so just use "task" everywhere below.
 202         */
 203        struct task_struct *task = current;
 204        struct pt_regs *regs = task_pt_regs(task);
 205        unsigned long args[6];
 206
 207        sd->nr = syscall_get_nr(task, regs);
 208        sd->arch = syscall_get_arch(task);
 209        syscall_get_arguments(task, regs, args);
 210        sd->args[0] = args[0];
 211        sd->args[1] = args[1];
 212        sd->args[2] = args[2];
 213        sd->args[3] = args[3];
 214        sd->args[4] = args[4];
 215        sd->args[5] = args[5];
 216        sd->instruction_pointer = KSTK_EIP(task);
 217}
 218
 219/**
 220 *      seccomp_check_filter - verify seccomp filter code
 221 *      @filter: filter to verify
 222 *      @flen: length of filter
 223 *
 224 * Takes a previously checked filter (by bpf_check_classic) and
 225 * redirects all filter code that loads struct sk_buff data
 226 * and related data through seccomp_bpf_load.  It also
 227 * enforces length and alignment checking of those loads.
 228 *
 229 * Returns 0 if the rule set is legal or -EINVAL if not.
 230 */
 231static int seccomp_check_filter(struct sock_filter *filter, unsigned int flen)
 232{
 233        int pc;
 234        for (pc = 0; pc < flen; pc++) {
 235                struct sock_filter *ftest = &filter[pc];
 236                u16 code = ftest->code;
 237                u32 k = ftest->k;
 238
 239                switch (code) {
 240                case BPF_LD | BPF_W | BPF_ABS:
 241                        ftest->code = BPF_LDX | BPF_W | BPF_ABS;
 242                        /* 32-bit aligned and not out of bounds. */
 243                        if (k >= sizeof(struct seccomp_data) || k & 3)
 244                                return -EINVAL;
 245                        continue;
 246                case BPF_LD | BPF_W | BPF_LEN:
 247                        ftest->code = BPF_LD | BPF_IMM;
 248                        ftest->k = sizeof(struct seccomp_data);
 249                        continue;
 250                case BPF_LDX | BPF_W | BPF_LEN:
 251                        ftest->code = BPF_LDX | BPF_IMM;
 252                        ftest->k = sizeof(struct seccomp_data);
 253                        continue;
 254                /* Explicitly include allowed calls. */
 255                case BPF_RET | BPF_K:
 256                case BPF_RET | BPF_A:
 257                case BPF_ALU | BPF_ADD | BPF_K:
 258                case BPF_ALU | BPF_ADD | BPF_X:
 259                case BPF_ALU | BPF_SUB | BPF_K:
 260                case BPF_ALU | BPF_SUB | BPF_X:
 261                case BPF_ALU | BPF_MUL | BPF_K:
 262                case BPF_ALU | BPF_MUL | BPF_X:
 263                case BPF_ALU | BPF_DIV | BPF_K:
 264                case BPF_ALU | BPF_DIV | BPF_X:
 265                case BPF_ALU | BPF_AND | BPF_K:
 266                case BPF_ALU | BPF_AND | BPF_X:
 267                case BPF_ALU | BPF_OR | BPF_K:
 268                case BPF_ALU | BPF_OR | BPF_X:
 269                case BPF_ALU | BPF_XOR | BPF_K:
 270                case BPF_ALU | BPF_XOR | BPF_X:
 271                case BPF_ALU | BPF_LSH | BPF_K:
 272                case BPF_ALU | BPF_LSH | BPF_X:
 273                case BPF_ALU | BPF_RSH | BPF_K:
 274                case BPF_ALU | BPF_RSH | BPF_X:
 275                case BPF_ALU | BPF_NEG:
 276                case BPF_LD | BPF_IMM:
 277                case BPF_LDX | BPF_IMM:
 278                case BPF_MISC | BPF_TAX:
 279                case BPF_MISC | BPF_TXA:
 280                case BPF_LD | BPF_MEM:
 281                case BPF_LDX | BPF_MEM:
 282                case BPF_ST:
 283                case BPF_STX:
 284                case BPF_JMP | BPF_JA:
 285                case BPF_JMP | BPF_JEQ | BPF_K:
 286                case BPF_JMP | BPF_JEQ | BPF_X:
 287                case BPF_JMP | BPF_JGE | BPF_K:
 288                case BPF_JMP | BPF_JGE | BPF_X:
 289                case BPF_JMP | BPF_JGT | BPF_K:
 290                case BPF_JMP | BPF_JGT | BPF_X:
 291                case BPF_JMP | BPF_JSET | BPF_K:
 292                case BPF_JMP | BPF_JSET | BPF_X:
 293                        continue;
 294                default:
 295                        return -EINVAL;
 296                }
 297        }
 298        return 0;
 299}
 300
 301/**
 302 * seccomp_run_filters - evaluates all seccomp filters against @sd
 303 * @sd: optional seccomp data to be passed to filters
 304 * @match: stores struct seccomp_filter that resulted in the return value,
 305 *         unless filter returned SECCOMP_RET_ALLOW, in which case it will
 306 *         be unchanged.
 307 *
 308 * Returns valid seccomp BPF response codes.
 309 */
 310#define ACTION_ONLY(ret) ((s32)((ret) & (SECCOMP_RET_ACTION_FULL)))
 311static u32 seccomp_run_filters(const struct seccomp_data *sd,
 312                               struct seccomp_filter **match)
 313{
 314        u32 ret = SECCOMP_RET_ALLOW;
 315        /* Make sure cross-thread synced filter points somewhere sane. */
 316        struct seccomp_filter *f =
 317                        READ_ONCE(current->seccomp.filter);
 318
 319        /* Ensure unexpected behavior doesn't result in failing open. */
 320        if (WARN_ON(f == NULL))
 321                return SECCOMP_RET_KILL_PROCESS;
 322
 323        /*
 324         * All filters in the list are evaluated and the lowest BPF return
 325         * value always takes priority (ignoring the DATA).
 326         */
 327        for (; f; f = f->prev) {
 328                u32 cur_ret = bpf_prog_run_pin_on_cpu(f->prog, sd);
 329
 330                if (ACTION_ONLY(cur_ret) < ACTION_ONLY(ret)) {
 331                        ret = cur_ret;
 332                        *match = f;
 333                }
 334        }
 335        return ret;
 336}
 337#endif /* CONFIG_SECCOMP_FILTER */
 338
 339static inline bool seccomp_may_assign_mode(unsigned long seccomp_mode)
 340{
 341        assert_spin_locked(&current->sighand->siglock);
 342
 343        if (current->seccomp.mode && current->seccomp.mode != seccomp_mode)
 344                return false;
 345
 346        return true;
 347}
 348
 349void __weak arch_seccomp_spec_mitigate(struct task_struct *task) { }
 350
 351static inline void seccomp_assign_mode(struct task_struct *task,
 352                                       unsigned long seccomp_mode,
 353                                       unsigned long flags)
 354{
 355        assert_spin_locked(&task->sighand->siglock);
 356
 357        task->seccomp.mode = seccomp_mode;
 358        /*
 359         * Make sure TIF_SECCOMP cannot be set before the mode (and
 360         * filter) is set.
 361         */
 362        smp_mb__before_atomic();
 363        /* Assume default seccomp processes want spec flaw mitigation. */
 364        if ((flags & SECCOMP_FILTER_FLAG_SPEC_ALLOW) == 0)
 365                arch_seccomp_spec_mitigate(task);
 366        set_tsk_thread_flag(task, TIF_SECCOMP);
 367}
 368
 369#ifdef CONFIG_SECCOMP_FILTER
 370/* Returns 1 if the parent is an ancestor of the child. */
 371static int is_ancestor(struct seccomp_filter *parent,
 372                       struct seccomp_filter *child)
 373{
 374        /* NULL is the root ancestor. */
 375        if (parent == NULL)
 376                return 1;
 377        for (; child; child = child->prev)
 378                if (child == parent)
 379                        return 1;
 380        return 0;
 381}
 382
 383/**
 384 * seccomp_can_sync_threads: checks if all threads can be synchronized
 385 *
 386 * Expects sighand and cred_guard_mutex locks to be held.
 387 *
 388 * Returns 0 on success, -ve on error, or the pid of a thread which was
 389 * either not in the correct seccomp mode or did not have an ancestral
 390 * seccomp filter.
 391 */
 392static inline pid_t seccomp_can_sync_threads(void)
 393{
 394        struct task_struct *thread, *caller;
 395
 396        BUG_ON(!mutex_is_locked(&current->signal->cred_guard_mutex));
 397        assert_spin_locked(&current->sighand->siglock);
 398
 399        /* Validate all threads being eligible for synchronization. */
 400        caller = current;
 401        for_each_thread(caller, thread) {
 402                pid_t failed;
 403
 404                /* Skip current, since it is initiating the sync. */
 405                if (thread == caller)
 406                        continue;
 407
 408                if (thread->seccomp.mode == SECCOMP_MODE_DISABLED ||
 409                    (thread->seccomp.mode == SECCOMP_MODE_FILTER &&
 410                     is_ancestor(thread->seccomp.filter,
 411                                 caller->seccomp.filter)))
 412                        continue;
 413
 414                /* Return the first thread that cannot be synchronized. */
 415                failed = task_pid_vnr(thread);
 416                /* If the pid cannot be resolved, then return -ESRCH */
 417                if (WARN_ON(failed == 0))
 418                        failed = -ESRCH;
 419                return failed;
 420        }
 421
 422        return 0;
 423}
 424
 425static inline void seccomp_filter_free(struct seccomp_filter *filter)
 426{
 427        if (filter) {
 428                bpf_prog_destroy(filter->prog);
 429                kfree(filter);
 430        }
 431}
 432
 433static void __seccomp_filter_orphan(struct seccomp_filter *orig)
 434{
 435        while (orig && refcount_dec_and_test(&orig->users)) {
 436                if (waitqueue_active(&orig->wqh))
 437                        wake_up_poll(&orig->wqh, EPOLLHUP);
 438                orig = orig->prev;
 439        }
 440}
 441
 442static void __put_seccomp_filter(struct seccomp_filter *orig)
 443{
 444        /* Clean up single-reference branches iteratively. */
 445        while (orig && refcount_dec_and_test(&orig->refs)) {
 446                struct seccomp_filter *freeme = orig;
 447                orig = orig->prev;
 448                seccomp_filter_free(freeme);
 449        }
 450}
 451
 452static void __seccomp_filter_release(struct seccomp_filter *orig)
 453{
 454        /* Notify about any unused filters in the task's former filter tree. */
 455        __seccomp_filter_orphan(orig);
 456        /* Finally drop all references to the task's former tree. */
 457        __put_seccomp_filter(orig);
 458}
 459
 460/**
 461 * seccomp_filter_release - Detach the task from its filter tree,
 462 *                          drop its reference count, and notify
 463 *                          about unused filters
 464 *
 465 * This function should only be called when the task is exiting as
 466 * it detaches it from its filter tree. As such, READ_ONCE() and
 467 * barriers are not needed here, as would normally be needed.
 468 */
 469void seccomp_filter_release(struct task_struct *tsk)
 470{
 471        struct seccomp_filter *orig = tsk->seccomp.filter;
 472
 473        /* Detach task from its filter tree. */
 474        tsk->seccomp.filter = NULL;
 475        __seccomp_filter_release(orig);
 476}
 477
 478/**
 479 * seccomp_sync_threads: sets all threads to use current's filter
 480 *
 481 * Expects sighand and cred_guard_mutex locks to be held, and for
 482 * seccomp_can_sync_threads() to have returned success already
 483 * without dropping the locks.
 484 *
 485 */
 486static inline void seccomp_sync_threads(unsigned long flags)
 487{
 488        struct task_struct *thread, *caller;
 489
 490        BUG_ON(!mutex_is_locked(&current->signal->cred_guard_mutex));
 491        assert_spin_locked(&current->sighand->siglock);
 492
 493        /* Synchronize all threads. */
 494        caller = current;
 495        for_each_thread(caller, thread) {
 496                /* Skip current, since it needs no changes. */
 497                if (thread == caller)
 498                        continue;
 499
 500                /* Get a task reference for the new leaf node. */
 501                get_seccomp_filter(caller);
 502
 503                /*
 504                 * Drop the task reference to the shared ancestor since
 505                 * current's path will hold a reference.  (This also
 506                 * allows a put before the assignment.)
 507                 */
 508                __seccomp_filter_release(thread->seccomp.filter);
 509
 510                /* Make our new filter tree visible. */
 511                smp_store_release(&thread->seccomp.filter,
 512                                  caller->seccomp.filter);
 513                atomic_set(&thread->seccomp.filter_count,
 514                           atomic_read(&thread->seccomp.filter_count));
 515
 516                /*
 517                 * Don't let an unprivileged task work around
 518                 * the no_new_privs restriction by creating
 519                 * a thread that sets it up, enters seccomp,
 520                 * then dies.
 521                 */
 522                if (task_no_new_privs(caller))
 523                        task_set_no_new_privs(thread);
 524
 525                /*
 526                 * Opt the other thread into seccomp if needed.
 527                 * As threads are considered to be trust-realm
 528                 * equivalent (see ptrace_may_access), it is safe to
 529                 * allow one thread to transition the other.
 530                 */
 531                if (thread->seccomp.mode == SECCOMP_MODE_DISABLED)
 532                        seccomp_assign_mode(thread, SECCOMP_MODE_FILTER,
 533                                            flags);
 534        }
 535}
 536
 537/**
 538 * seccomp_prepare_filter: Prepares a seccomp filter for use.
 539 * @fprog: BPF program to install
 540 *
 541 * Returns filter on success or an ERR_PTR on failure.
 542 */
 543static struct seccomp_filter *seccomp_prepare_filter(struct sock_fprog *fprog)
 544{
 545        struct seccomp_filter *sfilter;
 546        int ret;
 547        const bool save_orig = IS_ENABLED(CONFIG_CHECKPOINT_RESTORE);
 548
 549        if (fprog->len == 0 || fprog->len > BPF_MAXINSNS)
 550                return ERR_PTR(-EINVAL);
 551
 552        BUG_ON(INT_MAX / fprog->len < sizeof(struct sock_filter));
 553
 554        /*
 555         * Installing a seccomp filter requires that the task has
 556         * CAP_SYS_ADMIN in its namespace or be running with no_new_privs.
 557         * This avoids scenarios where unprivileged tasks can affect the
 558         * behavior of privileged children.
 559         */
 560        if (!task_no_new_privs(current) &&
 561                        !ns_capable_noaudit(current_user_ns(), CAP_SYS_ADMIN))
 562                return ERR_PTR(-EACCES);
 563
 564        /* Allocate a new seccomp_filter */
 565        sfilter = kzalloc(sizeof(*sfilter), GFP_KERNEL | __GFP_NOWARN);
 566        if (!sfilter)
 567                return ERR_PTR(-ENOMEM);
 568
 569        mutex_init(&sfilter->notify_lock);
 570        ret = bpf_prog_create_from_user(&sfilter->prog, fprog,
 571                                        seccomp_check_filter, save_orig);
 572        if (ret < 0) {
 573                kfree(sfilter);
 574                return ERR_PTR(ret);
 575        }
 576
 577        refcount_set(&sfilter->refs, 1);
 578        refcount_set(&sfilter->users, 1);
 579        init_waitqueue_head(&sfilter->wqh);
 580
 581        return sfilter;
 582}
 583
 584/**
 585 * seccomp_prepare_user_filter - prepares a user-supplied sock_fprog
 586 * @user_filter: pointer to the user data containing a sock_fprog.
 587 *
 588 * Returns 0 on success and non-zero otherwise.
 589 */
 590static struct seccomp_filter *
 591seccomp_prepare_user_filter(const char __user *user_filter)
 592{
 593        struct sock_fprog fprog;
 594        struct seccomp_filter *filter = ERR_PTR(-EFAULT);
 595
 596#ifdef CONFIG_COMPAT
 597        if (in_compat_syscall()) {
 598                struct compat_sock_fprog fprog32;
 599                if (copy_from_user(&fprog32, user_filter, sizeof(fprog32)))
 600                        goto out;
 601                fprog.len = fprog32.len;
 602                fprog.filter = compat_ptr(fprog32.filter);
 603        } else /* falls through to the if below. */
 604#endif
 605        if (copy_from_user(&fprog, user_filter, sizeof(fprog)))
 606                goto out;
 607        filter = seccomp_prepare_filter(&fprog);
 608out:
 609        return filter;
 610}
 611
 612/**
 613 * seccomp_attach_filter: validate and attach filter
 614 * @flags:  flags to change filter behavior
 615 * @filter: seccomp filter to add to the current process
 616 *
 617 * Caller must be holding current->sighand->siglock lock.
 618 *
 619 * Returns 0 on success, -ve on error, or
 620 *   - in TSYNC mode: the pid of a thread which was either not in the correct
 621 *     seccomp mode or did not have an ancestral seccomp filter
 622 *   - in NEW_LISTENER mode: the fd of the new listener
 623 */
 624static long seccomp_attach_filter(unsigned int flags,
 625                                  struct seccomp_filter *filter)
 626{
 627        unsigned long total_insns;
 628        struct seccomp_filter *walker;
 629
 630        assert_spin_locked(&current->sighand->siglock);
 631
 632        /* Validate resulting filter length. */
 633        total_insns = filter->prog->len;
 634        for (walker = current->seccomp.filter; walker; walker = walker->prev)
 635                total_insns += walker->prog->len + 4;  /* 4 instr penalty */
 636        if (total_insns > MAX_INSNS_PER_PATH)
 637                return -ENOMEM;
 638
 639        /* If thread sync has been requested, check that it is possible. */
 640        if (flags & SECCOMP_FILTER_FLAG_TSYNC) {
 641                int ret;
 642
 643                ret = seccomp_can_sync_threads();
 644                if (ret) {
 645                        if (flags & SECCOMP_FILTER_FLAG_TSYNC_ESRCH)
 646                                return -ESRCH;
 647                        else
 648                                return ret;
 649                }
 650        }
 651
 652        /* Set log flag, if present. */
 653        if (flags & SECCOMP_FILTER_FLAG_LOG)
 654                filter->log = true;
 655
 656        /*
 657         * If there is an existing filter, make it the prev and don't drop its
 658         * task reference.
 659         */
 660        filter->prev = current->seccomp.filter;
 661        current->seccomp.filter = filter;
 662        atomic_inc(&current->seccomp.filter_count);
 663
 664        /* Now that the new filter is in place, synchronize to all threads. */
 665        if (flags & SECCOMP_FILTER_FLAG_TSYNC)
 666                seccomp_sync_threads(flags);
 667
 668        return 0;
 669}
 670
 671static void __get_seccomp_filter(struct seccomp_filter *filter)
 672{
 673        refcount_inc(&filter->refs);
 674}
 675
 676/* get_seccomp_filter - increments the reference count of the filter on @tsk */
 677void get_seccomp_filter(struct task_struct *tsk)
 678{
 679        struct seccomp_filter *orig = tsk->seccomp.filter;
 680        if (!orig)
 681                return;
 682        __get_seccomp_filter(orig);
 683        refcount_inc(&orig->users);
 684}
 685
 686static void seccomp_init_siginfo(kernel_siginfo_t *info, int syscall, int reason)
 687{
 688        clear_siginfo(info);
 689        info->si_signo = SIGSYS;
 690        info->si_code = SYS_SECCOMP;
 691        info->si_call_addr = (void __user *)KSTK_EIP(current);
 692        info->si_errno = reason;
 693        info->si_arch = syscall_get_arch(current);
 694        info->si_syscall = syscall;
 695}
 696
 697/**
 698 * seccomp_send_sigsys - signals the task to allow in-process syscall emulation
 699 * @syscall: syscall number to send to userland
 700 * @reason: filter-supplied reason code to send to userland (via si_errno)
 701 *
 702 * Forces a SIGSYS with a code of SYS_SECCOMP and related sigsys info.
 703 */
 704static void seccomp_send_sigsys(int syscall, int reason)
 705{
 706        struct kernel_siginfo info;
 707        seccomp_init_siginfo(&info, syscall, reason);
 708        force_sig_info(&info);
 709}
 710#endif  /* CONFIG_SECCOMP_FILTER */
 711
 712/* For use with seccomp_actions_logged */
 713#define SECCOMP_LOG_KILL_PROCESS        (1 << 0)
 714#define SECCOMP_LOG_KILL_THREAD         (1 << 1)
 715#define SECCOMP_LOG_TRAP                (1 << 2)
 716#define SECCOMP_LOG_ERRNO               (1 << 3)
 717#define SECCOMP_LOG_TRACE               (1 << 4)
 718#define SECCOMP_LOG_LOG                 (1 << 5)
 719#define SECCOMP_LOG_ALLOW               (1 << 6)
 720#define SECCOMP_LOG_USER_NOTIF          (1 << 7)
 721
 722static u32 seccomp_actions_logged = SECCOMP_LOG_KILL_PROCESS |
 723                                    SECCOMP_LOG_KILL_THREAD  |
 724                                    SECCOMP_LOG_TRAP  |
 725                                    SECCOMP_LOG_ERRNO |
 726                                    SECCOMP_LOG_USER_NOTIF |
 727                                    SECCOMP_LOG_TRACE |
 728                                    SECCOMP_LOG_LOG;
 729
 730static inline void seccomp_log(unsigned long syscall, long signr, u32 action,
 731                               bool requested)
 732{
 733        bool log = false;
 734
 735        switch (action) {
 736        case SECCOMP_RET_ALLOW:
 737                break;
 738        case SECCOMP_RET_TRAP:
 739                log = requested && seccomp_actions_logged & SECCOMP_LOG_TRAP;
 740                break;
 741        case SECCOMP_RET_ERRNO:
 742                log = requested && seccomp_actions_logged & SECCOMP_LOG_ERRNO;
 743                break;
 744        case SECCOMP_RET_TRACE:
 745                log = requested && seccomp_actions_logged & SECCOMP_LOG_TRACE;
 746                break;
 747        case SECCOMP_RET_USER_NOTIF:
 748                log = requested && seccomp_actions_logged & SECCOMP_LOG_USER_NOTIF;
 749                break;
 750        case SECCOMP_RET_LOG:
 751                log = seccomp_actions_logged & SECCOMP_LOG_LOG;
 752                break;
 753        case SECCOMP_RET_KILL_THREAD:
 754                log = seccomp_actions_logged & SECCOMP_LOG_KILL_THREAD;
 755                break;
 756        case SECCOMP_RET_KILL_PROCESS:
 757        default:
 758                log = seccomp_actions_logged & SECCOMP_LOG_KILL_PROCESS;
 759        }
 760
 761        /*
 762         * Emit an audit message when the action is RET_KILL_*, RET_LOG, or the
 763         * FILTER_FLAG_LOG bit was set. The admin has the ability to silence
 764         * any action from being logged by removing the action name from the
 765         * seccomp_actions_logged sysctl.
 766         */
 767        if (!log)
 768                return;
 769
 770        audit_seccomp(syscall, signr, action);
 771}
 772
 773/*
 774 * Secure computing mode 1 allows only read/write/exit/sigreturn.
 775 * To be fully secure this must be combined with rlimit
 776 * to limit the stack allocations too.
 777 */
 778static const int mode1_syscalls[] = {
 779        __NR_seccomp_read, __NR_seccomp_write, __NR_seccomp_exit, __NR_seccomp_sigreturn,
 780        -1, /* negative terminated */
 781};
 782
 783static void __secure_computing_strict(int this_syscall)
 784{
 785        const int *allowed_syscalls = mode1_syscalls;
 786#ifdef CONFIG_COMPAT
 787        if (in_compat_syscall())
 788                allowed_syscalls = get_compat_mode1_syscalls();
 789#endif
 790        do {
 791                if (*allowed_syscalls == this_syscall)
 792                        return;
 793        } while (*++allowed_syscalls != -1);
 794
 795#ifdef SECCOMP_DEBUG
 796        dump_stack();
 797#endif
 798        seccomp_log(this_syscall, SIGKILL, SECCOMP_RET_KILL_THREAD, true);
 799        do_exit(SIGKILL);
 800}
 801
 802#ifndef CONFIG_HAVE_ARCH_SECCOMP_FILTER
 803void secure_computing_strict(int this_syscall)
 804{
 805        int mode = current->seccomp.mode;
 806
 807        if (IS_ENABLED(CONFIG_CHECKPOINT_RESTORE) &&
 808            unlikely(current->ptrace & PT_SUSPEND_SECCOMP))
 809                return;
 810
 811        if (mode == SECCOMP_MODE_DISABLED)
 812                return;
 813        else if (mode == SECCOMP_MODE_STRICT)
 814                __secure_computing_strict(this_syscall);
 815        else
 816                BUG();
 817}
 818#else
 819
 820#ifdef CONFIG_SECCOMP_FILTER
 821static u64 seccomp_next_notify_id(struct seccomp_filter *filter)
 822{
 823        /*
 824         * Note: overflow is ok here, the id just needs to be unique per
 825         * filter.
 826         */
 827        lockdep_assert_held(&filter->notify_lock);
 828        return filter->notif->next_id++;
 829}
 830
 831static void seccomp_handle_addfd(struct seccomp_kaddfd *addfd)
 832{
 833        /*
 834         * Remove the notification, and reset the list pointers, indicating
 835         * that it has been handled.
 836         */
 837        list_del_init(&addfd->list);
 838        addfd->ret = receive_fd_replace(addfd->fd, addfd->file, addfd->flags);
 839        complete(&addfd->completion);
 840}
 841
 842static int seccomp_do_user_notification(int this_syscall,
 843                                        struct seccomp_filter *match,
 844                                        const struct seccomp_data *sd)
 845{
 846        int err;
 847        u32 flags = 0;
 848        long ret = 0;
 849        struct seccomp_knotif n = {};
 850        struct seccomp_kaddfd *addfd, *tmp;
 851
 852        mutex_lock(&match->notify_lock);
 853        err = -ENOSYS;
 854        if (!match->notif)
 855                goto out;
 856
 857        n.task = current;
 858        n.state = SECCOMP_NOTIFY_INIT;
 859        n.data = sd;
 860        n.id = seccomp_next_notify_id(match);
 861        init_completion(&n.ready);
 862        list_add(&n.list, &match->notif->notifications);
 863        INIT_LIST_HEAD(&n.addfd);
 864
 865        up(&match->notif->request);
 866        wake_up_poll(&match->wqh, EPOLLIN | EPOLLRDNORM);
 867        mutex_unlock(&match->notify_lock);
 868
 869        /*
 870         * This is where we wait for a reply from userspace.
 871         */
 872wait:
 873        err = wait_for_completion_interruptible(&n.ready);
 874        mutex_lock(&match->notify_lock);
 875        if (err == 0) {
 876                /* Check if we were woken up by a addfd message */
 877                addfd = list_first_entry_or_null(&n.addfd,
 878                                                 struct seccomp_kaddfd, list);
 879                if (addfd && n.state != SECCOMP_NOTIFY_REPLIED) {
 880                        seccomp_handle_addfd(addfd);
 881                        mutex_unlock(&match->notify_lock);
 882                        goto wait;
 883                }
 884                ret = n.val;
 885                err = n.error;
 886                flags = n.flags;
 887        }
 888
 889        /* If there were any pending addfd calls, clear them out */
 890        list_for_each_entry_safe(addfd, tmp, &n.addfd, list) {
 891                /* The process went away before we got a chance to handle it */
 892                addfd->ret = -ESRCH;
 893                list_del_init(&addfd->list);
 894                complete(&addfd->completion);
 895        }
 896
 897        /*
 898         * Note that it's possible the listener died in between the time when
 899         * we were notified of a response (or a signal) and when we were able to
 900         * re-acquire the lock, so only delete from the list if the
 901         * notification actually exists.
 902         *
 903         * Also note that this test is only valid because there's no way to
 904         * *reattach* to a notifier right now. If one is added, we'll need to
 905         * keep track of the notif itself and make sure they match here.
 906         */
 907        if (match->notif)
 908                list_del(&n.list);
 909out:
 910        mutex_unlock(&match->notify_lock);
 911
 912        /* Userspace requests to continue the syscall. */
 913        if (flags & SECCOMP_USER_NOTIF_FLAG_CONTINUE)
 914                return 0;
 915
 916        syscall_set_return_value(current, current_pt_regs(),
 917                                 err, ret);
 918        return -1;
 919}
 920
 921static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd,
 922                            const bool recheck_after_trace)
 923{
 924        u32 filter_ret, action;
 925        struct seccomp_filter *match = NULL;
 926        int data;
 927        struct seccomp_data sd_local;
 928
 929        /*
 930         * Make sure that any changes to mode from another thread have
 931         * been seen after TIF_SECCOMP was seen.
 932         */
 933        rmb();
 934
 935        if (!sd) {
 936                populate_seccomp_data(&sd_local);
 937                sd = &sd_local;
 938        }
 939
 940        filter_ret = seccomp_run_filters(sd, &match);
 941        data = filter_ret & SECCOMP_RET_DATA;
 942        action = filter_ret & SECCOMP_RET_ACTION_FULL;
 943
 944        switch (action) {
 945        case SECCOMP_RET_ERRNO:
 946                /* Set low-order bits as an errno, capped at MAX_ERRNO. */
 947                if (data > MAX_ERRNO)
 948                        data = MAX_ERRNO;
 949                syscall_set_return_value(current, current_pt_regs(),
 950                                         -data, 0);
 951                goto skip;
 952
 953        case SECCOMP_RET_TRAP:
 954                /* Show the handler the original registers. */
 955                syscall_rollback(current, current_pt_regs());
 956                /* Let the filter pass back 16 bits of data. */
 957                seccomp_send_sigsys(this_syscall, data);
 958                goto skip;
 959
 960        case SECCOMP_RET_TRACE:
 961                /* We've been put in this state by the ptracer already. */
 962                if (recheck_after_trace)
 963                        return 0;
 964
 965                /* ENOSYS these calls if there is no tracer attached. */
 966                if (!ptrace_event_enabled(current, PTRACE_EVENT_SECCOMP)) {
 967                        syscall_set_return_value(current,
 968                                                 current_pt_regs(),
 969                                                 -ENOSYS, 0);
 970                        goto skip;
 971                }
 972
 973                /* Allow the BPF to provide the event message */
 974                ptrace_event(PTRACE_EVENT_SECCOMP, data);
 975                /*
 976                 * The delivery of a fatal signal during event
 977                 * notification may silently skip tracer notification,
 978                 * which could leave us with a potentially unmodified
 979                 * syscall that the tracer would have liked to have
 980                 * changed. Since the process is about to die, we just
 981                 * force the syscall to be skipped and let the signal
 982                 * kill the process and correctly handle any tracer exit
 983                 * notifications.
 984                 */
 985                if (fatal_signal_pending(current))
 986                        goto skip;
 987                /* Check if the tracer forced the syscall to be skipped. */
 988                this_syscall = syscall_get_nr(current, current_pt_regs());
 989                if (this_syscall < 0)
 990                        goto skip;
 991
 992                /*
 993                 * Recheck the syscall, since it may have changed. This
 994                 * intentionally uses a NULL struct seccomp_data to force
 995                 * a reload of all registers. This does not goto skip since
 996                 * a skip would have already been reported.
 997                 */
 998                if (__seccomp_filter(this_syscall, NULL, true))
 999                        return -1;
1000

1001                return 0;
1002
1003        case SECCOMP_RET_USER_NOTIF:
1004                if (seccomp_do_user_notification(this_syscall, match, sd))
1005                        goto skip;
1006
1007                return 0;
1008
1009        case SECCOMP_RET_LOG:
1010                seccomp_log(this_syscall, 0, action, true);
1011                return 0;
1012
1013        case SECCOMP_RET_ALLOW:
1014                /*
1015                 * Note that the "match" filter will always be NULL for
1016                 * this action since SECCOMP_RET_ALLOW is the starting
1017                 * state in seccomp_run_filters().
1018                 */
1019                return 0;
1020
1021        case SECCOMP_RET_KILL_THREAD:
1022        case SECCOMP_RET_KILL_PROCESS:
1023        default:
1024                seccomp_log(this_syscall, SIGSYS, action, true);
1025                /* Dump core only if this is the last remaining thread. */
1026                if (action != SECCOMP_RET_KILL_THREAD ||
1027                    get_nr_threads(current) == 1) {
1028                        kernel_siginfo_t info;
1029
1030                        /* Show the original registers in the dump. */
1031                        syscall_rollback(current, current_pt_regs());
1032                        /* Trigger a manual coredump since do_exit skips it. */
1033                        seccomp_init_siginfo(&info, this_syscall, data);
1034                        do_coredump(&info);
1035                }
1036                if (action == SECCOMP_RET_KILL_THREAD)
1037                        do_exit(SIGSYS);
1038                else
1039                        do_group_exit(SIGSYS);
1040        }
1041
1042        unreachable();
1043
1044skip:
1045        seccomp_log(this_syscall, 0, action, match ? match->log : false);
1046        return -1;
1047}
1048#else
1049static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd,
1050                            const bool recheck_after_trace)
1051{
1052        BUG();
1053}
1054#endif
1055
1056int __secure_computing(const struct seccomp_data *sd)
1057{
1058        int mode = current->seccomp.mode;
1059        int this_syscall;
1060
1061        if (IS_ENABLED(CONFIG_CHECKPOINT_RESTORE) &&
1062            unlikely(current->ptrace & PT_SUSPEND_SECCOMP))
1063                return 0;
1064
1065        this_syscall = sd ? sd->nr :
1066                syscall_get_nr(current, current_pt_regs());
1067
1068        switch (mode) {
1069        case SECCOMP_MODE_STRICT:
1070                __secure_computing_strict(this_syscall);  /* may call do_exit */
1071                return 0;
1072        case SECCOMP_MODE_FILTER:
1073                return __seccomp_filter(this_syscall, sd, false);
1074        default:
1075                BUG();
1076        }
1077}
1078#endif /* CONFIG_HAVE_ARCH_SECCOMP_FILTER */
1079
1080long prctl_get_seccomp(void)
1081{
1082        return current->seccomp.mode;
1083}
1084
1085/**
1086 * seccomp_set_mode_strict: internal function for setting strict seccomp
1087 *
1088 * Once current->seccomp.mode is non-zero, it may not be changed.
1089 *
1090 * Returns 0 on success or -EINVAL on failure.
1091 */
1092static long seccomp_set_mode_strict(void)
1093{
1094        const unsigned long seccomp_mode = SECCOMP_MODE_STRICT;
1095        long ret = -EINVAL;
1096
1097        spin_lock_irq(&current->sighand->siglock);
1098
1099        if (!seccomp_may_assign_mode(seccomp_mode))
1100                goto out;
1101
1102#ifdef TIF_NOTSC
1103        disable_TSC();
1104#endif
1105        seccomp_assign_mode(current, seccomp_mode, 0);
1106        ret = 0;
1107
1108out:
1109        spin_unlock_irq(&current->sighand->siglock);
1110
1111        return ret;
1112}
1113
1114#ifdef CONFIG_SECCOMP_FILTER
1115static void seccomp_notify_free(struct seccomp_filter *filter)
1116{
1117        kfree(filter->notif);
1118        filter->notif = NULL;
1119}
1120
1121static void seccomp_notify_detach(struct seccomp_filter *filter)
1122{
1123        struct seccomp_knotif *knotif;
1124
1125        if (!filter)
1126                return;
1127
1128        mutex_lock(&filter->notify_lock);
1129
1130        /*
1131         * If this file is being closed because e.g. the task who owned it
1132         * died, let's wake everyone up who was waiting on us.
1133         */
1134        list_for_each_entry(knotif, &filter->notif->notifications, list) {
1135                if (knotif->state == SECCOMP_NOTIFY_REPLIED)
1136                        continue;
1137
1138                knotif->state = SECCOMP_NOTIFY_REPLIED;
1139                knotif->error = -ENOSYS;
1140                knotif->val = 0;
1141
1142                /*
1143                 * We do not need to wake up any pending addfd messages, as
1144                 * the notifier will do that for us, as this just looks
1145                 * like a standard reply.
1146                 */
1147                complete(&knotif->ready);
1148        }
1149
1150        seccomp_notify_free(filter);
1151        mutex_unlock(&filter->notify_lock);
1152}
1153
1154static int seccomp_notify_release(struct inode *inode, struct file *file)
1155{
1156        struct seccomp_filter *filter = file->private_data;
1157
1158        seccomp_notify_detach(filter);
1159        __put_seccomp_filter(filter);
1160        return 0;
1161}
1162
1163/* must be called with notif_lock held */
1164static inline struct seccomp_knotif *
1165find_notification(struct seccomp_filter *filter, u64 id)
1166{
1167        struct seccomp_knotif *cur;
1168
1169        lockdep_assert_held(&filter->notify_lock);
1170
1171        list_for_each_entry(cur, &filter->notif->notifications, list) {
1172                if (cur->id == id)
1173                        return cur;
1174        }
1175
1176        return NULL;
1177}
1178
1179
1180static long seccomp_notify_recv(struct seccomp_filter *filter,
1181                                void __user *buf)
1182{
1183        struct seccomp_knotif *knotif = NULL, *cur;
1184        struct seccomp_notif unotif;
1185        ssize_t ret;
1186
1187        /* Verify that we're not given garbage to keep struct extensible. */
1188        ret = check_zeroed_user(buf, sizeof(unotif));
1189        if (ret < 0)
1190                return ret;
1191        if (!ret)
1192                return -EINVAL;
1193
1194        memset(&unotif, 0, sizeof(unotif));
1195
1196        ret = down_interruptible(&filter->notif->request);
1197        if (ret < 0)
1198                return ret;
1199
1200        mutex_lock(&filter->notify_lock);
1201        list_for_each_entry(cur, &filter->notif->notifications, list) {
1202                if (cur->state == SECCOMP_NOTIFY_INIT) {
1203                        knotif = cur;
1204                        break;
1205                }
1206        }
1207
1208        /*
1209         * If we didn't find a notification, it could be that the task was
1210         * interrupted by a fatal signal between the time we were woken and
1211         * when we were able to acquire the rw lock.
1212         */
1213        if (!knotif) {
1214                ret = -ENOENT;
1215                goto out;
1216        }
1217
1218        unotif.id = knotif->id;
1219        unotif.pid = task_pid_vnr(knotif->task);
1220        unotif.data = *(knotif->data);
1221
1222        knotif->state = SECCOMP_NOTIFY_SENT;
1223        wake_up_poll(&filter->wqh, EPOLLOUT | EPOLLWRNORM);
1224        ret = 0;
1225out:
1226        mutex_unlock(&filter->notify_lock);
1227
1228        if (ret == 0 && copy_to_user(buf, &unotif, sizeof(unotif))) {
1229                ret = -EFAULT;
1230
1231                /*
1232                 * Userspace screwed up. To make sure that we keep this
1233                 * notification alive, let's reset it back to INIT. It
1234                 * may have died when we released the lock, so we need to make
1235                 * sure it's still around.
1236                 */
1237                mutex_lock(&filter->notify_lock);
1238                knotif = find_notification(filter, unotif.id);
1239                if (knotif) {
1240                        knotif->state = SECCOMP_NOTIFY_INIT;
1241                        up(&filter->notif->request);
1242                }
1243                mutex_unlock(&filter->notify_lock);
1244        }
1245
1246        return ret;
1247}
1248
1249static long seccomp_notify_send(struct seccomp_filter *filter,
1250                                void __user *buf)
1251{
1252        struct seccomp_notif_resp resp = {};
1253        struct seccomp_knotif *knotif;
1254        long ret;
1255
1256        if (copy_from_user(&resp, buf, sizeof(resp)))
1257                return -EFAULT;
1258
1259        if (resp.flags & ~SECCOMP_USER_NOTIF_FLAG_CONTINUE)
1260                return -EINVAL;
1261
1262        if ((resp.flags & SECCOMP_USER_NOTIF_FLAG_CONTINUE) &&
1263            (resp.error || resp.val))
1264                return -EINVAL;
1265
1266        ret = mutex_lock_interruptible(&filter->notify_lock);
1267        if (ret < 0)
1268                return ret;
1269
1270        knotif = find_notification(filter, resp.id);
1271        if (!knotif) {
1272                ret = -ENOENT;
1273                goto out;
1274        }
1275
1276        /* Allow exactly one reply. */
1277        if (knotif->state != SECCOMP_NOTIFY_SENT) {
1278                ret = -EINPROGRESS;
1279                goto out;
1280        }
1281
1282        ret = 0;
1283        knotif->state = SECCOMP_NOTIFY_REPLIED;
1284        knotif->error = resp.error;
1285        knotif->val = resp.val;
1286        knotif->flags = resp.flags;
1287        complete(&knotif->ready);
1288out:
1289        mutex_unlock(&filter->notify_lock);
1290        return ret;
1291}
1292
1293static long seccomp_notify_id_valid(struct seccomp_filter *filter,
1294                                    void __user *buf)
1295{
1296        struct seccomp_knotif *knotif;
1297        u64 id;
1298        long ret;
1299
1300        if (copy_from_user(&id, buf, sizeof(id)))
1301                return -EFAULT;
1302
1303        ret = mutex_lock_interruptible(&filter->notify_lock);
1304        if (ret < 0)
1305                return ret;
1306
1307        knotif = find_notification(filter, id);
1308        if (knotif && knotif->state == SECCOMP_NOTIFY_SENT)
1309                ret = 0;
1310        else
1311                ret = -ENOENT;
1312
1313        mutex_unlock(&filter->notify_lock);
1314        return ret;
1315}
1316
1317static long seccomp_notify_addfd(struct seccomp_filter *filter,
1318                                 struct seccomp_notif_addfd __user *uaddfd,
1319                                 unsigned int size)
1320{
1321        struct seccomp_notif_addfd addfd;
1322        struct seccomp_knotif *knotif;
1323        struct seccomp_kaddfd kaddfd;
1324        int ret;
1325
1326        BUILD_BUG_ON(sizeof(addfd) < SECCOMP_NOTIFY_ADDFD_SIZE_VER0);
1327        BUILD_BUG_ON(sizeof(addfd) != SECCOMP_NOTIFY_ADDFD_SIZE_LATEST);
1328
1329        if (size < SECCOMP_NOTIFY_ADDFD_SIZE_VER0 || size >= PAGE_SIZE)
1330                return -EINVAL;
1331
1332        ret = copy_struct_from_user(&addfd, sizeof(addfd), uaddfd, size);
1333        if (ret)
1334                return ret;
1335
1336        if (addfd.newfd_flags & ~O_CLOEXEC)
1337                return -EINVAL;
1338
1339        if (addfd.flags & ~SECCOMP_ADDFD_FLAG_SETFD)
1340                return -EINVAL;
1341
1342        if (addfd.newfd && !(addfd.flags & SECCOMP_ADDFD_FLAG_SETFD))
1343                return -EINVAL;
1344
1345        kaddfd.file = fget(addfd.srcfd);
1346        if (!kaddfd.file)
1347                return -EBADF;
1348
1349        kaddfd.flags = addfd.newfd_flags;
1350        kaddfd.fd = (addfd.flags & SECCOMP_ADDFD_FLAG_SETFD) ?
1351                    addfd.newfd : -1;
1352        init_completion(&kaddfd.completion);
1353
1354        ret = mutex_lock_interruptible(&filter->notify_lock);
1355        if (ret < 0)
1356                goto out;
1357
1358        knotif = find_notification(filter, addfd.id);
1359        if (!knotif) {
1360                ret = -ENOENT;
1361                goto out_unlock;
1362        }
1363
1364        /*
1365         * We do not want to allow for FD injection to occur before the
1366         * notification has been picked up by a userspace handler, or after
1367         * the notification has been replied to.
1368         */
1369        if (knotif->state != SECCOMP_NOTIFY_SENT) {
1370                ret = -EINPROGRESS;
1371                goto out_unlock;
1372        }
1373
1374        list_add(&kaddfd.list, &knotif->addfd);
1375        complete(&knotif->ready);
1376        mutex_unlock(&filter->notify_lock);
1377
1378        /* Now we wait for it to be processed or be interrupted */
1379        ret = wait_for_completion_interruptible(&kaddfd.completion);
1380        if (ret == 0) {
1381                /*
1382                 * We had a successful completion. The other side has already
1383                 * removed us from the addfd queue, and
1384                 * wait_for_completion_interruptible has a memory barrier upon
1385                 * success that lets us read this value directly without
1386                 * locking.
1387                 */
1388                ret = kaddfd.ret;
1389                goto out;
1390        }
1391
1392        mutex_lock(&filter->notify_lock);
1393        /*
1394         * Even though we were woken up by a signal and not a successful
1395         * completion, a completion may have happened in the mean time.
1396         *
1397         * We need to check again if the addfd request has been handled,
1398         * and if not, we will remove it from the queue.
1399         */
1400        if (list_empty(&kaddfd.list))
1401                ret = kaddfd.ret;
1402        else
1403                list_del(&kaddfd.list);
1404
1405out_unlock:
1406        mutex_unlock(&filter->notify_lock);
1407out:
1408        fput(kaddfd.file);
1409
1410        return ret;
1411}
1412
1413static long seccomp_notify_ioctl(struct file *file, unsigned int cmd,
1414                                 unsigned long arg)
1415{
1416        struct seccomp_filter *filter = file->private_data;
1417        void __user *buf = (void __user *)arg;
1418
1419        /* Fixed-size ioctls */
1420        switch (cmd) {
1421        case SECCOMP_IOCTL_NOTIF_RECV:
1422                return seccomp_notify_recv(filter, buf);
1423        case SECCOMP_IOCTL_NOTIF_SEND:
1424                return seccomp_notify_send(filter, buf);
1425        case SECCOMP_IOCTL_NOTIF_ID_VALID_WRONG_DIR:
1426        case SECCOMP_IOCTL_NOTIF_ID_VALID:
1427                return seccomp_notify_id_valid(filter, buf);
1428        }
1429
1430        /* Extensible Argument ioctls */
1431#define EA_IOCTL(cmd)   ((cmd) & ~(IOC_INOUT | IOCSIZE_MASK))
1432        switch (EA_IOCTL(cmd)) {
1433        case EA_IOCTL(SECCOMP_IOCTL_NOTIF_ADDFD):
1434                return seccomp_notify_addfd(filter, buf, _IOC_SIZE(cmd));
1435        default:
1436                return -EINVAL;
1437        }
1438}
1439
1440static __poll_t seccomp_notify_poll(struct file *file,
1441                                    struct poll_table_struct *poll_tab)
1442{
1443        struct seccomp_filter *filter = file->private_data;
1444        __poll_t ret = 0;
1445        struct seccomp_knotif *cur;
1446
1447        poll_wait(file, &filter->wqh, poll_tab);
1448
1449        if (mutex_lock_interruptible(&filter->notify_lock) < 0)
1450                return EPOLLERR;
1451
1452        list_for_each_entry(cur, &filter->notif->notifications, list) {
1453                if (cur->state == SECCOMP_NOTIFY_INIT)
1454                        ret |= EPOLLIN | EPOLLRDNORM;
1455                if (cur->state == SECCOMP_NOTIFY_SENT)
1456                        ret |= EPOLLOUT | EPOLLWRNORM;
1457                if ((ret & EPOLLIN) && (ret & EPOLLOUT))
1458                        break;
1459        }
1460
1461        mutex_unlock(&filter->notify_lock);
1462
1463        if (refcount_read(&filter->users) == 0)
1464                ret |= EPOLLHUP;
1465
1466        return ret;
1467}
1468
1469static const struct file_operations seccomp_notify_ops = {
1470        .poll = seccomp_notify_poll,
1471        .release = seccomp_notify_release,
1472        .unlocked_ioctl = seccomp_notify_ioctl,
1473        .compat_ioctl = seccomp_notify_ioctl,
1474};
1475
1476static struct file *init_listener(struct seccomp_filter *filter)
1477{
1478        struct file *ret;
1479
1480        ret = ERR_PTR(-ENOMEM);
1481        filter->notif = kzalloc(sizeof(*(filter->notif)), GFP_KERNEL);
1482        if (!filter->notif)
1483                goto out;
1484
1485        sema_init(&filter->notif->request, 0);
1486        filter->notif->next_id = get_random_u64();
1487        INIT_LIST_HEAD(&filter->notif->notifications);
1488
1489        ret = anon_inode_getfile("seccomp notify", &seccomp_notify_ops,
1490                                 filter, O_RDWR);
1491        if (IS_ERR(ret))
1492                goto out_notif;
1493
1494        /* The file has a reference to it now */
1495        __get_seccomp_filter(filter);
1496
1497out_notif:
1498        if (IS_ERR(ret))
1499                seccomp_notify_free(filter);
1500out:
1501        return ret;
1502}
1503
1504/*
1505 * Does @new_child have a listener while an ancestor also has a listener?
1506 * If so, we'll want to reject this filter.
1507 * This only has to be tested for the current process, even in the TSYNC case,
1508 * because TSYNC installs @child with the same parent on all threads.
1509 * Note that @new_child is not hooked up to its parent at this point yet, so
1510 * we use current->seccomp.filter.
1511 */
1512static bool has_duplicate_listener(struct seccomp_filter *new_child)
1513{
1514        struct seccomp_filter *cur;
1515
1516        /* must be protected against concurrent TSYNC */
1517        lockdep_assert_held(&current->sighand->siglock);
1518
1519        if (!new_child->notif)
1520                return false;
1521        for (cur = current->seccomp.filter; cur; cur = cur->prev) {
1522                if (cur->notif)
1523                        return true;
1524        }
1525
1526        return false;
1527}
1528
1529/**
1530 * seccomp_set_mode_filter: internal function for setting seccomp filter
1531 * @flags:  flags to change filter behavior
1532 * @filter: struct sock_fprog containing filter
1533 *
1534 * This function may be called repeatedly to install additional filters.
1535 * Every filter successfully installed will be evaluated (in reverse order)
1536 * for each system call the task makes.
1537 *
1538 * Once current->seccomp.mode is non-zero, it may not be changed.
1539 *
1540 * Returns 0 on success or -EINVAL on failure.
1541 */
1542static long seccomp_set_mode_filter(unsigned int flags,
1543                                    const char __user *filter)
1544{
1545        const unsigned long seccomp_mode = SECCOMP_MODE_FILTER;
1546        struct seccomp_filter *prepared = NULL;
1547        long ret = -EINVAL;
1548        int listener = -1;
1549        struct file *listener_f = NULL;
1550
1551        /* Validate flags. */
1552        if (flags & ~SECCOMP_FILTER_FLAG_MASK)
1553                return -EINVAL;
1554
1555        /*
1556         * In the successful case, NEW_LISTENER returns the new listener fd.
1557         * But in the failure case, TSYNC returns the thread that died. If you
1558         * combine these two flags, there's no way to tell whether something
1559         * succeeded or failed. So, let's disallow this combination if the user
1560         * has not explicitly requested no errors from TSYNC.
1561         */
1562        if ((flags & SECCOMP_FILTER_FLAG_TSYNC) &&
1563            (flags & SECCOMP_FILTER_FLAG_NEW_LISTENER) &&
1564            ((flags & SECCOMP_FILTER_FLAG_TSYNC_ESRCH) == 0))
1565                return -EINVAL;
1566
1567        /* Prepare the new filter before holding any locks. */
1568        prepared = seccomp_prepare_user_filter(filter);
1569        if (IS_ERR(prepared))
1570                return PTR_ERR(prepared);
1571
1572        if (flags & SECCOMP_FILTER_FLAG_NEW_LISTENER) {
1573                listener = get_unused_fd_flags(O_CLOEXEC);
1574                if (listener < 0) {
1575                        ret = listener;
1576                        goto out_free;
1577                }
1578
1579                listener_f = init_listener(prepared);
1580                if (IS_ERR(listener_f)) {
1581                        put_unused_fd(listener);
1582                        ret = PTR_ERR(listener_f);
1583                        goto out_free;
1584                }
1585        }
1586
1587        /*
1588         * Make sure we cannot change seccomp or nnp state via TSYNC
1589         * while another thread is in the middle of calling exec.
1590         */
1591        if (flags & SECCOMP_FILTER_FLAG_TSYNC &&
1592            mutex_lock_killable(&current->signal->cred_guard_mutex))
1593                goto out_put_fd;
1594
1595        spin_lock_irq(&current->sighand->siglock);
1596
1597        if (!seccomp_may_assign_mode(seccomp_mode))
1598                goto out;
1599
1600        if (has_duplicate_listener(prepared)) {
1601                ret = -EBUSY;
1602                goto out;
1603        }
1604
1605        ret = seccomp_attach_filter(flags, prepared);
1606        if (ret)
1607                goto out;
1608        /* Do not free the successfully attached filter. */
1609        prepared = NULL;
1610
1611        seccomp_assign_mode(current, seccomp_mode, flags);
1612out:
1613        spin_unlock_irq(&current->sighand->siglock);
1614        if (flags & SECCOMP_FILTER_FLAG_TSYNC)
1615                mutex_unlock(&current->signal->cred_guard_mutex);
1616out_put_fd:
1617        if (flags & SECCOMP_FILTER_FLAG_NEW_LISTENER) {
1618                if (ret) {
1619                        listener_f->private_data = NULL;
1620                        fput(listener_f);
1621                        put_unused_fd(listener);
1622                        seccomp_notify_detach(prepared);
1623                } else {
1624                        fd_install(listener, listener_f);
1625                        ret = listener;
1626                }
1627        }
1628out_free:
1629        seccomp_filter_free(prepared);
1630        return ret;
1631}
1632#else
1633static inline long seccomp_set_mode_filter(unsigned int flags,
1634                                           const char __user *filter)
1635{
1636        return -EINVAL;
1637}
1638#endif
1639
1640static long seccomp_get_action_avail(const char __user *uaction)
1641{
1642        u32 action;
1643
1644        if (copy_from_user(&action, uaction, sizeof(action)))
1645                return -EFAULT;
1646
1647        switch (action) {
1648        case SECCOMP_RET_KILL_PROCESS:
1649        case SECCOMP_RET_KILL_THREAD:
1650        case SECCOMP_RET_TRAP:
1651        case SECCOMP_RET_ERRNO:
1652        case SECCOMP_RET_USER_NOTIF:
1653        case SECCOMP_RET_TRACE:
1654        case SECCOMP_RET_LOG:
1655        case SECCOMP_RET_ALLOW:
1656                break;
1657        default:
1658                return -EOPNOTSUPP;
1659        }
1660
1661        return 0;
1662}
1663
1664static long seccomp_get_notif_sizes(void __user *usizes)
1665{
1666        struct seccomp_notif_sizes sizes = {
1667                .seccomp_notif = sizeof(struct seccomp_notif),
1668                .seccomp_notif_resp = sizeof(struct seccomp_notif_resp),
1669                .seccomp_data = sizeof(struct seccomp_data),
1670        };
1671
1672        if (copy_to_user(usizes, &sizes, sizeof(sizes)))
1673                return -EFAULT;
1674
1675        return 0;
1676}
1677
1678/* Common entry point for both prctl and syscall. */
1679static long do_seccomp(unsigned int op, unsigned int flags,
1680                       void __user *uargs)
1681{
1682        switch (op) {
1683        case SECCOMP_SET_MODE_STRICT:
1684                if (flags != 0 || uargs != NULL)
1685                        return -EINVAL;
1686                return seccomp_set_mode_strict();
1687        case SECCOMP_SET_MODE_FILTER:
1688                return seccomp_set_mode_filter(flags, uargs);
1689        case SECCOMP_GET_ACTION_AVAIL:
1690                if (flags != 0)
1691                        return -EINVAL;
1692
1693                return seccomp_get_action_avail(uargs);
1694        case SECCOMP_GET_NOTIF_SIZES:
1695                if (flags != 0)
1696                        return -EINVAL;
1697
1698                return seccomp_get_notif_sizes(uargs);
1699        default:
1700                return -EINVAL;
1701        }
1702}
1703
1704SYSCALL_DEFINE3(seccomp, unsigned int, op, unsigned int, flags,
1705                         void __user *, uargs)
1706{
1707        return do_seccomp(op, flags, uargs);
1708}
1709
1710/**
1711 * prctl_set_seccomp: configures current->seccomp.mode
1712 * @seccomp_mode: requested mode to use
1713 * @filter: optional struct sock_fprog for use with SECCOMP_MODE_FILTER
1714 *
1715 * Returns 0 on success or -EINVAL on failure.
1716 */
1717long prctl_set_seccomp(unsigned long seccomp_mode, void __user *filter)
1718{
1719        unsigned int op;
1720        void __user *uargs;
1721
1722        switch (seccomp_mode) {
1723        case SECCOMP_MODE_STRICT:
1724                op = SECCOMP_SET_MODE_STRICT;
1725                /*
1726                 * Setting strict mode through prctl always ignored filter,
1727                 * so make sure it is always NULL here to pass the internal
1728                 * check in do_seccomp().
1729                 */
1730                uargs = NULL;
1731                break;
1732        case SECCOMP_MODE_FILTER:
1733                op = SECCOMP_SET_MODE_FILTER;
1734                uargs = filter;
1735                break;
1736        default:
1737                return -EINVAL;
1738        }
1739
1740        /* prctl interface doesn't have flags, so they are always zero. */
1741        return do_seccomp(op, 0, uargs);
1742}
1743
1744#if defined(CONFIG_SECCOMP_FILTER) && defined(CONFIG_CHECKPOINT_RESTORE)
1745static struct seccomp_filter *get_nth_filter(struct task_struct *task,
1746                                             unsigned long filter_off)
1747{
1748        struct seccomp_filter *orig, *filter;
1749        unsigned long count;
1750
1751        /*
1752         * Note: this is only correct because the caller should be the (ptrace)
1753         * tracer of the task, otherwise lock_task_sighand is needed.
1754         */
1755        spin_lock_irq(&task->sighand->siglock);
1756
1757        if (task->seccomp.mode != SECCOMP_MODE_FILTER) {
1758                spin_unlock_irq(&task->sighand->siglock);
1759                return ERR_PTR(-EINVAL);
1760        }
1761
1762        orig = task->seccomp.filter;
1763        __get_seccomp_filter(orig);
1764        spin_unlock_irq(&task->sighand->siglock);
1765
1766        count = 0;
1767        for (filter = orig; filter; filter = filter->prev)
1768                count++;
1769
1770        if (filter_off >= count) {
1771                filter = ERR_PTR(-ENOENT);
1772                goto out;
1773        }
1774
1775        count -= filter_off;
1776        for (filter = orig; filter && count > 1; filter = filter->prev)
1777                count--;
1778
1779        if (WARN_ON(count != 1 || !filter)) {
1780                filter = ERR_PTR(-ENOENT);
1781                goto out;
1782        }
1783
1784        __get_seccomp_filter(filter);
1785
1786out:
1787        __put_seccomp_filter(orig);
1788        return filter;
1789}
1790
1791long seccomp_get_filter(struct task_struct *task, unsigned long filter_off,
1792                        void __user *data)
1793{
1794        struct seccomp_filter *filter;
1795        struct sock_fprog_kern *fprog;
1796        long ret;
1797
1798        if (!capable(CAP_SYS_ADMIN) ||
1799            current->seccomp.mode != SECCOMP_MODE_DISABLED) {
1800                return -EACCES;
1801        }
1802
1803        filter = get_nth_filter(task, filter_off);
1804        if (IS_ERR(filter))
1805                return PTR_ERR(filter);
1806
1807        fprog = filter->prog->orig_prog;
1808        if (!fprog) {
1809                /* This must be a new non-cBPF filter, since we save
1810                 * every cBPF filter's orig_prog above when
1811                 * CONFIG_CHECKPOINT_RESTORE is enabled.
1812                 */
1813                ret = -EMEDIUMTYPE;
1814                goto out;
1815        }
1816
1817        ret = fprog->len;
1818        if (!data)
1819                goto out;
1820
1821        if (copy_to_user(data, fprog->filter, bpf_classic_proglen(fprog)))
1822                ret = -EFAULT;
1823
1824out:
1825        __put_seccomp_filter(filter);
1826        return ret;
1827}
1828
1829long seccomp_get_metadata(struct task_struct *task,
1830                          unsigned long size, void __user *data)
1831{
1832        long ret;
1833        struct seccomp_filter *filter;
1834        struct seccomp_metadata kmd = {};
1835
1836        if (!capable(CAP_SYS_ADMIN) ||
1837            current->seccomp.mode != SECCOMP_MODE_DISABLED) {
1838                return -EACCES;
1839        }
1840
1841        size = min_t(unsigned long, size, sizeof(kmd));
1842
1843        if (size < sizeof(kmd.filter_off))
1844                return -EINVAL;
1845
1846        if (copy_from_user(&kmd.filter_off, data, sizeof(kmd.filter_off)))
1847                return -EFAULT;
1848
1849        filter = get_nth_filter(task, kmd.filter_off);
1850        if (IS_ERR(filter))
1851                return PTR_ERR(filter);
1852
1853        if (filter->log)
1854                kmd.flags |= SECCOMP_FILTER_FLAG_LOG;
1855
1856        ret = size;
1857        if (copy_to_user(data, &kmd, size))
1858                ret = -EFAULT;
1859
1860        __put_seccomp_filter(filter);
1861        return ret;
1862}
1863#endif
1864
1865#ifdef CONFIG_SYSCTL
1866
1867/* Human readable action names for friendly sysctl interaction */
1868#define SECCOMP_RET_KILL_PROCESS_NAME   "kill_process"
1869#define SECCOMP_RET_KILL_THREAD_NAME    "kill_thread"
1870#define SECCOMP_RET_TRAP_NAME           "trap"
1871#define SECCOMP_RET_ERRNO_NAME          "errno"
1872#define SECCOMP_RET_USER_NOTIF_NAME     "user_notif"
1873#define SECCOMP_RET_TRACE_NAME          "trace"
1874#define SECCOMP_RET_LOG_NAME            "log"
1875#define SECCOMP_RET_ALLOW_NAME          "allow"
1876
1877static const char seccomp_actions_avail[] =
1878                                SECCOMP_RET_KILL_PROCESS_NAME   " "
1879                                SECCOMP_RET_KILL_THREAD_NAME    " "
1880                                SECCOMP_RET_TRAP_NAME           " "
1881                                SECCOMP_RET_ERRNO_NAME          " "
1882                                SECCOMP_RET_USER_NOTIF_NAME     " "
1883                                SECCOMP_RET_TRACE_NAME          " "
1884                                SECCOMP_RET_LOG_NAME            " "
1885                                SECCOMP_RET_ALLOW_NAME;
1886
1887struct seccomp_log_name {
1888        u32             log;
1889        const char      *name;
1890};
1891
1892static const struct seccomp_log_name seccomp_log_names[] = {
1893        { SECCOMP_LOG_KILL_PROCESS, SECCOMP_RET_KILL_PROCESS_NAME },
1894        { SECCOMP_LOG_KILL_THREAD, SECCOMP_RET_KILL_THREAD_NAME },
1895        { SECCOMP_LOG_TRAP, SECCOMP_RET_TRAP_NAME },
1896        { SECCOMP_LOG_ERRNO, SECCOMP_RET_ERRNO_NAME },
1897        { SECCOMP_LOG_USER_NOTIF, SECCOMP_RET_USER_NOTIF_NAME },
1898        { SECCOMP_LOG_TRACE, SECCOMP_RET_TRACE_NAME },
1899        { SECCOMP_LOG_LOG, SECCOMP_RET_LOG_NAME },
1900        { SECCOMP_LOG_ALLOW, SECCOMP_RET_ALLOW_NAME },
1901        { }
1902};
1903
1904static bool seccomp_names_from_actions_logged(char *names, size_t size,
1905                                              u32 actions_logged,
1906                                              const char *sep)
1907{
1908        const struct seccomp_log_name *cur;
1909        bool append_sep = false;
1910
1911        for (cur = seccomp_log_names; cur->name && size; cur++) {
1912                ssize_t ret;
1913
1914                if (!(actions_logged & cur->log))
1915                        continue;
1916
1917                if (append_sep) {
1918                        ret = strscpy(names, sep, size);
1919                        if (ret < 0)
1920                                return false;
1921
1922                        names += ret;
1923                        size -= ret;
1924                } else
1925                        append_sep = true;
1926
1927                ret = strscpy(names, cur->name, size);
1928                if (ret < 0)
1929                        return false;
1930
1931                names += ret;
1932                size -= ret;
1933        }
1934
1935        return true;
1936}
1937
1938static bool seccomp_action_logged_from_name(u32 *action_logged,
1939                                            const char *name)
1940{
1941        const struct seccomp_log_name *cur;
1942
1943        for (cur = seccomp_log_names; cur->name; cur++) {
1944                if (!strcmp(cur->name, name)) {
1945                        *action_logged = cur->log;
1946                        return true;
1947                }
1948        }
1949
1950        return false;
1951}
1952
1953static bool seccomp_actions_logged_from_names(u32 *actions_logged, char *names)
1954{
1955        char *name;
1956
1957        *actions_logged = 0;
1958        while ((name = strsep(&names, " ")) && *name) {
1959                u32 action_logged = 0;
1960
1961                if (!seccomp_action_logged_from_name(&action_logged, name))
1962                        return false;
1963
1964                *actions_logged |= action_logged;
1965        }
1966
1967        return true;
1968}
1969
1970static int read_actions_logged(struct ctl_table *ro_table, void __user *buffer,
1971                               size_t *lenp, loff_t *ppos)
1972{
1973        char names[sizeof(seccomp_actions_avail)];
1974        struct ctl_table table;
1975
1976        memset(names, 0, sizeof(names));
1977
1978        if (!seccomp_names_from_actions_logged(names, sizeof(names),
1979                                               seccomp_actions_logged, " "))
1980                return -EINVAL;
1981
1982        table = *ro_table;
1983        table.data = names;
1984        table.maxlen = sizeof(names);
1985        return proc_dostring(&table, 0, buffer, lenp, ppos);
1986}
1987
1988static int write_actions_logged(struct ctl_table *ro_table, void __user *buffer,
1989                                size_t *lenp, loff_t *ppos, u32 *actions_logged)
1990{
1991        char names[sizeof(seccomp_actions_avail)];
1992        struct ctl_table table;
1993        int ret;
1994
1995        if (!capable(CAP_SYS_ADMIN))
1996                return -EPERM;
1997
1998        memset(names, 0, sizeof(names));
1999
2000        table = *ro_table;

2001        table.data = names;
2002        table.maxlen = sizeof(names);
2003        ret = proc_dostring(&table, 1, buffer, lenp, ppos);
2004        if (ret)
2005                return ret;
2006
2007        if (!seccomp_actions_logged_from_names(actions_logged, table.data))
2008                return -EINVAL;
2009
2010        if (*actions_logged & SECCOMP_LOG_ALLOW)
2011                return -EINVAL;
2012
2013        seccomp_actions_logged = *actions_logged;
2014        return 0;
2015}
2016
2017static void audit_actions_logged(u32 actions_logged, u32 old_actions_logged,
2018                                 int ret)
2019{
2020        char names[sizeof(seccomp_actions_avail)];
2021        char old_names[sizeof(seccomp_actions_avail)];
2022        const char *new = names;
2023        const char *old = old_names;
2024
2025        if (!audit_enabled)
2026                return;
2027
2028        memset(names, 0, sizeof(names));
2029        memset(old_names, 0, sizeof(old_names));
2030
2031        if (ret)
2032                new = "?";
2033        else if (!actions_logged)
2034                new = "(none)";
2035        else if (!seccomp_names_from_actions_logged(names, sizeof(names),
2036                                                    actions_logged, ","))
2037                new = "?";
2038
2039        if (!old_actions_logged)
2040                old = "(none)";
2041        else if (!seccomp_names_from_actions_logged(old_names,
2042                                                    sizeof(old_names),
2043                                                    old_actions_logged, ","))
2044                old = "?";
2045
2046        return audit_seccomp_actions_logged(new, old, !ret);
2047}
2048
2049static int seccomp_actions_logged_handler(struct ctl_table *ro_table, int write,
2050                                          void *buffer, size_t *lenp,
2051                                          loff_t *ppos)
2052{
2053        int ret;
2054
2055        if (write) {
2056                u32 actions_logged = 0;
2057                u32 old_actions_logged = seccomp_actions_logged;
2058
2059                ret = write_actions_logged(ro_table, buffer, lenp, ppos,
2060                                           &actions_logged);
2061                audit_actions_logged(actions_logged, old_actions_logged, ret);
2062        } else
2063                ret = read_actions_logged(ro_table, buffer, lenp, ppos);
2064
2065        return ret;
2066}
2067
2068static struct ctl_path seccomp_sysctl_path[] = {
2069        { .procname = "kernel", },
2070        { .procname = "seccomp", },
2071        { }
2072};
2073
2074static struct ctl_table seccomp_sysctl_table[] = {
2075        {
2076                .procname       = "actions_avail",
2077                .data           = (void *) &seccomp_actions_avail,
2078                .maxlen         = sizeof(seccomp_actions_avail),
2079                .mode           = 0444,
2080                .proc_handler   = proc_dostring,
2081        },
2082        {
2083                .procname       = "actions_logged",
2084                .mode           = 0644,
2085                .proc_handler   = seccomp_actions_logged_handler,
2086        },
2087        { }
2088};
2089
2090static int __init seccomp_sysctl_init(void)
2091{
2092        struct ctl_table_header *hdr;
2093
2094        hdr = register_sysctl_paths(seccomp_sysctl_path, seccomp_sysctl_table);
2095        if (!hdr)
2096                pr_warn("sysctl registration failed\n");
2097        else
2098                kmemleak_not_leak(hdr);
2099
2100        return 0;
2101}
2102
2103device_initcall(seccomp_sysctl_init)
2104
2105#endif /* CONFIG_SYSCTL */
2106