linux/kernel/seccomp.c
<<
>>
Prefs
   1/*
   2 * linux/kernel/seccomp.c
   3 *
   4 * Copyright 2004-2005  Andrea Arcangeli <andrea@cpushare.com>
   5 *
   6 * Copyright (C) 2012 Google, Inc.
   7 * Will Drewry <wad@chromium.org>
   8 *
   9 * This defines a simple but solid secure-computing facility.
  10 *
  11 * Mode 1 uses a fixed list of allowed system calls.
  12 * Mode 2 allows user-defined system call filters in the form
  13 *        of Berkeley Packet Filters/Linux Socket Filters.
  14 */
  15
  16#include <linux/atomic.h>
  17#include <linux/audit.h>
  18#include <linux/compat.h>
  19#include <linux/sched.h>
  20#include <linux/seccomp.h>
  21
  22/* #define SECCOMP_DEBUG 1 */
  23
  24#ifdef CONFIG_SECCOMP_FILTER
  25#include <asm/syscall.h>
  26#include <linux/filter.h>
  27#include <linux/ptrace.h>
  28#include <linux/security.h>
  29#include <linux/slab.h>
  30#include <linux/tracehook.h>
  31#include <linux/uaccess.h>
  32
  33/**
  34 * struct seccomp_filter - container for seccomp BPF programs
  35 *
  36 * @usage: reference count to manage the object lifetime.
  37 *         get/put helpers should be used when accessing an instance
  38 *         outside of a lifetime-guarded section.  In general, this
  39 *         is only needed for handling filters shared across tasks.
  40 * @prev: points to a previously installed, or inherited, filter
  41 * @len: the number of instructions in the program
  42 * @insnsi: the BPF program instructions to evaluate
  43 *
  44 * seccomp_filter objects are organized in a tree linked via the @prev
  45 * pointer.  For any task, it appears to be a singly-linked list starting
  46 * with current->seccomp.filter, the most recently attached or inherited filter.
  47 * However, multiple filters may share a @prev node, by way of fork(), which
  48 * results in a unidirectional tree existing in memory.  This is similar to
  49 * how namespaces work.
  50 *
  51 * seccomp_filter objects should never be modified after being attached
  52 * to a task_struct (other than @usage).
  53 */
  54struct seccomp_filter {
  55        atomic_t usage;
  56        struct seccomp_filter *prev;
  57        struct sk_filter *prog;
  58};
  59
  60/* Limit any path through the tree to 256KB worth of instructions. */
  61#define MAX_INSNS_PER_PATH ((1 << 18) / sizeof(struct sock_filter))
  62
  63/*
  64 * Endianness is explicitly ignored and left for BPF program authors to manage
  65 * as per the specific architecture.
  66 */
  67static void populate_seccomp_data(struct seccomp_data *sd)
  68{
  69        struct task_struct *task = current;
  70        struct pt_regs *regs = task_pt_regs(task);
  71        unsigned long args[6];
  72
  73        sd->nr = syscall_get_nr(task, regs);
  74        sd->arch = syscall_get_arch();
  75        syscall_get_arguments(task, regs, 0, 6, args);
  76        sd->args[0] = args[0];
  77        sd->args[1] = args[1];
  78        sd->args[2] = args[2];
  79        sd->args[3] = args[3];
  80        sd->args[4] = args[4];
  81        sd->args[5] = args[5];
  82        sd->instruction_pointer = KSTK_EIP(task);
  83}
  84
  85/**
  86 *      seccomp_check_filter - verify seccomp filter code
  87 *      @filter: filter to verify
  88 *      @flen: length of filter
  89 *
  90 * Takes a previously checked filter (by sk_chk_filter) and
  91 * redirects all filter code that loads struct sk_buff data
  92 * and related data through seccomp_bpf_load.  It also
  93 * enforces length and alignment checking of those loads.
  94 *
  95 * Returns 0 if the rule set is legal or -EINVAL if not.
  96 */
  97static int seccomp_check_filter(struct sock_filter *filter, unsigned int flen)
  98{
  99        int pc;
 100        for (pc = 0; pc < flen; pc++) {
 101                struct sock_filter *ftest = &filter[pc];
 102                u16 code = ftest->code;
 103                u32 k = ftest->k;
 104
 105                switch (code) {
 106                case BPF_LD | BPF_W | BPF_ABS:
 107                        ftest->code = BPF_LDX | BPF_W | BPF_ABS;
 108                        /* 32-bit aligned and not out of bounds. */
 109                        if (k >= sizeof(struct seccomp_data) || k & 3)
 110                                return -EINVAL;
 111                        continue;
 112                case BPF_LD | BPF_W | BPF_LEN:
 113                        ftest->code = BPF_LD | BPF_IMM;
 114                        ftest->k = sizeof(struct seccomp_data);
 115                        continue;
 116                case BPF_LDX | BPF_W | BPF_LEN:
 117                        ftest->code = BPF_LDX | BPF_IMM;
 118                        ftest->k = sizeof(struct seccomp_data);
 119                        continue;
 120                /* Explicitly include allowed calls. */
 121                case BPF_RET | BPF_K:
 122                case BPF_RET | BPF_A:
 123                case BPF_ALU | BPF_ADD | BPF_K:
 124                case BPF_ALU | BPF_ADD | BPF_X:
 125                case BPF_ALU | BPF_SUB | BPF_K:
 126                case BPF_ALU | BPF_SUB | BPF_X:
 127                case BPF_ALU | BPF_MUL | BPF_K:
 128                case BPF_ALU | BPF_MUL | BPF_X:
 129                case BPF_ALU | BPF_DIV | BPF_K:
 130                case BPF_ALU | BPF_DIV | BPF_X:
 131                case BPF_ALU | BPF_AND | BPF_K:
 132                case BPF_ALU | BPF_AND | BPF_X:
 133                case BPF_ALU | BPF_OR | BPF_K:
 134                case BPF_ALU | BPF_OR | BPF_X:
 135                case BPF_ALU | BPF_XOR | BPF_K:
 136                case BPF_ALU | BPF_XOR | BPF_X:
 137                case BPF_ALU | BPF_LSH | BPF_K:
 138                case BPF_ALU | BPF_LSH | BPF_X:
 139                case BPF_ALU | BPF_RSH | BPF_K:
 140                case BPF_ALU | BPF_RSH | BPF_X:
 141                case BPF_ALU | BPF_NEG:
 142                case BPF_LD | BPF_IMM:
 143                case BPF_LDX | BPF_IMM:
 144                case BPF_MISC | BPF_TAX:
 145                case BPF_MISC | BPF_TXA:
 146                case BPF_LD | BPF_MEM:
 147                case BPF_LDX | BPF_MEM:
 148                case BPF_ST:
 149                case BPF_STX:
 150                case BPF_JMP | BPF_JA:
 151                case BPF_JMP | BPF_JEQ | BPF_K:
 152                case BPF_JMP | BPF_JEQ | BPF_X:
 153                case BPF_JMP | BPF_JGE | BPF_K:
 154                case BPF_JMP | BPF_JGE | BPF_X:
 155                case BPF_JMP | BPF_JGT | BPF_K:
 156                case BPF_JMP | BPF_JGT | BPF_X:
 157                case BPF_JMP | BPF_JSET | BPF_K:
 158                case BPF_JMP | BPF_JSET | BPF_X:
 159                        continue;
 160                default:
 161                        return -EINVAL;
 162                }
 163        }
 164        return 0;
 165}
 166
 167/**
 168 * seccomp_run_filters - evaluates all seccomp filters against @syscall
 169 * @syscall: number of the current system call
 170 *
 171 * Returns valid seccomp BPF response codes.
 172 */
 173static u32 seccomp_run_filters(int syscall)
 174{
 175        struct seccomp_filter *f;
 176        struct seccomp_data sd;
 177        u32 ret = SECCOMP_RET_ALLOW;
 178
 179        /* Ensure unexpected behavior doesn't result in failing open. */
 180        if (WARN_ON(current->seccomp.filter == NULL))
 181                return SECCOMP_RET_KILL;
 182
 183        populate_seccomp_data(&sd);
 184
 185        /*
 186         * All filters in the list are evaluated and the lowest BPF return
 187         * value always takes priority (ignoring the DATA).
 188         */
 189        for (f = current->seccomp.filter; f; f = f->prev) {
 190                u32 cur_ret = SK_RUN_FILTER(f->prog, (void *)&sd);
 191
 192                if ((cur_ret & SECCOMP_RET_ACTION) < (ret & SECCOMP_RET_ACTION))
 193                        ret = cur_ret;
 194        }
 195        return ret;
 196}
 197
 198/**
 199 * seccomp_attach_filter: Attaches a seccomp filter to current.
 200 * @fprog: BPF program to install
 201 *
 202 * Returns 0 on success or an errno on failure.
 203 */
 204static long seccomp_attach_filter(struct sock_fprog *fprog)
 205{
 206        struct seccomp_filter *filter;
 207        unsigned long fp_size = fprog->len * sizeof(struct sock_filter);
 208        unsigned long total_insns = fprog->len;
 209        struct sock_filter *fp;
 210        int new_len;
 211        long ret;
 212
 213        if (fprog->len == 0 || fprog->len > BPF_MAXINSNS)
 214                return -EINVAL;
 215
 216        for (filter = current->seccomp.filter; filter; filter = filter->prev)
 217                total_insns += filter->prog->len + 4;  /* include a 4 instr penalty */
 218        if (total_insns > MAX_INSNS_PER_PATH)
 219                return -ENOMEM;
 220
 221        /*
 222         * Installing a seccomp filter requires that the task has
 223         * CAP_SYS_ADMIN in its namespace or be running with no_new_privs.
 224         * This avoids scenarios where unprivileged tasks can affect the
 225         * behavior of privileged children.
 226         */
 227        if (!current->no_new_privs &&
 228            security_capable_noaudit(current_cred(), current_user_ns(),
 229                                     CAP_SYS_ADMIN) != 0)
 230                return -EACCES;
 231
 232        fp = kzalloc(fp_size, GFP_KERNEL|__GFP_NOWARN);
 233        if (!fp)
 234                return -ENOMEM;
 235
 236        /* Copy the instructions from fprog. */
 237        ret = -EFAULT;
 238        if (copy_from_user(fp, fprog->filter, fp_size))
 239                goto free_prog;
 240
 241        /* Check and rewrite the fprog via the skb checker */
 242        ret = sk_chk_filter(fp, fprog->len);
 243        if (ret)
 244                goto free_prog;
 245
 246        /* Check and rewrite the fprog for seccomp use */
 247        ret = seccomp_check_filter(fp, fprog->len);
 248        if (ret)
 249                goto free_prog;
 250
 251        /* Convert 'sock_filter' insns to 'sock_filter_int' insns */
 252        ret = sk_convert_filter(fp, fprog->len, NULL, &new_len);
 253        if (ret)
 254                goto free_prog;
 255
 256        /* Allocate a new seccomp_filter */
 257        ret = -ENOMEM;
 258        filter = kzalloc(sizeof(struct seccomp_filter),
 259                         GFP_KERNEL|__GFP_NOWARN);
 260        if (!filter)
 261                goto free_prog;
 262
 263        filter->prog = kzalloc(sk_filter_size(new_len),
 264                               GFP_KERNEL|__GFP_NOWARN);
 265        if (!filter->prog)
 266                goto free_filter;
 267
 268        ret = sk_convert_filter(fp, fprog->len, filter->prog->insnsi, &new_len);
 269        if (ret)
 270                goto free_filter_prog;
 271        kfree(fp);
 272
 273        atomic_set(&filter->usage, 1);
 274        filter->prog->len = new_len;
 275
 276        sk_filter_select_runtime(filter->prog);
 277
 278        /*
 279         * If there is an existing filter, make it the prev and don't drop its
 280         * task reference.
 281         */
 282        filter->prev = current->seccomp.filter;
 283        current->seccomp.filter = filter;
 284        return 0;
 285
 286free_filter_prog:
 287        kfree(filter->prog);
 288free_filter:
 289        kfree(filter);
 290free_prog:
 291        kfree(fp);
 292        return ret;
 293}
 294
 295/**
 296 * seccomp_attach_user_filter - attaches a user-supplied sock_fprog
 297 * @user_filter: pointer to the user data containing a sock_fprog.
 298 *
 299 * Returns 0 on success and non-zero otherwise.
 300 */
 301static long seccomp_attach_user_filter(char __user *user_filter)
 302{
 303        struct sock_fprog fprog;
 304        long ret = -EFAULT;
 305
 306#ifdef CONFIG_COMPAT
 307        if (is_compat_task()) {
 308                struct compat_sock_fprog fprog32;
 309                if (copy_from_user(&fprog32, user_filter, sizeof(fprog32)))
 310                        goto out;
 311                fprog.len = fprog32.len;
 312                fprog.filter = compat_ptr(fprog32.filter);
 313        } else /* falls through to the if below. */
 314#endif
 315        if (copy_from_user(&fprog, user_filter, sizeof(fprog)))
 316                goto out;
 317        ret = seccomp_attach_filter(&fprog);
 318out:
 319        return ret;
 320}
 321
 322/* get_seccomp_filter - increments the reference count of the filter on @tsk */
 323void get_seccomp_filter(struct task_struct *tsk)
 324{
 325        struct seccomp_filter *orig = tsk->seccomp.filter;
 326        if (!orig)
 327                return;
 328        /* Reference count is bounded by the number of total processes. */
 329        atomic_inc(&orig->usage);
 330}
 331
 332/* put_seccomp_filter - decrements the ref count of tsk->seccomp.filter */
 333void put_seccomp_filter(struct task_struct *tsk)
 334{
 335        struct seccomp_filter *orig = tsk->seccomp.filter;
 336        /* Clean up single-reference branches iteratively. */
 337        while (orig && atomic_dec_and_test(&orig->usage)) {
 338                struct seccomp_filter *freeme = orig;
 339                orig = orig->prev;
 340                sk_filter_free(freeme->prog);
 341                kfree(freeme);
 342        }
 343}
 344
 345/**
 346 * seccomp_send_sigsys - signals the task to allow in-process syscall emulation
 347 * @syscall: syscall number to send to userland
 348 * @reason: filter-supplied reason code to send to userland (via si_errno)
 349 *
 350 * Forces a SIGSYS with a code of SYS_SECCOMP and related sigsys info.
 351 */
 352static void seccomp_send_sigsys(int syscall, int reason)
 353{
 354        struct siginfo info;
 355        memset(&info, 0, sizeof(info));
 356        info.si_signo = SIGSYS;
 357        info.si_code = SYS_SECCOMP;
 358        info.si_call_addr = (void __user *)KSTK_EIP(current);
 359        info.si_errno = reason;
 360        info.si_arch = syscall_get_arch();
 361        info.si_syscall = syscall;
 362        force_sig_info(SIGSYS, &info, current);
 363}
 364#endif  /* CONFIG_SECCOMP_FILTER */
 365
 366/*
 367 * Secure computing mode 1 allows only read/write/exit/sigreturn.
 368 * To be fully secure this must be combined with rlimit
 369 * to limit the stack allocations too.
 370 */
 371static int mode1_syscalls[] = {
 372        __NR_seccomp_read, __NR_seccomp_write, __NR_seccomp_exit, __NR_seccomp_sigreturn,
 373        0, /* null terminated */
 374};
 375
 376#ifdef CONFIG_COMPAT
 377static int mode1_syscalls_32[] = {
 378        __NR_seccomp_read_32, __NR_seccomp_write_32, __NR_seccomp_exit_32, __NR_seccomp_sigreturn_32,
 379        0, /* null terminated */
 380};
 381#endif
 382
 383int __secure_computing(int this_syscall)
 384{
 385        int mode = current->seccomp.mode;
 386        int exit_sig = 0;
 387        int *syscall;
 388        u32 ret;
 389
 390        switch (mode) {
 391        case SECCOMP_MODE_STRICT:
 392                syscall = mode1_syscalls;
 393#ifdef CONFIG_COMPAT
 394                if (is_compat_task())
 395                        syscall = mode1_syscalls_32;
 396#endif
 397                do {
 398                        if (*syscall == this_syscall)
 399                                return 0;
 400                } while (*++syscall);
 401                exit_sig = SIGKILL;
 402                ret = SECCOMP_RET_KILL;
 403                break;
 404#ifdef CONFIG_SECCOMP_FILTER
 405        case SECCOMP_MODE_FILTER: {
 406                int data;
 407                struct pt_regs *regs = task_pt_regs(current);
 408                ret = seccomp_run_filters(this_syscall);
 409                data = ret & SECCOMP_RET_DATA;
 410                ret &= SECCOMP_RET_ACTION;
 411                switch (ret) {
 412                case SECCOMP_RET_ERRNO:
 413                        /* Set the low-order 16-bits as a errno. */
 414                        syscall_set_return_value(current, regs,
 415                                                 -data, 0);
 416                        goto skip;
 417                case SECCOMP_RET_TRAP:
 418                        /* Show the handler the original registers. */
 419                        syscall_rollback(current, regs);
 420                        /* Let the filter pass back 16 bits of data. */
 421                        seccomp_send_sigsys(this_syscall, data);
 422                        goto skip;
 423                case SECCOMP_RET_TRACE:
 424                        /* Skip these calls if there is no tracer. */
 425                        if (!ptrace_event_enabled(current, PTRACE_EVENT_SECCOMP)) {
 426                                syscall_set_return_value(current, regs,
 427                                                         -ENOSYS, 0);
 428                                goto skip;
 429                        }
 430                        /* Allow the BPF to provide the event message */
 431                        ptrace_event(PTRACE_EVENT_SECCOMP, data);
 432                        /*
 433                         * The delivery of a fatal signal during event
 434                         * notification may silently skip tracer notification.
 435                         * Terminating the task now avoids executing a system
 436                         * call that may not be intended.
 437                         */
 438                        if (fatal_signal_pending(current))
 439                                break;
 440                        if (syscall_get_nr(current, regs) < 0)
 441                                goto skip;  /* Explicit request to skip. */
 442
 443                        return 0;
 444                case SECCOMP_RET_ALLOW:
 445                        return 0;
 446                case SECCOMP_RET_KILL:
 447                default:
 448                        break;
 449                }
 450                exit_sig = SIGSYS;
 451                break;
 452        }
 453#endif
 454        default:
 455                BUG();
 456        }
 457
 458#ifdef SECCOMP_DEBUG
 459        dump_stack();
 460#endif
 461        audit_seccomp(this_syscall, exit_sig, ret);
 462        do_exit(exit_sig);
 463#ifdef CONFIG_SECCOMP_FILTER
 464skip:
 465        audit_seccomp(this_syscall, exit_sig, ret);
 466#endif
 467        return -1;
 468}
 469
 470long prctl_get_seccomp(void)
 471{
 472        return current->seccomp.mode;
 473}
 474
 475/**
 476 * prctl_set_seccomp: configures current->seccomp.mode
 477 * @seccomp_mode: requested mode to use
 478 * @filter: optional struct sock_fprog for use with SECCOMP_MODE_FILTER
 479 *
 480 * This function may be called repeatedly with a @seccomp_mode of
 481 * SECCOMP_MODE_FILTER to install additional filters.  Every filter
 482 * successfully installed will be evaluated (in reverse order) for each system
 483 * call the task makes.
 484 *
 485 * Once current->seccomp.mode is non-zero, it may not be changed.
 486 *
 487 * Returns 0 on success or -EINVAL on failure.
 488 */
 489long prctl_set_seccomp(unsigned long seccomp_mode, char __user *filter)
 490{
 491        long ret = -EINVAL;
 492
 493        if (current->seccomp.mode &&
 494            current->seccomp.mode != seccomp_mode)
 495                goto out;
 496
 497        switch (seccomp_mode) {
 498        case SECCOMP_MODE_STRICT:
 499                ret = 0;
 500#ifdef TIF_NOTSC
 501                disable_TSC();
 502#endif
 503                break;
 504#ifdef CONFIG_SECCOMP_FILTER
 505        case SECCOMP_MODE_FILTER:
 506                ret = seccomp_attach_user_filter(filter);
 507                if (ret)
 508                        goto out;
 509                break;
 510#endif
 511        default:
 512                goto out;
 513        }
 514
 515        current->seccomp.mode = seccomp_mode;
 516        set_thread_flag(TIF_SECCOMP);
 517out:
 518        return ret;
 519}
 520