linux/tools/testing/selftests/seccomp/seccomp_bpf.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0-only
   2/*
   3 * Copyright (c) 2012 The Chromium OS Authors. All rights reserved.
   4 *
   5 * Test code for seccomp bpf.
   6 */
   7
   8#define _GNU_SOURCE
   9#include <sys/types.h>
  10
  11/*
  12 * glibc 2.26 and later have SIGSYS in siginfo_t. Before that,
  13 * we need to use the kernel's siginfo.h file and trick glibc
  14 * into accepting it.
  15 */
  16#if !__GLIBC_PREREQ(2, 26)
  17# include <asm/siginfo.h>
  18# define __have_siginfo_t 1
  19# define __have_sigval_t 1
  20# define __have_sigevent_t 1
  21#endif
  22
  23#include <errno.h>
  24#include <linux/filter.h>
  25#include <sys/prctl.h>
  26#include <sys/ptrace.h>
  27#include <sys/user.h>
  28#include <linux/prctl.h>
  29#include <linux/ptrace.h>
  30#include <linux/seccomp.h>
  31#include <pthread.h>
  32#include <semaphore.h>
  33#include <signal.h>
  34#include <stddef.h>
  35#include <stdbool.h>
  36#include <string.h>
  37#include <time.h>
  38#include <limits.h>
  39#include <linux/elf.h>
  40#include <sys/uio.h>
  41#include <sys/utsname.h>
  42#include <sys/fcntl.h>
  43#include <sys/mman.h>
  44#include <sys/times.h>
  45#include <sys/socket.h>
  46#include <sys/ioctl.h>
  47#include <linux/kcmp.h>
  48#include <sys/resource.h>
  49
  50#include <unistd.h>
  51#include <sys/syscall.h>
  52#include <poll.h>
  53
  54#include "../kselftest_harness.h"
  55#include "../clone3/clone3_selftests.h"
  56
  57/* Attempt to de-conflict with the selftests tree. */
  58#ifndef SKIP
  59#define SKIP(s, ...)    XFAIL(s, ##__VA_ARGS__)
  60#endif
  61
  62#ifndef PR_SET_PTRACER
  63# define PR_SET_PTRACER 0x59616d61
  64#endif
  65
  66#ifndef PR_SET_NO_NEW_PRIVS
  67#define PR_SET_NO_NEW_PRIVS 38
  68#define PR_GET_NO_NEW_PRIVS 39
  69#endif
  70
  71#ifndef PR_SECCOMP_EXT
  72#define PR_SECCOMP_EXT 43
  73#endif
  74
  75#ifndef SECCOMP_EXT_ACT
  76#define SECCOMP_EXT_ACT 1
  77#endif
  78
  79#ifndef SECCOMP_EXT_ACT_TSYNC
  80#define SECCOMP_EXT_ACT_TSYNC 1
  81#endif
  82
  83#ifndef SECCOMP_MODE_STRICT
  84#define SECCOMP_MODE_STRICT 1
  85#endif
  86
  87#ifndef SECCOMP_MODE_FILTER
  88#define SECCOMP_MODE_FILTER 2
  89#endif
  90
  91#ifndef SECCOMP_RET_ALLOW
  92struct seccomp_data {
  93        int nr;
  94        __u32 arch;
  95        __u64 instruction_pointer;
  96        __u64 args[6];
  97};
  98#endif
  99
 100#ifndef SECCOMP_RET_KILL_PROCESS
 101#define SECCOMP_RET_KILL_PROCESS 0x80000000U /* kill the process */
 102#define SECCOMP_RET_KILL_THREAD  0x00000000U /* kill the thread */
 103#endif
 104#ifndef SECCOMP_RET_KILL
 105#define SECCOMP_RET_KILL         SECCOMP_RET_KILL_THREAD
 106#define SECCOMP_RET_TRAP         0x00030000U /* disallow and force a SIGSYS */
 107#define SECCOMP_RET_ERRNO        0x00050000U /* returns an errno */
 108#define SECCOMP_RET_TRACE        0x7ff00000U /* pass to a tracer or disallow */
 109#define SECCOMP_RET_ALLOW        0x7fff0000U /* allow */
 110#endif
 111#ifndef SECCOMP_RET_LOG
 112#define SECCOMP_RET_LOG          0x7ffc0000U /* allow after logging */
 113#endif
 114
 115#ifndef __NR_seccomp
 116# if defined(__i386__)
 117#  define __NR_seccomp 354
 118# elif defined(__x86_64__)
 119#  define __NR_seccomp 317
 120# elif defined(__arm__)
 121#  define __NR_seccomp 383
 122# elif defined(__aarch64__)
 123#  define __NR_seccomp 277
 124# elif defined(__riscv)
 125#  define __NR_seccomp 277
 126# elif defined(__csky__)
 127#  define __NR_seccomp 277
 128# elif defined(__hppa__)
 129#  define __NR_seccomp 338
 130# elif defined(__powerpc__)
 131#  define __NR_seccomp 358
 132# elif defined(__s390__)
 133#  define __NR_seccomp 348
 134# elif defined(__xtensa__)
 135#  define __NR_seccomp 337
 136# elif defined(__sh__)
 137#  define __NR_seccomp 372
 138# else
 139#  warning "seccomp syscall number unknown for this architecture"
 140#  define __NR_seccomp 0xffff
 141# endif
 142#endif
 143
 144#ifndef SECCOMP_SET_MODE_STRICT
 145#define SECCOMP_SET_MODE_STRICT 0
 146#endif
 147
 148#ifndef SECCOMP_SET_MODE_FILTER
 149#define SECCOMP_SET_MODE_FILTER 1
 150#endif
 151
 152#ifndef SECCOMP_GET_ACTION_AVAIL
 153#define SECCOMP_GET_ACTION_AVAIL 2
 154#endif
 155
 156#ifndef SECCOMP_GET_NOTIF_SIZES
 157#define SECCOMP_GET_NOTIF_SIZES 3
 158#endif
 159
 160#ifndef SECCOMP_FILTER_FLAG_TSYNC
 161#define SECCOMP_FILTER_FLAG_TSYNC (1UL << 0)
 162#endif
 163
 164#ifndef SECCOMP_FILTER_FLAG_LOG
 165#define SECCOMP_FILTER_FLAG_LOG (1UL << 1)
 166#endif
 167
 168#ifndef SECCOMP_FILTER_FLAG_SPEC_ALLOW
 169#define SECCOMP_FILTER_FLAG_SPEC_ALLOW (1UL << 2)
 170#endif
 171
 172#ifndef PTRACE_SECCOMP_GET_METADATA
 173#define PTRACE_SECCOMP_GET_METADATA     0x420d
 174
 175struct seccomp_metadata {
 176        __u64 filter_off;       /* Input: which filter */
 177        __u64 flags;             /* Output: filter's flags */
 178};
 179#endif
 180
 181#ifndef SECCOMP_FILTER_FLAG_NEW_LISTENER
 182#define SECCOMP_FILTER_FLAG_NEW_LISTENER        (1UL << 3)
 183#endif
 184
 185#ifndef SECCOMP_RET_USER_NOTIF
 186#define SECCOMP_RET_USER_NOTIF 0x7fc00000U
 187
 188#define SECCOMP_IOC_MAGIC               '!'
 189#define SECCOMP_IO(nr)                  _IO(SECCOMP_IOC_MAGIC, nr)
 190#define SECCOMP_IOR(nr, type)           _IOR(SECCOMP_IOC_MAGIC, nr, type)
 191#define SECCOMP_IOW(nr, type)           _IOW(SECCOMP_IOC_MAGIC, nr, type)
 192#define SECCOMP_IOWR(nr, type)          _IOWR(SECCOMP_IOC_MAGIC, nr, type)
 193
 194/* Flags for seccomp notification fd ioctl. */
 195#define SECCOMP_IOCTL_NOTIF_RECV        SECCOMP_IOWR(0, struct seccomp_notif)
 196#define SECCOMP_IOCTL_NOTIF_SEND        SECCOMP_IOWR(1, \
 197                                                struct seccomp_notif_resp)
 198#define SECCOMP_IOCTL_NOTIF_ID_VALID    SECCOMP_IOW(2, __u64)
 199
 200struct seccomp_notif {
 201        __u64 id;
 202        __u32 pid;
 203        __u32 flags;
 204        struct seccomp_data data;
 205};
 206
 207struct seccomp_notif_resp {
 208        __u64 id;
 209        __s64 val;
 210        __s32 error;
 211        __u32 flags;
 212};
 213
 214struct seccomp_notif_sizes {
 215        __u16 seccomp_notif;
 216        __u16 seccomp_notif_resp;
 217        __u16 seccomp_data;
 218};
 219#endif
 220
 221#ifndef SECCOMP_IOCTL_NOTIF_ADDFD
 222/* On success, the return value is the remote process's added fd number */
 223#define SECCOMP_IOCTL_NOTIF_ADDFD       SECCOMP_IOW(3,  \
 224                                                struct seccomp_notif_addfd)
 225
 226/* valid flags for seccomp_notif_addfd */
 227#define SECCOMP_ADDFD_FLAG_SETFD        (1UL << 0) /* Specify remote fd */
 228
 229struct seccomp_notif_addfd {
 230        __u64 id;
 231        __u32 flags;
 232        __u32 srcfd;
 233        __u32 newfd;
 234        __u32 newfd_flags;
 235};
 236#endif
 237
 238#ifndef SECCOMP_ADDFD_FLAG_SEND
 239#define SECCOMP_ADDFD_FLAG_SEND (1UL << 1) /* Addfd and return it, atomically */
 240#endif
 241
 242struct seccomp_notif_addfd_small {
 243        __u64 id;
 244        char weird[4];
 245};
 246#define SECCOMP_IOCTL_NOTIF_ADDFD_SMALL \
 247        SECCOMP_IOW(3, struct seccomp_notif_addfd_small)
 248
 249struct seccomp_notif_addfd_big {
 250        union {
 251                struct seccomp_notif_addfd addfd;
 252                char buf[sizeof(struct seccomp_notif_addfd) + 8];
 253        };
 254};
 255#define SECCOMP_IOCTL_NOTIF_ADDFD_BIG   \
 256        SECCOMP_IOWR(3, struct seccomp_notif_addfd_big)
 257
 258#ifndef PTRACE_EVENTMSG_SYSCALL_ENTRY
 259#define PTRACE_EVENTMSG_SYSCALL_ENTRY   1
 260#define PTRACE_EVENTMSG_SYSCALL_EXIT    2
 261#endif
 262
 263#ifndef SECCOMP_USER_NOTIF_FLAG_CONTINUE
 264#define SECCOMP_USER_NOTIF_FLAG_CONTINUE 0x00000001
 265#endif
 266
 267#ifndef SECCOMP_FILTER_FLAG_TSYNC_ESRCH
 268#define SECCOMP_FILTER_FLAG_TSYNC_ESRCH (1UL << 4)
 269#endif
 270
 271#ifndef seccomp
 272int seccomp(unsigned int op, unsigned int flags, void *args)
 273{
 274        errno = 0;
 275        return syscall(__NR_seccomp, op, flags, args);
 276}
 277#endif
 278
 279#if __BYTE_ORDER == __LITTLE_ENDIAN
 280#define syscall_arg(_n) (offsetof(struct seccomp_data, args[_n]))
 281#elif __BYTE_ORDER == __BIG_ENDIAN
 282#define syscall_arg(_n) (offsetof(struct seccomp_data, args[_n]) + sizeof(__u32))
 283#else
 284#error "wut? Unknown __BYTE_ORDER?!"
 285#endif
 286
 287#define SIBLING_EXIT_UNKILLED   0xbadbeef
 288#define SIBLING_EXIT_FAILURE    0xbadface
 289#define SIBLING_EXIT_NEWPRIVS   0xbadfeed
 290
 291static int __filecmp(pid_t pid1, pid_t pid2, int fd1, int fd2)
 292{
 293#ifdef __NR_kcmp
 294        errno = 0;
 295        return syscall(__NR_kcmp, pid1, pid2, KCMP_FILE, fd1, fd2);
 296#else
 297        errno = ENOSYS;
 298        return -1;
 299#endif
 300}
 301
 302/* Have TH_LOG report actual location filecmp() is used. */
 303#define filecmp(pid1, pid2, fd1, fd2)   ({              \
 304        int _ret;                                       \
 305                                                        \
 306        _ret = __filecmp(pid1, pid2, fd1, fd2);         \
 307        if (_ret != 0) {                                \
 308                if (_ret < 0 && errno == ENOSYS) {      \
 309                        TH_LOG("kcmp() syscall missing (test is less accurate)");\
 310                        _ret = 0;                       \
 311                }                                       \
 312        }                                               \
 313        _ret; })
 314
 315TEST(kcmp)
 316{
 317        int ret;
 318
 319        ret = __filecmp(getpid(), getpid(), 1, 1);
 320        EXPECT_EQ(ret, 0);
 321        if (ret != 0 && errno == ENOSYS)
 322                SKIP(return, "Kernel does not support kcmp() (missing CONFIG_KCMP?)");
 323}
 324
 325TEST(mode_strict_support)
 326{
 327        long ret;
 328
 329        ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_STRICT, NULL, NULL, NULL);
 330        ASSERT_EQ(0, ret) {
 331                TH_LOG("Kernel does not support CONFIG_SECCOMP");
 332        }
 333        syscall(__NR_exit, 0);
 334}
 335
 336TEST_SIGNAL(mode_strict_cannot_call_prctl, SIGKILL)
 337{
 338        long ret;
 339
 340        ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_STRICT, NULL, NULL, NULL);
 341        ASSERT_EQ(0, ret) {
 342                TH_LOG("Kernel does not support CONFIG_SECCOMP");
 343        }
 344        syscall(__NR_prctl, PR_SET_SECCOMP, SECCOMP_MODE_FILTER,
 345                NULL, NULL, NULL);
 346        EXPECT_FALSE(true) {
 347                TH_LOG("Unreachable!");
 348        }
 349}
 350
 351/* Note! This doesn't test no new privs behavior */
 352TEST(no_new_privs_support)
 353{
 354        long ret;
 355
 356        ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
 357        EXPECT_EQ(0, ret) {
 358                TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
 359        }
 360}
 361
 362/* Tests kernel support by checking for a copy_from_user() fault on NULL. */
 363TEST(mode_filter_support)
 364{
 365        long ret;
 366
 367        ret = prctl(PR_SET_NO_NEW_PRIVS, 1, NULL, 0, 0);
 368        ASSERT_EQ(0, ret) {
 369                TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
 370        }
 371        ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, NULL, NULL, NULL);
 372        EXPECT_EQ(-1, ret);
 373        EXPECT_EQ(EFAULT, errno) {
 374                TH_LOG("Kernel does not support CONFIG_SECCOMP_FILTER!");
 375        }
 376}
 377
 378TEST(mode_filter_without_nnp)
 379{
 380        struct sock_filter filter[] = {
 381                BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
 382        };
 383        struct sock_fprog prog = {
 384                .len = (unsigned short)ARRAY_SIZE(filter),
 385                .filter = filter,
 386        };
 387        long ret;
 388
 389        ret = prctl(PR_GET_NO_NEW_PRIVS, 0, NULL, 0, 0);
 390        ASSERT_LE(0, ret) {
 391                TH_LOG("Expected 0 or unsupported for NO_NEW_PRIVS");
 392        }
 393        errno = 0;
 394        ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog, 0, 0);
 395        /* Succeeds with CAP_SYS_ADMIN, fails without */
 396        /* TODO(wad) check caps not euid */
 397        if (geteuid()) {
 398                EXPECT_EQ(-1, ret);
 399                EXPECT_EQ(EACCES, errno);
 400        } else {
 401                EXPECT_EQ(0, ret);
 402        }
 403}
 404
 405#define MAX_INSNS_PER_PATH 32768
 406
 407TEST(filter_size_limits)
 408{
 409        int i;
 410        int count = BPF_MAXINSNS + 1;
 411        struct sock_filter allow[] = {
 412                BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
 413        };
 414        struct sock_filter *filter;
 415        struct sock_fprog prog = { };
 416        long ret;
 417
 418        filter = calloc(count, sizeof(*filter));
 419        ASSERT_NE(NULL, filter);
 420
 421        for (i = 0; i < count; i++)
 422                filter[i] = allow[0];
 423
 424        ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
 425        ASSERT_EQ(0, ret);
 426
 427        prog.filter = filter;
 428        prog.len = count;
 429
 430        /* Too many filter instructions in a single filter. */
 431        ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog, 0, 0);
 432        ASSERT_NE(0, ret) {
 433                TH_LOG("Installing %d insn filter was allowed", prog.len);
 434        }
 435
 436        /* One less is okay, though. */
 437        prog.len -= 1;
 438        ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog, 0, 0);
 439        ASSERT_EQ(0, ret) {
 440                TH_LOG("Installing %d insn filter wasn't allowed", prog.len);
 441        }
 442}
 443
 444TEST(filter_chain_limits)
 445{
 446        int i;
 447        int count = BPF_MAXINSNS;
 448        struct sock_filter allow[] = {
 449                BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
 450        };
 451        struct sock_filter *filter;
 452        struct sock_fprog prog = { };
 453        long ret;
 454
 455        filter = calloc(count, sizeof(*filter));
 456        ASSERT_NE(NULL, filter);
 457
 458        for (i = 0; i < count; i++)
 459                filter[i] = allow[0];
 460
 461        ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
 462        ASSERT_EQ(0, ret);
 463
 464        prog.filter = filter;
 465        prog.len = 1;
 466
 467        ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog, 0, 0);
 468        ASSERT_EQ(0, ret);
 469
 470        prog.len = count;
 471
 472        /* Too many total filter instructions. */
 473        for (i = 0; i < MAX_INSNS_PER_PATH; i++) {
 474                ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog, 0, 0);
 475                if (ret != 0)
 476                        break;
 477        }
 478        ASSERT_NE(0, ret) {
 479                TH_LOG("Allowed %d %d-insn filters (total with penalties:%d)",
 480                       i, count, i * (count + 4));
 481        }
 482}
 483
 484TEST(mode_filter_cannot_move_to_strict)
 485{
 486        struct sock_filter filter[] = {
 487                BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
 488        };
 489        struct sock_fprog prog = {
 490                .len = (unsigned short)ARRAY_SIZE(filter),
 491                .filter = filter,
 492        };
 493        long ret;
 494
 495        ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
 496        ASSERT_EQ(0, ret);
 497
 498        ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog, 0, 0);
 499        ASSERT_EQ(0, ret);
 500
 501        ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_STRICT, NULL, 0, 0);
 502        EXPECT_EQ(-1, ret);
 503        EXPECT_EQ(EINVAL, errno);
 504}
 505
 506
 507TEST(mode_filter_get_seccomp)
 508{
 509        struct sock_filter filter[] = {
 510                BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
 511        };
 512        struct sock_fprog prog = {
 513                .len = (unsigned short)ARRAY_SIZE(filter),
 514                .filter = filter,
 515        };
 516        long ret;
 517
 518        ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
 519        ASSERT_EQ(0, ret);
 520
 521        ret = prctl(PR_GET_SECCOMP, 0, 0, 0, 0);
 522        EXPECT_EQ(0, ret);
 523
 524        ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog, 0, 0);
 525        ASSERT_EQ(0, ret);
 526
 527        ret = prctl(PR_GET_SECCOMP, 0, 0, 0, 0);
 528        EXPECT_EQ(2, ret);
 529}
 530
 531
 532TEST(ALLOW_all)
 533{
 534        struct sock_filter filter[] = {
 535                BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
 536        };
 537        struct sock_fprog prog = {
 538                .len = (unsigned short)ARRAY_SIZE(filter),
 539                .filter = filter,
 540        };
 541        long ret;
 542
 543        ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
 544        ASSERT_EQ(0, ret);
 545
 546        ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog);
 547        ASSERT_EQ(0, ret);
 548}
 549
 550TEST(empty_prog)
 551{
 552        struct sock_filter filter[] = {
 553        };
 554        struct sock_fprog prog = {
 555                .len = (unsigned short)ARRAY_SIZE(filter),
 556                .filter = filter,
 557        };
 558        long ret;
 559
 560        ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
 561        ASSERT_EQ(0, ret);
 562
 563        ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog);
 564        EXPECT_EQ(-1, ret);
 565        EXPECT_EQ(EINVAL, errno);
 566}
 567
 568TEST(log_all)
 569{
 570        struct sock_filter filter[] = {
 571                BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_LOG),
 572        };
 573        struct sock_fprog prog = {
 574                .len = (unsigned short)ARRAY_SIZE(filter),
 575                .filter = filter,
 576        };
 577        long ret;
 578        pid_t parent = getppid();
 579
 580        ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
 581        ASSERT_EQ(0, ret);
 582
 583        ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog);
 584        ASSERT_EQ(0, ret);
 585
 586        /* getppid() should succeed and be logged (no check for logging) */
 587        EXPECT_EQ(parent, syscall(__NR_getppid));
 588}
 589
 590TEST_SIGNAL(unknown_ret_is_kill_inside, SIGSYS)
 591{
 592        struct sock_filter filter[] = {
 593                BPF_STMT(BPF_RET|BPF_K, 0x10000000U),
 594        };
 595        struct sock_fprog prog = {
 596                .len = (unsigned short)ARRAY_SIZE(filter),
 597                .filter = filter,
 598        };
 599        long ret;
 600
 601        ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
 602        ASSERT_EQ(0, ret);
 603
 604        ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog);
 605        ASSERT_EQ(0, ret);
 606        EXPECT_EQ(0, syscall(__NR_getpid)) {
 607                TH_LOG("getpid() shouldn't ever return");
 608        }
 609}
 610
 611/* return code >= 0x80000000 is unused. */
 612TEST_SIGNAL(unknown_ret_is_kill_above_allow, SIGSYS)
 613{
 614        struct sock_filter filter[] = {
 615                BPF_STMT(BPF_RET|BPF_K, 0x90000000U),
 616        };
 617        struct sock_fprog prog = {
 618                .len = (unsigned short)ARRAY_SIZE(filter),
 619                .filter = filter,
 620        };
 621        long ret;
 622
 623        ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
 624        ASSERT_EQ(0, ret);
 625
 626        ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog);
 627        ASSERT_EQ(0, ret);
 628        EXPECT_EQ(0, syscall(__NR_getpid)) {
 629                TH_LOG("getpid() shouldn't ever return");
 630        }
 631}
 632
 633TEST_SIGNAL(KILL_all, SIGSYS)
 634{
 635        struct sock_filter filter[] = {
 636                BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_KILL),
 637        };
 638        struct sock_fprog prog = {
 639                .len = (unsigned short)ARRAY_SIZE(filter),
 640                .filter = filter,
 641        };
 642        long ret;
 643
 644        ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
 645        ASSERT_EQ(0, ret);
 646
 647        ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog);
 648        ASSERT_EQ(0, ret);
 649}
 650
 651TEST_SIGNAL(KILL_one, SIGSYS)
 652{
 653        struct sock_filter filter[] = {
 654                BPF_STMT(BPF_LD|BPF_W|BPF_ABS,
 655                        offsetof(struct seccomp_data, nr)),
 656                BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_getpid, 0, 1),
 657                BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_KILL),
 658                BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
 659        };
 660        struct sock_fprog prog = {
 661                .len = (unsigned short)ARRAY_SIZE(filter),
 662                .filter = filter,
 663        };
 664        long ret;
 665        pid_t parent = getppid();
 666
 667        ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
 668        ASSERT_EQ(0, ret);
 669
 670        ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog);
 671        ASSERT_EQ(0, ret);
 672
 673        EXPECT_EQ(parent, syscall(__NR_getppid));
 674        /* getpid() should never return. */
 675        EXPECT_EQ(0, syscall(__NR_getpid));
 676}
 677
 678TEST_SIGNAL(KILL_one_arg_one, SIGSYS)
 679{
 680        void *fatal_address;
 681        struct sock_filter filter[] = {
 682                BPF_STMT(BPF_LD|BPF_W|BPF_ABS,
 683                        offsetof(struct seccomp_data, nr)),
 684                BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_times, 1, 0),
 685                BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
 686                /* Only both with lower 32-bit for now. */
 687                BPF_STMT(BPF_LD|BPF_W|BPF_ABS, syscall_arg(0)),
 688                BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K,
 689                        (unsigned long)&fatal_address, 0, 1),
 690                BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_KILL),
 691                BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
 692        };
 693        struct sock_fprog prog = {
 694                .len = (unsigned short)ARRAY_SIZE(filter),
 695                .filter = filter,
 696        };
 697        long ret;
 698        pid_t parent = getppid();
 699        struct tms timebuf;
 700        clock_t clock = times(&timebuf);
 701
 702        ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
 703        ASSERT_EQ(0, ret);
 704
 705        ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog);
 706        ASSERT_EQ(0, ret);
 707
 708        EXPECT_EQ(parent, syscall(__NR_getppid));
 709        EXPECT_LE(clock, syscall(__NR_times, &timebuf));
 710        /* times() should never return. */
 711        EXPECT_EQ(0, syscall(__NR_times, &fatal_address));
 712}
 713
 714TEST_SIGNAL(KILL_one_arg_six, SIGSYS)
 715{
 716#ifndef __NR_mmap2
 717        int sysno = __NR_mmap;
 718#else
 719        int sysno = __NR_mmap2;
 720#endif
 721        struct sock_filter filter[] = {
 722                BPF_STMT(BPF_LD|BPF_W|BPF_ABS,
 723                        offsetof(struct seccomp_data, nr)),
 724                BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, sysno, 1, 0),
 725                BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
 726                /* Only both with lower 32-bit for now. */
 727                BPF_STMT(BPF_LD|BPF_W|BPF_ABS, syscall_arg(5)),
 728                BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, 0x0C0FFEE, 0, 1),
 729                BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_KILL),
 730                BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
 731        };
 732        struct sock_fprog prog = {
 733                .len = (unsigned short)ARRAY_SIZE(filter),
 734                .filter = filter,
 735        };
 736        long ret;
 737        pid_t parent = getppid();
 738        int fd;
 739        void *map1, *map2;
 740        int page_size = sysconf(_SC_PAGESIZE);
 741
 742        ASSERT_LT(0, page_size);
 743
 744        ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
 745        ASSERT_EQ(0, ret);
 746
 747        ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog);
 748        ASSERT_EQ(0, ret);
 749
 750        fd = open("/dev/zero", O_RDONLY);
 751        ASSERT_NE(-1, fd);
 752
 753        EXPECT_EQ(parent, syscall(__NR_getppid));
 754        map1 = (void *)syscall(sysno,
 755                NULL, page_size, PROT_READ, MAP_PRIVATE, fd, page_size);
 756        EXPECT_NE(MAP_FAILED, map1);
 757        /* mmap2() should never return. */
 758        map2 = (void *)syscall(sysno,
 759                 NULL, page_size, PROT_READ, MAP_PRIVATE, fd, 0x0C0FFEE);
 760        EXPECT_EQ(MAP_FAILED, map2);
 761
 762        /* The test failed, so clean up the resources. */
 763        munmap(map1, page_size);
 764        munmap(map2, page_size);
 765        close(fd);
 766}
 767
 768/* This is a thread task to die via seccomp filter violation. */
 769void *kill_thread(void *data)
 770{
 771        bool die = (bool)data;
 772
 773        if (die) {
 774                prctl(PR_GET_SECCOMP, 0, 0, 0, 0);
 775                return (void *)SIBLING_EXIT_FAILURE;
 776        }
 777
 778        return (void *)SIBLING_EXIT_UNKILLED;
 779}
 780
 781enum kill_t {
 782        KILL_THREAD,
 783        KILL_PROCESS,
 784        RET_UNKNOWN
 785};
 786
 787/* Prepare a thread that will kill itself or both of us. */
 788void kill_thread_or_group(struct __test_metadata *_metadata,
 789                          enum kill_t kill_how)
 790{
 791        pthread_t thread;
 792        void *status;
 793        /* Kill only when calling __NR_prctl. */
 794        struct sock_filter filter_thread[] = {
 795                BPF_STMT(BPF_LD|BPF_W|BPF_ABS,
 796                        offsetof(struct seccomp_data, nr)),
 797                BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_prctl, 0, 1),
 798                BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_KILL_THREAD),
 799                BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
 800        };
 801        struct sock_fprog prog_thread = {
 802                .len = (unsigned short)ARRAY_SIZE(filter_thread),
 803                .filter = filter_thread,
 804        };
 805        int kill = kill_how == KILL_PROCESS ? SECCOMP_RET_KILL_PROCESS : 0xAAAAAAAAA;
 806        struct sock_filter filter_process[] = {
 807                BPF_STMT(BPF_LD|BPF_W|BPF_ABS,
 808                        offsetof(struct seccomp_data, nr)),
 809                BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_prctl, 0, 1),
 810                BPF_STMT(BPF_RET|BPF_K, kill),
 811                BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
 812        };
 813        struct sock_fprog prog_process = {
 814                .len = (unsigned short)ARRAY_SIZE(filter_process),
 815                .filter = filter_process,
 816        };
 817
 818        ASSERT_EQ(0, prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0)) {
 819                TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
 820        }
 821
 822        ASSERT_EQ(0, seccomp(SECCOMP_SET_MODE_FILTER, 0,
 823                             kill_how == KILL_THREAD ? &prog_thread
 824                                                     : &prog_process));
 825
 826        /*
 827         * Add the KILL_THREAD rule again to make sure that the KILL_PROCESS
 828         * flag cannot be downgraded by a new filter.
 829         */
 830        if (kill_how == KILL_PROCESS)
 831                ASSERT_EQ(0, seccomp(SECCOMP_SET_MODE_FILTER, 0, &prog_thread));
 832
 833        /* Start a thread that will exit immediately. */
 834        ASSERT_EQ(0, pthread_create(&thread, NULL, kill_thread, (void *)false));
 835        ASSERT_EQ(0, pthread_join(thread, &status));
 836        ASSERT_EQ(SIBLING_EXIT_UNKILLED, (unsigned long)status);
 837
 838        /* Start a thread that will die immediately. */
 839        ASSERT_EQ(0, pthread_create(&thread, NULL, kill_thread, (void *)true));
 840        ASSERT_EQ(0, pthread_join(thread, &status));
 841        ASSERT_NE(SIBLING_EXIT_FAILURE, (unsigned long)status);
 842
 843        /*
 844         * If we get here, only the spawned thread died. Let the parent know
 845         * the whole process didn't die (i.e. this thread, the spawner,
 846         * stayed running).
 847         */
 848        exit(42);
 849}
 850
 851TEST(KILL_thread)
 852{
 853        int status;
 854        pid_t child_pid;
 855
 856        child_pid = fork();
 857        ASSERT_LE(0, child_pid);
 858        if (child_pid == 0) {
 859                kill_thread_or_group(_metadata, KILL_THREAD);
 860                _exit(38);
 861        }
 862
 863        ASSERT_EQ(child_pid, waitpid(child_pid, &status, 0));
 864
 865        /* If only the thread was killed, we'll see exit 42. */
 866        ASSERT_TRUE(WIFEXITED(status));
 867        ASSERT_EQ(42, WEXITSTATUS(status));
 868}
 869
 870TEST(KILL_process)
 871{
 872        int status;
 873        pid_t child_pid;
 874
 875        child_pid = fork();
 876        ASSERT_LE(0, child_pid);
 877        if (child_pid == 0) {
 878                kill_thread_or_group(_metadata, KILL_PROCESS);
 879                _exit(38);
 880        }
 881
 882        ASSERT_EQ(child_pid, waitpid(child_pid, &status, 0));
 883
 884        /* If the entire process was killed, we'll see SIGSYS. */
 885        ASSERT_TRUE(WIFSIGNALED(status));
 886        ASSERT_EQ(SIGSYS, WTERMSIG(status));
 887}
 888
 889TEST(KILL_unknown)
 890{
 891        int status;
 892        pid_t child_pid;
 893
 894        child_pid = fork();
 895        ASSERT_LE(0, child_pid);
 896        if (child_pid == 0) {
 897                kill_thread_or_group(_metadata, RET_UNKNOWN);
 898                _exit(38);
 899        }
 900
 901        ASSERT_EQ(child_pid, waitpid(child_pid, &status, 0));
 902
 903        /* If the entire process was killed, we'll see SIGSYS. */
 904        EXPECT_TRUE(WIFSIGNALED(status)) {
 905                TH_LOG("Unknown SECCOMP_RET is only killing the thread?");
 906        }
 907        ASSERT_EQ(SIGSYS, WTERMSIG(status));
 908}
 909
 910/* TODO(wad) add 64-bit versus 32-bit arg tests. */
 911TEST(arg_out_of_range)
 912{
 913        struct sock_filter filter[] = {
 914                BPF_STMT(BPF_LD|BPF_W|BPF_ABS, syscall_arg(6)),
 915                BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
 916        };
 917        struct sock_fprog prog = {
 918                .len = (unsigned short)ARRAY_SIZE(filter),
 919                .filter = filter,
 920        };
 921        long ret;
 922
 923        ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
 924        ASSERT_EQ(0, ret);
 925
 926        ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog);
 927        EXPECT_EQ(-1, ret);
 928        EXPECT_EQ(EINVAL, errno);
 929}
 930
 931#define ERRNO_FILTER(name, errno)                                       \
 932        struct sock_filter _read_filter_##name[] = {                    \
 933                BPF_STMT(BPF_LD|BPF_W|BPF_ABS,                          \
 934                        offsetof(struct seccomp_data, nr)),             \
 935                BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_read, 0, 1),       \
 936                BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ERRNO | errno),     \
 937                BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),             \
 938        };                                                              \
 939        struct sock_fprog prog_##name = {                               \
 940                .len = (unsigned short)ARRAY_SIZE(_read_filter_##name), \
 941                .filter = _read_filter_##name,                          \
 942        }
 943
 944/* Make sure basic errno values are correctly passed through a filter. */
 945TEST(ERRNO_valid)
 946{
 947        ERRNO_FILTER(valid, E2BIG);
 948        long ret;
 949        pid_t parent = getppid();
 950
 951        ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
 952        ASSERT_EQ(0, ret);
 953
 954        ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog_valid);
 955        ASSERT_EQ(0, ret);
 956
 957        EXPECT_EQ(parent, syscall(__NR_getppid));
 958        EXPECT_EQ(-1, read(0, NULL, 0));
 959        EXPECT_EQ(E2BIG, errno);
 960}
 961
 962/* Make sure an errno of zero is correctly handled by the arch code. */
 963TEST(ERRNO_zero)
 964{
 965        ERRNO_FILTER(zero, 0);
 966        long ret;
 967        pid_t parent = getppid();
 968
 969        ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
 970        ASSERT_EQ(0, ret);
 971
 972        ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog_zero);
 973        ASSERT_EQ(0, ret);
 974
 975        EXPECT_EQ(parent, syscall(__NR_getppid));
 976        /* "errno" of 0 is ok. */
 977        EXPECT_EQ(0, read(0, NULL, 0));
 978}
 979
 980/*
 981 * The SECCOMP_RET_DATA mask is 16 bits wide, but errno is smaller.
 982 * This tests that the errno value gets capped correctly, fixed by
 983 * 580c57f10768 ("seccomp: cap SECCOMP_RET_ERRNO data to MAX_ERRNO").
 984 */
 985TEST(ERRNO_capped)
 986{
 987        ERRNO_FILTER(capped, 4096);
 988        long ret;
 989        pid_t parent = getppid();
 990
 991        ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
 992        ASSERT_EQ(0, ret);
 993
 994        ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog_capped);
 995        ASSERT_EQ(0, ret);
 996
 997        EXPECT_EQ(parent, syscall(__NR_getppid));
 998        EXPECT_EQ(-1, read(0, NULL, 0));
 999        EXPECT_EQ(4095, errno);
1000}
1001
1002/*
1003 * Filters are processed in reverse order: last applied is executed first.
1004 * Since only the SECCOMP_RET_ACTION mask is tested for return values, the
1005 * SECCOMP_RET_DATA mask results will follow the most recently applied
1006 * matching filter return (and not the lowest or highest value).
1007 */
1008TEST(ERRNO_order)
1009{
1010        ERRNO_FILTER(first,  11);
1011        ERRNO_FILTER(second, 13);
1012        ERRNO_FILTER(third,  12);
1013        long ret;
1014        pid_t parent = getppid();
1015
1016        ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
1017        ASSERT_EQ(0, ret);
1018
1019        ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog_first);
1020        ASSERT_EQ(0, ret);
1021
1022        ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog_second);
1023        ASSERT_EQ(0, ret);
1024
1025        ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog_third);
1026        ASSERT_EQ(0, ret);
1027
1028        EXPECT_EQ(parent, syscall(__NR_getppid));
1029        EXPECT_EQ(-1, read(0, NULL, 0));
1030        EXPECT_EQ(12, errno);
1031}
1032
1033FIXTURE(TRAP) {
1034        struct sock_fprog prog;
1035};
1036
1037FIXTURE_SETUP(TRAP)
1038{
1039        struct sock_filter filter[] = {
1040                BPF_STMT(BPF_LD|BPF_W|BPF_ABS,
1041                        offsetof(struct seccomp_data, nr)),
1042                BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_getpid, 0, 1),
1043                BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_TRAP),
1044                BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
1045        };
1046
1047        memset(&self->prog, 0, sizeof(self->prog));
1048        self->prog.filter = malloc(sizeof(filter));
1049        ASSERT_NE(NULL, self->prog.filter);
1050        memcpy(self->prog.filter, filter, sizeof(filter));
1051        self->prog.len = (unsigned short)ARRAY_SIZE(filter);
1052}
1053
1054FIXTURE_TEARDOWN(TRAP)
1055{
1056        if (self->prog.filter)
1057                free(self->prog.filter);
1058}
1059
1060TEST_F_SIGNAL(TRAP, dfl, SIGSYS)
1061{
1062        long ret;
1063
1064        ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
1065        ASSERT_EQ(0, ret);
1066
1067        ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->prog);
1068        ASSERT_EQ(0, ret);
1069        syscall(__NR_getpid);
1070}
1071
1072/* Ensure that SIGSYS overrides SIG_IGN */
1073TEST_F_SIGNAL(TRAP, ign, SIGSYS)
1074{
1075        long ret;
1076
1077        ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
1078        ASSERT_EQ(0, ret);
1079
1080        signal(SIGSYS, SIG_IGN);
1081
1082        ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->prog);
1083        ASSERT_EQ(0, ret);
1084        syscall(__NR_getpid);
1085}
1086
1087static siginfo_t TRAP_info;
1088static volatile int TRAP_nr;
1089static void TRAP_action(int nr, siginfo_t *info, void *void_context)
1090{
1091        memcpy(&TRAP_info, info, sizeof(TRAP_info));
1092        TRAP_nr = nr;
1093}
1094
1095TEST_F(TRAP, handler)
1096{
1097        int ret, test;
1098        struct sigaction act;
1099        sigset_t mask;
1100
1101        memset(&act, 0, sizeof(act));
1102        sigemptyset(&mask);
1103        sigaddset(&mask, SIGSYS);
1104
1105        act.sa_sigaction = &TRAP_action;
1106        act.sa_flags = SA_SIGINFO;
1107        ret = sigaction(SIGSYS, &act, NULL);
1108        ASSERT_EQ(0, ret) {
1109                TH_LOG("sigaction failed");
1110        }
1111        ret = sigprocmask(SIG_UNBLOCK, &mask, NULL);
1112        ASSERT_EQ(0, ret) {
1113                TH_LOG("sigprocmask failed");
1114        }
1115
1116        ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
1117        ASSERT_EQ(0, ret);
1118        ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->prog);
1119        ASSERT_EQ(0, ret);
1120        TRAP_nr = 0;
1121        memset(&TRAP_info, 0, sizeof(TRAP_info));
1122        /* Expect the registers to be rolled back. (nr = error) may vary
1123         * based on arch. */
1124        ret = syscall(__NR_getpid);
1125        /* Silence gcc warning about volatile. */
1126        test = TRAP_nr;
1127        EXPECT_EQ(SIGSYS, test);
1128        struct local_sigsys {
1129                void *_call_addr;       /* calling user insn */
1130                int _syscall;           /* triggering system call number */
1131                unsigned int _arch;     /* AUDIT_ARCH_* of syscall */
1132        } *sigsys = (struct local_sigsys *)
1133#ifdef si_syscall
1134                &(TRAP_info.si_call_addr);
1135#else
1136                &TRAP_info.si_pid;
1137#endif
1138        EXPECT_EQ(__NR_getpid, sigsys->_syscall);
1139        /* Make sure arch is non-zero. */
1140        EXPECT_NE(0, sigsys->_arch);
1141        EXPECT_NE(0, (unsigned long)sigsys->_call_addr);
1142}
1143
1144FIXTURE(precedence) {
1145        struct sock_fprog allow;
1146        struct sock_fprog log;
1147        struct sock_fprog trace;
1148        struct sock_fprog error;
1149        struct sock_fprog trap;
1150        struct sock_fprog kill;
1151};
1152
1153FIXTURE_SETUP(precedence)
1154{
1155        struct sock_filter allow_insns[] = {
1156                BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
1157        };
1158        struct sock_filter log_insns[] = {
1159                BPF_STMT(BPF_LD|BPF_W|BPF_ABS,
1160                        offsetof(struct seccomp_data, nr)),
1161                BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_getpid, 1, 0),
1162                BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
1163                BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_LOG),
1164        };
1165        struct sock_filter trace_insns[] = {
1166                BPF_STMT(BPF_LD|BPF_W|BPF_ABS,
1167                        offsetof(struct seccomp_data, nr)),
1168                BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_getpid, 1, 0),
1169                BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
1170                BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_TRACE),
1171        };
1172        struct sock_filter error_insns[] = {
1173                BPF_STMT(BPF_LD|BPF_W|BPF_ABS,
1174                        offsetof(struct seccomp_data, nr)),
1175                BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_getpid, 1, 0),
1176                BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
1177                BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ERRNO),
1178        };
1179        struct sock_filter trap_insns[] = {
1180                BPF_STMT(BPF_LD|BPF_W|BPF_ABS,
1181                        offsetof(struct seccomp_data, nr)),
1182                BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_getpid, 1, 0),
1183                BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
1184                BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_TRAP),
1185        };
1186        struct sock_filter kill_insns[] = {
1187                BPF_STMT(BPF_LD|BPF_W|BPF_ABS,
1188                        offsetof(struct seccomp_data, nr)),
1189                BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_getpid, 1, 0),
1190                BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
1191                BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_KILL),
1192        };
1193
1194        memset(self, 0, sizeof(*self));
1195#define FILTER_ALLOC(_x) \
1196        self->_x.filter = malloc(sizeof(_x##_insns)); \
1197        ASSERT_NE(NULL, self->_x.filter); \
1198        memcpy(self->_x.filter, &_x##_insns, sizeof(_x##_insns)); \
1199        self->_x.len = (unsigned short)ARRAY_SIZE(_x##_insns)
1200        FILTER_ALLOC(allow);
1201        FILTER_ALLOC(log);
1202        FILTER_ALLOC(trace);
1203        FILTER_ALLOC(error);
1204        FILTER_ALLOC(trap);
1205        FILTER_ALLOC(kill);
1206}
1207
1208FIXTURE_TEARDOWN(precedence)
1209{
1210#define FILTER_FREE(_x) if (self->_x.filter) free(self->_x.filter)
1211        FILTER_FREE(allow);
1212        FILTER_FREE(log);
1213        FILTER_FREE(trace);
1214        FILTER_FREE(error);
1215        FILTER_FREE(trap);
1216        FILTER_FREE(kill);
1217}
1218
1219TEST_F(precedence, allow_ok)
1220{
1221        pid_t parent, res = 0;
1222        long ret;
1223
1224        parent = getppid();
1225        ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
1226        ASSERT_EQ(0, ret);
1227
1228        ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->allow);
1229        ASSERT_EQ(0, ret);
1230        ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->log);
1231        ASSERT_EQ(0, ret);
1232        ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->trace);
1233        ASSERT_EQ(0, ret);
1234        ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->error);
1235        ASSERT_EQ(0, ret);
1236        ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->trap);
1237        ASSERT_EQ(0, ret);
1238        ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->kill);
1239        ASSERT_EQ(0, ret);
1240        /* Should work just fine. */
1241        res = syscall(__NR_getppid);
1242        EXPECT_EQ(parent, res);
1243}
1244
1245TEST_F_SIGNAL(precedence, kill_is_highest, SIGSYS)
1246{
1247        pid_t parent, res = 0;
1248        long ret;
1249
1250        parent = getppid();
1251        ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
1252        ASSERT_EQ(0, ret);
1253
1254        ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->allow);
1255        ASSERT_EQ(0, ret);
1256        ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->log);
1257        ASSERT_EQ(0, ret);
1258        ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->trace);
1259        ASSERT_EQ(0, ret);
1260        ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->error);
1261        ASSERT_EQ(0, ret);
1262        ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->trap);
1263        ASSERT_EQ(0, ret);
1264        ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->kill);
1265        ASSERT_EQ(0, ret);
1266        /* Should work just fine. */
1267        res = syscall(__NR_getppid);
1268        EXPECT_EQ(parent, res);
1269        /* getpid() should never return. */
1270        res = syscall(__NR_getpid);
1271        EXPECT_EQ(0, res);
1272}
1273
1274TEST_F_SIGNAL(precedence, kill_is_highest_in_any_order, SIGSYS)
1275{
1276        pid_t parent;
1277        long ret;
1278
1279        parent = getppid();
1280        ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
1281        ASSERT_EQ(0, ret);
1282
1283        ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->allow);
1284        ASSERT_EQ(0, ret);
1285        ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->kill);
1286        ASSERT_EQ(0, ret);
1287        ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->error);
1288        ASSERT_EQ(0, ret);
1289        ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->log);
1290        ASSERT_EQ(0, ret);
1291        ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->trace);
1292        ASSERT_EQ(0, ret);
1293        ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->trap);
1294        ASSERT_EQ(0, ret);
1295        /* Should work just fine. */
1296        EXPECT_EQ(parent, syscall(__NR_getppid));
1297        /* getpid() should never return. */
1298        EXPECT_EQ(0, syscall(__NR_getpid));
1299}
1300
1301TEST_F_SIGNAL(precedence, trap_is_second, SIGSYS)
1302{
1303        pid_t parent;
1304        long ret;
1305
1306        parent = getppid();
1307        ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
1308        ASSERT_EQ(0, ret);
1309
1310        ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->allow);
1311        ASSERT_EQ(0, ret);
1312        ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->log);
1313        ASSERT_EQ(0, ret);
1314        ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->trace);
1315        ASSERT_EQ(0, ret);
1316        ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->error);
1317        ASSERT_EQ(0, ret);
1318        ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->trap);
1319        ASSERT_EQ(0, ret);
1320        /* Should work just fine. */
1321        EXPECT_EQ(parent, syscall(__NR_getppid));
1322        /* getpid() should never return. */
1323        EXPECT_EQ(0, syscall(__NR_getpid));
1324}
1325
1326TEST_F_SIGNAL(precedence, trap_is_second_in_any_order, SIGSYS)
1327{
1328        pid_t parent;
1329        long ret;
1330
1331        parent = getppid();
1332        ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
1333        ASSERT_EQ(0, ret);
1334
1335        ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->allow);
1336        ASSERT_EQ(0, ret);
1337        ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->trap);
1338        ASSERT_EQ(0, ret);
1339        ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->log);
1340        ASSERT_EQ(0, ret);
1341        ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->trace);
1342        ASSERT_EQ(0, ret);
1343        ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->error);
1344        ASSERT_EQ(0, ret);
1345        /* Should work just fine. */
1346        EXPECT_EQ(parent, syscall(__NR_getppid));
1347        /* getpid() should never return. */
1348        EXPECT_EQ(0, syscall(__NR_getpid));
1349}
1350
1351TEST_F(precedence, errno_is_third)
1352{
1353        pid_t parent;
1354        long ret;
1355
1356        parent = getppid();
1357        ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
1358        ASSERT_EQ(0, ret);
1359
1360        ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->allow);
1361        ASSERT_EQ(0, ret);
1362        ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->log);
1363        ASSERT_EQ(0, ret);
1364        ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->trace);
1365        ASSERT_EQ(0, ret);
1366        ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->error);
1367        ASSERT_EQ(0, ret);
1368        /* Should work just fine. */
1369        EXPECT_EQ(parent, syscall(__NR_getppid));
1370        EXPECT_EQ(0, syscall(__NR_getpid));
1371}
1372
1373TEST_F(precedence, errno_is_third_in_any_order)
1374{
1375        pid_t parent;
1376        long ret;
1377
1378        parent = getppid();
1379        ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
1380        ASSERT_EQ(0, ret);
1381
1382        ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->log);
1383        ASSERT_EQ(0, ret);
1384        ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->error);
1385        ASSERT_EQ(0, ret);
1386        ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->trace);
1387        ASSERT_EQ(0, ret);
1388        ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->allow);
1389        ASSERT_EQ(0, ret);
1390        /* Should work just fine. */
1391        EXPECT_EQ(parent, syscall(__NR_getppid));
1392        EXPECT_EQ(0, syscall(__NR_getpid));
1393}
1394
1395TEST_F(precedence, trace_is_fourth)
1396{
1397        pid_t parent;
1398        long ret;
1399
1400        parent = getppid();
1401        ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
1402        ASSERT_EQ(0, ret);
1403
1404        ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->allow);
1405        ASSERT_EQ(0, ret);
1406        ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->log);
1407        ASSERT_EQ(0, ret);
1408        ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->trace);
1409        ASSERT_EQ(0, ret);
1410        /* Should work just fine. */
1411        EXPECT_EQ(parent, syscall(__NR_getppid));
1412        /* No ptracer */
1413        EXPECT_EQ(-1, syscall(__NR_getpid));
1414}
1415
1416TEST_F(precedence, trace_is_fourth_in_any_order)
1417{
1418        pid_t parent;
1419        long ret;
1420
1421        parent = getppid();
1422        ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
1423        ASSERT_EQ(0, ret);
1424
1425        ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->trace);
1426        ASSERT_EQ(0, ret);
1427        ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->allow);
1428        ASSERT_EQ(0, ret);
1429        ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->log);
1430        ASSERT_EQ(0, ret);
1431        /* Should work just fine. */
1432        EXPECT_EQ(parent, syscall(__NR_getppid));
1433        /* No ptracer */
1434        EXPECT_EQ(-1, syscall(__NR_getpid));
1435}
1436
1437TEST_F(precedence, log_is_fifth)
1438{
1439        pid_t mypid, parent;
1440        long ret;
1441
1442        mypid = getpid();
1443        parent = getppid();
1444        ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
1445        ASSERT_EQ(0, ret);
1446
1447        ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->allow);
1448        ASSERT_EQ(0, ret);
1449        ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->log);
1450        ASSERT_EQ(0, ret);
1451        /* Should work just fine. */
1452        EXPECT_EQ(parent, syscall(__NR_getppid));
1453        /* Should also work just fine */
1454        EXPECT_EQ(mypid, syscall(__NR_getpid));
1455}
1456
1457TEST_F(precedence, log_is_fifth_in_any_order)
1458{
1459        pid_t mypid, parent;
1460        long ret;
1461
1462        mypid = getpid();
1463        parent = getppid();
1464        ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
1465        ASSERT_EQ(0, ret);
1466
1467        ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->log);
1468        ASSERT_EQ(0, ret);
1469        ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->allow);
1470        ASSERT_EQ(0, ret);
1471        /* Should work just fine. */
1472        EXPECT_EQ(parent, syscall(__NR_getppid));
1473        /* Should also work just fine */
1474        EXPECT_EQ(mypid, syscall(__NR_getpid));
1475}
1476
1477#ifndef PTRACE_O_TRACESECCOMP
1478#define PTRACE_O_TRACESECCOMP   0x00000080
1479#endif
1480
1481/* Catch the Ubuntu 12.04 value error. */
1482#if PTRACE_EVENT_SECCOMP != 7
1483#undef PTRACE_EVENT_SECCOMP
1484#endif
1485
1486#ifndef PTRACE_EVENT_SECCOMP
1487#define PTRACE_EVENT_SECCOMP 7
1488#endif
1489
1490#define IS_SECCOMP_EVENT(status) ((status >> 16) == PTRACE_EVENT_SECCOMP)
1491bool tracer_running;
1492void tracer_stop(int sig)
1493{
1494        tracer_running = false;
1495}
1496
1497typedef void tracer_func_t(struct __test_metadata *_metadata,
1498                           pid_t tracee, int status, void *args);
1499
1500void start_tracer(struct __test_metadata *_metadata, int fd, pid_t tracee,
1501            tracer_func_t tracer_func, void *args, bool ptrace_syscall)
1502{
1503        int ret = -1;
1504        struct sigaction action = {
1505                .sa_handler = tracer_stop,
1506        };
1507
1508        /* Allow external shutdown. */
1509        tracer_running = true;
1510        ASSERT_EQ(0, sigaction(SIGUSR1, &action, NULL));
1511
1512        errno = 0;
1513        while (ret == -1 && errno != EINVAL)
1514                ret = ptrace(PTRACE_ATTACH, tracee, NULL, 0);
1515        ASSERT_EQ(0, ret) {
1516                kill(tracee, SIGKILL);
1517        }
1518        /* Wait for attach stop */
1519        wait(NULL);
1520
1521        ret = ptrace(PTRACE_SETOPTIONS, tracee, NULL, ptrace_syscall ?
1522                                                      PTRACE_O_TRACESYSGOOD :
1523                                                      PTRACE_O_TRACESECCOMP);
1524        ASSERT_EQ(0, ret) {
1525                TH_LOG("Failed to set PTRACE_O_TRACESECCOMP");
1526                kill(tracee, SIGKILL);
1527        }
1528        ret = ptrace(ptrace_syscall ? PTRACE_SYSCALL : PTRACE_CONT,
1529                     tracee, NULL, 0);
1530        ASSERT_EQ(0, ret);
1531
1532        /* Unblock the tracee */
1533        ASSERT_EQ(1, write(fd, "A", 1));
1534        ASSERT_EQ(0, close(fd));
1535
1536        /* Run until we're shut down. Must assert to stop execution. */
1537        while (tracer_running) {
1538                int status;
1539
1540                if (wait(&status) != tracee)
1541                        continue;
1542                if (WIFSIGNALED(status) || WIFEXITED(status))
1543                        /* Child is dead. Time to go. */
1544                        return;
1545
1546                /* Check if this is a seccomp event. */
1547                ASSERT_EQ(!ptrace_syscall, IS_SECCOMP_EVENT(status));
1548
1549                tracer_func(_metadata, tracee, status, args);
1550
1551                ret = ptrace(ptrace_syscall ? PTRACE_SYSCALL : PTRACE_CONT,
1552                             tracee, NULL, 0);
1553                ASSERT_EQ(0, ret);
1554        }
1555        /* Directly report the status of our test harness results. */
1556        syscall(__NR_exit, _metadata->passed ? EXIT_SUCCESS : EXIT_FAILURE);
1557}
1558
1559/* Common tracer setup/teardown functions. */
1560void cont_handler(int num)
1561{ }
1562pid_t setup_trace_fixture(struct __test_metadata *_metadata,
1563                          tracer_func_t func, void *args, bool ptrace_syscall)
1564{
1565        char sync;
1566        int pipefd[2];
1567        pid_t tracer_pid;
1568        pid_t tracee = getpid();
1569
1570        /* Setup a pipe for clean synchronization. */
1571        ASSERT_EQ(0, pipe(pipefd));
1572
1573        /* Fork a child which we'll promote to tracer */
1574        tracer_pid = fork();
1575        ASSERT_LE(0, tracer_pid);
1576        signal(SIGALRM, cont_handler);
1577        if (tracer_pid == 0) {
1578                close(pipefd[0]);
1579                start_tracer(_metadata, pipefd[1], tracee, func, args,
1580                             ptrace_syscall);
1581                syscall(__NR_exit, 0);
1582        }
1583        close(pipefd[1]);
1584        prctl(PR_SET_PTRACER, tracer_pid, 0, 0, 0);
1585        read(pipefd[0], &sync, 1);
1586        close(pipefd[0]);
1587
1588        return tracer_pid;
1589}
1590
1591void teardown_trace_fixture(struct __test_metadata *_metadata,
1592                            pid_t tracer)
1593{
1594        if (tracer) {
1595                int status;
1596                /*
1597                 * Extract the exit code from the other process and
1598                 * adopt it for ourselves in case its asserts failed.
1599                 */
1600                ASSERT_EQ(0, kill(tracer, SIGUSR1));
1601                ASSERT_EQ(tracer, waitpid(tracer, &status, 0));
1602                if (WEXITSTATUS(status))
1603                        _metadata->passed = 0;
1604        }
1605}
1606
1607/* "poke" tracer arguments and function. */
1608struct tracer_args_poke_t {
1609        unsigned long poke_addr;
1610};
1611
1612void tracer_poke(struct __test_metadata *_metadata, pid_t tracee, int status,
1613                 void *args)
1614{
1615        int ret;
1616        unsigned long msg;
1617        struct tracer_args_poke_t *info = (struct tracer_args_poke_t *)args;
1618
1619        ret = ptrace(PTRACE_GETEVENTMSG, tracee, NULL, &msg);
1620        EXPECT_EQ(0, ret);
1621        /* If this fails, don't try to recover. */
1622        ASSERT_EQ(0x1001, msg) {
1623                kill(tracee, SIGKILL);
1624        }
1625        /*
1626         * Poke in the message.
1627         * Registers are not touched to try to keep this relatively arch
1628         * agnostic.
1629         */
1630        ret = ptrace(PTRACE_POKEDATA, tracee, info->poke_addr, 0x1001);
1631        EXPECT_EQ(0, ret);
1632}
1633
1634FIXTURE(TRACE_poke) {
1635        struct sock_fprog prog;
1636        pid_t tracer;
1637        long poked;
1638        struct tracer_args_poke_t tracer_args;
1639};
1640
1641FIXTURE_SETUP(TRACE_poke)
1642{
1643        struct sock_filter filter[] = {
1644                BPF_STMT(BPF_LD|BPF_W|BPF_ABS,
1645                        offsetof(struct seccomp_data, nr)),
1646                BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_read, 0, 1),
1647                BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_TRACE | 0x1001),
1648                BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
1649        };
1650
1651        self->poked = 0;
1652        memset(&self->prog, 0, sizeof(self->prog));
1653        self->prog.filter = malloc(sizeof(filter));
1654        ASSERT_NE(NULL, self->prog.filter);
1655        memcpy(self->prog.filter, filter, sizeof(filter));
1656        self->prog.len = (unsigned short)ARRAY_SIZE(filter);
1657
1658        /* Set up tracer args. */
1659        self->tracer_args.poke_addr = (unsigned long)&self->poked;
1660
1661        /* Launch tracer. */
1662        self->tracer = setup_trace_fixture(_metadata, tracer_poke,
1663                                           &self->tracer_args, false);
1664}
1665
1666FIXTURE_TEARDOWN(TRACE_poke)
1667{
1668        teardown_trace_fixture(_metadata, self->tracer);
1669        if (self->prog.filter)
1670                free(self->prog.filter);
1671}
1672
1673TEST_F(TRACE_poke, read_has_side_effects)
1674{
1675        ssize_t ret;
1676
1677        ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
1678        ASSERT_EQ(0, ret);
1679
1680        ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->prog, 0, 0);
1681        ASSERT_EQ(0, ret);
1682
1683        EXPECT_EQ(0, self->poked);
1684        ret = read(-1, NULL, 0);
1685        EXPECT_EQ(-1, ret);
1686        EXPECT_EQ(0x1001, self->poked);
1687}
1688
1689TEST_F(TRACE_poke, getpid_runs_normally)
1690{
1691        long ret;
1692
1693        ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
1694        ASSERT_EQ(0, ret);
1695
1696        ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->prog, 0, 0);
1697        ASSERT_EQ(0, ret);
1698
1699        EXPECT_EQ(0, self->poked);
1700        EXPECT_NE(0, syscall(__NR_getpid));
1701        EXPECT_EQ(0, self->poked);
1702}
1703
1704#if defined(__x86_64__)
1705# define ARCH_REGS              struct user_regs_struct
1706# define SYSCALL_NUM(_regs)     (_regs).orig_rax
1707# define SYSCALL_RET(_regs)     (_regs).rax
1708#elif defined(__i386__)
1709# define ARCH_REGS              struct user_regs_struct
1710# define SYSCALL_NUM(_regs)     (_regs).orig_eax
1711# define SYSCALL_RET(_regs)     (_regs).eax
1712#elif defined(__arm__)
1713# define ARCH_REGS              struct pt_regs
1714# define SYSCALL_NUM(_regs)     (_regs).ARM_r7
1715# ifndef PTRACE_SET_SYSCALL
1716#  define PTRACE_SET_SYSCALL   23
1717# endif
1718# define SYSCALL_NUM_SET(_regs, _nr)    \
1719                EXPECT_EQ(0, ptrace(PTRACE_SET_SYSCALL, tracee, NULL, _nr))
1720# define SYSCALL_RET(_regs)     (_regs).ARM_r0
1721#elif defined(__aarch64__)
1722# define ARCH_REGS              struct user_pt_regs
1723# define SYSCALL_NUM(_regs)     (_regs).regs[8]
1724# ifndef NT_ARM_SYSTEM_CALL
1725#  define NT_ARM_SYSTEM_CALL 0x404
1726# endif
1727# define SYSCALL_NUM_SET(_regs, _nr)                            \
1728        do {                                                    \
1729                struct iovec __v;                               \
1730                typeof(_nr) __nr = (_nr);                       \
1731                __v.iov_base = &__nr;                           \
1732                __v.iov_len = sizeof(__nr);                     \
1733                EXPECT_EQ(0, ptrace(PTRACE_SETREGSET, tracee,   \
1734                                    NT_ARM_SYSTEM_CALL, &__v)); \
1735        } while (0)
1736# define SYSCALL_RET(_regs)     (_regs).regs[0]
1737#elif defined(__riscv) && __riscv_xlen == 64
1738# define ARCH_REGS              struct user_regs_struct
1739# define SYSCALL_NUM(_regs)     (_regs).a7
1740# define SYSCALL_RET(_regs)     (_regs).a0
1741#elif defined(__csky__)
1742# define ARCH_REGS              struct pt_regs
1743#  if defined(__CSKYABIV2__)
1744#   define SYSCALL_NUM(_regs)   (_regs).regs[3]
1745#  else
1746#   define SYSCALL_NUM(_regs)   (_regs).regs[9]
1747#  endif
1748# define SYSCALL_RET(_regs)     (_regs).a0
1749#elif defined(__hppa__)
1750# define ARCH_REGS              struct user_regs_struct
1751# define SYSCALL_NUM(_regs)     (_regs).gr[20]
1752# define SYSCALL_RET(_regs)     (_regs).gr[28]
1753#elif defined(__powerpc__)
1754# define ARCH_REGS              struct pt_regs
1755# define SYSCALL_NUM(_regs)     (_regs).gpr[0]
1756# define SYSCALL_RET(_regs)     (_regs).gpr[3]
1757# define SYSCALL_RET_SET(_regs, _val)                           \
1758        do {                                                    \
1759                typeof(_val) _result = (_val);                  \
1760                if ((_regs.trap & 0xfff0) == 0x3000) {          \
1761                        /*                                      \
1762                         * scv 0 system call uses -ve result    \
1763                         * for error, so no need to adjust.     \
1764                         */                                     \
1765                        SYSCALL_RET(_regs) = _result;           \
1766                } else {                                        \
1767                        /*                                      \
1768                         * A syscall error is signaled by the   \
1769                         * CR0 SO bit and the code is stored as \
1770                         * a positive value.                    \
1771                         */                                     \
1772                        if (_result < 0) {                      \
1773                                SYSCALL_RET(_regs) = -_result;  \
1774                                (_regs).ccr |= 0x10000000;      \
1775                        } else {                                \
1776                                SYSCALL_RET(_regs) = _result;   \
1777                                (_regs).ccr &= ~0x10000000;     \
1778                        }                                       \
1779                }                                               \
1780        } while (0)
1781# define SYSCALL_RET_SET_ON_PTRACE_EXIT
1782#elif defined(__s390__)
1783# define ARCH_REGS              s390_regs
1784# define SYSCALL_NUM(_regs)     (_regs).gprs[2]
1785# define SYSCALL_RET_SET(_regs, _val)                   \
1786                TH_LOG("Can't modify syscall return on this architecture")
1787#elif defined(__mips__)
1788# include <asm/unistd_nr_n32.h>
1789# include <asm/unistd_nr_n64.h>
1790# include <asm/unistd_nr_o32.h>
1791# define ARCH_REGS              struct pt_regs
1792# define SYSCALL_NUM(_regs)                             \
1793        ({                                              \
1794                typeof((_regs).regs[2]) _nr;            \
1795                if ((_regs).regs[2] == __NR_O32_Linux)  \
1796                        _nr = (_regs).regs[4];          \
1797                else                                    \
1798                        _nr = (_regs).regs[2];          \
1799                _nr;                                    \
1800        })
1801# define SYSCALL_NUM_SET(_regs, _nr)                    \
1802        do {                                            \
1803                if ((_regs).regs[2] == __NR_O32_Linux)  \
1804                        (_regs).regs[4] = _nr;          \
1805                else                                    \
1806                        (_regs).regs[2] = _nr;          \
1807        } while (0)
1808# define SYSCALL_RET_SET(_regs, _val)                   \
1809                TH_LOG("Can't modify syscall return on this architecture")
1810#elif defined(__xtensa__)
1811# define ARCH_REGS              struct user_pt_regs
1812# define SYSCALL_NUM(_regs)     (_regs).syscall
1813/*
1814 * On xtensa syscall return value is in the register
1815 * a2 of the current window which is not fixed.
1816 */
1817#define SYSCALL_RET(_regs)      (_regs).a[(_regs).windowbase * 4 + 2]
1818#elif defined(__sh__)
1819# define ARCH_REGS              struct pt_regs
1820# define SYSCALL_NUM(_regs)     (_regs).regs[3]
1821# define SYSCALL_RET(_regs)     (_regs).regs[0]
1822#else
1823# error "Do not know how to find your architecture's registers and syscalls"
1824#endif
1825
1826/*
1827 * Most architectures can change the syscall by just updating the
1828 * associated register. This is the default if not defined above.
1829 */
1830#ifndef SYSCALL_NUM_SET
1831# define SYSCALL_NUM_SET(_regs, _nr)            \
1832        do {                                    \
1833                SYSCALL_NUM(_regs) = (_nr);     \
1834        } while (0)
1835#endif
1836/*
1837 * Most architectures can change the syscall return value by just
1838 * writing to the SYSCALL_RET register. This is the default if not
1839 * defined above. If an architecture cannot set the return value
1840 * (for example when the syscall and return value register is
1841 * shared), report it with TH_LOG() in an arch-specific definition
1842 * of SYSCALL_RET_SET() above, and leave SYSCALL_RET undefined.
1843 */
1844#if !defined(SYSCALL_RET) && !defined(SYSCALL_RET_SET)
1845# error "One of SYSCALL_RET or SYSCALL_RET_SET is needed for this arch"
1846#endif
1847#ifndef SYSCALL_RET_SET
1848# define SYSCALL_RET_SET(_regs, _val)           \
1849        do {                                    \
1850                SYSCALL_RET(_regs) = (_val);    \
1851        } while (0)
1852#endif
1853
1854/* When the syscall return can't be changed, stub out the tests for it. */
1855#ifndef SYSCALL_RET
1856# define EXPECT_SYSCALL_RETURN(val, action)     EXPECT_EQ(-1, action)
1857#else
1858# define EXPECT_SYSCALL_RETURN(val, action)             \
1859        do {                                            \
1860                errno = 0;                              \
1861                if (val < 0) {                          \
1862                        EXPECT_EQ(-1, action);          \
1863                        EXPECT_EQ(-(val), errno);       \
1864                } else {                                \
1865                        EXPECT_EQ(val, action);         \
1866                }                                       \
1867        } while (0)
1868#endif
1869
1870/*
1871 * Some architectures (e.g. powerpc) can only set syscall
1872 * return values on syscall exit during ptrace.
1873 */
1874const bool ptrace_entry_set_syscall_nr = true;
1875const bool ptrace_entry_set_syscall_ret =
1876#ifndef SYSCALL_RET_SET_ON_PTRACE_EXIT
1877        true;
1878#else
1879        false;
1880#endif
1881
1882/*
1883 * Use PTRACE_GETREGS and PTRACE_SETREGS when available. This is useful for
1884 * architectures without HAVE_ARCH_TRACEHOOK (e.g. User-mode Linux).
1885 */
1886#if defined(__x86_64__) || defined(__i386__) || defined(__mips__)
1887# define ARCH_GETREGS(_regs)    ptrace(PTRACE_GETREGS, tracee, 0, &(_regs))
1888# define ARCH_SETREGS(_regs)    ptrace(PTRACE_SETREGS, tracee, 0, &(_regs))
1889#else
1890# define ARCH_GETREGS(_regs)    ({                                      \
1891                struct iovec __v;                                       \
1892                __v.iov_base = &(_regs);                                \
1893                __v.iov_len = sizeof(_regs);                            \
1894                ptrace(PTRACE_GETREGSET, tracee, NT_PRSTATUS, &__v);    \
1895        })
1896# define ARCH_SETREGS(_regs)    ({                                      \
1897                struct iovec __v;                                       \
1898                __v.iov_base = &(_regs);                                \
1899                __v.iov_len = sizeof(_regs);                            \
1900                ptrace(PTRACE_SETREGSET, tracee, NT_PRSTATUS, &__v);    \
1901        })
1902#endif
1903
1904/* Architecture-specific syscall fetching routine. */
1905int get_syscall(struct __test_metadata *_metadata, pid_t tracee)
1906{
1907        ARCH_REGS regs;
1908
1909        EXPECT_EQ(0, ARCH_GETREGS(regs)) {
1910                return -1;
1911        }
1912
1913        return SYSCALL_NUM(regs);
1914}
1915
1916/* Architecture-specific syscall changing routine. */
1917void __change_syscall(struct __test_metadata *_metadata,
1918                    pid_t tracee, long *syscall, long *ret)
1919{
1920        ARCH_REGS orig, regs;
1921
1922        /* Do not get/set registers if we have nothing to do. */
1923        if (!syscall && !ret)
1924                return;
1925
1926        EXPECT_EQ(0, ARCH_GETREGS(regs)) {
1927                return;
1928        }
1929        orig = regs;
1930
1931        if (syscall)
1932                SYSCALL_NUM_SET(regs, *syscall);
1933
1934        if (ret)
1935                SYSCALL_RET_SET(regs, *ret);
1936
1937        /* Flush any register changes made. */
1938        if (memcmp(&orig, &regs, sizeof(orig)) != 0)
1939                EXPECT_EQ(0, ARCH_SETREGS(regs));
1940}
1941
1942/* Change only syscall number. */
1943void change_syscall_nr(struct __test_metadata *_metadata,
1944                       pid_t tracee, long syscall)
1945{
1946        __change_syscall(_metadata, tracee, &syscall, NULL);
1947}
1948
1949/* Change syscall return value (and set syscall number to -1). */
1950void change_syscall_ret(struct __test_metadata *_metadata,
1951                        pid_t tracee, long ret)
1952{
1953        long syscall = -1;
1954
1955        __change_syscall(_metadata, tracee, &syscall, &ret);
1956}
1957
1958void tracer_seccomp(struct __test_metadata *_metadata, pid_t tracee,
1959                    int status, void *args)
1960{
1961        int ret;
1962        unsigned long msg;
1963
1964        /* Make sure we got the right message. */
1965        ret = ptrace(PTRACE_GETEVENTMSG, tracee, NULL, &msg);
1966        EXPECT_EQ(0, ret);
1967
1968        /* Validate and take action on expected syscalls. */
1969        switch (msg) {
1970        case 0x1002:
1971                /* change getpid to getppid. */
1972                EXPECT_EQ(__NR_getpid, get_syscall(_metadata, tracee));
1973                change_syscall_nr(_metadata, tracee, __NR_getppid);
1974                break;
1975        case 0x1003:
1976                /* skip gettid with valid return code. */
1977                EXPECT_EQ(__NR_gettid, get_syscall(_metadata, tracee));
1978                change_syscall_ret(_metadata, tracee, 45000);
1979                break;
1980        case 0x1004:
1981                /* skip openat with error. */
1982                EXPECT_EQ(__NR_openat, get_syscall(_metadata, tracee));
1983                change_syscall_ret(_metadata, tracee, -ESRCH);
1984                break;
1985        case 0x1005:
1986                /* do nothing (allow getppid) */
1987                EXPECT_EQ(__NR_getppid, get_syscall(_metadata, tracee));
1988                break;
1989        default:
1990                EXPECT_EQ(0, msg) {
1991                        TH_LOG("Unknown PTRACE_GETEVENTMSG: 0x%lx", msg);
1992                        kill(tracee, SIGKILL);
1993                }
1994        }
1995
1996}
1997
1998FIXTURE(TRACE_syscall) {
1999        struct sock_fprog prog;
2000        pid_t tracer, mytid, mypid, parent;
2001        long syscall_nr;
2002};
2003
2004void tracer_ptrace(struct __test_metadata *_metadata, pid_t tracee,
2005                   int status, void *args)
2006{
2007        int ret;
2008        unsigned long msg;
2009        static bool entry;
2010        long syscall_nr_val, syscall_ret_val;
2011        long *syscall_nr = NULL, *syscall_ret = NULL;
2012        FIXTURE_DATA(TRACE_syscall) *self = args;
2013
2014        /*
2015         * The traditional way to tell PTRACE_SYSCALL entry/exit
2016         * is by counting.
2017         */
2018        entry = !entry;
2019
2020        /* Make sure we got an appropriate message. */
2021        ret = ptrace(PTRACE_GETEVENTMSG, tracee, NULL, &msg);
2022        EXPECT_EQ(0, ret);
2023        EXPECT_EQ(entry ? PTRACE_EVENTMSG_SYSCALL_ENTRY
2024                        : PTRACE_EVENTMSG_SYSCALL_EXIT, msg);
2025
2026        /*
2027         * Some architectures only support setting return values during
2028         * syscall exit under ptrace, and on exit the syscall number may
2029         * no longer be available. Therefore, save the initial sycall
2030         * number here, so it can be examined during both entry and exit
2031         * phases.
2032         */
2033        if (entry)
2034                self->syscall_nr = get_syscall(_metadata, tracee);
2035
2036        /*
2037         * Depending on the architecture's syscall setting abilities, we
2038         * pick which things to set during this phase (entry or exit).
2039         */
2040        if (entry == ptrace_entry_set_syscall_nr)
2041                syscall_nr = &syscall_nr_val;
2042        if (entry == ptrace_entry_set_syscall_ret)
2043                syscall_ret = &syscall_ret_val;
2044
2045        /* Now handle the actual rewriting cases. */
2046        switch (self->syscall_nr) {
2047        case __NR_getpid:
2048                syscall_nr_val = __NR_getppid;
2049                /* Never change syscall return for this case. */
2050                syscall_ret = NULL;
2051                break;
2052        case __NR_gettid:
2053                syscall_nr_val = -1;
2054                syscall_ret_val = 45000;
2055                break;
2056        case __NR_openat:
2057                syscall_nr_val = -1;
2058                syscall_ret_val = -ESRCH;
2059                break;
2060        default:
2061                /* Unhandled, do nothing. */
2062                return;
2063        }
2064
2065        __change_syscall(_metadata, tracee, syscall_nr, syscall_ret);
2066}
2067
2068FIXTURE_VARIANT(TRACE_syscall) {
2069        /*
2070         * All of the SECCOMP_RET_TRACE behaviors can be tested with either
2071         * SECCOMP_RET_TRACE+PTRACE_CONT or plain ptrace()+PTRACE_SYSCALL.
2072         * This indicates if we should use SECCOMP_RET_TRACE (false), or
2073         * ptrace (true).
2074         */
2075        bool use_ptrace;
2076};
2077
2078FIXTURE_VARIANT_ADD(TRACE_syscall, ptrace) {
2079        .use_ptrace = true,
2080};
2081
2082FIXTURE_VARIANT_ADD(TRACE_syscall, seccomp) {
2083        .use_ptrace = false,
2084};
2085
2086FIXTURE_SETUP(TRACE_syscall)
2087{
2088        struct sock_filter filter[] = {
2089                BPF_STMT(BPF_LD|BPF_W|BPF_ABS,
2090                        offsetof(struct seccomp_data, nr)),
2091                BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_getpid, 0, 1),
2092                BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_TRACE | 0x1002),
2093                BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_gettid, 0, 1),
2094                BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_TRACE | 0x1003),
2095                BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_openat, 0, 1),
2096                BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_TRACE | 0x1004),
2097                BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_getppid, 0, 1),
2098                BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_TRACE | 0x1005),
2099                BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
2100        };
2101        struct sock_fprog prog = {
2102                .len = (unsigned short)ARRAY_SIZE(filter),
2103                .filter = filter,
2104        };
2105        long ret;
2106
2107        /* Prepare some testable syscall results. */
2108        self->mytid = syscall(__NR_gettid);
2109        ASSERT_GT(self->mytid, 0);
2110        ASSERT_NE(self->mytid, 1) {
2111                TH_LOG("Running this test as init is not supported. :)");
2112        }
2113
2114        self->mypid = getpid();
2115        ASSERT_GT(self->mypid, 0);
2116        ASSERT_EQ(self->mytid, self->mypid);
2117
2118        self->parent = getppid();
2119        ASSERT_GT(self->parent, 0);
2120        ASSERT_NE(self->parent, self->mypid);
2121
2122        /* Launch tracer. */
2123        self->tracer = setup_trace_fixture(_metadata,
2124                                           variant->use_ptrace ? tracer_ptrace
2125                                                               : tracer_seccomp,
2126                                           self, variant->use_ptrace);
2127
2128        ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
2129        ASSERT_EQ(0, ret);
2130
2131        if (variant->use_ptrace)
2132                return;
2133
2134        ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog, 0, 0);
2135        ASSERT_EQ(0, ret);
2136}
2137
2138FIXTURE_TEARDOWN(TRACE_syscall)
2139{
2140        teardown_trace_fixture(_metadata, self->tracer);
2141}
2142
2143TEST(negative_ENOSYS)
2144{
2145        /*
2146         * There should be no difference between an "internal" skip
2147         * and userspace asking for syscall "-1".
2148         */
2149        errno = 0;
2150        EXPECT_EQ(-1, syscall(-1));
2151        EXPECT_EQ(errno, ENOSYS);
2152        /* And no difference for "still not valid but not -1". */
2153        errno = 0;
2154        EXPECT_EQ(-1, syscall(-101));
2155        EXPECT_EQ(errno, ENOSYS);
2156}
2157
2158TEST_F(TRACE_syscall, negative_ENOSYS)
2159{
2160        negative_ENOSYS(_metadata);
2161}
2162
2163TEST_F(TRACE_syscall, syscall_allowed)
2164{
2165        /* getppid works as expected (no changes). */
2166        EXPECT_EQ(self->parent, syscall(__NR_getppid));
2167        EXPECT_NE(self->mypid, syscall(__NR_getppid));
2168}
2169
2170TEST_F(TRACE_syscall, syscall_redirected)
2171{
2172        /* getpid has been redirected to getppid as expected. */
2173        EXPECT_EQ(self->parent, syscall(__NR_getpid));
2174        EXPECT_NE(self->mypid, syscall(__NR_getpid));
2175}
2176
2177TEST_F(TRACE_syscall, syscall_errno)
2178{
2179        /* Tracer should skip the open syscall, resulting in ESRCH. */
2180        EXPECT_SYSCALL_RETURN(-ESRCH, syscall(__NR_openat));
2181}
2182
2183TEST_F(TRACE_syscall, syscall_faked)
2184{
2185        /* Tracer skips the gettid syscall and store altered return value. */
2186        EXPECT_SYSCALL_RETURN(45000, syscall(__NR_gettid));
2187}
2188
2189TEST_F(TRACE_syscall, skip_after)
2190{
2191        struct sock_filter filter[] = {
2192                BPF_STMT(BPF_LD|BPF_W|BPF_ABS,
2193                        offsetof(struct seccomp_data, nr)),
2194                BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_getppid, 0, 1),
2195                BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ERRNO | EPERM),
2196                BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
2197        };
2198        struct sock_fprog prog = {
2199                .len = (unsigned short)ARRAY_SIZE(filter),
2200                .filter = filter,
2201        };
2202        long ret;
2203
2204        /* Install additional "errno on getppid" filter. */
2205        ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog, 0, 0);
2206        ASSERT_EQ(0, ret);
2207
2208        /* Tracer will redirect getpid to getppid, and we should see EPERM. */
2209        errno = 0;
2210        EXPECT_EQ(-1, syscall(__NR_getpid));
2211        EXPECT_EQ(EPERM, errno);
2212}
2213
2214TEST_F_SIGNAL(TRACE_syscall, kill_after, SIGSYS)
2215{
2216        struct sock_filter filter[] = {
2217                BPF_STMT(BPF_LD|BPF_W|BPF_ABS,
2218                        offsetof(struct seccomp_data, nr)),
2219                BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_getppid, 0, 1),
2220                BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_KILL),
2221                BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
2222        };
2223        struct sock_fprog prog = {
2224                .len = (unsigned short)ARRAY_SIZE(filter),
2225                .filter = filter,
2226        };
2227        long ret;
2228
2229        /* Install additional "death on getppid" filter. */
2230        ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog, 0, 0);
2231        ASSERT_EQ(0, ret);
2232
2233        /* Tracer will redirect getpid to getppid, and we should die. */
2234        EXPECT_NE(self->mypid, syscall(__NR_getpid));
2235}
2236
2237TEST(seccomp_syscall)
2238{
2239        struct sock_filter filter[] = {
2240                BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
2241        };
2242        struct sock_fprog prog = {
2243                .len = (unsigned short)ARRAY_SIZE(filter),
2244                .filter = filter,
2245        };
2246        long ret;
2247
2248        ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
2249        ASSERT_EQ(0, ret) {
2250                TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
2251        }
2252
2253        /* Reject insane operation. */
2254        ret = seccomp(-1, 0, &prog);
2255        ASSERT_NE(ENOSYS, errno) {
2256                TH_LOG("Kernel does not support seccomp syscall!");
2257        }
2258        EXPECT_EQ(EINVAL, errno) {
2259                TH_LOG("Did not reject crazy op value!");
2260        }
2261
2262        /* Reject strict with flags or pointer. */
2263        ret = seccomp(SECCOMP_SET_MODE_STRICT, -1, NULL);
2264        EXPECT_EQ(EINVAL, errno) {
2265                TH_LOG("Did not reject mode strict with flags!");
2266        }
2267        ret = seccomp(SECCOMP_SET_MODE_STRICT, 0, &prog);
2268        EXPECT_EQ(EINVAL, errno) {
2269                TH_LOG("Did not reject mode strict with uargs!");
2270        }
2271
2272        /* Reject insane args for filter. */
2273        ret = seccomp(SECCOMP_SET_MODE_FILTER, -1, &prog);
2274        EXPECT_EQ(EINVAL, errno) {
2275                TH_LOG("Did not reject crazy filter flags!");
2276        }
2277        ret = seccomp(SECCOMP_SET_MODE_FILTER, 0, NULL);
2278        EXPECT_EQ(EFAULT, errno) {
2279                TH_LOG("Did not reject NULL filter!");
2280        }
2281
2282        ret = seccomp(SECCOMP_SET_MODE_FILTER, 0, &prog);
2283        EXPECT_EQ(0, errno) {
2284                TH_LOG("Kernel does not support SECCOMP_SET_MODE_FILTER: %s",
2285                        strerror(errno));
2286        }
2287}
2288
2289TEST(seccomp_syscall_mode_lock)
2290{
2291        struct sock_filter filter[] = {
2292                BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
2293        };
2294        struct sock_fprog prog = {
2295                .len = (unsigned short)ARRAY_SIZE(filter),
2296                .filter = filter,
2297        };
2298        long ret;
2299
2300        ret = prctl(PR_SET_NO_NEW_PRIVS, 1, NULL, 0, 0);
2301        ASSERT_EQ(0, ret) {
2302                TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
2303        }
2304
2305        ret = seccomp(SECCOMP_SET_MODE_FILTER, 0, &prog);
2306        ASSERT_NE(ENOSYS, errno) {
2307                TH_LOG("Kernel does not support seccomp syscall!");
2308        }
2309        EXPECT_EQ(0, ret) {
2310                TH_LOG("Could not install filter!");
2311        }
2312
2313        /* Make sure neither entry point will switch to strict. */
2314        ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_STRICT, 0, 0, 0);
2315        EXPECT_EQ(EINVAL, errno) {
2316                TH_LOG("Switched to mode strict!");
2317        }
2318
2319        ret = seccomp(SECCOMP_SET_MODE_STRICT, 0, NULL);
2320        EXPECT_EQ(EINVAL, errno) {
2321                TH_LOG("Switched to mode strict!");
2322        }
2323}
2324
2325/*
2326 * Test detection of known and unknown filter flags. Userspace needs to be able
2327 * to check if a filter flag is supported by the current kernel and a good way
2328 * of doing that is by attempting to enter filter mode, with the flag bit in
2329 * question set, and a NULL pointer for the _args_ parameter. EFAULT indicates
2330 * that the flag is valid and EINVAL indicates that the flag is invalid.
2331 */
2332TEST(detect_seccomp_filter_flags)
2333{
2334        unsigned int flags[] = { SECCOMP_FILTER_FLAG_TSYNC,
2335                                 SECCOMP_FILTER_FLAG_LOG,
2336                                 SECCOMP_FILTER_FLAG_SPEC_ALLOW,
2337                                 SECCOMP_FILTER_FLAG_NEW_LISTENER,
2338                                 SECCOMP_FILTER_FLAG_TSYNC_ESRCH };
2339        unsigned int exclusive[] = {
2340                                SECCOMP_FILTER_FLAG_TSYNC,
2341                                SECCOMP_FILTER_FLAG_NEW_LISTENER };
2342        unsigned int flag, all_flags, exclusive_mask;
2343        int i;
2344        long ret;
2345
2346        /* Test detection of individual known-good filter flags */
2347        for (i = 0, all_flags = 0; i < ARRAY_SIZE(flags); i++) {
2348                int bits = 0;
2349
2350                flag = flags[i];
2351                /* Make sure the flag is a single bit! */
2352                while (flag) {
2353                        if (flag & 0x1)
2354                                bits ++;
2355                        flag >>= 1;
2356                }
2357                ASSERT_EQ(1, bits);
2358                flag = flags[i];
2359
2360                ret = seccomp(SECCOMP_SET_MODE_FILTER, flag, NULL);
2361                ASSERT_NE(ENOSYS, errno) {
2362                        TH_LOG("Kernel does not support seccomp syscall!");
2363                }
2364                EXPECT_EQ(-1, ret);
2365                EXPECT_EQ(EFAULT, errno) {
2366                        TH_LOG("Failed to detect that a known-good filter flag (0x%X) is supported!",
2367                               flag);
2368                }
2369
2370                all_flags |= flag;
2371        }
2372
2373        /*
2374         * Test detection of all known-good filter flags combined. But
2375         * for the exclusive flags we need to mask them out and try them
2376         * individually for the "all flags" testing.
2377         */
2378        exclusive_mask = 0;
2379        for (i = 0; i < ARRAY_SIZE(exclusive); i++)
2380                exclusive_mask |= exclusive[i];
2381        for (i = 0; i < ARRAY_SIZE(exclusive); i++) {
2382                flag = all_flags & ~exclusive_mask;
2383                flag |= exclusive[i];
2384
2385                ret = seccomp(SECCOMP_SET_MODE_FILTER, flag, NULL);
2386                EXPECT_EQ(-1, ret);
2387                EXPECT_EQ(EFAULT, errno) {
2388                        TH_LOG("Failed to detect that all known-good filter flags (0x%X) are supported!",
2389                               flag);
2390                }
2391        }
2392
2393        /* Test detection of an unknown filter flags, without exclusives. */
2394        flag = -1;
2395        flag &= ~exclusive_mask;
2396        ret = seccomp(SECCOMP_SET_MODE_FILTER, flag, NULL);
2397        EXPECT_EQ(-1, ret);
2398        EXPECT_EQ(EINVAL, errno) {
2399                TH_LOG("Failed to detect that an unknown filter flag (0x%X) is unsupported!",
2400                       flag);
2401        }
2402
2403        /*
2404         * Test detection of an unknown filter flag that may simply need to be
2405         * added to this test
2406         */
2407        flag = flags[ARRAY_SIZE(flags) - 1] << 1;
2408        ret = seccomp(SECCOMP_SET_MODE_FILTER, flag, NULL);
2409        EXPECT_EQ(-1, ret);
2410        EXPECT_EQ(EINVAL, errno) {
2411                TH_LOG("Failed to detect that an unknown filter flag (0x%X) is unsupported! Does a new flag need to be added to this test?",
2412                       flag);
2413        }
2414}
2415
2416TEST(TSYNC_first)
2417{
2418        struct sock_filter filter[] = {
2419                BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
2420        };
2421        struct sock_fprog prog = {
2422                .len = (unsigned short)ARRAY_SIZE(filter),
2423                .filter = filter,
2424        };
2425        long ret;
2426
2427        ret = prctl(PR_SET_NO_NEW_PRIVS, 1, NULL, 0, 0);
2428        ASSERT_EQ(0, ret) {
2429                TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
2430        }
2431
2432        ret = seccomp(SECCOMP_SET_MODE_FILTER, SECCOMP_FILTER_FLAG_TSYNC,
2433                      &prog);
2434        ASSERT_NE(ENOSYS, errno) {
2435                TH_LOG("Kernel does not support seccomp syscall!");
2436        }
2437        EXPECT_EQ(0, ret) {
2438                TH_LOG("Could not install initial filter with TSYNC!");
2439        }
2440}
2441
2442#define TSYNC_SIBLINGS 2
2443struct tsync_sibling {
2444        pthread_t tid;
2445        pid_t system_tid;
2446        sem_t *started;
2447        pthread_cond_t *cond;
2448        pthread_mutex_t *mutex;
2449        int diverge;
2450        int num_waits;
2451        struct sock_fprog *prog;
2452        struct __test_metadata *metadata;
2453};
2454
2455/*
2456 * To avoid joining joined threads (which is not allowed by Bionic),
2457 * make sure we both successfully join and clear the tid to skip a
2458 * later join attempt during fixture teardown. Any remaining threads
2459 * will be directly killed during teardown.
2460 */
2461#define PTHREAD_JOIN(tid, status)                                       \
2462        do {                                                            \
2463                int _rc = pthread_join(tid, status);                    \
2464                if (_rc) {                                              \
2465                        TH_LOG("pthread_join of tid %u failed: %d\n",   \
2466                                (unsigned int)tid, _rc);                \
2467                } else {                                                \
2468                        tid = 0;                                        \
2469                }                                                       \
2470        } while (0)
2471
2472FIXTURE(TSYNC) {
2473        struct sock_fprog root_prog, apply_prog;
2474        struct tsync_sibling sibling[TSYNC_SIBLINGS];
2475        sem_t started;
2476        pthread_cond_t cond;
2477        pthread_mutex_t mutex;
2478        int sibling_count;
2479};
2480
2481FIXTURE_SETUP(TSYNC)
2482{
2483        struct sock_filter root_filter[] = {
2484                BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
2485        };
2486        struct sock_filter apply_filter[] = {
2487                BPF_STMT(BPF_LD|BPF_W|BPF_ABS,
2488                        offsetof(struct seccomp_data, nr)),
2489                BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_read, 0, 1),
2490                BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_KILL),
2491                BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
2492        };
2493
2494        memset(&self->root_prog, 0, sizeof(self->root_prog));
2495        memset(&self->apply_prog, 0, sizeof(self->apply_prog));
2496        memset(&self->sibling, 0, sizeof(self->sibling));
2497        self->root_prog.filter = malloc(sizeof(root_filter));
2498        ASSERT_NE(NULL, self->root_prog.filter);
2499        memcpy(self->root_prog.filter, &root_filter, sizeof(root_filter));
2500        self->root_prog.len = (unsigned short)ARRAY_SIZE(root_filter);
2501
2502        self->apply_prog.filter = malloc(sizeof(apply_filter));
2503        ASSERT_NE(NULL, self->apply_prog.filter);
2504        memcpy(self->apply_prog.filter, &apply_filter, sizeof(apply_filter));
2505        self->apply_prog.len = (unsigned short)ARRAY_SIZE(apply_filter);
2506
2507        self->sibling_count = 0;
2508        pthread_mutex_init(&self->mutex, NULL);
2509        pthread_cond_init(&self->cond, NULL);
2510        sem_init(&self->started, 0, 0);
2511        self->sibling[0].tid = 0;
2512        self->sibling[0].cond = &self->cond;
2513        self->sibling[0].started = &self->started;
2514        self->sibling[0].mutex = &self->mutex;
2515        self->sibling[0].diverge = 0;
2516        self->sibling[0].num_waits = 1;
2517        self->sibling[0].prog = &self->root_prog;
2518        self->sibling[0].metadata = _metadata;
2519        self->sibling[1].tid = 0;
2520        self->sibling[1].cond = &self->cond;
2521        self->sibling[1].started = &self->started;
2522        self->sibling[1].mutex = &self->mutex;
2523        self->sibling[1].diverge = 0;
2524        self->sibling[1].prog = &self->root_prog;
2525        self->sibling[1].num_waits = 1;
2526        self->sibling[1].metadata = _metadata;
2527}
2528
2529FIXTURE_TEARDOWN(TSYNC)
2530{
2531        int sib = 0;
2532
2533        if (self->root_prog.filter)
2534                free(self->root_prog.filter);
2535        if (self->apply_prog.filter)
2536                free(self->apply_prog.filter);
2537
2538        for ( ; sib < self->sibling_count; ++sib) {
2539                struct tsync_sibling *s = &self->sibling[sib];
2540
2541                if (!s->tid)
2542                        continue;
2543                /*
2544                 * If a thread is still running, it may be stuck, so hit
2545                 * it over the head really hard.
2546                 */
2547                pthread_kill(s->tid, 9);
2548        }
2549        pthread_mutex_destroy(&self->mutex);
2550        pthread_cond_destroy(&self->cond);
2551        sem_destroy(&self->started);
2552}
2553
2554void *tsync_sibling(void *data)
2555{
2556        long ret = 0;
2557        struct tsync_sibling *me = data;
2558
2559        me->system_tid = syscall(__NR_gettid);
2560
2561        pthread_mutex_lock(me->mutex);
2562        if (me->diverge) {
2563                /* Just re-apply the root prog to fork the tree */
2564                ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER,
2565                                me->prog, 0, 0);
2566        }
2567        sem_post(me->started);
2568        /* Return outside of started so parent notices failures. */
2569        if (ret) {
2570                pthread_mutex_unlock(me->mutex);
2571                return (void *)SIBLING_EXIT_FAILURE;
2572        }
2573        do {
2574                pthread_cond_wait(me->cond, me->mutex);
2575                me->num_waits = me->num_waits - 1;
2576        } while (me->num_waits);
2577        pthread_mutex_unlock(me->mutex);
2578
2579        ret = prctl(PR_GET_NO_NEW_PRIVS, 0, 0, 0, 0);
2580        if (!ret)
2581                return (void *)SIBLING_EXIT_NEWPRIVS;
2582        read(0, NULL, 0);
2583        return (void *)SIBLING_EXIT_UNKILLED;
2584}
2585
2586void tsync_start_sibling(struct tsync_sibling *sibling)
2587{
2588        pthread_create(&sibling->tid, NULL, tsync_sibling, (void *)sibling);
2589}
2590
2591TEST_F(TSYNC, siblings_fail_prctl)
2592{
2593        long ret;
2594        void *status;
2595        struct sock_filter filter[] = {
2596                BPF_STMT(BPF_LD|BPF_W|BPF_ABS,
2597                        offsetof(struct seccomp_data, nr)),
2598                BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_prctl, 0, 1),
2599                BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ERRNO | EINVAL),
2600                BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
2601        };
2602        struct sock_fprog prog = {
2603                .len = (unsigned short)ARRAY_SIZE(filter),
2604                .filter = filter,
2605        };
2606
2607        ASSERT_EQ(0, prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0)) {
2608                TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
2609        }
2610
2611        /* Check prctl failure detection by requesting sib 0 diverge. */
2612        ret = seccomp(SECCOMP_SET_MODE_FILTER, 0, &prog);
2613        ASSERT_NE(ENOSYS, errno) {
2614                TH_LOG("Kernel does not support seccomp syscall!");
2615        }
2616        ASSERT_EQ(0, ret) {
2617                TH_LOG("setting filter failed");
2618        }
2619
2620        self->sibling[0].diverge = 1;
2621        tsync_start_sibling(&self->sibling[0]);
2622        tsync_start_sibling(&self->sibling[1]);
2623
2624        while (self->sibling_count < TSYNC_SIBLINGS) {
2625                sem_wait(&self->started);
2626                self->sibling_count++;
2627        }
2628
2629        /* Signal the threads to clean up*/
2630        pthread_mutex_lock(&self->mutex);
2631        ASSERT_EQ(0, pthread_cond_broadcast(&self->cond)) {
2632                TH_LOG("cond broadcast non-zero");
2633        }
2634        pthread_mutex_unlock(&self->mutex);
2635
2636        /* Ensure diverging sibling failed to call prctl. */
2637        PTHREAD_JOIN(self->sibling[0].tid, &status);
2638        EXPECT_EQ(SIBLING_EXIT_FAILURE, (long)status);
2639        PTHREAD_JOIN(self->sibling[1].tid, &status);
2640        EXPECT_EQ(SIBLING_EXIT_UNKILLED, (long)status);
2641}
2642
2643TEST_F(TSYNC, two_siblings_with_ancestor)
2644{
2645        long ret;
2646        void *status;
2647
2648        ASSERT_EQ(0, prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0)) {
2649                TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
2650        }
2651
2652        ret = seccomp(SECCOMP_SET_MODE_FILTER, 0, &self->root_prog);
2653        ASSERT_NE(ENOSYS, errno) {
2654                TH_LOG("Kernel does not support seccomp syscall!");
2655        }
2656        ASSERT_EQ(0, ret) {
2657                TH_LOG("Kernel does not support SECCOMP_SET_MODE_FILTER!");
2658        }
2659        tsync_start_sibling(&self->sibling[0]);
2660        tsync_start_sibling(&self->sibling[1]);
2661
2662        while (self->sibling_count < TSYNC_SIBLINGS) {
2663                sem_wait(&self->started);
2664                self->sibling_count++;
2665        }
2666
2667        ret = seccomp(SECCOMP_SET_MODE_FILTER, SECCOMP_FILTER_FLAG_TSYNC,
2668                      &self->apply_prog);
2669        ASSERT_EQ(0, ret) {
2670                TH_LOG("Could install filter on all threads!");
2671        }
2672        /* Tell the siblings to test the policy */
2673        pthread_mutex_lock(&self->mutex);
2674        ASSERT_EQ(0, pthread_cond_broadcast(&self->cond)) {
2675                TH_LOG("cond broadcast non-zero");
2676        }
2677        pthread_mutex_unlock(&self->mutex);
2678        /* Ensure they are both killed and don't exit cleanly. */
2679        PTHREAD_JOIN(self->sibling[0].tid, &status);
2680        EXPECT_EQ(0x0, (long)status);
2681        PTHREAD_JOIN(self->sibling[1].tid, &status);
2682        EXPECT_EQ(0x0, (long)status);
2683}
2684
2685TEST_F(TSYNC, two_sibling_want_nnp)
2686{
2687        void *status;
2688
2689        /* start siblings before any prctl() operations */
2690        tsync_start_sibling(&self->sibling[0]);
2691        tsync_start_sibling(&self->sibling[1]);
2692        while (self->sibling_count < TSYNC_SIBLINGS) {
2693                sem_wait(&self->started);
2694                self->sibling_count++;
2695        }
2696
2697        /* Tell the siblings to test no policy */
2698        pthread_mutex_lock(&self->mutex);
2699        ASSERT_EQ(0, pthread_cond_broadcast(&self->cond)) {
2700                TH_LOG("cond broadcast non-zero");
2701        }
2702        pthread_mutex_unlock(&self->mutex);
2703
2704        /* Ensure they are both upset about lacking nnp. */
2705        PTHREAD_JOIN(self->sibling[0].tid, &status);
2706        EXPECT_EQ(SIBLING_EXIT_NEWPRIVS, (long)status);
2707        PTHREAD_JOIN(self->sibling[1].tid, &status);
2708        EXPECT_EQ(SIBLING_EXIT_NEWPRIVS, (long)status);
2709}
2710
2711TEST_F(TSYNC, two_siblings_with_no_filter)
2712{
2713        long ret;
2714        void *status;
2715
2716        /* start siblings before any prctl() operations */
2717        tsync_start_sibling(&self->sibling[0]);
2718        tsync_start_sibling(&self->sibling[1]);
2719        while (self->sibling_count < TSYNC_SIBLINGS) {
2720                sem_wait(&self->started);
2721                self->sibling_count++;
2722        }
2723
2724        ASSERT_EQ(0, prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0)) {
2725                TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
2726        }
2727
2728        ret = seccomp(SECCOMP_SET_MODE_FILTER, SECCOMP_FILTER_FLAG_TSYNC,
2729                      &self->apply_prog);
2730        ASSERT_NE(ENOSYS, errno) {
2731                TH_LOG("Kernel does not support seccomp syscall!");
2732        }
2733        ASSERT_EQ(0, ret) {
2734                TH_LOG("Could install filter on all threads!");
2735        }
2736
2737        /* Tell the siblings to test the policy */
2738        pthread_mutex_lock(&self->mutex);
2739        ASSERT_EQ(0, pthread_cond_broadcast(&self->cond)) {
2740                TH_LOG("cond broadcast non-zero");
2741        }
2742        pthread_mutex_unlock(&self->mutex);
2743
2744        /* Ensure they are both killed and don't exit cleanly. */
2745        PTHREAD_JOIN(self->sibling[0].tid, &status);
2746        EXPECT_EQ(0x0, (long)status);
2747        PTHREAD_JOIN(self->sibling[1].tid, &status);
2748        EXPECT_EQ(0x0, (long)status);
2749}
2750
2751TEST_F(TSYNC, two_siblings_with_one_divergence)
2752{
2753        long ret;
2754        void *status;
2755
2756        ASSERT_EQ(0, prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0)) {
2757                TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
2758        }
2759
2760        ret = seccomp(SECCOMP_SET_MODE_FILTER, 0, &self->root_prog);
2761        ASSERT_NE(ENOSYS, errno) {
2762                TH_LOG("Kernel does not support seccomp syscall!");
2763        }
2764        ASSERT_EQ(0, ret) {
2765                TH_LOG("Kernel does not support SECCOMP_SET_MODE_FILTER!");
2766        }
2767        self->sibling[0].diverge = 1;
2768        tsync_start_sibling(&self->sibling[0]);
2769        tsync_start_sibling(&self->sibling[1]);
2770
2771        while (self->sibling_count < TSYNC_SIBLINGS) {
2772                sem_wait(&self->started);
2773                self->sibling_count++;
2774        }
2775
2776        ret = seccomp(SECCOMP_SET_MODE_FILTER, SECCOMP_FILTER_FLAG_TSYNC,
2777                      &self->apply_prog);
2778        ASSERT_EQ(self->sibling[0].system_tid, ret) {
2779                TH_LOG("Did not fail on diverged sibling.");
2780        }
2781
2782        /* Wake the threads */
2783        pthread_mutex_lock(&self->mutex);
2784        ASSERT_EQ(0, pthread_cond_broadcast(&self->cond)) {
2785                TH_LOG("cond broadcast non-zero");
2786        }
2787        pthread_mutex_unlock(&self->mutex);
2788
2789        /* Ensure they are both unkilled. */
2790        PTHREAD_JOIN(self->sibling[0].tid, &status);
2791        EXPECT_EQ(SIBLING_EXIT_UNKILLED, (long)status);
2792        PTHREAD_JOIN(self->sibling[1].tid, &status);
2793        EXPECT_EQ(SIBLING_EXIT_UNKILLED, (long)status);
2794}
2795
2796TEST_F(TSYNC, two_siblings_with_one_divergence_no_tid_in_err)
2797{
2798        long ret, flags;
2799        void *status;
2800
2801        ASSERT_EQ(0, prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0)) {
2802                TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
2803        }
2804
2805        ret = seccomp(SECCOMP_SET_MODE_FILTER, 0, &self->root_prog);
2806        ASSERT_NE(ENOSYS, errno) {
2807                TH_LOG("Kernel does not support seccomp syscall!");
2808        }
2809        ASSERT_EQ(0, ret) {
2810                TH_LOG("Kernel does not support SECCOMP_SET_MODE_FILTER!");
2811        }
2812        self->sibling[0].diverge = 1;
2813        tsync_start_sibling(&self->sibling[0]);
2814        tsync_start_sibling(&self->sibling[1]);
2815
2816        while (self->sibling_count < TSYNC_SIBLINGS) {
2817                sem_wait(&self->started);
2818                self->sibling_count++;
2819        }
2820
2821        flags = SECCOMP_FILTER_FLAG_TSYNC | \
2822                SECCOMP_FILTER_FLAG_TSYNC_ESRCH;
2823        ret = seccomp(SECCOMP_SET_MODE_FILTER, flags, &self->apply_prog);
2824        ASSERT_EQ(ESRCH, errno) {
2825                TH_LOG("Did not return ESRCH for diverged sibling.");
2826        }
2827        ASSERT_EQ(-1, ret) {
2828                TH_LOG("Did not fail on diverged sibling.");
2829        }
2830
2831        /* Wake the threads */
2832        pthread_mutex_lock(&self->mutex);
2833        ASSERT_EQ(0, pthread_cond_broadcast(&self->cond)) {
2834                TH_LOG("cond broadcast non-zero");
2835        }
2836        pthread_mutex_unlock(&self->mutex);
2837
2838        /* Ensure they are both unkilled. */
2839        PTHREAD_JOIN(self->sibling[0].tid, &status);
2840        EXPECT_EQ(SIBLING_EXIT_UNKILLED, (long)status);
2841        PTHREAD_JOIN(self->sibling[1].tid, &status);
2842        EXPECT_EQ(SIBLING_EXIT_UNKILLED, (long)status);
2843}
2844
2845TEST_F(TSYNC, two_siblings_not_under_filter)
2846{
2847        long ret, sib;
2848        void *status;
2849        struct timespec delay = { .tv_nsec = 100000000 };
2850
2851        ASSERT_EQ(0, prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0)) {
2852                TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
2853        }
2854
2855        /*
2856         * Sibling 0 will have its own seccomp policy
2857         * and Sibling 1 will not be under seccomp at
2858         * all. Sibling 1 will enter seccomp and 0
2859         * will cause failure.
2860         */
2861        self->sibling[0].diverge = 1;
2862        tsync_start_sibling(&self->sibling[0]);
2863        tsync_start_sibling(&self->sibling[1]);
2864
2865        while (self->sibling_count < TSYNC_SIBLINGS) {
2866                sem_wait(&self->started);
2867                self->sibling_count++;
2868        }
2869
2870        ret = seccomp(SECCOMP_SET_MODE_FILTER, 0, &self->root_prog);
2871        ASSERT_NE(ENOSYS, errno) {
2872                TH_LOG("Kernel does not support seccomp syscall!");
2873        }
2874        ASSERT_EQ(0, ret) {
2875                TH_LOG("Kernel does not support SECCOMP_SET_MODE_FILTER!");
2876        }
2877
2878        ret = seccomp(SECCOMP_SET_MODE_FILTER, SECCOMP_FILTER_FLAG_TSYNC,
2879                      &self->apply_prog);
2880        ASSERT_EQ(ret, self->sibling[0].system_tid) {
2881                TH_LOG("Did not fail on diverged sibling.");
2882        }
2883        sib = 1;
2884        if (ret == self->sibling[0].system_tid)
2885                sib = 0;
2886
2887        pthread_mutex_lock(&self->mutex);
2888
2889        /* Increment the other siblings num_waits so we can clean up
2890         * the one we just saw.
2891         */
2892        self->sibling[!sib].num_waits += 1;
2893
2894        /* Signal the thread to clean up*/
2895        ASSERT_EQ(0, pthread_cond_broadcast(&self->cond)) {
2896                TH_LOG("cond broadcast non-zero");
2897        }
2898        pthread_mutex_unlock(&self->mutex);
2899        PTHREAD_JOIN(self->sibling[sib].tid, &status);
2900        EXPECT_EQ(SIBLING_EXIT_UNKILLED, (long)status);
2901        /* Poll for actual task death. pthread_join doesn't guarantee it. */
2902        while (!kill(self->sibling[sib].system_tid, 0))
2903                nanosleep(&delay, NULL);
2904        /* Switch to the remaining sibling */
2905        sib = !sib;
2906
2907        ret = seccomp(SECCOMP_SET_MODE_FILTER, SECCOMP_FILTER_FLAG_TSYNC,
2908                      &self->apply_prog);
2909        ASSERT_EQ(0, ret) {
2910                TH_LOG("Expected the remaining sibling to sync");
2911        };
2912
2913        pthread_mutex_lock(&self->mutex);
2914
2915        /* If remaining sibling didn't have a chance to wake up during
2916         * the first broadcast, manually reduce the num_waits now.
2917         */
2918        if (self->sibling[sib].num_waits > 1)
2919                self->sibling[sib].num_waits = 1;
2920        ASSERT_EQ(0, pthread_cond_broadcast(&self->cond)) {
2921                TH_LOG("cond broadcast non-zero");
2922        }
2923        pthread_mutex_unlock(&self->mutex);
2924        PTHREAD_JOIN(self->sibling[sib].tid, &status);
2925        EXPECT_EQ(0, (long)status);
2926        /* Poll for actual task death. pthread_join doesn't guarantee it. */
2927        while (!kill(self->sibling[sib].system_tid, 0))
2928                nanosleep(&delay, NULL);
2929
2930        ret = seccomp(SECCOMP_SET_MODE_FILTER, SECCOMP_FILTER_FLAG_TSYNC,
2931                      &self->apply_prog);
2932        ASSERT_EQ(0, ret);  /* just us chickens */
2933}
2934
2935/* Make sure restarted syscalls are seen directly as "restart_syscall". */
2936TEST(syscall_restart)
2937{
2938        long ret;
2939        unsigned long msg;
2940        pid_t child_pid;
2941        int pipefd[2];
2942        int status;
2943        siginfo_t info = { };
2944        struct sock_filter filter[] = {
2945                BPF_STMT(BPF_LD|BPF_W|BPF_ABS,
2946                         offsetof(struct seccomp_data, nr)),
2947
2948#ifdef __NR_sigreturn
2949                BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_sigreturn, 7, 0),
2950#endif
2951                BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_read, 6, 0),
2952                BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_exit, 5, 0),
2953                BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_rt_sigreturn, 4, 0),
2954                BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_nanosleep, 5, 0),
2955                BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_clock_nanosleep, 4, 0),
2956                BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_restart_syscall, 4, 0),
2957
2958                /* Allow __NR_write for easy logging. */
2959                BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_write, 0, 1),
2960                BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
2961                BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_KILL),
2962                /* The nanosleep jump target. */
2963                BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_TRACE|0x100),
2964                /* The restart_syscall jump target. */
2965                BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_TRACE|0x200),
2966        };
2967        struct sock_fprog prog = {
2968                .len = (unsigned short)ARRAY_SIZE(filter),
2969                .filter = filter,
2970        };
2971#if defined(__arm__)
2972        struct utsname utsbuf;
2973#endif
2974
2975        ASSERT_EQ(0, pipe(pipefd));
2976
2977        child_pid = fork();
2978        ASSERT_LE(0, child_pid);
2979        if (child_pid == 0) {
2980                /* Child uses EXPECT not ASSERT to deliver status correctly. */
2981                char buf = ' ';
2982                struct timespec timeout = { };
2983
2984                /* Attach parent as tracer and stop. */
2985                EXPECT_EQ(0, ptrace(PTRACE_TRACEME));
2986                EXPECT_EQ(0, raise(SIGSTOP));
2987
2988                EXPECT_EQ(0, close(pipefd[1]));
2989
2990                EXPECT_EQ(0, prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0)) {
2991                        TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
2992                }
2993
2994                ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog, 0, 0);
2995                EXPECT_EQ(0, ret) {
2996                        TH_LOG("Failed to install filter!");
2997                }
2998
2999                EXPECT_EQ(1, read(pipefd[0], &buf, 1)) {
3000                        TH_LOG("Failed to read() sync from parent");
3001                }
3002                EXPECT_EQ('.', buf) {
3003                        TH_LOG("Failed to get sync data from read()");
3004                }
3005
3006                /* Start nanosleep to be interrupted. */
3007                timeout.tv_sec = 1;
3008                errno = 0;
3009                EXPECT_EQ(0, nanosleep(&timeout, NULL)) {
3010                        TH_LOG("Call to nanosleep() failed (errno %d)", errno);
3011                }
3012
3013                /* Read final sync from parent. */
3014                EXPECT_EQ(1, read(pipefd[0], &buf, 1)) {
3015                        TH_LOG("Failed final read() from parent");
3016                }
3017                EXPECT_EQ('!', buf) {
3018                        TH_LOG("Failed to get final data from read()");
3019                }
3020
3021                /* Directly report the status of our test harness results. */
3022                syscall(__NR_exit, _metadata->passed ? EXIT_SUCCESS
3023                                                     : EXIT_FAILURE);
3024        }
3025        EXPECT_EQ(0, close(pipefd[0]));
3026
3027        /* Attach to child, setup options, and release. */
3028        ASSERT_EQ(child_pid, waitpid(child_pid, &status, 0));
3029        ASSERT_EQ(true, WIFSTOPPED(status));
3030        ASSERT_EQ(0, ptrace(PTRACE_SETOPTIONS, child_pid, NULL,
3031                            PTRACE_O_TRACESECCOMP));
3032        ASSERT_EQ(0, ptrace(PTRACE_CONT, child_pid, NULL, 0));
3033        ASSERT_EQ(1, write(pipefd[1], ".", 1));
3034
3035        /* Wait for nanosleep() to start. */
3036        ASSERT_EQ(child_pid, waitpid(child_pid, &status, 0));
3037        ASSERT_EQ(true, WIFSTOPPED(status));
3038        ASSERT_EQ(SIGTRAP, WSTOPSIG(status));
3039        ASSERT_EQ(PTRACE_EVENT_SECCOMP, (status >> 16));
3040        ASSERT_EQ(0, ptrace(PTRACE_GETEVENTMSG, child_pid, NULL, &msg));
3041        ASSERT_EQ(0x100, msg);
3042        ret = get_syscall(_metadata, child_pid);
3043        EXPECT_TRUE(ret == __NR_nanosleep || ret == __NR_clock_nanosleep);
3044
3045        /* Might as well check siginfo for sanity while we're here. */
3046        ASSERT_EQ(0, ptrace(PTRACE_GETSIGINFO, child_pid, NULL, &info));
3047        ASSERT_EQ(SIGTRAP, info.si_signo);
3048        ASSERT_EQ(SIGTRAP | (PTRACE_EVENT_SECCOMP << 8), info.si_code);
3049        EXPECT_EQ(0, info.si_errno);
3050        EXPECT_EQ(getuid(), info.si_uid);
3051        /* Verify signal delivery came from child (seccomp-triggered). */
3052        EXPECT_EQ(child_pid, info.si_pid);
3053
3054        /* Interrupt nanosleep with SIGSTOP (which we'll need to handle). */
3055        ASSERT_EQ(0, kill(child_pid, SIGSTOP));
3056        ASSERT_EQ(0, ptrace(PTRACE_CONT, child_pid, NULL, 0));
3057        ASSERT_EQ(child_pid, waitpid(child_pid, &status, 0));
3058        ASSERT_EQ(true, WIFSTOPPED(status));
3059        ASSERT_EQ(SIGSTOP, WSTOPSIG(status));
3060        ASSERT_EQ(0, ptrace(PTRACE_GETSIGINFO, child_pid, NULL, &info));
3061        /*
3062         * There is no siginfo on SIGSTOP any more, so we can't verify
3063         * signal delivery came from parent now (getpid() == info.si_pid).
3064         * https://lkml.kernel.org/r/CAGXu5jJaZAOzP1qFz66tYrtbuywqb+UN2SOA1VLHpCCOiYvYeg@mail.gmail.com
3065         * At least verify the SIGSTOP via PTRACE_GETSIGINFO.
3066         */
3067        EXPECT_EQ(SIGSTOP, info.si_signo);
3068
3069        /* Restart nanosleep with SIGCONT, which triggers restart_syscall. */
3070        ASSERT_EQ(0, kill(child_pid, SIGCONT));
3071        ASSERT_EQ(0, ptrace(PTRACE_CONT, child_pid, NULL, 0));
3072        ASSERT_EQ(child_pid, waitpid(child_pid, &status, 0));
3073        ASSERT_EQ(true, WIFSTOPPED(status));
3074        ASSERT_EQ(SIGCONT, WSTOPSIG(status));
3075        ASSERT_EQ(0, ptrace(PTRACE_CONT, child_pid, NULL, 0));
3076
3077        /* Wait for restart_syscall() to start. */
3078        ASSERT_EQ(child_pid, waitpid(child_pid, &status, 0));
3079        ASSERT_EQ(true, WIFSTOPPED(status));
3080        ASSERT_EQ(SIGTRAP, WSTOPSIG(status));
3081        ASSERT_EQ(PTRACE_EVENT_SECCOMP, (status >> 16));
3082        ASSERT_EQ(0, ptrace(PTRACE_GETEVENTMSG, child_pid, NULL, &msg));
3083
3084        ASSERT_EQ(0x200, msg);
3085        ret = get_syscall(_metadata, child_pid);
3086#if defined(__arm__)
3087        /*
3088         * FIXME:
3089         * - native ARM registers do NOT expose true syscall.
3090         * - compat ARM registers on ARM64 DO expose true syscall.
3091         */
3092        ASSERT_EQ(0, uname(&utsbuf));
3093        if (strncmp(utsbuf.machine, "arm", 3) == 0) {
3094                EXPECT_EQ(__NR_nanosleep, ret);
3095        } else
3096#endif
3097        {
3098                EXPECT_EQ(__NR_restart_syscall, ret);
3099        }
3100
3101        /* Write again to end test. */
3102        ASSERT_EQ(0, ptrace(PTRACE_CONT, child_pid, NULL, 0));
3103        ASSERT_EQ(1, write(pipefd[1], "!", 1));
3104        EXPECT_EQ(0, close(pipefd[1]));
3105
3106        ASSERT_EQ(child_pid, waitpid(child_pid, &status, 0));
3107        if (WIFSIGNALED(status) || WEXITSTATUS(status))
3108                _metadata->passed = 0;
3109}
3110
3111TEST_SIGNAL(filter_flag_log, SIGSYS)
3112{
3113        struct sock_filter allow_filter[] = {
3114                BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
3115        };
3116        struct sock_filter kill_filter[] = {
3117                BPF_STMT(BPF_LD|BPF_W|BPF_ABS,
3118                        offsetof(struct seccomp_data, nr)),
3119                BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_getpid, 0, 1),
3120                BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_KILL),
3121                BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
3122        };
3123        struct sock_fprog allow_prog = {
3124                .len = (unsigned short)ARRAY_SIZE(allow_filter),
3125                .filter = allow_filter,
3126        };
3127        struct sock_fprog kill_prog = {
3128                .len = (unsigned short)ARRAY_SIZE(kill_filter),
3129                .filter = kill_filter,
3130        };
3131        long ret;
3132        pid_t parent = getppid();
3133
3134        ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
3135        ASSERT_EQ(0, ret);
3136
3137        /* Verify that the FILTER_FLAG_LOG flag isn't accepted in strict mode */
3138        ret = seccomp(SECCOMP_SET_MODE_STRICT, SECCOMP_FILTER_FLAG_LOG,
3139                      &allow_prog);
3140        ASSERT_NE(ENOSYS, errno) {
3141                TH_LOG("Kernel does not support seccomp syscall!");
3142        }
3143        EXPECT_NE(0, ret) {
3144                TH_LOG("Kernel accepted FILTER_FLAG_LOG flag in strict mode!");
3145        }
3146        EXPECT_EQ(EINVAL, errno) {
3147                TH_LOG("Kernel returned unexpected errno for FILTER_FLAG_LOG flag in strict mode!");
3148        }
3149
3150        /* Verify that a simple, permissive filter can be added with no flags */
3151        ret = seccomp(SECCOMP_SET_MODE_FILTER, 0, &allow_prog);
3152        EXPECT_EQ(0, ret);
3153
3154        /* See if the same filter can be added with the FILTER_FLAG_LOG flag */
3155        ret = seccomp(SECCOMP_SET_MODE_FILTER, SECCOMP_FILTER_FLAG_LOG,
3156                      &allow_prog);
3157        ASSERT_NE(EINVAL, errno) {
3158                TH_LOG("Kernel does not support the FILTER_FLAG_LOG flag!");
3159        }
3160        EXPECT_EQ(0, ret);
3161
3162        /* Ensure that the kill filter works with the FILTER_FLAG_LOG flag */
3163        ret = seccomp(SECCOMP_SET_MODE_FILTER, SECCOMP_FILTER_FLAG_LOG,
3164                      &kill_prog);
3165        EXPECT_EQ(0, ret);
3166
3167        EXPECT_EQ(parent, syscall(__NR_getppid));
3168        /* getpid() should never return. */
3169        EXPECT_EQ(0, syscall(__NR_getpid));
3170}
3171
3172TEST(get_action_avail)
3173{
3174        __u32 actions[] = { SECCOMP_RET_KILL_THREAD, SECCOMP_RET_TRAP,
3175                            SECCOMP_RET_ERRNO, SECCOMP_RET_TRACE,
3176                            SECCOMP_RET_LOG,   SECCOMP_RET_ALLOW };
3177        __u32 unknown_action = 0x10000000U;
3178        int i;
3179        long ret;
3180
3181        ret = seccomp(SECCOMP_GET_ACTION_AVAIL, 0, &actions[0]);
3182        ASSERT_NE(ENOSYS, errno) {
3183                TH_LOG("Kernel does not support seccomp syscall!");
3184        }
3185        ASSERT_NE(EINVAL, errno) {
3186                TH_LOG("Kernel does not support SECCOMP_GET_ACTION_AVAIL operation!");
3187        }
3188        EXPECT_EQ(ret, 0);
3189
3190        for (i = 0; i < ARRAY_SIZE(actions); i++) {
3191                ret = seccomp(SECCOMP_GET_ACTION_AVAIL, 0, &actions[i]);
3192                EXPECT_EQ(ret, 0) {
3193                        TH_LOG("Expected action (0x%X) not available!",
3194                               actions[i]);
3195                }
3196        }
3197
3198        /* Check that an unknown action is handled properly (EOPNOTSUPP) */
3199        ret = seccomp(SECCOMP_GET_ACTION_AVAIL, 0, &unknown_action);
3200        EXPECT_EQ(ret, -1);
3201        EXPECT_EQ(errno, EOPNOTSUPP);
3202}
3203
3204TEST(get_metadata)
3205{
3206        pid_t pid;
3207        int pipefd[2];
3208        char buf;
3209        struct seccomp_metadata md;
3210        long ret;
3211
3212        /* Only real root can get metadata. */
3213        if (geteuid()) {
3214                SKIP(return, "get_metadata requires real root");
3215                return;
3216        }
3217
3218        ASSERT_EQ(0, pipe(pipefd));
3219
3220        pid = fork();
3221        ASSERT_GE(pid, 0);
3222        if (pid == 0) {
3223                struct sock_filter filter[] = {
3224                        BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
3225                };
3226                struct sock_fprog prog = {
3227                        .len = (unsigned short)ARRAY_SIZE(filter),
3228                        .filter = filter,
3229                };
3230
3231                /* one with log, one without */
3232                EXPECT_EQ(0, seccomp(SECCOMP_SET_MODE_FILTER,
3233                                     SECCOMP_FILTER_FLAG_LOG, &prog));
3234                EXPECT_EQ(0, seccomp(SECCOMP_SET_MODE_FILTER, 0, &prog));
3235
3236                EXPECT_EQ(0, close(pipefd[0]));
3237                ASSERT_EQ(1, write(pipefd[1], "1", 1));
3238                ASSERT_EQ(0, close(pipefd[1]));
3239
3240                while (1)
3241                        sleep(100);
3242        }
3243
3244        ASSERT_EQ(0, close(pipefd[1]));
3245        ASSERT_EQ(1, read(pipefd[0], &buf, 1));
3246
3247        ASSERT_EQ(0, ptrace(PTRACE_ATTACH, pid));
3248        ASSERT_EQ(pid, waitpid(pid, NULL, 0));
3249
3250        /* Past here must not use ASSERT or child process is never killed. */
3251
3252        md.filter_off = 0;
3253        errno = 0;
3254        ret = ptrace(PTRACE_SECCOMP_GET_METADATA, pid, sizeof(md), &md);
3255        EXPECT_EQ(sizeof(md), ret) {
3256                if (errno == EINVAL)
3257                        SKIP(goto skip, "Kernel does not support PTRACE_SECCOMP_GET_METADATA (missing CONFIG_CHECKPOINT_RESTORE?)");
3258        }
3259
3260        EXPECT_EQ(md.flags, SECCOMP_FILTER_FLAG_LOG);
3261        EXPECT_EQ(md.filter_off, 0);
3262
3263        md.filter_off = 1;
3264        ret = ptrace(PTRACE_SECCOMP_GET_METADATA, pid, sizeof(md), &md);
3265        EXPECT_EQ(sizeof(md), ret);
3266        EXPECT_EQ(md.flags, 0);
3267        EXPECT_EQ(md.filter_off, 1);
3268
3269skip:
3270        ASSERT_EQ(0, kill(pid, SIGKILL));
3271}
3272
3273static int user_notif_syscall(int nr, unsigned int flags)
3274{
3275        struct sock_filter filter[] = {
3276                BPF_STMT(BPF_LD|BPF_W|BPF_ABS,
3277                        offsetof(struct seccomp_data, nr)),
3278                BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, nr, 0, 1),
3279                BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_USER_NOTIF),
3280                BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
3281        };
3282
3283        struct sock_fprog prog = {
3284                .len = (unsigned short)ARRAY_SIZE(filter),
3285                .filter = filter,
3286        };
3287
3288        return seccomp(SECCOMP_SET_MODE_FILTER, flags, &prog);
3289}
3290
3291#define USER_NOTIF_MAGIC INT_MAX
3292TEST(user_notification_basic)
3293{
3294        pid_t pid;
3295        long ret;
3296        int status, listener;
3297        struct seccomp_notif req = {};
3298        struct seccomp_notif_resp resp = {};
3299        struct pollfd pollfd;
3300
3301        struct sock_filter filter[] = {
3302                BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
3303        };
3304        struct sock_fprog prog = {
3305                .len = (unsigned short)ARRAY_SIZE(filter),
3306                .filter = filter,
3307        };
3308
3309        ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
3310        ASSERT_EQ(0, ret) {
3311                TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
3312        }
3313
3314        pid = fork();
3315        ASSERT_GE(pid, 0);
3316
3317        /* Check that we get -ENOSYS with no listener attached */
3318        if (pid == 0) {
3319                if (user_notif_syscall(__NR_getppid, 0) < 0)
3320                        exit(1);
3321                ret = syscall(__NR_getppid);
3322                exit(ret >= 0 || errno != ENOSYS);
3323        }
3324
3325        EXPECT_EQ(waitpid(pid, &status, 0), pid);
3326        EXPECT_EQ(true, WIFEXITED(status));
3327        EXPECT_EQ(0, WEXITSTATUS(status));
3328
3329        /* Add some no-op filters for grins. */
3330        EXPECT_EQ(seccomp(SECCOMP_SET_MODE_FILTER, 0, &prog), 0);
3331        EXPECT_EQ(seccomp(SECCOMP_SET_MODE_FILTER, 0, &prog), 0);
3332        EXPECT_EQ(seccomp(SECCOMP_SET_MODE_FILTER, 0, &prog), 0);
3333        EXPECT_EQ(seccomp(SECCOMP_SET_MODE_FILTER, 0, &prog), 0);
3334
3335        /* Check that the basic notification machinery works */
3336        listener = user_notif_syscall(__NR_getppid,
3337                                      SECCOMP_FILTER_FLAG_NEW_LISTENER);
3338        ASSERT_GE(listener, 0);
3339
3340        /* Installing a second listener in the chain should EBUSY */
3341        EXPECT_EQ(user_notif_syscall(__NR_getppid,
3342                                     SECCOMP_FILTER_FLAG_NEW_LISTENER),
3343                  -1);
3344        EXPECT_EQ(errno, EBUSY);
3345
3346        pid = fork();
3347        ASSERT_GE(pid, 0);
3348
3349        if (pid == 0) {
3350                ret = syscall(__NR_getppid);
3351                exit(ret != USER_NOTIF_MAGIC);
3352        }
3353
3354        pollfd.fd = listener;
3355        pollfd.events = POLLIN | POLLOUT;
3356
3357        EXPECT_GT(poll(&pollfd, 1, -1), 0);
3358        EXPECT_EQ(pollfd.revents, POLLIN);
3359
3360        /* Test that we can't pass garbage to the kernel. */
3361        memset(&req, 0, sizeof(req));
3362        req.pid = -1;
3363        errno = 0;
3364        ret = ioctl(listener, SECCOMP_IOCTL_NOTIF_RECV, &req);
3365        EXPECT_EQ(-1, ret);
3366        EXPECT_EQ(EINVAL, errno);
3367
3368        if (ret) {
3369                req.pid = 0;
3370                EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_RECV, &req), 0);
3371        }
3372
3373        pollfd.fd = listener;
3374        pollfd.events = POLLIN | POLLOUT;
3375
3376        EXPECT_GT(poll(&pollfd, 1, -1), 0);
3377        EXPECT_EQ(pollfd.revents, POLLOUT);
3378
3379        EXPECT_EQ(req.data.nr,  __NR_getppid);
3380
3381        resp.id = req.id;
3382        resp.error = 0;
3383        resp.val = USER_NOTIF_MAGIC;
3384
3385        /* check that we make sure flags == 0 */
3386        resp.flags = 1;
3387        EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_SEND, &resp), -1);
3388        EXPECT_EQ(errno, EINVAL);
3389
3390        resp.flags = 0;
3391        EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_SEND, &resp), 0);
3392
3393        EXPECT_EQ(waitpid(pid, &status, 0), pid);
3394        EXPECT_EQ(true, WIFEXITED(status));
3395        EXPECT_EQ(0, WEXITSTATUS(status));
3396}
3397
3398TEST(user_notification_with_tsync)
3399{
3400        int ret;
3401        unsigned int flags;
3402
3403        ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
3404        ASSERT_EQ(0, ret) {
3405                TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
3406        }
3407
3408        /* these were exclusive */
3409        flags = SECCOMP_FILTER_FLAG_NEW_LISTENER |
3410                SECCOMP_FILTER_FLAG_TSYNC;
3411        ASSERT_EQ(-1, user_notif_syscall(__NR_getppid, flags));
3412        ASSERT_EQ(EINVAL, errno);
3413
3414        /* but now they're not */
3415        flags |= SECCOMP_FILTER_FLAG_TSYNC_ESRCH;
3416        ret = user_notif_syscall(__NR_getppid, flags);
3417        close(ret);
3418        ASSERT_LE(0, ret);
3419}
3420
3421TEST(user_notification_kill_in_middle)
3422{
3423        pid_t pid;
3424        long ret;
3425        int listener;
3426        struct seccomp_notif req = {};
3427        struct seccomp_notif_resp resp = {};
3428
3429        ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
3430        ASSERT_EQ(0, ret) {
3431                TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
3432        }
3433
3434        listener = user_notif_syscall(__NR_getppid,
3435                                      SECCOMP_FILTER_FLAG_NEW_LISTENER);
3436        ASSERT_GE(listener, 0);
3437
3438        /*
3439         * Check that nothing bad happens when we kill the task in the middle
3440         * of a syscall.
3441         */
3442        pid = fork();
3443        ASSERT_GE(pid, 0);
3444
3445        if (pid == 0) {
3446                ret = syscall(__NR_getppid);
3447                exit(ret != USER_NOTIF_MAGIC);
3448        }
3449
3450        EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_RECV, &req), 0);
3451        EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_ID_VALID, &req.id), 0);
3452
3453        EXPECT_EQ(kill(pid, SIGKILL), 0);
3454        EXPECT_EQ(waitpid(pid, NULL, 0), pid);
3455
3456        EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_ID_VALID, &req.id), -1);
3457
3458        resp.id = req.id;
3459        ret = ioctl(listener, SECCOMP_IOCTL_NOTIF_SEND, &resp);
3460        EXPECT_EQ(ret, -1);
3461        EXPECT_EQ(errno, ENOENT);
3462}
3463
3464static int handled = -1;
3465
3466static void signal_handler(int signal)
3467{
3468        if (write(handled, "c", 1) != 1)
3469                perror("write from signal");
3470}
3471
3472TEST(user_notification_signal)
3473{
3474        pid_t pid;
3475        long ret;
3476        int status, listener, sk_pair[2];
3477        struct seccomp_notif req = {};
3478        struct seccomp_notif_resp resp = {};
3479        char c;
3480
3481        ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
3482        ASSERT_EQ(0, ret) {
3483                TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
3484        }
3485
3486        ASSERT_EQ(socketpair(PF_LOCAL, SOCK_SEQPACKET, 0, sk_pair), 0);
3487
3488        listener = user_notif_syscall(__NR_gettid,
3489                                      SECCOMP_FILTER_FLAG_NEW_LISTENER);
3490        ASSERT_GE(listener, 0);
3491
3492        pid = fork();
3493        ASSERT_GE(pid, 0);
3494
3495        if (pid == 0) {
3496                close(sk_pair[0]);
3497                handled = sk_pair[1];
3498                if (signal(SIGUSR1, signal_handler) == SIG_ERR) {
3499                        perror("signal");
3500                        exit(1);
3501                }
3502                /*
3503                 * ERESTARTSYS behavior is a bit hard to test, because we need
3504                 * to rely on a signal that has not yet been handled. Let's at
3505                 * least check that the error code gets propagated through, and
3506                 * hope that it doesn't break when there is actually a signal :)
3507                 */
3508                ret = syscall(__NR_gettid);
3509                exit(!(ret == -1 && errno == 512));
3510        }
3511
3512        close(sk_pair[1]);
3513
3514        memset(&req, 0, sizeof(req));
3515        EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_RECV, &req), 0);
3516
3517        EXPECT_EQ(kill(pid, SIGUSR1), 0);
3518
3519        /*
3520         * Make sure the signal really is delivered, which means we're not
3521         * stuck in the user notification code any more and the notification
3522         * should be dead.
3523         */
3524        EXPECT_EQ(read(sk_pair[0], &c, 1), 1);
3525
3526        resp.id = req.id;
3527        resp.error = -EPERM;
3528        resp.val = 0;
3529
3530        EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_SEND, &resp), -1);
3531        EXPECT_EQ(errno, ENOENT);
3532
3533        memset(&req, 0, sizeof(req));
3534        EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_RECV, &req), 0);
3535
3536        resp.id = req.id;
3537        resp.error = -512; /* -ERESTARTSYS */
3538        resp.val = 0;
3539
3540        EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_SEND, &resp), 0);
3541
3542        EXPECT_EQ(waitpid(pid, &status, 0), pid);
3543        EXPECT_EQ(true, WIFEXITED(status));
3544        EXPECT_EQ(0, WEXITSTATUS(status));
3545}
3546
3547TEST(user_notification_closed_listener)
3548{
3549        pid_t pid;
3550        long ret;
3551        int status, listener;
3552
3553        ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
3554        ASSERT_EQ(0, ret) {
3555                TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
3556        }
3557
3558        listener = user_notif_syscall(__NR_getppid,
3559                                      SECCOMP_FILTER_FLAG_NEW_LISTENER);
3560        ASSERT_GE(listener, 0);
3561
3562        /*
3563         * Check that we get an ENOSYS when the listener is closed.
3564         */
3565        pid = fork();
3566        ASSERT_GE(pid, 0);
3567        if (pid == 0) {
3568                close(listener);
3569                ret = syscall(__NR_getppid);
3570                exit(ret != -1 && errno != ENOSYS);
3571        }
3572
3573        close(listener);
3574
3575        EXPECT_EQ(waitpid(pid, &status, 0), pid);
3576        EXPECT_EQ(true, WIFEXITED(status));
3577        EXPECT_EQ(0, WEXITSTATUS(status));
3578}
3579
3580/*
3581 * Check that a pid in a child namespace still shows up as valid in ours.
3582 */
3583TEST(user_notification_child_pid_ns)
3584{
3585        pid_t pid;
3586        int status, listener;
3587        struct seccomp_notif req = {};
3588        struct seccomp_notif_resp resp = {};
3589
3590        ASSERT_EQ(unshare(CLONE_NEWUSER | CLONE_NEWPID), 0) {
3591                if (errno == EINVAL)
3592                        SKIP(return, "kernel missing CLONE_NEWUSER support");
3593        };
3594
3595        listener = user_notif_syscall(__NR_getppid,
3596                                      SECCOMP_FILTER_FLAG_NEW_LISTENER);
3597        ASSERT_GE(listener, 0);
3598
3599        pid = fork();
3600        ASSERT_GE(pid, 0);
3601
3602        if (pid == 0)
3603                exit(syscall(__NR_getppid) != USER_NOTIF_MAGIC);
3604
3605        EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_RECV, &req), 0);
3606        EXPECT_EQ(req.pid, pid);
3607
3608        resp.id = req.id;
3609        resp.error = 0;
3610        resp.val = USER_NOTIF_MAGIC;
3611
3612        EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_SEND, &resp), 0);
3613
3614        EXPECT_EQ(waitpid(pid, &status, 0), pid);
3615        EXPECT_EQ(true, WIFEXITED(status));
3616        EXPECT_EQ(0, WEXITSTATUS(status));
3617        close(listener);
3618}
3619
3620/*
3621 * Check that a pid in a sibling (i.e. unrelated) namespace shows up as 0, i.e.
3622 * invalid.
3623 */
3624TEST(user_notification_sibling_pid_ns)
3625{
3626        pid_t pid, pid2;
3627        int status, listener;
3628        struct seccomp_notif req = {};
3629        struct seccomp_notif_resp resp = {};
3630
3631        ASSERT_EQ(prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0), 0) {
3632                TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
3633        }
3634
3635        listener = user_notif_syscall(__NR_getppid,
3636                                      SECCOMP_FILTER_FLAG_NEW_LISTENER);
3637        ASSERT_GE(listener, 0);
3638
3639        pid = fork();
3640        ASSERT_GE(pid, 0);
3641
3642        if (pid == 0) {
3643                ASSERT_EQ(unshare(CLONE_NEWPID), 0);
3644
3645                pid2 = fork();
3646                ASSERT_GE(pid2, 0);
3647
3648                if (pid2 == 0)
3649                        exit(syscall(__NR_getppid) != USER_NOTIF_MAGIC);
3650
3651                EXPECT_EQ(waitpid(pid2, &status, 0), pid2);
3652                EXPECT_EQ(true, WIFEXITED(status));
3653                EXPECT_EQ(0, WEXITSTATUS(status));
3654                exit(WEXITSTATUS(status));
3655        }
3656
3657        /* Create the sibling ns, and sibling in it. */
3658        ASSERT_EQ(unshare(CLONE_NEWPID), 0) {
3659                if (errno == EPERM)
3660                        SKIP(return, "CLONE_NEWPID requires CAP_SYS_ADMIN");
3661        }
3662        ASSERT_EQ(errno, 0);
3663
3664        pid2 = fork();
3665        ASSERT_GE(pid2, 0);
3666
3667        if (pid2 == 0) {
3668                ASSERT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_RECV, &req), 0);
3669                /*
3670                 * The pid should be 0, i.e. the task is in some namespace that
3671                 * we can't "see".
3672                 */
3673                EXPECT_EQ(req.pid, 0);
3674
3675                resp.id = req.id;
3676                resp.error = 0;
3677                resp.val = USER_NOTIF_MAGIC;
3678
3679                ASSERT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_SEND, &resp), 0);
3680                exit(0);
3681        }
3682
3683        close(listener);
3684
3685        EXPECT_EQ(waitpid(pid, &status, 0), pid);
3686        EXPECT_EQ(true, WIFEXITED(status));
3687        EXPECT_EQ(0, WEXITSTATUS(status));
3688
3689        EXPECT_EQ(waitpid(pid2, &status, 0), pid2);
3690        EXPECT_EQ(true, WIFEXITED(status));
3691        EXPECT_EQ(0, WEXITSTATUS(status));
3692}
3693
3694TEST(user_notification_fault_recv)
3695{
3696        pid_t pid;
3697        int status, listener;
3698        struct seccomp_notif req = {};
3699        struct seccomp_notif_resp resp = {};
3700
3701        ASSERT_EQ(unshare(CLONE_NEWUSER), 0);
3702
3703        listener = user_notif_syscall(__NR_getppid,
3704                                      SECCOMP_FILTER_FLAG_NEW_LISTENER);
3705        ASSERT_GE(listener, 0);
3706
3707        pid = fork();
3708        ASSERT_GE(pid, 0);
3709
3710        if (pid == 0)
3711                exit(syscall(__NR_getppid) != USER_NOTIF_MAGIC);
3712
3713        /* Do a bad recv() */
3714        EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_RECV, NULL), -1);
3715        EXPECT_EQ(errno, EFAULT);
3716
3717        /* We should still be able to receive this notification, though. */
3718        EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_RECV, &req), 0);
3719        EXPECT_EQ(req.pid, pid);
3720
3721        resp.id = req.id;
3722        resp.error = 0;
3723        resp.val = USER_NOTIF_MAGIC;
3724
3725        EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_SEND, &resp), 0);
3726
3727        EXPECT_EQ(waitpid(pid, &status, 0), pid);
3728        EXPECT_EQ(true, WIFEXITED(status));
3729        EXPECT_EQ(0, WEXITSTATUS(status));
3730}
3731
3732TEST(seccomp_get_notif_sizes)
3733{
3734        struct seccomp_notif_sizes sizes;
3735
3736        ASSERT_EQ(seccomp(SECCOMP_GET_NOTIF_SIZES, 0, &sizes), 0);
3737        EXPECT_EQ(sizes.seccomp_notif, sizeof(struct seccomp_notif));
3738        EXPECT_EQ(sizes.seccomp_notif_resp, sizeof(struct seccomp_notif_resp));
3739}
3740
3741TEST(user_notification_continue)
3742{
3743        pid_t pid;
3744        long ret;
3745        int status, listener;
3746        struct seccomp_notif req = {};
3747        struct seccomp_notif_resp resp = {};
3748        struct pollfd pollfd;
3749
3750        ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
3751        ASSERT_EQ(0, ret) {
3752                TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
3753        }
3754
3755        listener = user_notif_syscall(__NR_dup, SECCOMP_FILTER_FLAG_NEW_LISTENER);
3756        ASSERT_GE(listener, 0);
3757
3758        pid = fork();
3759        ASSERT_GE(pid, 0);
3760
3761        if (pid == 0) {
3762                int dup_fd, pipe_fds[2];
3763                pid_t self;
3764
3765                ASSERT_GE(pipe(pipe_fds), 0);
3766
3767                dup_fd = dup(pipe_fds[0]);
3768                ASSERT_GE(dup_fd, 0);
3769                EXPECT_NE(pipe_fds[0], dup_fd);
3770
3771                self = getpid();
3772                ASSERT_EQ(filecmp(self, self, pipe_fds[0], dup_fd), 0);
3773                exit(0);
3774        }
3775
3776        pollfd.fd = listener;
3777        pollfd.events = POLLIN | POLLOUT;
3778
3779        EXPECT_GT(poll(&pollfd, 1, -1), 0);
3780        EXPECT_EQ(pollfd.revents, POLLIN);
3781
3782        EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_RECV, &req), 0);
3783
3784        pollfd.fd = listener;
3785        pollfd.events = POLLIN | POLLOUT;
3786
3787        EXPECT_GT(poll(&pollfd, 1, -1), 0);
3788        EXPECT_EQ(pollfd.revents, POLLOUT);
3789
3790        EXPECT_EQ(req.data.nr, __NR_dup);
3791
3792        resp.id = req.id;
3793        resp.flags = SECCOMP_USER_NOTIF_FLAG_CONTINUE;
3794
3795        /*
3796         * Verify that setting SECCOMP_USER_NOTIF_FLAG_CONTINUE enforces other
3797         * args be set to 0.
3798         */
3799        resp.error = 0;
3800        resp.val = USER_NOTIF_MAGIC;
3801        EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_SEND, &resp), -1);
3802        EXPECT_EQ(errno, EINVAL);
3803
3804        resp.error = USER_NOTIF_MAGIC;
3805        resp.val = 0;
3806        EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_SEND, &resp), -1);
3807        EXPECT_EQ(errno, EINVAL);
3808
3809        resp.error = 0;
3810        resp.val = 0;
3811        EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_SEND, &resp), 0) {
3812                if (errno == EINVAL)
3813                        SKIP(goto skip, "Kernel does not support SECCOMP_USER_NOTIF_FLAG_CONTINUE");
3814        }
3815
3816skip:
3817        EXPECT_EQ(waitpid(pid, &status, 0), pid);
3818        EXPECT_EQ(true, WIFEXITED(status));
3819        EXPECT_EQ(0, WEXITSTATUS(status)) {
3820                if (WEXITSTATUS(status) == 2) {
3821                        SKIP(return, "Kernel does not support kcmp() syscall");
3822                        return;
3823                }
3824        }
3825}
3826
3827TEST(user_notification_filter_empty)
3828{
3829        pid_t pid;
3830        long ret;
3831        int status;
3832        struct pollfd pollfd;
3833        struct __clone_args args = {
3834                .flags = CLONE_FILES,
3835                .exit_signal = SIGCHLD,
3836        };
3837
3838        ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
3839        ASSERT_EQ(0, ret) {
3840                TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
3841        }
3842
3843        pid = sys_clone3(&args, sizeof(args));
3844        ASSERT_GE(pid, 0);
3845
3846        if (pid == 0) {
3847                int listener;
3848
3849                listener = user_notif_syscall(__NR_mknodat, SECCOMP_FILTER_FLAG_NEW_LISTENER);
3850                if (listener < 0)
3851                        _exit(EXIT_FAILURE);
3852
3853                if (dup2(listener, 200) != 200)
3854                        _exit(EXIT_FAILURE);
3855
3856                close(listener);
3857
3858                _exit(EXIT_SUCCESS);
3859        }
3860
3861        EXPECT_EQ(waitpid(pid, &status, 0), pid);
3862        EXPECT_EQ(true, WIFEXITED(status));
3863        EXPECT_EQ(0, WEXITSTATUS(status));
3864
3865        /*
3866         * The seccomp filter has become unused so we should be notified once
3867         * the kernel gets around to cleaning up task struct.
3868         */
3869        pollfd.fd = 200;
3870        pollfd.events = POLLHUP;
3871
3872        EXPECT_GT(poll(&pollfd, 1, 2000), 0);
3873        EXPECT_GT((pollfd.revents & POLLHUP) ?: 0, 0);
3874}
3875
3876static void *do_thread(void *data)
3877{
3878        return NULL;
3879}
3880
3881TEST(user_notification_filter_empty_threaded)
3882{
3883        pid_t pid;
3884        long ret;
3885        int status;
3886        struct pollfd pollfd;
3887        struct __clone_args args = {
3888                .flags = CLONE_FILES,
3889                .exit_signal = SIGCHLD,
3890        };
3891
3892        ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
3893        ASSERT_EQ(0, ret) {
3894                TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
3895        }
3896
3897        pid = sys_clone3(&args, sizeof(args));
3898        ASSERT_GE(pid, 0);
3899
3900        if (pid == 0) {
3901                pid_t pid1, pid2;
3902                int listener, status;
3903                pthread_t thread;
3904
3905                listener = user_notif_syscall(__NR_dup, SECCOMP_FILTER_FLAG_NEW_LISTENER);
3906                if (listener < 0)
3907                        _exit(EXIT_FAILURE);
3908
3909                if (dup2(listener, 200) != 200)
3910                        _exit(EXIT_FAILURE);
3911
3912                close(listener);
3913
3914                pid1 = fork();
3915                if (pid1 < 0)
3916                        _exit(EXIT_FAILURE);
3917
3918                if (pid1 == 0)
3919                        _exit(EXIT_SUCCESS);
3920
3921                pid2 = fork();
3922                if (pid2 < 0)
3923                        _exit(EXIT_FAILURE);
3924
3925                if (pid2 == 0)
3926                        _exit(EXIT_SUCCESS);
3927
3928                if (pthread_create(&thread, NULL, do_thread, NULL) ||
3929                    pthread_join(thread, NULL))
3930                        _exit(EXIT_FAILURE);
3931
3932                if (pthread_create(&thread, NULL, do_thread, NULL) ||
3933                    pthread_join(thread, NULL))
3934                        _exit(EXIT_FAILURE);
3935
3936                if (waitpid(pid1, &status, 0) != pid1 || !WIFEXITED(status) ||
3937                    WEXITSTATUS(status))
3938                        _exit(EXIT_FAILURE);
3939
3940                if (waitpid(pid2, &status, 0) != pid2 || !WIFEXITED(status) ||
3941                    WEXITSTATUS(status))
3942                        _exit(EXIT_FAILURE);
3943
3944                exit(EXIT_SUCCESS);
3945        }
3946
3947        EXPECT_EQ(waitpid(pid, &status, 0), pid);
3948        EXPECT_EQ(true, WIFEXITED(status));
3949        EXPECT_EQ(0, WEXITSTATUS(status));
3950
3951        /*
3952         * The seccomp filter has become unused so we should be notified once
3953         * the kernel gets around to cleaning up task struct.
3954         */
3955        pollfd.fd = 200;
3956        pollfd.events = POLLHUP;
3957
3958        EXPECT_GT(poll(&pollfd, 1, 2000), 0);
3959        EXPECT_GT((pollfd.revents & POLLHUP) ?: 0, 0);
3960}
3961
3962TEST(user_notification_addfd)
3963{
3964        pid_t pid;
3965        long ret;
3966        int status, listener, memfd, fd, nextfd;
3967        struct seccomp_notif_addfd addfd = {};
3968        struct seccomp_notif_addfd_small small = {};
3969        struct seccomp_notif_addfd_big big = {};
3970        struct seccomp_notif req = {};
3971        struct seccomp_notif_resp resp = {};
3972        /* 100 ms */
3973        struct timespec delay = { .tv_nsec = 100000000 };
3974
3975        /* There may be arbitrary already-open fds at test start. */
3976        memfd = memfd_create("test", 0);
3977        ASSERT_GE(memfd, 0);
3978        nextfd = memfd + 1;
3979
3980        ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
3981        ASSERT_EQ(0, ret) {
3982                TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
3983        }
3984
3985        /* fd: 4 */
3986        /* Check that the basic notification machinery works */
3987        listener = user_notif_syscall(__NR_getppid,
3988                                      SECCOMP_FILTER_FLAG_NEW_LISTENER);
3989        ASSERT_EQ(listener, nextfd++);
3990
3991        pid = fork();
3992        ASSERT_GE(pid, 0);
3993
3994        if (pid == 0) {
3995                /* fds will be added and this value is expected */
3996                if (syscall(__NR_getppid) != USER_NOTIF_MAGIC)
3997                        exit(1);
3998
3999                /* Atomic addfd+send is received here. Check it is a valid fd */
4000                if (fcntl(syscall(__NR_getppid), F_GETFD) == -1)
4001                        exit(1);
4002
4003                exit(syscall(__NR_getppid) != USER_NOTIF_MAGIC);
4004        }
4005
4006        ASSERT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_RECV, &req), 0);
4007
4008        addfd.srcfd = memfd;
4009        addfd.newfd = 0;
4010        addfd.id = req.id;
4011        addfd.flags = 0x0;
4012
4013        /* Verify bad newfd_flags cannot be set */
4014        addfd.newfd_flags = ~O_CLOEXEC;
4015        EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_ADDFD, &addfd), -1);
4016        EXPECT_EQ(errno, EINVAL);
4017        addfd.newfd_flags = O_CLOEXEC;
4018
4019        /* Verify bad flags cannot be set */
4020        addfd.flags = 0xff;
4021        EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_ADDFD, &addfd), -1);
4022        EXPECT_EQ(errno, EINVAL);
4023        addfd.flags = 0;
4024
4025        /* Verify that remote_fd cannot be set without setting flags */
4026        addfd.newfd = 1;
4027        EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_ADDFD, &addfd), -1);
4028        EXPECT_EQ(errno, EINVAL);
4029        addfd.newfd = 0;
4030
4031        /* Verify small size cannot be set */
4032        EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_ADDFD_SMALL, &small), -1);
4033        EXPECT_EQ(errno, EINVAL);
4034
4035        /* Verify we can't send bits filled in unknown buffer area */
4036        memset(&big, 0xAA, sizeof(big));
4037        big.addfd = addfd;
4038        EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_ADDFD_BIG, &big), -1);
4039        EXPECT_EQ(errno, E2BIG);
4040
4041
4042        /* Verify we can set an arbitrary remote fd */
4043        fd = ioctl(listener, SECCOMP_IOCTL_NOTIF_ADDFD, &addfd);
4044        EXPECT_EQ(fd, nextfd++);
4045        EXPECT_EQ(filecmp(getpid(), pid, memfd, fd), 0);
4046
4047        /* Verify we can set an arbitrary remote fd with large size */
4048        memset(&big, 0x0, sizeof(big));
4049        big.addfd = addfd;
4050        fd = ioctl(listener, SECCOMP_IOCTL_NOTIF_ADDFD_BIG, &big);
4051        EXPECT_EQ(fd, nextfd++);
4052
4053        /* Verify we can set a specific remote fd */
4054        addfd.newfd = 42;
4055        addfd.flags = SECCOMP_ADDFD_FLAG_SETFD;
4056        fd = ioctl(listener, SECCOMP_IOCTL_NOTIF_ADDFD, &addfd);
4057        EXPECT_EQ(fd, 42);
4058        EXPECT_EQ(filecmp(getpid(), pid, memfd, fd), 0);
4059
4060        /* Resume syscall */
4061        resp.id = req.id;
4062        resp.error = 0;
4063        resp.val = USER_NOTIF_MAGIC;
4064        EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_SEND, &resp), 0);
4065
4066        /*
4067         * This sets the ID of the ADD FD to the last request plus 1. The
4068         * notification ID increments 1 per notification.
4069         */
4070        addfd.id = req.id + 1;
4071
4072        /* This spins until the underlying notification is generated */
4073        while (ioctl(listener, SECCOMP_IOCTL_NOTIF_ADDFD, &addfd) != -1 &&
4074               errno != -EINPROGRESS)
4075                nanosleep(&delay, NULL);
4076
4077        memset(&req, 0, sizeof(req));
4078        ASSERT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_RECV, &req), 0);
4079        ASSERT_EQ(addfd.id, req.id);
4080
4081        /* Verify we can do an atomic addfd and send */
4082        addfd.newfd = 0;
4083        addfd.flags = SECCOMP_ADDFD_FLAG_SEND;
4084        fd = ioctl(listener, SECCOMP_IOCTL_NOTIF_ADDFD, &addfd);
4085        /*
4086         * Child has earlier "low" fds and now 42, so we expect the next
4087         * lowest available fd to be assigned here.
4088         */
4089        EXPECT_EQ(fd, nextfd++);
4090        EXPECT_EQ(filecmp(getpid(), pid, memfd, fd), 0);
4091
4092        /*
4093         * This sets the ID of the ADD FD to the last request plus 1. The
4094         * notification ID increments 1 per notification.
4095         */
4096        addfd.id = req.id + 1;
4097
4098        /* This spins until the underlying notification is generated */
4099        while (ioctl(listener, SECCOMP_IOCTL_NOTIF_ADDFD, &addfd) != -1 &&
4100               errno != -EINPROGRESS)
4101                nanosleep(&delay, NULL);
4102
4103        memset(&req, 0, sizeof(req));
4104        ASSERT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_RECV, &req), 0);
4105        ASSERT_EQ(addfd.id, req.id);
4106
4107        resp.id = req.id;
4108        resp.error = 0;
4109        resp.val = USER_NOTIF_MAGIC;
4110        EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_SEND, &resp), 0);
4111
4112        /* Wait for child to finish. */
4113        EXPECT_EQ(waitpid(pid, &status, 0), pid);
4114        EXPECT_EQ(true, WIFEXITED(status));
4115        EXPECT_EQ(0, WEXITSTATUS(status));
4116
4117        close(memfd);
4118}
4119
4120TEST(user_notification_addfd_rlimit)
4121{
4122        pid_t pid;
4123        long ret;
4124        int status, listener, memfd;
4125        struct seccomp_notif_addfd addfd = {};
4126        struct seccomp_notif req = {};
4127        struct seccomp_notif_resp resp = {};
4128        const struct rlimit lim = {
4129                .rlim_cur       = 0,
4130                .rlim_max       = 0,
4131        };
4132
4133        memfd = memfd_create("test", 0);
4134        ASSERT_GE(memfd, 0);
4135
4136        ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
4137        ASSERT_EQ(0, ret) {
4138                TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
4139        }
4140
4141        /* Check that the basic notification machinery works */
4142        listener = user_notif_syscall(__NR_getppid,
4143                                      SECCOMP_FILTER_FLAG_NEW_LISTENER);
4144        ASSERT_GE(listener, 0);
4145
4146        pid = fork();
4147        ASSERT_GE(pid, 0);
4148
4149        if (pid == 0)
4150                exit(syscall(__NR_getppid) != USER_NOTIF_MAGIC);
4151
4152
4153        ASSERT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_RECV, &req), 0);
4154
4155        ASSERT_EQ(prlimit(pid, RLIMIT_NOFILE, &lim, NULL), 0);
4156
4157        addfd.srcfd = memfd;
4158        addfd.newfd_flags = O_CLOEXEC;
4159        addfd.newfd = 0;
4160        addfd.id = req.id;
4161        addfd.flags = 0;
4162
4163        /* Should probably spot check /proc/sys/fs/file-nr */
4164        EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_ADDFD, &addfd), -1);
4165        EXPECT_EQ(errno, EMFILE);
4166
4167        addfd.flags = SECCOMP_ADDFD_FLAG_SEND;
4168        EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_ADDFD, &addfd), -1);
4169        EXPECT_EQ(errno, EMFILE);
4170
4171        addfd.newfd = 100;
4172        addfd.flags = SECCOMP_ADDFD_FLAG_SETFD;
4173        EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_ADDFD, &addfd), -1);
4174        EXPECT_EQ(errno, EBADF);
4175
4176        resp.id = req.id;
4177        resp.error = 0;
4178        resp.val = USER_NOTIF_MAGIC;
4179
4180        EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_SEND, &resp), 0);
4181
4182        /* Wait for child to finish. */
4183        EXPECT_EQ(waitpid(pid, &status, 0), pid);
4184        EXPECT_EQ(true, WIFEXITED(status));
4185        EXPECT_EQ(0, WEXITSTATUS(status));
4186
4187        close(memfd);
4188}
4189
4190/*
4191 * TODO:
4192 * - expand NNP testing
4193 * - better arch-specific TRACE and TRAP handlers.
4194 * - endianness checking when appropriate
4195 * - 64-bit arg prodding
4196 * - arch value testing (x86 modes especially)
4197 * - verify that FILTER_FLAG_LOG filters generate log messages
4198 * - verify that RET_LOG generates log messages
4199 */
4200
4201TEST_HARNESS_MAIN
4202