linux/samples/seccomp/user-trap.c
<<
>>
Prefs
   1#include <signal.h>
   2#include <stdio.h>
   3#include <stdlib.h>
   4#include <unistd.h>
   5#include <errno.h>
   6#include <fcntl.h>
   7#include <string.h>
   8#include <stddef.h>
   9#include <sys/sysmacros.h>
  10#include <sys/types.h>
  11#include <sys/wait.h>
  12#include <sys/socket.h>
  13#include <sys/stat.h>
  14#include <sys/mman.h>
  15#include <sys/syscall.h>
  16#include <sys/user.h>
  17#include <sys/ioctl.h>
  18#include <sys/ptrace.h>
  19#include <sys/mount.h>
  20#include <linux/limits.h>
  21#include <linux/filter.h>
  22#include <linux/seccomp.h>
  23
  24#define ARRAY_SIZE(x) (sizeof(x) / sizeof(*(x)))
  25
  26static int seccomp(unsigned int op, unsigned int flags, void *args)
  27{
  28        errno = 0;
  29        return syscall(__NR_seccomp, op, flags, args);
  30}
  31
  32static int send_fd(int sock, int fd)
  33{
  34        struct msghdr msg = {};
  35        struct cmsghdr *cmsg;
  36        int *fd_ptr;
  37        char buf[CMSG_SPACE(sizeof(int))] = {0}, c = 'c';
  38        struct iovec io = {
  39                .iov_base = &c,
  40                .iov_len = 1,
  41        };
  42
  43        msg.msg_iov = &io;
  44        msg.msg_iovlen = 1;
  45        msg.msg_control = buf;
  46        msg.msg_controllen = sizeof(buf);
  47        cmsg = CMSG_FIRSTHDR(&msg);
  48        cmsg->cmsg_level = SOL_SOCKET;
  49        cmsg->cmsg_type = SCM_RIGHTS;
  50        cmsg->cmsg_len = CMSG_LEN(sizeof(int));
  51        fd_ptr = (int *)CMSG_DATA(cmsg);
  52        *fd_ptr = fd;
  53        msg.msg_controllen = cmsg->cmsg_len;
  54
  55        if (sendmsg(sock, &msg, 0) < 0) {
  56                perror("sendmsg");
  57                return -1;
  58        }
  59
  60        return 0;
  61}
  62
  63static int recv_fd(int sock)
  64{
  65        struct msghdr msg = {};
  66        struct cmsghdr *cmsg;
  67        int *fd_ptr;
  68        char buf[CMSG_SPACE(sizeof(int))] = {0}, c = 'c';
  69        struct iovec io = {
  70                .iov_base = &c,
  71                .iov_len = 1,
  72        };
  73
  74        msg.msg_iov = &io;
  75        msg.msg_iovlen = 1;
  76        msg.msg_control = buf;
  77        msg.msg_controllen = sizeof(buf);
  78
  79        if (recvmsg(sock, &msg, 0) < 0) {
  80                perror("recvmsg");
  81                return -1;
  82        }
  83
  84        cmsg = CMSG_FIRSTHDR(&msg);
  85        fd_ptr = (int *)CMSG_DATA(cmsg);
  86
  87        return *fd_ptr;
  88}
  89
  90static int user_trap_syscall(int nr, unsigned int flags)
  91{
  92        struct sock_filter filter[] = {
  93                BPF_STMT(BPF_LD+BPF_W+BPF_ABS,
  94                        offsetof(struct seccomp_data, nr)),
  95                BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, nr, 0, 1),
  96                BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_USER_NOTIF),
  97                BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_ALLOW),
  98        };
  99
 100        struct sock_fprog prog = {
 101                .len = (unsigned short)ARRAY_SIZE(filter),
 102                .filter = filter,
 103        };
 104
 105        return seccomp(SECCOMP_SET_MODE_FILTER, flags, &prog);
 106}
 107
 108static int handle_req(struct seccomp_notif *req,
 109                      struct seccomp_notif_resp *resp, int listener)
 110{
 111        char path[PATH_MAX], source[PATH_MAX], target[PATH_MAX];
 112        int ret = -1, mem;
 113
 114        resp->id = req->id;
 115        resp->error = -EPERM;
 116        resp->val = 0;
 117
 118        if (req->data.nr != __NR_mount) {
 119                fprintf(stderr, "huh? trapped something besides mount? %d\n", req->data.nr);
 120                return -1;
 121        }
 122
 123        /* Only allow bind mounts. */
 124        if (!(req->data.args[3] & MS_BIND))
 125                return 0;
 126
 127        /*
 128         * Ok, let's read the task's memory to see where they wanted their
 129         * mount to go.
 130         */
 131        snprintf(path, sizeof(path), "/proc/%d/mem", req->pid);
 132        mem = open(path, O_RDONLY);
 133        if (mem < 0) {
 134                perror("open mem");
 135                return -1;
 136        }
 137
 138        /*
 139         * Now we avoid a TOCTOU: we referred to a pid by its pid, but since
 140         * the pid that made the syscall may have died, we need to confirm that
 141         * the pid is still valid after we open its /proc/pid/mem file. We can
 142         * ask the listener fd this as follows.
 143         *
 144         * Note that this check should occur *after* any task-specific
 145         * resources are opened, to make sure that the task has not died and
 146         * we're not wrongly reading someone else's state in order to make
 147         * decisions.
 148         */
 149        if (ioctl(listener, SECCOMP_IOCTL_NOTIF_ID_VALID, &req->id) < 0) {
 150                fprintf(stderr, "task died before we could map its memory\n");
 151                goto out;
 152        }
 153
 154        /*
 155         * Phew, we've got the right /proc/pid/mem. Now we can read it. Note
 156         * that to avoid another TOCTOU, we should read all of the pointer args
 157         * before we decide to allow the syscall.
 158         */
 159        if (lseek(mem, req->data.args[0], SEEK_SET) < 0) {
 160                perror("seek");
 161                goto out;
 162        }
 163
 164        ret = read(mem, source, sizeof(source));
 165        if (ret < 0) {
 166                perror("read");
 167                goto out;
 168        }
 169
 170        if (lseek(mem, req->data.args[1], SEEK_SET) < 0) {
 171                perror("seek");
 172                goto out;
 173        }
 174
 175        ret = read(mem, target, sizeof(target));
 176        if (ret < 0) {
 177                perror("read");
 178                goto out;
 179        }
 180
 181        /*
 182         * Our policy is to only allow bind mounts inside /tmp. This isn't very
 183         * interesting, because we could do unprivlieged bind mounts with user
 184         * namespaces already, but you get the idea.
 185         */
 186        if (!strncmp(source, "/tmp/", 5) && !strncmp(target, "/tmp/", 5)) {
 187                if (mount(source, target, NULL, req->data.args[3], NULL) < 0) {
 188                        ret = -1;
 189                        perror("actual mount");
 190                        goto out;
 191                }
 192                resp->error = 0;
 193        }
 194
 195        /* Even if we didn't allow it because of policy, generating the
 196         * response was be a success, because we want to tell the worker EPERM.
 197         */
 198        ret = 0;
 199
 200out:
 201        close(mem);
 202        return ret;
 203}
 204
 205int main(void)
 206{
 207        int sk_pair[2], ret = 1, status, listener;
 208        pid_t worker = 0 , tracer = 0;
 209
 210        if (socketpair(PF_LOCAL, SOCK_SEQPACKET, 0, sk_pair) < 0) {
 211                perror("socketpair");
 212                return 1;
 213        }
 214
 215        worker = fork();
 216        if (worker < 0) {
 217                perror("fork");
 218                goto close_pair;
 219        }
 220
 221        if (worker == 0) {
 222                listener = user_trap_syscall(__NR_mount,
 223                                             SECCOMP_FILTER_FLAG_NEW_LISTENER);
 224                if (listener < 0) {
 225                        perror("seccomp");
 226                        exit(1);
 227                }
 228
 229                /*
 230                 * Drop privileges. We definitely can't mount as uid 1000.
 231                 */
 232                if (setuid(1000) < 0) {
 233                        perror("setuid");
 234                        exit(1);
 235                }
 236
 237                /*
 238                 * Send the listener to the parent; also serves as
 239                 * synchronization.
 240                 */
 241                if (send_fd(sk_pair[1], listener) < 0)
 242                        exit(1);
 243                close(listener);
 244
 245                if (mkdir("/tmp/foo", 0755) < 0) {
 246                        perror("mkdir");
 247                        exit(1);
 248                }
 249
 250                /*
 251                 * Try a bad mount just for grins.
 252                 */
 253                if (mount("/dev/sda", "/tmp/foo", NULL, 0, NULL) != -1) {
 254                        fprintf(stderr, "huh? mounted /dev/sda?\n");
 255                        exit(1);
 256                }
 257
 258                if (errno != EPERM) {
 259                        perror("bad error from mount");
 260                        exit(1);
 261                }
 262
 263                /*
 264                 * Ok, we expect this one to succeed.
 265                 */
 266                if (mount("/tmp/foo", "/tmp/foo", NULL, MS_BIND, NULL) < 0) {
 267                        perror("mount");
 268                        exit(1);
 269                }
 270
 271                exit(0);
 272        }
 273
 274        /*
 275         * Get the listener from the child.
 276         */
 277        listener = recv_fd(sk_pair[0]);
 278        if (listener < 0)
 279                goto out_kill;
 280
 281        /*
 282         * Fork a task to handle the requests. This isn't strictly necessary,
 283         * but it makes the particular writing of this sample easier, since we
 284         * can just wait ofr the tracee to exit and kill the tracer.
 285         */
 286        tracer = fork();
 287        if (tracer < 0) {
 288                perror("fork");
 289                goto out_kill;
 290        }
 291
 292        if (tracer == 0) {
 293                struct seccomp_notif *req;
 294                struct seccomp_notif_resp *resp;
 295                struct seccomp_notif_sizes sizes;
 296
 297                if (seccomp(SECCOMP_GET_NOTIF_SIZES, 0, &sizes) < 0) {
 298                        perror("seccomp(GET_NOTIF_SIZES)");
 299                        goto out_close;
 300                }
 301
 302                req = malloc(sizes.seccomp_notif);
 303                if (!req)
 304                        goto out_close;
 305
 306                resp = malloc(sizes.seccomp_notif_resp);
 307                if (!resp)
 308                        goto out_req;
 309                memset(resp, 0, sizes.seccomp_notif_resp);
 310
 311                while (1) {
 312                        memset(req, 0, sizes.seccomp_notif);
 313                        if (ioctl(listener, SECCOMP_IOCTL_NOTIF_RECV, req)) {
 314                                perror("ioctl recv");
 315                                goto out_resp;
 316                        }
 317
 318                        if (handle_req(req, resp, listener) < 0)
 319                                goto out_resp;
 320
 321                        /*
 322                         * ENOENT here means that the task may have gotten a
 323                         * signal and restarted the syscall. It's up to the
 324                         * handler to decide what to do in this case, but for
 325                         * the sample code, we just ignore it. Probably
 326                         * something better should happen, like undoing the
 327                         * mount, or keeping track of the args to make sure we
 328                         * don't do it again.
 329                         */
 330                        if (ioctl(listener, SECCOMP_IOCTL_NOTIF_SEND, resp) < 0 &&
 331                            errno != ENOENT) {
 332                                perror("ioctl send");
 333                                goto out_resp;
 334                        }
 335                }
 336out_resp:
 337                free(resp);
 338out_req:
 339                free(req);
 340out_close:
 341                close(listener);
 342                exit(1);
 343        }
 344
 345        close(listener);
 346
 347        if (waitpid(worker, &status, 0) != worker) {
 348                perror("waitpid");
 349                goto out_kill;
 350        }
 351
 352        if (umount2("/tmp/foo", MNT_DETACH) < 0 && errno != EINVAL) {
 353                perror("umount2");
 354                goto out_kill;
 355        }
 356
 357        if (remove("/tmp/foo") < 0 && errno != ENOENT) {
 358                perror("remove");
 359                exit(1);
 360        }
 361
 362        if (!WIFEXITED(status) || WEXITSTATUS(status)) {
 363                fprintf(stderr, "worker exited nonzero\n");
 364                goto out_kill;
 365        }
 366
 367        ret = 0;
 368
 369out_kill:
 370        if (tracer > 0)
 371                kill(tracer, SIGKILL);
 372        if (worker > 0)
 373                kill(worker, SIGKILL);
 374
 375close_pair:
 376        close(sk_pair[0]);
 377        close(sk_pair[1]);
 378        return ret;
 379}
 380