linux/kernel/rseq.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0+
   2/*
   3 * Restartable sequences system call
   4 *
   5 * Copyright (C) 2015, Google, Inc.,
   6 * Paul Turner <pjt@google.com> and Andrew Hunter <ahh@google.com>
   7 * Copyright (C) 2015-2018, EfficiOS Inc.,
   8 * Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
   9 */
  10
  11#include <linux/sched.h>
  12#include <linux/uaccess.h>
  13#include <linux/syscalls.h>
  14#include <linux/rseq.h>
  15#include <linux/types.h>
  16#include <asm/ptrace.h>
  17
  18#define CREATE_TRACE_POINTS
  19#include <trace/events/rseq.h>
  20
  21#define RSEQ_CS_PREEMPT_MIGRATE_FLAGS (RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE | \
  22                                       RSEQ_CS_FLAG_NO_RESTART_ON_PREEMPT)
  23
  24/*
  25 *
  26 * Restartable sequences are a lightweight interface that allows
  27 * user-level code to be executed atomically relative to scheduler
  28 * preemption and signal delivery. Typically used for implementing
  29 * per-cpu operations.
  30 *
  31 * It allows user-space to perform update operations on per-cpu data
  32 * without requiring heavy-weight atomic operations.
  33 *
  34 * Detailed algorithm of rseq user-space assembly sequences:
  35 *
  36 *                     init(rseq_cs)
  37 *                     cpu = TLS->rseq::cpu_id_start
  38 *   [1]               TLS->rseq::rseq_cs = rseq_cs
  39 *   [start_ip]        ----------------------------
  40 *   [2]               if (cpu != TLS->rseq::cpu_id)
  41 *                             goto abort_ip;
  42 *   [3]               <last_instruction_in_cs>
  43 *   [post_commit_ip]  ----------------------------
  44 *
  45 *   The address of jump target abort_ip must be outside the critical
  46 *   region, i.e.:
  47 *
  48 *     [abort_ip] < [start_ip]  || [abort_ip] >= [post_commit_ip]
  49 *
  50 *   Steps [2]-[3] (inclusive) need to be a sequence of instructions in
  51 *   userspace that can handle being interrupted between any of those
  52 *   instructions, and then resumed to the abort_ip.
  53 *
  54 *   1.  Userspace stores the address of the struct rseq_cs assembly
  55 *       block descriptor into the rseq_cs field of the registered
  56 *       struct rseq TLS area. This update is performed through a single
  57 *       store within the inline assembly instruction sequence.
  58 *       [start_ip]
  59 *
  60 *   2.  Userspace tests to check whether the current cpu_id field match
  61 *       the cpu number loaded before start_ip, branching to abort_ip
  62 *       in case of a mismatch.
  63 *
  64 *       If the sequence is preempted or interrupted by a signal
  65 *       at or after start_ip and before post_commit_ip, then the kernel
  66 *       clears TLS->__rseq_abi::rseq_cs, and sets the user-space return
  67 *       ip to abort_ip before returning to user-space, so the preempted
  68 *       execution resumes at abort_ip.
  69 *
  70 *   3.  Userspace critical section final instruction before
  71 *       post_commit_ip is the commit. The critical section is
  72 *       self-terminating.
  73 *       [post_commit_ip]
  74 *
  75 *   4.  <success>
  76 *
  77 *   On failure at [2], or if interrupted by preempt or signal delivery
  78 *   between [1] and [3]:
  79 *
  80 *       [abort_ip]
  81 *   F1. <failure>
  82 */
  83
  84static int rseq_update_cpu_id(struct task_struct *t)
  85{
  86        u32 cpu_id = raw_smp_processor_id();
  87        struct rseq __user *rseq = t->rseq;
  88
  89        if (!user_write_access_begin(rseq, sizeof(*rseq)))
  90                goto efault;
  91        unsafe_put_user(cpu_id, &rseq->cpu_id_start, efault_end);
  92        unsafe_put_user(cpu_id, &rseq->cpu_id, efault_end);
  93        user_write_access_end();
  94        trace_rseq_update(t);
  95        return 0;
  96
  97efault_end:
  98        user_write_access_end();
  99efault:
 100        return -EFAULT;
 101}
 102
 103static int rseq_reset_rseq_cpu_id(struct task_struct *t)
 104{
 105        u32 cpu_id_start = 0, cpu_id = RSEQ_CPU_ID_UNINITIALIZED;
 106
 107        /*
 108         * Reset cpu_id_start to its initial state (0).
 109         */
 110        if (put_user(cpu_id_start, &t->rseq->cpu_id_start))
 111                return -EFAULT;
 112        /*
 113         * Reset cpu_id to RSEQ_CPU_ID_UNINITIALIZED, so any user coming
 114         * in after unregistration can figure out that rseq needs to be
 115         * registered again.
 116         */
 117        if (put_user(cpu_id, &t->rseq->cpu_id))
 118                return -EFAULT;
 119        return 0;
 120}
 121
 122static int rseq_get_rseq_cs(struct task_struct *t, struct rseq_cs *rseq_cs)
 123{
 124        struct rseq_cs __user *urseq_cs;
 125        u64 ptr;
 126        u32 __user *usig;
 127        u32 sig;
 128        int ret;
 129
 130#ifdef CONFIG_64BIT
 131        if (get_user(ptr, &t->rseq->rseq_cs.ptr64))
 132                return -EFAULT;
 133#else
 134        if (copy_from_user(&ptr, &t->rseq->rseq_cs.ptr64, sizeof(ptr)))
 135                return -EFAULT;
 136#endif
 137        if (!ptr) {
 138                memset(rseq_cs, 0, sizeof(*rseq_cs));
 139                return 0;
 140        }
 141        if (ptr >= TASK_SIZE)
 142                return -EINVAL;
 143        urseq_cs = (struct rseq_cs __user *)(unsigned long)ptr;
 144        if (copy_from_user(rseq_cs, urseq_cs, sizeof(*rseq_cs)))
 145                return -EFAULT;
 146
 147        if (rseq_cs->start_ip >= TASK_SIZE ||
 148            rseq_cs->start_ip + rseq_cs->post_commit_offset >= TASK_SIZE ||
 149            rseq_cs->abort_ip >= TASK_SIZE ||
 150            rseq_cs->version > 0)
 151                return -EINVAL;
 152        /* Check for overflow. */
 153        if (rseq_cs->start_ip + rseq_cs->post_commit_offset < rseq_cs->start_ip)
 154                return -EINVAL;
 155        /* Ensure that abort_ip is not in the critical section. */
 156        if (rseq_cs->abort_ip - rseq_cs->start_ip < rseq_cs->post_commit_offset)
 157                return -EINVAL;
 158
 159        usig = (u32 __user *)(unsigned long)(rseq_cs->abort_ip - sizeof(u32));
 160        ret = get_user(sig, usig);
 161        if (ret)
 162                return ret;
 163
 164        if (current->rseq_sig != sig) {
 165                printk_ratelimited(KERN_WARNING
 166                        "Possible attack attempt. Unexpected rseq signature 0x%x, expecting 0x%x (pid=%d, addr=%p).\n",
 167                        sig, current->rseq_sig, current->pid, usig);
 168                return -EINVAL;
 169        }
 170        return 0;
 171}
 172
 173static int rseq_need_restart(struct task_struct *t, u32 cs_flags)
 174{
 175        u32 flags, event_mask;
 176        int ret;
 177
 178        /* Get thread flags. */
 179        ret = get_user(flags, &t->rseq->flags);
 180        if (ret)
 181                return ret;
 182
 183        /* Take critical section flags into account. */
 184        flags |= cs_flags;
 185
 186        /*
 187         * Restart on signal can only be inhibited when restart on
 188         * preempt and restart on migrate are inhibited too. Otherwise,
 189         * a preempted signal handler could fail to restart the prior
 190         * execution context on sigreturn.
 191         */
 192        if (unlikely((flags & RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL) &&
 193                     (flags & RSEQ_CS_PREEMPT_MIGRATE_FLAGS) !=
 194                     RSEQ_CS_PREEMPT_MIGRATE_FLAGS))
 195                return -EINVAL;
 196
 197        /*
 198         * Load and clear event mask atomically with respect to
 199         * scheduler preemption.
 200         */
 201        preempt_disable();
 202        event_mask = t->rseq_event_mask;
 203        t->rseq_event_mask = 0;
 204        preempt_enable();
 205
 206        return !!(event_mask & ~flags);
 207}
 208
 209static int clear_rseq_cs(struct task_struct *t)
 210{
 211        /*
 212         * The rseq_cs field is set to NULL on preemption or signal
 213         * delivery on top of rseq assembly block, as well as on top
 214         * of code outside of the rseq assembly block. This performs
 215         * a lazy clear of the rseq_cs field.
 216         *
 217         * Set rseq_cs to NULL.
 218         */
 219#ifdef CONFIG_64BIT
 220        return put_user(0UL, &t->rseq->rseq_cs.ptr64);
 221#else
 222        if (clear_user(&t->rseq->rseq_cs.ptr64, sizeof(t->rseq->rseq_cs.ptr64)))
 223                return -EFAULT;
 224        return 0;
 225#endif
 226}
 227
 228/*
 229 * Unsigned comparison will be true when ip >= start_ip, and when
 230 * ip < start_ip + post_commit_offset.
 231 */
 232static bool in_rseq_cs(unsigned long ip, struct rseq_cs *rseq_cs)
 233{
 234        return ip - rseq_cs->start_ip < rseq_cs->post_commit_offset;
 235}
 236
 237static int rseq_ip_fixup(struct pt_regs *regs)
 238{
 239        unsigned long ip = instruction_pointer(regs);
 240        struct task_struct *t = current;
 241        struct rseq_cs rseq_cs;
 242        int ret;
 243
 244        ret = rseq_get_rseq_cs(t, &rseq_cs);
 245        if (ret)
 246                return ret;
 247
 248        /*
 249         * Handle potentially not being within a critical section.
 250         * If not nested over a rseq critical section, restart is useless.
 251         * Clear the rseq_cs pointer and return.
 252         */
 253        if (!in_rseq_cs(ip, &rseq_cs))
 254                return clear_rseq_cs(t);
 255        ret = rseq_need_restart(t, rseq_cs.flags);
 256        if (ret <= 0)
 257                return ret;
 258        ret = clear_rseq_cs(t);
 259        if (ret)
 260                return ret;
 261        trace_rseq_ip_fixup(ip, rseq_cs.start_ip, rseq_cs.post_commit_offset,
 262                            rseq_cs.abort_ip);
 263        instruction_pointer_set(regs, (unsigned long)rseq_cs.abort_ip);
 264        return 0;
 265}
 266
 267/*
 268 * This resume handler must always be executed between any of:
 269 * - preemption,
 270 * - signal delivery,
 271 * and return to user-space.
 272 *
 273 * This is how we can ensure that the entire rseq critical section
 274 * will issue the commit instruction only if executed atomically with
 275 * respect to other threads scheduled on the same CPU, and with respect
 276 * to signal handlers.
 277 */
 278void __rseq_handle_notify_resume(struct ksignal *ksig, struct pt_regs *regs)
 279{
 280        struct task_struct *t = current;
 281        int ret, sig;
 282
 283        if (unlikely(t->flags & PF_EXITING))
 284                return;
 285
 286        /*
 287         * regs is NULL if and only if the caller is in a syscall path.  Skip
 288         * fixup and leave rseq_cs as is so that rseq_sycall() will detect and
 289         * kill a misbehaving userspace on debug kernels.
 290         */
 291        if (regs) {
 292                ret = rseq_ip_fixup(regs);
 293                if (unlikely(ret < 0))
 294                        goto error;
 295        }
 296        if (unlikely(rseq_update_cpu_id(t)))
 297                goto error;
 298        return;
 299
 300error:
 301        sig = ksig ? ksig->sig : 0;
 302        force_sigsegv(sig);
 303}
 304
 305#ifdef CONFIG_DEBUG_RSEQ
 306
 307/*
 308 * Terminate the process if a syscall is issued within a restartable
 309 * sequence.
 310 */
 311void rseq_syscall(struct pt_regs *regs)
 312{
 313        unsigned long ip = instruction_pointer(regs);
 314        struct task_struct *t = current;
 315        struct rseq_cs rseq_cs;
 316
 317        if (!t->rseq)
 318                return;
 319        if (rseq_get_rseq_cs(t, &rseq_cs) || in_rseq_cs(ip, &rseq_cs))
 320                force_sig(SIGSEGV);
 321}
 322
 323#endif
 324
 325/*
 326 * sys_rseq - setup restartable sequences for caller thread.
 327 */
 328SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len,
 329                int, flags, u32, sig)
 330{
 331        int ret;
 332
 333        if (flags & RSEQ_FLAG_UNREGISTER) {
 334                if (flags & ~RSEQ_FLAG_UNREGISTER)
 335                        return -EINVAL;
 336                /* Unregister rseq for current thread. */
 337                if (current->rseq != rseq || !current->rseq)
 338                        return -EINVAL;
 339                if (rseq_len != sizeof(*rseq))
 340                        return -EINVAL;
 341                if (current->rseq_sig != sig)
 342                        return -EPERM;
 343                ret = rseq_reset_rseq_cpu_id(current);
 344                if (ret)
 345                        return ret;
 346                current->rseq = NULL;
 347                current->rseq_sig = 0;
 348                return 0;
 349        }
 350
 351        if (unlikely(flags))
 352                return -EINVAL;
 353
 354        if (current->rseq) {
 355                /*
 356                 * If rseq is already registered, check whether
 357                 * the provided address differs from the prior
 358                 * one.
 359                 */
 360                if (current->rseq != rseq || rseq_len != sizeof(*rseq))
 361                        return -EINVAL;
 362                if (current->rseq_sig != sig)
 363                        return -EPERM;
 364                /* Already registered. */
 365                return -EBUSY;
 366        }
 367
 368        /*
 369         * If there was no rseq previously registered,
 370         * ensure the provided rseq is properly aligned and valid.
 371         */
 372        if (!IS_ALIGNED((unsigned long)rseq, __alignof__(*rseq)) ||
 373            rseq_len != sizeof(*rseq))
 374                return -EINVAL;
 375        if (!access_ok(rseq, rseq_len))
 376                return -EFAULT;
 377        current->rseq = rseq;
 378        current->rseq_sig = sig;
 379        /*
 380         * If rseq was previously inactive, and has just been
 381         * registered, ensure the cpu_id_start and cpu_id fields
 382         * are updated before returning to user-space.
 383         */
 384        rseq_set_notify_resume(current);
 385
 386        return 0;
 387}
 388