linux/kernel/events/uprobes.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0+
   2/*
   3 * User-space Probes (UProbes)
   4 *
   5 * Copyright (C) IBM Corporation, 2008-2012
   6 * Authors:
   7 *      Srikar Dronamraju
   8 *      Jim Keniston
   9 * Copyright (C) 2011-2012 Red Hat, Inc., Peter Zijlstra
  10 */
  11
  12#include <linux/kernel.h>
  13#include <linux/highmem.h>
  14#include <linux/pagemap.h>      /* read_mapping_page */
  15#include <linux/slab.h>
  16#include <linux/sched.h>
  17#include <linux/sched/mm.h>
  18#include <linux/sched/coredump.h>
  19#include <linux/export.h>
  20#include <linux/rmap.h>         /* anon_vma_prepare */
  21#include <linux/mmu_notifier.h> /* set_pte_at_notify */
  22#include <linux/swap.h>         /* try_to_free_swap */
  23#include <linux/ptrace.h>       /* user_enable_single_step */
  24#include <linux/kdebug.h>       /* notifier mechanism */
  25#include "../../mm/internal.h"  /* munlock_vma_page */
  26#include <linux/percpu-rwsem.h>
  27#include <linux/task_work.h>
  28#include <linux/shmem_fs.h>
  29
  30#include <linux/uprobes.h>
  31
  32#define UINSNS_PER_PAGE                 (PAGE_SIZE/UPROBE_XOL_SLOT_BYTES)
  33#define MAX_UPROBE_XOL_SLOTS            UINSNS_PER_PAGE
  34
  35static struct rb_root uprobes_tree = RB_ROOT;
  36/*
  37 * allows us to skip the uprobe_mmap if there are no uprobe events active
  38 * at this time.  Probably a fine grained per inode count is better?
  39 */
  40#define no_uprobe_events()      RB_EMPTY_ROOT(&uprobes_tree)
  41
  42static DEFINE_SPINLOCK(uprobes_treelock);       /* serialize rbtree access */
  43
  44#define UPROBES_HASH_SZ 13
  45/* serialize uprobe->pending_list */
  46static struct mutex uprobes_mmap_mutex[UPROBES_HASH_SZ];
  47#define uprobes_mmap_hash(v)    (&uprobes_mmap_mutex[((unsigned long)(v)) % UPROBES_HASH_SZ])
  48
  49static struct percpu_rw_semaphore dup_mmap_sem;
  50
  51/* Have a copy of original instruction */
  52#define UPROBE_COPY_INSN        0
  53
  54struct uprobe {
  55        struct rb_node          rb_node;        /* node in the rb tree */
  56        refcount_t              ref;
  57        struct rw_semaphore     register_rwsem;
  58        struct rw_semaphore     consumer_rwsem;
  59        struct list_head        pending_list;
  60        struct uprobe_consumer  *consumers;
  61        struct inode            *inode;         /* Also hold a ref to inode */
  62        loff_t                  offset;
  63        loff_t                  ref_ctr_offset;
  64        unsigned long           flags;
  65
  66        /*
  67         * The generic code assumes that it has two members of unknown type
  68         * owned by the arch-specific code:
  69         *
  70         *      insn -  copy_insn() saves the original instruction here for
  71         *              arch_uprobe_analyze_insn().
  72         *
  73         *      ixol -  potentially modified instruction to execute out of
  74         *              line, copied to xol_area by xol_get_insn_slot().
  75         */
  76        struct arch_uprobe      arch;
  77};
  78
  79struct delayed_uprobe {
  80        struct list_head list;
  81        struct uprobe *uprobe;
  82        struct mm_struct *mm;
  83};
  84
  85static DEFINE_MUTEX(delayed_uprobe_lock);
  86static LIST_HEAD(delayed_uprobe_list);
  87
  88/*
  89 * Execute out of line area: anonymous executable mapping installed
  90 * by the probed task to execute the copy of the original instruction
  91 * mangled by set_swbp().
  92 *
  93 * On a breakpoint hit, thread contests for a slot.  It frees the
  94 * slot after singlestep. Currently a fixed number of slots are
  95 * allocated.
  96 */
  97struct xol_area {
  98        wait_queue_head_t               wq;             /* if all slots are busy */
  99        atomic_t                        slot_count;     /* number of in-use slots */
 100        unsigned long                   *bitmap;        /* 0 = free slot */
 101
 102        struct vm_special_mapping       xol_mapping;
 103        struct page                     *pages[2];
 104        /*
 105         * We keep the vma's vm_start rather than a pointer to the vma
 106         * itself.  The probed process or a naughty kernel module could make
 107         * the vma go away, and we must handle that reasonably gracefully.
 108         */
 109        unsigned long                   vaddr;          /* Page(s) of instruction slots */
 110};
 111
 112/*
 113 * valid_vma: Verify if the specified vma is an executable vma
 114 * Relax restrictions while unregistering: vm_flags might have
 115 * changed after breakpoint was inserted.
 116 *      - is_register: indicates if we are in register context.
 117 *      - Return 1 if the specified virtual address is in an
 118 *        executable vma.
 119 */
 120static bool valid_vma(struct vm_area_struct *vma, bool is_register)
 121{
 122        vm_flags_t flags = VM_HUGETLB | VM_MAYEXEC | VM_MAYSHARE;
 123
 124        if (is_register)
 125                flags |= VM_WRITE;
 126
 127        return vma->vm_file && (vma->vm_flags & flags) == VM_MAYEXEC;
 128}
 129
 130static unsigned long offset_to_vaddr(struct vm_area_struct *vma, loff_t offset)
 131{
 132        return vma->vm_start + offset - ((loff_t)vma->vm_pgoff << PAGE_SHIFT);
 133}
 134
 135static loff_t vaddr_to_offset(struct vm_area_struct *vma, unsigned long vaddr)
 136{
 137        return ((loff_t)vma->vm_pgoff << PAGE_SHIFT) + (vaddr - vma->vm_start);
 138}
 139
 140/**
 141 * __replace_page - replace page in vma by new page.
 142 * based on replace_page in mm/ksm.c
 143 *
 144 * @vma:      vma that holds the pte pointing to page
 145 * @addr:     address the old @page is mapped at
 146 * @page:     the cowed page we are replacing by kpage
 147 * @kpage:    the modified page we replace page by
 148 *
 149 * Returns 0 on success, -EFAULT on failure.
 150 */
 151static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
 152                                struct page *old_page, struct page *new_page)
 153{
 154        struct mm_struct *mm = vma->vm_mm;
 155        struct page_vma_mapped_walk pvmw = {
 156                .page = old_page,
 157                .vma = vma,
 158                .address = addr,
 159        };
 160        int err;
 161        /* For mmu_notifiers */
 162        const unsigned long mmun_start = addr;
 163        const unsigned long mmun_end   = addr + PAGE_SIZE;
 164        struct mem_cgroup *memcg;
 165
 166        VM_BUG_ON_PAGE(PageTransHuge(old_page), old_page);
 167
 168        err = mem_cgroup_try_charge(new_page, vma->vm_mm, GFP_KERNEL, &memcg,
 169                        false);
 170        if (err)
 171                return err;
 172
 173        /* For try_to_free_swap() and munlock_vma_page() below */
 174        lock_page(old_page);
 175
 176        mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
 177        err = -EAGAIN;
 178        if (!page_vma_mapped_walk(&pvmw)) {
 179                mem_cgroup_cancel_charge(new_page, memcg, false);
 180                goto unlock;
 181        }
 182        VM_BUG_ON_PAGE(addr != pvmw.address, old_page);
 183
 184        get_page(new_page);
 185        page_add_new_anon_rmap(new_page, vma, addr, false);
 186        mem_cgroup_commit_charge(new_page, memcg, false, false);
 187        lru_cache_add_active_or_unevictable(new_page, vma);
 188
 189        if (!PageAnon(old_page)) {
 190                dec_mm_counter(mm, mm_counter_file(old_page));
 191                inc_mm_counter(mm, MM_ANONPAGES);
 192        }
 193
 194        flush_cache_page(vma, addr, pte_pfn(*pvmw.pte));
 195        ptep_clear_flush_notify(vma, addr, pvmw.pte);
 196        set_pte_at_notify(mm, addr, pvmw.pte,
 197                        mk_pte(new_page, vma->vm_page_prot));
 198
 199        page_remove_rmap(old_page, false);
 200        if (!page_mapped(old_page))
 201                try_to_free_swap(old_page);
 202        page_vma_mapped_walk_done(&pvmw);
 203
 204        if (vma->vm_flags & VM_LOCKED)
 205                munlock_vma_page(old_page);
 206        put_page(old_page);
 207
 208        err = 0;
 209 unlock:
 210        mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
 211        unlock_page(old_page);
 212        return err;
 213}
 214
 215/**
 216 * is_swbp_insn - check if instruction is breakpoint instruction.
 217 * @insn: instruction to be checked.
 218 * Default implementation of is_swbp_insn
 219 * Returns true if @insn is a breakpoint instruction.
 220 */
 221bool __weak is_swbp_insn(uprobe_opcode_t *insn)
 222{
 223        return *insn == UPROBE_SWBP_INSN;
 224}
 225
 226/**
 227 * is_trap_insn - check if instruction is breakpoint instruction.
 228 * @insn: instruction to be checked.
 229 * Default implementation of is_trap_insn
 230 * Returns true if @insn is a breakpoint instruction.
 231 *
 232 * This function is needed for the case where an architecture has multiple
 233 * trap instructions (like powerpc).
 234 */
 235bool __weak is_trap_insn(uprobe_opcode_t *insn)
 236{
 237        return is_swbp_insn(insn);
 238}
 239
 240static void copy_from_page(struct page *page, unsigned long vaddr, void *dst, int len)
 241{
 242        void *kaddr = kmap_atomic(page);
 243        memcpy(dst, kaddr + (vaddr & ~PAGE_MASK), len);
 244        kunmap_atomic(kaddr);
 245}
 246
 247static void copy_to_page(struct page *page, unsigned long vaddr, const void *src, int len)
 248{
 249        void *kaddr = kmap_atomic(page);
 250        memcpy(kaddr + (vaddr & ~PAGE_MASK), src, len);
 251        kunmap_atomic(kaddr);
 252}
 253
 254static int verify_opcode(struct page *page, unsigned long vaddr, uprobe_opcode_t *new_opcode)
 255{
 256        uprobe_opcode_t old_opcode;
 257        bool is_swbp;
 258
 259        /*
 260         * Note: We only check if the old_opcode is UPROBE_SWBP_INSN here.
 261         * We do not check if it is any other 'trap variant' which could
 262         * be conditional trap instruction such as the one powerpc supports.
 263         *
 264         * The logic is that we do not care if the underlying instruction
 265         * is a trap variant; uprobes always wins over any other (gdb)
 266         * breakpoint.
 267         */
 268        copy_from_page(page, vaddr, &old_opcode, UPROBE_SWBP_INSN_SIZE);
 269        is_swbp = is_swbp_insn(&old_opcode);
 270
 271        if (is_swbp_insn(new_opcode)) {
 272                if (is_swbp)            /* register: already installed? */
 273                        return 0;
 274        } else {
 275                if (!is_swbp)           /* unregister: was it changed by us? */
 276                        return 0;
 277        }
 278
 279        return 1;
 280}
 281
 282static struct delayed_uprobe *
 283delayed_uprobe_check(struct uprobe *uprobe, struct mm_struct *mm)
 284{
 285        struct delayed_uprobe *du;
 286
 287        list_for_each_entry(du, &delayed_uprobe_list, list)
 288                if (du->uprobe == uprobe && du->mm == mm)
 289                        return du;
 290        return NULL;
 291}
 292
 293static int delayed_uprobe_add(struct uprobe *uprobe, struct mm_struct *mm)
 294{
 295        struct delayed_uprobe *du;
 296
 297        if (delayed_uprobe_check(uprobe, mm))
 298                return 0;
 299
 300        du  = kzalloc(sizeof(*du), GFP_KERNEL);
 301        if (!du)
 302                return -ENOMEM;
 303
 304        du->uprobe = uprobe;
 305        du->mm = mm;
 306        list_add(&du->list, &delayed_uprobe_list);
 307        return 0;
 308}
 309
 310static void delayed_uprobe_delete(struct delayed_uprobe *du)
 311{
 312        if (WARN_ON(!du))
 313                return;
 314        list_del(&du->list);
 315        kfree(du);
 316}
 317
 318static void delayed_uprobe_remove(struct uprobe *uprobe, struct mm_struct *mm)
 319{
 320        struct list_head *pos, *q;
 321        struct delayed_uprobe *du;
 322
 323        if (!uprobe && !mm)
 324                return;
 325
 326        list_for_each_safe(pos, q, &delayed_uprobe_list) {
 327                du = list_entry(pos, struct delayed_uprobe, list);
 328
 329                if (uprobe && du->uprobe != uprobe)
 330                        continue;
 331                if (mm && du->mm != mm)
 332                        continue;
 333
 334                delayed_uprobe_delete(du);
 335        }
 336}
 337
 338static bool valid_ref_ctr_vma(struct uprobe *uprobe,
 339                              struct vm_area_struct *vma)
 340{
 341        unsigned long vaddr = offset_to_vaddr(vma, uprobe->ref_ctr_offset);
 342
 343        return uprobe->ref_ctr_offset &&
 344                vma->vm_file &&
 345                file_inode(vma->vm_file) == uprobe->inode &&
 346                (vma->vm_flags & (VM_WRITE|VM_SHARED)) == VM_WRITE &&
 347                vma->vm_start <= vaddr &&
 348                vma->vm_end > vaddr;
 349}
 350
 351static struct vm_area_struct *
 352find_ref_ctr_vma(struct uprobe *uprobe, struct mm_struct *mm)
 353{
 354        struct vm_area_struct *tmp;
 355
 356        for (tmp = mm->mmap; tmp; tmp = tmp->vm_next)
 357                if (valid_ref_ctr_vma(uprobe, tmp))
 358                        return tmp;
 359
 360        return NULL;
 361}
 362
 363static int
 364__update_ref_ctr(struct mm_struct *mm, unsigned long vaddr, short d)
 365{
 366        void *kaddr;
 367        struct page *page;
 368        struct vm_area_struct *vma;
 369        int ret;
 370        short *ptr;
 371
 372        if (!vaddr || !d)
 373                return -EINVAL;
 374
 375        ret = get_user_pages_remote(NULL, mm, vaddr, 1,
 376                        FOLL_WRITE, &page, &vma, NULL);
 377        if (unlikely(ret <= 0)) {
 378                /*
 379                 * We are asking for 1 page. If get_user_pages_remote() fails,
 380                 * it may return 0, in that case we have to return error.
 381                 */
 382                return ret == 0 ? -EBUSY : ret;
 383        }
 384
 385        kaddr = kmap_atomic(page);
 386        ptr = kaddr + (vaddr & ~PAGE_MASK);
 387
 388        if (unlikely(*ptr + d < 0)) {
 389                pr_warn("ref_ctr going negative. vaddr: 0x%lx, "
 390                        "curr val: %d, delta: %d\n", vaddr, *ptr, d);
 391                ret = -EINVAL;
 392                goto out;
 393        }
 394
 395        *ptr += d;
 396        ret = 0;
 397out:
 398        kunmap_atomic(kaddr);
 399        put_page(page);
 400        return ret;
 401}
 402
 403static void update_ref_ctr_warn(struct uprobe *uprobe,
 404                                struct mm_struct *mm, short d)
 405{
 406        pr_warn("ref_ctr %s failed for inode: 0x%lx offset: "
 407                "0x%llx ref_ctr_offset: 0x%llx of mm: 0x%pK\n",
 408                d > 0 ? "increment" : "decrement", uprobe->inode->i_ino,
 409                (unsigned long long) uprobe->offset,
 410                (unsigned long long) uprobe->ref_ctr_offset, mm);
 411}
 412
 413static int update_ref_ctr(struct uprobe *uprobe, struct mm_struct *mm,
 414                          short d)
 415{
 416        struct vm_area_struct *rc_vma;
 417        unsigned long rc_vaddr;
 418        int ret = 0;
 419
 420        rc_vma = find_ref_ctr_vma(uprobe, mm);
 421
 422        if (rc_vma) {
 423                rc_vaddr = offset_to_vaddr(rc_vma, uprobe->ref_ctr_offset);
 424                ret = __update_ref_ctr(mm, rc_vaddr, d);
 425                if (ret)
 426                        update_ref_ctr_warn(uprobe, mm, d);
 427
 428                if (d > 0)
 429                        return ret;
 430        }
 431
 432        mutex_lock(&delayed_uprobe_lock);
 433        if (d > 0)
 434                ret = delayed_uprobe_add(uprobe, mm);
 435        else
 436                delayed_uprobe_remove(uprobe, mm);
 437        mutex_unlock(&delayed_uprobe_lock);
 438
 439        return ret;
 440}
 441
 442/*
 443 * NOTE:
 444 * Expect the breakpoint instruction to be the smallest size instruction for
 445 * the architecture. If an arch has variable length instruction and the
 446 * breakpoint instruction is not of the smallest length instruction
 447 * supported by that architecture then we need to modify is_trap_at_addr and
 448 * uprobe_write_opcode accordingly. This would never be a problem for archs
 449 * that have fixed length instructions.
 450 *
 451 * uprobe_write_opcode - write the opcode at a given virtual address.
 452 * @mm: the probed process address space.
 453 * @vaddr: the virtual address to store the opcode.
 454 * @opcode: opcode to be written at @vaddr.
 455 *
 456 * Called with mm->mmap_sem held for write.
 457 * Return 0 (success) or a negative errno.
 458 */
 459int uprobe_write_opcode(struct arch_uprobe *auprobe, struct mm_struct *mm,
 460                        unsigned long vaddr, uprobe_opcode_t opcode)
 461{
 462        struct uprobe *uprobe;
 463        struct page *old_page, *new_page;
 464        struct vm_area_struct *vma;
 465        int ret, is_register, ref_ctr_updated = 0;
 466
 467        is_register = is_swbp_insn(&opcode);
 468        uprobe = container_of(auprobe, struct uprobe, arch);
 469
 470retry:
 471        /* Read the page with vaddr into memory */
 472        ret = get_user_pages_remote(NULL, mm, vaddr, 1,
 473                        FOLL_FORCE | FOLL_SPLIT, &old_page, &vma, NULL);
 474        if (ret <= 0)
 475                return ret;
 476
 477        ret = verify_opcode(old_page, vaddr, &opcode);
 478        if (ret <= 0)
 479                goto put_old;
 480
 481        /* We are going to replace instruction, update ref_ctr. */
 482        if (!ref_ctr_updated && uprobe->ref_ctr_offset) {
 483                ret = update_ref_ctr(uprobe, mm, is_register ? 1 : -1);
 484                if (ret)
 485                        goto put_old;
 486
 487                ref_ctr_updated = 1;
 488        }
 489
 490        ret = anon_vma_prepare(vma);
 491        if (ret)
 492                goto put_old;
 493
 494        ret = -ENOMEM;
 495        new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, vaddr);
 496        if (!new_page)
 497                goto put_old;
 498
 499        __SetPageUptodate(new_page);
 500        copy_highpage(new_page, old_page);
 501        copy_to_page(new_page, vaddr, &opcode, UPROBE_SWBP_INSN_SIZE);
 502
 503        ret = __replace_page(vma, vaddr, old_page, new_page);
 504        put_page(new_page);
 505put_old:
 506        put_page(old_page);
 507
 508        if (unlikely(ret == -EAGAIN))
 509                goto retry;
 510
 511        /* Revert back reference counter if instruction update failed. */
 512        if (ret && is_register && ref_ctr_updated)
 513                update_ref_ctr(uprobe, mm, -1);
 514
 515        return ret;
 516}
 517
 518/**
 519 * set_swbp - store breakpoint at a given address.
 520 * @auprobe: arch specific probepoint information.
 521 * @mm: the probed process address space.
 522 * @vaddr: the virtual address to insert the opcode.
 523 *
 524 * For mm @mm, store the breakpoint instruction at @vaddr.
 525 * Return 0 (success) or a negative errno.
 526 */
 527int __weak set_swbp(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long vaddr)
 528{
 529        return uprobe_write_opcode(auprobe, mm, vaddr, UPROBE_SWBP_INSN);
 530}
 531
 532/**
 533 * set_orig_insn - Restore the original instruction.
 534 * @mm: the probed process address space.
 535 * @auprobe: arch specific probepoint information.
 536 * @vaddr: the virtual address to insert the opcode.
 537 *
 538 * For mm @mm, restore the original opcode (opcode) at @vaddr.
 539 * Return 0 (success) or a negative errno.
 540 */
 541int __weak
 542set_orig_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long vaddr)
 543{
 544        return uprobe_write_opcode(auprobe, mm, vaddr,
 545                        *(uprobe_opcode_t *)&auprobe->insn);
 546}
 547
 548static struct uprobe *get_uprobe(struct uprobe *uprobe)
 549{
 550        refcount_inc(&uprobe->ref);
 551        return uprobe;
 552}
 553
 554static void put_uprobe(struct uprobe *uprobe)
 555{
 556        if (refcount_dec_and_test(&uprobe->ref)) {
 557                /*
 558                 * If application munmap(exec_vma) before uprobe_unregister()
 559                 * gets called, we don't get a chance to remove uprobe from
 560                 * delayed_uprobe_list from remove_breakpoint(). Do it here.
 561                 */
 562                mutex_lock(&delayed_uprobe_lock);
 563                delayed_uprobe_remove(uprobe, NULL);
 564                mutex_unlock(&delayed_uprobe_lock);
 565                kfree(uprobe);
 566        }
 567}
 568
 569static int match_uprobe(struct uprobe *l, struct uprobe *r)
 570{
 571        if (l->inode < r->inode)
 572                return -1;
 573
 574        if (l->inode > r->inode)
 575                return 1;
 576
 577        if (l->offset < r->offset)
 578                return -1;
 579
 580        if (l->offset > r->offset)
 581                return 1;
 582
 583        return 0;
 584}
 585
 586static struct uprobe *__find_uprobe(struct inode *inode, loff_t offset)
 587{
 588        struct uprobe u = { .inode = inode, .offset = offset };
 589        struct rb_node *n = uprobes_tree.rb_node;
 590        struct uprobe *uprobe;
 591        int match;
 592
 593        while (n) {
 594                uprobe = rb_entry(n, struct uprobe, rb_node);
 595                match = match_uprobe(&u, uprobe);
 596                if (!match)
 597                        return get_uprobe(uprobe);
 598
 599                if (match < 0)
 600                        n = n->rb_left;
 601                else
 602                        n = n->rb_right;
 603        }
 604        return NULL;
 605}
 606
 607/*
 608 * Find a uprobe corresponding to a given inode:offset
 609 * Acquires uprobes_treelock
 610 */
 611static struct uprobe *find_uprobe(struct inode *inode, loff_t offset)
 612{
 613        struct uprobe *uprobe;
 614
 615        spin_lock(&uprobes_treelock);
 616        uprobe = __find_uprobe(inode, offset);
 617        spin_unlock(&uprobes_treelock);
 618
 619        return uprobe;
 620}
 621
 622static struct uprobe *__insert_uprobe(struct uprobe *uprobe)
 623{
 624        struct rb_node **p = &uprobes_tree.rb_node;
 625        struct rb_node *parent = NULL;
 626        struct uprobe *u;
 627        int match;
 628
 629        while (*p) {
 630                parent = *p;
 631                u = rb_entry(parent, struct uprobe, rb_node);
 632                match = match_uprobe(uprobe, u);
 633                if (!match)
 634                        return get_uprobe(u);
 635
 636                if (match < 0)
 637                        p = &parent->rb_left;
 638                else
 639                        p = &parent->rb_right;
 640
 641        }
 642
 643        u = NULL;
 644        rb_link_node(&uprobe->rb_node, parent, p);
 645        rb_insert_color(&uprobe->rb_node, &uprobes_tree);
 646        /* get access + creation ref */
 647        refcount_set(&uprobe->ref, 2);
 648
 649        return u;
 650}
 651
 652/*
 653 * Acquire uprobes_treelock.
 654 * Matching uprobe already exists in rbtree;
 655 *      increment (access refcount) and return the matching uprobe.
 656 *
 657 * No matching uprobe; insert the uprobe in rb_tree;
 658 *      get a double refcount (access + creation) and return NULL.
 659 */
 660static struct uprobe *insert_uprobe(struct uprobe *uprobe)
 661{
 662        struct uprobe *u;
 663
 664        spin_lock(&uprobes_treelock);
 665        u = __insert_uprobe(uprobe);
 666        spin_unlock(&uprobes_treelock);
 667
 668        return u;
 669}
 670
 671static void
 672ref_ctr_mismatch_warn(struct uprobe *cur_uprobe, struct uprobe *uprobe)
 673{
 674        pr_warn("ref_ctr_offset mismatch. inode: 0x%lx offset: 0x%llx "
 675                "ref_ctr_offset(old): 0x%llx ref_ctr_offset(new): 0x%llx\n",
 676                uprobe->inode->i_ino, (unsigned long long) uprobe->offset,
 677                (unsigned long long) cur_uprobe->ref_ctr_offset,
 678                (unsigned long long) uprobe->ref_ctr_offset);
 679}
 680
 681static struct uprobe *alloc_uprobe(struct inode *inode, loff_t offset,
 682                                   loff_t ref_ctr_offset)
 683{
 684        struct uprobe *uprobe, *cur_uprobe;
 685
 686        uprobe = kzalloc(sizeof(struct uprobe), GFP_KERNEL);
 687        if (!uprobe)
 688                return NULL;
 689
 690        uprobe->inode = inode;
 691        uprobe->offset = offset;
 692        uprobe->ref_ctr_offset = ref_ctr_offset;
 693        init_rwsem(&uprobe->register_rwsem);
 694        init_rwsem(&uprobe->consumer_rwsem);
 695
 696        /* add to uprobes_tree, sorted on inode:offset */
 697        cur_uprobe = insert_uprobe(uprobe);
 698        /* a uprobe exists for this inode:offset combination */
 699        if (cur_uprobe) {
 700                if (cur_uprobe->ref_ctr_offset != uprobe->ref_ctr_offset) {
 701                        ref_ctr_mismatch_warn(cur_uprobe, uprobe);
 702                        put_uprobe(cur_uprobe);
 703                        kfree(uprobe);
 704                        return ERR_PTR(-EINVAL);
 705                }
 706                kfree(uprobe);
 707                uprobe = cur_uprobe;
 708        }
 709
 710        return uprobe;
 711}
 712
 713static void consumer_add(struct uprobe *uprobe, struct uprobe_consumer *uc)
 714{
 715        down_write(&uprobe->consumer_rwsem);
 716        uc->next = uprobe->consumers;
 717        uprobe->consumers = uc;
 718        up_write(&uprobe->consumer_rwsem);
 719}
 720
 721/*
 722 * For uprobe @uprobe, delete the consumer @uc.
 723 * Return true if the @uc is deleted successfully
 724 * or return false.
 725 */
 726static bool consumer_del(struct uprobe *uprobe, struct uprobe_consumer *uc)
 727{
 728        struct uprobe_consumer **con;
 729        bool ret = false;
 730
 731        down_write(&uprobe->consumer_rwsem);
 732        for (con = &uprobe->consumers; *con; con = &(*con)->next) {
 733                if (*con == uc) {
 734                        *con = uc->next;
 735                        ret = true;
 736                        break;
 737                }
 738        }
 739        up_write(&uprobe->consumer_rwsem);
 740
 741        return ret;
 742}
 743
 744static int __copy_insn(struct address_space *mapping, struct file *filp,
 745                        void *insn, int nbytes, loff_t offset)
 746{
 747        struct page *page;
 748        /*
 749         * Ensure that the page that has the original instruction is populated
 750         * and in page-cache. If ->readpage == NULL it must be shmem_mapping(),
 751         * see uprobe_register().
 752         */
 753        if (mapping->a_ops->readpage)
 754                page = read_mapping_page(mapping, offset >> PAGE_SHIFT, filp);
 755        else
 756                page = shmem_read_mapping_page(mapping, offset >> PAGE_SHIFT);
 757        if (IS_ERR(page))
 758                return PTR_ERR(page);
 759
 760        copy_from_page(page, offset, insn, nbytes);
 761        put_page(page);
 762
 763        return 0;
 764}
 765
 766static int copy_insn(struct uprobe *uprobe, struct file *filp)
 767{
 768        struct address_space *mapping = uprobe->inode->i_mapping;
 769        loff_t offs = uprobe->offset;
 770        void *insn = &uprobe->arch.insn;
 771        int size = sizeof(uprobe->arch.insn);
 772        int len, err = -EIO;
 773
 774        /* Copy only available bytes, -EIO if nothing was read */
 775        do {
 776                if (offs >= i_size_read(uprobe->inode))
 777                        break;
 778
 779                len = min_t(int, size, PAGE_SIZE - (offs & ~PAGE_MASK));
 780                err = __copy_insn(mapping, filp, insn, len, offs);
 781                if (err)
 782                        break;
 783
 784                insn += len;
 785                offs += len;
 786                size -= len;
 787        } while (size);
 788
 789        return err;
 790}
 791
 792static int prepare_uprobe(struct uprobe *uprobe, struct file *file,
 793                                struct mm_struct *mm, unsigned long vaddr)
 794{
 795        int ret = 0;
 796
 797        if (test_bit(UPROBE_COPY_INSN, &uprobe->flags))
 798                return ret;
 799
 800        /* TODO: move this into _register, until then we abuse this sem. */
 801        down_write(&uprobe->consumer_rwsem);
 802        if (test_bit(UPROBE_COPY_INSN, &uprobe->flags))
 803                goto out;
 804
 805        ret = copy_insn(uprobe, file);
 806        if (ret)
 807                goto out;
 808
 809        ret = -ENOTSUPP;
 810        if (is_trap_insn((uprobe_opcode_t *)&uprobe->arch.insn))
 811                goto out;
 812
 813        ret = arch_uprobe_analyze_insn(&uprobe->arch, mm, vaddr);
 814        if (ret)
 815                goto out;
 816
 817        /* uprobe_write_opcode() assumes we don't cross page boundary */
 818        BUG_ON((uprobe->offset & ~PAGE_MASK) +
 819                        UPROBE_SWBP_INSN_SIZE > PAGE_SIZE);
 820
 821        smp_wmb(); /* pairs with rmb() in find_active_uprobe() */
 822        set_bit(UPROBE_COPY_INSN, &uprobe->flags);
 823
 824 out:
 825        up_write(&uprobe->consumer_rwsem);
 826
 827        return ret;
 828}
 829
 830static inline bool consumer_filter(struct uprobe_consumer *uc,
 831                                   enum uprobe_filter_ctx ctx, struct mm_struct *mm)
 832{
 833        return !uc->filter || uc->filter(uc, ctx, mm);
 834}
 835
 836static bool filter_chain(struct uprobe *uprobe,
 837                         enum uprobe_filter_ctx ctx, struct mm_struct *mm)
 838{
 839        struct uprobe_consumer *uc;
 840        bool ret = false;
 841
 842        down_read(&uprobe->consumer_rwsem);
 843        for (uc = uprobe->consumers; uc; uc = uc->next) {
 844                ret = consumer_filter(uc, ctx, mm);
 845                if (ret)
 846                        break;
 847        }
 848        up_read(&uprobe->consumer_rwsem);
 849
 850        return ret;
 851}
 852
 853static int
 854install_breakpoint(struct uprobe *uprobe, struct mm_struct *mm,
 855                        struct vm_area_struct *vma, unsigned long vaddr)
 856{
 857        bool first_uprobe;
 858        int ret;
 859
 860        ret = prepare_uprobe(uprobe, vma->vm_file, mm, vaddr);
 861        if (ret)
 862                return ret;
 863
 864        /*
 865         * set MMF_HAS_UPROBES in advance for uprobe_pre_sstep_notifier(),
 866         * the task can hit this breakpoint right after __replace_page().
 867         */
 868        first_uprobe = !test_bit(MMF_HAS_UPROBES, &mm->flags);
 869        if (first_uprobe)
 870                set_bit(MMF_HAS_UPROBES, &mm->flags);
 871
 872        ret = set_swbp(&uprobe->arch, mm, vaddr);
 873        if (!ret)
 874                clear_bit(MMF_RECALC_UPROBES, &mm->flags);
 875        else if (first_uprobe)
 876                clear_bit(MMF_HAS_UPROBES, &mm->flags);
 877
 878        return ret;
 879}
 880
 881static int
 882remove_breakpoint(struct uprobe *uprobe, struct mm_struct *mm, unsigned long vaddr)
 883{
 884        set_bit(MMF_RECALC_UPROBES, &mm->flags);
 885        return set_orig_insn(&uprobe->arch, mm, vaddr);
 886}
 887
 888static inline bool uprobe_is_active(struct uprobe *uprobe)
 889{
 890        return !RB_EMPTY_NODE(&uprobe->rb_node);
 891}
 892/*
 893 * There could be threads that have already hit the breakpoint. They
 894 * will recheck the current insn and restart if find_uprobe() fails.
 895 * See find_active_uprobe().
 896 */
 897static void delete_uprobe(struct uprobe *uprobe)
 898{
 899        if (WARN_ON(!uprobe_is_active(uprobe)))
 900                return;
 901
 902        spin_lock(&uprobes_treelock);
 903        rb_erase(&uprobe->rb_node, &uprobes_tree);
 904        spin_unlock(&uprobes_treelock);
 905        RB_CLEAR_NODE(&uprobe->rb_node); /* for uprobe_is_active() */
 906        put_uprobe(uprobe);
 907}
 908
 909struct map_info {
 910        struct map_info *next;
 911        struct mm_struct *mm;
 912        unsigned long vaddr;
 913};
 914
 915static inline struct map_info *free_map_info(struct map_info *info)
 916{
 917        struct map_info *next = info->next;
 918        kfree(info);
 919        return next;
 920}
 921
 922static struct map_info *
 923build_map_info(struct address_space *mapping, loff_t offset, bool is_register)
 924{
 925        unsigned long pgoff = offset >> PAGE_SHIFT;
 926        struct vm_area_struct *vma;
 927        struct map_info *curr = NULL;
 928        struct map_info *prev = NULL;
 929        struct map_info *info;
 930        int more = 0;
 931
 932 again:
 933        i_mmap_lock_read(mapping);
 934        vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) {
 935                if (!valid_vma(vma, is_register))
 936                        continue;
 937
 938                if (!prev && !more) {
 939                        /*
 940                         * Needs GFP_NOWAIT to avoid i_mmap_rwsem recursion through
 941                         * reclaim. This is optimistic, no harm done if it fails.
 942                         */
 943                        prev = kmalloc(sizeof(struct map_info),
 944                                        GFP_NOWAIT | __GFP_NOMEMALLOC | __GFP_NOWARN);
 945                        if (prev)
 946                                prev->next = NULL;
 947                }
 948                if (!prev) {
 949                        more++;
 950                        continue;
 951                }
 952
 953                if (!mmget_not_zero(vma->vm_mm))
 954                        continue;
 955
 956                info = prev;
 957                prev = prev->next;
 958                info->next = curr;
 959                curr = info;
 960
 961                info->mm = vma->vm_mm;
 962                info->vaddr = offset_to_vaddr(vma, offset);
 963        }
 964        i_mmap_unlock_read(mapping);
 965
 966        if (!more)
 967                goto out;
 968
 969        prev = curr;
 970        while (curr) {
 971                mmput(curr->mm);
 972                curr = curr->next;
 973        }
 974
 975        do {
 976                info = kmalloc(sizeof(struct map_info), GFP_KERNEL);
 977                if (!info) {
 978                        curr = ERR_PTR(-ENOMEM);
 979                        goto out;
 980                }
 981                info->next = prev;
 982                prev = info;
 983        } while (--more);
 984
 985        goto again;
 986 out:
 987        while (prev)
 988                prev = free_map_info(prev);
 989        return curr;
 990}
 991
 992static int
 993register_for_each_vma(struct uprobe *uprobe, struct uprobe_consumer *new)
 994{
 995        bool is_register = !!new;
 996        struct map_info *info;
 997        int err = 0;
 998
 999        percpu_down_write(&dup_mmap_sem);
1000        info = build_map_info(uprobe->inode->i_mapping,
1001                                        uprobe->offset, is_register);
1002        if (IS_ERR(info)) {
1003                err = PTR_ERR(info);
1004                goto out;
1005        }
1006
1007        while (info) {
1008                struct mm_struct *mm = info->mm;
1009                struct vm_area_struct *vma;
1010
1011                if (err && is_register)
1012                        goto free;
1013
1014                down_write(&mm->mmap_sem);
1015                vma = find_vma(mm, info->vaddr);
1016                if (!vma || !valid_vma(vma, is_register) ||
1017                    file_inode(vma->vm_file) != uprobe->inode)
1018                        goto unlock;
1019
1020                if (vma->vm_start > info->vaddr ||
1021                    vaddr_to_offset(vma, info->vaddr) != uprobe->offset)
1022                        goto unlock;
1023
1024                if (is_register) {
1025                        /* consult only the "caller", new consumer. */
1026                        if (consumer_filter(new,
1027                                        UPROBE_FILTER_REGISTER, mm))
1028                                err = install_breakpoint(uprobe, mm, vma, info->vaddr);
1029                } else if (test_bit(MMF_HAS_UPROBES, &mm->flags)) {
1030                        if (!filter_chain(uprobe,
1031                                        UPROBE_FILTER_UNREGISTER, mm))
1032                                err |= remove_breakpoint(uprobe, mm, info->vaddr);
1033                }
1034
1035 unlock:
1036                up_write(&mm->mmap_sem);
1037 free:
1038                mmput(mm);
1039                info = free_map_info(info);
1040        }
1041 out:
1042        percpu_up_write(&dup_mmap_sem);
1043        return err;
1044}
1045
1046static void
1047__uprobe_unregister(struct uprobe *uprobe, struct uprobe_consumer *uc)
1048{
1049        int err;
1050
1051        if (WARN_ON(!consumer_del(uprobe, uc)))
1052                return;
1053
1054        err = register_for_each_vma(uprobe, NULL);
1055        /* TODO : cant unregister? schedule a worker thread */
1056        if (!uprobe->consumers && !err)
1057                delete_uprobe(uprobe);
1058}
1059
1060/*
1061 * uprobe_unregister - unregister an already registered probe.
1062 * @inode: the file in which the probe has to be removed.
1063 * @offset: offset from the start of the file.
1064 * @uc: identify which probe if multiple probes are colocated.
1065 */
1066void uprobe_unregister(struct inode *inode, loff_t offset, struct uprobe_consumer *uc)
1067{
1068        struct uprobe *uprobe;
1069
1070        uprobe = find_uprobe(inode, offset);
1071        if (WARN_ON(!uprobe))
1072                return;
1073
1074        down_write(&uprobe->register_rwsem);
1075        __uprobe_unregister(uprobe, uc);
1076        up_write(&uprobe->register_rwsem);
1077        put_uprobe(uprobe);
1078}
1079EXPORT_SYMBOL_GPL(uprobe_unregister);
1080
1081/*
1082 * __uprobe_register - register a probe
1083 * @inode: the file in which the probe has to be placed.
1084 * @offset: offset from the start of the file.
1085 * @uc: information on howto handle the probe..
1086 *
1087 * Apart from the access refcount, __uprobe_register() takes a creation
1088 * refcount (thro alloc_uprobe) if and only if this @uprobe is getting
1089 * inserted into the rbtree (i.e first consumer for a @inode:@offset
1090 * tuple).  Creation refcount stops uprobe_unregister from freeing the
1091 * @uprobe even before the register operation is complete. Creation
1092 * refcount is released when the last @uc for the @uprobe
1093 * unregisters. Caller of __uprobe_register() is required to keep @inode
1094 * (and the containing mount) referenced.
1095 *
1096 * Return errno if it cannot successully install probes
1097 * else return 0 (success)
1098 */
1099static int __uprobe_register(struct inode *inode, loff_t offset,
1100                             loff_t ref_ctr_offset, struct uprobe_consumer *uc)
1101{
1102        struct uprobe *uprobe;
1103        int ret;
1104
1105        /* Uprobe must have at least one set consumer */
1106        if (!uc->handler && !uc->ret_handler)
1107                return -EINVAL;
1108
1109        /* copy_insn() uses read_mapping_page() or shmem_read_mapping_page() */
1110        if (!inode->i_mapping->a_ops->readpage && !shmem_mapping(inode->i_mapping))
1111                return -EIO;
1112        /* Racy, just to catch the obvious mistakes */
1113        if (offset > i_size_read(inode))
1114                return -EINVAL;
1115
1116 retry:
1117        uprobe = alloc_uprobe(inode, offset, ref_ctr_offset);
1118        if (!uprobe)
1119                return -ENOMEM;
1120        if (IS_ERR(uprobe))
1121                return PTR_ERR(uprobe);
1122
1123        /*
1124         * We can race with uprobe_unregister()->delete_uprobe().
1125         * Check uprobe_is_active() and retry if it is false.
1126         */
1127        down_write(&uprobe->register_rwsem);
1128        ret = -EAGAIN;
1129        if (likely(uprobe_is_active(uprobe))) {
1130                consumer_add(uprobe, uc);
1131                ret = register_for_each_vma(uprobe, uc);
1132                if (ret)
1133                        __uprobe_unregister(uprobe, uc);
1134        }
1135        up_write(&uprobe->register_rwsem);
1136        put_uprobe(uprobe);
1137
1138        if (unlikely(ret == -EAGAIN))
1139                goto retry;
1140        return ret;
1141}
1142
1143int uprobe_register(struct inode *inode, loff_t offset,
1144                    struct uprobe_consumer *uc)
1145{
1146        return __uprobe_register(inode, offset, 0, uc);
1147}
1148EXPORT_SYMBOL_GPL(uprobe_register);
1149
1150int uprobe_register_refctr(struct inode *inode, loff_t offset,
1151                           loff_t ref_ctr_offset, struct uprobe_consumer *uc)
1152{
1153        return __uprobe_register(inode, offset, ref_ctr_offset, uc);
1154}
1155EXPORT_SYMBOL_GPL(uprobe_register_refctr);
1156
1157/*
1158 * uprobe_apply - unregister an already registered probe.
1159 * @inode: the file in which the probe has to be removed.
1160 * @offset: offset from the start of the file.
1161 * @uc: consumer which wants to add more or remove some breakpoints
1162 * @add: add or remove the breakpoints
1163 */
1164int uprobe_apply(struct inode *inode, loff_t offset,
1165                        struct uprobe_consumer *uc, bool add)
1166{
1167        struct uprobe *uprobe;
1168        struct uprobe_consumer *con;
1169        int ret = -ENOENT;
1170
1171        uprobe = find_uprobe(inode, offset);
1172        if (WARN_ON(!uprobe))
1173                return ret;
1174
1175        down_write(&uprobe->register_rwsem);
1176        for (con = uprobe->consumers; con && con != uc ; con = con->next)
1177                ;
1178        if (con)
1179                ret = register_for_each_vma(uprobe, add ? uc : NULL);
1180        up_write(&uprobe->register_rwsem);
1181        put_uprobe(uprobe);
1182
1183        return ret;
1184}
1185
1186static int unapply_uprobe(struct uprobe *uprobe, struct mm_struct *mm)
1187{
1188        struct vm_area_struct *vma;
1189        int err = 0;
1190
1191        down_read(&mm->mmap_sem);
1192        for (vma = mm->mmap; vma; vma = vma->vm_next) {
1193                unsigned long vaddr;
1194                loff_t offset;
1195
1196                if (!valid_vma(vma, false) ||
1197                    file_inode(vma->vm_file) != uprobe->inode)
1198                        continue;
1199
1200                offset = (loff_t)vma->vm_pgoff << PAGE_SHIFT;
1201                if (uprobe->offset <  offset ||
1202                    uprobe->offset >= offset + vma->vm_end - vma->vm_start)
1203                        continue;
1204
1205                vaddr = offset_to_vaddr(vma, uprobe->offset);
1206                err |= remove_breakpoint(uprobe, mm, vaddr);
1207        }
1208        up_read(&mm->mmap_sem);
1209
1210        return err;
1211}
1212
1213static struct rb_node *
1214find_node_in_range(struct inode *inode, loff_t min, loff_t max)
1215{
1216        struct rb_node *n = uprobes_tree.rb_node;
1217
1218        while (n) {
1219                struct uprobe *u = rb_entry(n, struct uprobe, rb_node);
1220
1221                if (inode < u->inode) {
1222                        n = n->rb_left;
1223                } else if (inode > u->inode) {
1224                        n = n->rb_right;
1225                } else {
1226                        if (max < u->offset)
1227                                n = n->rb_left;
1228                        else if (min > u->offset)
1229                                n = n->rb_right;
1230                        else
1231                                break;
1232                }
1233        }
1234
1235        return n;
1236}
1237
1238/*
1239 * For a given range in vma, build a list of probes that need to be inserted.
1240 */
1241static void build_probe_list(struct inode *inode,
1242                                struct vm_area_struct *vma,
1243                                unsigned long start, unsigned long end,
1244                                struct list_head *head)
1245{
1246        loff_t min, max;
1247        struct rb_node *n, *t;
1248        struct uprobe *u;
1249
1250        INIT_LIST_HEAD(head);
1251        min = vaddr_to_offset(vma, start);
1252        max = min + (end - start) - 1;
1253
1254        spin_lock(&uprobes_treelock);
1255        n = find_node_in_range(inode, min, max);
1256        if (n) {
1257                for (t = n; t; t = rb_prev(t)) {
1258                        u = rb_entry(t, struct uprobe, rb_node);
1259                        if (u->inode != inode || u->offset < min)
1260                                break;
1261                        list_add(&u->pending_list, head);
1262                        get_uprobe(u);
1263                }
1264                for (t = n; (t = rb_next(t)); ) {
1265                        u = rb_entry(t, struct uprobe, rb_node);
1266                        if (u->inode != inode || u->offset > max)
1267                                break;
1268                        list_add(&u->pending_list, head);
1269                        get_uprobe(u);
1270                }
1271        }
1272        spin_unlock(&uprobes_treelock);
1273}
1274
1275/* @vma contains reference counter, not the probed instruction. */
1276static int delayed_ref_ctr_inc(struct vm_area_struct *vma)
1277{
1278        struct list_head *pos, *q;
1279        struct delayed_uprobe *du;
1280        unsigned long vaddr;
1281        int ret = 0, err = 0;
1282
1283        mutex_lock(&delayed_uprobe_lock);
1284        list_for_each_safe(pos, q, &delayed_uprobe_list) {
1285                du = list_entry(pos, struct delayed_uprobe, list);
1286
1287                if (du->mm != vma->vm_mm ||
1288                    !valid_ref_ctr_vma(du->uprobe, vma))
1289                        continue;
1290
1291                vaddr = offset_to_vaddr(vma, du->uprobe->ref_ctr_offset);
1292                ret = __update_ref_ctr(vma->vm_mm, vaddr, 1);
1293                if (ret) {
1294                        update_ref_ctr_warn(du->uprobe, vma->vm_mm, 1);
1295                        if (!err)
1296                                err = ret;
1297                }
1298                delayed_uprobe_delete(du);
1299        }
1300        mutex_unlock(&delayed_uprobe_lock);
1301        return err;
1302}
1303
1304/*
1305 * Called from mmap_region/vma_adjust with mm->mmap_sem acquired.
1306 *
1307 * Currently we ignore all errors and always return 0, the callers
1308 * can't handle the failure anyway.
1309 */
1310int uprobe_mmap(struct vm_area_struct *vma)
1311{
1312        struct list_head tmp_list;
1313        struct uprobe *uprobe, *u;
1314        struct inode *inode;
1315
1316        if (no_uprobe_events())
1317                return 0;
1318
1319        if (vma->vm_file &&
1320            (vma->vm_flags & (VM_WRITE|VM_SHARED)) == VM_WRITE &&
1321            test_bit(MMF_HAS_UPROBES, &vma->vm_mm->flags))
1322                delayed_ref_ctr_inc(vma);
1323
1324        if (!valid_vma(vma, true))
1325                return 0;
1326
1327        inode = file_inode(vma->vm_file);
1328        if (!inode)
1329                return 0;
1330
1331        mutex_lock(uprobes_mmap_hash(inode));
1332        build_probe_list(inode, vma, vma->vm_start, vma->vm_end, &tmp_list);
1333        /*
1334         * We can race with uprobe_unregister(), this uprobe can be already
1335         * removed. But in this case filter_chain() must return false, all
1336         * consumers have gone away.
1337         */
1338        list_for_each_entry_safe(uprobe, u, &tmp_list, pending_list) {
1339                if (!fatal_signal_pending(current) &&
1340                    filter_chain(uprobe, UPROBE_FILTER_MMAP, vma->vm_mm)) {
1341                        unsigned long vaddr = offset_to_vaddr(vma, uprobe->offset);
1342                        install_breakpoint(uprobe, vma->vm_mm, vma, vaddr);
1343                }
1344                put_uprobe(uprobe);
1345        }
1346        mutex_unlock(uprobes_mmap_hash(inode));
1347
1348        return 0;
1349}
1350
1351static bool
1352vma_has_uprobes(struct vm_area_struct *vma, unsigned long start, unsigned long end)
1353{
1354        loff_t min, max;
1355        struct inode *inode;
1356        struct rb_node *n;
1357
1358        inode = file_inode(vma->vm_file);
1359
1360        min = vaddr_to_offset(vma, start);
1361        max = min + (end - start) - 1;
1362
1363        spin_lock(&uprobes_treelock);
1364        n = find_node_in_range(inode, min, max);
1365        spin_unlock(&uprobes_treelock);
1366
1367        return !!n;
1368}
1369
1370/*
1371 * Called in context of a munmap of a vma.
1372 */
1373void uprobe_munmap(struct vm_area_struct *vma, unsigned long start, unsigned long end)
1374{
1375        if (no_uprobe_events() || !valid_vma(vma, false))
1376                return;
1377
1378        if (!atomic_read(&vma->vm_mm->mm_users)) /* called by mmput() ? */
1379                return;
1380
1381        if (!test_bit(MMF_HAS_UPROBES, &vma->vm_mm->flags) ||
1382             test_bit(MMF_RECALC_UPROBES, &vma->vm_mm->flags))
1383                return;
1384
1385        if (vma_has_uprobes(vma, start, end))
1386                set_bit(MMF_RECALC_UPROBES, &vma->vm_mm->flags);
1387}
1388
1389/* Slot allocation for XOL */
1390static int xol_add_vma(struct mm_struct *mm, struct xol_area *area)
1391{
1392        struct vm_area_struct *vma;
1393        int ret;
1394
1395        if (down_write_killable(&mm->mmap_sem))
1396                return -EINTR;
1397
1398        if (mm->uprobes_state.xol_area) {
1399                ret = -EALREADY;
1400                goto fail;
1401        }
1402
1403        if (!area->vaddr) {
1404                /* Try to map as high as possible, this is only a hint. */
1405                area->vaddr = get_unmapped_area(NULL, TASK_SIZE - PAGE_SIZE,
1406                                                PAGE_SIZE, 0, 0);
1407                if (area->vaddr & ~PAGE_MASK) {
1408                        ret = area->vaddr;
1409                        goto fail;
1410                }
1411        }
1412
1413        vma = _install_special_mapping(mm, area->vaddr, PAGE_SIZE,
1414                                VM_EXEC|VM_MAYEXEC|VM_DONTCOPY|VM_IO,
1415                                &area->xol_mapping);
1416        if (IS_ERR(vma)) {
1417                ret = PTR_ERR(vma);
1418                goto fail;
1419        }
1420
1421        ret = 0;
1422        /* pairs with get_xol_area() */
1423        smp_store_release(&mm->uprobes_state.xol_area, area); /* ^^^ */
1424 fail:
1425        up_write(&mm->mmap_sem);
1426
1427        return ret;
1428}
1429
1430static struct xol_area *__create_xol_area(unsigned long vaddr)
1431{
1432        struct mm_struct *mm = current->mm;
1433        uprobe_opcode_t insn = UPROBE_SWBP_INSN;
1434        struct xol_area *area;
1435
1436        area = kmalloc(sizeof(*area), GFP_KERNEL);
1437        if (unlikely(!area))
1438                goto out;
1439
1440        area->bitmap = kcalloc(BITS_TO_LONGS(UINSNS_PER_PAGE), sizeof(long),
1441                               GFP_KERNEL);
1442        if (!area->bitmap)
1443                goto free_area;
1444
1445        area->xol_mapping.name = "[uprobes]";
1446        area->xol_mapping.fault = NULL;
1447        area->xol_mapping.pages = area->pages;
1448        area->pages[0] = alloc_page(GFP_HIGHUSER);
1449        if (!area->pages[0])
1450                goto free_bitmap;
1451        area->pages[1] = NULL;
1452
1453        area->vaddr = vaddr;
1454        init_waitqueue_head(&area->wq);
1455        /* Reserve the 1st slot for get_trampoline_vaddr() */
1456        set_bit(0, area->bitmap);
1457        atomic_set(&area->slot_count, 1);
1458        arch_uprobe_copy_ixol(area->pages[0], 0, &insn, UPROBE_SWBP_INSN_SIZE);
1459
1460        if (!xol_add_vma(mm, area))
1461                return area;
1462
1463        __free_page(area->pages[0]);
1464 free_bitmap:
1465        kfree(area->bitmap);
1466 free_area:
1467        kfree(area);
1468 out:
1469        return NULL;
1470}
1471
1472/*
1473 * get_xol_area - Allocate process's xol_area if necessary.
1474 * This area will be used for storing instructions for execution out of line.
1475 *
1476 * Returns the allocated area or NULL.
1477 */
1478static struct xol_area *get_xol_area(void)
1479{
1480        struct mm_struct *mm = current->mm;
1481        struct xol_area *area;
1482
1483        if (!mm->uprobes_state.xol_area)
1484                __create_xol_area(0);
1485
1486        /* Pairs with xol_add_vma() smp_store_release() */
1487        area = READ_ONCE(mm->uprobes_state.xol_area); /* ^^^ */
1488        return area;
1489}
1490
1491/*
1492 * uprobe_clear_state - Free the area allocated for slots.
1493 */
1494void uprobe_clear_state(struct mm_struct *mm)
1495{
1496        struct xol_area *area = mm->uprobes_state.xol_area;
1497
1498        mutex_lock(&delayed_uprobe_lock);
1499        delayed_uprobe_remove(NULL, mm);
1500        mutex_unlock(&delayed_uprobe_lock);
1501
1502        if (!area)
1503                return;
1504
1505        put_page(area->pages[0]);
1506        kfree(area->bitmap);
1507        kfree(area);
1508}
1509
1510void uprobe_start_dup_mmap(void)
1511{
1512        percpu_down_read(&dup_mmap_sem);
1513}
1514
1515void uprobe_end_dup_mmap(void)
1516{
1517        percpu_up_read(&dup_mmap_sem);
1518}
1519
1520void uprobe_dup_mmap(struct mm_struct *oldmm, struct mm_struct *newmm)
1521{
1522        if (test_bit(MMF_HAS_UPROBES, &oldmm->flags)) {
1523                set_bit(MMF_HAS_UPROBES, &newmm->flags);
1524                /* unconditionally, dup_mmap() skips VM_DONTCOPY vmas */
1525                set_bit(MMF_RECALC_UPROBES, &newmm->flags);
1526        }
1527}
1528
1529/*
1530 *  - search for a free slot.
1531 */
1532static unsigned long xol_take_insn_slot(struct xol_area *area)
1533{
1534        unsigned long slot_addr;
1535        int slot_nr;
1536
1537        do {
1538                slot_nr = find_first_zero_bit(area->bitmap, UINSNS_PER_PAGE);
1539                if (slot_nr < UINSNS_PER_PAGE) {
1540                        if (!test_and_set_bit(slot_nr, area->bitmap))
1541                                break;
1542
1543                        slot_nr = UINSNS_PER_PAGE;
1544                        continue;
1545                }
1546                wait_event(area->wq, (atomic_read(&area->slot_count) < UINSNS_PER_PAGE));
1547        } while (slot_nr >= UINSNS_PER_PAGE);
1548
1549        slot_addr = area->vaddr + (slot_nr * UPROBE_XOL_SLOT_BYTES);
1550        atomic_inc(&area->slot_count);
1551
1552        return slot_addr;
1553}
1554
1555/*
1556 * xol_get_insn_slot - allocate a slot for xol.
1557 * Returns the allocated slot address or 0.
1558 */
1559static unsigned long xol_get_insn_slot(struct uprobe *uprobe)
1560{
1561        struct xol_area *area;
1562        unsigned long xol_vaddr;
1563
1564        area = get_xol_area();
1565        if (!area)
1566                return 0;
1567
1568        xol_vaddr = xol_take_insn_slot(area);
1569        if (unlikely(!xol_vaddr))
1570                return 0;
1571
1572        arch_uprobe_copy_ixol(area->pages[0], xol_vaddr,
1573                              &uprobe->arch.ixol, sizeof(uprobe->arch.ixol));
1574
1575        return xol_vaddr;
1576}
1577
1578/*
1579 * xol_free_insn_slot - If slot was earlier allocated by
1580 * @xol_get_insn_slot(), make the slot available for
1581 * subsequent requests.
1582 */
1583static void xol_free_insn_slot(struct task_struct *tsk)
1584{
1585        struct xol_area *area;
1586        unsigned long vma_end;
1587        unsigned long slot_addr;
1588
1589        if (!tsk->mm || !tsk->mm->uprobes_state.xol_area || !tsk->utask)
1590                return;
1591
1592        slot_addr = tsk->utask->xol_vaddr;
1593        if (unlikely(!slot_addr))
1594                return;
1595
1596        area = tsk->mm->uprobes_state.xol_area;
1597        vma_end = area->vaddr + PAGE_SIZE;
1598        if (area->vaddr <= slot_addr && slot_addr < vma_end) {
1599                unsigned long offset;
1600                int slot_nr;
1601
1602                offset = slot_addr - area->vaddr;
1603                slot_nr = offset / UPROBE_XOL_SLOT_BYTES;
1604                if (slot_nr >= UINSNS_PER_PAGE)
1605                        return;
1606
1607                clear_bit(slot_nr, area->bitmap);
1608                atomic_dec(&area->slot_count);
1609                smp_mb__after_atomic(); /* pairs with prepare_to_wait() */
1610                if (waitqueue_active(&area->wq))
1611                        wake_up(&area->wq);
1612
1613                tsk->utask->xol_vaddr = 0;
1614        }
1615}
1616
1617void __weak arch_uprobe_copy_ixol(struct page *page, unsigned long vaddr,
1618                                  void *src, unsigned long len)
1619{
1620        /* Initialize the slot */
1621        copy_to_page(page, vaddr, src, len);
1622
1623        /*
1624         * We probably need flush_icache_user_range() but it needs vma.
1625         * This should work on most of architectures by default. If
1626         * architecture needs to do something different it can define
1627         * its own version of the function.
1628         */
1629        flush_dcache_page(page);
1630}
1631
1632/**
1633 * uprobe_get_swbp_addr - compute address of swbp given post-swbp regs
1634 * @regs: Reflects the saved state of the task after it has hit a breakpoint
1635 * instruction.
1636 * Return the address of the breakpoint instruction.
1637 */
1638unsigned long __weak uprobe_get_swbp_addr(struct pt_regs *regs)
1639{
1640        return instruction_pointer(regs) - UPROBE_SWBP_INSN_SIZE;
1641}
1642
1643unsigned long uprobe_get_trap_addr(struct pt_regs *regs)
1644{
1645        struct uprobe_task *utask = current->utask;
1646
1647        if (unlikely(utask && utask->active_uprobe))
1648                return utask->vaddr;
1649
1650        return instruction_pointer(regs);
1651}
1652
1653static struct return_instance *free_ret_instance(struct return_instance *ri)
1654{
1655        struct return_instance *next = ri->next;
1656        put_uprobe(ri->uprobe);
1657        kfree(ri);
1658        return next;
1659}
1660
1661/*
1662 * Called with no locks held.
1663 * Called in context of an exiting or an exec-ing thread.
1664 */
1665void uprobe_free_utask(struct task_struct *t)
1666{
1667        struct uprobe_task *utask = t->utask;
1668        struct return_instance *ri;
1669
1670        if (!utask)
1671                return;
1672
1673        if (utask->active_uprobe)
1674                put_uprobe(utask->active_uprobe);
1675
1676        ri = utask->return_instances;
1677        while (ri)
1678                ri = free_ret_instance(ri);
1679
1680        xol_free_insn_slot(t);
1681        kfree(utask);
1682        t->utask = NULL;
1683}
1684
1685/*
1686 * Allocate a uprobe_task object for the task if if necessary.
1687 * Called when the thread hits a breakpoint.
1688 *
1689 * Returns:
1690 * - pointer to new uprobe_task on success
1691 * - NULL otherwise
1692 */
1693static struct uprobe_task *get_utask(void)
1694{
1695        if (!current->utask)
1696                current->utask = kzalloc(sizeof(struct uprobe_task), GFP_KERNEL);
1697        return current->utask;
1698}
1699
1700static int dup_utask(struct task_struct *t, struct uprobe_task *o_utask)
1701{
1702        struct uprobe_task *n_utask;
1703        struct return_instance **p, *o, *n;
1704
1705        n_utask = kzalloc(sizeof(struct uprobe_task), GFP_KERNEL);
1706        if (!n_utask)
1707                return -ENOMEM;
1708        t->utask = n_utask;
1709
1710        p = &n_utask->return_instances;
1711        for (o = o_utask->return_instances; o; o = o->next) {
1712                n = kmalloc(sizeof(struct return_instance), GFP_KERNEL);
1713                if (!n)
1714                        return -ENOMEM;
1715
1716                *n = *o;
1717                get_uprobe(n->uprobe);
1718                n->next = NULL;
1719
1720                *p = n;
1721                p = &n->next;
1722                n_utask->depth++;
1723        }
1724
1725        return 0;
1726}
1727
1728static void uprobe_warn(struct task_struct *t, const char *msg)
1729{
1730        pr_warn("uprobe: %s:%d failed to %s\n",
1731                        current->comm, current->pid, msg);
1732}
1733
1734static void dup_xol_work(struct callback_head *work)
1735{
1736        if (current->flags & PF_EXITING)
1737                return;
1738
1739        if (!__create_xol_area(current->utask->dup_xol_addr) &&
1740                        !fatal_signal_pending(current))
1741                uprobe_warn(current, "dup xol area");
1742}
1743
1744/*
1745 * Called in context of a new clone/fork from copy_process.
1746 */
1747void uprobe_copy_process(struct task_struct *t, unsigned long flags)
1748{
1749        struct uprobe_task *utask = current->utask;
1750        struct mm_struct *mm = current->mm;
1751        struct xol_area *area;
1752
1753        t->utask = NULL;
1754
1755        if (!utask || !utask->return_instances)
1756                return;
1757
1758        if (mm == t->mm && !(flags & CLONE_VFORK))
1759                return;
1760
1761        if (dup_utask(t, utask))
1762                return uprobe_warn(t, "dup ret instances");
1763
1764        /* The task can fork() after dup_xol_work() fails */
1765        area = mm->uprobes_state.xol_area;
1766        if (!area)
1767                return uprobe_warn(t, "dup xol area");
1768
1769        if (mm == t->mm)
1770                return;
1771
1772        t->utask->dup_xol_addr = area->vaddr;
1773        init_task_work(&t->utask->dup_xol_work, dup_xol_work);
1774        task_work_add(t, &t->utask->dup_xol_work, true);
1775}
1776
1777/*
1778 * Current area->vaddr notion assume the trampoline address is always
1779 * equal area->vaddr.
1780 *
1781 * Returns -1 in case the xol_area is not allocated.
1782 */
1783static unsigned long get_trampoline_vaddr(void)
1784{
1785        struct xol_area *area;
1786        unsigned long trampoline_vaddr = -1;
1787
1788        /* Pairs with xol_add_vma() smp_store_release() */
1789        area = READ_ONCE(current->mm->uprobes_state.xol_area); /* ^^^ */
1790        if (area)
1791                trampoline_vaddr = area->vaddr;
1792
1793        return trampoline_vaddr;
1794}
1795
1796static void cleanup_return_instances(struct uprobe_task *utask, bool chained,
1797                                        struct pt_regs *regs)
1798{
1799        struct return_instance *ri = utask->return_instances;
1800        enum rp_check ctx = chained ? RP_CHECK_CHAIN_CALL : RP_CHECK_CALL;
1801
1802        while (ri && !arch_uretprobe_is_alive(ri, ctx, regs)) {
1803                ri = free_ret_instance(ri);
1804                utask->depth--;
1805        }
1806        utask->return_instances = ri;
1807}
1808
1809static void prepare_uretprobe(struct uprobe *uprobe, struct pt_regs *regs)
1810{
1811        struct return_instance *ri;
1812        struct uprobe_task *utask;
1813        unsigned long orig_ret_vaddr, trampoline_vaddr;
1814        bool chained;
1815
1816        if (!get_xol_area())
1817                return;
1818
1819        utask = get_utask();
1820        if (!utask)
1821                return;
1822
1823        if (utask->depth >= MAX_URETPROBE_DEPTH) {
1824                printk_ratelimited(KERN_INFO "uprobe: omit uretprobe due to"
1825                                " nestedness limit pid/tgid=%d/%d\n",
1826                                current->pid, current->tgid);
1827                return;
1828        }
1829
1830        ri = kmalloc(sizeof(struct return_instance), GFP_KERNEL);
1831        if (!ri)
1832                return;
1833
1834        trampoline_vaddr = get_trampoline_vaddr();
1835        orig_ret_vaddr = arch_uretprobe_hijack_return_addr(trampoline_vaddr, regs);
1836        if (orig_ret_vaddr == -1)
1837                goto fail;
1838
1839        /* drop the entries invalidated by longjmp() */
1840        chained = (orig_ret_vaddr == trampoline_vaddr);
1841        cleanup_return_instances(utask, chained, regs);
1842
1843        /*
1844         * We don't want to keep trampoline address in stack, rather keep the
1845         * original return address of first caller thru all the consequent
1846         * instances. This also makes breakpoint unwrapping easier.
1847         */
1848        if (chained) {
1849                if (!utask->return_instances) {
1850                        /*
1851                         * This situation is not possible. Likely we have an
1852                         * attack from user-space.
1853                         */
1854                        uprobe_warn(current, "handle tail call");
1855                        goto fail;
1856                }
1857                orig_ret_vaddr = utask->return_instances->orig_ret_vaddr;
1858        }
1859
1860        ri->uprobe = get_uprobe(uprobe);
1861        ri->func = instruction_pointer(regs);
1862        ri->stack = user_stack_pointer(regs);
1863        ri->orig_ret_vaddr = orig_ret_vaddr;
1864        ri->chained = chained;
1865
1866        utask->depth++;
1867        ri->next = utask->return_instances;
1868        utask->return_instances = ri;
1869
1870        return;
1871 fail:
1872        kfree(ri);
1873}
1874
1875/* Prepare to single-step probed instruction out of line. */
1876static int
1877pre_ssout(struct uprobe *uprobe, struct pt_regs *regs, unsigned long bp_vaddr)
1878{
1879        struct uprobe_task *utask;
1880        unsigned long xol_vaddr;
1881        int err;
1882
1883        utask = get_utask();
1884        if (!utask)
1885                return -ENOMEM;
1886
1887        xol_vaddr = xol_get_insn_slot(uprobe);
1888        if (!xol_vaddr)
1889                return -ENOMEM;
1890
1891        utask->xol_vaddr = xol_vaddr;
1892        utask->vaddr = bp_vaddr;
1893
1894        err = arch_uprobe_pre_xol(&uprobe->arch, regs);
1895        if (unlikely(err)) {
1896                xol_free_insn_slot(current);
1897                return err;
1898        }
1899
1900        utask->active_uprobe = uprobe;
1901        utask->state = UTASK_SSTEP;
1902        return 0;
1903}
1904
1905/*
1906 * If we are singlestepping, then ensure this thread is not connected to
1907 * non-fatal signals until completion of singlestep.  When xol insn itself
1908 * triggers the signal,  restart the original insn even if the task is
1909 * already SIGKILL'ed (since coredump should report the correct ip).  This
1910 * is even more important if the task has a handler for SIGSEGV/etc, The
1911 * _same_ instruction should be repeated again after return from the signal
1912 * handler, and SSTEP can never finish in this case.
1913 */
1914bool uprobe_deny_signal(void)
1915{
1916        struct task_struct *t = current;
1917        struct uprobe_task *utask = t->utask;
1918
1919        if (likely(!utask || !utask->active_uprobe))
1920                return false;
1921
1922        WARN_ON_ONCE(utask->state != UTASK_SSTEP);
1923
1924        if (signal_pending(t)) {
1925                spin_lock_irq(&t->sighand->siglock);
1926                clear_tsk_thread_flag(t, TIF_SIGPENDING);
1927                spin_unlock_irq(&t->sighand->siglock);
1928
1929                if (__fatal_signal_pending(t) || arch_uprobe_xol_was_trapped(t)) {
1930                        utask->state = UTASK_SSTEP_TRAPPED;
1931                        set_tsk_thread_flag(t, TIF_UPROBE);
1932                }
1933        }
1934
1935        return true;
1936}
1937
1938static void mmf_recalc_uprobes(struct mm_struct *mm)
1939{
1940        struct vm_area_struct *vma;
1941
1942        for (vma = mm->mmap; vma; vma = vma->vm_next) {
1943                if (!valid_vma(vma, false))
1944                        continue;
1945                /*
1946                 * This is not strictly accurate, we can race with
1947                 * uprobe_unregister() and see the already removed
1948                 * uprobe if delete_uprobe() was not yet called.
1949                 * Or this uprobe can be filtered out.
1950                 */
1951                if (vma_has_uprobes(vma, vma->vm_start, vma->vm_end))
1952                        return;
1953        }
1954
1955        clear_bit(MMF_HAS_UPROBES, &mm->flags);
1956}
1957
1958static int is_trap_at_addr(struct mm_struct *mm, unsigned long vaddr)
1959{
1960        struct page *page;
1961        uprobe_opcode_t opcode;
1962        int result;
1963
1964        pagefault_disable();
1965        result = __get_user(opcode, (uprobe_opcode_t __user *)vaddr);
1966        pagefault_enable();
1967
1968        if (likely(result == 0))
1969                goto out;
1970
1971        /*
1972         * The NULL 'tsk' here ensures that any faults that occur here
1973         * will not be accounted to the task.  'mm' *is* current->mm,
1974         * but we treat this as a 'remote' access since it is
1975         * essentially a kernel access to the memory.
1976         */
1977        result = get_user_pages_remote(NULL, mm, vaddr, 1, FOLL_FORCE, &page,
1978                        NULL, NULL);
1979        if (result < 0)
1980                return result;
1981
1982        copy_from_page(page, vaddr, &opcode, UPROBE_SWBP_INSN_SIZE);
1983        put_page(page);
1984 out:
1985        /* This needs to return true for any variant of the trap insn */
1986        return is_trap_insn(&opcode);
1987}
1988
1989static struct uprobe *find_active_uprobe(unsigned long bp_vaddr, int *is_swbp)
1990{
1991        struct mm_struct *mm = current->mm;
1992        struct uprobe *uprobe = NULL;
1993        struct vm_area_struct *vma;
1994
1995        down_read(&mm->mmap_sem);
1996        vma = find_vma(mm, bp_vaddr);
1997        if (vma && vma->vm_start <= bp_vaddr) {
1998                if (valid_vma(vma, false)) {
1999                        struct inode *inode = file_inode(vma->vm_file);
2000                        loff_t offset = vaddr_to_offset(vma, bp_vaddr);
2001
2002                        uprobe = find_uprobe(inode, offset);
2003                }
2004
2005                if (!uprobe)
2006                        *is_swbp = is_trap_at_addr(mm, bp_vaddr);
2007        } else {
2008                *is_swbp = -EFAULT;
2009        }
2010
2011        if (!uprobe && test_and_clear_bit(MMF_RECALC_UPROBES, &mm->flags))
2012                mmf_recalc_uprobes(mm);
2013        up_read(&mm->mmap_sem);
2014
2015        return uprobe;
2016}
2017
2018static void handler_chain(struct uprobe *uprobe, struct pt_regs *regs)
2019{
2020        struct uprobe_consumer *uc;
2021        int remove = UPROBE_HANDLER_REMOVE;
2022        bool need_prep = false; /* prepare return uprobe, when needed */
2023
2024        down_read(&uprobe->register_rwsem);
2025        for (uc = uprobe->consumers; uc; uc = uc->next) {
2026                int rc = 0;
2027
2028                if (uc->handler) {
2029                        rc = uc->handler(uc, regs);
2030                        WARN(rc & ~UPROBE_HANDLER_MASK,
2031                                "bad rc=0x%x from %pf()\n", rc, uc->handler);
2032                }
2033
2034                if (uc->ret_handler)
2035                        need_prep = true;
2036
2037                remove &= rc;
2038        }
2039
2040        if (need_prep && !remove)
2041                prepare_uretprobe(uprobe, regs); /* put bp at return */
2042
2043        if (remove && uprobe->consumers) {
2044                WARN_ON(!uprobe_is_active(uprobe));
2045                unapply_uprobe(uprobe, current->mm);
2046        }
2047        up_read(&uprobe->register_rwsem);
2048}
2049
2050static void
2051handle_uretprobe_chain(struct return_instance *ri, struct pt_regs *regs)
2052{
2053        struct uprobe *uprobe = ri->uprobe;
2054        struct uprobe_consumer *uc;
2055
2056        down_read(&uprobe->register_rwsem);
2057        for (uc = uprobe->consumers; uc; uc = uc->next) {
2058                if (uc->ret_handler)
2059                        uc->ret_handler(uc, ri->func, regs);
2060        }
2061        up_read(&uprobe->register_rwsem);
2062}
2063
2064static struct return_instance *find_next_ret_chain(struct return_instance *ri)
2065{
2066        bool chained;
2067
2068        do {
2069                chained = ri->chained;
2070                ri = ri->next;  /* can't be NULL if chained */
2071        } while (chained);
2072
2073        return ri;
2074}
2075
2076static void handle_trampoline(struct pt_regs *regs)
2077{
2078        struct uprobe_task *utask;
2079        struct return_instance *ri, *next;
2080        bool valid;
2081
2082        utask = current->utask;
2083        if (!utask)
2084                goto sigill;
2085
2086        ri = utask->return_instances;
2087        if (!ri)
2088                goto sigill;
2089
2090        do {
2091                /*
2092                 * We should throw out the frames invalidated by longjmp().
2093                 * If this chain is valid, then the next one should be alive
2094                 * or NULL; the latter case means that nobody but ri->func
2095                 * could hit this trampoline on return. TODO: sigaltstack().
2096                 */
2097                next = find_next_ret_chain(ri);
2098                valid = !next || arch_uretprobe_is_alive(next, RP_CHECK_RET, regs);
2099
2100                instruction_pointer_set(regs, ri->orig_ret_vaddr);
2101                do {
2102                        if (valid)
2103                                handle_uretprobe_chain(ri, regs);
2104                        ri = free_ret_instance(ri);
2105                        utask->depth--;
2106                } while (ri != next);
2107        } while (!valid);
2108
2109        utask->return_instances = ri;
2110        return;
2111
2112 sigill:
2113        uprobe_warn(current, "handle uretprobe, sending SIGILL.");
2114        force_sig(SIGILL, current);
2115
2116}
2117
2118bool __weak arch_uprobe_ignore(struct arch_uprobe *aup, struct pt_regs *regs)
2119{
2120        return false;
2121}
2122
2123bool __weak arch_uretprobe_is_alive(struct return_instance *ret, enum rp_check ctx,
2124                                        struct pt_regs *regs)
2125{
2126        return true;
2127}
2128
2129/*
2130 * Run handler and ask thread to singlestep.
2131 * Ensure all non-fatal signals cannot interrupt thread while it singlesteps.
2132 */
2133static void handle_swbp(struct pt_regs *regs)
2134{
2135        struct uprobe *uprobe;
2136        unsigned long bp_vaddr;
2137        int uninitialized_var(is_swbp);
2138
2139        bp_vaddr = uprobe_get_swbp_addr(regs);
2140        if (bp_vaddr == get_trampoline_vaddr())
2141                return handle_trampoline(regs);
2142
2143        uprobe = find_active_uprobe(bp_vaddr, &is_swbp);
2144        if (!uprobe) {
2145                if (is_swbp > 0) {
2146                        /* No matching uprobe; signal SIGTRAP. */
2147                        send_sig(SIGTRAP, current, 0);
2148                } else {
2149                        /*
2150                         * Either we raced with uprobe_unregister() or we can't
2151                         * access this memory. The latter is only possible if
2152                         * another thread plays with our ->mm. In both cases
2153                         * we can simply restart. If this vma was unmapped we
2154                         * can pretend this insn was not executed yet and get
2155                         * the (correct) SIGSEGV after restart.
2156                         */
2157                        instruction_pointer_set(regs, bp_vaddr);
2158                }
2159                return;
2160        }
2161
2162        /* change it in advance for ->handler() and restart */
2163        instruction_pointer_set(regs, bp_vaddr);
2164
2165        /*
2166         * TODO: move copy_insn/etc into _register and remove this hack.
2167         * After we hit the bp, _unregister + _register can install the
2168         * new and not-yet-analyzed uprobe at the same address, restart.
2169         */
2170        smp_rmb(); /* pairs with wmb() in install_breakpoint() */
2171        if (unlikely(!test_bit(UPROBE_COPY_INSN, &uprobe->flags)))
2172                goto out;
2173
2174        /* Tracing handlers use ->utask to communicate with fetch methods */
2175        if (!get_utask())
2176                goto out;
2177
2178        if (arch_uprobe_ignore(&uprobe->arch, regs))
2179                goto out;
2180
2181        handler_chain(uprobe, regs);
2182
2183        if (arch_uprobe_skip_sstep(&uprobe->arch, regs))
2184                goto out;
2185
2186        if (!pre_ssout(uprobe, regs, bp_vaddr))
2187                return;
2188
2189        /* arch_uprobe_skip_sstep() succeeded, or restart if can't singlestep */
2190out:
2191        put_uprobe(uprobe);
2192}
2193
2194/*
2195 * Perform required fix-ups and disable singlestep.
2196 * Allow pending signals to take effect.
2197 */
2198static void handle_singlestep(struct uprobe_task *utask, struct pt_regs *regs)
2199{
2200        struct uprobe *uprobe;
2201        int err = 0;
2202
2203        uprobe = utask->active_uprobe;
2204        if (utask->state == UTASK_SSTEP_ACK)
2205                err = arch_uprobe_post_xol(&uprobe->arch, regs);
2206        else if (utask->state == UTASK_SSTEP_TRAPPED)
2207                arch_uprobe_abort_xol(&uprobe->arch, regs);
2208        else
2209                WARN_ON_ONCE(1);
2210
2211        put_uprobe(uprobe);
2212        utask->active_uprobe = NULL;
2213        utask->state = UTASK_RUNNING;
2214        xol_free_insn_slot(current);
2215
2216        spin_lock_irq(&current->sighand->siglock);
2217        recalc_sigpending(); /* see uprobe_deny_signal() */
2218        spin_unlock_irq(&current->sighand->siglock);
2219
2220        if (unlikely(err)) {
2221                uprobe_warn(current, "execute the probed insn, sending SIGILL.");
2222                force_sig(SIGILL, current);
2223        }
2224}
2225
2226/*
2227 * On breakpoint hit, breakpoint notifier sets the TIF_UPROBE flag and
2228 * allows the thread to return from interrupt. After that handle_swbp()
2229 * sets utask->active_uprobe.
2230 *
2231 * On singlestep exception, singlestep notifier sets the TIF_UPROBE flag
2232 * and allows the thread to return from interrupt.
2233 *
2234 * While returning to userspace, thread notices the TIF_UPROBE flag and calls
2235 * uprobe_notify_resume().
2236 */
2237void uprobe_notify_resume(struct pt_regs *regs)
2238{
2239        struct uprobe_task *utask;
2240
2241        clear_thread_flag(TIF_UPROBE);
2242
2243        utask = current->utask;
2244        if (utask && utask->active_uprobe)
2245                handle_singlestep(utask, regs);
2246        else
2247                handle_swbp(regs);
2248}
2249
2250/*
2251 * uprobe_pre_sstep_notifier gets called from interrupt context as part of
2252 * notifier mechanism. Set TIF_UPROBE flag and indicate breakpoint hit.
2253 */
2254int uprobe_pre_sstep_notifier(struct pt_regs *regs)
2255{
2256        if (!current->mm)
2257                return 0;
2258
2259        if (!test_bit(MMF_HAS_UPROBES, &current->mm->flags) &&
2260            (!current->utask || !current->utask->return_instances))
2261                return 0;
2262
2263        set_thread_flag(TIF_UPROBE);
2264        return 1;
2265}
2266
2267/*
2268 * uprobe_post_sstep_notifier gets called in interrupt context as part of notifier
2269 * mechanism. Set TIF_UPROBE flag and indicate completion of singlestep.
2270 */
2271int uprobe_post_sstep_notifier(struct pt_regs *regs)
2272{
2273        struct uprobe_task *utask = current->utask;
2274
2275        if (!current->mm || !utask || !utask->active_uprobe)
2276                /* task is currently not uprobed */
2277                return 0;
2278
2279        utask->state = UTASK_SSTEP_ACK;
2280        set_thread_flag(TIF_UPROBE);
2281        return 1;
2282}
2283
2284static struct notifier_block uprobe_exception_nb = {
2285        .notifier_call          = arch_uprobe_exception_notify,
2286        .priority               = INT_MAX-1,    /* notified after kprobes, kgdb */
2287};
2288
2289static int __init init_uprobes(void)
2290{
2291        int i;
2292
2293        for (i = 0; i < UPROBES_HASH_SZ; i++)
2294                mutex_init(&uprobes_mmap_mutex[i]);
2295
2296        if (percpu_init_rwsem(&dup_mmap_sem))
2297                return -ENOMEM;
2298
2299        return register_die_notifier(&uprobe_exception_nb);
2300}
2301__initcall(init_uprobes);
2302