linux/kernel/events/uprobes.c
<<
>>
Prefs
   1/*
   2 * User-space Probes (UProbes)
   3 *
   4 * This program is free software; you can redistribute it and/or modify
   5 * it under the terms of the GNU General Public License as published by
   6 * the Free Software Foundation; either version 2 of the License, or
   7 * (at your option) any later version.
   8 *
   9 * This program is distributed in the hope that it will be useful,
  10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
  11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12 * GNU General Public License for more details.
  13 *
  14 * You should have received a copy of the GNU General Public License
  15 * along with this program; if not, write to the Free Software
  16 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
  17 *
  18 * Copyright (C) IBM Corporation, 2008-2012
  19 * Authors:
  20 *      Srikar Dronamraju
  21 *      Jim Keniston
  22 * Copyright (C) 2011-2012 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
  23 */
  24
  25#include <linux/kernel.h>
  26#include <linux/highmem.h>
  27#include <linux/pagemap.h>      /* read_mapping_page */
  28#include <linux/slab.h>
  29#include <linux/sched.h>
  30#include <linux/export.h>
  31#include <linux/rmap.h>         /* anon_vma_prepare */
  32#include <linux/mmu_notifier.h> /* set_pte_at_notify */
  33#include <linux/swap.h>         /* try_to_free_swap */
  34#include <linux/ptrace.h>       /* user_enable_single_step */
  35#include <linux/kdebug.h>       /* notifier mechanism */
  36#include "../../mm/internal.h"  /* munlock_vma_page */
  37#include <linux/percpu-rwsem.h>
  38
  39#include <linux/uprobes.h>
  40
  41#define UINSNS_PER_PAGE                 (PAGE_SIZE/UPROBE_XOL_SLOT_BYTES)
  42#define MAX_UPROBE_XOL_SLOTS            UINSNS_PER_PAGE
  43
  44static struct rb_root uprobes_tree = RB_ROOT;
  45/*
  46 * allows us to skip the uprobe_mmap if there are no uprobe events active
  47 * at this time.  Probably a fine grained per inode count is better?
  48 */
  49#define no_uprobe_events()      RB_EMPTY_ROOT(&uprobes_tree)
  50
  51static DEFINE_SPINLOCK(uprobes_treelock);       /* serialize rbtree access */
  52
  53#define UPROBES_HASH_SZ 13
  54/* serialize uprobe->pending_list */
  55static struct mutex uprobes_mmap_mutex[UPROBES_HASH_SZ];
  56#define uprobes_mmap_hash(v)    (&uprobes_mmap_mutex[((unsigned long)(v)) % UPROBES_HASH_SZ])
  57
  58static struct percpu_rw_semaphore dup_mmap_sem;
  59
  60/* Have a copy of original instruction */
  61#define UPROBE_COPY_INSN        0
  62/* Can skip singlestep */
  63#define UPROBE_SKIP_SSTEP       1
  64
  65struct uprobe {
  66        struct rb_node          rb_node;        /* node in the rb tree */
  67        atomic_t                ref;
  68        struct rw_semaphore     register_rwsem;
  69        struct rw_semaphore     consumer_rwsem;
  70        struct list_head        pending_list;
  71        struct uprobe_consumer  *consumers;
  72        struct inode            *inode;         /* Also hold a ref to inode */
  73        loff_t                  offset;
  74        unsigned long           flags;
  75        struct arch_uprobe      arch;
  76};
  77
  78struct return_instance {
  79        struct uprobe           *uprobe;
  80        unsigned long           func;
  81        unsigned long           orig_ret_vaddr; /* original return address */
  82        bool                    chained;        /* true, if instance is nested */
  83
  84        struct return_instance  *next;          /* keep as stack */
  85};
  86
  87/*
  88 * valid_vma: Verify if the specified vma is an executable vma
  89 * Relax restrictions while unregistering: vm_flags might have
  90 * changed after breakpoint was inserted.
  91 *      - is_register: indicates if we are in register context.
  92 *      - Return 1 if the specified virtual address is in an
  93 *        executable vma.
  94 */
  95static bool valid_vma(struct vm_area_struct *vma, bool is_register)
  96{
  97        vm_flags_t flags = VM_HUGETLB | VM_MAYEXEC | VM_SHARED;
  98
  99        if (is_register)
 100                flags |= VM_WRITE;
 101
 102        return vma->vm_file && (vma->vm_flags & flags) == VM_MAYEXEC;
 103}
 104
 105static unsigned long offset_to_vaddr(struct vm_area_struct *vma, loff_t offset)
 106{
 107        return vma->vm_start + offset - ((loff_t)vma->vm_pgoff << PAGE_SHIFT);
 108}
 109
 110static loff_t vaddr_to_offset(struct vm_area_struct *vma, unsigned long vaddr)
 111{
 112        return ((loff_t)vma->vm_pgoff << PAGE_SHIFT) + (vaddr - vma->vm_start);
 113}
 114
 115/**
 116 * __replace_page - replace page in vma by new page.
 117 * based on replace_page in mm/ksm.c
 118 *
 119 * @vma:      vma that holds the pte pointing to page
 120 * @addr:     address the old @page is mapped at
 121 * @page:     the cowed page we are replacing by kpage
 122 * @kpage:    the modified page we replace page by
 123 *
 124 * Returns 0 on success, -EFAULT on failure.
 125 */
 126static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
 127                                struct page *page, struct page *kpage)
 128{
 129        struct mm_struct *mm = vma->vm_mm;
 130        spinlock_t *ptl;
 131        pte_t *ptep;
 132        int err;
 133        /* For mmu_notifiers */
 134        const unsigned long mmun_start = addr;
 135        const unsigned long mmun_end   = addr + PAGE_SIZE;
 136
 137        /* For try_to_free_swap() and munlock_vma_page() below */
 138        lock_page(page);
 139
 140        mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
 141        err = -EAGAIN;
 142        ptep = page_check_address(page, mm, addr, &ptl, 0);
 143        if (!ptep)
 144                goto unlock;
 145
 146        get_page(kpage);
 147        page_add_new_anon_rmap(kpage, vma, addr);
 148
 149        if (!PageAnon(page)) {
 150                dec_mm_counter(mm, MM_FILEPAGES);
 151                inc_mm_counter(mm, MM_ANONPAGES);
 152        }
 153
 154        flush_cache_page(vma, addr, pte_pfn(*ptep));
 155        ptep_clear_flush(vma, addr, ptep);
 156        set_pte_at_notify(mm, addr, ptep, mk_pte(kpage, vma->vm_page_prot));
 157
 158        page_remove_rmap(page);
 159        if (!page_mapped(page))
 160                try_to_free_swap(page);
 161        pte_unmap_unlock(ptep, ptl);
 162
 163        if (vma->vm_flags & VM_LOCKED)
 164                munlock_vma_page(page);
 165        put_page(page);
 166
 167        err = 0;
 168 unlock:
 169        mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
 170        unlock_page(page);
 171        return err;
 172}
 173
 174/**
 175 * is_swbp_insn - check if instruction is breakpoint instruction.
 176 * @insn: instruction to be checked.
 177 * Default implementation of is_swbp_insn
 178 * Returns true if @insn is a breakpoint instruction.
 179 */
 180bool __weak is_swbp_insn(uprobe_opcode_t *insn)
 181{
 182        return *insn == UPROBE_SWBP_INSN;
 183}
 184
 185/**
 186 * is_trap_insn - check if instruction is breakpoint instruction.
 187 * @insn: instruction to be checked.
 188 * Default implementation of is_trap_insn
 189 * Returns true if @insn is a breakpoint instruction.
 190 *
 191 * This function is needed for the case where an architecture has multiple
 192 * trap instructions (like powerpc).
 193 */
 194bool __weak is_trap_insn(uprobe_opcode_t *insn)
 195{
 196        return is_swbp_insn(insn);
 197}
 198
 199static void copy_from_page(struct page *page, unsigned long vaddr, void *dst, int len)
 200{
 201        void *kaddr = kmap_atomic(page);
 202        memcpy(dst, kaddr + (vaddr & ~PAGE_MASK), len);
 203        kunmap_atomic(kaddr);
 204}
 205
 206static void copy_to_page(struct page *page, unsigned long vaddr, const void *src, int len)
 207{
 208        void *kaddr = kmap_atomic(page);
 209        memcpy(kaddr + (vaddr & ~PAGE_MASK), src, len);
 210        kunmap_atomic(kaddr);
 211}
 212
 213static int verify_opcode(struct page *page, unsigned long vaddr, uprobe_opcode_t *new_opcode)
 214{
 215        uprobe_opcode_t old_opcode;
 216        bool is_swbp;
 217
 218        /*
 219         * Note: We only check if the old_opcode is UPROBE_SWBP_INSN here.
 220         * We do not check if it is any other 'trap variant' which could
 221         * be conditional trap instruction such as the one powerpc supports.
 222         *
 223         * The logic is that we do not care if the underlying instruction
 224         * is a trap variant; uprobes always wins over any other (gdb)
 225         * breakpoint.
 226         */
 227        copy_from_page(page, vaddr, &old_opcode, UPROBE_SWBP_INSN_SIZE);
 228        is_swbp = is_swbp_insn(&old_opcode);
 229
 230        if (is_swbp_insn(new_opcode)) {
 231                if (is_swbp)            /* register: already installed? */
 232                        return 0;
 233        } else {
 234                if (!is_swbp)           /* unregister: was it changed by us? */
 235                        return 0;
 236        }
 237
 238        return 1;
 239}
 240
 241/*
 242 * NOTE:
 243 * Expect the breakpoint instruction to be the smallest size instruction for
 244 * the architecture. If an arch has variable length instruction and the
 245 * breakpoint instruction is not of the smallest length instruction
 246 * supported by that architecture then we need to modify is_trap_at_addr and
 247 * write_opcode accordingly. This would never be a problem for archs that
 248 * have fixed length instructions.
 249 */
 250
 251/*
 252 * write_opcode - write the opcode at a given virtual address.
 253 * @mm: the probed process address space.
 254 * @vaddr: the virtual address to store the opcode.
 255 * @opcode: opcode to be written at @vaddr.
 256 *
 257 * Called with mm->mmap_sem held (for read and with a reference to
 258 * mm).
 259 *
 260 * For mm @mm, write the opcode at @vaddr.
 261 * Return 0 (success) or a negative errno.
 262 */
 263static int write_opcode(struct mm_struct *mm, unsigned long vaddr,
 264                        uprobe_opcode_t opcode)
 265{
 266        struct page *old_page, *new_page;
 267        struct vm_area_struct *vma;
 268        int ret;
 269
 270retry:
 271        /* Read the page with vaddr into memory */
 272        ret = get_user_pages(NULL, mm, vaddr, 1, 0, 1, &old_page, &vma);
 273        if (ret <= 0)
 274                return ret;
 275
 276        ret = verify_opcode(old_page, vaddr, &opcode);
 277        if (ret <= 0)
 278                goto put_old;
 279
 280        ret = -ENOMEM;
 281        new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, vaddr);
 282        if (!new_page)
 283                goto put_old;
 284
 285        __SetPageUptodate(new_page);
 286
 287        copy_highpage(new_page, old_page);
 288        copy_to_page(new_page, vaddr, &opcode, UPROBE_SWBP_INSN_SIZE);
 289
 290        ret = anon_vma_prepare(vma);
 291        if (ret)
 292                goto put_new;
 293
 294        ret = __replace_page(vma, vaddr, old_page, new_page);
 295
 296put_new:
 297        page_cache_release(new_page);
 298put_old:
 299        put_page(old_page);
 300
 301        if (unlikely(ret == -EAGAIN))
 302                goto retry;
 303        return ret;
 304}
 305
 306/**
 307 * set_swbp - store breakpoint at a given address.
 308 * @auprobe: arch specific probepoint information.
 309 * @mm: the probed process address space.
 310 * @vaddr: the virtual address to insert the opcode.
 311 *
 312 * For mm @mm, store the breakpoint instruction at @vaddr.
 313 * Return 0 (success) or a negative errno.
 314 */
 315int __weak set_swbp(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long vaddr)
 316{
 317        return write_opcode(mm, vaddr, UPROBE_SWBP_INSN);
 318}
 319
 320/**
 321 * set_orig_insn - Restore the original instruction.
 322 * @mm: the probed process address space.
 323 * @auprobe: arch specific probepoint information.
 324 * @vaddr: the virtual address to insert the opcode.
 325 *
 326 * For mm @mm, restore the original opcode (opcode) at @vaddr.
 327 * Return 0 (success) or a negative errno.
 328 */
 329int __weak
 330set_orig_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long vaddr)
 331{
 332        return write_opcode(mm, vaddr, *(uprobe_opcode_t *)auprobe->insn);
 333}
 334
 335static int match_uprobe(struct uprobe *l, struct uprobe *r)
 336{
 337        if (l->inode < r->inode)
 338                return -1;
 339
 340        if (l->inode > r->inode)
 341                return 1;
 342
 343        if (l->offset < r->offset)
 344                return -1;
 345
 346        if (l->offset > r->offset)
 347                return 1;
 348
 349        return 0;
 350}
 351
 352static struct uprobe *__find_uprobe(struct inode *inode, loff_t offset)
 353{
 354        struct uprobe u = { .inode = inode, .offset = offset };
 355        struct rb_node *n = uprobes_tree.rb_node;
 356        struct uprobe *uprobe;
 357        int match;
 358
 359        while (n) {
 360                uprobe = rb_entry(n, struct uprobe, rb_node);
 361                match = match_uprobe(&u, uprobe);
 362                if (!match) {
 363                        atomic_inc(&uprobe->ref);
 364                        return uprobe;
 365                }
 366
 367                if (match < 0)
 368                        n = n->rb_left;
 369                else
 370                        n = n->rb_right;
 371        }
 372        return NULL;
 373}
 374
 375/*
 376 * Find a uprobe corresponding to a given inode:offset
 377 * Acquires uprobes_treelock
 378 */
 379static struct uprobe *find_uprobe(struct inode *inode, loff_t offset)
 380{
 381        struct uprobe *uprobe;
 382
 383        spin_lock(&uprobes_treelock);
 384        uprobe = __find_uprobe(inode, offset);
 385        spin_unlock(&uprobes_treelock);
 386
 387        return uprobe;
 388}
 389
 390static struct uprobe *__insert_uprobe(struct uprobe *uprobe)
 391{
 392        struct rb_node **p = &uprobes_tree.rb_node;
 393        struct rb_node *parent = NULL;
 394        struct uprobe *u;
 395        int match;
 396
 397        while (*p) {
 398                parent = *p;
 399                u = rb_entry(parent, struct uprobe, rb_node);
 400                match = match_uprobe(uprobe, u);
 401                if (!match) {
 402                        atomic_inc(&u->ref);
 403                        return u;
 404                }
 405
 406                if (match < 0)
 407                        p = &parent->rb_left;
 408                else
 409                        p = &parent->rb_right;
 410
 411        }
 412
 413        u = NULL;
 414        rb_link_node(&uprobe->rb_node, parent, p);
 415        rb_insert_color(&uprobe->rb_node, &uprobes_tree);
 416        /* get access + creation ref */
 417        atomic_set(&uprobe->ref, 2);
 418
 419        return u;
 420}
 421
 422/*
 423 * Acquire uprobes_treelock.
 424 * Matching uprobe already exists in rbtree;
 425 *      increment (access refcount) and return the matching uprobe.
 426 *
 427 * No matching uprobe; insert the uprobe in rb_tree;
 428 *      get a double refcount (access + creation) and return NULL.
 429 */
 430static struct uprobe *insert_uprobe(struct uprobe *uprobe)
 431{
 432        struct uprobe *u;
 433
 434        spin_lock(&uprobes_treelock);
 435        u = __insert_uprobe(uprobe);
 436        spin_unlock(&uprobes_treelock);
 437
 438        return u;
 439}
 440
 441static void put_uprobe(struct uprobe *uprobe)
 442{
 443        if (atomic_dec_and_test(&uprobe->ref))
 444                kfree(uprobe);
 445}
 446
 447static struct uprobe *alloc_uprobe(struct inode *inode, loff_t offset)
 448{
 449        struct uprobe *uprobe, *cur_uprobe;
 450
 451        uprobe = kzalloc(sizeof(struct uprobe), GFP_KERNEL);
 452        if (!uprobe)
 453                return NULL;
 454
 455        uprobe->inode = igrab(inode);
 456        uprobe->offset = offset;
 457        init_rwsem(&uprobe->register_rwsem);
 458        init_rwsem(&uprobe->consumer_rwsem);
 459        /* For now assume that the instruction need not be single-stepped */
 460        __set_bit(UPROBE_SKIP_SSTEP, &uprobe->flags);
 461
 462        /* add to uprobes_tree, sorted on inode:offset */
 463        cur_uprobe = insert_uprobe(uprobe);
 464
 465        /* a uprobe exists for this inode:offset combination */
 466        if (cur_uprobe) {
 467                kfree(uprobe);
 468                uprobe = cur_uprobe;
 469                iput(inode);
 470        }
 471
 472        return uprobe;
 473}
 474
 475static void consumer_add(struct uprobe *uprobe, struct uprobe_consumer *uc)
 476{
 477        down_write(&uprobe->consumer_rwsem);
 478        uc->next = uprobe->consumers;
 479        uprobe->consumers = uc;
 480        up_write(&uprobe->consumer_rwsem);
 481}
 482
 483/*
 484 * For uprobe @uprobe, delete the consumer @uc.
 485 * Return true if the @uc is deleted successfully
 486 * or return false.
 487 */
 488static bool consumer_del(struct uprobe *uprobe, struct uprobe_consumer *uc)
 489{
 490        struct uprobe_consumer **con;
 491        bool ret = false;
 492
 493        down_write(&uprobe->consumer_rwsem);
 494        for (con = &uprobe->consumers; *con; con = &(*con)->next) {
 495                if (*con == uc) {
 496                        *con = uc->next;
 497                        ret = true;
 498                        break;
 499                }
 500        }
 501        up_write(&uprobe->consumer_rwsem);
 502
 503        return ret;
 504}
 505
 506static int
 507__copy_insn(struct address_space *mapping, struct file *filp, char *insn,
 508                        unsigned long nbytes, loff_t offset)
 509{
 510        struct page *page;
 511
 512        if (!mapping->a_ops->readpage)
 513                return -EIO;
 514        /*
 515         * Ensure that the page that has the original instruction is
 516         * populated and in page-cache.
 517         */
 518        page = read_mapping_page(mapping, offset >> PAGE_CACHE_SHIFT, filp);
 519        if (IS_ERR(page))
 520                return PTR_ERR(page);
 521
 522        copy_from_page(page, offset, insn, nbytes);
 523        page_cache_release(page);
 524
 525        return 0;
 526}
 527
 528static int copy_insn(struct uprobe *uprobe, struct file *filp)
 529{
 530        struct address_space *mapping;
 531        unsigned long nbytes;
 532        int bytes;
 533
 534        nbytes = PAGE_SIZE - (uprobe->offset & ~PAGE_MASK);
 535        mapping = uprobe->inode->i_mapping;
 536
 537        /* Instruction at end of binary; copy only available bytes */
 538        if (uprobe->offset + MAX_UINSN_BYTES > uprobe->inode->i_size)
 539                bytes = uprobe->inode->i_size - uprobe->offset;
 540        else
 541                bytes = MAX_UINSN_BYTES;
 542
 543        /* Instruction at the page-boundary; copy bytes in second page */
 544        if (nbytes < bytes) {
 545                int err = __copy_insn(mapping, filp, uprobe->arch.insn + nbytes,
 546                                bytes - nbytes, uprobe->offset + nbytes);
 547                if (err)
 548                        return err;
 549                bytes = nbytes;
 550        }
 551        return __copy_insn(mapping, filp, uprobe->arch.insn, bytes, uprobe->offset);
 552}
 553
 554static int prepare_uprobe(struct uprobe *uprobe, struct file *file,
 555                                struct mm_struct *mm, unsigned long vaddr)
 556{
 557        int ret = 0;
 558
 559        if (test_bit(UPROBE_COPY_INSN, &uprobe->flags))
 560                return ret;
 561
 562        /* TODO: move this into _register, until then we abuse this sem. */
 563        down_write(&uprobe->consumer_rwsem);
 564        if (test_bit(UPROBE_COPY_INSN, &uprobe->flags))
 565                goto out;
 566
 567        ret = copy_insn(uprobe, file);
 568        if (ret)
 569                goto out;
 570
 571        ret = -ENOTSUPP;
 572        if (is_trap_insn((uprobe_opcode_t *)uprobe->arch.insn))
 573                goto out;
 574
 575        ret = arch_uprobe_analyze_insn(&uprobe->arch, mm, vaddr);
 576        if (ret)
 577                goto out;
 578
 579        /* write_opcode() assumes we don't cross page boundary */
 580        BUG_ON((uprobe->offset & ~PAGE_MASK) +
 581                        UPROBE_SWBP_INSN_SIZE > PAGE_SIZE);
 582
 583        smp_wmb(); /* pairs with rmb() in find_active_uprobe() */
 584        set_bit(UPROBE_COPY_INSN, &uprobe->flags);
 585
 586 out:
 587        up_write(&uprobe->consumer_rwsem);
 588
 589        return ret;
 590}
 591
 592static inline bool consumer_filter(struct uprobe_consumer *uc,
 593                                   enum uprobe_filter_ctx ctx, struct mm_struct *mm)
 594{
 595        return !uc->filter || uc->filter(uc, ctx, mm);
 596}
 597
 598static bool filter_chain(struct uprobe *uprobe,
 599                         enum uprobe_filter_ctx ctx, struct mm_struct *mm)
 600{
 601        struct uprobe_consumer *uc;
 602        bool ret = false;
 603
 604        down_read(&uprobe->consumer_rwsem);
 605        for (uc = uprobe->consumers; uc; uc = uc->next) {
 606                ret = consumer_filter(uc, ctx, mm);
 607                if (ret)
 608                        break;
 609        }
 610        up_read(&uprobe->consumer_rwsem);
 611
 612        return ret;
 613}
 614
 615static int
 616install_breakpoint(struct uprobe *uprobe, struct mm_struct *mm,
 617                        struct vm_area_struct *vma, unsigned long vaddr)
 618{
 619        bool first_uprobe;
 620        int ret;
 621
 622        ret = prepare_uprobe(uprobe, vma->vm_file, mm, vaddr);
 623        if (ret)
 624                return ret;
 625
 626        /*
 627         * set MMF_HAS_UPROBES in advance for uprobe_pre_sstep_notifier(),
 628         * the task can hit this breakpoint right after __replace_page().
 629         */
 630        first_uprobe = !test_bit(MMF_HAS_UPROBES, &mm->flags);
 631        if (first_uprobe)
 632                set_bit(MMF_HAS_UPROBES, &mm->flags);
 633
 634        ret = set_swbp(&uprobe->arch, mm, vaddr);
 635        if (!ret)
 636                clear_bit(MMF_RECALC_UPROBES, &mm->flags);
 637        else if (first_uprobe)
 638                clear_bit(MMF_HAS_UPROBES, &mm->flags);
 639
 640        return ret;
 641}
 642
 643static int
 644remove_breakpoint(struct uprobe *uprobe, struct mm_struct *mm, unsigned long vaddr)
 645{
 646        set_bit(MMF_RECALC_UPROBES, &mm->flags);
 647        return set_orig_insn(&uprobe->arch, mm, vaddr);
 648}
 649
 650static inline bool uprobe_is_active(struct uprobe *uprobe)
 651{
 652        return !RB_EMPTY_NODE(&uprobe->rb_node);
 653}
 654/*
 655 * There could be threads that have already hit the breakpoint. They
 656 * will recheck the current insn and restart if find_uprobe() fails.
 657 * See find_active_uprobe().
 658 */
 659static void delete_uprobe(struct uprobe *uprobe)
 660{
 661        if (WARN_ON(!uprobe_is_active(uprobe)))
 662                return;
 663
 664        spin_lock(&uprobes_treelock);
 665        rb_erase(&uprobe->rb_node, &uprobes_tree);
 666        spin_unlock(&uprobes_treelock);
 667        RB_CLEAR_NODE(&uprobe->rb_node); /* for uprobe_is_active() */
 668        iput(uprobe->inode);
 669        put_uprobe(uprobe);
 670}
 671
 672struct map_info {
 673        struct map_info *next;
 674        struct mm_struct *mm;
 675        unsigned long vaddr;
 676};
 677
 678static inline struct map_info *free_map_info(struct map_info *info)
 679{
 680        struct map_info *next = info->next;
 681        kfree(info);
 682        return next;
 683}
 684
 685static struct map_info *
 686build_map_info(struct address_space *mapping, loff_t offset, bool is_register)
 687{
 688        unsigned long pgoff = offset >> PAGE_SHIFT;
 689        struct vm_area_struct *vma;
 690        struct map_info *curr = NULL;
 691        struct map_info *prev = NULL;
 692        struct map_info *info;
 693        int more = 0;
 694
 695 again:
 696        mutex_lock(&mapping->i_mmap_mutex);
 697        vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) {
 698                if (!valid_vma(vma, is_register))
 699                        continue;
 700
 701                if (!prev && !more) {
 702                        /*
 703                         * Needs GFP_NOWAIT to avoid i_mmap_mutex recursion through
 704                         * reclaim. This is optimistic, no harm done if it fails.
 705                         */
 706                        prev = kmalloc(sizeof(struct map_info),
 707                                        GFP_NOWAIT | __GFP_NOMEMALLOC | __GFP_NOWARN);
 708                        if (prev)
 709                                prev->next = NULL;
 710                }
 711                if (!prev) {
 712                        more++;
 713                        continue;
 714                }
 715
 716                if (!atomic_inc_not_zero(&vma->vm_mm->mm_users))
 717                        continue;
 718
 719                info = prev;
 720                prev = prev->next;
 721                info->next = curr;
 722                curr = info;
 723
 724                info->mm = vma->vm_mm;
 725                info->vaddr = offset_to_vaddr(vma, offset);
 726        }
 727        mutex_unlock(&mapping->i_mmap_mutex);
 728
 729        if (!more)
 730                goto out;
 731
 732        prev = curr;
 733        while (curr) {
 734                mmput(curr->mm);
 735                curr = curr->next;
 736        }
 737
 738        do {
 739                info = kmalloc(sizeof(struct map_info), GFP_KERNEL);
 740                if (!info) {
 741                        curr = ERR_PTR(-ENOMEM);
 742                        goto out;
 743                }
 744                info->next = prev;
 745                prev = info;
 746        } while (--more);
 747
 748        goto again;
 749 out:
 750        while (prev)
 751                prev = free_map_info(prev);
 752        return curr;
 753}
 754
 755static int
 756register_for_each_vma(struct uprobe *uprobe, struct uprobe_consumer *new)
 757{
 758        bool is_register = !!new;
 759        struct map_info *info;
 760        int err = 0;
 761
 762        percpu_down_write(&dup_mmap_sem);
 763        info = build_map_info(uprobe->inode->i_mapping,
 764                                        uprobe->offset, is_register);
 765        if (IS_ERR(info)) {
 766                err = PTR_ERR(info);
 767                goto out;
 768        }
 769
 770        while (info) {
 771                struct mm_struct *mm = info->mm;
 772                struct vm_area_struct *vma;
 773
 774                if (err && is_register)
 775                        goto free;
 776
 777                down_write(&mm->mmap_sem);
 778                vma = find_vma(mm, info->vaddr);
 779                if (!vma || !valid_vma(vma, is_register) ||
 780                    file_inode(vma->vm_file) != uprobe->inode)
 781                        goto unlock;
 782
 783                if (vma->vm_start > info->vaddr ||
 784                    vaddr_to_offset(vma, info->vaddr) != uprobe->offset)
 785                        goto unlock;
 786
 787                if (is_register) {
 788                        /* consult only the "caller", new consumer. */
 789                        if (consumer_filter(new,
 790                                        UPROBE_FILTER_REGISTER, mm))
 791                                err = install_breakpoint(uprobe, mm, vma, info->vaddr);
 792                } else if (test_bit(MMF_HAS_UPROBES, &mm->flags)) {
 793                        if (!filter_chain(uprobe,
 794                                        UPROBE_FILTER_UNREGISTER, mm))
 795                                err |= remove_breakpoint(uprobe, mm, info->vaddr);
 796                }
 797
 798 unlock:
 799                up_write(&mm->mmap_sem);
 800 free:
 801                mmput(mm);
 802                info = free_map_info(info);
 803        }
 804 out:
 805        percpu_up_write(&dup_mmap_sem);
 806        return err;
 807}
 808
 809static int __uprobe_register(struct uprobe *uprobe, struct uprobe_consumer *uc)
 810{
 811        consumer_add(uprobe, uc);
 812        return register_for_each_vma(uprobe, uc);
 813}
 814
 815static void __uprobe_unregister(struct uprobe *uprobe, struct uprobe_consumer *uc)
 816{
 817        int err;
 818
 819        if (!consumer_del(uprobe, uc))  /* WARN? */
 820                return;
 821
 822        err = register_for_each_vma(uprobe, NULL);
 823        /* TODO : cant unregister? schedule a worker thread */
 824        if (!uprobe->consumers && !err)
 825                delete_uprobe(uprobe);
 826}
 827
 828/*
 829 * uprobe_register - register a probe
 830 * @inode: the file in which the probe has to be placed.
 831 * @offset: offset from the start of the file.
 832 * @uc: information on howto handle the probe..
 833 *
 834 * Apart from the access refcount, uprobe_register() takes a creation
 835 * refcount (thro alloc_uprobe) if and only if this @uprobe is getting
 836 * inserted into the rbtree (i.e first consumer for a @inode:@offset
 837 * tuple).  Creation refcount stops uprobe_unregister from freeing the
 838 * @uprobe even before the register operation is complete. Creation
 839 * refcount is released when the last @uc for the @uprobe
 840 * unregisters.
 841 *
 842 * Return errno if it cannot successully install probes
 843 * else return 0 (success)
 844 */
 845int uprobe_register(struct inode *inode, loff_t offset, struct uprobe_consumer *uc)
 846{
 847        struct uprobe *uprobe;
 848        int ret;
 849
 850        /* Uprobe must have at least one set consumer */
 851        if (!uc->handler && !uc->ret_handler)
 852                return -EINVAL;
 853
 854        /* Racy, just to catch the obvious mistakes */
 855        if (offset > i_size_read(inode))
 856                return -EINVAL;
 857
 858 retry:
 859        uprobe = alloc_uprobe(inode, offset);
 860        if (!uprobe)
 861                return -ENOMEM;
 862        /*
 863         * We can race with uprobe_unregister()->delete_uprobe().
 864         * Check uprobe_is_active() and retry if it is false.
 865         */
 866        down_write(&uprobe->register_rwsem);
 867        ret = -EAGAIN;
 868        if (likely(uprobe_is_active(uprobe))) {
 869                ret = __uprobe_register(uprobe, uc);
 870                if (ret)
 871                        __uprobe_unregister(uprobe, uc);
 872        }
 873        up_write(&uprobe->register_rwsem);
 874        put_uprobe(uprobe);
 875
 876        if (unlikely(ret == -EAGAIN))
 877                goto retry;
 878        return ret;
 879}
 880EXPORT_SYMBOL_GPL(uprobe_register);
 881
 882/*
 883 * uprobe_apply - unregister a already registered probe.
 884 * @inode: the file in which the probe has to be removed.
 885 * @offset: offset from the start of the file.
 886 * @uc: consumer which wants to add more or remove some breakpoints
 887 * @add: add or remove the breakpoints
 888 */
 889int uprobe_apply(struct inode *inode, loff_t offset,
 890                        struct uprobe_consumer *uc, bool add)
 891{
 892        struct uprobe *uprobe;
 893        struct uprobe_consumer *con;
 894        int ret = -ENOENT;
 895
 896        uprobe = find_uprobe(inode, offset);
 897        if (!uprobe)
 898                return ret;
 899
 900        down_write(&uprobe->register_rwsem);
 901        for (con = uprobe->consumers; con && con != uc ; con = con->next)
 902                ;
 903        if (con)
 904                ret = register_for_each_vma(uprobe, add ? uc : NULL);
 905        up_write(&uprobe->register_rwsem);
 906        put_uprobe(uprobe);
 907
 908        return ret;
 909}
 910
 911/*
 912 * uprobe_unregister - unregister a already registered probe.
 913 * @inode: the file in which the probe has to be removed.
 914 * @offset: offset from the start of the file.
 915 * @uc: identify which probe if multiple probes are colocated.
 916 */
 917void uprobe_unregister(struct inode *inode, loff_t offset, struct uprobe_consumer *uc)
 918{
 919        struct uprobe *uprobe;
 920
 921        uprobe = find_uprobe(inode, offset);
 922        if (!uprobe)
 923                return;
 924
 925        down_write(&uprobe->register_rwsem);
 926        __uprobe_unregister(uprobe, uc);
 927        up_write(&uprobe->register_rwsem);
 928        put_uprobe(uprobe);
 929}
 930EXPORT_SYMBOL_GPL(uprobe_unregister);
 931
 932static int unapply_uprobe(struct uprobe *uprobe, struct mm_struct *mm)
 933{
 934        struct vm_area_struct *vma;
 935        int err = 0;
 936
 937        down_read(&mm->mmap_sem);
 938        for (vma = mm->mmap; vma; vma = vma->vm_next) {
 939                unsigned long vaddr;
 940                loff_t offset;
 941
 942                if (!valid_vma(vma, false) ||
 943                    file_inode(vma->vm_file) != uprobe->inode)
 944                        continue;
 945
 946                offset = (loff_t)vma->vm_pgoff << PAGE_SHIFT;
 947                if (uprobe->offset <  offset ||
 948                    uprobe->offset >= offset + vma->vm_end - vma->vm_start)
 949                        continue;
 950
 951                vaddr = offset_to_vaddr(vma, uprobe->offset);
 952                err |= remove_breakpoint(uprobe, mm, vaddr);
 953        }
 954        up_read(&mm->mmap_sem);
 955
 956        return err;
 957}
 958
 959static struct rb_node *
 960find_node_in_range(struct inode *inode, loff_t min, loff_t max)
 961{
 962        struct rb_node *n = uprobes_tree.rb_node;
 963
 964        while (n) {
 965                struct uprobe *u = rb_entry(n, struct uprobe, rb_node);
 966
 967                if (inode < u->inode) {
 968                        n = n->rb_left;
 969                } else if (inode > u->inode) {
 970                        n = n->rb_right;
 971                } else {
 972                        if (max < u->offset)
 973                                n = n->rb_left;
 974                        else if (min > u->offset)
 975                                n = n->rb_right;
 976                        else
 977                                break;
 978                }
 979        }
 980
 981        return n;
 982}
 983
 984/*
 985 * For a given range in vma, build a list of probes that need to be inserted.
 986 */
 987static void build_probe_list(struct inode *inode,
 988                                struct vm_area_struct *vma,
 989                                unsigned long start, unsigned long end,
 990                                struct list_head *head)
 991{
 992        loff_t min, max;
 993        struct rb_node *n, *t;
 994        struct uprobe *u;
 995
 996        INIT_LIST_HEAD(head);
 997        min = vaddr_to_offset(vma, start);
 998        max = min + (end - start) - 1;
 999
1000        spin_lock(&uprobes_treelock);
1001        n = find_node_in_range(inode, min, max);
1002        if (n) {
1003                for (t = n; t; t = rb_prev(t)) {
1004                        u = rb_entry(t, struct uprobe, rb_node);
1005                        if (u->inode != inode || u->offset < min)
1006                                break;
1007                        list_add(&u->pending_list, head);
1008                        atomic_inc(&u->ref);
1009                }
1010                for (t = n; (t = rb_next(t)); ) {
1011                        u = rb_entry(t, struct uprobe, rb_node);
1012                        if (u->inode != inode || u->offset > max)
1013                                break;
1014                        list_add(&u->pending_list, head);
1015                        atomic_inc(&u->ref);
1016                }
1017        }
1018        spin_unlock(&uprobes_treelock);
1019}
1020
1021/*
1022 * Called from mmap_region/vma_adjust with mm->mmap_sem acquired.
1023 *
1024 * Currently we ignore all errors and always return 0, the callers
1025 * can't handle the failure anyway.
1026 */
1027int uprobe_mmap(struct vm_area_struct *vma)
1028{
1029        struct list_head tmp_list;
1030        struct uprobe *uprobe, *u;
1031        struct inode *inode;
1032
1033        if (no_uprobe_events() || !valid_vma(vma, true))
1034                return 0;
1035
1036        inode = file_inode(vma->vm_file);
1037        if (!inode)
1038                return 0;
1039
1040        mutex_lock(uprobes_mmap_hash(inode));
1041        build_probe_list(inode, vma, vma->vm_start, vma->vm_end, &tmp_list);
1042        /*
1043         * We can race with uprobe_unregister(), this uprobe can be already
1044         * removed. But in this case filter_chain() must return false, all
1045         * consumers have gone away.
1046         */
1047        list_for_each_entry_safe(uprobe, u, &tmp_list, pending_list) {
1048                if (!fatal_signal_pending(current) &&
1049                    filter_chain(uprobe, UPROBE_FILTER_MMAP, vma->vm_mm)) {
1050                        unsigned long vaddr = offset_to_vaddr(vma, uprobe->offset);
1051                        install_breakpoint(uprobe, vma->vm_mm, vma, vaddr);
1052                }
1053                put_uprobe(uprobe);
1054        }
1055        mutex_unlock(uprobes_mmap_hash(inode));
1056
1057        return 0;
1058}
1059
1060static bool
1061vma_has_uprobes(struct vm_area_struct *vma, unsigned long start, unsigned long end)
1062{
1063        loff_t min, max;
1064        struct inode *inode;
1065        struct rb_node *n;
1066
1067        inode = file_inode(vma->vm_file);
1068
1069        min = vaddr_to_offset(vma, start);
1070        max = min + (end - start) - 1;
1071
1072        spin_lock(&uprobes_treelock);
1073        n = find_node_in_range(inode, min, max);
1074        spin_unlock(&uprobes_treelock);
1075
1076        return !!n;
1077}
1078
1079/*
1080 * Called in context of a munmap of a vma.
1081 */
1082void uprobe_munmap(struct vm_area_struct *vma, unsigned long start, unsigned long end)
1083{
1084        if (no_uprobe_events() || !valid_vma(vma, false))
1085                return;
1086
1087        if (!atomic_read(&vma->vm_mm->mm_users)) /* called by mmput() ? */
1088                return;
1089
1090        if (!test_bit(MMF_HAS_UPROBES, &vma->vm_mm->flags) ||
1091             test_bit(MMF_RECALC_UPROBES, &vma->vm_mm->flags))
1092                return;
1093
1094        if (vma_has_uprobes(vma, start, end))
1095                set_bit(MMF_RECALC_UPROBES, &vma->vm_mm->flags);
1096}
1097
1098/* Slot allocation for XOL */
1099static int xol_add_vma(struct xol_area *area)
1100{
1101        struct mm_struct *mm = current->mm;
1102        int ret = -EALREADY;
1103
1104        down_write(&mm->mmap_sem);
1105        if (mm->uprobes_state.xol_area)
1106                goto fail;
1107
1108        ret = -ENOMEM;
1109        /* Try to map as high as possible, this is only a hint. */
1110        area->vaddr = get_unmapped_area(NULL, TASK_SIZE - PAGE_SIZE, PAGE_SIZE, 0, 0);
1111        if (area->vaddr & ~PAGE_MASK) {
1112                ret = area->vaddr;
1113                goto fail;
1114        }
1115
1116        ret = install_special_mapping(mm, area->vaddr, PAGE_SIZE,
1117                                VM_EXEC|VM_MAYEXEC|VM_DONTCOPY|VM_IO, &area->page);
1118        if (ret)
1119                goto fail;
1120
1121        smp_wmb();      /* pairs with get_xol_area() */
1122        mm->uprobes_state.xol_area = area;
1123        ret = 0;
1124 fail:
1125        up_write(&mm->mmap_sem);
1126
1127        return ret;
1128}
1129
1130/*
1131 * get_xol_area - Allocate process's xol_area if necessary.
1132 * This area will be used for storing instructions for execution out of line.
1133 *
1134 * Returns the allocated area or NULL.
1135 */
1136static struct xol_area *get_xol_area(void)
1137{
1138        struct mm_struct *mm = current->mm;
1139        struct xol_area *area;
1140        uprobe_opcode_t insn = UPROBE_SWBP_INSN;
1141
1142        area = mm->uprobes_state.xol_area;
1143        if (area)
1144                goto ret;
1145
1146        area = kzalloc(sizeof(*area), GFP_KERNEL);
1147        if (unlikely(!area))
1148                goto out;
1149
1150        area->bitmap = kzalloc(BITS_TO_LONGS(UINSNS_PER_PAGE) * sizeof(long), GFP_KERNEL);
1151        if (!area->bitmap)
1152                goto free_area;
1153
1154        area->page = alloc_page(GFP_HIGHUSER);
1155        if (!area->page)
1156                goto free_bitmap;
1157
1158        /* allocate first slot of task's xol_area for the return probes */
1159        set_bit(0, area->bitmap);
1160        copy_to_page(area->page, 0, &insn, UPROBE_SWBP_INSN_SIZE);
1161        atomic_set(&area->slot_count, 1);
1162        init_waitqueue_head(&area->wq);
1163
1164        if (!xol_add_vma(area))
1165                return area;
1166
1167        __free_page(area->page);
1168 free_bitmap:
1169        kfree(area->bitmap);
1170 free_area:
1171        kfree(area);
1172 out:
1173        area = mm->uprobes_state.xol_area;
1174 ret:
1175        smp_read_barrier_depends();     /* pairs with wmb in xol_add_vma() */
1176        return area;
1177}
1178
1179/*
1180 * uprobe_clear_state - Free the area allocated for slots.
1181 */
1182void uprobe_clear_state(struct mm_struct *mm)
1183{
1184        struct xol_area *area = mm->uprobes_state.xol_area;
1185
1186        if (!area)
1187                return;
1188
1189        put_page(area->page);
1190        kfree(area->bitmap);
1191        kfree(area);
1192}
1193
1194void uprobe_start_dup_mmap(void)
1195{
1196        percpu_down_read(&dup_mmap_sem);
1197}
1198
1199void uprobe_end_dup_mmap(void)
1200{
1201        percpu_up_read(&dup_mmap_sem);
1202}
1203
1204void uprobe_dup_mmap(struct mm_struct *oldmm, struct mm_struct *newmm)
1205{
1206        newmm->uprobes_state.xol_area = NULL;
1207
1208        if (test_bit(MMF_HAS_UPROBES, &oldmm->flags)) {
1209                set_bit(MMF_HAS_UPROBES, &newmm->flags);
1210                /* unconditionally, dup_mmap() skips VM_DONTCOPY vmas */
1211                set_bit(MMF_RECALC_UPROBES, &newmm->flags);
1212        }
1213}
1214
1215/*
1216 *  - search for a free slot.
1217 */
1218static unsigned long xol_take_insn_slot(struct xol_area *area)
1219{
1220        unsigned long slot_addr;
1221        int slot_nr;
1222
1223        do {
1224                slot_nr = find_first_zero_bit(area->bitmap, UINSNS_PER_PAGE);
1225                if (slot_nr < UINSNS_PER_PAGE) {
1226                        if (!test_and_set_bit(slot_nr, area->bitmap))
1227                                break;
1228
1229                        slot_nr = UINSNS_PER_PAGE;
1230                        continue;
1231                }
1232                wait_event(area->wq, (atomic_read(&area->slot_count) < UINSNS_PER_PAGE));
1233        } while (slot_nr >= UINSNS_PER_PAGE);
1234
1235        slot_addr = area->vaddr + (slot_nr * UPROBE_XOL_SLOT_BYTES);
1236        atomic_inc(&area->slot_count);
1237
1238        return slot_addr;
1239}
1240
1241/*
1242 * xol_get_insn_slot - allocate a slot for xol.
1243 * Returns the allocated slot address or 0.
1244 */
1245static unsigned long xol_get_insn_slot(struct uprobe *uprobe)
1246{
1247        struct xol_area *area;
1248        unsigned long xol_vaddr;
1249
1250        area = get_xol_area();
1251        if (!area)
1252                return 0;
1253
1254        xol_vaddr = xol_take_insn_slot(area);
1255        if (unlikely(!xol_vaddr))
1256                return 0;
1257
1258        /* Initialize the slot */
1259        copy_to_page(area->page, xol_vaddr, uprobe->arch.insn, MAX_UINSN_BYTES);
1260        /*
1261         * We probably need flush_icache_user_range() but it needs vma.
1262         * This should work on supported architectures too.
1263         */
1264        flush_dcache_page(area->page);
1265
1266        return xol_vaddr;
1267}
1268
1269/*
1270 * xol_free_insn_slot - If slot was earlier allocated by
1271 * @xol_get_insn_slot(), make the slot available for
1272 * subsequent requests.
1273 */
1274static void xol_free_insn_slot(struct task_struct *tsk)
1275{
1276        struct xol_area *area;
1277        unsigned long vma_end;
1278        unsigned long slot_addr;
1279
1280        if (!tsk->mm || !tsk->mm->uprobes_state.xol_area || !tsk->utask)
1281                return;
1282
1283        slot_addr = tsk->utask->xol_vaddr;
1284        if (unlikely(!slot_addr))
1285                return;
1286
1287        area = tsk->mm->uprobes_state.xol_area;
1288        vma_end = area->vaddr + PAGE_SIZE;
1289        if (area->vaddr <= slot_addr && slot_addr < vma_end) {
1290                unsigned long offset;
1291                int slot_nr;
1292
1293                offset = slot_addr - area->vaddr;
1294                slot_nr = offset / UPROBE_XOL_SLOT_BYTES;
1295                if (slot_nr >= UINSNS_PER_PAGE)
1296                        return;
1297
1298                clear_bit(slot_nr, area->bitmap);
1299                atomic_dec(&area->slot_count);
1300                if (waitqueue_active(&area->wq))
1301                        wake_up(&area->wq);
1302
1303                tsk->utask->xol_vaddr = 0;
1304        }
1305}
1306
1307/**
1308 * uprobe_get_swbp_addr - compute address of swbp given post-swbp regs
1309 * @regs: Reflects the saved state of the task after it has hit a breakpoint
1310 * instruction.
1311 * Return the address of the breakpoint instruction.
1312 */
1313unsigned long __weak uprobe_get_swbp_addr(struct pt_regs *regs)
1314{
1315        return instruction_pointer(regs) - UPROBE_SWBP_INSN_SIZE;
1316}
1317
1318/*
1319 * Called with no locks held.
1320 * Called in context of a exiting or a exec-ing thread.
1321 */
1322void uprobe_free_utask(struct task_struct *t)
1323{
1324        struct uprobe_task *utask = t->utask;
1325        struct return_instance *ri, *tmp;
1326
1327        if (!utask)
1328                return;
1329
1330        if (utask->active_uprobe)
1331                put_uprobe(utask->active_uprobe);
1332
1333        ri = utask->return_instances;
1334        while (ri) {
1335                tmp = ri;
1336                ri = ri->next;
1337
1338                put_uprobe(tmp->uprobe);
1339                kfree(tmp);
1340        }
1341
1342        xol_free_insn_slot(t);
1343        kfree(utask);
1344        t->utask = NULL;
1345}
1346
1347/*
1348 * Called in context of a new clone/fork from copy_process.
1349 */
1350void uprobe_copy_process(struct task_struct *t)
1351{
1352        t->utask = NULL;
1353}
1354
1355/*
1356 * Allocate a uprobe_task object for the task if if necessary.
1357 * Called when the thread hits a breakpoint.
1358 *
1359 * Returns:
1360 * - pointer to new uprobe_task on success
1361 * - NULL otherwise
1362 */
1363static struct uprobe_task *get_utask(void)
1364{
1365        if (!current->utask)
1366                current->utask = kzalloc(sizeof(struct uprobe_task), GFP_KERNEL);
1367        return current->utask;
1368}
1369
1370/*
1371 * Current area->vaddr notion assume the trampoline address is always
1372 * equal area->vaddr.
1373 *
1374 * Returns -1 in case the xol_area is not allocated.
1375 */
1376static unsigned long get_trampoline_vaddr(void)
1377{
1378        struct xol_area *area;
1379        unsigned long trampoline_vaddr = -1;
1380
1381        area = current->mm->uprobes_state.xol_area;
1382        smp_read_barrier_depends();
1383        if (area)
1384                trampoline_vaddr = area->vaddr;
1385
1386        return trampoline_vaddr;
1387}
1388
1389static void prepare_uretprobe(struct uprobe *uprobe, struct pt_regs *regs)
1390{
1391        struct return_instance *ri;
1392        struct uprobe_task *utask;
1393        unsigned long orig_ret_vaddr, trampoline_vaddr;
1394        bool chained = false;
1395
1396        if (!get_xol_area())
1397                return;
1398
1399        utask = get_utask();
1400        if (!utask)
1401                return;
1402
1403        if (utask->depth >= MAX_URETPROBE_DEPTH) {
1404                printk_ratelimited(KERN_INFO "uprobe: omit uretprobe due to"
1405                                " nestedness limit pid/tgid=%d/%d\n",
1406                                current->pid, current->tgid);
1407                return;
1408        }
1409
1410        ri = kzalloc(sizeof(struct return_instance), GFP_KERNEL);
1411        if (!ri)
1412                goto fail;
1413
1414        trampoline_vaddr = get_trampoline_vaddr();
1415        orig_ret_vaddr = arch_uretprobe_hijack_return_addr(trampoline_vaddr, regs);
1416        if (orig_ret_vaddr == -1)
1417                goto fail;
1418
1419        /*
1420         * We don't want to keep trampoline address in stack, rather keep the
1421         * original return address of first caller thru all the consequent
1422         * instances. This also makes breakpoint unwrapping easier.
1423         */
1424        if (orig_ret_vaddr == trampoline_vaddr) {
1425                if (!utask->return_instances) {
1426                        /*
1427                         * This situation is not possible. Likely we have an
1428                         * attack from user-space.
1429                         */
1430                        pr_warn("uprobe: unable to set uretprobe pid/tgid=%d/%d\n",
1431                                                current->pid, current->tgid);
1432                        goto fail;
1433                }
1434
1435                chained = true;
1436                orig_ret_vaddr = utask->return_instances->orig_ret_vaddr;
1437        }
1438
1439        atomic_inc(&uprobe->ref);
1440        ri->uprobe = uprobe;
1441        ri->func = instruction_pointer(regs);
1442        ri->orig_ret_vaddr = orig_ret_vaddr;
1443        ri->chained = chained;
1444
1445        utask->depth++;
1446
1447        /* add instance to the stack */
1448        ri->next = utask->return_instances;
1449        utask->return_instances = ri;
1450
1451        return;
1452
1453 fail:
1454        kfree(ri);
1455}
1456
1457/* Prepare to single-step probed instruction out of line. */
1458static int
1459pre_ssout(struct uprobe *uprobe, struct pt_regs *regs, unsigned long bp_vaddr)
1460{
1461        struct uprobe_task *utask;
1462        unsigned long xol_vaddr;
1463        int err;
1464
1465        utask = get_utask();
1466        if (!utask)
1467                return -ENOMEM;
1468
1469        xol_vaddr = xol_get_insn_slot(uprobe);
1470        if (!xol_vaddr)
1471                return -ENOMEM;
1472
1473        utask->xol_vaddr = xol_vaddr;
1474        utask->vaddr = bp_vaddr;
1475
1476        err = arch_uprobe_pre_xol(&uprobe->arch, regs);
1477        if (unlikely(err)) {
1478                xol_free_insn_slot(current);
1479                return err;
1480        }
1481
1482        utask->active_uprobe = uprobe;
1483        utask->state = UTASK_SSTEP;
1484        return 0;
1485}
1486
1487/*
1488 * If we are singlestepping, then ensure this thread is not connected to
1489 * non-fatal signals until completion of singlestep.  When xol insn itself
1490 * triggers the signal,  restart the original insn even if the task is
1491 * already SIGKILL'ed (since coredump should report the correct ip).  This
1492 * is even more important if the task has a handler for SIGSEGV/etc, The
1493 * _same_ instruction should be repeated again after return from the signal
1494 * handler, and SSTEP can never finish in this case.
1495 */
1496bool uprobe_deny_signal(void)
1497{
1498        struct task_struct *t = current;
1499        struct uprobe_task *utask = t->utask;
1500
1501        if (likely(!utask || !utask->active_uprobe))
1502                return false;
1503
1504        WARN_ON_ONCE(utask->state != UTASK_SSTEP);
1505
1506        if (signal_pending(t)) {
1507                spin_lock_irq(&t->sighand->siglock);
1508                clear_tsk_thread_flag(t, TIF_SIGPENDING);
1509                spin_unlock_irq(&t->sighand->siglock);
1510
1511                if (__fatal_signal_pending(t) || arch_uprobe_xol_was_trapped(t)) {
1512                        utask->state = UTASK_SSTEP_TRAPPED;
1513                        set_tsk_thread_flag(t, TIF_UPROBE);
1514                        set_tsk_thread_flag(t, TIF_NOTIFY_RESUME);
1515                }
1516        }
1517
1518        return true;
1519}
1520
1521/*
1522 * Avoid singlestepping the original instruction if the original instruction
1523 * is a NOP or can be emulated.
1524 */
1525static bool can_skip_sstep(struct uprobe *uprobe, struct pt_regs *regs)
1526{
1527        if (test_bit(UPROBE_SKIP_SSTEP, &uprobe->flags)) {
1528                if (arch_uprobe_skip_sstep(&uprobe->arch, regs))
1529                        return true;
1530                clear_bit(UPROBE_SKIP_SSTEP, &uprobe->flags);
1531        }
1532        return false;
1533}
1534
1535static void mmf_recalc_uprobes(struct mm_struct *mm)
1536{
1537        struct vm_area_struct *vma;
1538
1539        for (vma = mm->mmap; vma; vma = vma->vm_next) {
1540                if (!valid_vma(vma, false))
1541                        continue;
1542                /*
1543                 * This is not strictly accurate, we can race with
1544                 * uprobe_unregister() and see the already removed
1545                 * uprobe if delete_uprobe() was not yet called.
1546                 * Or this uprobe can be filtered out.
1547                 */
1548                if (vma_has_uprobes(vma, vma->vm_start, vma->vm_end))
1549                        return;
1550        }
1551
1552        clear_bit(MMF_HAS_UPROBES, &mm->flags);
1553}
1554
1555static int is_trap_at_addr(struct mm_struct *mm, unsigned long vaddr)
1556{
1557        struct page *page;
1558        uprobe_opcode_t opcode;
1559        int result;
1560
1561        pagefault_disable();
1562        result = __copy_from_user_inatomic(&opcode, (void __user*)vaddr,
1563                                                        sizeof(opcode));
1564        pagefault_enable();
1565
1566        if (likely(result == 0))
1567                goto out;
1568
1569        result = get_user_pages(NULL, mm, vaddr, 1, 0, 1, &page, NULL);
1570        if (result < 0)
1571                return result;
1572
1573        copy_from_page(page, vaddr, &opcode, UPROBE_SWBP_INSN_SIZE);
1574        put_page(page);
1575 out:
1576        /* This needs to return true for any variant of the trap insn */
1577        return is_trap_insn(&opcode);
1578}
1579
1580static struct uprobe *find_active_uprobe(unsigned long bp_vaddr, int *is_swbp)
1581{
1582        struct mm_struct *mm = current->mm;
1583        struct uprobe *uprobe = NULL;
1584        struct vm_area_struct *vma;
1585
1586        down_read(&mm->mmap_sem);
1587        vma = find_vma(mm, bp_vaddr);
1588        if (vma && vma->vm_start <= bp_vaddr) {
1589                if (valid_vma(vma, false)) {
1590                        struct inode *inode = file_inode(vma->vm_file);
1591                        loff_t offset = vaddr_to_offset(vma, bp_vaddr);
1592
1593                        uprobe = find_uprobe(inode, offset);
1594                }
1595
1596                if (!uprobe)
1597                        *is_swbp = is_trap_at_addr(mm, bp_vaddr);
1598        } else {
1599                *is_swbp = -EFAULT;
1600        }
1601
1602        if (!uprobe && test_and_clear_bit(MMF_RECALC_UPROBES, &mm->flags))
1603                mmf_recalc_uprobes(mm);
1604        up_read(&mm->mmap_sem);
1605
1606        return uprobe;
1607}
1608
1609static void handler_chain(struct uprobe *uprobe, struct pt_regs *regs)
1610{
1611        struct uprobe_consumer *uc;
1612        int remove = UPROBE_HANDLER_REMOVE;
1613        bool need_prep = false; /* prepare return uprobe, when needed */
1614
1615        down_read(&uprobe->register_rwsem);
1616        for (uc = uprobe->consumers; uc; uc = uc->next) {
1617                int rc = 0;
1618
1619                if (uc->handler) {
1620                        rc = uc->handler(uc, regs);
1621                        WARN(rc & ~UPROBE_HANDLER_MASK,
1622                                "bad rc=0x%x from %pf()\n", rc, uc->handler);
1623                }
1624
1625                if (uc->ret_handler)
1626                        need_prep = true;
1627
1628                remove &= rc;
1629        }
1630
1631        if (need_prep && !remove)
1632                prepare_uretprobe(uprobe, regs); /* put bp at return */
1633
1634        if (remove && uprobe->consumers) {
1635                WARN_ON(!uprobe_is_active(uprobe));
1636                unapply_uprobe(uprobe, current->mm);
1637        }
1638        up_read(&uprobe->register_rwsem);
1639}
1640
1641static void
1642handle_uretprobe_chain(struct return_instance *ri, struct pt_regs *regs)
1643{
1644        struct uprobe *uprobe = ri->uprobe;
1645        struct uprobe_consumer *uc;
1646
1647        down_read(&uprobe->register_rwsem);
1648        for (uc = uprobe->consumers; uc; uc = uc->next) {
1649                if (uc->ret_handler)
1650                        uc->ret_handler(uc, ri->func, regs);
1651        }
1652        up_read(&uprobe->register_rwsem);
1653}
1654
1655static bool handle_trampoline(struct pt_regs *regs)
1656{
1657        struct uprobe_task *utask;
1658        struct return_instance *ri, *tmp;
1659        bool chained;
1660
1661        utask = current->utask;
1662        if (!utask)
1663                return false;
1664
1665        ri = utask->return_instances;
1666        if (!ri)
1667                return false;
1668
1669        /*
1670         * TODO: we should throw out return_instance's invalidated by
1671         * longjmp(), currently we assume that the probed function always
1672         * returns.
1673         */
1674        instruction_pointer_set(regs, ri->orig_ret_vaddr);
1675
1676        for (;;) {
1677                handle_uretprobe_chain(ri, regs);
1678
1679                chained = ri->chained;
1680                put_uprobe(ri->uprobe);
1681
1682                tmp = ri;
1683                ri = ri->next;
1684                kfree(tmp);
1685
1686                if (!chained)
1687                        break;
1688
1689                utask->depth--;
1690
1691                BUG_ON(!ri);
1692        }
1693
1694        utask->return_instances = ri;
1695
1696        return true;
1697}
1698
1699/*
1700 * Run handler and ask thread to singlestep.
1701 * Ensure all non-fatal signals cannot interrupt thread while it singlesteps.
1702 */
1703static void handle_swbp(struct pt_regs *regs)
1704{
1705        struct uprobe *uprobe;
1706        unsigned long bp_vaddr;
1707        int uninitialized_var(is_swbp);
1708
1709        bp_vaddr = uprobe_get_swbp_addr(regs);
1710        if (bp_vaddr == get_trampoline_vaddr()) {
1711                if (handle_trampoline(regs))
1712                        return;
1713
1714                pr_warn("uprobe: unable to handle uretprobe pid/tgid=%d/%d\n",
1715                                                current->pid, current->tgid);
1716        }
1717
1718        uprobe = find_active_uprobe(bp_vaddr, &is_swbp);
1719        if (!uprobe) {
1720                if (is_swbp > 0) {
1721                        /* No matching uprobe; signal SIGTRAP. */
1722                        send_sig(SIGTRAP, current, 0);
1723                } else {
1724                        /*
1725                         * Either we raced with uprobe_unregister() or we can't
1726                         * access this memory. The latter is only possible if
1727                         * another thread plays with our ->mm. In both cases
1728                         * we can simply restart. If this vma was unmapped we
1729                         * can pretend this insn was not executed yet and get
1730                         * the (correct) SIGSEGV after restart.
1731                         */
1732                        instruction_pointer_set(regs, bp_vaddr);
1733                }
1734                return;
1735        }
1736
1737        /* change it in advance for ->handler() and restart */
1738        instruction_pointer_set(regs, bp_vaddr);
1739
1740        /*
1741         * TODO: move copy_insn/etc into _register and remove this hack.
1742         * After we hit the bp, _unregister + _register can install the
1743         * new and not-yet-analyzed uprobe at the same address, restart.
1744         */
1745        smp_rmb(); /* pairs with wmb() in install_breakpoint() */
1746        if (unlikely(!test_bit(UPROBE_COPY_INSN, &uprobe->flags)))
1747                goto out;
1748
1749        handler_chain(uprobe, regs);
1750        if (can_skip_sstep(uprobe, regs))
1751                goto out;
1752
1753        if (!pre_ssout(uprobe, regs, bp_vaddr))
1754                return;
1755
1756        /* can_skip_sstep() succeeded, or restart if can't singlestep */
1757out:
1758        put_uprobe(uprobe);
1759}
1760
1761/*
1762 * Perform required fix-ups and disable singlestep.
1763 * Allow pending signals to take effect.
1764 */
1765static void handle_singlestep(struct uprobe_task *utask, struct pt_regs *regs)
1766{
1767        struct uprobe *uprobe;
1768
1769        uprobe = utask->active_uprobe;
1770        if (utask->state == UTASK_SSTEP_ACK)
1771                arch_uprobe_post_xol(&uprobe->arch, regs);
1772        else if (utask->state == UTASK_SSTEP_TRAPPED)
1773                arch_uprobe_abort_xol(&uprobe->arch, regs);
1774        else
1775                WARN_ON_ONCE(1);
1776
1777        put_uprobe(uprobe);
1778        utask->active_uprobe = NULL;
1779        utask->state = UTASK_RUNNING;
1780        xol_free_insn_slot(current);
1781
1782        spin_lock_irq(&current->sighand->siglock);
1783        recalc_sigpending(); /* see uprobe_deny_signal() */
1784        spin_unlock_irq(&current->sighand->siglock);
1785}
1786
1787/*
1788 * On breakpoint hit, breakpoint notifier sets the TIF_UPROBE flag and
1789 * allows the thread to return from interrupt. After that handle_swbp()
1790 * sets utask->active_uprobe.
1791 *
1792 * On singlestep exception, singlestep notifier sets the TIF_UPROBE flag
1793 * and allows the thread to return from interrupt.
1794 *
1795 * While returning to userspace, thread notices the TIF_UPROBE flag and calls
1796 * uprobe_notify_resume().
1797 */
1798void uprobe_notify_resume(struct pt_regs *regs)
1799{
1800        struct uprobe_task *utask;
1801
1802        clear_thread_flag(TIF_UPROBE);
1803
1804        utask = current->utask;
1805        if (utask && utask->active_uprobe)
1806                handle_singlestep(utask, regs);
1807        else
1808                handle_swbp(regs);
1809}
1810
1811/*
1812 * uprobe_pre_sstep_notifier gets called from interrupt context as part of
1813 * notifier mechanism. Set TIF_UPROBE flag and indicate breakpoint hit.
1814 */
1815int uprobe_pre_sstep_notifier(struct pt_regs *regs)
1816{
1817        if (!current->mm)
1818                return 0;
1819
1820        if (!test_bit(MMF_HAS_UPROBES, &current->mm->flags) &&
1821            (!current->utask || !current->utask->return_instances))
1822                return 0;
1823
1824        set_thread_flag(TIF_UPROBE);
1825        return 1;
1826}
1827
1828/*
1829 * uprobe_post_sstep_notifier gets called in interrupt context as part of notifier
1830 * mechanism. Set TIF_UPROBE flag and indicate completion of singlestep.
1831 */
1832int uprobe_post_sstep_notifier(struct pt_regs *regs)
1833{
1834        struct uprobe_task *utask = current->utask;
1835
1836        if (!current->mm || !utask || !utask->active_uprobe)
1837                /* task is currently not uprobed */
1838                return 0;
1839
1840        utask->state = UTASK_SSTEP_ACK;
1841        set_thread_flag(TIF_UPROBE);
1842        return 1;
1843}
1844
1845static struct notifier_block uprobe_exception_nb = {
1846        .notifier_call          = arch_uprobe_exception_notify,
1847        .priority               = INT_MAX-1,    /* notified after kprobes, kgdb */
1848};
1849
1850static int __init init_uprobes(void)
1851{
1852        int i;
1853
1854        for (i = 0; i < UPROBES_HASH_SZ; i++)
1855                mutex_init(&uprobes_mmap_mutex[i]);
1856
1857        if (percpu_init_rwsem(&dup_mmap_sem))
1858                return -ENOMEM;
1859
1860        return register_die_notifier(&uprobe_exception_nb);
1861}
1862module_init(init_uprobes);
1863
1864static void __exit exit_uprobes(void)
1865{
1866}
1867module_exit(exit_uprobes);
1868