linux/kernel/events/uprobes.c
<<
>>
Prefs
   1/*
   2 * User-space Probes (UProbes)
   3 *
   4 * This program is free software; you can redistribute it and/or modify
   5 * it under the terms of the GNU General Public License as published by
   6 * the Free Software Foundation; either version 2 of the License, or
   7 * (at your option) any later version.
   8 *
   9 * This program is distributed in the hope that it will be useful,
  10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
  11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12 * GNU General Public License for more details.
  13 *
  14 * You should have received a copy of the GNU General Public License
  15 * along with this program; if not, write to the Free Software
  16 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
  17 *
  18 * Copyright (C) IBM Corporation, 2008-2012
  19 * Authors:
  20 *      Srikar Dronamraju
  21 *      Jim Keniston
  22 * Copyright (C) 2011-2012 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
  23 */
  24
  25#include <linux/kernel.h>
  26#include <linux/highmem.h>
  27#include <linux/pagemap.h>      /* read_mapping_page */
  28#include <linux/slab.h>
  29#include <linux/sched.h>
  30#include <linux/export.h>
  31#include <linux/rmap.h>         /* anon_vma_prepare */
  32#include <linux/mmu_notifier.h> /* set_pte_at_notify */
  33#include <linux/swap.h>         /* try_to_free_swap */
  34#include <linux/ptrace.h>       /* user_enable_single_step */
  35#include <linux/kdebug.h>       /* notifier mechanism */
  36#include "../../mm/internal.h"  /* munlock_vma_page */
  37#include <linux/percpu-rwsem.h>
  38#include <linux/task_work.h>
  39#include <linux/shmem_fs.h>
  40
  41#include <linux/uprobes.h>
  42
  43#define UINSNS_PER_PAGE                 (PAGE_SIZE/UPROBE_XOL_SLOT_BYTES)
  44#define MAX_UPROBE_XOL_SLOTS            UINSNS_PER_PAGE
  45
  46static struct rb_root uprobes_tree = RB_ROOT;
  47/*
  48 * allows us to skip the uprobe_mmap if there are no uprobe events active
  49 * at this time.  Probably a fine grained per inode count is better?
  50 */
  51#define no_uprobe_events()      RB_EMPTY_ROOT(&uprobes_tree)
  52
  53static DEFINE_SPINLOCK(uprobes_treelock);       /* serialize rbtree access */
  54
  55#define UPROBES_HASH_SZ 13
  56/* serialize uprobe->pending_list */
  57static struct mutex uprobes_mmap_mutex[UPROBES_HASH_SZ];
  58#define uprobes_mmap_hash(v)    (&uprobes_mmap_mutex[((unsigned long)(v)) % UPROBES_HASH_SZ])
  59
  60static struct percpu_rw_semaphore dup_mmap_sem;
  61
  62/* Have a copy of original instruction */
  63#define UPROBE_COPY_INSN        0
  64
  65struct uprobe {
  66        struct rb_node          rb_node;        /* node in the rb tree */
  67        atomic_t                ref;
  68        struct rw_semaphore     register_rwsem;
  69        struct rw_semaphore     consumer_rwsem;
  70        struct list_head        pending_list;
  71        struct uprobe_consumer  *consumers;
  72        struct inode            *inode;         /* Also hold a ref to inode */
  73        loff_t                  offset;
  74        unsigned long           flags;
  75
  76        /*
  77         * The generic code assumes that it has two members of unknown type
  78         * owned by the arch-specific code:
  79         *
  80         *      insn -  copy_insn() saves the original instruction here for
  81         *              arch_uprobe_analyze_insn().
  82         *
  83         *      ixol -  potentially modified instruction to execute out of
  84         *              line, copied to xol_area by xol_get_insn_slot().
  85         */
  86        struct arch_uprobe      arch;
  87};
  88
  89struct return_instance {
  90        struct uprobe           *uprobe;
  91        unsigned long           func;
  92        unsigned long           orig_ret_vaddr; /* original return address */
  93        bool                    chained;        /* true, if instance is nested */
  94
  95        struct return_instance  *next;          /* keep as stack */
  96};
  97
  98/*
  99 * Execute out of line area: anonymous executable mapping installed
 100 * by the probed task to execute the copy of the original instruction
 101 * mangled by set_swbp().
 102 *
 103 * On a breakpoint hit, thread contests for a slot.  It frees the
 104 * slot after singlestep. Currently a fixed number of slots are
 105 * allocated.
 106 */
 107struct xol_area {
 108        wait_queue_head_t       wq;             /* if all slots are busy */
 109        atomic_t                slot_count;     /* number of in-use slots */
 110        unsigned long           *bitmap;        /* 0 = free slot */
 111        struct page             *page;
 112
 113        /*
 114         * We keep the vma's vm_start rather than a pointer to the vma
 115         * itself.  The probed process or a naughty kernel module could make
 116         * the vma go away, and we must handle that reasonably gracefully.
 117         */
 118        unsigned long           vaddr;          /* Page(s) of instruction slots */
 119};
 120
 121/*
 122 * valid_vma: Verify if the specified vma is an executable vma
 123 * Relax restrictions while unregistering: vm_flags might have
 124 * changed after breakpoint was inserted.
 125 *      - is_register: indicates if we are in register context.
 126 *      - Return 1 if the specified virtual address is in an
 127 *        executable vma.
 128 */
 129static bool valid_vma(struct vm_area_struct *vma, bool is_register)
 130{
 131        vm_flags_t flags = VM_HUGETLB | VM_MAYEXEC | VM_MAYSHARE;
 132
 133        if (is_register)
 134                flags |= VM_WRITE;
 135
 136        return vma->vm_file && (vma->vm_flags & flags) == VM_MAYEXEC;
 137}
 138
 139static unsigned long offset_to_vaddr(struct vm_area_struct *vma, loff_t offset)
 140{
 141        return vma->vm_start + offset - ((loff_t)vma->vm_pgoff << PAGE_SHIFT);
 142}
 143
 144static loff_t vaddr_to_offset(struct vm_area_struct *vma, unsigned long vaddr)
 145{
 146        return ((loff_t)vma->vm_pgoff << PAGE_SHIFT) + (vaddr - vma->vm_start);
 147}
 148
 149/**
 150 * __replace_page - replace page in vma by new page.
 151 * based on replace_page in mm/ksm.c
 152 *
 153 * @vma:      vma that holds the pte pointing to page
 154 * @addr:     address the old @page is mapped at
 155 * @page:     the cowed page we are replacing by kpage
 156 * @kpage:    the modified page we replace page by
 157 *
 158 * Returns 0 on success, -EFAULT on failure.
 159 */
 160static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
 161                                struct page *page, struct page *kpage)
 162{
 163        struct mm_struct *mm = vma->vm_mm;
 164        spinlock_t *ptl;
 165        pte_t *ptep;
 166        int err;
 167        /* For mmu_notifiers */
 168        const unsigned long mmun_start = addr;
 169        const unsigned long mmun_end   = addr + PAGE_SIZE;
 170        struct mem_cgroup *memcg;
 171
 172        err = mem_cgroup_try_charge(kpage, vma->vm_mm, GFP_KERNEL, &memcg);
 173        if (err)
 174                return err;
 175
 176        /* For try_to_free_swap() and munlock_vma_page() below */
 177        lock_page(page);
 178
 179        mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
 180        err = -EAGAIN;
 181        ptep = page_check_address(page, mm, addr, &ptl, 0);
 182        if (!ptep)
 183                goto unlock;
 184
 185        get_page(kpage);
 186        page_add_new_anon_rmap(kpage, vma, addr);
 187        mem_cgroup_commit_charge(kpage, memcg, false);
 188        lru_cache_add_active_or_unevictable(kpage, vma);
 189
 190        if (!PageAnon(page)) {
 191                dec_mm_counter(mm, MM_FILEPAGES);
 192                inc_mm_counter(mm, MM_ANONPAGES);
 193        }
 194
 195        flush_cache_page(vma, addr, pte_pfn(*ptep));
 196        ptep_clear_flush_notify(vma, addr, ptep);
 197        set_pte_at_notify(mm, addr, ptep, mk_pte(kpage, vma->vm_page_prot));
 198
 199        page_remove_rmap(page);
 200        if (!page_mapped(page))
 201                try_to_free_swap(page);
 202        pte_unmap_unlock(ptep, ptl);
 203
 204        if (vma->vm_flags & VM_LOCKED)
 205                munlock_vma_page(page);
 206        put_page(page);
 207
 208        err = 0;
 209 unlock:
 210        mem_cgroup_cancel_charge(kpage, memcg);
 211        mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
 212        unlock_page(page);
 213        return err;
 214}
 215
 216/**
 217 * is_swbp_insn - check if instruction is breakpoint instruction.
 218 * @insn: instruction to be checked.
 219 * Default implementation of is_swbp_insn
 220 * Returns true if @insn is a breakpoint instruction.
 221 */
 222bool __weak is_swbp_insn(uprobe_opcode_t *insn)
 223{
 224        return *insn == UPROBE_SWBP_INSN;
 225}
 226
 227/**
 228 * is_trap_insn - check if instruction is breakpoint instruction.
 229 * @insn: instruction to be checked.
 230 * Default implementation of is_trap_insn
 231 * Returns true if @insn is a breakpoint instruction.
 232 *
 233 * This function is needed for the case where an architecture has multiple
 234 * trap instructions (like powerpc).
 235 */
 236bool __weak is_trap_insn(uprobe_opcode_t *insn)
 237{
 238        return is_swbp_insn(insn);
 239}
 240
 241static void copy_from_page(struct page *page, unsigned long vaddr, void *dst, int len)
 242{
 243        void *kaddr = kmap_atomic(page);
 244        memcpy(dst, kaddr + (vaddr & ~PAGE_MASK), len);
 245        kunmap_atomic(kaddr);
 246}
 247
 248static void copy_to_page(struct page *page, unsigned long vaddr, const void *src, int len)
 249{
 250        void *kaddr = kmap_atomic(page);
 251        memcpy(kaddr + (vaddr & ~PAGE_MASK), src, len);
 252        kunmap_atomic(kaddr);
 253}
 254
 255static int verify_opcode(struct page *page, unsigned long vaddr, uprobe_opcode_t *new_opcode)
 256{
 257        uprobe_opcode_t old_opcode;
 258        bool is_swbp;
 259
 260        /*
 261         * Note: We only check if the old_opcode is UPROBE_SWBP_INSN here.
 262         * We do not check if it is any other 'trap variant' which could
 263         * be conditional trap instruction such as the one powerpc supports.
 264         *
 265         * The logic is that we do not care if the underlying instruction
 266         * is a trap variant; uprobes always wins over any other (gdb)
 267         * breakpoint.
 268         */
 269        copy_from_page(page, vaddr, &old_opcode, UPROBE_SWBP_INSN_SIZE);
 270        is_swbp = is_swbp_insn(&old_opcode);
 271
 272        if (is_swbp_insn(new_opcode)) {
 273                if (is_swbp)            /* register: already installed? */
 274                        return 0;
 275        } else {
 276                if (!is_swbp)           /* unregister: was it changed by us? */
 277                        return 0;
 278        }
 279
 280        return 1;
 281}
 282
 283/*
 284 * NOTE:
 285 * Expect the breakpoint instruction to be the smallest size instruction for
 286 * the architecture. If an arch has variable length instruction and the
 287 * breakpoint instruction is not of the smallest length instruction
 288 * supported by that architecture then we need to modify is_trap_at_addr and
 289 * uprobe_write_opcode accordingly. This would never be a problem for archs
 290 * that have fixed length instructions.
 291 *
 292 * uprobe_write_opcode - write the opcode at a given virtual address.
 293 * @mm: the probed process address space.
 294 * @vaddr: the virtual address to store the opcode.
 295 * @opcode: opcode to be written at @vaddr.
 296 *
 297 * Called with mm->mmap_sem held for write.
 298 * Return 0 (success) or a negative errno.
 299 */
 300int uprobe_write_opcode(struct mm_struct *mm, unsigned long vaddr,
 301                        uprobe_opcode_t opcode)
 302{
 303        struct page *old_page, *new_page;
 304        struct vm_area_struct *vma;
 305        int ret;
 306
 307retry:
 308        /* Read the page with vaddr into memory */
 309        ret = get_user_pages(NULL, mm, vaddr, 1, 0, 1, &old_page, &vma);
 310        if (ret <= 0)
 311                return ret;
 312
 313        ret = verify_opcode(old_page, vaddr, &opcode);
 314        if (ret <= 0)
 315                goto put_old;
 316
 317        ret = anon_vma_prepare(vma);
 318        if (ret)
 319                goto put_old;
 320
 321        ret = -ENOMEM;
 322        new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, vaddr);
 323        if (!new_page)
 324                goto put_old;
 325
 326        __SetPageUptodate(new_page);
 327        copy_highpage(new_page, old_page);
 328        copy_to_page(new_page, vaddr, &opcode, UPROBE_SWBP_INSN_SIZE);
 329
 330        ret = __replace_page(vma, vaddr, old_page, new_page);
 331        page_cache_release(new_page);
 332put_old:
 333        put_page(old_page);
 334
 335        if (unlikely(ret == -EAGAIN))
 336                goto retry;
 337        return ret;
 338}
 339
 340/**
 341 * set_swbp - store breakpoint at a given address.
 342 * @auprobe: arch specific probepoint information.
 343 * @mm: the probed process address space.
 344 * @vaddr: the virtual address to insert the opcode.
 345 *
 346 * For mm @mm, store the breakpoint instruction at @vaddr.
 347 * Return 0 (success) or a negative errno.
 348 */
 349int __weak set_swbp(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long vaddr)
 350{
 351        return uprobe_write_opcode(mm, vaddr, UPROBE_SWBP_INSN);
 352}
 353
 354/**
 355 * set_orig_insn - Restore the original instruction.
 356 * @mm: the probed process address space.
 357 * @auprobe: arch specific probepoint information.
 358 * @vaddr: the virtual address to insert the opcode.
 359 *
 360 * For mm @mm, restore the original opcode (opcode) at @vaddr.
 361 * Return 0 (success) or a negative errno.
 362 */
 363int __weak
 364set_orig_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long vaddr)
 365{
 366        return uprobe_write_opcode(mm, vaddr, *(uprobe_opcode_t *)&auprobe->insn);
 367}
 368
 369static int match_uprobe(struct uprobe *l, struct uprobe *r)
 370{
 371        if (l->inode < r->inode)
 372                return -1;
 373
 374        if (l->inode > r->inode)
 375                return 1;
 376
 377        if (l->offset < r->offset)
 378                return -1;
 379
 380        if (l->offset > r->offset)
 381                return 1;
 382
 383        return 0;
 384}
 385
 386static struct uprobe *__find_uprobe(struct inode *inode, loff_t offset)
 387{
 388        struct uprobe u = { .inode = inode, .offset = offset };
 389        struct rb_node *n = uprobes_tree.rb_node;
 390        struct uprobe *uprobe;
 391        int match;
 392
 393        while (n) {
 394                uprobe = rb_entry(n, struct uprobe, rb_node);
 395                match = match_uprobe(&u, uprobe);
 396                if (!match) {
 397                        atomic_inc(&uprobe->ref);
 398                        return uprobe;
 399                }
 400
 401                if (match < 0)
 402                        n = n->rb_left;
 403                else
 404                        n = n->rb_right;
 405        }
 406        return NULL;
 407}
 408
 409/*
 410 * Find a uprobe corresponding to a given inode:offset
 411 * Acquires uprobes_treelock
 412 */
 413static struct uprobe *find_uprobe(struct inode *inode, loff_t offset)
 414{
 415        struct uprobe *uprobe;
 416
 417        spin_lock(&uprobes_treelock);
 418        uprobe = __find_uprobe(inode, offset);
 419        spin_unlock(&uprobes_treelock);
 420
 421        return uprobe;
 422}
 423
 424static struct uprobe *__insert_uprobe(struct uprobe *uprobe)
 425{
 426        struct rb_node **p = &uprobes_tree.rb_node;
 427        struct rb_node *parent = NULL;
 428        struct uprobe *u;
 429        int match;
 430
 431        while (*p) {
 432                parent = *p;
 433                u = rb_entry(parent, struct uprobe, rb_node);
 434                match = match_uprobe(uprobe, u);
 435                if (!match) {
 436                        atomic_inc(&u->ref);
 437                        return u;
 438                }
 439
 440                if (match < 0)
 441                        p = &parent->rb_left;
 442                else
 443                        p = &parent->rb_right;
 444
 445        }
 446
 447        u = NULL;
 448        rb_link_node(&uprobe->rb_node, parent, p);
 449        rb_insert_color(&uprobe->rb_node, &uprobes_tree);
 450        /* get access + creation ref */
 451        atomic_set(&uprobe->ref, 2);
 452
 453        return u;
 454}
 455
 456/*
 457 * Acquire uprobes_treelock.
 458 * Matching uprobe already exists in rbtree;
 459 *      increment (access refcount) and return the matching uprobe.
 460 *
 461 * No matching uprobe; insert the uprobe in rb_tree;
 462 *      get a double refcount (access + creation) and return NULL.
 463 */
 464static struct uprobe *insert_uprobe(struct uprobe *uprobe)
 465{
 466        struct uprobe *u;
 467
 468        spin_lock(&uprobes_treelock);
 469        u = __insert_uprobe(uprobe);
 470        spin_unlock(&uprobes_treelock);
 471
 472        return u;
 473}
 474
 475static void put_uprobe(struct uprobe *uprobe)
 476{
 477        if (atomic_dec_and_test(&uprobe->ref))
 478                kfree(uprobe);
 479}
 480
 481static struct uprobe *alloc_uprobe(struct inode *inode, loff_t offset)
 482{
 483        struct uprobe *uprobe, *cur_uprobe;
 484
 485        uprobe = kzalloc(sizeof(struct uprobe), GFP_KERNEL);
 486        if (!uprobe)
 487                return NULL;
 488
 489        uprobe->inode = igrab(inode);
 490        uprobe->offset = offset;
 491        init_rwsem(&uprobe->register_rwsem);
 492        init_rwsem(&uprobe->consumer_rwsem);
 493
 494        /* add to uprobes_tree, sorted on inode:offset */
 495        cur_uprobe = insert_uprobe(uprobe);
 496        /* a uprobe exists for this inode:offset combination */
 497        if (cur_uprobe) {
 498                kfree(uprobe);
 499                uprobe = cur_uprobe;
 500                iput(inode);
 501        }
 502
 503        return uprobe;
 504}
 505
 506static void consumer_add(struct uprobe *uprobe, struct uprobe_consumer *uc)
 507{
 508        down_write(&uprobe->consumer_rwsem);
 509        uc->next = uprobe->consumers;
 510        uprobe->consumers = uc;
 511        up_write(&uprobe->consumer_rwsem);
 512}
 513
 514/*
 515 * For uprobe @uprobe, delete the consumer @uc.
 516 * Return true if the @uc is deleted successfully
 517 * or return false.
 518 */
 519static bool consumer_del(struct uprobe *uprobe, struct uprobe_consumer *uc)
 520{
 521        struct uprobe_consumer **con;
 522        bool ret = false;
 523
 524        down_write(&uprobe->consumer_rwsem);
 525        for (con = &uprobe->consumers; *con; con = &(*con)->next) {
 526                if (*con == uc) {
 527                        *con = uc->next;
 528                        ret = true;
 529                        break;
 530                }
 531        }
 532        up_write(&uprobe->consumer_rwsem);
 533
 534        return ret;
 535}
 536
 537static int __copy_insn(struct address_space *mapping, struct file *filp,
 538                        void *insn, int nbytes, loff_t offset)
 539{
 540        struct page *page;
 541        /*
 542         * Ensure that the page that has the original instruction is populated
 543         * and in page-cache. If ->readpage == NULL it must be shmem_mapping(),
 544         * see uprobe_register().
 545         */
 546        if (mapping->a_ops->readpage)
 547                page = read_mapping_page(mapping, offset >> PAGE_CACHE_SHIFT, filp);
 548        else
 549                page = shmem_read_mapping_page(mapping, offset >> PAGE_CACHE_SHIFT);
 550        if (IS_ERR(page))
 551                return PTR_ERR(page);
 552
 553        copy_from_page(page, offset, insn, nbytes);
 554        page_cache_release(page);
 555
 556        return 0;
 557}
 558
 559static int copy_insn(struct uprobe *uprobe, struct file *filp)
 560{
 561        struct address_space *mapping = uprobe->inode->i_mapping;
 562        loff_t offs = uprobe->offset;
 563        void *insn = &uprobe->arch.insn;
 564        int size = sizeof(uprobe->arch.insn);
 565        int len, err = -EIO;
 566
 567        /* Copy only available bytes, -EIO if nothing was read */
 568        do {
 569                if (offs >= i_size_read(uprobe->inode))
 570                        break;
 571
 572                len = min_t(int, size, PAGE_SIZE - (offs & ~PAGE_MASK));
 573                err = __copy_insn(mapping, filp, insn, len, offs);
 574                if (err)
 575                        break;
 576
 577                insn += len;
 578                offs += len;
 579                size -= len;
 580        } while (size);
 581
 582        return err;
 583}
 584
 585static int prepare_uprobe(struct uprobe *uprobe, struct file *file,
 586                                struct mm_struct *mm, unsigned long vaddr)
 587{
 588        int ret = 0;
 589
 590        if (test_bit(UPROBE_COPY_INSN, &uprobe->flags))
 591                return ret;
 592
 593        /* TODO: move this into _register, until then we abuse this sem. */
 594        down_write(&uprobe->consumer_rwsem);
 595        if (test_bit(UPROBE_COPY_INSN, &uprobe->flags))
 596                goto out;
 597
 598        ret = copy_insn(uprobe, file);
 599        if (ret)
 600                goto out;
 601
 602        ret = -ENOTSUPP;
 603        if (is_trap_insn((uprobe_opcode_t *)&uprobe->arch.insn))
 604                goto out;
 605
 606        ret = arch_uprobe_analyze_insn(&uprobe->arch, mm, vaddr);
 607        if (ret)
 608                goto out;
 609
 610        /* uprobe_write_opcode() assumes we don't cross page boundary */
 611        BUG_ON((uprobe->offset & ~PAGE_MASK) +
 612                        UPROBE_SWBP_INSN_SIZE > PAGE_SIZE);
 613
 614        smp_wmb(); /* pairs with rmb() in find_active_uprobe() */
 615        set_bit(UPROBE_COPY_INSN, &uprobe->flags);
 616
 617 out:
 618        up_write(&uprobe->consumer_rwsem);
 619
 620        return ret;
 621}
 622
 623static inline bool consumer_filter(struct uprobe_consumer *uc,
 624                                   enum uprobe_filter_ctx ctx, struct mm_struct *mm)
 625{
 626        return !uc->filter || uc->filter(uc, ctx, mm);
 627}
 628
 629static bool filter_chain(struct uprobe *uprobe,
 630                         enum uprobe_filter_ctx ctx, struct mm_struct *mm)
 631{
 632        struct uprobe_consumer *uc;
 633        bool ret = false;
 634
 635        down_read(&uprobe->consumer_rwsem);
 636        for (uc = uprobe->consumers; uc; uc = uc->next) {
 637                ret = consumer_filter(uc, ctx, mm);
 638                if (ret)
 639                        break;
 640        }
 641        up_read(&uprobe->consumer_rwsem);
 642
 643        return ret;
 644}
 645
 646static int
 647install_breakpoint(struct uprobe *uprobe, struct mm_struct *mm,
 648                        struct vm_area_struct *vma, unsigned long vaddr)
 649{
 650        bool first_uprobe;
 651        int ret;
 652
 653        ret = prepare_uprobe(uprobe, vma->vm_file, mm, vaddr);
 654        if (ret)
 655                return ret;
 656
 657        /*
 658         * set MMF_HAS_UPROBES in advance for uprobe_pre_sstep_notifier(),
 659         * the task can hit this breakpoint right after __replace_page().
 660         */
 661        first_uprobe = !test_bit(MMF_HAS_UPROBES, &mm->flags);
 662        if (first_uprobe)
 663                set_bit(MMF_HAS_UPROBES, &mm->flags);
 664
 665        ret = set_swbp(&uprobe->arch, mm, vaddr);
 666        if (!ret)
 667                clear_bit(MMF_RECALC_UPROBES, &mm->flags);
 668        else if (first_uprobe)
 669                clear_bit(MMF_HAS_UPROBES, &mm->flags);
 670
 671        return ret;
 672}
 673
 674static int
 675remove_breakpoint(struct uprobe *uprobe, struct mm_struct *mm, unsigned long vaddr)
 676{
 677        set_bit(MMF_RECALC_UPROBES, &mm->flags);
 678        return set_orig_insn(&uprobe->arch, mm, vaddr);
 679}
 680
 681static inline bool uprobe_is_active(struct uprobe *uprobe)
 682{
 683        return !RB_EMPTY_NODE(&uprobe->rb_node);
 684}
 685/*
 686 * There could be threads that have already hit the breakpoint. They
 687 * will recheck the current insn and restart if find_uprobe() fails.
 688 * See find_active_uprobe().
 689 */
 690static void delete_uprobe(struct uprobe *uprobe)
 691{
 692        if (WARN_ON(!uprobe_is_active(uprobe)))
 693                return;
 694
 695        spin_lock(&uprobes_treelock);
 696        rb_erase(&uprobe->rb_node, &uprobes_tree);
 697        spin_unlock(&uprobes_treelock);
 698        RB_CLEAR_NODE(&uprobe->rb_node); /* for uprobe_is_active() */
 699        iput(uprobe->inode);
 700        put_uprobe(uprobe);
 701}
 702
 703struct map_info {
 704        struct map_info *next;
 705        struct mm_struct *mm;
 706        unsigned long vaddr;
 707};
 708
 709static inline struct map_info *free_map_info(struct map_info *info)
 710{
 711        struct map_info *next = info->next;
 712        kfree(info);
 713        return next;
 714}
 715
 716static struct map_info *
 717build_map_info(struct address_space *mapping, loff_t offset, bool is_register)
 718{
 719        unsigned long pgoff = offset >> PAGE_SHIFT;
 720        struct vm_area_struct *vma;
 721        struct map_info *curr = NULL;
 722        struct map_info *prev = NULL;
 723        struct map_info *info;
 724        int more = 0;
 725
 726 again:
 727        i_mmap_lock_read(mapping);
 728        vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) {
 729                if (!valid_vma(vma, is_register))
 730                        continue;
 731
 732                if (!prev && !more) {
 733                        /*
 734                         * Needs GFP_NOWAIT to avoid i_mmap_rwsem recursion through
 735                         * reclaim. This is optimistic, no harm done if it fails.
 736                         */
 737                        prev = kmalloc(sizeof(struct map_info),
 738                                        GFP_NOWAIT | __GFP_NOMEMALLOC | __GFP_NOWARN);
 739                        if (prev)
 740                                prev->next = NULL;
 741                }
 742                if (!prev) {
 743                        more++;
 744                        continue;
 745                }
 746
 747                if (!atomic_inc_not_zero(&vma->vm_mm->mm_users))
 748                        continue;
 749
 750                info = prev;
 751                prev = prev->next;
 752                info->next = curr;
 753                curr = info;
 754
 755                info->mm = vma->vm_mm;
 756                info->vaddr = offset_to_vaddr(vma, offset);
 757        }
 758        i_mmap_unlock_read(mapping);
 759
 760        if (!more)
 761                goto out;
 762
 763        prev = curr;
 764        while (curr) {
 765                mmput(curr->mm);
 766                curr = curr->next;
 767        }
 768
 769        do {
 770                info = kmalloc(sizeof(struct map_info), GFP_KERNEL);
 771                if (!info) {
 772                        curr = ERR_PTR(-ENOMEM);
 773                        goto out;
 774                }
 775                info->next = prev;
 776                prev = info;
 777        } while (--more);
 778
 779        goto again;
 780 out:
 781        while (prev)
 782                prev = free_map_info(prev);
 783        return curr;
 784}
 785
 786static int
 787register_for_each_vma(struct uprobe *uprobe, struct uprobe_consumer *new)
 788{
 789        bool is_register = !!new;
 790        struct map_info *info;
 791        int err = 0;
 792
 793        percpu_down_write(&dup_mmap_sem);
 794        info = build_map_info(uprobe->inode->i_mapping,
 795                                        uprobe->offset, is_register);
 796        if (IS_ERR(info)) {
 797                err = PTR_ERR(info);
 798                goto out;
 799        }
 800
 801        while (info) {
 802                struct mm_struct *mm = info->mm;
 803                struct vm_area_struct *vma;
 804
 805                if (err && is_register)
 806                        goto free;
 807
 808                down_write(&mm->mmap_sem);
 809                vma = find_vma(mm, info->vaddr);
 810                if (!vma || !valid_vma(vma, is_register) ||
 811                    file_inode(vma->vm_file) != uprobe->inode)
 812                        goto unlock;
 813
 814                if (vma->vm_start > info->vaddr ||
 815                    vaddr_to_offset(vma, info->vaddr) != uprobe->offset)
 816                        goto unlock;
 817
 818                if (is_register) {
 819                        /* consult only the "caller", new consumer. */
 820                        if (consumer_filter(new,
 821                                        UPROBE_FILTER_REGISTER, mm))
 822                                err = install_breakpoint(uprobe, mm, vma, info->vaddr);
 823                } else if (test_bit(MMF_HAS_UPROBES, &mm->flags)) {
 824                        if (!filter_chain(uprobe,
 825                                        UPROBE_FILTER_UNREGISTER, mm))
 826                                err |= remove_breakpoint(uprobe, mm, info->vaddr);
 827                }
 828
 829 unlock:
 830                up_write(&mm->mmap_sem);
 831 free:
 832                mmput(mm);
 833                info = free_map_info(info);
 834        }
 835 out:
 836        percpu_up_write(&dup_mmap_sem);
 837        return err;
 838}
 839
 840static int __uprobe_register(struct uprobe *uprobe, struct uprobe_consumer *uc)
 841{
 842        consumer_add(uprobe, uc);
 843        return register_for_each_vma(uprobe, uc);
 844}
 845
 846static void __uprobe_unregister(struct uprobe *uprobe, struct uprobe_consumer *uc)
 847{
 848        int err;
 849
 850        if (WARN_ON(!consumer_del(uprobe, uc)))
 851                return;
 852
 853        err = register_for_each_vma(uprobe, NULL);
 854        /* TODO : cant unregister? schedule a worker thread */
 855        if (!uprobe->consumers && !err)
 856                delete_uprobe(uprobe);
 857}
 858
 859/*
 860 * uprobe_register - register a probe
 861 * @inode: the file in which the probe has to be placed.
 862 * @offset: offset from the start of the file.
 863 * @uc: information on howto handle the probe..
 864 *
 865 * Apart from the access refcount, uprobe_register() takes a creation
 866 * refcount (thro alloc_uprobe) if and only if this @uprobe is getting
 867 * inserted into the rbtree (i.e first consumer for a @inode:@offset
 868 * tuple).  Creation refcount stops uprobe_unregister from freeing the
 869 * @uprobe even before the register operation is complete. Creation
 870 * refcount is released when the last @uc for the @uprobe
 871 * unregisters.
 872 *
 873 * Return errno if it cannot successully install probes
 874 * else return 0 (success)
 875 */
 876int uprobe_register(struct inode *inode, loff_t offset, struct uprobe_consumer *uc)
 877{
 878        struct uprobe *uprobe;
 879        int ret;
 880
 881        /* Uprobe must have at least one set consumer */
 882        if (!uc->handler && !uc->ret_handler)
 883                return -EINVAL;
 884
 885        /* copy_insn() uses read_mapping_page() or shmem_read_mapping_page() */
 886        if (!inode->i_mapping->a_ops->readpage && !shmem_mapping(inode->i_mapping))
 887                return -EIO;
 888        /* Racy, just to catch the obvious mistakes */
 889        if (offset > i_size_read(inode))
 890                return -EINVAL;
 891
 892 retry:
 893        uprobe = alloc_uprobe(inode, offset);
 894        if (!uprobe)
 895                return -ENOMEM;
 896        /*
 897         * We can race with uprobe_unregister()->delete_uprobe().
 898         * Check uprobe_is_active() and retry if it is false.
 899         */
 900        down_write(&uprobe->register_rwsem);
 901        ret = -EAGAIN;
 902        if (likely(uprobe_is_active(uprobe))) {
 903                ret = __uprobe_register(uprobe, uc);
 904                if (ret)
 905                        __uprobe_unregister(uprobe, uc);
 906        }
 907        up_write(&uprobe->register_rwsem);
 908        put_uprobe(uprobe);
 909
 910        if (unlikely(ret == -EAGAIN))
 911                goto retry;
 912        return ret;
 913}
 914EXPORT_SYMBOL_GPL(uprobe_register);
 915
 916/*
 917 * uprobe_apply - unregister a already registered probe.
 918 * @inode: the file in which the probe has to be removed.
 919 * @offset: offset from the start of the file.
 920 * @uc: consumer which wants to add more or remove some breakpoints
 921 * @add: add or remove the breakpoints
 922 */
 923int uprobe_apply(struct inode *inode, loff_t offset,
 924                        struct uprobe_consumer *uc, bool add)
 925{
 926        struct uprobe *uprobe;
 927        struct uprobe_consumer *con;
 928        int ret = -ENOENT;
 929
 930        uprobe = find_uprobe(inode, offset);
 931        if (WARN_ON(!uprobe))
 932                return ret;
 933
 934        down_write(&uprobe->register_rwsem);
 935        for (con = uprobe->consumers; con && con != uc ; con = con->next)
 936                ;
 937        if (con)
 938                ret = register_for_each_vma(uprobe, add ? uc : NULL);
 939        up_write(&uprobe->register_rwsem);
 940        put_uprobe(uprobe);
 941
 942        return ret;
 943}
 944
 945/*
 946 * uprobe_unregister - unregister a already registered probe.
 947 * @inode: the file in which the probe has to be removed.
 948 * @offset: offset from the start of the file.
 949 * @uc: identify which probe if multiple probes are colocated.
 950 */
 951void uprobe_unregister(struct inode *inode, loff_t offset, struct uprobe_consumer *uc)
 952{
 953        struct uprobe *uprobe;
 954
 955        uprobe = find_uprobe(inode, offset);
 956        if (WARN_ON(!uprobe))
 957                return;
 958
 959        down_write(&uprobe->register_rwsem);
 960        __uprobe_unregister(uprobe, uc);
 961        up_write(&uprobe->register_rwsem);
 962        put_uprobe(uprobe);
 963}
 964EXPORT_SYMBOL_GPL(uprobe_unregister);
 965
 966static int unapply_uprobe(struct uprobe *uprobe, struct mm_struct *mm)
 967{
 968        struct vm_area_struct *vma;
 969        int err = 0;
 970
 971        down_read(&mm->mmap_sem);
 972        for (vma = mm->mmap; vma; vma = vma->vm_next) {
 973                unsigned long vaddr;
 974                loff_t offset;
 975
 976                if (!valid_vma(vma, false) ||
 977                    file_inode(vma->vm_file) != uprobe->inode)
 978                        continue;
 979
 980                offset = (loff_t)vma->vm_pgoff << PAGE_SHIFT;
 981                if (uprobe->offset <  offset ||
 982                    uprobe->offset >= offset + vma->vm_end - vma->vm_start)
 983                        continue;
 984
 985                vaddr = offset_to_vaddr(vma, uprobe->offset);
 986                err |= remove_breakpoint(uprobe, mm, vaddr);
 987        }
 988        up_read(&mm->mmap_sem);
 989
 990        return err;
 991}
 992
 993static struct rb_node *
 994find_node_in_range(struct inode *inode, loff_t min, loff_t max)
 995{
 996        struct rb_node *n = uprobes_tree.rb_node;
 997
 998        while (n) {
 999                struct uprobe *u = rb_entry(n, struct uprobe, rb_node);
1000
1001                if (inode < u->inode) {
1002                        n = n->rb_left;
1003                } else if (inode > u->inode) {
1004                        n = n->rb_right;
1005                } else {
1006                        if (max < u->offset)
1007                                n = n->rb_left;
1008                        else if (min > u->offset)
1009                                n = n->rb_right;
1010                        else
1011                                break;
1012                }
1013        }
1014
1015        return n;
1016}
1017
1018/*
1019 * For a given range in vma, build a list of probes that need to be inserted.
1020 */
1021static void build_probe_list(struct inode *inode,
1022                                struct vm_area_struct *vma,
1023                                unsigned long start, unsigned long end,
1024                                struct list_head *head)
1025{
1026        loff_t min, max;
1027        struct rb_node *n, *t;
1028        struct uprobe *u;
1029
1030        INIT_LIST_HEAD(head);
1031        min = vaddr_to_offset(vma, start);
1032        max = min + (end - start) - 1;
1033
1034        spin_lock(&uprobes_treelock);
1035        n = find_node_in_range(inode, min, max);
1036        if (n) {
1037                for (t = n; t; t = rb_prev(t)) {
1038                        u = rb_entry(t, struct uprobe, rb_node);
1039                        if (u->inode != inode || u->offset < min)
1040                                break;
1041                        list_add(&u->pending_list, head);
1042                        atomic_inc(&u->ref);
1043                }
1044                for (t = n; (t = rb_next(t)); ) {
1045                        u = rb_entry(t, struct uprobe, rb_node);
1046                        if (u->inode != inode || u->offset > max)
1047                                break;
1048                        list_add(&u->pending_list, head);
1049                        atomic_inc(&u->ref);
1050                }
1051        }
1052        spin_unlock(&uprobes_treelock);
1053}
1054
1055/*
1056 * Called from mmap_region/vma_adjust with mm->mmap_sem acquired.
1057 *
1058 * Currently we ignore all errors and always return 0, the callers
1059 * can't handle the failure anyway.
1060 */
1061int uprobe_mmap(struct vm_area_struct *vma)
1062{
1063        struct list_head tmp_list;
1064        struct uprobe *uprobe, *u;
1065        struct inode *inode;
1066
1067        if (no_uprobe_events() || !valid_vma(vma, true))
1068                return 0;
1069
1070        inode = file_inode(vma->vm_file);
1071        if (!inode)
1072                return 0;
1073
1074        mutex_lock(uprobes_mmap_hash(inode));
1075        build_probe_list(inode, vma, vma->vm_start, vma->vm_end, &tmp_list);
1076        /*
1077         * We can race with uprobe_unregister(), this uprobe can be already
1078         * removed. But in this case filter_chain() must return false, all
1079         * consumers have gone away.
1080         */
1081        list_for_each_entry_safe(uprobe, u, &tmp_list, pending_list) {
1082                if (!fatal_signal_pending(current) &&
1083                    filter_chain(uprobe, UPROBE_FILTER_MMAP, vma->vm_mm)) {
1084                        unsigned long vaddr = offset_to_vaddr(vma, uprobe->offset);
1085                        install_breakpoint(uprobe, vma->vm_mm, vma, vaddr);
1086                }
1087                put_uprobe(uprobe);
1088        }
1089        mutex_unlock(uprobes_mmap_hash(inode));
1090
1091        return 0;
1092}
1093
1094static bool
1095vma_has_uprobes(struct vm_area_struct *vma, unsigned long start, unsigned long end)
1096{
1097        loff_t min, max;
1098        struct inode *inode;
1099        struct rb_node *n;
1100
1101        inode = file_inode(vma->vm_file);
1102
1103        min = vaddr_to_offset(vma, start);
1104        max = min + (end - start) - 1;
1105
1106        spin_lock(&uprobes_treelock);
1107        n = find_node_in_range(inode, min, max);
1108        spin_unlock(&uprobes_treelock);
1109
1110        return !!n;
1111}
1112
1113/*
1114 * Called in context of a munmap of a vma.
1115 */
1116void uprobe_munmap(struct vm_area_struct *vma, unsigned long start, unsigned long end)
1117{
1118        if (no_uprobe_events() || !valid_vma(vma, false))
1119                return;
1120
1121        if (!atomic_read(&vma->vm_mm->mm_users)) /* called by mmput() ? */
1122                return;
1123
1124        if (!test_bit(MMF_HAS_UPROBES, &vma->vm_mm->flags) ||
1125             test_bit(MMF_RECALC_UPROBES, &vma->vm_mm->flags))
1126                return;
1127
1128        if (vma_has_uprobes(vma, start, end))
1129                set_bit(MMF_RECALC_UPROBES, &vma->vm_mm->flags);
1130}
1131
1132/* Slot allocation for XOL */
1133static int xol_add_vma(struct mm_struct *mm, struct xol_area *area)
1134{
1135        int ret = -EALREADY;
1136
1137        down_write(&mm->mmap_sem);
1138        if (mm->uprobes_state.xol_area)
1139                goto fail;
1140
1141        if (!area->vaddr) {
1142                /* Try to map as high as possible, this is only a hint. */
1143                area->vaddr = get_unmapped_area(NULL, TASK_SIZE - PAGE_SIZE,
1144                                                PAGE_SIZE, 0, 0);
1145                if (area->vaddr & ~PAGE_MASK) {
1146                        ret = area->vaddr;
1147                        goto fail;
1148                }
1149        }
1150
1151        ret = install_special_mapping(mm, area->vaddr, PAGE_SIZE,
1152                                VM_EXEC|VM_MAYEXEC|VM_DONTCOPY|VM_IO, &area->page);
1153        if (ret)
1154                goto fail;
1155
1156        smp_wmb();      /* pairs with get_xol_area() */
1157        mm->uprobes_state.xol_area = area;
1158 fail:
1159        up_write(&mm->mmap_sem);
1160
1161        return ret;
1162}
1163
1164static struct xol_area *__create_xol_area(unsigned long vaddr)
1165{
1166        struct mm_struct *mm = current->mm;
1167        uprobe_opcode_t insn = UPROBE_SWBP_INSN;
1168        struct xol_area *area;
1169
1170        area = kmalloc(sizeof(*area), GFP_KERNEL);
1171        if (unlikely(!area))
1172                goto out;
1173
1174        area->bitmap = kzalloc(BITS_TO_LONGS(UINSNS_PER_PAGE) * sizeof(long), GFP_KERNEL);
1175        if (!area->bitmap)
1176                goto free_area;
1177
1178        area->page = alloc_page(GFP_HIGHUSER);
1179        if (!area->page)
1180                goto free_bitmap;
1181
1182        area->vaddr = vaddr;
1183        init_waitqueue_head(&area->wq);
1184        /* Reserve the 1st slot for get_trampoline_vaddr() */
1185        set_bit(0, area->bitmap);
1186        atomic_set(&area->slot_count, 1);
1187        copy_to_page(area->page, 0, &insn, UPROBE_SWBP_INSN_SIZE);
1188
1189        if (!xol_add_vma(mm, area))
1190                return area;
1191
1192        __free_page(area->page);
1193 free_bitmap:
1194        kfree(area->bitmap);
1195 free_area:
1196        kfree(area);
1197 out:
1198        return NULL;
1199}
1200
1201/*
1202 * get_xol_area - Allocate process's xol_area if necessary.
1203 * This area will be used for storing instructions for execution out of line.
1204 *
1205 * Returns the allocated area or NULL.
1206 */
1207static struct xol_area *get_xol_area(void)
1208{
1209        struct mm_struct *mm = current->mm;
1210        struct xol_area *area;
1211
1212        if (!mm->uprobes_state.xol_area)
1213                __create_xol_area(0);
1214
1215        area = mm->uprobes_state.xol_area;
1216        smp_read_barrier_depends();     /* pairs with wmb in xol_add_vma() */
1217        return area;
1218}
1219
1220/*
1221 * uprobe_clear_state - Free the area allocated for slots.
1222 */
1223void uprobe_clear_state(struct mm_struct *mm)
1224{
1225        struct xol_area *area = mm->uprobes_state.xol_area;
1226
1227        if (!area)
1228                return;
1229
1230        put_page(area->page);
1231        kfree(area->bitmap);
1232        kfree(area);
1233}
1234
1235void uprobe_start_dup_mmap(void)
1236{
1237        percpu_down_read(&dup_mmap_sem);
1238}
1239
1240void uprobe_end_dup_mmap(void)
1241{
1242        percpu_up_read(&dup_mmap_sem);
1243}
1244
1245void uprobe_dup_mmap(struct mm_struct *oldmm, struct mm_struct *newmm)
1246{
1247        newmm->uprobes_state.xol_area = NULL;
1248
1249        if (test_bit(MMF_HAS_UPROBES, &oldmm->flags)) {
1250                set_bit(MMF_HAS_UPROBES, &newmm->flags);
1251                /* unconditionally, dup_mmap() skips VM_DONTCOPY vmas */
1252                set_bit(MMF_RECALC_UPROBES, &newmm->flags);
1253        }
1254}
1255
1256/*
1257 *  - search for a free slot.
1258 */
1259static unsigned long xol_take_insn_slot(struct xol_area *area)
1260{
1261        unsigned long slot_addr;
1262        int slot_nr;
1263
1264        do {
1265                slot_nr = find_first_zero_bit(area->bitmap, UINSNS_PER_PAGE);
1266                if (slot_nr < UINSNS_PER_PAGE) {
1267                        if (!test_and_set_bit(slot_nr, area->bitmap))
1268                                break;
1269
1270                        slot_nr = UINSNS_PER_PAGE;
1271                        continue;
1272                }
1273                wait_event(area->wq, (atomic_read(&area->slot_count) < UINSNS_PER_PAGE));
1274        } while (slot_nr >= UINSNS_PER_PAGE);
1275
1276        slot_addr = area->vaddr + (slot_nr * UPROBE_XOL_SLOT_BYTES);
1277        atomic_inc(&area->slot_count);
1278
1279        return slot_addr;
1280}
1281
1282/*
1283 * xol_get_insn_slot - allocate a slot for xol.
1284 * Returns the allocated slot address or 0.
1285 */
1286static unsigned long xol_get_insn_slot(struct uprobe *uprobe)
1287{
1288        struct xol_area *area;
1289        unsigned long xol_vaddr;
1290
1291        area = get_xol_area();
1292        if (!area)
1293                return 0;
1294
1295        xol_vaddr = xol_take_insn_slot(area);
1296        if (unlikely(!xol_vaddr))
1297                return 0;
1298
1299        arch_uprobe_copy_ixol(area->page, xol_vaddr,
1300                              &uprobe->arch.ixol, sizeof(uprobe->arch.ixol));
1301
1302        return xol_vaddr;
1303}
1304
1305/*
1306 * xol_free_insn_slot - If slot was earlier allocated by
1307 * @xol_get_insn_slot(), make the slot available for
1308 * subsequent requests.
1309 */
1310static void xol_free_insn_slot(struct task_struct *tsk)
1311{
1312        struct xol_area *area;
1313        unsigned long vma_end;
1314        unsigned long slot_addr;
1315
1316        if (!tsk->mm || !tsk->mm->uprobes_state.xol_area || !tsk->utask)
1317                return;
1318
1319        slot_addr = tsk->utask->xol_vaddr;
1320        if (unlikely(!slot_addr))
1321                return;
1322
1323        area = tsk->mm->uprobes_state.xol_area;
1324        vma_end = area->vaddr + PAGE_SIZE;
1325        if (area->vaddr <= slot_addr && slot_addr < vma_end) {
1326                unsigned long offset;
1327                int slot_nr;
1328
1329                offset = slot_addr - area->vaddr;
1330                slot_nr = offset / UPROBE_XOL_SLOT_BYTES;
1331                if (slot_nr >= UINSNS_PER_PAGE)
1332                        return;
1333
1334                clear_bit(slot_nr, area->bitmap);
1335                atomic_dec(&area->slot_count);
1336                if (waitqueue_active(&area->wq))
1337                        wake_up(&area->wq);
1338
1339                tsk->utask->xol_vaddr = 0;
1340        }
1341}
1342
1343void __weak arch_uprobe_copy_ixol(struct page *page, unsigned long vaddr,
1344                                  void *src, unsigned long len)
1345{
1346        /* Initialize the slot */
1347        copy_to_page(page, vaddr, src, len);
1348
1349        /*
1350         * We probably need flush_icache_user_range() but it needs vma.
1351         * This should work on most of architectures by default. If
1352         * architecture needs to do something different it can define
1353         * its own version of the function.
1354         */
1355        flush_dcache_page(page);
1356}
1357
1358/**
1359 * uprobe_get_swbp_addr - compute address of swbp given post-swbp regs
1360 * @regs: Reflects the saved state of the task after it has hit a breakpoint
1361 * instruction.
1362 * Return the address of the breakpoint instruction.
1363 */
1364unsigned long __weak uprobe_get_swbp_addr(struct pt_regs *regs)
1365{
1366        return instruction_pointer(regs) - UPROBE_SWBP_INSN_SIZE;
1367}
1368
1369unsigned long uprobe_get_trap_addr(struct pt_regs *regs)
1370{
1371        struct uprobe_task *utask = current->utask;
1372
1373        if (unlikely(utask && utask->active_uprobe))
1374                return utask->vaddr;
1375
1376        return instruction_pointer(regs);
1377}
1378
1379/*
1380 * Called with no locks held.
1381 * Called in context of a exiting or a exec-ing thread.
1382 */
1383void uprobe_free_utask(struct task_struct *t)
1384{
1385        struct uprobe_task *utask = t->utask;
1386        struct return_instance *ri, *tmp;
1387
1388        if (!utask)
1389                return;
1390
1391        if (utask->active_uprobe)
1392                put_uprobe(utask->active_uprobe);
1393
1394        ri = utask->return_instances;
1395        while (ri) {
1396                tmp = ri;
1397                ri = ri->next;
1398
1399                put_uprobe(tmp->uprobe);
1400                kfree(tmp);
1401        }
1402
1403        xol_free_insn_slot(t);
1404        kfree(utask);
1405        t->utask = NULL;
1406}
1407
1408/*
1409 * Allocate a uprobe_task object for the task if if necessary.
1410 * Called when the thread hits a breakpoint.
1411 *
1412 * Returns:
1413 * - pointer to new uprobe_task on success
1414 * - NULL otherwise
1415 */
1416static struct uprobe_task *get_utask(void)
1417{
1418        if (!current->utask)
1419                current->utask = kzalloc(sizeof(struct uprobe_task), GFP_KERNEL);
1420        return current->utask;
1421}
1422
1423static int dup_utask(struct task_struct *t, struct uprobe_task *o_utask)
1424{
1425        struct uprobe_task *n_utask;
1426        struct return_instance **p, *o, *n;
1427
1428        n_utask = kzalloc(sizeof(struct uprobe_task), GFP_KERNEL);
1429        if (!n_utask)
1430                return -ENOMEM;
1431        t->utask = n_utask;
1432
1433        p = &n_utask->return_instances;
1434        for (o = o_utask->return_instances; o; o = o->next) {
1435                n = kmalloc(sizeof(struct return_instance), GFP_KERNEL);
1436                if (!n)
1437                        return -ENOMEM;
1438
1439                *n = *o;
1440                atomic_inc(&n->uprobe->ref);
1441                n->next = NULL;
1442
1443                *p = n;
1444                p = &n->next;
1445                n_utask->depth++;
1446        }
1447
1448        return 0;
1449}
1450
1451static void uprobe_warn(struct task_struct *t, const char *msg)
1452{
1453        pr_warn("uprobe: %s:%d failed to %s\n",
1454                        current->comm, current->pid, msg);
1455}
1456
1457static void dup_xol_work(struct callback_head *work)
1458{
1459        if (current->flags & PF_EXITING)
1460                return;
1461
1462        if (!__create_xol_area(current->utask->dup_xol_addr))
1463                uprobe_warn(current, "dup xol area");
1464}
1465
1466/*
1467 * Called in context of a new clone/fork from copy_process.
1468 */
1469void uprobe_copy_process(struct task_struct *t, unsigned long flags)
1470{
1471        struct uprobe_task *utask = current->utask;
1472        struct mm_struct *mm = current->mm;
1473        struct xol_area *area;
1474
1475        t->utask = NULL;
1476
1477        if (!utask || !utask->return_instances)
1478                return;
1479
1480        if (mm == t->mm && !(flags & CLONE_VFORK))
1481                return;
1482
1483        if (dup_utask(t, utask))
1484                return uprobe_warn(t, "dup ret instances");
1485
1486        /* The task can fork() after dup_xol_work() fails */
1487        area = mm->uprobes_state.xol_area;
1488        if (!area)
1489                return uprobe_warn(t, "dup xol area");
1490
1491        if (mm == t->mm)
1492                return;
1493
1494        t->utask->dup_xol_addr = area->vaddr;
1495        init_task_work(&t->utask->dup_xol_work, dup_xol_work);
1496        task_work_add(t, &t->utask->dup_xol_work, true);
1497}
1498
1499/*
1500 * Current area->vaddr notion assume the trampoline address is always
1501 * equal area->vaddr.
1502 *
1503 * Returns -1 in case the xol_area is not allocated.
1504 */
1505static unsigned long get_trampoline_vaddr(void)
1506{
1507        struct xol_area *area;
1508        unsigned long trampoline_vaddr = -1;
1509
1510        area = current->mm->uprobes_state.xol_area;
1511        smp_read_barrier_depends();
1512        if (area)
1513                trampoline_vaddr = area->vaddr;
1514
1515        return trampoline_vaddr;
1516}
1517
1518static void prepare_uretprobe(struct uprobe *uprobe, struct pt_regs *regs)
1519{
1520        struct return_instance *ri;
1521        struct uprobe_task *utask;
1522        unsigned long orig_ret_vaddr, trampoline_vaddr;
1523        bool chained = false;
1524
1525        if (!get_xol_area())
1526                return;
1527
1528        utask = get_utask();
1529        if (!utask)
1530                return;
1531
1532        if (utask->depth >= MAX_URETPROBE_DEPTH) {
1533                printk_ratelimited(KERN_INFO "uprobe: omit uretprobe due to"
1534                                " nestedness limit pid/tgid=%d/%d\n",
1535                                current->pid, current->tgid);
1536                return;
1537        }
1538
1539        ri = kzalloc(sizeof(struct return_instance), GFP_KERNEL);
1540        if (!ri)
1541                goto fail;
1542
1543        trampoline_vaddr = get_trampoline_vaddr();
1544        orig_ret_vaddr = arch_uretprobe_hijack_return_addr(trampoline_vaddr, regs);
1545        if (orig_ret_vaddr == -1)
1546                goto fail;
1547
1548        /*
1549         * We don't want to keep trampoline address in stack, rather keep the
1550         * original return address of first caller thru all the consequent
1551         * instances. This also makes breakpoint unwrapping easier.
1552         */
1553        if (orig_ret_vaddr == trampoline_vaddr) {
1554                if (!utask->return_instances) {
1555                        /*
1556                         * This situation is not possible. Likely we have an
1557                         * attack from user-space.
1558                         */
1559                        pr_warn("uprobe: unable to set uretprobe pid/tgid=%d/%d\n",
1560                                                current->pid, current->tgid);
1561                        goto fail;
1562                }
1563
1564                chained = true;
1565                orig_ret_vaddr = utask->return_instances->orig_ret_vaddr;
1566        }
1567
1568        atomic_inc(&uprobe->ref);
1569        ri->uprobe = uprobe;
1570        ri->func = instruction_pointer(regs);
1571        ri->orig_ret_vaddr = orig_ret_vaddr;
1572        ri->chained = chained;
1573
1574        utask->depth++;
1575
1576        /* add instance to the stack */
1577        ri->next = utask->return_instances;
1578        utask->return_instances = ri;
1579
1580        return;
1581
1582 fail:
1583        kfree(ri);
1584}
1585
1586/* Prepare to single-step probed instruction out of line. */
1587static int
1588pre_ssout(struct uprobe *uprobe, struct pt_regs *regs, unsigned long bp_vaddr)
1589{
1590        struct uprobe_task *utask;
1591        unsigned long xol_vaddr;
1592        int err;
1593
1594        utask = get_utask();
1595        if (!utask)
1596                return -ENOMEM;
1597
1598        xol_vaddr = xol_get_insn_slot(uprobe);
1599        if (!xol_vaddr)
1600                return -ENOMEM;
1601
1602        utask->xol_vaddr = xol_vaddr;
1603        utask->vaddr = bp_vaddr;
1604
1605        err = arch_uprobe_pre_xol(&uprobe->arch, regs);
1606        if (unlikely(err)) {
1607                xol_free_insn_slot(current);
1608                return err;
1609        }
1610
1611        utask->active_uprobe = uprobe;
1612        utask->state = UTASK_SSTEP;
1613        return 0;
1614}
1615
1616/*
1617 * If we are singlestepping, then ensure this thread is not connected to
1618 * non-fatal signals until completion of singlestep.  When xol insn itself
1619 * triggers the signal,  restart the original insn even if the task is
1620 * already SIGKILL'ed (since coredump should report the correct ip).  This
1621 * is even more important if the task has a handler for SIGSEGV/etc, The
1622 * _same_ instruction should be repeated again after return from the signal
1623 * handler, and SSTEP can never finish in this case.
1624 */
1625bool uprobe_deny_signal(void)
1626{
1627        struct task_struct *t = current;
1628        struct uprobe_task *utask = t->utask;
1629
1630        if (likely(!utask || !utask->active_uprobe))
1631                return false;
1632
1633        WARN_ON_ONCE(utask->state != UTASK_SSTEP);
1634
1635        if (signal_pending(t)) {
1636                spin_lock_irq(&t->sighand->siglock);
1637                clear_tsk_thread_flag(t, TIF_SIGPENDING);
1638                spin_unlock_irq(&t->sighand->siglock);
1639
1640                if (__fatal_signal_pending(t) || arch_uprobe_xol_was_trapped(t)) {
1641                        utask->state = UTASK_SSTEP_TRAPPED;
1642                        set_tsk_thread_flag(t, TIF_UPROBE);
1643                }
1644        }
1645
1646        return true;
1647}
1648
1649static void mmf_recalc_uprobes(struct mm_struct *mm)
1650{
1651        struct vm_area_struct *vma;
1652
1653        for (vma = mm->mmap; vma; vma = vma->vm_next) {
1654                if (!valid_vma(vma, false))
1655                        continue;
1656                /*
1657                 * This is not strictly accurate, we can race with
1658                 * uprobe_unregister() and see the already removed
1659                 * uprobe if delete_uprobe() was not yet called.
1660                 * Or this uprobe can be filtered out.
1661                 */
1662                if (vma_has_uprobes(vma, vma->vm_start, vma->vm_end))
1663                        return;
1664        }
1665
1666        clear_bit(MMF_HAS_UPROBES, &mm->flags);
1667}
1668
1669static int is_trap_at_addr(struct mm_struct *mm, unsigned long vaddr)
1670{
1671        struct page *page;
1672        uprobe_opcode_t opcode;
1673        int result;
1674
1675        pagefault_disable();
1676        result = __copy_from_user_inatomic(&opcode, (void __user*)vaddr,
1677                                                        sizeof(opcode));
1678        pagefault_enable();
1679
1680        if (likely(result == 0))
1681                goto out;
1682
1683        result = get_user_pages(NULL, mm, vaddr, 1, 0, 1, &page, NULL);
1684        if (result < 0)
1685                return result;
1686
1687        copy_from_page(page, vaddr, &opcode, UPROBE_SWBP_INSN_SIZE);
1688        put_page(page);
1689 out:
1690        /* This needs to return true for any variant of the trap insn */
1691        return is_trap_insn(&opcode);
1692}
1693
1694static struct uprobe *find_active_uprobe(unsigned long bp_vaddr, int *is_swbp)
1695{
1696        struct mm_struct *mm = current->mm;
1697        struct uprobe *uprobe = NULL;
1698        struct vm_area_struct *vma;
1699
1700        down_read(&mm->mmap_sem);
1701        vma = find_vma(mm, bp_vaddr);
1702        if (vma && vma->vm_start <= bp_vaddr) {
1703                if (valid_vma(vma, false)) {
1704                        struct inode *inode = file_inode(vma->vm_file);
1705                        loff_t offset = vaddr_to_offset(vma, bp_vaddr);
1706
1707                        uprobe = find_uprobe(inode, offset);
1708                }
1709
1710                if (!uprobe)
1711                        *is_swbp = is_trap_at_addr(mm, bp_vaddr);
1712        } else {
1713                *is_swbp = -EFAULT;
1714        }
1715
1716        if (!uprobe && test_and_clear_bit(MMF_RECALC_UPROBES, &mm->flags))
1717                mmf_recalc_uprobes(mm);
1718        up_read(&mm->mmap_sem);
1719
1720        return uprobe;
1721}
1722
1723static void handler_chain(struct uprobe *uprobe, struct pt_regs *regs)
1724{
1725        struct uprobe_consumer *uc;
1726        int remove = UPROBE_HANDLER_REMOVE;
1727        bool need_prep = false; /* prepare return uprobe, when needed */
1728
1729        down_read(&uprobe->register_rwsem);
1730        for (uc = uprobe->consumers; uc; uc = uc->next) {
1731                int rc = 0;
1732
1733                if (uc->handler) {
1734                        rc = uc->handler(uc, regs);
1735                        WARN(rc & ~UPROBE_HANDLER_MASK,
1736                                "bad rc=0x%x from %pf()\n", rc, uc->handler);
1737                }
1738
1739                if (uc->ret_handler)
1740                        need_prep = true;
1741
1742                remove &= rc;
1743        }
1744
1745        if (need_prep && !remove)
1746                prepare_uretprobe(uprobe, regs); /* put bp at return */
1747
1748        if (remove && uprobe->consumers) {
1749                WARN_ON(!uprobe_is_active(uprobe));
1750                unapply_uprobe(uprobe, current->mm);
1751        }
1752        up_read(&uprobe->register_rwsem);
1753}
1754
1755static void
1756handle_uretprobe_chain(struct return_instance *ri, struct pt_regs *regs)
1757{
1758        struct uprobe *uprobe = ri->uprobe;
1759        struct uprobe_consumer *uc;
1760
1761        down_read(&uprobe->register_rwsem);
1762        for (uc = uprobe->consumers; uc; uc = uc->next) {
1763                if (uc->ret_handler)
1764                        uc->ret_handler(uc, ri->func, regs);
1765        }
1766        up_read(&uprobe->register_rwsem);
1767}
1768
1769static bool handle_trampoline(struct pt_regs *regs)
1770{
1771        struct uprobe_task *utask;
1772        struct return_instance *ri, *tmp;
1773        bool chained;
1774
1775        utask = current->utask;
1776        if (!utask)
1777                return false;
1778
1779        ri = utask->return_instances;
1780        if (!ri)
1781                return false;
1782
1783        /*
1784         * TODO: we should throw out return_instance's invalidated by
1785         * longjmp(), currently we assume that the probed function always
1786         * returns.
1787         */
1788        instruction_pointer_set(regs, ri->orig_ret_vaddr);
1789
1790        for (;;) {
1791                handle_uretprobe_chain(ri, regs);
1792
1793                chained = ri->chained;
1794                put_uprobe(ri->uprobe);
1795
1796                tmp = ri;
1797                ri = ri->next;
1798                kfree(tmp);
1799                utask->depth--;
1800
1801                if (!chained)
1802                        break;
1803                BUG_ON(!ri);
1804        }
1805
1806        utask->return_instances = ri;
1807
1808        return true;
1809}
1810
1811bool __weak arch_uprobe_ignore(struct arch_uprobe *aup, struct pt_regs *regs)
1812{
1813        return false;
1814}
1815
1816/*
1817 * Run handler and ask thread to singlestep.
1818 * Ensure all non-fatal signals cannot interrupt thread while it singlesteps.
1819 */
1820static void handle_swbp(struct pt_regs *regs)
1821{
1822        struct uprobe *uprobe;
1823        unsigned long bp_vaddr;
1824        int uninitialized_var(is_swbp);
1825
1826        bp_vaddr = uprobe_get_swbp_addr(regs);
1827        if (bp_vaddr == get_trampoline_vaddr()) {
1828                if (handle_trampoline(regs))
1829                        return;
1830
1831                pr_warn("uprobe: unable to handle uretprobe pid/tgid=%d/%d\n",
1832                                                current->pid, current->tgid);
1833        }
1834
1835        uprobe = find_active_uprobe(bp_vaddr, &is_swbp);
1836        if (!uprobe) {
1837                if (is_swbp > 0) {
1838                        /* No matching uprobe; signal SIGTRAP. */
1839                        send_sig(SIGTRAP, current, 0);
1840                } else {
1841                        /*
1842                         * Either we raced with uprobe_unregister() or we can't
1843                         * access this memory. The latter is only possible if
1844                         * another thread plays with our ->mm. In both cases
1845                         * we can simply restart. If this vma was unmapped we
1846                         * can pretend this insn was not executed yet and get
1847                         * the (correct) SIGSEGV after restart.
1848                         */
1849                        instruction_pointer_set(regs, bp_vaddr);
1850                }
1851                return;
1852        }
1853
1854        /* change it in advance for ->handler() and restart */
1855        instruction_pointer_set(regs, bp_vaddr);
1856
1857        /*
1858         * TODO: move copy_insn/etc into _register and remove this hack.
1859         * After we hit the bp, _unregister + _register can install the
1860         * new and not-yet-analyzed uprobe at the same address, restart.
1861         */
1862        smp_rmb(); /* pairs with wmb() in install_breakpoint() */
1863        if (unlikely(!test_bit(UPROBE_COPY_INSN, &uprobe->flags)))
1864                goto out;
1865
1866        /* Tracing handlers use ->utask to communicate with fetch methods */
1867        if (!get_utask())
1868                goto out;
1869
1870        if (arch_uprobe_ignore(&uprobe->arch, regs))
1871                goto out;
1872
1873        handler_chain(uprobe, regs);
1874
1875        if (arch_uprobe_skip_sstep(&uprobe->arch, regs))
1876                goto out;
1877
1878        if (!pre_ssout(uprobe, regs, bp_vaddr))
1879                return;
1880
1881        /* arch_uprobe_skip_sstep() succeeded, or restart if can't singlestep */
1882out:
1883        put_uprobe(uprobe);
1884}
1885
1886/*
1887 * Perform required fix-ups and disable singlestep.
1888 * Allow pending signals to take effect.
1889 */
1890static void handle_singlestep(struct uprobe_task *utask, struct pt_regs *regs)
1891{
1892        struct uprobe *uprobe;
1893        int err = 0;
1894
1895        uprobe = utask->active_uprobe;
1896        if (utask->state == UTASK_SSTEP_ACK)
1897                err = arch_uprobe_post_xol(&uprobe->arch, regs);
1898        else if (utask->state == UTASK_SSTEP_TRAPPED)
1899                arch_uprobe_abort_xol(&uprobe->arch, regs);
1900        else
1901                WARN_ON_ONCE(1);
1902
1903        put_uprobe(uprobe);
1904        utask->active_uprobe = NULL;
1905        utask->state = UTASK_RUNNING;
1906        xol_free_insn_slot(current);
1907
1908        spin_lock_irq(&current->sighand->siglock);
1909        recalc_sigpending(); /* see uprobe_deny_signal() */
1910        spin_unlock_irq(&current->sighand->siglock);
1911
1912        if (unlikely(err)) {
1913                uprobe_warn(current, "execute the probed insn, sending SIGILL.");
1914                force_sig_info(SIGILL, SEND_SIG_FORCED, current);
1915        }
1916}
1917
1918/*
1919 * On breakpoint hit, breakpoint notifier sets the TIF_UPROBE flag and
1920 * allows the thread to return from interrupt. After that handle_swbp()
1921 * sets utask->active_uprobe.
1922 *
1923 * On singlestep exception, singlestep notifier sets the TIF_UPROBE flag
1924 * and allows the thread to return from interrupt.
1925 *
1926 * While returning to userspace, thread notices the TIF_UPROBE flag and calls
1927 * uprobe_notify_resume().
1928 */
1929void uprobe_notify_resume(struct pt_regs *regs)
1930{
1931        struct uprobe_task *utask;
1932
1933        clear_thread_flag(TIF_UPROBE);
1934
1935        utask = current->utask;
1936        if (utask && utask->active_uprobe)
1937                handle_singlestep(utask, regs);
1938        else
1939                handle_swbp(regs);
1940}
1941
1942/*
1943 * uprobe_pre_sstep_notifier gets called from interrupt context as part of
1944 * notifier mechanism. Set TIF_UPROBE flag and indicate breakpoint hit.
1945 */
1946int uprobe_pre_sstep_notifier(struct pt_regs *regs)
1947{
1948        if (!current->mm)
1949                return 0;
1950
1951        if (!test_bit(MMF_HAS_UPROBES, &current->mm->flags) &&
1952            (!current->utask || !current->utask->return_instances))
1953                return 0;
1954
1955        set_thread_flag(TIF_UPROBE);
1956        return 1;
1957}
1958
1959/*
1960 * uprobe_post_sstep_notifier gets called in interrupt context as part of notifier
1961 * mechanism. Set TIF_UPROBE flag and indicate completion of singlestep.
1962 */
1963int uprobe_post_sstep_notifier(struct pt_regs *regs)
1964{
1965        struct uprobe_task *utask = current->utask;
1966
1967        if (!current->mm || !utask || !utask->active_uprobe)
1968                /* task is currently not uprobed */
1969                return 0;
1970
1971        utask->state = UTASK_SSTEP_ACK;
1972        set_thread_flag(TIF_UPROBE);
1973        return 1;
1974}
1975
1976static struct notifier_block uprobe_exception_nb = {
1977        .notifier_call          = arch_uprobe_exception_notify,
1978        .priority               = INT_MAX-1,    /* notified after kprobes, kgdb */
1979};
1980
1981static int __init init_uprobes(void)
1982{
1983        int i;
1984
1985        for (i = 0; i < UPROBES_HASH_SZ; i++)
1986                mutex_init(&uprobes_mmap_mutex[i]);
1987
1988        if (percpu_init_rwsem(&dup_mmap_sem))
1989                return -ENOMEM;
1990
1991        return register_die_notifier(&uprobe_exception_nb);
1992}
1993__initcall(init_uprobes);
1994