linux/kernel/futex.c
<<
>>
Prefs
   1/*
   2 *  Fast Userspace Mutexes (which I call "Futexes!").
   3 *  (C) Rusty Russell, IBM 2002
   4 *
   5 *  Generalized futexes, futex requeueing, misc fixes by Ingo Molnar
   6 *  (C) Copyright 2003 Red Hat Inc, All Rights Reserved
   7 *
   8 *  Removed page pinning, fix privately mapped COW pages and other cleanups
   9 *  (C) Copyright 2003, 2004 Jamie Lokier
  10 *
  11 *  Robust futex support started by Ingo Molnar
  12 *  (C) Copyright 2006 Red Hat Inc, All Rights Reserved
  13 *  Thanks to Thomas Gleixner for suggestions, analysis and fixes.
  14 *
  15 *  PI-futex support started by Ingo Molnar and Thomas Gleixner
  16 *  Copyright (C) 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
  17 *  Copyright (C) 2006 Timesys Corp., Thomas Gleixner <tglx@timesys.com>
  18 *
  19 *  PRIVATE futexes by Eric Dumazet
  20 *  Copyright (C) 2007 Eric Dumazet <dada1@cosmosbay.com>
  21 *
  22 *  Requeue-PI support by Darren Hart <dvhltc@us.ibm.com>
  23 *  Copyright (C) IBM Corporation, 2009
  24 *  Thanks to Thomas Gleixner for conceptual design and careful reviews.
  25 *
  26 *  Thanks to Ben LaHaise for yelling "hashed waitqueues" loudly
  27 *  enough at me, Linus for the original (flawed) idea, Matthew
  28 *  Kirkwood for proof-of-concept implementation.
  29 *
  30 *  "The futexes are also cursed."
  31 *  "But they come in a choice of three flavours!"
  32 *
  33 *  This program is free software; you can redistribute it and/or modify
  34 *  it under the terms of the GNU General Public License as published by
  35 *  the Free Software Foundation; either version 2 of the License, or
  36 *  (at your option) any later version.
  37 *
  38 *  This program is distributed in the hope that it will be useful,
  39 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
  40 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  41 *  GNU General Public License for more details.
  42 *
  43 *  You should have received a copy of the GNU General Public License
  44 *  along with this program; if not, write to the Free Software
  45 *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  46 */
  47#include <linux/slab.h>
  48#include <linux/poll.h>
  49#include <linux/fs.h>
  50#include <linux/file.h>
  51#include <linux/jhash.h>
  52#include <linux/init.h>
  53#include <linux/futex.h>
  54#include <linux/mount.h>
  55#include <linux/pagemap.h>
  56#include <linux/syscalls.h>
  57#include <linux/signal.h>
  58#include <linux/export.h>
  59#include <linux/magic.h>
  60#include <linux/pid.h>
  61#include <linux/nsproxy.h>
  62#include <linux/ptrace.h>
  63#include <linux/sched/rt.h>
  64#include <linux/freezer.h>
  65#include <linux/bootmem.h>
  66#include <linux/hugetlb.h>
  67
  68#include <asm/futex.h>
  69
  70#include "rtmutex_common.h"
  71
  72/*
  73 * Basic futex operation and ordering guarantees:
  74 *
  75 * The waiter reads the futex value in user space and calls
  76 * futex_wait(). This function computes the hash bucket and acquires
  77 * the hash bucket lock. After that it reads the futex user space value
  78 * again and verifies that the data has not changed. If it has not changed
  79 * it enqueues itself into the hash bucket, releases the hash bucket lock
  80 * and schedules.
  81 *
  82 * The waker side modifies the user space value of the futex and calls
  83 * futex_wake(). This function computes the hash bucket and acquires the
  84 * hash bucket lock. Then it looks for waiters on that futex in the hash
  85 * bucket and wakes them.
  86 *
  87 * In futex wake up scenarios where no tasks are blocked on a futex, taking
  88 * the hb spinlock can be avoided and simply return. In order for this
  89 * optimization to work, ordering guarantees must exist so that the waiter
  90 * being added to the list is acknowledged when the list is concurrently being
  91 * checked by the waker, avoiding scenarios like the following:
  92 *
  93 * CPU 0                               CPU 1
  94 * val = *futex;
  95 * sys_futex(WAIT, futex, val);
  96 *   futex_wait(futex, val);
  97 *   uval = *futex;
  98 *                                     *futex = newval;
  99 *                                     sys_futex(WAKE, futex);
 100 *                                       futex_wake(futex);
 101 *                                       if (queue_empty())
 102 *                                         return;
 103 *   if (uval == val)
 104 *      lock(hash_bucket(futex));
 105 *      queue();
 106 *     unlock(hash_bucket(futex));
 107 *     schedule();
 108 *
 109 * This would cause the waiter on CPU 0 to wait forever because it
 110 * missed the transition of the user space value from val to newval
 111 * and the waker did not find the waiter in the hash bucket queue.
 112 *
 113 * The correct serialization ensures that a waiter either observes
 114 * the changed user space value before blocking or is woken by a
 115 * concurrent waker:
 116 *
 117 * CPU 0                                 CPU 1
 118 * val = *futex;
 119 * sys_futex(WAIT, futex, val);
 120 *   futex_wait(futex, val);
 121 *
 122 *   waiters++;
 123 *   mb(); (A) <-- paired with -.
 124 *                              |
 125 *   lock(hash_bucket(futex));  |
 126 *                              |
 127 *   uval = *futex;             |
 128 *                              |        *futex = newval;
 129 *                              |        sys_futex(WAKE, futex);
 130 *                              |          futex_wake(futex);
 131 *                              |
 132 *                              `------->  mb(); (B)
 133 *   if (uval == val)
 134 *     queue();
 135 *     unlock(hash_bucket(futex));
 136 *     schedule();                         if (waiters)
 137 *                                           lock(hash_bucket(futex));
 138 *                                           wake_waiters(futex);
 139 *                                           unlock(hash_bucket(futex));
 140 *
 141 * Where (A) orders the waiters increment and the futex value read -- this
 142 * is guaranteed by the head counter in the hb spinlock; and where (B)
 143 * orders the write to futex and the waiters read -- this is done by the
 144 * barriers for both shared and private futexes in get_futex_key_refs().
 145 *
 146 * This yields the following case (where X:=waiters, Y:=futex):
 147 *
 148 *      X = Y = 0
 149 *
 150 *      w[X]=1          w[Y]=1
 151 *      MB              MB
 152 *      r[Y]=y          r[X]=x
 153 *
 154 * Which guarantees that x==0 && y==0 is impossible; which translates back into
 155 * the guarantee that we cannot both miss the futex variable change and the
 156 * enqueue.
 157 */
 158
 159int __read_mostly futex_cmpxchg_enabled;
 160
 161/*
 162 * Futex flags used to encode options to functions and preserve them across
 163 * restarts.
 164 */
 165#define FLAGS_SHARED            0x01
 166#define FLAGS_CLOCKRT           0x02
 167#define FLAGS_HAS_TIMEOUT       0x04
 168
 169/*
 170 * Priority Inheritance state:
 171 */
 172struct futex_pi_state {
 173        /*
 174         * list of 'owned' pi_state instances - these have to be
 175         * cleaned up in do_exit() if the task exits prematurely:
 176         */
 177        struct list_head list;
 178
 179        /*
 180         * The PI object:
 181         */
 182        struct rt_mutex pi_mutex;
 183
 184        struct task_struct *owner;
 185        atomic_t refcount;
 186
 187        union futex_key key;
 188};
 189
 190/**
 191 * struct futex_q - The hashed futex queue entry, one per waiting task
 192 * @list:               priority-sorted list of tasks waiting on this futex
 193 * @task:               the task waiting on the futex
 194 * @lock_ptr:           the hash bucket lock
 195 * @key:                the key the futex is hashed on
 196 * @pi_state:           optional priority inheritance state
 197 * @rt_waiter:          rt_waiter storage for use with requeue_pi
 198 * @requeue_pi_key:     the requeue_pi target futex key
 199 * @bitset:             bitset for the optional bitmasked wakeup
 200 *
 201 * We use this hashed waitqueue, instead of a normal wait_queue_t, so
 202 * we can wake only the relevant ones (hashed queues may be shared).
 203 *
 204 * A futex_q has a woken state, just like tasks have TASK_RUNNING.
 205 * It is considered woken when plist_node_empty(&q->list) || q->lock_ptr == 0.
 206 * The order of wakeup is always to make the first condition true, then
 207 * the second.
 208 *
 209 * PI futexes are typically woken before they are removed from the hash list via
 210 * the rt_mutex code. See unqueue_me_pi().
 211 */
 212struct futex_q {
 213        struct plist_node list;
 214
 215        struct task_struct *task;
 216        spinlock_t *lock_ptr;
 217        union futex_key key;
 218        struct futex_pi_state *pi_state;
 219        struct rt_mutex_waiter *rt_waiter;
 220        union futex_key *requeue_pi_key;
 221        u32 bitset;
 222};
 223
 224static const struct futex_q futex_q_init = {
 225        /* list gets initialized in queue_me()*/
 226        .key = FUTEX_KEY_INIT,
 227        .bitset = FUTEX_BITSET_MATCH_ANY
 228};
 229
 230/*
 231 * Hash buckets are shared by all the futex_keys that hash to the same
 232 * location.  Each key may have multiple futex_q structures, one for each task
 233 * waiting on a futex.
 234 */
 235struct futex_hash_bucket {
 236        atomic_t waiters;
 237        spinlock_t lock;
 238        struct plist_head chain;
 239} ____cacheline_aligned_in_smp;
 240
 241/*
 242 * The base of the bucket array and its size are always used together
 243 * (after initialization only in hash_futex()), so ensure that they
 244 * reside in the same cacheline.
 245 */
 246static struct {
 247        struct futex_hash_bucket *queues;
 248        unsigned long            hashsize;
 249} __futex_data __read_mostly __aligned(2*sizeof(long));
 250#define futex_queues   (__futex_data.queues)
 251#define futex_hashsize (__futex_data.hashsize)
 252
 253
 254static inline void futex_get_mm(union futex_key *key)
 255{
 256        atomic_inc(&key->private.mm->mm_count);
 257        /*
 258         * Ensure futex_get_mm() implies a full barrier such that
 259         * get_futex_key() implies a full barrier. This is relied upon
 260         * as full barrier (B), see the ordering comment above.
 261         */
 262        smp_mb__after_atomic_inc();
 263}
 264
 265/*
 266 * Reflects a new waiter being added to the waitqueue.
 267 */
 268static inline void hb_waiters_inc(struct futex_hash_bucket *hb)
 269{
 270#ifdef CONFIG_SMP
 271        atomic_inc(&hb->waiters);
 272        /*
 273         * Full barrier (A), see the ordering comment above.
 274         */
 275        smp_mb__after_atomic_inc();
 276#endif
 277}
 278
 279/*
 280 * Reflects a waiter being removed from the waitqueue by wakeup
 281 * paths.
 282 */
 283static inline void hb_waiters_dec(struct futex_hash_bucket *hb)
 284{
 285#ifdef CONFIG_SMP
 286        atomic_dec(&hb->waiters);
 287#endif
 288}
 289
 290static inline int hb_waiters_pending(struct futex_hash_bucket *hb)
 291{
 292#ifdef CONFIG_SMP
 293        return atomic_read(&hb->waiters);
 294#else
 295        return 1;
 296#endif
 297}
 298
 299/*
 300 * We hash on the keys returned from get_futex_key (see below).
 301 */
 302static struct futex_hash_bucket *hash_futex(union futex_key *key)
 303{
 304        u32 hash = jhash2((u32*)&key->both.word,
 305                          (sizeof(key->both.word)+sizeof(key->both.ptr))/4,
 306                          key->both.offset);
 307        return &futex_queues[hash & (futex_hashsize - 1)];
 308}
 309
 310/*
 311 * Return 1 if two futex_keys are equal, 0 otherwise.
 312 */
 313static inline int match_futex(union futex_key *key1, union futex_key *key2)
 314{
 315        return (key1 && key2
 316                && key1->both.word == key2->both.word
 317                && key1->both.ptr == key2->both.ptr
 318                && key1->both.offset == key2->both.offset);
 319}
 320
 321/*
 322 * Take a reference to the resource addressed by a key.
 323 * Can be called while holding spinlocks.
 324 *
 325 */
 326static void get_futex_key_refs(union futex_key *key)
 327{
 328        if (!key->both.ptr)
 329                return;
 330
 331        switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) {
 332        case FUT_OFF_INODE:
 333                ihold(key->shared.inode); /* implies MB (B) */
 334                break;
 335        case FUT_OFF_MMSHARED:
 336                futex_get_mm(key); /* implies MB (B) */
 337                break;
 338        default:
 339                /*
 340                 * Private futexes do not hold reference on an inode or
 341                 * mm, therefore the only purpose of calling get_futex_key_refs
 342                 * is because we need the barrier for the lockless waiter check.
 343                 */
 344                smp_mb(); /* explicit MB (B) */
 345        }
 346}
 347
 348/*
 349 * Drop a reference to the resource addressed by a key.
 350 * The hash bucket spinlock must not be held. This is
 351 * a no-op for private futexes, see comment in the get
 352 * counterpart.
 353 */
 354static void drop_futex_key_refs(union futex_key *key)
 355{
 356        if (!key->both.ptr) {
 357                /* If we're here then we tried to put a key we failed to get */
 358                WARN_ON_ONCE(1);
 359                return;
 360        }
 361
 362        switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) {
 363        case FUT_OFF_INODE:
 364                iput(key->shared.inode);
 365                break;
 366        case FUT_OFF_MMSHARED:
 367                mmdrop(key->private.mm);
 368                break;
 369        }
 370}
 371
 372/**
 373 * get_futex_key() - Get parameters which are the keys for a futex
 374 * @uaddr:      virtual address of the futex
 375 * @fshared:    0 for a PROCESS_PRIVATE futex, 1 for PROCESS_SHARED
 376 * @key:        address where result is stored.
 377 * @rw:         mapping needs to be read/write (values: VERIFY_READ,
 378 *              VERIFY_WRITE)
 379 *
 380 * Return: a negative error code or 0
 381 *
 382 * The key words are stored in *key on success.
 383 *
 384 * For shared mappings, it's (page->index, file_inode(vma->vm_file),
 385 * offset_within_page).  For private mappings, it's (uaddr, current->mm).
 386 * We can usually work out the index without swapping in the page.
 387 *
 388 * lock_page() might sleep, the caller should not hold a spinlock.
 389 */
 390static int
 391get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key, int rw)
 392{
 393        unsigned long address = (unsigned long)uaddr;
 394        struct mm_struct *mm = current->mm;
 395        struct page *page, *page_head;
 396        int err, ro = 0;
 397
 398        /*
 399         * The futex address must be "naturally" aligned.
 400         */
 401        key->both.offset = address % PAGE_SIZE;
 402        if (unlikely((address % sizeof(u32)) != 0))
 403                return -EINVAL;
 404        address -= key->both.offset;
 405
 406        if (unlikely(!access_ok(rw, uaddr, sizeof(u32))))
 407                return -EFAULT;
 408
 409        /*
 410         * PROCESS_PRIVATE futexes are fast.
 411         * As the mm cannot disappear under us and the 'key' only needs
 412         * virtual address, we dont even have to find the underlying vma.
 413         * Note : We do have to check 'uaddr' is a valid user address,
 414         *        but access_ok() should be faster than find_vma()
 415         */
 416        if (!fshared) {
 417                key->private.mm = mm;
 418                key->private.address = address;
 419                get_futex_key_refs(key);  /* implies MB (B) */
 420                return 0;
 421        }
 422
 423again:
 424        err = get_user_pages_fast(address, 1, 1, &page);
 425        /*
 426         * If write access is not required (eg. FUTEX_WAIT), try
 427         * and get read-only access.
 428         */
 429        if (err == -EFAULT && rw == VERIFY_READ) {
 430                err = get_user_pages_fast(address, 1, 0, &page);
 431                ro = 1;
 432        }
 433        if (err < 0)
 434                return err;
 435        else
 436                err = 0;
 437
 438#ifdef CONFIG_TRANSPARENT_HUGEPAGE
 439        page_head = page;
 440        if (unlikely(PageTail(page))) {
 441                put_page(page);
 442                /* serialize against __split_huge_page_splitting() */
 443                local_irq_disable();
 444                if (likely(__get_user_pages_fast(address, 1, !ro, &page) == 1)) {
 445                        page_head = compound_head(page);
 446                        /*
 447                         * page_head is valid pointer but we must pin
 448                         * it before taking the PG_lock and/or
 449                         * PG_compound_lock. The moment we re-enable
 450                         * irqs __split_huge_page_splitting() can
 451                         * return and the head page can be freed from
 452                         * under us. We can't take the PG_lock and/or
 453                         * PG_compound_lock on a page that could be
 454                         * freed from under us.
 455                         */
 456                        if (page != page_head) {
 457                                get_page(page_head);
 458                                put_page(page);
 459                        }
 460                        local_irq_enable();
 461                } else {
 462                        local_irq_enable();
 463                        goto again;
 464                }
 465        }
 466#else
 467        page_head = compound_head(page);
 468        if (page != page_head) {
 469                get_page(page_head);
 470                put_page(page);
 471        }
 472#endif
 473
 474        lock_page(page_head);
 475
 476        /*
 477         * If page_head->mapping is NULL, then it cannot be a PageAnon
 478         * page; but it might be the ZERO_PAGE or in the gate area or
 479         * in a special mapping (all cases which we are happy to fail);
 480         * or it may have been a good file page when get_user_pages_fast
 481         * found it, but truncated or holepunched or subjected to
 482         * invalidate_complete_page2 before we got the page lock (also
 483         * cases which we are happy to fail).  And we hold a reference,
 484         * so refcount care in invalidate_complete_page's remove_mapping
 485         * prevents drop_caches from setting mapping to NULL beneath us.
 486         *
 487         * The case we do have to guard against is when memory pressure made
 488         * shmem_writepage move it from filecache to swapcache beneath us:
 489         * an unlikely race, but we do need to retry for page_head->mapping.
 490         */
 491        if (!page_head->mapping) {
 492                int shmem_swizzled = PageSwapCache(page_head);
 493                unlock_page(page_head);
 494                put_page(page_head);
 495                if (shmem_swizzled)
 496                        goto again;
 497                return -EFAULT;
 498        }
 499
 500        /*
 501         * Private mappings are handled in a simple way.
 502         *
 503         * NOTE: When userspace waits on a MAP_SHARED mapping, even if
 504         * it's a read-only handle, it's expected that futexes attach to
 505         * the object not the particular process.
 506         */
 507        if (PageAnon(page_head)) {
 508                /*
 509                 * A RO anonymous page will never change and thus doesn't make
 510                 * sense for futex operations.
 511                 */
 512                if (ro) {
 513                        err = -EFAULT;
 514                        goto out;
 515                }
 516
 517                key->both.offset |= FUT_OFF_MMSHARED; /* ref taken on mm */
 518                key->private.mm = mm;
 519                key->private.address = address;
 520        } else {
 521                key->both.offset |= FUT_OFF_INODE; /* inode-based key */
 522                key->shared.inode = page_head->mapping->host;
 523                key->shared.pgoff = basepage_index(page);
 524        }
 525
 526        get_futex_key_refs(key); /* implies MB (B) */
 527
 528out:
 529        unlock_page(page_head);
 530        put_page(page_head);
 531        return err;
 532}
 533
 534static inline void put_futex_key(union futex_key *key)
 535{
 536        drop_futex_key_refs(key);
 537}
 538
 539/**
 540 * fault_in_user_writeable() - Fault in user address and verify RW access
 541 * @uaddr:      pointer to faulting user space address
 542 *
 543 * Slow path to fixup the fault we just took in the atomic write
 544 * access to @uaddr.
 545 *
 546 * We have no generic implementation of a non-destructive write to the
 547 * user address. We know that we faulted in the atomic pagefault
 548 * disabled section so we can as well avoid the #PF overhead by
 549 * calling get_user_pages() right away.
 550 */
 551static int fault_in_user_writeable(u32 __user *uaddr)
 552{
 553        struct mm_struct *mm = current->mm;
 554        int ret;
 555
 556        down_read(&mm->mmap_sem);
 557        ret = fixup_user_fault(current, mm, (unsigned long)uaddr,
 558                               FAULT_FLAG_WRITE);
 559        up_read(&mm->mmap_sem);
 560
 561        return ret < 0 ? ret : 0;
 562}
 563
 564/**
 565 * futex_top_waiter() - Return the highest priority waiter on a futex
 566 * @hb:         the hash bucket the futex_q's reside in
 567 * @key:        the futex key (to distinguish it from other futex futex_q's)
 568 *
 569 * Must be called with the hb lock held.
 570 */
 571static struct futex_q *futex_top_waiter(struct futex_hash_bucket *hb,
 572                                        union futex_key *key)
 573{
 574        struct futex_q *this;
 575
 576        plist_for_each_entry(this, &hb->chain, list) {
 577                if (match_futex(&this->key, key))
 578                        return this;
 579        }
 580        return NULL;
 581}
 582
 583static int cmpxchg_futex_value_locked(u32 *curval, u32 __user *uaddr,
 584                                      u32 uval, u32 newval)
 585{
 586        int ret;
 587
 588        pagefault_disable();
 589        ret = futex_atomic_cmpxchg_inatomic(curval, uaddr, uval, newval);
 590        pagefault_enable();
 591
 592        return ret;
 593}
 594
 595static int get_futex_value_locked(u32 *dest, u32 __user *from)
 596{
 597        int ret;
 598
 599        pagefault_disable();
 600        ret = __copy_from_user_inatomic(dest, from, sizeof(u32));
 601        pagefault_enable();
 602
 603        return ret ? -EFAULT : 0;
 604}
 605
 606
 607/*
 608 * PI code:
 609 */
 610static int refill_pi_state_cache(void)
 611{
 612        struct futex_pi_state *pi_state;
 613
 614        if (likely(current->pi_state_cache))
 615                return 0;
 616
 617        pi_state = kzalloc(sizeof(*pi_state), GFP_KERNEL);
 618
 619        if (!pi_state)
 620                return -ENOMEM;
 621
 622        INIT_LIST_HEAD(&pi_state->list);
 623        /* pi_mutex gets initialized later */
 624        pi_state->owner = NULL;
 625        atomic_set(&pi_state->refcount, 1);
 626        pi_state->key = FUTEX_KEY_INIT;
 627
 628        current->pi_state_cache = pi_state;
 629
 630        return 0;
 631}
 632
 633static struct futex_pi_state * alloc_pi_state(void)
 634{
 635        struct futex_pi_state *pi_state = current->pi_state_cache;
 636
 637        WARN_ON(!pi_state);
 638        current->pi_state_cache = NULL;
 639
 640        return pi_state;
 641}
 642
 643static void free_pi_state(struct futex_pi_state *pi_state)
 644{
 645        if (!atomic_dec_and_test(&pi_state->refcount))
 646                return;
 647
 648        /*
 649         * If pi_state->owner is NULL, the owner is most probably dying
 650         * and has cleaned up the pi_state already
 651         */
 652        if (pi_state->owner) {
 653                raw_spin_lock_irq(&pi_state->owner->pi_lock);
 654                list_del_init(&pi_state->list);
 655                raw_spin_unlock_irq(&pi_state->owner->pi_lock);
 656
 657                rt_mutex_proxy_unlock(&pi_state->pi_mutex, pi_state->owner);
 658        }
 659
 660        if (current->pi_state_cache)
 661                kfree(pi_state);
 662        else {
 663                /*
 664                 * pi_state->list is already empty.
 665                 * clear pi_state->owner.
 666                 * refcount is at 0 - put it back to 1.
 667                 */
 668                pi_state->owner = NULL;
 669                atomic_set(&pi_state->refcount, 1);
 670                current->pi_state_cache = pi_state;
 671        }
 672}
 673
 674/*
 675 * Look up the task based on what TID userspace gave us.
 676 * We dont trust it.
 677 */
 678static struct task_struct * futex_find_get_task(pid_t pid)
 679{
 680        struct task_struct *p;
 681
 682        rcu_read_lock();
 683        p = find_task_by_vpid(pid);
 684        if (p)
 685                get_task_struct(p);
 686
 687        rcu_read_unlock();
 688
 689        return p;
 690}
 691
 692/*
 693 * This task is holding PI mutexes at exit time => bad.
 694 * Kernel cleans up PI-state, but userspace is likely hosed.
 695 * (Robust-futex cleanup is separate and might save the day for userspace.)
 696 */
 697void exit_pi_state_list(struct task_struct *curr)
 698{
 699        struct list_head *next, *head = &curr->pi_state_list;
 700        struct futex_pi_state *pi_state;
 701        struct futex_hash_bucket *hb;
 702        union futex_key key = FUTEX_KEY_INIT;
 703
 704        if (!futex_cmpxchg_enabled)
 705                return;
 706        /*
 707         * We are a ZOMBIE and nobody can enqueue itself on
 708         * pi_state_list anymore, but we have to be careful
 709         * versus waiters unqueueing themselves:
 710         */
 711        raw_spin_lock_irq(&curr->pi_lock);
 712        while (!list_empty(head)) {
 713
 714                next = head->next;
 715                pi_state = list_entry(next, struct futex_pi_state, list);
 716                key = pi_state->key;
 717                hb = hash_futex(&key);
 718                raw_spin_unlock_irq(&curr->pi_lock);
 719
 720                spin_lock(&hb->lock);
 721
 722                raw_spin_lock_irq(&curr->pi_lock);
 723                /*
 724                 * We dropped the pi-lock, so re-check whether this
 725                 * task still owns the PI-state:
 726                 */
 727                if (head->next != next) {
 728                        spin_unlock(&hb->lock);
 729                        continue;
 730                }
 731
 732                WARN_ON(pi_state->owner != curr);
 733                WARN_ON(list_empty(&pi_state->list));
 734                list_del_init(&pi_state->list);
 735                pi_state->owner = NULL;
 736                raw_spin_unlock_irq(&curr->pi_lock);
 737
 738                rt_mutex_unlock(&pi_state->pi_mutex);
 739
 740                spin_unlock(&hb->lock);
 741
 742                raw_spin_lock_irq(&curr->pi_lock);
 743        }
 744        raw_spin_unlock_irq(&curr->pi_lock);
 745}
 746
 747/*
 748 * We need to check the following states:
 749 *
 750 *      Waiter | pi_state | pi->owner | uTID      | uODIED | ?
 751 *
 752 * [1]  NULL   | ---      | ---       | 0         | 0/1    | Valid
 753 * [2]  NULL   | ---      | ---       | >0        | 0/1    | Valid
 754 *
 755 * [3]  Found  | NULL     | --        | Any       | 0/1    | Invalid
 756 *
 757 * [4]  Found  | Found    | NULL      | 0         | 1      | Valid
 758 * [5]  Found  | Found    | NULL      | >0        | 1      | Invalid
 759 *
 760 * [6]  Found  | Found    | task      | 0         | 1      | Valid
 761 *
 762 * [7]  Found  | Found    | NULL      | Any       | 0      | Invalid
 763 *
 764 * [8]  Found  | Found    | task      | ==taskTID | 0/1    | Valid
 765 * [9]  Found  | Found    | task      | 0         | 0      | Invalid
 766 * [10] Found  | Found    | task      | !=taskTID | 0/1    | Invalid
 767 *
 768 * [1]  Indicates that the kernel can acquire the futex atomically. We
 769 *      came came here due to a stale FUTEX_WAITERS/FUTEX_OWNER_DIED bit.
 770 *
 771 * [2]  Valid, if TID does not belong to a kernel thread. If no matching
 772 *      thread is found then it indicates that the owner TID has died.
 773 *
 774 * [3]  Invalid. The waiter is queued on a non PI futex
 775 *
 776 * [4]  Valid state after exit_robust_list(), which sets the user space
 777 *      value to FUTEX_WAITERS | FUTEX_OWNER_DIED.
 778 *
 779 * [5]  The user space value got manipulated between exit_robust_list()
 780 *      and exit_pi_state_list()
 781 *
 782 * [6]  Valid state after exit_pi_state_list() which sets the new owner in
 783 *      the pi_state but cannot access the user space value.
 784 *
 785 * [7]  pi_state->owner can only be NULL when the OWNER_DIED bit is set.
 786 *
 787 * [8]  Owner and user space value match
 788 *
 789 * [9]  There is no transient state which sets the user space TID to 0
 790 *      except exit_robust_list(), but this is indicated by the
 791 *      FUTEX_OWNER_DIED bit. See [4]
 792 *
 793 * [10] There is no transient state which leaves owner and user space
 794 *      TID out of sync.
 795 */
 796static int
 797lookup_pi_state(u32 uval, struct futex_hash_bucket *hb,
 798                union futex_key *key, struct futex_pi_state **ps,
 799                struct task_struct *task)
 800{
 801        struct futex_pi_state *pi_state = NULL;
 802        struct futex_q *this, *next;
 803        struct task_struct *p;
 804        pid_t pid = uval & FUTEX_TID_MASK;
 805
 806        plist_for_each_entry_safe(this, next, &hb->chain, list) {
 807                if (match_futex(&this->key, key)) {
 808                        /*
 809                         * Sanity check the waiter before increasing
 810                         * the refcount and attaching to it.
 811                         */
 812                        pi_state = this->pi_state;
 813                        /*
 814                         * Userspace might have messed up non-PI and
 815                         * PI futexes [3]
 816                         */
 817                        if (unlikely(!pi_state))
 818                                return -EINVAL;
 819
 820                        WARN_ON(!atomic_read(&pi_state->refcount));
 821
 822                        /*
 823                         * Handle the owner died case:
 824                         */
 825                        if (uval & FUTEX_OWNER_DIED) {
 826                                /*
 827                                 * exit_pi_state_list sets owner to NULL and
 828                                 * wakes the topmost waiter. The task which
 829                                 * acquires the pi_state->rt_mutex will fixup
 830                                 * owner.
 831                                 */
 832                                if (!pi_state->owner) {
 833                                        /*
 834                                         * No pi state owner, but the user
 835                                         * space TID is not 0. Inconsistent
 836                                         * state. [5]
 837                                         */
 838                                        if (pid)
 839                                                return -EINVAL;
 840                                        /*
 841                                         * Take a ref on the state and
 842                                         * return. [4]
 843                                         */
 844                                        goto out_state;
 845                                }
 846
 847                                /*
 848                                 * If TID is 0, then either the dying owner
 849                                 * has not yet executed exit_pi_state_list()
 850                                 * or some waiter acquired the rtmutex in the
 851                                 * pi state, but did not yet fixup the TID in
 852                                 * user space.
 853                                 *
 854                                 * Take a ref on the state and return. [6]
 855                                 */
 856                                if (!pid)
 857                                        goto out_state;
 858                        } else {
 859                                /*
 860                                 * If the owner died bit is not set,
 861                                 * then the pi_state must have an
 862                                 * owner. [7]
 863                                 */
 864                                if (!pi_state->owner)
 865                                        return -EINVAL;
 866                        }
 867
 868                        /*
 869                         * Bail out if user space manipulated the
 870                         * futex value. If pi state exists then the
 871                         * owner TID must be the same as the user
 872                         * space TID. [9/10]
 873                         */
 874                        if (pid != task_pid_vnr(pi_state->owner))
 875                                return -EINVAL;
 876
 877                out_state:
 878                        atomic_inc(&pi_state->refcount);
 879                        *ps = pi_state;
 880                        return 0;
 881                }
 882        }
 883
 884        /*
 885         * We are the first waiter - try to look up the real owner and attach
 886         * the new pi_state to it, but bail out when TID = 0 [1]
 887         */
 888        if (!pid)
 889                return -ESRCH;
 890        p = futex_find_get_task(pid);
 891        if (!p)
 892                return -ESRCH;
 893
 894        if (!p->mm) {
 895                put_task_struct(p);
 896                return -EPERM;
 897        }
 898
 899        /*
 900         * We need to look at the task state flags to figure out,
 901         * whether the task is exiting. To protect against the do_exit
 902         * change of the task flags, we do this protected by
 903         * p->pi_lock:
 904         */
 905        raw_spin_lock_irq(&p->pi_lock);
 906        if (unlikely(p->flags & PF_EXITING)) {
 907                /*
 908                 * The task is on the way out. When PF_EXITPIDONE is
 909                 * set, we know that the task has finished the
 910                 * cleanup:
 911                 */
 912                int ret = (p->flags & PF_EXITPIDONE) ? -ESRCH : -EAGAIN;
 913
 914                raw_spin_unlock_irq(&p->pi_lock);
 915                put_task_struct(p);
 916                return ret;
 917        }
 918
 919        /*
 920         * No existing pi state. First waiter. [2]
 921         */
 922        pi_state = alloc_pi_state();
 923
 924        /*
 925         * Initialize the pi_mutex in locked state and make 'p'
 926         * the owner of it:
 927         */
 928        rt_mutex_init_proxy_locked(&pi_state->pi_mutex, p);
 929
 930        /* Store the key for possible exit cleanups: */
 931        pi_state->key = *key;
 932
 933        WARN_ON(!list_empty(&pi_state->list));
 934        list_add(&pi_state->list, &p->pi_state_list);
 935        pi_state->owner = p;
 936        raw_spin_unlock_irq(&p->pi_lock);
 937
 938        put_task_struct(p);
 939
 940        *ps = pi_state;
 941
 942        return 0;
 943}
 944
 945/**
 946 * futex_lock_pi_atomic() - Atomic work required to acquire a pi aware futex
 947 * @uaddr:              the pi futex user address
 948 * @hb:                 the pi futex hash bucket
 949 * @key:                the futex key associated with uaddr and hb
 950 * @ps:                 the pi_state pointer where we store the result of the
 951 *                      lookup
 952 * @task:               the task to perform the atomic lock work for.  This will
 953 *                      be "current" except in the case of requeue pi.
 954 * @set_waiters:        force setting the FUTEX_WAITERS bit (1) or not (0)
 955 *
 956 * Return:
 957 *  0 - ready to wait;
 958 *  1 - acquired the lock;
 959 * <0 - error
 960 *
 961 * The hb->lock and futex_key refs shall be held by the caller.
 962 */
 963static int futex_lock_pi_atomic(u32 __user *uaddr, struct futex_hash_bucket *hb,
 964                                union futex_key *key,
 965                                struct futex_pi_state **ps,
 966                                struct task_struct *task, int set_waiters)
 967{
 968        int lock_taken, ret, force_take = 0;
 969        u32 uval, newval, curval, vpid = task_pid_vnr(task);
 970
 971retry:
 972        ret = lock_taken = 0;
 973
 974        /*
 975         * To avoid races, we attempt to take the lock here again
 976         * (by doing a 0 -> TID atomic cmpxchg), while holding all
 977         * the locks. It will most likely not succeed.
 978         */
 979        newval = vpid;
 980        if (set_waiters)
 981                newval |= FUTEX_WAITERS;
 982
 983        if (unlikely(cmpxchg_futex_value_locked(&curval, uaddr, 0, newval)))
 984                return -EFAULT;
 985
 986        /*
 987         * Detect deadlocks.
 988         */
 989        if ((unlikely((curval & FUTEX_TID_MASK) == vpid)))
 990                return -EDEADLK;
 991
 992        /*
 993         * Surprise - we got the lock, but we do not trust user space at all.
 994         */
 995        if (unlikely(!curval)) {
 996                /*
 997                 * We verify whether there is kernel state for this
 998                 * futex. If not, we can safely assume, that the 0 ->
 999                 * TID transition is correct. If state exists, we do
1000                 * not bother to fixup the user space state as it was
1001                 * corrupted already.
1002                 */
1003                return futex_top_waiter(hb, key) ? -EINVAL : 1;
1004        }
1005
1006        uval = curval;
1007
1008        /*
1009         * Set the FUTEX_WAITERS flag, so the owner will know it has someone
1010         * to wake at the next unlock.
1011         */
1012        newval = curval | FUTEX_WAITERS;
1013
1014        /*
1015         * Should we force take the futex? See below.
1016         */
1017        if (unlikely(force_take)) {
1018                /*
1019                 * Keep the OWNER_DIED and the WAITERS bit and set the
1020                 * new TID value.
1021                 */
1022                newval = (curval & ~FUTEX_TID_MASK) | vpid;
1023                force_take = 0;
1024                lock_taken = 1;
1025        }
1026
1027        if (unlikely(cmpxchg_futex_value_locked(&curval, uaddr, uval, newval)))
1028                return -EFAULT;
1029        if (unlikely(curval != uval))
1030                goto retry;
1031
1032        /*
1033         * We took the lock due to forced take over.
1034         */
1035        if (unlikely(lock_taken))
1036                return 1;
1037
1038        /*
1039         * We dont have the lock. Look up the PI state (or create it if
1040         * we are the first waiter):
1041         */
1042        ret = lookup_pi_state(uval, hb, key, ps, task);
1043
1044        if (unlikely(ret)) {
1045                switch (ret) {
1046                case -ESRCH:
1047                        /*
1048                         * We failed to find an owner for this
1049                         * futex. So we have no pi_state to block
1050                         * on. This can happen in two cases:
1051                         *
1052                         * 1) The owner died
1053                         * 2) A stale FUTEX_WAITERS bit
1054                         *
1055                         * Re-read the futex value.
1056                         */
1057                        if (get_futex_value_locked(&curval, uaddr))
1058                                return -EFAULT;
1059
1060                        /*
1061                         * If the owner died or we have a stale
1062                         * WAITERS bit the owner TID in the user space
1063                         * futex is 0.
1064                         */
1065                        if (!(curval & FUTEX_TID_MASK)) {
1066                                force_take = 1;
1067                                goto retry;
1068                        }
1069                default:
1070                        break;
1071                }
1072        }
1073
1074        return ret;
1075}
1076
1077/**
1078 * __unqueue_futex() - Remove the futex_q from its futex_hash_bucket
1079 * @q:  The futex_q to unqueue
1080 *
1081 * The q->lock_ptr must not be NULL and must be held by the caller.
1082 */
1083static void __unqueue_futex(struct futex_q *q)
1084{
1085        struct futex_hash_bucket *hb;
1086
1087        if (WARN_ON_SMP(!q->lock_ptr || !spin_is_locked(q->lock_ptr))
1088            || WARN_ON(plist_node_empty(&q->list)))
1089                return;
1090
1091        hb = container_of(q->lock_ptr, struct futex_hash_bucket, lock);
1092        plist_del(&q->list, &hb->chain);
1093        hb_waiters_dec(hb);
1094}
1095
1096/*
1097 * The hash bucket lock must be held when this is called.
1098 * Afterwards, the futex_q must not be accessed. Callers
1099 * must ensure to later call wake_up_q() for the actual
1100 * wakeups to occur.
1101 */
1102static void mark_wake_futex(struct wake_q_head *wake_q, struct futex_q *q)
1103{
1104        struct task_struct *p = q->task;
1105
1106        if (WARN(q->pi_state || q->rt_waiter, "refusing to wake PI futex\n"))
1107                return;
1108
1109        /*
1110         * Queue the task for later wakeup for after we've released
1111         * the hb->lock. wake_q_add() grabs reference to p.
1112         */
1113        wake_q_add(wake_q, p);
1114        __unqueue_futex(q);
1115        /*
1116         * The waiting task can free the futex_q as soon as
1117         * q->lock_ptr = NULL is written, without taking any locks. A
1118         * memory barrier is required here to prevent the following
1119         * store to lock_ptr from getting ahead of the plist_del.
1120         */
1121        smp_wmb();
1122        q->lock_ptr = NULL;
1123}
1124
1125static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this)
1126{
1127        struct task_struct *new_owner;
1128        struct futex_pi_state *pi_state = this->pi_state;
1129        u32 uninitialized_var(curval), newval;
1130        int ret = 0;
1131
1132        if (!pi_state)
1133                return -EINVAL;
1134
1135        /*
1136         * If current does not own the pi_state then the futex is
1137         * inconsistent and user space fiddled with the futex value.
1138         */
1139        if (pi_state->owner != current)
1140                return -EINVAL;
1141
1142        raw_spin_lock(&pi_state->pi_mutex.wait_lock);
1143        new_owner = rt_mutex_next_owner(&pi_state->pi_mutex);
1144
1145        /*
1146         * It is possible that the next waiter (the one that brought
1147         * this owner to the kernel) timed out and is no longer
1148         * waiting on the lock.
1149         */
1150        if (!new_owner)
1151                new_owner = this->task;
1152
1153        /*
1154         * We pass it to the next owner. The WAITERS bit is always
1155         * kept enabled while there is PI state around. We cleanup the
1156         * owner died bit, because we are the owner.
1157         */
1158        newval = FUTEX_WAITERS | task_pid_vnr(new_owner);
1159
1160        if (cmpxchg_futex_value_locked(&curval, uaddr, uval, newval))
1161                ret = -EFAULT;
1162        else if (curval != uval)
1163                ret = -EINVAL;
1164        if (ret) {
1165                raw_spin_unlock(&pi_state->pi_mutex.wait_lock);
1166                return ret;
1167        }
1168
1169        raw_spin_lock_irq(&pi_state->owner->pi_lock);
1170        WARN_ON(list_empty(&pi_state->list));
1171        list_del_init(&pi_state->list);
1172        raw_spin_unlock_irq(&pi_state->owner->pi_lock);
1173
1174        raw_spin_lock_irq(&new_owner->pi_lock);
1175        WARN_ON(!list_empty(&pi_state->list));
1176        list_add(&pi_state->list, &new_owner->pi_state_list);
1177        pi_state->owner = new_owner;
1178        raw_spin_unlock_irq(&new_owner->pi_lock);
1179
1180        raw_spin_unlock(&pi_state->pi_mutex.wait_lock);
1181        rt_mutex_unlock(&pi_state->pi_mutex);
1182
1183        return 0;
1184}
1185
1186static int unlock_futex_pi(u32 __user *uaddr, u32 uval)
1187{
1188        u32 uninitialized_var(oldval);
1189
1190        /*
1191         * There is no waiter, so we unlock the futex. The owner died
1192         * bit has not to be preserved here. We are the owner:
1193         */
1194        if (cmpxchg_futex_value_locked(&oldval, uaddr, uval, 0))
1195                return -EFAULT;
1196        if (oldval != uval)
1197                return -EAGAIN;
1198
1199        return 0;
1200}
1201
1202/*
1203 * Express the locking dependencies for lockdep:
1204 */
1205static inline void
1206double_lock_hb(struct futex_hash_bucket *hb1, struct futex_hash_bucket *hb2)
1207{
1208        if (hb1 <= hb2) {
1209                spin_lock(&hb1->lock);
1210                if (hb1 < hb2)
1211                        spin_lock_nested(&hb2->lock, SINGLE_DEPTH_NESTING);
1212        } else { /* hb1 > hb2 */
1213                spin_lock(&hb2->lock);
1214                spin_lock_nested(&hb1->lock, SINGLE_DEPTH_NESTING);
1215        }
1216}
1217
1218static inline void
1219double_unlock_hb(struct futex_hash_bucket *hb1, struct futex_hash_bucket *hb2)
1220{
1221        spin_unlock(&hb1->lock);
1222        if (hb1 != hb2)
1223                spin_unlock(&hb2->lock);
1224}
1225
1226/*
1227 * Wake up waiters matching bitset queued on this futex (uaddr).
1228 */
1229static int
1230futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset)
1231{
1232        struct futex_hash_bucket *hb;
1233        struct futex_q *this, *next;
1234        union futex_key key = FUTEX_KEY_INIT;
1235        int ret;
1236        WAKE_Q(wake_q);
1237
1238        if (!bitset)
1239                return -EINVAL;
1240
1241        ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &key, VERIFY_READ);
1242        if (unlikely(ret != 0))
1243                goto out;
1244
1245        hb = hash_futex(&key);
1246
1247        /* Make sure we really have tasks to wakeup */
1248        if (!hb_waiters_pending(hb))
1249                goto out_put_key;
1250
1251        spin_lock(&hb->lock);
1252
1253        plist_for_each_entry_safe(this, next, &hb->chain, list) {
1254                if (match_futex (&this->key, &key)) {
1255                        if (this->pi_state || this->rt_waiter) {
1256                                ret = -EINVAL;
1257                                break;
1258                        }
1259
1260                        /* Check if one of the bits is set in both bitsets */
1261                        if (!(this->bitset & bitset))
1262                                continue;
1263
1264                        mark_wake_futex(&wake_q, this);
1265                        if (++ret >= nr_wake)
1266                                break;
1267                }
1268        }
1269
1270        spin_unlock(&hb->lock);
1271        wake_up_q(&wake_q);
1272out_put_key:
1273        put_futex_key(&key);
1274out:
1275        return ret;
1276}
1277
1278/*
1279 * Wake up all waiters hashed on the physical page that is mapped
1280 * to this virtual address:
1281 */
1282static int
1283futex_wake_op(u32 __user *uaddr1, unsigned int flags, u32 __user *uaddr2,
1284              int nr_wake, int nr_wake2, int op)
1285{
1286        union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT;
1287        struct futex_hash_bucket *hb1, *hb2;
1288        struct futex_q *this, *next;
1289        int ret, op_ret;
1290        WAKE_Q(wake_q);
1291
1292retry:
1293        ret = get_futex_key(uaddr1, flags & FLAGS_SHARED, &key1, VERIFY_READ);
1294        if (unlikely(ret != 0))
1295                goto out;
1296        ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, VERIFY_WRITE);
1297        if (unlikely(ret != 0))
1298                goto out_put_key1;
1299
1300        hb1 = hash_futex(&key1);
1301        hb2 = hash_futex(&key2);
1302
1303retry_private:
1304        double_lock_hb(hb1, hb2);
1305        op_ret = futex_atomic_op_inuser(op, uaddr2);
1306        if (unlikely(op_ret < 0)) {
1307
1308                double_unlock_hb(hb1, hb2);
1309
1310#ifndef CONFIG_MMU
1311                /*
1312                 * we don't get EFAULT from MMU faults if we don't have an MMU,
1313                 * but we might get them from range checking
1314                 */
1315                ret = op_ret;
1316                goto out_put_keys;
1317#endif
1318
1319                if (unlikely(op_ret != -EFAULT)) {
1320                        ret = op_ret;
1321                        goto out_put_keys;
1322                }
1323
1324                ret = fault_in_user_writeable(uaddr2);
1325                if (ret)
1326                        goto out_put_keys;
1327
1328                if (!(flags & FLAGS_SHARED))
1329                        goto retry_private;
1330
1331                put_futex_key(&key2);
1332                put_futex_key(&key1);
1333                goto retry;
1334        }
1335
1336        plist_for_each_entry_safe(this, next, &hb1->chain, list) {
1337                if (match_futex (&this->key, &key1)) {
1338                        if (this->pi_state || this->rt_waiter) {
1339                                ret = -EINVAL;
1340                                goto out_unlock;
1341                        }
1342                        mark_wake_futex(&wake_q, this);
1343                        if (++ret >= nr_wake)
1344                                break;
1345                }
1346        }
1347
1348        if (op_ret > 0) {
1349                op_ret = 0;
1350                plist_for_each_entry_safe(this, next, &hb2->chain, list) {
1351                        if (match_futex (&this->key, &key2)) {
1352                                if (this->pi_state || this->rt_waiter) {
1353                                        ret = -EINVAL;
1354                                        goto out_unlock;
1355                                }
1356                                mark_wake_futex(&wake_q, this);
1357                                if (++op_ret >= nr_wake2)
1358                                        break;
1359                        }
1360                }
1361                ret += op_ret;
1362        }
1363
1364out_unlock:
1365        double_unlock_hb(hb1, hb2);
1366        wake_up_q(&wake_q);
1367out_put_keys:
1368        put_futex_key(&key2);
1369out_put_key1:
1370        put_futex_key(&key1);
1371out:
1372        return ret;
1373}
1374
1375/**
1376 * requeue_futex() - Requeue a futex_q from one hb to another
1377 * @q:          the futex_q to requeue
1378 * @hb1:        the source hash_bucket
1379 * @hb2:        the target hash_bucket
1380 * @key2:       the new key for the requeued futex_q
1381 */
1382static inline
1383void requeue_futex(struct futex_q *q, struct futex_hash_bucket *hb1,
1384                   struct futex_hash_bucket *hb2, union futex_key *key2)
1385{
1386
1387        /*
1388         * If key1 and key2 hash to the same bucket, no need to
1389         * requeue.
1390         */
1391        if (likely(&hb1->chain != &hb2->chain)) {
1392                plist_del(&q->list, &hb1->chain);
1393                hb_waiters_dec(hb1);
1394                plist_add(&q->list, &hb2->chain);
1395                hb_waiters_inc(hb2);
1396                q->lock_ptr = &hb2->lock;
1397        }
1398        get_futex_key_refs(key2);
1399        q->key = *key2;
1400}
1401
1402/**
1403 * requeue_pi_wake_futex() - Wake a task that acquired the lock during requeue
1404 * @q:          the futex_q
1405 * @key:        the key of the requeue target futex
1406 * @hb:         the hash_bucket of the requeue target futex
1407 *
1408 * During futex_requeue, with requeue_pi=1, it is possible to acquire the
1409 * target futex if it is uncontended or via a lock steal.  Set the futex_q key
1410 * to the requeue target futex so the waiter can detect the wakeup on the right
1411 * futex, but remove it from the hb and NULL the rt_waiter so it can detect
1412 * atomic lock acquisition.  Set the q->lock_ptr to the requeue target hb->lock
1413 * to protect access to the pi_state to fixup the owner later.  Must be called
1414 * with both q->lock_ptr and hb->lock held.
1415 */
1416static inline
1417void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key,
1418                           struct futex_hash_bucket *hb)
1419{
1420        get_futex_key_refs(key);
1421        q->key = *key;
1422
1423        __unqueue_futex(q);
1424
1425        WARN_ON(!q->rt_waiter);
1426        q->rt_waiter = NULL;
1427
1428        q->lock_ptr = &hb->lock;
1429
1430        wake_up_state(q->task, TASK_NORMAL);
1431}
1432
1433/**
1434 * futex_proxy_trylock_atomic() - Attempt an atomic lock for the top waiter
1435 * @pifutex:            the user address of the to futex
1436 * @hb1:                the from futex hash bucket, must be locked by the caller
1437 * @hb2:                the to futex hash bucket, must be locked by the caller
1438 * @key1:               the from futex key
1439 * @key2:               the to futex key
1440 * @ps:                 address to store the pi_state pointer
1441 * @set_waiters:        force setting the FUTEX_WAITERS bit (1) or not (0)
1442 *
1443 * Try and get the lock on behalf of the top waiter if we can do it atomically.
1444 * Wake the top waiter if we succeed.  If the caller specified set_waiters,
1445 * then direct futex_lock_pi_atomic() to force setting the FUTEX_WAITERS bit.
1446 * hb1 and hb2 must be held by the caller.
1447 *
1448 * Return:
1449 *  0 - failed to acquire the lock atomically;
1450 * >0 - acquired the lock, return value is vpid of the top_waiter
1451 * <0 - error
1452 */
1453static int futex_proxy_trylock_atomic(u32 __user *pifutex,
1454                                 struct futex_hash_bucket *hb1,
1455                                 struct futex_hash_bucket *hb2,
1456                                 union futex_key *key1, union futex_key *key2,
1457                                 struct futex_pi_state **ps, int set_waiters)
1458{
1459        struct futex_q *top_waiter = NULL;
1460        u32 curval;
1461        int ret, vpid;
1462
1463        if (get_futex_value_locked(&curval, pifutex))
1464                return -EFAULT;
1465
1466        /*
1467         * Find the top_waiter and determine if there are additional waiters.
1468         * If the caller intends to requeue more than 1 waiter to pifutex,
1469         * force futex_lock_pi_atomic() to set the FUTEX_WAITERS bit now,
1470         * as we have means to handle the possible fault.  If not, don't set
1471         * the bit unecessarily as it will force the subsequent unlock to enter
1472         * the kernel.
1473         */
1474        top_waiter = futex_top_waiter(hb1, key1);
1475
1476        /* There are no waiters, nothing for us to do. */
1477        if (!top_waiter)
1478                return 0;
1479
1480        /* Ensure we requeue to the expected futex. */
1481        if (!match_futex(top_waiter->requeue_pi_key, key2))
1482                return -EINVAL;
1483
1484        /*
1485         * Try to take the lock for top_waiter.  Set the FUTEX_WAITERS bit in
1486         * the contended case or if set_waiters is 1.  The pi_state is returned
1487         * in ps in contended cases.
1488         */
1489        vpid = task_pid_vnr(top_waiter->task);
1490        ret = futex_lock_pi_atomic(pifutex, hb2, key2, ps, top_waiter->task,
1491                                   set_waiters);
1492        if (ret == 1) {
1493                requeue_pi_wake_futex(top_waiter, key2, hb2);
1494                return vpid;
1495        }
1496        return ret;
1497}
1498
1499/**
1500 * futex_requeue() - Requeue waiters from uaddr1 to uaddr2
1501 * @uaddr1:     source futex user address
1502 * @flags:      futex flags (FLAGS_SHARED, etc.)
1503 * @uaddr2:     target futex user address
1504 * @nr_wake:    number of waiters to wake (must be 1 for requeue_pi)
1505 * @nr_requeue: number of waiters to requeue (0-INT_MAX)
1506 * @cmpval:     @uaddr1 expected value (or %NULL)
1507 * @requeue_pi: if we are attempting to requeue from a non-pi futex to a
1508 *              pi futex (pi to pi requeue is not supported)
1509 *
1510 * Requeue waiters on uaddr1 to uaddr2. In the requeue_pi case, try to acquire
1511 * uaddr2 atomically on behalf of the top waiter.
1512 *
1513 * Return:
1514 * >=0 - on success, the number of tasks requeued or woken;
1515 *  <0 - on error
1516 */
1517static int futex_requeue(u32 __user *uaddr1, unsigned int flags,
1518                         u32 __user *uaddr2, int nr_wake, int nr_requeue,
1519                         u32 *cmpval, int requeue_pi)
1520{
1521        union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT;
1522        int drop_count = 0, task_count = 0, ret;
1523        struct futex_pi_state *pi_state = NULL;
1524        struct futex_hash_bucket *hb1, *hb2;
1525        struct futex_q *this, *next;
1526        WAKE_Q(wake_q);
1527
1528        if (nr_wake < 0 || nr_requeue < 0)
1529                return -EINVAL;
1530
1531        if (requeue_pi) {
1532                /*
1533                 * Requeue PI only works on two distinct uaddrs. This
1534                 * check is only valid for private futexes. See below.
1535                 */
1536                if (uaddr1 == uaddr2)
1537                        return -EINVAL;
1538
1539                /*
1540                 * requeue_pi requires a pi_state, try to allocate it now
1541                 * without any locks in case it fails.
1542                 */
1543                if (refill_pi_state_cache())
1544                        return -ENOMEM;
1545                /*
1546                 * requeue_pi must wake as many tasks as it can, up to nr_wake
1547                 * + nr_requeue, since it acquires the rt_mutex prior to
1548                 * returning to userspace, so as to not leave the rt_mutex with
1549                 * waiters and no owner.  However, second and third wake-ups
1550                 * cannot be predicted as they involve race conditions with the
1551                 * first wake and a fault while looking up the pi_state.  Both
1552                 * pthread_cond_signal() and pthread_cond_broadcast() should
1553                 * use nr_wake=1.
1554                 */
1555                if (nr_wake != 1)
1556                        return -EINVAL;
1557        }
1558
1559retry:
1560        if (pi_state != NULL) {
1561                /*
1562                 * We will have to lookup the pi_state again, so free this one
1563                 * to keep the accounting correct.
1564                 */
1565                free_pi_state(pi_state);
1566                pi_state = NULL;
1567        }
1568
1569        ret = get_futex_key(uaddr1, flags & FLAGS_SHARED, &key1, VERIFY_READ);
1570        if (unlikely(ret != 0))
1571                goto out;
1572        ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2,
1573                            requeue_pi ? VERIFY_WRITE : VERIFY_READ);
1574        if (unlikely(ret != 0))
1575                goto out_put_key1;
1576
1577        /*
1578         * The check above which compares uaddrs is not sufficient for
1579         * shared futexes. We need to compare the keys:
1580         */
1581        if (requeue_pi && match_futex(&key1, &key2)) {
1582                ret = -EINVAL;
1583                goto out_put_keys;
1584        }
1585
1586        hb1 = hash_futex(&key1);
1587        hb2 = hash_futex(&key2);
1588
1589retry_private:
1590        hb_waiters_inc(hb2);
1591        double_lock_hb(hb1, hb2);
1592
1593        if (likely(cmpval != NULL)) {
1594                u32 curval;
1595
1596                ret = get_futex_value_locked(&curval, uaddr1);
1597
1598                if (unlikely(ret)) {
1599                        double_unlock_hb(hb1, hb2);
1600                        hb_waiters_dec(hb2);
1601
1602                        ret = get_user(curval, uaddr1);
1603                        if (ret)
1604                                goto out_put_keys;
1605
1606                        if (!(flags & FLAGS_SHARED))
1607                                goto retry_private;
1608
1609                        put_futex_key(&key2);
1610                        put_futex_key(&key1);
1611                        goto retry;
1612                }
1613                if (curval != *cmpval) {
1614                        ret = -EAGAIN;
1615                        goto out_unlock;
1616                }
1617        }
1618
1619        if (requeue_pi && (task_count - nr_wake < nr_requeue)) {
1620                /*
1621                 * Attempt to acquire uaddr2 and wake the top waiter. If we
1622                 * intend to requeue waiters, force setting the FUTEX_WAITERS
1623                 * bit.  We force this here where we are able to easily handle
1624                 * faults rather in the requeue loop below.
1625                 */
1626                ret = futex_proxy_trylock_atomic(uaddr2, hb1, hb2, &key1,
1627                                                 &key2, &pi_state, nr_requeue);
1628
1629                /*
1630                 * At this point the top_waiter has either taken uaddr2 or is
1631                 * waiting on it.  If the former, then the pi_state will not
1632                 * exist yet, look it up one more time to ensure we have a
1633                 * reference to it. If the lock was taken, ret contains the
1634                 * vpid of the top waiter task.
1635                 */
1636                if (ret > 0) {
1637                        WARN_ON(pi_state);
1638                        drop_count++;
1639                        task_count++;
1640                        /*
1641                         * If we acquired the lock, then the user
1642                         * space value of uaddr2 should be vpid. It
1643                         * cannot be changed by the top waiter as it
1644                         * is blocked on hb2 lock if it tries to do
1645                         * so. If something fiddled with it behind our
1646                         * back the pi state lookup might unearth
1647                         * it. So we rather use the known value than
1648                         * rereading and handing potential crap to
1649                         * lookup_pi_state.
1650                         */
1651                        ret = lookup_pi_state(ret, hb2, &key2, &pi_state, NULL);
1652                }
1653
1654                switch (ret) {
1655                case 0:
1656                        break;
1657                case -EFAULT:
1658                        double_unlock_hb(hb1, hb2);
1659                        hb_waiters_dec(hb2);
1660                        put_futex_key(&key2);
1661                        put_futex_key(&key1);
1662                        ret = fault_in_user_writeable(uaddr2);
1663                        if (!ret)
1664                                goto retry;
1665                        goto out;
1666                case -EAGAIN:
1667                        /* The owner was exiting, try again. */
1668                        double_unlock_hb(hb1, hb2);
1669                        hb_waiters_dec(hb2);
1670                        put_futex_key(&key2);
1671                        put_futex_key(&key1);
1672                        cond_resched();
1673                        goto retry;
1674                default:
1675                        goto out_unlock;
1676                }
1677        }
1678
1679        plist_for_each_entry_safe(this, next, &hb1->chain, list) {
1680                if (task_count - nr_wake >= nr_requeue)
1681                        break;
1682
1683                if (!match_futex(&this->key, &key1))
1684                        continue;
1685
1686                /*
1687                 * FUTEX_WAIT_REQEUE_PI and FUTEX_CMP_REQUEUE_PI should always
1688                 * be paired with each other and no other futex ops.
1689                 *
1690                 * We should never be requeueing a futex_q with a pi_state,
1691                 * which is awaiting a futex_unlock_pi().
1692                 */
1693                if ((requeue_pi && !this->rt_waiter) ||
1694                    (!requeue_pi && this->rt_waiter) ||
1695                    this->pi_state) {
1696                        ret = -EINVAL;
1697                        break;
1698                }
1699
1700                /*
1701                 * Wake nr_wake waiters.  For requeue_pi, if we acquired the
1702                 * lock, we already woke the top_waiter.  If not, it will be
1703                 * woken by futex_unlock_pi().
1704                 */
1705                if (++task_count <= nr_wake && !requeue_pi) {
1706                        mark_wake_futex(&wake_q, this);
1707                        continue;
1708                }
1709
1710                /* Ensure we requeue to the expected futex for requeue_pi. */
1711                if (requeue_pi && !match_futex(this->requeue_pi_key, &key2)) {
1712                        ret = -EINVAL;
1713                        break;
1714                }
1715
1716                /*
1717                 * Requeue nr_requeue waiters and possibly one more in the case
1718                 * of requeue_pi if we couldn't acquire the lock atomically.
1719                 */
1720                if (requeue_pi) {
1721                        /* Prepare the waiter to take the rt_mutex. */
1722                        atomic_inc(&pi_state->refcount);
1723                        this->pi_state = pi_state;
1724                        ret = rt_mutex_start_proxy_lock(&pi_state->pi_mutex,
1725                                                        this->rt_waiter,
1726                                                        this->task, 1);
1727                        if (ret == 1) {
1728                                /* We got the lock. */
1729                                requeue_pi_wake_futex(this, &key2, hb2);
1730                                drop_count++;
1731                                continue;
1732                        } else if (ret) {
1733                                /* -EDEADLK */
1734                                this->pi_state = NULL;
1735                                free_pi_state(pi_state);
1736                                goto out_unlock;
1737                        }
1738                }
1739                requeue_futex(this, hb1, hb2, &key2);
1740                drop_count++;
1741        }
1742
1743out_unlock:
1744        double_unlock_hb(hb1, hb2);
1745        wake_up_q(&wake_q);
1746        hb_waiters_dec(hb2);
1747
1748        /*
1749         * drop_futex_key_refs() must be called outside the spinlocks. During
1750         * the requeue we moved futex_q's from the hash bucket at key1 to the
1751         * one at key2 and updated their key pointer.  We no longer need to
1752         * hold the references to key1.
1753         */
1754        while (--drop_count >= 0)
1755                drop_futex_key_refs(&key1);
1756
1757out_put_keys:
1758        put_futex_key(&key2);
1759out_put_key1:
1760        put_futex_key(&key1);
1761out:
1762        if (pi_state != NULL)
1763                free_pi_state(pi_state);
1764        return ret ? ret : task_count;
1765}
1766
1767/* The key must be already stored in q->key. */
1768static inline struct futex_hash_bucket *queue_lock(struct futex_q *q)
1769        __acquires(&hb->lock)
1770{
1771        struct futex_hash_bucket *hb;
1772
1773        hb = hash_futex(&q->key);
1774
1775        /*
1776         * Increment the counter before taking the lock so that
1777         * a potential waker won't miss a to-be-slept task that is
1778         * waiting for the spinlock. This is safe as all queue_lock()
1779         * users end up calling queue_me(). Similarly, for housekeeping,
1780         * decrement the counter at queue_unlock() when some error has
1781         * occurred and we don't end up adding the task to the list.
1782         */
1783        hb_waiters_inc(hb);
1784
1785        q->lock_ptr = &hb->lock;
1786
1787        spin_lock(&hb->lock); /* implies MB (A) */
1788        return hb;
1789}
1790
1791static inline void
1792queue_unlock(struct futex_hash_bucket *hb)
1793        __releases(&hb->lock)
1794{
1795        spin_unlock(&hb->lock);
1796        hb_waiters_dec(hb);
1797}
1798
1799/**
1800 * queue_me() - Enqueue the futex_q on the futex_hash_bucket
1801 * @q:  The futex_q to enqueue
1802 * @hb: The destination hash bucket
1803 *
1804 * The hb->lock must be held by the caller, and is released here. A call to
1805 * queue_me() is typically paired with exactly one call to unqueue_me().  The
1806 * exceptions involve the PI related operations, which may use unqueue_me_pi()
1807 * or nothing if the unqueue is done as part of the wake process and the unqueue
1808 * state is implicit in the state of woken task (see futex_wait_requeue_pi() for
1809 * an example).
1810 */
1811static inline void queue_me(struct futex_q *q, struct futex_hash_bucket *hb)
1812        __releases(&hb->lock)
1813{
1814        int prio;
1815
1816        /*
1817         * The priority used to register this element is
1818         * - either the real thread-priority for the real-time threads
1819         * (i.e. threads with a priority lower than MAX_RT_PRIO)
1820         * - or MAX_RT_PRIO for non-RT threads.
1821         * Thus, all RT-threads are woken first in priority order, and
1822         * the others are woken last, in FIFO order.
1823         */
1824        prio = min(current->normal_prio, MAX_RT_PRIO);
1825
1826        plist_node_init(&q->list, prio);
1827        plist_add(&q->list, &hb->chain);
1828        q->task = current;
1829        spin_unlock(&hb->lock);
1830}
1831
1832/**
1833 * unqueue_me() - Remove the futex_q from its futex_hash_bucket
1834 * @q:  The futex_q to unqueue
1835 *
1836 * The q->lock_ptr must not be held by the caller. A call to unqueue_me() must
1837 * be paired with exactly one earlier call to queue_me().
1838 *
1839 * Return:
1840 *   1 - if the futex_q was still queued (and we removed unqueued it);
1841 *   0 - if the futex_q was already removed by the waking thread
1842 */
1843static int unqueue_me(struct futex_q *q)
1844{
1845        spinlock_t *lock_ptr;
1846        int ret = 0;
1847
1848        /* In the common case we don't take the spinlock, which is nice. */
1849retry:
1850        lock_ptr = q->lock_ptr;
1851        barrier();
1852        if (lock_ptr != NULL) {
1853                spin_lock(lock_ptr);
1854                /*
1855                 * q->lock_ptr can change between reading it and
1856                 * spin_lock(), causing us to take the wrong lock.  This
1857                 * corrects the race condition.
1858                 *
1859                 * Reasoning goes like this: if we have the wrong lock,
1860                 * q->lock_ptr must have changed (maybe several times)
1861                 * between reading it and the spin_lock().  It can
1862                 * change again after the spin_lock() but only if it was
1863                 * already changed before the spin_lock().  It cannot,
1864                 * however, change back to the original value.  Therefore
1865                 * we can detect whether we acquired the correct lock.
1866                 */
1867                if (unlikely(lock_ptr != q->lock_ptr)) {
1868                        spin_unlock(lock_ptr);
1869                        goto retry;
1870                }
1871                __unqueue_futex(q);
1872
1873                BUG_ON(q->pi_state);
1874
1875                spin_unlock(lock_ptr);
1876                ret = 1;
1877        }
1878
1879        drop_futex_key_refs(&q->key);
1880        return ret;
1881}
1882
1883/*
1884 * PI futexes can not be requeued and must remove themself from the
1885 * hash bucket. The hash bucket lock (i.e. lock_ptr) is held on entry
1886 * and dropped here.
1887 */
1888static void unqueue_me_pi(struct futex_q *q)
1889        __releases(q->lock_ptr)
1890{
1891        __unqueue_futex(q);
1892
1893        BUG_ON(!q->pi_state);
1894        free_pi_state(q->pi_state);
1895        q->pi_state = NULL;
1896
1897        spin_unlock(q->lock_ptr);
1898}
1899
1900/*
1901 * Fixup the pi_state owner with the new owner.
1902 *
1903 * Must be called with hash bucket lock held and mm->sem held for non
1904 * private futexes.
1905 */
1906static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,
1907                                struct task_struct *newowner)
1908{
1909        u32 newtid = task_pid_vnr(newowner) | FUTEX_WAITERS;
1910        struct futex_pi_state *pi_state = q->pi_state;
1911        struct task_struct *oldowner = pi_state->owner;
1912        u32 uval, uninitialized_var(curval), newval;
1913        int ret;
1914
1915        /* Owner died? */
1916        if (!pi_state->owner)
1917                newtid |= FUTEX_OWNER_DIED;
1918
1919        /*
1920         * We are here either because we stole the rtmutex from the
1921         * previous highest priority waiter or we are the highest priority
1922         * waiter but failed to get the rtmutex the first time.
1923         * We have to replace the newowner TID in the user space variable.
1924         * This must be atomic as we have to preserve the owner died bit here.
1925         *
1926         * Note: We write the user space value _before_ changing the pi_state
1927         * because we can fault here. Imagine swapped out pages or a fork
1928         * that marked all the anonymous memory readonly for cow.
1929         *
1930         * Modifying pi_state _before_ the user space value would
1931         * leave the pi_state in an inconsistent state when we fault
1932         * here, because we need to drop the hash bucket lock to
1933         * handle the fault. This might be observed in the PID check
1934         * in lookup_pi_state.
1935         */
1936retry:
1937        if (get_futex_value_locked(&uval, uaddr))
1938                goto handle_fault;
1939
1940        while (1) {
1941                newval = (uval & FUTEX_OWNER_DIED) | newtid;
1942
1943                if (cmpxchg_futex_value_locked(&curval, uaddr, uval, newval))
1944                        goto handle_fault;
1945                if (curval == uval)
1946                        break;
1947                uval = curval;
1948        }
1949
1950        /*
1951         * We fixed up user space. Now we need to fix the pi_state
1952         * itself.
1953         */
1954        if (pi_state->owner != NULL) {
1955                raw_spin_lock_irq(&pi_state->owner->pi_lock);
1956                WARN_ON(list_empty(&pi_state->list));
1957                list_del_init(&pi_state->list);
1958                raw_spin_unlock_irq(&pi_state->owner->pi_lock);
1959        }
1960
1961        pi_state->owner = newowner;
1962
1963        raw_spin_lock_irq(&newowner->pi_lock);
1964        WARN_ON(!list_empty(&pi_state->list));
1965        list_add(&pi_state->list, &newowner->pi_state_list);
1966        raw_spin_unlock_irq(&newowner->pi_lock);
1967        return 0;
1968
1969        /*
1970         * To handle the page fault we need to drop the hash bucket
1971         * lock here. That gives the other task (either the highest priority
1972         * waiter itself or the task which stole the rtmutex) the
1973         * chance to try the fixup of the pi_state. So once we are
1974         * back from handling the fault we need to check the pi_state
1975         * after reacquiring the hash bucket lock and before trying to
1976         * do another fixup. When the fixup has been done already we
1977         * simply return.
1978         */
1979handle_fault:
1980        spin_unlock(q->lock_ptr);
1981
1982        ret = fault_in_user_writeable(uaddr);
1983
1984        spin_lock(q->lock_ptr);
1985
1986        /*
1987         * Check if someone else fixed it for us:
1988         */
1989        if (pi_state->owner != oldowner)
1990                return 0;
1991
1992        if (ret)
1993                return ret;
1994
1995        goto retry;
1996}
1997
1998static long futex_wait_restart(struct restart_block *restart);
1999
2000/**
2001 * fixup_owner() - Post lock pi_state and corner case management
2002 * @uaddr:      user address of the futex
2003 * @q:          futex_q (contains pi_state and access to the rt_mutex)
2004 * @locked:     if the attempt to take the rt_mutex succeeded (1) or not (0)
2005 *
2006 * After attempting to lock an rt_mutex, this function is called to cleanup
2007 * the pi_state owner as well as handle race conditions that may allow us to
2008 * acquire the lock. Must be called with the hb lock held.
2009 *
2010 * Return:
2011 *  1 - success, lock taken;
2012 *  0 - success, lock not taken;
2013 * <0 - on error (-EFAULT)
2014 */
2015static int fixup_owner(u32 __user *uaddr, struct futex_q *q, int locked)
2016{
2017        struct task_struct *owner;
2018        int ret = 0;
2019
2020        if (locked) {
2021                /*
2022                 * Got the lock. We might not be the anticipated owner if we
2023                 * did a lock-steal - fix up the PI-state in that case:
2024                 */
2025                if (q->pi_state->owner != current)
2026                        ret = fixup_pi_state_owner(uaddr, q, current);
2027                goto out;
2028        }
2029
2030        /*
2031         * Catch the rare case, where the lock was released when we were on the
2032         * way back before we locked the hash bucket.
2033         */
2034        if (q->pi_state->owner == current) {
2035                /*
2036                 * Try to get the rt_mutex now. This might fail as some other
2037                 * task acquired the rt_mutex after we removed ourself from the
2038                 * rt_mutex waiters list.
2039                 */
2040                if (rt_mutex_trylock(&q->pi_state->pi_mutex)) {
2041                        locked = 1;
2042                        goto out;
2043                }
2044
2045                /*
2046                 * pi_state is incorrect, some other task did a lock steal and
2047                 * we returned due to timeout or signal without taking the
2048                 * rt_mutex. Too late.
2049                 */
2050                raw_spin_lock(&q->pi_state->pi_mutex.wait_lock);
2051                owner = rt_mutex_owner(&q->pi_state->pi_mutex);
2052                if (!owner)
2053                        owner = rt_mutex_next_owner(&q->pi_state->pi_mutex);
2054                raw_spin_unlock(&q->pi_state->pi_mutex.wait_lock);
2055                ret = fixup_pi_state_owner(uaddr, q, owner);
2056                goto out;
2057        }
2058
2059        /*
2060         * Paranoia check. If we did not take the lock, then we should not be
2061         * the owner of the rt_mutex.
2062         */
2063        if (rt_mutex_owner(&q->pi_state->pi_mutex) == current)
2064                printk(KERN_ERR "fixup_owner: ret = %d pi-mutex: %p "
2065                                "pi-state %p\n", ret,
2066                                q->pi_state->pi_mutex.owner,
2067                                q->pi_state->owner);
2068
2069out:
2070        return ret ? ret : locked;
2071}
2072
2073/**
2074 * futex_wait_queue_me() - queue_me() and wait for wakeup, timeout, or signal
2075 * @hb:         the futex hash bucket, must be locked by the caller
2076 * @q:          the futex_q to queue up on
2077 * @timeout:    the prepared hrtimer_sleeper, or null for no timeout
2078 */
2079static void futex_wait_queue_me(struct futex_hash_bucket *hb, struct futex_q *q,
2080                                struct hrtimer_sleeper *timeout)
2081{
2082        /*
2083         * The task state is guaranteed to be set before another task can
2084         * wake it. set_current_state() is implemented using set_mb() and
2085         * queue_me() calls spin_unlock() upon completion, both serializing
2086         * access to the hash list and forcing another memory barrier.
2087         */
2088        set_current_state(TASK_INTERRUPTIBLE);
2089        queue_me(q, hb);
2090
2091        /* Arm the timer */
2092        if (timeout)
2093                hrtimer_start_expires(&timeout->timer, HRTIMER_MODE_ABS);
2094
2095        /*
2096         * If we have been removed from the hash list, then another task
2097         * has tried to wake us, and we can skip the call to schedule().
2098         */
2099        if (likely(!plist_node_empty(&q->list))) {
2100                /*
2101                 * If the timer has already expired, current will already be
2102                 * flagged for rescheduling. Only call schedule if there
2103                 * is no timeout, or if it has yet to expire.
2104                 */
2105                if (!timeout || timeout->task)
2106                        freezable_schedule();
2107        }
2108        __set_current_state(TASK_RUNNING);
2109}
2110
2111/**
2112 * futex_wait_setup() - Prepare to wait on a futex
2113 * @uaddr:      the futex userspace address
2114 * @val:        the expected value
2115 * @flags:      futex flags (FLAGS_SHARED, etc.)
2116 * @q:          the associated futex_q
2117 * @hb:         storage for hash_bucket pointer to be returned to caller
2118 *
2119 * Setup the futex_q and locate the hash_bucket.  Get the futex value and
2120 * compare it with the expected value.  Handle atomic faults internally.
2121 * Return with the hb lock held and a q.key reference on success, and unlocked
2122 * with no q.key reference on failure.
2123 *
2124 * Return:
2125 *  0 - uaddr contains val and hb has been locked;
2126 * <1 - -EFAULT or -EWOULDBLOCK (uaddr does not contain val) and hb is unlocked
2127 */
2128static int futex_wait_setup(u32 __user *uaddr, u32 val, unsigned int flags,
2129                           struct futex_q *q, struct futex_hash_bucket **hb)
2130{
2131        u32 uval;
2132        int ret;
2133
2134        /*
2135         * Access the page AFTER the hash-bucket is locked.
2136         * Order is important:
2137         *
2138         *   Userspace waiter: val = var; if (cond(val)) futex_wait(&var, val);
2139         *   Userspace waker:  if (cond(var)) { var = new; futex_wake(&var); }
2140         *
2141         * The basic logical guarantee of a futex is that it blocks ONLY
2142         * if cond(var) is known to be true at the time of blocking, for
2143         * any cond.  If we locked the hash-bucket after testing *uaddr, that
2144         * would open a race condition where we could block indefinitely with
2145         * cond(var) false, which would violate the guarantee.
2146         *
2147         * On the other hand, we insert q and release the hash-bucket only
2148         * after testing *uaddr.  This guarantees that futex_wait() will NOT
2149         * absorb a wakeup if *uaddr does not match the desired values
2150         * while the syscall executes.
2151         */
2152retry:
2153        ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &q->key, VERIFY_READ);
2154        if (unlikely(ret != 0))
2155                return ret;
2156
2157retry_private:
2158        *hb = queue_lock(q);
2159
2160        ret = get_futex_value_locked(&uval, uaddr);
2161
2162        if (ret) {
2163                queue_unlock(*hb);
2164
2165                ret = get_user(uval, uaddr);
2166                if (ret)
2167                        goto out;
2168
2169                if (!(flags & FLAGS_SHARED))
2170                        goto retry_private;
2171
2172                put_futex_key(&q->key);
2173                goto retry;
2174        }
2175
2176        if (uval != val) {
2177                queue_unlock(*hb);
2178                ret = -EWOULDBLOCK;
2179        }
2180
2181out:
2182        if (ret)
2183                put_futex_key(&q->key);
2184        return ret;
2185}
2186
2187static int futex_wait(u32 __user *uaddr, unsigned int flags, u32 val,
2188                      ktime_t *abs_time, u32 bitset)
2189{
2190        struct hrtimer_sleeper timeout, *to = NULL;
2191        struct restart_block *restart;
2192        struct futex_hash_bucket *hb;
2193        struct futex_q q = futex_q_init;
2194        int ret;
2195
2196        if (!bitset)
2197                return -EINVAL;
2198        q.bitset = bitset;
2199
2200        if (abs_time) {
2201                to = &timeout;
2202
2203                hrtimer_init_on_stack(&to->timer, (flags & FLAGS_CLOCKRT) ?
2204                                      CLOCK_REALTIME : CLOCK_MONOTONIC,
2205                                      HRTIMER_MODE_ABS);
2206                hrtimer_init_sleeper(to, current);
2207                hrtimer_set_expires_range_ns(&to->timer, *abs_time,
2208                                             current->timer_slack_ns);
2209        }
2210
2211retry:
2212        /*
2213         * Prepare to wait on uaddr. On success, holds hb lock and increments
2214         * q.key refs.
2215         */
2216        ret = futex_wait_setup(uaddr, val, flags, &q, &hb);
2217        if (ret)
2218                goto out;
2219
2220        /* queue_me and wait for wakeup, timeout, or a signal. */
2221        futex_wait_queue_me(hb, &q, to);
2222
2223        /* If we were woken (and unqueued), we succeeded, whatever. */
2224        ret = 0;
2225        /* unqueue_me() drops q.key ref */
2226        if (!unqueue_me(&q))
2227                goto out;
2228        ret = -ETIMEDOUT;
2229        if (to && !to->task)
2230                goto out;
2231
2232        /*
2233         * We expect signal_pending(current), but we might be the
2234         * victim of a spurious wakeup as well.
2235         */
2236        if (!signal_pending(current))
2237                goto retry;
2238
2239        ret = -ERESTARTSYS;
2240        if (!abs_time)
2241                goto out;
2242
2243        restart = &current_thread_info()->restart_block;
2244        restart->fn = futex_wait_restart;
2245        restart->futex.uaddr = uaddr;
2246        restart->futex.val = val;
2247        restart->futex.time = abs_time->tv64;
2248        restart->futex.bitset = bitset;
2249        restart->futex.flags = flags | FLAGS_HAS_TIMEOUT;
2250
2251        ret = -ERESTART_RESTARTBLOCK;
2252
2253out:
2254        if (to) {
2255                hrtimer_cancel(&to->timer);
2256                destroy_hrtimer_on_stack(&to->timer);
2257        }
2258        return ret;
2259}
2260
2261
2262static long futex_wait_restart(struct restart_block *restart)
2263{
2264        u32 __user *uaddr = restart->futex.uaddr;
2265        ktime_t t, *tp = NULL;
2266
2267        if (restart->futex.flags & FLAGS_HAS_TIMEOUT) {
2268                t.tv64 = restart->futex.time;
2269                tp = &t;
2270        }
2271        restart->fn = do_no_restart_syscall;
2272
2273        return (long)futex_wait(uaddr, restart->futex.flags,
2274                                restart->futex.val, tp, restart->futex.bitset);
2275}
2276
2277
2278/*
2279 * Userspace tried a 0 -> TID atomic transition of the futex value
2280 * and failed. The kernel side here does the whole locking operation:
2281 * if there are waiters then it will block, it does PI, etc. (Due to
2282 * races the kernel might see a 0 value of the futex too.)
2283 */
2284static int futex_lock_pi(u32 __user *uaddr, unsigned int flags, int detect,
2285                         ktime_t *time, int trylock)
2286{
2287        struct hrtimer_sleeper timeout, *to = NULL;
2288        struct futex_hash_bucket *hb;
2289        struct futex_q q = futex_q_init;
2290        int res, ret;
2291
2292        if (refill_pi_state_cache())
2293                return -ENOMEM;
2294
2295        if (time) {
2296                to = &timeout;
2297                hrtimer_init_on_stack(&to->timer, CLOCK_REALTIME,
2298                                      HRTIMER_MODE_ABS);
2299                hrtimer_init_sleeper(to, current);
2300                hrtimer_set_expires(&to->timer, *time);
2301        }
2302
2303retry:
2304        ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &q.key, VERIFY_WRITE);
2305        if (unlikely(ret != 0))
2306                goto out;
2307
2308retry_private:
2309        hb = queue_lock(&q);
2310
2311        ret = futex_lock_pi_atomic(uaddr, hb, &q.key, &q.pi_state, current, 0);
2312        if (unlikely(ret)) {
2313                switch (ret) {
2314                case 1:
2315                        /* We got the lock. */
2316                        ret = 0;
2317                        goto out_unlock_put_key;
2318                case -EFAULT:
2319                        goto uaddr_faulted;
2320                case -EAGAIN:
2321                        /*
2322                         * Task is exiting and we just wait for the
2323                         * exit to complete.
2324                         */
2325                        queue_unlock(hb);
2326                        put_futex_key(&q.key);
2327                        cond_resched();
2328                        goto retry;
2329                default:
2330                        goto out_unlock_put_key;
2331                }
2332        }
2333
2334        /*
2335         * Only actually queue now that the atomic ops are done:
2336         */
2337        queue_me(&q, hb);
2338
2339        WARN_ON(!q.pi_state);
2340        /*
2341         * Block on the PI mutex:
2342         */
2343        if (!trylock)
2344                ret = rt_mutex_timed_lock(&q.pi_state->pi_mutex, to, 1);
2345        else {
2346                ret = rt_mutex_trylock(&q.pi_state->pi_mutex);
2347                /* Fixup the trylock return value: */
2348                ret = ret ? 0 : -EWOULDBLOCK;
2349        }
2350
2351        spin_lock(q.lock_ptr);
2352        /*
2353         * Fixup the pi_state owner and possibly acquire the lock if we
2354         * haven't already.
2355         */
2356        res = fixup_owner(uaddr, &q, !ret);
2357        /*
2358         * If fixup_owner() returned an error, proprogate that.  If it acquired
2359         * the lock, clear our -ETIMEDOUT or -EINTR.
2360         */
2361        if (res)
2362                ret = (res < 0) ? res : 0;
2363
2364        /*
2365         * If fixup_owner() faulted and was unable to handle the fault, unlock
2366         * it and return the fault to userspace.
2367         */
2368        if (ret && (rt_mutex_owner(&q.pi_state->pi_mutex) == current))
2369                rt_mutex_unlock(&q.pi_state->pi_mutex);
2370
2371        /* Unqueue and drop the lock */
2372        unqueue_me_pi(&q);
2373
2374        goto out_put_key;
2375
2376out_unlock_put_key:
2377        queue_unlock(hb);
2378
2379out_put_key:
2380        put_futex_key(&q.key);
2381out:
2382        if (to)
2383                destroy_hrtimer_on_stack(&to->timer);
2384        return ret != -EINTR ? ret : -ERESTARTNOINTR;
2385
2386uaddr_faulted:
2387        queue_unlock(hb);
2388
2389        ret = fault_in_user_writeable(uaddr);
2390        if (ret)
2391                goto out_put_key;
2392
2393        if (!(flags & FLAGS_SHARED))
2394                goto retry_private;
2395
2396        put_futex_key(&q.key);
2397        goto retry;
2398}
2399
2400/*
2401 * Userspace attempted a TID -> 0 atomic transition, and failed.
2402 * This is the in-kernel slowpath: we look up the PI state (if any),
2403 * and do the rt-mutex unlock.
2404 */
2405static int futex_unlock_pi(u32 __user *uaddr, unsigned int flags)
2406{
2407        struct futex_hash_bucket *hb;
2408        struct futex_q *this, *next;
2409        union futex_key key = FUTEX_KEY_INIT;
2410        u32 uval, vpid = task_pid_vnr(current);
2411        int ret;
2412
2413retry:
2414        if (get_user(uval, uaddr))
2415                return -EFAULT;
2416        /*
2417         * We release only a lock we actually own:
2418         */
2419        if ((uval & FUTEX_TID_MASK) != vpid)
2420                return -EPERM;
2421
2422        ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &key, VERIFY_WRITE);
2423        if (unlikely(ret != 0))
2424                goto out;
2425
2426        hb = hash_futex(&key);
2427        spin_lock(&hb->lock);
2428
2429        /*
2430         * To avoid races, try to do the TID -> 0 atomic transition
2431         * again. If it succeeds then we can return without waking
2432         * anyone else up. We only try this if neither the waiters nor
2433         * the owner died bit are set.
2434         */
2435        if (!(uval & ~FUTEX_TID_MASK) &&
2436            cmpxchg_futex_value_locked(&uval, uaddr, vpid, 0))
2437                goto pi_faulted;
2438        /*
2439         * Rare case: we managed to release the lock atomically,
2440         * no need to wake anyone else up:
2441         */
2442        if (unlikely(uval == vpid))
2443                goto out_unlock;
2444
2445        /*
2446         * Ok, other tasks may need to be woken up - check waiters
2447         * and do the wakeup if necessary:
2448         */
2449        plist_for_each_entry_safe(this, next, &hb->chain, list) {
2450                if (!match_futex (&this->key, &key))
2451                        continue;
2452                ret = wake_futex_pi(uaddr, uval, this);
2453                /*
2454                 * The atomic access to the futex value
2455                 * generated a pagefault, so retry the
2456                 * user-access and the wakeup:
2457                 */
2458                if (ret == -EFAULT)
2459                        goto pi_faulted;
2460                goto out_unlock;
2461        }
2462        /*
2463         * No waiters - kernel unlocks the futex:
2464         */
2465        ret = unlock_futex_pi(uaddr, uval);
2466        if (ret == -EFAULT)
2467                goto pi_faulted;
2468
2469out_unlock:
2470        spin_unlock(&hb->lock);
2471        put_futex_key(&key);
2472
2473out:
2474        return ret;
2475
2476pi_faulted:
2477        spin_unlock(&hb->lock);
2478        put_futex_key(&key);
2479
2480        ret = fault_in_user_writeable(uaddr);
2481        if (!ret)
2482                goto retry;
2483
2484        return ret;
2485}
2486
2487/**
2488 * handle_early_requeue_pi_wakeup() - Detect early wakeup on the initial futex
2489 * @hb:         the hash_bucket futex_q was original enqueued on
2490 * @q:          the futex_q woken while waiting to be requeued
2491 * @key2:       the futex_key of the requeue target futex
2492 * @timeout:    the timeout associated with the wait (NULL if none)
2493 *
2494 * Detect if the task was woken on the initial futex as opposed to the requeue
2495 * target futex.  If so, determine if it was a timeout or a signal that caused
2496 * the wakeup and return the appropriate error code to the caller.  Must be
2497 * called with the hb lock held.
2498 *
2499 * Return:
2500 *  0 = no early wakeup detected;
2501 * <0 = -ETIMEDOUT or -ERESTARTNOINTR
2502 */
2503static inline
2504int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb,
2505                                   struct futex_q *q, union futex_key *key2,
2506                                   struct hrtimer_sleeper *timeout)
2507{
2508        int ret = 0;
2509
2510        /*
2511         * With the hb lock held, we avoid races while we process the wakeup.
2512         * We only need to hold hb (and not hb2) to ensure atomicity as the
2513         * wakeup code can't change q.key from uaddr to uaddr2 if we hold hb.
2514         * It can't be requeued from uaddr2 to something else since we don't
2515         * support a PI aware source futex for requeue.
2516         */
2517        if (!match_futex(&q->key, key2)) {
2518                WARN_ON(q->lock_ptr && (&hb->lock != q->lock_ptr));
2519                /*
2520                 * We were woken prior to requeue by a timeout or a signal.
2521                 * Unqueue the futex_q and determine which it was.
2522                 */
2523                plist_del(&q->list, &hb->chain);
2524                hb_waiters_dec(hb);
2525
2526                /* Handle spurious wakeups gracefully */
2527                ret = -EWOULDBLOCK;
2528                if (timeout && !timeout->task)
2529                        ret = -ETIMEDOUT;
2530                else if (signal_pending(current))
2531                        ret = -ERESTARTNOINTR;
2532        }
2533        return ret;
2534}
2535
2536/**
2537 * futex_wait_requeue_pi() - Wait on uaddr and take uaddr2
2538 * @uaddr:      the futex we initially wait on (non-pi)
2539 * @flags:      futex flags (FLAGS_SHARED, FLAGS_CLOCKRT, etc.), they must be
2540 *              the same type, no requeueing from private to shared, etc.
2541 * @val:        the expected value of uaddr
2542 * @abs_time:   absolute timeout
2543 * @bitset:     32 bit wakeup bitset set by userspace, defaults to all
2544 * @uaddr2:     the pi futex we will take prior to returning to user-space
2545 *
2546 * The caller will wait on uaddr and will be requeued by futex_requeue() to
2547 * uaddr2 which must be PI aware and unique from uaddr.  Normal wakeup will wake
2548 * on uaddr2 and complete the acquisition of the rt_mutex prior to returning to
2549 * userspace.  This ensures the rt_mutex maintains an owner when it has waiters;
2550 * without one, the pi logic would not know which task to boost/deboost, if
2551 * there was a need to.
2552 *
2553 * We call schedule in futex_wait_queue_me() when we enqueue and return there
2554 * via the following--
2555 * 1) wakeup on uaddr2 after an atomic lock acquisition by futex_requeue()
2556 * 2) wakeup on uaddr2 after a requeue
2557 * 3) signal
2558 * 4) timeout
2559 *
2560 * If 3, cleanup and return -ERESTARTNOINTR.
2561 *
2562 * If 2, we may then block on trying to take the rt_mutex and return via:
2563 * 5) successful lock
2564 * 6) signal
2565 * 7) timeout
2566 * 8) other lock acquisition failure
2567 *
2568 * If 6, return -EWOULDBLOCK (restarting the syscall would do the same).
2569 *
2570 * If 4 or 7, we cleanup and return with -ETIMEDOUT.
2571 *
2572 * Return:
2573 *  0 - On success;
2574 * <0 - On error
2575 */
2576static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
2577                                 u32 val, ktime_t *abs_time, u32 bitset,
2578                                 u32 __user *uaddr2)
2579{
2580        struct hrtimer_sleeper timeout, *to = NULL;
2581        struct rt_mutex_waiter rt_waiter;
2582        struct rt_mutex *pi_mutex = NULL;
2583        struct futex_hash_bucket *hb;
2584        union futex_key key2 = FUTEX_KEY_INIT;
2585        struct futex_q q = futex_q_init;
2586        int res, ret;
2587
2588        if (uaddr == uaddr2)
2589                return -EINVAL;
2590
2591        if (!bitset)
2592                return -EINVAL;
2593
2594        if (abs_time) {
2595                to = &timeout;
2596                hrtimer_init_on_stack(&to->timer, (flags & FLAGS_CLOCKRT) ?
2597                                      CLOCK_REALTIME : CLOCK_MONOTONIC,
2598                                      HRTIMER_MODE_ABS);
2599                hrtimer_init_sleeper(to, current);
2600                hrtimer_set_expires_range_ns(&to->timer, *abs_time,
2601                                             current->timer_slack_ns);
2602        }
2603
2604        /*
2605         * The waiter is allocated on our stack, manipulated by the requeue
2606         * code while we sleep on uaddr.
2607         */
2608        debug_rt_mutex_init_waiter(&rt_waiter);
2609        RB_CLEAR_NODE(&rt_waiter.pi_tree_entry);
2610        RB_CLEAR_NODE(&rt_waiter.tree_entry);
2611        rt_waiter.task = NULL;
2612
2613        ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, VERIFY_WRITE);
2614        if (unlikely(ret != 0))
2615                goto out;
2616
2617        q.bitset = bitset;
2618        q.rt_waiter = &rt_waiter;
2619        q.requeue_pi_key = &key2;
2620
2621        /*
2622         * Prepare to wait on uaddr. On success, increments q.key (key1) ref
2623         * count.
2624         */
2625        ret = futex_wait_setup(uaddr, val, flags, &q, &hb);
2626        if (ret)
2627                goto out_key2;
2628
2629        /*
2630         * The check above which compares uaddrs is not sufficient for
2631         * shared futexes. We need to compare the keys:
2632         */
2633        if (match_futex(&q.key, &key2)) {
2634                ret = -EINVAL;
2635                goto out_put_keys;
2636        }
2637
2638        /* Queue the futex_q, drop the hb lock, wait for wakeup. */
2639        futex_wait_queue_me(hb, &q, to);
2640
2641        spin_lock(&hb->lock);
2642        ret = handle_early_requeue_pi_wakeup(hb, &q, &key2, to);
2643        spin_unlock(&hb->lock);
2644        if (ret)
2645                goto out_put_keys;
2646
2647        /*
2648         * In order for us to be here, we know our q.key == key2, and since
2649         * we took the hb->lock above, we also know that futex_requeue() has
2650         * completed and we no longer have to concern ourselves with a wakeup
2651         * race with the atomic proxy lock acquisition by the requeue code. The
2652         * futex_requeue dropped our key1 reference and incremented our key2
2653         * reference count.
2654         */
2655
2656        /* Check if the requeue code acquired the second futex for us. */
2657        if (!q.rt_waiter) {
2658                /*
2659                 * Got the lock. We might not be the anticipated owner if we
2660                 * did a lock-steal - fix up the PI-state in that case.
2661                 */
2662                if (q.pi_state && (q.pi_state->owner != current)) {
2663                        spin_lock(q.lock_ptr);
2664                        ret = fixup_pi_state_owner(uaddr2, &q, current);
2665                        spin_unlock(q.lock_ptr);
2666                }
2667        } else {
2668                /*
2669                 * We have been woken up by futex_unlock_pi(), a timeout, or a
2670                 * signal.  futex_unlock_pi() will not destroy the lock_ptr nor
2671                 * the pi_state.
2672                 */
2673                WARN_ON(!q.pi_state);
2674                pi_mutex = &q.pi_state->pi_mutex;
2675                ret = rt_mutex_finish_proxy_lock(pi_mutex, to, &rt_waiter, 1);
2676                debug_rt_mutex_free_waiter(&rt_waiter);
2677
2678                spin_lock(q.lock_ptr);
2679                /*
2680                 * Fixup the pi_state owner and possibly acquire the lock if we
2681                 * haven't already.
2682                 */
2683                res = fixup_owner(uaddr2, &q, !ret);
2684                /*
2685                 * If fixup_owner() returned an error, proprogate that.  If it
2686                 * acquired the lock, clear -ETIMEDOUT or -EINTR.
2687                 */
2688                if (res)
2689                        ret = (res < 0) ? res : 0;
2690
2691                /* Unqueue and drop the lock. */
2692                unqueue_me_pi(&q);
2693        }
2694
2695        /*
2696         * If fixup_pi_state_owner() faulted and was unable to handle the
2697         * fault, unlock the rt_mutex and return the fault to userspace.
2698         */
2699        if (ret == -EFAULT) {
2700                if (pi_mutex && rt_mutex_owner(pi_mutex) == current)
2701                        rt_mutex_unlock(pi_mutex);
2702        } else if (ret == -EINTR) {
2703                /*
2704                 * We've already been requeued, but cannot restart by calling
2705                 * futex_lock_pi() directly. We could restart this syscall, but
2706                 * it would detect that the user space "val" changed and return
2707                 * -EWOULDBLOCK.  Save the overhead of the restart and return
2708                 * -EWOULDBLOCK directly.
2709                 */
2710                ret = -EWOULDBLOCK;
2711        }
2712
2713out_put_keys:
2714        put_futex_key(&q.key);
2715out_key2:
2716        put_futex_key(&key2);
2717
2718out:
2719        if (to) {
2720                hrtimer_cancel(&to->timer);
2721                destroy_hrtimer_on_stack(&to->timer);
2722        }
2723        return ret;
2724}
2725
2726/*
2727 * Support for robust futexes: the kernel cleans up held futexes at
2728 * thread exit time.
2729 *
2730 * Implementation: user-space maintains a per-thread list of locks it
2731 * is holding. Upon do_exit(), the kernel carefully walks this list,
2732 * and marks all locks that are owned by this thread with the
2733 * FUTEX_OWNER_DIED bit, and wakes up a waiter (if any). The list is
2734 * always manipulated with the lock held, so the list is private and
2735 * per-thread. Userspace also maintains a per-thread 'list_op_pending'
2736 * field, to allow the kernel to clean up if the thread dies after
2737 * acquiring the lock, but just before it could have added itself to
2738 * the list. There can only be one such pending lock.
2739 */
2740
2741/**
2742 * sys_set_robust_list() - Set the robust-futex list head of a task
2743 * @head:       pointer to the list-head
2744 * @len:        length of the list-head, as userspace expects
2745 */
2746SYSCALL_DEFINE2(set_robust_list, struct robust_list_head __user *, head,
2747                size_t, len)
2748{
2749        if (!futex_cmpxchg_enabled)
2750                return -ENOSYS;
2751        /*
2752         * The kernel knows only one size for now:
2753         */
2754        if (unlikely(len != sizeof(*head)))
2755                return -EINVAL;
2756
2757        current->robust_list = head;
2758
2759        return 0;
2760}
2761
2762/**
2763 * sys_get_robust_list() - Get the robust-futex list head of a task
2764 * @pid:        pid of the process [zero for current task]
2765 * @head_ptr:   pointer to a list-head pointer, the kernel fills it in
2766 * @len_ptr:    pointer to a length field, the kernel fills in the header size
2767 */
2768SYSCALL_DEFINE3(get_robust_list, int, pid,
2769                struct robust_list_head __user * __user *, head_ptr,
2770                size_t __user *, len_ptr)
2771{
2772        struct robust_list_head __user *head;
2773        unsigned long ret;
2774        struct task_struct *p;
2775
2776        if (!futex_cmpxchg_enabled)
2777                return -ENOSYS;
2778
2779        rcu_read_lock();
2780
2781        ret = -ESRCH;
2782        if (!pid)
2783                p = current;
2784        else {
2785                p = find_task_by_vpid(pid);
2786                if (!p)
2787                        goto err_unlock;
2788        }
2789
2790        ret = -EPERM;
2791        if (!ptrace_may_access(p, PTRACE_MODE_READ_REALCREDS))
2792                goto err_unlock;
2793
2794        head = p->robust_list;
2795        rcu_read_unlock();
2796
2797        if (put_user(sizeof(*head), len_ptr))
2798                return -EFAULT;
2799        return put_user(head, head_ptr);
2800
2801err_unlock:
2802        rcu_read_unlock();
2803
2804        return ret;
2805}
2806
2807/*
2808 * Process a futex-list entry, check whether it's owned by the
2809 * dying task, and do notification if so:
2810 */
2811int handle_futex_death(u32 __user *uaddr, struct task_struct *curr, int pi)
2812{
2813        u32 uval, uninitialized_var(nval), mval;
2814
2815retry:
2816        if (get_user(uval, uaddr))
2817                return -1;
2818
2819        if ((uval & FUTEX_TID_MASK) == task_pid_vnr(curr)) {
2820                /*
2821                 * Ok, this dying thread is truly holding a futex
2822                 * of interest. Set the OWNER_DIED bit atomically
2823                 * via cmpxchg, and if the value had FUTEX_WAITERS
2824                 * set, wake up a waiter (if any). (We have to do a
2825                 * futex_wake() even if OWNER_DIED is already set -
2826                 * to handle the rare but possible case of recursive
2827                 * thread-death.) The rest of the cleanup is done in
2828                 * userspace.
2829                 */
2830                mval = (uval & FUTEX_WAITERS) | FUTEX_OWNER_DIED;
2831                /*
2832                 * We are not holding a lock here, but we want to have
2833                 * the pagefault_disable/enable() protection because
2834                 * we want to handle the fault gracefully. If the
2835                 * access fails we try to fault in the futex with R/W
2836                 * verification via get_user_pages. get_user() above
2837                 * does not guarantee R/W access. If that fails we
2838                 * give up and leave the futex locked.
2839                 */
2840                if (cmpxchg_futex_value_locked(&nval, uaddr, uval, mval)) {
2841                        if (fault_in_user_writeable(uaddr))
2842                                return -1;
2843                        goto retry;
2844                }
2845                if (nval != uval)
2846                        goto retry;
2847
2848                /*
2849                 * Wake robust non-PI futexes here. The wakeup of
2850                 * PI futexes happens in exit_pi_state():
2851                 */
2852                if (!pi && (uval & FUTEX_WAITERS))
2853                        futex_wake(uaddr, 1, 1, FUTEX_BITSET_MATCH_ANY);
2854        }
2855        return 0;
2856}
2857
2858/*
2859 * Fetch a robust-list pointer. Bit 0 signals PI futexes:
2860 */
2861static inline int fetch_robust_entry(struct robust_list __user **entry,
2862                                     struct robust_list __user * __user *head,
2863                                     unsigned int *pi)
2864{
2865        unsigned long uentry;
2866
2867        if (get_user(uentry, (unsigned long __user *)head))
2868                return -EFAULT;
2869
2870        *entry = (void __user *)(uentry & ~1UL);
2871        *pi = uentry & 1;
2872
2873        return 0;
2874}
2875
2876/*
2877 * Walk curr->robust_list (very carefully, it's a userspace list!)
2878 * and mark any locks found there dead, and notify any waiters.
2879 *
2880 * We silently return on any sign of list-walking problem.
2881 */
2882void exit_robust_list(struct task_struct *curr)
2883{
2884        struct robust_list_head __user *head = curr->robust_list;
2885        struct robust_list __user *entry, *next_entry, *pending;
2886        unsigned int limit = ROBUST_LIST_LIMIT, pi, pip;
2887        unsigned int uninitialized_var(next_pi);
2888        unsigned long futex_offset;
2889        int rc;
2890
2891        if (!futex_cmpxchg_enabled)
2892                return;
2893
2894        /*
2895         * Fetch the list head (which was registered earlier, via
2896         * sys_set_robust_list()):
2897         */
2898        if (fetch_robust_entry(&entry, &head->list.next, &pi))
2899                return;
2900        /*
2901         * Fetch the relative futex offset:
2902         */
2903        if (get_user(futex_offset, &head->futex_offset))
2904                return;
2905        /*
2906         * Fetch any possibly pending lock-add first, and handle it
2907         * if it exists:
2908         */
2909        if (fetch_robust_entry(&pending, &head->list_op_pending, &pip))
2910                return;
2911
2912        next_entry = NULL;      /* avoid warning with gcc */
2913        while (entry != &head->list) {
2914                /*
2915                 * Fetch the next entry in the list before calling
2916                 * handle_futex_death:
2917                 */
2918                rc = fetch_robust_entry(&next_entry, &entry->next, &next_pi);
2919                /*
2920                 * A pending lock might already be on the list, so
2921                 * don't process it twice:
2922                 */
2923                if (entry != pending)
2924                        if (handle_futex_death((void __user *)entry + futex_offset,
2925                                                curr, pi))
2926                                return;
2927                if (rc)
2928                        return;
2929                entry = next_entry;
2930                pi = next_pi;
2931                /*
2932                 * Avoid excessively long or circular lists:
2933                 */
2934                if (!--limit)
2935                        break;
2936
2937                cond_resched();
2938        }
2939
2940        if (pending)
2941                handle_futex_death((void __user *)pending + futex_offset,
2942                                   curr, pip);
2943}
2944
2945long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
2946                u32 __user *uaddr2, u32 val2, u32 val3)
2947{
2948        int cmd = op & FUTEX_CMD_MASK;
2949        unsigned int flags = 0;
2950
2951        if (!(op & FUTEX_PRIVATE_FLAG))
2952                flags |= FLAGS_SHARED;
2953
2954        if (op & FUTEX_CLOCK_REALTIME) {
2955                flags |= FLAGS_CLOCKRT;
2956                if (cmd != FUTEX_WAIT_BITSET && cmd != FUTEX_WAIT_REQUEUE_PI)
2957                        return -ENOSYS;
2958        }
2959
2960        switch (cmd) {
2961        case FUTEX_LOCK_PI:
2962        case FUTEX_UNLOCK_PI:
2963        case FUTEX_TRYLOCK_PI:
2964        case FUTEX_WAIT_REQUEUE_PI:
2965        case FUTEX_CMP_REQUEUE_PI:
2966                if (!futex_cmpxchg_enabled)
2967                        return -ENOSYS;
2968        }
2969
2970        switch (cmd) {
2971        case FUTEX_WAIT:
2972                val3 = FUTEX_BITSET_MATCH_ANY;
2973        case FUTEX_WAIT_BITSET:
2974                return futex_wait(uaddr, flags, val, timeout, val3);
2975        case FUTEX_WAKE:
2976                val3 = FUTEX_BITSET_MATCH_ANY;
2977        case FUTEX_WAKE_BITSET:
2978                return futex_wake(uaddr, flags, val, val3);
2979        case FUTEX_REQUEUE:
2980                return futex_requeue(uaddr, flags, uaddr2, val, val2, NULL, 0);
2981        case FUTEX_CMP_REQUEUE:
2982                return futex_requeue(uaddr, flags, uaddr2, val, val2, &val3, 0);
2983        case FUTEX_WAKE_OP:
2984                return futex_wake_op(uaddr, flags, uaddr2, val, val2, val3);
2985        case FUTEX_LOCK_PI:
2986                return futex_lock_pi(uaddr, flags, val, timeout, 0);
2987        case FUTEX_UNLOCK_PI:
2988                return futex_unlock_pi(uaddr, flags);
2989        case FUTEX_TRYLOCK_PI:
2990                return futex_lock_pi(uaddr, flags, 0, timeout, 1);
2991        case FUTEX_WAIT_REQUEUE_PI:
2992                val3 = FUTEX_BITSET_MATCH_ANY;
2993                return futex_wait_requeue_pi(uaddr, flags, val, timeout, val3,
2994                                             uaddr2);
2995        case FUTEX_CMP_REQUEUE_PI:
2996                return futex_requeue(uaddr, flags, uaddr2, val, val2, &val3, 1);
2997        }
2998        return -ENOSYS;
2999}
3000
3001
3002SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val,
3003                struct timespec __user *, utime, u32 __user *, uaddr2,
3004                u32, val3)
3005{
3006        struct timespec ts;
3007        ktime_t t, *tp = NULL;
3008        u32 val2 = 0;
3009        int cmd = op & FUTEX_CMD_MASK;
3010
3011        if (utime && (cmd == FUTEX_WAIT || cmd == FUTEX_LOCK_PI ||
3012                      cmd == FUTEX_WAIT_BITSET ||
3013                      cmd == FUTEX_WAIT_REQUEUE_PI)) {
3014                if (copy_from_user(&ts, utime, sizeof(ts)) != 0)
3015                        return -EFAULT;
3016                if (!timespec_valid(&ts))
3017                        return -EINVAL;
3018
3019                t = timespec_to_ktime(ts);
3020                if (cmd == FUTEX_WAIT)
3021                        t = ktime_add_safe(ktime_get(), t);
3022                tp = &t;
3023        }
3024        /*
3025         * requeue parameter in 'utime' if cmd == FUTEX_*_REQUEUE_*.
3026         * number of waiters to wake in 'utime' if cmd == FUTEX_WAKE_OP.
3027         */
3028        if (cmd == FUTEX_REQUEUE || cmd == FUTEX_CMP_REQUEUE ||
3029            cmd == FUTEX_CMP_REQUEUE_PI || cmd == FUTEX_WAKE_OP)
3030                val2 = (u32) (unsigned long) utime;
3031
3032        return do_futex(uaddr, op, val, tp, uaddr2, val2, val3);
3033}
3034
3035static int __init futex_init(void)
3036{
3037        u32 curval;
3038        unsigned int futex_shift;
3039        unsigned long i;
3040
3041#if CONFIG_BASE_SMALL
3042        futex_hashsize = 16;
3043#else
3044        futex_hashsize = roundup_pow_of_two(256 * num_possible_cpus());
3045#endif
3046
3047        futex_queues = alloc_large_system_hash("futex", sizeof(*futex_queues),
3048                                               futex_hashsize, 0,
3049                                               futex_hashsize < 256 ? HASH_SMALL : 0,
3050                                               &futex_shift, NULL,
3051                                               futex_hashsize, futex_hashsize);
3052        futex_hashsize = 1UL << futex_shift;
3053        /*
3054         * This will fail and we want it. Some arch implementations do
3055         * runtime detection of the futex_atomic_cmpxchg_inatomic()
3056         * functionality. We want to know that before we call in any
3057         * of the complex code paths. Also we want to prevent
3058         * registration of robust lists in that case. NULL is
3059         * guaranteed to fault and we get -EFAULT on functional
3060         * implementation, the non-functional ones will return
3061         * -ENOSYS.
3062         */
3063        if (cmpxchg_futex_value_locked(&curval, NULL, 0, 0) == -EFAULT)
3064                futex_cmpxchg_enabled = 1;
3065
3066        for (i = 0; i < futex_hashsize; i++) {
3067                atomic_set(&futex_queues[i].waiters, 0);
3068                plist_head_init(&futex_queues[i].chain);
3069                spin_lock_init(&futex_queues[i].lock);
3070        }
3071
3072        return 0;
3073}
3074__initcall(futex_init);
3075