linux/kernel/futex.c
<<
>>
Prefs
   1/*
   2 *  Fast Userspace Mutexes (which I call "Futexes!").
   3 *  (C) Rusty Russell, IBM 2002
   4 *
   5 *  Generalized futexes, futex requeueing, misc fixes by Ingo Molnar
   6 *  (C) Copyright 2003 Red Hat Inc, All Rights Reserved
   7 *
   8 *  Removed page pinning, fix privately mapped COW pages and other cleanups
   9 *  (C) Copyright 2003, 2004 Jamie Lokier
  10 *
  11 *  Robust futex support started by Ingo Molnar
  12 *  (C) Copyright 2006 Red Hat Inc, All Rights Reserved
  13 *  Thanks to Thomas Gleixner for suggestions, analysis and fixes.
  14 *
  15 *  PI-futex support started by Ingo Molnar and Thomas Gleixner
  16 *  Copyright (C) 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
  17 *  Copyright (C) 2006 Timesys Corp., Thomas Gleixner <tglx@timesys.com>
  18 *
  19 *  PRIVATE futexes by Eric Dumazet
  20 *  Copyright (C) 2007 Eric Dumazet <dada1@cosmosbay.com>
  21 *
  22 *  Requeue-PI support by Darren Hart <dvhltc@us.ibm.com>
  23 *  Copyright (C) IBM Corporation, 2009
  24 *  Thanks to Thomas Gleixner for conceptual design and careful reviews.
  25 *
  26 *  Thanks to Ben LaHaise for yelling "hashed waitqueues" loudly
  27 *  enough at me, Linus for the original (flawed) idea, Matthew
  28 *  Kirkwood for proof-of-concept implementation.
  29 *
  30 *  "The futexes are also cursed."
  31 *  "But they come in a choice of three flavours!"
  32 *
  33 *  This program is free software; you can redistribute it and/or modify
  34 *  it under the terms of the GNU General Public License as published by
  35 *  the Free Software Foundation; either version 2 of the License, or
  36 *  (at your option) any later version.
  37 *
  38 *  This program is distributed in the hope that it will be useful,
  39 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
  40 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  41 *  GNU General Public License for more details.
  42 *
  43 *  You should have received a copy of the GNU General Public License
  44 *  along with this program; if not, write to the Free Software
  45 *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  46 */
  47#include <linux/slab.h>
  48#include <linux/poll.h>
  49#include <linux/fs.h>
  50#include <linux/file.h>
  51#include <linux/jhash.h>
  52#include <linux/init.h>
  53#include <linux/futex.h>
  54#include <linux/mount.h>
  55#include <linux/pagemap.h>
  56#include <linux/syscalls.h>
  57#include <linux/signal.h>
  58#include <linux/export.h>
  59#include <linux/magic.h>
  60#include <linux/pid.h>
  61#include <linux/nsproxy.h>
  62#include <linux/ptrace.h>
  63#include <linux/sched/rt.h>
  64#include <linux/hugetlb.h>
  65#include <linux/freezer.h>
  66#include <linux/bootmem.h>
  67
  68#include <asm/futex.h>
  69
  70#include "locking/rtmutex_common.h"
  71
  72/*
  73 * Basic futex operation and ordering guarantees:
  74 *
  75 * The waiter reads the futex value in user space and calls
  76 * futex_wait(). This function computes the hash bucket and acquires
  77 * the hash bucket lock. After that it reads the futex user space value
  78 * again and verifies that the data has not changed. If it has not changed
  79 * it enqueues itself into the hash bucket, releases the hash bucket lock
  80 * and schedules.
  81 *
  82 * The waker side modifies the user space value of the futex and calls
  83 * futex_wake(). This function computes the hash bucket and acquires the
  84 * hash bucket lock. Then it looks for waiters on that futex in the hash
  85 * bucket and wakes them.
  86 *
  87 * In futex wake up scenarios where no tasks are blocked on a futex, taking
  88 * the hb spinlock can be avoided and simply return. In order for this
  89 * optimization to work, ordering guarantees must exist so that the waiter
  90 * being added to the list is acknowledged when the list is concurrently being
  91 * checked by the waker, avoiding scenarios like the following:
  92 *
  93 * CPU 0                               CPU 1
  94 * val = *futex;
  95 * sys_futex(WAIT, futex, val);
  96 *   futex_wait(futex, val);
  97 *   uval = *futex;
  98 *                                     *futex = newval;
  99 *                                     sys_futex(WAKE, futex);
 100 *                                       futex_wake(futex);
 101 *                                       if (queue_empty())
 102 *                                         return;
 103 *   if (uval == val)
 104 *      lock(hash_bucket(futex));
 105 *      queue();
 106 *     unlock(hash_bucket(futex));
 107 *     schedule();
 108 *
 109 * This would cause the waiter on CPU 0 to wait forever because it
 110 * missed the transition of the user space value from val to newval
 111 * and the waker did not find the waiter in the hash bucket queue.
 112 *
 113 * The correct serialization ensures that a waiter either observes
 114 * the changed user space value before blocking or is woken by a
 115 * concurrent waker:
 116 *
 117 * CPU 0                                 CPU 1
 118 * val = *futex;
 119 * sys_futex(WAIT, futex, val);
 120 *   futex_wait(futex, val);
 121 *
 122 *   waiters++;
 123 *   mb(); (A) <-- paired with -.
 124 *                              |
 125 *   lock(hash_bucket(futex));  |
 126 *                              |
 127 *   uval = *futex;             |
 128 *                              |        *futex = newval;
 129 *                              |        sys_futex(WAKE, futex);
 130 *                              |          futex_wake(futex);
 131 *                              |
 132 *                              `------->  mb(); (B)
 133 *   if (uval == val)
 134 *     queue();
 135 *     unlock(hash_bucket(futex));
 136 *     schedule();                         if (waiters)
 137 *                                           lock(hash_bucket(futex));
 138 *                                           wake_waiters(futex);
 139 *                                           unlock(hash_bucket(futex));
 140 *
 141 * Where (A) orders the waiters increment and the futex value read -- this
 142 * is guaranteed by the head counter in the hb spinlock; and where (B)
 143 * orders the write to futex and the waiters read -- this is done by the
 144 * barriers in get_futex_key_refs(), through either ihold or atomic_inc,
 145 * depending on the futex type.
 146 *
 147 * This yields the following case (where X:=waiters, Y:=futex):
 148 *
 149 *      X = Y = 0
 150 *
 151 *      w[X]=1          w[Y]=1
 152 *      MB              MB
 153 *      r[Y]=y          r[X]=x
 154 *
 155 * Which guarantees that x==0 && y==0 is impossible; which translates back into
 156 * the guarantee that we cannot both miss the futex variable change and the
 157 * enqueue.
 158 */
 159
 160int __read_mostly futex_cmpxchg_enabled;
 161
 162/*
 163 * Futex flags used to encode options to functions and preserve them across
 164 * restarts.
 165 */
 166#define FLAGS_SHARED            0x01
 167#define FLAGS_CLOCKRT           0x02
 168#define FLAGS_HAS_TIMEOUT       0x04
 169
 170/*
 171 * Priority Inheritance state:
 172 */
 173struct futex_pi_state {
 174        /*
 175         * list of 'owned' pi_state instances - these have to be
 176         * cleaned up in do_exit() if the task exits prematurely:
 177         */
 178        struct list_head list;
 179
 180        /*
 181         * The PI object:
 182         */
 183        struct rt_mutex pi_mutex;
 184
 185        struct task_struct *owner;
 186        atomic_t refcount;
 187
 188        union futex_key key;
 189};
 190
 191/**
 192 * struct futex_q - The hashed futex queue entry, one per waiting task
 193 * @list:               priority-sorted list of tasks waiting on this futex
 194 * @task:               the task waiting on the futex
 195 * @lock_ptr:           the hash bucket lock
 196 * @key:                the key the futex is hashed on
 197 * @pi_state:           optional priority inheritance state
 198 * @rt_waiter:          rt_waiter storage for use with requeue_pi
 199 * @requeue_pi_key:     the requeue_pi target futex key
 200 * @bitset:             bitset for the optional bitmasked wakeup
 201 *
 202 * We use this hashed waitqueue, instead of a normal wait_queue_t, so
 203 * we can wake only the relevant ones (hashed queues may be shared).
 204 *
 205 * A futex_q has a woken state, just like tasks have TASK_RUNNING.
 206 * It is considered woken when plist_node_empty(&q->list) || q->lock_ptr == 0.
 207 * The order of wakeup is always to make the first condition true, then
 208 * the second.
 209 *
 210 * PI futexes are typically woken before they are removed from the hash list via
 211 * the rt_mutex code. See unqueue_me_pi().
 212 */
 213struct futex_q {
 214        struct plist_node list;
 215
 216        struct task_struct *task;
 217        spinlock_t *lock_ptr;
 218        union futex_key key;
 219        struct futex_pi_state *pi_state;
 220        struct rt_mutex_waiter *rt_waiter;
 221        union futex_key *requeue_pi_key;
 222        u32 bitset;
 223};
 224
 225static const struct futex_q futex_q_init = {
 226        /* list gets initialized in queue_me()*/
 227        .key = FUTEX_KEY_INIT,
 228        .bitset = FUTEX_BITSET_MATCH_ANY
 229};
 230
 231/*
 232 * Hash buckets are shared by all the futex_keys that hash to the same
 233 * location.  Each key may have multiple futex_q structures, one for each task
 234 * waiting on a futex.
 235 */
 236struct futex_hash_bucket {
 237        atomic_t waiters;
 238        spinlock_t lock;
 239        struct plist_head chain;
 240} ____cacheline_aligned_in_smp;
 241
 242static unsigned long __read_mostly futex_hashsize;
 243
 244static struct futex_hash_bucket *futex_queues;
 245
 246static inline void futex_get_mm(union futex_key *key)
 247{
 248        atomic_inc(&key->private.mm->mm_count);
 249        /*
 250         * Ensure futex_get_mm() implies a full barrier such that
 251         * get_futex_key() implies a full barrier. This is relied upon
 252         * as full barrier (B), see the ordering comment above.
 253         */
 254        smp_mb__after_atomic_inc();
 255}
 256
 257/*
 258 * Reflects a new waiter being added to the waitqueue.
 259 */
 260static inline void hb_waiters_inc(struct futex_hash_bucket *hb)
 261{
 262#ifdef CONFIG_SMP
 263        atomic_inc(&hb->waiters);
 264        /*
 265         * Full barrier (A), see the ordering comment above.
 266         */
 267        smp_mb__after_atomic_inc();
 268#endif
 269}
 270
 271/*
 272 * Reflects a waiter being removed from the waitqueue by wakeup
 273 * paths.
 274 */
 275static inline void hb_waiters_dec(struct futex_hash_bucket *hb)
 276{
 277#ifdef CONFIG_SMP
 278        atomic_dec(&hb->waiters);
 279#endif
 280}
 281
 282static inline int hb_waiters_pending(struct futex_hash_bucket *hb)
 283{
 284#ifdef CONFIG_SMP
 285        return atomic_read(&hb->waiters);
 286#else
 287        return 1;
 288#endif
 289}
 290
 291/*
 292 * We hash on the keys returned from get_futex_key (see below).
 293 */
 294static struct futex_hash_bucket *hash_futex(union futex_key *key)
 295{
 296        u32 hash = jhash2((u32*)&key->both.word,
 297                          (sizeof(key->both.word)+sizeof(key->both.ptr))/4,
 298                          key->both.offset);
 299        return &futex_queues[hash & (futex_hashsize - 1)];
 300}
 301
 302/*
 303 * Return 1 if two futex_keys are equal, 0 otherwise.
 304 */
 305static inline int match_futex(union futex_key *key1, union futex_key *key2)
 306{
 307        return (key1 && key2
 308                && key1->both.word == key2->both.word
 309                && key1->both.ptr == key2->both.ptr
 310                && key1->both.offset == key2->both.offset);
 311}
 312
 313/*
 314 * Take a reference to the resource addressed by a key.
 315 * Can be called while holding spinlocks.
 316 *
 317 */
 318static void get_futex_key_refs(union futex_key *key)
 319{
 320        if (!key->both.ptr)
 321                return;
 322
 323        switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) {
 324        case FUT_OFF_INODE:
 325                ihold(key->shared.inode); /* implies MB (B) */
 326                break;
 327        case FUT_OFF_MMSHARED:
 328                futex_get_mm(key); /* implies MB (B) */
 329                break;
 330        }
 331}
 332
 333/*
 334 * Drop a reference to the resource addressed by a key.
 335 * The hash bucket spinlock must not be held.
 336 */
 337static void drop_futex_key_refs(union futex_key *key)
 338{
 339        if (!key->both.ptr) {
 340                /* If we're here then we tried to put a key we failed to get */
 341                WARN_ON_ONCE(1);
 342                return;
 343        }
 344
 345        switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) {
 346        case FUT_OFF_INODE:
 347                iput(key->shared.inode);
 348                break;
 349        case FUT_OFF_MMSHARED:
 350                mmdrop(key->private.mm);
 351                break;
 352        }
 353}
 354
 355/**
 356 * get_futex_key() - Get parameters which are the keys for a futex
 357 * @uaddr:      virtual address of the futex
 358 * @fshared:    0 for a PROCESS_PRIVATE futex, 1 for PROCESS_SHARED
 359 * @key:        address where result is stored.
 360 * @rw:         mapping needs to be read/write (values: VERIFY_READ,
 361 *              VERIFY_WRITE)
 362 *
 363 * Return: a negative error code or 0
 364 *
 365 * The key words are stored in *key on success.
 366 *
 367 * For shared mappings, it's (page->index, file_inode(vma->vm_file),
 368 * offset_within_page).  For private mappings, it's (uaddr, current->mm).
 369 * We can usually work out the index without swapping in the page.
 370 *
 371 * lock_page() might sleep, the caller should not hold a spinlock.
 372 */
 373static int
 374get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key, int rw)
 375{
 376        unsigned long address = (unsigned long)uaddr;
 377        struct mm_struct *mm = current->mm;
 378        struct page *page, *page_head;
 379        int err, ro = 0;
 380
 381        /*
 382         * The futex address must be "naturally" aligned.
 383         */
 384        key->both.offset = address % PAGE_SIZE;
 385        if (unlikely((address % sizeof(u32)) != 0))
 386                return -EINVAL;
 387        address -= key->both.offset;
 388
 389        if (unlikely(!access_ok(rw, uaddr, sizeof(u32))))
 390                return -EFAULT;
 391
 392        /*
 393         * PROCESS_PRIVATE futexes are fast.
 394         * As the mm cannot disappear under us and the 'key' only needs
 395         * virtual address, we dont even have to find the underlying vma.
 396         * Note : We do have to check 'uaddr' is a valid user address,
 397         *        but access_ok() should be faster than find_vma()
 398         */
 399        if (!fshared) {
 400                key->private.mm = mm;
 401                key->private.address = address;
 402                get_futex_key_refs(key);  /* implies MB (B) */
 403                return 0;
 404        }
 405
 406again:
 407        err = get_user_pages_fast(address, 1, 1, &page);
 408        /*
 409         * If write access is not required (eg. FUTEX_WAIT), try
 410         * and get read-only access.
 411         */
 412        if (err == -EFAULT && rw == VERIFY_READ) {
 413                err = get_user_pages_fast(address, 1, 0, &page);
 414                ro = 1;
 415        }
 416        if (err < 0)
 417                return err;
 418        else
 419                err = 0;
 420
 421#ifdef CONFIG_TRANSPARENT_HUGEPAGE
 422        page_head = page;
 423        if (unlikely(PageTail(page))) {
 424                put_page(page);
 425                /* serialize against __split_huge_page_splitting() */
 426                local_irq_disable();
 427                if (likely(__get_user_pages_fast(address, 1, !ro, &page) == 1)) {
 428                        page_head = compound_head(page);
 429                        /*
 430                         * page_head is valid pointer but we must pin
 431                         * it before taking the PG_lock and/or
 432                         * PG_compound_lock. The moment we re-enable
 433                         * irqs __split_huge_page_splitting() can
 434                         * return and the head page can be freed from
 435                         * under us. We can't take the PG_lock and/or
 436                         * PG_compound_lock on a page that could be
 437                         * freed from under us.
 438                         */
 439                        if (page != page_head) {
 440                                get_page(page_head);
 441                                put_page(page);
 442                        }
 443                        local_irq_enable();
 444                } else {
 445                        local_irq_enable();
 446                        goto again;
 447                }
 448        }
 449#else
 450        page_head = compound_head(page);
 451        if (page != page_head) {
 452                get_page(page_head);
 453                put_page(page);
 454        }
 455#endif
 456
 457        lock_page(page_head);
 458
 459        /*
 460         * If page_head->mapping is NULL, then it cannot be a PageAnon
 461         * page; but it might be the ZERO_PAGE or in the gate area or
 462         * in a special mapping (all cases which we are happy to fail);
 463         * or it may have been a good file page when get_user_pages_fast
 464         * found it, but truncated or holepunched or subjected to
 465         * invalidate_complete_page2 before we got the page lock (also
 466         * cases which we are happy to fail).  And we hold a reference,
 467         * so refcount care in invalidate_complete_page's remove_mapping
 468         * prevents drop_caches from setting mapping to NULL beneath us.
 469         *
 470         * The case we do have to guard against is when memory pressure made
 471         * shmem_writepage move it from filecache to swapcache beneath us:
 472         * an unlikely race, but we do need to retry for page_head->mapping.
 473         */
 474        if (!page_head->mapping) {
 475                int shmem_swizzled = PageSwapCache(page_head);
 476                unlock_page(page_head);
 477                put_page(page_head);
 478                if (shmem_swizzled)
 479                        goto again;
 480                return -EFAULT;
 481        }
 482
 483        /*
 484         * Private mappings are handled in a simple way.
 485         *
 486         * NOTE: When userspace waits on a MAP_SHARED mapping, even if
 487         * it's a read-only handle, it's expected that futexes attach to
 488         * the object not the particular process.
 489         */
 490        if (PageAnon(page_head)) {
 491                /*
 492                 * A RO anonymous page will never change and thus doesn't make
 493                 * sense for futex operations.
 494                 */
 495                if (ro) {
 496                        err = -EFAULT;
 497                        goto out;
 498                }
 499
 500                key->both.offset |= FUT_OFF_MMSHARED; /* ref taken on mm */
 501                key->private.mm = mm;
 502                key->private.address = address;
 503        } else {
 504                key->both.offset |= FUT_OFF_INODE; /* inode-based key */
 505                key->shared.inode = page_head->mapping->host;
 506                key->shared.pgoff = basepage_index(page);
 507        }
 508
 509        get_futex_key_refs(key); /* implies MB (B) */
 510
 511out:
 512        unlock_page(page_head);
 513        put_page(page_head);
 514        return err;
 515}
 516
 517static inline void put_futex_key(union futex_key *key)
 518{
 519        drop_futex_key_refs(key);
 520}
 521
 522/**
 523 * fault_in_user_writeable() - Fault in user address and verify RW access
 524 * @uaddr:      pointer to faulting user space address
 525 *
 526 * Slow path to fixup the fault we just took in the atomic write
 527 * access to @uaddr.
 528 *
 529 * We have no generic implementation of a non-destructive write to the
 530 * user address. We know that we faulted in the atomic pagefault
 531 * disabled section so we can as well avoid the #PF overhead by
 532 * calling get_user_pages() right away.
 533 */
 534static int fault_in_user_writeable(u32 __user *uaddr)
 535{
 536        struct mm_struct *mm = current->mm;
 537        int ret;
 538
 539        down_read(&mm->mmap_sem);
 540        ret = fixup_user_fault(current, mm, (unsigned long)uaddr,
 541                               FAULT_FLAG_WRITE);
 542        up_read(&mm->mmap_sem);
 543
 544        return ret < 0 ? ret : 0;
 545}
 546
 547/**
 548 * futex_top_waiter() - Return the highest priority waiter on a futex
 549 * @hb:         the hash bucket the futex_q's reside in
 550 * @key:        the futex key (to distinguish it from other futex futex_q's)
 551 *
 552 * Must be called with the hb lock held.
 553 */
 554static struct futex_q *futex_top_waiter(struct futex_hash_bucket *hb,
 555                                        union futex_key *key)
 556{
 557        struct futex_q *this;
 558
 559        plist_for_each_entry(this, &hb->chain, list) {
 560                if (match_futex(&this->key, key))
 561                        return this;
 562        }
 563        return NULL;
 564}
 565
 566static int cmpxchg_futex_value_locked(u32 *curval, u32 __user *uaddr,
 567                                      u32 uval, u32 newval)
 568{
 569        int ret;
 570
 571        pagefault_disable();
 572        ret = futex_atomic_cmpxchg_inatomic(curval, uaddr, uval, newval);
 573        pagefault_enable();
 574
 575        return ret;
 576}
 577
 578static int get_futex_value_locked(u32 *dest, u32 __user *from)
 579{
 580        int ret;
 581
 582        pagefault_disable();
 583        ret = __copy_from_user_inatomic(dest, from, sizeof(u32));
 584        pagefault_enable();
 585
 586        return ret ? -EFAULT : 0;
 587}
 588
 589
 590/*
 591 * PI code:
 592 */
 593static int refill_pi_state_cache(void)
 594{
 595        struct futex_pi_state *pi_state;
 596
 597        if (likely(current->pi_state_cache))
 598                return 0;
 599
 600        pi_state = kzalloc(sizeof(*pi_state), GFP_KERNEL);
 601
 602        if (!pi_state)
 603                return -ENOMEM;
 604
 605        INIT_LIST_HEAD(&pi_state->list);
 606        /* pi_mutex gets initialized later */
 607        pi_state->owner = NULL;
 608        atomic_set(&pi_state->refcount, 1);
 609        pi_state->key = FUTEX_KEY_INIT;
 610
 611        current->pi_state_cache = pi_state;
 612
 613        return 0;
 614}
 615
 616static struct futex_pi_state * alloc_pi_state(void)
 617{
 618        struct futex_pi_state *pi_state = current->pi_state_cache;
 619
 620        WARN_ON(!pi_state);
 621        current->pi_state_cache = NULL;
 622
 623        return pi_state;
 624}
 625
 626static void free_pi_state(struct futex_pi_state *pi_state)
 627{
 628        if (!atomic_dec_and_test(&pi_state->refcount))
 629                return;
 630
 631        /*
 632         * If pi_state->owner is NULL, the owner is most probably dying
 633         * and has cleaned up the pi_state already
 634         */
 635        if (pi_state->owner) {
 636                raw_spin_lock_irq(&pi_state->owner->pi_lock);
 637                list_del_init(&pi_state->list);
 638                raw_spin_unlock_irq(&pi_state->owner->pi_lock);
 639
 640                rt_mutex_proxy_unlock(&pi_state->pi_mutex, pi_state->owner);
 641        }
 642
 643        if (current->pi_state_cache)
 644                kfree(pi_state);
 645        else {
 646                /*
 647                 * pi_state->list is already empty.
 648                 * clear pi_state->owner.
 649                 * refcount is at 0 - put it back to 1.
 650                 */
 651                pi_state->owner = NULL;
 652                atomic_set(&pi_state->refcount, 1);
 653                current->pi_state_cache = pi_state;
 654        }
 655}
 656
 657/*
 658 * Look up the task based on what TID userspace gave us.
 659 * We dont trust it.
 660 */
 661static struct task_struct * futex_find_get_task(pid_t pid)
 662{
 663        struct task_struct *p;
 664
 665        rcu_read_lock();
 666        p = find_task_by_vpid(pid);
 667        if (p)
 668                get_task_struct(p);
 669
 670        rcu_read_unlock();
 671
 672        return p;
 673}
 674
 675/*
 676 * This task is holding PI mutexes at exit time => bad.
 677 * Kernel cleans up PI-state, but userspace is likely hosed.
 678 * (Robust-futex cleanup is separate and might save the day for userspace.)
 679 */
 680void exit_pi_state_list(struct task_struct *curr)
 681{
 682        struct list_head *next, *head = &curr->pi_state_list;
 683        struct futex_pi_state *pi_state;
 684        struct futex_hash_bucket *hb;
 685        union futex_key key = FUTEX_KEY_INIT;
 686
 687        if (!futex_cmpxchg_enabled)
 688                return;
 689        /*
 690         * We are a ZOMBIE and nobody can enqueue itself on
 691         * pi_state_list anymore, but we have to be careful
 692         * versus waiters unqueueing themselves:
 693         */
 694        raw_spin_lock_irq(&curr->pi_lock);
 695        while (!list_empty(head)) {
 696
 697                next = head->next;
 698                pi_state = list_entry(next, struct futex_pi_state, list);
 699                key = pi_state->key;
 700                hb = hash_futex(&key);
 701                raw_spin_unlock_irq(&curr->pi_lock);
 702
 703                spin_lock(&hb->lock);
 704
 705                raw_spin_lock_irq(&curr->pi_lock);
 706                /*
 707                 * We dropped the pi-lock, so re-check whether this
 708                 * task still owns the PI-state:
 709                 */
 710                if (head->next != next) {
 711                        spin_unlock(&hb->lock);
 712                        continue;
 713                }
 714
 715                WARN_ON(pi_state->owner != curr);
 716                WARN_ON(list_empty(&pi_state->list));
 717                list_del_init(&pi_state->list);
 718                pi_state->owner = NULL;
 719                raw_spin_unlock_irq(&curr->pi_lock);
 720
 721                rt_mutex_unlock(&pi_state->pi_mutex);
 722
 723                spin_unlock(&hb->lock);
 724
 725                raw_spin_lock_irq(&curr->pi_lock);
 726        }
 727        raw_spin_unlock_irq(&curr->pi_lock);
 728}
 729
 730static int
 731lookup_pi_state(u32 uval, struct futex_hash_bucket *hb,
 732                union futex_key *key, struct futex_pi_state **ps)
 733{
 734        struct futex_pi_state *pi_state = NULL;
 735        struct futex_q *this, *next;
 736        struct task_struct *p;
 737        pid_t pid = uval & FUTEX_TID_MASK;
 738
 739        plist_for_each_entry_safe(this, next, &hb->chain, list) {
 740                if (match_futex(&this->key, key)) {
 741                        /*
 742                         * Another waiter already exists - bump up
 743                         * the refcount and return its pi_state:
 744                         */
 745                        pi_state = this->pi_state;
 746                        /*
 747                         * Userspace might have messed up non-PI and PI futexes
 748                         */
 749                        if (unlikely(!pi_state))
 750                                return -EINVAL;
 751
 752                        WARN_ON(!atomic_read(&pi_state->refcount));
 753
 754                        /*
 755                         * When pi_state->owner is NULL then the owner died
 756                         * and another waiter is on the fly. pi_state->owner
 757                         * is fixed up by the task which acquires
 758                         * pi_state->rt_mutex.
 759                         *
 760                         * We do not check for pid == 0 which can happen when
 761                         * the owner died and robust_list_exit() cleared the
 762                         * TID.
 763                         */
 764                        if (pid && pi_state->owner) {
 765                                /*
 766                                 * Bail out if user space manipulated the
 767                                 * futex value.
 768                                 */
 769                                if (pid != task_pid_vnr(pi_state->owner))
 770                                        return -EINVAL;
 771                        }
 772
 773                        atomic_inc(&pi_state->refcount);
 774                        *ps = pi_state;
 775
 776                        return 0;
 777                }
 778        }
 779
 780        /*
 781         * We are the first waiter - try to look up the real owner and attach
 782         * the new pi_state to it, but bail out when TID = 0
 783         */
 784        if (!pid)
 785                return -ESRCH;
 786        p = futex_find_get_task(pid);
 787        if (!p)
 788                return -ESRCH;
 789
 790        /*
 791         * We need to look at the task state flags to figure out,
 792         * whether the task is exiting. To protect against the do_exit
 793         * change of the task flags, we do this protected by
 794         * p->pi_lock:
 795         */
 796        raw_spin_lock_irq(&p->pi_lock);
 797        if (unlikely(p->flags & PF_EXITING)) {
 798                /*
 799                 * The task is on the way out. When PF_EXITPIDONE is
 800                 * set, we know that the task has finished the
 801                 * cleanup:
 802                 */
 803                int ret = (p->flags & PF_EXITPIDONE) ? -ESRCH : -EAGAIN;
 804
 805                raw_spin_unlock_irq(&p->pi_lock);
 806                put_task_struct(p);
 807                return ret;
 808        }
 809
 810        pi_state = alloc_pi_state();
 811
 812        /*
 813         * Initialize the pi_mutex in locked state and make 'p'
 814         * the owner of it:
 815         */
 816        rt_mutex_init_proxy_locked(&pi_state->pi_mutex, p);
 817
 818        /* Store the key for possible exit cleanups: */
 819        pi_state->key = *key;
 820
 821        WARN_ON(!list_empty(&pi_state->list));
 822        list_add(&pi_state->list, &p->pi_state_list);
 823        pi_state->owner = p;
 824        raw_spin_unlock_irq(&p->pi_lock);
 825
 826        put_task_struct(p);
 827
 828        *ps = pi_state;
 829
 830        return 0;
 831}
 832
 833/**
 834 * futex_lock_pi_atomic() - Atomic work required to acquire a pi aware futex
 835 * @uaddr:              the pi futex user address
 836 * @hb:                 the pi futex hash bucket
 837 * @key:                the futex key associated with uaddr and hb
 838 * @ps:                 the pi_state pointer where we store the result of the
 839 *                      lookup
 840 * @task:               the task to perform the atomic lock work for.  This will
 841 *                      be "current" except in the case of requeue pi.
 842 * @set_waiters:        force setting the FUTEX_WAITERS bit (1) or not (0)
 843 *
 844 * Return:
 845 *  0 - ready to wait;
 846 *  1 - acquired the lock;
 847 * <0 - error
 848 *
 849 * The hb->lock and futex_key refs shall be held by the caller.
 850 */
 851static int futex_lock_pi_atomic(u32 __user *uaddr, struct futex_hash_bucket *hb,
 852                                union futex_key *key,
 853                                struct futex_pi_state **ps,
 854                                struct task_struct *task, int set_waiters)
 855{
 856        int lock_taken, ret, force_take = 0;
 857        u32 uval, newval, curval, vpid = task_pid_vnr(task);
 858
 859retry:
 860        ret = lock_taken = 0;
 861
 862        /*
 863         * To avoid races, we attempt to take the lock here again
 864         * (by doing a 0 -> TID atomic cmpxchg), while holding all
 865         * the locks. It will most likely not succeed.
 866         */
 867        newval = vpid;
 868        if (set_waiters)
 869                newval |= FUTEX_WAITERS;
 870
 871        if (unlikely(cmpxchg_futex_value_locked(&curval, uaddr, 0, newval)))
 872                return -EFAULT;
 873
 874        /*
 875         * Detect deadlocks.
 876         */
 877        if ((unlikely((curval & FUTEX_TID_MASK) == vpid)))
 878                return -EDEADLK;
 879
 880        /*
 881         * Surprise - we got the lock. Just return to userspace:
 882         */
 883        if (unlikely(!curval))
 884                return 1;
 885
 886        uval = curval;
 887
 888        /*
 889         * Set the FUTEX_WAITERS flag, so the owner will know it has someone
 890         * to wake at the next unlock.
 891         */
 892        newval = curval | FUTEX_WAITERS;
 893
 894        /*
 895         * Should we force take the futex? See below.
 896         */
 897        if (unlikely(force_take)) {
 898                /*
 899                 * Keep the OWNER_DIED and the WAITERS bit and set the
 900                 * new TID value.
 901                 */
 902                newval = (curval & ~FUTEX_TID_MASK) | vpid;
 903                force_take = 0;
 904                lock_taken = 1;
 905        }
 906
 907        if (unlikely(cmpxchg_futex_value_locked(&curval, uaddr, uval, newval)))
 908                return -EFAULT;
 909        if (unlikely(curval != uval))
 910                goto retry;
 911
 912        /*
 913         * We took the lock due to forced take over.
 914         */
 915        if (unlikely(lock_taken))
 916                return 1;
 917
 918        /*
 919         * We dont have the lock. Look up the PI state (or create it if
 920         * we are the first waiter):
 921         */
 922        ret = lookup_pi_state(uval, hb, key, ps);
 923
 924        if (unlikely(ret)) {
 925                switch (ret) {
 926                case -ESRCH:
 927                        /*
 928                         * We failed to find an owner for this
 929                         * futex. So we have no pi_state to block
 930                         * on. This can happen in two cases:
 931                         *
 932                         * 1) The owner died
 933                         * 2) A stale FUTEX_WAITERS bit
 934                         *
 935                         * Re-read the futex value.
 936                         */
 937                        if (get_futex_value_locked(&curval, uaddr))
 938                                return -EFAULT;
 939
 940                        /*
 941                         * If the owner died or we have a stale
 942                         * WAITERS bit the owner TID in the user space
 943                         * futex is 0.
 944                         */
 945                        if (!(curval & FUTEX_TID_MASK)) {
 946                                force_take = 1;
 947                                goto retry;
 948                        }
 949                default:
 950                        break;
 951                }
 952        }
 953
 954        return ret;
 955}
 956
 957/**
 958 * __unqueue_futex() - Remove the futex_q from its futex_hash_bucket
 959 * @q:  The futex_q to unqueue
 960 *
 961 * The q->lock_ptr must not be NULL and must be held by the caller.
 962 */
 963static void __unqueue_futex(struct futex_q *q)
 964{
 965        struct futex_hash_bucket *hb;
 966
 967        if (WARN_ON_SMP(!q->lock_ptr || !spin_is_locked(q->lock_ptr))
 968            || WARN_ON(plist_node_empty(&q->list)))
 969                return;
 970
 971        hb = container_of(q->lock_ptr, struct futex_hash_bucket, lock);
 972        plist_del(&q->list, &hb->chain);
 973        hb_waiters_dec(hb);
 974}
 975
 976/*
 977 * The hash bucket lock must be held when this is called.
 978 * Afterwards, the futex_q must not be accessed.
 979 */
 980static void wake_futex(struct futex_q *q)
 981{
 982        struct task_struct *p = q->task;
 983
 984        if (WARN(q->pi_state || q->rt_waiter, "refusing to wake PI futex\n"))
 985                return;
 986
 987        /*
 988         * We set q->lock_ptr = NULL _before_ we wake up the task. If
 989         * a non-futex wake up happens on another CPU then the task
 990         * might exit and p would dereference a non-existing task
 991         * struct. Prevent this by holding a reference on p across the
 992         * wake up.
 993         */
 994        get_task_struct(p);
 995
 996        __unqueue_futex(q);
 997        /*
 998         * The waiting task can free the futex_q as soon as
 999         * q->lock_ptr = NULL is written, without taking any locks. A
1000         * memory barrier is required here to prevent the following
1001         * store to lock_ptr from getting ahead of the plist_del.
1002         */
1003        smp_wmb();
1004        q->lock_ptr = NULL;
1005
1006        wake_up_state(p, TASK_NORMAL);
1007        put_task_struct(p);
1008}
1009
1010static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this)
1011{
1012        struct task_struct *new_owner;
1013        struct futex_pi_state *pi_state = this->pi_state;
1014        u32 uninitialized_var(curval), newval;
1015
1016        if (!pi_state)
1017                return -EINVAL;
1018
1019        /*
1020         * If current does not own the pi_state then the futex is
1021         * inconsistent and user space fiddled with the futex value.
1022         */
1023        if (pi_state->owner != current)
1024                return -EINVAL;
1025
1026        raw_spin_lock(&pi_state->pi_mutex.wait_lock);
1027        new_owner = rt_mutex_next_owner(&pi_state->pi_mutex);
1028
1029        /*
1030         * It is possible that the next waiter (the one that brought
1031         * this owner to the kernel) timed out and is no longer
1032         * waiting on the lock.
1033         */
1034        if (!new_owner)
1035                new_owner = this->task;
1036
1037        /*
1038         * We pass it to the next owner. (The WAITERS bit is always
1039         * kept enabled while there is PI state around. We must also
1040         * preserve the owner died bit.)
1041         */
1042        if (!(uval & FUTEX_OWNER_DIED)) {
1043                int ret = 0;
1044
1045                newval = FUTEX_WAITERS | task_pid_vnr(new_owner);
1046
1047                if (cmpxchg_futex_value_locked(&curval, uaddr, uval, newval))
1048                        ret = -EFAULT;
1049                else if (curval != uval)
1050                        ret = -EINVAL;
1051                if (ret) {
1052                        raw_spin_unlock(&pi_state->pi_mutex.wait_lock);
1053                        return ret;
1054                }
1055        }
1056
1057        raw_spin_lock_irq(&pi_state->owner->pi_lock);
1058        WARN_ON(list_empty(&pi_state->list));
1059        list_del_init(&pi_state->list);
1060        raw_spin_unlock_irq(&pi_state->owner->pi_lock);
1061
1062        raw_spin_lock_irq(&new_owner->pi_lock);
1063        WARN_ON(!list_empty(&pi_state->list));
1064        list_add(&pi_state->list, &new_owner->pi_state_list);
1065        pi_state->owner = new_owner;
1066        raw_spin_unlock_irq(&new_owner->pi_lock);
1067
1068        raw_spin_unlock(&pi_state->pi_mutex.wait_lock);
1069        rt_mutex_unlock(&pi_state->pi_mutex);
1070
1071        return 0;
1072}
1073
1074static int unlock_futex_pi(u32 __user *uaddr, u32 uval)
1075{
1076        u32 uninitialized_var(oldval);
1077
1078        /*
1079         * There is no waiter, so we unlock the futex. The owner died
1080         * bit has not to be preserved here. We are the owner:
1081         */
1082        if (cmpxchg_futex_value_locked(&oldval, uaddr, uval, 0))
1083                return -EFAULT;
1084        if (oldval != uval)
1085                return -EAGAIN;
1086
1087        return 0;
1088}
1089
1090/*
1091 * Express the locking dependencies for lockdep:
1092 */
1093static inline void
1094double_lock_hb(struct futex_hash_bucket *hb1, struct futex_hash_bucket *hb2)
1095{
1096        if (hb1 <= hb2) {
1097                spin_lock(&hb1->lock);
1098                if (hb1 < hb2)
1099                        spin_lock_nested(&hb2->lock, SINGLE_DEPTH_NESTING);
1100        } else { /* hb1 > hb2 */
1101                spin_lock(&hb2->lock);
1102                spin_lock_nested(&hb1->lock, SINGLE_DEPTH_NESTING);
1103        }
1104}
1105
1106static inline void
1107double_unlock_hb(struct futex_hash_bucket *hb1, struct futex_hash_bucket *hb2)
1108{
1109        spin_unlock(&hb1->lock);
1110        if (hb1 != hb2)
1111                spin_unlock(&hb2->lock);
1112}
1113
1114/*
1115 * Wake up waiters matching bitset queued on this futex (uaddr).
1116 */
1117static int
1118futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset)
1119{
1120        struct futex_hash_bucket *hb;
1121        struct futex_q *this, *next;
1122        union futex_key key = FUTEX_KEY_INIT;
1123        int ret;
1124
1125        if (!bitset)
1126                return -EINVAL;
1127
1128        ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &key, VERIFY_READ);
1129        if (unlikely(ret != 0))
1130                goto out;
1131
1132        hb = hash_futex(&key);
1133
1134        /* Make sure we really have tasks to wakeup */
1135        if (!hb_waiters_pending(hb))
1136                goto out_put_key;
1137
1138        spin_lock(&hb->lock);
1139
1140        plist_for_each_entry_safe(this, next, &hb->chain, list) {
1141                if (match_futex (&this->key, &key)) {
1142                        if (this->pi_state || this->rt_waiter) {
1143                                ret = -EINVAL;
1144                                break;
1145                        }
1146
1147                        /* Check if one of the bits is set in both bitsets */
1148                        if (!(this->bitset & bitset))
1149                                continue;
1150
1151                        wake_futex(this);
1152                        if (++ret >= nr_wake)
1153                                break;
1154                }
1155        }
1156
1157        spin_unlock(&hb->lock);
1158out_put_key:
1159        put_futex_key(&key);
1160out:
1161        return ret;
1162}
1163
1164/*
1165 * Wake up all waiters hashed on the physical page that is mapped
1166 * to this virtual address:
1167 */
1168static int
1169futex_wake_op(u32 __user *uaddr1, unsigned int flags, u32 __user *uaddr2,
1170              int nr_wake, int nr_wake2, int op)
1171{
1172        union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT;
1173        struct futex_hash_bucket *hb1, *hb2;
1174        struct futex_q *this, *next;
1175        int ret, op_ret;
1176
1177retry:
1178        ret = get_futex_key(uaddr1, flags & FLAGS_SHARED, &key1, VERIFY_READ);
1179        if (unlikely(ret != 0))
1180                goto out;
1181        ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, VERIFY_WRITE);
1182        if (unlikely(ret != 0))
1183                goto out_put_key1;
1184
1185        hb1 = hash_futex(&key1);
1186        hb2 = hash_futex(&key2);
1187
1188retry_private:
1189        double_lock_hb(hb1, hb2);
1190        op_ret = futex_atomic_op_inuser(op, uaddr2);
1191        if (unlikely(op_ret < 0)) {
1192
1193                double_unlock_hb(hb1, hb2);
1194
1195#ifndef CONFIG_MMU
1196                /*
1197                 * we don't get EFAULT from MMU faults if we don't have an MMU,
1198                 * but we might get them from range checking
1199                 */
1200                ret = op_ret;
1201                goto out_put_keys;
1202#endif
1203
1204                if (unlikely(op_ret != -EFAULT)) {
1205                        ret = op_ret;
1206                        goto out_put_keys;
1207                }
1208
1209                ret = fault_in_user_writeable(uaddr2);
1210                if (ret)
1211                        goto out_put_keys;
1212
1213                if (!(flags & FLAGS_SHARED))
1214                        goto retry_private;
1215
1216                put_futex_key(&key2);
1217                put_futex_key(&key1);
1218                goto retry;
1219        }
1220
1221        plist_for_each_entry_safe(this, next, &hb1->chain, list) {
1222                if (match_futex (&this->key, &key1)) {
1223                        if (this->pi_state || this->rt_waiter) {
1224                                ret = -EINVAL;
1225                                goto out_unlock;
1226                        }
1227                        wake_futex(this);
1228                        if (++ret >= nr_wake)
1229                                break;
1230                }
1231        }
1232
1233        if (op_ret > 0) {
1234                op_ret = 0;
1235                plist_for_each_entry_safe(this, next, &hb2->chain, list) {
1236                        if (match_futex (&this->key, &key2)) {
1237                                if (this->pi_state || this->rt_waiter) {
1238                                        ret = -EINVAL;
1239                                        goto out_unlock;
1240                                }
1241                                wake_futex(this);
1242                                if (++op_ret >= nr_wake2)
1243                                        break;
1244                        }
1245                }
1246                ret += op_ret;
1247        }
1248
1249out_unlock:
1250        double_unlock_hb(hb1, hb2);
1251out_put_keys:
1252        put_futex_key(&key2);
1253out_put_key1:
1254        put_futex_key(&key1);
1255out:
1256        return ret;
1257}
1258
1259/**
1260 * requeue_futex() - Requeue a futex_q from one hb to another
1261 * @q:          the futex_q to requeue
1262 * @hb1:        the source hash_bucket
1263 * @hb2:        the target hash_bucket
1264 * @key2:       the new key for the requeued futex_q
1265 */
1266static inline
1267void requeue_futex(struct futex_q *q, struct futex_hash_bucket *hb1,
1268                   struct futex_hash_bucket *hb2, union futex_key *key2)
1269{
1270
1271        /*
1272         * If key1 and key2 hash to the same bucket, no need to
1273         * requeue.
1274         */
1275        if (likely(&hb1->chain != &hb2->chain)) {
1276                plist_del(&q->list, &hb1->chain);
1277                hb_waiters_dec(hb1);
1278                plist_add(&q->list, &hb2->chain);
1279                hb_waiters_inc(hb2);
1280                q->lock_ptr = &hb2->lock;
1281        }
1282        get_futex_key_refs(key2);
1283        q->key = *key2;
1284}
1285
1286/**
1287 * requeue_pi_wake_futex() - Wake a task that acquired the lock during requeue
1288 * @q:          the futex_q
1289 * @key:        the key of the requeue target futex
1290 * @hb:         the hash_bucket of the requeue target futex
1291 *
1292 * During futex_requeue, with requeue_pi=1, it is possible to acquire the
1293 * target futex if it is uncontended or via a lock steal.  Set the futex_q key
1294 * to the requeue target futex so the waiter can detect the wakeup on the right
1295 * futex, but remove it from the hb and NULL the rt_waiter so it can detect
1296 * atomic lock acquisition.  Set the q->lock_ptr to the requeue target hb->lock
1297 * to protect access to the pi_state to fixup the owner later.  Must be called
1298 * with both q->lock_ptr and hb->lock held.
1299 */
1300static inline
1301void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key,
1302                           struct futex_hash_bucket *hb)
1303{
1304        get_futex_key_refs(key);
1305        q->key = *key;
1306
1307        __unqueue_futex(q);
1308
1309        WARN_ON(!q->rt_waiter);
1310        q->rt_waiter = NULL;
1311
1312        q->lock_ptr = &hb->lock;
1313
1314        wake_up_state(q->task, TASK_NORMAL);
1315}
1316
1317/**
1318 * futex_proxy_trylock_atomic() - Attempt an atomic lock for the top waiter
1319 * @pifutex:            the user address of the to futex
1320 * @hb1:                the from futex hash bucket, must be locked by the caller
1321 * @hb2:                the to futex hash bucket, must be locked by the caller
1322 * @key1:               the from futex key
1323 * @key2:               the to futex key
1324 * @ps:                 address to store the pi_state pointer
1325 * @set_waiters:        force setting the FUTEX_WAITERS bit (1) or not (0)
1326 *
1327 * Try and get the lock on behalf of the top waiter if we can do it atomically.
1328 * Wake the top waiter if we succeed.  If the caller specified set_waiters,
1329 * then direct futex_lock_pi_atomic() to force setting the FUTEX_WAITERS bit.
1330 * hb1 and hb2 must be held by the caller.
1331 *
1332 * Return:
1333 *  0 - failed to acquire the lock atomically;
1334 *  1 - acquired the lock;
1335 * <0 - error
1336 */
1337static int futex_proxy_trylock_atomic(u32 __user *pifutex,
1338                                 struct futex_hash_bucket *hb1,
1339                                 struct futex_hash_bucket *hb2,
1340                                 union futex_key *key1, union futex_key *key2,
1341                                 struct futex_pi_state **ps, int set_waiters)
1342{
1343        struct futex_q *top_waiter = NULL;
1344        u32 curval;
1345        int ret;
1346
1347        if (get_futex_value_locked(&curval, pifutex))
1348                return -EFAULT;
1349
1350        /*
1351         * Find the top_waiter and determine if there are additional waiters.
1352         * If the caller intends to requeue more than 1 waiter to pifutex,
1353         * force futex_lock_pi_atomic() to set the FUTEX_WAITERS bit now,
1354         * as we have means to handle the possible fault.  If not, don't set
1355         * the bit unecessarily as it will force the subsequent unlock to enter
1356         * the kernel.
1357         */
1358        top_waiter = futex_top_waiter(hb1, key1);
1359
1360        /* There are no waiters, nothing for us to do. */
1361        if (!top_waiter)
1362                return 0;
1363
1364        /* Ensure we requeue to the expected futex. */
1365        if (!match_futex(top_waiter->requeue_pi_key, key2))
1366                return -EINVAL;
1367
1368        /*
1369         * Try to take the lock for top_waiter.  Set the FUTEX_WAITERS bit in
1370         * the contended case or if set_waiters is 1.  The pi_state is returned
1371         * in ps in contended cases.
1372         */
1373        ret = futex_lock_pi_atomic(pifutex, hb2, key2, ps, top_waiter->task,
1374                                   set_waiters);
1375        if (ret == 1)
1376                requeue_pi_wake_futex(top_waiter, key2, hb2);
1377
1378        return ret;
1379}
1380
1381/**
1382 * futex_requeue() - Requeue waiters from uaddr1 to uaddr2
1383 * @uaddr1:     source futex user address
1384 * @flags:      futex flags (FLAGS_SHARED, etc.)
1385 * @uaddr2:     target futex user address
1386 * @nr_wake:    number of waiters to wake (must be 1 for requeue_pi)
1387 * @nr_requeue: number of waiters to requeue (0-INT_MAX)
1388 * @cmpval:     @uaddr1 expected value (or %NULL)
1389 * @requeue_pi: if we are attempting to requeue from a non-pi futex to a
1390 *              pi futex (pi to pi requeue is not supported)
1391 *
1392 * Requeue waiters on uaddr1 to uaddr2. In the requeue_pi case, try to acquire
1393 * uaddr2 atomically on behalf of the top waiter.
1394 *
1395 * Return:
1396 * >=0 - on success, the number of tasks requeued or woken;
1397 *  <0 - on error
1398 */
1399static int futex_requeue(u32 __user *uaddr1, unsigned int flags,
1400                         u32 __user *uaddr2, int nr_wake, int nr_requeue,
1401                         u32 *cmpval, int requeue_pi)
1402{
1403        union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT;
1404        int drop_count = 0, task_count = 0, ret;
1405        struct futex_pi_state *pi_state = NULL;
1406        struct futex_hash_bucket *hb1, *hb2;
1407        struct futex_q *this, *next;
1408        u32 curval2;
1409
1410        if (requeue_pi) {
1411                /*
1412                 * requeue_pi requires a pi_state, try to allocate it now
1413                 * without any locks in case it fails.
1414                 */
1415                if (refill_pi_state_cache())
1416                        return -ENOMEM;
1417                /*
1418                 * requeue_pi must wake as many tasks as it can, up to nr_wake
1419                 * + nr_requeue, since it acquires the rt_mutex prior to
1420                 * returning to userspace, so as to not leave the rt_mutex with
1421                 * waiters and no owner.  However, second and third wake-ups
1422                 * cannot be predicted as they involve race conditions with the
1423                 * first wake and a fault while looking up the pi_state.  Both
1424                 * pthread_cond_signal() and pthread_cond_broadcast() should
1425                 * use nr_wake=1.
1426                 */
1427                if (nr_wake != 1)
1428                        return -EINVAL;
1429        }
1430
1431retry:
1432        if (pi_state != NULL) {
1433                /*
1434                 * We will have to lookup the pi_state again, so free this one
1435                 * to keep the accounting correct.
1436                 */
1437                free_pi_state(pi_state);
1438                pi_state = NULL;
1439        }
1440
1441        ret = get_futex_key(uaddr1, flags & FLAGS_SHARED, &key1, VERIFY_READ);
1442        if (unlikely(ret != 0))
1443                goto out;
1444        ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2,
1445                            requeue_pi ? VERIFY_WRITE : VERIFY_READ);
1446        if (unlikely(ret != 0))
1447                goto out_put_key1;
1448
1449        hb1 = hash_futex(&key1);
1450        hb2 = hash_futex(&key2);
1451
1452retry_private:
1453        double_lock_hb(hb1, hb2);
1454
1455        if (likely(cmpval != NULL)) {
1456                u32 curval;
1457
1458                ret = get_futex_value_locked(&curval, uaddr1);
1459
1460                if (unlikely(ret)) {
1461                        double_unlock_hb(hb1, hb2);
1462
1463                        ret = get_user(curval, uaddr1);
1464                        if (ret)
1465                                goto out_put_keys;
1466
1467                        if (!(flags & FLAGS_SHARED))
1468                                goto retry_private;
1469
1470                        put_futex_key(&key2);
1471                        put_futex_key(&key1);
1472                        goto retry;
1473                }
1474                if (curval != *cmpval) {
1475                        ret = -EAGAIN;
1476                        goto out_unlock;
1477                }
1478        }
1479
1480        if (requeue_pi && (task_count - nr_wake < nr_requeue)) {
1481                /*
1482                 * Attempt to acquire uaddr2 and wake the top waiter. If we
1483                 * intend to requeue waiters, force setting the FUTEX_WAITERS
1484                 * bit.  We force this here where we are able to easily handle
1485                 * faults rather in the requeue loop below.
1486                 */
1487                ret = futex_proxy_trylock_atomic(uaddr2, hb1, hb2, &key1,
1488                                                 &key2, &pi_state, nr_requeue);
1489
1490                /*
1491                 * At this point the top_waiter has either taken uaddr2 or is
1492                 * waiting on it.  If the former, then the pi_state will not
1493                 * exist yet, look it up one more time to ensure we have a
1494                 * reference to it.
1495                 */
1496                if (ret == 1) {
1497                        WARN_ON(pi_state);
1498                        drop_count++;
1499                        task_count++;
1500                        ret = get_futex_value_locked(&curval2, uaddr2);
1501                        if (!ret)
1502                                ret = lookup_pi_state(curval2, hb2, &key2,
1503                                                      &pi_state);
1504                }
1505
1506                switch (ret) {
1507                case 0:
1508                        break;
1509                case -EFAULT:
1510                        double_unlock_hb(hb1, hb2);
1511                        put_futex_key(&key2);
1512                        put_futex_key(&key1);
1513                        ret = fault_in_user_writeable(uaddr2);
1514                        if (!ret)
1515                                goto retry;
1516                        goto out;
1517                case -EAGAIN:
1518                        /* The owner was exiting, try again. */
1519                        double_unlock_hb(hb1, hb2);
1520                        put_futex_key(&key2);
1521                        put_futex_key(&key1);
1522                        cond_resched();
1523                        goto retry;
1524                default:
1525                        goto out_unlock;
1526                }
1527        }
1528
1529        plist_for_each_entry_safe(this, next, &hb1->chain, list) {
1530                if (task_count - nr_wake >= nr_requeue)
1531                        break;
1532
1533                if (!match_futex(&this->key, &key1))
1534                        continue;
1535
1536                /*
1537                 * FUTEX_WAIT_REQEUE_PI and FUTEX_CMP_REQUEUE_PI should always
1538                 * be paired with each other and no other futex ops.
1539                 *
1540                 * We should never be requeueing a futex_q with a pi_state,
1541                 * which is awaiting a futex_unlock_pi().
1542                 */
1543                if ((requeue_pi && !this->rt_waiter) ||
1544                    (!requeue_pi && this->rt_waiter) ||
1545                    this->pi_state) {
1546                        ret = -EINVAL;
1547                        break;
1548                }
1549
1550                /*
1551                 * Wake nr_wake waiters.  For requeue_pi, if we acquired the
1552                 * lock, we already woke the top_waiter.  If not, it will be
1553                 * woken by futex_unlock_pi().
1554                 */
1555                if (++task_count <= nr_wake && !requeue_pi) {
1556                        wake_futex(this);
1557                        continue;
1558                }
1559
1560                /* Ensure we requeue to the expected futex for requeue_pi. */
1561                if (requeue_pi && !match_futex(this->requeue_pi_key, &key2)) {
1562                        ret = -EINVAL;
1563                        break;
1564                }
1565
1566                /*
1567                 * Requeue nr_requeue waiters and possibly one more in the case
1568                 * of requeue_pi if we couldn't acquire the lock atomically.
1569                 */
1570                if (requeue_pi) {
1571                        /* Prepare the waiter to take the rt_mutex. */
1572                        atomic_inc(&pi_state->refcount);
1573                        this->pi_state = pi_state;
1574                        ret = rt_mutex_start_proxy_lock(&pi_state->pi_mutex,
1575                                                        this->rt_waiter,
1576                                                        this->task, 1);
1577                        if (ret == 1) {
1578                                /* We got the lock. */
1579                                requeue_pi_wake_futex(this, &key2, hb2);
1580                                drop_count++;
1581                                continue;
1582                        } else if (ret) {
1583                                /* -EDEADLK */
1584                                this->pi_state = NULL;
1585                                free_pi_state(pi_state);
1586                                goto out_unlock;
1587                        }
1588                }
1589                requeue_futex(this, hb1, hb2, &key2);
1590                drop_count++;
1591        }
1592
1593out_unlock:
1594        double_unlock_hb(hb1, hb2);
1595
1596        /*
1597         * drop_futex_key_refs() must be called outside the spinlocks. During
1598         * the requeue we moved futex_q's from the hash bucket at key1 to the
1599         * one at key2 and updated their key pointer.  We no longer need to
1600         * hold the references to key1.
1601         */
1602        while (--drop_count >= 0)
1603                drop_futex_key_refs(&key1);
1604
1605out_put_keys:
1606        put_futex_key(&key2);
1607out_put_key1:
1608        put_futex_key(&key1);
1609out:
1610        if (pi_state != NULL)
1611                free_pi_state(pi_state);
1612        return ret ? ret : task_count;
1613}
1614
1615/* The key must be already stored in q->key. */
1616static inline struct futex_hash_bucket *queue_lock(struct futex_q *q)
1617        __acquires(&hb->lock)
1618{
1619        struct futex_hash_bucket *hb;
1620
1621        hb = hash_futex(&q->key);
1622
1623        /*
1624         * Increment the counter before taking the lock so that
1625         * a potential waker won't miss a to-be-slept task that is
1626         * waiting for the spinlock. This is safe as all queue_lock()
1627         * users end up calling queue_me(). Similarly, for housekeeping,
1628         * decrement the counter at queue_unlock() when some error has
1629         * occurred and we don't end up adding the task to the list.
1630         */
1631        hb_waiters_inc(hb);
1632
1633        q->lock_ptr = &hb->lock;
1634
1635        spin_lock(&hb->lock); /* implies MB (A) */
1636        return hb;
1637}
1638
1639static inline void
1640queue_unlock(struct futex_hash_bucket *hb)
1641        __releases(&hb->lock)
1642{
1643        spin_unlock(&hb->lock);
1644        hb_waiters_dec(hb);
1645}
1646
1647/**
1648 * queue_me() - Enqueue the futex_q on the futex_hash_bucket
1649 * @q:  The futex_q to enqueue
1650 * @hb: The destination hash bucket
1651 *
1652 * The hb->lock must be held by the caller, and is released here. A call to
1653 * queue_me() is typically paired with exactly one call to unqueue_me().  The
1654 * exceptions involve the PI related operations, which may use unqueue_me_pi()
1655 * or nothing if the unqueue is done as part of the wake process and the unqueue
1656 * state is implicit in the state of woken task (see futex_wait_requeue_pi() for
1657 * an example).
1658 */
1659static inline void queue_me(struct futex_q *q, struct futex_hash_bucket *hb)
1660        __releases(&hb->lock)
1661{
1662        int prio;
1663
1664        /*
1665         * The priority used to register this element is
1666         * - either the real thread-priority for the real-time threads
1667         * (i.e. threads with a priority lower than MAX_RT_PRIO)
1668         * - or MAX_RT_PRIO for non-RT threads.
1669         * Thus, all RT-threads are woken first in priority order, and
1670         * the others are woken last, in FIFO order.
1671         */
1672        prio = min(current->normal_prio, MAX_RT_PRIO);
1673
1674        plist_node_init(&q->list, prio);
1675        plist_add(&q->list, &hb->chain);
1676        q->task = current;
1677        spin_unlock(&hb->lock);
1678}
1679
1680/**
1681 * unqueue_me() - Remove the futex_q from its futex_hash_bucket
1682 * @q:  The futex_q to unqueue
1683 *
1684 * The q->lock_ptr must not be held by the caller. A call to unqueue_me() must
1685 * be paired with exactly one earlier call to queue_me().
1686 *
1687 * Return:
1688 *   1 - if the futex_q was still queued (and we removed unqueued it);
1689 *   0 - if the futex_q was already removed by the waking thread
1690 */
1691static int unqueue_me(struct futex_q *q)
1692{
1693        spinlock_t *lock_ptr;
1694        int ret = 0;
1695
1696        /* In the common case we don't take the spinlock, which is nice. */
1697retry:
1698        lock_ptr = q->lock_ptr;
1699        barrier();
1700        if (lock_ptr != NULL) {
1701                spin_lock(lock_ptr);
1702                /*
1703                 * q->lock_ptr can change between reading it and
1704                 * spin_lock(), causing us to take the wrong lock.  This
1705                 * corrects the race condition.
1706                 *
1707                 * Reasoning goes like this: if we have the wrong lock,
1708                 * q->lock_ptr must have changed (maybe several times)
1709                 * between reading it and the spin_lock().  It can
1710                 * change again after the spin_lock() but only if it was
1711                 * already changed before the spin_lock().  It cannot,
1712                 * however, change back to the original value.  Therefore
1713                 * we can detect whether we acquired the correct lock.
1714                 */
1715                if (unlikely(lock_ptr != q->lock_ptr)) {
1716                        spin_unlock(lock_ptr);
1717                        goto retry;
1718                }
1719                __unqueue_futex(q);
1720
1721                BUG_ON(q->pi_state);
1722
1723                spin_unlock(lock_ptr);
1724                ret = 1;
1725        }
1726
1727        drop_futex_key_refs(&q->key);
1728        return ret;
1729}
1730
1731/*
1732 * PI futexes can not be requeued and must remove themself from the
1733 * hash bucket. The hash bucket lock (i.e. lock_ptr) is held on entry
1734 * and dropped here.
1735 */
1736static void unqueue_me_pi(struct futex_q *q)
1737        __releases(q->lock_ptr)
1738{
1739        __unqueue_futex(q);
1740
1741        BUG_ON(!q->pi_state);
1742        free_pi_state(q->pi_state);
1743        q->pi_state = NULL;
1744
1745        spin_unlock(q->lock_ptr);
1746}
1747
1748/*
1749 * Fixup the pi_state owner with the new owner.
1750 *
1751 * Must be called with hash bucket lock held and mm->sem held for non
1752 * private futexes.
1753 */
1754static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,
1755                                struct task_struct *newowner)
1756{
1757        u32 newtid = task_pid_vnr(newowner) | FUTEX_WAITERS;
1758        struct futex_pi_state *pi_state = q->pi_state;
1759        struct task_struct *oldowner = pi_state->owner;
1760        u32 uval, uninitialized_var(curval), newval;
1761        int ret;
1762
1763        /* Owner died? */
1764        if (!pi_state->owner)
1765                newtid |= FUTEX_OWNER_DIED;
1766
1767        /*
1768         * We are here either because we stole the rtmutex from the
1769         * previous highest priority waiter or we are the highest priority
1770         * waiter but failed to get the rtmutex the first time.
1771         * We have to replace the newowner TID in the user space variable.
1772         * This must be atomic as we have to preserve the owner died bit here.
1773         *
1774         * Note: We write the user space value _before_ changing the pi_state
1775         * because we can fault here. Imagine swapped out pages or a fork
1776         * that marked all the anonymous memory readonly for cow.
1777         *
1778         * Modifying pi_state _before_ the user space value would
1779         * leave the pi_state in an inconsistent state when we fault
1780         * here, because we need to drop the hash bucket lock to
1781         * handle the fault. This might be observed in the PID check
1782         * in lookup_pi_state.
1783         */
1784retry:
1785        if (get_futex_value_locked(&uval, uaddr))
1786                goto handle_fault;
1787
1788        while (1) {
1789                newval = (uval & FUTEX_OWNER_DIED) | newtid;
1790
1791                if (cmpxchg_futex_value_locked(&curval, uaddr, uval, newval))
1792                        goto handle_fault;
1793                if (curval == uval)
1794                        break;
1795                uval = curval;
1796        }
1797
1798        /*
1799         * We fixed up user space. Now we need to fix the pi_state
1800         * itself.
1801         */
1802        if (pi_state->owner != NULL) {
1803                raw_spin_lock_irq(&pi_state->owner->pi_lock);
1804                WARN_ON(list_empty(&pi_state->list));
1805                list_del_init(&pi_state->list);
1806                raw_spin_unlock_irq(&pi_state->owner->pi_lock);
1807        }
1808
1809        pi_state->owner = newowner;
1810
1811        raw_spin_lock_irq(&newowner->pi_lock);
1812        WARN_ON(!list_empty(&pi_state->list));
1813        list_add(&pi_state->list, &newowner->pi_state_list);
1814        raw_spin_unlock_irq(&newowner->pi_lock);
1815        return 0;
1816
1817        /*
1818         * To handle the page fault we need to drop the hash bucket
1819         * lock here. That gives the other task (either the highest priority
1820         * waiter itself or the task which stole the rtmutex) the
1821         * chance to try the fixup of the pi_state. So once we are
1822         * back from handling the fault we need to check the pi_state
1823         * after reacquiring the hash bucket lock and before trying to
1824         * do another fixup. When the fixup has been done already we
1825         * simply return.
1826         */
1827handle_fault:
1828        spin_unlock(q->lock_ptr);
1829
1830        ret = fault_in_user_writeable(uaddr);
1831
1832        spin_lock(q->lock_ptr);
1833
1834        /*
1835         * Check if someone else fixed it for us:
1836         */
1837        if (pi_state->owner != oldowner)
1838                return 0;
1839
1840        if (ret)
1841                return ret;
1842
1843        goto retry;
1844}
1845
1846static long futex_wait_restart(struct restart_block *restart);
1847
1848/**
1849 * fixup_owner() - Post lock pi_state and corner case management
1850 * @uaddr:      user address of the futex
1851 * @q:          futex_q (contains pi_state and access to the rt_mutex)
1852 * @locked:     if the attempt to take the rt_mutex succeeded (1) or not (0)
1853 *
1854 * After attempting to lock an rt_mutex, this function is called to cleanup
1855 * the pi_state owner as well as handle race conditions that may allow us to
1856 * acquire the lock. Must be called with the hb lock held.
1857 *
1858 * Return:
1859 *  1 - success, lock taken;
1860 *  0 - success, lock not taken;
1861 * <0 - on error (-EFAULT)
1862 */
1863static int fixup_owner(u32 __user *uaddr, struct futex_q *q, int locked)
1864{
1865        struct task_struct *owner;
1866        int ret = 0;
1867
1868        if (locked) {
1869                /*
1870                 * Got the lock. We might not be the anticipated owner if we
1871                 * did a lock-steal - fix up the PI-state in that case:
1872                 */
1873                if (q->pi_state->owner != current)
1874                        ret = fixup_pi_state_owner(uaddr, q, current);
1875                goto out;
1876        }
1877
1878        /*
1879         * Catch the rare case, where the lock was released when we were on the
1880         * way back before we locked the hash bucket.
1881         */
1882        if (q->pi_state->owner == current) {
1883                /*
1884                 * Try to get the rt_mutex now. This might fail as some other
1885                 * task acquired the rt_mutex after we removed ourself from the
1886                 * rt_mutex waiters list.
1887                 */
1888                if (rt_mutex_trylock(&q->pi_state->pi_mutex)) {
1889                        locked = 1;
1890                        goto out;
1891                }
1892
1893                /*
1894                 * pi_state is incorrect, some other task did a lock steal and
1895                 * we returned due to timeout or signal without taking the
1896                 * rt_mutex. Too late.
1897                 */
1898                raw_spin_lock(&q->pi_state->pi_mutex.wait_lock);
1899                owner = rt_mutex_owner(&q->pi_state->pi_mutex);
1900                if (!owner)
1901                        owner = rt_mutex_next_owner(&q->pi_state->pi_mutex);
1902                raw_spin_unlock(&q->pi_state->pi_mutex.wait_lock);
1903                ret = fixup_pi_state_owner(uaddr, q, owner);
1904                goto out;
1905        }
1906
1907        /*
1908         * Paranoia check. If we did not take the lock, then we should not be
1909         * the owner of the rt_mutex.
1910         */
1911        if (rt_mutex_owner(&q->pi_state->pi_mutex) == current)
1912                printk(KERN_ERR "fixup_owner: ret = %d pi-mutex: %p "
1913                                "pi-state %p\n", ret,
1914                                q->pi_state->pi_mutex.owner,
1915                                q->pi_state->owner);
1916
1917out:
1918        return ret ? ret : locked;
1919}
1920
1921/**
1922 * futex_wait_queue_me() - queue_me() and wait for wakeup, timeout, or signal
1923 * @hb:         the futex hash bucket, must be locked by the caller
1924 * @q:          the futex_q to queue up on
1925 * @timeout:    the prepared hrtimer_sleeper, or null for no timeout
1926 */
1927static void futex_wait_queue_me(struct futex_hash_bucket *hb, struct futex_q *q,
1928                                struct hrtimer_sleeper *timeout)
1929{
1930        /*
1931         * The task state is guaranteed to be set before another task can
1932         * wake it. set_current_state() is implemented using set_mb() and
1933         * queue_me() calls spin_unlock() upon completion, both serializing
1934         * access to the hash list and forcing another memory barrier.
1935         */
1936        set_current_state(TASK_INTERRUPTIBLE);
1937        queue_me(q, hb);
1938
1939        /* Arm the timer */
1940        if (timeout) {
1941                hrtimer_start_expires(&timeout->timer, HRTIMER_MODE_ABS);
1942                if (!hrtimer_active(&timeout->timer))
1943                        timeout->task = NULL;
1944        }
1945
1946        /*
1947         * If we have been removed from the hash list, then another task
1948         * has tried to wake us, and we can skip the call to schedule().
1949         */
1950        if (likely(!plist_node_empty(&q->list))) {
1951                /*
1952                 * If the timer has already expired, current will already be
1953                 * flagged for rescheduling. Only call schedule if there
1954                 * is no timeout, or if it has yet to expire.
1955                 */
1956                if (!timeout || timeout->task)
1957                        freezable_schedule();
1958        }
1959        __set_current_state(TASK_RUNNING);
1960}
1961
1962/**
1963 * futex_wait_setup() - Prepare to wait on a futex
1964 * @uaddr:      the futex userspace address
1965 * @val:        the expected value
1966 * @flags:      futex flags (FLAGS_SHARED, etc.)
1967 * @q:          the associated futex_q
1968 * @hb:         storage for hash_bucket pointer to be returned to caller
1969 *
1970 * Setup the futex_q and locate the hash_bucket.  Get the futex value and
1971 * compare it with the expected value.  Handle atomic faults internally.
1972 * Return with the hb lock held and a q.key reference on success, and unlocked
1973 * with no q.key reference on failure.
1974 *
1975 * Return:
1976 *  0 - uaddr contains val and hb has been locked;
1977 * <1 - -EFAULT or -EWOULDBLOCK (uaddr does not contain val) and hb is unlocked
1978 */
1979static int futex_wait_setup(u32 __user *uaddr, u32 val, unsigned int flags,
1980                           struct futex_q *q, struct futex_hash_bucket **hb)
1981{
1982        u32 uval;
1983        int ret;
1984
1985        /*
1986         * Access the page AFTER the hash-bucket is locked.
1987         * Order is important:
1988         *
1989         *   Userspace waiter: val = var; if (cond(val)) futex_wait(&var, val);
1990         *   Userspace waker:  if (cond(var)) { var = new; futex_wake(&var); }
1991         *
1992         * The basic logical guarantee of a futex is that it blocks ONLY
1993         * if cond(var) is known to be true at the time of blocking, for
1994         * any cond.  If we locked the hash-bucket after testing *uaddr, that
1995         * would open a race condition where we could block indefinitely with
1996         * cond(var) false, which would violate the guarantee.
1997         *
1998         * On the other hand, we insert q and release the hash-bucket only
1999         * after testing *uaddr.  This guarantees that futex_wait() will NOT
2000         * absorb a wakeup if *uaddr does not match the desired values
2001         * while the syscall executes.
2002         */
2003retry:
2004        ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &q->key, VERIFY_READ);
2005        if (unlikely(ret != 0))
2006                return ret;
2007
2008retry_private:
2009        *hb = queue_lock(q);
2010
2011        ret = get_futex_value_locked(&uval, uaddr);
2012
2013        if (ret) {
2014                queue_unlock(*hb);
2015
2016                ret = get_user(uval, uaddr);
2017                if (ret)
2018                        goto out;
2019
2020                if (!(flags & FLAGS_SHARED))
2021                        goto retry_private;
2022
2023                put_futex_key(&q->key);
2024                goto retry;
2025        }
2026
2027        if (uval != val) {
2028                queue_unlock(*hb);
2029                ret = -EWOULDBLOCK;
2030        }
2031
2032out:
2033        if (ret)
2034                put_futex_key(&q->key);
2035        return ret;
2036}
2037
2038static int futex_wait(u32 __user *uaddr, unsigned int flags, u32 val,
2039                      ktime_t *abs_time, u32 bitset)
2040{
2041        struct hrtimer_sleeper timeout, *to = NULL;
2042        struct restart_block *restart;
2043        struct futex_hash_bucket *hb;
2044        struct futex_q q = futex_q_init;
2045        int ret;
2046
2047        if (!bitset)
2048                return -EINVAL;
2049        q.bitset = bitset;
2050
2051        if (abs_time) {
2052                to = &timeout;
2053
2054                hrtimer_init_on_stack(&to->timer, (flags & FLAGS_CLOCKRT) ?
2055                                      CLOCK_REALTIME : CLOCK_MONOTONIC,
2056                                      HRTIMER_MODE_ABS);
2057                hrtimer_init_sleeper(to, current);
2058                hrtimer_set_expires_range_ns(&to->timer, *abs_time,
2059                                             current->timer_slack_ns);
2060        }
2061
2062retry:
2063        /*
2064         * Prepare to wait on uaddr. On success, holds hb lock and increments
2065         * q.key refs.
2066         */
2067        ret = futex_wait_setup(uaddr, val, flags, &q, &hb);
2068        if (ret)
2069                goto out;
2070
2071        /* queue_me and wait for wakeup, timeout, or a signal. */
2072        futex_wait_queue_me(hb, &q, to);
2073
2074        /* If we were woken (and unqueued), we succeeded, whatever. */
2075        ret = 0;
2076        /* unqueue_me() drops q.key ref */
2077        if (!unqueue_me(&q))
2078                goto out;
2079        ret = -ETIMEDOUT;
2080        if (to && !to->task)
2081                goto out;
2082
2083        /*
2084         * We expect signal_pending(current), but we might be the
2085         * victim of a spurious wakeup as well.
2086         */
2087        if (!signal_pending(current))
2088                goto retry;
2089
2090        ret = -ERESTARTSYS;
2091        if (!abs_time)
2092                goto out;
2093
2094        restart = &current_thread_info()->restart_block;
2095        restart->fn = futex_wait_restart;
2096        restart->futex.uaddr = uaddr;
2097        restart->futex.val = val;
2098        restart->futex.time = abs_time->tv64;
2099        restart->futex.bitset = bitset;
2100        restart->futex.flags = flags | FLAGS_HAS_TIMEOUT;
2101
2102        ret = -ERESTART_RESTARTBLOCK;
2103
2104out:
2105        if (to) {
2106                hrtimer_cancel(&to->timer);
2107                destroy_hrtimer_on_stack(&to->timer);
2108        }
2109        return ret;
2110}
2111
2112
2113static long futex_wait_restart(struct restart_block *restart)
2114{
2115        u32 __user *uaddr = restart->futex.uaddr;
2116        ktime_t t, *tp = NULL;
2117
2118        if (restart->futex.flags & FLAGS_HAS_TIMEOUT) {
2119                t.tv64 = restart->futex.time;
2120                tp = &t;
2121        }
2122        restart->fn = do_no_restart_syscall;
2123
2124        return (long)futex_wait(uaddr, restart->futex.flags,
2125                                restart->futex.val, tp, restart->futex.bitset);
2126}
2127
2128
2129/*
2130 * Userspace tried a 0 -> TID atomic transition of the futex value
2131 * and failed. The kernel side here does the whole locking operation:
2132 * if there are waiters then it will block, it does PI, etc. (Due to
2133 * races the kernel might see a 0 value of the futex too.)
2134 */
2135static int futex_lock_pi(u32 __user *uaddr, unsigned int flags, int detect,
2136                         ktime_t *time, int trylock)
2137{
2138        struct hrtimer_sleeper timeout, *to = NULL;
2139        struct futex_hash_bucket *hb;
2140        struct futex_q q = futex_q_init;
2141        int res, ret;
2142
2143        if (refill_pi_state_cache())
2144                return -ENOMEM;
2145
2146        if (time) {
2147                to = &timeout;
2148                hrtimer_init_on_stack(&to->timer, CLOCK_REALTIME,
2149                                      HRTIMER_MODE_ABS);
2150                hrtimer_init_sleeper(to, current);
2151                hrtimer_set_expires(&to->timer, *time);
2152        }
2153
2154retry:
2155        ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &q.key, VERIFY_WRITE);
2156        if (unlikely(ret != 0))
2157                goto out;
2158
2159retry_private:
2160        hb = queue_lock(&q);
2161
2162        ret = futex_lock_pi_atomic(uaddr, hb, &q.key, &q.pi_state, current, 0);
2163        if (unlikely(ret)) {
2164                switch (ret) {
2165                case 1:
2166                        /* We got the lock. */
2167                        ret = 0;
2168                        goto out_unlock_put_key;
2169                case -EFAULT:
2170                        goto uaddr_faulted;
2171                case -EAGAIN:
2172                        /*
2173                         * Task is exiting and we just wait for the
2174                         * exit to complete.
2175                         */
2176                        queue_unlock(hb);
2177                        put_futex_key(&q.key);
2178                        cond_resched();
2179                        goto retry;
2180                default:
2181                        goto out_unlock_put_key;
2182                }
2183        }
2184
2185        /*
2186         * Only actually queue now that the atomic ops are done:
2187         */
2188        queue_me(&q, hb);
2189
2190        WARN_ON(!q.pi_state);
2191        /*
2192         * Block on the PI mutex:
2193         */
2194        if (!trylock)
2195                ret = rt_mutex_timed_lock(&q.pi_state->pi_mutex, to, 1);
2196        else {
2197                ret = rt_mutex_trylock(&q.pi_state->pi_mutex);
2198                /* Fixup the trylock return value: */
2199                ret = ret ? 0 : -EWOULDBLOCK;
2200        }
2201
2202        spin_lock(q.lock_ptr);
2203        /*
2204         * Fixup the pi_state owner and possibly acquire the lock if we
2205         * haven't already.
2206         */
2207        res = fixup_owner(uaddr, &q, !ret);
2208        /*
2209         * If fixup_owner() returned an error, proprogate that.  If it acquired
2210         * the lock, clear our -ETIMEDOUT or -EINTR.
2211         */
2212        if (res)
2213                ret = (res < 0) ? res : 0;
2214
2215        /*
2216         * If fixup_owner() faulted and was unable to handle the fault, unlock
2217         * it and return the fault to userspace.
2218         */
2219        if (ret && (rt_mutex_owner(&q.pi_state->pi_mutex) == current))
2220                rt_mutex_unlock(&q.pi_state->pi_mutex);
2221
2222        /* Unqueue and drop the lock */
2223        unqueue_me_pi(&q);
2224
2225        goto out_put_key;
2226
2227out_unlock_put_key:
2228        queue_unlock(hb);
2229
2230out_put_key:
2231        put_futex_key(&q.key);
2232out:
2233        if (to)
2234                destroy_hrtimer_on_stack(&to->timer);
2235        return ret != -EINTR ? ret : -ERESTARTNOINTR;
2236
2237uaddr_faulted:
2238        queue_unlock(hb);
2239
2240        ret = fault_in_user_writeable(uaddr);
2241        if (ret)
2242                goto out_put_key;
2243
2244        if (!(flags & FLAGS_SHARED))
2245                goto retry_private;
2246
2247        put_futex_key(&q.key);
2248        goto retry;
2249}
2250
2251/*
2252 * Userspace attempted a TID -> 0 atomic transition, and failed.
2253 * This is the in-kernel slowpath: we look up the PI state (if any),
2254 * and do the rt-mutex unlock.
2255 */
2256static int futex_unlock_pi(u32 __user *uaddr, unsigned int flags)
2257{
2258        struct futex_hash_bucket *hb;
2259        struct futex_q *this, *next;
2260        union futex_key key = FUTEX_KEY_INIT;
2261        u32 uval, vpid = task_pid_vnr(current);
2262        int ret;
2263
2264retry:
2265        if (get_user(uval, uaddr))
2266                return -EFAULT;
2267        /*
2268         * We release only a lock we actually own:
2269         */
2270        if ((uval & FUTEX_TID_MASK) != vpid)
2271                return -EPERM;
2272
2273        ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &key, VERIFY_WRITE);
2274        if (unlikely(ret != 0))
2275                goto out;
2276
2277        hb = hash_futex(&key);
2278        spin_lock(&hb->lock);
2279
2280        /*
2281         * To avoid races, try to do the TID -> 0 atomic transition
2282         * again. If it succeeds then we can return without waking
2283         * anyone else up:
2284         */
2285        if (!(uval & FUTEX_OWNER_DIED) &&
2286            cmpxchg_futex_value_locked(&uval, uaddr, vpid, 0))
2287                goto pi_faulted;
2288        /*
2289         * Rare case: we managed to release the lock atomically,
2290         * no need to wake anyone else up:
2291         */
2292        if (unlikely(uval == vpid))
2293                goto out_unlock;
2294
2295        /*
2296         * Ok, other tasks may need to be woken up - check waiters
2297         * and do the wakeup if necessary:
2298         */
2299        plist_for_each_entry_safe(this, next, &hb->chain, list) {
2300                if (!match_futex (&this->key, &key))
2301                        continue;
2302                ret = wake_futex_pi(uaddr, uval, this);
2303                /*
2304                 * The atomic access to the futex value
2305                 * generated a pagefault, so retry the
2306                 * user-access and the wakeup:
2307                 */
2308                if (ret == -EFAULT)
2309                        goto pi_faulted;
2310                goto out_unlock;
2311        }
2312        /*
2313         * No waiters - kernel unlocks the futex:
2314         */
2315        if (!(uval & FUTEX_OWNER_DIED)) {
2316                ret = unlock_futex_pi(uaddr, uval);
2317                if (ret == -EFAULT)
2318                        goto pi_faulted;
2319        }
2320
2321out_unlock:
2322        spin_unlock(&hb->lock);
2323        put_futex_key(&key);
2324
2325out:
2326        return ret;
2327
2328pi_faulted:
2329        spin_unlock(&hb->lock);
2330        put_futex_key(&key);
2331
2332        ret = fault_in_user_writeable(uaddr);
2333        if (!ret)
2334                goto retry;
2335
2336        return ret;
2337}
2338
2339/**
2340 * handle_early_requeue_pi_wakeup() - Detect early wakeup on the initial futex
2341 * @hb:         the hash_bucket futex_q was original enqueued on
2342 * @q:          the futex_q woken while waiting to be requeued
2343 * @key2:       the futex_key of the requeue target futex
2344 * @timeout:    the timeout associated with the wait (NULL if none)
2345 *
2346 * Detect if the task was woken on the initial futex as opposed to the requeue
2347 * target futex.  If so, determine if it was a timeout or a signal that caused
2348 * the wakeup and return the appropriate error code to the caller.  Must be
2349 * called with the hb lock held.
2350 *
2351 * Return:
2352 *  0 = no early wakeup detected;
2353 * <0 = -ETIMEDOUT or -ERESTARTNOINTR
2354 */
2355static inline
2356int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb,
2357                                   struct futex_q *q, union futex_key *key2,
2358                                   struct hrtimer_sleeper *timeout)
2359{
2360        int ret = 0;
2361
2362        /*
2363         * With the hb lock held, we avoid races while we process the wakeup.
2364         * We only need to hold hb (and not hb2) to ensure atomicity as the
2365         * wakeup code can't change q.key from uaddr to uaddr2 if we hold hb.
2366         * It can't be requeued from uaddr2 to something else since we don't
2367         * support a PI aware source futex for requeue.
2368         */
2369        if (!match_futex(&q->key, key2)) {
2370                WARN_ON(q->lock_ptr && (&hb->lock != q->lock_ptr));
2371                /*
2372                 * We were woken prior to requeue by a timeout or a signal.
2373                 * Unqueue the futex_q and determine which it was.
2374                 */
2375                plist_del(&q->list, &hb->chain);
2376                hb_waiters_dec(hb);
2377
2378                /* Handle spurious wakeups gracefully */
2379                ret = -EWOULDBLOCK;
2380                if (timeout && !timeout->task)
2381                        ret = -ETIMEDOUT;
2382                else if (signal_pending(current))
2383                        ret = -ERESTARTNOINTR;
2384        }
2385        return ret;
2386}
2387
2388/**
2389 * futex_wait_requeue_pi() - Wait on uaddr and take uaddr2
2390 * @uaddr:      the futex we initially wait on (non-pi)
2391 * @flags:      futex flags (FLAGS_SHARED, FLAGS_CLOCKRT, etc.), they must be
2392 *              the same type, no requeueing from private to shared, etc.
2393 * @val:        the expected value of uaddr
2394 * @abs_time:   absolute timeout
2395 * @bitset:     32 bit wakeup bitset set by userspace, defaults to all
2396 * @uaddr2:     the pi futex we will take prior to returning to user-space
2397 *
2398 * The caller will wait on uaddr and will be requeued by futex_requeue() to
2399 * uaddr2 which must be PI aware and unique from uaddr.  Normal wakeup will wake
2400 * on uaddr2 and complete the acquisition of the rt_mutex prior to returning to
2401 * userspace.  This ensures the rt_mutex maintains an owner when it has waiters;
2402 * without one, the pi logic would not know which task to boost/deboost, if
2403 * there was a need to.
2404 *
2405 * We call schedule in futex_wait_queue_me() when we enqueue and return there
2406 * via the following--
2407 * 1) wakeup on uaddr2 after an atomic lock acquisition by futex_requeue()
2408 * 2) wakeup on uaddr2 after a requeue
2409 * 3) signal
2410 * 4) timeout
2411 *
2412 * If 3, cleanup and return -ERESTARTNOINTR.
2413 *
2414 * If 2, we may then block on trying to take the rt_mutex and return via:
2415 * 5) successful lock
2416 * 6) signal
2417 * 7) timeout
2418 * 8) other lock acquisition failure
2419 *
2420 * If 6, return -EWOULDBLOCK (restarting the syscall would do the same).
2421 *
2422 * If 4 or 7, we cleanup and return with -ETIMEDOUT.
2423 *
2424 * Return:
2425 *  0 - On success;
2426 * <0 - On error
2427 */
2428static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
2429                                 u32 val, ktime_t *abs_time, u32 bitset,
2430                                 u32 __user *uaddr2)
2431{
2432        struct hrtimer_sleeper timeout, *to = NULL;
2433        struct rt_mutex_waiter rt_waiter;
2434        struct rt_mutex *pi_mutex = NULL;
2435        struct futex_hash_bucket *hb;
2436        union futex_key key2 = FUTEX_KEY_INIT;
2437        struct futex_q q = futex_q_init;
2438        int res, ret;
2439
2440        if (uaddr == uaddr2)
2441                return -EINVAL;
2442
2443        if (!bitset)
2444                return -EINVAL;
2445
2446        if (abs_time) {
2447                to = &timeout;
2448                hrtimer_init_on_stack(&to->timer, (flags & FLAGS_CLOCKRT) ?
2449                                      CLOCK_REALTIME : CLOCK_MONOTONIC,
2450                                      HRTIMER_MODE_ABS);
2451                hrtimer_init_sleeper(to, current);
2452                hrtimer_set_expires_range_ns(&to->timer, *abs_time,
2453                                             current->timer_slack_ns);
2454        }
2455
2456        /*
2457         * The waiter is allocated on our stack, manipulated by the requeue
2458         * code while we sleep on uaddr.
2459         */
2460        debug_rt_mutex_init_waiter(&rt_waiter);
2461        RB_CLEAR_NODE(&rt_waiter.pi_tree_entry);
2462        RB_CLEAR_NODE(&rt_waiter.tree_entry);
2463        rt_waiter.task = NULL;
2464
2465        ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, VERIFY_WRITE);
2466        if (unlikely(ret != 0))
2467                goto out;
2468
2469        q.bitset = bitset;
2470        q.rt_waiter = &rt_waiter;
2471        q.requeue_pi_key = &key2;
2472
2473        /*
2474         * Prepare to wait on uaddr. On success, increments q.key (key1) ref
2475         * count.
2476         */
2477        ret = futex_wait_setup(uaddr, val, flags, &q, &hb);
2478        if (ret)
2479                goto out_key2;
2480
2481        /* Queue the futex_q, drop the hb lock, wait for wakeup. */
2482        futex_wait_queue_me(hb, &q, to);
2483
2484        spin_lock(&hb->lock);
2485        ret = handle_early_requeue_pi_wakeup(hb, &q, &key2, to);
2486        spin_unlock(&hb->lock);
2487        if (ret)
2488                goto out_put_keys;
2489
2490        /*
2491         * In order for us to be here, we know our q.key == key2, and since
2492         * we took the hb->lock above, we also know that futex_requeue() has
2493         * completed and we no longer have to concern ourselves with a wakeup
2494         * race with the atomic proxy lock acquisition by the requeue code. The
2495         * futex_requeue dropped our key1 reference and incremented our key2
2496         * reference count.
2497         */
2498
2499        /* Check if the requeue code acquired the second futex for us. */
2500        if (!q.rt_waiter) {
2501                /*
2502                 * Got the lock. We might not be the anticipated owner if we
2503                 * did a lock-steal - fix up the PI-state in that case.
2504                 */
2505                if (q.pi_state && (q.pi_state->owner != current)) {
2506                        spin_lock(q.lock_ptr);
2507                        ret = fixup_pi_state_owner(uaddr2, &q, current);
2508                        spin_unlock(q.lock_ptr);
2509                }
2510        } else {
2511                /*
2512                 * We have been woken up by futex_unlock_pi(), a timeout, or a
2513                 * signal.  futex_unlock_pi() will not destroy the lock_ptr nor
2514                 * the pi_state.
2515                 */
2516                WARN_ON(!q.pi_state);
2517                pi_mutex = &q.pi_state->pi_mutex;
2518                ret = rt_mutex_finish_proxy_lock(pi_mutex, to, &rt_waiter, 1);
2519                debug_rt_mutex_free_waiter(&rt_waiter);
2520
2521                spin_lock(q.lock_ptr);
2522                /*
2523                 * Fixup the pi_state owner and possibly acquire the lock if we
2524                 * haven't already.
2525                 */
2526                res = fixup_owner(uaddr2, &q, !ret);
2527                /*
2528                 * If fixup_owner() returned an error, proprogate that.  If it
2529                 * acquired the lock, clear -ETIMEDOUT or -EINTR.
2530                 */
2531                if (res)
2532                        ret = (res < 0) ? res : 0;
2533
2534                /* Unqueue and drop the lock. */
2535                unqueue_me_pi(&q);
2536        }
2537
2538        /*
2539         * If fixup_pi_state_owner() faulted and was unable to handle the
2540         * fault, unlock the rt_mutex and return the fault to userspace.
2541         */
2542        if (ret == -EFAULT) {
2543                if (pi_mutex && rt_mutex_owner(pi_mutex) == current)
2544                        rt_mutex_unlock(pi_mutex);
2545        } else if (ret == -EINTR) {
2546                /*
2547                 * We've already been requeued, but cannot restart by calling
2548                 * futex_lock_pi() directly. We could restart this syscall, but
2549                 * it would detect that the user space "val" changed and return
2550                 * -EWOULDBLOCK.  Save the overhead of the restart and return
2551                 * -EWOULDBLOCK directly.
2552                 */
2553                ret = -EWOULDBLOCK;
2554        }
2555
2556out_put_keys:
2557        put_futex_key(&q.key);
2558out_key2:
2559        put_futex_key(&key2);
2560
2561out:
2562        if (to) {
2563                hrtimer_cancel(&to->timer);
2564                destroy_hrtimer_on_stack(&to->timer);
2565        }
2566        return ret;
2567}
2568
2569/*
2570 * Support for robust futexes: the kernel cleans up held futexes at
2571 * thread exit time.
2572 *
2573 * Implementation: user-space maintains a per-thread list of locks it
2574 * is holding. Upon do_exit(), the kernel carefully walks this list,
2575 * and marks all locks that are owned by this thread with the
2576 * FUTEX_OWNER_DIED bit, and wakes up a waiter (if any). The list is
2577 * always manipulated with the lock held, so the list is private and
2578 * per-thread. Userspace also maintains a per-thread 'list_op_pending'
2579 * field, to allow the kernel to clean up if the thread dies after
2580 * acquiring the lock, but just before it could have added itself to
2581 * the list. There can only be one such pending lock.
2582 */
2583
2584/**
2585 * sys_set_robust_list() - Set the robust-futex list head of a task
2586 * @head:       pointer to the list-head
2587 * @len:        length of the list-head, as userspace expects
2588 */
2589SYSCALL_DEFINE2(set_robust_list, struct robust_list_head __user *, head,
2590                size_t, len)
2591{
2592        if (!futex_cmpxchg_enabled)
2593                return -ENOSYS;
2594        /*
2595         * The kernel knows only one size for now:
2596         */
2597        if (unlikely(len != sizeof(*head)))
2598                return -EINVAL;
2599
2600        current->robust_list = head;
2601
2602        return 0;
2603}
2604
2605/**
2606 * sys_get_robust_list() - Get the robust-futex list head of a task
2607 * @pid:        pid of the process [zero for current task]
2608 * @head_ptr:   pointer to a list-head pointer, the kernel fills it in
2609 * @len_ptr:    pointer to a length field, the kernel fills in the header size
2610 */
2611SYSCALL_DEFINE3(get_robust_list, int, pid,
2612                struct robust_list_head __user * __user *, head_ptr,
2613                size_t __user *, len_ptr)
2614{
2615        struct robust_list_head __user *head;
2616        unsigned long ret;
2617        struct task_struct *p;
2618
2619        if (!futex_cmpxchg_enabled)
2620                return -ENOSYS;
2621
2622        rcu_read_lock();
2623
2624        ret = -ESRCH;
2625        if (!pid)
2626                p = current;
2627        else {
2628                p = find_task_by_vpid(pid);
2629                if (!p)
2630                        goto err_unlock;
2631        }
2632
2633        ret = -EPERM;
2634        if (!ptrace_may_access(p, PTRACE_MODE_READ))
2635                goto err_unlock;
2636
2637        head = p->robust_list;
2638        rcu_read_unlock();
2639
2640        if (put_user(sizeof(*head), len_ptr))
2641                return -EFAULT;
2642        return put_user(head, head_ptr);
2643
2644err_unlock:
2645        rcu_read_unlock();
2646
2647        return ret;
2648}
2649
2650/*
2651 * Process a futex-list entry, check whether it's owned by the
2652 * dying task, and do notification if so:
2653 */
2654int handle_futex_death(u32 __user *uaddr, struct task_struct *curr, int pi)
2655{
2656        u32 uval, uninitialized_var(nval), mval;
2657
2658retry:
2659        if (get_user(uval, uaddr))
2660                return -1;
2661
2662        if ((uval & FUTEX_TID_MASK) == task_pid_vnr(curr)) {
2663                /*
2664                 * Ok, this dying thread is truly holding a futex
2665                 * of interest. Set the OWNER_DIED bit atomically
2666                 * via cmpxchg, and if the value had FUTEX_WAITERS
2667                 * set, wake up a waiter (if any). (We have to do a
2668                 * futex_wake() even if OWNER_DIED is already set -
2669                 * to handle the rare but possible case of recursive
2670                 * thread-death.) The rest of the cleanup is done in
2671                 * userspace.
2672                 */
2673                mval = (uval & FUTEX_WAITERS) | FUTEX_OWNER_DIED;
2674                /*
2675                 * We are not holding a lock here, but we want to have
2676                 * the pagefault_disable/enable() protection because
2677                 * we want to handle the fault gracefully. If the
2678                 * access fails we try to fault in the futex with R/W
2679                 * verification via get_user_pages. get_user() above
2680                 * does not guarantee R/W access. If that fails we
2681                 * give up and leave the futex locked.
2682                 */
2683                if (cmpxchg_futex_value_locked(&nval, uaddr, uval, mval)) {
2684                        if (fault_in_user_writeable(uaddr))
2685                                return -1;
2686                        goto retry;
2687                }
2688                if (nval != uval)
2689                        goto retry;
2690
2691                /*
2692                 * Wake robust non-PI futexes here. The wakeup of
2693                 * PI futexes happens in exit_pi_state():
2694                 */
2695                if (!pi && (uval & FUTEX_WAITERS))
2696                        futex_wake(uaddr, 1, 1, FUTEX_BITSET_MATCH_ANY);
2697        }
2698        return 0;
2699}
2700
2701/*
2702 * Fetch a robust-list pointer. Bit 0 signals PI futexes:
2703 */
2704static inline int fetch_robust_entry(struct robust_list __user **entry,
2705                                     struct robust_list __user * __user *head,
2706                                     unsigned int *pi)
2707{
2708        unsigned long uentry;
2709
2710        if (get_user(uentry, (unsigned long __user *)head))
2711                return -EFAULT;
2712
2713        *entry = (void __user *)(uentry & ~1UL);
2714        *pi = uentry & 1;
2715
2716        return 0;
2717}
2718
2719/*
2720 * Walk curr->robust_list (very carefully, it's a userspace list!)
2721 * and mark any locks found there dead, and notify any waiters.
2722 *
2723 * We silently return on any sign of list-walking problem.
2724 */
2725void exit_robust_list(struct task_struct *curr)
2726{
2727        struct robust_list_head __user *head = curr->robust_list;
2728        struct robust_list __user *entry, *next_entry, *pending;
2729        unsigned int limit = ROBUST_LIST_LIMIT, pi, pip;
2730        unsigned int uninitialized_var(next_pi);
2731        unsigned long futex_offset;
2732        int rc;
2733
2734        if (!futex_cmpxchg_enabled)
2735                return;
2736
2737        /*
2738         * Fetch the list head (which was registered earlier, via
2739         * sys_set_robust_list()):
2740         */
2741        if (fetch_robust_entry(&entry, &head->list.next, &pi))
2742                return;
2743        /*
2744         * Fetch the relative futex offset:
2745         */
2746        if (get_user(futex_offset, &head->futex_offset))
2747                return;
2748        /*
2749         * Fetch any possibly pending lock-add first, and handle it
2750         * if it exists:
2751         */
2752        if (fetch_robust_entry(&pending, &head->list_op_pending, &pip))
2753                return;
2754
2755        next_entry = NULL;      /* avoid warning with gcc */
2756        while (entry != &head->list) {
2757                /*
2758                 * Fetch the next entry in the list before calling
2759                 * handle_futex_death:
2760                 */
2761                rc = fetch_robust_entry(&next_entry, &entry->next, &next_pi);
2762                /*
2763                 * A pending lock might already be on the list, so
2764                 * don't process it twice:
2765                 */
2766                if (entry != pending)
2767                        if (handle_futex_death((void __user *)entry + futex_offset,
2768                                                curr, pi))
2769                                return;
2770                if (rc)
2771                        return;
2772                entry = next_entry;
2773                pi = next_pi;
2774                /*
2775                 * Avoid excessively long or circular lists:
2776                 */
2777                if (!--limit)
2778                        break;
2779
2780                cond_resched();
2781        }
2782
2783        if (pending)
2784                handle_futex_death((void __user *)pending + futex_offset,
2785                                   curr, pip);
2786}
2787
2788long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
2789                u32 __user *uaddr2, u32 val2, u32 val3)
2790{
2791        int cmd = op & FUTEX_CMD_MASK;
2792        unsigned int flags = 0;
2793
2794        if (!(op & FUTEX_PRIVATE_FLAG))
2795                flags |= FLAGS_SHARED;
2796
2797        if (op & FUTEX_CLOCK_REALTIME) {
2798                flags |= FLAGS_CLOCKRT;
2799                if (cmd != FUTEX_WAIT_BITSET && cmd != FUTEX_WAIT_REQUEUE_PI)
2800                        return -ENOSYS;
2801        }
2802
2803        switch (cmd) {
2804        case FUTEX_LOCK_PI:
2805        case FUTEX_UNLOCK_PI:
2806        case FUTEX_TRYLOCK_PI:
2807        case FUTEX_WAIT_REQUEUE_PI:
2808        case FUTEX_CMP_REQUEUE_PI:
2809                if (!futex_cmpxchg_enabled)
2810                        return -ENOSYS;
2811        }
2812
2813        switch (cmd) {
2814        case FUTEX_WAIT:
2815                val3 = FUTEX_BITSET_MATCH_ANY;
2816        case FUTEX_WAIT_BITSET:
2817                return futex_wait(uaddr, flags, val, timeout, val3);
2818        case FUTEX_WAKE:
2819                val3 = FUTEX_BITSET_MATCH_ANY;
2820        case FUTEX_WAKE_BITSET:
2821                return futex_wake(uaddr, flags, val, val3);
2822        case FUTEX_REQUEUE:
2823                return futex_requeue(uaddr, flags, uaddr2, val, val2, NULL, 0);
2824        case FUTEX_CMP_REQUEUE:
2825                return futex_requeue(uaddr, flags, uaddr2, val, val2, &val3, 0);
2826        case FUTEX_WAKE_OP:
2827                return futex_wake_op(uaddr, flags, uaddr2, val, val2, val3);
2828        case FUTEX_LOCK_PI:
2829                return futex_lock_pi(uaddr, flags, val, timeout, 0);
2830        case FUTEX_UNLOCK_PI:
2831                return futex_unlock_pi(uaddr, flags);
2832        case FUTEX_TRYLOCK_PI:
2833                return futex_lock_pi(uaddr, flags, 0, timeout, 1);
2834        case FUTEX_WAIT_REQUEUE_PI:
2835                val3 = FUTEX_BITSET_MATCH_ANY;
2836                return futex_wait_requeue_pi(uaddr, flags, val, timeout, val3,
2837                                             uaddr2);
2838        case FUTEX_CMP_REQUEUE_PI:
2839                return futex_requeue(uaddr, flags, uaddr2, val, val2, &val3, 1);
2840        }
2841        return -ENOSYS;
2842}
2843
2844
2845SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val,
2846                struct timespec __user *, utime, u32 __user *, uaddr2,
2847                u32, val3)
2848{
2849        struct timespec ts;
2850        ktime_t t, *tp = NULL;
2851        u32 val2 = 0;
2852        int cmd = op & FUTEX_CMD_MASK;
2853
2854        if (utime && (cmd == FUTEX_WAIT || cmd == FUTEX_LOCK_PI ||
2855                      cmd == FUTEX_WAIT_BITSET ||
2856                      cmd == FUTEX_WAIT_REQUEUE_PI)) {
2857                if (copy_from_user(&ts, utime, sizeof(ts)) != 0)
2858                        return -EFAULT;
2859                if (!timespec_valid(&ts))
2860                        return -EINVAL;
2861
2862                t = timespec_to_ktime(ts);
2863                if (cmd == FUTEX_WAIT)
2864                        t = ktime_add_safe(ktime_get(), t);
2865                tp = &t;
2866        }
2867        /*
2868         * requeue parameter in 'utime' if cmd == FUTEX_*_REQUEUE_*.
2869         * number of waiters to wake in 'utime' if cmd == FUTEX_WAKE_OP.
2870         */
2871        if (cmd == FUTEX_REQUEUE || cmd == FUTEX_CMP_REQUEUE ||
2872            cmd == FUTEX_CMP_REQUEUE_PI || cmd == FUTEX_WAKE_OP)
2873                val2 = (u32) (unsigned long) utime;
2874
2875        return do_futex(uaddr, op, val, tp, uaddr2, val2, val3);
2876}
2877
2878static int __init futex_init(void)
2879{
2880        u32 curval;
2881        unsigned int futex_shift;
2882        unsigned long i;
2883
2884#if CONFIG_BASE_SMALL
2885        futex_hashsize = 16;
2886#else
2887        futex_hashsize = roundup_pow_of_two(256 * num_possible_cpus());
2888#endif
2889
2890        futex_queues = alloc_large_system_hash("futex", sizeof(*futex_queues),
2891                                               futex_hashsize, 0,
2892                                               futex_hashsize < 256 ? HASH_SMALL : 0,
2893                                               &futex_shift, NULL,
2894                                               futex_hashsize, futex_hashsize);
2895        futex_hashsize = 1UL << futex_shift;
2896        /*
2897         * This will fail and we want it. Some arch implementations do
2898         * runtime detection of the futex_atomic_cmpxchg_inatomic()
2899         * functionality. We want to know that before we call in any
2900         * of the complex code paths. Also we want to prevent
2901         * registration of robust lists in that case. NULL is
2902         * guaranteed to fault and we get -EFAULT on functional
2903         * implementation, the non-functional ones will return
2904         * -ENOSYS.
2905         */
2906        if (cmpxchg_futex_value_locked(&curval, NULL, 0, 0) == -EFAULT)
2907                futex_cmpxchg_enabled = 1;
2908
2909        for (i = 0; i < futex_hashsize; i++) {
2910                atomic_set(&futex_queues[i].waiters, 0);
2911                plist_head_init(&futex_queues[i].chain);
2912                spin_lock_init(&futex_queues[i].lock);
2913        }
2914
2915        return 0;
2916}
2917__initcall(futex_init);
2918