linux/kernel/futex.c
<<
>>
Prefs
   1/*
   2 *  Fast Userspace Mutexes (which I call "Futexes!").
   3 *  (C) Rusty Russell, IBM 2002
   4 *
   5 *  Generalized futexes, futex requeueing, misc fixes by Ingo Molnar
   6 *  (C) Copyright 2003 Red Hat Inc, All Rights Reserved
   7 *
   8 *  Removed page pinning, fix privately mapped COW pages and other cleanups
   9 *  (C) Copyright 2003, 2004 Jamie Lokier
  10 *
  11 *  Robust futex support started by Ingo Molnar
  12 *  (C) Copyright 2006 Red Hat Inc, All Rights Reserved
  13 *  Thanks to Thomas Gleixner for suggestions, analysis and fixes.
  14 *
  15 *  PI-futex support started by Ingo Molnar and Thomas Gleixner
  16 *  Copyright (C) 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
  17 *  Copyright (C) 2006 Timesys Corp., Thomas Gleixner <tglx@timesys.com>
  18 *
  19 *  PRIVATE futexes by Eric Dumazet
  20 *  Copyright (C) 2007 Eric Dumazet <dada1@cosmosbay.com>
  21 *
  22 *  Requeue-PI support by Darren Hart <dvhltc@us.ibm.com>
  23 *  Copyright (C) IBM Corporation, 2009
  24 *  Thanks to Thomas Gleixner for conceptual design and careful reviews.
  25 *
  26 *  Thanks to Ben LaHaise for yelling "hashed waitqueues" loudly
  27 *  enough at me, Linus for the original (flawed) idea, Matthew
  28 *  Kirkwood for proof-of-concept implementation.
  29 *
  30 *  "The futexes are also cursed."
  31 *  "But they come in a choice of three flavours!"
  32 *
  33 *  This program is free software; you can redistribute it and/or modify
  34 *  it under the terms of the GNU General Public License as published by
  35 *  the Free Software Foundation; either version 2 of the License, or
  36 *  (at your option) any later version.
  37 *
  38 *  This program is distributed in the hope that it will be useful,
  39 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
  40 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  41 *  GNU General Public License for more details.
  42 *
  43 *  You should have received a copy of the GNU General Public License
  44 *  along with this program; if not, write to the Free Software
  45 *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  46 */
  47#include <linux/slab.h>
  48#include <linux/poll.h>
  49#include <linux/fs.h>
  50#include <linux/file.h>
  51#include <linux/jhash.h>
  52#include <linux/init.h>
  53#include <linux/futex.h>
  54#include <linux/mount.h>
  55#include <linux/pagemap.h>
  56#include <linux/syscalls.h>
  57#include <linux/signal.h>
  58#include <linux/module.h>
  59#include <linux/magic.h>
  60#include <linux/pid.h>
  61#include <linux/nsproxy.h>
  62
  63#include <asm/futex.h>
  64
  65#include "rtmutex_common.h"
  66
  67int __read_mostly futex_cmpxchg_enabled;
  68
  69#define FUTEX_HASHBITS (CONFIG_BASE_SMALL ? 4 : 8)
  70
  71/*
  72 * Futex flags used to encode options to functions and preserve them across
  73 * restarts.
  74 */
  75#define FLAGS_SHARED            0x01
  76#define FLAGS_CLOCKRT           0x02
  77#define FLAGS_HAS_TIMEOUT       0x04
  78
  79/*
  80 * Priority Inheritance state:
  81 */
  82struct futex_pi_state {
  83        /*
  84         * list of 'owned' pi_state instances - these have to be
  85         * cleaned up in do_exit() if the task exits prematurely:
  86         */
  87        struct list_head list;
  88
  89        /*
  90         * The PI object:
  91         */
  92        struct rt_mutex pi_mutex;
  93
  94        struct task_struct *owner;
  95        atomic_t refcount;
  96
  97        union futex_key key;
  98};
  99
 100/**
 101 * struct futex_q - The hashed futex queue entry, one per waiting task
 102 * @list:               priority-sorted list of tasks waiting on this futex
 103 * @task:               the task waiting on the futex
 104 * @lock_ptr:           the hash bucket lock
 105 * @key:                the key the futex is hashed on
 106 * @pi_state:           optional priority inheritance state
 107 * @rt_waiter:          rt_waiter storage for use with requeue_pi
 108 * @requeue_pi_key:     the requeue_pi target futex key
 109 * @bitset:             bitset for the optional bitmasked wakeup
 110 *
 111 * We use this hashed waitqueue, instead of a normal wait_queue_t, so
 112 * we can wake only the relevant ones (hashed queues may be shared).
 113 *
 114 * A futex_q has a woken state, just like tasks have TASK_RUNNING.
 115 * It is considered woken when plist_node_empty(&q->list) || q->lock_ptr == 0.
 116 * The order of wakeup is always to make the first condition true, then
 117 * the second.
 118 *
 119 * PI futexes are typically woken before they are removed from the hash list via
 120 * the rt_mutex code. See unqueue_me_pi().
 121 */
 122struct futex_q {
 123        struct plist_node list;
 124
 125        struct task_struct *task;
 126        spinlock_t *lock_ptr;
 127        union futex_key key;
 128        struct futex_pi_state *pi_state;
 129        struct rt_mutex_waiter *rt_waiter;
 130        union futex_key *requeue_pi_key;
 131        u32 bitset;
 132};
 133
 134static const struct futex_q futex_q_init = {
 135        /* list gets initialized in queue_me()*/
 136        .key = FUTEX_KEY_INIT,
 137        .bitset = FUTEX_BITSET_MATCH_ANY
 138};
 139
 140/*
 141 * Hash buckets are shared by all the futex_keys that hash to the same
 142 * location.  Each key may have multiple futex_q structures, one for each task
 143 * waiting on a futex.
 144 */
 145struct futex_hash_bucket {
 146        spinlock_t lock;
 147        struct plist_head chain;
 148};
 149
 150static struct futex_hash_bucket futex_queues[1<<FUTEX_HASHBITS];
 151
 152/*
 153 * We hash on the keys returned from get_futex_key (see below).
 154 */
 155static struct futex_hash_bucket *hash_futex(union futex_key *key)
 156{
 157        u32 hash = jhash2((u32*)&key->both.word,
 158                          (sizeof(key->both.word)+sizeof(key->both.ptr))/4,
 159                          key->both.offset);
 160        return &futex_queues[hash & ((1 << FUTEX_HASHBITS)-1)];
 161}
 162
 163/*
 164 * Return 1 if two futex_keys are equal, 0 otherwise.
 165 */
 166static inline int match_futex(union futex_key *key1, union futex_key *key2)
 167{
 168        return (key1 && key2
 169                && key1->both.word == key2->both.word
 170                && key1->both.ptr == key2->both.ptr
 171                && key1->both.offset == key2->both.offset);
 172}
 173
 174/*
 175 * Take a reference to the resource addressed by a key.
 176 * Can be called while holding spinlocks.
 177 *
 178 */
 179static void get_futex_key_refs(union futex_key *key)
 180{
 181        if (!key->both.ptr)
 182                return;
 183
 184        switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) {
 185        case FUT_OFF_INODE:
 186                ihold(key->shared.inode);
 187                break;
 188        case FUT_OFF_MMSHARED:
 189                atomic_inc(&key->private.mm->mm_count);
 190                break;
 191        }
 192}
 193
 194/*
 195 * Drop a reference to the resource addressed by a key.
 196 * The hash bucket spinlock must not be held.
 197 */
 198static void drop_futex_key_refs(union futex_key *key)
 199{
 200        if (!key->both.ptr) {
 201                /* If we're here then we tried to put a key we failed to get */
 202                WARN_ON_ONCE(1);
 203                return;
 204        }
 205
 206        switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) {
 207        case FUT_OFF_INODE:
 208                iput(key->shared.inode);
 209                break;
 210        case FUT_OFF_MMSHARED:
 211                mmdrop(key->private.mm);
 212                break;
 213        }
 214}
 215
 216/**
 217 * get_futex_key() - Get parameters which are the keys for a futex
 218 * @uaddr:      virtual address of the futex
 219 * @fshared:    0 for a PROCESS_PRIVATE futex, 1 for PROCESS_SHARED
 220 * @key:        address where result is stored.
 221 * @rw:         mapping needs to be read/write (values: VERIFY_READ,
 222 *              VERIFY_WRITE)
 223 *
 224 * Returns a negative error code or 0
 225 * The key words are stored in *key on success.
 226 *
 227 * For shared mappings, it's (page->index, vma->vm_file->f_path.dentry->d_inode,
 228 * offset_within_page).  For private mappings, it's (uaddr, current->mm).
 229 * We can usually work out the index without swapping in the page.
 230 *
 231 * lock_page() might sleep, the caller should not hold a spinlock.
 232 */
 233static int
 234get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key, int rw)
 235{
 236        unsigned long address = (unsigned long)uaddr;
 237        struct mm_struct *mm = current->mm;
 238        struct page *page, *page_head;
 239        int err, ro = 0;
 240
 241        /*
 242         * The futex address must be "naturally" aligned.
 243         */
 244        key->both.offset = address % PAGE_SIZE;
 245        if (unlikely((address % sizeof(u32)) != 0))
 246                return -EINVAL;
 247        address -= key->both.offset;
 248
 249        /*
 250         * PROCESS_PRIVATE futexes are fast.
 251         * As the mm cannot disappear under us and the 'key' only needs
 252         * virtual address, we dont even have to find the underlying vma.
 253         * Note : We do have to check 'uaddr' is a valid user address,
 254         *        but access_ok() should be faster than find_vma()
 255         */
 256        if (!fshared) {
 257                if (unlikely(!access_ok(VERIFY_WRITE, uaddr, sizeof(u32))))
 258                        return -EFAULT;
 259                key->private.mm = mm;
 260                key->private.address = address;
 261                get_futex_key_refs(key);
 262                return 0;
 263        }
 264
 265again:
 266        err = get_user_pages_fast(address, 1, 1, &page);
 267        /*
 268         * If write access is not required (eg. FUTEX_WAIT), try
 269         * and get read-only access.
 270         */
 271        if (err == -EFAULT && rw == VERIFY_READ) {
 272                err = get_user_pages_fast(address, 1, 0, &page);
 273                ro = 1;
 274        }
 275        if (err < 0)
 276                return err;
 277        else
 278                err = 0;
 279
 280#ifdef CONFIG_TRANSPARENT_HUGEPAGE
 281        page_head = page;
 282        if (unlikely(PageTail(page))) {
 283                put_page(page);
 284                /* serialize against __split_huge_page_splitting() */
 285                local_irq_disable();
 286                if (likely(__get_user_pages_fast(address, 1, 1, &page) == 1)) {
 287                        page_head = compound_head(page);
 288                        /*
 289                         * page_head is valid pointer but we must pin
 290                         * it before taking the PG_lock and/or
 291                         * PG_compound_lock. The moment we re-enable
 292                         * irqs __split_huge_page_splitting() can
 293                         * return and the head page can be freed from
 294                         * under us. We can't take the PG_lock and/or
 295                         * PG_compound_lock on a page that could be
 296                         * freed from under us.
 297                         */
 298                        if (page != page_head) {
 299                                get_page(page_head);
 300                                put_page(page);
 301                        }
 302                        local_irq_enable();
 303                } else {
 304                        local_irq_enable();
 305                        goto again;
 306                }
 307        }
 308#else
 309        page_head = compound_head(page);
 310        if (page != page_head) {
 311                get_page(page_head);
 312                put_page(page);
 313        }
 314#endif
 315
 316        lock_page(page_head);
 317        if (!page_head->mapping) {
 318                unlock_page(page_head);
 319                put_page(page_head);
 320                /*
 321                * ZERO_PAGE pages don't have a mapping. Avoid a busy loop
 322                * trying to find one. RW mapping would have COW'd (and thus
 323                * have a mapping) so this page is RO and won't ever change.
 324                */
 325                if ((page_head == ZERO_PAGE(address)))
 326                        return -EFAULT;
 327                goto again;
 328        }
 329
 330        /*
 331         * Private mappings are handled in a simple way.
 332         *
 333         * NOTE: When userspace waits on a MAP_SHARED mapping, even if
 334         * it's a read-only handle, it's expected that futexes attach to
 335         * the object not the particular process.
 336         */
 337        if (PageAnon(page_head)) {
 338                /*
 339                 * A RO anonymous page will never change and thus doesn't make
 340                 * sense for futex operations.
 341                 */
 342                if (ro) {
 343                        err = -EFAULT;
 344                        goto out;
 345                }
 346
 347                key->both.offset |= FUT_OFF_MMSHARED; /* ref taken on mm */
 348                key->private.mm = mm;
 349                key->private.address = address;
 350        } else {
 351                key->both.offset |= FUT_OFF_INODE; /* inode-based key */
 352                key->shared.inode = page_head->mapping->host;
 353                key->shared.pgoff = page_head->index;
 354        }
 355
 356        get_futex_key_refs(key);
 357
 358out:
 359        unlock_page(page_head);
 360        put_page(page_head);
 361        return err;
 362}
 363
 364static inline void put_futex_key(union futex_key *key)
 365{
 366        drop_futex_key_refs(key);
 367}
 368
 369/**
 370 * fault_in_user_writeable() - Fault in user address and verify RW access
 371 * @uaddr:      pointer to faulting user space address
 372 *
 373 * Slow path to fixup the fault we just took in the atomic write
 374 * access to @uaddr.
 375 *
 376 * We have no generic implementation of a non-destructive write to the
 377 * user address. We know that we faulted in the atomic pagefault
 378 * disabled section so we can as well avoid the #PF overhead by
 379 * calling get_user_pages() right away.
 380 */
 381static int fault_in_user_writeable(u32 __user *uaddr)
 382{
 383        struct mm_struct *mm = current->mm;
 384        int ret;
 385
 386        down_read(&mm->mmap_sem);
 387        ret = fixup_user_fault(current, mm, (unsigned long)uaddr,
 388                               FAULT_FLAG_WRITE);
 389        up_read(&mm->mmap_sem);
 390
 391        return ret < 0 ? ret : 0;
 392}
 393
 394/**
 395 * futex_top_waiter() - Return the highest priority waiter on a futex
 396 * @hb:         the hash bucket the futex_q's reside in
 397 * @key:        the futex key (to distinguish it from other futex futex_q's)
 398 *
 399 * Must be called with the hb lock held.
 400 */
 401static struct futex_q *futex_top_waiter(struct futex_hash_bucket *hb,
 402                                        union futex_key *key)
 403{
 404        struct futex_q *this;
 405
 406        plist_for_each_entry(this, &hb->chain, list) {
 407                if (match_futex(&this->key, key))
 408                        return this;
 409        }
 410        return NULL;
 411}
 412
 413static int cmpxchg_futex_value_locked(u32 *curval, u32 __user *uaddr,
 414                                      u32 uval, u32 newval)
 415{
 416        int ret;
 417
 418        pagefault_disable();
 419        ret = futex_atomic_cmpxchg_inatomic(curval, uaddr, uval, newval);
 420        pagefault_enable();
 421
 422        return ret;
 423}
 424
 425static int get_futex_value_locked(u32 *dest, u32 __user *from)
 426{
 427        int ret;
 428
 429        pagefault_disable();
 430        ret = __copy_from_user_inatomic(dest, from, sizeof(u32));
 431        pagefault_enable();
 432
 433        return ret ? -EFAULT : 0;
 434}
 435
 436
 437/*
 438 * PI code:
 439 */
 440static int refill_pi_state_cache(void)
 441{
 442        struct futex_pi_state *pi_state;
 443
 444        if (likely(current->pi_state_cache))
 445                return 0;
 446
 447        pi_state = kzalloc(sizeof(*pi_state), GFP_KERNEL);
 448
 449        if (!pi_state)
 450                return -ENOMEM;
 451
 452        INIT_LIST_HEAD(&pi_state->list);
 453        /* pi_mutex gets initialized later */
 454        pi_state->owner = NULL;
 455        atomic_set(&pi_state->refcount, 1);
 456        pi_state->key = FUTEX_KEY_INIT;
 457
 458        current->pi_state_cache = pi_state;
 459
 460        return 0;
 461}
 462
 463static struct futex_pi_state * alloc_pi_state(void)
 464{
 465        struct futex_pi_state *pi_state = current->pi_state_cache;
 466
 467        WARN_ON(!pi_state);
 468        current->pi_state_cache = NULL;
 469
 470        return pi_state;
 471}
 472
 473static void free_pi_state(struct futex_pi_state *pi_state)
 474{
 475        if (!atomic_dec_and_test(&pi_state->refcount))
 476                return;
 477
 478        /*
 479         * If pi_state->owner is NULL, the owner is most probably dying
 480         * and has cleaned up the pi_state already
 481         */
 482        if (pi_state->owner) {
 483                raw_spin_lock_irq(&pi_state->owner->pi_lock);
 484                list_del_init(&pi_state->list);
 485                raw_spin_unlock_irq(&pi_state->owner->pi_lock);
 486
 487                rt_mutex_proxy_unlock(&pi_state->pi_mutex, pi_state->owner);
 488        }
 489
 490        if (current->pi_state_cache)
 491                kfree(pi_state);
 492        else {
 493                /*
 494                 * pi_state->list is already empty.
 495                 * clear pi_state->owner.
 496                 * refcount is at 0 - put it back to 1.
 497                 */
 498                pi_state->owner = NULL;
 499                atomic_set(&pi_state->refcount, 1);
 500                current->pi_state_cache = pi_state;
 501        }
 502}
 503
 504/*
 505 * Look up the task based on what TID userspace gave us.
 506 * We dont trust it.
 507 */
 508static struct task_struct * futex_find_get_task(pid_t pid)
 509{
 510        struct task_struct *p;
 511
 512        rcu_read_lock();
 513        p = find_task_by_vpid(pid);
 514        if (p)
 515                get_task_struct(p);
 516
 517        rcu_read_unlock();
 518
 519        return p;
 520}
 521
 522/*
 523 * This task is holding PI mutexes at exit time => bad.
 524 * Kernel cleans up PI-state, but userspace is likely hosed.
 525 * (Robust-futex cleanup is separate and might save the day for userspace.)
 526 */
 527void exit_pi_state_list(struct task_struct *curr)
 528{
 529        struct list_head *next, *head = &curr->pi_state_list;
 530        struct futex_pi_state *pi_state;
 531        struct futex_hash_bucket *hb;
 532        union futex_key key = FUTEX_KEY_INIT;
 533
 534        if (!futex_cmpxchg_enabled)
 535                return;
 536        /*
 537         * We are a ZOMBIE and nobody can enqueue itself on
 538         * pi_state_list anymore, but we have to be careful
 539         * versus waiters unqueueing themselves:
 540         */
 541        raw_spin_lock_irq(&curr->pi_lock);
 542        while (!list_empty(head)) {
 543
 544                next = head->next;
 545                pi_state = list_entry(next, struct futex_pi_state, list);
 546                key = pi_state->key;
 547                hb = hash_futex(&key);
 548                raw_spin_unlock_irq(&curr->pi_lock);
 549
 550                spin_lock(&hb->lock);
 551
 552                raw_spin_lock_irq(&curr->pi_lock);
 553                /*
 554                 * We dropped the pi-lock, so re-check whether this
 555                 * task still owns the PI-state:
 556                 */
 557                if (head->next != next) {
 558                        spin_unlock(&hb->lock);
 559                        continue;
 560                }
 561
 562                WARN_ON(pi_state->owner != curr);
 563                WARN_ON(list_empty(&pi_state->list));
 564                list_del_init(&pi_state->list);
 565                pi_state->owner = NULL;
 566                raw_spin_unlock_irq(&curr->pi_lock);
 567
 568                rt_mutex_unlock(&pi_state->pi_mutex);
 569
 570                spin_unlock(&hb->lock);
 571
 572                raw_spin_lock_irq(&curr->pi_lock);
 573        }
 574        raw_spin_unlock_irq(&curr->pi_lock);
 575}
 576
 577static int
 578lookup_pi_state(u32 uval, struct futex_hash_bucket *hb,
 579                union futex_key *key, struct futex_pi_state **ps)
 580{
 581        struct futex_pi_state *pi_state = NULL;
 582        struct futex_q *this, *next;
 583        struct plist_head *head;
 584        struct task_struct *p;
 585        pid_t pid = uval & FUTEX_TID_MASK;
 586
 587        head = &hb->chain;
 588
 589        plist_for_each_entry_safe(this, next, head, list) {
 590                if (match_futex(&this->key, key)) {
 591                        /*
 592                         * Another waiter already exists - bump up
 593                         * the refcount and return its pi_state:
 594                         */
 595                        pi_state = this->pi_state;
 596                        /*
 597                         * Userspace might have messed up non-PI and PI futexes
 598                         */
 599                        if (unlikely(!pi_state))
 600                                return -EINVAL;
 601
 602                        WARN_ON(!atomic_read(&pi_state->refcount));
 603
 604                        /*
 605                         * When pi_state->owner is NULL then the owner died
 606                         * and another waiter is on the fly. pi_state->owner
 607                         * is fixed up by the task which acquires
 608                         * pi_state->rt_mutex.
 609                         *
 610                         * We do not check for pid == 0 which can happen when
 611                         * the owner died and robust_list_exit() cleared the
 612                         * TID.
 613                         */
 614                        if (pid && pi_state->owner) {
 615                                /*
 616                                 * Bail out if user space manipulated the
 617                                 * futex value.
 618                                 */
 619                                if (pid != task_pid_vnr(pi_state->owner))
 620                                        return -EINVAL;
 621                        }
 622
 623                        atomic_inc(&pi_state->refcount);
 624                        *ps = pi_state;
 625
 626                        return 0;
 627                }
 628        }
 629
 630        /*
 631         * We are the first waiter - try to look up the real owner and attach
 632         * the new pi_state to it, but bail out when TID = 0
 633         */
 634        if (!pid)
 635                return -ESRCH;
 636        p = futex_find_get_task(pid);
 637        if (!p)
 638                return -ESRCH;
 639
 640        /*
 641         * We need to look at the task state flags to figure out,
 642         * whether the task is exiting. To protect against the do_exit
 643         * change of the task flags, we do this protected by
 644         * p->pi_lock:
 645         */
 646        raw_spin_lock_irq(&p->pi_lock);
 647        if (unlikely(p->flags & PF_EXITING)) {
 648                /*
 649                 * The task is on the way out. When PF_EXITPIDONE is
 650                 * set, we know that the task has finished the
 651                 * cleanup:
 652                 */
 653                int ret = (p->flags & PF_EXITPIDONE) ? -ESRCH : -EAGAIN;
 654
 655                raw_spin_unlock_irq(&p->pi_lock);
 656                put_task_struct(p);
 657                return ret;
 658        }
 659
 660        pi_state = alloc_pi_state();
 661
 662        /*
 663         * Initialize the pi_mutex in locked state and make 'p'
 664         * the owner of it:
 665         */
 666        rt_mutex_init_proxy_locked(&pi_state->pi_mutex, p);
 667
 668        /* Store the key for possible exit cleanups: */
 669        pi_state->key = *key;
 670
 671        WARN_ON(!list_empty(&pi_state->list));
 672        list_add(&pi_state->list, &p->pi_state_list);
 673        pi_state->owner = p;
 674        raw_spin_unlock_irq(&p->pi_lock);
 675
 676        put_task_struct(p);
 677
 678        *ps = pi_state;
 679
 680        return 0;
 681}
 682
 683/**
 684 * futex_lock_pi_atomic() - Atomic work required to acquire a pi aware futex
 685 * @uaddr:              the pi futex user address
 686 * @hb:                 the pi futex hash bucket
 687 * @key:                the futex key associated with uaddr and hb
 688 * @ps:                 the pi_state pointer where we store the result of the
 689 *                      lookup
 690 * @task:               the task to perform the atomic lock work for.  This will
 691 *                      be "current" except in the case of requeue pi.
 692 * @set_waiters:        force setting the FUTEX_WAITERS bit (1) or not (0)
 693 *
 694 * Returns:
 695 *  0 - ready to wait
 696 *  1 - acquired the lock
 697 * <0 - error
 698 *
 699 * The hb->lock and futex_key refs shall be held by the caller.
 700 */
 701static int futex_lock_pi_atomic(u32 __user *uaddr, struct futex_hash_bucket *hb,
 702                                union futex_key *key,
 703                                struct futex_pi_state **ps,
 704                                struct task_struct *task, int set_waiters)
 705{
 706        int lock_taken, ret, ownerdied = 0;
 707        u32 uval, newval, curval, vpid = task_pid_vnr(task);
 708
 709retry:
 710        ret = lock_taken = 0;
 711
 712        /*
 713         * To avoid races, we attempt to take the lock here again
 714         * (by doing a 0 -> TID atomic cmpxchg), while holding all
 715         * the locks. It will most likely not succeed.
 716         */
 717        newval = vpid;
 718        if (set_waiters)
 719                newval |= FUTEX_WAITERS;
 720
 721        if (unlikely(cmpxchg_futex_value_locked(&curval, uaddr, 0, newval)))
 722                return -EFAULT;
 723
 724        /*
 725         * Detect deadlocks.
 726         */
 727        if ((unlikely((curval & FUTEX_TID_MASK) == vpid)))
 728                return -EDEADLK;
 729
 730        /*
 731         * Surprise - we got the lock. Just return to userspace:
 732         */
 733        if (unlikely(!curval))
 734                return 1;
 735
 736        uval = curval;
 737
 738        /*
 739         * Set the FUTEX_WAITERS flag, so the owner will know it has someone
 740         * to wake at the next unlock.
 741         */
 742        newval = curval | FUTEX_WAITERS;
 743
 744        /*
 745         * There are two cases, where a futex might have no owner (the
 746         * owner TID is 0): OWNER_DIED. We take over the futex in this
 747         * case. We also do an unconditional take over, when the owner
 748         * of the futex died.
 749         *
 750         * This is safe as we are protected by the hash bucket lock !
 751         */
 752        if (unlikely(ownerdied || !(curval & FUTEX_TID_MASK))) {
 753                /* Keep the OWNER_DIED bit */
 754                newval = (curval & ~FUTEX_TID_MASK) | vpid;
 755                ownerdied = 0;
 756                lock_taken = 1;
 757        }
 758
 759        if (unlikely(cmpxchg_futex_value_locked(&curval, uaddr, uval, newval)))
 760                return -EFAULT;
 761        if (unlikely(curval != uval))
 762                goto retry;
 763
 764        /*
 765         * We took the lock due to owner died take over.
 766         */
 767        if (unlikely(lock_taken))
 768                return 1;
 769
 770        /*
 771         * We dont have the lock. Look up the PI state (or create it if
 772         * we are the first waiter):
 773         */
 774        ret = lookup_pi_state(uval, hb, key, ps);
 775
 776        if (unlikely(ret)) {
 777                switch (ret) {
 778                case -ESRCH:
 779                        /*
 780                         * No owner found for this futex. Check if the
 781                         * OWNER_DIED bit is set to figure out whether
 782                         * this is a robust futex or not.
 783                         */
 784                        if (get_futex_value_locked(&curval, uaddr))
 785                                return -EFAULT;
 786
 787                        /*
 788                         * We simply start over in case of a robust
 789                         * futex. The code above will take the futex
 790                         * and return happy.
 791                         */
 792                        if (curval & FUTEX_OWNER_DIED) {
 793                                ownerdied = 1;
 794                                goto retry;
 795                        }
 796                default:
 797                        break;
 798                }
 799        }
 800
 801        return ret;
 802}
 803
 804/**
 805 * __unqueue_futex() - Remove the futex_q from its futex_hash_bucket
 806 * @q:  The futex_q to unqueue
 807 *
 808 * The q->lock_ptr must not be NULL and must be held by the caller.
 809 */
 810static void __unqueue_futex(struct futex_q *q)
 811{
 812        struct futex_hash_bucket *hb;
 813
 814        if (WARN_ON_SMP(!q->lock_ptr || !spin_is_locked(q->lock_ptr))
 815            || WARN_ON(plist_node_empty(&q->list)))
 816                return;
 817
 818        hb = container_of(q->lock_ptr, struct futex_hash_bucket, lock);
 819        plist_del(&q->list, &hb->chain);
 820}
 821
 822/*
 823 * The hash bucket lock must be held when this is called.
 824 * Afterwards, the futex_q must not be accessed.
 825 */
 826static void wake_futex(struct futex_q *q)
 827{
 828        struct task_struct *p = q->task;
 829
 830        /*
 831         * We set q->lock_ptr = NULL _before_ we wake up the task. If
 832         * a non-futex wake up happens on another CPU then the task
 833         * might exit and p would dereference a non-existing task
 834         * struct. Prevent this by holding a reference on p across the
 835         * wake up.
 836         */
 837        get_task_struct(p);
 838
 839        __unqueue_futex(q);
 840        /*
 841         * The waiting task can free the futex_q as soon as
 842         * q->lock_ptr = NULL is written, without taking any locks. A
 843         * memory barrier is required here to prevent the following
 844         * store to lock_ptr from getting ahead of the plist_del.
 845         */
 846        smp_wmb();
 847        q->lock_ptr = NULL;
 848
 849        wake_up_state(p, TASK_NORMAL);
 850        put_task_struct(p);
 851}
 852
 853static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this)
 854{
 855        struct task_struct *new_owner;
 856        struct futex_pi_state *pi_state = this->pi_state;
 857        u32 curval, newval;
 858
 859        if (!pi_state)
 860                return -EINVAL;
 861
 862        /*
 863         * If current does not own the pi_state then the futex is
 864         * inconsistent and user space fiddled with the futex value.
 865         */
 866        if (pi_state->owner != current)
 867                return -EINVAL;
 868
 869        raw_spin_lock(&pi_state->pi_mutex.wait_lock);
 870        new_owner = rt_mutex_next_owner(&pi_state->pi_mutex);
 871
 872        /*
 873         * It is possible that the next waiter (the one that brought
 874         * this owner to the kernel) timed out and is no longer
 875         * waiting on the lock.
 876         */
 877        if (!new_owner)
 878                new_owner = this->task;
 879
 880        /*
 881         * We pass it to the next owner. (The WAITERS bit is always
 882         * kept enabled while there is PI state around. We must also
 883         * preserve the owner died bit.)
 884         */
 885        if (!(uval & FUTEX_OWNER_DIED)) {
 886                int ret = 0;
 887
 888                newval = FUTEX_WAITERS | task_pid_vnr(new_owner);
 889
 890                if (cmpxchg_futex_value_locked(&curval, uaddr, uval, newval))
 891                        ret = -EFAULT;
 892                else if (curval != uval)
 893                        ret = -EINVAL;
 894                if (ret) {
 895                        raw_spin_unlock(&pi_state->pi_mutex.wait_lock);
 896                        return ret;
 897                }
 898        }
 899
 900        raw_spin_lock_irq(&pi_state->owner->pi_lock);
 901        WARN_ON(list_empty(&pi_state->list));
 902        list_del_init(&pi_state->list);
 903        raw_spin_unlock_irq(&pi_state->owner->pi_lock);
 904
 905        raw_spin_lock_irq(&new_owner->pi_lock);
 906        WARN_ON(!list_empty(&pi_state->list));
 907        list_add(&pi_state->list, &new_owner->pi_state_list);
 908        pi_state->owner = new_owner;
 909        raw_spin_unlock_irq(&new_owner->pi_lock);
 910
 911        raw_spin_unlock(&pi_state->pi_mutex.wait_lock);
 912        rt_mutex_unlock(&pi_state->pi_mutex);
 913
 914        return 0;
 915}
 916
 917static int unlock_futex_pi(u32 __user *uaddr, u32 uval)
 918{
 919        u32 oldval;
 920
 921        /*
 922         * There is no waiter, so we unlock the futex. The owner died
 923         * bit has not to be preserved here. We are the owner:
 924         */
 925        if (cmpxchg_futex_value_locked(&oldval, uaddr, uval, 0))
 926                return -EFAULT;
 927        if (oldval != uval)
 928                return -EAGAIN;
 929
 930        return 0;
 931}
 932
 933/*
 934 * Express the locking dependencies for lockdep:
 935 */
 936static inline void
 937double_lock_hb(struct futex_hash_bucket *hb1, struct futex_hash_bucket *hb2)
 938{
 939        if (hb1 <= hb2) {
 940                spin_lock(&hb1->lock);
 941                if (hb1 < hb2)
 942                        spin_lock_nested(&hb2->lock, SINGLE_DEPTH_NESTING);
 943        } else { /* hb1 > hb2 */
 944                spin_lock(&hb2->lock);
 945                spin_lock_nested(&hb1->lock, SINGLE_DEPTH_NESTING);
 946        }
 947}
 948
 949static inline void
 950double_unlock_hb(struct futex_hash_bucket *hb1, struct futex_hash_bucket *hb2)
 951{
 952        spin_unlock(&hb1->lock);
 953        if (hb1 != hb2)
 954                spin_unlock(&hb2->lock);
 955}
 956
 957/*
 958 * Wake up waiters matching bitset queued on this futex (uaddr).
 959 */
 960static int
 961futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset)
 962{
 963        struct futex_hash_bucket *hb;
 964        struct futex_q *this, *next;
 965        struct plist_head *head;
 966        union futex_key key = FUTEX_KEY_INIT;
 967        int ret;
 968
 969        if (!bitset)
 970                return -EINVAL;
 971
 972        ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &key, VERIFY_READ);
 973        if (unlikely(ret != 0))
 974                goto out;
 975
 976        hb = hash_futex(&key);
 977        spin_lock(&hb->lock);
 978        head = &hb->chain;
 979
 980        plist_for_each_entry_safe(this, next, head, list) {
 981                if (match_futex (&this->key, &key)) {
 982                        if (this->pi_state || this->rt_waiter) {
 983                                ret = -EINVAL;
 984                                break;
 985                        }
 986
 987                        /* Check if one of the bits is set in both bitsets */
 988                        if (!(this->bitset & bitset))
 989                                continue;
 990
 991                        wake_futex(this);
 992                        if (++ret >= nr_wake)
 993                                break;
 994                }
 995        }
 996
 997        spin_unlock(&hb->lock);
 998        put_futex_key(&key);
 999out:
1000        return ret;
1001}
1002
1003/*
1004 * Wake up all waiters hashed on the physical page that is mapped
1005 * to this virtual address:
1006 */
1007static int
1008futex_wake_op(u32 __user *uaddr1, unsigned int flags, u32 __user *uaddr2,
1009              int nr_wake, int nr_wake2, int op)
1010{
1011        union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT;
1012        struct futex_hash_bucket *hb1, *hb2;
1013        struct plist_head *head;
1014        struct futex_q *this, *next;
1015        int ret, op_ret;
1016
1017retry:
1018        ret = get_futex_key(uaddr1, flags & FLAGS_SHARED, &key1, VERIFY_READ);
1019        if (unlikely(ret != 0))
1020                goto out;
1021        ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, VERIFY_WRITE);
1022        if (unlikely(ret != 0))
1023                goto out_put_key1;
1024
1025        hb1 = hash_futex(&key1);
1026        hb2 = hash_futex(&key2);
1027
1028retry_private:
1029        double_lock_hb(hb1, hb2);
1030        op_ret = futex_atomic_op_inuser(op, uaddr2);
1031        if (unlikely(op_ret < 0)) {
1032
1033                double_unlock_hb(hb1, hb2);
1034
1035#ifndef CONFIG_MMU
1036                /*
1037                 * we don't get EFAULT from MMU faults if we don't have an MMU,
1038                 * but we might get them from range checking
1039                 */
1040                ret = op_ret;
1041                goto out_put_keys;
1042#endif
1043
1044                if (unlikely(op_ret != -EFAULT)) {
1045                        ret = op_ret;
1046                        goto out_put_keys;
1047                }
1048
1049                ret = fault_in_user_writeable(uaddr2);
1050                if (ret)
1051                        goto out_put_keys;
1052
1053                if (!(flags & FLAGS_SHARED))
1054                        goto retry_private;
1055
1056                put_futex_key(&key2);
1057                put_futex_key(&key1);
1058                goto retry;
1059        }
1060
1061        head = &hb1->chain;
1062
1063        plist_for_each_entry_safe(this, next, head, list) {
1064                if (match_futex (&this->key, &key1)) {
1065                        wake_futex(this);
1066                        if (++ret >= nr_wake)
1067                                break;
1068                }
1069        }
1070
1071        if (op_ret > 0) {
1072                head = &hb2->chain;
1073
1074                op_ret = 0;
1075                plist_for_each_entry_safe(this, next, head, list) {
1076                        if (match_futex (&this->key, &key2)) {
1077                                wake_futex(this);
1078                                if (++op_ret >= nr_wake2)
1079                                        break;
1080                        }
1081                }
1082                ret += op_ret;
1083        }
1084
1085        double_unlock_hb(hb1, hb2);
1086out_put_keys:
1087        put_futex_key(&key2);
1088out_put_key1:
1089        put_futex_key(&key1);
1090out:
1091        return ret;
1092}
1093
1094/**
1095 * requeue_futex() - Requeue a futex_q from one hb to another
1096 * @q:          the futex_q to requeue
1097 * @hb1:        the source hash_bucket
1098 * @hb2:        the target hash_bucket
1099 * @key2:       the new key for the requeued futex_q
1100 */
1101static inline
1102void requeue_futex(struct futex_q *q, struct futex_hash_bucket *hb1,
1103                   struct futex_hash_bucket *hb2, union futex_key *key2)
1104{
1105
1106        /*
1107         * If key1 and key2 hash to the same bucket, no need to
1108         * requeue.
1109         */
1110        if (likely(&hb1->chain != &hb2->chain)) {
1111                plist_del(&q->list, &hb1->chain);
1112                plist_add(&q->list, &hb2->chain);
1113                q->lock_ptr = &hb2->lock;
1114        }
1115        get_futex_key_refs(key2);
1116        q->key = *key2;
1117}
1118
1119/**
1120 * requeue_pi_wake_futex() - Wake a task that acquired the lock during requeue
1121 * @q:          the futex_q
1122 * @key:        the key of the requeue target futex
1123 * @hb:         the hash_bucket of the requeue target futex
1124 *
1125 * During futex_requeue, with requeue_pi=1, it is possible to acquire the
1126 * target futex if it is uncontended or via a lock steal.  Set the futex_q key
1127 * to the requeue target futex so the waiter can detect the wakeup on the right
1128 * futex, but remove it from the hb and NULL the rt_waiter so it can detect
1129 * atomic lock acquisition.  Set the q->lock_ptr to the requeue target hb->lock
1130 * to protect access to the pi_state to fixup the owner later.  Must be called
1131 * with both q->lock_ptr and hb->lock held.
1132 */
1133static inline
1134void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key,
1135                           struct futex_hash_bucket *hb)
1136{
1137        get_futex_key_refs(key);
1138        q->key = *key;
1139
1140        __unqueue_futex(q);
1141
1142        WARN_ON(!q->rt_waiter);
1143        q->rt_waiter = NULL;
1144
1145        q->lock_ptr = &hb->lock;
1146
1147        wake_up_state(q->task, TASK_NORMAL);
1148}
1149
1150/**
1151 * futex_proxy_trylock_atomic() - Attempt an atomic lock for the top waiter
1152 * @pifutex:            the user address of the to futex
1153 * @hb1:                the from futex hash bucket, must be locked by the caller
1154 * @hb2:                the to futex hash bucket, must be locked by the caller
1155 * @key1:               the from futex key
1156 * @key2:               the to futex key
1157 * @ps:                 address to store the pi_state pointer
1158 * @set_waiters:        force setting the FUTEX_WAITERS bit (1) or not (0)
1159 *
1160 * Try and get the lock on behalf of the top waiter if we can do it atomically.
1161 * Wake the top waiter if we succeed.  If the caller specified set_waiters,
1162 * then direct futex_lock_pi_atomic() to force setting the FUTEX_WAITERS bit.
1163 * hb1 and hb2 must be held by the caller.
1164 *
1165 * Returns:
1166 *  0 - failed to acquire the lock atomicly
1167 *  1 - acquired the lock
1168 * <0 - error
1169 */
1170static int futex_proxy_trylock_atomic(u32 __user *pifutex,
1171                                 struct futex_hash_bucket *hb1,
1172                                 struct futex_hash_bucket *hb2,
1173                                 union futex_key *key1, union futex_key *key2,
1174                                 struct futex_pi_state **ps, int set_waiters)
1175{
1176        struct futex_q *top_waiter = NULL;
1177        u32 curval;
1178        int ret;
1179
1180        if (get_futex_value_locked(&curval, pifutex))
1181                return -EFAULT;
1182
1183        /*
1184         * Find the top_waiter and determine if there are additional waiters.
1185         * If the caller intends to requeue more than 1 waiter to pifutex,
1186         * force futex_lock_pi_atomic() to set the FUTEX_WAITERS bit now,
1187         * as we have means to handle the possible fault.  If not, don't set
1188         * the bit unecessarily as it will force the subsequent unlock to enter
1189         * the kernel.
1190         */
1191        top_waiter = futex_top_waiter(hb1, key1);
1192
1193        /* There are no waiters, nothing for us to do. */
1194        if (!top_waiter)
1195                return 0;
1196
1197        /* Ensure we requeue to the expected futex. */
1198        if (!match_futex(top_waiter->requeue_pi_key, key2))
1199                return -EINVAL;
1200
1201        /*
1202         * Try to take the lock for top_waiter.  Set the FUTEX_WAITERS bit in
1203         * the contended case or if set_waiters is 1.  The pi_state is returned
1204         * in ps in contended cases.
1205         */
1206        ret = futex_lock_pi_atomic(pifutex, hb2, key2, ps, top_waiter->task,
1207                                   set_waiters);
1208        if (ret == 1)
1209                requeue_pi_wake_futex(top_waiter, key2, hb2);
1210
1211        return ret;
1212}
1213
1214/**
1215 * futex_requeue() - Requeue waiters from uaddr1 to uaddr2
1216 * @uaddr1:     source futex user address
1217 * @flags:      futex flags (FLAGS_SHARED, etc.)
1218 * @uaddr2:     target futex user address
1219 * @nr_wake:    number of waiters to wake (must be 1 for requeue_pi)
1220 * @nr_requeue: number of waiters to requeue (0-INT_MAX)
1221 * @cmpval:     @uaddr1 expected value (or %NULL)
1222 * @requeue_pi: if we are attempting to requeue from a non-pi futex to a
1223 *              pi futex (pi to pi requeue is not supported)
1224 *
1225 * Requeue waiters on uaddr1 to uaddr2. In the requeue_pi case, try to acquire
1226 * uaddr2 atomically on behalf of the top waiter.
1227 *
1228 * Returns:
1229 * >=0 - on success, the number of tasks requeued or woken
1230 *  <0 - on error
1231 */
1232static int futex_requeue(u32 __user *uaddr1, unsigned int flags,
1233                         u32 __user *uaddr2, int nr_wake, int nr_requeue,
1234                         u32 *cmpval, int requeue_pi)
1235{
1236        union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT;
1237        int drop_count = 0, task_count = 0, ret;
1238        struct futex_pi_state *pi_state = NULL;
1239        struct futex_hash_bucket *hb1, *hb2;
1240        struct plist_head *head1;
1241        struct futex_q *this, *next;
1242        u32 curval2;
1243
1244        if (requeue_pi) {
1245                /*
1246                 * requeue_pi requires a pi_state, try to allocate it now
1247                 * without any locks in case it fails.
1248                 */
1249                if (refill_pi_state_cache())
1250                        return -ENOMEM;
1251                /*
1252                 * requeue_pi must wake as many tasks as it can, up to nr_wake
1253                 * + nr_requeue, since it acquires the rt_mutex prior to
1254                 * returning to userspace, so as to not leave the rt_mutex with
1255                 * waiters and no owner.  However, second and third wake-ups
1256                 * cannot be predicted as they involve race conditions with the
1257                 * first wake and a fault while looking up the pi_state.  Both
1258                 * pthread_cond_signal() and pthread_cond_broadcast() should
1259                 * use nr_wake=1.
1260                 */
1261                if (nr_wake != 1)
1262                        return -EINVAL;
1263        }
1264
1265retry:
1266        if (pi_state != NULL) {
1267                /*
1268                 * We will have to lookup the pi_state again, so free this one
1269                 * to keep the accounting correct.
1270                 */
1271                free_pi_state(pi_state);
1272                pi_state = NULL;
1273        }
1274
1275        ret = get_futex_key(uaddr1, flags & FLAGS_SHARED, &key1, VERIFY_READ);
1276        if (unlikely(ret != 0))
1277                goto out;
1278        ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2,
1279                            requeue_pi ? VERIFY_WRITE : VERIFY_READ);
1280        if (unlikely(ret != 0))
1281                goto out_put_key1;
1282
1283        hb1 = hash_futex(&key1);
1284        hb2 = hash_futex(&key2);
1285
1286retry_private:
1287        double_lock_hb(hb1, hb2);
1288
1289        if (likely(cmpval != NULL)) {
1290                u32 curval;
1291
1292                ret = get_futex_value_locked(&curval, uaddr1);
1293
1294                if (unlikely(ret)) {
1295                        double_unlock_hb(hb1, hb2);
1296
1297                        ret = get_user(curval, uaddr1);
1298                        if (ret)
1299                                goto out_put_keys;
1300
1301                        if (!(flags & FLAGS_SHARED))
1302                                goto retry_private;
1303
1304                        put_futex_key(&key2);
1305                        put_futex_key(&key1);
1306                        goto retry;
1307                }
1308                if (curval != *cmpval) {
1309                        ret = -EAGAIN;
1310                        goto out_unlock;
1311                }
1312        }
1313
1314        if (requeue_pi && (task_count - nr_wake < nr_requeue)) {
1315                /*
1316                 * Attempt to acquire uaddr2 and wake the top waiter. If we
1317                 * intend to requeue waiters, force setting the FUTEX_WAITERS
1318                 * bit.  We force this here where we are able to easily handle
1319                 * faults rather in the requeue loop below.
1320                 */
1321                ret = futex_proxy_trylock_atomic(uaddr2, hb1, hb2, &key1,
1322                                                 &key2, &pi_state, nr_requeue);
1323
1324                /*
1325                 * At this point the top_waiter has either taken uaddr2 or is
1326                 * waiting on it.  If the former, then the pi_state will not
1327                 * exist yet, look it up one more time to ensure we have a
1328                 * reference to it.
1329                 */
1330                if (ret == 1) {
1331                        WARN_ON(pi_state);
1332                        drop_count++;
1333                        task_count++;
1334                        ret = get_futex_value_locked(&curval2, uaddr2);
1335                        if (!ret)
1336                                ret = lookup_pi_state(curval2, hb2, &key2,
1337                                                      &pi_state);
1338                }
1339
1340                switch (ret) {
1341                case 0:
1342                        break;
1343                case -EFAULT:
1344                        double_unlock_hb(hb1, hb2);
1345                        put_futex_key(&key2);
1346                        put_futex_key(&key1);
1347                        ret = fault_in_user_writeable(uaddr2);
1348                        if (!ret)
1349                                goto retry;
1350                        goto out;
1351                case -EAGAIN:
1352                        /* The owner was exiting, try again. */
1353                        double_unlock_hb(hb1, hb2);
1354                        put_futex_key(&key2);
1355                        put_futex_key(&key1);
1356                        cond_resched();
1357                        goto retry;
1358                default:
1359                        goto out_unlock;
1360                }
1361        }
1362
1363        head1 = &hb1->chain;
1364        plist_for_each_entry_safe(this, next, head1, list) {
1365                if (task_count - nr_wake >= nr_requeue)
1366                        break;
1367
1368                if (!match_futex(&this->key, &key1))
1369                        continue;
1370
1371                /*
1372                 * FUTEX_WAIT_REQEUE_PI and FUTEX_CMP_REQUEUE_PI should always
1373                 * be paired with each other and no other futex ops.
1374                 */
1375                if ((requeue_pi && !this->rt_waiter) ||
1376                    (!requeue_pi && this->rt_waiter)) {
1377                        ret = -EINVAL;
1378                        break;
1379                }
1380
1381                /*
1382                 * Wake nr_wake waiters.  For requeue_pi, if we acquired the
1383                 * lock, we already woke the top_waiter.  If not, it will be
1384                 * woken by futex_unlock_pi().
1385                 */
1386                if (++task_count <= nr_wake && !requeue_pi) {
1387                        wake_futex(this);
1388                        continue;
1389                }
1390
1391                /* Ensure we requeue to the expected futex for requeue_pi. */
1392                if (requeue_pi && !match_futex(this->requeue_pi_key, &key2)) {
1393                        ret = -EINVAL;
1394                        break;
1395                }
1396
1397                /*
1398                 * Requeue nr_requeue waiters and possibly one more in the case
1399                 * of requeue_pi if we couldn't acquire the lock atomically.
1400                 */
1401                if (requeue_pi) {
1402                        /* Prepare the waiter to take the rt_mutex. */
1403                        atomic_inc(&pi_state->refcount);
1404                        this->pi_state = pi_state;
1405                        ret = rt_mutex_start_proxy_lock(&pi_state->pi_mutex,
1406                                                        this->rt_waiter,
1407                                                        this->task, 1);
1408                        if (ret == 1) {
1409                                /* We got the lock. */
1410                                requeue_pi_wake_futex(this, &key2, hb2);
1411                                drop_count++;
1412                                continue;
1413                        } else if (ret) {
1414                                /* -EDEADLK */
1415                                this->pi_state = NULL;
1416                                free_pi_state(pi_state);
1417                                goto out_unlock;
1418                        }
1419                }
1420                requeue_futex(this, hb1, hb2, &key2);
1421                drop_count++;
1422        }
1423
1424out_unlock:
1425        double_unlock_hb(hb1, hb2);
1426
1427        /*
1428         * drop_futex_key_refs() must be called outside the spinlocks. During
1429         * the requeue we moved futex_q's from the hash bucket at key1 to the
1430         * one at key2 and updated their key pointer.  We no longer need to
1431         * hold the references to key1.
1432         */
1433        while (--drop_count >= 0)
1434                drop_futex_key_refs(&key1);
1435
1436out_put_keys:
1437        put_futex_key(&key2);
1438out_put_key1:
1439        put_futex_key(&key1);
1440out:
1441        if (pi_state != NULL)
1442                free_pi_state(pi_state);
1443        return ret ? ret : task_count;
1444}
1445
1446/* The key must be already stored in q->key. */
1447static inline struct futex_hash_bucket *queue_lock(struct futex_q *q)
1448        __acquires(&hb->lock)
1449{
1450        struct futex_hash_bucket *hb;
1451
1452        hb = hash_futex(&q->key);
1453        q->lock_ptr = &hb->lock;
1454
1455        spin_lock(&hb->lock);
1456        return hb;
1457}
1458
1459static inline void
1460queue_unlock(struct futex_q *q, struct futex_hash_bucket *hb)
1461        __releases(&hb->lock)
1462{
1463        spin_unlock(&hb->lock);
1464}
1465
1466/**
1467 * queue_me() - Enqueue the futex_q on the futex_hash_bucket
1468 * @q:  The futex_q to enqueue
1469 * @hb: The destination hash bucket
1470 *
1471 * The hb->lock must be held by the caller, and is released here. A call to
1472 * queue_me() is typically paired with exactly one call to unqueue_me().  The
1473 * exceptions involve the PI related operations, which may use unqueue_me_pi()
1474 * or nothing if the unqueue is done as part of the wake process and the unqueue
1475 * state is implicit in the state of woken task (see futex_wait_requeue_pi() for
1476 * an example).
1477 */
1478static inline void queue_me(struct futex_q *q, struct futex_hash_bucket *hb)
1479        __releases(&hb->lock)
1480{
1481        int prio;
1482
1483        /*
1484         * The priority used to register this element is
1485         * - either the real thread-priority for the real-time threads
1486         * (i.e. threads with a priority lower than MAX_RT_PRIO)
1487         * - or MAX_RT_PRIO for non-RT threads.
1488         * Thus, all RT-threads are woken first in priority order, and
1489         * the others are woken last, in FIFO order.
1490         */
1491        prio = min(current->normal_prio, MAX_RT_PRIO);
1492
1493        plist_node_init(&q->list, prio);
1494        plist_add(&q->list, &hb->chain);
1495        q->task = current;
1496        spin_unlock(&hb->lock);
1497}
1498
1499/**
1500 * unqueue_me() - Remove the futex_q from its futex_hash_bucket
1501 * @q:  The futex_q to unqueue
1502 *
1503 * The q->lock_ptr must not be held by the caller. A call to unqueue_me() must
1504 * be paired with exactly one earlier call to queue_me().
1505 *
1506 * Returns:
1507 *   1 - if the futex_q was still queued (and we removed unqueued it)
1508 *   0 - if the futex_q was already removed by the waking thread
1509 */
1510static int unqueue_me(struct futex_q *q)
1511{
1512        spinlock_t *lock_ptr;
1513        int ret = 0;
1514
1515        /* In the common case we don't take the spinlock, which is nice. */
1516retry:
1517        lock_ptr = q->lock_ptr;
1518        barrier();
1519        if (lock_ptr != NULL) {
1520                spin_lock(lock_ptr);
1521                /*
1522                 * q->lock_ptr can change between reading it and
1523                 * spin_lock(), causing us to take the wrong lock.  This
1524                 * corrects the race condition.
1525                 *
1526                 * Reasoning goes like this: if we have the wrong lock,
1527                 * q->lock_ptr must have changed (maybe several times)
1528                 * between reading it and the spin_lock().  It can
1529                 * change again after the spin_lock() but only if it was
1530                 * already changed before the spin_lock().  It cannot,
1531                 * however, change back to the original value.  Therefore
1532                 * we can detect whether we acquired the correct lock.
1533                 */
1534                if (unlikely(lock_ptr != q->lock_ptr)) {
1535                        spin_unlock(lock_ptr);
1536                        goto retry;
1537                }
1538                __unqueue_futex(q);
1539
1540                BUG_ON(q->pi_state);
1541
1542                spin_unlock(lock_ptr);
1543                ret = 1;
1544        }
1545
1546        drop_futex_key_refs(&q->key);
1547        return ret;
1548}
1549
1550/*
1551 * PI futexes can not be requeued and must remove themself from the
1552 * hash bucket. The hash bucket lock (i.e. lock_ptr) is held on entry
1553 * and dropped here.
1554 */
1555static void unqueue_me_pi(struct futex_q *q)
1556        __releases(q->lock_ptr)
1557{
1558        __unqueue_futex(q);
1559
1560        BUG_ON(!q->pi_state);
1561        free_pi_state(q->pi_state);
1562        q->pi_state = NULL;
1563
1564        spin_unlock(q->lock_ptr);
1565}
1566
1567/*
1568 * Fixup the pi_state owner with the new owner.
1569 *
1570 * Must be called with hash bucket lock held and mm->sem held for non
1571 * private futexes.
1572 */
1573static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,
1574                                struct task_struct *newowner)
1575{
1576        u32 newtid = task_pid_vnr(newowner) | FUTEX_WAITERS;
1577        struct futex_pi_state *pi_state = q->pi_state;
1578        struct task_struct *oldowner = pi_state->owner;
1579        u32 uval, curval, newval;
1580        int ret;
1581
1582        /* Owner died? */
1583        if (!pi_state->owner)
1584                newtid |= FUTEX_OWNER_DIED;
1585
1586        /*
1587         * We are here either because we stole the rtmutex from the
1588         * previous highest priority waiter or we are the highest priority
1589         * waiter but failed to get the rtmutex the first time.
1590         * We have to replace the newowner TID in the user space variable.
1591         * This must be atomic as we have to preserve the owner died bit here.
1592         *
1593         * Note: We write the user space value _before_ changing the pi_state
1594         * because we can fault here. Imagine swapped out pages or a fork
1595         * that marked all the anonymous memory readonly for cow.
1596         *
1597         * Modifying pi_state _before_ the user space value would
1598         * leave the pi_state in an inconsistent state when we fault
1599         * here, because we need to drop the hash bucket lock to
1600         * handle the fault. This might be observed in the PID check
1601         * in lookup_pi_state.
1602         */
1603retry:
1604        if (get_futex_value_locked(&uval, uaddr))
1605                goto handle_fault;
1606
1607        while (1) {
1608                newval = (uval & FUTEX_OWNER_DIED) | newtid;
1609
1610                if (cmpxchg_futex_value_locked(&curval, uaddr, uval, newval))
1611                        goto handle_fault;
1612                if (curval == uval)
1613                        break;
1614                uval = curval;
1615        }
1616
1617        /*
1618         * We fixed up user space. Now we need to fix the pi_state
1619         * itself.
1620         */
1621        if (pi_state->owner != NULL) {
1622                raw_spin_lock_irq(&pi_state->owner->pi_lock);
1623                WARN_ON(list_empty(&pi_state->list));
1624                list_del_init(&pi_state->list);
1625                raw_spin_unlock_irq(&pi_state->owner->pi_lock);
1626        }
1627
1628        pi_state->owner = newowner;
1629
1630        raw_spin_lock_irq(&newowner->pi_lock);
1631        WARN_ON(!list_empty(&pi_state->list));
1632        list_add(&pi_state->list, &newowner->pi_state_list);
1633        raw_spin_unlock_irq(&newowner->pi_lock);
1634        return 0;
1635
1636        /*
1637         * To handle the page fault we need to drop the hash bucket
1638         * lock here. That gives the other task (either the highest priority
1639         * waiter itself or the task which stole the rtmutex) the
1640         * chance to try the fixup of the pi_state. So once we are
1641         * back from handling the fault we need to check the pi_state
1642         * after reacquiring the hash bucket lock and before trying to
1643         * do another fixup. When the fixup has been done already we
1644         * simply return.
1645         */
1646handle_fault:
1647        spin_unlock(q->lock_ptr);
1648
1649        ret = fault_in_user_writeable(uaddr);
1650
1651        spin_lock(q->lock_ptr);
1652
1653        /*
1654         * Check if someone else fixed it for us:
1655         */
1656        if (pi_state->owner != oldowner)
1657                return 0;
1658
1659        if (ret)
1660                return ret;
1661
1662        goto retry;
1663}
1664
1665static long futex_wait_restart(struct restart_block *restart);
1666
1667/**
1668 * fixup_owner() - Post lock pi_state and corner case management
1669 * @uaddr:      user address of the futex
1670 * @q:          futex_q (contains pi_state and access to the rt_mutex)
1671 * @locked:     if the attempt to take the rt_mutex succeeded (1) or not (0)
1672 *
1673 * After attempting to lock an rt_mutex, this function is called to cleanup
1674 * the pi_state owner as well as handle race conditions that may allow us to
1675 * acquire the lock. Must be called with the hb lock held.
1676 *
1677 * Returns:
1678 *  1 - success, lock taken
1679 *  0 - success, lock not taken
1680 * <0 - on error (-EFAULT)
1681 */
1682static int fixup_owner(u32 __user *uaddr, struct futex_q *q, int locked)
1683{
1684        struct task_struct *owner;
1685        int ret = 0;
1686
1687        if (locked) {
1688                /*
1689                 * Got the lock. We might not be the anticipated owner if we
1690                 * did a lock-steal - fix up the PI-state in that case:
1691                 */
1692                if (q->pi_state->owner != current)
1693                        ret = fixup_pi_state_owner(uaddr, q, current);
1694                goto out;
1695        }
1696
1697        /*
1698         * Catch the rare case, where the lock was released when we were on the
1699         * way back before we locked the hash bucket.
1700         */
1701        if (q->pi_state->owner == current) {
1702                /*
1703                 * Try to get the rt_mutex now. This might fail as some other
1704                 * task acquired the rt_mutex after we removed ourself from the
1705                 * rt_mutex waiters list.
1706                 */
1707                if (rt_mutex_trylock(&q->pi_state->pi_mutex)) {
1708                        locked = 1;
1709                        goto out;
1710                }
1711
1712                /*
1713                 * pi_state is incorrect, some other task did a lock steal and
1714                 * we returned due to timeout or signal without taking the
1715                 * rt_mutex. Too late.
1716                 */
1717                raw_spin_lock(&q->pi_state->pi_mutex.wait_lock);
1718                owner = rt_mutex_owner(&q->pi_state->pi_mutex);
1719                if (!owner)
1720                        owner = rt_mutex_next_owner(&q->pi_state->pi_mutex);
1721                raw_spin_unlock(&q->pi_state->pi_mutex.wait_lock);
1722                ret = fixup_pi_state_owner(uaddr, q, owner);
1723                goto out;
1724        }
1725
1726        /*
1727         * Paranoia check. If we did not take the lock, then we should not be
1728         * the owner of the rt_mutex.
1729         */
1730        if (rt_mutex_owner(&q->pi_state->pi_mutex) == current)
1731                printk(KERN_ERR "fixup_owner: ret = %d pi-mutex: %p "
1732                                "pi-state %p\n", ret,
1733                                q->pi_state->pi_mutex.owner,
1734                                q->pi_state->owner);
1735
1736out:
1737        return ret ? ret : locked;
1738}
1739
1740/**
1741 * futex_wait_queue_me() - queue_me() and wait for wakeup, timeout, or signal
1742 * @hb:         the futex hash bucket, must be locked by the caller
1743 * @q:          the futex_q to queue up on
1744 * @timeout:    the prepared hrtimer_sleeper, or null for no timeout
1745 */
1746static void futex_wait_queue_me(struct futex_hash_bucket *hb, struct futex_q *q,
1747                                struct hrtimer_sleeper *timeout)
1748{
1749        /*
1750         * The task state is guaranteed to be set before another task can
1751         * wake it. set_current_state() is implemented using set_mb() and
1752         * queue_me() calls spin_unlock() upon completion, both serializing
1753         * access to the hash list and forcing another memory barrier.
1754         */
1755        set_current_state(TASK_INTERRUPTIBLE);
1756        queue_me(q, hb);
1757
1758        /* Arm the timer */
1759        if (timeout) {
1760                hrtimer_start_expires(&timeout->timer, HRTIMER_MODE_ABS);
1761                if (!hrtimer_active(&timeout->timer))
1762                        timeout->task = NULL;
1763        }
1764
1765        /*
1766         * If we have been removed from the hash list, then another task
1767         * has tried to wake us, and we can skip the call to schedule().
1768         */
1769        if (likely(!plist_node_empty(&q->list))) {
1770                /*
1771                 * If the timer has already expired, current will already be
1772                 * flagged for rescheduling. Only call schedule if there
1773                 * is no timeout, or if it has yet to expire.
1774                 */
1775                if (!timeout || timeout->task)
1776                        schedule();
1777        }
1778        __set_current_state(TASK_RUNNING);
1779}
1780
1781/**
1782 * futex_wait_setup() - Prepare to wait on a futex
1783 * @uaddr:      the futex userspace address
1784 * @val:        the expected value
1785 * @flags:      futex flags (FLAGS_SHARED, etc.)
1786 * @q:          the associated futex_q
1787 * @hb:         storage for hash_bucket pointer to be returned to caller
1788 *
1789 * Setup the futex_q and locate the hash_bucket.  Get the futex value and
1790 * compare it with the expected value.  Handle atomic faults internally.
1791 * Return with the hb lock held and a q.key reference on success, and unlocked
1792 * with no q.key reference on failure.
1793 *
1794 * Returns:
1795 *  0 - uaddr contains val and hb has been locked
1796 * <1 - -EFAULT or -EWOULDBLOCK (uaddr does not contain val) and hb is unlcoked
1797 */
1798static int futex_wait_setup(u32 __user *uaddr, u32 val, unsigned int flags,
1799                           struct futex_q *q, struct futex_hash_bucket **hb)
1800{
1801        u32 uval;
1802        int ret;
1803
1804        /*
1805         * Access the page AFTER the hash-bucket is locked.
1806         * Order is important:
1807         *
1808         *   Userspace waiter: val = var; if (cond(val)) futex_wait(&var, val);
1809         *   Userspace waker:  if (cond(var)) { var = new; futex_wake(&var); }
1810         *
1811         * The basic logical guarantee of a futex is that it blocks ONLY
1812         * if cond(var) is known to be true at the time of blocking, for
1813         * any cond.  If we locked the hash-bucket after testing *uaddr, that
1814         * would open a race condition where we could block indefinitely with
1815         * cond(var) false, which would violate the guarantee.
1816         *
1817         * On the other hand, we insert q and release the hash-bucket only
1818         * after testing *uaddr.  This guarantees that futex_wait() will NOT
1819         * absorb a wakeup if *uaddr does not match the desired values
1820         * while the syscall executes.
1821         */
1822retry:
1823        ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &q->key, VERIFY_READ);
1824        if (unlikely(ret != 0))
1825                return ret;
1826
1827retry_private:
1828        *hb = queue_lock(q);
1829
1830        ret = get_futex_value_locked(&uval, uaddr);
1831
1832        if (ret) {
1833                queue_unlock(q, *hb);
1834
1835                ret = get_user(uval, uaddr);
1836                if (ret)
1837                        goto out;
1838
1839                if (!(flags & FLAGS_SHARED))
1840                        goto retry_private;
1841
1842                put_futex_key(&q->key);
1843                goto retry;
1844        }
1845
1846        if (uval != val) {
1847                queue_unlock(q, *hb);
1848                ret = -EWOULDBLOCK;
1849        }
1850
1851out:
1852        if (ret)
1853                put_futex_key(&q->key);
1854        return ret;
1855}
1856
1857static int futex_wait(u32 __user *uaddr, unsigned int flags, u32 val,
1858                      ktime_t *abs_time, u32 bitset)
1859{
1860        struct hrtimer_sleeper timeout, *to = NULL;
1861        struct restart_block *restart;
1862        struct futex_hash_bucket *hb;
1863        struct futex_q q = futex_q_init;
1864        int ret;
1865
1866        if (!bitset)
1867                return -EINVAL;
1868        q.bitset = bitset;
1869
1870        if (abs_time) {
1871                to = &timeout;
1872
1873                hrtimer_init_on_stack(&to->timer, (flags & FLAGS_CLOCKRT) ?
1874                                      CLOCK_REALTIME : CLOCK_MONOTONIC,
1875                                      HRTIMER_MODE_ABS);
1876                hrtimer_init_sleeper(to, current);
1877                hrtimer_set_expires_range_ns(&to->timer, *abs_time,
1878                                             current->timer_slack_ns);
1879        }
1880
1881retry:
1882        /*
1883         * Prepare to wait on uaddr. On success, holds hb lock and increments
1884         * q.key refs.
1885         */
1886        ret = futex_wait_setup(uaddr, val, flags, &q, &hb);
1887        if (ret)
1888                goto out;
1889
1890        /* queue_me and wait for wakeup, timeout, or a signal. */
1891        futex_wait_queue_me(hb, &q, to);
1892
1893        /* If we were woken (and unqueued), we succeeded, whatever. */
1894        ret = 0;
1895        /* unqueue_me() drops q.key ref */
1896        if (!unqueue_me(&q))
1897                goto out;
1898        ret = -ETIMEDOUT;
1899        if (to && !to->task)
1900                goto out;
1901
1902        /*
1903         * We expect signal_pending(current), but we might be the
1904         * victim of a spurious wakeup as well.
1905         */
1906        if (!signal_pending(current))
1907                goto retry;
1908
1909        ret = -ERESTARTSYS;
1910        if (!abs_time)
1911                goto out;
1912
1913        restart = &current_thread_info()->restart_block;
1914        restart->fn = futex_wait_restart;
1915        restart->futex.uaddr = uaddr;
1916        restart->futex.val = val;
1917        restart->futex.time = abs_time->tv64;
1918        restart->futex.bitset = bitset;
1919        restart->futex.flags = flags | FLAGS_HAS_TIMEOUT;
1920
1921        ret = -ERESTART_RESTARTBLOCK;
1922
1923out:
1924        if (to) {
1925                hrtimer_cancel(&to->timer);
1926                destroy_hrtimer_on_stack(&to->timer);
1927        }
1928        return ret;
1929}
1930
1931
1932static long futex_wait_restart(struct restart_block *restart)
1933{
1934        u32 __user *uaddr = restart->futex.uaddr;
1935        ktime_t t, *tp = NULL;
1936
1937        if (restart->futex.flags & FLAGS_HAS_TIMEOUT) {
1938                t.tv64 = restart->futex.time;
1939                tp = &t;
1940        }
1941        restart->fn = do_no_restart_syscall;
1942
1943        return (long)futex_wait(uaddr, restart->futex.flags,
1944                                restart->futex.val, tp, restart->futex.bitset);
1945}
1946
1947
1948/*
1949 * Userspace tried a 0 -> TID atomic transition of the futex value
1950 * and failed. The kernel side here does the whole locking operation:
1951 * if there are waiters then it will block, it does PI, etc. (Due to
1952 * races the kernel might see a 0 value of the futex too.)
1953 */
1954static int futex_lock_pi(u32 __user *uaddr, unsigned int flags, int detect,
1955                         ktime_t *time, int trylock)
1956{
1957        struct hrtimer_sleeper timeout, *to = NULL;
1958        struct futex_hash_bucket *hb;
1959        struct futex_q q = futex_q_init;
1960        int res, ret;
1961
1962        if (refill_pi_state_cache())
1963                return -ENOMEM;
1964
1965        if (time) {
1966                to = &timeout;
1967                hrtimer_init_on_stack(&to->timer, CLOCK_REALTIME,
1968                                      HRTIMER_MODE_ABS);
1969                hrtimer_init_sleeper(to, current);
1970                hrtimer_set_expires(&to->timer, *time);
1971        }
1972
1973retry:
1974        ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &q.key, VERIFY_WRITE);
1975        if (unlikely(ret != 0))
1976                goto out;
1977
1978retry_private:
1979        hb = queue_lock(&q);
1980
1981        ret = futex_lock_pi_atomic(uaddr, hb, &q.key, &q.pi_state, current, 0);
1982        if (unlikely(ret)) {
1983                switch (ret) {
1984                case 1:
1985                        /* We got the lock. */
1986                        ret = 0;
1987                        goto out_unlock_put_key;
1988                case -EFAULT:
1989                        goto uaddr_faulted;
1990                case -EAGAIN:
1991                        /*
1992                         * Task is exiting and we just wait for the
1993                         * exit to complete.
1994                         */
1995                        queue_unlock(&q, hb);
1996                        put_futex_key(&q.key);
1997                        cond_resched();
1998                        goto retry;
1999                default:
2000                        goto out_unlock_put_key;
2001                }
2002        }
2003
2004        /*
2005         * Only actually queue now that the atomic ops are done:
2006         */
2007        queue_me(&q, hb);
2008
2009        WARN_ON(!q.pi_state);
2010        /*
2011         * Block on the PI mutex:
2012         */
2013        if (!trylock)
2014                ret = rt_mutex_timed_lock(&q.pi_state->pi_mutex, to, 1);
2015        else {
2016                ret = rt_mutex_trylock(&q.pi_state->pi_mutex);
2017                /* Fixup the trylock return value: */
2018                ret = ret ? 0 : -EWOULDBLOCK;
2019        }
2020
2021        spin_lock(q.lock_ptr);
2022        /*
2023         * Fixup the pi_state owner and possibly acquire the lock if we
2024         * haven't already.
2025         */
2026        res = fixup_owner(uaddr, &q, !ret);
2027        /*
2028         * If fixup_owner() returned an error, proprogate that.  If it acquired
2029         * the lock, clear our -ETIMEDOUT or -EINTR.
2030         */
2031        if (res)
2032                ret = (res < 0) ? res : 0;
2033
2034        /*
2035         * If fixup_owner() faulted and was unable to handle the fault, unlock
2036         * it and return the fault to userspace.
2037         */
2038        if (ret && (rt_mutex_owner(&q.pi_state->pi_mutex) == current))
2039                rt_mutex_unlock(&q.pi_state->pi_mutex);
2040
2041        /* Unqueue and drop the lock */
2042        unqueue_me_pi(&q);
2043
2044        goto out_put_key;
2045
2046out_unlock_put_key:
2047        queue_unlock(&q, hb);
2048
2049out_put_key:
2050        put_futex_key(&q.key);
2051out:
2052        if (to)
2053                destroy_hrtimer_on_stack(&to->timer);
2054        return ret != -EINTR ? ret : -ERESTARTNOINTR;
2055
2056uaddr_faulted:
2057        queue_unlock(&q, hb);
2058
2059        ret = fault_in_user_writeable(uaddr);
2060        if (ret)
2061                goto out_put_key;
2062
2063        if (!(flags & FLAGS_SHARED))
2064                goto retry_private;
2065
2066        put_futex_key(&q.key);
2067        goto retry;
2068}
2069
2070/*
2071 * Userspace attempted a TID -> 0 atomic transition, and failed.
2072 * This is the in-kernel slowpath: we look up the PI state (if any),
2073 * and do the rt-mutex unlock.
2074 */
2075static int futex_unlock_pi(u32 __user *uaddr, unsigned int flags)
2076{
2077        struct futex_hash_bucket *hb;
2078        struct futex_q *this, *next;
2079        struct plist_head *head;
2080        union futex_key key = FUTEX_KEY_INIT;
2081        u32 uval, vpid = task_pid_vnr(current);
2082        int ret;
2083
2084retry:
2085        if (get_user(uval, uaddr))
2086                return -EFAULT;
2087        /*
2088         * We release only a lock we actually own:
2089         */
2090        if ((uval & FUTEX_TID_MASK) != vpid)
2091                return -EPERM;
2092
2093        ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &key, VERIFY_WRITE);
2094        if (unlikely(ret != 0))
2095                goto out;
2096
2097        hb = hash_futex(&key);
2098        spin_lock(&hb->lock);
2099
2100        /*
2101         * To avoid races, try to do the TID -> 0 atomic transition
2102         * again. If it succeeds then we can return without waking
2103         * anyone else up:
2104         */
2105        if (!(uval & FUTEX_OWNER_DIED) &&
2106            cmpxchg_futex_value_locked(&uval, uaddr, vpid, 0))
2107                goto pi_faulted;
2108        /*
2109         * Rare case: we managed to release the lock atomically,
2110         * no need to wake anyone else up:
2111         */
2112        if (unlikely(uval == vpid))
2113                goto out_unlock;
2114
2115        /*
2116         * Ok, other tasks may need to be woken up - check waiters
2117         * and do the wakeup if necessary:
2118         */
2119        head = &hb->chain;
2120
2121        plist_for_each_entry_safe(this, next, head, list) {
2122                if (!match_futex (&this->key, &key))
2123                        continue;
2124                ret = wake_futex_pi(uaddr, uval, this);
2125                /*
2126                 * The atomic access to the futex value
2127                 * generated a pagefault, so retry the
2128                 * user-access and the wakeup:
2129                 */
2130                if (ret == -EFAULT)
2131                        goto pi_faulted;
2132                goto out_unlock;
2133        }
2134        /*
2135         * No waiters - kernel unlocks the futex:
2136         */
2137        if (!(uval & FUTEX_OWNER_DIED)) {
2138                ret = unlock_futex_pi(uaddr, uval);
2139                if (ret == -EFAULT)
2140                        goto pi_faulted;
2141        }
2142
2143out_unlock:
2144        spin_unlock(&hb->lock);
2145        put_futex_key(&key);
2146
2147out:
2148        return ret;
2149
2150pi_faulted:
2151        spin_unlock(&hb->lock);
2152        put_futex_key(&key);
2153
2154        ret = fault_in_user_writeable(uaddr);
2155        if (!ret)
2156                goto retry;
2157
2158        return ret;
2159}
2160
2161/**
2162 * handle_early_requeue_pi_wakeup() - Detect early wakeup on the initial futex
2163 * @hb:         the hash_bucket futex_q was original enqueued on
2164 * @q:          the futex_q woken while waiting to be requeued
2165 * @key2:       the futex_key of the requeue target futex
2166 * @timeout:    the timeout associated with the wait (NULL if none)
2167 *
2168 * Detect if the task was woken on the initial futex as opposed to the requeue
2169 * target futex.  If so, determine if it was a timeout or a signal that caused
2170 * the wakeup and return the appropriate error code to the caller.  Must be
2171 * called with the hb lock held.
2172 *
2173 * Returns
2174 *  0 - no early wakeup detected
2175 * <0 - -ETIMEDOUT or -ERESTARTNOINTR
2176 */
2177static inline
2178int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb,
2179                                   struct futex_q *q, union futex_key *key2,
2180                                   struct hrtimer_sleeper *timeout)
2181{
2182        int ret = 0;
2183
2184        /*
2185         * With the hb lock held, we avoid races while we process the wakeup.
2186         * We only need to hold hb (and not hb2) to ensure atomicity as the
2187         * wakeup code can't change q.key from uaddr to uaddr2 if we hold hb.
2188         * It can't be requeued from uaddr2 to something else since we don't
2189         * support a PI aware source futex for requeue.
2190         */
2191        if (!match_futex(&q->key, key2)) {
2192                WARN_ON(q->lock_ptr && (&hb->lock != q->lock_ptr));
2193                /*
2194                 * We were woken prior to requeue by a timeout or a signal.
2195                 * Unqueue the futex_q and determine which it was.
2196                 */
2197                plist_del(&q->list, &hb->chain);
2198
2199                /* Handle spurious wakeups gracefully */
2200                ret = -EWOULDBLOCK;
2201                if (timeout && !timeout->task)
2202                        ret = -ETIMEDOUT;
2203                else if (signal_pending(current))
2204                        ret = -ERESTARTNOINTR;
2205        }
2206        return ret;
2207}
2208
2209/**
2210 * futex_wait_requeue_pi() - Wait on uaddr and take uaddr2
2211 * @uaddr:      the futex we initially wait on (non-pi)
2212 * @flags:      futex flags (FLAGS_SHARED, FLAGS_CLOCKRT, etc.), they must be
2213 *              the same type, no requeueing from private to shared, etc.
2214 * @val:        the expected value of uaddr
2215 * @abs_time:   absolute timeout
2216 * @bitset:     32 bit wakeup bitset set by userspace, defaults to all
2217 * @clockrt:    whether to use CLOCK_REALTIME (1) or CLOCK_MONOTONIC (0)
2218 * @uaddr2:     the pi futex we will take prior to returning to user-space
2219 *
2220 * The caller will wait on uaddr and will be requeued by futex_requeue() to
2221 * uaddr2 which must be PI aware.  Normal wakeup will wake on uaddr2 and
2222 * complete the acquisition of the rt_mutex prior to returning to userspace.
2223 * This ensures the rt_mutex maintains an owner when it has waiters; without
2224 * one, the pi logic wouldn't know which task to boost/deboost, if there was a
2225 * need to.
2226 *
2227 * We call schedule in futex_wait_queue_me() when we enqueue and return there
2228 * via the following:
2229 * 1) wakeup on uaddr2 after an atomic lock acquisition by futex_requeue()
2230 * 2) wakeup on uaddr2 after a requeue
2231 * 3) signal
2232 * 4) timeout
2233 *
2234 * If 3, cleanup and return -ERESTARTNOINTR.
2235 *
2236 * If 2, we may then block on trying to take the rt_mutex and return via:
2237 * 5) successful lock
2238 * 6) signal
2239 * 7) timeout
2240 * 8) other lock acquisition failure
2241 *
2242 * If 6, return -EWOULDBLOCK (restarting the syscall would do the same).
2243 *
2244 * If 4 or 7, we cleanup and return with -ETIMEDOUT.
2245 *
2246 * Returns:
2247 *  0 - On success
2248 * <0 - On error
2249 */
2250static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
2251                                 u32 val, ktime_t *abs_time, u32 bitset,
2252                                 u32 __user *uaddr2)
2253{
2254        struct hrtimer_sleeper timeout, *to = NULL;
2255        struct rt_mutex_waiter rt_waiter;
2256        struct rt_mutex *pi_mutex = NULL;
2257        struct futex_hash_bucket *hb;
2258        union futex_key key2 = FUTEX_KEY_INIT;
2259        struct futex_q q = futex_q_init;
2260        int res, ret;
2261
2262        if (!bitset)
2263                return -EINVAL;
2264
2265        if (abs_time) {
2266                to = &timeout;
2267                hrtimer_init_on_stack(&to->timer, (flags & FLAGS_CLOCKRT) ?
2268                                      CLOCK_REALTIME : CLOCK_MONOTONIC,
2269                                      HRTIMER_MODE_ABS);
2270                hrtimer_init_sleeper(to, current);
2271                hrtimer_set_expires_range_ns(&to->timer, *abs_time,
2272                                             current->timer_slack_ns);
2273        }
2274
2275        /*
2276         * The waiter is allocated on our stack, manipulated by the requeue
2277         * code while we sleep on uaddr.
2278         */
2279        debug_rt_mutex_init_waiter(&rt_waiter);
2280        rt_waiter.task = NULL;
2281
2282        ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, VERIFY_WRITE);
2283        if (unlikely(ret != 0))
2284                goto out;
2285
2286        q.bitset = bitset;
2287        q.rt_waiter = &rt_waiter;
2288        q.requeue_pi_key = &key2;
2289
2290        /*
2291         * Prepare to wait on uaddr. On success, increments q.key (key1) ref
2292         * count.
2293         */
2294        ret = futex_wait_setup(uaddr, val, flags, &q, &hb);
2295        if (ret)
2296                goto out_key2;
2297
2298        /* Queue the futex_q, drop the hb lock, wait for wakeup. */
2299        futex_wait_queue_me(hb, &q, to);
2300
2301        spin_lock(&hb->lock);
2302        ret = handle_early_requeue_pi_wakeup(hb, &q, &key2, to);
2303        spin_unlock(&hb->lock);
2304        if (ret)
2305                goto out_put_keys;
2306
2307        /*
2308         * In order for us to be here, we know our q.key == key2, and since
2309         * we took the hb->lock above, we also know that futex_requeue() has
2310         * completed and we no longer have to concern ourselves with a wakeup
2311         * race with the atomic proxy lock acquisition by the requeue code. The
2312         * futex_requeue dropped our key1 reference and incremented our key2
2313         * reference count.
2314         */
2315
2316        /* Check if the requeue code acquired the second futex for us. */
2317        if (!q.rt_waiter) {
2318                /*
2319                 * Got the lock. We might not be the anticipated owner if we
2320                 * did a lock-steal - fix up the PI-state in that case.
2321                 */
2322                if (q.pi_state && (q.pi_state->owner != current)) {
2323                        spin_lock(q.lock_ptr);
2324                        ret = fixup_pi_state_owner(uaddr2, &q, current);
2325                        spin_unlock(q.lock_ptr);
2326                }
2327        } else {
2328                /*
2329                 * We have been woken up by futex_unlock_pi(), a timeout, or a
2330                 * signal.  futex_unlock_pi() will not destroy the lock_ptr nor
2331                 * the pi_state.
2332                 */
2333                WARN_ON(!&q.pi_state);
2334                pi_mutex = &q.pi_state->pi_mutex;
2335                ret = rt_mutex_finish_proxy_lock(pi_mutex, to, &rt_waiter, 1);
2336                debug_rt_mutex_free_waiter(&rt_waiter);
2337
2338                spin_lock(q.lock_ptr);
2339                /*
2340                 * Fixup the pi_state owner and possibly acquire the lock if we
2341                 * haven't already.
2342                 */
2343                res = fixup_owner(uaddr2, &q, !ret);
2344                /*
2345                 * If fixup_owner() returned an error, proprogate that.  If it
2346                 * acquired the lock, clear -ETIMEDOUT or -EINTR.
2347                 */
2348                if (res)
2349                        ret = (res < 0) ? res : 0;
2350
2351                /* Unqueue and drop the lock. */
2352                unqueue_me_pi(&q);
2353        }
2354
2355        /*
2356         * If fixup_pi_state_owner() faulted and was unable to handle the
2357         * fault, unlock the rt_mutex and return the fault to userspace.
2358         */
2359        if (ret == -EFAULT) {
2360                if (rt_mutex_owner(pi_mutex) == current)
2361                        rt_mutex_unlock(pi_mutex);
2362        } else if (ret == -EINTR) {
2363                /*
2364                 * We've already been requeued, but cannot restart by calling
2365                 * futex_lock_pi() directly. We could restart this syscall, but
2366                 * it would detect that the user space "val" changed and return
2367                 * -EWOULDBLOCK.  Save the overhead of the restart and return
2368                 * -EWOULDBLOCK directly.
2369                 */
2370                ret = -EWOULDBLOCK;
2371        }
2372
2373out_put_keys:
2374        put_futex_key(&q.key);
2375out_key2:
2376        put_futex_key(&key2);
2377
2378out:
2379        if (to) {
2380                hrtimer_cancel(&to->timer);
2381                destroy_hrtimer_on_stack(&to->timer);
2382        }
2383        return ret;
2384}
2385
2386/*
2387 * Support for robust futexes: the kernel cleans up held futexes at
2388 * thread exit time.
2389 *
2390 * Implementation: user-space maintains a per-thread list of locks it
2391 * is holding. Upon do_exit(), the kernel carefully walks this list,
2392 * and marks all locks that are owned by this thread with the
2393 * FUTEX_OWNER_DIED bit, and wakes up a waiter (if any). The list is
2394 * always manipulated with the lock held, so the list is private and
2395 * per-thread. Userspace also maintains a per-thread 'list_op_pending'
2396 * field, to allow the kernel to clean up if the thread dies after
2397 * acquiring the lock, but just before it could have added itself to
2398 * the list. There can only be one such pending lock.
2399 */
2400
2401/**
2402 * sys_set_robust_list() - Set the robust-futex list head of a task
2403 * @head:       pointer to the list-head
2404 * @len:        length of the list-head, as userspace expects
2405 */
2406SYSCALL_DEFINE2(set_robust_list, struct robust_list_head __user *, head,
2407                size_t, len)
2408{
2409        if (!futex_cmpxchg_enabled)
2410                return -ENOSYS;
2411        /*
2412         * The kernel knows only one size for now:
2413         */
2414        if (unlikely(len != sizeof(*head)))
2415                return -EINVAL;
2416
2417        current->robust_list = head;
2418
2419        return 0;
2420}
2421
2422/**
2423 * sys_get_robust_list() - Get the robust-futex list head of a task
2424 * @pid:        pid of the process [zero for current task]
2425 * @head_ptr:   pointer to a list-head pointer, the kernel fills it in
2426 * @len_ptr:    pointer to a length field, the kernel fills in the header size
2427 */
2428SYSCALL_DEFINE3(get_robust_list, int, pid,
2429                struct robust_list_head __user * __user *, head_ptr,
2430                size_t __user *, len_ptr)
2431{
2432        struct robust_list_head __user *head;
2433        unsigned long ret;
2434        const struct cred *cred = current_cred(), *pcred;
2435
2436        if (!futex_cmpxchg_enabled)
2437                return -ENOSYS;
2438
2439        if (!pid)
2440                head = current->robust_list;
2441        else {
2442                struct task_struct *p;
2443
2444                ret = -ESRCH;
2445                rcu_read_lock();
2446                p = find_task_by_vpid(pid);
2447                if (!p)
2448                        goto err_unlock;
2449                ret = -EPERM;
2450                pcred = __task_cred(p);
2451                /* If victim is in different user_ns, then uids are not
2452                   comparable, so we must have CAP_SYS_PTRACE */
2453                if (cred->user->user_ns != pcred->user->user_ns) {
2454                        if (!ns_capable(pcred->user->user_ns, CAP_SYS_PTRACE))
2455                                goto err_unlock;
2456                        goto ok;
2457                }
2458                /* If victim is in same user_ns, then uids are comparable */
2459                if (cred->euid != pcred->euid &&
2460                    cred->euid != pcred->uid &&
2461                    !ns_capable(pcred->user->user_ns, CAP_SYS_PTRACE))
2462                        goto err_unlock;
2463ok:
2464                head = p->robust_list;
2465                rcu_read_unlock();
2466        }
2467
2468        if (put_user(sizeof(*head), len_ptr))
2469                return -EFAULT;
2470        return put_user(head, head_ptr);
2471
2472err_unlock:
2473        rcu_read_unlock();
2474
2475        return ret;
2476}
2477
2478/*
2479 * Process a futex-list entry, check whether it's owned by the
2480 * dying task, and do notification if so:
2481 */
2482int handle_futex_death(u32 __user *uaddr, struct task_struct *curr, int pi)
2483{
2484        u32 uval, nval, mval;
2485
2486retry:
2487        if (get_user(uval, uaddr))
2488                return -1;
2489
2490        if ((uval & FUTEX_TID_MASK) == task_pid_vnr(curr)) {
2491                /*
2492                 * Ok, this dying thread is truly holding a futex
2493                 * of interest. Set the OWNER_DIED bit atomically
2494                 * via cmpxchg, and if the value had FUTEX_WAITERS
2495                 * set, wake up a waiter (if any). (We have to do a
2496                 * futex_wake() even if OWNER_DIED is already set -
2497                 * to handle the rare but possible case of recursive
2498                 * thread-death.) The rest of the cleanup is done in
2499                 * userspace.
2500                 */
2501                mval = (uval & FUTEX_WAITERS) | FUTEX_OWNER_DIED;
2502                /*
2503                 * We are not holding a lock here, but we want to have
2504                 * the pagefault_disable/enable() protection because
2505                 * we want to handle the fault gracefully. If the
2506                 * access fails we try to fault in the futex with R/W
2507                 * verification via get_user_pages. get_user() above
2508                 * does not guarantee R/W access. If that fails we
2509                 * give up and leave the futex locked.
2510                 */
2511                if (cmpxchg_futex_value_locked(&nval, uaddr, uval, mval)) {
2512                        if (fault_in_user_writeable(uaddr))
2513                                return -1;
2514                        goto retry;
2515                }
2516                if (nval != uval)
2517                        goto retry;
2518
2519                /*
2520                 * Wake robust non-PI futexes here. The wakeup of
2521                 * PI futexes happens in exit_pi_state():
2522                 */
2523                if (!pi && (uval & FUTEX_WAITERS))
2524                        futex_wake(uaddr, 1, 1, FUTEX_BITSET_MATCH_ANY);
2525        }
2526        return 0;
2527}
2528
2529/*
2530 * Fetch a robust-list pointer. Bit 0 signals PI futexes:
2531 */
2532static inline int fetch_robust_entry(struct robust_list __user **entry,
2533                                     struct robust_list __user * __user *head,
2534                                     unsigned int *pi)
2535{
2536        unsigned long uentry;
2537
2538        if (get_user(uentry, (unsigned long __user *)head))
2539                return -EFAULT;
2540
2541        *entry = (void __user *)(uentry & ~1UL);
2542        *pi = uentry & 1;
2543
2544        return 0;
2545}
2546
2547/*
2548 * Walk curr->robust_list (very carefully, it's a userspace list!)
2549 * and mark any locks found there dead, and notify any waiters.
2550 *
2551 * We silently return on any sign of list-walking problem.
2552 */
2553void exit_robust_list(struct task_struct *curr)
2554{
2555        struct robust_list_head __user *head = curr->robust_list;
2556        struct robust_list __user *entry, *next_entry, *pending;
2557        unsigned int limit = ROBUST_LIST_LIMIT, pi, pip;
2558        unsigned int uninitialized_var(next_pi);
2559        unsigned long futex_offset;
2560        int rc;
2561
2562        if (!futex_cmpxchg_enabled)
2563                return;
2564
2565        /*
2566         * Fetch the list head (which was registered earlier, via
2567         * sys_set_robust_list()):
2568         */
2569        if (fetch_robust_entry(&entry, &head->list.next, &pi))
2570                return;
2571        /*
2572         * Fetch the relative futex offset:
2573         */
2574        if (get_user(futex_offset, &head->futex_offset))
2575                return;
2576        /*
2577         * Fetch any possibly pending lock-add first, and handle it
2578         * if it exists:
2579         */
2580        if (fetch_robust_entry(&pending, &head->list_op_pending, &pip))
2581                return;
2582
2583        next_entry = NULL;      /* avoid warning with gcc */
2584        while (entry != &head->list) {
2585                /*
2586                 * Fetch the next entry in the list before calling
2587                 * handle_futex_death:
2588                 */
2589                rc = fetch_robust_entry(&next_entry, &entry->next, &next_pi);
2590                /*
2591                 * A pending lock might already be on the list, so
2592                 * don't process it twice:
2593                 */
2594                if (entry != pending)
2595                        if (handle_futex_death((void __user *)entry + futex_offset,
2596                                                curr, pi))
2597                                return;
2598                if (rc)
2599                        return;
2600                entry = next_entry;
2601                pi = next_pi;
2602                /*
2603                 * Avoid excessively long or circular lists:
2604                 */
2605                if (!--limit)
2606                        break;
2607
2608                cond_resched();
2609        }
2610
2611        if (pending)
2612                handle_futex_death((void __user *)pending + futex_offset,
2613                                   curr, pip);
2614}
2615
2616long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
2617                u32 __user *uaddr2, u32 val2, u32 val3)
2618{
2619        int ret = -ENOSYS, cmd = op & FUTEX_CMD_MASK;
2620        unsigned int flags = 0;
2621
2622        if (!(op & FUTEX_PRIVATE_FLAG))
2623                flags |= FLAGS_SHARED;
2624
2625        if (op & FUTEX_CLOCK_REALTIME) {
2626                flags |= FLAGS_CLOCKRT;
2627                if (cmd != FUTEX_WAIT_BITSET && cmd != FUTEX_WAIT_REQUEUE_PI)
2628                        return -ENOSYS;
2629        }
2630
2631        switch (cmd) {
2632        case FUTEX_WAIT:
2633                val3 = FUTEX_BITSET_MATCH_ANY;
2634        case FUTEX_WAIT_BITSET:
2635                ret = futex_wait(uaddr, flags, val, timeout, val3);
2636                break;
2637        case FUTEX_WAKE:
2638                val3 = FUTEX_BITSET_MATCH_ANY;
2639        case FUTEX_WAKE_BITSET:
2640                ret = futex_wake(uaddr, flags, val, val3);
2641                break;
2642        case FUTEX_REQUEUE:
2643                ret = futex_requeue(uaddr, flags, uaddr2, val, val2, NULL, 0);
2644                break;
2645        case FUTEX_CMP_REQUEUE:
2646                ret = futex_requeue(uaddr, flags, uaddr2, val, val2, &val3, 0);
2647                break;
2648        case FUTEX_WAKE_OP:
2649                ret = futex_wake_op(uaddr, flags, uaddr2, val, val2, val3);
2650                break;
2651        case FUTEX_LOCK_PI:
2652                if (futex_cmpxchg_enabled)
2653                        ret = futex_lock_pi(uaddr, flags, val, timeout, 0);
2654                break;
2655        case FUTEX_UNLOCK_PI:
2656                if (futex_cmpxchg_enabled)
2657                        ret = futex_unlock_pi(uaddr, flags);
2658                break;
2659        case FUTEX_TRYLOCK_PI:
2660                if (futex_cmpxchg_enabled)
2661                        ret = futex_lock_pi(uaddr, flags, 0, timeout, 1);
2662                break;
2663        case FUTEX_WAIT_REQUEUE_PI:
2664                val3 = FUTEX_BITSET_MATCH_ANY;
2665                ret = futex_wait_requeue_pi(uaddr, flags, val, timeout, val3,
2666                                            uaddr2);
2667                break;
2668        case FUTEX_CMP_REQUEUE_PI:
2669                ret = futex_requeue(uaddr, flags, uaddr2, val, val2, &val3, 1);
2670                break;
2671        default:
2672                ret = -ENOSYS;
2673        }
2674        return ret;
2675}
2676
2677
2678SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val,
2679                struct timespec __user *, utime, u32 __user *, uaddr2,
2680                u32, val3)
2681{
2682        struct timespec ts;
2683        ktime_t t, *tp = NULL;
2684        u32 val2 = 0;
2685        int cmd = op & FUTEX_CMD_MASK;
2686
2687        if (utime && (cmd == FUTEX_WAIT || cmd == FUTEX_LOCK_PI ||
2688                      cmd == FUTEX_WAIT_BITSET ||
2689                      cmd == FUTEX_WAIT_REQUEUE_PI)) {
2690                if (copy_from_user(&ts, utime, sizeof(ts)) != 0)
2691                        return -EFAULT;
2692                if (!timespec_valid(&ts))
2693                        return -EINVAL;
2694
2695                t = timespec_to_ktime(ts);
2696                if (cmd == FUTEX_WAIT)
2697                        t = ktime_add_safe(ktime_get(), t);
2698                tp = &t;
2699        }
2700        /*
2701         * requeue parameter in 'utime' if cmd == FUTEX_*_REQUEUE_*.
2702         * number of waiters to wake in 'utime' if cmd == FUTEX_WAKE_OP.
2703         */
2704        if (cmd == FUTEX_REQUEUE || cmd == FUTEX_CMP_REQUEUE ||
2705            cmd == FUTEX_CMP_REQUEUE_PI || cmd == FUTEX_WAKE_OP)
2706                val2 = (u32) (unsigned long) utime;
2707
2708        return do_futex(uaddr, op, val, tp, uaddr2, val2, val3);
2709}
2710
2711static int __init futex_init(void)
2712{
2713        u32 curval;
2714        int i;
2715
2716        /*
2717         * This will fail and we want it. Some arch implementations do
2718         * runtime detection of the futex_atomic_cmpxchg_inatomic()
2719         * functionality. We want to know that before we call in any
2720         * of the complex code paths. Also we want to prevent
2721         * registration of robust lists in that case. NULL is
2722         * guaranteed to fault and we get -EFAULT on functional
2723         * implementation, the non-functional ones will return
2724         * -ENOSYS.
2725         */
2726        if (cmpxchg_futex_value_locked(&curval, NULL, 0, 0) == -EFAULT)
2727                futex_cmpxchg_enabled = 1;
2728
2729        for (i = 0; i < ARRAY_SIZE(futex_queues); i++) {
2730                plist_head_init(&futex_queues[i].chain);
2731                spin_lock_init(&futex_queues[i].lock);
2732        }
2733
2734        return 0;
2735}
2736__initcall(futex_init);
2737