linux/kernel/futex.c
<<
>>
Prefs
   1/*
   2 *  Fast Userspace Mutexes (which I call "Futexes!").
   3 *  (C) Rusty Russell, IBM 2002
   4 *
   5 *  Generalized futexes, futex requeueing, misc fixes by Ingo Molnar
   6 *  (C) Copyright 2003 Red Hat Inc, All Rights Reserved
   7 *
   8 *  Removed page pinning, fix privately mapped COW pages and other cleanups
   9 *  (C) Copyright 2003, 2004 Jamie Lokier
  10 *
  11 *  Robust futex support started by Ingo Molnar
  12 *  (C) Copyright 2006 Red Hat Inc, All Rights Reserved
  13 *  Thanks to Thomas Gleixner for suggestions, analysis and fixes.
  14 *
  15 *  PI-futex support started by Ingo Molnar and Thomas Gleixner
  16 *  Copyright (C) 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
  17 *  Copyright (C) 2006 Timesys Corp., Thomas Gleixner <tglx@timesys.com>
  18 *
  19 *  PRIVATE futexes by Eric Dumazet
  20 *  Copyright (C) 2007 Eric Dumazet <dada1@cosmosbay.com>
  21 *
  22 *  Requeue-PI support by Darren Hart <dvhltc@us.ibm.com>
  23 *  Copyright (C) IBM Corporation, 2009
  24 *  Thanks to Thomas Gleixner for conceptual design and careful reviews.
  25 *
  26 *  Thanks to Ben LaHaise for yelling "hashed waitqueues" loudly
  27 *  enough at me, Linus for the original (flawed) idea, Matthew
  28 *  Kirkwood for proof-of-concept implementation.
  29 *
  30 *  "The futexes are also cursed."
  31 *  "But they come in a choice of three flavours!"
  32 *
  33 *  This program is free software; you can redistribute it and/or modify
  34 *  it under the terms of the GNU General Public License as published by
  35 *  the Free Software Foundation; either version 2 of the License, or
  36 *  (at your option) any later version.
  37 *
  38 *  This program is distributed in the hope that it will be useful,
  39 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
  40 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  41 *  GNU General Public License for more details.
  42 *
  43 *  You should have received a copy of the GNU General Public License
  44 *  along with this program; if not, write to the Free Software
  45 *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  46 */
  47#include <linux/slab.h>
  48#include <linux/poll.h>
  49#include <linux/fs.h>
  50#include <linux/file.h>
  51#include <linux/jhash.h>
  52#include <linux/init.h>
  53#include <linux/futex.h>
  54#include <linux/mount.h>
  55#include <linux/pagemap.h>
  56#include <linux/syscalls.h>
  57#include <linux/signal.h>
  58#include <linux/export.h>
  59#include <linux/magic.h>
  60#include <linux/pid.h>
  61#include <linux/nsproxy.h>
  62
  63#include <asm/futex.h>
  64
  65#include "rtmutex_common.h"
  66
  67int __read_mostly futex_cmpxchg_enabled;
  68
  69#define FUTEX_HASHBITS (CONFIG_BASE_SMALL ? 4 : 8)
  70
  71/*
  72 * Futex flags used to encode options to functions and preserve them across
  73 * restarts.
  74 */
  75#define FLAGS_SHARED            0x01
  76#define FLAGS_CLOCKRT           0x02
  77#define FLAGS_HAS_TIMEOUT       0x04
  78
  79/*
  80 * Priority Inheritance state:
  81 */
  82struct futex_pi_state {
  83        /*
  84         * list of 'owned' pi_state instances - these have to be
  85         * cleaned up in do_exit() if the task exits prematurely:
  86         */
  87        struct list_head list;
  88
  89        /*
  90         * The PI object:
  91         */
  92        struct rt_mutex pi_mutex;
  93
  94        struct task_struct *owner;
  95        atomic_t refcount;
  96
  97        union futex_key key;
  98};
  99
 100/**
 101 * struct futex_q - The hashed futex queue entry, one per waiting task
 102 * @list:               priority-sorted list of tasks waiting on this futex
 103 * @task:               the task waiting on the futex
 104 * @lock_ptr:           the hash bucket lock
 105 * @key:                the key the futex is hashed on
 106 * @pi_state:           optional priority inheritance state
 107 * @rt_waiter:          rt_waiter storage for use with requeue_pi
 108 * @requeue_pi_key:     the requeue_pi target futex key
 109 * @bitset:             bitset for the optional bitmasked wakeup
 110 *
 111 * We use this hashed waitqueue, instead of a normal wait_queue_t, so
 112 * we can wake only the relevant ones (hashed queues may be shared).
 113 *
 114 * A futex_q has a woken state, just like tasks have TASK_RUNNING.
 115 * It is considered woken when plist_node_empty(&q->list) || q->lock_ptr == 0.
 116 * The order of wakeup is always to make the first condition true, then
 117 * the second.
 118 *
 119 * PI futexes are typically woken before they are removed from the hash list via
 120 * the rt_mutex code. See unqueue_me_pi().
 121 */
 122struct futex_q {
 123        struct plist_node list;
 124
 125        struct task_struct *task;
 126        spinlock_t *lock_ptr;
 127        union futex_key key;
 128        struct futex_pi_state *pi_state;
 129        struct rt_mutex_waiter *rt_waiter;
 130        union futex_key *requeue_pi_key;
 131        u32 bitset;
 132};
 133
 134static const struct futex_q futex_q_init = {
 135        /* list gets initialized in queue_me()*/
 136        .key = FUTEX_KEY_INIT,
 137        .bitset = FUTEX_BITSET_MATCH_ANY
 138};
 139
 140/*
 141 * Hash buckets are shared by all the futex_keys that hash to the same
 142 * location.  Each key may have multiple futex_q structures, one for each task
 143 * waiting on a futex.
 144 */
 145struct futex_hash_bucket {
 146        spinlock_t lock;
 147        struct plist_head chain;
 148};
 149
 150static struct futex_hash_bucket futex_queues[1<<FUTEX_HASHBITS];
 151
 152/*
 153 * We hash on the keys returned from get_futex_key (see below).
 154 */
 155static struct futex_hash_bucket *hash_futex(union futex_key *key)
 156{
 157        u32 hash = jhash2((u32*)&key->both.word,
 158                          (sizeof(key->both.word)+sizeof(key->both.ptr))/4,
 159                          key->both.offset);
 160        return &futex_queues[hash & ((1 << FUTEX_HASHBITS)-1)];
 161}
 162
 163/*
 164 * Return 1 if two futex_keys are equal, 0 otherwise.
 165 */
 166static inline int match_futex(union futex_key *key1, union futex_key *key2)
 167{
 168        return (key1 && key2
 169                && key1->both.word == key2->both.word
 170                && key1->both.ptr == key2->both.ptr
 171                && key1->both.offset == key2->both.offset);
 172}
 173
 174/*
 175 * Take a reference to the resource addressed by a key.
 176 * Can be called while holding spinlocks.
 177 *
 178 */
 179static void get_futex_key_refs(union futex_key *key)
 180{
 181        if (!key->both.ptr)
 182                return;
 183
 184        switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) {
 185        case FUT_OFF_INODE:
 186                ihold(key->shared.inode);
 187                break;
 188        case FUT_OFF_MMSHARED:
 189                atomic_inc(&key->private.mm->mm_count);
 190                break;
 191        }
 192}
 193
 194/*
 195 * Drop a reference to the resource addressed by a key.
 196 * The hash bucket spinlock must not be held.
 197 */
 198static void drop_futex_key_refs(union futex_key *key)
 199{
 200        if (!key->both.ptr) {
 201                /* If we're here then we tried to put a key we failed to get */
 202                WARN_ON_ONCE(1);
 203                return;
 204        }
 205
 206        switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) {
 207        case FUT_OFF_INODE:
 208                iput(key->shared.inode);
 209                break;
 210        case FUT_OFF_MMSHARED:
 211                mmdrop(key->private.mm);
 212                break;
 213        }
 214}
 215
 216/**
 217 * get_futex_key() - Get parameters which are the keys for a futex
 218 * @uaddr:      virtual address of the futex
 219 * @fshared:    0 for a PROCESS_PRIVATE futex, 1 for PROCESS_SHARED
 220 * @key:        address where result is stored.
 221 * @rw:         mapping needs to be read/write (values: VERIFY_READ,
 222 *              VERIFY_WRITE)
 223 *
 224 * Returns a negative error code or 0
 225 * The key words are stored in *key on success.
 226 *
 227 * For shared mappings, it's (page->index, vma->vm_file->f_path.dentry->d_inode,
 228 * offset_within_page).  For private mappings, it's (uaddr, current->mm).
 229 * We can usually work out the index without swapping in the page.
 230 *
 231 * lock_page() might sleep, the caller should not hold a spinlock.
 232 */
 233static int
 234get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key, int rw)
 235{
 236        unsigned long address = (unsigned long)uaddr;
 237        struct mm_struct *mm = current->mm;
 238        struct page *page, *page_head;
 239        int err, ro = 0;
 240
 241        /*
 242         * The futex address must be "naturally" aligned.
 243         */
 244        key->both.offset = address % PAGE_SIZE;
 245        if (unlikely((address % sizeof(u32)) != 0))
 246                return -EINVAL;
 247        address -= key->both.offset;
 248
 249        /*
 250         * PROCESS_PRIVATE futexes are fast.
 251         * As the mm cannot disappear under us and the 'key' only needs
 252         * virtual address, we dont even have to find the underlying vma.
 253         * Note : We do have to check 'uaddr' is a valid user address,
 254         *        but access_ok() should be faster than find_vma()
 255         */
 256        if (!fshared) {
 257                if (unlikely(!access_ok(VERIFY_WRITE, uaddr, sizeof(u32))))
 258                        return -EFAULT;
 259                key->private.mm = mm;
 260                key->private.address = address;
 261                get_futex_key_refs(key);
 262                return 0;
 263        }
 264
 265again:
 266        err = get_user_pages_fast(address, 1, 1, &page);
 267        /*
 268         * If write access is not required (eg. FUTEX_WAIT), try
 269         * and get read-only access.
 270         */
 271        if (err == -EFAULT && rw == VERIFY_READ) {
 272                err = get_user_pages_fast(address, 1, 0, &page);
 273                ro = 1;
 274        }
 275        if (err < 0)
 276                return err;
 277        else
 278                err = 0;
 279
 280#ifdef CONFIG_TRANSPARENT_HUGEPAGE
 281        page_head = page;
 282        if (unlikely(PageTail(page))) {
 283                put_page(page);
 284                /* serialize against __split_huge_page_splitting() */
 285                local_irq_disable();
 286                if (likely(__get_user_pages_fast(address, 1, 1, &page) == 1)) {
 287                        page_head = compound_head(page);
 288                        /*
 289                         * page_head is valid pointer but we must pin
 290                         * it before taking the PG_lock and/or
 291                         * PG_compound_lock. The moment we re-enable
 292                         * irqs __split_huge_page_splitting() can
 293                         * return and the head page can be freed from
 294                         * under us. We can't take the PG_lock and/or
 295                         * PG_compound_lock on a page that could be
 296                         * freed from under us.
 297                         */
 298                        if (page != page_head) {
 299                                get_page(page_head);
 300                                put_page(page);
 301                        }
 302                        local_irq_enable();
 303                } else {
 304                        local_irq_enable();
 305                        goto again;
 306                }
 307        }
 308#else
 309        page_head = compound_head(page);
 310        if (page != page_head) {
 311                get_page(page_head);
 312                put_page(page);
 313        }
 314#endif
 315
 316        lock_page(page_head);
 317
 318        /*
 319         * If page_head->mapping is NULL, then it cannot be a PageAnon
 320         * page; but it might be the ZERO_PAGE or in the gate area or
 321         * in a special mapping (all cases which we are happy to fail);
 322         * or it may have been a good file page when get_user_pages_fast
 323         * found it, but truncated or holepunched or subjected to
 324         * invalidate_complete_page2 before we got the page lock (also
 325         * cases which we are happy to fail).  And we hold a reference,
 326         * so refcount care in invalidate_complete_page's remove_mapping
 327         * prevents drop_caches from setting mapping to NULL beneath us.
 328         *
 329         * The case we do have to guard against is when memory pressure made
 330         * shmem_writepage move it from filecache to swapcache beneath us:
 331         * an unlikely race, but we do need to retry for page_head->mapping.
 332         */
 333        if (!page_head->mapping) {
 334                int shmem_swizzled = PageSwapCache(page_head);
 335                unlock_page(page_head);
 336                put_page(page_head);
 337                if (shmem_swizzled)
 338                        goto again;
 339                return -EFAULT;
 340        }
 341
 342        /*
 343         * Private mappings are handled in a simple way.
 344         *
 345         * NOTE: When userspace waits on a MAP_SHARED mapping, even if
 346         * it's a read-only handle, it's expected that futexes attach to
 347         * the object not the particular process.
 348         */
 349        if (PageAnon(page_head)) {
 350                /*
 351                 * A RO anonymous page will never change and thus doesn't make
 352                 * sense for futex operations.
 353                 */
 354                if (ro) {
 355                        err = -EFAULT;
 356                        goto out;
 357                }
 358
 359                key->both.offset |= FUT_OFF_MMSHARED; /* ref taken on mm */
 360                key->private.mm = mm;
 361                key->private.address = address;
 362        } else {
 363                key->both.offset |= FUT_OFF_INODE; /* inode-based key */
 364                key->shared.inode = page_head->mapping->host;
 365                key->shared.pgoff = page_head->index;
 366        }
 367
 368        get_futex_key_refs(key);
 369
 370out:
 371        unlock_page(page_head);
 372        put_page(page_head);
 373        return err;
 374}
 375
 376static inline void put_futex_key(union futex_key *key)
 377{
 378        drop_futex_key_refs(key);
 379}
 380
 381/**
 382 * fault_in_user_writeable() - Fault in user address and verify RW access
 383 * @uaddr:      pointer to faulting user space address
 384 *
 385 * Slow path to fixup the fault we just took in the atomic write
 386 * access to @uaddr.
 387 *
 388 * We have no generic implementation of a non-destructive write to the
 389 * user address. We know that we faulted in the atomic pagefault
 390 * disabled section so we can as well avoid the #PF overhead by
 391 * calling get_user_pages() right away.
 392 */
 393static int fault_in_user_writeable(u32 __user *uaddr)
 394{
 395        struct mm_struct *mm = current->mm;
 396        int ret;
 397
 398        down_read(&mm->mmap_sem);
 399        ret = fixup_user_fault(current, mm, (unsigned long)uaddr,
 400                               FAULT_FLAG_WRITE);
 401        up_read(&mm->mmap_sem);
 402
 403        return ret < 0 ? ret : 0;
 404}
 405
 406/**
 407 * futex_top_waiter() - Return the highest priority waiter on a futex
 408 * @hb:         the hash bucket the futex_q's reside in
 409 * @key:        the futex key (to distinguish it from other futex futex_q's)
 410 *
 411 * Must be called with the hb lock held.
 412 */
 413static struct futex_q *futex_top_waiter(struct futex_hash_bucket *hb,
 414                                        union futex_key *key)
 415{
 416        struct futex_q *this;
 417
 418        plist_for_each_entry(this, &hb->chain, list) {
 419                if (match_futex(&this->key, key))
 420                        return this;
 421        }
 422        return NULL;
 423}
 424
 425static int cmpxchg_futex_value_locked(u32 *curval, u32 __user *uaddr,
 426                                      u32 uval, u32 newval)
 427{
 428        int ret;
 429
 430        pagefault_disable();
 431        ret = futex_atomic_cmpxchg_inatomic(curval, uaddr, uval, newval);
 432        pagefault_enable();
 433
 434        return ret;
 435}
 436
 437static int get_futex_value_locked(u32 *dest, u32 __user *from)
 438{
 439        int ret;
 440
 441        pagefault_disable();
 442        ret = __copy_from_user_inatomic(dest, from, sizeof(u32));
 443        pagefault_enable();
 444
 445        return ret ? -EFAULT : 0;
 446}
 447
 448
 449/*
 450 * PI code:
 451 */
 452static int refill_pi_state_cache(void)
 453{
 454        struct futex_pi_state *pi_state;
 455
 456        if (likely(current->pi_state_cache))
 457                return 0;
 458
 459        pi_state = kzalloc(sizeof(*pi_state), GFP_KERNEL);
 460
 461        if (!pi_state)
 462                return -ENOMEM;
 463
 464        INIT_LIST_HEAD(&pi_state->list);
 465        /* pi_mutex gets initialized later */
 466        pi_state->owner = NULL;
 467        atomic_set(&pi_state->refcount, 1);
 468        pi_state->key = FUTEX_KEY_INIT;
 469
 470        current->pi_state_cache = pi_state;
 471
 472        return 0;
 473}
 474
 475static struct futex_pi_state * alloc_pi_state(void)
 476{
 477        struct futex_pi_state *pi_state = current->pi_state_cache;
 478
 479        WARN_ON(!pi_state);
 480        current->pi_state_cache = NULL;
 481
 482        return pi_state;
 483}
 484
 485static void free_pi_state(struct futex_pi_state *pi_state)
 486{
 487        if (!atomic_dec_and_test(&pi_state->refcount))
 488                return;
 489
 490        /*
 491         * If pi_state->owner is NULL, the owner is most probably dying
 492         * and has cleaned up the pi_state already
 493         */
 494        if (pi_state->owner) {
 495                raw_spin_lock_irq(&pi_state->owner->pi_lock);
 496                list_del_init(&pi_state->list);
 497                raw_spin_unlock_irq(&pi_state->owner->pi_lock);
 498
 499                rt_mutex_proxy_unlock(&pi_state->pi_mutex, pi_state->owner);
 500        }
 501
 502        if (current->pi_state_cache)
 503                kfree(pi_state);
 504        else {
 505                /*
 506                 * pi_state->list is already empty.
 507                 * clear pi_state->owner.
 508                 * refcount is at 0 - put it back to 1.
 509                 */
 510                pi_state->owner = NULL;
 511                atomic_set(&pi_state->refcount, 1);
 512                current->pi_state_cache = pi_state;
 513        }
 514}
 515
 516/*
 517 * Look up the task based on what TID userspace gave us.
 518 * We dont trust it.
 519 */
 520static struct task_struct * futex_find_get_task(pid_t pid)
 521{
 522        struct task_struct *p;
 523
 524        rcu_read_lock();
 525        p = find_task_by_vpid(pid);
 526        if (p)
 527                get_task_struct(p);
 528
 529        rcu_read_unlock();
 530
 531        return p;
 532}
 533
 534/*
 535 * This task is holding PI mutexes at exit time => bad.
 536 * Kernel cleans up PI-state, but userspace is likely hosed.
 537 * (Robust-futex cleanup is separate and might save the day for userspace.)
 538 */
 539void exit_pi_state_list(struct task_struct *curr)
 540{
 541        struct list_head *next, *head = &curr->pi_state_list;
 542        struct futex_pi_state *pi_state;
 543        struct futex_hash_bucket *hb;
 544        union futex_key key = FUTEX_KEY_INIT;
 545
 546        if (!futex_cmpxchg_enabled)
 547                return;
 548        /*
 549         * We are a ZOMBIE and nobody can enqueue itself on
 550         * pi_state_list anymore, but we have to be careful
 551         * versus waiters unqueueing themselves:
 552         */
 553        raw_spin_lock_irq(&curr->pi_lock);
 554        while (!list_empty(head)) {
 555
 556                next = head->next;
 557                pi_state = list_entry(next, struct futex_pi_state, list);
 558                key = pi_state->key;
 559                hb = hash_futex(&key);
 560                raw_spin_unlock_irq(&curr->pi_lock);
 561
 562                spin_lock(&hb->lock);
 563
 564                raw_spin_lock_irq(&curr->pi_lock);
 565                /*
 566                 * We dropped the pi-lock, so re-check whether this
 567                 * task still owns the PI-state:
 568                 */
 569                if (head->next != next) {
 570                        spin_unlock(&hb->lock);
 571                        continue;
 572                }
 573
 574                WARN_ON(pi_state->owner != curr);
 575                WARN_ON(list_empty(&pi_state->list));
 576                list_del_init(&pi_state->list);
 577                pi_state->owner = NULL;
 578                raw_spin_unlock_irq(&curr->pi_lock);
 579
 580                rt_mutex_unlock(&pi_state->pi_mutex);
 581
 582                spin_unlock(&hb->lock);
 583
 584                raw_spin_lock_irq(&curr->pi_lock);
 585        }
 586        raw_spin_unlock_irq(&curr->pi_lock);
 587}
 588
 589static int
 590lookup_pi_state(u32 uval, struct futex_hash_bucket *hb,
 591                union futex_key *key, struct futex_pi_state **ps)
 592{
 593        struct futex_pi_state *pi_state = NULL;
 594        struct futex_q *this, *next;
 595        struct plist_head *head;
 596        struct task_struct *p;
 597        pid_t pid = uval & FUTEX_TID_MASK;
 598
 599        head = &hb->chain;
 600
 601        plist_for_each_entry_safe(this, next, head, list) {
 602                if (match_futex(&this->key, key)) {
 603                        /*
 604                         * Another waiter already exists - bump up
 605                         * the refcount and return its pi_state:
 606                         */
 607                        pi_state = this->pi_state;
 608                        /*
 609                         * Userspace might have messed up non-PI and PI futexes
 610                         */
 611                        if (unlikely(!pi_state))
 612                                return -EINVAL;
 613
 614                        WARN_ON(!atomic_read(&pi_state->refcount));
 615
 616                        /*
 617                         * When pi_state->owner is NULL then the owner died
 618                         * and another waiter is on the fly. pi_state->owner
 619                         * is fixed up by the task which acquires
 620                         * pi_state->rt_mutex.
 621                         *
 622                         * We do not check for pid == 0 which can happen when
 623                         * the owner died and robust_list_exit() cleared the
 624                         * TID.
 625                         */
 626                        if (pid && pi_state->owner) {
 627                                /*
 628                                 * Bail out if user space manipulated the
 629                                 * futex value.
 630                                 */
 631                                if (pid != task_pid_vnr(pi_state->owner))
 632                                        return -EINVAL;
 633                        }
 634
 635                        atomic_inc(&pi_state->refcount);
 636                        *ps = pi_state;
 637
 638                        return 0;
 639                }
 640        }
 641
 642        /*
 643         * We are the first waiter - try to look up the real owner and attach
 644         * the new pi_state to it, but bail out when TID = 0
 645         */
 646        if (!pid)
 647                return -ESRCH;
 648        p = futex_find_get_task(pid);
 649        if (!p)
 650                return -ESRCH;
 651
 652        /*
 653         * We need to look at the task state flags to figure out,
 654         * whether the task is exiting. To protect against the do_exit
 655         * change of the task flags, we do this protected by
 656         * p->pi_lock:
 657         */
 658        raw_spin_lock_irq(&p->pi_lock);
 659        if (unlikely(p->flags & PF_EXITING)) {
 660                /*
 661                 * The task is on the way out. When PF_EXITPIDONE is
 662                 * set, we know that the task has finished the
 663                 * cleanup:
 664                 */
 665                int ret = (p->flags & PF_EXITPIDONE) ? -ESRCH : -EAGAIN;
 666
 667                raw_spin_unlock_irq(&p->pi_lock);
 668                put_task_struct(p);
 669                return ret;
 670        }
 671
 672        pi_state = alloc_pi_state();
 673
 674        /*
 675         * Initialize the pi_mutex in locked state and make 'p'
 676         * the owner of it:
 677         */
 678        rt_mutex_init_proxy_locked(&pi_state->pi_mutex, p);
 679
 680        /* Store the key for possible exit cleanups: */
 681        pi_state->key = *key;
 682
 683        WARN_ON(!list_empty(&pi_state->list));
 684        list_add(&pi_state->list, &p->pi_state_list);
 685        pi_state->owner = p;
 686        raw_spin_unlock_irq(&p->pi_lock);
 687
 688        put_task_struct(p);
 689
 690        *ps = pi_state;
 691
 692        return 0;
 693}
 694
 695/**
 696 * futex_lock_pi_atomic() - Atomic work required to acquire a pi aware futex
 697 * @uaddr:              the pi futex user address
 698 * @hb:                 the pi futex hash bucket
 699 * @key:                the futex key associated with uaddr and hb
 700 * @ps:                 the pi_state pointer where we store the result of the
 701 *                      lookup
 702 * @task:               the task to perform the atomic lock work for.  This will
 703 *                      be "current" except in the case of requeue pi.
 704 * @set_waiters:        force setting the FUTEX_WAITERS bit (1) or not (0)
 705 *
 706 * Returns:
 707 *  0 - ready to wait
 708 *  1 - acquired the lock
 709 * <0 - error
 710 *
 711 * The hb->lock and futex_key refs shall be held by the caller.
 712 */
 713static int futex_lock_pi_atomic(u32 __user *uaddr, struct futex_hash_bucket *hb,
 714                                union futex_key *key,
 715                                struct futex_pi_state **ps,
 716                                struct task_struct *task, int set_waiters)
 717{
 718        int lock_taken, ret, ownerdied = 0;
 719        u32 uval, newval, curval, vpid = task_pid_vnr(task);
 720
 721retry:
 722        ret = lock_taken = 0;
 723
 724        /*
 725         * To avoid races, we attempt to take the lock here again
 726         * (by doing a 0 -> TID atomic cmpxchg), while holding all
 727         * the locks. It will most likely not succeed.
 728         */
 729        newval = vpid;
 730        if (set_waiters)
 731                newval |= FUTEX_WAITERS;
 732
 733        if (unlikely(cmpxchg_futex_value_locked(&curval, uaddr, 0, newval)))
 734                return -EFAULT;
 735
 736        /*
 737         * Detect deadlocks.
 738         */
 739        if ((unlikely((curval & FUTEX_TID_MASK) == vpid)))
 740                return -EDEADLK;
 741
 742        /*
 743         * Surprise - we got the lock. Just return to userspace:
 744         */
 745        if (unlikely(!curval))
 746                return 1;
 747
 748        uval = curval;
 749
 750        /*
 751         * Set the FUTEX_WAITERS flag, so the owner will know it has someone
 752         * to wake at the next unlock.
 753         */
 754        newval = curval | FUTEX_WAITERS;
 755
 756        /*
 757         * There are two cases, where a futex might have no owner (the
 758         * owner TID is 0): OWNER_DIED. We take over the futex in this
 759         * case. We also do an unconditional take over, when the owner
 760         * of the futex died.
 761         *
 762         * This is safe as we are protected by the hash bucket lock !
 763         */
 764        if (unlikely(ownerdied || !(curval & FUTEX_TID_MASK))) {
 765                /* Keep the OWNER_DIED bit */
 766                newval = (curval & ~FUTEX_TID_MASK) | vpid;
 767                ownerdied = 0;
 768                lock_taken = 1;
 769        }
 770
 771        if (unlikely(cmpxchg_futex_value_locked(&curval, uaddr, uval, newval)))
 772                return -EFAULT;
 773        if (unlikely(curval != uval))
 774                goto retry;
 775
 776        /*
 777         * We took the lock due to owner died take over.
 778         */
 779        if (unlikely(lock_taken))
 780                return 1;
 781
 782        /*
 783         * We dont have the lock. Look up the PI state (or create it if
 784         * we are the first waiter):
 785         */
 786        ret = lookup_pi_state(uval, hb, key, ps);
 787
 788        if (unlikely(ret)) {
 789                switch (ret) {
 790                case -ESRCH:
 791                        /*
 792                         * No owner found for this futex. Check if the
 793                         * OWNER_DIED bit is set to figure out whether
 794                         * this is a robust futex or not.
 795                         */
 796                        if (get_futex_value_locked(&curval, uaddr))
 797                                return -EFAULT;
 798
 799                        /*
 800                         * We simply start over in case of a robust
 801                         * futex. The code above will take the futex
 802                         * and return happy.
 803                         */
 804                        if (curval & FUTEX_OWNER_DIED) {
 805                                ownerdied = 1;
 806                                goto retry;
 807                        }
 808                default:
 809                        break;
 810                }
 811        }
 812
 813        return ret;
 814}
 815
 816/**
 817 * __unqueue_futex() - Remove the futex_q from its futex_hash_bucket
 818 * @q:  The futex_q to unqueue
 819 *
 820 * The q->lock_ptr must not be NULL and must be held by the caller.
 821 */
 822static void __unqueue_futex(struct futex_q *q)
 823{
 824        struct futex_hash_bucket *hb;
 825
 826        if (WARN_ON_SMP(!q->lock_ptr || !spin_is_locked(q->lock_ptr))
 827            || WARN_ON(plist_node_empty(&q->list)))
 828                return;
 829
 830        hb = container_of(q->lock_ptr, struct futex_hash_bucket, lock);
 831        plist_del(&q->list, &hb->chain);
 832}
 833
 834/*
 835 * The hash bucket lock must be held when this is called.
 836 * Afterwards, the futex_q must not be accessed.
 837 */
 838static void wake_futex(struct futex_q *q)
 839{
 840        struct task_struct *p = q->task;
 841
 842        /*
 843         * We set q->lock_ptr = NULL _before_ we wake up the task. If
 844         * a non-futex wake up happens on another CPU then the task
 845         * might exit and p would dereference a non-existing task
 846         * struct. Prevent this by holding a reference on p across the
 847         * wake up.
 848         */
 849        get_task_struct(p);
 850
 851        __unqueue_futex(q);
 852        /*
 853         * The waiting task can free the futex_q as soon as
 854         * q->lock_ptr = NULL is written, without taking any locks. A
 855         * memory barrier is required here to prevent the following
 856         * store to lock_ptr from getting ahead of the plist_del.
 857         */
 858        smp_wmb();
 859        q->lock_ptr = NULL;
 860
 861        wake_up_state(p, TASK_NORMAL);
 862        put_task_struct(p);
 863}
 864
 865static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this)
 866{
 867        struct task_struct *new_owner;
 868        struct futex_pi_state *pi_state = this->pi_state;
 869        u32 uninitialized_var(curval), newval;
 870
 871        if (!pi_state)
 872                return -EINVAL;
 873
 874        /*
 875         * If current does not own the pi_state then the futex is
 876         * inconsistent and user space fiddled with the futex value.
 877         */
 878        if (pi_state->owner != current)
 879                return -EINVAL;
 880
 881        raw_spin_lock(&pi_state->pi_mutex.wait_lock);
 882        new_owner = rt_mutex_next_owner(&pi_state->pi_mutex);
 883
 884        /*
 885         * It is possible that the next waiter (the one that brought
 886         * this owner to the kernel) timed out and is no longer
 887         * waiting on the lock.
 888         */
 889        if (!new_owner)
 890                new_owner = this->task;
 891
 892        /*
 893         * We pass it to the next owner. (The WAITERS bit is always
 894         * kept enabled while there is PI state around. We must also
 895         * preserve the owner died bit.)
 896         */
 897        if (!(uval & FUTEX_OWNER_DIED)) {
 898                int ret = 0;
 899
 900                newval = FUTEX_WAITERS | task_pid_vnr(new_owner);
 901
 902                if (cmpxchg_futex_value_locked(&curval, uaddr, uval, newval))
 903                        ret = -EFAULT;
 904                else if (curval != uval)
 905                        ret = -EINVAL;
 906                if (ret) {
 907                        raw_spin_unlock(&pi_state->pi_mutex.wait_lock);
 908                        return ret;
 909                }
 910        }
 911
 912        raw_spin_lock_irq(&pi_state->owner->pi_lock);
 913        WARN_ON(list_empty(&pi_state->list));
 914        list_del_init(&pi_state->list);
 915        raw_spin_unlock_irq(&pi_state->owner->pi_lock);
 916
 917        raw_spin_lock_irq(&new_owner->pi_lock);
 918        WARN_ON(!list_empty(&pi_state->list));
 919        list_add(&pi_state->list, &new_owner->pi_state_list);
 920        pi_state->owner = new_owner;
 921        raw_spin_unlock_irq(&new_owner->pi_lock);
 922
 923        raw_spin_unlock(&pi_state->pi_mutex.wait_lock);
 924        rt_mutex_unlock(&pi_state->pi_mutex);
 925
 926        return 0;
 927}
 928
 929static int unlock_futex_pi(u32 __user *uaddr, u32 uval)
 930{
 931        u32 uninitialized_var(oldval);
 932
 933        /*
 934         * There is no waiter, so we unlock the futex. The owner died
 935         * bit has not to be preserved here. We are the owner:
 936         */
 937        if (cmpxchg_futex_value_locked(&oldval, uaddr, uval, 0))
 938                return -EFAULT;
 939        if (oldval != uval)
 940                return -EAGAIN;
 941
 942        return 0;
 943}
 944
 945/*
 946 * Express the locking dependencies for lockdep:
 947 */
 948static inline void
 949double_lock_hb(struct futex_hash_bucket *hb1, struct futex_hash_bucket *hb2)
 950{
 951        if (hb1 <= hb2) {
 952                spin_lock(&hb1->lock);
 953                if (hb1 < hb2)
 954                        spin_lock_nested(&hb2->lock, SINGLE_DEPTH_NESTING);
 955        } else { /* hb1 > hb2 */
 956                spin_lock(&hb2->lock);
 957                spin_lock_nested(&hb1->lock, SINGLE_DEPTH_NESTING);
 958        }
 959}
 960
 961static inline void
 962double_unlock_hb(struct futex_hash_bucket *hb1, struct futex_hash_bucket *hb2)
 963{
 964        spin_unlock(&hb1->lock);
 965        if (hb1 != hb2)
 966                spin_unlock(&hb2->lock);
 967}
 968
 969/*
 970 * Wake up waiters matching bitset queued on this futex (uaddr).
 971 */
 972static int
 973futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset)
 974{
 975        struct futex_hash_bucket *hb;
 976        struct futex_q *this, *next;
 977        struct plist_head *head;
 978        union futex_key key = FUTEX_KEY_INIT;
 979        int ret;
 980
 981        if (!bitset)
 982                return -EINVAL;
 983
 984        ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &key, VERIFY_READ);
 985        if (unlikely(ret != 0))
 986                goto out;
 987
 988        hb = hash_futex(&key);
 989        spin_lock(&hb->lock);
 990        head = &hb->chain;
 991
 992        plist_for_each_entry_safe(this, next, head, list) {
 993                if (match_futex (&this->key, &key)) {
 994                        if (this->pi_state || this->rt_waiter) {
 995                                ret = -EINVAL;
 996                                break;
 997                        }
 998
 999                        /* Check if one of the bits is set in both bitsets */
1000                        if (!(this->bitset & bitset))
1001                                continue;
1002
1003                        wake_futex(this);
1004                        if (++ret >= nr_wake)
1005                                break;
1006                }
1007        }
1008
1009        spin_unlock(&hb->lock);
1010        put_futex_key(&key);
1011out:
1012        return ret;
1013}
1014
1015/*
1016 * Wake up all waiters hashed on the physical page that is mapped
1017 * to this virtual address:
1018 */
1019static int
1020futex_wake_op(u32 __user *uaddr1, unsigned int flags, u32 __user *uaddr2,
1021              int nr_wake, int nr_wake2, int op)
1022{
1023        union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT;
1024        struct futex_hash_bucket *hb1, *hb2;
1025        struct plist_head *head;
1026        struct futex_q *this, *next;
1027        int ret, op_ret;
1028
1029retry:
1030        ret = get_futex_key(uaddr1, flags & FLAGS_SHARED, &key1, VERIFY_READ);
1031        if (unlikely(ret != 0))
1032                goto out;
1033        ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, VERIFY_WRITE);
1034        if (unlikely(ret != 0))
1035                goto out_put_key1;
1036
1037        hb1 = hash_futex(&key1);
1038        hb2 = hash_futex(&key2);
1039
1040retry_private:
1041        double_lock_hb(hb1, hb2);
1042        op_ret = futex_atomic_op_inuser(op, uaddr2);
1043        if (unlikely(op_ret < 0)) {
1044
1045                double_unlock_hb(hb1, hb2);
1046
1047#ifndef CONFIG_MMU
1048                /*
1049                 * we don't get EFAULT from MMU faults if we don't have an MMU,
1050                 * but we might get them from range checking
1051                 */
1052                ret = op_ret;
1053                goto out_put_keys;
1054#endif
1055
1056                if (unlikely(op_ret != -EFAULT)) {
1057                        ret = op_ret;
1058                        goto out_put_keys;
1059                }
1060
1061                ret = fault_in_user_writeable(uaddr2);
1062                if (ret)
1063                        goto out_put_keys;
1064
1065                if (!(flags & FLAGS_SHARED))
1066                        goto retry_private;
1067
1068                put_futex_key(&key2);
1069                put_futex_key(&key1);
1070                goto retry;
1071        }
1072
1073        head = &hb1->chain;
1074
1075        plist_for_each_entry_safe(this, next, head, list) {
1076                if (match_futex (&this->key, &key1)) {
1077                        wake_futex(this);
1078                        if (++ret >= nr_wake)
1079                                break;
1080                }
1081        }
1082
1083        if (op_ret > 0) {
1084                head = &hb2->chain;
1085
1086                op_ret = 0;
1087                plist_for_each_entry_safe(this, next, head, list) {
1088                        if (match_futex (&this->key, &key2)) {
1089                                wake_futex(this);
1090                                if (++op_ret >= nr_wake2)
1091                                        break;
1092                        }
1093                }
1094                ret += op_ret;
1095        }
1096
1097        double_unlock_hb(hb1, hb2);
1098out_put_keys:
1099        put_futex_key(&key2);
1100out_put_key1:
1101        put_futex_key(&key1);
1102out:
1103        return ret;
1104}
1105
1106/**
1107 * requeue_futex() - Requeue a futex_q from one hb to another
1108 * @q:          the futex_q to requeue
1109 * @hb1:        the source hash_bucket
1110 * @hb2:        the target hash_bucket
1111 * @key2:       the new key for the requeued futex_q
1112 */
1113static inline
1114void requeue_futex(struct futex_q *q, struct futex_hash_bucket *hb1,
1115                   struct futex_hash_bucket *hb2, union futex_key *key2)
1116{
1117
1118        /*
1119         * If key1 and key2 hash to the same bucket, no need to
1120         * requeue.
1121         */
1122        if (likely(&hb1->chain != &hb2->chain)) {
1123                plist_del(&q->list, &hb1->chain);
1124                plist_add(&q->list, &hb2->chain);
1125                q->lock_ptr = &hb2->lock;
1126        }
1127        get_futex_key_refs(key2);
1128        q->key = *key2;
1129}
1130
1131/**
1132 * requeue_pi_wake_futex() - Wake a task that acquired the lock during requeue
1133 * @q:          the futex_q
1134 * @key:        the key of the requeue target futex
1135 * @hb:         the hash_bucket of the requeue target futex
1136 *
1137 * During futex_requeue, with requeue_pi=1, it is possible to acquire the
1138 * target futex if it is uncontended or via a lock steal.  Set the futex_q key
1139 * to the requeue target futex so the waiter can detect the wakeup on the right
1140 * futex, but remove it from the hb and NULL the rt_waiter so it can detect
1141 * atomic lock acquisition.  Set the q->lock_ptr to the requeue target hb->lock
1142 * to protect access to the pi_state to fixup the owner later.  Must be called
1143 * with both q->lock_ptr and hb->lock held.
1144 */
1145static inline
1146void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key,
1147                           struct futex_hash_bucket *hb)
1148{
1149        get_futex_key_refs(key);
1150        q->key = *key;
1151
1152        __unqueue_futex(q);
1153
1154        WARN_ON(!q->rt_waiter);
1155        q->rt_waiter = NULL;
1156
1157        q->lock_ptr = &hb->lock;
1158
1159        wake_up_state(q->task, TASK_NORMAL);
1160}
1161
1162/**
1163 * futex_proxy_trylock_atomic() - Attempt an atomic lock for the top waiter
1164 * @pifutex:            the user address of the to futex
1165 * @hb1:                the from futex hash bucket, must be locked by the caller
1166 * @hb2:                the to futex hash bucket, must be locked by the caller
1167 * @key1:               the from futex key
1168 * @key2:               the to futex key
1169 * @ps:                 address to store the pi_state pointer
1170 * @set_waiters:        force setting the FUTEX_WAITERS bit (1) or not (0)
1171 *
1172 * Try and get the lock on behalf of the top waiter if we can do it atomically.
1173 * Wake the top waiter if we succeed.  If the caller specified set_waiters,
1174 * then direct futex_lock_pi_atomic() to force setting the FUTEX_WAITERS bit.
1175 * hb1 and hb2 must be held by the caller.
1176 *
1177 * Returns:
1178 *  0 - failed to acquire the lock atomicly
1179 *  1 - acquired the lock
1180 * <0 - error
1181 */
1182static int futex_proxy_trylock_atomic(u32 __user *pifutex,
1183                                 struct futex_hash_bucket *hb1,
1184                                 struct futex_hash_bucket *hb2,
1185                                 union futex_key *key1, union futex_key *key2,
1186                                 struct futex_pi_state **ps, int set_waiters)
1187{
1188        struct futex_q *top_waiter = NULL;
1189        u32 curval;
1190        int ret;
1191
1192        if (get_futex_value_locked(&curval, pifutex))
1193                return -EFAULT;
1194
1195        /*
1196         * Find the top_waiter and determine if there are additional waiters.
1197         * If the caller intends to requeue more than 1 waiter to pifutex,
1198         * force futex_lock_pi_atomic() to set the FUTEX_WAITERS bit now,
1199         * as we have means to handle the possible fault.  If not, don't set
1200         * the bit unecessarily as it will force the subsequent unlock to enter
1201         * the kernel.
1202         */
1203        top_waiter = futex_top_waiter(hb1, key1);
1204
1205        /* There are no waiters, nothing for us to do. */
1206        if (!top_waiter)
1207                return 0;
1208
1209        /* Ensure we requeue to the expected futex. */
1210        if (!match_futex(top_waiter->requeue_pi_key, key2))
1211                return -EINVAL;
1212
1213        /*
1214         * Try to take the lock for top_waiter.  Set the FUTEX_WAITERS bit in
1215         * the contended case or if set_waiters is 1.  The pi_state is returned
1216         * in ps in contended cases.
1217         */
1218        ret = futex_lock_pi_atomic(pifutex, hb2, key2, ps, top_waiter->task,
1219                                   set_waiters);
1220        if (ret == 1)
1221                requeue_pi_wake_futex(top_waiter, key2, hb2);
1222
1223        return ret;
1224}
1225
1226/**
1227 * futex_requeue() - Requeue waiters from uaddr1 to uaddr2
1228 * @uaddr1:     source futex user address
1229 * @flags:      futex flags (FLAGS_SHARED, etc.)
1230 * @uaddr2:     target futex user address
1231 * @nr_wake:    number of waiters to wake (must be 1 for requeue_pi)
1232 * @nr_requeue: number of waiters to requeue (0-INT_MAX)
1233 * @cmpval:     @uaddr1 expected value (or %NULL)
1234 * @requeue_pi: if we are attempting to requeue from a non-pi futex to a
1235 *              pi futex (pi to pi requeue is not supported)
1236 *
1237 * Requeue waiters on uaddr1 to uaddr2. In the requeue_pi case, try to acquire
1238 * uaddr2 atomically on behalf of the top waiter.
1239 *
1240 * Returns:
1241 * >=0 - on success, the number of tasks requeued or woken
1242 *  <0 - on error
1243 */
1244static int futex_requeue(u32 __user *uaddr1, unsigned int flags,
1245                         u32 __user *uaddr2, int nr_wake, int nr_requeue,
1246                         u32 *cmpval, int requeue_pi)
1247{
1248        union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT;
1249        int drop_count = 0, task_count = 0, ret;
1250        struct futex_pi_state *pi_state = NULL;
1251        struct futex_hash_bucket *hb1, *hb2;
1252        struct plist_head *head1;
1253        struct futex_q *this, *next;
1254        u32 curval2;
1255
1256        if (requeue_pi) {
1257                /*
1258                 * requeue_pi requires a pi_state, try to allocate it now
1259                 * without any locks in case it fails.
1260                 */
1261                if (refill_pi_state_cache())
1262                        return -ENOMEM;
1263                /*
1264                 * requeue_pi must wake as many tasks as it can, up to nr_wake
1265                 * + nr_requeue, since it acquires the rt_mutex prior to
1266                 * returning to userspace, so as to not leave the rt_mutex with
1267                 * waiters and no owner.  However, second and third wake-ups
1268                 * cannot be predicted as they involve race conditions with the
1269                 * first wake and a fault while looking up the pi_state.  Both
1270                 * pthread_cond_signal() and pthread_cond_broadcast() should
1271                 * use nr_wake=1.
1272                 */
1273                if (nr_wake != 1)
1274                        return -EINVAL;
1275        }
1276
1277retry:
1278        if (pi_state != NULL) {
1279                /*
1280                 * We will have to lookup the pi_state again, so free this one
1281                 * to keep the accounting correct.
1282                 */
1283                free_pi_state(pi_state);
1284                pi_state = NULL;
1285        }
1286
1287        ret = get_futex_key(uaddr1, flags & FLAGS_SHARED, &key1, VERIFY_READ);
1288        if (unlikely(ret != 0))
1289                goto out;
1290        ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2,
1291                            requeue_pi ? VERIFY_WRITE : VERIFY_READ);
1292        if (unlikely(ret != 0))
1293                goto out_put_key1;
1294
1295        hb1 = hash_futex(&key1);
1296        hb2 = hash_futex(&key2);
1297
1298retry_private:
1299        double_lock_hb(hb1, hb2);
1300
1301        if (likely(cmpval != NULL)) {
1302                u32 curval;
1303
1304                ret = get_futex_value_locked(&curval, uaddr1);
1305
1306                if (unlikely(ret)) {
1307                        double_unlock_hb(hb1, hb2);
1308
1309                        ret = get_user(curval, uaddr1);
1310                        if (ret)
1311                                goto out_put_keys;
1312
1313                        if (!(flags & FLAGS_SHARED))
1314                                goto retry_private;
1315
1316                        put_futex_key(&key2);
1317                        put_futex_key(&key1);
1318                        goto retry;
1319                }
1320                if (curval != *cmpval) {
1321                        ret = -EAGAIN;
1322                        goto out_unlock;
1323                }
1324        }
1325
1326        if (requeue_pi && (task_count - nr_wake < nr_requeue)) {
1327                /*
1328                 * Attempt to acquire uaddr2 and wake the top waiter. If we
1329                 * intend to requeue waiters, force setting the FUTEX_WAITERS
1330                 * bit.  We force this here where we are able to easily handle
1331                 * faults rather in the requeue loop below.
1332                 */
1333                ret = futex_proxy_trylock_atomic(uaddr2, hb1, hb2, &key1,
1334                                                 &key2, &pi_state, nr_requeue);
1335
1336                /*
1337                 * At this point the top_waiter has either taken uaddr2 or is
1338                 * waiting on it.  If the former, then the pi_state will not
1339                 * exist yet, look it up one more time to ensure we have a
1340                 * reference to it.
1341                 */
1342                if (ret == 1) {
1343                        WARN_ON(pi_state);
1344                        drop_count++;
1345                        task_count++;
1346                        ret = get_futex_value_locked(&curval2, uaddr2);
1347                        if (!ret)
1348                                ret = lookup_pi_state(curval2, hb2, &key2,
1349                                                      &pi_state);
1350                }
1351
1352                switch (ret) {
1353                case 0:
1354                        break;
1355                case -EFAULT:
1356                        double_unlock_hb(hb1, hb2);
1357                        put_futex_key(&key2);
1358                        put_futex_key(&key1);
1359                        ret = fault_in_user_writeable(uaddr2);
1360                        if (!ret)
1361                                goto retry;
1362                        goto out;
1363                case -EAGAIN:
1364                        /* The owner was exiting, try again. */
1365                        double_unlock_hb(hb1, hb2);
1366                        put_futex_key(&key2);
1367                        put_futex_key(&key1);
1368                        cond_resched();
1369                        goto retry;
1370                default:
1371                        goto out_unlock;
1372                }
1373        }
1374
1375        head1 = &hb1->chain;
1376        plist_for_each_entry_safe(this, next, head1, list) {
1377                if (task_count - nr_wake >= nr_requeue)
1378                        break;
1379
1380                if (!match_futex(&this->key, &key1))
1381                        continue;
1382
1383                /*
1384                 * FUTEX_WAIT_REQEUE_PI and FUTEX_CMP_REQUEUE_PI should always
1385                 * be paired with each other and no other futex ops.
1386                 */
1387                if ((requeue_pi && !this->rt_waiter) ||
1388                    (!requeue_pi && this->rt_waiter)) {
1389                        ret = -EINVAL;
1390                        break;
1391                }
1392
1393                /*
1394                 * Wake nr_wake waiters.  For requeue_pi, if we acquired the
1395                 * lock, we already woke the top_waiter.  If not, it will be
1396                 * woken by futex_unlock_pi().
1397                 */
1398                if (++task_count <= nr_wake && !requeue_pi) {
1399                        wake_futex(this);
1400                        continue;
1401                }
1402
1403                /* Ensure we requeue to the expected futex for requeue_pi. */
1404                if (requeue_pi && !match_futex(this->requeue_pi_key, &key2)) {
1405                        ret = -EINVAL;
1406                        break;
1407                }
1408
1409                /*
1410                 * Requeue nr_requeue waiters and possibly one more in the case
1411                 * of requeue_pi if we couldn't acquire the lock atomically.
1412                 */
1413                if (requeue_pi) {
1414                        /* Prepare the waiter to take the rt_mutex. */
1415                        atomic_inc(&pi_state->refcount);
1416                        this->pi_state = pi_state;
1417                        ret = rt_mutex_start_proxy_lock(&pi_state->pi_mutex,
1418                                                        this->rt_waiter,
1419                                                        this->task, 1);
1420                        if (ret == 1) {
1421                                /* We got the lock. */
1422                                requeue_pi_wake_futex(this, &key2, hb2);
1423                                drop_count++;
1424                                continue;
1425                        } else if (ret) {
1426                                /* -EDEADLK */
1427                                this->pi_state = NULL;
1428                                free_pi_state(pi_state);
1429                                goto out_unlock;
1430                        }
1431                }
1432                requeue_futex(this, hb1, hb2, &key2);
1433                drop_count++;
1434        }
1435
1436out_unlock:
1437        double_unlock_hb(hb1, hb2);
1438
1439        /*
1440         * drop_futex_key_refs() must be called outside the spinlocks. During
1441         * the requeue we moved futex_q's from the hash bucket at key1 to the
1442         * one at key2 and updated their key pointer.  We no longer need to
1443         * hold the references to key1.
1444         */
1445        while (--drop_count >= 0)
1446                drop_futex_key_refs(&key1);
1447
1448out_put_keys:
1449        put_futex_key(&key2);
1450out_put_key1:
1451        put_futex_key(&key1);
1452out:
1453        if (pi_state != NULL)
1454                free_pi_state(pi_state);
1455        return ret ? ret : task_count;
1456}
1457
1458/* The key must be already stored in q->key. */
1459static inline struct futex_hash_bucket *queue_lock(struct futex_q *q)
1460        __acquires(&hb->lock)
1461{
1462        struct futex_hash_bucket *hb;
1463
1464        hb = hash_futex(&q->key);
1465        q->lock_ptr = &hb->lock;
1466
1467        spin_lock(&hb->lock);
1468        return hb;
1469}
1470
1471static inline void
1472queue_unlock(struct futex_q *q, struct futex_hash_bucket *hb)
1473        __releases(&hb->lock)
1474{
1475        spin_unlock(&hb->lock);
1476}
1477
1478/**
1479 * queue_me() - Enqueue the futex_q on the futex_hash_bucket
1480 * @q:  The futex_q to enqueue
1481 * @hb: The destination hash bucket
1482 *
1483 * The hb->lock must be held by the caller, and is released here. A call to
1484 * queue_me() is typically paired with exactly one call to unqueue_me().  The
1485 * exceptions involve the PI related operations, which may use unqueue_me_pi()
1486 * or nothing if the unqueue is done as part of the wake process and the unqueue
1487 * state is implicit in the state of woken task (see futex_wait_requeue_pi() for
1488 * an example).
1489 */
1490static inline void queue_me(struct futex_q *q, struct futex_hash_bucket *hb)
1491        __releases(&hb->lock)
1492{
1493        int prio;
1494
1495        /*
1496         * The priority used to register this element is
1497         * - either the real thread-priority for the real-time threads
1498         * (i.e. threads with a priority lower than MAX_RT_PRIO)
1499         * - or MAX_RT_PRIO for non-RT threads.
1500         * Thus, all RT-threads are woken first in priority order, and
1501         * the others are woken last, in FIFO order.
1502         */
1503        prio = min(current->normal_prio, MAX_RT_PRIO);
1504
1505        plist_node_init(&q->list, prio);
1506        plist_add(&q->list, &hb->chain);
1507        q->task = current;
1508        spin_unlock(&hb->lock);
1509}
1510
1511/**
1512 * unqueue_me() - Remove the futex_q from its futex_hash_bucket
1513 * @q:  The futex_q to unqueue
1514 *
1515 * The q->lock_ptr must not be held by the caller. A call to unqueue_me() must
1516 * be paired with exactly one earlier call to queue_me().
1517 *
1518 * Returns:
1519 *   1 - if the futex_q was still queued (and we removed unqueued it)
1520 *   0 - if the futex_q was already removed by the waking thread
1521 */
1522static int unqueue_me(struct futex_q *q)
1523{
1524        spinlock_t *lock_ptr;
1525        int ret = 0;
1526
1527        /* In the common case we don't take the spinlock, which is nice. */
1528retry:
1529        lock_ptr = q->lock_ptr;
1530        barrier();
1531        if (lock_ptr != NULL) {
1532                spin_lock(lock_ptr);
1533                /*
1534                 * q->lock_ptr can change between reading it and
1535                 * spin_lock(), causing us to take the wrong lock.  This
1536                 * corrects the race condition.
1537                 *
1538                 * Reasoning goes like this: if we have the wrong lock,
1539                 * q->lock_ptr must have changed (maybe several times)
1540                 * between reading it and the spin_lock().  It can
1541                 * change again after the spin_lock() but only if it was
1542                 * already changed before the spin_lock().  It cannot,
1543                 * however, change back to the original value.  Therefore
1544                 * we can detect whether we acquired the correct lock.
1545                 */
1546                if (unlikely(lock_ptr != q->lock_ptr)) {
1547                        spin_unlock(lock_ptr);
1548                        goto retry;
1549                }
1550                __unqueue_futex(q);
1551
1552                BUG_ON(q->pi_state);
1553
1554                spin_unlock(lock_ptr);
1555                ret = 1;
1556        }
1557
1558        drop_futex_key_refs(&q->key);
1559        return ret;
1560}
1561
1562/*
1563 * PI futexes can not be requeued and must remove themself from the
1564 * hash bucket. The hash bucket lock (i.e. lock_ptr) is held on entry
1565 * and dropped here.
1566 */
1567static void unqueue_me_pi(struct futex_q *q)
1568        __releases(q->lock_ptr)
1569{
1570        __unqueue_futex(q);
1571
1572        BUG_ON(!q->pi_state);
1573        free_pi_state(q->pi_state);
1574        q->pi_state = NULL;
1575
1576        spin_unlock(q->lock_ptr);
1577}
1578
1579/*
1580 * Fixup the pi_state owner with the new owner.
1581 *
1582 * Must be called with hash bucket lock held and mm->sem held for non
1583 * private futexes.
1584 */
1585static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,
1586                                struct task_struct *newowner)
1587{
1588        u32 newtid = task_pid_vnr(newowner) | FUTEX_WAITERS;
1589        struct futex_pi_state *pi_state = q->pi_state;
1590        struct task_struct *oldowner = pi_state->owner;
1591        u32 uval, uninitialized_var(curval), newval;
1592        int ret;
1593
1594        /* Owner died? */
1595        if (!pi_state->owner)
1596                newtid |= FUTEX_OWNER_DIED;
1597
1598        /*
1599         * We are here either because we stole the rtmutex from the
1600         * previous highest priority waiter or we are the highest priority
1601         * waiter but failed to get the rtmutex the first time.
1602         * We have to replace the newowner TID in the user space variable.
1603         * This must be atomic as we have to preserve the owner died bit here.
1604         *
1605         * Note: We write the user space value _before_ changing the pi_state
1606         * because we can fault here. Imagine swapped out pages or a fork
1607         * that marked all the anonymous memory readonly for cow.
1608         *
1609         * Modifying pi_state _before_ the user space value would
1610         * leave the pi_state in an inconsistent state when we fault
1611         * here, because we need to drop the hash bucket lock to
1612         * handle the fault. This might be observed in the PID check
1613         * in lookup_pi_state.
1614         */
1615retry:
1616        if (get_futex_value_locked(&uval, uaddr))
1617                goto handle_fault;
1618
1619        while (1) {
1620                newval = (uval & FUTEX_OWNER_DIED) | newtid;
1621
1622                if (cmpxchg_futex_value_locked(&curval, uaddr, uval, newval))
1623                        goto handle_fault;
1624                if (curval == uval)
1625                        break;
1626                uval = curval;
1627        }
1628
1629        /*
1630         * We fixed up user space. Now we need to fix the pi_state
1631         * itself.
1632         */
1633        if (pi_state->owner != NULL) {
1634                raw_spin_lock_irq(&pi_state->owner->pi_lock);
1635                WARN_ON(list_empty(&pi_state->list));
1636                list_del_init(&pi_state->list);
1637                raw_spin_unlock_irq(&pi_state->owner->pi_lock);
1638        }
1639
1640        pi_state->owner = newowner;
1641
1642        raw_spin_lock_irq(&newowner->pi_lock);
1643        WARN_ON(!list_empty(&pi_state->list));
1644        list_add(&pi_state->list, &newowner->pi_state_list);
1645        raw_spin_unlock_irq(&newowner->pi_lock);
1646        return 0;
1647
1648        /*
1649         * To handle the page fault we need to drop the hash bucket
1650         * lock here. That gives the other task (either the highest priority
1651         * waiter itself or the task which stole the rtmutex) the
1652         * chance to try the fixup of the pi_state. So once we are
1653         * back from handling the fault we need to check the pi_state
1654         * after reacquiring the hash bucket lock and before trying to
1655         * do another fixup. When the fixup has been done already we
1656         * simply return.
1657         */
1658handle_fault:
1659        spin_unlock(q->lock_ptr);
1660
1661        ret = fault_in_user_writeable(uaddr);
1662
1663        spin_lock(q->lock_ptr);
1664
1665        /*
1666         * Check if someone else fixed it for us:
1667         */
1668        if (pi_state->owner != oldowner)
1669                return 0;
1670
1671        if (ret)
1672                return ret;
1673
1674        goto retry;
1675}
1676
1677static long futex_wait_restart(struct restart_block *restart);
1678
1679/**
1680 * fixup_owner() - Post lock pi_state and corner case management
1681 * @uaddr:      user address of the futex
1682 * @q:          futex_q (contains pi_state and access to the rt_mutex)
1683 * @locked:     if the attempt to take the rt_mutex succeeded (1) or not (0)
1684 *
1685 * After attempting to lock an rt_mutex, this function is called to cleanup
1686 * the pi_state owner as well as handle race conditions that may allow us to
1687 * acquire the lock. Must be called with the hb lock held.
1688 *
1689 * Returns:
1690 *  1 - success, lock taken
1691 *  0 - success, lock not taken
1692 * <0 - on error (-EFAULT)
1693 */
1694static int fixup_owner(u32 __user *uaddr, struct futex_q *q, int locked)
1695{
1696        struct task_struct *owner;
1697        int ret = 0;
1698
1699        if (locked) {
1700                /*
1701                 * Got the lock. We might not be the anticipated owner if we
1702                 * did a lock-steal - fix up the PI-state in that case:
1703                 */
1704                if (q->pi_state->owner != current)
1705                        ret = fixup_pi_state_owner(uaddr, q, current);
1706                goto out;
1707        }
1708
1709        /*
1710         * Catch the rare case, where the lock was released when we were on the
1711         * way back before we locked the hash bucket.
1712         */
1713        if (q->pi_state->owner == current) {
1714                /*
1715                 * Try to get the rt_mutex now. This might fail as some other
1716                 * task acquired the rt_mutex after we removed ourself from the
1717                 * rt_mutex waiters list.
1718                 */
1719                if (rt_mutex_trylock(&q->pi_state->pi_mutex)) {
1720                        locked = 1;
1721                        goto out;
1722                }
1723
1724                /*
1725                 * pi_state is incorrect, some other task did a lock steal and
1726                 * we returned due to timeout or signal without taking the
1727                 * rt_mutex. Too late.
1728                 */
1729                raw_spin_lock(&q->pi_state->pi_mutex.wait_lock);
1730                owner = rt_mutex_owner(&q->pi_state->pi_mutex);
1731                if (!owner)
1732                        owner = rt_mutex_next_owner(&q->pi_state->pi_mutex);
1733                raw_spin_unlock(&q->pi_state->pi_mutex.wait_lock);
1734                ret = fixup_pi_state_owner(uaddr, q, owner);
1735                goto out;
1736        }
1737
1738        /*
1739         * Paranoia check. If we did not take the lock, then we should not be
1740         * the owner of the rt_mutex.
1741         */
1742        if (rt_mutex_owner(&q->pi_state->pi_mutex) == current)
1743                printk(KERN_ERR "fixup_owner: ret = %d pi-mutex: %p "
1744                                "pi-state %p\n", ret,
1745                                q->pi_state->pi_mutex.owner,
1746                                q->pi_state->owner);
1747
1748out:
1749        return ret ? ret : locked;
1750}
1751
1752/**
1753 * futex_wait_queue_me() - queue_me() and wait for wakeup, timeout, or signal
1754 * @hb:         the futex hash bucket, must be locked by the caller
1755 * @q:          the futex_q to queue up on
1756 * @timeout:    the prepared hrtimer_sleeper, or null for no timeout
1757 */
1758static void futex_wait_queue_me(struct futex_hash_bucket *hb, struct futex_q *q,
1759                                struct hrtimer_sleeper *timeout)
1760{
1761        /*
1762         * The task state is guaranteed to be set before another task can
1763         * wake it. set_current_state() is implemented using set_mb() and
1764         * queue_me() calls spin_unlock() upon completion, both serializing
1765         * access to the hash list and forcing another memory barrier.
1766         */
1767        set_current_state(TASK_INTERRUPTIBLE);
1768        queue_me(q, hb);
1769
1770        /* Arm the timer */
1771        if (timeout) {
1772                hrtimer_start_expires(&timeout->timer, HRTIMER_MODE_ABS);
1773                if (!hrtimer_active(&timeout->timer))
1774                        timeout->task = NULL;
1775        }
1776
1777        /*
1778         * If we have been removed from the hash list, then another task
1779         * has tried to wake us, and we can skip the call to schedule().
1780         */
1781        if (likely(!plist_node_empty(&q->list))) {
1782                /*
1783                 * If the timer has already expired, current will already be
1784                 * flagged for rescheduling. Only call schedule if there
1785                 * is no timeout, or if it has yet to expire.
1786                 */
1787                if (!timeout || timeout->task)
1788                        schedule();
1789        }
1790        __set_current_state(TASK_RUNNING);
1791}
1792
1793/**
1794 * futex_wait_setup() - Prepare to wait on a futex
1795 * @uaddr:      the futex userspace address
1796 * @val:        the expected value
1797 * @flags:      futex flags (FLAGS_SHARED, etc.)
1798 * @q:          the associated futex_q
1799 * @hb:         storage for hash_bucket pointer to be returned to caller
1800 *
1801 * Setup the futex_q and locate the hash_bucket.  Get the futex value and
1802 * compare it with the expected value.  Handle atomic faults internally.
1803 * Return with the hb lock held and a q.key reference on success, and unlocked
1804 * with no q.key reference on failure.
1805 *
1806 * Returns:
1807 *  0 - uaddr contains val and hb has been locked
1808 * <1 - -EFAULT or -EWOULDBLOCK (uaddr does not contain val) and hb is unlocked
1809 */
1810static int futex_wait_setup(u32 __user *uaddr, u32 val, unsigned int flags,
1811                           struct futex_q *q, struct futex_hash_bucket **hb)
1812{
1813        u32 uval;
1814        int ret;
1815
1816        /*
1817         * Access the page AFTER the hash-bucket is locked.
1818         * Order is important:
1819         *
1820         *   Userspace waiter: val = var; if (cond(val)) futex_wait(&var, val);
1821         *   Userspace waker:  if (cond(var)) { var = new; futex_wake(&var); }
1822         *
1823         * The basic logical guarantee of a futex is that it blocks ONLY
1824         * if cond(var) is known to be true at the time of blocking, for
1825         * any cond.  If we locked the hash-bucket after testing *uaddr, that
1826         * would open a race condition where we could block indefinitely with
1827         * cond(var) false, which would violate the guarantee.
1828         *
1829         * On the other hand, we insert q and release the hash-bucket only
1830         * after testing *uaddr.  This guarantees that futex_wait() will NOT
1831         * absorb a wakeup if *uaddr does not match the desired values
1832         * while the syscall executes.
1833         */
1834retry:
1835        ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &q->key, VERIFY_READ);
1836        if (unlikely(ret != 0))
1837                return ret;
1838
1839retry_private:
1840        *hb = queue_lock(q);
1841
1842        ret = get_futex_value_locked(&uval, uaddr);
1843
1844        if (ret) {
1845                queue_unlock(q, *hb);
1846
1847                ret = get_user(uval, uaddr);
1848                if (ret)
1849                        goto out;
1850
1851                if (!(flags & FLAGS_SHARED))
1852                        goto retry_private;
1853
1854                put_futex_key(&q->key);
1855                goto retry;
1856        }
1857
1858        if (uval != val) {
1859                queue_unlock(q, *hb);
1860                ret = -EWOULDBLOCK;
1861        }
1862
1863out:
1864        if (ret)
1865                put_futex_key(&q->key);
1866        return ret;
1867}
1868
1869static int futex_wait(u32 __user *uaddr, unsigned int flags, u32 val,
1870                      ktime_t *abs_time, u32 bitset)
1871{
1872        struct hrtimer_sleeper timeout, *to = NULL;
1873        struct restart_block *restart;
1874        struct futex_hash_bucket *hb;
1875        struct futex_q q = futex_q_init;
1876        int ret;
1877
1878        if (!bitset)
1879                return -EINVAL;
1880        q.bitset = bitset;
1881
1882        if (abs_time) {
1883                to = &timeout;
1884
1885                hrtimer_init_on_stack(&to->timer, (flags & FLAGS_CLOCKRT) ?
1886                                      CLOCK_REALTIME : CLOCK_MONOTONIC,
1887                                      HRTIMER_MODE_ABS);
1888                hrtimer_init_sleeper(to, current);
1889                hrtimer_set_expires_range_ns(&to->timer, *abs_time,
1890                                             current->timer_slack_ns);
1891        }
1892
1893retry:
1894        /*
1895         * Prepare to wait on uaddr. On success, holds hb lock and increments
1896         * q.key refs.
1897         */
1898        ret = futex_wait_setup(uaddr, val, flags, &q, &hb);
1899        if (ret)
1900                goto out;
1901
1902        /* queue_me and wait for wakeup, timeout, or a signal. */
1903        futex_wait_queue_me(hb, &q, to);
1904
1905        /* If we were woken (and unqueued), we succeeded, whatever. */
1906        ret = 0;
1907        /* unqueue_me() drops q.key ref */
1908        if (!unqueue_me(&q))
1909                goto out;
1910        ret = -ETIMEDOUT;
1911        if (to && !to->task)
1912                goto out;
1913
1914        /*
1915         * We expect signal_pending(current), but we might be the
1916         * victim of a spurious wakeup as well.
1917         */
1918        if (!signal_pending(current))
1919                goto retry;
1920
1921        ret = -ERESTARTSYS;
1922        if (!abs_time)
1923                goto out;
1924
1925        restart = &current_thread_info()->restart_block;
1926        restart->fn = futex_wait_restart;
1927        restart->futex.uaddr = uaddr;
1928        restart->futex.val = val;
1929        restart->futex.time = abs_time->tv64;
1930        restart->futex.bitset = bitset;
1931        restart->futex.flags = flags | FLAGS_HAS_TIMEOUT;
1932
1933        ret = -ERESTART_RESTARTBLOCK;
1934
1935out:
1936        if (to) {
1937                hrtimer_cancel(&to->timer);
1938                destroy_hrtimer_on_stack(&to->timer);
1939        }
1940        return ret;
1941}
1942
1943
1944static long futex_wait_restart(struct restart_block *restart)
1945{
1946        u32 __user *uaddr = restart->futex.uaddr;
1947        ktime_t t, *tp = NULL;
1948
1949        if (restart->futex.flags & FLAGS_HAS_TIMEOUT) {
1950                t.tv64 = restart->futex.time;
1951                tp = &t;
1952        }
1953        restart->fn = do_no_restart_syscall;
1954
1955        return (long)futex_wait(uaddr, restart->futex.flags,
1956                                restart->futex.val, tp, restart->futex.bitset);
1957}
1958
1959
1960/*
1961 * Userspace tried a 0 -> TID atomic transition of the futex value
1962 * and failed. The kernel side here does the whole locking operation:
1963 * if there are waiters then it will block, it does PI, etc. (Due to
1964 * races the kernel might see a 0 value of the futex too.)
1965 */
1966static int futex_lock_pi(u32 __user *uaddr, unsigned int flags, int detect,
1967                         ktime_t *time, int trylock)
1968{
1969        struct hrtimer_sleeper timeout, *to = NULL;
1970        struct futex_hash_bucket *hb;
1971        struct futex_q q = futex_q_init;
1972        int res, ret;
1973
1974        if (refill_pi_state_cache())
1975                return -ENOMEM;
1976
1977        if (time) {
1978                to = &timeout;
1979                hrtimer_init_on_stack(&to->timer, CLOCK_REALTIME,
1980                                      HRTIMER_MODE_ABS);
1981                hrtimer_init_sleeper(to, current);
1982                hrtimer_set_expires(&to->timer, *time);
1983        }
1984
1985retry:
1986        ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &q.key, VERIFY_WRITE);
1987        if (unlikely(ret != 0))
1988                goto out;
1989
1990retry_private:
1991        hb = queue_lock(&q);
1992
1993        ret = futex_lock_pi_atomic(uaddr, hb, &q.key, &q.pi_state, current, 0);
1994        if (unlikely(ret)) {
1995                switch (ret) {
1996                case 1:
1997                        /* We got the lock. */
1998                        ret = 0;
1999                        goto out_unlock_put_key;
2000                case -EFAULT:
2001                        goto uaddr_faulted;
2002                case -EAGAIN:
2003                        /*
2004                         * Task is exiting and we just wait for the
2005                         * exit to complete.
2006                         */
2007                        queue_unlock(&q, hb);
2008                        put_futex_key(&q.key);
2009                        cond_resched();
2010                        goto retry;
2011                default:
2012                        goto out_unlock_put_key;
2013                }
2014        }
2015
2016        /*
2017         * Only actually queue now that the atomic ops are done:
2018         */
2019        queue_me(&q, hb);
2020
2021        WARN_ON(!q.pi_state);
2022        /*
2023         * Block on the PI mutex:
2024         */
2025        if (!trylock)
2026                ret = rt_mutex_timed_lock(&q.pi_state->pi_mutex, to, 1);
2027        else {
2028                ret = rt_mutex_trylock(&q.pi_state->pi_mutex);
2029                /* Fixup the trylock return value: */
2030                ret = ret ? 0 : -EWOULDBLOCK;
2031        }
2032
2033        spin_lock(q.lock_ptr);
2034        /*
2035         * Fixup the pi_state owner and possibly acquire the lock if we
2036         * haven't already.
2037         */
2038        res = fixup_owner(uaddr, &q, !ret);
2039        /*
2040         * If fixup_owner() returned an error, proprogate that.  If it acquired
2041         * the lock, clear our -ETIMEDOUT or -EINTR.
2042         */
2043        if (res)
2044                ret = (res < 0) ? res : 0;
2045
2046        /*
2047         * If fixup_owner() faulted and was unable to handle the fault, unlock
2048         * it and return the fault to userspace.
2049         */
2050        if (ret && (rt_mutex_owner(&q.pi_state->pi_mutex) == current))
2051                rt_mutex_unlock(&q.pi_state->pi_mutex);
2052
2053        /* Unqueue and drop the lock */
2054        unqueue_me_pi(&q);
2055
2056        goto out_put_key;
2057
2058out_unlock_put_key:
2059        queue_unlock(&q, hb);
2060
2061out_put_key:
2062        put_futex_key(&q.key);
2063out:
2064        if (to)
2065                destroy_hrtimer_on_stack(&to->timer);
2066        return ret != -EINTR ? ret : -ERESTARTNOINTR;
2067
2068uaddr_faulted:
2069        queue_unlock(&q, hb);
2070
2071        ret = fault_in_user_writeable(uaddr);
2072        if (ret)
2073                goto out_put_key;
2074
2075        if (!(flags & FLAGS_SHARED))
2076                goto retry_private;
2077
2078        put_futex_key(&q.key);
2079        goto retry;
2080}
2081
2082/*
2083 * Userspace attempted a TID -> 0 atomic transition, and failed.
2084 * This is the in-kernel slowpath: we look up the PI state (if any),
2085 * and do the rt-mutex unlock.
2086 */
2087static int futex_unlock_pi(u32 __user *uaddr, unsigned int flags)
2088{
2089        struct futex_hash_bucket *hb;
2090        struct futex_q *this, *next;
2091        struct plist_head *head;
2092        union futex_key key = FUTEX_KEY_INIT;
2093        u32 uval, vpid = task_pid_vnr(current);
2094        int ret;
2095
2096retry:
2097        if (get_user(uval, uaddr))
2098                return -EFAULT;
2099        /*
2100         * We release only a lock we actually own:
2101         */
2102        if ((uval & FUTEX_TID_MASK) != vpid)
2103                return -EPERM;
2104
2105        ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &key, VERIFY_WRITE);
2106        if (unlikely(ret != 0))
2107                goto out;
2108
2109        hb = hash_futex(&key);
2110        spin_lock(&hb->lock);
2111
2112        /*
2113         * To avoid races, try to do the TID -> 0 atomic transition
2114         * again. If it succeeds then we can return without waking
2115         * anyone else up:
2116         */
2117        if (!(uval & FUTEX_OWNER_DIED) &&
2118            cmpxchg_futex_value_locked(&uval, uaddr, vpid, 0))
2119                goto pi_faulted;
2120        /*
2121         * Rare case: we managed to release the lock atomically,
2122         * no need to wake anyone else up:
2123         */
2124        if (unlikely(uval == vpid))
2125                goto out_unlock;
2126
2127        /*
2128         * Ok, other tasks may need to be woken up - check waiters
2129         * and do the wakeup if necessary:
2130         */
2131        head = &hb->chain;
2132
2133        plist_for_each_entry_safe(this, next, head, list) {
2134                if (!match_futex (&this->key, &key))
2135                        continue;
2136                ret = wake_futex_pi(uaddr, uval, this);
2137                /*
2138                 * The atomic access to the futex value
2139                 * generated a pagefault, so retry the
2140                 * user-access and the wakeup:
2141                 */
2142                if (ret == -EFAULT)
2143                        goto pi_faulted;
2144                goto out_unlock;
2145        }
2146        /*
2147         * No waiters - kernel unlocks the futex:
2148         */
2149        if (!(uval & FUTEX_OWNER_DIED)) {
2150                ret = unlock_futex_pi(uaddr, uval);
2151                if (ret == -EFAULT)
2152                        goto pi_faulted;
2153        }
2154
2155out_unlock:
2156        spin_unlock(&hb->lock);
2157        put_futex_key(&key);
2158
2159out:
2160        return ret;
2161
2162pi_faulted:
2163        spin_unlock(&hb->lock);
2164        put_futex_key(&key);
2165
2166        ret = fault_in_user_writeable(uaddr);
2167        if (!ret)
2168                goto retry;
2169
2170        return ret;
2171}
2172
2173/**
2174 * handle_early_requeue_pi_wakeup() - Detect early wakeup on the initial futex
2175 * @hb:         the hash_bucket futex_q was original enqueued on
2176 * @q:          the futex_q woken while waiting to be requeued
2177 * @key2:       the futex_key of the requeue target futex
2178 * @timeout:    the timeout associated with the wait (NULL if none)
2179 *
2180 * Detect if the task was woken on the initial futex as opposed to the requeue
2181 * target futex.  If so, determine if it was a timeout or a signal that caused
2182 * the wakeup and return the appropriate error code to the caller.  Must be
2183 * called with the hb lock held.
2184 *
2185 * Returns
2186 *  0 - no early wakeup detected
2187 * <0 - -ETIMEDOUT or -ERESTARTNOINTR
2188 */
2189static inline
2190int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb,
2191                                   struct futex_q *q, union futex_key *key2,
2192                                   struct hrtimer_sleeper *timeout)
2193{
2194        int ret = 0;
2195
2196        /*
2197         * With the hb lock held, we avoid races while we process the wakeup.
2198         * We only need to hold hb (and not hb2) to ensure atomicity as the
2199         * wakeup code can't change q.key from uaddr to uaddr2 if we hold hb.
2200         * It can't be requeued from uaddr2 to something else since we don't
2201         * support a PI aware source futex for requeue.
2202         */
2203        if (!match_futex(&q->key, key2)) {
2204                WARN_ON(q->lock_ptr && (&hb->lock != q->lock_ptr));
2205                /*
2206                 * We were woken prior to requeue by a timeout or a signal.
2207                 * Unqueue the futex_q and determine which it was.
2208                 */
2209                plist_del(&q->list, &hb->chain);
2210
2211                /* Handle spurious wakeups gracefully */
2212                ret = -EWOULDBLOCK;
2213                if (timeout && !timeout->task)
2214                        ret = -ETIMEDOUT;
2215                else if (signal_pending(current))
2216                        ret = -ERESTARTNOINTR;
2217        }
2218        return ret;
2219}
2220
2221/**
2222 * futex_wait_requeue_pi() - Wait on uaddr and take uaddr2
2223 * @uaddr:      the futex we initially wait on (non-pi)
2224 * @flags:      futex flags (FLAGS_SHARED, FLAGS_CLOCKRT, etc.), they must be
2225 *              the same type, no requeueing from private to shared, etc.
2226 * @val:        the expected value of uaddr
2227 * @abs_time:   absolute timeout
2228 * @bitset:     32 bit wakeup bitset set by userspace, defaults to all
2229 * @clockrt:    whether to use CLOCK_REALTIME (1) or CLOCK_MONOTONIC (0)
2230 * @uaddr2:     the pi futex we will take prior to returning to user-space
2231 *
2232 * The caller will wait on uaddr and will be requeued by futex_requeue() to
2233 * uaddr2 which must be PI aware.  Normal wakeup will wake on uaddr2 and
2234 * complete the acquisition of the rt_mutex prior to returning to userspace.
2235 * This ensures the rt_mutex maintains an owner when it has waiters; without
2236 * one, the pi logic wouldn't know which task to boost/deboost, if there was a
2237 * need to.
2238 *
2239 * We call schedule in futex_wait_queue_me() when we enqueue and return there
2240 * via the following:
2241 * 1) wakeup on uaddr2 after an atomic lock acquisition by futex_requeue()
2242 * 2) wakeup on uaddr2 after a requeue
2243 * 3) signal
2244 * 4) timeout
2245 *
2246 * If 3, cleanup and return -ERESTARTNOINTR.
2247 *
2248 * If 2, we may then block on trying to take the rt_mutex and return via:
2249 * 5) successful lock
2250 * 6) signal
2251 * 7) timeout
2252 * 8) other lock acquisition failure
2253 *
2254 * If 6, return -EWOULDBLOCK (restarting the syscall would do the same).
2255 *
2256 * If 4 or 7, we cleanup and return with -ETIMEDOUT.
2257 *
2258 * Returns:
2259 *  0 - On success
2260 * <0 - On error
2261 */
2262static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
2263                                 u32 val, ktime_t *abs_time, u32 bitset,
2264                                 u32 __user *uaddr2)
2265{
2266        struct hrtimer_sleeper timeout, *to = NULL;
2267        struct rt_mutex_waiter rt_waiter;
2268        struct rt_mutex *pi_mutex = NULL;
2269        struct futex_hash_bucket *hb;
2270        union futex_key key2 = FUTEX_KEY_INIT;
2271        struct futex_q q = futex_q_init;
2272        int res, ret;
2273
2274        if (!bitset)
2275                return -EINVAL;
2276
2277        if (abs_time) {
2278                to = &timeout;
2279                hrtimer_init_on_stack(&to->timer, (flags & FLAGS_CLOCKRT) ?
2280                                      CLOCK_REALTIME : CLOCK_MONOTONIC,
2281                                      HRTIMER_MODE_ABS);
2282                hrtimer_init_sleeper(to, current);
2283                hrtimer_set_expires_range_ns(&to->timer, *abs_time,
2284                                             current->timer_slack_ns);
2285        }
2286
2287        /*
2288         * The waiter is allocated on our stack, manipulated by the requeue
2289         * code while we sleep on uaddr.
2290         */
2291        debug_rt_mutex_init_waiter(&rt_waiter);
2292        rt_waiter.task = NULL;
2293
2294        ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, VERIFY_WRITE);
2295        if (unlikely(ret != 0))
2296                goto out;
2297
2298        q.bitset = bitset;
2299        q.rt_waiter = &rt_waiter;
2300        q.requeue_pi_key = &key2;
2301
2302        /*
2303         * Prepare to wait on uaddr. On success, increments q.key (key1) ref
2304         * count.
2305         */
2306        ret = futex_wait_setup(uaddr, val, flags, &q, &hb);
2307        if (ret)
2308                goto out_key2;
2309
2310        /* Queue the futex_q, drop the hb lock, wait for wakeup. */
2311        futex_wait_queue_me(hb, &q, to);
2312
2313        spin_lock(&hb->lock);
2314        ret = handle_early_requeue_pi_wakeup(hb, &q, &key2, to);
2315        spin_unlock(&hb->lock);
2316        if (ret)
2317                goto out_put_keys;
2318
2319        /*
2320         * In order for us to be here, we know our q.key == key2, and since
2321         * we took the hb->lock above, we also know that futex_requeue() has
2322         * completed and we no longer have to concern ourselves with a wakeup
2323         * race with the atomic proxy lock acquisition by the requeue code. The
2324         * futex_requeue dropped our key1 reference and incremented our key2
2325         * reference count.
2326         */
2327
2328        /* Check if the requeue code acquired the second futex for us. */
2329        if (!q.rt_waiter) {
2330                /*
2331                 * Got the lock. We might not be the anticipated owner if we
2332                 * did a lock-steal - fix up the PI-state in that case.
2333                 */
2334                if (q.pi_state && (q.pi_state->owner != current)) {
2335                        spin_lock(q.lock_ptr);
2336                        ret = fixup_pi_state_owner(uaddr2, &q, current);
2337                        spin_unlock(q.lock_ptr);
2338                }
2339        } else {
2340                /*
2341                 * We have been woken up by futex_unlock_pi(), a timeout, or a
2342                 * signal.  futex_unlock_pi() will not destroy the lock_ptr nor
2343                 * the pi_state.
2344                 */
2345                WARN_ON(!&q.pi_state);
2346                pi_mutex = &q.pi_state->pi_mutex;
2347                ret = rt_mutex_finish_proxy_lock(pi_mutex, to, &rt_waiter, 1);
2348                debug_rt_mutex_free_waiter(&rt_waiter);
2349
2350                spin_lock(q.lock_ptr);
2351                /*
2352                 * Fixup the pi_state owner and possibly acquire the lock if we
2353                 * haven't already.
2354                 */
2355                res = fixup_owner(uaddr2, &q, !ret);
2356                /*
2357                 * If fixup_owner() returned an error, proprogate that.  If it
2358                 * acquired the lock, clear -ETIMEDOUT or -EINTR.
2359                 */
2360                if (res)
2361                        ret = (res < 0) ? res : 0;
2362
2363                /* Unqueue and drop the lock. */
2364                unqueue_me_pi(&q);
2365        }
2366
2367        /*
2368         * If fixup_pi_state_owner() faulted and was unable to handle the
2369         * fault, unlock the rt_mutex and return the fault to userspace.
2370         */
2371        if (ret == -EFAULT) {
2372                if (rt_mutex_owner(pi_mutex) == current)
2373                        rt_mutex_unlock(pi_mutex);
2374        } else if (ret == -EINTR) {
2375                /*
2376                 * We've already been requeued, but cannot restart by calling
2377                 * futex_lock_pi() directly. We could restart this syscall, but
2378                 * it would detect that the user space "val" changed and return
2379                 * -EWOULDBLOCK.  Save the overhead of the restart and return
2380                 * -EWOULDBLOCK directly.
2381                 */
2382                ret = -EWOULDBLOCK;
2383        }
2384
2385out_put_keys:
2386        put_futex_key(&q.key);
2387out_key2:
2388        put_futex_key(&key2);
2389
2390out:
2391        if (to) {
2392                hrtimer_cancel(&to->timer);
2393                destroy_hrtimer_on_stack(&to->timer);
2394        }
2395        return ret;
2396}
2397
2398/*
2399 * Support for robust futexes: the kernel cleans up held futexes at
2400 * thread exit time.
2401 *
2402 * Implementation: user-space maintains a per-thread list of locks it
2403 * is holding. Upon do_exit(), the kernel carefully walks this list,
2404 * and marks all locks that are owned by this thread with the
2405 * FUTEX_OWNER_DIED bit, and wakes up a waiter (if any). The list is
2406 * always manipulated with the lock held, so the list is private and
2407 * per-thread. Userspace also maintains a per-thread 'list_op_pending'
2408 * field, to allow the kernel to clean up if the thread dies after
2409 * acquiring the lock, but just before it could have added itself to
2410 * the list. There can only be one such pending lock.
2411 */
2412
2413/**
2414 * sys_set_robust_list() - Set the robust-futex list head of a task
2415 * @head:       pointer to the list-head
2416 * @len:        length of the list-head, as userspace expects
2417 */
2418SYSCALL_DEFINE2(set_robust_list, struct robust_list_head __user *, head,
2419                size_t, len)
2420{
2421        if (!futex_cmpxchg_enabled)
2422                return -ENOSYS;
2423        /*
2424         * The kernel knows only one size for now:
2425         */
2426        if (unlikely(len != sizeof(*head)))
2427                return -EINVAL;
2428
2429        current->robust_list = head;
2430
2431        return 0;
2432}
2433
2434/**
2435 * sys_get_robust_list() - Get the robust-futex list head of a task
2436 * @pid:        pid of the process [zero for current task]
2437 * @head_ptr:   pointer to a list-head pointer, the kernel fills it in
2438 * @len_ptr:    pointer to a length field, the kernel fills in the header size
2439 */
2440SYSCALL_DEFINE3(get_robust_list, int, pid,
2441                struct robust_list_head __user * __user *, head_ptr,
2442                size_t __user *, len_ptr)
2443{
2444        struct robust_list_head __user *head;
2445        unsigned long ret;
2446        const struct cred *cred = current_cred(), *pcred;
2447
2448        if (!futex_cmpxchg_enabled)
2449                return -ENOSYS;
2450
2451        if (!pid)
2452                head = current->robust_list;
2453        else {
2454                struct task_struct *p;
2455
2456                ret = -ESRCH;
2457                rcu_read_lock();
2458                p = find_task_by_vpid(pid);
2459                if (!p)
2460                        goto err_unlock;
2461                ret = -EPERM;
2462                pcred = __task_cred(p);
2463                /* If victim is in different user_ns, then uids are not
2464                   comparable, so we must have CAP_SYS_PTRACE */
2465                if (cred->user->user_ns != pcred->user->user_ns) {
2466                        if (!ns_capable(pcred->user->user_ns, CAP_SYS_PTRACE))
2467                                goto err_unlock;
2468                        goto ok;
2469                }
2470                /* If victim is in same user_ns, then uids are comparable */
2471                if (cred->euid != pcred->euid &&
2472                    cred->euid != pcred->uid &&
2473                    !ns_capable(pcred->user->user_ns, CAP_SYS_PTRACE))
2474                        goto err_unlock;
2475ok:
2476                head = p->robust_list;
2477                rcu_read_unlock();
2478        }
2479
2480        if (put_user(sizeof(*head), len_ptr))
2481                return -EFAULT;
2482        return put_user(head, head_ptr);
2483
2484err_unlock:
2485        rcu_read_unlock();
2486
2487        return ret;
2488}
2489
2490/*
2491 * Process a futex-list entry, check whether it's owned by the
2492 * dying task, and do notification if so:
2493 */
2494int handle_futex_death(u32 __user *uaddr, struct task_struct *curr, int pi)
2495{
2496        u32 uval, uninitialized_var(nval), mval;
2497
2498retry:
2499        if (get_user(uval, uaddr))
2500                return -1;
2501
2502        if ((uval & FUTEX_TID_MASK) == task_pid_vnr(curr)) {
2503                /*
2504                 * Ok, this dying thread is truly holding a futex
2505                 * of interest. Set the OWNER_DIED bit atomically
2506                 * via cmpxchg, and if the value had FUTEX_WAITERS
2507                 * set, wake up a waiter (if any). (We have to do a
2508                 * futex_wake() even if OWNER_DIED is already set -
2509                 * to handle the rare but possible case of recursive
2510                 * thread-death.) The rest of the cleanup is done in
2511                 * userspace.
2512                 */
2513                mval = (uval & FUTEX_WAITERS) | FUTEX_OWNER_DIED;
2514                /*
2515                 * We are not holding a lock here, but we want to have
2516                 * the pagefault_disable/enable() protection because
2517                 * we want to handle the fault gracefully. If the
2518                 * access fails we try to fault in the futex with R/W
2519                 * verification via get_user_pages. get_user() above
2520                 * does not guarantee R/W access. If that fails we
2521                 * give up and leave the futex locked.
2522                 */
2523                if (cmpxchg_futex_value_locked(&nval, uaddr, uval, mval)) {
2524                        if (fault_in_user_writeable(uaddr))
2525                                return -1;
2526                        goto retry;
2527                }
2528                if (nval != uval)
2529                        goto retry;
2530
2531                /*
2532                 * Wake robust non-PI futexes here. The wakeup of
2533                 * PI futexes happens in exit_pi_state():
2534                 */
2535                if (!pi && (uval & FUTEX_WAITERS))
2536                        futex_wake(uaddr, 1, 1, FUTEX_BITSET_MATCH_ANY);
2537        }
2538        return 0;
2539}
2540
2541/*
2542 * Fetch a robust-list pointer. Bit 0 signals PI futexes:
2543 */
2544static inline int fetch_robust_entry(struct robust_list __user **entry,
2545                                     struct robust_list __user * __user *head,
2546                                     unsigned int *pi)
2547{
2548        unsigned long uentry;
2549
2550        if (get_user(uentry, (unsigned long __user *)head))
2551                return -EFAULT;
2552
2553        *entry = (void __user *)(uentry & ~1UL);
2554        *pi = uentry & 1;
2555
2556        return 0;
2557}
2558
2559/*
2560 * Walk curr->robust_list (very carefully, it's a userspace list!)
2561 * and mark any locks found there dead, and notify any waiters.
2562 *
2563 * We silently return on any sign of list-walking problem.
2564 */
2565void exit_robust_list(struct task_struct *curr)
2566{
2567        struct robust_list_head __user *head = curr->robust_list;
2568        struct robust_list __user *entry, *next_entry, *pending;
2569        unsigned int limit = ROBUST_LIST_LIMIT, pi, pip;
2570        unsigned int uninitialized_var(next_pi);
2571        unsigned long futex_offset;
2572        int rc;
2573
2574        if (!futex_cmpxchg_enabled)
2575                return;
2576
2577        /*
2578         * Fetch the list head (which was registered earlier, via
2579         * sys_set_robust_list()):
2580         */
2581        if (fetch_robust_entry(&entry, &head->list.next, &pi))
2582                return;
2583        /*
2584         * Fetch the relative futex offset:
2585         */
2586        if (get_user(futex_offset, &head->futex_offset))
2587                return;
2588        /*
2589         * Fetch any possibly pending lock-add first, and handle it
2590         * if it exists:
2591         */
2592        if (fetch_robust_entry(&pending, &head->list_op_pending, &pip))
2593                return;
2594
2595        next_entry = NULL;      /* avoid warning with gcc */
2596        while (entry != &head->list) {
2597                /*
2598                 * Fetch the next entry in the list before calling
2599                 * handle_futex_death:
2600                 */
2601                rc = fetch_robust_entry(&next_entry, &entry->next, &next_pi);
2602                /*
2603                 * A pending lock might already be on the list, so
2604                 * don't process it twice:
2605                 */
2606                if (entry != pending)
2607                        if (handle_futex_death((void __user *)entry + futex_offset,
2608                                                curr, pi))
2609                                return;
2610                if (rc)
2611                        return;
2612                entry = next_entry;
2613                pi = next_pi;
2614                /*
2615                 * Avoid excessively long or circular lists:
2616                 */
2617                if (!--limit)
2618                        break;
2619
2620                cond_resched();
2621        }
2622
2623        if (pending)
2624                handle_futex_death((void __user *)pending + futex_offset,
2625                                   curr, pip);
2626}
2627
2628long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
2629                u32 __user *uaddr2, u32 val2, u32 val3)
2630{
2631        int ret = -ENOSYS, cmd = op & FUTEX_CMD_MASK;
2632        unsigned int flags = 0;
2633
2634        if (!(op & FUTEX_PRIVATE_FLAG))
2635                flags |= FLAGS_SHARED;
2636
2637        if (op & FUTEX_CLOCK_REALTIME) {
2638                flags |= FLAGS_CLOCKRT;
2639                if (cmd != FUTEX_WAIT_BITSET && cmd != FUTEX_WAIT_REQUEUE_PI)
2640                        return -ENOSYS;
2641        }
2642
2643        switch (cmd) {
2644        case FUTEX_WAIT:
2645                val3 = FUTEX_BITSET_MATCH_ANY;
2646        case FUTEX_WAIT_BITSET:
2647                ret = futex_wait(uaddr, flags, val, timeout, val3);
2648                break;
2649        case FUTEX_WAKE:
2650                val3 = FUTEX_BITSET_MATCH_ANY;
2651        case FUTEX_WAKE_BITSET:
2652                ret = futex_wake(uaddr, flags, val, val3);
2653                break;
2654        case FUTEX_REQUEUE:
2655                ret = futex_requeue(uaddr, flags, uaddr2, val, val2, NULL, 0);
2656                break;
2657        case FUTEX_CMP_REQUEUE:
2658                ret = futex_requeue(uaddr, flags, uaddr2, val, val2, &val3, 0);
2659                break;
2660        case FUTEX_WAKE_OP:
2661                ret = futex_wake_op(uaddr, flags, uaddr2, val, val2, val3);
2662                break;
2663        case FUTEX_LOCK_PI:
2664                if (futex_cmpxchg_enabled)
2665                        ret = futex_lock_pi(uaddr, flags, val, timeout, 0);
2666                break;
2667        case FUTEX_UNLOCK_PI:
2668                if (futex_cmpxchg_enabled)
2669                        ret = futex_unlock_pi(uaddr, flags);
2670                break;
2671        case FUTEX_TRYLOCK_PI:
2672                if (futex_cmpxchg_enabled)
2673                        ret = futex_lock_pi(uaddr, flags, 0, timeout, 1);
2674                break;
2675        case FUTEX_WAIT_REQUEUE_PI:
2676                val3 = FUTEX_BITSET_MATCH_ANY;
2677                ret = futex_wait_requeue_pi(uaddr, flags, val, timeout, val3,
2678                                            uaddr2);
2679                break;
2680        case FUTEX_CMP_REQUEUE_PI:
2681                ret = futex_requeue(uaddr, flags, uaddr2, val, val2, &val3, 1);
2682                break;
2683        default:
2684                ret = -ENOSYS;
2685        }
2686        return ret;
2687}
2688
2689
2690SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val,
2691                struct timespec __user *, utime, u32 __user *, uaddr2,
2692                u32, val3)
2693{
2694        struct timespec ts;
2695        ktime_t t, *tp = NULL;
2696        u32 val2 = 0;
2697        int cmd = op & FUTEX_CMD_MASK;
2698
2699        if (utime && (cmd == FUTEX_WAIT || cmd == FUTEX_LOCK_PI ||
2700                      cmd == FUTEX_WAIT_BITSET ||
2701                      cmd == FUTEX_WAIT_REQUEUE_PI)) {
2702                if (copy_from_user(&ts, utime, sizeof(ts)) != 0)
2703                        return -EFAULT;
2704                if (!timespec_valid(&ts))
2705                        return -EINVAL;
2706
2707                t = timespec_to_ktime(ts);
2708                if (cmd == FUTEX_WAIT)
2709                        t = ktime_add_safe(ktime_get(), t);
2710                tp = &t;
2711        }
2712        /*
2713         * requeue parameter in 'utime' if cmd == FUTEX_*_REQUEUE_*.
2714         * number of waiters to wake in 'utime' if cmd == FUTEX_WAKE_OP.
2715         */
2716        if (cmd == FUTEX_REQUEUE || cmd == FUTEX_CMP_REQUEUE ||
2717            cmd == FUTEX_CMP_REQUEUE_PI || cmd == FUTEX_WAKE_OP)
2718                val2 = (u32) (unsigned long) utime;
2719
2720        return do_futex(uaddr, op, val, tp, uaddr2, val2, val3);
2721}
2722
2723static int __init futex_init(void)
2724{
2725        u32 curval;
2726        int i;
2727
2728        /*
2729         * This will fail and we want it. Some arch implementations do
2730         * runtime detection of the futex_atomic_cmpxchg_inatomic()
2731         * functionality. We want to know that before we call in any
2732         * of the complex code paths. Also we want to prevent
2733         * registration of robust lists in that case. NULL is
2734         * guaranteed to fault and we get -EFAULT on functional
2735         * implementation, the non-functional ones will return
2736         * -ENOSYS.
2737         */
2738        if (cmpxchg_futex_value_locked(&curval, NULL, 0, 0) == -EFAULT)
2739                futex_cmpxchg_enabled = 1;
2740
2741        for (i = 0; i < ARRAY_SIZE(futex_queues); i++) {
2742                plist_head_init(&futex_queues[i].chain);
2743                spin_lock_init(&futex_queues[i].lock);
2744        }
2745
2746        return 0;
2747}
2748__initcall(futex_init);
2749