linux/kernel/futex.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0-or-later
   2/*
   3 *  Fast Userspace Mutexes (which I call "Futexes!").
   4 *  (C) Rusty Russell, IBM 2002
   5 *
   6 *  Generalized futexes, futex requeueing, misc fixes by Ingo Molnar
   7 *  (C) Copyright 2003 Red Hat Inc, All Rights Reserved
   8 *
   9 *  Removed page pinning, fix privately mapped COW pages and other cleanups
  10 *  (C) Copyright 2003, 2004 Jamie Lokier
  11 *
  12 *  Robust futex support started by Ingo Molnar
  13 *  (C) Copyright 2006 Red Hat Inc, All Rights Reserved
  14 *  Thanks to Thomas Gleixner for suggestions, analysis and fixes.
  15 *
  16 *  PI-futex support started by Ingo Molnar and Thomas Gleixner
  17 *  Copyright (C) 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
  18 *  Copyright (C) 2006 Timesys Corp., Thomas Gleixner <tglx@timesys.com>
  19 *
  20 *  PRIVATE futexes by Eric Dumazet
  21 *  Copyright (C) 2007 Eric Dumazet <dada1@cosmosbay.com>
  22 *
  23 *  Requeue-PI support by Darren Hart <dvhltc@us.ibm.com>
  24 *  Copyright (C) IBM Corporation, 2009
  25 *  Thanks to Thomas Gleixner for conceptual design and careful reviews.
  26 *
  27 *  Thanks to Ben LaHaise for yelling "hashed waitqueues" loudly
  28 *  enough at me, Linus for the original (flawed) idea, Matthew
  29 *  Kirkwood for proof-of-concept implementation.
  30 *
  31 *  "The futexes are also cursed."
  32 *  "But they come in a choice of three flavours!"
  33 */
  34#include <linux/compat.h>
  35#include <linux/jhash.h>
  36#include <linux/pagemap.h>
  37#include <linux/syscalls.h>
  38#include <linux/hugetlb.h>
  39#include <linux/freezer.h>
  40#include <linux/memblock.h>
  41#include <linux/fault-inject.h>
  42#include <linux/time_namespace.h>
  43
  44#include <asm/futex.h>
  45
  46#include "locking/rtmutex_common.h"
  47
  48/*
  49 * READ this before attempting to hack on futexes!
  50 *
  51 * Basic futex operation and ordering guarantees
  52 * =============================================
  53 *
  54 * The waiter reads the futex value in user space and calls
  55 * futex_wait(). This function computes the hash bucket and acquires
  56 * the hash bucket lock. After that it reads the futex user space value
  57 * again and verifies that the data has not changed. If it has not changed
  58 * it enqueues itself into the hash bucket, releases the hash bucket lock
  59 * and schedules.
  60 *
  61 * The waker side modifies the user space value of the futex and calls
  62 * futex_wake(). This function computes the hash bucket and acquires the
  63 * hash bucket lock. Then it looks for waiters on that futex in the hash
  64 * bucket and wakes them.
  65 *
  66 * In futex wake up scenarios where no tasks are blocked on a futex, taking
  67 * the hb spinlock can be avoided and simply return. In order for this
  68 * optimization to work, ordering guarantees must exist so that the waiter
  69 * being added to the list is acknowledged when the list is concurrently being
  70 * checked by the waker, avoiding scenarios like the following:
  71 *
  72 * CPU 0                               CPU 1
  73 * val = *futex;
  74 * sys_futex(WAIT, futex, val);
  75 *   futex_wait(futex, val);
  76 *   uval = *futex;
  77 *                                     *futex = newval;
  78 *                                     sys_futex(WAKE, futex);
  79 *                                       futex_wake(futex);
  80 *                                       if (queue_empty())
  81 *                                         return;
  82 *   if (uval == val)
  83 *      lock(hash_bucket(futex));
  84 *      queue();
  85 *     unlock(hash_bucket(futex));
  86 *     schedule();
  87 *
  88 * This would cause the waiter on CPU 0 to wait forever because it
  89 * missed the transition of the user space value from val to newval
  90 * and the waker did not find the waiter in the hash bucket queue.
  91 *
  92 * The correct serialization ensures that a waiter either observes
  93 * the changed user space value before blocking or is woken by a
  94 * concurrent waker:
  95 *
  96 * CPU 0                                 CPU 1
  97 * val = *futex;
  98 * sys_futex(WAIT, futex, val);
  99 *   futex_wait(futex, val);
 100 *
 101 *   waiters++; (a)
 102 *   smp_mb(); (A) <-- paired with -.
 103 *                                  |
 104 *   lock(hash_bucket(futex));      |
 105 *                                  |
 106 *   uval = *futex;                 |
 107 *                                  |        *futex = newval;
 108 *                                  |        sys_futex(WAKE, futex);
 109 *                                  |          futex_wake(futex);
 110 *                                  |
 111 *                                  `--------> smp_mb(); (B)
 112 *   if (uval == val)
 113 *     queue();
 114 *     unlock(hash_bucket(futex));
 115 *     schedule();                         if (waiters)
 116 *                                           lock(hash_bucket(futex));
 117 *   else                                    wake_waiters(futex);
 118 *     waiters--; (b)                        unlock(hash_bucket(futex));
 119 *
 120 * Where (A) orders the waiters increment and the futex value read through
 121 * atomic operations (see hb_waiters_inc) and where (B) orders the write
 122 * to futex and the waiters read (see hb_waiters_pending()).
 123 *
 124 * This yields the following case (where X:=waiters, Y:=futex):
 125 *
 126 *      X = Y = 0
 127 *
 128 *      w[X]=1          w[Y]=1
 129 *      MB              MB
 130 *      r[Y]=y          r[X]=x
 131 *
 132 * Which guarantees that x==0 && y==0 is impossible; which translates back into
 133 * the guarantee that we cannot both miss the futex variable change and the
 134 * enqueue.
 135 *
 136 * Note that a new waiter is accounted for in (a) even when it is possible that
 137 * the wait call can return error, in which case we backtrack from it in (b).
 138 * Refer to the comment in queue_lock().
 139 *
 140 * Similarly, in order to account for waiters being requeued on another
 141 * address we always increment the waiters for the destination bucket before
 142 * acquiring the lock. It then decrements them again  after releasing it -
 143 * the code that actually moves the futex(es) between hash buckets (requeue_futex)
 144 * will do the additional required waiter count housekeeping. This is done for
 145 * double_lock_hb() and double_unlock_hb(), respectively.
 146 */
 147
 148#ifdef CONFIG_HAVE_FUTEX_CMPXCHG
 149#define futex_cmpxchg_enabled 1
 150#else
 151static int  __read_mostly futex_cmpxchg_enabled;
 152#endif
 153
 154/*
 155 * Futex flags used to encode options to functions and preserve them across
 156 * restarts.
 157 */
 158#ifdef CONFIG_MMU
 159# define FLAGS_SHARED           0x01
 160#else
 161/*
 162 * NOMMU does not have per process address space. Let the compiler optimize
 163 * code away.
 164 */
 165# define FLAGS_SHARED           0x00
 166#endif
 167#define FLAGS_CLOCKRT           0x02
 168#define FLAGS_HAS_TIMEOUT       0x04
 169
 170/*
 171 * Priority Inheritance state:
 172 */
 173struct futex_pi_state {
 174        /*
 175         * list of 'owned' pi_state instances - these have to be
 176         * cleaned up in do_exit() if the task exits prematurely:
 177         */
 178        struct list_head list;
 179
 180        /*
 181         * The PI object:
 182         */
 183        struct rt_mutex pi_mutex;
 184
 185        struct task_struct *owner;
 186        refcount_t refcount;
 187
 188        union futex_key key;
 189} __randomize_layout;
 190
 191/**
 192 * struct futex_q - The hashed futex queue entry, one per waiting task
 193 * @list:               priority-sorted list of tasks waiting on this futex
 194 * @task:               the task waiting on the futex
 195 * @lock_ptr:           the hash bucket lock
 196 * @key:                the key the futex is hashed on
 197 * @pi_state:           optional priority inheritance state
 198 * @rt_waiter:          rt_waiter storage for use with requeue_pi
 199 * @requeue_pi_key:     the requeue_pi target futex key
 200 * @bitset:             bitset for the optional bitmasked wakeup
 201 *
 202 * We use this hashed waitqueue, instead of a normal wait_queue_entry_t, so
 203 * we can wake only the relevant ones (hashed queues may be shared).
 204 *
 205 * A futex_q has a woken state, just like tasks have TASK_RUNNING.
 206 * It is considered woken when plist_node_empty(&q->list) || q->lock_ptr == 0.
 207 * The order of wakeup is always to make the first condition true, then
 208 * the second.
 209 *
 210 * PI futexes are typically woken before they are removed from the hash list via
 211 * the rt_mutex code. See unqueue_me_pi().
 212 */
 213struct futex_q {
 214        struct plist_node list;
 215
 216        struct task_struct *task;
 217        spinlock_t *lock_ptr;
 218        union futex_key key;
 219        struct futex_pi_state *pi_state;
 220        struct rt_mutex_waiter *rt_waiter;
 221        union futex_key *requeue_pi_key;
 222        u32 bitset;
 223} __randomize_layout;
 224
 225static const struct futex_q futex_q_init = {
 226        /* list gets initialized in queue_me()*/
 227        .key = FUTEX_KEY_INIT,
 228        .bitset = FUTEX_BITSET_MATCH_ANY
 229};
 230
 231/*
 232 * Hash buckets are shared by all the futex_keys that hash to the same
 233 * location.  Each key may have multiple futex_q structures, one for each task
 234 * waiting on a futex.
 235 */
 236struct futex_hash_bucket {
 237        atomic_t waiters;
 238        spinlock_t lock;
 239        struct plist_head chain;
 240} ____cacheline_aligned_in_smp;
 241
 242/*
 243 * The base of the bucket array and its size are always used together
 244 * (after initialization only in hash_futex()), so ensure that they
 245 * reside in the same cacheline.
 246 */
 247static struct {
 248        struct futex_hash_bucket *queues;
 249        unsigned long            hashsize;
 250} __futex_data __read_mostly __aligned(2*sizeof(long));
 251#define futex_queues   (__futex_data.queues)
 252#define futex_hashsize (__futex_data.hashsize)
 253
 254
 255/*
 256 * Fault injections for futexes.
 257 */
 258#ifdef CONFIG_FAIL_FUTEX
 259
 260static struct {
 261        struct fault_attr attr;
 262
 263        bool ignore_private;
 264} fail_futex = {
 265        .attr = FAULT_ATTR_INITIALIZER,
 266        .ignore_private = false,
 267};
 268
 269static int __init setup_fail_futex(char *str)
 270{
 271        return setup_fault_attr(&fail_futex.attr, str);
 272}
 273__setup("fail_futex=", setup_fail_futex);
 274
 275static bool should_fail_futex(bool fshared)
 276{
 277        if (fail_futex.ignore_private && !fshared)
 278                return false;
 279
 280        return should_fail(&fail_futex.attr, 1);
 281}
 282
 283#ifdef CONFIG_FAULT_INJECTION_DEBUG_FS
 284
 285static int __init fail_futex_debugfs(void)
 286{
 287        umode_t mode = S_IFREG | S_IRUSR | S_IWUSR;
 288        struct dentry *dir;
 289
 290        dir = fault_create_debugfs_attr("fail_futex", NULL,
 291                                        &fail_futex.attr);
 292        if (IS_ERR(dir))
 293                return PTR_ERR(dir);
 294
 295        debugfs_create_bool("ignore-private", mode, dir,
 296                            &fail_futex.ignore_private);
 297        return 0;
 298}
 299
 300late_initcall(fail_futex_debugfs);
 301
 302#endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */
 303
 304#else
 305static inline bool should_fail_futex(bool fshared)
 306{
 307        return false;
 308}
 309#endif /* CONFIG_FAIL_FUTEX */
 310
 311#ifdef CONFIG_COMPAT
 312static void compat_exit_robust_list(struct task_struct *curr);
 313#else
 314static inline void compat_exit_robust_list(struct task_struct *curr) { }
 315#endif
 316
 317/*
 318 * Reflects a new waiter being added to the waitqueue.
 319 */
 320static inline void hb_waiters_inc(struct futex_hash_bucket *hb)
 321{
 322#ifdef CONFIG_SMP
 323        atomic_inc(&hb->waiters);
 324        /*
 325         * Full barrier (A), see the ordering comment above.
 326         */
 327        smp_mb__after_atomic();
 328#endif
 329}
 330
 331/*
 332 * Reflects a waiter being removed from the waitqueue by wakeup
 333 * paths.
 334 */
 335static inline void hb_waiters_dec(struct futex_hash_bucket *hb)
 336{
 337#ifdef CONFIG_SMP
 338        atomic_dec(&hb->waiters);
 339#endif
 340}
 341
 342static inline int hb_waiters_pending(struct futex_hash_bucket *hb)
 343{
 344#ifdef CONFIG_SMP
 345        /*
 346         * Full barrier (B), see the ordering comment above.
 347         */
 348        smp_mb();
 349        return atomic_read(&hb->waiters);
 350#else
 351        return 1;
 352#endif
 353}
 354
 355/**
 356 * hash_futex - Return the hash bucket in the global hash
 357 * @key:        Pointer to the futex key for which the hash is calculated
 358 *
 359 * We hash on the keys returned from get_futex_key (see below) and return the
 360 * corresponding hash bucket in the global hash.
 361 */
 362static struct futex_hash_bucket *hash_futex(union futex_key *key)
 363{
 364        u32 hash = jhash2((u32 *)key, offsetof(typeof(*key), both.offset) / 4,
 365                          key->both.offset);
 366
 367        return &futex_queues[hash & (futex_hashsize - 1)];
 368}
 369
 370
 371/**
 372 * match_futex - Check whether two futex keys are equal
 373 * @key1:       Pointer to key1
 374 * @key2:       Pointer to key2
 375 *
 376 * Return 1 if two futex_keys are equal, 0 otherwise.
 377 */
 378static inline int match_futex(union futex_key *key1, union futex_key *key2)
 379{
 380        return (key1 && key2
 381                && key1->both.word == key2->both.word
 382                && key1->both.ptr == key2->both.ptr
 383                && key1->both.offset == key2->both.offset);
 384}
 385
 386enum futex_access {
 387        FUTEX_READ,
 388        FUTEX_WRITE
 389};
 390
 391/**
 392 * futex_setup_timer - set up the sleeping hrtimer.
 393 * @time:       ptr to the given timeout value
 394 * @timeout:    the hrtimer_sleeper structure to be set up
 395 * @flags:      futex flags
 396 * @range_ns:   optional range in ns
 397 *
 398 * Return: Initialized hrtimer_sleeper structure or NULL if no timeout
 399 *         value given
 400 */
 401static inline struct hrtimer_sleeper *
 402futex_setup_timer(ktime_t *time, struct hrtimer_sleeper *timeout,
 403                  int flags, u64 range_ns)
 404{
 405        if (!time)
 406                return NULL;
 407
 408        hrtimer_init_sleeper_on_stack(timeout, (flags & FLAGS_CLOCKRT) ?
 409                                      CLOCK_REALTIME : CLOCK_MONOTONIC,
 410                                      HRTIMER_MODE_ABS);
 411        /*
 412         * If range_ns is 0, calling hrtimer_set_expires_range_ns() is
 413         * effectively the same as calling hrtimer_set_expires().
 414         */
 415        hrtimer_set_expires_range_ns(&timeout->timer, *time, range_ns);
 416
 417        return timeout;
 418}
 419
 420/*
 421 * Generate a machine wide unique identifier for this inode.
 422 *
 423 * This relies on u64 not wrapping in the life-time of the machine; which with
 424 * 1ns resolution means almost 585 years.
 425 *
 426 * This further relies on the fact that a well formed program will not unmap
 427 * the file while it has a (shared) futex waiting on it. This mapping will have
 428 * a file reference which pins the mount and inode.
 429 *
 430 * If for some reason an inode gets evicted and read back in again, it will get
 431 * a new sequence number and will _NOT_ match, even though it is the exact same
 432 * file.
 433 *
 434 * It is important that match_futex() will never have a false-positive, esp.
 435 * for PI futexes that can mess up the state. The above argues that false-negatives
 436 * are only possible for malformed programs.
 437 */
 438static u64 get_inode_sequence_number(struct inode *inode)
 439{
 440        static atomic64_t i_seq;
 441        u64 old;
 442
 443        /* Does the inode already have a sequence number? */
 444        old = atomic64_read(&inode->i_sequence);
 445        if (likely(old))
 446                return old;
 447
 448        for (;;) {
 449                u64 new = atomic64_add_return(1, &i_seq);
 450                if (WARN_ON_ONCE(!new))
 451                        continue;
 452
 453                old = atomic64_cmpxchg_relaxed(&inode->i_sequence, 0, new);
 454                if (old)
 455                        return old;
 456                return new;
 457        }
 458}
 459
 460/**
 461 * get_futex_key() - Get parameters which are the keys for a futex
 462 * @uaddr:      virtual address of the futex
 463 * @fshared:    false for a PROCESS_PRIVATE futex, true for PROCESS_SHARED
 464 * @key:        address where result is stored.
 465 * @rw:         mapping needs to be read/write (values: FUTEX_READ,
 466 *              FUTEX_WRITE)
 467 *
 468 * Return: a negative error code or 0
 469 *
 470 * The key words are stored in @key on success.
 471 *
 472 * For shared mappings (when @fshared), the key is:
 473 *
 474 *   ( inode->i_sequence, page->index, offset_within_page )
 475 *
 476 * [ also see get_inode_sequence_number() ]
 477 *
 478 * For private mappings (or when !@fshared), the key is:
 479 *
 480 *   ( current->mm, address, 0 )
 481 *
 482 * This allows (cross process, where applicable) identification of the futex
 483 * without keeping the page pinned for the duration of the FUTEX_WAIT.
 484 *
 485 * lock_page() might sleep, the caller should not hold a spinlock.
 486 */
 487static int get_futex_key(u32 __user *uaddr, bool fshared, union futex_key *key,
 488                         enum futex_access rw)
 489{
 490        unsigned long address = (unsigned long)uaddr;
 491        struct mm_struct *mm = current->mm;
 492        struct page *page, *tail;
 493        struct address_space *mapping;
 494        int err, ro = 0;
 495
 496        /*
 497         * The futex address must be "naturally" aligned.
 498         */
 499        key->both.offset = address % PAGE_SIZE;
 500        if (unlikely((address % sizeof(u32)) != 0))
 501                return -EINVAL;
 502        address -= key->both.offset;
 503
 504        if (unlikely(!access_ok(uaddr, sizeof(u32))))
 505                return -EFAULT;
 506
 507        if (unlikely(should_fail_futex(fshared)))
 508                return -EFAULT;
 509
 510        /*
 511         * PROCESS_PRIVATE futexes are fast.
 512         * As the mm cannot disappear under us and the 'key' only needs
 513         * virtual address, we dont even have to find the underlying vma.
 514         * Note : We do have to check 'uaddr' is a valid user address,
 515         *        but access_ok() should be faster than find_vma()
 516         */
 517        if (!fshared) {
 518                key->private.mm = mm;
 519                key->private.address = address;
 520                return 0;
 521        }
 522
 523again:
 524        /* Ignore any VERIFY_READ mapping (futex common case) */
 525        if (unlikely(should_fail_futex(true)))
 526                return -EFAULT;
 527
 528        err = get_user_pages_fast(address, 1, FOLL_WRITE, &page);
 529        /*
 530         * If write access is not required (eg. FUTEX_WAIT), try
 531         * and get read-only access.
 532         */
 533        if (err == -EFAULT && rw == FUTEX_READ) {
 534                err = get_user_pages_fast(address, 1, 0, &page);
 535                ro = 1;
 536        }
 537        if (err < 0)
 538                return err;
 539        else
 540                err = 0;
 541
 542        /*
 543         * The treatment of mapping from this point on is critical. The page
 544         * lock protects many things but in this context the page lock
 545         * stabilizes mapping, prevents inode freeing in the shared
 546         * file-backed region case and guards against movement to swap cache.
 547         *
 548         * Strictly speaking the page lock is not needed in all cases being
 549         * considered here and page lock forces unnecessarily serialization
 550         * From this point on, mapping will be re-verified if necessary and
 551         * page lock will be acquired only if it is unavoidable
 552         *
 553         * Mapping checks require the head page for any compound page so the
 554         * head page and mapping is looked up now. For anonymous pages, it
 555         * does not matter if the page splits in the future as the key is
 556         * based on the address. For filesystem-backed pages, the tail is
 557         * required as the index of the page determines the key. For
 558         * base pages, there is no tail page and tail == page.
 559         */
 560        tail = page;
 561        page = compound_head(page);
 562        mapping = READ_ONCE(page->mapping);
 563
 564        /*
 565         * If page->mapping is NULL, then it cannot be a PageAnon
 566         * page; but it might be the ZERO_PAGE or in the gate area or
 567         * in a special mapping (all cases which we are happy to fail);
 568         * or it may have been a good file page when get_user_pages_fast
 569         * found it, but truncated or holepunched or subjected to
 570         * invalidate_complete_page2 before we got the page lock (also
 571         * cases which we are happy to fail).  And we hold a reference,
 572         * so refcount care in invalidate_complete_page's remove_mapping
 573         * prevents drop_caches from setting mapping to NULL beneath us.
 574         *
 575         * The case we do have to guard against is when memory pressure made
 576         * shmem_writepage move it from filecache to swapcache beneath us:
 577         * an unlikely race, but we do need to retry for page->mapping.
 578         */
 579        if (unlikely(!mapping)) {
 580                int shmem_swizzled;
 581
 582                /*
 583                 * Page lock is required to identify which special case above
 584                 * applies. If this is really a shmem page then the page lock
 585                 * will prevent unexpected transitions.
 586                 */
 587                lock_page(page);
 588                shmem_swizzled = PageSwapCache(page) || page->mapping;
 589                unlock_page(page);
 590                put_page(page);
 591
 592                if (shmem_swizzled)
 593                        goto again;
 594
 595                return -EFAULT;
 596        }
 597
 598        /*
 599         * Private mappings are handled in a simple way.
 600         *
 601         * If the futex key is stored on an anonymous page, then the associated
 602         * object is the mm which is implicitly pinned by the calling process.
 603         *
 604         * NOTE: When userspace waits on a MAP_SHARED mapping, even if
 605         * it's a read-only handle, it's expected that futexes attach to
 606         * the object not the particular process.
 607         */
 608        if (PageAnon(page)) {
 609                /*
 610                 * A RO anonymous page will never change and thus doesn't make
 611                 * sense for futex operations.
 612                 */
 613                if (unlikely(should_fail_futex(true)) || ro) {
 614                        err = -EFAULT;
 615                        goto out;
 616                }
 617
 618                key->both.offset |= FUT_OFF_MMSHARED; /* ref taken on mm */
 619                key->private.mm = mm;
 620                key->private.address = address;
 621
 622        } else {
 623                struct inode *inode;
 624
 625                /*
 626                 * The associated futex object in this case is the inode and
 627                 * the page->mapping must be traversed. Ordinarily this should
 628                 * be stabilised under page lock but it's not strictly
 629                 * necessary in this case as we just want to pin the inode, not
 630                 * update the radix tree or anything like that.
 631                 *
 632                 * The RCU read lock is taken as the inode is finally freed
 633                 * under RCU. If the mapping still matches expectations then the
 634                 * mapping->host can be safely accessed as being a valid inode.
 635                 */
 636                rcu_read_lock();
 637
 638                if (READ_ONCE(page->mapping) != mapping) {
 639                        rcu_read_unlock();
 640                        put_page(page);
 641
 642                        goto again;
 643                }
 644
 645                inode = READ_ONCE(mapping->host);
 646                if (!inode) {
 647                        rcu_read_unlock();
 648                        put_page(page);
 649
 650                        goto again;
 651                }
 652
 653                key->both.offset |= FUT_OFF_INODE; /* inode-based key */
 654                key->shared.i_seq = get_inode_sequence_number(inode);
 655                key->shared.pgoff = basepage_index(tail);
 656                rcu_read_unlock();
 657        }
 658
 659out:
 660        put_page(page);
 661        return err;
 662}
 663
 664/**
 665 * fault_in_user_writeable() - Fault in user address and verify RW access
 666 * @uaddr:      pointer to faulting user space address
 667 *
 668 * Slow path to fixup the fault we just took in the atomic write
 669 * access to @uaddr.
 670 *
 671 * We have no generic implementation of a non-destructive write to the
 672 * user address. We know that we faulted in the atomic pagefault
 673 * disabled section so we can as well avoid the #PF overhead by
 674 * calling get_user_pages() right away.
 675 */
 676static int fault_in_user_writeable(u32 __user *uaddr)
 677{
 678        struct mm_struct *mm = current->mm;
 679        int ret;
 680
 681        mmap_read_lock(mm);
 682        ret = fixup_user_fault(mm, (unsigned long)uaddr,
 683                               FAULT_FLAG_WRITE, NULL);
 684        mmap_read_unlock(mm);
 685
 686        return ret < 0 ? ret : 0;
 687}
 688
 689/**
 690 * futex_top_waiter() - Return the highest priority waiter on a futex
 691 * @hb:         the hash bucket the futex_q's reside in
 692 * @key:        the futex key (to distinguish it from other futex futex_q's)
 693 *
 694 * Must be called with the hb lock held.
 695 */
 696static struct futex_q *futex_top_waiter(struct futex_hash_bucket *hb,
 697                                        union futex_key *key)
 698{
 699        struct futex_q *this;
 700
 701        plist_for_each_entry(this, &hb->chain, list) {
 702                if (match_futex(&this->key, key))
 703                        return this;
 704        }
 705        return NULL;
 706}
 707
 708static int cmpxchg_futex_value_locked(u32 *curval, u32 __user *uaddr,
 709                                      u32 uval, u32 newval)
 710{
 711        int ret;
 712
 713        pagefault_disable();
 714        ret = futex_atomic_cmpxchg_inatomic(curval, uaddr, uval, newval);
 715        pagefault_enable();
 716
 717        return ret;
 718}
 719
 720static int get_futex_value_locked(u32 *dest, u32 __user *from)
 721{
 722        int ret;
 723
 724        pagefault_disable();
 725        ret = __get_user(*dest, from);
 726        pagefault_enable();
 727
 728        return ret ? -EFAULT : 0;
 729}
 730
 731
 732/*
 733 * PI code:
 734 */
 735static int refill_pi_state_cache(void)
 736{
 737        struct futex_pi_state *pi_state;
 738
 739        if (likely(current->pi_state_cache))
 740                return 0;
 741
 742        pi_state = kzalloc(sizeof(*pi_state), GFP_KERNEL);
 743
 744        if (!pi_state)
 745                return -ENOMEM;
 746
 747        INIT_LIST_HEAD(&pi_state->list);
 748        /* pi_mutex gets initialized later */
 749        pi_state->owner = NULL;
 750        refcount_set(&pi_state->refcount, 1);
 751        pi_state->key = FUTEX_KEY_INIT;
 752
 753        current->pi_state_cache = pi_state;
 754
 755        return 0;
 756}
 757
 758static struct futex_pi_state *alloc_pi_state(void)
 759{
 760        struct futex_pi_state *pi_state = current->pi_state_cache;
 761
 762        WARN_ON(!pi_state);
 763        current->pi_state_cache = NULL;
 764
 765        return pi_state;
 766}
 767
 768static void get_pi_state(struct futex_pi_state *pi_state)
 769{
 770        WARN_ON_ONCE(!refcount_inc_not_zero(&pi_state->refcount));
 771}
 772
 773/*
 774 * Drops a reference to the pi_state object and frees or caches it
 775 * when the last reference is gone.
 776 */
 777static void put_pi_state(struct futex_pi_state *pi_state)
 778{
 779        if (!pi_state)
 780                return;
 781
 782        if (!refcount_dec_and_test(&pi_state->refcount))
 783                return;
 784
 785        /*
 786         * If pi_state->owner is NULL, the owner is most probably dying
 787         * and has cleaned up the pi_state already
 788         */
 789        if (pi_state->owner) {
 790                struct task_struct *owner;
 791                unsigned long flags;
 792
 793                raw_spin_lock_irqsave(&pi_state->pi_mutex.wait_lock, flags);
 794                owner = pi_state->owner;
 795                if (owner) {
 796                        raw_spin_lock(&owner->pi_lock);
 797                        list_del_init(&pi_state->list);
 798                        raw_spin_unlock(&owner->pi_lock);
 799                }
 800                rt_mutex_proxy_unlock(&pi_state->pi_mutex, owner);
 801                raw_spin_unlock_irqrestore(&pi_state->pi_mutex.wait_lock, flags);
 802        }
 803
 804        if (current->pi_state_cache) {
 805                kfree(pi_state);
 806        } else {
 807                /*
 808                 * pi_state->list is already empty.
 809                 * clear pi_state->owner.
 810                 * refcount is at 0 - put it back to 1.
 811                 */
 812                pi_state->owner = NULL;
 813                refcount_set(&pi_state->refcount, 1);
 814                current->pi_state_cache = pi_state;
 815        }
 816}
 817
 818#ifdef CONFIG_FUTEX_PI
 819
 820/*
 821 * This task is holding PI mutexes at exit time => bad.
 822 * Kernel cleans up PI-state, but userspace is likely hosed.
 823 * (Robust-futex cleanup is separate and might save the day for userspace.)
 824 */
 825static void exit_pi_state_list(struct task_struct *curr)
 826{
 827        struct list_head *next, *head = &curr->pi_state_list;
 828        struct futex_pi_state *pi_state;
 829        struct futex_hash_bucket *hb;
 830        union futex_key key = FUTEX_KEY_INIT;
 831
 832        if (!futex_cmpxchg_enabled)
 833                return;
 834        /*
 835         * We are a ZOMBIE and nobody can enqueue itself on
 836         * pi_state_list anymore, but we have to be careful
 837         * versus waiters unqueueing themselves:
 838         */
 839        raw_spin_lock_irq(&curr->pi_lock);
 840        while (!list_empty(head)) {
 841                next = head->next;
 842                pi_state = list_entry(next, struct futex_pi_state, list);
 843                key = pi_state->key;
 844                hb = hash_futex(&key);
 845
 846                /*
 847                 * We can race against put_pi_state() removing itself from the
 848                 * list (a waiter going away). put_pi_state() will first
 849                 * decrement the reference count and then modify the list, so
 850                 * its possible to see the list entry but fail this reference
 851                 * acquire.
 852                 *
 853                 * In that case; drop the locks to let put_pi_state() make
 854                 * progress and retry the loop.
 855                 */
 856                if (!refcount_inc_not_zero(&pi_state->refcount)) {
 857                        raw_spin_unlock_irq(&curr->pi_lock);
 858                        cpu_relax();
 859                        raw_spin_lock_irq(&curr->pi_lock);
 860                        continue;
 861                }
 862                raw_spin_unlock_irq(&curr->pi_lock);
 863
 864                spin_lock(&hb->lock);
 865                raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
 866                raw_spin_lock(&curr->pi_lock);
 867                /*
 868                 * We dropped the pi-lock, so re-check whether this
 869                 * task still owns the PI-state:
 870                 */
 871                if (head->next != next) {
 872                        /* retain curr->pi_lock for the loop invariant */
 873                        raw_spin_unlock(&pi_state->pi_mutex.wait_lock);
 874                        spin_unlock(&hb->lock);
 875                        put_pi_state(pi_state);
 876                        continue;
 877                }
 878
 879                WARN_ON(pi_state->owner != curr);
 880                WARN_ON(list_empty(&pi_state->list));
 881                list_del_init(&pi_state->list);
 882                pi_state->owner = NULL;
 883
 884                raw_spin_unlock(&curr->pi_lock);
 885                raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
 886                spin_unlock(&hb->lock);
 887
 888                rt_mutex_futex_unlock(&pi_state->pi_mutex);
 889                put_pi_state(pi_state);
 890
 891                raw_spin_lock_irq(&curr->pi_lock);
 892        }
 893        raw_spin_unlock_irq(&curr->pi_lock);
 894}
 895#else
 896static inline void exit_pi_state_list(struct task_struct *curr) { }
 897#endif
 898
 899/*
 900 * We need to check the following states:
 901 *
 902 *      Waiter | pi_state | pi->owner | uTID      | uODIED | ?
 903 *
 904 * [1]  NULL   | ---      | ---       | 0         | 0/1    | Valid
 905 * [2]  NULL   | ---      | ---       | >0        | 0/1    | Valid
 906 *
 907 * [3]  Found  | NULL     | --        | Any       | 0/1    | Invalid
 908 *
 909 * [4]  Found  | Found    | NULL      | 0         | 1      | Valid
 910 * [5]  Found  | Found    | NULL      | >0        | 1      | Invalid
 911 *
 912 * [6]  Found  | Found    | task      | 0         | 1      | Valid
 913 *
 914 * [7]  Found  | Found    | NULL      | Any       | 0      | Invalid
 915 *
 916 * [8]  Found  | Found    | task      | ==taskTID | 0/1    | Valid
 917 * [9]  Found  | Found    | task      | 0         | 0      | Invalid
 918 * [10] Found  | Found    | task      | !=taskTID | 0/1    | Invalid
 919 *
 920 * [1]  Indicates that the kernel can acquire the futex atomically. We
 921 *      came here due to a stale FUTEX_WAITERS/FUTEX_OWNER_DIED bit.
 922 *
 923 * [2]  Valid, if TID does not belong to a kernel thread. If no matching
 924 *      thread is found then it indicates that the owner TID has died.
 925 *
 926 * [3]  Invalid. The waiter is queued on a non PI futex
 927 *
 928 * [4]  Valid state after exit_robust_list(), which sets the user space
 929 *      value to FUTEX_WAITERS | FUTEX_OWNER_DIED.
 930 *
 931 * [5]  The user space value got manipulated between exit_robust_list()
 932 *      and exit_pi_state_list()
 933 *
 934 * [6]  Valid state after exit_pi_state_list() which sets the new owner in
 935 *      the pi_state but cannot access the user space value.
 936 *
 937 * [7]  pi_state->owner can only be NULL when the OWNER_DIED bit is set.
 938 *
 939 * [8]  Owner and user space value match
 940 *
 941 * [9]  There is no transient state which sets the user space TID to 0
 942 *      except exit_robust_list(), but this is indicated by the
 943 *      FUTEX_OWNER_DIED bit. See [4]
 944 *
 945 * [10] There is no transient state which leaves owner and user space
 946 *      TID out of sync.
 947 *
 948 *
 949 * Serialization and lifetime rules:
 950 *
 951 * hb->lock:
 952 *
 953 *      hb -> futex_q, relation
 954 *      futex_q -> pi_state, relation
 955 *
 956 *      (cannot be raw because hb can contain arbitrary amount
 957 *       of futex_q's)
 958 *
 959 * pi_mutex->wait_lock:
 960 *
 961 *      {uval, pi_state}
 962 *
 963 *      (and pi_mutex 'obviously')
 964 *
 965 * p->pi_lock:
 966 *
 967 *      p->pi_state_list -> pi_state->list, relation
 968 *
 969 * pi_state->refcount:
 970 *
 971 *      pi_state lifetime
 972 *
 973 *
 974 * Lock order:
 975 *
 976 *   hb->lock
 977 *     pi_mutex->wait_lock
 978 *       p->pi_lock
 979 *
 980 */
 981
 982/*
 983 * Validate that the existing waiter has a pi_state and sanity check
 984 * the pi_state against the user space value. If correct, attach to
 985 * it.
 986 */
 987static int attach_to_pi_state(u32 __user *uaddr, u32 uval,
 988                              struct futex_pi_state *pi_state,
 989                              struct futex_pi_state **ps)
 990{
 991        pid_t pid = uval & FUTEX_TID_MASK;
 992        u32 uval2;
 993        int ret;
 994
 995        /*
 996         * Userspace might have messed up non-PI and PI futexes [3]
 997         */
 998        if (unlikely(!pi_state))
 999                return -EINVAL;
1000
1001        /*
1002         * We get here with hb->lock held, and having found a
1003         * futex_top_waiter(). This means that futex_lock_pi() of said futex_q
1004         * has dropped the hb->lock in between queue_me() and unqueue_me_pi(),
1005         * which in turn means that futex_lock_pi() still has a reference on
1006         * our pi_state.
1007         *
1008         * The waiter holding a reference on @pi_state also protects against
1009         * the unlocked put_pi_state() in futex_unlock_pi(), futex_lock_pi()
1010         * and futex_wait_requeue_pi() as it cannot go to 0 and consequently
1011         * free pi_state before we can take a reference ourselves.
1012         */
1013        WARN_ON(!refcount_read(&pi_state->refcount));
1014
1015        /*
1016         * Now that we have a pi_state, we can acquire wait_lock
1017         * and do the state validation.
1018         */
1019        raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
1020
1021        /*
1022         * Since {uval, pi_state} is serialized by wait_lock, and our current
1023         * uval was read without holding it, it can have changed. Verify it
1024         * still is what we expect it to be, otherwise retry the entire
1025         * operation.
1026         */
1027        if (get_futex_value_locked(&uval2, uaddr))
1028                goto out_efault;
1029
1030        if (uval != uval2)
1031                goto out_eagain;
1032
1033        /*
1034         * Handle the owner died case:
1035         */
1036        if (uval & FUTEX_OWNER_DIED) {
1037                /*
1038                 * exit_pi_state_list sets owner to NULL and wakes the
1039                 * topmost waiter. The task which acquires the
1040                 * pi_state->rt_mutex will fixup owner.
1041                 */
1042                if (!pi_state->owner) {
1043                        /*
1044                         * No pi state owner, but the user space TID
1045                         * is not 0. Inconsistent state. [5]
1046                         */
1047                        if (pid)
1048                                goto out_einval;
1049                        /*
1050                         * Take a ref on the state and return success. [4]
1051                         */
1052                        goto out_attach;
1053                }
1054
1055                /*
1056                 * If TID is 0, then either the dying owner has not
1057                 * yet executed exit_pi_state_list() or some waiter
1058                 * acquired the rtmutex in the pi state, but did not
1059                 * yet fixup the TID in user space.
1060                 *
1061                 * Take a ref on the state and return success. [6]
1062                 */
1063                if (!pid)
1064                        goto out_attach;
1065        } else {
1066                /*
1067                 * If the owner died bit is not set, then the pi_state
1068                 * must have an owner. [7]
1069                 */
1070                if (!pi_state->owner)
1071                        goto out_einval;
1072        }
1073
1074        /*
1075         * Bail out if user space manipulated the futex value. If pi
1076         * state exists then the owner TID must be the same as the
1077         * user space TID. [9/10]
1078         */
1079        if (pid != task_pid_vnr(pi_state->owner))
1080                goto out_einval;
1081
1082out_attach:
1083        get_pi_state(pi_state);
1084        raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
1085        *ps = pi_state;
1086        return 0;
1087
1088out_einval:
1089        ret = -EINVAL;
1090        goto out_error;
1091
1092out_eagain:
1093        ret = -EAGAIN;
1094        goto out_error;
1095
1096out_efault:
1097        ret = -EFAULT;
1098        goto out_error;
1099
1100out_error:
1101        raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
1102        return ret;
1103}
1104
1105/**
1106 * wait_for_owner_exiting - Block until the owner has exited
1107 * @ret: owner's current futex lock status
1108 * @exiting:    Pointer to the exiting task
1109 *
1110 * Caller must hold a refcount on @exiting.
1111 */
1112static void wait_for_owner_exiting(int ret, struct task_struct *exiting)
1113{
1114        if (ret != -EBUSY) {
1115                WARN_ON_ONCE(exiting);
1116                return;
1117        }
1118
1119        if (WARN_ON_ONCE(ret == -EBUSY && !exiting))
1120                return;
1121
1122        mutex_lock(&exiting->futex_exit_mutex);
1123        /*
1124         * No point in doing state checking here. If the waiter got here
1125         * while the task was in exec()->exec_futex_release() then it can
1126         * have any FUTEX_STATE_* value when the waiter has acquired the
1127         * mutex. OK, if running, EXITING or DEAD if it reached exit()
1128         * already. Highly unlikely and not a problem. Just one more round
1129         * through the futex maze.
1130         */
1131        mutex_unlock(&exiting->futex_exit_mutex);
1132
1133        put_task_struct(exiting);
1134}
1135
1136static int handle_exit_race(u32 __user *uaddr, u32 uval,
1137                            struct task_struct *tsk)
1138{
1139        u32 uval2;
1140
1141        /*
1142         * If the futex exit state is not yet FUTEX_STATE_DEAD, tell the
1143         * caller that the alleged owner is busy.
1144         */
1145        if (tsk && tsk->futex_state != FUTEX_STATE_DEAD)
1146                return -EBUSY;
1147
1148        /*
1149         * Reread the user space value to handle the following situation:
1150         *
1151         * CPU0                         CPU1
1152         *
1153         * sys_exit()                   sys_futex()
1154         *  do_exit()                    futex_lock_pi()
1155         *                                futex_lock_pi_atomic()
1156         *   exit_signals(tsk)              No waiters:
1157         *    tsk->flags |= PF_EXITING;     *uaddr == 0x00000PID
1158         *  mm_release(tsk)                 Set waiter bit
1159         *   exit_robust_list(tsk) {        *uaddr = 0x80000PID;
1160         *      Set owner died              attach_to_pi_owner() {
1161         *    *uaddr = 0xC0000000;           tsk = get_task(PID);
1162         *   }                               if (!tsk->flags & PF_EXITING) {
1163         *  ...                                attach();
1164         *  tsk->futex_state =               } else {
1165         *      FUTEX_STATE_DEAD;              if (tsk->futex_state !=
1166         *                                        FUTEX_STATE_DEAD)
1167         *                                       return -EAGAIN;
1168         *                                     return -ESRCH; <--- FAIL
1169         *                                   }
1170         *
1171         * Returning ESRCH unconditionally is wrong here because the
1172         * user space value has been changed by the exiting task.
1173         *
1174         * The same logic applies to the case where the exiting task is
1175         * already gone.
1176         */
1177        if (get_futex_value_locked(&uval2, uaddr))
1178                return -EFAULT;
1179
1180        /* If the user space value has changed, try again. */
1181        if (uval2 != uval)
1182                return -EAGAIN;
1183
1184        /*
1185         * The exiting task did not have a robust list, the robust list was
1186         * corrupted or the user space value in *uaddr is simply bogus.
1187         * Give up and tell user space.
1188         */
1189        return -ESRCH;
1190}
1191
1192/*
1193 * Lookup the task for the TID provided from user space and attach to
1194 * it after doing proper sanity checks.
1195 */
1196static int attach_to_pi_owner(u32 __user *uaddr, u32 uval, union futex_key *key,
1197                              struct futex_pi_state **ps,
1198                              struct task_struct **exiting)
1199{
1200        pid_t pid = uval & FUTEX_TID_MASK;
1201        struct futex_pi_state *pi_state;
1202        struct task_struct *p;
1203
1204        /*
1205         * We are the first waiter - try to look up the real owner and attach
1206         * the new pi_state to it, but bail out when TID = 0 [1]
1207         *
1208         * The !pid check is paranoid. None of the call sites should end up
1209         * with pid == 0, but better safe than sorry. Let the caller retry
1210         */
1211        if (!pid)
1212                return -EAGAIN;
1213        p = find_get_task_by_vpid(pid);
1214        if (!p)
1215                return handle_exit_race(uaddr, uval, NULL);
1216
1217        if (unlikely(p->flags & PF_KTHREAD)) {
1218                put_task_struct(p);
1219                return -EPERM;
1220        }
1221
1222        /*
1223         * We need to look at the task state to figure out, whether the
1224         * task is exiting. To protect against the change of the task state
1225         * in futex_exit_release(), we do this protected by p->pi_lock:
1226         */
1227        raw_spin_lock_irq(&p->pi_lock);
1228        if (unlikely(p->futex_state != FUTEX_STATE_OK)) {
1229                /*
1230                 * The task is on the way out. When the futex state is
1231                 * FUTEX_STATE_DEAD, we know that the task has finished
1232                 * the cleanup:
1233                 */
1234                int ret = handle_exit_race(uaddr, uval, p);
1235
1236                raw_spin_unlock_irq(&p->pi_lock);
1237                /*
1238                 * If the owner task is between FUTEX_STATE_EXITING and
1239                 * FUTEX_STATE_DEAD then store the task pointer and keep
1240                 * the reference on the task struct. The calling code will
1241                 * drop all locks, wait for the task to reach
1242                 * FUTEX_STATE_DEAD and then drop the refcount. This is
1243                 * required to prevent a live lock when the current task
1244                 * preempted the exiting task between the two states.
1245                 */
1246                if (ret == -EBUSY)
1247                        *exiting = p;
1248                else
1249                        put_task_struct(p);
1250                return ret;
1251        }
1252
1253        /*
1254         * No existing pi state. First waiter. [2]
1255         *
1256         * This creates pi_state, we have hb->lock held, this means nothing can
1257         * observe this state, wait_lock is irrelevant.
1258         */
1259        pi_state = alloc_pi_state();
1260
1261        /*
1262         * Initialize the pi_mutex in locked state and make @p
1263         * the owner of it:
1264         */
1265        rt_mutex_init_proxy_locked(&pi_state->pi_mutex, p);
1266
1267        /* Store the key for possible exit cleanups: */
1268        pi_state->key = *key;
1269
1270        WARN_ON(!list_empty(&pi_state->list));
1271        list_add(&pi_state->list, &p->pi_state_list);
1272        /*
1273         * Assignment without holding pi_state->pi_mutex.wait_lock is safe
1274         * because there is no concurrency as the object is not published yet.
1275         */
1276        pi_state->owner = p;
1277        raw_spin_unlock_irq(&p->pi_lock);
1278
1279        put_task_struct(p);
1280
1281        *ps = pi_state;
1282
1283        return 0;
1284}
1285
1286static int lookup_pi_state(u32 __user *uaddr, u32 uval,
1287                           struct futex_hash_bucket *hb,
1288                           union futex_key *key, struct futex_pi_state **ps,
1289                           struct task_struct **exiting)
1290{
1291        struct futex_q *top_waiter = futex_top_waiter(hb, key);
1292
1293        /*
1294         * If there is a waiter on that futex, validate it and
1295         * attach to the pi_state when the validation succeeds.
1296         */
1297        if (top_waiter)
1298                return attach_to_pi_state(uaddr, uval, top_waiter->pi_state, ps);
1299
1300        /*
1301         * We are the first waiter - try to look up the owner based on
1302         * @uval and attach to it.
1303         */
1304        return attach_to_pi_owner(uaddr, uval, key, ps, exiting);
1305}
1306
1307static int lock_pi_update_atomic(u32 __user *uaddr, u32 uval, u32 newval)
1308{
1309        int err;
1310        u32 curval;
1311
1312        if (unlikely(should_fail_futex(true)))
1313                return -EFAULT;
1314
1315        err = cmpxchg_futex_value_locked(&curval, uaddr, uval, newval);
1316        if (unlikely(err))
1317                return err;
1318
1319        /* If user space value changed, let the caller retry */
1320        return curval != uval ? -EAGAIN : 0;
1321}
1322
1323/**
1324 * futex_lock_pi_atomic() - Atomic work required to acquire a pi aware futex
1325 * @uaddr:              the pi futex user address
1326 * @hb:                 the pi futex hash bucket
1327 * @key:                the futex key associated with uaddr and hb
1328 * @ps:                 the pi_state pointer where we store the result of the
1329 *                      lookup
1330 * @task:               the task to perform the atomic lock work for.  This will
1331 *                      be "current" except in the case of requeue pi.
1332 * @exiting:            Pointer to store the task pointer of the owner task
1333 *                      which is in the middle of exiting
1334 * @set_waiters:        force setting the FUTEX_WAITERS bit (1) or not (0)
1335 *
1336 * Return:
1337 *  -  0 - ready to wait;
1338 *  -  1 - acquired the lock;
1339 *  - <0 - error
1340 *
1341 * The hb->lock and futex_key refs shall be held by the caller.
1342 *
1343 * @exiting is only set when the return value is -EBUSY. If so, this holds
1344 * a refcount on the exiting task on return and the caller needs to drop it
1345 * after waiting for the exit to complete.
1346 */
1347static int futex_lock_pi_atomic(u32 __user *uaddr, struct futex_hash_bucket *hb,
1348                                union futex_key *key,
1349                                struct futex_pi_state **ps,
1350                                struct task_struct *task,
1351                                struct task_struct **exiting,
1352                                int set_waiters)
1353{
1354        u32 uval, newval, vpid = task_pid_vnr(task);
1355        struct futex_q *top_waiter;
1356        int ret;
1357
1358        /*
1359         * Read the user space value first so we can validate a few
1360         * things before proceeding further.
1361         */
1362        if (get_futex_value_locked(&uval, uaddr))
1363                return -EFAULT;
1364
1365        if (unlikely(should_fail_futex(true)))
1366                return -EFAULT;
1367
1368        /*
1369         * Detect deadlocks.
1370         */
1371        if ((unlikely((uval & FUTEX_TID_MASK) == vpid)))
1372                return -EDEADLK;
1373
1374        if ((unlikely(should_fail_futex(true))))
1375                return -EDEADLK;
1376
1377        /*
1378         * Lookup existing state first. If it exists, try to attach to
1379         * its pi_state.
1380         */
1381        top_waiter = futex_top_waiter(hb, key);
1382        if (top_waiter)
1383                return attach_to_pi_state(uaddr, uval, top_waiter->pi_state, ps);
1384
1385        /*
1386         * No waiter and user TID is 0. We are here because the
1387         * waiters or the owner died bit is set or called from
1388         * requeue_cmp_pi or for whatever reason something took the
1389         * syscall.
1390         */
1391        if (!(uval & FUTEX_TID_MASK)) {
1392                /*
1393                 * We take over the futex. No other waiters and the user space
1394                 * TID is 0. We preserve the owner died bit.
1395                 */
1396                newval = uval & FUTEX_OWNER_DIED;
1397                newval |= vpid;
1398
1399                /* The futex requeue_pi code can enforce the waiters bit */
1400                if (set_waiters)
1401                        newval |= FUTEX_WAITERS;
1402
1403                ret = lock_pi_update_atomic(uaddr, uval, newval);
1404                /* If the take over worked, return 1 */
1405                return ret < 0 ? ret : 1;
1406        }
1407
1408        /*
1409         * First waiter. Set the waiters bit before attaching ourself to
1410         * the owner. If owner tries to unlock, it will be forced into
1411         * the kernel and blocked on hb->lock.
1412         */
1413        newval = uval | FUTEX_WAITERS;
1414        ret = lock_pi_update_atomic(uaddr, uval, newval);
1415        if (ret)
1416                return ret;
1417        /*
1418         * If the update of the user space value succeeded, we try to
1419         * attach to the owner. If that fails, no harm done, we only
1420         * set the FUTEX_WAITERS bit in the user space variable.
1421         */
1422        return attach_to_pi_owner(uaddr, newval, key, ps, exiting);
1423}
1424
1425/**
1426 * __unqueue_futex() - Remove the futex_q from its futex_hash_bucket
1427 * @q:  The futex_q to unqueue
1428 *
1429 * The q->lock_ptr must not be NULL and must be held by the caller.
1430 */
1431static void __unqueue_futex(struct futex_q *q)
1432{
1433        struct futex_hash_bucket *hb;
1434
1435        if (WARN_ON_SMP(!q->lock_ptr) || WARN_ON(plist_node_empty(&q->list)))
1436                return;
1437        lockdep_assert_held(q->lock_ptr);
1438
1439        hb = container_of(q->lock_ptr, struct futex_hash_bucket, lock);
1440        plist_del(&q->list, &hb->chain);
1441        hb_waiters_dec(hb);
1442}
1443
1444/*
1445 * The hash bucket lock must be held when this is called.
1446 * Afterwards, the futex_q must not be accessed. Callers
1447 * must ensure to later call wake_up_q() for the actual
1448 * wakeups to occur.
1449 */
1450static void mark_wake_futex(struct wake_q_head *wake_q, struct futex_q *q)
1451{
1452        struct task_struct *p = q->task;
1453
1454        if (WARN(q->pi_state || q->rt_waiter, "refusing to wake PI futex\n"))
1455                return;
1456
1457        get_task_struct(p);
1458        __unqueue_futex(q);
1459        /*
1460         * The waiting task can free the futex_q as soon as q->lock_ptr = NULL
1461         * is written, without taking any locks. This is possible in the event
1462         * of a spurious wakeup, for example. A memory barrier is required here
1463         * to prevent the following store to lock_ptr from getting ahead of the
1464         * plist_del in __unqueue_futex().
1465         */
1466        smp_store_release(&q->lock_ptr, NULL);
1467
1468        /*
1469         * Queue the task for later wakeup for after we've released
1470         * the hb->lock.
1471         */
1472        wake_q_add_safe(wake_q, p);
1473}
1474
1475/*
1476 * Caller must hold a reference on @pi_state.
1477 */
1478static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_pi_state *pi_state)
1479{
1480        u32 curval, newval;
1481        struct task_struct *new_owner;
1482        bool postunlock = false;
1483        DEFINE_WAKE_Q(wake_q);
1484        int ret = 0;
1485
1486        new_owner = rt_mutex_next_owner(&pi_state->pi_mutex);
1487        if (WARN_ON_ONCE(!new_owner)) {
1488                /*
1489                 * As per the comment in futex_unlock_pi() this should not happen.
1490                 *
1491                 * When this happens, give up our locks and try again, giving
1492                 * the futex_lock_pi() instance time to complete, either by
1493                 * waiting on the rtmutex or removing itself from the futex
1494                 * queue.
1495                 */
1496                ret = -EAGAIN;
1497                goto out_unlock;
1498        }
1499
1500        /*
1501         * We pass it to the next owner. The WAITERS bit is always kept
1502         * enabled while there is PI state around. We cleanup the owner
1503         * died bit, because we are the owner.
1504         */
1505        newval = FUTEX_WAITERS | task_pid_vnr(new_owner);
1506
1507        if (unlikely(should_fail_futex(true))) {
1508                ret = -EFAULT;
1509                goto out_unlock;
1510        }
1511
1512        ret = cmpxchg_futex_value_locked(&curval, uaddr, uval, newval);
1513        if (!ret && (curval != uval)) {
1514                /*
1515                 * If a unconditional UNLOCK_PI operation (user space did not
1516                 * try the TID->0 transition) raced with a waiter setting the
1517                 * FUTEX_WAITERS flag between get_user() and locking the hash
1518                 * bucket lock, retry the operation.
1519                 */
1520                if ((FUTEX_TID_MASK & curval) == uval)
1521                        ret = -EAGAIN;
1522                else
1523                        ret = -EINVAL;
1524        }
1525
1526        if (ret)
1527                goto out_unlock;
1528
1529        /*
1530         * This is a point of no return; once we modify the uval there is no
1531         * going back and subsequent operations must not fail.
1532         */
1533
1534        raw_spin_lock(&pi_state->owner->pi_lock);
1535        WARN_ON(list_empty(&pi_state->list));
1536        list_del_init(&pi_state->list);
1537        raw_spin_unlock(&pi_state->owner->pi_lock);
1538
1539        raw_spin_lock(&new_owner->pi_lock);
1540        WARN_ON(!list_empty(&pi_state->list));
1541        list_add(&pi_state->list, &new_owner->pi_state_list);
1542        pi_state->owner = new_owner;
1543        raw_spin_unlock(&new_owner->pi_lock);
1544
1545        postunlock = __rt_mutex_futex_unlock(&pi_state->pi_mutex, &wake_q);
1546
1547out_unlock:
1548        raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
1549
1550        if (postunlock)
1551                rt_mutex_postunlock(&wake_q);
1552
1553        return ret;
1554}
1555
1556/*
1557 * Express the locking dependencies for lockdep:
1558 */
1559static inline void
1560double_lock_hb(struct futex_hash_bucket *hb1, struct futex_hash_bucket *hb2)
1561{
1562        if (hb1 <= hb2) {
1563                spin_lock(&hb1->lock);
1564                if (hb1 < hb2)
1565                        spin_lock_nested(&hb2->lock, SINGLE_DEPTH_NESTING);
1566        } else { /* hb1 > hb2 */
1567                spin_lock(&hb2->lock);
1568                spin_lock_nested(&hb1->lock, SINGLE_DEPTH_NESTING);
1569        }
1570}
1571
1572static inline void
1573double_unlock_hb(struct futex_hash_bucket *hb1, struct futex_hash_bucket *hb2)
1574{
1575        spin_unlock(&hb1->lock);
1576        if (hb1 != hb2)
1577                spin_unlock(&hb2->lock);
1578}
1579
1580/*
1581 * Wake up waiters matching bitset queued on this futex (uaddr).
1582 */
1583static int
1584futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset)
1585{
1586        struct futex_hash_bucket *hb;
1587        struct futex_q *this, *next;
1588        union futex_key key = FUTEX_KEY_INIT;
1589        int ret;
1590        DEFINE_WAKE_Q(wake_q);
1591
1592        if (!bitset)
1593                return -EINVAL;
1594
1595        ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &key, FUTEX_READ);
1596        if (unlikely(ret != 0))
1597                return ret;
1598
1599        hb = hash_futex(&key);
1600
1601        /* Make sure we really have tasks to wakeup */
1602        if (!hb_waiters_pending(hb))
1603                return ret;
1604
1605        spin_lock(&hb->lock);
1606
1607        plist_for_each_entry_safe(this, next, &hb->chain, list) {
1608                if (match_futex (&this->key, &key)) {
1609                        if (this->pi_state || this->rt_waiter) {
1610                                ret = -EINVAL;
1611                                break;
1612                        }
1613
1614                        /* Check if one of the bits is set in both bitsets */
1615                        if (!(this->bitset & bitset))
1616                                continue;
1617
1618                        mark_wake_futex(&wake_q, this);
1619                        if (++ret >= nr_wake)
1620                                break;
1621                }
1622        }
1623
1624        spin_unlock(&hb->lock);
1625        wake_up_q(&wake_q);
1626        return ret;
1627}
1628
1629static int futex_atomic_op_inuser(unsigned int encoded_op, u32 __user *uaddr)
1630{
1631        unsigned int op =         (encoded_op & 0x70000000) >> 28;
1632        unsigned int cmp =        (encoded_op & 0x0f000000) >> 24;
1633        int oparg = sign_extend32((encoded_op & 0x00fff000) >> 12, 11);
1634        int cmparg = sign_extend32(encoded_op & 0x00000fff, 11);
1635        int oldval, ret;
1636
1637        if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28)) {
1638                if (oparg < 0 || oparg > 31) {
1639                        char comm[sizeof(current->comm)];
1640                        /*
1641                         * kill this print and return -EINVAL when userspace
1642                         * is sane again
1643                         */
1644                        pr_info_ratelimited("futex_wake_op: %s tries to shift op by %d; fix this program\n",
1645                                        get_task_comm(comm, current), oparg);
1646                        oparg &= 31;
1647                }
1648                oparg = 1 << oparg;
1649        }
1650
1651        pagefault_disable();
1652        ret = arch_futex_atomic_op_inuser(op, oparg, &oldval, uaddr);
1653        pagefault_enable();
1654        if (ret)
1655                return ret;
1656
1657        switch (cmp) {
1658        case FUTEX_OP_CMP_EQ:
1659                return oldval == cmparg;
1660        case FUTEX_OP_CMP_NE:
1661                return oldval != cmparg;
1662        case FUTEX_OP_CMP_LT:
1663                return oldval < cmparg;
1664        case FUTEX_OP_CMP_GE:
1665                return oldval >= cmparg;
1666        case FUTEX_OP_CMP_LE:
1667                return oldval <= cmparg;
1668        case FUTEX_OP_CMP_GT:
1669                return oldval > cmparg;
1670        default:
1671                return -ENOSYS;
1672        }
1673}
1674
1675/*
1676 * Wake up all waiters hashed on the physical page that is mapped
1677 * to this virtual address:
1678 */
1679static int
1680futex_wake_op(u32 __user *uaddr1, unsigned int flags, u32 __user *uaddr2,
1681              int nr_wake, int nr_wake2, int op)
1682{
1683        union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT;
1684        struct futex_hash_bucket *hb1, *hb2;
1685        struct futex_q *this, *next;
1686        int ret, op_ret;
1687        DEFINE_WAKE_Q(wake_q);
1688
1689retry:
1690        ret = get_futex_key(uaddr1, flags & FLAGS_SHARED, &key1, FUTEX_READ);
1691        if (unlikely(ret != 0))
1692                return ret;
1693        ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, FUTEX_WRITE);
1694        if (unlikely(ret != 0))
1695                return ret;
1696
1697        hb1 = hash_futex(&key1);
1698        hb2 = hash_futex(&key2);
1699
1700retry_private:
1701        double_lock_hb(hb1, hb2);
1702        op_ret = futex_atomic_op_inuser(op, uaddr2);
1703        if (unlikely(op_ret < 0)) {
1704                double_unlock_hb(hb1, hb2);
1705
1706                if (!IS_ENABLED(CONFIG_MMU) ||
1707                    unlikely(op_ret != -EFAULT && op_ret != -EAGAIN)) {
1708                        /*
1709                         * we don't get EFAULT from MMU faults if we don't have
1710                         * an MMU, but we might get them from range checking
1711                         */
1712                        ret = op_ret;
1713                        return ret;
1714                }
1715
1716                if (op_ret == -EFAULT) {
1717                        ret = fault_in_user_writeable(uaddr2);
1718                        if (ret)
1719                                return ret;
1720                }
1721
1722                if (!(flags & FLAGS_SHARED)) {
1723                        cond_resched();
1724                        goto retry_private;
1725                }
1726
1727                cond_resched();
1728                goto retry;
1729        }
1730
1731        plist_for_each_entry_safe(this, next, &hb1->chain, list) {
1732                if (match_futex (&this->key, &key1)) {
1733                        if (this->pi_state || this->rt_waiter) {
1734                                ret = -EINVAL;
1735                                goto out_unlock;
1736                        }
1737                        mark_wake_futex(&wake_q, this);
1738                        if (++ret >= nr_wake)
1739                                break;
1740                }
1741        }
1742
1743        if (op_ret > 0) {
1744                op_ret = 0;
1745                plist_for_each_entry_safe(this, next, &hb2->chain, list) {
1746                        if (match_futex (&this->key, &key2)) {
1747                                if (this->pi_state || this->rt_waiter) {
1748                                        ret = -EINVAL;
1749                                        goto out_unlock;
1750                                }
1751                                mark_wake_futex(&wake_q, this);
1752                                if (++op_ret >= nr_wake2)
1753                                        break;
1754                        }
1755                }
1756                ret += op_ret;
1757        }
1758
1759out_unlock:
1760        double_unlock_hb(hb1, hb2);
1761        wake_up_q(&wake_q);
1762        return ret;
1763}
1764
1765/**
1766 * requeue_futex() - Requeue a futex_q from one hb to another
1767 * @q:          the futex_q to requeue
1768 * @hb1:        the source hash_bucket
1769 * @hb2:        the target hash_bucket
1770 * @key2:       the new key for the requeued futex_q
1771 */
1772static inline
1773void requeue_futex(struct futex_q *q, struct futex_hash_bucket *hb1,
1774                   struct futex_hash_bucket *hb2, union futex_key *key2)
1775{
1776
1777        /*
1778         * If key1 and key2 hash to the same bucket, no need to
1779         * requeue.
1780         */
1781        if (likely(&hb1->chain != &hb2->chain)) {
1782                plist_del(&q->list, &hb1->chain);
1783                hb_waiters_dec(hb1);
1784                hb_waiters_inc(hb2);
1785                plist_add(&q->list, &hb2->chain);
1786                q->lock_ptr = &hb2->lock;
1787        }
1788        q->key = *key2;
1789}
1790
1791/**
1792 * requeue_pi_wake_futex() - Wake a task that acquired the lock during requeue
1793 * @q:          the futex_q
1794 * @key:        the key of the requeue target futex
1795 * @hb:         the hash_bucket of the requeue target futex
1796 *
1797 * During futex_requeue, with requeue_pi=1, it is possible to acquire the
1798 * target futex if it is uncontended or via a lock steal.  Set the futex_q key
1799 * to the requeue target futex so the waiter can detect the wakeup on the right
1800 * futex, but remove it from the hb and NULL the rt_waiter so it can detect
1801 * atomic lock acquisition.  Set the q->lock_ptr to the requeue target hb->lock
1802 * to protect access to the pi_state to fixup the owner later.  Must be called
1803 * with both q->lock_ptr and hb->lock held.
1804 */
1805static inline
1806void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key,
1807                           struct futex_hash_bucket *hb)
1808{
1809        q->key = *key;
1810
1811        __unqueue_futex(q);
1812
1813        WARN_ON(!q->rt_waiter);
1814        q->rt_waiter = NULL;
1815
1816        q->lock_ptr = &hb->lock;
1817
1818        wake_up_state(q->task, TASK_NORMAL);
1819}
1820
1821/**
1822 * futex_proxy_trylock_atomic() - Attempt an atomic lock for the top waiter
1823 * @pifutex:            the user address of the to futex
1824 * @hb1:                the from futex hash bucket, must be locked by the caller
1825 * @hb2:                the to futex hash bucket, must be locked by the caller
1826 * @key1:               the from futex key
1827 * @key2:               the to futex key
1828 * @ps:                 address to store the pi_state pointer
1829 * @exiting:            Pointer to store the task pointer of the owner task
1830 *                      which is in the middle of exiting
1831 * @set_waiters:        force setting the FUTEX_WAITERS bit (1) or not (0)
1832 *
1833 * Try and get the lock on behalf of the top waiter if we can do it atomically.
1834 * Wake the top waiter if we succeed.  If the caller specified set_waiters,
1835 * then direct futex_lock_pi_atomic() to force setting the FUTEX_WAITERS bit.
1836 * hb1 and hb2 must be held by the caller.
1837 *
1838 * @exiting is only set when the return value is -EBUSY. If so, this holds
1839 * a refcount on the exiting task on return and the caller needs to drop it
1840 * after waiting for the exit to complete.
1841 *
1842 * Return:
1843 *  -  0 - failed to acquire the lock atomically;
1844 *  - >0 - acquired the lock, return value is vpid of the top_waiter
1845 *  - <0 - error
1846 */
1847static int
1848futex_proxy_trylock_atomic(u32 __user *pifutex, struct futex_hash_bucket *hb1,
1849                           struct futex_hash_bucket *hb2, union futex_key *key1,
1850                           union futex_key *key2, struct futex_pi_state **ps,
1851                           struct task_struct **exiting, int set_waiters)
1852{
1853        struct futex_q *top_waiter = NULL;
1854        u32 curval;
1855        int ret, vpid;
1856
1857        if (get_futex_value_locked(&curval, pifutex))
1858                return -EFAULT;
1859
1860        if (unlikely(should_fail_futex(true)))
1861                return -EFAULT;
1862
1863        /*
1864         * Find the top_waiter and determine if there are additional waiters.
1865         * If the caller intends to requeue more than 1 waiter to pifutex,
1866         * force futex_lock_pi_atomic() to set the FUTEX_WAITERS bit now,
1867         * as we have means to handle the possible fault.  If not, don't set
1868         * the bit unecessarily as it will force the subsequent unlock to enter
1869         * the kernel.
1870         */
1871        top_waiter = futex_top_waiter(hb1, key1);
1872
1873        /* There are no waiters, nothing for us to do. */
1874        if (!top_waiter)
1875                return 0;
1876
1877        /* Ensure we requeue to the expected futex. */
1878        if (!match_futex(top_waiter->requeue_pi_key, key2))
1879                return -EINVAL;
1880
1881        /*
1882         * Try to take the lock for top_waiter.  Set the FUTEX_WAITERS bit in
1883         * the contended case or if set_waiters is 1.  The pi_state is returned
1884         * in ps in contended cases.
1885         */
1886        vpid = task_pid_vnr(top_waiter->task);
1887        ret = futex_lock_pi_atomic(pifutex, hb2, key2, ps, top_waiter->task,
1888                                   exiting, set_waiters);
1889        if (ret == 1) {
1890                requeue_pi_wake_futex(top_waiter, key2, hb2);
1891                return vpid;
1892        }
1893        return ret;
1894}
1895
1896/**
1897 * futex_requeue() - Requeue waiters from uaddr1 to uaddr2
1898 * @uaddr1:     source futex user address
1899 * @flags:      futex flags (FLAGS_SHARED, etc.)
1900 * @uaddr2:     target futex user address
1901 * @nr_wake:    number of waiters to wake (must be 1 for requeue_pi)
1902 * @nr_requeue: number of waiters to requeue (0-INT_MAX)
1903 * @cmpval:     @uaddr1 expected value (or %NULL)
1904 * @requeue_pi: if we are attempting to requeue from a non-pi futex to a
1905 *              pi futex (pi to pi requeue is not supported)
1906 *
1907 * Requeue waiters on uaddr1 to uaddr2. In the requeue_pi case, try to acquire
1908 * uaddr2 atomically on behalf of the top waiter.
1909 *
1910 * Return:
1911 *  - >=0 - on success, the number of tasks requeued or woken;
1912 *  -  <0 - on error
1913 */
1914static int futex_requeue(u32 __user *uaddr1, unsigned int flags,
1915                         u32 __user *uaddr2, int nr_wake, int nr_requeue,
1916                         u32 *cmpval, int requeue_pi)
1917{
1918        union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT;
1919        int task_count = 0, ret;
1920        struct futex_pi_state *pi_state = NULL;
1921        struct futex_hash_bucket *hb1, *hb2;
1922        struct futex_q *this, *next;
1923        DEFINE_WAKE_Q(wake_q);
1924
1925        if (nr_wake < 0 || nr_requeue < 0)
1926                return -EINVAL;
1927
1928        /*
1929         * When PI not supported: return -ENOSYS if requeue_pi is true,
1930         * consequently the compiler knows requeue_pi is always false past
1931         * this point which will optimize away all the conditional code
1932         * further down.
1933         */
1934        if (!IS_ENABLED(CONFIG_FUTEX_PI) && requeue_pi)
1935                return -ENOSYS;
1936
1937        if (requeue_pi) {
1938                /*
1939                 * Requeue PI only works on two distinct uaddrs. This
1940                 * check is only valid for private futexes. See below.
1941                 */
1942                if (uaddr1 == uaddr2)
1943                        return -EINVAL;
1944
1945                /*
1946                 * requeue_pi requires a pi_state, try to allocate it now
1947                 * without any locks in case it fails.
1948                 */
1949                if (refill_pi_state_cache())
1950                        return -ENOMEM;
1951                /*
1952                 * requeue_pi must wake as many tasks as it can, up to nr_wake
1953                 * + nr_requeue, since it acquires the rt_mutex prior to
1954                 * returning to userspace, so as to not leave the rt_mutex with
1955                 * waiters and no owner.  However, second and third wake-ups
1956                 * cannot be predicted as they involve race conditions with the
1957                 * first wake and a fault while looking up the pi_state.  Both
1958                 * pthread_cond_signal() and pthread_cond_broadcast() should
1959                 * use nr_wake=1.
1960                 */
1961                if (nr_wake != 1)
1962                        return -EINVAL;
1963        }
1964
1965retry:
1966        ret = get_futex_key(uaddr1, flags & FLAGS_SHARED, &key1, FUTEX_READ);
1967        if (unlikely(ret != 0))
1968                return ret;
1969        ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2,
1970                            requeue_pi ? FUTEX_WRITE : FUTEX_READ);
1971        if (unlikely(ret != 0))
1972                return ret;
1973
1974        /*
1975         * The check above which compares uaddrs is not sufficient for
1976         * shared futexes. We need to compare the keys:
1977         */
1978        if (requeue_pi && match_futex(&key1, &key2))
1979                return -EINVAL;
1980
1981        hb1 = hash_futex(&key1);
1982        hb2 = hash_futex(&key2);
1983
1984retry_private:
1985        hb_waiters_inc(hb2);
1986        double_lock_hb(hb1, hb2);
1987
1988        if (likely(cmpval != NULL)) {
1989                u32 curval;
1990
1991                ret = get_futex_value_locked(&curval, uaddr1);
1992
1993                if (unlikely(ret)) {
1994                        double_unlock_hb(hb1, hb2);
1995                        hb_waiters_dec(hb2);
1996
1997                        ret = get_user(curval, uaddr1);
1998                        if (ret)
1999                                return ret;
2000
2001                        if (!(flags & FLAGS_SHARED))
2002                                goto retry_private;
2003
2004                        goto retry;
2005                }
2006                if (curval != *cmpval) {
2007                        ret = -EAGAIN;
2008                        goto out_unlock;
2009                }
2010        }
2011
2012        if (requeue_pi && (task_count - nr_wake < nr_requeue)) {
2013                struct task_struct *exiting = NULL;
2014
2015                /*
2016                 * Attempt to acquire uaddr2 and wake the top waiter. If we
2017                 * intend to requeue waiters, force setting the FUTEX_WAITERS
2018                 * bit.  We force this here where we are able to easily handle
2019                 * faults rather in the requeue loop below.
2020                 */
2021                ret = futex_proxy_trylock_atomic(uaddr2, hb1, hb2, &key1,
2022                                                 &key2, &pi_state,
2023                                                 &exiting, nr_requeue);
2024
2025                /*
2026                 * At this point the top_waiter has either taken uaddr2 or is
2027                 * waiting on it.  If the former, then the pi_state will not
2028                 * exist yet, look it up one more time to ensure we have a
2029                 * reference to it. If the lock was taken, ret contains the
2030                 * vpid of the top waiter task.
2031                 * If the lock was not taken, we have pi_state and an initial
2032                 * refcount on it. In case of an error we have nothing.
2033                 */
2034                if (ret > 0) {
2035                        WARN_ON(pi_state);
2036                        task_count++;
2037                        /*
2038                         * If we acquired the lock, then the user space value
2039                         * of uaddr2 should be vpid. It cannot be changed by
2040                         * the top waiter as it is blocked on hb2 lock if it
2041                         * tries to do so. If something fiddled with it behind
2042                         * our back the pi state lookup might unearth it. So
2043                         * we rather use the known value than rereading and
2044                         * handing potential crap to lookup_pi_state.
2045                         *
2046                         * If that call succeeds then we have pi_state and an
2047                         * initial refcount on it.
2048                         */
2049                        ret = lookup_pi_state(uaddr2, ret, hb2, &key2,
2050                                              &pi_state, &exiting);
2051                }
2052
2053                switch (ret) {
2054                case 0:
2055                        /* We hold a reference on the pi state. */
2056                        break;
2057
2058                        /* If the above failed, then pi_state is NULL */
2059                case -EFAULT:
2060                        double_unlock_hb(hb1, hb2);
2061                        hb_waiters_dec(hb2);
2062                        ret = fault_in_user_writeable(uaddr2);
2063                        if (!ret)
2064                                goto retry;
2065                        return ret;
2066                case -EBUSY:
2067                case -EAGAIN:
2068                        /*
2069                         * Two reasons for this:
2070                         * - EBUSY: Owner is exiting and we just wait for the
2071                         *   exit to complete.
2072                         * - EAGAIN: The user space value changed.
2073                         */
2074                        double_unlock_hb(hb1, hb2);
2075                        hb_waiters_dec(hb2);
2076                        /*
2077                         * Handle the case where the owner is in the middle of
2078                         * exiting. Wait for the exit to complete otherwise
2079                         * this task might loop forever, aka. live lock.
2080                         */
2081                        wait_for_owner_exiting(ret, exiting);
2082                        cond_resched();
2083                        goto retry;
2084                default:
2085                        goto out_unlock;
2086                }
2087        }
2088
2089        plist_for_each_entry_safe(this, next, &hb1->chain, list) {
2090                if (task_count - nr_wake >= nr_requeue)
2091                        break;
2092
2093                if (!match_futex(&this->key, &key1))
2094                        continue;
2095
2096                /*
2097                 * FUTEX_WAIT_REQEUE_PI and FUTEX_CMP_REQUEUE_PI should always
2098                 * be paired with each other and no other futex ops.
2099                 *
2100                 * We should never be requeueing a futex_q with a pi_state,
2101                 * which is awaiting a futex_unlock_pi().
2102                 */
2103                if ((requeue_pi && !this->rt_waiter) ||
2104                    (!requeue_pi && this->rt_waiter) ||
2105                    this->pi_state) {
2106                        ret = -EINVAL;
2107                        break;
2108                }
2109
2110                /*
2111                 * Wake nr_wake waiters.  For requeue_pi, if we acquired the
2112                 * lock, we already woke the top_waiter.  If not, it will be
2113                 * woken by futex_unlock_pi().
2114                 */
2115                if (++task_count <= nr_wake && !requeue_pi) {
2116                        mark_wake_futex(&wake_q, this);
2117                        continue;
2118                }
2119
2120                /* Ensure we requeue to the expected futex for requeue_pi. */
2121                if (requeue_pi && !match_futex(this->requeue_pi_key, &key2)) {
2122                        ret = -EINVAL;
2123                        break;
2124                }
2125
2126                /*
2127                 * Requeue nr_requeue waiters and possibly one more in the case
2128                 * of requeue_pi if we couldn't acquire the lock atomically.
2129                 */
2130                if (requeue_pi) {
2131                        /*
2132                         * Prepare the waiter to take the rt_mutex. Take a
2133                         * refcount on the pi_state and store the pointer in
2134                         * the futex_q object of the waiter.
2135                         */
2136                        get_pi_state(pi_state);
2137                        this->pi_state = pi_state;
2138                        ret = rt_mutex_start_proxy_lock(&pi_state->pi_mutex,
2139                                                        this->rt_waiter,
2140                                                        this->task);
2141                        if (ret == 1) {
2142                                /*
2143                                 * We got the lock. We do neither drop the
2144                                 * refcount on pi_state nor clear
2145                                 * this->pi_state because the waiter needs the
2146                                 * pi_state for cleaning up the user space
2147                                 * value. It will drop the refcount after
2148                                 * doing so.
2149                                 */
2150                                requeue_pi_wake_futex(this, &key2, hb2);
2151                                continue;
2152                        } else if (ret) {
2153                                /*
2154                                 * rt_mutex_start_proxy_lock() detected a
2155                                 * potential deadlock when we tried to queue
2156                                 * that waiter. Drop the pi_state reference
2157                                 * which we took above and remove the pointer
2158                                 * to the state from the waiters futex_q
2159                                 * object.
2160                                 */
2161                                this->pi_state = NULL;
2162                                put_pi_state(pi_state);
2163                                /*
2164                                 * We stop queueing more waiters and let user
2165                                 * space deal with the mess.
2166                                 */
2167                                break;
2168                        }
2169                }
2170                requeue_futex(this, hb1, hb2, &key2);
2171        }
2172
2173        /*
2174         * We took an extra initial reference to the pi_state either
2175         * in futex_proxy_trylock_atomic() or in lookup_pi_state(). We
2176         * need to drop it here again.
2177         */
2178        put_pi_state(pi_state);
2179
2180out_unlock:
2181        double_unlock_hb(hb1, hb2);
2182        wake_up_q(&wake_q);
2183        hb_waiters_dec(hb2);
2184        return ret ? ret : task_count;
2185}
2186
2187/* The key must be already stored in q->key. */
2188static inline struct futex_hash_bucket *queue_lock(struct futex_q *q)
2189        __acquires(&hb->lock)
2190{
2191        struct futex_hash_bucket *hb;
2192
2193        hb = hash_futex(&q->key);
2194
2195        /*
2196         * Increment the counter before taking the lock so that
2197         * a potential waker won't miss a to-be-slept task that is
2198         * waiting for the spinlock. This is safe as all queue_lock()
2199         * users end up calling queue_me(). Similarly, for housekeeping,
2200         * decrement the counter at queue_unlock() when some error has
2201         * occurred and we don't end up adding the task to the list.
2202         */
2203        hb_waiters_inc(hb); /* implies smp_mb(); (A) */
2204
2205        q->lock_ptr = &hb->lock;
2206
2207        spin_lock(&hb->lock);
2208        return hb;
2209}
2210
2211static inline void
2212queue_unlock(struct futex_hash_bucket *hb)
2213        __releases(&hb->lock)
2214{
2215        spin_unlock(&hb->lock);
2216        hb_waiters_dec(hb);
2217}
2218
2219static inline void __queue_me(struct futex_q *q, struct futex_hash_bucket *hb)
2220{
2221        int prio;
2222
2223        /*
2224         * The priority used to register this element is
2225         * - either the real thread-priority for the real-time threads
2226         * (i.e. threads with a priority lower than MAX_RT_PRIO)
2227         * - or MAX_RT_PRIO for non-RT threads.
2228         * Thus, all RT-threads are woken first in priority order, and
2229         * the others are woken last, in FIFO order.
2230         */
2231        prio = min(current->normal_prio, MAX_RT_PRIO);
2232
2233        plist_node_init(&q->list, prio);
2234        plist_add(&q->list, &hb->chain);
2235        q->task = current;
2236}
2237
2238/**
2239 * queue_me() - Enqueue the futex_q on the futex_hash_bucket
2240 * @q:  The futex_q to enqueue
2241 * @hb: The destination hash bucket
2242 *
2243 * The hb->lock must be held by the caller, and is released here. A call to
2244 * queue_me() is typically paired with exactly one call to unqueue_me().  The
2245 * exceptions involve the PI related operations, which may use unqueue_me_pi()
2246 * or nothing if the unqueue is done as part of the wake process and the unqueue
2247 * state is implicit in the state of woken task (see futex_wait_requeue_pi() for
2248 * an example).
2249 */
2250static inline void queue_me(struct futex_q *q, struct futex_hash_bucket *hb)
2251        __releases(&hb->lock)
2252{
2253        __queue_me(q, hb);
2254        spin_unlock(&hb->lock);
2255}
2256
2257/**
2258 * unqueue_me() - Remove the futex_q from its futex_hash_bucket
2259 * @q:  The futex_q to unqueue
2260 *
2261 * The q->lock_ptr must not be held by the caller. A call to unqueue_me() must
2262 * be paired with exactly one earlier call to queue_me().
2263 *
2264 * Return:
2265 *  - 1 - if the futex_q was still queued (and we removed unqueued it);
2266 *  - 0 - if the futex_q was already removed by the waking thread
2267 */
2268static int unqueue_me(struct futex_q *q)
2269{
2270        spinlock_t *lock_ptr;
2271        int ret = 0;
2272
2273        /* In the common case we don't take the spinlock, which is nice. */
2274retry:
2275        /*
2276         * q->lock_ptr can change between this read and the following spin_lock.
2277         * Use READ_ONCE to forbid the compiler from reloading q->lock_ptr and
2278         * optimizing lock_ptr out of the logic below.
2279         */
2280        lock_ptr = READ_ONCE(q->lock_ptr);
2281        if (lock_ptr != NULL) {
2282                spin_lock(lock_ptr);
2283                /*
2284                 * q->lock_ptr can change between reading it and
2285                 * spin_lock(), causing us to take the wrong lock.  This
2286                 * corrects the race condition.
2287                 *
2288                 * Reasoning goes like this: if we have the wrong lock,
2289                 * q->lock_ptr must have changed (maybe several times)
2290                 * between reading it and the spin_lock().  It can
2291                 * change again after the spin_lock() but only if it was
2292                 * already changed before the spin_lock().  It cannot,
2293                 * however, change back to the original value.  Therefore
2294                 * we can detect whether we acquired the correct lock.
2295                 */
2296                if (unlikely(lock_ptr != q->lock_ptr)) {
2297                        spin_unlock(lock_ptr);
2298                        goto retry;
2299                }
2300                __unqueue_futex(q);
2301
2302                BUG_ON(q->pi_state);
2303
2304                spin_unlock(lock_ptr);
2305                ret = 1;
2306        }
2307
2308        return ret;
2309}
2310
2311/*
2312 * PI futexes can not be requeued and must remove themself from the
2313 * hash bucket. The hash bucket lock (i.e. lock_ptr) is held on entry
2314 * and dropped here.
2315 */
2316static void unqueue_me_pi(struct futex_q *q)
2317        __releases(q->lock_ptr)
2318{
2319        __unqueue_futex(q);
2320
2321        BUG_ON(!q->pi_state);
2322        put_pi_state(q->pi_state);
2323        q->pi_state = NULL;
2324
2325        spin_unlock(q->lock_ptr);
2326}
2327
2328static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,
2329                                struct task_struct *argowner)
2330{
2331        struct futex_pi_state *pi_state = q->pi_state;
2332        u32 uval, curval, newval;
2333        struct task_struct *oldowner, *newowner;
2334        u32 newtid;
2335        int ret, err = 0;
2336
2337        lockdep_assert_held(q->lock_ptr);
2338
2339        raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
2340
2341        oldowner = pi_state->owner;
2342
2343        /*
2344         * We are here because either:
2345         *
2346         *  - we stole the lock and pi_state->owner needs updating to reflect
2347         *    that (@argowner == current),
2348         *
2349         * or:
2350         *
2351         *  - someone stole our lock and we need to fix things to point to the
2352         *    new owner (@argowner == NULL).
2353         *
2354         * Either way, we have to replace the TID in the user space variable.
2355         * This must be atomic as we have to preserve the owner died bit here.
2356         *
2357         * Note: We write the user space value _before_ changing the pi_state
2358         * because we can fault here. Imagine swapped out pages or a fork
2359         * that marked all the anonymous memory readonly for cow.
2360         *
2361         * Modifying pi_state _before_ the user space value would leave the
2362         * pi_state in an inconsistent state when we fault here, because we
2363         * need to drop the locks to handle the fault. This might be observed
2364         * in the PID check in lookup_pi_state.
2365         */
2366retry:
2367        if (!argowner) {
2368                if (oldowner != current) {
2369                        /*
2370                         * We raced against a concurrent self; things are
2371                         * already fixed up. Nothing to do.
2372                         */
2373                        ret = 0;
2374                        goto out_unlock;
2375                }
2376
2377                if (__rt_mutex_futex_trylock(&pi_state->pi_mutex)) {
2378                        /* We got the lock after all, nothing to fix. */
2379                        ret = 0;
2380                        goto out_unlock;
2381                }
2382
2383                /*
2384                 * The trylock just failed, so either there is an owner or
2385                 * there is a higher priority waiter than this one.
2386                 */
2387                newowner = rt_mutex_owner(&pi_state->pi_mutex);
2388                /*
2389                 * If the higher priority waiter has not yet taken over the
2390                 * rtmutex then newowner is NULL. We can't return here with
2391                 * that state because it's inconsistent vs. the user space
2392                 * state. So drop the locks and try again. It's a valid
2393                 * situation and not any different from the other retry
2394                 * conditions.
2395                 */
2396                if (unlikely(!newowner)) {
2397                        err = -EAGAIN;
2398                        goto handle_err;
2399                }
2400        } else {
2401                WARN_ON_ONCE(argowner != current);
2402                if (oldowner == current) {
2403                        /*
2404                         * We raced against a concurrent self; things are
2405                         * already fixed up. Nothing to do.
2406                         */
2407                        ret = 0;
2408                        goto out_unlock;
2409                }
2410                newowner = argowner;
2411        }
2412
2413        newtid = task_pid_vnr(newowner) | FUTEX_WAITERS;
2414        /* Owner died? */
2415        if (!pi_state->owner)
2416                newtid |= FUTEX_OWNER_DIED;
2417
2418        err = get_futex_value_locked(&uval, uaddr);
2419        if (err)
2420                goto handle_err;
2421
2422        for (;;) {
2423                newval = (uval & FUTEX_OWNER_DIED) | newtid;
2424
2425                err = cmpxchg_futex_value_locked(&curval, uaddr, uval, newval);
2426                if (err)
2427                        goto handle_err;
2428
2429                if (curval == uval)
2430                        break;
2431                uval = curval;
2432        }
2433
2434        /*
2435         * We fixed up user space. Now we need to fix the pi_state
2436         * itself.
2437         */
2438        if (pi_state->owner != NULL) {
2439                raw_spin_lock(&pi_state->owner->pi_lock);
2440                WARN_ON(list_empty(&pi_state->list));
2441                list_del_init(&pi_state->list);
2442                raw_spin_unlock(&pi_state->owner->pi_lock);
2443        }
2444
2445        pi_state->owner = newowner;
2446
2447        raw_spin_lock(&newowner->pi_lock);
2448        WARN_ON(!list_empty(&pi_state->list));
2449        list_add(&pi_state->list, &newowner->pi_state_list);
2450        raw_spin_unlock(&newowner->pi_lock);
2451        raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
2452
2453        return 0;
2454
2455        /*
2456         * In order to reschedule or handle a page fault, we need to drop the
2457         * locks here. In the case of a fault, this gives the other task
2458         * (either the highest priority waiter itself or the task which stole
2459         * the rtmutex) the chance to try the fixup of the pi_state. So once we
2460         * are back from handling the fault we need to check the pi_state after
2461         * reacquiring the locks and before trying to do another fixup. When
2462         * the fixup has been done already we simply return.
2463         *
2464         * Note: we hold both hb->lock and pi_mutex->wait_lock. We can safely
2465         * drop hb->lock since the caller owns the hb -> futex_q relation.
2466         * Dropping the pi_mutex->wait_lock requires the state revalidate.
2467         */
2468handle_err:
2469        raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
2470        spin_unlock(q->lock_ptr);
2471
2472        switch (err) {
2473        case -EFAULT:
2474                ret = fault_in_user_writeable(uaddr);
2475                break;
2476
2477        case -EAGAIN:
2478                cond_resched();
2479                ret = 0;
2480                break;
2481
2482        default:
2483                WARN_ON_ONCE(1);
2484                ret = err;
2485                break;
2486        }
2487
2488        spin_lock(q->lock_ptr);
2489        raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
2490
2491        /*
2492         * Check if someone else fixed it for us:
2493         */
2494        if (pi_state->owner != oldowner) {
2495                ret = 0;
2496                goto out_unlock;
2497        }
2498
2499        if (ret)
2500                goto out_unlock;
2501
2502        goto retry;
2503
2504out_unlock:
2505        raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
2506        return ret;
2507}
2508
2509static long futex_wait_restart(struct restart_block *restart);
2510
2511/**
2512 * fixup_owner() - Post lock pi_state and corner case management
2513 * @uaddr:      user address of the futex
2514 * @q:          futex_q (contains pi_state and access to the rt_mutex)
2515 * @locked:     if the attempt to take the rt_mutex succeeded (1) or not (0)
2516 *
2517 * After attempting to lock an rt_mutex, this function is called to cleanup
2518 * the pi_state owner as well as handle race conditions that may allow us to
2519 * acquire the lock. Must be called with the hb lock held.
2520 *
2521 * Return:
2522 *  -  1 - success, lock taken;
2523 *  -  0 - success, lock not taken;
2524 *  - <0 - on error (-EFAULT)
2525 */
2526static int fixup_owner(u32 __user *uaddr, struct futex_q *q, int locked)
2527{
2528        int ret = 0;
2529
2530        if (locked) {
2531                /*
2532                 * Got the lock. We might not be the anticipated owner if we
2533                 * did a lock-steal - fix up the PI-state in that case:
2534                 *
2535                 * Speculative pi_state->owner read (we don't hold wait_lock);
2536                 * since we own the lock pi_state->owner == current is the
2537                 * stable state, anything else needs more attention.
2538                 */
2539                if (q->pi_state->owner != current)
2540                        ret = fixup_pi_state_owner(uaddr, q, current);
2541                return ret ? ret : locked;
2542        }
2543
2544        /*
2545         * If we didn't get the lock; check if anybody stole it from us. In
2546         * that case, we need to fix up the uval to point to them instead of
2547         * us, otherwise bad things happen. [10]
2548         *
2549         * Another speculative read; pi_state->owner == current is unstable
2550         * but needs our attention.
2551         */
2552        if (q->pi_state->owner == current) {
2553                ret = fixup_pi_state_owner(uaddr, q, NULL);
2554                return ret;
2555        }
2556
2557        /*
2558         * Paranoia check. If we did not take the lock, then we should not be
2559         * the owner of the rt_mutex.
2560         */
2561        if (rt_mutex_owner(&q->pi_state->pi_mutex) == current) {
2562                printk(KERN_ERR "fixup_owner: ret = %d pi-mutex: %p "
2563                                "pi-state %p\n", ret,
2564                                q->pi_state->pi_mutex.owner,
2565                                q->pi_state->owner);
2566        }
2567
2568        return ret;
2569}
2570
2571/**
2572 * futex_wait_queue_me() - queue_me() and wait for wakeup, timeout, or signal
2573 * @hb:         the futex hash bucket, must be locked by the caller
2574 * @q:          the futex_q to queue up on
2575 * @timeout:    the prepared hrtimer_sleeper, or null for no timeout
2576 */
2577static void futex_wait_queue_me(struct futex_hash_bucket *hb, struct futex_q *q,
2578                                struct hrtimer_sleeper *timeout)
2579{
2580        /*
2581         * The task state is guaranteed to be set before another task can
2582         * wake it. set_current_state() is implemented using smp_store_mb() and
2583         * queue_me() calls spin_unlock() upon completion, both serializing
2584         * access to the hash list and forcing another memory barrier.
2585         */
2586        set_current_state(TASK_INTERRUPTIBLE);
2587        queue_me(q, hb);
2588
2589        /* Arm the timer */
2590        if (timeout)
2591                hrtimer_sleeper_start_expires(timeout, HRTIMER_MODE_ABS);
2592
2593        /*
2594         * If we have been removed from the hash list, then another task
2595         * has tried to wake us, and we can skip the call to schedule().
2596         */
2597        if (likely(!plist_node_empty(&q->list))) {
2598                /*
2599                 * If the timer has already expired, current will already be
2600                 * flagged for rescheduling. Only call schedule if there
2601                 * is no timeout, or if it has yet to expire.
2602                 */
2603                if (!timeout || timeout->task)
2604                        freezable_schedule();
2605        }
2606        __set_current_state(TASK_RUNNING);
2607}
2608
2609/**
2610 * futex_wait_setup() - Prepare to wait on a futex
2611 * @uaddr:      the futex userspace address
2612 * @val:        the expected value
2613 * @flags:      futex flags (FLAGS_SHARED, etc.)
2614 * @q:          the associated futex_q
2615 * @hb:         storage for hash_bucket pointer to be returned to caller
2616 *
2617 * Setup the futex_q and locate the hash_bucket.  Get the futex value and
2618 * compare it with the expected value.  Handle atomic faults internally.
2619 * Return with the hb lock held and a q.key reference on success, and unlocked
2620 * with no q.key reference on failure.
2621 *
2622 * Return:
2623 *  -  0 - uaddr contains val and hb has been locked;
2624 *  - <1 - -EFAULT or -EWOULDBLOCK (uaddr does not contain val) and hb is unlocked
2625 */
2626static int futex_wait_setup(u32 __user *uaddr, u32 val, unsigned int flags,
2627                           struct futex_q *q, struct futex_hash_bucket **hb)
2628{
2629        u32 uval;
2630        int ret;
2631
2632        /*
2633         * Access the page AFTER the hash-bucket is locked.
2634         * Order is important:
2635         *
2636         *   Userspace waiter: val = var; if (cond(val)) futex_wait(&var, val);
2637         *   Userspace waker:  if (cond(var)) { var = new; futex_wake(&var); }
2638         *
2639         * The basic logical guarantee of a futex is that it blocks ONLY
2640         * if cond(var) is known to be true at the time of blocking, for
2641         * any cond.  If we locked the hash-bucket after testing *uaddr, that
2642         * would open a race condition where we could block indefinitely with
2643         * cond(var) false, which would violate the guarantee.
2644         *
2645         * On the other hand, we insert q and release the hash-bucket only
2646         * after testing *uaddr.  This guarantees that futex_wait() will NOT
2647         * absorb a wakeup if *uaddr does not match the desired values
2648         * while the syscall executes.
2649         */
2650retry:
2651        ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &q->key, FUTEX_READ);
2652        if (unlikely(ret != 0))
2653                return ret;
2654
2655retry_private:
2656        *hb = queue_lock(q);
2657
2658        ret = get_futex_value_locked(&uval, uaddr);
2659
2660        if (ret) {
2661                queue_unlock(*hb);
2662
2663                ret = get_user(uval, uaddr);
2664                if (ret)
2665                        return ret;
2666
2667                if (!(flags & FLAGS_SHARED))
2668                        goto retry_private;
2669
2670                goto retry;
2671        }
2672
2673        if (uval != val) {
2674                queue_unlock(*hb);
2675                ret = -EWOULDBLOCK;
2676        }
2677
2678        return ret;
2679}
2680
2681static int futex_wait(u32 __user *uaddr, unsigned int flags, u32 val,
2682                      ktime_t *abs_time, u32 bitset)
2683{
2684        struct hrtimer_sleeper timeout, *to;
2685        struct restart_block *restart;
2686        struct futex_hash_bucket *hb;
2687        struct futex_q q = futex_q_init;
2688        int ret;
2689
2690        if (!bitset)
2691                return -EINVAL;
2692        q.bitset = bitset;
2693
2694        to = futex_setup_timer(abs_time, &timeout, flags,
2695                               current->timer_slack_ns);
2696retry:
2697        /*
2698         * Prepare to wait on uaddr. On success, holds hb lock and increments
2699         * q.key refs.
2700         */
2701        ret = futex_wait_setup(uaddr, val, flags, &q, &hb);
2702        if (ret)
2703                goto out;
2704
2705        /* queue_me and wait for wakeup, timeout, or a signal. */
2706        futex_wait_queue_me(hb, &q, to);
2707
2708        /* If we were woken (and unqueued), we succeeded, whatever. */
2709        ret = 0;
2710        /* unqueue_me() drops q.key ref */
2711        if (!unqueue_me(&q))
2712                goto out;
2713        ret = -ETIMEDOUT;
2714        if (to && !to->task)
2715                goto out;
2716
2717        /*
2718         * We expect signal_pending(current), but we might be the
2719         * victim of a spurious wakeup as well.
2720         */
2721        if (!signal_pending(current))
2722                goto retry;
2723
2724        ret = -ERESTARTSYS;
2725        if (!abs_time)
2726                goto out;
2727
2728        restart = &current->restart_block;
2729        restart->fn = futex_wait_restart;
2730        restart->futex.uaddr = uaddr;
2731        restart->futex.val = val;
2732        restart->futex.time = *abs_time;
2733        restart->futex.bitset = bitset;
2734        restart->futex.flags = flags | FLAGS_HAS_TIMEOUT;
2735
2736        ret = -ERESTART_RESTARTBLOCK;
2737
2738out:
2739        if (to) {
2740                hrtimer_cancel(&to->timer);
2741                destroy_hrtimer_on_stack(&to->timer);
2742        }
2743        return ret;
2744}
2745
2746
2747static long futex_wait_restart(struct restart_block *restart)
2748{
2749        u32 __user *uaddr = restart->futex.uaddr;
2750        ktime_t t, *tp = NULL;
2751
2752        if (restart->futex.flags & FLAGS_HAS_TIMEOUT) {
2753                t = restart->futex.time;
2754                tp = &t;
2755        }
2756        restart->fn = do_no_restart_syscall;
2757
2758        return (long)futex_wait(uaddr, restart->futex.flags,
2759                                restart->futex.val, tp, restart->futex.bitset);
2760}
2761
2762
2763/*
2764 * Userspace tried a 0 -> TID atomic transition of the futex value
2765 * and failed. The kernel side here does the whole locking operation:
2766 * if there are waiters then it will block as a consequence of relying
2767 * on rt-mutexes, it does PI, etc. (Due to races the kernel might see
2768 * a 0 value of the futex too.).
2769 *
2770 * Also serves as futex trylock_pi()'ing, and due semantics.
2771 */
2772static int futex_lock_pi(u32 __user *uaddr, unsigned int flags,
2773                         ktime_t *time, int trylock)
2774{
2775        struct hrtimer_sleeper timeout, *to;
2776        struct futex_pi_state *pi_state = NULL;
2777        struct task_struct *exiting = NULL;
2778        struct rt_mutex_waiter rt_waiter;
2779        struct futex_hash_bucket *hb;
2780        struct futex_q q = futex_q_init;
2781        int res, ret;
2782
2783        if (!IS_ENABLED(CONFIG_FUTEX_PI))
2784                return -ENOSYS;
2785
2786        if (refill_pi_state_cache())
2787                return -ENOMEM;
2788
2789        to = futex_setup_timer(time, &timeout, FLAGS_CLOCKRT, 0);
2790
2791retry:
2792        ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &q.key, FUTEX_WRITE);
2793        if (unlikely(ret != 0))
2794                goto out;
2795
2796retry_private:
2797        hb = queue_lock(&q);
2798
2799        ret = futex_lock_pi_atomic(uaddr, hb, &q.key, &q.pi_state, current,
2800                                   &exiting, 0);
2801        if (unlikely(ret)) {
2802                /*
2803                 * Atomic work succeeded and we got the lock,
2804                 * or failed. Either way, we do _not_ block.
2805                 */
2806                switch (ret) {
2807                case 1:
2808                        /* We got the lock. */
2809                        ret = 0;
2810                        goto out_unlock_put_key;
2811                case -EFAULT:
2812                        goto uaddr_faulted;
2813                case -EBUSY:
2814                case -EAGAIN:
2815                        /*
2816                         * Two reasons for this:
2817                         * - EBUSY: Task is exiting and we just wait for the
2818                         *   exit to complete.
2819                         * - EAGAIN: The user space value changed.
2820                         */
2821                        queue_unlock(hb);
2822                        /*
2823                         * Handle the case where the owner is in the middle of
2824                         * exiting. Wait for the exit to complete otherwise
2825                         * this task might loop forever, aka. live lock.
2826                         */
2827                        wait_for_owner_exiting(ret, exiting);
2828                        cond_resched();
2829                        goto retry;
2830                default:
2831                        goto out_unlock_put_key;
2832                }
2833        }
2834
2835        WARN_ON(!q.pi_state);
2836
2837        /*
2838         * Only actually queue now that the atomic ops are done:
2839         */
2840        __queue_me(&q, hb);
2841
2842        if (trylock) {
2843                ret = rt_mutex_futex_trylock(&q.pi_state->pi_mutex);
2844                /* Fixup the trylock return value: */
2845                ret = ret ? 0 : -EWOULDBLOCK;
2846                goto no_block;
2847        }
2848
2849        rt_mutex_init_waiter(&rt_waiter);
2850
2851        /*
2852         * On PREEMPT_RT_FULL, when hb->lock becomes an rt_mutex, we must not
2853         * hold it while doing rt_mutex_start_proxy(), because then it will
2854         * include hb->lock in the blocking chain, even through we'll not in
2855         * fact hold it while blocking. This will lead it to report -EDEADLK
2856         * and BUG when futex_unlock_pi() interleaves with this.
2857         *
2858         * Therefore acquire wait_lock while holding hb->lock, but drop the
2859         * latter before calling __rt_mutex_start_proxy_lock(). This
2860         * interleaves with futex_unlock_pi() -- which does a similar lock
2861         * handoff -- such that the latter can observe the futex_q::pi_state
2862         * before __rt_mutex_start_proxy_lock() is done.
2863         */
2864        raw_spin_lock_irq(&q.pi_state->pi_mutex.wait_lock);
2865        spin_unlock(q.lock_ptr);
2866        /*
2867         * __rt_mutex_start_proxy_lock() unconditionally enqueues the @rt_waiter
2868         * such that futex_unlock_pi() is guaranteed to observe the waiter when
2869         * it sees the futex_q::pi_state.
2870         */
2871        ret = __rt_mutex_start_proxy_lock(&q.pi_state->pi_mutex, &rt_waiter, current);
2872        raw_spin_unlock_irq(&q.pi_state->pi_mutex.wait_lock);
2873
2874        if (ret) {
2875                if (ret == 1)
2876                        ret = 0;
2877                goto cleanup;
2878        }
2879
2880        if (unlikely(to))
2881                hrtimer_sleeper_start_expires(to, HRTIMER_MODE_ABS);
2882
2883        ret = rt_mutex_wait_proxy_lock(&q.pi_state->pi_mutex, to, &rt_waiter);
2884
2885cleanup:
2886        spin_lock(q.lock_ptr);
2887        /*
2888         * If we failed to acquire the lock (deadlock/signal/timeout), we must
2889         * first acquire the hb->lock before removing the lock from the
2890         * rt_mutex waitqueue, such that we can keep the hb and rt_mutex wait
2891         * lists consistent.
2892         *
2893         * In particular; it is important that futex_unlock_pi() can not
2894         * observe this inconsistency.
2895         */
2896        if (ret && !rt_mutex_cleanup_proxy_lock(&q.pi_state->pi_mutex, &rt_waiter))
2897                ret = 0;
2898
2899no_block:
2900        /*
2901         * Fixup the pi_state owner and possibly acquire the lock if we
2902         * haven't already.
2903         */
2904        res = fixup_owner(uaddr, &q, !ret);
2905        /*
2906         * If fixup_owner() returned an error, proprogate that.  If it acquired
2907         * the lock, clear our -ETIMEDOUT or -EINTR.
2908         */
2909        if (res)
2910                ret = (res < 0) ? res : 0;
2911
2912        /*
2913         * If fixup_owner() faulted and was unable to handle the fault, unlock
2914         * it and return the fault to userspace.
2915         */
2916        if (ret && (rt_mutex_owner(&q.pi_state->pi_mutex) == current)) {
2917                pi_state = q.pi_state;
2918                get_pi_state(pi_state);
2919        }
2920
2921        /* Unqueue and drop the lock */
2922        unqueue_me_pi(&q);
2923
2924        if (pi_state) {
2925                rt_mutex_futex_unlock(&pi_state->pi_mutex);
2926                put_pi_state(pi_state);
2927        }
2928
2929        goto out;
2930
2931out_unlock_put_key:
2932        queue_unlock(hb);
2933
2934out:
2935        if (to) {
2936                hrtimer_cancel(&to->timer);
2937                destroy_hrtimer_on_stack(&to->timer);
2938        }
2939        return ret != -EINTR ? ret : -ERESTARTNOINTR;
2940
2941uaddr_faulted:
2942        queue_unlock(hb);
2943
2944        ret = fault_in_user_writeable(uaddr);
2945        if (ret)
2946                goto out;
2947
2948        if (!(flags & FLAGS_SHARED))
2949                goto retry_private;
2950
2951        goto retry;
2952}
2953
2954/*
2955 * Userspace attempted a TID -> 0 atomic transition, and failed.
2956 * This is the in-kernel slowpath: we look up the PI state (if any),
2957 * and do the rt-mutex unlock.
2958 */
2959static int futex_unlock_pi(u32 __user *uaddr, unsigned int flags)
2960{
2961        u32 curval, uval, vpid = task_pid_vnr(current);
2962        union futex_key key = FUTEX_KEY_INIT;
2963        struct futex_hash_bucket *hb;
2964        struct futex_q *top_waiter;
2965        int ret;
2966
2967        if (!IS_ENABLED(CONFIG_FUTEX_PI))
2968                return -ENOSYS;
2969
2970retry:
2971        if (get_user(uval, uaddr))
2972                return -EFAULT;
2973        /*
2974         * We release only a lock we actually own:
2975         */
2976        if ((uval & FUTEX_TID_MASK) != vpid)
2977                return -EPERM;
2978
2979        ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &key, FUTEX_WRITE);
2980        if (ret)
2981                return ret;
2982
2983        hb = hash_futex(&key);
2984        spin_lock(&hb->lock);
2985
2986        /*
2987         * Check waiters first. We do not trust user space values at
2988         * all and we at least want to know if user space fiddled
2989         * with the futex value instead of blindly unlocking.
2990         */
2991        top_waiter = futex_top_waiter(hb, &key);
2992        if (top_waiter) {
2993                struct futex_pi_state *pi_state = top_waiter->pi_state;
2994
2995                ret = -EINVAL;
2996                if (!pi_state)
2997                        goto out_unlock;
2998
2999                /*
3000                 * If current does not own the pi_state then the futex is
3001                 * inconsistent and user space fiddled with the futex value.
3002                 */
3003                if (pi_state->owner != current)
3004                        goto out_unlock;
3005
3006                get_pi_state(pi_state);
3007                /*
3008                 * By taking wait_lock while still holding hb->lock, we ensure
3009                 * there is no point where we hold neither; and therefore
3010                 * wake_futex_pi() must observe a state consistent with what we
3011                 * observed.
3012                 *
3013                 * In particular; this forces __rt_mutex_start_proxy() to
3014                 * complete such that we're guaranteed to observe the
3015                 * rt_waiter. Also see the WARN in wake_futex_pi().
3016                 */
3017                raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
3018                spin_unlock(&hb->lock);
3019
3020                /* drops pi_state->pi_mutex.wait_lock */
3021                ret = wake_futex_pi(uaddr, uval, pi_state);
3022
3023                put_pi_state(pi_state);
3024
3025                /*
3026                 * Success, we're done! No tricky corner cases.
3027                 */
3028                if (!ret)
3029                        goto out_putkey;
3030                /*
3031                 * The atomic access to the futex value generated a
3032                 * pagefault, so retry the user-access and the wakeup:
3033                 */
3034                if (ret == -EFAULT)
3035                        goto pi_faulted;
3036                /*
3037                 * A unconditional UNLOCK_PI op raced against a waiter
3038                 * setting the FUTEX_WAITERS bit. Try again.
3039                 */
3040                if (ret == -EAGAIN)
3041                        goto pi_retry;
3042                /*
3043                 * wake_futex_pi has detected invalid state. Tell user
3044                 * space.
3045                 */
3046                goto out_putkey;
3047        }
3048
3049        /*
3050         * We have no kernel internal state, i.e. no waiters in the
3051         * kernel. Waiters which are about to queue themselves are stuck
3052         * on hb->lock. So we can safely ignore them. We do neither
3053         * preserve the WAITERS bit not the OWNER_DIED one. We are the
3054         * owner.
3055         */
3056        if ((ret = cmpxchg_futex_value_locked(&curval, uaddr, uval, 0))) {
3057                spin_unlock(&hb->lock);
3058                switch (ret) {
3059                case -EFAULT:
3060                        goto pi_faulted;
3061
3062                case -EAGAIN:
3063                        goto pi_retry;
3064
3065                default:
3066                        WARN_ON_ONCE(1);
3067                        goto out_putkey;
3068                }
3069        }
3070
3071        /*
3072         * If uval has changed, let user space handle it.
3073         */
3074        ret = (curval == uval) ? 0 : -EAGAIN;
3075
3076out_unlock:
3077        spin_unlock(&hb->lock);
3078out_putkey:
3079        return ret;
3080
3081pi_retry:
3082        cond_resched();
3083        goto retry;
3084
3085pi_faulted:
3086
3087        ret = fault_in_user_writeable(uaddr);
3088        if (!ret)
3089                goto retry;
3090
3091        return ret;
3092}
3093
3094/**
3095 * handle_early_requeue_pi_wakeup() - Detect early wakeup on the initial futex
3096 * @hb:         the hash_bucket futex_q was original enqueued on
3097 * @q:          the futex_q woken while waiting to be requeued
3098 * @key2:       the futex_key of the requeue target futex
3099 * @timeout:    the timeout associated with the wait (NULL if none)
3100 *
3101 * Detect if the task was woken on the initial futex as opposed to the requeue
3102 * target futex.  If so, determine if it was a timeout or a signal that caused
3103 * the wakeup and return the appropriate error code to the caller.  Must be
3104 * called with the hb lock held.
3105 *
3106 * Return:
3107 *  -  0 = no early wakeup detected;
3108 *  - <0 = -ETIMEDOUT or -ERESTARTNOINTR
3109 */
3110static inline
3111int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb,
3112                                   struct futex_q *q, union futex_key *key2,
3113                                   struct hrtimer_sleeper *timeout)
3114{
3115        int ret = 0;
3116
3117        /*
3118         * With the hb lock held, we avoid races while we process the wakeup.
3119         * We only need to hold hb (and not hb2) to ensure atomicity as the
3120         * wakeup code can't change q.key from uaddr to uaddr2 if we hold hb.
3121         * It can't be requeued from uaddr2 to something else since we don't
3122         * support a PI aware source futex for requeue.
3123         */
3124        if (!match_futex(&q->key, key2)) {
3125                WARN_ON(q->lock_ptr && (&hb->lock != q->lock_ptr));
3126                /*
3127                 * We were woken prior to requeue by a timeout or a signal.
3128                 * Unqueue the futex_q and determine which it was.
3129                 */
3130                plist_del(&q->list, &hb->chain);
3131                hb_waiters_dec(hb);
3132
3133                /* Handle spurious wakeups gracefully */
3134                ret = -EWOULDBLOCK;
3135                if (timeout && !timeout->task)
3136                        ret = -ETIMEDOUT;
3137                else if (signal_pending(current))
3138                        ret = -ERESTARTNOINTR;
3139        }
3140        return ret;
3141}
3142
3143/**
3144 * futex_wait_requeue_pi() - Wait on uaddr and take uaddr2
3145 * @uaddr:      the futex we initially wait on (non-pi)
3146 * @flags:      futex flags (FLAGS_SHARED, FLAGS_CLOCKRT, etc.), they must be
3147 *              the same type, no requeueing from private to shared, etc.
3148 * @val:        the expected value of uaddr
3149 * @abs_time:   absolute timeout
3150 * @bitset:     32 bit wakeup bitset set by userspace, defaults to all
3151 * @uaddr2:     the pi futex we will take prior to returning to user-space
3152 *
3153 * The caller will wait on uaddr and will be requeued by futex_requeue() to
3154 * uaddr2 which must be PI aware and unique from uaddr.  Normal wakeup will wake
3155 * on uaddr2 and complete the acquisition of the rt_mutex prior to returning to
3156 * userspace.  This ensures the rt_mutex maintains an owner when it has waiters;
3157 * without one, the pi logic would not know which task to boost/deboost, if
3158 * there was a need to.
3159 *
3160 * We call schedule in futex_wait_queue_me() when we enqueue and return there
3161 * via the following--
3162 * 1) wakeup on uaddr2 after an atomic lock acquisition by futex_requeue()
3163 * 2) wakeup on uaddr2 after a requeue
3164 * 3) signal
3165 * 4) timeout
3166 *
3167 * If 3, cleanup and return -ERESTARTNOINTR.
3168 *
3169 * If 2, we may then block on trying to take the rt_mutex and return via:
3170 * 5) successful lock
3171 * 6) signal
3172 * 7) timeout
3173 * 8) other lock acquisition failure
3174 *
3175 * If 6, return -EWOULDBLOCK (restarting the syscall would do the same).
3176 *
3177 * If 4 or 7, we cleanup and return with -ETIMEDOUT.
3178 *
3179 * Return:
3180 *  -  0 - On success;
3181 *  - <0 - On error
3182 */
3183static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
3184                                 u32 val, ktime_t *abs_time, u32 bitset,
3185                                 u32 __user *uaddr2)
3186{
3187        struct hrtimer_sleeper timeout, *to;
3188        struct futex_pi_state *pi_state = NULL;
3189        struct rt_mutex_waiter rt_waiter;
3190        struct futex_hash_bucket *hb;
3191        union futex_key key2 = FUTEX_KEY_INIT;
3192        struct futex_q q = futex_q_init;
3193        int res, ret;
3194
3195        if (!IS_ENABLED(CONFIG_FUTEX_PI))
3196                return -ENOSYS;
3197
3198        if (uaddr == uaddr2)
3199                return -EINVAL;
3200
3201        if (!bitset)
3202                return -EINVAL;
3203
3204        to = futex_setup_timer(abs_time, &timeout, flags,
3205                               current->timer_slack_ns);
3206
3207        /*
3208         * The waiter is allocated on our stack, manipulated by the requeue
3209         * code while we sleep on uaddr.
3210         */
3211        rt_mutex_init_waiter(&rt_waiter);
3212
3213        ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, FUTEX_WRITE);
3214        if (unlikely(ret != 0))
3215                goto out;
3216
3217        q.bitset = bitset;
3218        q.rt_waiter = &rt_waiter;
3219        q.requeue_pi_key = &key2;
3220
3221        /*
3222         * Prepare to wait on uaddr. On success, increments q.key (key1) ref
3223         * count.
3224         */
3225        ret = futex_wait_setup(uaddr, val, flags, &q, &hb);
3226        if (ret)
3227                goto out;
3228
3229        /*
3230         * The check above which compares uaddrs is not sufficient for
3231         * shared futexes. We need to compare the keys:
3232         */
3233        if (match_futex(&q.key, &key2)) {
3234                queue_unlock(hb);
3235                ret = -EINVAL;
3236                goto out;
3237        }
3238
3239        /* Queue the futex_q, drop the hb lock, wait for wakeup. */
3240        futex_wait_queue_me(hb, &q, to);
3241
3242        spin_lock(&hb->lock);
3243        ret = handle_early_requeue_pi_wakeup(hb, &q, &key2, to);
3244        spin_unlock(&hb->lock);
3245        if (ret)
3246                goto out;
3247
3248        /*
3249         * In order for us to be here, we know our q.key == key2, and since
3250         * we took the hb->lock above, we also know that futex_requeue() has
3251         * completed and we no longer have to concern ourselves with a wakeup
3252         * race with the atomic proxy lock acquisition by the requeue code. The
3253         * futex_requeue dropped our key1 reference and incremented our key2
3254         * reference count.
3255         */
3256
3257        /* Check if the requeue code acquired the second futex for us. */
3258        if (!q.rt_waiter) {
3259                /*
3260                 * Got the lock. We might not be the anticipated owner if we
3261                 * did a lock-steal - fix up the PI-state in that case.
3262                 */
3263                if (q.pi_state && (q.pi_state->owner != current)) {
3264                        spin_lock(q.lock_ptr);
3265                        ret = fixup_pi_state_owner(uaddr2, &q, current);
3266                        if (ret && rt_mutex_owner(&q.pi_state->pi_mutex) == current) {
3267                                pi_state = q.pi_state;
3268                                get_pi_state(pi_state);
3269                        }
3270                        /*
3271                         * Drop the reference to the pi state which
3272                         * the requeue_pi() code acquired for us.
3273                         */
3274                        put_pi_state(q.pi_state);
3275                        spin_unlock(q.lock_ptr);
3276                }
3277        } else {
3278                struct rt_mutex *pi_mutex;
3279
3280                /*
3281                 * We have been woken up by futex_unlock_pi(), a timeout, or a
3282                 * signal.  futex_unlock_pi() will not destroy the lock_ptr nor
3283                 * the pi_state.
3284                 */
3285                WARN_ON(!q.pi_state);
3286                pi_mutex = &q.pi_state->pi_mutex;
3287                ret = rt_mutex_wait_proxy_lock(pi_mutex, to, &rt_waiter);
3288
3289                spin_lock(q.lock_ptr);
3290                if (ret && !rt_mutex_cleanup_proxy_lock(pi_mutex, &rt_waiter))
3291                        ret = 0;
3292
3293                debug_rt_mutex_free_waiter(&rt_waiter);
3294                /*
3295                 * Fixup the pi_state owner and possibly acquire the lock if we
3296                 * haven't already.
3297                 */
3298                res = fixup_owner(uaddr2, &q, !ret);
3299                /*
3300                 * If fixup_owner() returned an error, proprogate that.  If it
3301                 * acquired the lock, clear -ETIMEDOUT or -EINTR.
3302                 */
3303                if (res)
3304                        ret = (res < 0) ? res : 0;
3305
3306                /*
3307                 * If fixup_pi_state_owner() faulted and was unable to handle
3308                 * the fault, unlock the rt_mutex and return the fault to
3309                 * userspace.
3310                 */
3311                if (ret && rt_mutex_owner(&q.pi_state->pi_mutex) == current) {
3312                        pi_state = q.pi_state;
3313                        get_pi_state(pi_state);
3314                }
3315
3316                /* Unqueue and drop the lock. */
3317                unqueue_me_pi(&q);
3318        }
3319
3320        if (pi_state) {
3321                rt_mutex_futex_unlock(&pi_state->pi_mutex);
3322                put_pi_state(pi_state);
3323        }
3324
3325        if (ret == -EINTR) {
3326                /*
3327                 * We've already been requeued, but cannot restart by calling
3328                 * futex_lock_pi() directly. We could restart this syscall, but
3329                 * it would detect that the user space "val" changed and return
3330                 * -EWOULDBLOCK.  Save the overhead of the restart and return
3331                 * -EWOULDBLOCK directly.
3332                 */
3333                ret = -EWOULDBLOCK;
3334        }
3335
3336out:
3337        if (to) {
3338                hrtimer_cancel(&to->timer);
3339                destroy_hrtimer_on_stack(&to->timer);
3340        }
3341        return ret;
3342}
3343
3344/*
3345 * Support for robust futexes: the kernel cleans up held futexes at
3346 * thread exit time.
3347 *
3348 * Implementation: user-space maintains a per-thread list of locks it
3349 * is holding. Upon do_exit(), the kernel carefully walks this list,
3350 * and marks all locks that are owned by this thread with the
3351 * FUTEX_OWNER_DIED bit, and wakes up a waiter (if any). The list is
3352 * always manipulated with the lock held, so the list is private and
3353 * per-thread. Userspace also maintains a per-thread 'list_op_pending'
3354 * field, to allow the kernel to clean up if the thread dies after
3355 * acquiring the lock, but just before it could have added itself to
3356 * the list. There can only be one such pending lock.
3357 */
3358
3359/**
3360 * sys_set_robust_list() - Set the robust-futex list head of a task
3361 * @head:       pointer to the list-head
3362 * @len:        length of the list-head, as userspace expects
3363 */
3364SYSCALL_DEFINE2(set_robust_list, struct robust_list_head __user *, head,
3365                size_t, len)
3366{
3367        if (!futex_cmpxchg_enabled)
3368                return -ENOSYS;
3369        /*
3370         * The kernel knows only one size for now:
3371         */
3372        if (unlikely(len != sizeof(*head)))
3373                return -EINVAL;
3374
3375        current->robust_list = head;
3376
3377        return 0;
3378}
3379
3380/**
3381 * sys_get_robust_list() - Get the robust-futex list head of a task
3382 * @pid:        pid of the process [zero for current task]
3383 * @head_ptr:   pointer to a list-head pointer, the kernel fills it in
3384 * @len_ptr:    pointer to a length field, the kernel fills in the header size
3385 */
3386SYSCALL_DEFINE3(get_robust_list, int, pid,
3387                struct robust_list_head __user * __user *, head_ptr,
3388                size_t __user *, len_ptr)
3389{
3390        struct robust_list_head __user *head;
3391        unsigned long ret;
3392        struct task_struct *p;
3393
3394        if (!futex_cmpxchg_enabled)
3395                return -ENOSYS;
3396
3397        rcu_read_lock();
3398
3399        ret = -ESRCH;
3400        if (!pid)
3401                p = current;
3402        else {
3403                p = find_task_by_vpid(pid);
3404                if (!p)
3405                        goto err_unlock;
3406        }
3407
3408        ret = -EPERM;
3409        if (!ptrace_may_access(p, PTRACE_MODE_READ_REALCREDS))
3410                goto err_unlock;
3411
3412        head = p->robust_list;
3413        rcu_read_unlock();
3414
3415        if (put_user(sizeof(*head), len_ptr))
3416                return -EFAULT;
3417        return put_user(head, head_ptr);
3418
3419err_unlock:
3420        rcu_read_unlock();
3421
3422        return ret;
3423}
3424
3425/* Constants for the pending_op argument of handle_futex_death */
3426#define HANDLE_DEATH_PENDING    true
3427#define HANDLE_DEATH_LIST       false
3428
3429/*
3430 * Process a futex-list entry, check whether it's owned by the
3431 * dying task, and do notification if so:
3432 */
3433static int handle_futex_death(u32 __user *uaddr, struct task_struct *curr,
3434                              bool pi, bool pending_op)
3435{
3436        u32 uval, nval, mval;
3437        int err;
3438
3439        /* Futex address must be 32bit aligned */
3440        if ((((unsigned long)uaddr) % sizeof(*uaddr)) != 0)
3441                return -1;
3442
3443retry:
3444        if (get_user(uval, uaddr))
3445                return -1;
3446
3447        /*
3448         * Special case for regular (non PI) futexes. The unlock path in
3449         * user space has two race scenarios:
3450         *
3451         * 1. The unlock path releases the user space futex value and
3452         *    before it can execute the futex() syscall to wake up
3453         *    waiters it is killed.
3454         *
3455         * 2. A woken up waiter is killed before it can acquire the
3456         *    futex in user space.
3457         *
3458         * In both cases the TID validation below prevents a wakeup of
3459         * potential waiters which can cause these waiters to block
3460         * forever.
3461         *
3462         * In both cases the following conditions are met:
3463         *
3464         *      1) task->robust_list->list_op_pending != NULL
3465         *         @pending_op == true
3466         *      2) User space futex value == 0
3467         *      3) Regular futex: @pi == false
3468         *
3469         * If these conditions are met, it is safe to attempt waking up a
3470         * potential waiter without touching the user space futex value and
3471         * trying to set the OWNER_DIED bit. The user space futex value is
3472         * uncontended and the rest of the user space mutex state is
3473         * consistent, so a woken waiter will just take over the
3474         * uncontended futex. Setting the OWNER_DIED bit would create
3475         * inconsistent state and malfunction of the user space owner died
3476         * handling.
3477         */
3478        if (pending_op && !pi && !uval) {
3479                futex_wake(uaddr, 1, 1, FUTEX_BITSET_MATCH_ANY);
3480                return 0;
3481        }
3482
3483        if ((uval & FUTEX_TID_MASK) != task_pid_vnr(curr))
3484                return 0;
3485
3486        /*
3487         * Ok, this dying thread is truly holding a futex
3488         * of interest. Set the OWNER_DIED bit atomically
3489         * via cmpxchg, and if the value had FUTEX_WAITERS
3490         * set, wake up a waiter (if any). (We have to do a
3491         * futex_wake() even if OWNER_DIED is already set -
3492         * to handle the rare but possible case of recursive
3493         * thread-death.) The rest of the cleanup is done in
3494         * userspace.
3495         */
3496        mval = (uval & FUTEX_WAITERS) | FUTEX_OWNER_DIED;
3497
3498        /*
3499         * We are not holding a lock here, but we want to have
3500         * the pagefault_disable/enable() protection because
3501         * we want to handle the fault gracefully. If the
3502         * access fails we try to fault in the futex with R/W
3503         * verification via get_user_pages. get_user() above
3504         * does not guarantee R/W access. If that fails we
3505         * give up and leave the futex locked.
3506         */
3507        if ((err = cmpxchg_futex_value_locked(&nval, uaddr, uval, mval))) {
3508                switch (err) {
3509                case -EFAULT:
3510                        if (fault_in_user_writeable(uaddr))
3511                                return -1;
3512                        goto retry;
3513
3514                case -EAGAIN:
3515                        cond_resched();
3516                        goto retry;
3517
3518                default:
3519                        WARN_ON_ONCE(1);
3520                        return err;
3521                }
3522        }
3523
3524        if (nval != uval)
3525                goto retry;
3526
3527        /*
3528         * Wake robust non-PI futexes here. The wakeup of
3529         * PI futexes happens in exit_pi_state():
3530         */
3531        if (!pi && (uval & FUTEX_WAITERS))
3532                futex_wake(uaddr, 1, 1, FUTEX_BITSET_MATCH_ANY);
3533
3534        return 0;
3535}
3536
3537/*
3538 * Fetch a robust-list pointer. Bit 0 signals PI futexes:
3539 */
3540static inline int fetch_robust_entry(struct robust_list __user **entry,
3541                                     struct robust_list __user * __user *head,
3542                                     unsigned int *pi)
3543{
3544        unsigned long uentry;
3545
3546        if (get_user(uentry, (unsigned long __user *)head))
3547                return -EFAULT;
3548
3549        *entry = (void __user *)(uentry & ~1UL);
3550        *pi = uentry & 1;
3551
3552        return 0;
3553}
3554
3555/*
3556 * Walk curr->robust_list (very carefully, it's a userspace list!)
3557 * and mark any locks found there dead, and notify any waiters.
3558 *
3559 * We silently return on any sign of list-walking problem.
3560 */
3561static void exit_robust_list(struct task_struct *curr)
3562{
3563        struct robust_list_head __user *head = curr->robust_list;
3564        struct robust_list __user *entry, *next_entry, *pending;
3565        unsigned int limit = ROBUST_LIST_LIMIT, pi, pip;
3566        unsigned int next_pi;
3567        unsigned long futex_offset;
3568        int rc;
3569
3570        if (!futex_cmpxchg_enabled)
3571                return;
3572
3573        /*
3574         * Fetch the list head (which was registered earlier, via
3575         * sys_set_robust_list()):
3576         */
3577        if (fetch_robust_entry(&entry, &head->list.next, &pi))
3578                return;
3579        /*
3580         * Fetch the relative futex offset:
3581         */
3582        if (get_user(futex_offset, &head->futex_offset))
3583                return;
3584        /*
3585         * Fetch any possibly pending lock-add first, and handle it
3586         * if it exists:
3587         */
3588        if (fetch_robust_entry(&pending, &head->list_op_pending, &pip))
3589                return;
3590
3591        next_entry = NULL;      /* avoid warning with gcc */
3592        while (entry != &head->list) {
3593                /*
3594                 * Fetch the next entry in the list before calling
3595                 * handle_futex_death:
3596                 */
3597                rc = fetch_robust_entry(&next_entry, &entry->next, &next_pi);
3598                /*
3599                 * A pending lock might already be on the list, so
3600                 * don't process it twice:
3601                 */
3602                if (entry != pending) {
3603                        if (handle_futex_death((void __user *)entry + futex_offset,
3604                                                curr, pi, HANDLE_DEATH_LIST))
3605                                return;
3606                }
3607                if (rc)
3608                        return;
3609                entry = next_entry;
3610                pi = next_pi;
3611                /*
3612                 * Avoid excessively long or circular lists:
3613                 */
3614                if (!--limit)
3615                        break;
3616
3617                cond_resched();
3618        }
3619
3620        if (pending) {
3621                handle_futex_death((void __user *)pending + futex_offset,
3622                                   curr, pip, HANDLE_DEATH_PENDING);
3623        }
3624}
3625
3626static void futex_cleanup(struct task_struct *tsk)
3627{
3628        if (unlikely(tsk->robust_list)) {
3629                exit_robust_list(tsk);
3630                tsk->robust_list = NULL;
3631        }
3632
3633#ifdef CONFIG_COMPAT
3634        if (unlikely(tsk->compat_robust_list)) {
3635                compat_exit_robust_list(tsk);
3636                tsk->compat_robust_list = NULL;
3637        }
3638#endif
3639
3640        if (unlikely(!list_empty(&tsk->pi_state_list)))
3641                exit_pi_state_list(tsk);
3642}
3643
3644/**
3645 * futex_exit_recursive - Set the tasks futex state to FUTEX_STATE_DEAD
3646 * @tsk:        task to set the state on
3647 *
3648 * Set the futex exit state of the task lockless. The futex waiter code
3649 * observes that state when a task is exiting and loops until the task has
3650 * actually finished the futex cleanup. The worst case for this is that the
3651 * waiter runs through the wait loop until the state becomes visible.
3652 *
3653 * This is called from the recursive fault handling path in do_exit().
3654 *
3655 * This is best effort. Either the futex exit code has run already or
3656 * not. If the OWNER_DIED bit has been set on the futex then the waiter can
3657 * take it over. If not, the problem is pushed back to user space. If the
3658 * futex exit code did not run yet, then an already queued waiter might
3659 * block forever, but there is nothing which can be done about that.
3660 */
3661void futex_exit_recursive(struct task_struct *tsk)
3662{
3663        /* If the state is FUTEX_STATE_EXITING then futex_exit_mutex is held */
3664        if (tsk->futex_state == FUTEX_STATE_EXITING)
3665                mutex_unlock(&tsk->futex_exit_mutex);
3666        tsk->futex_state = FUTEX_STATE_DEAD;
3667}
3668
3669static void futex_cleanup_begin(struct task_struct *tsk)
3670{
3671        /*
3672         * Prevent various race issues against a concurrent incoming waiter
3673         * including live locks by forcing the waiter to block on
3674         * tsk->futex_exit_mutex when it observes FUTEX_STATE_EXITING in
3675         * attach_to_pi_owner().
3676         */
3677        mutex_lock(&tsk->futex_exit_mutex);
3678
3679        /*
3680         * Switch the state to FUTEX_STATE_EXITING under tsk->pi_lock.
3681         *
3682         * This ensures that all subsequent checks of tsk->futex_state in
3683         * attach_to_pi_owner() must observe FUTEX_STATE_EXITING with
3684         * tsk->pi_lock held.
3685         *
3686         * It guarantees also that a pi_state which was queued right before
3687         * the state change under tsk->pi_lock by a concurrent waiter must
3688         * be observed in exit_pi_state_list().
3689         */
3690        raw_spin_lock_irq(&tsk->pi_lock);
3691        tsk->futex_state = FUTEX_STATE_EXITING;
3692        raw_spin_unlock_irq(&tsk->pi_lock);
3693}
3694
3695static void futex_cleanup_end(struct task_struct *tsk, int state)
3696{
3697        /*
3698         * Lockless store. The only side effect is that an observer might
3699         * take another loop until it becomes visible.
3700         */
3701        tsk->futex_state = state;
3702        /*
3703         * Drop the exit protection. This unblocks waiters which observed
3704         * FUTEX_STATE_EXITING to reevaluate the state.
3705         */
3706        mutex_unlock(&tsk->futex_exit_mutex);
3707}
3708
3709void futex_exec_release(struct task_struct *tsk)
3710{
3711        /*
3712         * The state handling is done for consistency, but in the case of
3713         * exec() there is no way to prevent futher damage as the PID stays
3714         * the same. But for the unlikely and arguably buggy case that a
3715         * futex is held on exec(), this provides at least as much state
3716         * consistency protection which is possible.
3717         */
3718        futex_cleanup_begin(tsk);
3719        futex_cleanup(tsk);
3720        /*
3721         * Reset the state to FUTEX_STATE_OK. The task is alive and about
3722         * exec a new binary.
3723         */
3724        futex_cleanup_end(tsk, FUTEX_STATE_OK);
3725}
3726
3727void futex_exit_release(struct task_struct *tsk)
3728{
3729        futex_cleanup_begin(tsk);
3730        futex_cleanup(tsk);
3731        futex_cleanup_end(tsk, FUTEX_STATE_DEAD);
3732}
3733
3734long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
3735                u32 __user *uaddr2, u32 val2, u32 val3)
3736{
3737        int cmd = op & FUTEX_CMD_MASK;
3738        unsigned int flags = 0;
3739
3740        if (!(op & FUTEX_PRIVATE_FLAG))
3741                flags |= FLAGS_SHARED;
3742
3743        if (op & FUTEX_CLOCK_REALTIME) {
3744                flags |= FLAGS_CLOCKRT;
3745                if (cmd != FUTEX_WAIT && cmd != FUTEX_WAIT_BITSET && \
3746                    cmd != FUTEX_WAIT_REQUEUE_PI)
3747                        return -ENOSYS;
3748        }
3749
3750        switch (cmd) {
3751        case FUTEX_LOCK_PI:
3752        case FUTEX_UNLOCK_PI:
3753        case FUTEX_TRYLOCK_PI:
3754        case FUTEX_WAIT_REQUEUE_PI:
3755        case FUTEX_CMP_REQUEUE_PI:
3756                if (!futex_cmpxchg_enabled)
3757                        return -ENOSYS;
3758        }
3759
3760        switch (cmd) {
3761        case FUTEX_WAIT:
3762                val3 = FUTEX_BITSET_MATCH_ANY;
3763                fallthrough;
3764        case FUTEX_WAIT_BITSET:
3765                return futex_wait(uaddr, flags, val, timeout, val3);
3766        case FUTEX_WAKE:
3767                val3 = FUTEX_BITSET_MATCH_ANY;
3768                fallthrough;
3769        case FUTEX_WAKE_BITSET:
3770                return futex_wake(uaddr, flags, val, val3);
3771        case FUTEX_REQUEUE:
3772                return futex_requeue(uaddr, flags, uaddr2, val, val2, NULL, 0);
3773        case FUTEX_CMP_REQUEUE:
3774                return futex_requeue(uaddr, flags, uaddr2, val, val2, &val3, 0);
3775        case FUTEX_WAKE_OP:
3776                return futex_wake_op(uaddr, flags, uaddr2, val, val2, val3);
3777        case FUTEX_LOCK_PI:
3778                return futex_lock_pi(uaddr, flags, timeout, 0);
3779        case FUTEX_UNLOCK_PI:
3780                return futex_unlock_pi(uaddr, flags);
3781        case FUTEX_TRYLOCK_PI:
3782                return futex_lock_pi(uaddr, flags, NULL, 1);
3783        case FUTEX_WAIT_REQUEUE_PI:
3784                val3 = FUTEX_BITSET_MATCH_ANY;
3785                return futex_wait_requeue_pi(uaddr, flags, val, timeout, val3,
3786                                             uaddr2);
3787        case FUTEX_CMP_REQUEUE_PI:
3788                return futex_requeue(uaddr, flags, uaddr2, val, val2, &val3, 1);
3789        }
3790        return -ENOSYS;
3791}
3792
3793
3794SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val,
3795                struct __kernel_timespec __user *, utime, u32 __user *, uaddr2,
3796                u32, val3)
3797{
3798        struct timespec64 ts;
3799        ktime_t t, *tp = NULL;
3800        u32 val2 = 0;
3801        int cmd = op & FUTEX_CMD_MASK;
3802
3803        if (utime && (cmd == FUTEX_WAIT || cmd == FUTEX_LOCK_PI ||
3804                      cmd == FUTEX_WAIT_BITSET ||
3805                      cmd == FUTEX_WAIT_REQUEUE_PI)) {
3806                if (unlikely(should_fail_futex(!(op & FUTEX_PRIVATE_FLAG))))
3807                        return -EFAULT;
3808                if (get_timespec64(&ts, utime))
3809                        return -EFAULT;
3810                if (!timespec64_valid(&ts))
3811                        return -EINVAL;
3812
3813                t = timespec64_to_ktime(ts);
3814                if (cmd == FUTEX_WAIT)
3815                        t = ktime_add_safe(ktime_get(), t);
3816                else if (!(op & FUTEX_CLOCK_REALTIME))
3817                        t = timens_ktime_to_host(CLOCK_MONOTONIC, t);
3818                tp = &t;
3819        }
3820        /*
3821         * requeue parameter in 'utime' if cmd == FUTEX_*_REQUEUE_*.
3822         * number of waiters to wake in 'utime' if cmd == FUTEX_WAKE_OP.
3823         */
3824        if (cmd == FUTEX_REQUEUE || cmd == FUTEX_CMP_REQUEUE ||
3825            cmd == FUTEX_CMP_REQUEUE_PI || cmd == FUTEX_WAKE_OP)
3826                val2 = (u32) (unsigned long) utime;
3827
3828        return do_futex(uaddr, op, val, tp, uaddr2, val2, val3);
3829}
3830
3831#ifdef CONFIG_COMPAT
3832/*
3833 * Fetch a robust-list pointer. Bit 0 signals PI futexes:
3834 */
3835static inline int
3836compat_fetch_robust_entry(compat_uptr_t *uentry, struct robust_list __user **entry,
3837                   compat_uptr_t __user *head, unsigned int *pi)
3838{
3839        if (get_user(*uentry, head))
3840                return -EFAULT;
3841
3842        *entry = compat_ptr((*uentry) & ~1);
3843        *pi = (unsigned int)(*uentry) & 1;
3844
3845        return 0;
3846}
3847
3848static void __user *futex_uaddr(struct robust_list __user *entry,
3849                                compat_long_t futex_offset)
3850{
3851        compat_uptr_t base = ptr_to_compat(entry);
3852        void __user *uaddr = compat_ptr(base + futex_offset);
3853
3854        return uaddr;
3855}
3856
3857/*
3858 * Walk curr->robust_list (very carefully, it's a userspace list!)
3859 * and mark any locks found there dead, and notify any waiters.
3860 *
3861 * We silently return on any sign of list-walking problem.
3862 */
3863static void compat_exit_robust_list(struct task_struct *curr)
3864{
3865        struct compat_robust_list_head __user *head = curr->compat_robust_list;
3866        struct robust_list __user *entry, *next_entry, *pending;
3867        unsigned int limit = ROBUST_LIST_LIMIT, pi, pip;
3868        unsigned int next_pi;
3869        compat_uptr_t uentry, next_uentry, upending;
3870        compat_long_t futex_offset;
3871        int rc;
3872
3873        if (!futex_cmpxchg_enabled)
3874                return;
3875
3876        /*
3877         * Fetch the list head (which was registered earlier, via
3878         * sys_set_robust_list()):
3879         */
3880        if (compat_fetch_robust_entry(&uentry, &entry, &head->list.next, &pi))
3881                return;
3882        /*
3883         * Fetch the relative futex offset:
3884         */
3885        if (get_user(futex_offset, &head->futex_offset))
3886                return;
3887        /*
3888         * Fetch any possibly pending lock-add first, and handle it
3889         * if it exists:
3890         */
3891        if (compat_fetch_robust_entry(&upending, &pending,
3892                               &head->list_op_pending, &pip))
3893                return;
3894
3895        next_entry = NULL;      /* avoid warning with gcc */
3896        while (entry != (struct robust_list __user *) &head->list) {
3897                /*
3898                 * Fetch the next entry in the list before calling
3899                 * handle_futex_death:
3900                 */
3901                rc = compat_fetch_robust_entry(&next_uentry, &next_entry,
3902                        (compat_uptr_t __user *)&entry->next, &next_pi);
3903                /*
3904                 * A pending lock might already be on the list, so
3905                 * dont process it twice:
3906                 */
3907                if (entry != pending) {
3908                        void __user *uaddr = futex_uaddr(entry, futex_offset);
3909
3910                        if (handle_futex_death(uaddr, curr, pi,
3911                                               HANDLE_DEATH_LIST))
3912                                return;
3913                }
3914                if (rc)
3915                        return;
3916                uentry = next_uentry;
3917                entry = next_entry;
3918                pi = next_pi;
3919                /*
3920                 * Avoid excessively long or circular lists:
3921                 */
3922                if (!--limit)
3923                        break;
3924
3925                cond_resched();
3926        }
3927        if (pending) {
3928                void __user *uaddr = futex_uaddr(pending, futex_offset);
3929
3930                handle_futex_death(uaddr, curr, pip, HANDLE_DEATH_PENDING);
3931        }
3932}
3933
3934COMPAT_SYSCALL_DEFINE2(set_robust_list,
3935                struct compat_robust_list_head __user *, head,
3936                compat_size_t, len)
3937{
3938        if (!futex_cmpxchg_enabled)
3939                return -ENOSYS;
3940
3941        if (unlikely(len != sizeof(*head)))
3942                return -EINVAL;
3943
3944        current->compat_robust_list = head;
3945
3946        return 0;
3947}
3948
3949COMPAT_SYSCALL_DEFINE3(get_robust_list, int, pid,
3950                        compat_uptr_t __user *, head_ptr,
3951                        compat_size_t __user *, len_ptr)
3952{
3953        struct compat_robust_list_head __user *head;
3954        unsigned long ret;
3955        struct task_struct *p;
3956
3957        if (!futex_cmpxchg_enabled)
3958                return -ENOSYS;
3959
3960        rcu_read_lock();
3961
3962        ret = -ESRCH;
3963        if (!pid)
3964                p = current;
3965        else {
3966                p = find_task_by_vpid(pid);
3967                if (!p)
3968                        goto err_unlock;
3969        }
3970
3971        ret = -EPERM;
3972        if (!ptrace_may_access(p, PTRACE_MODE_READ_REALCREDS))
3973                goto err_unlock;
3974
3975        head = p->compat_robust_list;
3976        rcu_read_unlock();
3977
3978        if (put_user(sizeof(*head), len_ptr))
3979                return -EFAULT;
3980        return put_user(ptr_to_compat(head), head_ptr);
3981
3982err_unlock:
3983        rcu_read_unlock();
3984
3985        return ret;
3986}
3987#endif /* CONFIG_COMPAT */
3988
3989#ifdef CONFIG_COMPAT_32BIT_TIME
3990SYSCALL_DEFINE6(futex_time32, u32 __user *, uaddr, int, op, u32, val,
3991                struct old_timespec32 __user *, utime, u32 __user *, uaddr2,
3992                u32, val3)
3993{
3994        struct timespec64 ts;
3995        ktime_t t, *tp = NULL;
3996        int val2 = 0;
3997        int cmd = op & FUTEX_CMD_MASK;
3998
3999        if (utime && (cmd == FUTEX_WAIT || cmd == FUTEX_LOCK_PI ||
4000                      cmd == FUTEX_WAIT_BITSET ||
4001                      cmd == FUTEX_WAIT_REQUEUE_PI)) {
4002                if (get_old_timespec32(&ts, utime))
4003                        return -EFAULT;
4004                if (!timespec64_valid(&ts))
4005                        return -EINVAL;
4006
4007                t = timespec64_to_ktime(ts);
4008                if (cmd == FUTEX_WAIT)
4009                        t = ktime_add_safe(ktime_get(), t);
4010                else if (!(op & FUTEX_CLOCK_REALTIME))
4011                        t = timens_ktime_to_host(CLOCK_MONOTONIC, t);
4012                tp = &t;
4013        }
4014        if (cmd == FUTEX_REQUEUE || cmd == FUTEX_CMP_REQUEUE ||
4015            cmd == FUTEX_CMP_REQUEUE_PI || cmd == FUTEX_WAKE_OP)
4016                val2 = (int) (unsigned long) utime;
4017
4018        return do_futex(uaddr, op, val, tp, uaddr2, val2, val3);
4019}
4020#endif /* CONFIG_COMPAT_32BIT_TIME */
4021
4022static void __init futex_detect_cmpxchg(void)
4023{
4024#ifndef CONFIG_HAVE_FUTEX_CMPXCHG
4025        u32 curval;
4026
4027        /*
4028         * This will fail and we want it. Some arch implementations do
4029         * runtime detection of the futex_atomic_cmpxchg_inatomic()
4030         * functionality. We want to know that before we call in any
4031         * of the complex code paths. Also we want to prevent
4032         * registration of robust lists in that case. NULL is
4033         * guaranteed to fault and we get -EFAULT on functional
4034         * implementation, the non-functional ones will return
4035         * -ENOSYS.
4036         */
4037        if (cmpxchg_futex_value_locked(&curval, NULL, 0, 0) == -EFAULT)
4038                futex_cmpxchg_enabled = 1;
4039#endif
4040}
4041
4042static int __init futex_init(void)
4043{
4044        unsigned int futex_shift;
4045        unsigned long i;
4046
4047#if CONFIG_BASE_SMALL
4048        futex_hashsize = 16;
4049#else
4050        futex_hashsize = roundup_pow_of_two(256 * num_possible_cpus());
4051#endif
4052
4053        futex_queues = alloc_large_system_hash("futex", sizeof(*futex_queues),
4054                                               futex_hashsize, 0,
4055                                               futex_hashsize < 256 ? HASH_SMALL : 0,
4056                                               &futex_shift, NULL,
4057                                               futex_hashsize, futex_hashsize);
4058        futex_hashsize = 1UL << futex_shift;
4059
4060        futex_detect_cmpxchg();
4061
4062        for (i = 0; i < futex_hashsize; i++) {
4063                atomic_set(&futex_queues[i].waiters, 0);
4064                plist_head_init(&futex_queues[i].chain);
4065                spin_lock_init(&futex_queues[i].lock);
4066        }
4067
4068        return 0;
4069}
4070core_initcall(futex_init);
4071