linux/kernel/futex.c
<<
>>
Prefs
   1/*
   2 *  Fast Userspace Mutexes (which I call "Futexes!").
   3 *  (C) Rusty Russell, IBM 2002
   4 *
   5 *  Generalized futexes, futex requeueing, misc fixes by Ingo Molnar
   6 *  (C) Copyright 2003 Red Hat Inc, All Rights Reserved
   7 *
   8 *  Removed page pinning, fix privately mapped COW pages and other cleanups
   9 *  (C) Copyright 2003, 2004 Jamie Lokier
  10 *
  11 *  Robust futex support started by Ingo Molnar
  12 *  (C) Copyright 2006 Red Hat Inc, All Rights Reserved
  13 *  Thanks to Thomas Gleixner for suggestions, analysis and fixes.
  14 *
  15 *  PI-futex support started by Ingo Molnar and Thomas Gleixner
  16 *  Copyright (C) 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
  17 *  Copyright (C) 2006 Timesys Corp., Thomas Gleixner <tglx@timesys.com>
  18 *
  19 *  PRIVATE futexes by Eric Dumazet
  20 *  Copyright (C) 2007 Eric Dumazet <dada1@cosmosbay.com>
  21 *
  22 *  Requeue-PI support by Darren Hart <dvhltc@us.ibm.com>
  23 *  Copyright (C) IBM Corporation, 2009
  24 *  Thanks to Thomas Gleixner for conceptual design and careful reviews.
  25 *
  26 *  Thanks to Ben LaHaise for yelling "hashed waitqueues" loudly
  27 *  enough at me, Linus for the original (flawed) idea, Matthew
  28 *  Kirkwood for proof-of-concept implementation.
  29 *
  30 *  "The futexes are also cursed."
  31 *  "But they come in a choice of three flavours!"
  32 *
  33 *  This program is free software; you can redistribute it and/or modify
  34 *  it under the terms of the GNU General Public License as published by
  35 *  the Free Software Foundation; either version 2 of the License, or
  36 *  (at your option) any later version.
  37 *
  38 *  This program is distributed in the hope that it will be useful,
  39 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
  40 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  41 *  GNU General Public License for more details.
  42 *
  43 *  You should have received a copy of the GNU General Public License
  44 *  along with this program; if not, write to the Free Software
  45 *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  46 */
  47#include <linux/compat.h>
  48#include <linux/slab.h>
  49#include <linux/poll.h>
  50#include <linux/fs.h>
  51#include <linux/file.h>
  52#include <linux/jhash.h>
  53#include <linux/init.h>
  54#include <linux/futex.h>
  55#include <linux/mount.h>
  56#include <linux/pagemap.h>
  57#include <linux/syscalls.h>
  58#include <linux/signal.h>
  59#include <linux/export.h>
  60#include <linux/magic.h>
  61#include <linux/pid.h>
  62#include <linux/nsproxy.h>
  63#include <linux/ptrace.h>
  64#include <linux/sched/rt.h>
  65#include <linux/sched/wake_q.h>
  66#include <linux/sched/mm.h>
  67#include <linux/hugetlb.h>
  68#include <linux/freezer.h>
  69#include <linux/memblock.h>
  70#include <linux/fault-inject.h>
  71#include <linux/refcount.h>
  72
  73#include <asm/futex.h>
  74
  75#include "locking/rtmutex_common.h"
  76
  77/*
  78 * READ this before attempting to hack on futexes!
  79 *
  80 * Basic futex operation and ordering guarantees
  81 * =============================================
  82 *
  83 * The waiter reads the futex value in user space and calls
  84 * futex_wait(). This function computes the hash bucket and acquires
  85 * the hash bucket lock. After that it reads the futex user space value
  86 * again and verifies that the data has not changed. If it has not changed
  87 * it enqueues itself into the hash bucket, releases the hash bucket lock
  88 * and schedules.
  89 *
  90 * The waker side modifies the user space value of the futex and calls
  91 * futex_wake(). This function computes the hash bucket and acquires the
  92 * hash bucket lock. Then it looks for waiters on that futex in the hash
  93 * bucket and wakes them.
  94 *
  95 * In futex wake up scenarios where no tasks are blocked on a futex, taking
  96 * the hb spinlock can be avoided and simply return. In order for this
  97 * optimization to work, ordering guarantees must exist so that the waiter
  98 * being added to the list is acknowledged when the list is concurrently being
  99 * checked by the waker, avoiding scenarios like the following:
 100 *
 101 * CPU 0                               CPU 1
 102 * val = *futex;
 103 * sys_futex(WAIT, futex, val);
 104 *   futex_wait(futex, val);
 105 *   uval = *futex;
 106 *                                     *futex = newval;
 107 *                                     sys_futex(WAKE, futex);
 108 *                                       futex_wake(futex);
 109 *                                       if (queue_empty())
 110 *                                         return;
 111 *   if (uval == val)
 112 *      lock(hash_bucket(futex));
 113 *      queue();
 114 *     unlock(hash_bucket(futex));
 115 *     schedule();
 116 *
 117 * This would cause the waiter on CPU 0 to wait forever because it
 118 * missed the transition of the user space value from val to newval
 119 * and the waker did not find the waiter in the hash bucket queue.
 120 *
 121 * The correct serialization ensures that a waiter either observes
 122 * the changed user space value before blocking or is woken by a
 123 * concurrent waker:
 124 *
 125 * CPU 0                                 CPU 1
 126 * val = *futex;
 127 * sys_futex(WAIT, futex, val);
 128 *   futex_wait(futex, val);
 129 *
 130 *   waiters++; (a)
 131 *   smp_mb(); (A) <-- paired with -.
 132 *                                  |
 133 *   lock(hash_bucket(futex));      |
 134 *                                  |
 135 *   uval = *futex;                 |
 136 *                                  |        *futex = newval;
 137 *                                  |        sys_futex(WAKE, futex);
 138 *                                  |          futex_wake(futex);
 139 *                                  |
 140 *                                  `--------> smp_mb(); (B)
 141 *   if (uval == val)
 142 *     queue();
 143 *     unlock(hash_bucket(futex));
 144 *     schedule();                         if (waiters)
 145 *                                           lock(hash_bucket(futex));
 146 *   else                                    wake_waiters(futex);
 147 *     waiters--; (b)                        unlock(hash_bucket(futex));
 148 *
 149 * Where (A) orders the waiters increment and the futex value read through
 150 * atomic operations (see hb_waiters_inc) and where (B) orders the write
 151 * to futex and the waiters read (see hb_waiters_pending()).
 152 *
 153 * This yields the following case (where X:=waiters, Y:=futex):
 154 *
 155 *      X = Y = 0
 156 *
 157 *      w[X]=1          w[Y]=1
 158 *      MB              MB
 159 *      r[Y]=y          r[X]=x
 160 *
 161 * Which guarantees that x==0 && y==0 is impossible; which translates back into
 162 * the guarantee that we cannot both miss the futex variable change and the
 163 * enqueue.
 164 *
 165 * Note that a new waiter is accounted for in (a) even when it is possible that
 166 * the wait call can return error, in which case we backtrack from it in (b).
 167 * Refer to the comment in queue_lock().
 168 *
 169 * Similarly, in order to account for waiters being requeued on another
 170 * address we always increment the waiters for the destination bucket before
 171 * acquiring the lock. It then decrements them again  after releasing it -
 172 * the code that actually moves the futex(es) between hash buckets (requeue_futex)
 173 * will do the additional required waiter count housekeeping. This is done for
 174 * double_lock_hb() and double_unlock_hb(), respectively.
 175 */
 176
 177#ifdef CONFIG_HAVE_FUTEX_CMPXCHG
 178#define futex_cmpxchg_enabled 1
 179#else
 180static int  __read_mostly futex_cmpxchg_enabled;
 181#endif
 182
 183/*
 184 * Futex flags used to encode options to functions and preserve them across
 185 * restarts.
 186 */
 187#ifdef CONFIG_MMU
 188# define FLAGS_SHARED           0x01
 189#else
 190/*
 191 * NOMMU does not have per process address space. Let the compiler optimize
 192 * code away.
 193 */
 194# define FLAGS_SHARED           0x00
 195#endif
 196#define FLAGS_CLOCKRT           0x02
 197#define FLAGS_HAS_TIMEOUT       0x04
 198
 199/*
 200 * Priority Inheritance state:
 201 */
 202struct futex_pi_state {
 203        /*
 204         * list of 'owned' pi_state instances - these have to be
 205         * cleaned up in do_exit() if the task exits prematurely:
 206         */
 207        struct list_head list;
 208
 209        /*
 210         * The PI object:
 211         */
 212        struct rt_mutex pi_mutex;
 213
 214        struct task_struct *owner;
 215        refcount_t refcount;
 216
 217        union futex_key key;
 218} __randomize_layout;
 219
 220/**
 221 * struct futex_q - The hashed futex queue entry, one per waiting task
 222 * @list:               priority-sorted list of tasks waiting on this futex
 223 * @task:               the task waiting on the futex
 224 * @lock_ptr:           the hash bucket lock
 225 * @key:                the key the futex is hashed on
 226 * @pi_state:           optional priority inheritance state
 227 * @rt_waiter:          rt_waiter storage for use with requeue_pi
 228 * @requeue_pi_key:     the requeue_pi target futex key
 229 * @bitset:             bitset for the optional bitmasked wakeup
 230 *
 231 * We use this hashed waitqueue, instead of a normal wait_queue_entry_t, so
 232 * we can wake only the relevant ones (hashed queues may be shared).
 233 *
 234 * A futex_q has a woken state, just like tasks have TASK_RUNNING.
 235 * It is considered woken when plist_node_empty(&q->list) || q->lock_ptr == 0.
 236 * The order of wakeup is always to make the first condition true, then
 237 * the second.
 238 *
 239 * PI futexes are typically woken before they are removed from the hash list via
 240 * the rt_mutex code. See unqueue_me_pi().
 241 */
 242struct futex_q {
 243        struct plist_node list;
 244
 245        struct task_struct *task;
 246        spinlock_t *lock_ptr;
 247        union futex_key key;
 248        struct futex_pi_state *pi_state;
 249        struct rt_mutex_waiter *rt_waiter;
 250        union futex_key *requeue_pi_key;
 251        u32 bitset;
 252} __randomize_layout;
 253
 254static const struct futex_q futex_q_init = {
 255        /* list gets initialized in queue_me()*/
 256        .key = FUTEX_KEY_INIT,
 257        .bitset = FUTEX_BITSET_MATCH_ANY
 258};
 259
 260/*
 261 * Hash buckets are shared by all the futex_keys that hash to the same
 262 * location.  Each key may have multiple futex_q structures, one for each task
 263 * waiting on a futex.
 264 */
 265struct futex_hash_bucket {
 266        atomic_t waiters;
 267        spinlock_t lock;
 268        struct plist_head chain;
 269} ____cacheline_aligned_in_smp;
 270
 271/*
 272 * The base of the bucket array and its size are always used together
 273 * (after initialization only in hash_futex()), so ensure that they
 274 * reside in the same cacheline.
 275 */
 276static struct {
 277        struct futex_hash_bucket *queues;
 278        unsigned long            hashsize;
 279} __futex_data __read_mostly __aligned(2*sizeof(long));
 280#define futex_queues   (__futex_data.queues)
 281#define futex_hashsize (__futex_data.hashsize)
 282
 283
 284/*
 285 * Fault injections for futexes.
 286 */
 287#ifdef CONFIG_FAIL_FUTEX
 288
 289static struct {
 290        struct fault_attr attr;
 291
 292        bool ignore_private;
 293} fail_futex = {
 294        .attr = FAULT_ATTR_INITIALIZER,
 295        .ignore_private = false,
 296};
 297
 298static int __init setup_fail_futex(char *str)
 299{
 300        return setup_fault_attr(&fail_futex.attr, str);
 301}
 302__setup("fail_futex=", setup_fail_futex);
 303
 304static bool should_fail_futex(bool fshared)
 305{
 306        if (fail_futex.ignore_private && !fshared)
 307                return false;
 308
 309        return should_fail(&fail_futex.attr, 1);
 310}
 311
 312#ifdef CONFIG_FAULT_INJECTION_DEBUG_FS
 313
 314static int __init fail_futex_debugfs(void)
 315{
 316        umode_t mode = S_IFREG | S_IRUSR | S_IWUSR;
 317        struct dentry *dir;
 318
 319        dir = fault_create_debugfs_attr("fail_futex", NULL,
 320                                        &fail_futex.attr);
 321        if (IS_ERR(dir))
 322                return PTR_ERR(dir);
 323
 324        debugfs_create_bool("ignore-private", mode, dir,
 325                            &fail_futex.ignore_private);
 326        return 0;
 327}
 328
 329late_initcall(fail_futex_debugfs);
 330
 331#endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */
 332
 333#else
 334static inline bool should_fail_futex(bool fshared)
 335{
 336        return false;
 337}
 338#endif /* CONFIG_FAIL_FUTEX */
 339
 340#ifdef CONFIG_COMPAT
 341static void compat_exit_robust_list(struct task_struct *curr);
 342#else
 343static inline void compat_exit_robust_list(struct task_struct *curr) { }
 344#endif
 345
 346/*
 347 * Reflects a new waiter being added to the waitqueue.
 348 */
 349static inline void hb_waiters_inc(struct futex_hash_bucket *hb)
 350{
 351#ifdef CONFIG_SMP
 352        atomic_inc(&hb->waiters);
 353        /*
 354         * Full barrier (A), see the ordering comment above.
 355         */
 356        smp_mb__after_atomic();
 357#endif
 358}
 359
 360/*
 361 * Reflects a waiter being removed from the waitqueue by wakeup
 362 * paths.
 363 */
 364static inline void hb_waiters_dec(struct futex_hash_bucket *hb)
 365{
 366#ifdef CONFIG_SMP
 367        atomic_dec(&hb->waiters);
 368#endif
 369}
 370
 371static inline int hb_waiters_pending(struct futex_hash_bucket *hb)
 372{
 373#ifdef CONFIG_SMP
 374        /*
 375         * Full barrier (B), see the ordering comment above.
 376         */
 377        smp_mb();
 378        return atomic_read(&hb->waiters);
 379#else
 380        return 1;
 381#endif
 382}
 383
 384/**
 385 * hash_futex - Return the hash bucket in the global hash
 386 * @key:        Pointer to the futex key for which the hash is calculated
 387 *
 388 * We hash on the keys returned from get_futex_key (see below) and return the
 389 * corresponding hash bucket in the global hash.
 390 */
 391static struct futex_hash_bucket *hash_futex(union futex_key *key)
 392{
 393        u32 hash = jhash2((u32 *)key, offsetof(typeof(*key), both.offset) / 4,
 394                          key->both.offset);
 395
 396        return &futex_queues[hash & (futex_hashsize - 1)];
 397}
 398
 399
 400/**
 401 * match_futex - Check whether two futex keys are equal
 402 * @key1:       Pointer to key1
 403 * @key2:       Pointer to key2
 404 *
 405 * Return 1 if two futex_keys are equal, 0 otherwise.
 406 */
 407static inline int match_futex(union futex_key *key1, union futex_key *key2)
 408{
 409        return (key1 && key2
 410                && key1->both.word == key2->both.word
 411                && key1->both.ptr == key2->both.ptr
 412                && key1->both.offset == key2->both.offset);
 413}
 414
 415enum futex_access {
 416        FUTEX_READ,
 417        FUTEX_WRITE
 418};
 419
 420/**
 421 * futex_setup_timer - set up the sleeping hrtimer.
 422 * @time:       ptr to the given timeout value
 423 * @timeout:    the hrtimer_sleeper structure to be set up
 424 * @flags:      futex flags
 425 * @range_ns:   optional range in ns
 426 *
 427 * Return: Initialized hrtimer_sleeper structure or NULL if no timeout
 428 *         value given
 429 */
 430static inline struct hrtimer_sleeper *
 431futex_setup_timer(ktime_t *time, struct hrtimer_sleeper *timeout,
 432                  int flags, u64 range_ns)
 433{
 434        if (!time)
 435                return NULL;
 436
 437        hrtimer_init_sleeper_on_stack(timeout, (flags & FLAGS_CLOCKRT) ?
 438                                      CLOCK_REALTIME : CLOCK_MONOTONIC,
 439                                      HRTIMER_MODE_ABS);
 440        /*
 441         * If range_ns is 0, calling hrtimer_set_expires_range_ns() is
 442         * effectively the same as calling hrtimer_set_expires().
 443         */
 444        hrtimer_set_expires_range_ns(&timeout->timer, *time, range_ns);
 445
 446        return timeout;
 447}
 448
 449/*
 450 * Generate a machine wide unique identifier for this inode.
 451 *
 452 * This relies on u64 not wrapping in the life-time of the machine; which with
 453 * 1ns resolution means almost 585 years.
 454 *
 455 * This further relies on the fact that a well formed program will not unmap
 456 * the file while it has a (shared) futex waiting on it. This mapping will have
 457 * a file reference which pins the mount and inode.
 458 *
 459 * If for some reason an inode gets evicted and read back in again, it will get
 460 * a new sequence number and will _NOT_ match, even though it is the exact same
 461 * file.
 462 *
 463 * It is important that match_futex() will never have a false-positive, esp.
 464 * for PI futexes that can mess up the state. The above argues that false-negatives
 465 * are only possible for malformed programs.
 466 */
 467static u64 get_inode_sequence_number(struct inode *inode)
 468{
 469        static atomic64_t i_seq;
 470        u64 old;
 471
 472        /* Does the inode already have a sequence number? */
 473        old = atomic64_read(&inode->i_sequence);
 474        if (likely(old))
 475                return old;
 476
 477        for (;;) {
 478                u64 new = atomic64_add_return(1, &i_seq);
 479                if (WARN_ON_ONCE(!new))
 480                        continue;
 481
 482                old = atomic64_cmpxchg_relaxed(&inode->i_sequence, 0, new);
 483                if (old)
 484                        return old;
 485                return new;
 486        }
 487}
 488
 489/**
 490 * get_futex_key() - Get parameters which are the keys for a futex
 491 * @uaddr:      virtual address of the futex
 492 * @fshared:    0 for a PROCESS_PRIVATE futex, 1 for PROCESS_SHARED
 493 * @key:        address where result is stored.
 494 * @rw:         mapping needs to be read/write (values: FUTEX_READ,
 495 *              FUTEX_WRITE)
 496 *
 497 * Return: a negative error code or 0
 498 *
 499 * The key words are stored in @key on success.
 500 *
 501 * For shared mappings (when @fshared), the key is:
 502 *   ( inode->i_sequence, page->index, offset_within_page )
 503 * [ also see get_inode_sequence_number() ]
 504 *
 505 * For private mappings (or when !@fshared), the key is:
 506 *   ( current->mm, address, 0 )
 507 *
 508 * This allows (cross process, where applicable) identification of the futex
 509 * without keeping the page pinned for the duration of the FUTEX_WAIT.
 510 *
 511 * lock_page() might sleep, the caller should not hold a spinlock.
 512 */
 513static int
 514get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key, enum futex_access rw)
 515{
 516        unsigned long address = (unsigned long)uaddr;
 517        struct mm_struct *mm = current->mm;
 518        struct page *page, *tail;
 519        struct address_space *mapping;
 520        int err, ro = 0;
 521
 522        /*
 523         * The futex address must be "naturally" aligned.
 524         */
 525        key->both.offset = address % PAGE_SIZE;
 526        if (unlikely((address % sizeof(u32)) != 0))
 527                return -EINVAL;
 528        address -= key->both.offset;
 529
 530        if (unlikely(!access_ok(uaddr, sizeof(u32))))
 531                return -EFAULT;
 532
 533        if (unlikely(should_fail_futex(fshared)))
 534                return -EFAULT;
 535
 536        /*
 537         * PROCESS_PRIVATE futexes are fast.
 538         * As the mm cannot disappear under us and the 'key' only needs
 539         * virtual address, we dont even have to find the underlying vma.
 540         * Note : We do have to check 'uaddr' is a valid user address,
 541         *        but access_ok() should be faster than find_vma()
 542         */
 543        if (!fshared) {
 544                key->private.mm = mm;
 545                key->private.address = address;
 546                return 0;
 547        }
 548
 549again:
 550        /* Ignore any VERIFY_READ mapping (futex common case) */
 551        if (unlikely(should_fail_futex(fshared)))
 552                return -EFAULT;
 553
 554        err = get_user_pages_fast(address, 1, FOLL_WRITE, &page);
 555        /*
 556         * If write access is not required (eg. FUTEX_WAIT), try
 557         * and get read-only access.
 558         */
 559        if (err == -EFAULT && rw == FUTEX_READ) {
 560                err = get_user_pages_fast(address, 1, 0, &page);
 561                ro = 1;
 562        }
 563        if (err < 0)
 564                return err;
 565        else
 566                err = 0;
 567
 568        /*
 569         * The treatment of mapping from this point on is critical. The page
 570         * lock protects many things but in this context the page lock
 571         * stabilizes mapping, prevents inode freeing in the shared
 572         * file-backed region case and guards against movement to swap cache.
 573         *
 574         * Strictly speaking the page lock is not needed in all cases being
 575         * considered here and page lock forces unnecessarily serialization
 576         * From this point on, mapping will be re-verified if necessary and
 577         * page lock will be acquired only if it is unavoidable
 578         *
 579         * Mapping checks require the head page for any compound page so the
 580         * head page and mapping is looked up now. For anonymous pages, it
 581         * does not matter if the page splits in the future as the key is
 582         * based on the address. For filesystem-backed pages, the tail is
 583         * required as the index of the page determines the key. For
 584         * base pages, there is no tail page and tail == page.
 585         */
 586        tail = page;
 587        page = compound_head(page);
 588        mapping = READ_ONCE(page->mapping);
 589
 590        /*
 591         * If page->mapping is NULL, then it cannot be a PageAnon
 592         * page; but it might be the ZERO_PAGE or in the gate area or
 593         * in a special mapping (all cases which we are happy to fail);
 594         * or it may have been a good file page when get_user_pages_fast
 595         * found it, but truncated or holepunched or subjected to
 596         * invalidate_complete_page2 before we got the page lock (also
 597         * cases which we are happy to fail).  And we hold a reference,
 598         * so refcount care in invalidate_complete_page's remove_mapping
 599         * prevents drop_caches from setting mapping to NULL beneath us.
 600         *
 601         * The case we do have to guard against is when memory pressure made
 602         * shmem_writepage move it from filecache to swapcache beneath us:
 603         * an unlikely race, but we do need to retry for page->mapping.
 604         */
 605        if (unlikely(!mapping)) {
 606                int shmem_swizzled;
 607
 608                /*
 609                 * Page lock is required to identify which special case above
 610                 * applies. If this is really a shmem page then the page lock
 611                 * will prevent unexpected transitions.
 612                 */
 613                lock_page(page);
 614                shmem_swizzled = PageSwapCache(page) || page->mapping;
 615                unlock_page(page);
 616                put_page(page);
 617
 618                if (shmem_swizzled)
 619                        goto again;
 620
 621                return -EFAULT;
 622        }
 623
 624        /*
 625         * Private mappings are handled in a simple way.
 626         *
 627         * If the futex key is stored on an anonymous page, then the associated
 628         * object is the mm which is implicitly pinned by the calling process.
 629         *
 630         * NOTE: When userspace waits on a MAP_SHARED mapping, even if
 631         * it's a read-only handle, it's expected that futexes attach to
 632         * the object not the particular process.
 633         */
 634        if (PageAnon(page)) {
 635                /*
 636                 * A RO anonymous page will never change and thus doesn't make
 637                 * sense for futex operations.
 638                 */
 639                if (unlikely(should_fail_futex(fshared)) || ro) {
 640                        err = -EFAULT;
 641                        goto out;
 642                }
 643
 644                key->both.offset |= FUT_OFF_MMSHARED; /* ref taken on mm */
 645                key->private.mm = mm;
 646                key->private.address = address;
 647
 648        } else {
 649                struct inode *inode;
 650
 651                /*
 652                 * The associated futex object in this case is the inode and
 653                 * the page->mapping must be traversed. Ordinarily this should
 654                 * be stabilised under page lock but it's not strictly
 655                 * necessary in this case as we just want to pin the inode, not
 656                 * update the radix tree or anything like that.
 657                 *
 658                 * The RCU read lock is taken as the inode is finally freed
 659                 * under RCU. If the mapping still matches expectations then the
 660                 * mapping->host can be safely accessed as being a valid inode.
 661                 */
 662                rcu_read_lock();
 663
 664                if (READ_ONCE(page->mapping) != mapping) {
 665                        rcu_read_unlock();
 666                        put_page(page);
 667
 668                        goto again;
 669                }
 670
 671                inode = READ_ONCE(mapping->host);
 672                if (!inode) {
 673                        rcu_read_unlock();
 674                        put_page(page);
 675
 676                        goto again;
 677                }
 678
 679                key->both.offset |= FUT_OFF_INODE; /* inode-based key */
 680                key->shared.i_seq = get_inode_sequence_number(inode);
 681                key->shared.pgoff = basepage_index(tail);
 682                rcu_read_unlock();
 683        }
 684
 685out:
 686        put_page(page);
 687        return err;
 688}
 689
 690static inline void put_futex_key(union futex_key *key)
 691{
 692}
 693
 694/**
 695 * fault_in_user_writeable() - Fault in user address and verify RW access
 696 * @uaddr:      pointer to faulting user space address
 697 *
 698 * Slow path to fixup the fault we just took in the atomic write
 699 * access to @uaddr.
 700 *
 701 * We have no generic implementation of a non-destructive write to the
 702 * user address. We know that we faulted in the atomic pagefault
 703 * disabled section so we can as well avoid the #PF overhead by
 704 * calling get_user_pages() right away.
 705 */
 706static int fault_in_user_writeable(u32 __user *uaddr)
 707{
 708        struct mm_struct *mm = current->mm;
 709        int ret;
 710
 711        down_read(&mm->mmap_sem);
 712        ret = fixup_user_fault(current, mm, (unsigned long)uaddr,
 713                               FAULT_FLAG_WRITE, NULL);
 714        up_read(&mm->mmap_sem);
 715
 716        return ret < 0 ? ret : 0;
 717}
 718
 719/**
 720 * futex_top_waiter() - Return the highest priority waiter on a futex
 721 * @hb:         the hash bucket the futex_q's reside in
 722 * @key:        the futex key (to distinguish it from other futex futex_q's)
 723 *
 724 * Must be called with the hb lock held.
 725 */
 726static struct futex_q *futex_top_waiter(struct futex_hash_bucket *hb,
 727                                        union futex_key *key)
 728{
 729        struct futex_q *this;
 730
 731        plist_for_each_entry(this, &hb->chain, list) {
 732                if (match_futex(&this->key, key))
 733                        return this;
 734        }
 735        return NULL;
 736}
 737
 738static int cmpxchg_futex_value_locked(u32 *curval, u32 __user *uaddr,
 739                                      u32 uval, u32 newval)
 740{
 741        int ret;
 742
 743        pagefault_disable();
 744        ret = futex_atomic_cmpxchg_inatomic(curval, uaddr, uval, newval);
 745        pagefault_enable();
 746
 747        return ret;
 748}
 749
 750static int get_futex_value_locked(u32 *dest, u32 __user *from)
 751{
 752        int ret;
 753
 754        pagefault_disable();
 755        ret = __get_user(*dest, from);
 756        pagefault_enable();
 757
 758        return ret ? -EFAULT : 0;
 759}
 760
 761
 762/*
 763 * PI code:
 764 */
 765static int refill_pi_state_cache(void)
 766{
 767        struct futex_pi_state *pi_state;
 768
 769        if (likely(current->pi_state_cache))
 770                return 0;
 771
 772        pi_state = kzalloc(sizeof(*pi_state), GFP_KERNEL);
 773
 774        if (!pi_state)
 775                return -ENOMEM;
 776
 777        INIT_LIST_HEAD(&pi_state->list);
 778        /* pi_mutex gets initialized later */
 779        pi_state->owner = NULL;
 780        refcount_set(&pi_state->refcount, 1);
 781        pi_state->key = FUTEX_KEY_INIT;
 782
 783        current->pi_state_cache = pi_state;
 784
 785        return 0;
 786}
 787
 788static struct futex_pi_state *alloc_pi_state(void)
 789{
 790        struct futex_pi_state *pi_state = current->pi_state_cache;
 791
 792        WARN_ON(!pi_state);
 793        current->pi_state_cache = NULL;
 794
 795        return pi_state;
 796}
 797
 798static void get_pi_state(struct futex_pi_state *pi_state)
 799{
 800        WARN_ON_ONCE(!refcount_inc_not_zero(&pi_state->refcount));
 801}
 802
 803/*
 804 * Drops a reference to the pi_state object and frees or caches it
 805 * when the last reference is gone.
 806 */
 807static void put_pi_state(struct futex_pi_state *pi_state)
 808{
 809        if (!pi_state)
 810                return;
 811
 812        if (!refcount_dec_and_test(&pi_state->refcount))
 813                return;
 814
 815        /*
 816         * If pi_state->owner is NULL, the owner is most probably dying
 817         * and has cleaned up the pi_state already
 818         */
 819        if (pi_state->owner) {
 820                struct task_struct *owner;
 821
 822                raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
 823                owner = pi_state->owner;
 824                if (owner) {
 825                        raw_spin_lock(&owner->pi_lock);
 826                        list_del_init(&pi_state->list);
 827                        raw_spin_unlock(&owner->pi_lock);
 828                }
 829                rt_mutex_proxy_unlock(&pi_state->pi_mutex, owner);
 830                raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
 831        }
 832
 833        if (current->pi_state_cache) {
 834                kfree(pi_state);
 835        } else {
 836                /*
 837                 * pi_state->list is already empty.
 838                 * clear pi_state->owner.
 839                 * refcount is at 0 - put it back to 1.
 840                 */
 841                pi_state->owner = NULL;
 842                refcount_set(&pi_state->refcount, 1);
 843                current->pi_state_cache = pi_state;
 844        }
 845}
 846
 847#ifdef CONFIG_FUTEX_PI
 848
 849/*
 850 * This task is holding PI mutexes at exit time => bad.
 851 * Kernel cleans up PI-state, but userspace is likely hosed.
 852 * (Robust-futex cleanup is separate and might save the day for userspace.)
 853 */
 854static void exit_pi_state_list(struct task_struct *curr)
 855{
 856        struct list_head *next, *head = &curr->pi_state_list;
 857        struct futex_pi_state *pi_state;
 858        struct futex_hash_bucket *hb;
 859        union futex_key key = FUTEX_KEY_INIT;
 860
 861        if (!futex_cmpxchg_enabled)
 862                return;
 863        /*
 864         * We are a ZOMBIE and nobody can enqueue itself on
 865         * pi_state_list anymore, but we have to be careful
 866         * versus waiters unqueueing themselves:
 867         */
 868        raw_spin_lock_irq(&curr->pi_lock);
 869        while (!list_empty(head)) {
 870                next = head->next;
 871                pi_state = list_entry(next, struct futex_pi_state, list);
 872                key = pi_state->key;
 873                hb = hash_futex(&key);
 874
 875                /*
 876                 * We can race against put_pi_state() removing itself from the
 877                 * list (a waiter going away). put_pi_state() will first
 878                 * decrement the reference count and then modify the list, so
 879                 * its possible to see the list entry but fail this reference
 880                 * acquire.
 881                 *
 882                 * In that case; drop the locks to let put_pi_state() make
 883                 * progress and retry the loop.
 884                 */
 885                if (!refcount_inc_not_zero(&pi_state->refcount)) {
 886                        raw_spin_unlock_irq(&curr->pi_lock);
 887                        cpu_relax();
 888                        raw_spin_lock_irq(&curr->pi_lock);
 889                        continue;
 890                }
 891                raw_spin_unlock_irq(&curr->pi_lock);
 892
 893                spin_lock(&hb->lock);
 894                raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
 895                raw_spin_lock(&curr->pi_lock);
 896                /*
 897                 * We dropped the pi-lock, so re-check whether this
 898                 * task still owns the PI-state:
 899                 */
 900                if (head->next != next) {
 901                        /* retain curr->pi_lock for the loop invariant */
 902                        raw_spin_unlock(&pi_state->pi_mutex.wait_lock);
 903                        spin_unlock(&hb->lock);
 904                        put_pi_state(pi_state);
 905                        continue;
 906                }
 907
 908                WARN_ON(pi_state->owner != curr);
 909                WARN_ON(list_empty(&pi_state->list));
 910                list_del_init(&pi_state->list);
 911                pi_state->owner = NULL;
 912
 913                raw_spin_unlock(&curr->pi_lock);
 914                raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
 915                spin_unlock(&hb->lock);
 916
 917                rt_mutex_futex_unlock(&pi_state->pi_mutex);
 918                put_pi_state(pi_state);
 919
 920                raw_spin_lock_irq(&curr->pi_lock);
 921        }
 922        raw_spin_unlock_irq(&curr->pi_lock);
 923}
 924#else
 925static inline void exit_pi_state_list(struct task_struct *curr) { }
 926#endif
 927
 928/*
 929 * We need to check the following states:
 930 *
 931 *      Waiter | pi_state | pi->owner | uTID      | uODIED | ?
 932 *
 933 * [1]  NULL   | ---      | ---       | 0         | 0/1    | Valid
 934 * [2]  NULL   | ---      | ---       | >0        | 0/1    | Valid
 935 *
 936 * [3]  Found  | NULL     | --        | Any       | 0/1    | Invalid
 937 *
 938 * [4]  Found  | Found    | NULL      | 0         | 1      | Valid
 939 * [5]  Found  | Found    | NULL      | >0        | 1      | Invalid
 940 *
 941 * [6]  Found  | Found    | task      | 0         | 1      | Valid
 942 *
 943 * [7]  Found  | Found    | NULL      | Any       | 0      | Invalid
 944 *
 945 * [8]  Found  | Found    | task      | ==taskTID | 0/1    | Valid
 946 * [9]  Found  | Found    | task      | 0         | 0      | Invalid
 947 * [10] Found  | Found    | task      | !=taskTID | 0/1    | Invalid
 948 *
 949 * [1]  Indicates that the kernel can acquire the futex atomically. We
 950 *      came came here due to a stale FUTEX_WAITERS/FUTEX_OWNER_DIED bit.
 951 *
 952 * [2]  Valid, if TID does not belong to a kernel thread. If no matching
 953 *      thread is found then it indicates that the owner TID has died.
 954 *
 955 * [3]  Invalid. The waiter is queued on a non PI futex
 956 *
 957 * [4]  Valid state after exit_robust_list(), which sets the user space
 958 *      value to FUTEX_WAITERS | FUTEX_OWNER_DIED.
 959 *
 960 * [5]  The user space value got manipulated between exit_robust_list()
 961 *      and exit_pi_state_list()
 962 *
 963 * [6]  Valid state after exit_pi_state_list() which sets the new owner in
 964 *      the pi_state but cannot access the user space value.
 965 *
 966 * [7]  pi_state->owner can only be NULL when the OWNER_DIED bit is set.
 967 *
 968 * [8]  Owner and user space value match
 969 *
 970 * [9]  There is no transient state which sets the user space TID to 0
 971 *      except exit_robust_list(), but this is indicated by the
 972 *      FUTEX_OWNER_DIED bit. See [4]
 973 *
 974 * [10] There is no transient state which leaves owner and user space
 975 *      TID out of sync.
 976 *
 977 *
 978 * Serialization and lifetime rules:
 979 *
 980 * hb->lock:
 981 *
 982 *      hb -> futex_q, relation
 983 *      futex_q -> pi_state, relation
 984 *
 985 *      (cannot be raw because hb can contain arbitrary amount
 986 *       of futex_q's)
 987 *
 988 * pi_mutex->wait_lock:
 989 *
 990 *      {uval, pi_state}
 991 *
 992 *      (and pi_mutex 'obviously')
 993 *
 994 * p->pi_lock:
 995 *
 996 *      p->pi_state_list -> pi_state->list, relation
 997 *
 998 * pi_state->refcount:
 999 *
1000 *      pi_state lifetime
1001 *
1002 *
1003 * Lock order:
1004 *
1005 *   hb->lock
1006 *     pi_mutex->wait_lock
1007 *       p->pi_lock
1008 *
1009 */
1010
1011/*
1012 * Validate that the existing waiter has a pi_state and sanity check
1013 * the pi_state against the user space value. If correct, attach to
1014 * it.
1015 */
1016static int attach_to_pi_state(u32 __user *uaddr, u32 uval,
1017                              struct futex_pi_state *pi_state,
1018                              struct futex_pi_state **ps)
1019{
1020        pid_t pid = uval & FUTEX_TID_MASK;
1021        u32 uval2;
1022        int ret;
1023
1024        /*
1025         * Userspace might have messed up non-PI and PI futexes [3]
1026         */
1027        if (unlikely(!pi_state))
1028                return -EINVAL;
1029
1030        /*
1031         * We get here with hb->lock held, and having found a
1032         * futex_top_waiter(). This means that futex_lock_pi() of said futex_q
1033         * has dropped the hb->lock in between queue_me() and unqueue_me_pi(),
1034         * which in turn means that futex_lock_pi() still has a reference on
1035         * our pi_state.
1036         *
1037         * The waiter holding a reference on @pi_state also protects against
1038         * the unlocked put_pi_state() in futex_unlock_pi(), futex_lock_pi()
1039         * and futex_wait_requeue_pi() as it cannot go to 0 and consequently
1040         * free pi_state before we can take a reference ourselves.
1041         */
1042        WARN_ON(!refcount_read(&pi_state->refcount));
1043
1044        /*
1045         * Now that we have a pi_state, we can acquire wait_lock
1046         * and do the state validation.
1047         */
1048        raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
1049
1050        /*
1051         * Since {uval, pi_state} is serialized by wait_lock, and our current
1052         * uval was read without holding it, it can have changed. Verify it
1053         * still is what we expect it to be, otherwise retry the entire
1054         * operation.
1055         */
1056        if (get_futex_value_locked(&uval2, uaddr))
1057                goto out_efault;
1058
1059        if (uval != uval2)
1060                goto out_eagain;
1061
1062        /*
1063         * Handle the owner died case:
1064         */
1065        if (uval & FUTEX_OWNER_DIED) {
1066                /*
1067                 * exit_pi_state_list sets owner to NULL and wakes the
1068                 * topmost waiter. The task which acquires the
1069                 * pi_state->rt_mutex will fixup owner.
1070                 */
1071                if (!pi_state->owner) {
1072                        /*
1073                         * No pi state owner, but the user space TID
1074                         * is not 0. Inconsistent state. [5]
1075                         */
1076                        if (pid)
1077                                goto out_einval;
1078                        /*
1079                         * Take a ref on the state and return success. [4]
1080                         */
1081                        goto out_attach;
1082                }
1083
1084                /*
1085                 * If TID is 0, then either the dying owner has not
1086                 * yet executed exit_pi_state_list() or some waiter
1087                 * acquired the rtmutex in the pi state, but did not
1088                 * yet fixup the TID in user space.
1089                 *
1090                 * Take a ref on the state and return success. [6]
1091                 */
1092                if (!pid)
1093                        goto out_attach;
1094        } else {
1095                /*
1096                 * If the owner died bit is not set, then the pi_state
1097                 * must have an owner. [7]
1098                 */
1099                if (!pi_state->owner)
1100                        goto out_einval;
1101        }
1102
1103        /*
1104         * Bail out if user space manipulated the futex value. If pi
1105         * state exists then the owner TID must be the same as the
1106         * user space TID. [9/10]
1107         */
1108        if (pid != task_pid_vnr(pi_state->owner))
1109                goto out_einval;
1110
1111out_attach:
1112        get_pi_state(pi_state);
1113        raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
1114        *ps = pi_state;
1115        return 0;
1116
1117out_einval:
1118        ret = -EINVAL;
1119        goto out_error;
1120
1121out_eagain:
1122        ret = -EAGAIN;
1123        goto out_error;
1124
1125out_efault:
1126        ret = -EFAULT;
1127        goto out_error;
1128
1129out_error:
1130        raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
1131        return ret;
1132}
1133
1134/**
1135 * wait_for_owner_exiting - Block until the owner has exited
1136 * @ret: owner's current futex lock status
1137 * @exiting:    Pointer to the exiting task
1138 *
1139 * Caller must hold a refcount on @exiting.
1140 */
1141static void wait_for_owner_exiting(int ret, struct task_struct *exiting)
1142{
1143        if (ret != -EBUSY) {
1144                WARN_ON_ONCE(exiting);
1145                return;
1146        }
1147
1148        if (WARN_ON_ONCE(ret == -EBUSY && !exiting))
1149                return;
1150
1151        mutex_lock(&exiting->futex_exit_mutex);
1152        /*
1153         * No point in doing state checking here. If the waiter got here
1154         * while the task was in exec()->exec_futex_release() then it can
1155         * have any FUTEX_STATE_* value when the waiter has acquired the
1156         * mutex. OK, if running, EXITING or DEAD if it reached exit()
1157         * already. Highly unlikely and not a problem. Just one more round
1158         * through the futex maze.
1159         */
1160        mutex_unlock(&exiting->futex_exit_mutex);
1161
1162        put_task_struct(exiting);
1163}
1164
1165static int handle_exit_race(u32 __user *uaddr, u32 uval,
1166                            struct task_struct *tsk)
1167{
1168        u32 uval2;
1169
1170        /*
1171         * If the futex exit state is not yet FUTEX_STATE_DEAD, tell the
1172         * caller that the alleged owner is busy.
1173         */
1174        if (tsk && tsk->futex_state != FUTEX_STATE_DEAD)
1175                return -EBUSY;
1176
1177        /*
1178         * Reread the user space value to handle the following situation:
1179         *
1180         * CPU0                         CPU1
1181         *
1182         * sys_exit()                   sys_futex()
1183         *  do_exit()                    futex_lock_pi()
1184         *                                futex_lock_pi_atomic()
1185         *   exit_signals(tsk)              No waiters:
1186         *    tsk->flags |= PF_EXITING;     *uaddr == 0x00000PID
1187         *  mm_release(tsk)                 Set waiter bit
1188         *   exit_robust_list(tsk) {        *uaddr = 0x80000PID;
1189         *      Set owner died              attach_to_pi_owner() {
1190         *    *uaddr = 0xC0000000;           tsk = get_task(PID);
1191         *   }                               if (!tsk->flags & PF_EXITING) {
1192         *  ...                                attach();
1193         *  tsk->futex_state =               } else {
1194         *      FUTEX_STATE_DEAD;              if (tsk->futex_state !=
1195         *                                        FUTEX_STATE_DEAD)
1196         *                                       return -EAGAIN;
1197         *                                     return -ESRCH; <--- FAIL
1198         *                                   }
1199         *
1200         * Returning ESRCH unconditionally is wrong here because the
1201         * user space value has been changed by the exiting task.
1202         *
1203         * The same logic applies to the case where the exiting task is
1204         * already gone.
1205         */
1206        if (get_futex_value_locked(&uval2, uaddr))
1207                return -EFAULT;
1208
1209        /* If the user space value has changed, try again. */
1210        if (uval2 != uval)
1211                return -EAGAIN;
1212
1213        /*
1214         * The exiting task did not have a robust list, the robust list was
1215         * corrupted or the user space value in *uaddr is simply bogus.
1216         * Give up and tell user space.
1217         */
1218        return -ESRCH;
1219}
1220
1221/*
1222 * Lookup the task for the TID provided from user space and attach to
1223 * it after doing proper sanity checks.
1224 */
1225static int attach_to_pi_owner(u32 __user *uaddr, u32 uval, union futex_key *key,
1226                              struct futex_pi_state **ps,
1227                              struct task_struct **exiting)
1228{
1229        pid_t pid = uval & FUTEX_TID_MASK;
1230        struct futex_pi_state *pi_state;
1231        struct task_struct *p;
1232
1233        /*
1234         * We are the first waiter - try to look up the real owner and attach
1235         * the new pi_state to it, but bail out when TID = 0 [1]
1236         *
1237         * The !pid check is paranoid. None of the call sites should end up
1238         * with pid == 0, but better safe than sorry. Let the caller retry
1239         */
1240        if (!pid)
1241                return -EAGAIN;
1242        p = find_get_task_by_vpid(pid);
1243        if (!p)
1244                return handle_exit_race(uaddr, uval, NULL);
1245
1246        if (unlikely(p->flags & PF_KTHREAD)) {
1247                put_task_struct(p);
1248                return -EPERM;
1249        }
1250
1251        /*
1252         * We need to look at the task state to figure out, whether the
1253         * task is exiting. To protect against the change of the task state
1254         * in futex_exit_release(), we do this protected by p->pi_lock:
1255         */
1256        raw_spin_lock_irq(&p->pi_lock);
1257        if (unlikely(p->futex_state != FUTEX_STATE_OK)) {
1258                /*
1259                 * The task is on the way out. When the futex state is
1260                 * FUTEX_STATE_DEAD, we know that the task has finished
1261                 * the cleanup:
1262                 */
1263                int ret = handle_exit_race(uaddr, uval, p);
1264
1265                raw_spin_unlock_irq(&p->pi_lock);
1266                /*
1267                 * If the owner task is between FUTEX_STATE_EXITING and
1268                 * FUTEX_STATE_DEAD then store the task pointer and keep
1269                 * the reference on the task struct. The calling code will
1270                 * drop all locks, wait for the task to reach
1271                 * FUTEX_STATE_DEAD and then drop the refcount. This is
1272                 * required to prevent a live lock when the current task
1273                 * preempted the exiting task between the two states.
1274                 */
1275                if (ret == -EBUSY)
1276                        *exiting = p;
1277                else
1278                        put_task_struct(p);
1279                return ret;
1280        }
1281
1282        /*
1283         * No existing pi state. First waiter. [2]
1284         *
1285         * This creates pi_state, we have hb->lock held, this means nothing can
1286         * observe this state, wait_lock is irrelevant.
1287         */
1288        pi_state = alloc_pi_state();
1289
1290        /*
1291         * Initialize the pi_mutex in locked state and make @p
1292         * the owner of it:
1293         */
1294        rt_mutex_init_proxy_locked(&pi_state->pi_mutex, p);
1295
1296        /* Store the key for possible exit cleanups: */
1297        pi_state->key = *key;
1298
1299        WARN_ON(!list_empty(&pi_state->list));
1300        list_add(&pi_state->list, &p->pi_state_list);
1301        /*
1302         * Assignment without holding pi_state->pi_mutex.wait_lock is safe
1303         * because there is no concurrency as the object is not published yet.
1304         */
1305        pi_state->owner = p;
1306        raw_spin_unlock_irq(&p->pi_lock);
1307
1308        put_task_struct(p);
1309
1310        *ps = pi_state;
1311
1312        return 0;
1313}
1314
1315static int lookup_pi_state(u32 __user *uaddr, u32 uval,
1316                           struct futex_hash_bucket *hb,
1317                           union futex_key *key, struct futex_pi_state **ps,
1318                           struct task_struct **exiting)
1319{
1320        struct futex_q *top_waiter = futex_top_waiter(hb, key);
1321
1322        /*
1323         * If there is a waiter on that futex, validate it and
1324         * attach to the pi_state when the validation succeeds.
1325         */
1326        if (top_waiter)
1327                return attach_to_pi_state(uaddr, uval, top_waiter->pi_state, ps);
1328
1329        /*
1330         * We are the first waiter - try to look up the owner based on
1331         * @uval and attach to it.
1332         */
1333        return attach_to_pi_owner(uaddr, uval, key, ps, exiting);
1334}
1335
1336static int lock_pi_update_atomic(u32 __user *uaddr, u32 uval, u32 newval)
1337{
1338        int err;
1339        u32 uninitialized_var(curval);
1340
1341        if (unlikely(should_fail_futex(true)))
1342                return -EFAULT;
1343
1344        err = cmpxchg_futex_value_locked(&curval, uaddr, uval, newval);
1345        if (unlikely(err))
1346                return err;
1347
1348        /* If user space value changed, let the caller retry */
1349        return curval != uval ? -EAGAIN : 0;
1350}
1351
1352/**
1353 * futex_lock_pi_atomic() - Atomic work required to acquire a pi aware futex
1354 * @uaddr:              the pi futex user address
1355 * @hb:                 the pi futex hash bucket
1356 * @key:                the futex key associated with uaddr and hb
1357 * @ps:                 the pi_state pointer where we store the result of the
1358 *                      lookup
1359 * @task:               the task to perform the atomic lock work for.  This will
1360 *                      be "current" except in the case of requeue pi.
1361 * @exiting:            Pointer to store the task pointer of the owner task
1362 *                      which is in the middle of exiting
1363 * @set_waiters:        force setting the FUTEX_WAITERS bit (1) or not (0)
1364 *
1365 * Return:
1366 *  -  0 - ready to wait;
1367 *  -  1 - acquired the lock;
1368 *  - <0 - error
1369 *
1370 * The hb->lock and futex_key refs shall be held by the caller.
1371 *
1372 * @exiting is only set when the return value is -EBUSY. If so, this holds
1373 * a refcount on the exiting task on return and the caller needs to drop it
1374 * after waiting for the exit to complete.
1375 */
1376static int futex_lock_pi_atomic(u32 __user *uaddr, struct futex_hash_bucket *hb,
1377                                union futex_key *key,
1378                                struct futex_pi_state **ps,
1379                                struct task_struct *task,
1380                                struct task_struct **exiting,
1381                                int set_waiters)
1382{
1383        u32 uval, newval, vpid = task_pid_vnr(task);
1384        struct futex_q *top_waiter;
1385        int ret;
1386
1387        /*
1388         * Read the user space value first so we can validate a few
1389         * things before proceeding further.
1390         */
1391        if (get_futex_value_locked(&uval, uaddr))
1392                return -EFAULT;
1393
1394        if (unlikely(should_fail_futex(true)))
1395                return -EFAULT;
1396
1397        /*
1398         * Detect deadlocks.
1399         */
1400        if ((unlikely((uval & FUTEX_TID_MASK) == vpid)))
1401                return -EDEADLK;
1402
1403        if ((unlikely(should_fail_futex(true))))
1404                return -EDEADLK;
1405
1406        /*
1407         * Lookup existing state first. If it exists, try to attach to
1408         * its pi_state.
1409         */
1410        top_waiter = futex_top_waiter(hb, key);
1411        if (top_waiter)
1412                return attach_to_pi_state(uaddr, uval, top_waiter->pi_state, ps);
1413
1414        /*
1415         * No waiter and user TID is 0. We are here because the
1416         * waiters or the owner died bit is set or called from
1417         * requeue_cmp_pi or for whatever reason something took the
1418         * syscall.
1419         */
1420        if (!(uval & FUTEX_TID_MASK)) {
1421                /*
1422                 * We take over the futex. No other waiters and the user space
1423                 * TID is 0. We preserve the owner died bit.
1424                 */
1425                newval = uval & FUTEX_OWNER_DIED;
1426                newval |= vpid;
1427
1428                /* The futex requeue_pi code can enforce the waiters bit */
1429                if (set_waiters)
1430                        newval |= FUTEX_WAITERS;
1431
1432                ret = lock_pi_update_atomic(uaddr, uval, newval);
1433                /* If the take over worked, return 1 */
1434                return ret < 0 ? ret : 1;
1435        }
1436
1437        /*
1438         * First waiter. Set the waiters bit before attaching ourself to
1439         * the owner. If owner tries to unlock, it will be forced into
1440         * the kernel and blocked on hb->lock.
1441         */
1442        newval = uval | FUTEX_WAITERS;
1443        ret = lock_pi_update_atomic(uaddr, uval, newval);
1444        if (ret)
1445                return ret;
1446        /*
1447         * If the update of the user space value succeeded, we try to
1448         * attach to the owner. If that fails, no harm done, we only
1449         * set the FUTEX_WAITERS bit in the user space variable.
1450         */
1451        return attach_to_pi_owner(uaddr, newval, key, ps, exiting);
1452}
1453
1454/**
1455 * __unqueue_futex() - Remove the futex_q from its futex_hash_bucket
1456 * @q:  The futex_q to unqueue
1457 *
1458 * The q->lock_ptr must not be NULL and must be held by the caller.
1459 */
1460static void __unqueue_futex(struct futex_q *q)
1461{
1462        struct futex_hash_bucket *hb;
1463
1464        if (WARN_ON_SMP(!q->lock_ptr) || WARN_ON(plist_node_empty(&q->list)))
1465                return;
1466        lockdep_assert_held(q->lock_ptr);
1467
1468        hb = container_of(q->lock_ptr, struct futex_hash_bucket, lock);
1469        plist_del(&q->list, &hb->chain);
1470        hb_waiters_dec(hb);
1471}
1472
1473/*
1474 * The hash bucket lock must be held when this is called.
1475 * Afterwards, the futex_q must not be accessed. Callers
1476 * must ensure to later call wake_up_q() for the actual
1477 * wakeups to occur.
1478 */
1479static void mark_wake_futex(struct wake_q_head *wake_q, struct futex_q *q)
1480{
1481        struct task_struct *p = q->task;
1482
1483        if (WARN(q->pi_state || q->rt_waiter, "refusing to wake PI futex\n"))
1484                return;
1485
1486        get_task_struct(p);
1487        __unqueue_futex(q);
1488        /*
1489         * The waiting task can free the futex_q as soon as q->lock_ptr = NULL
1490         * is written, without taking any locks. This is possible in the event
1491         * of a spurious wakeup, for example. A memory barrier is required here
1492         * to prevent the following store to lock_ptr from getting ahead of the
1493         * plist_del in __unqueue_futex().
1494         */
1495        smp_store_release(&q->lock_ptr, NULL);
1496
1497        /*
1498         * Queue the task for later wakeup for after we've released
1499         * the hb->lock.
1500         */
1501        wake_q_add_safe(wake_q, p);
1502}
1503
1504/*
1505 * Caller must hold a reference on @pi_state.
1506 */
1507static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_pi_state *pi_state)
1508{
1509        u32 uninitialized_var(curval), newval;
1510        struct task_struct *new_owner;
1511        bool postunlock = false;
1512        DEFINE_WAKE_Q(wake_q);
1513        int ret = 0;
1514
1515        new_owner = rt_mutex_next_owner(&pi_state->pi_mutex);
1516        if (WARN_ON_ONCE(!new_owner)) {
1517                /*
1518                 * As per the comment in futex_unlock_pi() this should not happen.
1519                 *
1520                 * When this happens, give up our locks and try again, giving
1521                 * the futex_lock_pi() instance time to complete, either by
1522                 * waiting on the rtmutex or removing itself from the futex
1523                 * queue.
1524                 */
1525                ret = -EAGAIN;
1526                goto out_unlock;
1527        }
1528
1529        /*
1530         * We pass it to the next owner. The WAITERS bit is always kept
1531         * enabled while there is PI state around. We cleanup the owner
1532         * died bit, because we are the owner.
1533         */
1534        newval = FUTEX_WAITERS | task_pid_vnr(new_owner);
1535
1536        if (unlikely(should_fail_futex(true)))
1537                ret = -EFAULT;
1538
1539        ret = cmpxchg_futex_value_locked(&curval, uaddr, uval, newval);
1540        if (!ret && (curval != uval)) {
1541                /*
1542                 * If a unconditional UNLOCK_PI operation (user space did not
1543                 * try the TID->0 transition) raced with a waiter setting the
1544                 * FUTEX_WAITERS flag between get_user() and locking the hash
1545                 * bucket lock, retry the operation.
1546                 */
1547                if ((FUTEX_TID_MASK & curval) == uval)
1548                        ret = -EAGAIN;
1549                else
1550                        ret = -EINVAL;
1551        }
1552
1553        if (ret)
1554                goto out_unlock;
1555
1556        /*
1557         * This is a point of no return; once we modify the uval there is no
1558         * going back and subsequent operations must not fail.
1559         */
1560
1561        raw_spin_lock(&pi_state->owner->pi_lock);
1562        WARN_ON(list_empty(&pi_state->list));
1563        list_del_init(&pi_state->list);
1564        raw_spin_unlock(&pi_state->owner->pi_lock);
1565
1566        raw_spin_lock(&new_owner->pi_lock);
1567        WARN_ON(!list_empty(&pi_state->list));
1568        list_add(&pi_state->list, &new_owner->pi_state_list);
1569        pi_state->owner = new_owner;
1570        raw_spin_unlock(&new_owner->pi_lock);
1571
1572        postunlock = __rt_mutex_futex_unlock(&pi_state->pi_mutex, &wake_q);
1573
1574out_unlock:
1575        raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
1576
1577        if (postunlock)
1578                rt_mutex_postunlock(&wake_q);
1579
1580        return ret;
1581}
1582
1583/*
1584 * Express the locking dependencies for lockdep:
1585 */
1586static inline void
1587double_lock_hb(struct futex_hash_bucket *hb1, struct futex_hash_bucket *hb2)
1588{
1589        if (hb1 <= hb2) {
1590                spin_lock(&hb1->lock);
1591                if (hb1 < hb2)
1592                        spin_lock_nested(&hb2->lock, SINGLE_DEPTH_NESTING);
1593        } else { /* hb1 > hb2 */
1594                spin_lock(&hb2->lock);
1595                spin_lock_nested(&hb1->lock, SINGLE_DEPTH_NESTING);
1596        }
1597}
1598
1599static inline void
1600double_unlock_hb(struct futex_hash_bucket *hb1, struct futex_hash_bucket *hb2)
1601{
1602        spin_unlock(&hb1->lock);
1603        if (hb1 != hb2)
1604                spin_unlock(&hb2->lock);
1605}
1606
1607/*
1608 * Wake up waiters matching bitset queued on this futex (uaddr).
1609 */
1610static int
1611futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset)
1612{
1613        struct futex_hash_bucket *hb;
1614        struct futex_q *this, *next;
1615        union futex_key key = FUTEX_KEY_INIT;
1616        int ret;
1617        DEFINE_WAKE_Q(wake_q);
1618
1619        if (!bitset)
1620                return -EINVAL;
1621
1622        ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &key, FUTEX_READ);
1623        if (unlikely(ret != 0))
1624                goto out;
1625
1626        hb = hash_futex(&key);
1627
1628        /* Make sure we really have tasks to wakeup */
1629        if (!hb_waiters_pending(hb))
1630                goto out_put_key;
1631
1632        spin_lock(&hb->lock);
1633
1634        plist_for_each_entry_safe(this, next, &hb->chain, list) {
1635                if (match_futex (&this->key, &key)) {
1636                        if (this->pi_state || this->rt_waiter) {
1637                                ret = -EINVAL;
1638                                break;
1639                        }
1640
1641                        /* Check if one of the bits is set in both bitsets */
1642                        if (!(this->bitset & bitset))
1643                                continue;
1644
1645                        mark_wake_futex(&wake_q, this);
1646                        if (++ret >= nr_wake)
1647                                break;
1648                }
1649        }
1650
1651        spin_unlock(&hb->lock);
1652        wake_up_q(&wake_q);
1653out_put_key:
1654        put_futex_key(&key);
1655out:
1656        return ret;
1657}
1658
1659static int futex_atomic_op_inuser(unsigned int encoded_op, u32 __user *uaddr)
1660{
1661        unsigned int op =         (encoded_op & 0x70000000) >> 28;
1662        unsigned int cmp =        (encoded_op & 0x0f000000) >> 24;
1663        int oparg = sign_extend32((encoded_op & 0x00fff000) >> 12, 11);
1664        int cmparg = sign_extend32(encoded_op & 0x00000fff, 11);
1665        int oldval, ret;
1666
1667        if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28)) {
1668                if (oparg < 0 || oparg > 31) {
1669                        char comm[sizeof(current->comm)];
1670                        /*
1671                         * kill this print and return -EINVAL when userspace
1672                         * is sane again
1673                         */
1674                        pr_info_ratelimited("futex_wake_op: %s tries to shift op by %d; fix this program\n",
1675                                        get_task_comm(comm, current), oparg);
1676                        oparg &= 31;
1677                }
1678                oparg = 1 << oparg;
1679        }
1680
1681        if (!access_ok(uaddr, sizeof(u32)))
1682                return -EFAULT;
1683
1684        ret = arch_futex_atomic_op_inuser(op, oparg, &oldval, uaddr);
1685        if (ret)
1686                return ret;
1687
1688        switch (cmp) {
1689        case FUTEX_OP_CMP_EQ:
1690                return oldval == cmparg;
1691        case FUTEX_OP_CMP_NE:
1692                return oldval != cmparg;
1693        case FUTEX_OP_CMP_LT:
1694                return oldval < cmparg;
1695        case FUTEX_OP_CMP_GE:
1696                return oldval >= cmparg;
1697        case FUTEX_OP_CMP_LE:
1698                return oldval <= cmparg;
1699        case FUTEX_OP_CMP_GT:
1700                return oldval > cmparg;
1701        default:
1702                return -ENOSYS;
1703        }
1704}
1705
1706/*
1707 * Wake up all waiters hashed on the physical page that is mapped
1708 * to this virtual address:
1709 */
1710static int
1711futex_wake_op(u32 __user *uaddr1, unsigned int flags, u32 __user *uaddr2,
1712              int nr_wake, int nr_wake2, int op)
1713{
1714        union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT;
1715        struct futex_hash_bucket *hb1, *hb2;
1716        struct futex_q *this, *next;
1717        int ret, op_ret;
1718        DEFINE_WAKE_Q(wake_q);
1719
1720retry:
1721        ret = get_futex_key(uaddr1, flags & FLAGS_SHARED, &key1, FUTEX_READ);
1722        if (unlikely(ret != 0))
1723                goto out;
1724        ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, FUTEX_WRITE);
1725        if (unlikely(ret != 0))
1726                goto out_put_key1;
1727
1728        hb1 = hash_futex(&key1);
1729        hb2 = hash_futex(&key2);
1730
1731retry_private:
1732        double_lock_hb(hb1, hb2);
1733        op_ret = futex_atomic_op_inuser(op, uaddr2);
1734        if (unlikely(op_ret < 0)) {
1735                double_unlock_hb(hb1, hb2);
1736
1737                if (!IS_ENABLED(CONFIG_MMU) ||
1738                    unlikely(op_ret != -EFAULT && op_ret != -EAGAIN)) {
1739                        /*
1740                         * we don't get EFAULT from MMU faults if we don't have
1741                         * an MMU, but we might get them from range checking
1742                         */
1743                        ret = op_ret;
1744                        goto out_put_keys;
1745                }
1746
1747                if (op_ret == -EFAULT) {
1748                        ret = fault_in_user_writeable(uaddr2);
1749                        if (ret)
1750                                goto out_put_keys;
1751                }
1752
1753                if (!(flags & FLAGS_SHARED)) {
1754                        cond_resched();
1755                        goto retry_private;
1756                }
1757
1758                put_futex_key(&key2);
1759                put_futex_key(&key1);
1760                cond_resched();
1761                goto retry;
1762        }
1763
1764        plist_for_each_entry_safe(this, next, &hb1->chain, list) {
1765                if (match_futex (&this->key, &key1)) {
1766                        if (this->pi_state || this->rt_waiter) {
1767                                ret = -EINVAL;
1768                                goto out_unlock;
1769                        }
1770                        mark_wake_futex(&wake_q, this);
1771                        if (++ret >= nr_wake)
1772                                break;
1773                }
1774        }
1775
1776        if (op_ret > 0) {
1777                op_ret = 0;
1778                plist_for_each_entry_safe(this, next, &hb2->chain, list) {
1779                        if (match_futex (&this->key, &key2)) {
1780                                if (this->pi_state || this->rt_waiter) {
1781                                        ret = -EINVAL;
1782                                        goto out_unlock;
1783                                }
1784                                mark_wake_futex(&wake_q, this);
1785                                if (++op_ret >= nr_wake2)
1786                                        break;
1787                        }
1788                }
1789                ret += op_ret;
1790        }
1791
1792out_unlock:
1793        double_unlock_hb(hb1, hb2);
1794        wake_up_q(&wake_q);
1795out_put_keys:
1796        put_futex_key(&key2);
1797out_put_key1:
1798        put_futex_key(&key1);
1799out:
1800        return ret;
1801}
1802
1803/**
1804 * requeue_futex() - Requeue a futex_q from one hb to another
1805 * @q:          the futex_q to requeue
1806 * @hb1:        the source hash_bucket
1807 * @hb2:        the target hash_bucket
1808 * @key2:       the new key for the requeued futex_q
1809 */
1810static inline
1811void requeue_futex(struct futex_q *q, struct futex_hash_bucket *hb1,
1812                   struct futex_hash_bucket *hb2, union futex_key *key2)
1813{
1814
1815        /*
1816         * If key1 and key2 hash to the same bucket, no need to
1817         * requeue.
1818         */
1819        if (likely(&hb1->chain != &hb2->chain)) {
1820                plist_del(&q->list, &hb1->chain);
1821                hb_waiters_dec(hb1);
1822                hb_waiters_inc(hb2);
1823                plist_add(&q->list, &hb2->chain);
1824                q->lock_ptr = &hb2->lock;
1825        }
1826        q->key = *key2;
1827}
1828
1829/**
1830 * requeue_pi_wake_futex() - Wake a task that acquired the lock during requeue
1831 * @q:          the futex_q
1832 * @key:        the key of the requeue target futex
1833 * @hb:         the hash_bucket of the requeue target futex
1834 *
1835 * During futex_requeue, with requeue_pi=1, it is possible to acquire the
1836 * target futex if it is uncontended or via a lock steal.  Set the futex_q key
1837 * to the requeue target futex so the waiter can detect the wakeup on the right
1838 * futex, but remove it from the hb and NULL the rt_waiter so it can detect
1839 * atomic lock acquisition.  Set the q->lock_ptr to the requeue target hb->lock
1840 * to protect access to the pi_state to fixup the owner later.  Must be called
1841 * with both q->lock_ptr and hb->lock held.
1842 */
1843static inline
1844void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key,
1845                           struct futex_hash_bucket *hb)
1846{
1847        q->key = *key;
1848
1849        __unqueue_futex(q);
1850
1851        WARN_ON(!q->rt_waiter);
1852        q->rt_waiter = NULL;
1853
1854        q->lock_ptr = &hb->lock;
1855
1856        wake_up_state(q->task, TASK_NORMAL);
1857}
1858
1859/**
1860 * futex_proxy_trylock_atomic() - Attempt an atomic lock for the top waiter
1861 * @pifutex:            the user address of the to futex
1862 * @hb1:                the from futex hash bucket, must be locked by the caller
1863 * @hb2:                the to futex hash bucket, must be locked by the caller
1864 * @key1:               the from futex key
1865 * @key2:               the to futex key
1866 * @ps:                 address to store the pi_state pointer
1867 * @exiting:            Pointer to store the task pointer of the owner task
1868 *                      which is in the middle of exiting
1869 * @set_waiters:        force setting the FUTEX_WAITERS bit (1) or not (0)
1870 *
1871 * Try and get the lock on behalf of the top waiter if we can do it atomically.
1872 * Wake the top waiter if we succeed.  If the caller specified set_waiters,
1873 * then direct futex_lock_pi_atomic() to force setting the FUTEX_WAITERS bit.
1874 * hb1 and hb2 must be held by the caller.
1875 *
1876 * @exiting is only set when the return value is -EBUSY. If so, this holds
1877 * a refcount on the exiting task on return and the caller needs to drop it
1878 * after waiting for the exit to complete.
1879 *
1880 * Return:
1881 *  -  0 - failed to acquire the lock atomically;
1882 *  - >0 - acquired the lock, return value is vpid of the top_waiter
1883 *  - <0 - error
1884 */
1885static int
1886futex_proxy_trylock_atomic(u32 __user *pifutex, struct futex_hash_bucket *hb1,
1887                           struct futex_hash_bucket *hb2, union futex_key *key1,
1888                           union futex_key *key2, struct futex_pi_state **ps,
1889                           struct task_struct **exiting, int set_waiters)
1890{
1891        struct futex_q *top_waiter = NULL;
1892        u32 curval;
1893        int ret, vpid;
1894
1895        if (get_futex_value_locked(&curval, pifutex))
1896                return -EFAULT;
1897
1898        if (unlikely(should_fail_futex(true)))
1899                return -EFAULT;
1900
1901        /*
1902         * Find the top_waiter and determine if there are additional waiters.
1903         * If the caller intends to requeue more than 1 waiter to pifutex,
1904         * force futex_lock_pi_atomic() to set the FUTEX_WAITERS bit now,
1905         * as we have means to handle the possible fault.  If not, don't set
1906         * the bit unecessarily as it will force the subsequent unlock to enter
1907         * the kernel.
1908         */
1909        top_waiter = futex_top_waiter(hb1, key1);
1910
1911        /* There are no waiters, nothing for us to do. */
1912        if (!top_waiter)
1913                return 0;
1914
1915        /* Ensure we requeue to the expected futex. */
1916        if (!match_futex(top_waiter->requeue_pi_key, key2))
1917                return -EINVAL;
1918
1919        /*
1920         * Try to take the lock for top_waiter.  Set the FUTEX_WAITERS bit in
1921         * the contended case or if set_waiters is 1.  The pi_state is returned
1922         * in ps in contended cases.
1923         */
1924        vpid = task_pid_vnr(top_waiter->task);
1925        ret = futex_lock_pi_atomic(pifutex, hb2, key2, ps, top_waiter->task,
1926                                   exiting, set_waiters);
1927        if (ret == 1) {
1928                requeue_pi_wake_futex(top_waiter, key2, hb2);
1929                return vpid;
1930        }
1931        return ret;
1932}
1933
1934/**
1935 * futex_requeue() - Requeue waiters from uaddr1 to uaddr2
1936 * @uaddr1:     source futex user address
1937 * @flags:      futex flags (FLAGS_SHARED, etc.)
1938 * @uaddr2:     target futex user address
1939 * @nr_wake:    number of waiters to wake (must be 1 for requeue_pi)
1940 * @nr_requeue: number of waiters to requeue (0-INT_MAX)
1941 * @cmpval:     @uaddr1 expected value (or %NULL)
1942 * @requeue_pi: if we are attempting to requeue from a non-pi futex to a
1943 *              pi futex (pi to pi requeue is not supported)
1944 *
1945 * Requeue waiters on uaddr1 to uaddr2. In the requeue_pi case, try to acquire
1946 * uaddr2 atomically on behalf of the top waiter.
1947 *
1948 * Return:
1949 *  - >=0 - on success, the number of tasks requeued or woken;
1950 *  -  <0 - on error
1951 */
1952static int futex_requeue(u32 __user *uaddr1, unsigned int flags,
1953                         u32 __user *uaddr2, int nr_wake, int nr_requeue,
1954                         u32 *cmpval, int requeue_pi)
1955{
1956        union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT;
1957        int task_count = 0, ret;
1958        struct futex_pi_state *pi_state = NULL;
1959        struct futex_hash_bucket *hb1, *hb2;
1960        struct futex_q *this, *next;
1961        DEFINE_WAKE_Q(wake_q);
1962
1963        if (nr_wake < 0 || nr_requeue < 0)
1964                return -EINVAL;
1965
1966        /*
1967         * When PI not supported: return -ENOSYS if requeue_pi is true,
1968         * consequently the compiler knows requeue_pi is always false past
1969         * this point which will optimize away all the conditional code
1970         * further down.
1971         */
1972        if (!IS_ENABLED(CONFIG_FUTEX_PI) && requeue_pi)
1973                return -ENOSYS;
1974
1975        if (requeue_pi) {
1976                /*
1977                 * Requeue PI only works on two distinct uaddrs. This
1978                 * check is only valid for private futexes. See below.
1979                 */
1980                if (uaddr1 == uaddr2)
1981                        return -EINVAL;
1982
1983                /*
1984                 * requeue_pi requires a pi_state, try to allocate it now
1985                 * without any locks in case it fails.
1986                 */
1987                if (refill_pi_state_cache())
1988                        return -ENOMEM;
1989                /*
1990                 * requeue_pi must wake as many tasks as it can, up to nr_wake
1991                 * + nr_requeue, since it acquires the rt_mutex prior to
1992                 * returning to userspace, so as to not leave the rt_mutex with
1993                 * waiters and no owner.  However, second and third wake-ups
1994                 * cannot be predicted as they involve race conditions with the
1995                 * first wake and a fault while looking up the pi_state.  Both
1996                 * pthread_cond_signal() and pthread_cond_broadcast() should
1997                 * use nr_wake=1.
1998                 */
1999                if (nr_wake != 1)
2000                        return -EINVAL;
2001        }
2002
2003retry:
2004        ret = get_futex_key(uaddr1, flags & FLAGS_SHARED, &key1, FUTEX_READ);
2005        if (unlikely(ret != 0))
2006                goto out;
2007        ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2,
2008                            requeue_pi ? FUTEX_WRITE : FUTEX_READ);
2009        if (unlikely(ret != 0))
2010                goto out_put_key1;
2011
2012        /*
2013         * The check above which compares uaddrs is not sufficient for
2014         * shared futexes. We need to compare the keys:
2015         */
2016        if (requeue_pi && match_futex(&key1, &key2)) {
2017                ret = -EINVAL;
2018                goto out_put_keys;
2019        }
2020
2021        hb1 = hash_futex(&key1);
2022        hb2 = hash_futex(&key2);
2023
2024retry_private:
2025        hb_waiters_inc(hb2);
2026        double_lock_hb(hb1, hb2);
2027
2028        if (likely(cmpval != NULL)) {
2029                u32 curval;
2030
2031                ret = get_futex_value_locked(&curval, uaddr1);
2032
2033                if (unlikely(ret)) {
2034                        double_unlock_hb(hb1, hb2);
2035                        hb_waiters_dec(hb2);
2036
2037                        ret = get_user(curval, uaddr1);
2038                        if (ret)
2039                                goto out_put_keys;
2040
2041                        if (!(flags & FLAGS_SHARED))
2042                                goto retry_private;
2043
2044                        put_futex_key(&key2);
2045                        put_futex_key(&key1);
2046                        goto retry;
2047                }
2048                if (curval != *cmpval) {
2049                        ret = -EAGAIN;
2050                        goto out_unlock;
2051                }
2052        }
2053
2054        if (requeue_pi && (task_count - nr_wake < nr_requeue)) {
2055                struct task_struct *exiting = NULL;
2056
2057                /*
2058                 * Attempt to acquire uaddr2 and wake the top waiter. If we
2059                 * intend to requeue waiters, force setting the FUTEX_WAITERS
2060                 * bit.  We force this here where we are able to easily handle
2061                 * faults rather in the requeue loop below.
2062                 */
2063                ret = futex_proxy_trylock_atomic(uaddr2, hb1, hb2, &key1,
2064                                                 &key2, &pi_state,
2065                                                 &exiting, nr_requeue);
2066
2067                /*
2068                 * At this point the top_waiter has either taken uaddr2 or is
2069                 * waiting on it.  If the former, then the pi_state will not
2070                 * exist yet, look it up one more time to ensure we have a
2071                 * reference to it. If the lock was taken, ret contains the
2072                 * vpid of the top waiter task.
2073                 * If the lock was not taken, we have pi_state and an initial
2074                 * refcount on it. In case of an error we have nothing.
2075                 */
2076                if (ret > 0) {
2077                        WARN_ON(pi_state);
2078                        task_count++;
2079                        /*
2080                         * If we acquired the lock, then the user space value
2081                         * of uaddr2 should be vpid. It cannot be changed by
2082                         * the top waiter as it is blocked on hb2 lock if it
2083                         * tries to do so. If something fiddled with it behind
2084                         * our back the pi state lookup might unearth it. So
2085                         * we rather use the known value than rereading and
2086                         * handing potential crap to lookup_pi_state.
2087                         *
2088                         * If that call succeeds then we have pi_state and an
2089                         * initial refcount on it.
2090                         */
2091                        ret = lookup_pi_state(uaddr2, ret, hb2, &key2,
2092                                              &pi_state, &exiting);
2093                }
2094
2095                switch (ret) {
2096                case 0:
2097                        /* We hold a reference on the pi state. */
2098                        break;
2099
2100                        /* If the above failed, then pi_state is NULL */
2101                case -EFAULT:
2102                        double_unlock_hb(hb1, hb2);
2103                        hb_waiters_dec(hb2);
2104                        put_futex_key(&key2);
2105                        put_futex_key(&key1);
2106                        ret = fault_in_user_writeable(uaddr2);
2107                        if (!ret)
2108                                goto retry;
2109                        goto out;
2110                case -EBUSY:
2111                case -EAGAIN:
2112                        /*
2113                         * Two reasons for this:
2114                         * - EBUSY: Owner is exiting and we just wait for the
2115                         *   exit to complete.
2116                         * - EAGAIN: The user space value changed.
2117                         */
2118                        double_unlock_hb(hb1, hb2);
2119                        hb_waiters_dec(hb2);
2120                        put_futex_key(&key2);
2121                        put_futex_key(&key1);
2122                        /*
2123                         * Handle the case where the owner is in the middle of
2124                         * exiting. Wait for the exit to complete otherwise
2125                         * this task might loop forever, aka. live lock.
2126                         */
2127                        wait_for_owner_exiting(ret, exiting);
2128                        cond_resched();
2129                        goto retry;
2130                default:
2131                        goto out_unlock;
2132                }
2133        }
2134
2135        plist_for_each_entry_safe(this, next, &hb1->chain, list) {
2136                if (task_count - nr_wake >= nr_requeue)
2137                        break;
2138
2139                if (!match_futex(&this->key, &key1))
2140                        continue;
2141
2142                /*
2143                 * FUTEX_WAIT_REQEUE_PI and FUTEX_CMP_REQUEUE_PI should always
2144                 * be paired with each other and no other futex ops.
2145                 *
2146                 * We should never be requeueing a futex_q with a pi_state,
2147                 * which is awaiting a futex_unlock_pi().
2148                 */
2149                if ((requeue_pi && !this->rt_waiter) ||
2150                    (!requeue_pi && this->rt_waiter) ||
2151                    this->pi_state) {
2152                        ret = -EINVAL;
2153                        break;
2154                }
2155
2156                /*
2157                 * Wake nr_wake waiters.  For requeue_pi, if we acquired the
2158                 * lock, we already woke the top_waiter.  If not, it will be
2159                 * woken by futex_unlock_pi().
2160                 */
2161                if (++task_count <= nr_wake && !requeue_pi) {
2162                        mark_wake_futex(&wake_q, this);
2163                        continue;
2164                }
2165
2166                /* Ensure we requeue to the expected futex for requeue_pi. */
2167                if (requeue_pi && !match_futex(this->requeue_pi_key, &key2)) {
2168                        ret = -EINVAL;
2169                        break;
2170                }
2171
2172                /*
2173                 * Requeue nr_requeue waiters and possibly one more in the case
2174                 * of requeue_pi if we couldn't acquire the lock atomically.
2175                 */
2176                if (requeue_pi) {
2177                        /*
2178                         * Prepare the waiter to take the rt_mutex. Take a
2179                         * refcount on the pi_state and store the pointer in
2180                         * the futex_q object of the waiter.
2181                         */
2182                        get_pi_state(pi_state);
2183                        this->pi_state = pi_state;
2184                        ret = rt_mutex_start_proxy_lock(&pi_state->pi_mutex,
2185                                                        this->rt_waiter,
2186                                                        this->task);
2187                        if (ret == 1) {
2188                                /*
2189                                 * We got the lock. We do neither drop the
2190                                 * refcount on pi_state nor clear
2191                                 * this->pi_state because the waiter needs the
2192                                 * pi_state for cleaning up the user space
2193                                 * value. It will drop the refcount after
2194                                 * doing so.
2195                                 */
2196                                requeue_pi_wake_futex(this, &key2, hb2);
2197                                continue;
2198                        } else if (ret) {
2199                                /*
2200                                 * rt_mutex_start_proxy_lock() detected a
2201                                 * potential deadlock when we tried to queue
2202                                 * that waiter. Drop the pi_state reference
2203                                 * which we took above and remove the pointer
2204                                 * to the state from the waiters futex_q
2205                                 * object.
2206                                 */
2207                                this->pi_state = NULL;
2208                                put_pi_state(pi_state);
2209                                /*
2210                                 * We stop queueing more waiters and let user
2211                                 * space deal with the mess.
2212                                 */
2213                                break;
2214                        }
2215                }
2216                requeue_futex(this, hb1, hb2, &key2);
2217        }
2218
2219        /*
2220         * We took an extra initial reference to the pi_state either
2221         * in futex_proxy_trylock_atomic() or in lookup_pi_state(). We
2222         * need to drop it here again.
2223         */
2224        put_pi_state(pi_state);
2225
2226out_unlock:
2227        double_unlock_hb(hb1, hb2);
2228        wake_up_q(&wake_q);
2229        hb_waiters_dec(hb2);
2230
2231out_put_keys:
2232        put_futex_key(&key2);
2233out_put_key1:
2234        put_futex_key(&key1);
2235out:
2236        return ret ? ret : task_count;
2237}
2238
2239/* The key must be already stored in q->key. */
2240static inline struct futex_hash_bucket *queue_lock(struct futex_q *q)
2241        __acquires(&hb->lock)
2242{
2243        struct futex_hash_bucket *hb;
2244
2245        hb = hash_futex(&q->key);
2246
2247        /*
2248         * Increment the counter before taking the lock so that
2249         * a potential waker won't miss a to-be-slept task that is
2250         * waiting for the spinlock. This is safe as all queue_lock()
2251         * users end up calling queue_me(). Similarly, for housekeeping,
2252         * decrement the counter at queue_unlock() when some error has
2253         * occurred and we don't end up adding the task to the list.
2254         */
2255        hb_waiters_inc(hb); /* implies smp_mb(); (A) */
2256
2257        q->lock_ptr = &hb->lock;
2258
2259        spin_lock(&hb->lock);
2260        return hb;
2261}
2262
2263static inline void
2264queue_unlock(struct futex_hash_bucket *hb)
2265        __releases(&hb->lock)
2266{
2267        spin_unlock(&hb->lock);
2268        hb_waiters_dec(hb);
2269}
2270
2271static inline void __queue_me(struct futex_q *q, struct futex_hash_bucket *hb)
2272{
2273        int prio;
2274
2275        /*
2276         * The priority used to register this element is
2277         * - either the real thread-priority for the real-time threads
2278         * (i.e. threads with a priority lower than MAX_RT_PRIO)
2279         * - or MAX_RT_PRIO for non-RT threads.
2280         * Thus, all RT-threads are woken first in priority order, and
2281         * the others are woken last, in FIFO order.
2282         */
2283        prio = min(current->normal_prio, MAX_RT_PRIO);
2284
2285        plist_node_init(&q->list, prio);
2286        plist_add(&q->list, &hb->chain);
2287        q->task = current;
2288}
2289
2290/**
2291 * queue_me() - Enqueue the futex_q on the futex_hash_bucket
2292 * @q:  The futex_q to enqueue
2293 * @hb: The destination hash bucket
2294 *
2295 * The hb->lock must be held by the caller, and is released here. A call to
2296 * queue_me() is typically paired with exactly one call to unqueue_me().  The
2297 * exceptions involve the PI related operations, which may use unqueue_me_pi()
2298 * or nothing if the unqueue is done as part of the wake process and the unqueue
2299 * state is implicit in the state of woken task (see futex_wait_requeue_pi() for
2300 * an example).
2301 */
2302static inline void queue_me(struct futex_q *q, struct futex_hash_bucket *hb)
2303        __releases(&hb->lock)
2304{
2305        __queue_me(q, hb);
2306        spin_unlock(&hb->lock);
2307}
2308
2309/**
2310 * unqueue_me() - Remove the futex_q from its futex_hash_bucket
2311 * @q:  The futex_q to unqueue
2312 *
2313 * The q->lock_ptr must not be held by the caller. A call to unqueue_me() must
2314 * be paired with exactly one earlier call to queue_me().
2315 *
2316 * Return:
2317 *  - 1 - if the futex_q was still queued (and we removed unqueued it);
2318 *  - 0 - if the futex_q was already removed by the waking thread
2319 */
2320static int unqueue_me(struct futex_q *q)
2321{
2322        spinlock_t *lock_ptr;
2323        int ret = 0;
2324
2325        /* In the common case we don't take the spinlock, which is nice. */
2326retry:
2327        /*
2328         * q->lock_ptr can change between this read and the following spin_lock.
2329         * Use READ_ONCE to forbid the compiler from reloading q->lock_ptr and
2330         * optimizing lock_ptr out of the logic below.
2331         */
2332        lock_ptr = READ_ONCE(q->lock_ptr);
2333        if (lock_ptr != NULL) {
2334                spin_lock(lock_ptr);
2335                /*
2336                 * q->lock_ptr can change between reading it and
2337                 * spin_lock(), causing us to take the wrong lock.  This
2338                 * corrects the race condition.
2339                 *
2340                 * Reasoning goes like this: if we have the wrong lock,
2341                 * q->lock_ptr must have changed (maybe several times)
2342                 * between reading it and the spin_lock().  It can
2343                 * change again after the spin_lock() but only if it was
2344                 * already changed before the spin_lock().  It cannot,
2345                 * however, change back to the original value.  Therefore
2346                 * we can detect whether we acquired the correct lock.
2347                 */
2348                if (unlikely(lock_ptr != q->lock_ptr)) {
2349                        spin_unlock(lock_ptr);
2350                        goto retry;
2351                }
2352                __unqueue_futex(q);
2353
2354                BUG_ON(q->pi_state);
2355
2356                spin_unlock(lock_ptr);
2357                ret = 1;
2358        }
2359
2360        return ret;
2361}
2362
2363/*
2364 * PI futexes can not be requeued and must remove themself from the
2365 * hash bucket. The hash bucket lock (i.e. lock_ptr) is held on entry
2366 * and dropped here.
2367 */
2368static void unqueue_me_pi(struct futex_q *q)
2369        __releases(q->lock_ptr)
2370{
2371        __unqueue_futex(q);
2372
2373        BUG_ON(!q->pi_state);
2374        put_pi_state(q->pi_state);
2375        q->pi_state = NULL;
2376
2377        spin_unlock(q->lock_ptr);
2378}
2379
2380static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,
2381                                struct task_struct *argowner)
2382{
2383        struct futex_pi_state *pi_state = q->pi_state;
2384        u32 uval, uninitialized_var(curval), newval;
2385        struct task_struct *oldowner, *newowner;
2386        u32 newtid;
2387        int ret, err = 0;
2388
2389        lockdep_assert_held(q->lock_ptr);
2390
2391        raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
2392
2393        oldowner = pi_state->owner;
2394
2395        /*
2396         * We are here because either:
2397         *
2398         *  - we stole the lock and pi_state->owner needs updating to reflect
2399         *    that (@argowner == current),
2400         *
2401         * or:
2402         *
2403         *  - someone stole our lock and we need to fix things to point to the
2404         *    new owner (@argowner == NULL).
2405         *
2406         * Either way, we have to replace the TID in the user space variable.
2407         * This must be atomic as we have to preserve the owner died bit here.
2408         *
2409         * Note: We write the user space value _before_ changing the pi_state
2410         * because we can fault here. Imagine swapped out pages or a fork
2411         * that marked all the anonymous memory readonly for cow.
2412         *
2413         * Modifying pi_state _before_ the user space value would leave the
2414         * pi_state in an inconsistent state when we fault here, because we
2415         * need to drop the locks to handle the fault. This might be observed
2416         * in the PID check in lookup_pi_state.
2417         */
2418retry:
2419        if (!argowner) {
2420                if (oldowner != current) {
2421                        /*
2422                         * We raced against a concurrent self; things are
2423                         * already fixed up. Nothing to do.
2424                         */
2425                        ret = 0;
2426                        goto out_unlock;
2427                }
2428
2429                if (__rt_mutex_futex_trylock(&pi_state->pi_mutex)) {
2430                        /* We got the lock after all, nothing to fix. */
2431                        ret = 0;
2432                        goto out_unlock;
2433                }
2434
2435                /*
2436                 * Since we just failed the trylock; there must be an owner.
2437                 */
2438                newowner = rt_mutex_owner(&pi_state->pi_mutex);
2439                BUG_ON(!newowner);
2440        } else {
2441                WARN_ON_ONCE(argowner != current);
2442                if (oldowner == current) {
2443                        /*
2444                         * We raced against a concurrent self; things are
2445                         * already fixed up. Nothing to do.
2446                         */
2447                        ret = 0;
2448                        goto out_unlock;
2449                }
2450                newowner = argowner;
2451        }
2452
2453        newtid = task_pid_vnr(newowner) | FUTEX_WAITERS;
2454        /* Owner died? */
2455        if (!pi_state->owner)
2456                newtid |= FUTEX_OWNER_DIED;
2457
2458        err = get_futex_value_locked(&uval, uaddr);
2459        if (err)
2460                goto handle_err;
2461
2462        for (;;) {
2463                newval = (uval & FUTEX_OWNER_DIED) | newtid;
2464
2465                err = cmpxchg_futex_value_locked(&curval, uaddr, uval, newval);
2466                if (err)
2467                        goto handle_err;
2468
2469                if (curval == uval)
2470                        break;
2471                uval = curval;
2472        }
2473
2474        /*
2475         * We fixed up user space. Now we need to fix the pi_state
2476         * itself.
2477         */
2478        if (pi_state->owner != NULL) {
2479                raw_spin_lock(&pi_state->owner->pi_lock);
2480                WARN_ON(list_empty(&pi_state->list));
2481                list_del_init(&pi_state->list);
2482                raw_spin_unlock(&pi_state->owner->pi_lock);
2483        }
2484
2485        pi_state->owner = newowner;
2486
2487        raw_spin_lock(&newowner->pi_lock);
2488        WARN_ON(!list_empty(&pi_state->list));
2489        list_add(&pi_state->list, &newowner->pi_state_list);
2490        raw_spin_unlock(&newowner->pi_lock);
2491        raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
2492
2493        return 0;
2494
2495        /*
2496         * In order to reschedule or handle a page fault, we need to drop the
2497         * locks here. In the case of a fault, this gives the other task
2498         * (either the highest priority waiter itself or the task which stole
2499         * the rtmutex) the chance to try the fixup of the pi_state. So once we
2500         * are back from handling the fault we need to check the pi_state after
2501         * reacquiring the locks and before trying to do another fixup. When
2502         * the fixup has been done already we simply return.
2503         *
2504         * Note: we hold both hb->lock and pi_mutex->wait_lock. We can safely
2505         * drop hb->lock since the caller owns the hb -> futex_q relation.
2506         * Dropping the pi_mutex->wait_lock requires the state revalidate.
2507         */
2508handle_err:
2509        raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
2510        spin_unlock(q->lock_ptr);
2511
2512        switch (err) {
2513        case -EFAULT:
2514                ret = fault_in_user_writeable(uaddr);
2515                break;
2516
2517        case -EAGAIN:
2518                cond_resched();
2519                ret = 0;
2520                break;
2521
2522        default:
2523                WARN_ON_ONCE(1);
2524                ret = err;
2525                break;
2526        }
2527
2528        spin_lock(q->lock_ptr);
2529        raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
2530
2531        /*
2532         * Check if someone else fixed it for us:
2533         */
2534        if (pi_state->owner != oldowner) {
2535                ret = 0;
2536                goto out_unlock;
2537        }
2538
2539        if (ret)
2540                goto out_unlock;
2541
2542        goto retry;
2543
2544out_unlock:
2545        raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
2546        return ret;
2547}
2548
2549static long futex_wait_restart(struct restart_block *restart);
2550
2551/**
2552 * fixup_owner() - Post lock pi_state and corner case management
2553 * @uaddr:      user address of the futex
2554 * @q:          futex_q (contains pi_state and access to the rt_mutex)
2555 * @locked:     if the attempt to take the rt_mutex succeeded (1) or not (0)
2556 *
2557 * After attempting to lock an rt_mutex, this function is called to cleanup
2558 * the pi_state owner as well as handle race conditions that may allow us to
2559 * acquire the lock. Must be called with the hb lock held.
2560 *
2561 * Return:
2562 *  -  1 - success, lock taken;
2563 *  -  0 - success, lock not taken;
2564 *  - <0 - on error (-EFAULT)
2565 */
2566static int fixup_owner(u32 __user *uaddr, struct futex_q *q, int locked)
2567{
2568        int ret = 0;
2569
2570        if (locked) {
2571                /*
2572                 * Got the lock. We might not be the anticipated owner if we
2573                 * did a lock-steal - fix up the PI-state in that case:
2574                 *
2575                 * Speculative pi_state->owner read (we don't hold wait_lock);
2576                 * since we own the lock pi_state->owner == current is the
2577                 * stable state, anything else needs more attention.
2578                 */
2579                if (q->pi_state->owner != current)
2580                        ret = fixup_pi_state_owner(uaddr, q, current);
2581                goto out;
2582        }
2583
2584        /*
2585         * If we didn't get the lock; check if anybody stole it from us. In
2586         * that case, we need to fix up the uval to point to them instead of
2587         * us, otherwise bad things happen. [10]
2588         *
2589         * Another speculative read; pi_state->owner == current is unstable
2590         * but needs our attention.
2591         */
2592        if (q->pi_state->owner == current) {
2593                ret = fixup_pi_state_owner(uaddr, q, NULL);
2594                goto out;
2595        }
2596
2597        /*
2598         * Paranoia check. If we did not take the lock, then we should not be
2599         * the owner of the rt_mutex.
2600         */
2601        if (rt_mutex_owner(&q->pi_state->pi_mutex) == current) {
2602                printk(KERN_ERR "fixup_owner: ret = %d pi-mutex: %p "
2603                                "pi-state %p\n", ret,
2604                                q->pi_state->pi_mutex.owner,
2605                                q->pi_state->owner);
2606        }
2607
2608out:
2609        return ret ? ret : locked;
2610}
2611
2612/**
2613 * futex_wait_queue_me() - queue_me() and wait for wakeup, timeout, or signal
2614 * @hb:         the futex hash bucket, must be locked by the caller
2615 * @q:          the futex_q to queue up on
2616 * @timeout:    the prepared hrtimer_sleeper, or null for no timeout
2617 */
2618static void futex_wait_queue_me(struct futex_hash_bucket *hb, struct futex_q *q,
2619                                struct hrtimer_sleeper *timeout)
2620{
2621        /*
2622         * The task state is guaranteed to be set before another task can
2623         * wake it. set_current_state() is implemented using smp_store_mb() and
2624         * queue_me() calls spin_unlock() upon completion, both serializing
2625         * access to the hash list and forcing another memory barrier.
2626         */
2627        set_current_state(TASK_INTERRUPTIBLE);
2628        queue_me(q, hb);
2629
2630        /* Arm the timer */
2631        if (timeout)
2632                hrtimer_sleeper_start_expires(timeout, HRTIMER_MODE_ABS);
2633
2634        /*
2635         * If we have been removed from the hash list, then another task
2636         * has tried to wake us, and we can skip the call to schedule().
2637         */
2638        if (likely(!plist_node_empty(&q->list))) {
2639                /*
2640                 * If the timer has already expired, current will already be
2641                 * flagged for rescheduling. Only call schedule if there
2642                 * is no timeout, or if it has yet to expire.
2643                 */
2644                if (!timeout || timeout->task)
2645                        freezable_schedule();
2646        }
2647        __set_current_state(TASK_RUNNING);
2648}
2649
2650/**
2651 * futex_wait_setup() - Prepare to wait on a futex
2652 * @uaddr:      the futex userspace address
2653 * @val:        the expected value
2654 * @flags:      futex flags (FLAGS_SHARED, etc.)
2655 * @q:          the associated futex_q
2656 * @hb:         storage for hash_bucket pointer to be returned to caller
2657 *
2658 * Setup the futex_q and locate the hash_bucket.  Get the futex value and
2659 * compare it with the expected value.  Handle atomic faults internally.
2660 * Return with the hb lock held and a q.key reference on success, and unlocked
2661 * with no q.key reference on failure.
2662 *
2663 * Return:
2664 *  -  0 - uaddr contains val and hb has been locked;
2665 *  - <1 - -EFAULT or -EWOULDBLOCK (uaddr does not contain val) and hb is unlocked
2666 */
2667static int futex_wait_setup(u32 __user *uaddr, u32 val, unsigned int flags,
2668                           struct futex_q *q, struct futex_hash_bucket **hb)
2669{
2670        u32 uval;
2671        int ret;
2672
2673        /*
2674         * Access the page AFTER the hash-bucket is locked.
2675         * Order is important:
2676         *
2677         *   Userspace waiter: val = var; if (cond(val)) futex_wait(&var, val);
2678         *   Userspace waker:  if (cond(var)) { var = new; futex_wake(&var); }
2679         *
2680         * The basic logical guarantee of a futex is that it blocks ONLY
2681         * if cond(var) is known to be true at the time of blocking, for
2682         * any cond.  If we locked the hash-bucket after testing *uaddr, that
2683         * would open a race condition where we could block indefinitely with
2684         * cond(var) false, which would violate the guarantee.
2685         *
2686         * On the other hand, we insert q and release the hash-bucket only
2687         * after testing *uaddr.  This guarantees that futex_wait() will NOT
2688         * absorb a wakeup if *uaddr does not match the desired values
2689         * while the syscall executes.
2690         */
2691retry:
2692        ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &q->key, FUTEX_READ);
2693        if (unlikely(ret != 0))
2694                return ret;
2695
2696retry_private:
2697        *hb = queue_lock(q);
2698
2699        ret = get_futex_value_locked(&uval, uaddr);
2700
2701        if (ret) {
2702                queue_unlock(*hb);
2703
2704                ret = get_user(uval, uaddr);
2705                if (ret)
2706                        goto out;
2707
2708                if (!(flags & FLAGS_SHARED))
2709                        goto retry_private;
2710
2711                put_futex_key(&q->key);
2712                goto retry;
2713        }
2714
2715        if (uval != val) {
2716                queue_unlock(*hb);
2717                ret = -EWOULDBLOCK;
2718        }
2719
2720out:
2721        if (ret)
2722                put_futex_key(&q->key);
2723        return ret;
2724}
2725
2726static int futex_wait(u32 __user *uaddr, unsigned int flags, u32 val,
2727                      ktime_t *abs_time, u32 bitset)
2728{
2729        struct hrtimer_sleeper timeout, *to;
2730        struct restart_block *restart;
2731        struct futex_hash_bucket *hb;
2732        struct futex_q q = futex_q_init;
2733        int ret;
2734
2735        if (!bitset)
2736                return -EINVAL;
2737        q.bitset = bitset;
2738
2739        to = futex_setup_timer(abs_time, &timeout, flags,
2740                               current->timer_slack_ns);
2741retry:
2742        /*
2743         * Prepare to wait on uaddr. On success, holds hb lock and increments
2744         * q.key refs.
2745         */
2746        ret = futex_wait_setup(uaddr, val, flags, &q, &hb);
2747        if (ret)
2748                goto out;
2749
2750        /* queue_me and wait for wakeup, timeout, or a signal. */
2751        futex_wait_queue_me(hb, &q, to);
2752
2753        /* If we were woken (and unqueued), we succeeded, whatever. */
2754        ret = 0;
2755        /* unqueue_me() drops q.key ref */
2756        if (!unqueue_me(&q))
2757                goto out;
2758        ret = -ETIMEDOUT;
2759        if (to && !to->task)
2760                goto out;
2761
2762        /*
2763         * We expect signal_pending(current), but we might be the
2764         * victim of a spurious wakeup as well.
2765         */
2766        if (!signal_pending(current))
2767                goto retry;
2768
2769        ret = -ERESTARTSYS;
2770        if (!abs_time)
2771                goto out;
2772
2773        restart = &current->restart_block;
2774        restart->fn = futex_wait_restart;
2775        restart->futex.uaddr = uaddr;
2776        restart->futex.val = val;
2777        restart->futex.time = *abs_time;
2778        restart->futex.bitset = bitset;
2779        restart->futex.flags = flags | FLAGS_HAS_TIMEOUT;
2780
2781        ret = -ERESTART_RESTARTBLOCK;
2782
2783out:
2784        if (to) {
2785                hrtimer_cancel(&to->timer);
2786                destroy_hrtimer_on_stack(&to->timer);
2787        }
2788        return ret;
2789}
2790
2791
2792static long futex_wait_restart(struct restart_block *restart)
2793{
2794        u32 __user *uaddr = restart->futex.uaddr;
2795        ktime_t t, *tp = NULL;
2796
2797        if (restart->futex.flags & FLAGS_HAS_TIMEOUT) {
2798                t = restart->futex.time;
2799                tp = &t;
2800        }
2801        restart->fn = do_no_restart_syscall;
2802
2803        return (long)futex_wait(uaddr, restart->futex.flags,
2804                                restart->futex.val, tp, restart->futex.bitset);
2805}
2806
2807
2808/*
2809 * Userspace tried a 0 -> TID atomic transition of the futex value
2810 * and failed. The kernel side here does the whole locking operation:
2811 * if there are waiters then it will block as a consequence of relying
2812 * on rt-mutexes, it does PI, etc. (Due to races the kernel might see
2813 * a 0 value of the futex too.).
2814 *
2815 * Also serves as futex trylock_pi()'ing, and due semantics.
2816 */
2817static int futex_lock_pi(u32 __user *uaddr, unsigned int flags,
2818                         ktime_t *time, int trylock)
2819{
2820        struct hrtimer_sleeper timeout, *to;
2821        struct futex_pi_state *pi_state = NULL;
2822        struct task_struct *exiting = NULL;
2823        struct rt_mutex_waiter rt_waiter;
2824        struct futex_hash_bucket *hb;
2825        struct futex_q q = futex_q_init;
2826        int res, ret;
2827
2828        if (!IS_ENABLED(CONFIG_FUTEX_PI))
2829                return -ENOSYS;
2830
2831        if (refill_pi_state_cache())
2832                return -ENOMEM;
2833
2834        to = futex_setup_timer(time, &timeout, FLAGS_CLOCKRT, 0);
2835
2836retry:
2837        ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &q.key, FUTEX_WRITE);
2838        if (unlikely(ret != 0))
2839                goto out;
2840
2841retry_private:
2842        hb = queue_lock(&q);
2843
2844        ret = futex_lock_pi_atomic(uaddr, hb, &q.key, &q.pi_state, current,
2845                                   &exiting, 0);
2846        if (unlikely(ret)) {
2847                /*
2848                 * Atomic work succeeded and we got the lock,
2849                 * or failed. Either way, we do _not_ block.
2850                 */
2851                switch (ret) {
2852                case 1:
2853                        /* We got the lock. */
2854                        ret = 0;
2855                        goto out_unlock_put_key;
2856                case -EFAULT:
2857                        goto uaddr_faulted;
2858                case -EBUSY:
2859                case -EAGAIN:
2860                        /*
2861                         * Two reasons for this:
2862                         * - EBUSY: Task is exiting and we just wait for the
2863                         *   exit to complete.
2864                         * - EAGAIN: The user space value changed.
2865                         */
2866                        queue_unlock(hb);
2867                        put_futex_key(&q.key);
2868                        /*
2869                         * Handle the case where the owner is in the middle of
2870                         * exiting. Wait for the exit to complete otherwise
2871                         * this task might loop forever, aka. live lock.
2872                         */
2873                        wait_for_owner_exiting(ret, exiting);
2874                        cond_resched();
2875                        goto retry;
2876                default:
2877                        goto out_unlock_put_key;
2878                }
2879        }
2880
2881        WARN_ON(!q.pi_state);
2882
2883        /*
2884         * Only actually queue now that the atomic ops are done:
2885         */
2886        __queue_me(&q, hb);
2887
2888        if (trylock) {
2889                ret = rt_mutex_futex_trylock(&q.pi_state->pi_mutex);
2890                /* Fixup the trylock return value: */
2891                ret = ret ? 0 : -EWOULDBLOCK;
2892                goto no_block;
2893        }
2894
2895        rt_mutex_init_waiter(&rt_waiter);
2896
2897        /*
2898         * On PREEMPT_RT_FULL, when hb->lock becomes an rt_mutex, we must not
2899         * hold it while doing rt_mutex_start_proxy(), because then it will
2900         * include hb->lock in the blocking chain, even through we'll not in
2901         * fact hold it while blocking. This will lead it to report -EDEADLK
2902         * and BUG when futex_unlock_pi() interleaves with this.
2903         *
2904         * Therefore acquire wait_lock while holding hb->lock, but drop the
2905         * latter before calling __rt_mutex_start_proxy_lock(). This
2906         * interleaves with futex_unlock_pi() -- which does a similar lock
2907         * handoff -- such that the latter can observe the futex_q::pi_state
2908         * before __rt_mutex_start_proxy_lock() is done.
2909         */
2910        raw_spin_lock_irq(&q.pi_state->pi_mutex.wait_lock);
2911        spin_unlock(q.lock_ptr);
2912        /*
2913         * __rt_mutex_start_proxy_lock() unconditionally enqueues the @rt_waiter
2914         * such that futex_unlock_pi() is guaranteed to observe the waiter when
2915         * it sees the futex_q::pi_state.
2916         */
2917        ret = __rt_mutex_start_proxy_lock(&q.pi_state->pi_mutex, &rt_waiter, current);
2918        raw_spin_unlock_irq(&q.pi_state->pi_mutex.wait_lock);
2919
2920        if (ret) {
2921                if (ret == 1)
2922                        ret = 0;
2923                goto cleanup;
2924        }
2925
2926        if (unlikely(to))
2927                hrtimer_sleeper_start_expires(to, HRTIMER_MODE_ABS);
2928
2929        ret = rt_mutex_wait_proxy_lock(&q.pi_state->pi_mutex, to, &rt_waiter);
2930
2931cleanup:
2932        spin_lock(q.lock_ptr);
2933        /*
2934         * If we failed to acquire the lock (deadlock/signal/timeout), we must
2935         * first acquire the hb->lock before removing the lock from the
2936         * rt_mutex waitqueue, such that we can keep the hb and rt_mutex wait
2937         * lists consistent.
2938         *
2939         * In particular; it is important that futex_unlock_pi() can not
2940         * observe this inconsistency.
2941         */
2942        if (ret && !rt_mutex_cleanup_proxy_lock(&q.pi_state->pi_mutex, &rt_waiter))
2943                ret = 0;
2944
2945no_block:
2946        /*
2947         * Fixup the pi_state owner and possibly acquire the lock if we
2948         * haven't already.
2949         */
2950        res = fixup_owner(uaddr, &q, !ret);
2951        /*
2952         * If fixup_owner() returned an error, proprogate that.  If it acquired
2953         * the lock, clear our -ETIMEDOUT or -EINTR.
2954         */
2955        if (res)
2956                ret = (res < 0) ? res : 0;
2957
2958        /*
2959         * If fixup_owner() faulted and was unable to handle the fault, unlock
2960         * it and return the fault to userspace.
2961         */
2962        if (ret && (rt_mutex_owner(&q.pi_state->pi_mutex) == current)) {
2963                pi_state = q.pi_state;
2964                get_pi_state(pi_state);
2965        }
2966
2967        /* Unqueue and drop the lock */
2968        unqueue_me_pi(&q);
2969
2970        if (pi_state) {
2971                rt_mutex_futex_unlock(&pi_state->pi_mutex);
2972                put_pi_state(pi_state);
2973        }
2974
2975        goto out_put_key;
2976
2977out_unlock_put_key:
2978        queue_unlock(hb);
2979
2980out_put_key:
2981        put_futex_key(&q.key);
2982out:
2983        if (to) {
2984                hrtimer_cancel(&to->timer);
2985                destroy_hrtimer_on_stack(&to->timer);
2986        }
2987        return ret != -EINTR ? ret : -ERESTARTNOINTR;
2988
2989uaddr_faulted:
2990        queue_unlock(hb);
2991
2992        ret = fault_in_user_writeable(uaddr);
2993        if (ret)
2994                goto out_put_key;
2995
2996        if (!(flags & FLAGS_SHARED))
2997                goto retry_private;
2998
2999        put_futex_key(&q.key);
3000        goto retry;
3001}
3002
3003/*
3004 * Userspace attempted a TID -> 0 atomic transition, and failed.
3005 * This is the in-kernel slowpath: we look up the PI state (if any),
3006 * and do the rt-mutex unlock.
3007 */
3008static int futex_unlock_pi(u32 __user *uaddr, unsigned int flags)
3009{
3010        u32 uninitialized_var(curval), uval, vpid = task_pid_vnr(current);
3011        union futex_key key = FUTEX_KEY_INIT;
3012        struct futex_hash_bucket *hb;
3013        struct futex_q *top_waiter;
3014        int ret;
3015
3016        if (!IS_ENABLED(CONFIG_FUTEX_PI))
3017                return -ENOSYS;
3018
3019retry:
3020        if (get_user(uval, uaddr))
3021                return -EFAULT;
3022        /*
3023         * We release only a lock we actually own:
3024         */
3025        if ((uval & FUTEX_TID_MASK) != vpid)
3026                return -EPERM;
3027
3028        ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &key, FUTEX_WRITE);
3029        if (ret)
3030                return ret;
3031
3032        hb = hash_futex(&key);
3033        spin_lock(&hb->lock);
3034
3035        /*
3036         * Check waiters first. We do not trust user space values at
3037         * all and we at least want to know if user space fiddled
3038         * with the futex value instead of blindly unlocking.
3039         */
3040        top_waiter = futex_top_waiter(hb, &key);
3041        if (top_waiter) {
3042                struct futex_pi_state *pi_state = top_waiter->pi_state;
3043
3044                ret = -EINVAL;
3045                if (!pi_state)
3046                        goto out_unlock;
3047
3048                /*
3049                 * If current does not own the pi_state then the futex is
3050                 * inconsistent and user space fiddled with the futex value.
3051                 */
3052                if (pi_state->owner != current)
3053                        goto out_unlock;
3054
3055                get_pi_state(pi_state);
3056                /*
3057                 * By taking wait_lock while still holding hb->lock, we ensure
3058                 * there is no point where we hold neither; and therefore
3059                 * wake_futex_pi() must observe a state consistent with what we
3060                 * observed.
3061                 *
3062                 * In particular; this forces __rt_mutex_start_proxy() to
3063                 * complete such that we're guaranteed to observe the
3064                 * rt_waiter. Also see the WARN in wake_futex_pi().
3065                 */
3066                raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
3067                spin_unlock(&hb->lock);
3068
3069                /* drops pi_state->pi_mutex.wait_lock */
3070                ret = wake_futex_pi(uaddr, uval, pi_state);
3071
3072                put_pi_state(pi_state);
3073
3074                /*
3075                 * Success, we're done! No tricky corner cases.
3076                 */
3077                if (!ret)
3078                        goto out_putkey;
3079                /*
3080                 * The atomic access to the futex value generated a
3081                 * pagefault, so retry the user-access and the wakeup:
3082                 */
3083                if (ret == -EFAULT)
3084                        goto pi_faulted;
3085                /*
3086                 * A unconditional UNLOCK_PI op raced against a waiter
3087                 * setting the FUTEX_WAITERS bit. Try again.
3088                 */
3089                if (ret == -EAGAIN)
3090                        goto pi_retry;
3091                /*
3092                 * wake_futex_pi has detected invalid state. Tell user
3093                 * space.
3094                 */
3095                goto out_putkey;
3096        }
3097
3098        /*
3099         * We have no kernel internal state, i.e. no waiters in the
3100         * kernel. Waiters which are about to queue themselves are stuck
3101         * on hb->lock. So we can safely ignore them. We do neither
3102         * preserve the WAITERS bit not the OWNER_DIED one. We are the
3103         * owner.
3104         */
3105        if ((ret = cmpxchg_futex_value_locked(&curval, uaddr, uval, 0))) {
3106                spin_unlock(&hb->lock);
3107                switch (ret) {
3108                case -EFAULT:
3109                        goto pi_faulted;
3110
3111                case -EAGAIN:
3112                        goto pi_retry;
3113
3114                default:
3115                        WARN_ON_ONCE(1);
3116                        goto out_putkey;
3117                }
3118        }
3119
3120        /*
3121         * If uval has changed, let user space handle it.
3122         */
3123        ret = (curval == uval) ? 0 : -EAGAIN;
3124
3125out_unlock:
3126        spin_unlock(&hb->lock);
3127out_putkey:
3128        put_futex_key(&key);
3129        return ret;
3130
3131pi_retry:
3132        put_futex_key(&key);
3133        cond_resched();
3134        goto retry;
3135
3136pi_faulted:
3137        put_futex_key(&key);
3138
3139        ret = fault_in_user_writeable(uaddr);
3140        if (!ret)
3141                goto retry;
3142
3143        return ret;
3144}
3145
3146/**
3147 * handle_early_requeue_pi_wakeup() - Detect early wakeup on the initial futex
3148 * @hb:         the hash_bucket futex_q was original enqueued on
3149 * @q:          the futex_q woken while waiting to be requeued
3150 * @key2:       the futex_key of the requeue target futex
3151 * @timeout:    the timeout associated with the wait (NULL if none)
3152 *
3153 * Detect if the task was woken on the initial futex as opposed to the requeue
3154 * target futex.  If so, determine if it was a timeout or a signal that caused
3155 * the wakeup and return the appropriate error code to the caller.  Must be
3156 * called with the hb lock held.
3157 *
3158 * Return:
3159 *  -  0 = no early wakeup detected;
3160 *  - <0 = -ETIMEDOUT or -ERESTARTNOINTR
3161 */
3162static inline
3163int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb,
3164                                   struct futex_q *q, union futex_key *key2,
3165                                   struct hrtimer_sleeper *timeout)
3166{
3167        int ret = 0;
3168
3169        /*
3170         * With the hb lock held, we avoid races while we process the wakeup.
3171         * We only need to hold hb (and not hb2) to ensure atomicity as the
3172         * wakeup code can't change q.key from uaddr to uaddr2 if we hold hb.
3173         * It can't be requeued from uaddr2 to something else since we don't
3174         * support a PI aware source futex for requeue.
3175         */
3176        if (!match_futex(&q->key, key2)) {
3177                WARN_ON(q->lock_ptr && (&hb->lock != q->lock_ptr));
3178                /*
3179                 * We were woken prior to requeue by a timeout or a signal.
3180                 * Unqueue the futex_q and determine which it was.
3181                 */
3182                plist_del(&q->list, &hb->chain);
3183                hb_waiters_dec(hb);
3184
3185                /* Handle spurious wakeups gracefully */
3186                ret = -EWOULDBLOCK;
3187                if (timeout && !timeout->task)
3188                        ret = -ETIMEDOUT;
3189                else if (signal_pending(current))
3190                        ret = -ERESTARTNOINTR;
3191        }
3192        return ret;
3193}
3194
3195/**
3196 * futex_wait_requeue_pi() - Wait on uaddr and take uaddr2
3197 * @uaddr:      the futex we initially wait on (non-pi)
3198 * @flags:      futex flags (FLAGS_SHARED, FLAGS_CLOCKRT, etc.), they must be
3199 *              the same type, no requeueing from private to shared, etc.
3200 * @val:        the expected value of uaddr
3201 * @abs_time:   absolute timeout
3202 * @bitset:     32 bit wakeup bitset set by userspace, defaults to all
3203 * @uaddr2:     the pi futex we will take prior to returning to user-space
3204 *
3205 * The caller will wait on uaddr and will be requeued by futex_requeue() to
3206 * uaddr2 which must be PI aware and unique from uaddr.  Normal wakeup will wake
3207 * on uaddr2 and complete the acquisition of the rt_mutex prior to returning to
3208 * userspace.  This ensures the rt_mutex maintains an owner when it has waiters;
3209 * without one, the pi logic would not know which task to boost/deboost, if
3210 * there was a need to.
3211 *
3212 * We call schedule in futex_wait_queue_me() when we enqueue and return there
3213 * via the following--
3214 * 1) wakeup on uaddr2 after an atomic lock acquisition by futex_requeue()
3215 * 2) wakeup on uaddr2 after a requeue
3216 * 3) signal
3217 * 4) timeout
3218 *
3219 * If 3, cleanup and return -ERESTARTNOINTR.
3220 *
3221 * If 2, we may then block on trying to take the rt_mutex and return via:
3222 * 5) successful lock
3223 * 6) signal
3224 * 7) timeout
3225 * 8) other lock acquisition failure
3226 *
3227 * If 6, return -EWOULDBLOCK (restarting the syscall would do the same).
3228 *
3229 * If 4 or 7, we cleanup and return with -ETIMEDOUT.
3230 *
3231 * Return:
3232 *  -  0 - On success;
3233 *  - <0 - On error
3234 */
3235static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
3236                                 u32 val, ktime_t *abs_time, u32 bitset,
3237                                 u32 __user *uaddr2)
3238{
3239        struct hrtimer_sleeper timeout, *to;
3240        struct futex_pi_state *pi_state = NULL;
3241        struct rt_mutex_waiter rt_waiter;
3242        struct futex_hash_bucket *hb;
3243        union futex_key key2 = FUTEX_KEY_INIT;
3244        struct futex_q q = futex_q_init;
3245        int res, ret;
3246
3247        if (!IS_ENABLED(CONFIG_FUTEX_PI))
3248                return -ENOSYS;
3249
3250        if (uaddr == uaddr2)
3251                return -EINVAL;
3252
3253        if (!bitset)
3254                return -EINVAL;
3255
3256        to = futex_setup_timer(abs_time, &timeout, flags,
3257                               current->timer_slack_ns);
3258
3259        /*
3260         * The waiter is allocated on our stack, manipulated by the requeue
3261         * code while we sleep on uaddr.
3262         */
3263        rt_mutex_init_waiter(&rt_waiter);
3264
3265        ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, FUTEX_WRITE);
3266        if (unlikely(ret != 0))
3267                goto out;
3268
3269        q.bitset = bitset;
3270        q.rt_waiter = &rt_waiter;
3271        q.requeue_pi_key = &key2;
3272
3273        /*
3274         * Prepare to wait on uaddr. On success, increments q.key (key1) ref
3275         * count.
3276         */
3277        ret = futex_wait_setup(uaddr, val, flags, &q, &hb);
3278        if (ret)
3279                goto out_key2;
3280
3281        /*
3282         * The check above which compares uaddrs is not sufficient for
3283         * shared futexes. We need to compare the keys:
3284         */
3285        if (match_futex(&q.key, &key2)) {
3286                queue_unlock(hb);
3287                ret = -EINVAL;
3288                goto out_put_keys;
3289        }
3290
3291        /* Queue the futex_q, drop the hb lock, wait for wakeup. */
3292        futex_wait_queue_me(hb, &q, to);
3293
3294        spin_lock(&hb->lock);
3295        ret = handle_early_requeue_pi_wakeup(hb, &q, &key2, to);
3296        spin_unlock(&hb->lock);
3297        if (ret)
3298                goto out_put_keys;
3299
3300        /*
3301         * In order for us to be here, we know our q.key == key2, and since
3302         * we took the hb->lock above, we also know that futex_requeue() has
3303         * completed and we no longer have to concern ourselves with a wakeup
3304         * race with the atomic proxy lock acquisition by the requeue code. The
3305         * futex_requeue dropped our key1 reference and incremented our key2
3306         * reference count.
3307         */
3308
3309        /* Check if the requeue code acquired the second futex for us. */
3310        if (!q.rt_waiter) {
3311                /*
3312                 * Got the lock. We might not be the anticipated owner if we
3313                 * did a lock-steal - fix up the PI-state in that case.
3314                 */
3315                if (q.pi_state && (q.pi_state->owner != current)) {
3316                        spin_lock(q.lock_ptr);
3317                        ret = fixup_pi_state_owner(uaddr2, &q, current);
3318                        if (ret && rt_mutex_owner(&q.pi_state->pi_mutex) == current) {
3319                                pi_state = q.pi_state;
3320                                get_pi_state(pi_state);
3321                        }
3322                        /*
3323                         * Drop the reference to the pi state which
3324                         * the requeue_pi() code acquired for us.
3325                         */
3326                        put_pi_state(q.pi_state);
3327                        spin_unlock(q.lock_ptr);
3328                }
3329        } else {
3330                struct rt_mutex *pi_mutex;
3331
3332                /*
3333                 * We have been woken up by futex_unlock_pi(), a timeout, or a
3334                 * signal.  futex_unlock_pi() will not destroy the lock_ptr nor
3335                 * the pi_state.
3336                 */
3337                WARN_ON(!q.pi_state);
3338                pi_mutex = &q.pi_state->pi_mutex;
3339                ret = rt_mutex_wait_proxy_lock(pi_mutex, to, &rt_waiter);
3340
3341                spin_lock(q.lock_ptr);
3342                if (ret && !rt_mutex_cleanup_proxy_lock(pi_mutex, &rt_waiter))
3343                        ret = 0;
3344
3345                debug_rt_mutex_free_waiter(&rt_waiter);
3346                /*
3347                 * Fixup the pi_state owner and possibly acquire the lock if we
3348                 * haven't already.
3349                 */
3350                res = fixup_owner(uaddr2, &q, !ret);
3351                /*
3352                 * If fixup_owner() returned an error, proprogate that.  If it
3353                 * acquired the lock, clear -ETIMEDOUT or -EINTR.
3354                 */
3355                if (res)
3356                        ret = (res < 0) ? res : 0;
3357
3358                /*
3359                 * If fixup_pi_state_owner() faulted and was unable to handle
3360                 * the fault, unlock the rt_mutex and return the fault to
3361                 * userspace.
3362                 */
3363                if (ret && rt_mutex_owner(&q.pi_state->pi_mutex) == current) {
3364                        pi_state = q.pi_state;
3365                        get_pi_state(pi_state);
3366                }
3367
3368                /* Unqueue and drop the lock. */
3369                unqueue_me_pi(&q);
3370        }
3371
3372        if (pi_state) {
3373                rt_mutex_futex_unlock(&pi_state->pi_mutex);
3374                put_pi_state(pi_state);
3375        }
3376
3377        if (ret == -EINTR) {
3378                /*
3379                 * We've already been requeued, but cannot restart by calling
3380                 * futex_lock_pi() directly. We could restart this syscall, but
3381                 * it would detect that the user space "val" changed and return
3382                 * -EWOULDBLOCK.  Save the overhead of the restart and return
3383                 * -EWOULDBLOCK directly.
3384                 */
3385                ret = -EWOULDBLOCK;
3386        }
3387
3388out_put_keys:
3389        put_futex_key(&q.key);
3390out_key2:
3391        put_futex_key(&key2);
3392
3393out:
3394        if (to) {
3395                hrtimer_cancel(&to->timer);
3396                destroy_hrtimer_on_stack(&to->timer);
3397        }
3398        return ret;
3399}
3400
3401/*
3402 * Support for robust futexes: the kernel cleans up held futexes at
3403 * thread exit time.
3404 *
3405 * Implementation: user-space maintains a per-thread list of locks it
3406 * is holding. Upon do_exit(), the kernel carefully walks this list,
3407 * and marks all locks that are owned by this thread with the
3408 * FUTEX_OWNER_DIED bit, and wakes up a waiter (if any). The list is
3409 * always manipulated with the lock held, so the list is private and
3410 * per-thread. Userspace also maintains a per-thread 'list_op_pending'
3411 * field, to allow the kernel to clean up if the thread dies after
3412 * acquiring the lock, but just before it could have added itself to
3413 * the list. There can only be one such pending lock.
3414 */
3415
3416/**
3417 * sys_set_robust_list() - Set the robust-futex list head of a task
3418 * @head:       pointer to the list-head
3419 * @len:        length of the list-head, as userspace expects
3420 */
3421SYSCALL_DEFINE2(set_robust_list, struct robust_list_head __user *, head,
3422                size_t, len)
3423{
3424        if (!futex_cmpxchg_enabled)
3425                return -ENOSYS;
3426        /*
3427         * The kernel knows only one size for now:
3428         */
3429        if (unlikely(len != sizeof(*head)))
3430                return -EINVAL;
3431
3432        current->robust_list = head;
3433
3434        return 0;
3435}
3436
3437/**
3438 * sys_get_robust_list() - Get the robust-futex list head of a task
3439 * @pid:        pid of the process [zero for current task]
3440 * @head_ptr:   pointer to a list-head pointer, the kernel fills it in
3441 * @len_ptr:    pointer to a length field, the kernel fills in the header size
3442 */
3443SYSCALL_DEFINE3(get_robust_list, int, pid,
3444                struct robust_list_head __user * __user *, head_ptr,
3445                size_t __user *, len_ptr)
3446{
3447        struct robust_list_head __user *head;
3448        unsigned long ret;
3449        struct task_struct *p;
3450
3451        if (!futex_cmpxchg_enabled)
3452                return -ENOSYS;
3453
3454        rcu_read_lock();
3455
3456        ret = -ESRCH;
3457        if (!pid)
3458                p = current;
3459        else {
3460                p = find_task_by_vpid(pid);
3461                if (!p)
3462                        goto err_unlock;
3463        }
3464
3465        ret = -EPERM;
3466        if (!ptrace_may_access(p, PTRACE_MODE_READ_REALCREDS))
3467                goto err_unlock;
3468
3469        head = p->robust_list;
3470        rcu_read_unlock();
3471
3472        if (put_user(sizeof(*head), len_ptr))
3473                return -EFAULT;
3474        return put_user(head, head_ptr);
3475
3476err_unlock:
3477        rcu_read_unlock();
3478
3479        return ret;
3480}
3481
3482/* Constants for the pending_op argument of handle_futex_death */
3483#define HANDLE_DEATH_PENDING    true
3484#define HANDLE_DEATH_LIST       false
3485
3486/*
3487 * Process a futex-list entry, check whether it's owned by the
3488 * dying task, and do notification if so:
3489 */
3490static int handle_futex_death(u32 __user *uaddr, struct task_struct *curr,
3491                              bool pi, bool pending_op)
3492{
3493        u32 uval, uninitialized_var(nval), mval;
3494        int err;
3495
3496        /* Futex address must be 32bit aligned */
3497        if ((((unsigned long)uaddr) % sizeof(*uaddr)) != 0)
3498                return -1;
3499
3500retry:
3501        if (get_user(uval, uaddr))
3502                return -1;
3503
3504        /*
3505         * Special case for regular (non PI) futexes. The unlock path in
3506         * user space has two race scenarios:
3507         *
3508         * 1. The unlock path releases the user space futex value and
3509         *    before it can execute the futex() syscall to wake up
3510         *    waiters it is killed.
3511         *
3512         * 2. A woken up waiter is killed before it can acquire the
3513         *    futex in user space.
3514         *
3515         * In both cases the TID validation below prevents a wakeup of
3516         * potential waiters which can cause these waiters to block
3517         * forever.
3518         *
3519         * In both cases the following conditions are met:
3520         *
3521         *      1) task->robust_list->list_op_pending != NULL
3522         *         @pending_op == true
3523         *      2) User space futex value == 0
3524         *      3) Regular futex: @pi == false
3525         *
3526         * If these conditions are met, it is safe to attempt waking up a
3527         * potential waiter without touching the user space futex value and
3528         * trying to set the OWNER_DIED bit. The user space futex value is
3529         * uncontended and the rest of the user space mutex state is
3530         * consistent, so a woken waiter will just take over the
3531         * uncontended futex. Setting the OWNER_DIED bit would create
3532         * inconsistent state and malfunction of the user space owner died
3533         * handling.
3534         */
3535        if (pending_op && !pi && !uval) {
3536                futex_wake(uaddr, 1, 1, FUTEX_BITSET_MATCH_ANY);
3537                return 0;
3538        }
3539
3540        if ((uval & FUTEX_TID_MASK) != task_pid_vnr(curr))
3541                return 0;
3542
3543        /*
3544         * Ok, this dying thread is truly holding a futex
3545         * of interest. Set the OWNER_DIED bit atomically
3546         * via cmpxchg, and if the value had FUTEX_WAITERS
3547         * set, wake up a waiter (if any). (We have to do a
3548         * futex_wake() even if OWNER_DIED is already set -
3549         * to handle the rare but possible case of recursive
3550         * thread-death.) The rest of the cleanup is done in
3551         * userspace.
3552         */
3553        mval = (uval & FUTEX_WAITERS) | FUTEX_OWNER_DIED;
3554
3555        /*
3556         * We are not holding a lock here, but we want to have
3557         * the pagefault_disable/enable() protection because
3558         * we want to handle the fault gracefully. If the
3559         * access fails we try to fault in the futex with R/W
3560         * verification via get_user_pages. get_user() above
3561         * does not guarantee R/W access. If that fails we
3562         * give up and leave the futex locked.
3563         */
3564        if ((err = cmpxchg_futex_value_locked(&nval, uaddr, uval, mval))) {
3565                switch (err) {
3566                case -EFAULT:
3567                        if (fault_in_user_writeable(uaddr))
3568                                return -1;
3569                        goto retry;
3570
3571                case -EAGAIN:
3572                        cond_resched();
3573                        goto retry;
3574
3575                default:
3576                        WARN_ON_ONCE(1);
3577                        return err;
3578                }
3579        }
3580
3581        if (nval != uval)
3582                goto retry;
3583
3584        /*
3585         * Wake robust non-PI futexes here. The wakeup of
3586         * PI futexes happens in exit_pi_state():
3587         */
3588        if (!pi && (uval & FUTEX_WAITERS))
3589                futex_wake(uaddr, 1, 1, FUTEX_BITSET_MATCH_ANY);
3590
3591        return 0;
3592}
3593
3594/*
3595 * Fetch a robust-list pointer. Bit 0 signals PI futexes:
3596 */
3597static inline int fetch_robust_entry(struct robust_list __user **entry,
3598                                     struct robust_list __user * __user *head,
3599                                     unsigned int *pi)
3600{
3601        unsigned long uentry;
3602
3603        if (get_user(uentry, (unsigned long __user *)head))
3604                return -EFAULT;
3605
3606        *entry = (void __user *)(uentry & ~1UL);
3607        *pi = uentry & 1;
3608
3609        return 0;
3610}
3611
3612/*
3613 * Walk curr->robust_list (very carefully, it's a userspace list!)
3614 * and mark any locks found there dead, and notify any waiters.
3615 *
3616 * We silently return on any sign of list-walking problem.
3617 */
3618static void exit_robust_list(struct task_struct *curr)
3619{
3620        struct robust_list_head __user *head = curr->robust_list;
3621        struct robust_list __user *entry, *next_entry, *pending;
3622        unsigned int limit = ROBUST_LIST_LIMIT, pi, pip;
3623        unsigned int uninitialized_var(next_pi);
3624        unsigned long futex_offset;
3625        int rc;
3626
3627        if (!futex_cmpxchg_enabled)
3628                return;
3629
3630        /*
3631         * Fetch the list head (which was registered earlier, via
3632         * sys_set_robust_list()):
3633         */
3634        if (fetch_robust_entry(&entry, &head->list.next, &pi))
3635                return;
3636        /*
3637         * Fetch the relative futex offset:
3638         */
3639        if (get_user(futex_offset, &head->futex_offset))
3640                return;
3641        /*
3642         * Fetch any possibly pending lock-add first, and handle it
3643         * if it exists:
3644         */
3645        if (fetch_robust_entry(&pending, &head->list_op_pending, &pip))
3646                return;
3647
3648        next_entry = NULL;      /* avoid warning with gcc */
3649        while (entry != &head->list) {
3650                /*
3651                 * Fetch the next entry in the list before calling
3652                 * handle_futex_death:
3653                 */
3654                rc = fetch_robust_entry(&next_entry, &entry->next, &next_pi);
3655                /*
3656                 * A pending lock might already be on the list, so
3657                 * don't process it twice:
3658                 */
3659                if (entry != pending) {
3660                        if (handle_futex_death((void __user *)entry + futex_offset,
3661                                                curr, pi, HANDLE_DEATH_LIST))
3662                                return;
3663                }
3664                if (rc)
3665                        return;
3666                entry = next_entry;
3667                pi = next_pi;
3668                /*
3669                 * Avoid excessively long or circular lists:
3670                 */
3671                if (!--limit)
3672                        break;
3673
3674                cond_resched();
3675        }
3676
3677        if (pending) {
3678                handle_futex_death((void __user *)pending + futex_offset,
3679                                   curr, pip, HANDLE_DEATH_PENDING);
3680        }
3681}
3682
3683static void futex_cleanup(struct task_struct *tsk)
3684{
3685        if (unlikely(tsk->robust_list)) {
3686                exit_robust_list(tsk);
3687                tsk->robust_list = NULL;
3688        }
3689
3690#ifdef CONFIG_COMPAT
3691        if (unlikely(tsk->compat_robust_list)) {
3692                compat_exit_robust_list(tsk);
3693                tsk->compat_robust_list = NULL;
3694        }
3695#endif
3696
3697        if (unlikely(!list_empty(&tsk->pi_state_list)))
3698                exit_pi_state_list(tsk);
3699}
3700
3701/**
3702 * futex_exit_recursive - Set the tasks futex state to FUTEX_STATE_DEAD
3703 * @tsk:        task to set the state on
3704 *
3705 * Set the futex exit state of the task lockless. The futex waiter code
3706 * observes that state when a task is exiting and loops until the task has
3707 * actually finished the futex cleanup. The worst case for this is that the
3708 * waiter runs through the wait loop until the state becomes visible.
3709 *
3710 * This is called from the recursive fault handling path in do_exit().
3711 *
3712 * This is best effort. Either the futex exit code has run already or
3713 * not. If the OWNER_DIED bit has been set on the futex then the waiter can
3714 * take it over. If not, the problem is pushed back to user space. If the
3715 * futex exit code did not run yet, then an already queued waiter might
3716 * block forever, but there is nothing which can be done about that.
3717 */
3718void futex_exit_recursive(struct task_struct *tsk)
3719{
3720        /* If the state is FUTEX_STATE_EXITING then futex_exit_mutex is held */
3721        if (tsk->futex_state == FUTEX_STATE_EXITING)
3722                mutex_unlock(&tsk->futex_exit_mutex);
3723        tsk->futex_state = FUTEX_STATE_DEAD;
3724}
3725
3726static void futex_cleanup_begin(struct task_struct *tsk)
3727{
3728        /*
3729         * Prevent various race issues against a concurrent incoming waiter
3730         * including live locks by forcing the waiter to block on
3731         * tsk->futex_exit_mutex when it observes FUTEX_STATE_EXITING in
3732         * attach_to_pi_owner().
3733         */
3734        mutex_lock(&tsk->futex_exit_mutex);
3735
3736        /*
3737         * Switch the state to FUTEX_STATE_EXITING under tsk->pi_lock.
3738         *
3739         * This ensures that all subsequent checks of tsk->futex_state in
3740         * attach_to_pi_owner() must observe FUTEX_STATE_EXITING with
3741         * tsk->pi_lock held.
3742         *
3743         * It guarantees also that a pi_state which was queued right before
3744         * the state change under tsk->pi_lock by a concurrent waiter must
3745         * be observed in exit_pi_state_list().
3746         */
3747        raw_spin_lock_irq(&tsk->pi_lock);
3748        tsk->futex_state = FUTEX_STATE_EXITING;
3749        raw_spin_unlock_irq(&tsk->pi_lock);
3750}
3751
3752static void futex_cleanup_end(struct task_struct *tsk, int state)
3753{
3754        /*
3755         * Lockless store. The only side effect is that an observer might
3756         * take another loop until it becomes visible.
3757         */
3758        tsk->futex_state = state;
3759        /*
3760         * Drop the exit protection. This unblocks waiters which observed
3761         * FUTEX_STATE_EXITING to reevaluate the state.
3762         */
3763        mutex_unlock(&tsk->futex_exit_mutex);
3764}
3765
3766void futex_exec_release(struct task_struct *tsk)
3767{
3768        /*
3769         * The state handling is done for consistency, but in the case of
3770         * exec() there is no way to prevent futher damage as the PID stays
3771         * the same. But for the unlikely and arguably buggy case that a
3772         * futex is held on exec(), this provides at least as much state
3773         * consistency protection which is possible.
3774         */
3775        futex_cleanup_begin(tsk);
3776        futex_cleanup(tsk);
3777        /*
3778         * Reset the state to FUTEX_STATE_OK. The task is alive and about
3779         * exec a new binary.
3780         */
3781        futex_cleanup_end(tsk, FUTEX_STATE_OK);
3782}
3783
3784void futex_exit_release(struct task_struct *tsk)
3785{
3786        futex_cleanup_begin(tsk);
3787        futex_cleanup(tsk);
3788        futex_cleanup_end(tsk, FUTEX_STATE_DEAD);
3789}
3790
3791long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
3792                u32 __user *uaddr2, u32 val2, u32 val3)
3793{
3794        int cmd = op & FUTEX_CMD_MASK;
3795        unsigned int flags = 0;
3796
3797        if (!(op & FUTEX_PRIVATE_FLAG))
3798                flags |= FLAGS_SHARED;
3799
3800        if (op & FUTEX_CLOCK_REALTIME) {
3801                flags |= FLAGS_CLOCKRT;
3802                if (cmd != FUTEX_WAIT && cmd != FUTEX_WAIT_BITSET && \
3803                    cmd != FUTEX_WAIT_REQUEUE_PI)
3804                        return -ENOSYS;
3805        }
3806
3807        switch (cmd) {
3808        case FUTEX_LOCK_PI:
3809        case FUTEX_UNLOCK_PI:
3810        case FUTEX_TRYLOCK_PI:
3811        case FUTEX_WAIT_REQUEUE_PI:
3812        case FUTEX_CMP_REQUEUE_PI:
3813                if (!futex_cmpxchg_enabled)
3814                        return -ENOSYS;
3815        }
3816
3817        switch (cmd) {
3818        case FUTEX_WAIT:
3819                val3 = FUTEX_BITSET_MATCH_ANY;
3820                /* fall through */
3821        case FUTEX_WAIT_BITSET:
3822                return futex_wait(uaddr, flags, val, timeout, val3);
3823        case FUTEX_WAKE:
3824                val3 = FUTEX_BITSET_MATCH_ANY;
3825                /* fall through */
3826        case FUTEX_WAKE_BITSET:
3827                return futex_wake(uaddr, flags, val, val3);
3828        case FUTEX_REQUEUE:
3829                return futex_requeue(uaddr, flags, uaddr2, val, val2, NULL, 0);
3830        case FUTEX_CMP_REQUEUE:
3831                return futex_requeue(uaddr, flags, uaddr2, val, val2, &val3, 0);
3832        case FUTEX_WAKE_OP:
3833                return futex_wake_op(uaddr, flags, uaddr2, val, val2, val3);
3834        case FUTEX_LOCK_PI:
3835                return futex_lock_pi(uaddr, flags, timeout, 0);
3836        case FUTEX_UNLOCK_PI:
3837                return futex_unlock_pi(uaddr, flags);
3838        case FUTEX_TRYLOCK_PI:
3839                return futex_lock_pi(uaddr, flags, NULL, 1);
3840        case FUTEX_WAIT_REQUEUE_PI:
3841                val3 = FUTEX_BITSET_MATCH_ANY;
3842                return futex_wait_requeue_pi(uaddr, flags, val, timeout, val3,
3843                                             uaddr2);
3844        case FUTEX_CMP_REQUEUE_PI:
3845                return futex_requeue(uaddr, flags, uaddr2, val, val2, &val3, 1);
3846        }
3847        return -ENOSYS;
3848}
3849
3850
3851SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val,
3852                struct timespec __user *, utime, u32 __user *, uaddr2,
3853                u32, val3)
3854{
3855        struct timespec ts;
3856        ktime_t t, *tp = NULL;
3857        u32 val2 = 0;
3858        int cmd = op & FUTEX_CMD_MASK;
3859
3860        if (utime && (cmd == FUTEX_WAIT || cmd == FUTEX_LOCK_PI ||
3861                      cmd == FUTEX_WAIT_BITSET ||
3862                      cmd == FUTEX_WAIT_REQUEUE_PI)) {
3863                if (unlikely(should_fail_futex(!(op & FUTEX_PRIVATE_FLAG))))
3864                        return -EFAULT;
3865                if (copy_from_user(&ts, utime, sizeof(ts)) != 0)
3866                        return -EFAULT;
3867                if (!timespec_valid(&ts))
3868                        return -EINVAL;
3869
3870                t = timespec_to_ktime(ts);
3871                if (cmd == FUTEX_WAIT)
3872                        t = ktime_add_safe(ktime_get(), t);
3873                tp = &t;
3874        }
3875        /*
3876         * requeue parameter in 'utime' if cmd == FUTEX_*_REQUEUE_*.
3877         * number of waiters to wake in 'utime' if cmd == FUTEX_WAKE_OP.
3878         */
3879        if (cmd == FUTEX_REQUEUE || cmd == FUTEX_CMP_REQUEUE ||
3880            cmd == FUTEX_CMP_REQUEUE_PI || cmd == FUTEX_WAKE_OP)
3881                val2 = (u32) (unsigned long) utime;
3882
3883        return do_futex(uaddr, op, val, tp, uaddr2, val2, val3);
3884}
3885
3886#ifdef CONFIG_COMPAT
3887/*
3888 * Fetch a robust-list pointer. Bit 0 signals PI futexes:
3889 */
3890static inline int
3891compat_fetch_robust_entry(compat_uptr_t *uentry, struct robust_list __user **entry,
3892                   compat_uptr_t __user *head, unsigned int *pi)
3893{
3894        if (get_user(*uentry, head))
3895                return -EFAULT;
3896
3897        *entry = compat_ptr((*uentry) & ~1);
3898        *pi = (unsigned int)(*uentry) & 1;
3899
3900        return 0;
3901}
3902
3903static void __user *futex_uaddr(struct robust_list __user *entry,
3904                                compat_long_t futex_offset)
3905{
3906        compat_uptr_t base = ptr_to_compat(entry);
3907        void __user *uaddr = compat_ptr(base + futex_offset);
3908
3909        return uaddr;
3910}
3911
3912/*
3913 * Walk curr->robust_list (very carefully, it's a userspace list!)
3914 * and mark any locks found there dead, and notify any waiters.
3915 *
3916 * We silently return on any sign of list-walking problem.
3917 */
3918static void compat_exit_robust_list(struct task_struct *curr)
3919{
3920        struct compat_robust_list_head __user *head = curr->compat_robust_list;
3921        struct robust_list __user *entry, *next_entry, *pending;
3922        unsigned int limit = ROBUST_LIST_LIMIT, pi, pip;
3923        unsigned int uninitialized_var(next_pi);
3924        compat_uptr_t uentry, next_uentry, upending;
3925        compat_long_t futex_offset;
3926        int rc;
3927
3928        if (!futex_cmpxchg_enabled)
3929                return;
3930
3931        /*
3932         * Fetch the list head (which was registered earlier, via
3933         * sys_set_robust_list()):
3934         */
3935        if (compat_fetch_robust_entry(&uentry, &entry, &head->list.next, &pi))
3936                return;
3937        /*
3938         * Fetch the relative futex offset:
3939         */
3940        if (get_user(futex_offset, &head->futex_offset))
3941                return;
3942        /*
3943         * Fetch any possibly pending lock-add first, and handle it
3944         * if it exists:
3945         */
3946        if (compat_fetch_robust_entry(&upending, &pending,
3947                               &head->list_op_pending, &pip))
3948                return;
3949
3950        next_entry = NULL;      /* avoid warning with gcc */
3951        while (entry != (struct robust_list __user *) &head->list) {
3952                /*
3953                 * Fetch the next entry in the list before calling
3954                 * handle_futex_death:
3955                 */
3956                rc = compat_fetch_robust_entry(&next_uentry, &next_entry,
3957                        (compat_uptr_t __user *)&entry->next, &next_pi);
3958                /*
3959                 * A pending lock might already be on the list, so
3960                 * dont process it twice:
3961                 */
3962                if (entry != pending) {
3963                        void __user *uaddr = futex_uaddr(entry, futex_offset);
3964
3965                        if (handle_futex_death(uaddr, curr, pi,
3966                                               HANDLE_DEATH_LIST))
3967                                return;
3968                }
3969                if (rc)
3970                        return;
3971                uentry = next_uentry;
3972                entry = next_entry;
3973                pi = next_pi;
3974                /*
3975                 * Avoid excessively long or circular lists:
3976                 */
3977                if (!--limit)
3978                        break;
3979
3980                cond_resched();
3981        }
3982        if (pending) {
3983                void __user *uaddr = futex_uaddr(pending, futex_offset);
3984
3985                handle_futex_death(uaddr, curr, pip, HANDLE_DEATH_PENDING);
3986        }
3987}
3988
3989COMPAT_SYSCALL_DEFINE2(set_robust_list,
3990                struct compat_robust_list_head __user *, head,
3991                compat_size_t, len)
3992{
3993        if (!futex_cmpxchg_enabled)
3994                return -ENOSYS;
3995
3996        if (unlikely(len != sizeof(*head)))
3997                return -EINVAL;
3998
3999        current->compat_robust_list = head;
4000
4001        return 0;
4002}
4003
4004COMPAT_SYSCALL_DEFINE3(get_robust_list, int, pid,
4005                        compat_uptr_t __user *, head_ptr,
4006                        compat_size_t __user *, len_ptr)
4007{
4008        struct compat_robust_list_head __user *head;
4009        unsigned long ret;
4010        struct task_struct *p;
4011
4012        if (!futex_cmpxchg_enabled)
4013                return -ENOSYS;
4014
4015        rcu_read_lock();
4016
4017        ret = -ESRCH;
4018        if (!pid)
4019                p = current;
4020        else {
4021                p = find_task_by_vpid(pid);
4022                if (!p)
4023                        goto err_unlock;
4024        }
4025
4026        ret = -EPERM;
4027        if (!ptrace_may_access(p, PTRACE_MODE_READ_REALCREDS))
4028                goto err_unlock;
4029
4030        head = p->compat_robust_list;
4031        rcu_read_unlock();
4032
4033        if (put_user(sizeof(*head), len_ptr))
4034                return -EFAULT;
4035        return put_user(ptr_to_compat(head), head_ptr);
4036
4037err_unlock:
4038        rcu_read_unlock();
4039
4040        return ret;
4041}
4042
4043COMPAT_SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val,
4044                struct compat_timespec __user *, utime, u32 __user *, uaddr2,
4045                u32, val3)
4046{
4047        struct timespec ts;
4048        ktime_t t, *tp = NULL;
4049        int val2 = 0;
4050        int cmd = op & FUTEX_CMD_MASK;
4051
4052        if (utime && (cmd == FUTEX_WAIT || cmd == FUTEX_LOCK_PI ||
4053                      cmd == FUTEX_WAIT_BITSET ||
4054                      cmd == FUTEX_WAIT_REQUEUE_PI)) {
4055                if (compat_get_timespec(&ts, utime))
4056                        return -EFAULT;
4057                if (!timespec_valid(&ts))
4058                        return -EINVAL;
4059
4060                t = timespec_to_ktime(ts);
4061                if (cmd == FUTEX_WAIT)
4062                        t = ktime_add_safe(ktime_get(), t);
4063                tp = &t;
4064        }
4065        if (cmd == FUTEX_REQUEUE || cmd == FUTEX_CMP_REQUEUE ||
4066            cmd == FUTEX_CMP_REQUEUE_PI || cmd == FUTEX_WAKE_OP)
4067                val2 = (int) (unsigned long) utime;
4068
4069        return do_futex(uaddr, op, val, tp, uaddr2, val2, val3);
4070}
4071#endif /* CONFIG_COMPAT */
4072
4073static void __init futex_detect_cmpxchg(void)
4074{
4075#ifndef CONFIG_HAVE_FUTEX_CMPXCHG
4076        u32 curval;
4077
4078        /*
4079         * This will fail and we want it. Some arch implementations do
4080         * runtime detection of the futex_atomic_cmpxchg_inatomic()
4081         * functionality. We want to know that before we call in any
4082         * of the complex code paths. Also we want to prevent
4083         * registration of robust lists in that case. NULL is
4084         * guaranteed to fault and we get -EFAULT on functional
4085         * implementation, the non-functional ones will return
4086         * -ENOSYS.
4087         */
4088        if (cmpxchg_futex_value_locked(&curval, NULL, 0, 0) == -EFAULT)
4089                futex_cmpxchg_enabled = 1;
4090#endif
4091}
4092
4093static int __init futex_init(void)
4094{
4095        unsigned int futex_shift;
4096        unsigned long i;
4097
4098#if CONFIG_BASE_SMALL
4099        futex_hashsize = 16;
4100#else
4101        futex_hashsize = roundup_pow_of_two(256 * num_possible_cpus());
4102#endif
4103
4104        futex_queues = alloc_large_system_hash("futex", sizeof(*futex_queues),
4105                                               futex_hashsize, 0,
4106                                               futex_hashsize < 256 ? HASH_SMALL : 0,
4107                                               &futex_shift, NULL,
4108                                               futex_hashsize, futex_hashsize);
4109        futex_hashsize = 1UL << futex_shift;
4110
4111        futex_detect_cmpxchg();
4112
4113        for (i = 0; i < futex_hashsize; i++) {
4114                atomic_set(&futex_queues[i].waiters, 0);
4115                plist_head_init(&futex_queues[i].chain);
4116                spin_lock_init(&futex_queues[i].lock);
4117        }
4118
4119        return 0;
4120}
4121core_initcall(futex_init);
4122