linux/kernel/locking/rtmutex.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0-only
   2/*
   3 * RT-Mutexes: simple blocking mutual exclusion locks with PI support
   4 *
   5 * started by Ingo Molnar and Thomas Gleixner.
   6 *
   7 *  Copyright (C) 2004-2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
   8 *  Copyright (C) 2005-2006 Timesys Corp., Thomas Gleixner <tglx@timesys.com>
   9 *  Copyright (C) 2005 Kihon Technologies Inc., Steven Rostedt
  10 *  Copyright (C) 2006 Esben Nielsen
  11 *
  12 *  See Documentation/locking/rt-mutex-design.rst for details.
  13 */
  14#include <linux/spinlock.h>
  15#include <linux/export.h>
  16#include <linux/sched/signal.h>
  17#include <linux/sched/rt.h>
  18#include <linux/sched/deadline.h>
  19#include <linux/sched/wake_q.h>
  20#include <linux/sched/debug.h>
  21#include <linux/timer.h>
  22
  23#include "rtmutex_common.h"
  24
  25/*
  26 * lock->owner state tracking:
  27 *
  28 * lock->owner holds the task_struct pointer of the owner. Bit 0
  29 * is used to keep track of the "lock has waiters" state.
  30 *
  31 * owner        bit0
  32 * NULL         0       lock is free (fast acquire possible)
  33 * NULL         1       lock is free and has waiters and the top waiter
  34 *                              is going to take the lock*
  35 * taskpointer  0       lock is held (fast release possible)
  36 * taskpointer  1       lock is held and has waiters**
  37 *
  38 * The fast atomic compare exchange based acquire and release is only
  39 * possible when bit 0 of lock->owner is 0.
  40 *
  41 * (*) It also can be a transitional state when grabbing the lock
  42 * with ->wait_lock is held. To prevent any fast path cmpxchg to the lock,
  43 * we need to set the bit0 before looking at the lock, and the owner may be
  44 * NULL in this small time, hence this can be a transitional state.
  45 *
  46 * (**) There is a small time when bit 0 is set but there are no
  47 * waiters. This can happen when grabbing the lock in the slow path.
  48 * To prevent a cmpxchg of the owner releasing the lock, we need to
  49 * set this bit before looking at the lock.
  50 */
  51
  52static __always_inline void
  53rt_mutex_set_owner(struct rt_mutex *lock, struct task_struct *owner)
  54{
  55        unsigned long val = (unsigned long)owner;
  56
  57        if (rt_mutex_has_waiters(lock))
  58                val |= RT_MUTEX_HAS_WAITERS;
  59
  60        WRITE_ONCE(lock->owner, (struct task_struct *)val);
  61}
  62
  63static __always_inline void clear_rt_mutex_waiters(struct rt_mutex *lock)
  64{
  65        lock->owner = (struct task_struct *)
  66                        ((unsigned long)lock->owner & ~RT_MUTEX_HAS_WAITERS);
  67}
  68
  69static __always_inline void fixup_rt_mutex_waiters(struct rt_mutex *lock)
  70{
  71        unsigned long owner, *p = (unsigned long *) &lock->owner;
  72
  73        if (rt_mutex_has_waiters(lock))
  74                return;
  75
  76        /*
  77         * The rbtree has no waiters enqueued, now make sure that the
  78         * lock->owner still has the waiters bit set, otherwise the
  79         * following can happen:
  80         *
  81         * CPU 0        CPU 1           CPU2
  82         * l->owner=T1
  83         *              rt_mutex_lock(l)
  84         *              lock(l->lock)
  85         *              l->owner = T1 | HAS_WAITERS;
  86         *              enqueue(T2)
  87         *              boost()
  88         *                unlock(l->lock)
  89         *              block()
  90         *
  91         *                              rt_mutex_lock(l)
  92         *                              lock(l->lock)
  93         *                              l->owner = T1 | HAS_WAITERS;
  94         *                              enqueue(T3)
  95         *                              boost()
  96         *                                unlock(l->lock)
  97         *                              block()
  98         *              signal(->T2)    signal(->T3)
  99         *              lock(l->lock)
 100         *              dequeue(T2)
 101         *              deboost()
 102         *                unlock(l->lock)
 103         *                              lock(l->lock)
 104         *                              dequeue(T3)
 105         *                               ==> wait list is empty
 106         *                              deboost()
 107         *                               unlock(l->lock)
 108         *              lock(l->lock)
 109         *              fixup_rt_mutex_waiters()
 110         *                if (wait_list_empty(l) {
 111         *                  l->owner = owner
 112         *                  owner = l->owner & ~HAS_WAITERS;
 113         *                    ==> l->owner = T1
 114         *                }
 115         *                              lock(l->lock)
 116         * rt_mutex_unlock(l)           fixup_rt_mutex_waiters()
 117         *                                if (wait_list_empty(l) {
 118         *                                  owner = l->owner & ~HAS_WAITERS;
 119         * cmpxchg(l->owner, T1, NULL)
 120         *  ===> Success (l->owner = NULL)
 121         *
 122         *                                  l->owner = owner
 123         *                                    ==> l->owner = T1
 124         *                                }
 125         *
 126         * With the check for the waiter bit in place T3 on CPU2 will not
 127         * overwrite. All tasks fiddling with the waiters bit are
 128         * serialized by l->lock, so nothing else can modify the waiters
 129         * bit. If the bit is set then nothing can change l->owner either
 130         * so the simple RMW is safe. The cmpxchg() will simply fail if it
 131         * happens in the middle of the RMW because the waiters bit is
 132         * still set.
 133         */
 134        owner = READ_ONCE(*p);
 135        if (owner & RT_MUTEX_HAS_WAITERS)
 136                WRITE_ONCE(*p, owner & ~RT_MUTEX_HAS_WAITERS);
 137}
 138
 139/*
 140 * We can speed up the acquire/release, if there's no debugging state to be
 141 * set up.
 142 */
 143#ifndef CONFIG_DEBUG_RT_MUTEXES
 144# define rt_mutex_cmpxchg_acquire(l,c,n) (cmpxchg_acquire(&l->owner, c, n) == c)
 145# define rt_mutex_cmpxchg_release(l,c,n) (cmpxchg_release(&l->owner, c, n) == c)
 146
 147/*
 148 * Callers must hold the ->wait_lock -- which is the whole purpose as we force
 149 * all future threads that attempt to [Rmw] the lock to the slowpath. As such
 150 * relaxed semantics suffice.
 151 */
 152static __always_inline void mark_rt_mutex_waiters(struct rt_mutex *lock)
 153{
 154        unsigned long owner, *p = (unsigned long *) &lock->owner;
 155
 156        do {
 157                owner = *p;
 158        } while (cmpxchg_relaxed(p, owner,
 159                                 owner | RT_MUTEX_HAS_WAITERS) != owner);
 160}
 161
 162/*
 163 * Safe fastpath aware unlock:
 164 * 1) Clear the waiters bit
 165 * 2) Drop lock->wait_lock
 166 * 3) Try to unlock the lock with cmpxchg
 167 */
 168static __always_inline bool unlock_rt_mutex_safe(struct rt_mutex *lock,
 169                                                 unsigned long flags)
 170        __releases(lock->wait_lock)
 171{
 172        struct task_struct *owner = rt_mutex_owner(lock);
 173
 174        clear_rt_mutex_waiters(lock);
 175        raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
 176        /*
 177         * If a new waiter comes in between the unlock and the cmpxchg
 178         * we have two situations:
 179         *
 180         * unlock(wait_lock);
 181         *                                      lock(wait_lock);
 182         * cmpxchg(p, owner, 0) == owner
 183         *                                      mark_rt_mutex_waiters(lock);
 184         *                                      acquire(lock);
 185         * or:
 186         *
 187         * unlock(wait_lock);
 188         *                                      lock(wait_lock);
 189         *                                      mark_rt_mutex_waiters(lock);
 190         *
 191         * cmpxchg(p, owner, 0) != owner
 192         *                                      enqueue_waiter();
 193         *                                      unlock(wait_lock);
 194         * lock(wait_lock);
 195         * wake waiter();
 196         * unlock(wait_lock);
 197         *                                      lock(wait_lock);
 198         *                                      acquire(lock);
 199         */
 200        return rt_mutex_cmpxchg_release(lock, owner, NULL);
 201}
 202
 203#else
 204# define rt_mutex_cmpxchg_acquire(l,c,n)        (0)
 205# define rt_mutex_cmpxchg_release(l,c,n)        (0)
 206
 207static __always_inline void mark_rt_mutex_waiters(struct rt_mutex *lock)
 208{
 209        lock->owner = (struct task_struct *)
 210                        ((unsigned long)lock->owner | RT_MUTEX_HAS_WAITERS);
 211}
 212
 213/*
 214 * Simple slow path only version: lock->owner is protected by lock->wait_lock.
 215 */
 216static __always_inline bool unlock_rt_mutex_safe(struct rt_mutex *lock,
 217                                                 unsigned long flags)
 218        __releases(lock->wait_lock)
 219{
 220        lock->owner = NULL;
 221        raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
 222        return true;
 223}
 224#endif
 225
 226/*
 227 * Only use with rt_mutex_waiter_{less,equal}()
 228 */
 229#define task_to_waiter(p)       \
 230        &(struct rt_mutex_waiter){ .prio = (p)->prio, .deadline = (p)->dl.deadline }
 231
 232static __always_inline int rt_mutex_waiter_less(struct rt_mutex_waiter *left,
 233                                                struct rt_mutex_waiter *right)
 234{
 235        if (left->prio < right->prio)
 236                return 1;
 237
 238        /*
 239         * If both waiters have dl_prio(), we check the deadlines of the
 240         * associated tasks.
 241         * If left waiter has a dl_prio(), and we didn't return 1 above,
 242         * then right waiter has a dl_prio() too.
 243         */
 244        if (dl_prio(left->prio))
 245                return dl_time_before(left->deadline, right->deadline);
 246
 247        return 0;
 248}
 249
 250static __always_inline int rt_mutex_waiter_equal(struct rt_mutex_waiter *left,
 251                                                 struct rt_mutex_waiter *right)
 252{
 253        if (left->prio != right->prio)
 254                return 0;
 255
 256        /*
 257         * If both waiters have dl_prio(), we check the deadlines of the
 258         * associated tasks.
 259         * If left waiter has a dl_prio(), and we didn't return 0 above,
 260         * then right waiter has a dl_prio() too.
 261         */
 262        if (dl_prio(left->prio))
 263                return left->deadline == right->deadline;
 264
 265        return 1;
 266}
 267
 268#define __node_2_waiter(node) \
 269        rb_entry((node), struct rt_mutex_waiter, tree_entry)
 270
 271static __always_inline bool __waiter_less(struct rb_node *a, const struct rb_node *b)
 272{
 273        return rt_mutex_waiter_less(__node_2_waiter(a), __node_2_waiter(b));
 274}
 275
 276static __always_inline void
 277rt_mutex_enqueue(struct rt_mutex *lock, struct rt_mutex_waiter *waiter)
 278{
 279        rb_add_cached(&waiter->tree_entry, &lock->waiters, __waiter_less);
 280}
 281
 282static __always_inline void
 283rt_mutex_dequeue(struct rt_mutex *lock, struct rt_mutex_waiter *waiter)
 284{
 285        if (RB_EMPTY_NODE(&waiter->tree_entry))
 286                return;
 287
 288        rb_erase_cached(&waiter->tree_entry, &lock->waiters);
 289        RB_CLEAR_NODE(&waiter->tree_entry);
 290}
 291
 292#define __node_2_pi_waiter(node) \
 293        rb_entry((node), struct rt_mutex_waiter, pi_tree_entry)
 294
 295static __always_inline bool
 296__pi_waiter_less(struct rb_node *a, const struct rb_node *b)
 297{
 298        return rt_mutex_waiter_less(__node_2_pi_waiter(a), __node_2_pi_waiter(b));
 299}
 300
 301static __always_inline void
 302rt_mutex_enqueue_pi(struct task_struct *task, struct rt_mutex_waiter *waiter)
 303{
 304        rb_add_cached(&waiter->pi_tree_entry, &task->pi_waiters, __pi_waiter_less);
 305}
 306
 307static __always_inline void
 308rt_mutex_dequeue_pi(struct task_struct *task, struct rt_mutex_waiter *waiter)
 309{
 310        if (RB_EMPTY_NODE(&waiter->pi_tree_entry))
 311                return;
 312
 313        rb_erase_cached(&waiter->pi_tree_entry, &task->pi_waiters);
 314        RB_CLEAR_NODE(&waiter->pi_tree_entry);
 315}
 316
 317static __always_inline void rt_mutex_adjust_prio(struct task_struct *p)
 318{
 319        struct task_struct *pi_task = NULL;
 320
 321        lockdep_assert_held(&p->pi_lock);
 322
 323        if (task_has_pi_waiters(p))
 324                pi_task = task_top_pi_waiter(p)->task;
 325
 326        rt_mutex_setprio(p, pi_task);
 327}
 328
 329/*
 330 * Deadlock detection is conditional:
 331 *
 332 * If CONFIG_DEBUG_RT_MUTEXES=n, deadlock detection is only conducted
 333 * if the detect argument is == RT_MUTEX_FULL_CHAINWALK.
 334 *
 335 * If CONFIG_DEBUG_RT_MUTEXES=y, deadlock detection is always
 336 * conducted independent of the detect argument.
 337 *
 338 * If the waiter argument is NULL this indicates the deboost path and
 339 * deadlock detection is disabled independent of the detect argument
 340 * and the config settings.
 341 */
 342static __always_inline bool
 343rt_mutex_cond_detect_deadlock(struct rt_mutex_waiter *waiter,
 344                              enum rtmutex_chainwalk chwalk)
 345{
 346        if (IS_ENABLED(CONFIG_DEBUG_RT_MUTEXES))
 347                return waiter != NULL;
 348        return chwalk == RT_MUTEX_FULL_CHAINWALK;
 349}
 350
 351/*
 352 * Max number of times we'll walk the boosting chain:
 353 */
 354int max_lock_depth = 1024;
 355
 356static __always_inline struct rt_mutex *task_blocked_on_lock(struct task_struct *p)
 357{
 358        return p->pi_blocked_on ? p->pi_blocked_on->lock : NULL;
 359}
 360
 361/*
 362 * Adjust the priority chain. Also used for deadlock detection.
 363 * Decreases task's usage by one - may thus free the task.
 364 *
 365 * @task:       the task owning the mutex (owner) for which a chain walk is
 366 *              probably needed
 367 * @chwalk:     do we have to carry out deadlock detection?
 368 * @orig_lock:  the mutex (can be NULL if we are walking the chain to recheck
 369 *              things for a task that has just got its priority adjusted, and
 370 *              is waiting on a mutex)
 371 * @next_lock:  the mutex on which the owner of @orig_lock was blocked before
 372 *              we dropped its pi_lock. Is never dereferenced, only used for
 373 *              comparison to detect lock chain changes.
 374 * @orig_waiter: rt_mutex_waiter struct for the task that has just donated
 375 *              its priority to the mutex owner (can be NULL in the case
 376 *              depicted above or if the top waiter is gone away and we are
 377 *              actually deboosting the owner)
 378 * @top_task:   the current top waiter
 379 *
 380 * Returns 0 or -EDEADLK.
 381 *
 382 * Chain walk basics and protection scope
 383 *
 384 * [R] refcount on task
 385 * [P] task->pi_lock held
 386 * [L] rtmutex->wait_lock held
 387 *
 388 * Step Description                             Protected by
 389 *      function arguments:
 390 *      @task                                   [R]
 391 *      @orig_lock if != NULL                   @top_task is blocked on it
 392 *      @next_lock                              Unprotected. Cannot be
 393 *                                              dereferenced. Only used for
 394 *                                              comparison.
 395 *      @orig_waiter if != NULL                 @top_task is blocked on it
 396 *      @top_task                               current, or in case of proxy
 397 *                                              locking protected by calling
 398 *                                              code
 399 *      again:
 400 *        loop_sanity_check();
 401 *      retry:
 402 * [1]    lock(task->pi_lock);                  [R] acquire [P]
 403 * [2]    waiter = task->pi_blocked_on;         [P]
 404 * [3]    check_exit_conditions_1();            [P]
 405 * [4]    lock = waiter->lock;                  [P]
 406 * [5]    if (!try_lock(lock->wait_lock)) {     [P] try to acquire [L]
 407 *          unlock(task->pi_lock);              release [P]
 408 *          goto retry;
 409 *        }
 410 * [6]    check_exit_conditions_2();            [P] + [L]
 411 * [7]    requeue_lock_waiter(lock, waiter);    [P] + [L]
 412 * [8]    unlock(task->pi_lock);                release [P]
 413 *        put_task_struct(task);                release [R]
 414 * [9]    check_exit_conditions_3();            [L]
 415 * [10]   task = owner(lock);                   [L]
 416 *        get_task_struct(task);                [L] acquire [R]
 417 *        lock(task->pi_lock);                  [L] acquire [P]
 418 * [11]   requeue_pi_waiter(tsk, waiters(lock));[P] + [L]
 419 * [12]   check_exit_conditions_4();            [P] + [L]
 420 * [13]   unlock(task->pi_lock);                release [P]
 421 *        unlock(lock->wait_lock);              release [L]
 422 *        goto again;
 423 */
 424static int __sched rt_mutex_adjust_prio_chain(struct task_struct *task,
 425                                              enum rtmutex_chainwalk chwalk,
 426                                              struct rt_mutex *orig_lock,
 427                                              struct rt_mutex *next_lock,
 428                                              struct rt_mutex_waiter *orig_waiter,
 429                                              struct task_struct *top_task)
 430{
 431        struct rt_mutex_waiter *waiter, *top_waiter = orig_waiter;
 432        struct rt_mutex_waiter *prerequeue_top_waiter;
 433        int ret = 0, depth = 0;
 434        struct rt_mutex *lock;
 435        bool detect_deadlock;
 436        bool requeue = true;
 437
 438        detect_deadlock = rt_mutex_cond_detect_deadlock(orig_waiter, chwalk);
 439
 440        /*
 441         * The (de)boosting is a step by step approach with a lot of
 442         * pitfalls. We want this to be preemptible and we want hold a
 443         * maximum of two locks per step. So we have to check
 444         * carefully whether things change under us.
 445         */
 446 again:
 447        /*
 448         * We limit the lock chain length for each invocation.
 449         */
 450        if (++depth > max_lock_depth) {
 451                static int prev_max;
 452
 453                /*
 454                 * Print this only once. If the admin changes the limit,
 455                 * print a new message when reaching the limit again.
 456                 */
 457                if (prev_max != max_lock_depth) {
 458                        prev_max = max_lock_depth;
 459                        printk(KERN_WARNING "Maximum lock depth %d reached "
 460                               "task: %s (%d)\n", max_lock_depth,
 461                               top_task->comm, task_pid_nr(top_task));
 462                }
 463                put_task_struct(task);
 464
 465                return -EDEADLK;
 466        }
 467
 468        /*
 469         * We are fully preemptible here and only hold the refcount on
 470         * @task. So everything can have changed under us since the
 471         * caller or our own code below (goto retry/again) dropped all
 472         * locks.
 473         */
 474 retry:
 475        /*
 476         * [1] Task cannot go away as we did a get_task() before !
 477         */
 478        raw_spin_lock_irq(&task->pi_lock);
 479
 480        /*
 481         * [2] Get the waiter on which @task is blocked on.
 482         */
 483        waiter = task->pi_blocked_on;
 484
 485        /*
 486         * [3] check_exit_conditions_1() protected by task->pi_lock.
 487         */
 488
 489        /*
 490         * Check whether the end of the boosting chain has been
 491         * reached or the state of the chain has changed while we
 492         * dropped the locks.
 493         */
 494        if (!waiter)
 495                goto out_unlock_pi;
 496
 497        /*
 498         * Check the orig_waiter state. After we dropped the locks,
 499         * the previous owner of the lock might have released the lock.
 500         */
 501        if (orig_waiter && !rt_mutex_owner(orig_lock))
 502                goto out_unlock_pi;
 503
 504        /*
 505         * We dropped all locks after taking a refcount on @task, so
 506         * the task might have moved on in the lock chain or even left
 507         * the chain completely and blocks now on an unrelated lock or
 508         * on @orig_lock.
 509         *
 510         * We stored the lock on which @task was blocked in @next_lock,
 511         * so we can detect the chain change.
 512         */
 513        if (next_lock != waiter->lock)
 514                goto out_unlock_pi;
 515
 516        /*
 517         * Drop out, when the task has no waiters. Note,
 518         * top_waiter can be NULL, when we are in the deboosting
 519         * mode!
 520         */
 521        if (top_waiter) {
 522                if (!task_has_pi_waiters(task))
 523                        goto out_unlock_pi;
 524                /*
 525                 * If deadlock detection is off, we stop here if we
 526                 * are not the top pi waiter of the task. If deadlock
 527                 * detection is enabled we continue, but stop the
 528                 * requeueing in the chain walk.
 529                 */
 530                if (top_waiter != task_top_pi_waiter(task)) {
 531                        if (!detect_deadlock)
 532                                goto out_unlock_pi;
 533                        else
 534                                requeue = false;
 535                }
 536        }
 537
 538        /*
 539         * If the waiter priority is the same as the task priority
 540         * then there is no further priority adjustment necessary.  If
 541         * deadlock detection is off, we stop the chain walk. If its
 542         * enabled we continue, but stop the requeueing in the chain
 543         * walk.
 544         */
 545        if (rt_mutex_waiter_equal(waiter, task_to_waiter(task))) {
 546                if (!detect_deadlock)
 547                        goto out_unlock_pi;
 548                else
 549                        requeue = false;
 550        }
 551
 552        /*
 553         * [4] Get the next lock
 554         */
 555        lock = waiter->lock;
 556        /*
 557         * [5] We need to trylock here as we are holding task->pi_lock,
 558         * which is the reverse lock order versus the other rtmutex
 559         * operations.
 560         */
 561        if (!raw_spin_trylock(&lock->wait_lock)) {
 562                raw_spin_unlock_irq(&task->pi_lock);
 563                cpu_relax();
 564                goto retry;
 565        }
 566
 567        /*
 568         * [6] check_exit_conditions_2() protected by task->pi_lock and
 569         * lock->wait_lock.
 570         *
 571         * Deadlock detection. If the lock is the same as the original
 572         * lock which caused us to walk the lock chain or if the
 573         * current lock is owned by the task which initiated the chain
 574         * walk, we detected a deadlock.
 575         */
 576        if (lock == orig_lock || rt_mutex_owner(lock) == top_task) {
 577                raw_spin_unlock(&lock->wait_lock);
 578                ret = -EDEADLK;
 579                goto out_unlock_pi;
 580        }
 581
 582        /*
 583         * If we just follow the lock chain for deadlock detection, no
 584         * need to do all the requeue operations. To avoid a truckload
 585         * of conditionals around the various places below, just do the
 586         * minimum chain walk checks.
 587         */
 588        if (!requeue) {
 589                /*
 590                 * No requeue[7] here. Just release @task [8]
 591                 */
 592                raw_spin_unlock(&task->pi_lock);
 593                put_task_struct(task);
 594
 595                /*
 596                 * [9] check_exit_conditions_3 protected by lock->wait_lock.
 597                 * If there is no owner of the lock, end of chain.
 598                 */
 599                if (!rt_mutex_owner(lock)) {
 600                        raw_spin_unlock_irq(&lock->wait_lock);
 601                        return 0;
 602                }
 603
 604                /* [10] Grab the next task, i.e. owner of @lock */
 605                task = get_task_struct(rt_mutex_owner(lock));
 606                raw_spin_lock(&task->pi_lock);
 607
 608                /*
 609                 * No requeue [11] here. We just do deadlock detection.
 610                 *
 611                 * [12] Store whether owner is blocked
 612                 * itself. Decision is made after dropping the locks
 613                 */
 614                next_lock = task_blocked_on_lock(task);
 615                /*
 616                 * Get the top waiter for the next iteration
 617                 */
 618                top_waiter = rt_mutex_top_waiter(lock);
 619
 620                /* [13] Drop locks */
 621                raw_spin_unlock(&task->pi_lock);
 622                raw_spin_unlock_irq(&lock->wait_lock);
 623
 624                /* If owner is not blocked, end of chain. */
 625                if (!next_lock)
 626                        goto out_put_task;
 627                goto again;
 628        }
 629
 630        /*
 631         * Store the current top waiter before doing the requeue
 632         * operation on @lock. We need it for the boost/deboost
 633         * decision below.
 634         */
 635        prerequeue_top_waiter = rt_mutex_top_waiter(lock);
 636
 637        /* [7] Requeue the waiter in the lock waiter tree. */
 638        rt_mutex_dequeue(lock, waiter);
 639
 640        /*
 641         * Update the waiter prio fields now that we're dequeued.
 642         *
 643         * These values can have changed through either:
 644         *
 645         *   sys_sched_set_scheduler() / sys_sched_setattr()
 646         *
 647         * or
 648         *
 649         *   DL CBS enforcement advancing the effective deadline.
 650         *
 651         * Even though pi_waiters also uses these fields, and that tree is only
 652         * updated in [11], we can do this here, since we hold [L], which
 653         * serializes all pi_waiters access and rb_erase() does not care about
 654         * the values of the node being removed.
 655         */
 656        waiter->prio = task->prio;
 657        waiter->deadline = task->dl.deadline;
 658
 659        rt_mutex_enqueue(lock, waiter);
 660
 661        /* [8] Release the task */
 662        raw_spin_unlock(&task->pi_lock);
 663        put_task_struct(task);
 664
 665        /*
 666         * [9] check_exit_conditions_3 protected by lock->wait_lock.
 667         *
 668         * We must abort the chain walk if there is no lock owner even
 669         * in the dead lock detection case, as we have nothing to
 670         * follow here. This is the end of the chain we are walking.
 671         */
 672        if (!rt_mutex_owner(lock)) {
 673                /*
 674                 * If the requeue [7] above changed the top waiter,
 675                 * then we need to wake the new top waiter up to try
 676                 * to get the lock.
 677                 */
 678                if (prerequeue_top_waiter != rt_mutex_top_waiter(lock))
 679                        wake_up_process(rt_mutex_top_waiter(lock)->task);
 680                raw_spin_unlock_irq(&lock->wait_lock);
 681                return 0;
 682        }
 683
 684        /* [10] Grab the next task, i.e. the owner of @lock */
 685        task = get_task_struct(rt_mutex_owner(lock));
 686        raw_spin_lock(&task->pi_lock);
 687
 688        /* [11] requeue the pi waiters if necessary */
 689        if (waiter == rt_mutex_top_waiter(lock)) {
 690                /*
 691                 * The waiter became the new top (highest priority)
 692                 * waiter on the lock. Replace the previous top waiter
 693                 * in the owner tasks pi waiters tree with this waiter
 694                 * and adjust the priority of the owner.
 695                 */
 696                rt_mutex_dequeue_pi(task, prerequeue_top_waiter);
 697                rt_mutex_enqueue_pi(task, waiter);
 698                rt_mutex_adjust_prio(task);
 699
 700        } else if (prerequeue_top_waiter == waiter) {
 701                /*
 702                 * The waiter was the top waiter on the lock, but is
 703                 * no longer the top priority waiter. Replace waiter in
 704                 * the owner tasks pi waiters tree with the new top
 705                 * (highest priority) waiter and adjust the priority
 706                 * of the owner.
 707                 * The new top waiter is stored in @waiter so that
 708                 * @waiter == @top_waiter evaluates to true below and
 709                 * we continue to deboost the rest of the chain.
 710                 */
 711                rt_mutex_dequeue_pi(task, waiter);
 712                waiter = rt_mutex_top_waiter(lock);
 713                rt_mutex_enqueue_pi(task, waiter);
 714                rt_mutex_adjust_prio(task);
 715        } else {
 716                /*
 717                 * Nothing changed. No need to do any priority
 718                 * adjustment.
 719                 */
 720        }
 721
 722        /*
 723         * [12] check_exit_conditions_4() protected by task->pi_lock
 724         * and lock->wait_lock. The actual decisions are made after we
 725         * dropped the locks.
 726         *
 727         * Check whether the task which owns the current lock is pi
 728         * blocked itself. If yes we store a pointer to the lock for
 729         * the lock chain change detection above. After we dropped
 730         * task->pi_lock next_lock cannot be dereferenced anymore.
 731         */
 732        next_lock = task_blocked_on_lock(task);
 733        /*
 734         * Store the top waiter of @lock for the end of chain walk
 735         * decision below.
 736         */
 737        top_waiter = rt_mutex_top_waiter(lock);
 738
 739        /* [13] Drop the locks */
 740        raw_spin_unlock(&task->pi_lock);
 741        raw_spin_unlock_irq(&lock->wait_lock);
 742
 743        /*
 744         * Make the actual exit decisions [12], based on the stored
 745         * values.
 746         *
 747         * We reached the end of the lock chain. Stop right here. No
 748         * point to go back just to figure that out.
 749         */
 750        if (!next_lock)
 751                goto out_put_task;
 752
 753        /*
 754         * If the current waiter is not the top waiter on the lock,
 755         * then we can stop the chain walk here if we are not in full
 756         * deadlock detection mode.
 757         */
 758        if (!detect_deadlock && waiter != top_waiter)
 759                goto out_put_task;
 760
 761        goto again;
 762
 763 out_unlock_pi:
 764        raw_spin_unlock_irq(&task->pi_lock);
 765 out_put_task:
 766        put_task_struct(task);
 767
 768        return ret;
 769}
 770
 771/*
 772 * Try to take an rt-mutex
 773 *
 774 * Must be called with lock->wait_lock held and interrupts disabled
 775 *
 776 * @lock:   The lock to be acquired.
 777 * @task:   The task which wants to acquire the lock
 778 * @waiter: The waiter that is queued to the lock's wait tree if the
 779 *          callsite called task_blocked_on_lock(), otherwise NULL
 780 */
 781static int __sched
 782try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task,
 783                     struct rt_mutex_waiter *waiter)
 784{
 785        lockdep_assert_held(&lock->wait_lock);
 786
 787        /*
 788         * Before testing whether we can acquire @lock, we set the
 789         * RT_MUTEX_HAS_WAITERS bit in @lock->owner. This forces all
 790         * other tasks which try to modify @lock into the slow path
 791         * and they serialize on @lock->wait_lock.
 792         *
 793         * The RT_MUTEX_HAS_WAITERS bit can have a transitional state
 794         * as explained at the top of this file if and only if:
 795         *
 796         * - There is a lock owner. The caller must fixup the
 797         *   transient state if it does a trylock or leaves the lock
 798         *   function due to a signal or timeout.
 799         *
 800         * - @task acquires the lock and there are no other
 801         *   waiters. This is undone in rt_mutex_set_owner(@task) at
 802         *   the end of this function.
 803         */
 804        mark_rt_mutex_waiters(lock);
 805
 806        /*
 807         * If @lock has an owner, give up.
 808         */
 809        if (rt_mutex_owner(lock))
 810                return 0;
 811
 812        /*
 813         * If @waiter != NULL, @task has already enqueued the waiter
 814         * into @lock waiter tree. If @waiter == NULL then this is a
 815         * trylock attempt.
 816         */
 817        if (waiter) {
 818                /*
 819                 * If waiter is not the highest priority waiter of
 820                 * @lock, give up.
 821                 */
 822                if (waiter != rt_mutex_top_waiter(lock))
 823                        return 0;
 824
 825                /*
 826                 * We can acquire the lock. Remove the waiter from the
 827                 * lock waiters tree.
 828                 */
 829                rt_mutex_dequeue(lock, waiter);
 830
 831        } else {
 832                /*
 833                 * If the lock has waiters already we check whether @task is
 834                 * eligible to take over the lock.
 835                 *
 836                 * If there are no other waiters, @task can acquire
 837                 * the lock.  @task->pi_blocked_on is NULL, so it does
 838                 * not need to be dequeued.
 839                 */
 840                if (rt_mutex_has_waiters(lock)) {
 841                        /*
 842                         * If @task->prio is greater than or equal to
 843                         * the top waiter priority (kernel view),
 844                         * @task lost.
 845                         */
 846                        if (!rt_mutex_waiter_less(task_to_waiter(task),
 847                                                  rt_mutex_top_waiter(lock)))
 848                                return 0;
 849
 850                        /*
 851                         * The current top waiter stays enqueued. We
 852                         * don't have to change anything in the lock
 853                         * waiters order.
 854                         */
 855                } else {
 856                        /*
 857                         * No waiters. Take the lock without the
 858                         * pi_lock dance.@task->pi_blocked_on is NULL
 859                         * and we have no waiters to enqueue in @task
 860                         * pi waiters tree.
 861                         */
 862                        goto takeit;
 863                }
 864        }
 865
 866        /*
 867         * Clear @task->pi_blocked_on. Requires protection by
 868         * @task->pi_lock. Redundant operation for the @waiter == NULL
 869         * case, but conditionals are more expensive than a redundant
 870         * store.
 871         */
 872        raw_spin_lock(&task->pi_lock);
 873        task->pi_blocked_on = NULL;
 874        /*
 875         * Finish the lock acquisition. @task is the new owner. If
 876         * other waiters exist we have to insert the highest priority
 877         * waiter into @task->pi_waiters tree.
 878         */
 879        if (rt_mutex_has_waiters(lock))
 880                rt_mutex_enqueue_pi(task, rt_mutex_top_waiter(lock));
 881        raw_spin_unlock(&task->pi_lock);
 882
 883takeit:
 884        /*
 885         * This either preserves the RT_MUTEX_HAS_WAITERS bit if there
 886         * are still waiters or clears it.
 887         */
 888        rt_mutex_set_owner(lock, task);
 889
 890        return 1;
 891}
 892
 893/*
 894 * Task blocks on lock.
 895 *
 896 * Prepare waiter and propagate pi chain
 897 *
 898 * This must be called with lock->wait_lock held and interrupts disabled
 899 */
 900static int __sched task_blocks_on_rt_mutex(struct rt_mutex *lock,
 901                                           struct rt_mutex_waiter *waiter,
 902                                           struct task_struct *task,
 903                                           enum rtmutex_chainwalk chwalk)
 904{
 905        struct task_struct *owner = rt_mutex_owner(lock);
 906        struct rt_mutex_waiter *top_waiter = waiter;
 907        struct rt_mutex *next_lock;
 908        int chain_walk = 0, res;
 909
 910        lockdep_assert_held(&lock->wait_lock);
 911
 912        /*
 913         * Early deadlock detection. We really don't want the task to
 914         * enqueue on itself just to untangle the mess later. It's not
 915         * only an optimization. We drop the locks, so another waiter
 916         * can come in before the chain walk detects the deadlock. So
 917         * the other will detect the deadlock and return -EDEADLOCK,
 918         * which is wrong, as the other waiter is not in a deadlock
 919         * situation.
 920         */
 921        if (owner == task)
 922                return -EDEADLK;
 923
 924        raw_spin_lock(&task->pi_lock);
 925        waiter->task = task;
 926        waiter->lock = lock;
 927        waiter->prio = task->prio;
 928        waiter->deadline = task->dl.deadline;
 929
 930        /* Get the top priority waiter on the lock */
 931        if (rt_mutex_has_waiters(lock))
 932                top_waiter = rt_mutex_top_waiter(lock);
 933        rt_mutex_enqueue(lock, waiter);
 934
 935        task->pi_blocked_on = waiter;
 936
 937        raw_spin_unlock(&task->pi_lock);
 938
 939        if (!owner)
 940                return 0;
 941
 942        raw_spin_lock(&owner->pi_lock);
 943        if (waiter == rt_mutex_top_waiter(lock)) {
 944                rt_mutex_dequeue_pi(owner, top_waiter);
 945                rt_mutex_enqueue_pi(owner, waiter);
 946
 947                rt_mutex_adjust_prio(owner);
 948                if (owner->pi_blocked_on)
 949                        chain_walk = 1;
 950        } else if (rt_mutex_cond_detect_deadlock(waiter, chwalk)) {
 951                chain_walk = 1;
 952        }
 953
 954        /* Store the lock on which owner is blocked or NULL */
 955        next_lock = task_blocked_on_lock(owner);
 956
 957        raw_spin_unlock(&owner->pi_lock);
 958        /*
 959         * Even if full deadlock detection is on, if the owner is not
 960         * blocked itself, we can avoid finding this out in the chain
 961         * walk.
 962         */
 963        if (!chain_walk || !next_lock)
 964                return 0;
 965
 966        /*
 967         * The owner can't disappear while holding a lock,
 968         * so the owner struct is protected by wait_lock.
 969         * Gets dropped in rt_mutex_adjust_prio_chain()!
 970         */
 971        get_task_struct(owner);
 972
 973        raw_spin_unlock_irq(&lock->wait_lock);
 974
 975        res = rt_mutex_adjust_prio_chain(owner, chwalk, lock,
 976                                         next_lock, waiter, task);
 977
 978        raw_spin_lock_irq(&lock->wait_lock);
 979
 980        return res;
 981}
 982
 983/*
 984 * Remove the top waiter from the current tasks pi waiter tree and
 985 * queue it up.
 986 *
 987 * Called with lock->wait_lock held and interrupts disabled.
 988 */
 989static void __sched mark_wakeup_next_waiter(struct wake_q_head *wake_q,
 990                                            struct rt_mutex *lock)
 991{
 992        struct rt_mutex_waiter *waiter;
 993
 994        raw_spin_lock(&current->pi_lock);
 995
 996        waiter = rt_mutex_top_waiter(lock);
 997
 998        /*
 999         * Remove it from current->pi_waiters and deboost.
1000         *
1001         * We must in fact deboost here in order to ensure we call
1002         * rt_mutex_setprio() to update p->pi_top_task before the
1003         * task unblocks.
1004         */
1005        rt_mutex_dequeue_pi(current, waiter);
1006        rt_mutex_adjust_prio(current);
1007
1008        /*
1009         * As we are waking up the top waiter, and the waiter stays
1010         * queued on the lock until it gets the lock, this lock
1011         * obviously has waiters. Just set the bit here and this has
1012         * the added benefit of forcing all new tasks into the
1013         * slow path making sure no task of lower priority than
1014         * the top waiter can steal this lock.
1015         */
1016        lock->owner = (void *) RT_MUTEX_HAS_WAITERS;
1017
1018        /*
1019         * We deboosted before waking the top waiter task such that we don't
1020         * run two tasks with the 'same' priority (and ensure the
1021         * p->pi_top_task pointer points to a blocked task). This however can
1022         * lead to priority inversion if we would get preempted after the
1023         * deboost but before waking our donor task, hence the preempt_disable()
1024         * before unlock.
1025         *
1026         * Pairs with preempt_enable() in rt_mutex_postunlock();
1027         */
1028        preempt_disable();
1029        wake_q_add(wake_q, waiter->task);
1030        raw_spin_unlock(&current->pi_lock);
1031}
1032
1033/*
1034 * Remove a waiter from a lock and give up
1035 *
1036 * Must be called with lock->wait_lock held and interrupts disabled. I must
1037 * have just failed to try_to_take_rt_mutex().
1038 */
1039static void __sched remove_waiter(struct rt_mutex *lock,
1040                                  struct rt_mutex_waiter *waiter)
1041{
1042        bool is_top_waiter = (waiter == rt_mutex_top_waiter(lock));
1043        struct task_struct *owner = rt_mutex_owner(lock);
1044        struct rt_mutex *next_lock;
1045
1046        lockdep_assert_held(&lock->wait_lock);
1047
1048        raw_spin_lock(&current->pi_lock);
1049        rt_mutex_dequeue(lock, waiter);
1050        current->pi_blocked_on = NULL;
1051        raw_spin_unlock(&current->pi_lock);
1052
1053        /*
1054         * Only update priority if the waiter was the highest priority
1055         * waiter of the lock and there is an owner to update.
1056         */
1057        if (!owner || !is_top_waiter)
1058                return;
1059
1060        raw_spin_lock(&owner->pi_lock);
1061
1062        rt_mutex_dequeue_pi(owner, waiter);
1063
1064        if (rt_mutex_has_waiters(lock))
1065                rt_mutex_enqueue_pi(owner, rt_mutex_top_waiter(lock));
1066
1067        rt_mutex_adjust_prio(owner);
1068
1069        /* Store the lock on which owner is blocked or NULL */
1070        next_lock = task_blocked_on_lock(owner);
1071
1072        raw_spin_unlock(&owner->pi_lock);
1073
1074        /*
1075         * Don't walk the chain, if the owner task is not blocked
1076         * itself.
1077         */
1078        if (!next_lock)
1079                return;
1080
1081        /* gets dropped in rt_mutex_adjust_prio_chain()! */
1082        get_task_struct(owner);
1083
1084        raw_spin_unlock_irq(&lock->wait_lock);
1085
1086        rt_mutex_adjust_prio_chain(owner, RT_MUTEX_MIN_CHAINWALK, lock,
1087                                   next_lock, NULL, current);
1088
1089        raw_spin_lock_irq(&lock->wait_lock);
1090}
1091
1092/*
1093 * Recheck the pi chain, in case we got a priority setting
1094 *
1095 * Called from sched_setscheduler
1096 */
1097void __sched rt_mutex_adjust_pi(struct task_struct *task)
1098{
1099        struct rt_mutex_waiter *waiter;
1100        struct rt_mutex *next_lock;
1101        unsigned long flags;
1102
1103        raw_spin_lock_irqsave(&task->pi_lock, flags);
1104
1105        waiter = task->pi_blocked_on;
1106        if (!waiter || rt_mutex_waiter_equal(waiter, task_to_waiter(task))) {
1107                raw_spin_unlock_irqrestore(&task->pi_lock, flags);
1108                return;
1109        }
1110        next_lock = waiter->lock;
1111        raw_spin_unlock_irqrestore(&task->pi_lock, flags);
1112
1113        /* gets dropped in rt_mutex_adjust_prio_chain()! */
1114        get_task_struct(task);
1115
1116        rt_mutex_adjust_prio_chain(task, RT_MUTEX_MIN_CHAINWALK, NULL,
1117                                   next_lock, NULL, task);
1118}
1119
1120void __sched rt_mutex_init_waiter(struct rt_mutex_waiter *waiter)
1121{
1122        debug_rt_mutex_init_waiter(waiter);
1123        RB_CLEAR_NODE(&waiter->pi_tree_entry);
1124        RB_CLEAR_NODE(&waiter->tree_entry);
1125        waiter->task = NULL;
1126}
1127
1128/**
1129 * __rt_mutex_slowlock() - Perform the wait-wake-try-to-take loop
1130 * @lock:                the rt_mutex to take
1131 * @state:               the state the task should block in (TASK_INTERRUPTIBLE
1132 *                       or TASK_UNINTERRUPTIBLE)
1133 * @timeout:             the pre-initialized and started timer, or NULL for none
1134 * @waiter:              the pre-initialized rt_mutex_waiter
1135 *
1136 * Must be called with lock->wait_lock held and interrupts disabled
1137 */
1138static int __sched __rt_mutex_slowlock(struct rt_mutex *lock, unsigned int state,
1139                                       struct hrtimer_sleeper *timeout,
1140                                       struct rt_mutex_waiter *waiter)
1141{
1142        int ret = 0;
1143
1144        for (;;) {
1145                /* Try to acquire the lock: */
1146                if (try_to_take_rt_mutex(lock, current, waiter))
1147                        break;
1148
1149                if (timeout && !timeout->task) {
1150                        ret = -ETIMEDOUT;
1151                        break;
1152                }
1153                if (signal_pending_state(state, current)) {
1154                        ret = -EINTR;
1155                        break;
1156                }
1157
1158                raw_spin_unlock_irq(&lock->wait_lock);
1159
1160                schedule();
1161
1162                raw_spin_lock_irq(&lock->wait_lock);
1163                set_current_state(state);
1164        }
1165
1166        __set_current_state(TASK_RUNNING);
1167        return ret;
1168}
1169
1170static void __sched rt_mutex_handle_deadlock(int res, int detect_deadlock,
1171                                             struct rt_mutex_waiter *w)
1172{
1173        /*
1174         * If the result is not -EDEADLOCK or the caller requested
1175         * deadlock detection, nothing to do here.
1176         */
1177        if (res != -EDEADLOCK || detect_deadlock)
1178                return;
1179
1180        /*
1181         * Yell loudly and stop the task right here.
1182         */
1183        WARN(1, "rtmutex deadlock detected\n");
1184        while (1) {
1185                set_current_state(TASK_INTERRUPTIBLE);
1186                schedule();
1187        }
1188}
1189
1190/*
1191 * Slow path lock function:
1192 */
1193static int __sched rt_mutex_slowlock(struct rt_mutex *lock, unsigned int state,
1194                                     struct hrtimer_sleeper *timeout,
1195                                     enum rtmutex_chainwalk chwalk)
1196{
1197        struct rt_mutex_waiter waiter;
1198        unsigned long flags;
1199        int ret = 0;
1200
1201        rt_mutex_init_waiter(&waiter);
1202
1203        /*
1204         * Technically we could use raw_spin_[un]lock_irq() here, but this can
1205         * be called in early boot if the cmpxchg() fast path is disabled
1206         * (debug, no architecture support). In this case we will acquire the
1207         * rtmutex with lock->wait_lock held. But we cannot unconditionally
1208         * enable interrupts in that early boot case. So we need to use the
1209         * irqsave/restore variants.
1210         */
1211        raw_spin_lock_irqsave(&lock->wait_lock, flags);
1212
1213        /* Try to acquire the lock again: */
1214        if (try_to_take_rt_mutex(lock, current, NULL)) {
1215                raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
1216                return 0;
1217        }
1218
1219        set_current_state(state);
1220
1221        /* Setup the timer, when timeout != NULL */
1222        if (unlikely(timeout))
1223                hrtimer_start_expires(&timeout->timer, HRTIMER_MODE_ABS);
1224
1225        ret = task_blocks_on_rt_mutex(lock, &waiter, current, chwalk);
1226
1227        if (likely(!ret))
1228                /* sleep on the mutex */
1229                ret = __rt_mutex_slowlock(lock, state, timeout, &waiter);
1230
1231        if (unlikely(ret)) {
1232                __set_current_state(TASK_RUNNING);
1233                remove_waiter(lock, &waiter);
1234                rt_mutex_handle_deadlock(ret, chwalk, &waiter);
1235        }
1236
1237        /*
1238         * try_to_take_rt_mutex() sets the waiter bit
1239         * unconditionally. We might have to fix that up.
1240         */
1241        fixup_rt_mutex_waiters(lock);
1242
1243        raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
1244
1245        /* Remove pending timer: */
1246        if (unlikely(timeout))
1247                hrtimer_cancel(&timeout->timer);
1248
1249        debug_rt_mutex_free_waiter(&waiter);
1250
1251        return ret;
1252}
1253
1254static int __sched __rt_mutex_slowtrylock(struct rt_mutex *lock)
1255{
1256        int ret = try_to_take_rt_mutex(lock, current, NULL);
1257
1258        /*
1259         * try_to_take_rt_mutex() sets the lock waiters bit
1260         * unconditionally. Clean this up.
1261         */
1262        fixup_rt_mutex_waiters(lock);
1263
1264        return ret;
1265}
1266
1267/*
1268 * Slow path try-lock function:
1269 */
1270static int __sched rt_mutex_slowtrylock(struct rt_mutex *lock)
1271{
1272        unsigned long flags;
1273        int ret;
1274
1275        /*
1276         * If the lock already has an owner we fail to get the lock.
1277         * This can be done without taking the @lock->wait_lock as
1278         * it is only being read, and this is a trylock anyway.
1279         */
1280        if (rt_mutex_owner(lock))
1281                return 0;
1282
1283        /*
1284         * The mutex has currently no owner. Lock the wait lock and try to
1285         * acquire the lock. We use irqsave here to support early boot calls.
1286         */
1287        raw_spin_lock_irqsave(&lock->wait_lock, flags);
1288
1289        ret = __rt_mutex_slowtrylock(lock);
1290
1291        raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
1292
1293        return ret;
1294}
1295
1296/*
1297 * Performs the wakeup of the top-waiter and re-enables preemption.
1298 */
1299void __sched rt_mutex_postunlock(struct wake_q_head *wake_q)
1300{
1301        wake_up_q(wake_q);
1302
1303        /* Pairs with preempt_disable() in mark_wakeup_next_waiter() */
1304        preempt_enable();
1305}
1306
1307/*
1308 * Slow path to release a rt-mutex.
1309 *
1310 * Return whether the current task needs to call rt_mutex_postunlock().
1311 */
1312static void __sched rt_mutex_slowunlock(struct rt_mutex *lock)
1313{
1314        DEFINE_WAKE_Q(wake_q);
1315        unsigned long flags;
1316
1317        /* irqsave required to support early boot calls */
1318        raw_spin_lock_irqsave(&lock->wait_lock, flags);
1319
1320        debug_rt_mutex_unlock(lock);
1321
1322        /*
1323         * We must be careful here if the fast path is enabled. If we
1324         * have no waiters queued we cannot set owner to NULL here
1325         * because of:
1326         *
1327         * foo->lock->owner = NULL;
1328         *                      rtmutex_lock(foo->lock);   <- fast path
1329         *                      free = atomic_dec_and_test(foo->refcnt);
1330         *                      rtmutex_unlock(foo->lock); <- fast path
1331         *                      if (free)
1332         *                              kfree(foo);
1333         * raw_spin_unlock(foo->lock->wait_lock);
1334         *
1335         * So for the fastpath enabled kernel:
1336         *
1337         * Nothing can set the waiters bit as long as we hold
1338         * lock->wait_lock. So we do the following sequence:
1339         *
1340         *      owner = rt_mutex_owner(lock);
1341         *      clear_rt_mutex_waiters(lock);
1342         *      raw_spin_unlock(&lock->wait_lock);
1343         *      if (cmpxchg(&lock->owner, owner, 0) == owner)
1344         *              return;
1345         *      goto retry;
1346         *
1347         * The fastpath disabled variant is simple as all access to
1348         * lock->owner is serialized by lock->wait_lock:
1349         *
1350         *      lock->owner = NULL;
1351         *      raw_spin_unlock(&lock->wait_lock);
1352         */
1353        while (!rt_mutex_has_waiters(lock)) {
1354                /* Drops lock->wait_lock ! */
1355                if (unlock_rt_mutex_safe(lock, flags) == true)
1356                        return;
1357                /* Relock the rtmutex and try again */
1358                raw_spin_lock_irqsave(&lock->wait_lock, flags);
1359        }
1360
1361        /*
1362         * The wakeup next waiter path does not suffer from the above
1363         * race. See the comments there.
1364         *
1365         * Queue the next waiter for wakeup once we release the wait_lock.
1366         */
1367        mark_wakeup_next_waiter(&wake_q, lock);
1368        raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
1369
1370        rt_mutex_postunlock(&wake_q);
1371}
1372
1373/*
1374 * debug aware fast / slowpath lock,trylock,unlock
1375 *
1376 * The atomic acquire/release ops are compiled away, when either the
1377 * architecture does not support cmpxchg or when debugging is enabled.
1378 */
1379static __always_inline int __rt_mutex_lock(struct rt_mutex *lock, long state,
1380                                           unsigned int subclass)
1381{
1382        int ret;
1383
1384        might_sleep();
1385        mutex_acquire(&lock->dep_map, subclass, 0, _RET_IP_);
1386
1387        if (likely(rt_mutex_cmpxchg_acquire(lock, NULL, current)))
1388                return 0;
1389
1390        ret = rt_mutex_slowlock(lock, state, NULL, RT_MUTEX_MIN_CHAINWALK);
1391        if (ret)
1392                mutex_release(&lock->dep_map, _RET_IP_);
1393        return ret;
1394}
1395
1396#ifdef CONFIG_DEBUG_LOCK_ALLOC
1397/**
1398 * rt_mutex_lock_nested - lock a rt_mutex
1399 *
1400 * @lock: the rt_mutex to be locked
1401 * @subclass: the lockdep subclass
1402 */
1403void __sched rt_mutex_lock_nested(struct rt_mutex *lock, unsigned int subclass)
1404{
1405        __rt_mutex_lock(lock, TASK_UNINTERRUPTIBLE, subclass);
1406}
1407EXPORT_SYMBOL_GPL(rt_mutex_lock_nested);
1408
1409#else /* !CONFIG_DEBUG_LOCK_ALLOC */
1410
1411/**
1412 * rt_mutex_lock - lock a rt_mutex
1413 *
1414 * @lock: the rt_mutex to be locked
1415 */
1416void __sched rt_mutex_lock(struct rt_mutex *lock)
1417{
1418        __rt_mutex_lock(lock, TASK_UNINTERRUPTIBLE, 0);
1419}
1420EXPORT_SYMBOL_GPL(rt_mutex_lock);
1421#endif
1422
1423/**
1424 * rt_mutex_lock_interruptible - lock a rt_mutex interruptible
1425 *
1426 * @lock:               the rt_mutex to be locked
1427 *
1428 * Returns:
1429 *  0           on success
1430 * -EINTR       when interrupted by a signal
1431 */
1432int __sched rt_mutex_lock_interruptible(struct rt_mutex *lock)
1433{
1434        return __rt_mutex_lock(lock, TASK_INTERRUPTIBLE, 0);
1435}
1436EXPORT_SYMBOL_GPL(rt_mutex_lock_interruptible);
1437
1438/**
1439 * rt_mutex_trylock - try to lock a rt_mutex
1440 *
1441 * @lock:       the rt_mutex to be locked
1442 *
1443 * This function can only be called in thread context. It's safe to call it
1444 * from atomic regions, but not from hard or soft interrupt context.
1445 *
1446 * Returns:
1447 *  1 on success
1448 *  0 on contention
1449 */
1450int __sched rt_mutex_trylock(struct rt_mutex *lock)
1451{
1452        int ret;
1453
1454        if (IS_ENABLED(CONFIG_DEBUG_RT_MUTEXES) && WARN_ON_ONCE(!in_task()))
1455                return 0;
1456
1457        /*
1458         * No lockdep annotation required because lockdep disables the fast
1459         * path.
1460         */
1461        if (likely(rt_mutex_cmpxchg_acquire(lock, NULL, current)))
1462                return 1;
1463
1464        ret = rt_mutex_slowtrylock(lock);
1465        if (ret)
1466                mutex_acquire(&lock->dep_map, 0, 1, _RET_IP_);
1467
1468        return ret;
1469}
1470EXPORT_SYMBOL_GPL(rt_mutex_trylock);
1471
1472/**
1473 * rt_mutex_unlock - unlock a rt_mutex
1474 *
1475 * @lock: the rt_mutex to be unlocked
1476 */
1477void __sched rt_mutex_unlock(struct rt_mutex *lock)
1478{
1479        mutex_release(&lock->dep_map, _RET_IP_);
1480        if (likely(rt_mutex_cmpxchg_release(lock, current, NULL)))
1481                return;
1482
1483        rt_mutex_slowunlock(lock);
1484}
1485EXPORT_SYMBOL_GPL(rt_mutex_unlock);
1486
1487/*
1488 * Futex variants, must not use fastpath.
1489 */
1490int __sched rt_mutex_futex_trylock(struct rt_mutex *lock)
1491{
1492        return rt_mutex_slowtrylock(lock);
1493}
1494
1495int __sched __rt_mutex_futex_trylock(struct rt_mutex *lock)
1496{
1497        return __rt_mutex_slowtrylock(lock);
1498}
1499
1500/**
1501 * __rt_mutex_futex_unlock - Futex variant, that since futex variants
1502 * do not use the fast-path, can be simple and will not need to retry.
1503 *
1504 * @lock:       The rt_mutex to be unlocked
1505 * @wake_q:     The wake queue head from which to get the next lock waiter
1506 */
1507bool __sched __rt_mutex_futex_unlock(struct rt_mutex *lock,
1508                                     struct wake_q_head *wake_q)
1509{
1510        lockdep_assert_held(&lock->wait_lock);
1511
1512        debug_rt_mutex_unlock(lock);
1513
1514        if (!rt_mutex_has_waiters(lock)) {
1515                lock->owner = NULL;
1516                return false; /* done */
1517        }
1518
1519        /*
1520         * We've already deboosted, mark_wakeup_next_waiter() will
1521         * retain preempt_disabled when we drop the wait_lock, to
1522         * avoid inversion prior to the wakeup.  preempt_disable()
1523         * therein pairs with rt_mutex_postunlock().
1524         */
1525        mark_wakeup_next_waiter(wake_q, lock);
1526
1527        return true; /* call postunlock() */
1528}
1529
1530void __sched rt_mutex_futex_unlock(struct rt_mutex *lock)
1531{
1532        DEFINE_WAKE_Q(wake_q);
1533        unsigned long flags;
1534        bool postunlock;
1535
1536        raw_spin_lock_irqsave(&lock->wait_lock, flags);
1537        postunlock = __rt_mutex_futex_unlock(lock, &wake_q);
1538        raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
1539
1540        if (postunlock)
1541                rt_mutex_postunlock(&wake_q);
1542}
1543
1544/**
1545 * __rt_mutex_init - initialize the rt_mutex
1546 *
1547 * @lock:       The rt_mutex to be initialized
1548 * @name:       The lock name used for debugging
1549 * @key:        The lock class key used for debugging
1550 *
1551 * Initialize the rt_mutex to unlocked state.
1552 *
1553 * Initializing of a locked rt_mutex is not allowed
1554 */
1555void __sched __rt_mutex_init(struct rt_mutex *lock, const char *name,
1556                     struct lock_class_key *key)
1557{
1558        debug_check_no_locks_freed((void *)lock, sizeof(*lock));
1559        lockdep_init_map(&lock->dep_map, name, key, 0);
1560
1561        __rt_mutex_basic_init(lock);
1562}
1563EXPORT_SYMBOL_GPL(__rt_mutex_init);
1564
1565/**
1566 * rt_mutex_init_proxy_locked - initialize and lock a rt_mutex on behalf of a
1567 *                              proxy owner
1568 *
1569 * @lock:       the rt_mutex to be locked
1570 * @proxy_owner:the task to set as owner
1571 *
1572 * No locking. Caller has to do serializing itself
1573 *
1574 * Special API call for PI-futex support. This initializes the rtmutex and
1575 * assigns it to @proxy_owner. Concurrent operations on the rtmutex are not
1576 * possible at this point because the pi_state which contains the rtmutex
1577 * is not yet visible to other tasks.
1578 */
1579void __sched rt_mutex_init_proxy_locked(struct rt_mutex *lock,
1580                                        struct task_struct *proxy_owner)
1581{
1582        __rt_mutex_basic_init(lock);
1583        rt_mutex_set_owner(lock, proxy_owner);
1584}
1585
1586/**
1587 * rt_mutex_proxy_unlock - release a lock on behalf of owner
1588 *
1589 * @lock:       the rt_mutex to be locked
1590 *
1591 * No locking. Caller has to do serializing itself
1592 *
1593 * Special API call for PI-futex support. This merrily cleans up the rtmutex
1594 * (debugging) state. Concurrent operations on this rt_mutex are not
1595 * possible because it belongs to the pi_state which is about to be freed
1596 * and it is not longer visible to other tasks.
1597 */
1598void __sched rt_mutex_proxy_unlock(struct rt_mutex *lock)
1599{
1600        debug_rt_mutex_proxy_unlock(lock);
1601        rt_mutex_set_owner(lock, NULL);
1602}
1603
1604/**
1605 * __rt_mutex_start_proxy_lock() - Start lock acquisition for another task
1606 * @lock:               the rt_mutex to take
1607 * @waiter:             the pre-initialized rt_mutex_waiter
1608 * @task:               the task to prepare
1609 *
1610 * Starts the rt_mutex acquire; it enqueues the @waiter and does deadlock
1611 * detection. It does not wait, see rt_mutex_wait_proxy_lock() for that.
1612 *
1613 * NOTE: does _NOT_ remove the @waiter on failure; must either call
1614 * rt_mutex_wait_proxy_lock() or rt_mutex_cleanup_proxy_lock() after this.
1615 *
1616 * Returns:
1617 *  0 - task blocked on lock
1618 *  1 - acquired the lock for task, caller should wake it up
1619 * <0 - error
1620 *
1621 * Special API call for PI-futex support.
1622 */
1623int __sched __rt_mutex_start_proxy_lock(struct rt_mutex *lock,
1624                                        struct rt_mutex_waiter *waiter,
1625                                        struct task_struct *task)
1626{
1627        int ret;
1628
1629        lockdep_assert_held(&lock->wait_lock);
1630
1631        if (try_to_take_rt_mutex(lock, task, NULL))
1632                return 1;
1633
1634        /* We enforce deadlock detection for futexes */
1635        ret = task_blocks_on_rt_mutex(lock, waiter, task,
1636                                      RT_MUTEX_FULL_CHAINWALK);
1637
1638        if (ret && !rt_mutex_owner(lock)) {
1639                /*
1640                 * Reset the return value. We might have
1641                 * returned with -EDEADLK and the owner
1642                 * released the lock while we were walking the
1643                 * pi chain.  Let the waiter sort it out.
1644                 */
1645                ret = 0;
1646        }
1647
1648        return ret;
1649}
1650
1651/**
1652 * rt_mutex_start_proxy_lock() - Start lock acquisition for another task
1653 * @lock:               the rt_mutex to take
1654 * @waiter:             the pre-initialized rt_mutex_waiter
1655 * @task:               the task to prepare
1656 *
1657 * Starts the rt_mutex acquire; it enqueues the @waiter and does deadlock
1658 * detection. It does not wait, see rt_mutex_wait_proxy_lock() for that.
1659 *
1660 * NOTE: unlike __rt_mutex_start_proxy_lock this _DOES_ remove the @waiter
1661 * on failure.
1662 *
1663 * Returns:
1664 *  0 - task blocked on lock
1665 *  1 - acquired the lock for task, caller should wake it up
1666 * <0 - error
1667 *
1668 * Special API call for PI-futex support.
1669 */
1670int __sched rt_mutex_start_proxy_lock(struct rt_mutex *lock,
1671                                      struct rt_mutex_waiter *waiter,
1672                                      struct task_struct *task)
1673{
1674        int ret;
1675
1676        raw_spin_lock_irq(&lock->wait_lock);
1677        ret = __rt_mutex_start_proxy_lock(lock, waiter, task);
1678        if (unlikely(ret))
1679                remove_waiter(lock, waiter);
1680        raw_spin_unlock_irq(&lock->wait_lock);
1681
1682        return ret;
1683}
1684
1685/**
1686 * rt_mutex_wait_proxy_lock() - Wait for lock acquisition
1687 * @lock:               the rt_mutex we were woken on
1688 * @to:                 the timeout, null if none. hrtimer should already have
1689 *                      been started.
1690 * @waiter:             the pre-initialized rt_mutex_waiter
1691 *
1692 * Wait for the lock acquisition started on our behalf by
1693 * rt_mutex_start_proxy_lock(). Upon failure, the caller must call
1694 * rt_mutex_cleanup_proxy_lock().
1695 *
1696 * Returns:
1697 *  0 - success
1698 * <0 - error, one of -EINTR, -ETIMEDOUT
1699 *
1700 * Special API call for PI-futex support
1701 */
1702int __sched rt_mutex_wait_proxy_lock(struct rt_mutex *lock,
1703                                     struct hrtimer_sleeper *to,
1704                                     struct rt_mutex_waiter *waiter)
1705{
1706        int ret;
1707
1708        raw_spin_lock_irq(&lock->wait_lock);
1709        /* sleep on the mutex */
1710        set_current_state(TASK_INTERRUPTIBLE);
1711        ret = __rt_mutex_slowlock(lock, TASK_INTERRUPTIBLE, to, waiter);
1712        /*
1713         * try_to_take_rt_mutex() sets the waiter bit unconditionally. We might
1714         * have to fix that up.
1715         */
1716        fixup_rt_mutex_waiters(lock);
1717        raw_spin_unlock_irq(&lock->wait_lock);
1718
1719        return ret;
1720}
1721
1722/**
1723 * rt_mutex_cleanup_proxy_lock() - Cleanup failed lock acquisition
1724 * @lock:               the rt_mutex we were woken on
1725 * @waiter:             the pre-initialized rt_mutex_waiter
1726 *
1727 * Attempt to clean up after a failed __rt_mutex_start_proxy_lock() or
1728 * rt_mutex_wait_proxy_lock().
1729 *
1730 * Unless we acquired the lock; we're still enqueued on the wait-list and can
1731 * in fact still be granted ownership until we're removed. Therefore we can
1732 * find we are in fact the owner and must disregard the
1733 * rt_mutex_wait_proxy_lock() failure.
1734 *
1735 * Returns:
1736 *  true  - did the cleanup, we done.
1737 *  false - we acquired the lock after rt_mutex_wait_proxy_lock() returned,
1738 *          caller should disregards its return value.
1739 *
1740 * Special API call for PI-futex support
1741 */
1742bool __sched rt_mutex_cleanup_proxy_lock(struct rt_mutex *lock,
1743                                         struct rt_mutex_waiter *waiter)
1744{
1745        bool cleanup = false;
1746
1747        raw_spin_lock_irq(&lock->wait_lock);
1748        /*
1749         * Do an unconditional try-lock, this deals with the lock stealing
1750         * state where __rt_mutex_futex_unlock() -> mark_wakeup_next_waiter()
1751         * sets a NULL owner.
1752         *
1753         * We're not interested in the return value, because the subsequent
1754         * test on rt_mutex_owner() will infer that. If the trylock succeeded,
1755         * we will own the lock and it will have removed the waiter. If we
1756         * failed the trylock, we're still not owner and we need to remove
1757         * ourselves.
1758         */
1759        try_to_take_rt_mutex(lock, current, waiter);
1760        /*
1761         * Unless we're the owner; we're still enqueued on the wait_list.
1762         * So check if we became owner, if not, take us off the wait_list.
1763         */
1764        if (rt_mutex_owner(lock) != current) {
1765                remove_waiter(lock, waiter);
1766                cleanup = true;
1767        }
1768        /*
1769         * try_to_take_rt_mutex() sets the waiter bit unconditionally. We might
1770         * have to fix that up.
1771         */
1772        fixup_rt_mutex_waiters(lock);
1773
1774        raw_spin_unlock_irq(&lock->wait_lock);
1775
1776        return cleanup;
1777}
1778
1779#ifdef CONFIG_DEBUG_RT_MUTEXES
1780void rt_mutex_debug_task_free(struct task_struct *task)
1781{
1782        DEBUG_LOCKS_WARN_ON(!RB_EMPTY_ROOT(&task->pi_waiters.rb_root));
1783        DEBUG_LOCKS_WARN_ON(task->pi_blocked_on);
1784}
1785#endif
1786