linux/kernel/rcutree.h
<<
>>
Prefs
   1/*
   2 * Read-Copy Update mechanism for mutual exclusion (tree-based version)
   3 * Internal non-public definitions.
   4 *
   5 * This program is free software; you can redistribute it and/or modify
   6 * it under the terms of the GNU General Public License as published by
   7 * the Free Software Foundation; either version 2 of the License, or
   8 * (at your option) any later version.
   9 *
  10 * This program is distributed in the hope that it will be useful,
  11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
  12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  13 * GNU General Public License for more details.
  14 *
  15 * You should have received a copy of the GNU General Public License
  16 * along with this program; if not, write to the Free Software
  17 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
  18 *
  19 * Copyright IBM Corporation, 2008
  20 *
  21 * Author: Ingo Molnar <mingo@elte.hu>
  22 *         Paul E. McKenney <paulmck@linux.vnet.ibm.com>
  23 */
  24
  25#include <linux/cache.h>
  26#include <linux/spinlock.h>
  27#include <linux/threads.h>
  28#include <linux/cpumask.h>
  29#include <linux/seqlock.h>
  30
  31/*
  32 * Define shape of hierarchy based on NR_CPUS and CONFIG_RCU_FANOUT.
  33 * In theory, it should be possible to add more levels straightforwardly.
  34 * In practice, this did work well going from three levels to four.
  35 * Of course, your mileage may vary.
  36 */
  37#define MAX_RCU_LVLS 4
  38#if CONFIG_RCU_FANOUT > 16
  39#define RCU_FANOUT_LEAF       16
  40#else /* #if CONFIG_RCU_FANOUT > 16 */
  41#define RCU_FANOUT_LEAF       (CONFIG_RCU_FANOUT)
  42#endif /* #else #if CONFIG_RCU_FANOUT > 16 */
  43#define RCU_FANOUT_1          (RCU_FANOUT_LEAF)
  44#define RCU_FANOUT_2          (RCU_FANOUT_1 * CONFIG_RCU_FANOUT)
  45#define RCU_FANOUT_3          (RCU_FANOUT_2 * CONFIG_RCU_FANOUT)
  46#define RCU_FANOUT_4          (RCU_FANOUT_3 * CONFIG_RCU_FANOUT)
  47
  48#if NR_CPUS <= RCU_FANOUT_1
  49#  define NUM_RCU_LVLS        1
  50#  define NUM_RCU_LVL_0       1
  51#  define NUM_RCU_LVL_1       (NR_CPUS)
  52#  define NUM_RCU_LVL_2       0
  53#  define NUM_RCU_LVL_3       0
  54#  define NUM_RCU_LVL_4       0
  55#elif NR_CPUS <= RCU_FANOUT_2
  56#  define NUM_RCU_LVLS        2
  57#  define NUM_RCU_LVL_0       1
  58#  define NUM_RCU_LVL_1       DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_1)
  59#  define NUM_RCU_LVL_2       (NR_CPUS)
  60#  define NUM_RCU_LVL_3       0
  61#  define NUM_RCU_LVL_4       0
  62#elif NR_CPUS <= RCU_FANOUT_3
  63#  define NUM_RCU_LVLS        3
  64#  define NUM_RCU_LVL_0       1
  65#  define NUM_RCU_LVL_1       DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_2)
  66#  define NUM_RCU_LVL_2       DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_1)
  67#  define NUM_RCU_LVL_3       (NR_CPUS)
  68#  define NUM_RCU_LVL_4       0
  69#elif NR_CPUS <= RCU_FANOUT_4
  70#  define NUM_RCU_LVLS        4
  71#  define NUM_RCU_LVL_0       1
  72#  define NUM_RCU_LVL_1       DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_3)
  73#  define NUM_RCU_LVL_2       DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_2)
  74#  define NUM_RCU_LVL_3       DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_1)
  75#  define NUM_RCU_LVL_4       (NR_CPUS)
  76#else
  77# error "CONFIG_RCU_FANOUT insufficient for NR_CPUS"
  78#endif /* #if (NR_CPUS) <= RCU_FANOUT_1 */
  79
  80#define RCU_SUM (NUM_RCU_LVL_0 + NUM_RCU_LVL_1 + NUM_RCU_LVL_2 + NUM_RCU_LVL_3 + NUM_RCU_LVL_4)
  81#define NUM_RCU_NODES (RCU_SUM - NR_CPUS)
  82
  83/*
  84 * Dynticks per-CPU state.
  85 */
  86struct rcu_dynticks {
  87        int dynticks_nesting;   /* Track nesting level, sort of. */
  88        int dynticks;           /* Even value for dynticks-idle, else odd. */
  89        int dynticks_nmi;       /* Even value for either dynticks-idle or */
  90                                /*  not in nmi handler, else odd.  So this */
  91                                /*  remains even for nmi from irq handler. */
  92};
  93
  94/*
  95 * Definition for node within the RCU grace-period-detection hierarchy.
  96 */
  97struct rcu_node {
  98        raw_spinlock_t lock;    /* Root rcu_node's lock protects some */
  99                                /*  rcu_state fields as well as following. */
 100        unsigned long gpnum;    /* Current grace period for this node. */
 101                                /*  This will either be equal to or one */
 102                                /*  behind the root rcu_node's gpnum. */
 103        unsigned long completed; /* Last GP completed for this node. */
 104                                /*  This will either be equal to or one */
 105                                /*  behind the root rcu_node's gpnum. */
 106        unsigned long qsmask;   /* CPUs or groups that need to switch in */
 107                                /*  order for current grace period to proceed.*/
 108                                /*  In leaf rcu_node, each bit corresponds to */
 109                                /*  an rcu_data structure, otherwise, each */
 110                                /*  bit corresponds to a child rcu_node */
 111                                /*  structure. */
 112        unsigned long expmask;  /* Groups that have ->blocked_tasks[] */
 113                                /*  elements that need to drain to allow the */
 114                                /*  current expedited grace period to */
 115                                /*  complete (only for TREE_PREEMPT_RCU). */
 116        unsigned long qsmaskinit;
 117                                /* Per-GP initial value for qsmask & expmask. */
 118        unsigned long grpmask;  /* Mask to apply to parent qsmask. */
 119                                /*  Only one bit will be set in this mask. */
 120        int     grplo;          /* lowest-numbered CPU or group here. */
 121        int     grphi;          /* highest-numbered CPU or group here. */
 122        u8      grpnum;         /* CPU/group number for next level up. */
 123        u8      level;          /* root is at level 0. */
 124        struct rcu_node *parent;
 125        struct list_head blocked_tasks[4];
 126                                /* Tasks blocked in RCU read-side critsect. */
 127                                /*  Grace period number (->gpnum) x blocked */
 128                                /*  by tasks on the (x & 0x1) element of the */
 129                                /*  blocked_tasks[] array. */
 130} ____cacheline_internodealigned_in_smp;
 131
 132/*
 133 * Do a full breadth-first scan of the rcu_node structures for the
 134 * specified rcu_state structure.
 135 */
 136#define rcu_for_each_node_breadth_first(rsp, rnp) \
 137        for ((rnp) = &(rsp)->node[0]; \
 138             (rnp) < &(rsp)->node[NUM_RCU_NODES]; (rnp)++)
 139
 140/*
 141 * Do a breadth-first scan of the non-leaf rcu_node structures for the
 142 * specified rcu_state structure.  Note that if there is a singleton
 143 * rcu_node tree with but one rcu_node structure, this loop is a no-op.
 144 */
 145#define rcu_for_each_nonleaf_node_breadth_first(rsp, rnp) \
 146        for ((rnp) = &(rsp)->node[0]; \
 147             (rnp) < (rsp)->level[NUM_RCU_LVLS - 1]; (rnp)++)
 148
 149/*
 150 * Scan the leaves of the rcu_node hierarchy for the specified rcu_state
 151 * structure.  Note that if there is a singleton rcu_node tree with but
 152 * one rcu_node structure, this loop -will- visit the rcu_node structure.
 153 * It is still a leaf node, even if it is also the root node.
 154 */
 155#define rcu_for_each_leaf_node(rsp, rnp) \
 156        for ((rnp) = (rsp)->level[NUM_RCU_LVLS - 1]; \
 157             (rnp) < &(rsp)->node[NUM_RCU_NODES]; (rnp)++)
 158
 159/* Index values for nxttail array in struct rcu_data. */
 160#define RCU_DONE_TAIL           0       /* Also RCU_WAIT head. */
 161#define RCU_WAIT_TAIL           1       /* Also RCU_NEXT_READY head. */
 162#define RCU_NEXT_READY_TAIL     2       /* Also RCU_NEXT head. */
 163#define RCU_NEXT_TAIL           3
 164#define RCU_NEXT_SIZE           4
 165
 166/* Per-CPU data for read-copy update. */
 167struct rcu_data {
 168        /* 1) quiescent-state and grace-period handling : */
 169        unsigned long   completed;      /* Track rsp->completed gp number */
 170                                        /*  in order to detect GP end. */
 171        unsigned long   gpnum;          /* Highest gp number that this CPU */
 172                                        /*  is aware of having started. */
 173        unsigned long   passed_quiesc_completed;
 174                                        /* Value of completed at time of qs. */
 175        bool            passed_quiesc;  /* User-mode/idle loop etc. */
 176        bool            qs_pending;     /* Core waits for quiesc state. */
 177        bool            beenonline;     /* CPU online at least once. */
 178        bool            preemptable;    /* Preemptable RCU? */
 179        struct rcu_node *mynode;        /* This CPU's leaf of hierarchy */
 180        unsigned long grpmask;          /* Mask to apply to leaf qsmask. */
 181
 182        /* 2) batch handling */
 183        /*
 184         * If nxtlist is not NULL, it is partitioned as follows.
 185         * Any of the partitions might be empty, in which case the
 186         * pointer to that partition will be equal to the pointer for
 187         * the following partition.  When the list is empty, all of
 188         * the nxttail elements point to the ->nxtlist pointer itself,
 189         * which in that case is NULL.
 190         *
 191         * [nxtlist, *nxttail[RCU_DONE_TAIL]):
 192         *      Entries that batch # <= ->completed
 193         *      The grace period for these entries has completed, and
 194         *      the other grace-period-completed entries may be moved
 195         *      here temporarily in rcu_process_callbacks().
 196         * [*nxttail[RCU_DONE_TAIL], *nxttail[RCU_WAIT_TAIL]):
 197         *      Entries that batch # <= ->completed - 1: waiting for current GP
 198         * [*nxttail[RCU_WAIT_TAIL], *nxttail[RCU_NEXT_READY_TAIL]):
 199         *      Entries known to have arrived before current GP ended
 200         * [*nxttail[RCU_NEXT_READY_TAIL], *nxttail[RCU_NEXT_TAIL]):
 201         *      Entries that might have arrived after current GP ended
 202         *      Note that the value of *nxttail[RCU_NEXT_TAIL] will
 203         *      always be NULL, as this is the end of the list.
 204         */
 205        struct rcu_head *nxtlist;
 206        struct rcu_head **nxttail[RCU_NEXT_SIZE];
 207        long            qlen;           /* # of queued callbacks */
 208        long            qlen_last_fqs_check;
 209                                        /* qlen at last check for QS forcing */
 210        unsigned long   n_cbs_invoked;  /* count of RCU cbs invoked. */
 211        unsigned long   n_cbs_orphaned; /* RCU cbs orphaned by dying CPU */
 212        unsigned long   n_cbs_adopted;  /* RCU cbs adopted from dying CPU */
 213        unsigned long   n_force_qs_snap;
 214                                        /* did other CPU force QS recently? */
 215        long            blimit;         /* Upper limit on a processed batch */
 216
 217#ifdef CONFIG_NO_HZ
 218        /* 3) dynticks interface. */
 219        struct rcu_dynticks *dynticks;  /* Shared per-CPU dynticks state. */
 220        int dynticks_snap;              /* Per-GP tracking for dynticks. */
 221        int dynticks_nmi_snap;          /* Per-GP tracking for dynticks_nmi. */
 222#endif /* #ifdef CONFIG_NO_HZ */
 223
 224        /* 4) reasons this CPU needed to be kicked by force_quiescent_state */
 225#ifdef CONFIG_NO_HZ
 226        unsigned long dynticks_fqs;     /* Kicked due to dynticks idle. */
 227#endif /* #ifdef CONFIG_NO_HZ */
 228        unsigned long offline_fqs;      /* Kicked due to being offline. */
 229        unsigned long resched_ipi;      /* Sent a resched IPI. */
 230
 231        /* 5) __rcu_pending() statistics. */
 232        unsigned long n_rcu_pending;    /* rcu_pending() calls since boot. */
 233        unsigned long n_rp_qs_pending;
 234        unsigned long n_rp_report_qs;
 235        unsigned long n_rp_cb_ready;
 236        unsigned long n_rp_cpu_needs_gp;
 237        unsigned long n_rp_gp_completed;
 238        unsigned long n_rp_gp_started;
 239        unsigned long n_rp_need_fqs;
 240        unsigned long n_rp_need_nothing;
 241
 242        int cpu;
 243};
 244
 245/* Values for signaled field in struct rcu_state. */
 246#define RCU_GP_IDLE             0       /* No grace period in progress. */
 247#define RCU_GP_INIT             1       /* Grace period being initialized. */
 248#define RCU_SAVE_DYNTICK        2       /* Need to scan dyntick state. */
 249#define RCU_FORCE_QS            3       /* Need to force quiescent state. */
 250#ifdef CONFIG_NO_HZ
 251#define RCU_SIGNAL_INIT         RCU_SAVE_DYNTICK
 252#else /* #ifdef CONFIG_NO_HZ */
 253#define RCU_SIGNAL_INIT         RCU_FORCE_QS
 254#endif /* #else #ifdef CONFIG_NO_HZ */
 255
 256#define RCU_JIFFIES_TILL_FORCE_QS        3      /* for rsp->jiffies_force_qs */
 257#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
 258
 259#ifdef CONFIG_PROVE_RCU
 260#define RCU_STALL_DELAY_DELTA          (5 * HZ)
 261#else
 262#define RCU_STALL_DELAY_DELTA          0
 263#endif
 264
 265#define RCU_SECONDS_TILL_STALL_CHECK   (CONFIG_RCU_CPU_STALL_TIMEOUT * HZ + \
 266                                        RCU_STALL_DELAY_DELTA)
 267                                                /* for rsp->jiffies_stall */
 268#define RCU_SECONDS_TILL_STALL_RECHECK (3 * RCU_SECONDS_TILL_STALL_CHECK + 30)
 269                                                /* for rsp->jiffies_stall */
 270#define RCU_STALL_RAT_DELAY             2       /* Allow other CPUs time */
 271                                                /*  to take at least one */
 272                                                /*  scheduling clock irq */
 273                                                /*  before ratting on them. */
 274
 275#ifdef CONFIG_RCU_CPU_STALL_DETECTOR_RUNNABLE
 276#define RCU_CPU_STALL_SUPPRESS_INIT 0
 277#else
 278#define RCU_CPU_STALL_SUPPRESS_INIT 1
 279#endif
 280
 281#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
 282
 283/*
 284 * RCU global state, including node hierarchy.  This hierarchy is
 285 * represented in "heap" form in a dense array.  The root (first level)
 286 * of the hierarchy is in ->node[0] (referenced by ->level[0]), the second
 287 * level in ->node[1] through ->node[m] (->node[1] referenced by ->level[1]),
 288 * and the third level in ->node[m+1] and following (->node[m+1] referenced
 289 * by ->level[2]).  The number of levels is determined by the number of
 290 * CPUs and by CONFIG_RCU_FANOUT.  Small systems will have a "hierarchy"
 291 * consisting of a single rcu_node.
 292 */
 293struct rcu_state {
 294        struct rcu_node node[NUM_RCU_NODES];    /* Hierarchy. */
 295        struct rcu_node *level[NUM_RCU_LVLS];   /* Hierarchy levels. */
 296        u32 levelcnt[MAX_RCU_LVLS + 1];         /* # nodes in each level. */
 297        u8 levelspread[NUM_RCU_LVLS];           /* kids/node in each level. */
 298        struct rcu_data __percpu *rda;          /* pointer of percu rcu_data. */
 299
 300        /* The following fields are guarded by the root rcu_node's lock. */
 301
 302        u8      signaled ____cacheline_internodealigned_in_smp;
 303                                                /* Force QS state. */
 304        u8      fqs_active;                     /* force_quiescent_state() */
 305                                                /*  is running. */
 306        u8      fqs_need_gp;                    /* A CPU was prevented from */
 307                                                /*  starting a new grace */
 308                                                /*  period because */
 309                                                /*  force_quiescent_state() */
 310                                                /*  was running. */
 311        unsigned long gpnum;                    /* Current gp number. */
 312        unsigned long completed;                /* # of last completed gp. */
 313
 314        /* End of fields guarded by root rcu_node's lock. */
 315
 316        raw_spinlock_t onofflock;               /* exclude on/offline and */
 317                                                /*  starting new GP. */
 318        raw_spinlock_t fqslock;                 /* Only one task forcing */
 319                                                /*  quiescent states. */
 320        unsigned long jiffies_force_qs;         /* Time at which to invoke */
 321                                                /*  force_quiescent_state(). */
 322        unsigned long n_force_qs;               /* Number of calls to */
 323                                                /*  force_quiescent_state(). */
 324        unsigned long n_force_qs_lh;            /* ~Number of calls leaving */
 325                                                /*  due to lock unavailable. */
 326        unsigned long n_force_qs_ngp;           /* Number of calls leaving */
 327                                                /*  due to no GP active. */
 328#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
 329        unsigned long gp_start;                 /* Time at which GP started, */
 330                                                /*  but in jiffies. */
 331        unsigned long jiffies_stall;            /* Time at which to check */
 332                                                /*  for CPU stalls. */
 333#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
 334        char *name;                             /* Name of structure. */
 335};
 336
 337/* Return values for rcu_preempt_offline_tasks(). */
 338
 339#define RCU_OFL_TASKS_NORM_GP   0x1             /* Tasks blocking normal */
 340                                                /*  GP were moved to root. */
 341#define RCU_OFL_TASKS_EXP_GP    0x2             /* Tasks blocking expedited */
 342                                                /*  GP were moved to root. */
 343
 344/*
 345 * RCU implementation internal declarations:
 346 */
 347extern struct rcu_state rcu_sched_state;
 348DECLARE_PER_CPU(struct rcu_data, rcu_sched_data);
 349
 350extern struct rcu_state rcu_bh_state;
 351DECLARE_PER_CPU(struct rcu_data, rcu_bh_data);
 352
 353#ifdef CONFIG_TREE_PREEMPT_RCU
 354extern struct rcu_state rcu_preempt_state;
 355DECLARE_PER_CPU(struct rcu_data, rcu_preempt_data);
 356#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */
 357
 358#ifndef RCU_TREE_NONCORE
 359
 360/* Forward declarations for rcutree_plugin.h */
 361static void rcu_bootup_announce(void);
 362long rcu_batches_completed(void);
 363static void rcu_preempt_note_context_switch(int cpu);
 364static int rcu_preempted_readers(struct rcu_node *rnp);
 365#ifdef CONFIG_HOTPLUG_CPU
 366static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp,
 367                                      unsigned long flags);
 368#endif /* #ifdef CONFIG_HOTPLUG_CPU */
 369#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
 370static void rcu_print_detail_task_stall(struct rcu_state *rsp);
 371static void rcu_print_task_stall(struct rcu_node *rnp);
 372static void rcu_preempt_stall_reset(void);
 373#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
 374static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp);
 375#ifdef CONFIG_HOTPLUG_CPU
 376static int rcu_preempt_offline_tasks(struct rcu_state *rsp,
 377                                     struct rcu_node *rnp,
 378                                     struct rcu_data *rdp);
 379static void rcu_preempt_offline_cpu(int cpu);
 380#endif /* #ifdef CONFIG_HOTPLUG_CPU */
 381static void rcu_preempt_check_callbacks(int cpu);
 382static void rcu_preempt_process_callbacks(void);
 383void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu));
 384#if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_TREE_PREEMPT_RCU)
 385static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp);
 386#endif /* #if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_TREE_PREEMPT_RCU) */
 387static int rcu_preempt_pending(int cpu);
 388static int rcu_preempt_needs_cpu(int cpu);
 389static void __cpuinit rcu_preempt_init_percpu_data(int cpu);
 390static void rcu_preempt_send_cbs_to_online(void);
 391static void __init __rcu_init_preempt(void);
 392static void rcu_needs_cpu_flush(void);
 393
 394#endif /* #ifndef RCU_TREE_NONCORE */
 395