linux/block/bfq-wf2q.c
<<
>>
Prefs
   1/*
   2 * Hierarchical Budget Worst-case Fair Weighted Fair Queueing
   3 * (B-WF2Q+): hierarchical scheduling algorithm by which the BFQ I/O
   4 * scheduler schedules generic entities. The latter can represent
   5 * either single bfq queues (associated with processes) or groups of
   6 * bfq queues (associated with cgroups).
   7 *
   8 *  This program is free software; you can redistribute it and/or
   9 *  modify it under the terms of the GNU General Public License as
  10 *  published by the Free Software Foundation; either version 2 of the
  11 *  License, or (at your option) any later version.
  12 *
  13 *  This program is distributed in the hope that it will be useful,
  14 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
  15 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  16 *  General Public License for more details.
  17 */
  18#include "bfq-iosched.h"
  19
  20/**
  21 * bfq_gt - compare two timestamps.
  22 * @a: first ts.
  23 * @b: second ts.
  24 *
  25 * Return @a > @b, dealing with wrapping correctly.
  26 */
  27static int bfq_gt(u64 a, u64 b)
  28{
  29        return (s64)(a - b) > 0;
  30}
  31
  32static struct bfq_entity *bfq_root_active_entity(struct rb_root *tree)
  33{
  34        struct rb_node *node = tree->rb_node;
  35
  36        return rb_entry(node, struct bfq_entity, rb_node);
  37}
  38
  39static unsigned int bfq_class_idx(struct bfq_entity *entity)
  40{
  41        struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
  42
  43        return bfqq ? bfqq->ioprio_class - 1 :
  44                BFQ_DEFAULT_GRP_CLASS - 1;
  45}
  46
  47static struct bfq_entity *bfq_lookup_next_entity(struct bfq_sched_data *sd,
  48                                                 bool expiration);
  49
  50static bool bfq_update_parent_budget(struct bfq_entity *next_in_service);
  51
  52/**
  53 * bfq_update_next_in_service - update sd->next_in_service
  54 * @sd: sched_data for which to perform the update.
  55 * @new_entity: if not NULL, pointer to the entity whose activation,
  56 *              requeueing or repositionig triggered the invocation of
  57 *              this function.
  58 * @expiration: id true, this function is being invoked after the
  59 *             expiration of the in-service entity
  60 *
  61 * This function is called to update sd->next_in_service, which, in
  62 * its turn, may change as a consequence of the insertion or
  63 * extraction of an entity into/from one of the active trees of
  64 * sd. These insertions/extractions occur as a consequence of
  65 * activations/deactivations of entities, with some activations being
  66 * 'true' activations, and other activations being requeueings (i.e.,
  67 * implementing the second, requeueing phase of the mechanism used to
  68 * reposition an entity in its active tree; see comments on
  69 * __bfq_activate_entity and __bfq_requeue_entity for details). In
  70 * both the last two activation sub-cases, new_entity points to the
  71 * just activated or requeued entity.
  72 *
  73 * Returns true if sd->next_in_service changes in such a way that
  74 * entity->parent may become the next_in_service for its parent
  75 * entity.
  76 */
  77static bool bfq_update_next_in_service(struct bfq_sched_data *sd,
  78                                       struct bfq_entity *new_entity,
  79                                       bool expiration)
  80{
  81        struct bfq_entity *next_in_service = sd->next_in_service;
  82        bool parent_sched_may_change = false;
  83        bool change_without_lookup = false;
  84
  85        /*
  86         * If this update is triggered by the activation, requeueing
  87         * or repositiong of an entity that does not coincide with
  88         * sd->next_in_service, then a full lookup in the active tree
  89         * can be avoided. In fact, it is enough to check whether the
  90         * just-modified entity has the same priority as
  91         * sd->next_in_service, is eligible and has a lower virtual
  92         * finish time than sd->next_in_service. If this compound
  93         * condition holds, then the new entity becomes the new
  94         * next_in_service. Otherwise no change is needed.
  95         */
  96        if (new_entity && new_entity != sd->next_in_service) {
  97                /*
  98                 * Flag used to decide whether to replace
  99                 * sd->next_in_service with new_entity. Tentatively
 100                 * set to true, and left as true if
 101                 * sd->next_in_service is NULL.
 102                 */
 103                change_without_lookup = true;
 104
 105                /*
 106                 * If there is already a next_in_service candidate
 107                 * entity, then compare timestamps to decide whether
 108                 * to replace sd->service_tree with new_entity.
 109                 */
 110                if (next_in_service) {
 111                        unsigned int new_entity_class_idx =
 112                                bfq_class_idx(new_entity);
 113                        struct bfq_service_tree *st =
 114                                sd->service_tree + new_entity_class_idx;
 115
 116                        change_without_lookup =
 117                                (new_entity_class_idx ==
 118                                 bfq_class_idx(next_in_service)
 119                                 &&
 120                                 !bfq_gt(new_entity->start, st->vtime)
 121                                 &&
 122                                 bfq_gt(next_in_service->finish,
 123                                        new_entity->finish));
 124                }
 125
 126                if (change_without_lookup)
 127                        next_in_service = new_entity;
 128        }
 129
 130        if (!change_without_lookup) /* lookup needed */
 131                next_in_service = bfq_lookup_next_entity(sd, expiration);
 132
 133        if (next_in_service) {
 134                bool new_budget_triggers_change =
 135                        bfq_update_parent_budget(next_in_service);
 136
 137                parent_sched_may_change = !sd->next_in_service ||
 138                        new_budget_triggers_change;
 139        }
 140
 141        sd->next_in_service = next_in_service;
 142
 143        if (!next_in_service)
 144                return parent_sched_may_change;
 145
 146        return parent_sched_may_change;
 147}
 148
 149#ifdef CONFIG_BFQ_GROUP_IOSCHED
 150
 151struct bfq_group *bfq_bfqq_to_bfqg(struct bfq_queue *bfqq)
 152{
 153        struct bfq_entity *group_entity = bfqq->entity.parent;
 154
 155        if (!group_entity)
 156                group_entity = &bfqq->bfqd->root_group->entity;
 157
 158        return container_of(group_entity, struct bfq_group, entity);
 159}
 160
 161/*
 162 * Returns true if this budget changes may let next_in_service->parent
 163 * become the next_in_service entity for its parent entity.
 164 */
 165static bool bfq_update_parent_budget(struct bfq_entity *next_in_service)
 166{
 167        struct bfq_entity *bfqg_entity;
 168        struct bfq_group *bfqg;
 169        struct bfq_sched_data *group_sd;
 170        bool ret = false;
 171
 172        group_sd = next_in_service->sched_data;
 173
 174        bfqg = container_of(group_sd, struct bfq_group, sched_data);
 175        /*
 176         * bfq_group's my_entity field is not NULL only if the group
 177         * is not the root group. We must not touch the root entity
 178         * as it must never become an in-service entity.
 179         */
 180        bfqg_entity = bfqg->my_entity;
 181        if (bfqg_entity) {
 182                if (bfqg_entity->budget > next_in_service->budget)
 183                        ret = true;
 184                bfqg_entity->budget = next_in_service->budget;
 185        }
 186
 187        return ret;
 188}
 189
 190/*
 191 * This function tells whether entity stops being a candidate for next
 192 * service, according to the restrictive definition of the field
 193 * next_in_service. In particular, this function is invoked for an
 194 * entity that is about to be set in service.
 195 *
 196 * If entity is a queue, then the entity is no longer a candidate for
 197 * next service according to the that definition, because entity is
 198 * about to become the in-service queue. This function then returns
 199 * true if entity is a queue.
 200 *
 201 * In contrast, entity could still be a candidate for next service if
 202 * it is not a queue, and has more than one active child. In fact,
 203 * even if one of its children is about to be set in service, other
 204 * active children may still be the next to serve, for the parent
 205 * entity, even according to the above definition. As a consequence, a
 206 * non-queue entity is not a candidate for next-service only if it has
 207 * only one active child. And only if this condition holds, then this
 208 * function returns true for a non-queue entity.
 209 */
 210static bool bfq_no_longer_next_in_service(struct bfq_entity *entity)
 211{
 212        struct bfq_group *bfqg;
 213
 214        if (bfq_entity_to_bfqq(entity))
 215                return true;
 216
 217        bfqg = container_of(entity, struct bfq_group, entity);
 218
 219        /*
 220         * The field active_entities does not always contain the
 221         * actual number of active children entities: it happens to
 222         * not account for the in-service entity in case the latter is
 223         * removed from its active tree (which may get done after
 224         * invoking the function bfq_no_longer_next_in_service in
 225         * bfq_get_next_queue). Fortunately, here, i.e., while
 226         * bfq_no_longer_next_in_service is not yet completed in
 227         * bfq_get_next_queue, bfq_active_extract has not yet been
 228         * invoked, and thus active_entities still coincides with the
 229         * actual number of active entities.
 230         */
 231        if (bfqg->active_entities == 1)
 232                return true;
 233
 234        return false;
 235}
 236
 237#else /* CONFIG_BFQ_GROUP_IOSCHED */
 238
 239struct bfq_group *bfq_bfqq_to_bfqg(struct bfq_queue *bfqq)
 240{
 241        return bfqq->bfqd->root_group;
 242}
 243
 244static bool bfq_update_parent_budget(struct bfq_entity *next_in_service)
 245{
 246        return false;
 247}
 248
 249static bool bfq_no_longer_next_in_service(struct bfq_entity *entity)
 250{
 251        return true;
 252}
 253
 254#endif /* CONFIG_BFQ_GROUP_IOSCHED */
 255
 256/*
 257 * Shift for timestamp calculations.  This actually limits the maximum
 258 * service allowed in one timestamp delta (small shift values increase it),
 259 * the maximum total weight that can be used for the queues in the system
 260 * (big shift values increase it), and the period of virtual time
 261 * wraparounds.
 262 */
 263#define WFQ_SERVICE_SHIFT       22
 264
 265struct bfq_queue *bfq_entity_to_bfqq(struct bfq_entity *entity)
 266{
 267        struct bfq_queue *bfqq = NULL;
 268
 269        if (!entity->my_sched_data)
 270                bfqq = container_of(entity, struct bfq_queue, entity);
 271
 272        return bfqq;
 273}
 274
 275
 276/**
 277 * bfq_delta - map service into the virtual time domain.
 278 * @service: amount of service.
 279 * @weight: scale factor (weight of an entity or weight sum).
 280 */
 281static u64 bfq_delta(unsigned long service, unsigned long weight)
 282{
 283        u64 d = (u64)service << WFQ_SERVICE_SHIFT;
 284
 285        do_div(d, weight);
 286        return d;
 287}
 288
 289/**
 290 * bfq_calc_finish - assign the finish time to an entity.
 291 * @entity: the entity to act upon.
 292 * @service: the service to be charged to the entity.
 293 */
 294static void bfq_calc_finish(struct bfq_entity *entity, unsigned long service)
 295{
 296        struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
 297
 298        entity->finish = entity->start +
 299                bfq_delta(service, entity->weight);
 300
 301        if (bfqq) {
 302                bfq_log_bfqq(bfqq->bfqd, bfqq,
 303                        "calc_finish: serv %lu, w %d",
 304                        service, entity->weight);
 305                bfq_log_bfqq(bfqq->bfqd, bfqq,
 306                        "calc_finish: start %llu, finish %llu, delta %llu",
 307                        entity->start, entity->finish,
 308                        bfq_delta(service, entity->weight));
 309        }
 310}
 311
 312/**
 313 * bfq_entity_of - get an entity from a node.
 314 * @node: the node field of the entity.
 315 *
 316 * Convert a node pointer to the relative entity.  This is used only
 317 * to simplify the logic of some functions and not as the generic
 318 * conversion mechanism because, e.g., in the tree walking functions,
 319 * the check for a %NULL value would be redundant.
 320 */
 321struct bfq_entity *bfq_entity_of(struct rb_node *node)
 322{
 323        struct bfq_entity *entity = NULL;
 324
 325        if (node)
 326                entity = rb_entry(node, struct bfq_entity, rb_node);
 327
 328        return entity;
 329}
 330
 331/**
 332 * bfq_extract - remove an entity from a tree.
 333 * @root: the tree root.
 334 * @entity: the entity to remove.
 335 */
 336static void bfq_extract(struct rb_root *root, struct bfq_entity *entity)
 337{
 338        entity->tree = NULL;
 339        rb_erase(&entity->rb_node, root);
 340}
 341
 342/**
 343 * bfq_idle_extract - extract an entity from the idle tree.
 344 * @st: the service tree of the owning @entity.
 345 * @entity: the entity being removed.
 346 */
 347static void bfq_idle_extract(struct bfq_service_tree *st,
 348                             struct bfq_entity *entity)
 349{
 350        struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
 351        struct rb_node *next;
 352
 353        if (entity == st->first_idle) {
 354                next = rb_next(&entity->rb_node);
 355                st->first_idle = bfq_entity_of(next);
 356        }
 357
 358        if (entity == st->last_idle) {
 359                next = rb_prev(&entity->rb_node);
 360                st->last_idle = bfq_entity_of(next);
 361        }
 362
 363        bfq_extract(&st->idle, entity);
 364
 365        if (bfqq)
 366                list_del(&bfqq->bfqq_list);
 367}
 368
 369/**
 370 * bfq_insert - generic tree insertion.
 371 * @root: tree root.
 372 * @entity: entity to insert.
 373 *
 374 * This is used for the idle and the active tree, since they are both
 375 * ordered by finish time.
 376 */
 377static void bfq_insert(struct rb_root *root, struct bfq_entity *entity)
 378{
 379        struct bfq_entity *entry;
 380        struct rb_node **node = &root->rb_node;
 381        struct rb_node *parent = NULL;
 382
 383        while (*node) {
 384                parent = *node;
 385                entry = rb_entry(parent, struct bfq_entity, rb_node);
 386
 387                if (bfq_gt(entry->finish, entity->finish))
 388                        node = &parent->rb_left;
 389                else
 390                        node = &parent->rb_right;
 391        }
 392
 393        rb_link_node(&entity->rb_node, parent, node);
 394        rb_insert_color(&entity->rb_node, root);
 395
 396        entity->tree = root;
 397}
 398
 399/**
 400 * bfq_update_min - update the min_start field of a entity.
 401 * @entity: the entity to update.
 402 * @node: one of its children.
 403 *
 404 * This function is called when @entity may store an invalid value for
 405 * min_start due to updates to the active tree.  The function  assumes
 406 * that the subtree rooted at @node (which may be its left or its right
 407 * child) has a valid min_start value.
 408 */
 409static void bfq_update_min(struct bfq_entity *entity, struct rb_node *node)
 410{
 411        struct bfq_entity *child;
 412
 413        if (node) {
 414                child = rb_entry(node, struct bfq_entity, rb_node);
 415                if (bfq_gt(entity->min_start, child->min_start))
 416                        entity->min_start = child->min_start;
 417        }
 418}
 419
 420/**
 421 * bfq_update_active_node - recalculate min_start.
 422 * @node: the node to update.
 423 *
 424 * @node may have changed position or one of its children may have moved,
 425 * this function updates its min_start value.  The left and right subtrees
 426 * are assumed to hold a correct min_start value.
 427 */
 428static void bfq_update_active_node(struct rb_node *node)
 429{
 430        struct bfq_entity *entity = rb_entry(node, struct bfq_entity, rb_node);
 431
 432        entity->min_start = entity->start;
 433        bfq_update_min(entity, node->rb_right);
 434        bfq_update_min(entity, node->rb_left);
 435}
 436
 437/**
 438 * bfq_update_active_tree - update min_start for the whole active tree.
 439 * @node: the starting node.
 440 *
 441 * @node must be the deepest modified node after an update.  This function
 442 * updates its min_start using the values held by its children, assuming
 443 * that they did not change, and then updates all the nodes that may have
 444 * changed in the path to the root.  The only nodes that may have changed
 445 * are the ones in the path or their siblings.
 446 */
 447static void bfq_update_active_tree(struct rb_node *node)
 448{
 449        struct rb_node *parent;
 450
 451up:
 452        bfq_update_active_node(node);
 453
 454        parent = rb_parent(node);
 455        if (!parent)
 456                return;
 457
 458        if (node == parent->rb_left && parent->rb_right)
 459                bfq_update_active_node(parent->rb_right);
 460        else if (parent->rb_left)
 461                bfq_update_active_node(parent->rb_left);
 462
 463        node = parent;
 464        goto up;
 465}
 466
 467/**
 468 * bfq_active_insert - insert an entity in the active tree of its
 469 *                     group/device.
 470 * @st: the service tree of the entity.
 471 * @entity: the entity being inserted.
 472 *
 473 * The active tree is ordered by finish time, but an extra key is kept
 474 * per each node, containing the minimum value for the start times of
 475 * its children (and the node itself), so it's possible to search for
 476 * the eligible node with the lowest finish time in logarithmic time.
 477 */
 478static void bfq_active_insert(struct bfq_service_tree *st,
 479                              struct bfq_entity *entity)
 480{
 481        struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
 482        struct rb_node *node = &entity->rb_node;
 483#ifdef CONFIG_BFQ_GROUP_IOSCHED
 484        struct bfq_sched_data *sd = NULL;
 485        struct bfq_group *bfqg = NULL;
 486        struct bfq_data *bfqd = NULL;
 487#endif
 488
 489        bfq_insert(&st->active, entity);
 490
 491        if (node->rb_left)
 492                node = node->rb_left;
 493        else if (node->rb_right)
 494                node = node->rb_right;
 495
 496        bfq_update_active_tree(node);
 497
 498#ifdef CONFIG_BFQ_GROUP_IOSCHED
 499        sd = entity->sched_data;
 500        bfqg = container_of(sd, struct bfq_group, sched_data);
 501        bfqd = (struct bfq_data *)bfqg->bfqd;
 502#endif
 503        if (bfqq)
 504                list_add(&bfqq->bfqq_list, &bfqq->bfqd->active_list);
 505#ifdef CONFIG_BFQ_GROUP_IOSCHED
 506        if (bfqg != bfqd->root_group)
 507                bfqg->active_entities++;
 508#endif
 509}
 510
 511/**
 512 * bfq_ioprio_to_weight - calc a weight from an ioprio.
 513 * @ioprio: the ioprio value to convert.
 514 */
 515unsigned short bfq_ioprio_to_weight(int ioprio)
 516{
 517        return (IOPRIO_BE_NR - ioprio) * BFQ_WEIGHT_CONVERSION_COEFF;
 518}
 519
 520/**
 521 * bfq_weight_to_ioprio - calc an ioprio from a weight.
 522 * @weight: the weight value to convert.
 523 *
 524 * To preserve as much as possible the old only-ioprio user interface,
 525 * 0 is used as an escape ioprio value for weights (numerically) equal or
 526 * larger than IOPRIO_BE_NR * BFQ_WEIGHT_CONVERSION_COEFF.
 527 */
 528static unsigned short bfq_weight_to_ioprio(int weight)
 529{
 530        return max_t(int, 0,
 531                     IOPRIO_BE_NR * BFQ_WEIGHT_CONVERSION_COEFF - weight);
 532}
 533
 534static void bfq_get_entity(struct bfq_entity *entity)
 535{
 536        struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
 537
 538        if (bfqq) {
 539                bfqq->ref++;
 540                bfq_log_bfqq(bfqq->bfqd, bfqq, "get_entity: %p %d",
 541                             bfqq, bfqq->ref);
 542        }
 543}
 544
 545/**
 546 * bfq_find_deepest - find the deepest node that an extraction can modify.
 547 * @node: the node being removed.
 548 *
 549 * Do the first step of an extraction in an rb tree, looking for the
 550 * node that will replace @node, and returning the deepest node that
 551 * the following modifications to the tree can touch.  If @node is the
 552 * last node in the tree return %NULL.
 553 */
 554static struct rb_node *bfq_find_deepest(struct rb_node *node)
 555{
 556        struct rb_node *deepest;
 557
 558        if (!node->rb_right && !node->rb_left)
 559                deepest = rb_parent(node);
 560        else if (!node->rb_right)
 561                deepest = node->rb_left;
 562        else if (!node->rb_left)
 563                deepest = node->rb_right;
 564        else {
 565                deepest = rb_next(node);
 566                if (deepest->rb_right)
 567                        deepest = deepest->rb_right;
 568                else if (rb_parent(deepest) != node)
 569                        deepest = rb_parent(deepest);
 570        }
 571
 572        return deepest;
 573}
 574
 575/**
 576 * bfq_active_extract - remove an entity from the active tree.
 577 * @st: the service_tree containing the tree.
 578 * @entity: the entity being removed.
 579 */
 580static void bfq_active_extract(struct bfq_service_tree *st,
 581                               struct bfq_entity *entity)
 582{
 583        struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
 584        struct rb_node *node;
 585#ifdef CONFIG_BFQ_GROUP_IOSCHED
 586        struct bfq_sched_data *sd = NULL;
 587        struct bfq_group *bfqg = NULL;
 588        struct bfq_data *bfqd = NULL;
 589#endif
 590
 591        node = bfq_find_deepest(&entity->rb_node);
 592        bfq_extract(&st->active, entity);
 593
 594        if (node)
 595                bfq_update_active_tree(node);
 596
 597#ifdef CONFIG_BFQ_GROUP_IOSCHED
 598        sd = entity->sched_data;
 599        bfqg = container_of(sd, struct bfq_group, sched_data);
 600        bfqd = (struct bfq_data *)bfqg->bfqd;
 601#endif
 602        if (bfqq)
 603                list_del(&bfqq->bfqq_list);
 604#ifdef CONFIG_BFQ_GROUP_IOSCHED
 605        if (bfqg != bfqd->root_group)
 606                bfqg->active_entities--;
 607#endif
 608}
 609
 610/**
 611 * bfq_idle_insert - insert an entity into the idle tree.
 612 * @st: the service tree containing the tree.
 613 * @entity: the entity to insert.
 614 */
 615static void bfq_idle_insert(struct bfq_service_tree *st,
 616                            struct bfq_entity *entity)
 617{
 618        struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
 619        struct bfq_entity *first_idle = st->first_idle;
 620        struct bfq_entity *last_idle = st->last_idle;
 621
 622        if (!first_idle || bfq_gt(first_idle->finish, entity->finish))
 623                st->first_idle = entity;
 624        if (!last_idle || bfq_gt(entity->finish, last_idle->finish))
 625                st->last_idle = entity;
 626
 627        bfq_insert(&st->idle, entity);
 628
 629        if (bfqq)
 630                list_add(&bfqq->bfqq_list, &bfqq->bfqd->idle_list);
 631}
 632
 633/**
 634 * bfq_forget_entity - do not consider entity any longer for scheduling
 635 * @st: the service tree.
 636 * @entity: the entity being removed.
 637 * @is_in_service: true if entity is currently the in-service entity.
 638 *
 639 * Forget everything about @entity. In addition, if entity represents
 640 * a queue, and the latter is not in service, then release the service
 641 * reference to the queue (the one taken through bfq_get_entity). In
 642 * fact, in this case, there is really no more service reference to
 643 * the queue, as the latter is also outside any service tree. If,
 644 * instead, the queue is in service, then __bfq_bfqd_reset_in_service
 645 * will take care of putting the reference when the queue finally
 646 * stops being served.
 647 */
 648static void bfq_forget_entity(struct bfq_service_tree *st,
 649                              struct bfq_entity *entity,
 650                              bool is_in_service)
 651{
 652        struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
 653
 654        entity->on_st = false;
 655        st->wsum -= entity->weight;
 656        if (bfqq && !is_in_service)
 657                bfq_put_queue(bfqq);
 658}
 659
 660/**
 661 * bfq_put_idle_entity - release the idle tree ref of an entity.
 662 * @st: service tree for the entity.
 663 * @entity: the entity being released.
 664 */
 665void bfq_put_idle_entity(struct bfq_service_tree *st, struct bfq_entity *entity)
 666{
 667        bfq_idle_extract(st, entity);
 668        bfq_forget_entity(st, entity,
 669                          entity == entity->sched_data->in_service_entity);
 670}
 671
 672/**
 673 * bfq_forget_idle - update the idle tree if necessary.
 674 * @st: the service tree to act upon.
 675 *
 676 * To preserve the global O(log N) complexity we only remove one entry here;
 677 * as the idle tree will not grow indefinitely this can be done safely.
 678 */
 679static void bfq_forget_idle(struct bfq_service_tree *st)
 680{
 681        struct bfq_entity *first_idle = st->first_idle;
 682        struct bfq_entity *last_idle = st->last_idle;
 683
 684        if (RB_EMPTY_ROOT(&st->active) && last_idle &&
 685            !bfq_gt(last_idle->finish, st->vtime)) {
 686                /*
 687                 * Forget the whole idle tree, increasing the vtime past
 688                 * the last finish time of idle entities.
 689                 */
 690                st->vtime = last_idle->finish;
 691        }
 692
 693        if (first_idle && !bfq_gt(first_idle->finish, st->vtime))
 694                bfq_put_idle_entity(st, first_idle);
 695}
 696
 697struct bfq_service_tree *bfq_entity_service_tree(struct bfq_entity *entity)
 698{
 699        struct bfq_sched_data *sched_data = entity->sched_data;
 700        unsigned int idx = bfq_class_idx(entity);
 701
 702        return sched_data->service_tree + idx;
 703}
 704
 705/*
 706 * Update weight and priority of entity. If update_class_too is true,
 707 * then update the ioprio_class of entity too.
 708 *
 709 * The reason why the update of ioprio_class is controlled through the
 710 * last parameter is as follows. Changing the ioprio class of an
 711 * entity implies changing the destination service trees for that
 712 * entity. If such a change occurred when the entity is already on one
 713 * of the service trees for its previous class, then the state of the
 714 * entity would become more complex: none of the new possible service
 715 * trees for the entity, according to bfq_entity_service_tree(), would
 716 * match any of the possible service trees on which the entity
 717 * is. Complex operations involving these trees, such as entity
 718 * activations and deactivations, should take into account this
 719 * additional complexity.  To avoid this issue, this function is
 720 * invoked with update_class_too unset in the points in the code where
 721 * entity may happen to be on some tree.
 722 */
 723struct bfq_service_tree *
 724__bfq_entity_update_weight_prio(struct bfq_service_tree *old_st,
 725                                struct bfq_entity *entity,
 726                                bool update_class_too)
 727{
 728        struct bfq_service_tree *new_st = old_st;
 729
 730        if (entity->prio_changed) {
 731                struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
 732                unsigned int prev_weight, new_weight;
 733                struct bfq_data *bfqd = NULL;
 734                struct rb_root *root;
 735#ifdef CONFIG_BFQ_GROUP_IOSCHED
 736                struct bfq_sched_data *sd;
 737                struct bfq_group *bfqg;
 738#endif
 739
 740                if (bfqq)
 741                        bfqd = bfqq->bfqd;
 742#ifdef CONFIG_BFQ_GROUP_IOSCHED
 743                else {
 744                        sd = entity->my_sched_data;
 745                        bfqg = container_of(sd, struct bfq_group, sched_data);
 746                        bfqd = (struct bfq_data *)bfqg->bfqd;
 747                }
 748#endif
 749
 750                old_st->wsum -= entity->weight;
 751
 752                if (entity->new_weight != entity->orig_weight) {
 753                        if (entity->new_weight < BFQ_MIN_WEIGHT ||
 754                            entity->new_weight > BFQ_MAX_WEIGHT) {
 755                                pr_crit("update_weight_prio: new_weight %d\n",
 756                                        entity->new_weight);
 757                                if (entity->new_weight < BFQ_MIN_WEIGHT)
 758                                        entity->new_weight = BFQ_MIN_WEIGHT;
 759                                else
 760                                        entity->new_weight = BFQ_MAX_WEIGHT;
 761                        }
 762                        entity->orig_weight = entity->new_weight;
 763                        if (bfqq)
 764                                bfqq->ioprio =
 765                                  bfq_weight_to_ioprio(entity->orig_weight);
 766                }
 767
 768                if (bfqq && update_class_too)
 769                        bfqq->ioprio_class = bfqq->new_ioprio_class;
 770
 771                /*
 772                 * Reset prio_changed only if the ioprio_class change
 773                 * is not pending any longer.
 774                 */
 775                if (!bfqq || bfqq->ioprio_class == bfqq->new_ioprio_class)
 776                        entity->prio_changed = 0;
 777
 778                /*
 779                 * NOTE: here we may be changing the weight too early,
 780                 * this will cause unfairness.  The correct approach
 781                 * would have required additional complexity to defer
 782                 * weight changes to the proper time instants (i.e.,
 783                 * when entity->finish <= old_st->vtime).
 784                 */
 785                new_st = bfq_entity_service_tree(entity);
 786
 787                prev_weight = entity->weight;
 788                new_weight = entity->orig_weight *
 789                             (bfqq ? bfqq->wr_coeff : 1);
 790                /*
 791                 * If the weight of the entity changes, remove the entity
 792                 * from its old weight counter (if there is a counter
 793                 * associated with the entity), and add it to the counter
 794                 * associated with its new weight.
 795                 */
 796                if (prev_weight != new_weight) {
 797                        root = bfqq ? &bfqd->queue_weights_tree :
 798                                      &bfqd->group_weights_tree;
 799                        __bfq_weights_tree_remove(bfqd, entity, root);
 800                }
 801                entity->weight = new_weight;
 802                /*
 803                 * Add the entity to its weights tree only if it is
 804                 * not associated with a weight-raised queue.
 805                 */
 806                if (prev_weight != new_weight &&
 807                    (bfqq ? bfqq->wr_coeff == 1 : 1))
 808                        /* If we get here, root has been initialized. */
 809                        bfq_weights_tree_add(bfqd, entity, root);
 810
 811                new_st->wsum += entity->weight;
 812
 813                if (new_st != old_st)
 814                        entity->start = new_st->vtime;
 815        }
 816
 817        return new_st;
 818}
 819
 820/**
 821 * bfq_bfqq_served - update the scheduler status after selection for
 822 *                   service.
 823 * @bfqq: the queue being served.
 824 * @served: bytes to transfer.
 825 *
 826 * NOTE: this can be optimized, as the timestamps of upper level entities
 827 * are synchronized every time a new bfqq is selected for service.  By now,
 828 * we keep it to better check consistency.
 829 */
 830void bfq_bfqq_served(struct bfq_queue *bfqq, int served)
 831{
 832        struct bfq_entity *entity = &bfqq->entity;
 833        struct bfq_service_tree *st;
 834
 835        if (!bfqq->service_from_backlogged)
 836                bfqq->first_IO_time = jiffies;
 837
 838        if (bfqq->wr_coeff > 1)
 839                bfqq->service_from_wr += served;
 840
 841        bfqq->service_from_backlogged += served;
 842        for_each_entity(entity) {
 843                st = bfq_entity_service_tree(entity);
 844
 845                entity->service += served;
 846
 847                st->vtime += bfq_delta(served, st->wsum);
 848                bfq_forget_idle(st);
 849        }
 850        bfq_log_bfqq(bfqq->bfqd, bfqq, "bfqq_served %d secs", served);
 851}
 852
 853/**
 854 * bfq_bfqq_charge_time - charge an amount of service equivalent to the length
 855 *                        of the time interval during which bfqq has been in
 856 *                        service.
 857 * @bfqd: the device
 858 * @bfqq: the queue that needs a service update.
 859 * @time_ms: the amount of time during which the queue has received service
 860 *
 861 * If a queue does not consume its budget fast enough, then providing
 862 * the queue with service fairness may impair throughput, more or less
 863 * severely. For this reason, queues that consume their budget slowly
 864 * are provided with time fairness instead of service fairness. This
 865 * goal is achieved through the BFQ scheduling engine, even if such an
 866 * engine works in the service, and not in the time domain. The trick
 867 * is charging these queues with an inflated amount of service, equal
 868 * to the amount of service that they would have received during their
 869 * service slot if they had been fast, i.e., if their requests had
 870 * been dispatched at a rate equal to the estimated peak rate.
 871 *
 872 * It is worth noting that time fairness can cause important
 873 * distortions in terms of bandwidth distribution, on devices with
 874 * internal queueing. The reason is that I/O requests dispatched
 875 * during the service slot of a queue may be served after that service
 876 * slot is finished, and may have a total processing time loosely
 877 * correlated with the duration of the service slot. This is
 878 * especially true for short service slots.
 879 */
 880void bfq_bfqq_charge_time(struct bfq_data *bfqd, struct bfq_queue *bfqq,
 881                          unsigned long time_ms)
 882{
 883        struct bfq_entity *entity = &bfqq->entity;
 884        unsigned long timeout_ms = jiffies_to_msecs(bfq_timeout);
 885        unsigned long bounded_time_ms = min(time_ms, timeout_ms);
 886        int serv_to_charge_for_time =
 887                (bfqd->bfq_max_budget * bounded_time_ms) / timeout_ms;
 888        int tot_serv_to_charge = max(serv_to_charge_for_time, entity->service);
 889
 890        /* Increase budget to avoid inconsistencies */
 891        if (tot_serv_to_charge > entity->budget)
 892                entity->budget = tot_serv_to_charge;
 893
 894        bfq_bfqq_served(bfqq,
 895                        max_t(int, 0, tot_serv_to_charge - entity->service));
 896}
 897
 898static void bfq_update_fin_time_enqueue(struct bfq_entity *entity,
 899                                        struct bfq_service_tree *st,
 900                                        bool backshifted)
 901{
 902        struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
 903
 904        /*
 905         * When this function is invoked, entity is not in any service
 906         * tree, then it is safe to invoke next function with the last
 907         * parameter set (see the comments on the function).
 908         */
 909        st = __bfq_entity_update_weight_prio(st, entity, true);
 910        bfq_calc_finish(entity, entity->budget);
 911
 912        /*
 913         * If some queues enjoy backshifting for a while, then their
 914         * (virtual) finish timestamps may happen to become lower and
 915         * lower than the system virtual time.  In particular, if
 916         * these queues often happen to be idle for short time
 917         * periods, and during such time periods other queues with
 918         * higher timestamps happen to be busy, then the backshifted
 919         * timestamps of the former queues can become much lower than
 920         * the system virtual time. In fact, to serve the queues with
 921         * higher timestamps while the ones with lower timestamps are
 922         * idle, the system virtual time may be pushed-up to much
 923         * higher values than the finish timestamps of the idle
 924         * queues. As a consequence, the finish timestamps of all new
 925         * or newly activated queues may end up being much larger than
 926         * those of lucky queues with backshifted timestamps. The
 927         * latter queues may then monopolize the device for a lot of
 928         * time. This would simply break service guarantees.
 929         *
 930         * To reduce this problem, push up a little bit the
 931         * backshifted timestamps of the queue associated with this
 932         * entity (only a queue can happen to have the backshifted
 933         * flag set): just enough to let the finish timestamp of the
 934         * queue be equal to the current value of the system virtual
 935         * time. This may introduce a little unfairness among queues
 936         * with backshifted timestamps, but it does not break
 937         * worst-case fairness guarantees.
 938         *
 939         * As a special case, if bfqq is weight-raised, push up
 940         * timestamps much less, to keep very low the probability that
 941         * this push up causes the backshifted finish timestamps of
 942         * weight-raised queues to become higher than the backshifted
 943         * finish timestamps of non weight-raised queues.
 944         */
 945        if (backshifted && bfq_gt(st->vtime, entity->finish)) {
 946                unsigned long delta = st->vtime - entity->finish;
 947
 948                if (bfqq)
 949                        delta /= bfqq->wr_coeff;
 950
 951                entity->start += delta;
 952                entity->finish += delta;
 953        }
 954
 955        bfq_active_insert(st, entity);
 956}
 957
 958/**
 959 * __bfq_activate_entity - handle activation of entity.
 960 * @entity: the entity being activated.
 961 * @non_blocking_wait_rq: true if entity was waiting for a request
 962 *
 963 * Called for a 'true' activation, i.e., if entity is not active and
 964 * one of its children receives a new request.
 965 *
 966 * Basically, this function updates the timestamps of entity and
 967 * inserts entity into its active tree, after possibly extracting it
 968 * from its idle tree.
 969 */
 970static void __bfq_activate_entity(struct bfq_entity *entity,
 971                                  bool non_blocking_wait_rq)
 972{
 973        struct bfq_service_tree *st = bfq_entity_service_tree(entity);
 974        bool backshifted = false;
 975        unsigned long long min_vstart;
 976
 977        /* See comments on bfq_fqq_update_budg_for_activation */
 978        if (non_blocking_wait_rq && bfq_gt(st->vtime, entity->finish)) {
 979                backshifted = true;
 980                min_vstart = entity->finish;
 981        } else
 982                min_vstart = st->vtime;
 983
 984        if (entity->tree == &st->idle) {
 985                /*
 986                 * Must be on the idle tree, bfq_idle_extract() will
 987                 * check for that.
 988                 */
 989                bfq_idle_extract(st, entity);
 990                entity->start = bfq_gt(min_vstart, entity->finish) ?
 991                        min_vstart : entity->finish;
 992        } else {
 993                /*
 994                 * The finish time of the entity may be invalid, and
 995                 * it is in the past for sure, otherwise the queue
 996                 * would have been on the idle tree.
 997                 */
 998                entity->start = min_vstart;
 999                st->wsum += entity->weight;
1000                /*
1001                 * entity is about to be inserted into a service tree,
1002                 * and then set in service: get a reference to make
1003                 * sure entity does not disappear until it is no
1004                 * longer in service or scheduled for service.
1005                 */
1006                bfq_get_entity(entity);
1007
1008                entity->on_st = true;
1009        }
1010
1011#ifdef BFQ_GROUP_IOSCHED_ENABLED
1012        if (!bfq_entity_to_bfqq(entity)) { /* bfq_group */
1013                struct bfq_group *bfqg =
1014                        container_of(entity, struct bfq_group, entity);
1015
1016                bfq_weights_tree_add(bfqg->bfqd, entity,
1017                                     &bfqd->group_weights_tree);
1018        }
1019#endif
1020
1021        bfq_update_fin_time_enqueue(entity, st, backshifted);
1022}
1023
1024/**
1025 * __bfq_requeue_entity - handle requeueing or repositioning of an entity.
1026 * @entity: the entity being requeued or repositioned.
1027 *
1028 * Requeueing is needed if this entity stops being served, which
1029 * happens if a leaf descendant entity has expired. On the other hand,
1030 * repositioning is needed if the next_inservice_entity for the child
1031 * entity has changed. See the comments inside the function for
1032 * details.
1033 *
1034 * Basically, this function: 1) removes entity from its active tree if
1035 * present there, 2) updates the timestamps of entity and 3) inserts
1036 * entity back into its active tree (in the new, right position for
1037 * the new values of the timestamps).
1038 */
1039static void __bfq_requeue_entity(struct bfq_entity *entity)
1040{
1041        struct bfq_sched_data *sd = entity->sched_data;
1042        struct bfq_service_tree *st = bfq_entity_service_tree(entity);
1043
1044        if (entity == sd->in_service_entity) {
1045                /*
1046                 * We are requeueing the current in-service entity,
1047                 * which may have to be done for one of the following
1048                 * reasons:
1049                 * - entity represents the in-service queue, and the
1050                 *   in-service queue is being requeued after an
1051                 *   expiration;
1052                 * - entity represents a group, and its budget has
1053                 *   changed because one of its child entities has
1054                 *   just been either activated or requeued for some
1055                 *   reason; the timestamps of the entity need then to
1056                 *   be updated, and the entity needs to be enqueued
1057                 *   or repositioned accordingly.
1058                 *
1059                 * In particular, before requeueing, the start time of
1060                 * the entity must be moved forward to account for the
1061                 * service that the entity has received while in
1062                 * service. This is done by the next instructions. The
1063                 * finish time will then be updated according to this
1064                 * new value of the start time, and to the budget of
1065                 * the entity.
1066                 */
1067                bfq_calc_finish(entity, entity->service);
1068                entity->start = entity->finish;
1069                /*
1070                 * In addition, if the entity had more than one child
1071                 * when set in service, then it was not extracted from
1072                 * the active tree. This implies that the position of
1073                 * the entity in the active tree may need to be
1074                 * changed now, because we have just updated the start
1075                 * time of the entity, and we will update its finish
1076                 * time in a moment (the requeueing is then, more
1077                 * precisely, a repositioning in this case). To
1078                 * implement this repositioning, we: 1) dequeue the
1079                 * entity here, 2) update the finish time and requeue
1080                 * the entity according to the new timestamps below.
1081                 */
1082                if (entity->tree)
1083                        bfq_active_extract(st, entity);
1084        } else { /* The entity is already active, and not in service */
1085                /*
1086                 * In this case, this function gets called only if the
1087                 * next_in_service entity below this entity has
1088                 * changed, and this change has caused the budget of
1089                 * this entity to change, which, finally implies that
1090                 * the finish time of this entity must be
1091                 * updated. Such an update may cause the scheduling,
1092                 * i.e., the position in the active tree, of this
1093                 * entity to change. We handle this change by: 1)
1094                 * dequeueing the entity here, 2) updating the finish
1095                 * time and requeueing the entity according to the new
1096                 * timestamps below. This is the same approach as the
1097                 * non-extracted-entity sub-case above.
1098                 */
1099                bfq_active_extract(st, entity);
1100        }
1101
1102        bfq_update_fin_time_enqueue(entity, st, false);
1103}
1104
1105static void __bfq_activate_requeue_entity(struct bfq_entity *entity,
1106                                          struct bfq_sched_data *sd,
1107                                          bool non_blocking_wait_rq)
1108{
1109        struct bfq_service_tree *st = bfq_entity_service_tree(entity);
1110
1111        if (sd->in_service_entity == entity || entity->tree == &st->active)
1112                 /*
1113                  * in service or already queued on the active tree,
1114                  * requeue or reposition
1115                  */
1116                __bfq_requeue_entity(entity);
1117        else
1118                /*
1119                 * Not in service and not queued on its active tree:
1120                 * the activity is idle and this is a true activation.
1121                 */
1122                __bfq_activate_entity(entity, non_blocking_wait_rq);
1123}
1124
1125
1126/**
1127 * bfq_activate_requeue_entity - activate or requeue an entity representing a
1128 *                               bfq_queue, and activate, requeue or reposition
1129 *                               all ancestors for which such an update becomes
1130 *                               necessary.
1131 * @entity: the entity to activate.
1132 * @non_blocking_wait_rq: true if this entity was waiting for a request
1133 * @requeue: true if this is a requeue, which implies that bfqq is
1134 *           being expired; thus ALL its ancestors stop being served and must
1135 *           therefore be requeued
1136 * @expiration: true if this function is being invoked in the expiration path
1137 *             of the in-service queue
1138 */
1139static void bfq_activate_requeue_entity(struct bfq_entity *entity,
1140                                        bool non_blocking_wait_rq,
1141                                        bool requeue, bool expiration)
1142{
1143        struct bfq_sched_data *sd;
1144
1145        for_each_entity(entity) {
1146                sd = entity->sched_data;
1147                __bfq_activate_requeue_entity(entity, sd, non_blocking_wait_rq);
1148
1149                if (!bfq_update_next_in_service(sd, entity, expiration) &&
1150                    !requeue)
1151                        break;
1152        }
1153}
1154
1155/**
1156 * __bfq_deactivate_entity - deactivate an entity from its service tree.
1157 * @entity: the entity to deactivate.
1158 * @ins_into_idle_tree: if false, the entity will not be put into the
1159 *                      idle tree.
1160 *
1161 * Deactivates an entity, independently of its previous state.  Must
1162 * be invoked only if entity is on a service tree. Extracts the entity
1163 * from that tree, and if necessary and allowed, puts it into the idle
1164 * tree.
1165 */
1166bool __bfq_deactivate_entity(struct bfq_entity *entity, bool ins_into_idle_tree)
1167{
1168        struct bfq_sched_data *sd = entity->sched_data;
1169        struct bfq_service_tree *st;
1170        bool is_in_service;
1171
1172        if (!entity->on_st) /* entity never activated, or already inactive */
1173                return false;
1174
1175        /*
1176         * If we get here, then entity is active, which implies that
1177         * bfq_group_set_parent has already been invoked for the group
1178         * represented by entity. Therefore, the field
1179         * entity->sched_data has been set, and we can safely use it.
1180         */
1181        st = bfq_entity_service_tree(entity);
1182        is_in_service = entity == sd->in_service_entity;
1183
1184        if (is_in_service) {
1185                bfq_calc_finish(entity, entity->service);
1186                sd->in_service_entity = NULL;
1187        }
1188
1189        if (entity->tree == &st->active)
1190                bfq_active_extract(st, entity);
1191        else if (!is_in_service && entity->tree == &st->idle)
1192                bfq_idle_extract(st, entity);
1193
1194        if (!ins_into_idle_tree || !bfq_gt(entity->finish, st->vtime))
1195                bfq_forget_entity(st, entity, is_in_service);
1196        else
1197                bfq_idle_insert(st, entity);
1198
1199        return true;
1200}
1201
1202/**
1203 * bfq_deactivate_entity - deactivate an entity representing a bfq_queue.
1204 * @entity: the entity to deactivate.
1205 * @ins_into_idle_tree: true if the entity can be put into the idle tree
1206 * @expiration: true if this function is being invoked in the expiration path
1207 *             of the in-service queue
1208 */
1209static void bfq_deactivate_entity(struct bfq_entity *entity,
1210                                  bool ins_into_idle_tree,
1211                                  bool expiration)
1212{
1213        struct bfq_sched_data *sd;
1214        struct bfq_entity *parent = NULL;
1215
1216        for_each_entity_safe(entity, parent) {
1217                sd = entity->sched_data;
1218
1219                if (!__bfq_deactivate_entity(entity, ins_into_idle_tree)) {
1220                        /*
1221                         * entity is not in any tree any more, so
1222                         * this deactivation is a no-op, and there is
1223                         * nothing to change for upper-level entities
1224                         * (in case of expiration, this can never
1225                         * happen).
1226                         */
1227                        return;
1228                }
1229
1230                if (sd->next_in_service == entity)
1231                        /*
1232                         * entity was the next_in_service entity,
1233                         * then, since entity has just been
1234                         * deactivated, a new one must be found.
1235                         */
1236                        bfq_update_next_in_service(sd, NULL, expiration);
1237
1238                if (sd->next_in_service || sd->in_service_entity) {
1239                        /*
1240                         * The parent entity is still active, because
1241                         * either next_in_service or in_service_entity
1242                         * is not NULL. So, no further upwards
1243                         * deactivation must be performed.  Yet,
1244                         * next_in_service has changed. Then the
1245                         * schedule does need to be updated upwards.
1246                         *
1247                         * NOTE If in_service_entity is not NULL, then
1248                         * next_in_service may happen to be NULL,
1249                         * although the parent entity is evidently
1250                         * active. This happens if 1) the entity
1251                         * pointed by in_service_entity is the only
1252                         * active entity in the parent entity, and 2)
1253                         * according to the definition of
1254                         * next_in_service, the in_service_entity
1255                         * cannot be considered as
1256                         * next_in_service. See the comments on the
1257                         * definition of next_in_service for details.
1258                         */
1259                        break;
1260                }
1261
1262                /*
1263                 * If we get here, then the parent is no more
1264                 * backlogged and we need to propagate the
1265                 * deactivation upwards. Thus let the loop go on.
1266                 */
1267
1268                /*
1269                 * Also let parent be queued into the idle tree on
1270                 * deactivation, to preserve service guarantees, and
1271                 * assuming that who invoked this function does not
1272                 * need parent entities too to be removed completely.
1273                 */
1274                ins_into_idle_tree = true;
1275        }
1276
1277        /*
1278         * If the deactivation loop is fully executed, then there are
1279         * no more entities to touch and next loop is not executed at
1280         * all. Otherwise, requeue remaining entities if they are
1281         * about to stop receiving service, or reposition them if this
1282         * is not the case.
1283         */
1284        entity = parent;
1285        for_each_entity(entity) {
1286                /*
1287                 * Invoke __bfq_requeue_entity on entity, even if
1288                 * already active, to requeue/reposition it in the
1289                 * active tree (because sd->next_in_service has
1290                 * changed)
1291                 */
1292                __bfq_requeue_entity(entity);
1293
1294                sd = entity->sched_data;
1295                if (!bfq_update_next_in_service(sd, entity, expiration) &&
1296                    !expiration)
1297                        /*
1298                         * next_in_service unchanged or not causing
1299                         * any change in entity->parent->sd, and no
1300                         * requeueing needed for expiration: stop
1301                         * here.
1302                         */
1303                        break;
1304        }
1305}
1306
1307/**
1308 * bfq_calc_vtime_jump - compute the value to which the vtime should jump,
1309 *                       if needed, to have at least one entity eligible.
1310 * @st: the service tree to act upon.
1311 *
1312 * Assumes that st is not empty.
1313 */
1314static u64 bfq_calc_vtime_jump(struct bfq_service_tree *st)
1315{
1316        struct bfq_entity *root_entity = bfq_root_active_entity(&st->active);
1317
1318        if (bfq_gt(root_entity->min_start, st->vtime))
1319                return root_entity->min_start;
1320
1321        return st->vtime;
1322}
1323
1324static void bfq_update_vtime(struct bfq_service_tree *st, u64 new_value)
1325{
1326        if (new_value > st->vtime) {
1327                st->vtime = new_value;
1328                bfq_forget_idle(st);
1329        }
1330}
1331
1332/**
1333 * bfq_first_active_entity - find the eligible entity with
1334 *                           the smallest finish time
1335 * @st: the service tree to select from.
1336 * @vtime: the system virtual to use as a reference for eligibility
1337 *
1338 * This function searches the first schedulable entity, starting from the
1339 * root of the tree and going on the left every time on this side there is
1340 * a subtree with at least one eligible (start <= vtime) entity. The path on
1341 * the right is followed only if a) the left subtree contains no eligible
1342 * entities and b) no eligible entity has been found yet.
1343 */
1344static struct bfq_entity *bfq_first_active_entity(struct bfq_service_tree *st,
1345                                                  u64 vtime)
1346{
1347        struct bfq_entity *entry, *first = NULL;
1348        struct rb_node *node = st->active.rb_node;
1349
1350        while (node) {
1351                entry = rb_entry(node, struct bfq_entity, rb_node);
1352left:
1353                if (!bfq_gt(entry->start, vtime))
1354                        first = entry;
1355
1356                if (node->rb_left) {
1357                        entry = rb_entry(node->rb_left,
1358                                         struct bfq_entity, rb_node);
1359                        if (!bfq_gt(entry->min_start, vtime)) {
1360                                node = node->rb_left;
1361                                goto left;
1362                        }
1363                }
1364                if (first)
1365                        break;
1366                node = node->rb_right;
1367        }
1368
1369        return first;
1370}
1371
1372/**
1373 * __bfq_lookup_next_entity - return the first eligible entity in @st.
1374 * @st: the service tree.
1375 *
1376 * If there is no in-service entity for the sched_data st belongs to,
1377 * then return the entity that will be set in service if:
1378 * 1) the parent entity this st belongs to is set in service;
1379 * 2) no entity belonging to such parent entity undergoes a state change
1380 * that would influence the timestamps of the entity (e.g., becomes idle,
1381 * becomes backlogged, changes its budget, ...).
1382 *
1383 * In this first case, update the virtual time in @st too (see the
1384 * comments on this update inside the function).
1385 *
1386 * In constrast, if there is an in-service entity, then return the
1387 * entity that would be set in service if not only the above
1388 * conditions, but also the next one held true: the currently
1389 * in-service entity, on expiration,
1390 * 1) gets a finish time equal to the current one, or
1391 * 2) is not eligible any more, or
1392 * 3) is idle.
1393 */
1394static struct bfq_entity *
1395__bfq_lookup_next_entity(struct bfq_service_tree *st, bool in_service)
1396{
1397        struct bfq_entity *entity;
1398        u64 new_vtime;
1399
1400        if (RB_EMPTY_ROOT(&st->active))
1401                return NULL;
1402
1403        /*
1404         * Get the value of the system virtual time for which at
1405         * least one entity is eligible.
1406         */
1407        new_vtime = bfq_calc_vtime_jump(st);
1408
1409        /*
1410         * If there is no in-service entity for the sched_data this
1411         * active tree belongs to, then push the system virtual time
1412         * up to the value that guarantees that at least one entity is
1413         * eligible. If, instead, there is an in-service entity, then
1414         * do not make any such update, because there is already an
1415         * eligible entity, namely the in-service one (even if the
1416         * entity is not on st, because it was extracted when set in
1417         * service).
1418         */
1419        if (!in_service)
1420                bfq_update_vtime(st, new_vtime);
1421
1422        entity = bfq_first_active_entity(st, new_vtime);
1423
1424        return entity;
1425}
1426
1427/**
1428 * bfq_lookup_next_entity - return the first eligible entity in @sd.
1429 * @sd: the sched_data.
1430 * @expiration: true if we are on the expiration path of the in-service queue
1431 *
1432 * This function is invoked when there has been a change in the trees
1433 * for sd, and we need to know what is the new next entity to serve
1434 * after this change.
1435 */
1436static struct bfq_entity *bfq_lookup_next_entity(struct bfq_sched_data *sd,
1437                                                 bool expiration)
1438{
1439        struct bfq_service_tree *st = sd->service_tree;
1440        struct bfq_service_tree *idle_class_st = st + (BFQ_IOPRIO_CLASSES - 1);
1441        struct bfq_entity *entity = NULL;
1442        int class_idx = 0;
1443
1444        /*
1445         * Choose from idle class, if needed to guarantee a minimum
1446         * bandwidth to this class (and if there is some active entity
1447         * in idle class). This should also mitigate
1448         * priority-inversion problems in case a low priority task is
1449         * holding file system resources.
1450         */
1451        if (time_is_before_jiffies(sd->bfq_class_idle_last_service +
1452                                   BFQ_CL_IDLE_TIMEOUT)) {
1453                if (!RB_EMPTY_ROOT(&idle_class_st->active))
1454                        class_idx = BFQ_IOPRIO_CLASSES - 1;
1455                /* About to be served if backlogged, or not yet backlogged */
1456                sd->bfq_class_idle_last_service = jiffies;
1457        }
1458
1459        /*
1460         * Find the next entity to serve for the highest-priority
1461         * class, unless the idle class needs to be served.
1462         */
1463        for (; class_idx < BFQ_IOPRIO_CLASSES; class_idx++) {
1464                /*
1465                 * If expiration is true, then bfq_lookup_next_entity
1466                 * is being invoked as a part of the expiration path
1467                 * of the in-service queue. In this case, even if
1468                 * sd->in_service_entity is not NULL,
1469                 * sd->in_service_entiy at this point is actually not
1470                 * in service any more, and, if needed, has already
1471                 * been properly queued or requeued into the right
1472                 * tree. The reason why sd->in_service_entity is still
1473                 * not NULL here, even if expiration is true, is that
1474                 * sd->in_service_entiy is reset as a last step in the
1475                 * expiration path. So, if expiration is true, tell
1476                 * __bfq_lookup_next_entity that there is no
1477                 * sd->in_service_entity.
1478                 */
1479                entity = __bfq_lookup_next_entity(st + class_idx,
1480                                                  sd->in_service_entity &&
1481                                                  !expiration);
1482
1483                if (entity)
1484                        break;
1485        }
1486
1487        if (!entity)
1488                return NULL;
1489
1490        return entity;
1491}
1492
1493bool next_queue_may_preempt(struct bfq_data *bfqd)
1494{
1495        struct bfq_sched_data *sd = &bfqd->root_group->sched_data;
1496
1497        return sd->next_in_service != sd->in_service_entity;
1498}
1499
1500/*
1501 * Get next queue for service.
1502 */
1503struct bfq_queue *bfq_get_next_queue(struct bfq_data *bfqd)
1504{
1505        struct bfq_entity *entity = NULL;
1506        struct bfq_sched_data *sd;
1507        struct bfq_queue *bfqq;
1508
1509        if (bfqd->busy_queues == 0)
1510                return NULL;
1511
1512        /*
1513         * Traverse the path from the root to the leaf entity to
1514         * serve. Set in service all the entities visited along the
1515         * way.
1516         */
1517        sd = &bfqd->root_group->sched_data;
1518        for (; sd ; sd = entity->my_sched_data) {
1519                /*
1520                 * WARNING. We are about to set the in-service entity
1521                 * to sd->next_in_service, i.e., to the (cached) value
1522                 * returned by bfq_lookup_next_entity(sd) the last
1523                 * time it was invoked, i.e., the last time when the
1524                 * service order in sd changed as a consequence of the
1525                 * activation or deactivation of an entity. In this
1526                 * respect, if we execute bfq_lookup_next_entity(sd)
1527                 * in this very moment, it may, although with low
1528                 * probability, yield a different entity than that
1529                 * pointed to by sd->next_in_service. This rare event
1530                 * happens in case there was no CLASS_IDLE entity to
1531                 * serve for sd when bfq_lookup_next_entity(sd) was
1532                 * invoked for the last time, while there is now one
1533                 * such entity.
1534                 *
1535                 * If the above event happens, then the scheduling of
1536                 * such entity in CLASS_IDLE is postponed until the
1537                 * service of the sd->next_in_service entity
1538                 * finishes. In fact, when the latter is expired,
1539                 * bfq_lookup_next_entity(sd) gets called again,
1540                 * exactly to update sd->next_in_service.
1541                 */
1542
1543                /* Make next_in_service entity become in_service_entity */
1544                entity = sd->next_in_service;
1545                sd->in_service_entity = entity;
1546
1547                /*
1548                 * If entity is no longer a candidate for next
1549                 * service, then it must be extracted from its active
1550                 * tree, so as to make sure that it won't be
1551                 * considered when computing next_in_service. See the
1552                 * comments on the function
1553                 * bfq_no_longer_next_in_service() for details.
1554                 */
1555                if (bfq_no_longer_next_in_service(entity))
1556                        bfq_active_extract(bfq_entity_service_tree(entity),
1557                                           entity);
1558
1559                /*
1560                 * Even if entity is not to be extracted according to
1561                 * the above check, a descendant entity may get
1562                 * extracted in one of the next iterations of this
1563                 * loop. Such an event could cause a change in
1564                 * next_in_service for the level of the descendant
1565                 * entity, and thus possibly back to this level.
1566                 *
1567                 * However, we cannot perform the resulting needed
1568                 * update of next_in_service for this level before the
1569                 * end of the whole loop, because, to know which is
1570                 * the correct next-to-serve candidate entity for each
1571                 * level, we need first to find the leaf entity to set
1572                 * in service. In fact, only after we know which is
1573                 * the next-to-serve leaf entity, we can discover
1574                 * whether the parent entity of the leaf entity
1575                 * becomes the next-to-serve, and so on.
1576                 */
1577        }
1578
1579        bfqq = bfq_entity_to_bfqq(entity);
1580
1581        /*
1582         * We can finally update all next-to-serve entities along the
1583         * path from the leaf entity just set in service to the root.
1584         */
1585        for_each_entity(entity) {
1586                struct bfq_sched_data *sd = entity->sched_data;
1587
1588                if (!bfq_update_next_in_service(sd, NULL, false))
1589                        break;
1590        }
1591
1592        return bfqq;
1593}
1594
1595void __bfq_bfqd_reset_in_service(struct bfq_data *bfqd)
1596{
1597        struct bfq_queue *in_serv_bfqq = bfqd->in_service_queue;
1598        struct bfq_entity *in_serv_entity = &in_serv_bfqq->entity;
1599        struct bfq_entity *entity = in_serv_entity;
1600
1601        bfq_clear_bfqq_wait_request(in_serv_bfqq);
1602        hrtimer_try_to_cancel(&bfqd->idle_slice_timer);
1603        bfqd->in_service_queue = NULL;
1604
1605        /*
1606         * When this function is called, all in-service entities have
1607         * been properly deactivated or requeued, so we can safely
1608         * execute the final step: reset in_service_entity along the
1609         * path from entity to the root.
1610         */
1611        for_each_entity(entity)
1612                entity->sched_data->in_service_entity = NULL;
1613
1614        /*
1615         * in_serv_entity is no longer in service, so, if it is in no
1616         * service tree either, then release the service reference to
1617         * the queue it represents (taken with bfq_get_entity).
1618         */
1619        if (!in_serv_entity->on_st)
1620                bfq_put_queue(in_serv_bfqq);
1621}
1622
1623void bfq_deactivate_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq,
1624                         bool ins_into_idle_tree, bool expiration)
1625{
1626        struct bfq_entity *entity = &bfqq->entity;
1627
1628        bfq_deactivate_entity(entity, ins_into_idle_tree, expiration);
1629}
1630
1631void bfq_activate_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq)
1632{
1633        struct bfq_entity *entity = &bfqq->entity;
1634
1635        bfq_activate_requeue_entity(entity, bfq_bfqq_non_blocking_wait_rq(bfqq),
1636                                    false, false);
1637        bfq_clear_bfqq_non_blocking_wait_rq(bfqq);
1638}
1639
1640void bfq_requeue_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq,
1641                      bool expiration)
1642{
1643        struct bfq_entity *entity = &bfqq->entity;
1644
1645        bfq_activate_requeue_entity(entity, false,
1646                                    bfqq == bfqd->in_service_queue, expiration);
1647}
1648
1649/*
1650 * Called when the bfqq no longer has requests pending, remove it from
1651 * the service tree. As a special case, it can be invoked during an
1652 * expiration.
1653 */
1654void bfq_del_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq,
1655                       bool expiration)
1656{
1657        bfq_log_bfqq(bfqd, bfqq, "del from busy");
1658
1659        bfq_clear_bfqq_busy(bfqq);
1660
1661        bfqd->busy_queues--;
1662
1663        if (!bfqq->dispatched)
1664                bfq_weights_tree_remove(bfqd, bfqq);
1665
1666        if (bfqq->wr_coeff > 1)
1667                bfqd->wr_busy_queues--;
1668
1669        bfqg_stats_update_dequeue(bfqq_group(bfqq));
1670
1671        bfq_deactivate_bfqq(bfqd, bfqq, true, expiration);
1672}
1673
1674/*
1675 * Called when an inactive queue receives a new request.
1676 */
1677void bfq_add_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq)
1678{
1679        bfq_log_bfqq(bfqd, bfqq, "add to busy");
1680
1681        bfq_activate_bfqq(bfqd, bfqq);
1682
1683        bfq_mark_bfqq_busy(bfqq);
1684        bfqd->busy_queues++;
1685
1686        if (!bfqq->dispatched)
1687                if (bfqq->wr_coeff == 1)
1688                        bfq_weights_tree_add(bfqd, &bfqq->entity,
1689                                             &bfqd->queue_weights_tree);
1690
1691        if (bfqq->wr_coeff > 1)
1692                bfqd->wr_busy_queues++;
1693}
1694