linux/fs/btrfs/reada.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0
   2/*
   3 * Copyright (C) 2011 STRATO.  All rights reserved.
   4 */
   5
   6#include <linux/sched.h>
   7#include <linux/pagemap.h>
   8#include <linux/writeback.h>
   9#include <linux/blkdev.h>
  10#include <linux/slab.h>
  11#include <linux/workqueue.h>
  12#include "ctree.h"
  13#include "volumes.h"
  14#include "disk-io.h"
  15#include "transaction.h"
  16#include "dev-replace.h"
  17#include "block-group.h"
  18
  19#undef DEBUG
  20
  21/*
  22 * This is the implementation for the generic read ahead framework.
  23 *
  24 * To trigger a readahead, btrfs_reada_add must be called. It will start
  25 * a read ahead for the given range [start, end) on tree root. The returned
  26 * handle can either be used to wait on the readahead to finish
  27 * (btrfs_reada_wait), or to send it to the background (btrfs_reada_detach).
  28 *
  29 * The read ahead works as follows:
  30 * On btrfs_reada_add, the root of the tree is inserted into a radix_tree.
  31 * reada_start_machine will then search for extents to prefetch and trigger
  32 * some reads. When a read finishes for a node, all contained node/leaf
  33 * pointers that lie in the given range will also be enqueued. The reads will
  34 * be triggered in sequential order, thus giving a big win over a naive
  35 * enumeration. It will also make use of multi-device layouts. Each disk
  36 * will have its on read pointer and all disks will by utilized in parallel.
  37 * Also will no two disks read both sides of a mirror simultaneously, as this
  38 * would waste seeking capacity. Instead both disks will read different parts
  39 * of the filesystem.
  40 * Any number of readaheads can be started in parallel. The read order will be
  41 * determined globally, i.e. 2 parallel readaheads will normally finish faster
  42 * than the 2 started one after another.
  43 */
  44
  45#define MAX_IN_FLIGHT 6
  46
  47struct reada_extctl {
  48        struct list_head        list;
  49        struct reada_control    *rc;
  50        u64                     generation;
  51};
  52
  53struct reada_extent {
  54        u64                     logical;
  55        u64                     owner_root;
  56        struct btrfs_key        top;
  57        struct list_head        extctl;
  58        int                     refcnt;
  59        spinlock_t              lock;
  60        struct reada_zone       *zones[BTRFS_MAX_MIRRORS];
  61        int                     nzones;
  62        int                     scheduled;
  63        int                     level;
  64};
  65
  66struct reada_zone {
  67        u64                     start;
  68        u64                     end;
  69        u64                     elems;
  70        struct list_head        list;
  71        spinlock_t              lock;
  72        int                     locked;
  73        struct btrfs_device     *device;
  74        struct btrfs_device     *devs[BTRFS_MAX_MIRRORS]; /* full list, incl
  75                                                           * self */
  76        int                     ndevs;
  77        struct kref             refcnt;
  78};
  79
  80struct reada_machine_work {
  81        struct btrfs_work       work;
  82        struct btrfs_fs_info    *fs_info;
  83};
  84
  85static void reada_extent_put(struct btrfs_fs_info *, struct reada_extent *);
  86static void reada_control_release(struct kref *kref);
  87static void reada_zone_release(struct kref *kref);
  88static void reada_start_machine(struct btrfs_fs_info *fs_info);
  89static void __reada_start_machine(struct btrfs_fs_info *fs_info);
  90
  91static int reada_add_block(struct reada_control *rc, u64 logical,
  92                           struct btrfs_key *top, u64 owner_root,
  93                           u64 generation, int level);
  94
  95/* recurses */
  96/* in case of err, eb might be NULL */
  97static void __readahead_hook(struct btrfs_fs_info *fs_info,
  98                             struct reada_extent *re, struct extent_buffer *eb,
  99                             int err)
 100{
 101        int nritems;
 102        int i;
 103        u64 bytenr;
 104        u64 generation;
 105        struct list_head list;
 106
 107        spin_lock(&re->lock);
 108        /*
 109         * just take the full list from the extent. afterwards we
 110         * don't need the lock anymore
 111         */
 112        list_replace_init(&re->extctl, &list);
 113        re->scheduled = 0;
 114        spin_unlock(&re->lock);
 115
 116        /*
 117         * this is the error case, the extent buffer has not been
 118         * read correctly. We won't access anything from it and
 119         * just cleanup our data structures. Effectively this will
 120         * cut the branch below this node from read ahead.
 121         */
 122        if (err)
 123                goto cleanup;
 124
 125        /*
 126         * FIXME: currently we just set nritems to 0 if this is a leaf,
 127         * effectively ignoring the content. In a next step we could
 128         * trigger more readahead depending from the content, e.g.
 129         * fetch the checksums for the extents in the leaf.
 130         */
 131        if (!btrfs_header_level(eb))
 132                goto cleanup;
 133
 134        nritems = btrfs_header_nritems(eb);
 135        generation = btrfs_header_generation(eb);
 136        for (i = 0; i < nritems; i++) {
 137                struct reada_extctl *rec;
 138                u64 n_gen;
 139                struct btrfs_key key;
 140                struct btrfs_key next_key;
 141
 142                btrfs_node_key_to_cpu(eb, &key, i);
 143                if (i + 1 < nritems)
 144                        btrfs_node_key_to_cpu(eb, &next_key, i + 1);
 145                else
 146                        next_key = re->top;
 147                bytenr = btrfs_node_blockptr(eb, i);
 148                n_gen = btrfs_node_ptr_generation(eb, i);
 149
 150                list_for_each_entry(rec, &list, list) {
 151                        struct reada_control *rc = rec->rc;
 152
 153                        /*
 154                         * if the generation doesn't match, just ignore this
 155                         * extctl. This will probably cut off a branch from
 156                         * prefetch. Alternatively one could start a new (sub-)
 157                         * prefetch for this branch, starting again from root.
 158                         * FIXME: move the generation check out of this loop
 159                         */
 160#ifdef DEBUG
 161                        if (rec->generation != generation) {
 162                                btrfs_debug(fs_info,
 163                                            "generation mismatch for (%llu,%d,%llu) %llu != %llu",
 164                                            key.objectid, key.type, key.offset,
 165                                            rec->generation, generation);
 166                        }
 167#endif
 168                        if (rec->generation == generation &&
 169                            btrfs_comp_cpu_keys(&key, &rc->key_end) < 0 &&
 170                            btrfs_comp_cpu_keys(&next_key, &rc->key_start) > 0)
 171                                reada_add_block(rc, bytenr, &next_key,
 172                                                btrfs_header_owner(eb), n_gen,
 173                                                btrfs_header_level(eb) - 1);
 174                }
 175        }
 176
 177cleanup:
 178        /*
 179         * free extctl records
 180         */
 181        while (!list_empty(&list)) {
 182                struct reada_control *rc;
 183                struct reada_extctl *rec;
 184
 185                rec = list_first_entry(&list, struct reada_extctl, list);
 186                list_del(&rec->list);
 187                rc = rec->rc;
 188                kfree(rec);
 189
 190                kref_get(&rc->refcnt);
 191                if (atomic_dec_and_test(&rc->elems)) {
 192                        kref_put(&rc->refcnt, reada_control_release);
 193                        wake_up(&rc->wait);
 194                }
 195                kref_put(&rc->refcnt, reada_control_release);
 196
 197                reada_extent_put(fs_info, re);  /* one ref for each entry */
 198        }
 199
 200        return;
 201}
 202
 203int btree_readahead_hook(struct extent_buffer *eb, int err)
 204{
 205        struct btrfs_fs_info *fs_info = eb->fs_info;
 206        int ret = 0;
 207        struct reada_extent *re;
 208
 209        /* find extent */
 210        spin_lock(&fs_info->reada_lock);
 211        re = radix_tree_lookup(&fs_info->reada_tree,
 212                               eb->start >> fs_info->sectorsize_bits);
 213        if (re)
 214                re->refcnt++;
 215        spin_unlock(&fs_info->reada_lock);
 216        if (!re) {
 217                ret = -1;
 218                goto start_machine;
 219        }
 220
 221        __readahead_hook(fs_info, re, eb, err);
 222        reada_extent_put(fs_info, re);  /* our ref */
 223
 224start_machine:
 225        reada_start_machine(fs_info);
 226        return ret;
 227}
 228
 229static struct reada_zone *reada_find_zone(struct btrfs_device *dev, u64 logical,
 230                                          struct btrfs_bio *bbio)
 231{
 232        struct btrfs_fs_info *fs_info = dev->fs_info;
 233        int ret;
 234        struct reada_zone *zone;
 235        struct btrfs_block_group *cache = NULL;
 236        u64 start;
 237        u64 end;
 238        int i;
 239
 240        zone = NULL;
 241        spin_lock(&fs_info->reada_lock);
 242        ret = radix_tree_gang_lookup(&dev->reada_zones, (void **)&zone,
 243                                     logical >> fs_info->sectorsize_bits, 1);
 244        if (ret == 1 && logical >= zone->start && logical <= zone->end) {
 245                kref_get(&zone->refcnt);
 246                spin_unlock(&fs_info->reada_lock);
 247                return zone;
 248        }
 249
 250        spin_unlock(&fs_info->reada_lock);
 251
 252        cache = btrfs_lookup_block_group(fs_info, logical);
 253        if (!cache)
 254                return NULL;
 255
 256        start = cache->start;
 257        end = start + cache->length - 1;
 258        btrfs_put_block_group(cache);
 259
 260        zone = kzalloc(sizeof(*zone), GFP_KERNEL);
 261        if (!zone)
 262                return NULL;
 263
 264        ret = radix_tree_preload(GFP_KERNEL);
 265        if (ret) {
 266                kfree(zone);
 267                return NULL;
 268        }
 269
 270        zone->start = start;
 271        zone->end = end;
 272        INIT_LIST_HEAD(&zone->list);
 273        spin_lock_init(&zone->lock);
 274        zone->locked = 0;
 275        kref_init(&zone->refcnt);
 276        zone->elems = 0;
 277        zone->device = dev; /* our device always sits at index 0 */
 278        for (i = 0; i < bbio->num_stripes; ++i) {
 279                /* bounds have already been checked */
 280                zone->devs[i] = bbio->stripes[i].dev;
 281        }
 282        zone->ndevs = bbio->num_stripes;
 283
 284        spin_lock(&fs_info->reada_lock);
 285        ret = radix_tree_insert(&dev->reada_zones,
 286                        (unsigned long)(zone->end >> fs_info->sectorsize_bits),
 287                        zone);
 288
 289        if (ret == -EEXIST) {
 290                kfree(zone);
 291                ret = radix_tree_gang_lookup(&dev->reada_zones, (void **)&zone,
 292                                        logical >> fs_info->sectorsize_bits, 1);
 293                if (ret == 1 && logical >= zone->start && logical <= zone->end)
 294                        kref_get(&zone->refcnt);
 295                else
 296                        zone = NULL;
 297        }
 298        spin_unlock(&fs_info->reada_lock);
 299        radix_tree_preload_end();
 300
 301        return zone;
 302}
 303
 304static struct reada_extent *reada_find_extent(struct btrfs_fs_info *fs_info,
 305                                              u64 logical,
 306                                              struct btrfs_key *top,
 307                                              u64 owner_root, int level)
 308{
 309        int ret;
 310        struct reada_extent *re = NULL;
 311        struct reada_extent *re_exist = NULL;
 312        struct btrfs_bio *bbio = NULL;
 313        struct btrfs_device *dev;
 314        struct btrfs_device *prev_dev;
 315        u64 length;
 316        int real_stripes;
 317        int nzones = 0;
 318        unsigned long index = logical >> fs_info->sectorsize_bits;
 319        int dev_replace_is_ongoing;
 320        int have_zone = 0;
 321
 322        spin_lock(&fs_info->reada_lock);
 323        re = radix_tree_lookup(&fs_info->reada_tree, index);
 324        if (re)
 325                re->refcnt++;
 326        spin_unlock(&fs_info->reada_lock);
 327
 328        if (re)
 329                return re;
 330
 331        re = kzalloc(sizeof(*re), GFP_KERNEL);
 332        if (!re)
 333                return NULL;
 334
 335        re->logical = logical;
 336        re->top = *top;
 337        INIT_LIST_HEAD(&re->extctl);
 338        spin_lock_init(&re->lock);
 339        re->refcnt = 1;
 340        re->owner_root = owner_root;
 341        re->level = level;
 342
 343        /*
 344         * map block
 345         */
 346        length = fs_info->nodesize;
 347        ret = btrfs_map_block(fs_info, BTRFS_MAP_GET_READ_MIRRORS, logical,
 348                        &length, &bbio, 0);
 349        if (ret || !bbio || length < fs_info->nodesize)
 350                goto error;
 351
 352        if (bbio->num_stripes > BTRFS_MAX_MIRRORS) {
 353                btrfs_err(fs_info,
 354                           "readahead: more than %d copies not supported",
 355                           BTRFS_MAX_MIRRORS);
 356                goto error;
 357        }
 358
 359        real_stripes = bbio->num_stripes - bbio->num_tgtdevs;
 360        for (nzones = 0; nzones < real_stripes; ++nzones) {
 361                struct reada_zone *zone;
 362
 363                dev = bbio->stripes[nzones].dev;
 364
 365                /* cannot read ahead on missing device. */
 366                if (!dev->bdev)
 367                        continue;
 368
 369                zone = reada_find_zone(dev, logical, bbio);
 370                if (!zone)
 371                        continue;
 372
 373                re->zones[re->nzones++] = zone;
 374                spin_lock(&zone->lock);
 375                if (!zone->elems)
 376                        kref_get(&zone->refcnt);
 377                ++zone->elems;
 378                spin_unlock(&zone->lock);
 379                spin_lock(&fs_info->reada_lock);
 380                kref_put(&zone->refcnt, reada_zone_release);
 381                spin_unlock(&fs_info->reada_lock);
 382        }
 383        if (re->nzones == 0) {
 384                /* not a single zone found, error and out */
 385                goto error;
 386        }
 387
 388        /* Insert extent in reada tree + all per-device trees, all or nothing */
 389        down_read(&fs_info->dev_replace.rwsem);
 390        ret = radix_tree_preload(GFP_KERNEL);
 391        if (ret) {
 392                up_read(&fs_info->dev_replace.rwsem);
 393                goto error;
 394        }
 395
 396        spin_lock(&fs_info->reada_lock);
 397        ret = radix_tree_insert(&fs_info->reada_tree, index, re);
 398        if (ret == -EEXIST) {
 399                re_exist = radix_tree_lookup(&fs_info->reada_tree, index);
 400                re_exist->refcnt++;
 401                spin_unlock(&fs_info->reada_lock);
 402                radix_tree_preload_end();
 403                up_read(&fs_info->dev_replace.rwsem);
 404                goto error;
 405        }
 406        if (ret) {
 407                spin_unlock(&fs_info->reada_lock);
 408                radix_tree_preload_end();
 409                up_read(&fs_info->dev_replace.rwsem);
 410                goto error;
 411        }
 412        radix_tree_preload_end();
 413        prev_dev = NULL;
 414        dev_replace_is_ongoing = btrfs_dev_replace_is_ongoing(
 415                        &fs_info->dev_replace);
 416        for (nzones = 0; nzones < re->nzones; ++nzones) {
 417                dev = re->zones[nzones]->device;
 418
 419                if (dev == prev_dev) {
 420                        /*
 421                         * in case of DUP, just add the first zone. As both
 422                         * are on the same device, there's nothing to gain
 423                         * from adding both.
 424                         * Also, it wouldn't work, as the tree is per device
 425                         * and adding would fail with EEXIST
 426                         */
 427                        continue;
 428                }
 429                if (!dev->bdev)
 430                        continue;
 431
 432                if (test_bit(BTRFS_DEV_STATE_NO_READA, &dev->dev_state))
 433                        continue;
 434
 435                if (dev_replace_is_ongoing &&
 436                    dev == fs_info->dev_replace.tgtdev) {
 437                        /*
 438                         * as this device is selected for reading only as
 439                         * a last resort, skip it for read ahead.
 440                         */
 441                        continue;
 442                }
 443                prev_dev = dev;
 444                ret = radix_tree_insert(&dev->reada_extents, index, re);
 445                if (ret) {
 446                        while (--nzones >= 0) {
 447                                dev = re->zones[nzones]->device;
 448                                BUG_ON(dev == NULL);
 449                                /* ignore whether the entry was inserted */
 450                                radix_tree_delete(&dev->reada_extents, index);
 451                        }
 452                        radix_tree_delete(&fs_info->reada_tree, index);
 453                        spin_unlock(&fs_info->reada_lock);
 454                        up_read(&fs_info->dev_replace.rwsem);
 455                        goto error;
 456                }
 457                have_zone = 1;
 458        }
 459        if (!have_zone)
 460                radix_tree_delete(&fs_info->reada_tree, index);
 461        spin_unlock(&fs_info->reada_lock);
 462        up_read(&fs_info->dev_replace.rwsem);
 463
 464        if (!have_zone)
 465                goto error;
 466
 467        btrfs_put_bbio(bbio);
 468        return re;
 469
 470error:
 471        for (nzones = 0; nzones < re->nzones; ++nzones) {
 472                struct reada_zone *zone;
 473
 474                zone = re->zones[nzones];
 475                kref_get(&zone->refcnt);
 476                spin_lock(&zone->lock);
 477                --zone->elems;
 478                if (zone->elems == 0) {
 479                        /*
 480                         * no fs_info->reada_lock needed, as this can't be
 481                         * the last ref
 482                         */
 483                        kref_put(&zone->refcnt, reada_zone_release);
 484                }
 485                spin_unlock(&zone->lock);
 486
 487                spin_lock(&fs_info->reada_lock);
 488                kref_put(&zone->refcnt, reada_zone_release);
 489                spin_unlock(&fs_info->reada_lock);
 490        }
 491        btrfs_put_bbio(bbio);
 492        kfree(re);
 493        return re_exist;
 494}
 495
 496static void reada_extent_put(struct btrfs_fs_info *fs_info,
 497                             struct reada_extent *re)
 498{
 499        int i;
 500        unsigned long index = re->logical >> fs_info->sectorsize_bits;
 501
 502        spin_lock(&fs_info->reada_lock);
 503        if (--re->refcnt) {
 504                spin_unlock(&fs_info->reada_lock);
 505                return;
 506        }
 507
 508        radix_tree_delete(&fs_info->reada_tree, index);
 509        for (i = 0; i < re->nzones; ++i) {
 510                struct reada_zone *zone = re->zones[i];
 511
 512                radix_tree_delete(&zone->device->reada_extents, index);
 513        }
 514
 515        spin_unlock(&fs_info->reada_lock);
 516
 517        for (i = 0; i < re->nzones; ++i) {
 518                struct reada_zone *zone = re->zones[i];
 519
 520                kref_get(&zone->refcnt);
 521                spin_lock(&zone->lock);
 522                --zone->elems;
 523                if (zone->elems == 0) {
 524                        /* no fs_info->reada_lock needed, as this can't be
 525                         * the last ref */
 526                        kref_put(&zone->refcnt, reada_zone_release);
 527                }
 528                spin_unlock(&zone->lock);
 529
 530                spin_lock(&fs_info->reada_lock);
 531                kref_put(&zone->refcnt, reada_zone_release);
 532                spin_unlock(&fs_info->reada_lock);
 533        }
 534
 535        kfree(re);
 536}
 537
 538static void reada_zone_release(struct kref *kref)
 539{
 540        struct reada_zone *zone = container_of(kref, struct reada_zone, refcnt);
 541        struct btrfs_fs_info *fs_info = zone->device->fs_info;
 542
 543        lockdep_assert_held(&fs_info->reada_lock);
 544
 545        radix_tree_delete(&zone->device->reada_zones,
 546                          zone->end >> fs_info->sectorsize_bits);
 547
 548        kfree(zone);
 549}
 550
 551static void reada_control_release(struct kref *kref)
 552{
 553        struct reada_control *rc = container_of(kref, struct reada_control,
 554                                                refcnt);
 555
 556        kfree(rc);
 557}
 558
 559static int reada_add_block(struct reada_control *rc, u64 logical,
 560                           struct btrfs_key *top, u64 owner_root,
 561                           u64 generation, int level)
 562{
 563        struct btrfs_fs_info *fs_info = rc->fs_info;
 564        struct reada_extent *re;
 565        struct reada_extctl *rec;
 566
 567        /* takes one ref */
 568        re = reada_find_extent(fs_info, logical, top, owner_root, level);
 569        if (!re)
 570                return -1;
 571
 572        rec = kzalloc(sizeof(*rec), GFP_KERNEL);
 573        if (!rec) {
 574                reada_extent_put(fs_info, re);
 575                return -ENOMEM;
 576        }
 577
 578        rec->rc = rc;
 579        rec->generation = generation;
 580        atomic_inc(&rc->elems);
 581
 582        spin_lock(&re->lock);
 583        list_add_tail(&rec->list, &re->extctl);
 584        spin_unlock(&re->lock);
 585
 586        /* leave the ref on the extent */
 587
 588        return 0;
 589}
 590
 591/*
 592 * called with fs_info->reada_lock held
 593 */
 594static void reada_peer_zones_set_lock(struct reada_zone *zone, int lock)
 595{
 596        int i;
 597        unsigned long index = zone->end >> zone->device->fs_info->sectorsize_bits;
 598
 599        for (i = 0; i < zone->ndevs; ++i) {
 600                struct reada_zone *peer;
 601                peer = radix_tree_lookup(&zone->devs[i]->reada_zones, index);
 602                if (peer && peer->device != zone->device)
 603                        peer->locked = lock;
 604        }
 605}
 606
 607/*
 608 * called with fs_info->reada_lock held
 609 */
 610static int reada_pick_zone(struct btrfs_device *dev)
 611{
 612        struct reada_zone *top_zone = NULL;
 613        struct reada_zone *top_locked_zone = NULL;
 614        u64 top_elems = 0;
 615        u64 top_locked_elems = 0;
 616        unsigned long index = 0;
 617        int ret;
 618
 619        if (dev->reada_curr_zone) {
 620                reada_peer_zones_set_lock(dev->reada_curr_zone, 0);
 621                kref_put(&dev->reada_curr_zone->refcnt, reada_zone_release);
 622                dev->reada_curr_zone = NULL;
 623        }
 624        /* pick the zone with the most elements */
 625        while (1) {
 626                struct reada_zone *zone;
 627
 628                ret = radix_tree_gang_lookup(&dev->reada_zones,
 629                                             (void **)&zone, index, 1);
 630                if (ret == 0)
 631                        break;
 632                index = (zone->end >> dev->fs_info->sectorsize_bits) + 1;
 633                if (zone->locked) {
 634                        if (zone->elems > top_locked_elems) {
 635                                top_locked_elems = zone->elems;
 636                                top_locked_zone = zone;
 637                        }
 638                } else {
 639                        if (zone->elems > top_elems) {
 640                                top_elems = zone->elems;
 641                                top_zone = zone;
 642                        }
 643                }
 644        }
 645        if (top_zone)
 646                dev->reada_curr_zone = top_zone;
 647        else if (top_locked_zone)
 648                dev->reada_curr_zone = top_locked_zone;
 649        else
 650                return 0;
 651
 652        dev->reada_next = dev->reada_curr_zone->start;
 653        kref_get(&dev->reada_curr_zone->refcnt);
 654        reada_peer_zones_set_lock(dev->reada_curr_zone, 1);
 655
 656        return 1;
 657}
 658
 659static int reada_tree_block_flagged(struct btrfs_fs_info *fs_info, u64 bytenr,
 660                                    u64 owner_root, int level, int mirror_num,
 661                                    struct extent_buffer **eb)
 662{
 663        struct extent_buffer *buf = NULL;
 664        int ret;
 665
 666        buf = btrfs_find_create_tree_block(fs_info, bytenr, owner_root, level);
 667        if (IS_ERR(buf))
 668                return 0;
 669
 670        set_bit(EXTENT_BUFFER_READAHEAD, &buf->bflags);
 671
 672        ret = read_extent_buffer_pages(buf, WAIT_PAGE_LOCK, mirror_num);
 673        if (ret) {
 674                free_extent_buffer_stale(buf);
 675                return ret;
 676        }
 677
 678        if (test_bit(EXTENT_BUFFER_CORRUPT, &buf->bflags)) {
 679                free_extent_buffer_stale(buf);
 680                return -EIO;
 681        } else if (extent_buffer_uptodate(buf)) {
 682                *eb = buf;
 683        } else {
 684                free_extent_buffer(buf);
 685        }
 686        return 0;
 687}
 688
 689static int reada_start_machine_dev(struct btrfs_device *dev)
 690{
 691        struct btrfs_fs_info *fs_info = dev->fs_info;
 692        struct reada_extent *re = NULL;
 693        int mirror_num = 0;
 694        struct extent_buffer *eb = NULL;
 695        u64 logical;
 696        int ret;
 697        int i;
 698
 699        spin_lock(&fs_info->reada_lock);
 700        if (dev->reada_curr_zone == NULL) {
 701                ret = reada_pick_zone(dev);
 702                if (!ret) {
 703                        spin_unlock(&fs_info->reada_lock);
 704                        return 0;
 705                }
 706        }
 707        /*
 708         * FIXME currently we issue the reads one extent at a time. If we have
 709         * a contiguous block of extents, we could also coagulate them or use
 710         * plugging to speed things up
 711         */
 712        ret = radix_tree_gang_lookup(&dev->reada_extents, (void **)&re,
 713                                dev->reada_next >> fs_info->sectorsize_bits, 1);
 714        if (ret == 0 || re->logical > dev->reada_curr_zone->end) {
 715                ret = reada_pick_zone(dev);
 716                if (!ret) {
 717                        spin_unlock(&fs_info->reada_lock);
 718                        return 0;
 719                }
 720                re = NULL;
 721                ret = radix_tree_gang_lookup(&dev->reada_extents, (void **)&re,
 722                                dev->reada_next >> fs_info->sectorsize_bits, 1);
 723        }
 724        if (ret == 0) {
 725                spin_unlock(&fs_info->reada_lock);
 726                return 0;
 727        }
 728        dev->reada_next = re->logical + fs_info->nodesize;
 729        re->refcnt++;
 730
 731        spin_unlock(&fs_info->reada_lock);
 732
 733        spin_lock(&re->lock);
 734        if (re->scheduled || list_empty(&re->extctl)) {
 735                spin_unlock(&re->lock);
 736                reada_extent_put(fs_info, re);
 737                return 0;
 738        }
 739        re->scheduled = 1;
 740        spin_unlock(&re->lock);
 741
 742        /*
 743         * find mirror num
 744         */
 745        for (i = 0; i < re->nzones; ++i) {
 746                if (re->zones[i]->device == dev) {
 747                        mirror_num = i + 1;
 748                        break;
 749                }
 750        }
 751        logical = re->logical;
 752
 753        atomic_inc(&dev->reada_in_flight);
 754        ret = reada_tree_block_flagged(fs_info, logical, re->owner_root,
 755                                       re->level, mirror_num, &eb);
 756        if (ret)
 757                __readahead_hook(fs_info, re, NULL, ret);
 758        else if (eb)
 759                __readahead_hook(fs_info, re, eb, ret);
 760
 761        if (eb)
 762                free_extent_buffer(eb);
 763
 764        atomic_dec(&dev->reada_in_flight);
 765        reada_extent_put(fs_info, re);
 766
 767        return 1;
 768
 769}
 770
 771static void reada_start_machine_worker(struct btrfs_work *work)
 772{
 773        struct reada_machine_work *rmw;
 774        int old_ioprio;
 775
 776        rmw = container_of(work, struct reada_machine_work, work);
 777
 778        old_ioprio = IOPRIO_PRIO_VALUE(task_nice_ioclass(current),
 779                                       task_nice_ioprio(current));
 780        set_task_ioprio(current, BTRFS_IOPRIO_READA);
 781        __reada_start_machine(rmw->fs_info);
 782        set_task_ioprio(current, old_ioprio);
 783
 784        atomic_dec(&rmw->fs_info->reada_works_cnt);
 785
 786        kfree(rmw);
 787}
 788
 789/* Try to start up to 10k READA requests for a group of devices */
 790static int reada_start_for_fsdevs(struct btrfs_fs_devices *fs_devices)
 791{
 792        u64 enqueued;
 793        u64 total = 0;
 794        struct btrfs_device *device;
 795
 796        do {
 797                enqueued = 0;
 798                list_for_each_entry(device, &fs_devices->devices, dev_list) {
 799                        if (atomic_read(&device->reada_in_flight) <
 800                            MAX_IN_FLIGHT)
 801                                enqueued += reada_start_machine_dev(device);
 802                }
 803                total += enqueued;
 804        } while (enqueued && total < 10000);
 805
 806        return total;
 807}
 808
 809static void __reada_start_machine(struct btrfs_fs_info *fs_info)
 810{
 811        struct btrfs_fs_devices *fs_devices = fs_info->fs_devices, *seed_devs;
 812        int i;
 813        u64 enqueued = 0;
 814
 815        mutex_lock(&fs_devices->device_list_mutex);
 816
 817        enqueued += reada_start_for_fsdevs(fs_devices);
 818        list_for_each_entry(seed_devs, &fs_devices->seed_list, seed_list)
 819                enqueued += reada_start_for_fsdevs(seed_devs);
 820
 821        mutex_unlock(&fs_devices->device_list_mutex);
 822        if (enqueued == 0)
 823                return;
 824
 825        /*
 826         * If everything is already in the cache, this is effectively single
 827         * threaded. To a) not hold the caller for too long and b) to utilize
 828         * more cores, we broke the loop above after 10000 iterations and now
 829         * enqueue to workers to finish it. This will distribute the load to
 830         * the cores.
 831         */
 832        for (i = 0; i < 2; ++i) {
 833                reada_start_machine(fs_info);
 834                if (atomic_read(&fs_info->reada_works_cnt) >
 835                    BTRFS_MAX_MIRRORS * 2)
 836                        break;
 837        }
 838}
 839
 840static void reada_start_machine(struct btrfs_fs_info *fs_info)
 841{
 842        struct reada_machine_work *rmw;
 843
 844        rmw = kzalloc(sizeof(*rmw), GFP_KERNEL);
 845        if (!rmw) {
 846                /* FIXME we cannot handle this properly right now */
 847                BUG();
 848        }
 849        btrfs_init_work(&rmw->work, reada_start_machine_worker, NULL, NULL);
 850        rmw->fs_info = fs_info;
 851
 852        btrfs_queue_work(fs_info->readahead_workers, &rmw->work);
 853        atomic_inc(&fs_info->reada_works_cnt);
 854}
 855
 856#ifdef DEBUG
 857static void dump_devs(struct btrfs_fs_info *fs_info, int all)
 858{
 859        struct btrfs_device *device;
 860        struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
 861        unsigned long index;
 862        int ret;
 863        int i;
 864        int j;
 865        int cnt;
 866
 867        spin_lock(&fs_info->reada_lock);
 868        list_for_each_entry(device, &fs_devices->devices, dev_list) {
 869                btrfs_debug(fs_info, "dev %lld has %d in flight", device->devid,
 870                        atomic_read(&device->reada_in_flight));
 871                index = 0;
 872                while (1) {
 873                        struct reada_zone *zone;
 874                        ret = radix_tree_gang_lookup(&device->reada_zones,
 875                                                     (void **)&zone, index, 1);
 876                        if (ret == 0)
 877                                break;
 878                        pr_debug("  zone %llu-%llu elems %llu locked %d devs",
 879                                    zone->start, zone->end, zone->elems,
 880                                    zone->locked);
 881                        for (j = 0; j < zone->ndevs; ++j) {
 882                                pr_cont(" %lld",
 883                                        zone->devs[j]->devid);
 884                        }
 885                        if (device->reada_curr_zone == zone)
 886                                pr_cont(" curr off %llu",
 887                                        device->reada_next - zone->start);
 888                        pr_cont("\n");
 889                        index = (zone->end >> fs_info->sectorsize_bits) + 1;
 890                }
 891                cnt = 0;
 892                index = 0;
 893                while (all) {
 894                        struct reada_extent *re = NULL;
 895
 896                        ret = radix_tree_gang_lookup(&device->reada_extents,
 897                                                     (void **)&re, index, 1);
 898                        if (ret == 0)
 899                                break;
 900                        pr_debug("  re: logical %llu size %u empty %d scheduled %d",
 901                                re->logical, fs_info->nodesize,
 902                                list_empty(&re->extctl), re->scheduled);
 903
 904                        for (i = 0; i < re->nzones; ++i) {
 905                                pr_cont(" zone %llu-%llu devs",
 906                                        re->zones[i]->start,
 907                                        re->zones[i]->end);
 908                                for (j = 0; j < re->zones[i]->ndevs; ++j) {
 909                                        pr_cont(" %lld",
 910                                                re->zones[i]->devs[j]->devid);
 911                                }
 912                        }
 913                        pr_cont("\n");
 914                        index = (re->logical >> fs_info->sectorsize_bits) + 1;
 915                        if (++cnt > 15)
 916                                break;
 917                }
 918        }
 919
 920        index = 0;
 921        cnt = 0;
 922        while (all) {
 923                struct reada_extent *re = NULL;
 924
 925                ret = radix_tree_gang_lookup(&fs_info->reada_tree, (void **)&re,
 926                                             index, 1);
 927                if (ret == 0)
 928                        break;
 929                if (!re->scheduled) {
 930                        index = (re->logical >> fs_info->sectorsize_bits) + 1;
 931                        continue;
 932                }
 933                pr_debug("re: logical %llu size %u list empty %d scheduled %d",
 934                        re->logical, fs_info->nodesize,
 935                        list_empty(&re->extctl), re->scheduled);
 936                for (i = 0; i < re->nzones; ++i) {
 937                        pr_cont(" zone %llu-%llu devs",
 938                                re->zones[i]->start,
 939                                re->zones[i]->end);
 940                        for (j = 0; j < re->zones[i]->ndevs; ++j) {
 941                                pr_cont(" %lld",
 942                                       re->zones[i]->devs[j]->devid);
 943                        }
 944                }
 945                pr_cont("\n");
 946                index = (re->logical >> fs_info->sectorsize_bits) + 1;
 947        }
 948        spin_unlock(&fs_info->reada_lock);
 949}
 950#endif
 951
 952/*
 953 * interface
 954 */
 955struct reada_control *btrfs_reada_add(struct btrfs_root *root,
 956                        struct btrfs_key *key_start, struct btrfs_key *key_end)
 957{
 958        struct reada_control *rc;
 959        u64 start;
 960        u64 generation;
 961        int ret;
 962        int level;
 963        struct extent_buffer *node;
 964        static struct btrfs_key max_key = {
 965                .objectid = (u64)-1,
 966                .type = (u8)-1,
 967                .offset = (u64)-1
 968        };
 969
 970        rc = kzalloc(sizeof(*rc), GFP_KERNEL);
 971        if (!rc)
 972                return ERR_PTR(-ENOMEM);
 973
 974        rc->fs_info = root->fs_info;
 975        rc->key_start = *key_start;
 976        rc->key_end = *key_end;
 977        atomic_set(&rc->elems, 0);
 978        init_waitqueue_head(&rc->wait);
 979        kref_init(&rc->refcnt);
 980        kref_get(&rc->refcnt); /* one ref for having elements */
 981
 982        node = btrfs_root_node(root);
 983        start = node->start;
 984        generation = btrfs_header_generation(node);
 985        level = btrfs_header_level(node);
 986        free_extent_buffer(node);
 987
 988        ret = reada_add_block(rc, start, &max_key, root->root_key.objectid,
 989                              generation, level);
 990        if (ret) {
 991                kfree(rc);
 992                return ERR_PTR(ret);
 993        }
 994
 995        reada_start_machine(root->fs_info);
 996
 997        return rc;
 998}
 999
1000#ifdef DEBUG
1001int btrfs_reada_wait(void *handle)
1002{
1003        struct reada_control *rc = handle;
1004        struct btrfs_fs_info *fs_info = rc->fs_info;
1005
1006        while (atomic_read(&rc->elems)) {
1007                if (!atomic_read(&fs_info->reada_works_cnt))
1008                        reada_start_machine(fs_info);
1009                wait_event_timeout(rc->wait, atomic_read(&rc->elems) == 0,
1010                                   5 * HZ);
1011                dump_devs(fs_info, atomic_read(&rc->elems) < 10 ? 1 : 0);
1012        }
1013
1014        dump_devs(fs_info, atomic_read(&rc->elems) < 10 ? 1 : 0);
1015
1016        kref_put(&rc->refcnt, reada_control_release);
1017
1018        return 0;
1019}
1020#else
1021int btrfs_reada_wait(void *handle)
1022{
1023        struct reada_control *rc = handle;
1024        struct btrfs_fs_info *fs_info = rc->fs_info;
1025
1026        while (atomic_read(&rc->elems)) {
1027                if (!atomic_read(&fs_info->reada_works_cnt))
1028                        reada_start_machine(fs_info);
1029                wait_event_timeout(rc->wait, atomic_read(&rc->elems) == 0,
1030                                   (HZ + 9) / 10);
1031        }
1032
1033        kref_put(&rc->refcnt, reada_control_release);
1034
1035        return 0;
1036}
1037#endif
1038
1039void btrfs_reada_detach(void *handle)
1040{
1041        struct reada_control *rc = handle;
1042
1043        kref_put(&rc->refcnt, reada_control_release);
1044}
1045
1046/*
1047 * Before removing a device (device replace or device remove ioctls), call this
1048 * function to wait for all existing readahead requests on the device and to
1049 * make sure no one queues more readahead requests for the device.
1050 *
1051 * Must be called without holding neither the device list mutex nor the device
1052 * replace semaphore, otherwise it will deadlock.
1053 */
1054void btrfs_reada_remove_dev(struct btrfs_device *dev)
1055{
1056        struct btrfs_fs_info *fs_info = dev->fs_info;
1057
1058        /* Serialize with readahead extent creation at reada_find_extent(). */
1059        spin_lock(&fs_info->reada_lock);
1060        set_bit(BTRFS_DEV_STATE_NO_READA, &dev->dev_state);
1061        spin_unlock(&fs_info->reada_lock);
1062
1063        /*
1064         * There might be readahead requests added to the radix trees which
1065         * were not yet added to the readahead work queue. We need to start
1066         * them and wait for their completion, otherwise we can end up with
1067         * use-after-free problems when dropping the last reference on the
1068         * readahead extents and their zones, as they need to access the
1069         * device structure.
1070         */
1071        reada_start_machine(fs_info);
1072        btrfs_flush_workqueue(fs_info->readahead_workers);
1073}
1074
1075/*
1076 * If when removing a device (device replace or device remove ioctls) an error
1077 * happens after calling btrfs_reada_remove_dev(), call this to undo what that
1078 * function did. This is safe to call even if btrfs_reada_remove_dev() was not
1079 * called before.
1080 */
1081void btrfs_reada_undo_remove_dev(struct btrfs_device *dev)
1082{
1083        spin_lock(&dev->fs_info->reada_lock);
1084        clear_bit(BTRFS_DEV_STATE_NO_READA, &dev->dev_state);
1085        spin_unlock(&dev->fs_info->reada_lock);
1086}
1087