linux/fs/btrfs/reada.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0
   2/*
   3 * Copyright (C) 2011 STRATO.  All rights reserved.
   4 */
   5
   6#include <linux/sched.h>
   7#include <linux/pagemap.h>
   8#include <linux/writeback.h>
   9#include <linux/blkdev.h>
  10#include <linux/slab.h>
  11#include <linux/workqueue.h>
  12#include "ctree.h"
  13#include "volumes.h"
  14#include "disk-io.h"
  15#include "transaction.h"
  16#include "dev-replace.h"
  17#include "block-group.h"
  18
  19#undef DEBUG
  20
  21/*
  22 * This is the implementation for the generic read ahead framework.
  23 *
  24 * To trigger a readahead, btrfs_reada_add must be called. It will start
  25 * a read ahead for the given range [start, end) on tree root. The returned
  26 * handle can either be used to wait on the readahead to finish
  27 * (btrfs_reada_wait), or to send it to the background (btrfs_reada_detach).
  28 *
  29 * The read ahead works as follows:
  30 * On btrfs_reada_add, the root of the tree is inserted into a radix_tree.
  31 * reada_start_machine will then search for extents to prefetch and trigger
  32 * some reads. When a read finishes for a node, all contained node/leaf
  33 * pointers that lie in the given range will also be enqueued. The reads will
  34 * be triggered in sequential order, thus giving a big win over a naive
  35 * enumeration. It will also make use of multi-device layouts. Each disk
  36 * will have its on read pointer and all disks will by utilized in parallel.
  37 * Also will no two disks read both sides of a mirror simultaneously, as this
  38 * would waste seeking capacity. Instead both disks will read different parts
  39 * of the filesystem.
  40 * Any number of readaheads can be started in parallel. The read order will be
  41 * determined globally, i.e. 2 parallel readaheads will normally finish faster
  42 * than the 2 started one after another.
  43 */
  44
  45#define MAX_IN_FLIGHT 6
  46
  47struct reada_extctl {
  48        struct list_head        list;
  49        struct reada_control    *rc;
  50        u64                     generation;
  51};
  52
  53struct reada_extent {
  54        u64                     logical;
  55        struct btrfs_key        top;
  56        struct list_head        extctl;
  57        int                     refcnt;
  58        spinlock_t              lock;
  59        struct reada_zone       *zones[BTRFS_MAX_MIRRORS];
  60        int                     nzones;
  61        int                     scheduled;
  62};
  63
  64struct reada_zone {
  65        u64                     start;
  66        u64                     end;
  67        u64                     elems;
  68        struct list_head        list;
  69        spinlock_t              lock;
  70        int                     locked;
  71        struct btrfs_device     *device;
  72        struct btrfs_device     *devs[BTRFS_MAX_MIRRORS]; /* full list, incl
  73                                                           * self */
  74        int                     ndevs;
  75        struct kref             refcnt;
  76};
  77
  78struct reada_machine_work {
  79        struct btrfs_work       work;
  80        struct btrfs_fs_info    *fs_info;
  81};
  82
  83static void reada_extent_put(struct btrfs_fs_info *, struct reada_extent *);
  84static void reada_control_release(struct kref *kref);
  85static void reada_zone_release(struct kref *kref);
  86static void reada_start_machine(struct btrfs_fs_info *fs_info);
  87static void __reada_start_machine(struct btrfs_fs_info *fs_info);
  88
  89static int reada_add_block(struct reada_control *rc, u64 logical,
  90                           struct btrfs_key *top, u64 generation);
  91
  92/* recurses */
  93/* in case of err, eb might be NULL */
  94static void __readahead_hook(struct btrfs_fs_info *fs_info,
  95                             struct reada_extent *re, struct extent_buffer *eb,
  96                             int err)
  97{
  98        int nritems;
  99        int i;
 100        u64 bytenr;
 101        u64 generation;
 102        struct list_head list;
 103
 104        spin_lock(&re->lock);
 105        /*
 106         * just take the full list from the extent. afterwards we
 107         * don't need the lock anymore
 108         */
 109        list_replace_init(&re->extctl, &list);
 110        re->scheduled = 0;
 111        spin_unlock(&re->lock);
 112
 113        /*
 114         * this is the error case, the extent buffer has not been
 115         * read correctly. We won't access anything from it and
 116         * just cleanup our data structures. Effectively this will
 117         * cut the branch below this node from read ahead.
 118         */
 119        if (err)
 120                goto cleanup;
 121
 122        /*
 123         * FIXME: currently we just set nritems to 0 if this is a leaf,
 124         * effectively ignoring the content. In a next step we could
 125         * trigger more readahead depending from the content, e.g.
 126         * fetch the checksums for the extents in the leaf.
 127         */
 128        if (!btrfs_header_level(eb))
 129                goto cleanup;
 130
 131        nritems = btrfs_header_nritems(eb);
 132        generation = btrfs_header_generation(eb);
 133        for (i = 0; i < nritems; i++) {
 134                struct reada_extctl *rec;
 135                u64 n_gen;
 136                struct btrfs_key key;
 137                struct btrfs_key next_key;
 138
 139                btrfs_node_key_to_cpu(eb, &key, i);
 140                if (i + 1 < nritems)
 141                        btrfs_node_key_to_cpu(eb, &next_key, i + 1);
 142                else
 143                        next_key = re->top;
 144                bytenr = btrfs_node_blockptr(eb, i);
 145                n_gen = btrfs_node_ptr_generation(eb, i);
 146
 147                list_for_each_entry(rec, &list, list) {
 148                        struct reada_control *rc = rec->rc;
 149
 150                        /*
 151                         * if the generation doesn't match, just ignore this
 152                         * extctl. This will probably cut off a branch from
 153                         * prefetch. Alternatively one could start a new (sub-)
 154                         * prefetch for this branch, starting again from root.
 155                         * FIXME: move the generation check out of this loop
 156                         */
 157#ifdef DEBUG
 158                        if (rec->generation != generation) {
 159                                btrfs_debug(fs_info,
 160                                            "generation mismatch for (%llu,%d,%llu) %llu != %llu",
 161                                            key.objectid, key.type, key.offset,
 162                                            rec->generation, generation);
 163                        }
 164#endif
 165                        if (rec->generation == generation &&
 166                            btrfs_comp_cpu_keys(&key, &rc->key_end) < 0 &&
 167                            btrfs_comp_cpu_keys(&next_key, &rc->key_start) > 0)
 168                                reada_add_block(rc, bytenr, &next_key, n_gen);
 169                }
 170        }
 171
 172cleanup:
 173        /*
 174         * free extctl records
 175         */
 176        while (!list_empty(&list)) {
 177                struct reada_control *rc;
 178                struct reada_extctl *rec;
 179
 180                rec = list_first_entry(&list, struct reada_extctl, list);
 181                list_del(&rec->list);
 182                rc = rec->rc;
 183                kfree(rec);
 184
 185                kref_get(&rc->refcnt);
 186                if (atomic_dec_and_test(&rc->elems)) {
 187                        kref_put(&rc->refcnt, reada_control_release);
 188                        wake_up(&rc->wait);
 189                }
 190                kref_put(&rc->refcnt, reada_control_release);
 191
 192                reada_extent_put(fs_info, re);  /* one ref for each entry */
 193        }
 194
 195        return;
 196}
 197
 198int btree_readahead_hook(struct extent_buffer *eb, int err)
 199{
 200        struct btrfs_fs_info *fs_info = eb->fs_info;
 201        int ret = 0;
 202        struct reada_extent *re;
 203
 204        /* find extent */
 205        spin_lock(&fs_info->reada_lock);
 206        re = radix_tree_lookup(&fs_info->reada_tree,
 207                               eb->start >> PAGE_SHIFT);
 208        if (re)
 209                re->refcnt++;
 210        spin_unlock(&fs_info->reada_lock);
 211        if (!re) {
 212                ret = -1;
 213                goto start_machine;
 214        }
 215
 216        __readahead_hook(fs_info, re, eb, err);
 217        reada_extent_put(fs_info, re);  /* our ref */
 218
 219start_machine:
 220        reada_start_machine(fs_info);
 221        return ret;
 222}
 223
 224static struct reada_zone *reada_find_zone(struct btrfs_device *dev, u64 logical,
 225                                          struct btrfs_bio *bbio)
 226{
 227        struct btrfs_fs_info *fs_info = dev->fs_info;
 228        int ret;
 229        struct reada_zone *zone;
 230        struct btrfs_block_group *cache = NULL;
 231        u64 start;
 232        u64 end;
 233        int i;
 234
 235        zone = NULL;
 236        spin_lock(&fs_info->reada_lock);
 237        ret = radix_tree_gang_lookup(&dev->reada_zones, (void **)&zone,
 238                                     logical >> PAGE_SHIFT, 1);
 239        if (ret == 1 && logical >= zone->start && logical <= zone->end) {
 240                kref_get(&zone->refcnt);
 241                spin_unlock(&fs_info->reada_lock);
 242                return zone;
 243        }
 244
 245        spin_unlock(&fs_info->reada_lock);
 246
 247        cache = btrfs_lookup_block_group(fs_info, logical);
 248        if (!cache)
 249                return NULL;
 250
 251        start = cache->start;
 252        end = start + cache->length - 1;
 253        btrfs_put_block_group(cache);
 254
 255        zone = kzalloc(sizeof(*zone), GFP_KERNEL);
 256        if (!zone)
 257                return NULL;
 258
 259        ret = radix_tree_preload(GFP_KERNEL);
 260        if (ret) {
 261                kfree(zone);
 262                return NULL;
 263        }
 264
 265        zone->start = start;
 266        zone->end = end;
 267        INIT_LIST_HEAD(&zone->list);
 268        spin_lock_init(&zone->lock);
 269        zone->locked = 0;
 270        kref_init(&zone->refcnt);
 271        zone->elems = 0;
 272        zone->device = dev; /* our device always sits at index 0 */
 273        for (i = 0; i < bbio->num_stripes; ++i) {
 274                /* bounds have already been checked */
 275                zone->devs[i] = bbio->stripes[i].dev;
 276        }
 277        zone->ndevs = bbio->num_stripes;
 278
 279        spin_lock(&fs_info->reada_lock);
 280        ret = radix_tree_insert(&dev->reada_zones,
 281                                (unsigned long)(zone->end >> PAGE_SHIFT),
 282                                zone);
 283
 284        if (ret == -EEXIST) {
 285                kfree(zone);
 286                ret = radix_tree_gang_lookup(&dev->reada_zones, (void **)&zone,
 287                                             logical >> PAGE_SHIFT, 1);
 288                if (ret == 1 && logical >= zone->start && logical <= zone->end)
 289                        kref_get(&zone->refcnt);
 290                else
 291                        zone = NULL;
 292        }
 293        spin_unlock(&fs_info->reada_lock);
 294        radix_tree_preload_end();
 295
 296        return zone;
 297}
 298
 299static struct reada_extent *reada_find_extent(struct btrfs_fs_info *fs_info,
 300                                              u64 logical,
 301                                              struct btrfs_key *top)
 302{
 303        int ret;
 304        struct reada_extent *re = NULL;
 305        struct reada_extent *re_exist = NULL;
 306        struct btrfs_bio *bbio = NULL;
 307        struct btrfs_device *dev;
 308        struct btrfs_device *prev_dev;
 309        u64 length;
 310        int real_stripes;
 311        int nzones = 0;
 312        unsigned long index = logical >> PAGE_SHIFT;
 313        int dev_replace_is_ongoing;
 314        int have_zone = 0;
 315
 316        spin_lock(&fs_info->reada_lock);
 317        re = radix_tree_lookup(&fs_info->reada_tree, index);
 318        if (re)
 319                re->refcnt++;
 320        spin_unlock(&fs_info->reada_lock);
 321
 322        if (re)
 323                return re;
 324
 325        re = kzalloc(sizeof(*re), GFP_KERNEL);
 326        if (!re)
 327                return NULL;
 328
 329        re->logical = logical;
 330        re->top = *top;
 331        INIT_LIST_HEAD(&re->extctl);
 332        spin_lock_init(&re->lock);
 333        re->refcnt = 1;
 334
 335        /*
 336         * map block
 337         */
 338        length = fs_info->nodesize;
 339        ret = btrfs_map_block(fs_info, BTRFS_MAP_GET_READ_MIRRORS, logical,
 340                        &length, &bbio, 0);
 341        if (ret || !bbio || length < fs_info->nodesize)
 342                goto error;
 343
 344        if (bbio->num_stripes > BTRFS_MAX_MIRRORS) {
 345                btrfs_err(fs_info,
 346                           "readahead: more than %d copies not supported",
 347                           BTRFS_MAX_MIRRORS);
 348                goto error;
 349        }
 350
 351        real_stripes = bbio->num_stripes - bbio->num_tgtdevs;
 352        for (nzones = 0; nzones < real_stripes; ++nzones) {
 353                struct reada_zone *zone;
 354
 355                dev = bbio->stripes[nzones].dev;
 356
 357                /* cannot read ahead on missing device. */
 358                if (!dev->bdev)
 359                        continue;
 360
 361                zone = reada_find_zone(dev, logical, bbio);
 362                if (!zone)
 363                        continue;
 364
 365                re->zones[re->nzones++] = zone;
 366                spin_lock(&zone->lock);
 367                if (!zone->elems)
 368                        kref_get(&zone->refcnt);
 369                ++zone->elems;
 370                spin_unlock(&zone->lock);
 371                spin_lock(&fs_info->reada_lock);
 372                kref_put(&zone->refcnt, reada_zone_release);
 373                spin_unlock(&fs_info->reada_lock);
 374        }
 375        if (re->nzones == 0) {
 376                /* not a single zone found, error and out */
 377                goto error;
 378        }
 379
 380        /* Insert extent in reada tree + all per-device trees, all or nothing */
 381        down_read(&fs_info->dev_replace.rwsem);
 382        ret = radix_tree_preload(GFP_KERNEL);
 383        if (ret) {
 384                up_read(&fs_info->dev_replace.rwsem);
 385                goto error;
 386        }
 387
 388        spin_lock(&fs_info->reada_lock);
 389        ret = radix_tree_insert(&fs_info->reada_tree, index, re);
 390        if (ret == -EEXIST) {
 391                re_exist = radix_tree_lookup(&fs_info->reada_tree, index);
 392                re_exist->refcnt++;
 393                spin_unlock(&fs_info->reada_lock);
 394                radix_tree_preload_end();
 395                up_read(&fs_info->dev_replace.rwsem);
 396                goto error;
 397        }
 398        if (ret) {
 399                spin_unlock(&fs_info->reada_lock);
 400                radix_tree_preload_end();
 401                up_read(&fs_info->dev_replace.rwsem);
 402                goto error;
 403        }
 404        radix_tree_preload_end();
 405        prev_dev = NULL;
 406        dev_replace_is_ongoing = btrfs_dev_replace_is_ongoing(
 407                        &fs_info->dev_replace);
 408        for (nzones = 0; nzones < re->nzones; ++nzones) {
 409                dev = re->zones[nzones]->device;
 410
 411                if (dev == prev_dev) {
 412                        /*
 413                         * in case of DUP, just add the first zone. As both
 414                         * are on the same device, there's nothing to gain
 415                         * from adding both.
 416                         * Also, it wouldn't work, as the tree is per device
 417                         * and adding would fail with EEXIST
 418                         */
 419                        continue;
 420                }
 421                if (!dev->bdev)
 422                        continue;
 423
 424                if (test_bit(BTRFS_DEV_STATE_NO_READA, &dev->dev_state))
 425                        continue;
 426
 427                if (dev_replace_is_ongoing &&
 428                    dev == fs_info->dev_replace.tgtdev) {
 429                        /*
 430                         * as this device is selected for reading only as
 431                         * a last resort, skip it for read ahead.
 432                         */
 433                        continue;
 434                }
 435                prev_dev = dev;
 436                ret = radix_tree_insert(&dev->reada_extents, index, re);
 437                if (ret) {
 438                        while (--nzones >= 0) {
 439                                dev = re->zones[nzones]->device;
 440                                BUG_ON(dev == NULL);
 441                                /* ignore whether the entry was inserted */
 442                                radix_tree_delete(&dev->reada_extents, index);
 443                        }
 444                        radix_tree_delete(&fs_info->reada_tree, index);
 445                        spin_unlock(&fs_info->reada_lock);
 446                        up_read(&fs_info->dev_replace.rwsem);
 447                        goto error;
 448                }
 449                have_zone = 1;
 450        }
 451        if (!have_zone)
 452                radix_tree_delete(&fs_info->reada_tree, index);
 453        spin_unlock(&fs_info->reada_lock);
 454        up_read(&fs_info->dev_replace.rwsem);
 455
 456        if (!have_zone)
 457                goto error;
 458
 459        btrfs_put_bbio(bbio);
 460        return re;
 461
 462error:
 463        for (nzones = 0; nzones < re->nzones; ++nzones) {
 464                struct reada_zone *zone;
 465
 466                zone = re->zones[nzones];
 467                kref_get(&zone->refcnt);
 468                spin_lock(&zone->lock);
 469                --zone->elems;
 470                if (zone->elems == 0) {
 471                        /*
 472                         * no fs_info->reada_lock needed, as this can't be
 473                         * the last ref
 474                         */
 475                        kref_put(&zone->refcnt, reada_zone_release);
 476                }
 477                spin_unlock(&zone->lock);
 478
 479                spin_lock(&fs_info->reada_lock);
 480                kref_put(&zone->refcnt, reada_zone_release);
 481                spin_unlock(&fs_info->reada_lock);
 482        }
 483        btrfs_put_bbio(bbio);
 484        kfree(re);
 485        return re_exist;
 486}
 487
 488static void reada_extent_put(struct btrfs_fs_info *fs_info,
 489                             struct reada_extent *re)
 490{
 491        int i;
 492        unsigned long index = re->logical >> PAGE_SHIFT;
 493
 494        spin_lock(&fs_info->reada_lock);
 495        if (--re->refcnt) {
 496                spin_unlock(&fs_info->reada_lock);
 497                return;
 498        }
 499
 500        radix_tree_delete(&fs_info->reada_tree, index);
 501        for (i = 0; i < re->nzones; ++i) {
 502                struct reada_zone *zone = re->zones[i];
 503
 504                radix_tree_delete(&zone->device->reada_extents, index);
 505        }
 506
 507        spin_unlock(&fs_info->reada_lock);
 508
 509        for (i = 0; i < re->nzones; ++i) {
 510                struct reada_zone *zone = re->zones[i];
 511
 512                kref_get(&zone->refcnt);
 513                spin_lock(&zone->lock);
 514                --zone->elems;
 515                if (zone->elems == 0) {
 516                        /* no fs_info->reada_lock needed, as this can't be
 517                         * the last ref */
 518                        kref_put(&zone->refcnt, reada_zone_release);
 519                }
 520                spin_unlock(&zone->lock);
 521
 522                spin_lock(&fs_info->reada_lock);
 523                kref_put(&zone->refcnt, reada_zone_release);
 524                spin_unlock(&fs_info->reada_lock);
 525        }
 526
 527        kfree(re);
 528}
 529
 530static void reada_zone_release(struct kref *kref)
 531{
 532        struct reada_zone *zone = container_of(kref, struct reada_zone, refcnt);
 533
 534        radix_tree_delete(&zone->device->reada_zones,
 535                          zone->end >> PAGE_SHIFT);
 536
 537        kfree(zone);
 538}
 539
 540static void reada_control_release(struct kref *kref)
 541{
 542        struct reada_control *rc = container_of(kref, struct reada_control,
 543                                                refcnt);
 544
 545        kfree(rc);
 546}
 547
 548static int reada_add_block(struct reada_control *rc, u64 logical,
 549                           struct btrfs_key *top, u64 generation)
 550{
 551        struct btrfs_fs_info *fs_info = rc->fs_info;
 552        struct reada_extent *re;
 553        struct reada_extctl *rec;
 554
 555        /* takes one ref */
 556        re = reada_find_extent(fs_info, logical, top);
 557        if (!re)
 558                return -1;
 559
 560        rec = kzalloc(sizeof(*rec), GFP_KERNEL);
 561        if (!rec) {
 562                reada_extent_put(fs_info, re);
 563                return -ENOMEM;
 564        }
 565
 566        rec->rc = rc;
 567        rec->generation = generation;
 568        atomic_inc(&rc->elems);
 569
 570        spin_lock(&re->lock);
 571        list_add_tail(&rec->list, &re->extctl);
 572        spin_unlock(&re->lock);
 573
 574        /* leave the ref on the extent */
 575
 576        return 0;
 577}
 578
 579/*
 580 * called with fs_info->reada_lock held
 581 */
 582static void reada_peer_zones_set_lock(struct reada_zone *zone, int lock)
 583{
 584        int i;
 585        unsigned long index = zone->end >> PAGE_SHIFT;
 586
 587        for (i = 0; i < zone->ndevs; ++i) {
 588                struct reada_zone *peer;
 589                peer = radix_tree_lookup(&zone->devs[i]->reada_zones, index);
 590                if (peer && peer->device != zone->device)
 591                        peer->locked = lock;
 592        }
 593}
 594
 595/*
 596 * called with fs_info->reada_lock held
 597 */
 598static int reada_pick_zone(struct btrfs_device *dev)
 599{
 600        struct reada_zone *top_zone = NULL;
 601        struct reada_zone *top_locked_zone = NULL;
 602        u64 top_elems = 0;
 603        u64 top_locked_elems = 0;
 604        unsigned long index = 0;
 605        int ret;
 606
 607        if (dev->reada_curr_zone) {
 608                reada_peer_zones_set_lock(dev->reada_curr_zone, 0);
 609                kref_put(&dev->reada_curr_zone->refcnt, reada_zone_release);
 610                dev->reada_curr_zone = NULL;
 611        }
 612        /* pick the zone with the most elements */
 613        while (1) {
 614                struct reada_zone *zone;
 615
 616                ret = radix_tree_gang_lookup(&dev->reada_zones,
 617                                             (void **)&zone, index, 1);
 618                if (ret == 0)
 619                        break;
 620                index = (zone->end >> PAGE_SHIFT) + 1;
 621                if (zone->locked) {
 622                        if (zone->elems > top_locked_elems) {
 623                                top_locked_elems = zone->elems;
 624                                top_locked_zone = zone;
 625                        }
 626                } else {
 627                        if (zone->elems > top_elems) {
 628                                top_elems = zone->elems;
 629                                top_zone = zone;
 630                        }
 631                }
 632        }
 633        if (top_zone)
 634                dev->reada_curr_zone = top_zone;
 635        else if (top_locked_zone)
 636                dev->reada_curr_zone = top_locked_zone;
 637        else
 638                return 0;
 639
 640        dev->reada_next = dev->reada_curr_zone->start;
 641        kref_get(&dev->reada_curr_zone->refcnt);
 642        reada_peer_zones_set_lock(dev->reada_curr_zone, 1);
 643
 644        return 1;
 645}
 646
 647static int reada_tree_block_flagged(struct btrfs_fs_info *fs_info, u64 bytenr,
 648                                    int mirror_num, struct extent_buffer **eb)
 649{
 650        struct extent_buffer *buf = NULL;
 651        int ret;
 652
 653        buf = btrfs_find_create_tree_block(fs_info, bytenr);
 654        if (IS_ERR(buf))
 655                return 0;
 656
 657        set_bit(EXTENT_BUFFER_READAHEAD, &buf->bflags);
 658
 659        ret = read_extent_buffer_pages(buf, WAIT_PAGE_LOCK, mirror_num);
 660        if (ret) {
 661                free_extent_buffer_stale(buf);
 662                return ret;
 663        }
 664
 665        if (test_bit(EXTENT_BUFFER_CORRUPT, &buf->bflags)) {
 666                free_extent_buffer_stale(buf);
 667                return -EIO;
 668        } else if (extent_buffer_uptodate(buf)) {
 669                *eb = buf;
 670        } else {
 671                free_extent_buffer(buf);
 672        }
 673        return 0;
 674}
 675
 676static int reada_start_machine_dev(struct btrfs_device *dev)
 677{
 678        struct btrfs_fs_info *fs_info = dev->fs_info;
 679        struct reada_extent *re = NULL;
 680        int mirror_num = 0;
 681        struct extent_buffer *eb = NULL;
 682        u64 logical;
 683        int ret;
 684        int i;
 685
 686        spin_lock(&fs_info->reada_lock);
 687        if (dev->reada_curr_zone == NULL) {
 688                ret = reada_pick_zone(dev);
 689                if (!ret) {
 690                        spin_unlock(&fs_info->reada_lock);
 691                        return 0;
 692                }
 693        }
 694        /*
 695         * FIXME currently we issue the reads one extent at a time. If we have
 696         * a contiguous block of extents, we could also coagulate them or use
 697         * plugging to speed things up
 698         */
 699        ret = radix_tree_gang_lookup(&dev->reada_extents, (void **)&re,
 700                                     dev->reada_next >> PAGE_SHIFT, 1);
 701        if (ret == 0 || re->logical > dev->reada_curr_zone->end) {
 702                ret = reada_pick_zone(dev);
 703                if (!ret) {
 704                        spin_unlock(&fs_info->reada_lock);
 705                        return 0;
 706                }
 707                re = NULL;
 708                ret = radix_tree_gang_lookup(&dev->reada_extents, (void **)&re,
 709                                        dev->reada_next >> PAGE_SHIFT, 1);
 710        }
 711        if (ret == 0) {
 712                spin_unlock(&fs_info->reada_lock);
 713                return 0;
 714        }
 715        dev->reada_next = re->logical + fs_info->nodesize;
 716        re->refcnt++;
 717
 718        spin_unlock(&fs_info->reada_lock);
 719
 720        spin_lock(&re->lock);
 721        if (re->scheduled || list_empty(&re->extctl)) {
 722                spin_unlock(&re->lock);
 723                reada_extent_put(fs_info, re);
 724                return 0;
 725        }
 726        re->scheduled = 1;
 727        spin_unlock(&re->lock);
 728
 729        /*
 730         * find mirror num
 731         */
 732        for (i = 0; i < re->nzones; ++i) {
 733                if (re->zones[i]->device == dev) {
 734                        mirror_num = i + 1;
 735                        break;
 736                }
 737        }
 738        logical = re->logical;
 739
 740        atomic_inc(&dev->reada_in_flight);
 741        ret = reada_tree_block_flagged(fs_info, logical, mirror_num, &eb);
 742        if (ret)
 743                __readahead_hook(fs_info, re, NULL, ret);
 744        else if (eb)
 745                __readahead_hook(fs_info, re, eb, ret);
 746
 747        if (eb)
 748                free_extent_buffer(eb);
 749
 750        atomic_dec(&dev->reada_in_flight);
 751        reada_extent_put(fs_info, re);
 752
 753        return 1;
 754
 755}
 756
 757static void reada_start_machine_worker(struct btrfs_work *work)
 758{
 759        struct reada_machine_work *rmw;
 760        int old_ioprio;
 761
 762        rmw = container_of(work, struct reada_machine_work, work);
 763
 764        old_ioprio = IOPRIO_PRIO_VALUE(task_nice_ioclass(current),
 765                                       task_nice_ioprio(current));
 766        set_task_ioprio(current, BTRFS_IOPRIO_READA);
 767        __reada_start_machine(rmw->fs_info);
 768        set_task_ioprio(current, old_ioprio);
 769
 770        atomic_dec(&rmw->fs_info->reada_works_cnt);
 771
 772        kfree(rmw);
 773}
 774
 775/* Try to start up to 10k READA requests for a group of devices */
 776static int reada_start_for_fsdevs(struct btrfs_fs_devices *fs_devices)
 777{
 778        u64 enqueued;
 779        u64 total = 0;
 780        struct btrfs_device *device;
 781
 782        do {
 783                enqueued = 0;
 784                list_for_each_entry(device, &fs_devices->devices, dev_list) {
 785                        if (atomic_read(&device->reada_in_flight) <
 786                            MAX_IN_FLIGHT)
 787                                enqueued += reada_start_machine_dev(device);
 788                }
 789                total += enqueued;
 790        } while (enqueued && total < 10000);
 791
 792        return total;
 793}
 794
 795static void __reada_start_machine(struct btrfs_fs_info *fs_info)
 796{
 797        struct btrfs_fs_devices *fs_devices = fs_info->fs_devices, *seed_devs;
 798        int i;
 799        u64 enqueued = 0;
 800
 801        mutex_lock(&fs_devices->device_list_mutex);
 802
 803        enqueued += reada_start_for_fsdevs(fs_devices);
 804        list_for_each_entry(seed_devs, &fs_devices->seed_list, seed_list)
 805                enqueued += reada_start_for_fsdevs(seed_devs);
 806
 807        mutex_unlock(&fs_devices->device_list_mutex);
 808        if (enqueued == 0)
 809                return;
 810
 811        /*
 812         * If everything is already in the cache, this is effectively single
 813         * threaded. To a) not hold the caller for too long and b) to utilize
 814         * more cores, we broke the loop above after 10000 iterations and now
 815         * enqueue to workers to finish it. This will distribute the load to
 816         * the cores.
 817         */
 818        for (i = 0; i < 2; ++i) {
 819                reada_start_machine(fs_info);
 820                if (atomic_read(&fs_info->reada_works_cnt) >
 821                    BTRFS_MAX_MIRRORS * 2)
 822                        break;
 823        }
 824}
 825
 826static void reada_start_machine(struct btrfs_fs_info *fs_info)
 827{
 828        struct reada_machine_work *rmw;
 829
 830        rmw = kzalloc(sizeof(*rmw), GFP_KERNEL);
 831        if (!rmw) {
 832                /* FIXME we cannot handle this properly right now */
 833                BUG();
 834        }
 835        btrfs_init_work(&rmw->work, reada_start_machine_worker, NULL, NULL);
 836        rmw->fs_info = fs_info;
 837
 838        btrfs_queue_work(fs_info->readahead_workers, &rmw->work);
 839        atomic_inc(&fs_info->reada_works_cnt);
 840}
 841
 842#ifdef DEBUG
 843static void dump_devs(struct btrfs_fs_info *fs_info, int all)
 844{
 845        struct btrfs_device *device;
 846        struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
 847        unsigned long index;
 848        int ret;
 849        int i;
 850        int j;
 851        int cnt;
 852
 853        spin_lock(&fs_info->reada_lock);
 854        list_for_each_entry(device, &fs_devices->devices, dev_list) {
 855                btrfs_debug(fs_info, "dev %lld has %d in flight", device->devid,
 856                        atomic_read(&device->reada_in_flight));
 857                index = 0;
 858                while (1) {
 859                        struct reada_zone *zone;
 860                        ret = radix_tree_gang_lookup(&device->reada_zones,
 861                                                     (void **)&zone, index, 1);
 862                        if (ret == 0)
 863                                break;
 864                        pr_debug("  zone %llu-%llu elems %llu locked %d devs",
 865                                    zone->start, zone->end, zone->elems,
 866                                    zone->locked);
 867                        for (j = 0; j < zone->ndevs; ++j) {
 868                                pr_cont(" %lld",
 869                                        zone->devs[j]->devid);
 870                        }
 871                        if (device->reada_curr_zone == zone)
 872                                pr_cont(" curr off %llu",
 873                                        device->reada_next - zone->start);
 874                        pr_cont("\n");
 875                        index = (zone->end >> PAGE_SHIFT) + 1;
 876                }
 877                cnt = 0;
 878                index = 0;
 879                while (all) {
 880                        struct reada_extent *re = NULL;
 881
 882                        ret = radix_tree_gang_lookup(&device->reada_extents,
 883                                                     (void **)&re, index, 1);
 884                        if (ret == 0)
 885                                break;
 886                        pr_debug("  re: logical %llu size %u empty %d scheduled %d",
 887                                re->logical, fs_info->nodesize,
 888                                list_empty(&re->extctl), re->scheduled);
 889
 890                        for (i = 0; i < re->nzones; ++i) {
 891                                pr_cont(" zone %llu-%llu devs",
 892                                        re->zones[i]->start,
 893                                        re->zones[i]->end);
 894                                for (j = 0; j < re->zones[i]->ndevs; ++j) {
 895                                        pr_cont(" %lld",
 896                                                re->zones[i]->devs[j]->devid);
 897                                }
 898                        }
 899                        pr_cont("\n");
 900                        index = (re->logical >> PAGE_SHIFT) + 1;
 901                        if (++cnt > 15)
 902                                break;
 903                }
 904        }
 905
 906        index = 0;
 907        cnt = 0;
 908        while (all) {
 909                struct reada_extent *re = NULL;
 910
 911                ret = radix_tree_gang_lookup(&fs_info->reada_tree, (void **)&re,
 912                                             index, 1);
 913                if (ret == 0)
 914                        break;
 915                if (!re->scheduled) {
 916                        index = (re->logical >> PAGE_SHIFT) + 1;
 917                        continue;
 918                }
 919                pr_debug("re: logical %llu size %u list empty %d scheduled %d",
 920                        re->logical, fs_info->nodesize,
 921                        list_empty(&re->extctl), re->scheduled);
 922                for (i = 0; i < re->nzones; ++i) {
 923                        pr_cont(" zone %llu-%llu devs",
 924                                re->zones[i]->start,
 925                                re->zones[i]->end);
 926                        for (j = 0; j < re->zones[i]->ndevs; ++j) {
 927                                pr_cont(" %lld",
 928                                       re->zones[i]->devs[j]->devid);
 929                        }
 930                }
 931                pr_cont("\n");
 932                index = (re->logical >> PAGE_SHIFT) + 1;
 933        }
 934        spin_unlock(&fs_info->reada_lock);
 935}
 936#endif
 937
 938/*
 939 * interface
 940 */
 941struct reada_control *btrfs_reada_add(struct btrfs_root *root,
 942                        struct btrfs_key *key_start, struct btrfs_key *key_end)
 943{
 944        struct reada_control *rc;
 945        u64 start;
 946        u64 generation;
 947        int ret;
 948        struct extent_buffer *node;
 949        static struct btrfs_key max_key = {
 950                .objectid = (u64)-1,
 951                .type = (u8)-1,
 952                .offset = (u64)-1
 953        };
 954
 955        rc = kzalloc(sizeof(*rc), GFP_KERNEL);
 956        if (!rc)
 957                return ERR_PTR(-ENOMEM);
 958
 959        rc->fs_info = root->fs_info;
 960        rc->key_start = *key_start;
 961        rc->key_end = *key_end;
 962        atomic_set(&rc->elems, 0);
 963        init_waitqueue_head(&rc->wait);
 964        kref_init(&rc->refcnt);
 965        kref_get(&rc->refcnt); /* one ref for having elements */
 966
 967        node = btrfs_root_node(root);
 968        start = node->start;
 969        generation = btrfs_header_generation(node);
 970        free_extent_buffer(node);
 971
 972        ret = reada_add_block(rc, start, &max_key, generation);
 973        if (ret) {
 974                kfree(rc);
 975                return ERR_PTR(ret);
 976        }
 977
 978        reada_start_machine(root->fs_info);
 979
 980        return rc;
 981}
 982
 983#ifdef DEBUG
 984int btrfs_reada_wait(void *handle)
 985{
 986        struct reada_control *rc = handle;
 987        struct btrfs_fs_info *fs_info = rc->fs_info;
 988
 989        while (atomic_read(&rc->elems)) {
 990                if (!atomic_read(&fs_info->reada_works_cnt))
 991                        reada_start_machine(fs_info);
 992                wait_event_timeout(rc->wait, atomic_read(&rc->elems) == 0,
 993                                   5 * HZ);
 994                dump_devs(fs_info, atomic_read(&rc->elems) < 10 ? 1 : 0);
 995        }
 996
 997        dump_devs(fs_info, atomic_read(&rc->elems) < 10 ? 1 : 0);
 998
 999        kref_put(&rc->refcnt, reada_control_release);
1000
1001        return 0;
1002}
1003#else
1004int btrfs_reada_wait(void *handle)
1005{
1006        struct reada_control *rc = handle;
1007        struct btrfs_fs_info *fs_info = rc->fs_info;
1008
1009        while (atomic_read(&rc->elems)) {
1010                if (!atomic_read(&fs_info->reada_works_cnt))
1011                        reada_start_machine(fs_info);
1012                wait_event_timeout(rc->wait, atomic_read(&rc->elems) == 0,
1013                                   (HZ + 9) / 10);
1014        }
1015
1016        kref_put(&rc->refcnt, reada_control_release);
1017
1018        return 0;
1019}
1020#endif
1021
1022void btrfs_reada_detach(void *handle)
1023{
1024        struct reada_control *rc = handle;
1025
1026        kref_put(&rc->refcnt, reada_control_release);
1027}
1028
1029/*
1030 * Before removing a device (device replace or device remove ioctls), call this
1031 * function to wait for all existing readahead requests on the device and to
1032 * make sure no one queues more readahead requests for the device.
1033 *
1034 * Must be called without holding neither the device list mutex nor the device
1035 * replace semaphore, otherwise it will deadlock.
1036 */
1037void btrfs_reada_remove_dev(struct btrfs_device *dev)
1038{
1039        struct btrfs_fs_info *fs_info = dev->fs_info;
1040
1041        /* Serialize with readahead extent creation at reada_find_extent(). */
1042        spin_lock(&fs_info->reada_lock);
1043        set_bit(BTRFS_DEV_STATE_NO_READA, &dev->dev_state);
1044        spin_unlock(&fs_info->reada_lock);
1045
1046        /*
1047         * There might be readahead requests added to the radix trees which
1048         * were not yet added to the readahead work queue. We need to start
1049         * them and wait for their completion, otherwise we can end up with
1050         * use-after-free problems when dropping the last reference on the
1051         * readahead extents and their zones, as they need to access the
1052         * device structure.
1053         */
1054        reada_start_machine(fs_info);
1055        btrfs_flush_workqueue(fs_info->readahead_workers);
1056}
1057
1058/*
1059 * If when removing a device (device replace or device remove ioctls) an error
1060 * happens after calling btrfs_reada_remove_dev(), call this to undo what that
1061 * function did. This is safe to call even if btrfs_reada_remove_dev() was not
1062 * called before.
1063 */
1064void btrfs_reada_undo_remove_dev(struct btrfs_device *dev)
1065{
1066        spin_lock(&dev->fs_info->reada_lock);
1067        clear_bit(BTRFS_DEV_STATE_NO_READA, &dev->dev_state);
1068        spin_unlock(&dev->fs_info->reada_lock);
1069}
1070