linux/fs/btrfs/reada.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0
   2/*
   3 * Copyright (C) 2011 STRATO.  All rights reserved.
   4 */
   5
   6#include <linux/sched.h>
   7#include <linux/pagemap.h>
   8#include <linux/writeback.h>
   9#include <linux/blkdev.h>
  10#include <linux/slab.h>
  11#include <linux/workqueue.h>
  12#include "ctree.h"
  13#include "volumes.h"
  14#include "disk-io.h"
  15#include "transaction.h"
  16#include "dev-replace.h"
  17#include "block-group.h"
  18
  19#undef DEBUG
  20
  21/*
  22 * This is the implementation for the generic read ahead framework.
  23 *
  24 * To trigger a readahead, btrfs_reada_add must be called. It will start
  25 * a read ahead for the given range [start, end) on tree root. The returned
  26 * handle can either be used to wait on the readahead to finish
  27 * (btrfs_reada_wait), or to send it to the background (btrfs_reada_detach).
  28 *
  29 * The read ahead works as follows:
  30 * On btrfs_reada_add, the root of the tree is inserted into a radix_tree.
  31 * reada_start_machine will then search for extents to prefetch and trigger
  32 * some reads. When a read finishes for a node, all contained node/leaf
  33 * pointers that lie in the given range will also be enqueued. The reads will
  34 * be triggered in sequential order, thus giving a big win over a naive
  35 * enumeration. It will also make use of multi-device layouts. Each disk
  36 * will have its on read pointer and all disks will by utilized in parallel.
  37 * Also will no two disks read both sides of a mirror simultaneously, as this
  38 * would waste seeking capacity. Instead both disks will read different parts
  39 * of the filesystem.
  40 * Any number of readaheads can be started in parallel. The read order will be
  41 * determined globally, i.e. 2 parallel readaheads will normally finish faster
  42 * than the 2 started one after another.
  43 */
  44
  45#define MAX_IN_FLIGHT 6
  46
  47struct reada_extctl {
  48        struct list_head        list;
  49        struct reada_control    *rc;
  50        u64                     generation;
  51};
  52
  53struct reada_extent {
  54        u64                     logical;
  55        struct btrfs_key        top;
  56        struct list_head        extctl;
  57        int                     refcnt;
  58        spinlock_t              lock;
  59        struct reada_zone       *zones[BTRFS_MAX_MIRRORS];
  60        int                     nzones;
  61        int                     scheduled;
  62};
  63
  64struct reada_zone {
  65        u64                     start;
  66        u64                     end;
  67        u64                     elems;
  68        struct list_head        list;
  69        spinlock_t              lock;
  70        int                     locked;
  71        struct btrfs_device     *device;
  72        struct btrfs_device     *devs[BTRFS_MAX_MIRRORS]; /* full list, incl
  73                                                           * self */
  74        int                     ndevs;
  75        struct kref             refcnt;
  76};
  77
  78struct reada_machine_work {
  79        struct btrfs_work       work;
  80        struct btrfs_fs_info    *fs_info;
  81};
  82
  83static void reada_extent_put(struct btrfs_fs_info *, struct reada_extent *);
  84static void reada_control_release(struct kref *kref);
  85static void reada_zone_release(struct kref *kref);
  86static void reada_start_machine(struct btrfs_fs_info *fs_info);
  87static void __reada_start_machine(struct btrfs_fs_info *fs_info);
  88
  89static int reada_add_block(struct reada_control *rc, u64 logical,
  90                           struct btrfs_key *top, u64 generation);
  91
  92/* recurses */
  93/* in case of err, eb might be NULL */
  94static void __readahead_hook(struct btrfs_fs_info *fs_info,
  95                             struct reada_extent *re, struct extent_buffer *eb,
  96                             int err)
  97{
  98        int nritems;
  99        int i;
 100        u64 bytenr;
 101        u64 generation;
 102        struct list_head list;
 103
 104        spin_lock(&re->lock);
 105        /*
 106         * just take the full list from the extent. afterwards we
 107         * don't need the lock anymore
 108         */
 109        list_replace_init(&re->extctl, &list);
 110        re->scheduled = 0;
 111        spin_unlock(&re->lock);
 112
 113        /*
 114         * this is the error case, the extent buffer has not been
 115         * read correctly. We won't access anything from it and
 116         * just cleanup our data structures. Effectively this will
 117         * cut the branch below this node from read ahead.
 118         */
 119        if (err)
 120                goto cleanup;
 121
 122        /*
 123         * FIXME: currently we just set nritems to 0 if this is a leaf,
 124         * effectively ignoring the content. In a next step we could
 125         * trigger more readahead depending from the content, e.g.
 126         * fetch the checksums for the extents in the leaf.
 127         */
 128        if (!btrfs_header_level(eb))
 129                goto cleanup;
 130
 131        nritems = btrfs_header_nritems(eb);
 132        generation = btrfs_header_generation(eb);
 133        for (i = 0; i < nritems; i++) {
 134                struct reada_extctl *rec;
 135                u64 n_gen;
 136                struct btrfs_key key;
 137                struct btrfs_key next_key;
 138
 139                btrfs_node_key_to_cpu(eb, &key, i);
 140                if (i + 1 < nritems)
 141                        btrfs_node_key_to_cpu(eb, &next_key, i + 1);
 142                else
 143                        next_key = re->top;
 144                bytenr = btrfs_node_blockptr(eb, i);
 145                n_gen = btrfs_node_ptr_generation(eb, i);
 146
 147                list_for_each_entry(rec, &list, list) {
 148                        struct reada_control *rc = rec->rc;
 149
 150                        /*
 151                         * if the generation doesn't match, just ignore this
 152                         * extctl. This will probably cut off a branch from
 153                         * prefetch. Alternatively one could start a new (sub-)
 154                         * prefetch for this branch, starting again from root.
 155                         * FIXME: move the generation check out of this loop
 156                         */
 157#ifdef DEBUG
 158                        if (rec->generation != generation) {
 159                                btrfs_debug(fs_info,
 160                                            "generation mismatch for (%llu,%d,%llu) %llu != %llu",
 161                                            key.objectid, key.type, key.offset,
 162                                            rec->generation, generation);
 163                        }
 164#endif
 165                        if (rec->generation == generation &&
 166                            btrfs_comp_cpu_keys(&key, &rc->key_end) < 0 &&
 167                            btrfs_comp_cpu_keys(&next_key, &rc->key_start) > 0)
 168                                reada_add_block(rc, bytenr, &next_key, n_gen);
 169                }
 170        }
 171
 172cleanup:
 173        /*
 174         * free extctl records
 175         */
 176        while (!list_empty(&list)) {
 177                struct reada_control *rc;
 178                struct reada_extctl *rec;
 179
 180                rec = list_first_entry(&list, struct reada_extctl, list);
 181                list_del(&rec->list);
 182                rc = rec->rc;
 183                kfree(rec);
 184
 185                kref_get(&rc->refcnt);
 186                if (atomic_dec_and_test(&rc->elems)) {
 187                        kref_put(&rc->refcnt, reada_control_release);
 188                        wake_up(&rc->wait);
 189                }
 190                kref_put(&rc->refcnt, reada_control_release);
 191
 192                reada_extent_put(fs_info, re);  /* one ref for each entry */
 193        }
 194
 195        return;
 196}
 197
 198int btree_readahead_hook(struct extent_buffer *eb, int err)
 199{
 200        struct btrfs_fs_info *fs_info = eb->fs_info;
 201        int ret = 0;
 202        struct reada_extent *re;
 203
 204        /* find extent */
 205        spin_lock(&fs_info->reada_lock);
 206        re = radix_tree_lookup(&fs_info->reada_tree,
 207                               eb->start >> PAGE_SHIFT);
 208        if (re)
 209                re->refcnt++;
 210        spin_unlock(&fs_info->reada_lock);
 211        if (!re) {
 212                ret = -1;
 213                goto start_machine;
 214        }
 215
 216        __readahead_hook(fs_info, re, eb, err);
 217        reada_extent_put(fs_info, re);  /* our ref */
 218
 219start_machine:
 220        reada_start_machine(fs_info);
 221        return ret;
 222}
 223
 224static struct reada_zone *reada_find_zone(struct btrfs_device *dev, u64 logical,
 225                                          struct btrfs_bio *bbio)
 226{
 227        struct btrfs_fs_info *fs_info = dev->fs_info;
 228        int ret;
 229        struct reada_zone *zone;
 230        struct btrfs_block_group *cache = NULL;
 231        u64 start;
 232        u64 end;
 233        int i;
 234
 235        zone = NULL;
 236        spin_lock(&fs_info->reada_lock);
 237        ret = radix_tree_gang_lookup(&dev->reada_zones, (void **)&zone,
 238                                     logical >> PAGE_SHIFT, 1);
 239        if (ret == 1 && logical >= zone->start && logical <= zone->end) {
 240                kref_get(&zone->refcnt);
 241                spin_unlock(&fs_info->reada_lock);
 242                return zone;
 243        }
 244
 245        spin_unlock(&fs_info->reada_lock);
 246
 247        cache = btrfs_lookup_block_group(fs_info, logical);
 248        if (!cache)
 249                return NULL;
 250
 251        start = cache->start;
 252        end = start + cache->length - 1;
 253        btrfs_put_block_group(cache);
 254
 255        zone = kzalloc(sizeof(*zone), GFP_KERNEL);
 256        if (!zone)
 257                return NULL;
 258
 259        ret = radix_tree_preload(GFP_KERNEL);
 260        if (ret) {
 261                kfree(zone);
 262                return NULL;
 263        }
 264
 265        zone->start = start;
 266        zone->end = end;
 267        INIT_LIST_HEAD(&zone->list);
 268        spin_lock_init(&zone->lock);
 269        zone->locked = 0;
 270        kref_init(&zone->refcnt);
 271        zone->elems = 0;
 272        zone->device = dev; /* our device always sits at index 0 */
 273        for (i = 0; i < bbio->num_stripes; ++i) {
 274                /* bounds have already been checked */
 275                zone->devs[i] = bbio->stripes[i].dev;
 276        }
 277        zone->ndevs = bbio->num_stripes;
 278
 279        spin_lock(&fs_info->reada_lock);
 280        ret = radix_tree_insert(&dev->reada_zones,
 281                                (unsigned long)(zone->end >> PAGE_SHIFT),
 282                                zone);
 283
 284        if (ret == -EEXIST) {
 285                kfree(zone);
 286                ret = radix_tree_gang_lookup(&dev->reada_zones, (void **)&zone,
 287                                             logical >> PAGE_SHIFT, 1);
 288                if (ret == 1 && logical >= zone->start && logical <= zone->end)
 289                        kref_get(&zone->refcnt);
 290                else
 291                        zone = NULL;
 292        }
 293        spin_unlock(&fs_info->reada_lock);
 294        radix_tree_preload_end();
 295
 296        return zone;
 297}
 298
 299static struct reada_extent *reada_find_extent(struct btrfs_fs_info *fs_info,
 300                                              u64 logical,
 301                                              struct btrfs_key *top)
 302{
 303        int ret;
 304        struct reada_extent *re = NULL;
 305        struct reada_extent *re_exist = NULL;
 306        struct btrfs_bio *bbio = NULL;
 307        struct btrfs_device *dev;
 308        struct btrfs_device *prev_dev;
 309        u64 length;
 310        int real_stripes;
 311        int nzones = 0;
 312        unsigned long index = logical >> PAGE_SHIFT;
 313        int dev_replace_is_ongoing;
 314        int have_zone = 0;
 315
 316        spin_lock(&fs_info->reada_lock);
 317        re = radix_tree_lookup(&fs_info->reada_tree, index);
 318        if (re)
 319                re->refcnt++;
 320        spin_unlock(&fs_info->reada_lock);
 321
 322        if (re)
 323                return re;
 324
 325        re = kzalloc(sizeof(*re), GFP_KERNEL);
 326        if (!re)
 327                return NULL;
 328
 329        re->logical = logical;
 330        re->top = *top;
 331        INIT_LIST_HEAD(&re->extctl);
 332        spin_lock_init(&re->lock);
 333        re->refcnt = 1;
 334
 335        /*
 336         * map block
 337         */
 338        length = fs_info->nodesize;
 339        ret = btrfs_map_block(fs_info, BTRFS_MAP_GET_READ_MIRRORS, logical,
 340                        &length, &bbio, 0);
 341        if (ret || !bbio || length < fs_info->nodesize)
 342                goto error;
 343
 344        if (bbio->num_stripes > BTRFS_MAX_MIRRORS) {
 345                btrfs_err(fs_info,
 346                           "readahead: more than %d copies not supported",
 347                           BTRFS_MAX_MIRRORS);
 348                goto error;
 349        }
 350
 351        real_stripes = bbio->num_stripes - bbio->num_tgtdevs;
 352        for (nzones = 0; nzones < real_stripes; ++nzones) {
 353                struct reada_zone *zone;
 354
 355                dev = bbio->stripes[nzones].dev;
 356
 357                /* cannot read ahead on missing device. */
 358                if (!dev->bdev)
 359                        continue;
 360
 361                zone = reada_find_zone(dev, logical, bbio);
 362                if (!zone)
 363                        continue;
 364
 365                re->zones[re->nzones++] = zone;
 366                spin_lock(&zone->lock);
 367                if (!zone->elems)
 368                        kref_get(&zone->refcnt);
 369                ++zone->elems;
 370                spin_unlock(&zone->lock);
 371                spin_lock(&fs_info->reada_lock);
 372                kref_put(&zone->refcnt, reada_zone_release);
 373                spin_unlock(&fs_info->reada_lock);
 374        }
 375        if (re->nzones == 0) {
 376                /* not a single zone found, error and out */
 377                goto error;
 378        }
 379
 380        /* Insert extent in reada tree + all per-device trees, all or nothing */
 381        down_read(&fs_info->dev_replace.rwsem);
 382        ret = radix_tree_preload(GFP_KERNEL);
 383        if (ret) {
 384                up_read(&fs_info->dev_replace.rwsem);
 385                goto error;
 386        }
 387
 388        spin_lock(&fs_info->reada_lock);
 389        ret = radix_tree_insert(&fs_info->reada_tree, index, re);
 390        if (ret == -EEXIST) {
 391                re_exist = radix_tree_lookup(&fs_info->reada_tree, index);
 392                re_exist->refcnt++;
 393                spin_unlock(&fs_info->reada_lock);
 394                radix_tree_preload_end();
 395                up_read(&fs_info->dev_replace.rwsem);
 396                goto error;
 397        }
 398        if (ret) {
 399                spin_unlock(&fs_info->reada_lock);
 400                radix_tree_preload_end();
 401                up_read(&fs_info->dev_replace.rwsem);
 402                goto error;
 403        }
 404        radix_tree_preload_end();
 405        prev_dev = NULL;
 406        dev_replace_is_ongoing = btrfs_dev_replace_is_ongoing(
 407                        &fs_info->dev_replace);
 408        for (nzones = 0; nzones < re->nzones; ++nzones) {
 409                dev = re->zones[nzones]->device;
 410
 411                if (dev == prev_dev) {
 412                        /*
 413                         * in case of DUP, just add the first zone. As both
 414                         * are on the same device, there's nothing to gain
 415                         * from adding both.
 416                         * Also, it wouldn't work, as the tree is per device
 417                         * and adding would fail with EEXIST
 418                         */
 419                        continue;
 420                }
 421                if (!dev->bdev)
 422                        continue;
 423
 424                if (dev_replace_is_ongoing &&
 425                    dev == fs_info->dev_replace.tgtdev) {
 426                        /*
 427                         * as this device is selected for reading only as
 428                         * a last resort, skip it for read ahead.
 429                         */
 430                        continue;
 431                }
 432                prev_dev = dev;
 433                ret = radix_tree_insert(&dev->reada_extents, index, re);
 434                if (ret) {
 435                        while (--nzones >= 0) {
 436                                dev = re->zones[nzones]->device;
 437                                BUG_ON(dev == NULL);
 438                                /* ignore whether the entry was inserted */
 439                                radix_tree_delete(&dev->reada_extents, index);
 440                        }
 441                        radix_tree_delete(&fs_info->reada_tree, index);
 442                        spin_unlock(&fs_info->reada_lock);
 443                        up_read(&fs_info->dev_replace.rwsem);
 444                        goto error;
 445                }
 446                have_zone = 1;
 447        }
 448        spin_unlock(&fs_info->reada_lock);
 449        up_read(&fs_info->dev_replace.rwsem);
 450
 451        if (!have_zone)
 452                goto error;
 453
 454        btrfs_put_bbio(bbio);
 455        return re;
 456
 457error:
 458        for (nzones = 0; nzones < re->nzones; ++nzones) {
 459                struct reada_zone *zone;
 460
 461                zone = re->zones[nzones];
 462                kref_get(&zone->refcnt);
 463                spin_lock(&zone->lock);
 464                --zone->elems;
 465                if (zone->elems == 0) {
 466                        /*
 467                         * no fs_info->reada_lock needed, as this can't be
 468                         * the last ref
 469                         */
 470                        kref_put(&zone->refcnt, reada_zone_release);
 471                }
 472                spin_unlock(&zone->lock);
 473
 474                spin_lock(&fs_info->reada_lock);
 475                kref_put(&zone->refcnt, reada_zone_release);
 476                spin_unlock(&fs_info->reada_lock);
 477        }
 478        btrfs_put_bbio(bbio);
 479        kfree(re);
 480        return re_exist;
 481}
 482
 483static void reada_extent_put(struct btrfs_fs_info *fs_info,
 484                             struct reada_extent *re)
 485{
 486        int i;
 487        unsigned long index = re->logical >> PAGE_SHIFT;
 488
 489        spin_lock(&fs_info->reada_lock);
 490        if (--re->refcnt) {
 491                spin_unlock(&fs_info->reada_lock);
 492                return;
 493        }
 494
 495        radix_tree_delete(&fs_info->reada_tree, index);
 496        for (i = 0; i < re->nzones; ++i) {
 497                struct reada_zone *zone = re->zones[i];
 498
 499                radix_tree_delete(&zone->device->reada_extents, index);
 500        }
 501
 502        spin_unlock(&fs_info->reada_lock);
 503
 504        for (i = 0; i < re->nzones; ++i) {
 505                struct reada_zone *zone = re->zones[i];
 506
 507                kref_get(&zone->refcnt);
 508                spin_lock(&zone->lock);
 509                --zone->elems;
 510                if (zone->elems == 0) {
 511                        /* no fs_info->reada_lock needed, as this can't be
 512                         * the last ref */
 513                        kref_put(&zone->refcnt, reada_zone_release);
 514                }
 515                spin_unlock(&zone->lock);
 516
 517                spin_lock(&fs_info->reada_lock);
 518                kref_put(&zone->refcnt, reada_zone_release);
 519                spin_unlock(&fs_info->reada_lock);
 520        }
 521
 522        kfree(re);
 523}
 524
 525static void reada_zone_release(struct kref *kref)
 526{
 527        struct reada_zone *zone = container_of(kref, struct reada_zone, refcnt);
 528
 529        radix_tree_delete(&zone->device->reada_zones,
 530                          zone->end >> PAGE_SHIFT);
 531
 532        kfree(zone);
 533}
 534
 535static void reada_control_release(struct kref *kref)
 536{
 537        struct reada_control *rc = container_of(kref, struct reada_control,
 538                                                refcnt);
 539
 540        kfree(rc);
 541}
 542
 543static int reada_add_block(struct reada_control *rc, u64 logical,
 544                           struct btrfs_key *top, u64 generation)
 545{
 546        struct btrfs_fs_info *fs_info = rc->fs_info;
 547        struct reada_extent *re;
 548        struct reada_extctl *rec;
 549
 550        /* takes one ref */
 551        re = reada_find_extent(fs_info, logical, top);
 552        if (!re)
 553                return -1;
 554
 555        rec = kzalloc(sizeof(*rec), GFP_KERNEL);
 556        if (!rec) {
 557                reada_extent_put(fs_info, re);
 558                return -ENOMEM;
 559        }
 560
 561        rec->rc = rc;
 562        rec->generation = generation;
 563        atomic_inc(&rc->elems);
 564
 565        spin_lock(&re->lock);
 566        list_add_tail(&rec->list, &re->extctl);
 567        spin_unlock(&re->lock);
 568
 569        /* leave the ref on the extent */
 570
 571        return 0;
 572}
 573
 574/*
 575 * called with fs_info->reada_lock held
 576 */
 577static void reada_peer_zones_set_lock(struct reada_zone *zone, int lock)
 578{
 579        int i;
 580        unsigned long index = zone->end >> PAGE_SHIFT;
 581
 582        for (i = 0; i < zone->ndevs; ++i) {
 583                struct reada_zone *peer;
 584                peer = radix_tree_lookup(&zone->devs[i]->reada_zones, index);
 585                if (peer && peer->device != zone->device)
 586                        peer->locked = lock;
 587        }
 588}
 589
 590/*
 591 * called with fs_info->reada_lock held
 592 */
 593static int reada_pick_zone(struct btrfs_device *dev)
 594{
 595        struct reada_zone *top_zone = NULL;
 596        struct reada_zone *top_locked_zone = NULL;
 597        u64 top_elems = 0;
 598        u64 top_locked_elems = 0;
 599        unsigned long index = 0;
 600        int ret;
 601
 602        if (dev->reada_curr_zone) {
 603                reada_peer_zones_set_lock(dev->reada_curr_zone, 0);
 604                kref_put(&dev->reada_curr_zone->refcnt, reada_zone_release);
 605                dev->reada_curr_zone = NULL;
 606        }
 607        /* pick the zone with the most elements */
 608        while (1) {
 609                struct reada_zone *zone;
 610
 611                ret = radix_tree_gang_lookup(&dev->reada_zones,
 612                                             (void **)&zone, index, 1);
 613                if (ret == 0)
 614                        break;
 615                index = (zone->end >> PAGE_SHIFT) + 1;
 616                if (zone->locked) {
 617                        if (zone->elems > top_locked_elems) {
 618                                top_locked_elems = zone->elems;
 619                                top_locked_zone = zone;
 620                        }
 621                } else {
 622                        if (zone->elems > top_elems) {
 623                                top_elems = zone->elems;
 624                                top_zone = zone;
 625                        }
 626                }
 627        }
 628        if (top_zone)
 629                dev->reada_curr_zone = top_zone;
 630        else if (top_locked_zone)
 631                dev->reada_curr_zone = top_locked_zone;
 632        else
 633                return 0;
 634
 635        dev->reada_next = dev->reada_curr_zone->start;
 636        kref_get(&dev->reada_curr_zone->refcnt);
 637        reada_peer_zones_set_lock(dev->reada_curr_zone, 1);
 638
 639        return 1;
 640}
 641
 642static int reada_tree_block_flagged(struct btrfs_fs_info *fs_info, u64 bytenr,
 643                                    int mirror_num, struct extent_buffer **eb)
 644{
 645        struct extent_buffer *buf = NULL;
 646        int ret;
 647
 648        buf = btrfs_find_create_tree_block(fs_info, bytenr);
 649        if (IS_ERR(buf))
 650                return 0;
 651
 652        set_bit(EXTENT_BUFFER_READAHEAD, &buf->bflags);
 653
 654        ret = read_extent_buffer_pages(buf, WAIT_PAGE_LOCK, mirror_num);
 655        if (ret) {
 656                free_extent_buffer_stale(buf);
 657                return ret;
 658        }
 659
 660        if (test_bit(EXTENT_BUFFER_CORRUPT, &buf->bflags)) {
 661                free_extent_buffer_stale(buf);
 662                return -EIO;
 663        } else if (extent_buffer_uptodate(buf)) {
 664                *eb = buf;
 665        } else {
 666                free_extent_buffer(buf);
 667        }
 668        return 0;
 669}
 670
 671static int reada_start_machine_dev(struct btrfs_device *dev)
 672{
 673        struct btrfs_fs_info *fs_info = dev->fs_info;
 674        struct reada_extent *re = NULL;
 675        int mirror_num = 0;
 676        struct extent_buffer *eb = NULL;
 677        u64 logical;
 678        int ret;
 679        int i;
 680
 681        spin_lock(&fs_info->reada_lock);
 682        if (dev->reada_curr_zone == NULL) {
 683                ret = reada_pick_zone(dev);
 684                if (!ret) {
 685                        spin_unlock(&fs_info->reada_lock);
 686                        return 0;
 687                }
 688        }
 689        /*
 690         * FIXME currently we issue the reads one extent at a time. If we have
 691         * a contiguous block of extents, we could also coagulate them or use
 692         * plugging to speed things up
 693         */
 694        ret = radix_tree_gang_lookup(&dev->reada_extents, (void **)&re,
 695                                     dev->reada_next >> PAGE_SHIFT, 1);
 696        if (ret == 0 || re->logical > dev->reada_curr_zone->end) {
 697                ret = reada_pick_zone(dev);
 698                if (!ret) {
 699                        spin_unlock(&fs_info->reada_lock);
 700                        return 0;
 701                }
 702                re = NULL;
 703                ret = radix_tree_gang_lookup(&dev->reada_extents, (void **)&re,
 704                                        dev->reada_next >> PAGE_SHIFT, 1);
 705        }
 706        if (ret == 0) {
 707                spin_unlock(&fs_info->reada_lock);
 708                return 0;
 709        }
 710        dev->reada_next = re->logical + fs_info->nodesize;
 711        re->refcnt++;
 712
 713        spin_unlock(&fs_info->reada_lock);
 714
 715        spin_lock(&re->lock);
 716        if (re->scheduled || list_empty(&re->extctl)) {
 717                spin_unlock(&re->lock);
 718                reada_extent_put(fs_info, re);
 719                return 0;
 720        }
 721        re->scheduled = 1;
 722        spin_unlock(&re->lock);
 723
 724        /*
 725         * find mirror num
 726         */
 727        for (i = 0; i < re->nzones; ++i) {
 728                if (re->zones[i]->device == dev) {
 729                        mirror_num = i + 1;
 730                        break;
 731                }
 732        }
 733        logical = re->logical;
 734
 735        atomic_inc(&dev->reada_in_flight);
 736        ret = reada_tree_block_flagged(fs_info, logical, mirror_num, &eb);
 737        if (ret)
 738                __readahead_hook(fs_info, re, NULL, ret);
 739        else if (eb)
 740                __readahead_hook(fs_info, re, eb, ret);
 741
 742        if (eb)
 743                free_extent_buffer(eb);
 744
 745        atomic_dec(&dev->reada_in_flight);
 746        reada_extent_put(fs_info, re);
 747
 748        return 1;
 749
 750}
 751
 752static void reada_start_machine_worker(struct btrfs_work *work)
 753{
 754        struct reada_machine_work *rmw;
 755        int old_ioprio;
 756
 757        rmw = container_of(work, struct reada_machine_work, work);
 758
 759        old_ioprio = IOPRIO_PRIO_VALUE(task_nice_ioclass(current),
 760                                       task_nice_ioprio(current));
 761        set_task_ioprio(current, BTRFS_IOPRIO_READA);
 762        __reada_start_machine(rmw->fs_info);
 763        set_task_ioprio(current, old_ioprio);
 764
 765        atomic_dec(&rmw->fs_info->reada_works_cnt);
 766
 767        kfree(rmw);
 768}
 769
 770static void __reada_start_machine(struct btrfs_fs_info *fs_info)
 771{
 772        struct btrfs_device *device;
 773        struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
 774        u64 enqueued;
 775        u64 total = 0;
 776        int i;
 777
 778again:
 779        do {
 780                enqueued = 0;
 781                mutex_lock(&fs_devices->device_list_mutex);
 782                list_for_each_entry(device, &fs_devices->devices, dev_list) {
 783                        if (atomic_read(&device->reada_in_flight) <
 784                            MAX_IN_FLIGHT)
 785                                enqueued += reada_start_machine_dev(device);
 786                }
 787                mutex_unlock(&fs_devices->device_list_mutex);
 788                total += enqueued;
 789        } while (enqueued && total < 10000);
 790        if (fs_devices->seed) {
 791                fs_devices = fs_devices->seed;
 792                goto again;
 793        }
 794
 795        if (enqueued == 0)
 796                return;
 797
 798        /*
 799         * If everything is already in the cache, this is effectively single
 800         * threaded. To a) not hold the caller for too long and b) to utilize
 801         * more cores, we broke the loop above after 10000 iterations and now
 802         * enqueue to workers to finish it. This will distribute the load to
 803         * the cores.
 804         */
 805        for (i = 0; i < 2; ++i) {
 806                reada_start_machine(fs_info);
 807                if (atomic_read(&fs_info->reada_works_cnt) >
 808                    BTRFS_MAX_MIRRORS * 2)
 809                        break;
 810        }
 811}
 812
 813static void reada_start_machine(struct btrfs_fs_info *fs_info)
 814{
 815        struct reada_machine_work *rmw;
 816
 817        rmw = kzalloc(sizeof(*rmw), GFP_KERNEL);
 818        if (!rmw) {
 819                /* FIXME we cannot handle this properly right now */
 820                BUG();
 821        }
 822        btrfs_init_work(&rmw->work, reada_start_machine_worker, NULL, NULL);
 823        rmw->fs_info = fs_info;
 824
 825        btrfs_queue_work(fs_info->readahead_workers, &rmw->work);
 826        atomic_inc(&fs_info->reada_works_cnt);
 827}
 828
 829#ifdef DEBUG
 830static void dump_devs(struct btrfs_fs_info *fs_info, int all)
 831{
 832        struct btrfs_device *device;
 833        struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
 834        unsigned long index;
 835        int ret;
 836        int i;
 837        int j;
 838        int cnt;
 839
 840        spin_lock(&fs_info->reada_lock);
 841        list_for_each_entry(device, &fs_devices->devices, dev_list) {
 842                btrfs_debug(fs_info, "dev %lld has %d in flight", device->devid,
 843                        atomic_read(&device->reada_in_flight));
 844                index = 0;
 845                while (1) {
 846                        struct reada_zone *zone;
 847                        ret = radix_tree_gang_lookup(&device->reada_zones,
 848                                                     (void **)&zone, index, 1);
 849                        if (ret == 0)
 850                                break;
 851                        pr_debug("  zone %llu-%llu elems %llu locked %d devs",
 852                                    zone->start, zone->end, zone->elems,
 853                                    zone->locked);
 854                        for (j = 0; j < zone->ndevs; ++j) {
 855                                pr_cont(" %lld",
 856                                        zone->devs[j]->devid);
 857                        }
 858                        if (device->reada_curr_zone == zone)
 859                                pr_cont(" curr off %llu",
 860                                        device->reada_next - zone->start);
 861                        pr_cont("\n");
 862                        index = (zone->end >> PAGE_SHIFT) + 1;
 863                }
 864                cnt = 0;
 865                index = 0;
 866                while (all) {
 867                        struct reada_extent *re = NULL;
 868
 869                        ret = radix_tree_gang_lookup(&device->reada_extents,
 870                                                     (void **)&re, index, 1);
 871                        if (ret == 0)
 872                                break;
 873                        pr_debug("  re: logical %llu size %u empty %d scheduled %d",
 874                                re->logical, fs_info->nodesize,
 875                                list_empty(&re->extctl), re->scheduled);
 876
 877                        for (i = 0; i < re->nzones; ++i) {
 878                                pr_cont(" zone %llu-%llu devs",
 879                                        re->zones[i]->start,
 880                                        re->zones[i]->end);
 881                                for (j = 0; j < re->zones[i]->ndevs; ++j) {
 882                                        pr_cont(" %lld",
 883                                                re->zones[i]->devs[j]->devid);
 884                                }
 885                        }
 886                        pr_cont("\n");
 887                        index = (re->logical >> PAGE_SHIFT) + 1;
 888                        if (++cnt > 15)
 889                                break;
 890                }
 891        }
 892
 893        index = 0;
 894        cnt = 0;
 895        while (all) {
 896                struct reada_extent *re = NULL;
 897
 898                ret = radix_tree_gang_lookup(&fs_info->reada_tree, (void **)&re,
 899                                             index, 1);
 900                if (ret == 0)
 901                        break;
 902                if (!re->scheduled) {
 903                        index = (re->logical >> PAGE_SHIFT) + 1;
 904                        continue;
 905                }
 906                pr_debug("re: logical %llu size %u list empty %d scheduled %d",
 907                        re->logical, fs_info->nodesize,
 908                        list_empty(&re->extctl), re->scheduled);
 909                for (i = 0; i < re->nzones; ++i) {
 910                        pr_cont(" zone %llu-%llu devs",
 911                                re->zones[i]->start,
 912                                re->zones[i]->end);
 913                        for (j = 0; j < re->zones[i]->ndevs; ++j) {
 914                                pr_cont(" %lld",
 915                                       re->zones[i]->devs[j]->devid);
 916                        }
 917                }
 918                pr_cont("\n");
 919                index = (re->logical >> PAGE_SHIFT) + 1;
 920        }
 921        spin_unlock(&fs_info->reada_lock);
 922}
 923#endif
 924
 925/*
 926 * interface
 927 */
 928struct reada_control *btrfs_reada_add(struct btrfs_root *root,
 929                        struct btrfs_key *key_start, struct btrfs_key *key_end)
 930{
 931        struct reada_control *rc;
 932        u64 start;
 933        u64 generation;
 934        int ret;
 935        struct extent_buffer *node;
 936        static struct btrfs_key max_key = {
 937                .objectid = (u64)-1,
 938                .type = (u8)-1,
 939                .offset = (u64)-1
 940        };
 941
 942        rc = kzalloc(sizeof(*rc), GFP_KERNEL);
 943        if (!rc)
 944                return ERR_PTR(-ENOMEM);
 945
 946        rc->fs_info = root->fs_info;
 947        rc->key_start = *key_start;
 948        rc->key_end = *key_end;
 949        atomic_set(&rc->elems, 0);
 950        init_waitqueue_head(&rc->wait);
 951        kref_init(&rc->refcnt);
 952        kref_get(&rc->refcnt); /* one ref for having elements */
 953
 954        node = btrfs_root_node(root);
 955        start = node->start;
 956        generation = btrfs_header_generation(node);
 957        free_extent_buffer(node);
 958
 959        ret = reada_add_block(rc, start, &max_key, generation);
 960        if (ret) {
 961                kfree(rc);
 962                return ERR_PTR(ret);
 963        }
 964
 965        reada_start_machine(root->fs_info);
 966
 967        return rc;
 968}
 969
 970#ifdef DEBUG
 971int btrfs_reada_wait(void *handle)
 972{
 973        struct reada_control *rc = handle;
 974        struct btrfs_fs_info *fs_info = rc->fs_info;
 975
 976        while (atomic_read(&rc->elems)) {
 977                if (!atomic_read(&fs_info->reada_works_cnt))
 978                        reada_start_machine(fs_info);
 979                wait_event_timeout(rc->wait, atomic_read(&rc->elems) == 0,
 980                                   5 * HZ);
 981                dump_devs(fs_info, atomic_read(&rc->elems) < 10 ? 1 : 0);
 982        }
 983
 984        dump_devs(fs_info, atomic_read(&rc->elems) < 10 ? 1 : 0);
 985
 986        kref_put(&rc->refcnt, reada_control_release);
 987
 988        return 0;
 989}
 990#else
 991int btrfs_reada_wait(void *handle)
 992{
 993        struct reada_control *rc = handle;
 994        struct btrfs_fs_info *fs_info = rc->fs_info;
 995
 996        while (atomic_read(&rc->elems)) {
 997                if (!atomic_read(&fs_info->reada_works_cnt))
 998                        reada_start_machine(fs_info);
 999                wait_event_timeout(rc->wait, atomic_read(&rc->elems) == 0,
1000                                   (HZ + 9) / 10);
1001        }
1002
1003        kref_put(&rc->refcnt, reada_control_release);
1004
1005        return 0;
1006}
1007#endif
1008
1009void btrfs_reada_detach(void *handle)
1010{
1011        struct reada_control *rc = handle;
1012
1013        kref_put(&rc->refcnt, reada_control_release);
1014}
1015