LXR linux/drivers/md/bcache/request.c

   1/*
   2 * Main bcache entry point - handle a read or a write request and decide what to
   3 * do with it; the make_request functions are called by the block layer.
   4 *
   5 * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com>
   6 * Copyright 2012 Google, Inc.
   7 */
   8
   9#include "bcache.h"
  10#include "btree.h"
  11#include "debug.h"
  12#include "request.h"
  13#include "writeback.h"
  14
  15#include <linux/cgroup.h>
  16#include <linux/module.h>
  17#include <linux/hash.h>
  18#include <linux/random.h>
  19#include "blk-cgroup.h"
  20
  21#include <trace/events/bcache.h>
  22
  23#define CUTOFF_CACHE_ADD        95
  24#define CUTOFF_CACHE_READA      90
  25
  26struct kmem_cache *bch_search_cache;
  27
  28static void check_should_skip(struct cached_dev *, struct search *);
  29
  30/* Cgroup interface */
  31
  32#ifdef CONFIG_CGROUP_BCACHE
  33static struct bch_cgroup bcache_default_cgroup = { .cache_mode = -1 };
  34
  35static struct bch_cgroup *cgroup_to_bcache(struct cgroup *cgroup)
  36{
  37        struct cgroup_subsys_state *css;
  38        return cgroup &&
  39                (css = cgroup_subsys_state(cgroup, bcache_subsys_id))
  40                ? container_of(css, struct bch_cgroup, css)
  41                : &bcache_default_cgroup;
  42}
  43
  44struct bch_cgroup *bch_bio_to_cgroup(struct bio *bio)
  45{
  46        struct cgroup_subsys_state *css = bio->bi_css
  47                ? cgroup_subsys_state(bio->bi_css->cgroup, bcache_subsys_id)
  48                : task_subsys_state(current, bcache_subsys_id);
  49
  50        return css
  51                ? container_of(css, struct bch_cgroup, css)
  52                : &bcache_default_cgroup;
  53}
  54
  55static ssize_t cache_mode_read(struct cgroup *cgrp, struct cftype *cft,
  56                        struct file *file,
  57                        char __user *buf, size_t nbytes, loff_t *ppos)
  58{
  59        char tmp[1024];
  60        int len = bch_snprint_string_list(tmp, PAGE_SIZE, bch_cache_modes,
  61                                          cgroup_to_bcache(cgrp)->cache_mode + 1);
  62
  63        if (len < 0)
  64                return len;
  65
  66        return simple_read_from_buffer(buf, nbytes, ppos, tmp, len);
  67}
  68
  69static int cache_mode_write(struct cgroup *cgrp, struct cftype *cft,
  70                            const char *buf)
  71{
  72        int v = bch_read_string_list(buf, bch_cache_modes);
  73        if (v < 0)
  74                return v;
  75
  76        cgroup_to_bcache(cgrp)->cache_mode = v - 1;
  77        return 0;
  78}
  79
  80static u64 bch_verify_read(struct cgroup *cgrp, struct cftype *cft)
  81{
  82        return cgroup_to_bcache(cgrp)->verify;
  83}
  84
  85static int bch_verify_write(struct cgroup *cgrp, struct cftype *cft, u64 val)
  86{
  87        cgroup_to_bcache(cgrp)->verify = val;
  88        return 0;
  89}
  90
  91static u64 bch_cache_hits_read(struct cgroup *cgrp, struct cftype *cft)
  92{
  93        struct bch_cgroup *bcachecg = cgroup_to_bcache(cgrp);
  94        return atomic_read(&bcachecg->stats.cache_hits);
  95}
  96
  97static u64 bch_cache_misses_read(struct cgroup *cgrp, struct cftype *cft)
  98{
  99        struct bch_cgroup *bcachecg = cgroup_to_bcache(cgrp);
 100        return atomic_read(&bcachecg->stats.cache_misses);
 101}
 102
 103static u64 bch_cache_bypass_hits_read(struct cgroup *cgrp,
 104                                         struct cftype *cft)
 105{
 106        struct bch_cgroup *bcachecg = cgroup_to_bcache(cgrp);
 107        return atomic_read(&bcachecg->stats.cache_bypass_hits);
 108}
 109
 110static u64 bch_cache_bypass_misses_read(struct cgroup *cgrp,
 111                                           struct cftype *cft)
 112{
 113        struct bch_cgroup *bcachecg = cgroup_to_bcache(cgrp);
 114        return atomic_read(&bcachecg->stats.cache_bypass_misses);
 115}
 116
 117static struct cftype bch_files[] = {
 118        {
 119                .name           = "cache_mode",
 120                .read           = cache_mode_read,
 121                .write_string   = cache_mode_write,
 122        },
 123        {
 124                .name           = "verify",
 125                .read_u64       = bch_verify_read,
 126                .write_u64      = bch_verify_write,
 127        },
 128        {
 129                .name           = "cache_hits",
 130                .read_u64       = bch_cache_hits_read,
 131        },
 132        {
 133                .name           = "cache_misses",
 134                .read_u64       = bch_cache_misses_read,
 135        },
 136        {
 137                .name           = "cache_bypass_hits",
 138                .read_u64       = bch_cache_bypass_hits_read,
 139        },
 140        {
 141                .name           = "cache_bypass_misses",
 142                .read_u64       = bch_cache_bypass_misses_read,
 143        },
 144        { }     /* terminate */
 145};
 146
 147static void init_bch_cgroup(struct bch_cgroup *cg)
 148{
 149        cg->cache_mode = -1;
 150}
 151
 152static struct cgroup_subsys_state *bcachecg_create(struct cgroup *cgroup)
 153{
 154        struct bch_cgroup *cg;
 155
 156        cg = kzalloc(sizeof(*cg), GFP_KERNEL);
 157        if (!cg)
 158                return ERR_PTR(-ENOMEM);
 159        init_bch_cgroup(cg);
 160        return &cg->css;
 161}
 162
 163static void bcachecg_destroy(struct cgroup *cgroup)
 164{
 165        struct bch_cgroup *cg = cgroup_to_bcache(cgroup);
 166        free_css_id(&bcache_subsys, &cg->css);
 167        kfree(cg);
 168}
 169
 170struct cgroup_subsys bcache_subsys = {
 171        .create         = bcachecg_create,
 172        .destroy        = bcachecg_destroy,
 173        .subsys_id      = bcache_subsys_id,
 174        .name           = "bcache",
 175        .module         = THIS_MODULE,
 176};
 177EXPORT_SYMBOL_GPL(bcache_subsys);
 178#endif
 179
 180static unsigned cache_mode(struct cached_dev *dc, struct bio *bio)
 181{
 182#ifdef CONFIG_CGROUP_BCACHE
 183        int r = bch_bio_to_cgroup(bio)->cache_mode;
 184        if (r >= 0)
 185                return r;
 186#endif
 187        return BDEV_CACHE_MODE(&dc->sb);
 188}
 189
 190static bool verify(struct cached_dev *dc, struct bio *bio)
 191{
 192#ifdef CONFIG_CGROUP_BCACHE
 193        if (bch_bio_to_cgroup(bio)->verify)
 194                return true;
 195#endif
 196        return dc->verify;
 197}
 198
 199static void bio_csum(struct bio *bio, struct bkey *k)
 200{
 201        struct bio_vec *bv;
 202        uint64_t csum = 0;
 203        int i;
 204
 205        bio_for_each_segment(bv, bio, i) {
 206                void *d = kmap(bv->bv_page) + bv->bv_offset;
 207                csum = bch_crc64_update(csum, d, bv->bv_len);
 208                kunmap(bv->bv_page);
 209        }
 210
 211        k->ptr[KEY_PTRS(k)] = csum & (~0ULL >> 1);
 212}
 213
 214/* Insert data into cache */
 215
 216static void bio_invalidate(struct closure *cl)
 217{
 218        struct btree_op *op = container_of(cl, struct btree_op, cl);
 219        struct bio *bio = op->cache_bio;
 220
 221        pr_debug("invalidating %i sectors from %llu",
 222                 bio_sectors(bio), (uint64_t) bio->bi_sector);
 223
 224        while (bio_sectors(bio)) {
 225                unsigned len = min(bio_sectors(bio), 1U << 14);
 226
 227                if (bch_keylist_realloc(&op->keys, 0, op->c))
 228                        goto out;
 229
 230                bio->bi_sector  += len;
 231                bio->bi_size    -= len << 9;
 232
 233                bch_keylist_add(&op->keys,
 234                                &KEY(op->inode, bio->bi_sector, len));
 235        }
 236
 237        op->insert_data_done = true;
 238        bio_put(bio);
 239out:
 240        continue_at(cl, bch_journal, bcache_wq);
 241}
 242
 243struct open_bucket {
 244        struct list_head        list;
 245        struct task_struct      *last;
 246        unsigned                sectors_free;
 247        BKEY_PADDED(key);
 248};
 249
 250void bch_open_buckets_free(struct cache_set *c)
 251{
 252        struct open_bucket *b;
 253
 254        while (!list_empty(&c->data_buckets)) {
 255                b = list_first_entry(&c->data_buckets,
 256                                     struct open_bucket, list);
 257                list_del(&b->list);
 258                kfree(b);
 259        }
 260}
 261
 262int bch_open_buckets_alloc(struct cache_set *c)
 263{
 264        int i;
 265
 266        spin_lock_init(&c->data_bucket_lock);
 267
 268        for (i = 0; i < 6; i++) {
 269                struct open_bucket *b = kzalloc(sizeof(*b), GFP_KERNEL);
 270                if (!b)
 271                        return -ENOMEM;
 272
 273                list_add(&b->list, &c->data_buckets);
 274        }
 275
 276        return 0;
 277}
 278
 279/*
 280 * We keep multiple buckets open for writes, and try to segregate different
 281 * write streams for better cache utilization: first we look for a bucket where
 282 * the last write to it was sequential with the current write, and failing that
 283 * we look for a bucket that was last used by the same task.
 284 *
 285 * The ideas is if you've got multiple tasks pulling data into the cache at the
 286 * same time, you'll get better cache utilization if you try to segregate their
 287 * data and preserve locality.
 288 *
 289 * For example, say you've starting Firefox at the same time you're copying a
 290 * bunch of files. Firefox will likely end up being fairly hot and stay in the
 291 * cache awhile, but the data you copied might not be; if you wrote all that
 292 * data to the same buckets it'd get invalidated at the same time.
 293 *
 294 * Both of those tasks will be doing fairly random IO so we can't rely on
 295 * detecting sequential IO to segregate their data, but going off of the task
 296 * should be a sane heuristic.
 297 */
 298static struct open_bucket *pick_data_bucket(struct cache_set *c,
 299                                            const struct bkey *search,
 300                                            struct task_struct *task,
 301                                            struct bkey *alloc)
 302{
 303        struct open_bucket *ret, *ret_task = NULL;
 304
 305        list_for_each_entry_reverse(ret, &c->data_buckets, list)
 306                if (!bkey_cmp(&ret->key, search))
 307                        goto found;
 308                else if (ret->last == task)
 309                        ret_task = ret;
 310
 311        ret = ret_task ?: list_first_entry(&c->data_buckets,
 312                                           struct open_bucket, list);
 313found:
 314        if (!ret->sectors_free && KEY_PTRS(alloc)) {
 315                ret->sectors_free = c->sb.bucket_size;
 316                bkey_copy(&ret->key, alloc);
 317                bkey_init(alloc);
 318        }
 319
 320        if (!ret->sectors_free)
 321                ret = NULL;
 322
 323        return ret;
 324}
 325
 326/*
 327 * Allocates some space in the cache to write to, and k to point to the newly
 328 * allocated space, and updates KEY_SIZE(k) and KEY_OFFSET(k) (to point to the
 329 * end of the newly allocated space).
 330 *
 331 * May allocate fewer sectors than @sectors, KEY_SIZE(k) indicates how many
 332 * sectors were actually allocated.
 333 *
 334 * If s->writeback is true, will not fail.
 335 */
 336static bool bch_alloc_sectors(struct bkey *k, unsigned sectors,
 337                              struct search *s)
 338{
 339        struct cache_set *c = s->op.c;
 340        struct open_bucket *b;
 341        BKEY_PADDED(key) alloc;
 342        struct closure cl, *w = NULL;
 343        unsigned i;
 344
 345        if (s->writeback) {
 346                closure_init_stack(&cl);
 347                w = &cl;
 348        }
 349
 350        /*
 351         * We might have to allocate a new bucket, which we can't do with a
 352         * spinlock held. So if we have to allocate, we drop the lock, allocate
 353         * and then retry. KEY_PTRS() indicates whether alloc points to
 354         * allocated bucket(s).
 355         */
 356
 357        bkey_init(&alloc.key);
 358        spin_lock(&c->data_bucket_lock);
 359
 360        while (!(b = pick_data_bucket(c, k, s->task, &alloc.key))) {
 361                unsigned watermark = s->op.write_prio
 362                        ? WATERMARK_MOVINGGC
 363                        : WATERMARK_NONE;
 364
 365                spin_unlock(&c->data_bucket_lock);
 366
 367                if (bch_bucket_alloc_set(c, watermark, &alloc.key, 1, w))
 368                        return false;
 369
 370                spin_lock(&c->data_bucket_lock);
 371        }
 372
 373        /*
 374         * If we had to allocate, we might race and not need to allocate the
 375         * second time we call find_data_bucket(). If we allocated a bucket but
 376         * didn't use it, drop the refcount bch_bucket_alloc_set() took:
 377         */
 378        if (KEY_PTRS(&alloc.key))
 379                __bkey_put(c, &alloc.key);
 380
 381        for (i = 0; i < KEY_PTRS(&b->key); i++)
 382                EBUG_ON(ptr_stale(c, &b->key, i));
 383
 384        /* Set up the pointer to the space we're allocating: */
 385
 386        for (i = 0; i < KEY_PTRS(&b->key); i++)
 387                k->ptr[i] = b->key.ptr[i];
 388
 389        sectors = min(sectors, b->sectors_free);
 390
 391        SET_KEY_OFFSET(k, KEY_OFFSET(k) + sectors);
 392        SET_KEY_SIZE(k, sectors);
 393        SET_KEY_PTRS(k, KEY_PTRS(&b->key));
 394
 395        /*
 396         * Move b to the end of the lru, and keep track of what this bucket was
 397         * last used for:
 398         */
 399        list_move_tail(&b->list, &c->data_buckets);
 400        bkey_copy_key(&b->key, k);
 401        b->last = s->task;
 402
 403        b->sectors_free -= sectors;
 404
 405        for (i = 0; i < KEY_PTRS(&b->key); i++) {
 406                SET_PTR_OFFSET(&b->key, i, PTR_OFFSET(&b->key, i) + sectors);
 407
 408                atomic_long_add(sectors,
 409                                &PTR_CACHE(c, &b->key, i)->sectors_written);
 410        }
 411
 412        if (b->sectors_free < c->sb.block_size)
 413                b->sectors_free = 0;
 414
 415        /*
 416         * k takes refcounts on the buckets it points to until it's inserted
 417         * into the btree, but if we're done with this bucket we just transfer
 418         * get_data_bucket()'s refcount.
 419         */
 420        if (b->sectors_free)
 421                for (i = 0; i < KEY_PTRS(&b->key); i++)
 422                        atomic_inc(&PTR_BUCKET(c, &b->key, i)->pin);
 423
 424        spin_unlock(&c->data_bucket_lock);
 425        return true;
 426}
 427
 428static void bch_insert_data_error(struct closure *cl)
 429{
 430        struct btree_op *op = container_of(cl, struct btree_op, cl);
 431
 432        /*
 433         * Our data write just errored, which means we've got a bunch of keys to
 434         * insert that point to data that wasn't succesfully written.
 435         *
 436         * We don't have to insert those keys but we still have to invalidate
 437         * that region of the cache - so, if we just strip off all the pointers
 438         * from the keys we'll accomplish just that.
 439         */
 440
 441        struct bkey *src = op->keys.bottom, *dst = op->keys.bottom;
 442
 443        while (src != op->keys.top) {
 444                struct bkey *n = bkey_next(src);
 445
 446                SET_KEY_PTRS(src, 0);
 447                bkey_copy(dst, src);
 448
 449                dst = bkey_next(dst);
 450                src = n;
 451        }
 452
 453        op->keys.top = dst;
 454
 455        bch_journal(cl);
 456}
 457
 458static void bch_insert_data_endio(struct bio *bio, int error)
 459{
 460        struct closure *cl = bio->bi_private;
 461        struct btree_op *op = container_of(cl, struct btree_op, cl);
 462        struct search *s = container_of(op, struct search, op);
 463
 464        if (error) {
 465                /* TODO: We could try to recover from this. */
 466                if (s->writeback)
 467                        s->error = error;
 468                else if (s->write)
 469                        set_closure_fn(cl, bch_insert_data_error, bcache_wq);
 470                else
 471                        set_closure_fn(cl, NULL, NULL);
 472        }
 473
 474        bch_bbio_endio(op->c, bio, error, "writing data to cache");
 475}
 476
 477static void bch_insert_data_loop(struct closure *cl)
 478{
 479        struct btree_op *op = container_of(cl, struct btree_op, cl);
 480        struct search *s = container_of(op, struct search, op);
 481        struct bio *bio = op->cache_bio, *n;
 482
 483        if (op->skip)
 484                return bio_invalidate(cl);
 485
 486        if (atomic_sub_return(bio_sectors(bio), &op->c->sectors_to_gc) < 0) {
 487                set_gc_sectors(op->c);
 488                bch_queue_gc(op->c);
 489        }
 490
 491        /*
 492         * Journal writes are marked REQ_FLUSH; if the original write was a
 493         * flush, it'll wait on the journal write.
 494         */
 495        bio->bi_rw &= ~(REQ_FLUSH|REQ_FUA);
 496
 497        do {
 498                unsigned i;
 499                struct bkey *k;
 500                struct bio_set *split = s->d
 501                        ? s->d->bio_split : op->c->bio_split;
 502
 503                /* 1 for the device pointer and 1 for the chksum */
 504                if (bch_keylist_realloc(&op->keys,
 505                                        1 + (op->csum ? 1 : 0),
 506                                        op->c))
 507                        continue_at(cl, bch_journal, bcache_wq);
 508
 509                k = op->keys.top;
 510                bkey_init(k);
 511                SET_KEY_INODE(k, op->inode);
 512                SET_KEY_OFFSET(k, bio->bi_sector);
 513
 514                if (!bch_alloc_sectors(k, bio_sectors(bio), s))
 515                        goto err;
 516
 517                n = bch_bio_split(bio, KEY_SIZE(k), GFP_NOIO, split);
 518
 519                n->bi_end_io    = bch_insert_data_endio;
 520                n->bi_private   = cl;
 521
 522                if (s->writeback) {
 523                        SET_KEY_DIRTY(k, true);
 524
 525                        for (i = 0; i < KEY_PTRS(k); i++)
 526                                SET_GC_MARK(PTR_BUCKET(op->c, k, i),
 527                                            GC_MARK_DIRTY);
 528                }
 529
 530                SET_KEY_CSUM(k, op->csum);
 531                if (KEY_CSUM(k))
 532                        bio_csum(n, k);
 533
 534                trace_bcache_cache_insert(k);
 535                bch_keylist_push(&op->keys);
 536
 537                n->bi_rw |= REQ_WRITE;
 538                bch_submit_bbio(n, op->c, k, 0);
 539        } while (n != bio);
 540
 541        op->insert_data_done = true;
 542        continue_at(cl, bch_journal, bcache_wq);
 543err:
 544        /* bch_alloc_sectors() blocks if s->writeback = true */
 545        BUG_ON(s->writeback);
 546
 547        /*
 548         * But if it's not a writeback write we'd rather just bail out if
 549         * there aren't any buckets ready to write to - it might take awhile and
 550         * we might be starving btree writes for gc or something.
 551         */
 552
 553        if (s->write) {
 554                /*
 555                 * Writethrough write: We can't complete the write until we've
 556                 * updated the index. But we don't want to delay the write while
 557                 * we wait for buckets to be freed up, so just invalidate the
 558                 * rest of the write.
 559                 */
 560                op->skip = true;
 561                return bio_invalidate(cl);
 562        } else {
 563                /*
 564                 * From a cache miss, we can just insert the keys for the data
 565                 * we have written or bail out if we didn't do anything.
 566                 */
 567                op->insert_data_done = true;
 568                bio_put(bio);
 569
 570                if (!bch_keylist_empty(&op->keys))
 571                        continue_at(cl, bch_journal, bcache_wq);
 572                else
 573                        closure_return(cl);
 574        }
 575}
 576
 577/**
 578 * bch_insert_data - stick some data in the cache
 579 *
 580 * This is the starting point for any data to end up in a cache device; it could
 581 * be from a normal write, or a writeback write, or a write to a flash only
 582 * volume - it's also used by the moving garbage collector to compact data in
 583 * mostly empty buckets.
 584 *
 585 * It first writes the data to the cache, creating a list of keys to be inserted
 586 * (if the data had to be fragmented there will be multiple keys); after the
 587 * data is written it calls bch_journal, and after the keys have been added to
 588 * the next journal write they're inserted into the btree.
 589 *
 590 * It inserts the data in op->cache_bio; bi_sector is used for the key offset,
 591 * and op->inode is used for the key inode.
 592 *
 593 * If op->skip is true, instead of inserting the data it invalidates the region
 594 * of the cache represented by op->cache_bio and op->inode.
 595 */
 596void bch_insert_data(struct closure *cl)
 597{
 598        struct btree_op *op = container_of(cl, struct btree_op, cl);
 599
 600        bch_keylist_init(&op->keys);
 601        bio_get(op->cache_bio);
 602        bch_insert_data_loop(cl);
 603}
 604
 605void bch_btree_insert_async(struct closure *cl)
 606{
 607        struct btree_op *op = container_of(cl, struct btree_op, cl);
 608        struct search *s = container_of(op, struct search, op);
 609
 610        if (bch_btree_insert(op, op->c)) {
 611                s->error                = -ENOMEM;
 612                op->insert_data_done    = true;
 613        }
 614
 615        if (op->insert_data_done) {
 616                bch_keylist_free(&op->keys);
 617                closure_return(cl);
 618        } else
 619                continue_at(cl, bch_insert_data_loop, bcache_wq);
 620}
 621
 622/* Common code for the make_request functions */
 623
 624static void request_endio(struct bio *bio, int error)
 625{
 626        struct closure *cl = bio->bi_private;
 627
 628        if (error) {
 629                struct search *s = container_of(cl, struct search, cl);
 630                s->error = error;
 631                /* Only cache read errors are recoverable */
 632                s->recoverable = false;
 633        }
 634
 635        bio_put(bio);
 636        closure_put(cl);
 637}
 638
 639void bch_cache_read_endio(struct bio *bio, int error)
 640{
 641        struct bbio *b = container_of(bio, struct bbio, bio);
 642        struct closure *cl = bio->bi_private;
 643        struct search *s = container_of(cl, struct search, cl);
 644
 645        /*
 646         * If the bucket was reused while our bio was in flight, we might have
 647         * read the wrong data. Set s->error but not error so it doesn't get
 648         * counted against the cache device, but we'll still reread the data
 649         * from the backing device.
 650         */
 651
 652        if (error)
 653                s->error = error;
 654        else if (ptr_stale(s->op.c, &b->key, 0)) {
 655                atomic_long_inc(&s->op.c->cache_read_races);
 656                s->error = -EINTR;
 657        }
 658
 659        bch_bbio_endio(s->op.c, bio, error, "reading from cache");
 660}
 661
 662static void bio_complete(struct search *s)
 663{
 664        if (s->orig_bio) {
 665                int cpu, rw = bio_data_dir(s->orig_bio);
 666                unsigned long duration = jiffies - s->start_time;
 667
 668                cpu = part_stat_lock();
 669                part_round_stats(cpu, &s->d->disk->part0);
 670                part_stat_add(cpu, &s->d->disk->part0, ticks[rw], duration);
 671                part_stat_unlock();
 672
 673                trace_bcache_request_end(s, s->orig_bio);
 674                bio_endio(s->orig_bio, s->error);
 675                s->orig_bio = NULL;
 676        }
 677}
 678
 679static void do_bio_hook(struct search *s)
 680{
 681        struct bio *bio = &s->bio.bio;
 682        memcpy(bio, s->orig_bio, sizeof(struct bio));
 683
 684        bio->bi_end_io          = request_endio;
 685        bio->bi_private         = &s->cl;
 686        atomic_set(&bio->bi_cnt, 3);
 687}
 688
 689static void search_free(struct closure *cl)
 690{
 691        struct search *s = container_of(cl, struct search, cl);
 692        bio_complete(s);
 693
 694        if (s->op.cache_bio)
 695                bio_put(s->op.cache_bio);
 696
 697        if (s->unaligned_bvec)
 698                mempool_free(s->bio.bio.bi_io_vec, s->d->unaligned_bvec);
 699
 700        closure_debug_destroy(cl);
 701        mempool_free(s, s->d->c->search);
 702}
 703
 704static struct search *search_alloc(struct bio *bio, struct bcache_device *d)
 705{
 706        struct bio_vec *bv;
 707        struct search *s = mempool_alloc(d->c->search, GFP_NOIO);
 708        memset(s, 0, offsetof(struct search, op.keys));
 709
 710        __closure_init(&s->cl, NULL);
 711
 712        s->op.inode             = d->id;
 713        s->op.c                 = d->c;
 714        s->d                    = d;
 715        s->op.lock              = -1;
 716        s->task                 = current;
 717        s->orig_bio             = bio;
 718        s->write                = (bio->bi_rw & REQ_WRITE) != 0;
 719        s->op.flush_journal     = (bio->bi_rw & (REQ_FLUSH|REQ_FUA)) != 0;
 720        s->op.skip              = (bio->bi_rw & REQ_DISCARD) != 0;
 721        s->recoverable          = 1;
 722        s->start_time           = jiffies;
 723        do_bio_hook(s);
 724
 725        if (bio->bi_size != bio_segments(bio) * PAGE_SIZE) {
 726                bv = mempool_alloc(d->unaligned_bvec, GFP_NOIO);
 727                memcpy(bv, bio_iovec(bio),
 728                       sizeof(struct bio_vec) * bio_segments(bio));
 729
 730                s->bio.bio.bi_io_vec    = bv;
 731                s->unaligned_bvec       = 1;
 732        }
 733
 734        return s;
 735}
 736
 737static void btree_read_async(struct closure *cl)
 738{
 739        struct btree_op *op = container_of(cl, struct btree_op, cl);
 740
 741        int ret = btree_root(search_recurse, op->c, op);
 742
 743        if (ret == -EAGAIN)
 744                continue_at(cl, btree_read_async, bcache_wq);
 745
 746        closure_return(cl);
 747}
 748
 749/* Cached devices */
 750
 751static void cached_dev_bio_complete(struct closure *cl)
 752{
 753        struct search *s = container_of(cl, struct search, cl);
 754        struct cached_dev *dc = container_of(s->d, struct cached_dev, disk);
 755
 756        search_free(cl);
 757        cached_dev_put(dc);
 758}
 759
 760/* Process reads */
 761
 762static void cached_dev_read_complete(struct closure *cl)
 763{
 764        struct search *s = container_of(cl, struct search, cl);
 765
 766        if (s->op.insert_collision)
 767                bch_mark_cache_miss_collision(s);
 768
 769        if (s->op.cache_bio) {
 770                int i;
 771                struct bio_vec *bv;
 772
 773                __bio_for_each_segment(bv, s->op.cache_bio, i, 0)
 774                        __free_page(bv->bv_page);
 775        }
 776
 777        cached_dev_bio_complete(cl);
 778}
 779
 780static void request_read_error(struct closure *cl)
 781{
 782        struct search *s = container_of(cl, struct search, cl);
 783        struct bio_vec *bv;
 784        int i;
 785
 786        if (s->recoverable) {
 787                /* Retry from the backing device: */
 788                trace_bcache_read_retry(s->orig_bio);
 789
 790                s->error = 0;
 791                bv = s->bio.bio.bi_io_vec;
 792                do_bio_hook(s);
 793                s->bio.bio.bi_io_vec = bv;
 794
 795                if (!s->unaligned_bvec)
 796                        bio_for_each_segment(bv, s->orig_bio, i)
 797                                bv->bv_offset = 0, bv->bv_len = PAGE_SIZE;
 798                else
 799                        memcpy(s->bio.bio.bi_io_vec,
 800                               bio_iovec(s->orig_bio),
 801                               sizeof(struct bio_vec) *
 802                               bio_segments(s->orig_bio));
 803
 804                /* XXX: invalidate cache */
 805
 806                closure_bio_submit(&s->bio.bio, &s->cl, s->d);
 807        }
 808
 809        continue_at(cl, cached_dev_read_complete, NULL);
 810}
 811
 812static void request_read_done(struct closure *cl)
 813{
 814        struct search *s = container_of(cl, struct search, cl);
 815        struct cached_dev *dc = container_of(s->d, struct cached_dev, disk);
 816
 817        /*
 818         * s->cache_bio != NULL implies that we had a cache miss; cache_bio now
 819         * contains data ready to be inserted into the cache.
 820         *
 821         * First, we copy the data we just read from cache_bio's bounce buffers
 822         * to the buffers the original bio pointed to:
 823         */
 824
 825        if (s->op.cache_bio) {
 826                bio_reset(s->op.cache_bio);
 827                s->op.cache_bio->bi_sector      = s->cache_miss->bi_sector;
 828                s->op.cache_bio->bi_bdev        = s->cache_miss->bi_bdev;
 829                s->op.cache_bio->bi_size        = s->cache_bio_sectors << 9;
 830                bch_bio_map(s->op.cache_bio, NULL);
 831
 832                bio_copy_data(s->cache_miss, s->op.cache_bio);
 833
 834                bio_put(s->cache_miss);
 835                s->cache_miss = NULL;
 836        }
 837
 838        if (verify(dc, &s->bio.bio) && s->recoverable)
 839                bch_data_verify(s);
 840
 841        bio_complete(s);
 842
 843        if (s->op.cache_bio &&
 844            !test_bit(CACHE_SET_STOPPING, &s->op.c->flags)) {
 845                s->op.type = BTREE_REPLACE;
 846                closure_call(&s->op.cl, bch_insert_data, NULL, cl);
 847        }
 848
 849        continue_at(cl, cached_dev_read_complete, NULL);
 850}
 851
 852static void request_read_done_bh(struct closure *cl)
 853{
 854        struct search *s = container_of(cl, struct search, cl);
 855        struct cached_dev *dc = container_of(s->d, struct cached_dev, disk);
 856
 857        bch_mark_cache_accounting(s, !s->cache_miss, s->op.skip);
 858        trace_bcache_read(s->orig_bio, !s->cache_miss, s->op.skip);
 859
 860        if (s->error)
 861                continue_at_nobarrier(cl, request_read_error, bcache_wq);
 862        else if (s->op.cache_bio || verify(dc, &s->bio.bio))
 863                continue_at_nobarrier(cl, request_read_done, bcache_wq);
 864        else
 865                continue_at_nobarrier(cl, cached_dev_read_complete, NULL);
 866}
 867
 868static int cached_dev_cache_miss(struct btree *b, struct search *s,
 869                                 struct bio *bio, unsigned sectors)
 870{
 871        int ret = 0;
 872        unsigned reada;
 873        struct cached_dev *dc = container_of(s->d, struct cached_dev, disk);
 874        struct bio *miss;
 875
 876        miss = bch_bio_split(bio, sectors, GFP_NOIO, s->d->bio_split);
 877        if (miss == bio)
 878                s->op.lookup_done = true;
 879
 880        miss->bi_end_io         = request_endio;
 881        miss->bi_private        = &s->cl;
 882
 883        if (s->cache_miss || s->op.skip)
 884                goto out_submit;
 885
 886        if (miss != bio ||
 887            (bio->bi_rw & REQ_RAHEAD) ||
 888            (bio->bi_rw & REQ_META) ||
 889            s->op.c->gc_stats.in_use >= CUTOFF_CACHE_READA)
 890                reada = 0;
 891        else {
 892                reada = min(dc->readahead >> 9,
 893                            sectors - bio_sectors(miss));
 894
 895                if (bio_end_sector(miss) + reada > bdev_sectors(miss->bi_bdev))
 896                        reada = bdev_sectors(miss->bi_bdev) -
 897                                bio_end_sector(miss);
 898        }
 899
 900        s->cache_bio_sectors = bio_sectors(miss) + reada;
 901        s->op.cache_bio = bio_alloc_bioset(GFP_NOWAIT,
 902                        DIV_ROUND_UP(s->cache_bio_sectors, PAGE_SECTORS),
 903                        dc->disk.bio_split);
 904
 905        if (!s->op.cache_bio)
 906                goto out_submit;
 907
 908        s->op.cache_bio->bi_sector      = miss->bi_sector;
 909        s->op.cache_bio->bi_bdev        = miss->bi_bdev;
 910        s->op.cache_bio->bi_size        = s->cache_bio_sectors << 9;
 911
 912        s->op.cache_bio->bi_end_io      = request_endio;
 913        s->op.cache_bio->bi_private     = &s->cl;
 914
 915        /* btree_search_recurse()'s btree iterator is no good anymore */
 916        ret = -EINTR;
 917        if (!bch_btree_insert_check_key(b, &s->op, s->op.cache_bio))
 918                goto out_put;
 919
 920        bch_bio_map(s->op.cache_bio, NULL);
 921        if (bio_alloc_pages(s->op.cache_bio, __GFP_NOWARN|GFP_NOIO))
 922                goto out_put;
 923
 924        s->cache_miss = miss;
 925        bio_get(s->op.cache_bio);
 926
 927        closure_bio_submit(s->op.cache_bio, &s->cl, s->d);
 928
 929        return ret;
 930out_put:
 931        bio_put(s->op.cache_bio);
 932        s->op.cache_bio = NULL;
 933out_submit:
 934        closure_bio_submit(miss, &s->cl, s->d);
 935        return ret;
 936}
 937
 938static void request_read(struct cached_dev *dc, struct search *s)
 939{
 940        struct closure *cl = &s->cl;
 941
 942        check_should_skip(dc, s);
 943        closure_call(&s->op.cl, btree_read_async, NULL, cl);
 944
 945        continue_at(cl, request_read_done_bh, NULL);
 946}
 947
 948/* Process writes */
 949
 950static void cached_dev_write_complete(struct closure *cl)
 951{
 952        struct search *s = container_of(cl, struct search, cl);
 953        struct cached_dev *dc = container_of(s->d, struct cached_dev, disk);
 954
 955        up_read_non_owner(&dc->writeback_lock);
 956        cached_dev_bio_complete(cl);
 957}
 958
 959static void request_write(struct cached_dev *dc, struct search *s)
 960{
 961        struct closure *cl = &s->cl;
 962        struct bio *bio = &s->bio.bio;
 963        struct bkey start, end;
 964        start = KEY(dc->disk.id, bio->bi_sector, 0);
 965        end = KEY(dc->disk.id, bio_end_sector(bio), 0);
 966
 967        bch_keybuf_check_overlapping(&s->op.c->moving_gc_keys, &start, &end);
 968
 969        check_should_skip(dc, s);
 970        down_read_non_owner(&dc->writeback_lock);
 971
 972        if (bch_keybuf_check_overlapping(&dc->writeback_keys, &start, &end)) {
 973                s->op.skip      = false;
 974                s->writeback    = true;
 975        }
 976
 977        if (bio->bi_rw & REQ_DISCARD)
 978                goto skip;
 979
 980        if (should_writeback(dc, s->orig_bio,
 981                             cache_mode(dc, bio),
 982                             s->op.skip)) {
 983                s->op.skip = false;
 984                s->writeback = true;
 985        }
 986
 987        if (s->op.skip)
 988                goto skip;
 989
 990        trace_bcache_write(s->orig_bio, s->writeback, s->op.skip);
 991
 992        if (!s->writeback) {
 993                s->op.cache_bio = bio_clone_bioset(bio, GFP_NOIO,
 994                                                   dc->disk.bio_split);
 995
 996                closure_bio_submit(bio, cl, s->d);
 997        } else {
 998                bch_writeback_add(dc);
 999
1000                if (s->op.flush_journal) {

1001                        /* Also need to send a flush to the backing device */
1002                        s->op.cache_bio = bio_clone_bioset(bio, GFP_NOIO,
1003                                                           dc->disk.bio_split);
1004
1005                        bio->bi_size = 0;
1006                        bio->bi_vcnt = 0;
1007                        closure_bio_submit(bio, cl, s->d);
1008                } else {
1009                        s->op.cache_bio = bio;
1010                }
1011        }
1012out:
1013        closure_call(&s->op.cl, bch_insert_data, NULL, cl);
1014        continue_at(cl, cached_dev_write_complete, NULL);
1015skip:
1016        s->op.skip = true;
1017        s->op.cache_bio = s->orig_bio;
1018        bio_get(s->op.cache_bio);
1019
1020        if ((bio->bi_rw & REQ_DISCARD) &&
1021            !blk_queue_discard(bdev_get_queue(dc->bdev)))
1022                goto out;
1023
1024        closure_bio_submit(bio, cl, s->d);
1025        goto out;
1026}
1027
1028static void request_nodata(struct cached_dev *dc, struct search *s)
1029{
1030        struct closure *cl = &s->cl;
1031        struct bio *bio = &s->bio.bio;
1032
1033        if (bio->bi_rw & REQ_DISCARD) {
1034                request_write(dc, s);
1035                return;
1036        }
1037
1038        if (s->op.flush_journal)
1039                bch_journal_meta(s->op.c, cl);
1040
1041        closure_bio_submit(bio, cl, s->d);
1042
1043        continue_at(cl, cached_dev_bio_complete, NULL);
1044}
1045
1046/* Cached devices - read & write stuff */
1047
1048unsigned bch_get_congested(struct cache_set *c)
1049{
1050        int i;
1051        long rand;
1052
1053        if (!c->congested_read_threshold_us &&
1054            !c->congested_write_threshold_us)
1055                return 0;
1056
1057        i = (local_clock_us() - c->congested_last_us) / 1024;
1058        if (i < 0)
1059                return 0;
1060
1061        i += atomic_read(&c->congested);
1062        if (i >= 0)
1063                return 0;
1064
1065        i += CONGESTED_MAX;
1066
1067        if (i > 0)
1068                i = fract_exp_two(i, 6);
1069
1070        rand = get_random_int();
1071        i -= bitmap_weight(&rand, BITS_PER_LONG);
1072
1073        return i > 0 ? i : 1;
1074}
1075
1076static void add_sequential(struct task_struct *t)
1077{
1078        ewma_add(t->sequential_io_avg,
1079                 t->sequential_io, 8, 0);
1080
1081        t->sequential_io = 0;
1082}
1083
1084static struct hlist_head *iohash(struct cached_dev *dc, uint64_t k)
1085{
1086        return &dc->io_hash[hash_64(k, RECENT_IO_BITS)];
1087}
1088
1089static void check_should_skip(struct cached_dev *dc, struct search *s)
1090{
1091        struct cache_set *c = s->op.c;
1092        struct bio *bio = &s->bio.bio;
1093        unsigned mode = cache_mode(dc, bio);
1094        unsigned sectors, congested = bch_get_congested(c);
1095
1096        if (atomic_read(&dc->disk.detaching) ||
1097            c->gc_stats.in_use > CUTOFF_CACHE_ADD ||
1098            (bio->bi_rw & REQ_DISCARD))
1099                goto skip;
1100
1101        if (mode == CACHE_MODE_NONE ||
1102            (mode == CACHE_MODE_WRITEAROUND &&
1103             (bio->bi_rw & REQ_WRITE)))
1104                goto skip;
1105
1106        if (bio->bi_sector   & (c->sb.block_size - 1) ||
1107            bio_sectors(bio) & (c->sb.block_size - 1)) {
1108                pr_debug("skipping unaligned io");
1109                goto skip;
1110        }
1111
1112        if (!congested && !dc->sequential_cutoff)
1113                goto rescale;
1114
1115        if (!congested &&
1116            mode == CACHE_MODE_WRITEBACK &&
1117            (bio->bi_rw & REQ_WRITE) &&
1118            (bio->bi_rw & REQ_SYNC))
1119                goto rescale;
1120
1121        if (dc->sequential_merge) {
1122                struct io *i;
1123
1124                spin_lock(&dc->io_lock);
1125
1126                hlist_for_each_entry(i, iohash(dc, bio->bi_sector), hash)
1127                        if (i->last == bio->bi_sector &&
1128                            time_before(jiffies, i->jiffies))
1129                                goto found;
1130
1131                i = list_first_entry(&dc->io_lru, struct io, lru);
1132
1133                add_sequential(s->task);
1134                i->sequential = 0;
1135found:
1136                if (i->sequential + bio->bi_size > i->sequential)
1137                        i->sequential   += bio->bi_size;
1138
1139                i->last                  = bio_end_sector(bio);
1140                i->jiffies               = jiffies + msecs_to_jiffies(5000);
1141                s->task->sequential_io   = i->sequential;
1142
1143                hlist_del(&i->hash);
1144                hlist_add_head(&i->hash, iohash(dc, i->last));
1145                list_move_tail(&i->lru, &dc->io_lru);
1146
1147                spin_unlock(&dc->io_lock);
1148        } else {
1149                s->task->sequential_io = bio->bi_size;
1150
1151                add_sequential(s->task);
1152        }
1153
1154        sectors = max(s->task->sequential_io,
1155                      s->task->sequential_io_avg) >> 9;
1156
1157        if (dc->sequential_cutoff &&
1158            sectors >= dc->sequential_cutoff >> 9) {
1159                trace_bcache_bypass_sequential(s->orig_bio);
1160                goto skip;
1161        }
1162
1163        if (congested && sectors >= congested) {
1164                trace_bcache_bypass_congested(s->orig_bio);
1165                goto skip;
1166        }
1167
1168rescale:
1169        bch_rescale_priorities(c, bio_sectors(bio));
1170        return;
1171skip:
1172        bch_mark_sectors_bypassed(s, bio_sectors(bio));
1173        s->op.skip = true;
1174}
1175
1176static void cached_dev_make_request(struct request_queue *q, struct bio *bio)
1177{
1178        struct search *s;
1179        struct bcache_device *d = bio->bi_bdev->bd_disk->private_data;
1180        struct cached_dev *dc = container_of(d, struct cached_dev, disk);
1181        int cpu, rw = bio_data_dir(bio);
1182
1183        cpu = part_stat_lock();
1184        part_stat_inc(cpu, &d->disk->part0, ios[rw]);
1185        part_stat_add(cpu, &d->disk->part0, sectors[rw], bio_sectors(bio));
1186        part_stat_unlock();
1187
1188        bio->bi_bdev = dc->bdev;
1189        bio->bi_sector += dc->sb.data_offset;
1190
1191        if (cached_dev_get(dc)) {
1192                s = search_alloc(bio, d);
1193                trace_bcache_request_start(s, bio);
1194
1195                if (!bio_has_data(bio))
1196                        request_nodata(dc, s);
1197                else if (rw)
1198                        request_write(dc, s);
1199                else
1200                        request_read(dc, s);
1201        } else {
1202                if ((bio->bi_rw & REQ_DISCARD) &&
1203                    !blk_queue_discard(bdev_get_queue(dc->bdev)))
1204                        bio_endio(bio, 0);
1205                else
1206                        bch_generic_make_request(bio, &d->bio_split_hook);
1207        }
1208}
1209
1210static int cached_dev_ioctl(struct bcache_device *d, fmode_t mode,
1211                            unsigned int cmd, unsigned long arg)
1212{
1213        struct cached_dev *dc = container_of(d, struct cached_dev, disk);
1214        return __blkdev_driver_ioctl(dc->bdev, mode, cmd, arg);
1215}
1216
1217static int cached_dev_congested(void *data, int bits)
1218{
1219        struct bcache_device *d = data;
1220        struct cached_dev *dc = container_of(d, struct cached_dev, disk);
1221        struct request_queue *q = bdev_get_queue(dc->bdev);
1222        int ret = 0;
1223
1224        if (bdi_congested(&q->backing_dev_info, bits))
1225                return 1;
1226
1227        if (cached_dev_get(dc)) {
1228                unsigned i;
1229                struct cache *ca;
1230
1231                for_each_cache(ca, d->c, i) {
1232                        q = bdev_get_queue(ca->bdev);
1233                        ret |= bdi_congested(&q->backing_dev_info, bits);
1234                }
1235
1236                cached_dev_put(dc);
1237        }
1238
1239        return ret;
1240}
1241
1242void bch_cached_dev_request_init(struct cached_dev *dc)
1243{
1244        struct gendisk *g = dc->disk.disk;
1245
1246        g->queue->make_request_fn               = cached_dev_make_request;
1247        g->queue->backing_dev_info.congested_fn = cached_dev_congested;
1248        dc->disk.cache_miss                     = cached_dev_cache_miss;
1249        dc->disk.ioctl                          = cached_dev_ioctl;
1250}
1251
1252/* Flash backed devices */
1253
1254static int flash_dev_cache_miss(struct btree *b, struct search *s,
1255                                struct bio *bio, unsigned sectors)
1256{
1257        struct bio_vec *bv;
1258        int i;
1259
1260        /* Zero fill bio */
1261
1262        bio_for_each_segment(bv, bio, i) {
1263                unsigned j = min(bv->bv_len >> 9, sectors);
1264
1265                void *p = kmap(bv->bv_page);
1266                memset(p + bv->bv_offset, 0, j << 9);
1267                kunmap(bv->bv_page);
1268
1269                sectors -= j;
1270        }
1271
1272        bio_advance(bio, min(sectors << 9, bio->bi_size));
1273
1274        if (!bio->bi_size)
1275                s->op.lookup_done = true;
1276
1277        return 0;
1278}
1279
1280static void flash_dev_make_request(struct request_queue *q, struct bio *bio)
1281{
1282        struct search *s;
1283        struct closure *cl;
1284        struct bcache_device *d = bio->bi_bdev->bd_disk->private_data;
1285        int cpu, rw = bio_data_dir(bio);
1286
1287        cpu = part_stat_lock();
1288        part_stat_inc(cpu, &d->disk->part0, ios[rw]);
1289        part_stat_add(cpu, &d->disk->part0, sectors[rw], bio_sectors(bio));
1290        part_stat_unlock();
1291
1292        s = search_alloc(bio, d);
1293        cl = &s->cl;
1294        bio = &s->bio.bio;
1295
1296        trace_bcache_request_start(s, bio);
1297
1298        if (bio_has_data(bio) && !rw) {
1299                closure_call(&s->op.cl, btree_read_async, NULL, cl);
1300        } else if (bio_has_data(bio) || s->op.skip) {
1301                bch_keybuf_check_overlapping(&s->op.c->moving_gc_keys,
1302                                        &KEY(d->id, bio->bi_sector, 0),
1303                                        &KEY(d->id, bio_end_sector(bio), 0));
1304
1305                s->writeback    = true;
1306                s->op.cache_bio = bio;
1307
1308                closure_call(&s->op.cl, bch_insert_data, NULL, cl);
1309        } else {
1310                /* No data - probably a cache flush */
1311                if (s->op.flush_journal)
1312                        bch_journal_meta(s->op.c, cl);
1313        }
1314
1315        continue_at(cl, search_free, NULL);
1316}
1317
1318static int flash_dev_ioctl(struct bcache_device *d, fmode_t mode,
1319                           unsigned int cmd, unsigned long arg)
1320{
1321        return -ENOTTY;
1322}
1323
1324static int flash_dev_congested(void *data, int bits)
1325{
1326        struct bcache_device *d = data;
1327        struct request_queue *q;
1328        struct cache *ca;
1329        unsigned i;
1330        int ret = 0;
1331
1332        for_each_cache(ca, d->c, i) {
1333                q = bdev_get_queue(ca->bdev);
1334                ret |= bdi_congested(&q->backing_dev_info, bits);
1335        }
1336
1337        return ret;
1338}
1339
1340void bch_flash_dev_request_init(struct bcache_device *d)
1341{
1342        struct gendisk *g = d->disk;
1343
1344        g->queue->make_request_fn               = flash_dev_make_request;
1345        g->queue->backing_dev_info.congested_fn = flash_dev_congested;
1346        d->cache_miss                           = flash_dev_cache_miss;
1347        d->ioctl                                = flash_dev_ioctl;
1348}
1349
1350void bch_request_exit(void)
1351{
1352#ifdef CONFIG_CGROUP_BCACHE
1353        cgroup_unload_subsys(&bcache_subsys);
1354#endif
1355        if (bch_search_cache)
1356                kmem_cache_destroy(bch_search_cache);
1357}
1358
1359int __init bch_request_init(void)
1360{
1361        bch_search_cache = KMEM_CACHE(search, 0);
1362        if (!bch_search_cache)
1363                return -ENOMEM;
1364
1365#ifdef CONFIG_CGROUP_BCACHE
1366        cgroup_load_subsys(&bcache_subsys);
1367        init_bch_cgroup(&bcache_default_cgroup);
1368
1369        cgroup_add_cftypes(&bcache_subsys, bch_files);
1370#endif
1371        return 0;
1372}
1373