LXR linux/drivers/block/drbd/drbd

   1/*
   2   drbd_actlog.c
   3
   4   This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
   5
   6   Copyright (C) 2003-2008, LINBIT Information Technologies GmbH.
   7   Copyright (C) 2003-2008, Philipp Reisner <philipp.reisner@linbit.com>.
   8   Copyright (C) 2003-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
   9
  10   drbd is free software; you can redistribute it and/or modify
  11   it under the terms of the GNU General Public License as published by
  12   the Free Software Foundation; either version 2, or (at your option)
  13   any later version.
  14
  15   drbd is distributed in the hope that it will be useful,
  16   but WITHOUT ANY WARRANTY; without even the implied warranty of
  17   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  18   GNU General Public License for more details.
  19
  20   You should have received a copy of the GNU General Public License
  21   along with drbd; see the file COPYING.  If not, write to
  22   the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
  23
  24 */
  25
  26#include <linux/slab.h>
  27#include <linux/crc32c.h>
  28#include <linux/drbd.h>
  29#include <linux/drbd_limits.h>
  30#include <linux/dynamic_debug.h>
  31#include "drbd_int.h"
  32#include "drbd_wrappers.h"
  33
  34
  35enum al_transaction_types {
  36        AL_TR_UPDATE = 0,
  37        AL_TR_INITIALIZED = 0xffff
  38};
  39/* all fields on disc in big endian */
  40struct __packed al_transaction_on_disk {
  41        /* don't we all like magic */
  42        __be32  magic;
  43
  44        /* to identify the most recent transaction block
  45         * in the on disk ring buffer */
  46        __be32  tr_number;
  47
  48        /* checksum on the full 4k block, with this field set to 0. */
  49        __be32  crc32c;
  50
  51        /* type of transaction, special transaction types like:
  52         * purge-all, set-all-idle, set-all-active, ... to-be-defined
  53         * see also enum al_transaction_types */
  54        __be16  transaction_type;
  55
  56        /* we currently allow only a few thousand extents,
  57         * so 16bit will be enough for the slot number. */
  58
  59        /* how many updates in this transaction */
  60        __be16  n_updates;
  61
  62        /* maximum slot number, "al-extents" in drbd.conf speak.
  63         * Having this in each transaction should make reconfiguration
  64         * of that parameter easier. */
  65        __be16  context_size;
  66
  67        /* slot number the context starts with */
  68        __be16  context_start_slot_nr;
  69
  70        /* Some reserved bytes.  Expected usage is a 64bit counter of
  71         * sectors-written since device creation, and other data generation tag
  72         * supporting usage */
  73        __be32  __reserved[4];
  74
  75        /* --- 36 byte used --- */
  76
  77        /* Reserve space for up to AL_UPDATES_PER_TRANSACTION changes
  78         * in one transaction, then use the remaining byte in the 4k block for
  79         * context information.  "Flexible" number of updates per transaction
  80         * does not help, as we have to account for the case when all update
  81         * slots are used anyways, so it would only complicate code without
  82         * additional benefit.
  83         */
  84        __be16  update_slot_nr[AL_UPDATES_PER_TRANSACTION];
  85
  86        /* but the extent number is 32bit, which at an extent size of 4 MiB
  87         * allows to cover device sizes of up to 2**54 Byte (16 PiB) */
  88        __be32  update_extent_nr[AL_UPDATES_PER_TRANSACTION];
  89
  90        /* --- 420 bytes used (36 + 64*6) --- */
  91
  92        /* 4096 - 420 = 3676 = 919 * 4 */
  93        __be32  context[AL_CONTEXT_PER_TRANSACTION];
  94};
  95
  96struct update_odbm_work {
  97        struct drbd_work w;
  98        unsigned int enr;
  99};
 100
 101struct update_al_work {
 102        struct drbd_work w;
 103        struct completion event;
 104        int err;
 105};
 106
 107
 108void *drbd_md_get_buffer(struct drbd_conf *mdev)
 109{
 110        int r;
 111
 112        wait_event(mdev->misc_wait,
 113                   (r = atomic_cmpxchg(&mdev->md_io_in_use, 0, 1)) == 0 ||
 114                   mdev->state.disk <= D_FAILED);
 115
 116        return r ? NULL : page_address(mdev->md_io_page);
 117}
 118
 119void drbd_md_put_buffer(struct drbd_conf *mdev)
 120{
 121        if (atomic_dec_and_test(&mdev->md_io_in_use))
 122                wake_up(&mdev->misc_wait);
 123}
 124
 125void wait_until_done_or_force_detached(struct drbd_conf *mdev, struct drbd_backing_dev *bdev,
 126                                     unsigned int *done)
 127{
 128        long dt;
 129
 130        rcu_read_lock();
 131        dt = rcu_dereference(bdev->disk_conf)->disk_timeout;
 132        rcu_read_unlock();
 133        dt = dt * HZ / 10;
 134        if (dt == 0)
 135                dt = MAX_SCHEDULE_TIMEOUT;
 136
 137        dt = wait_event_timeout(mdev->misc_wait,
 138                        *done || test_bit(FORCE_DETACH, &mdev->flags), dt);
 139        if (dt == 0) {
 140                dev_err(DEV, "meta-data IO operation timed out\n");
 141                drbd_chk_io_error(mdev, 1, DRBD_FORCE_DETACH);
 142        }
 143}
 144
 145static int _drbd_md_sync_page_io(struct drbd_conf *mdev,
 146                                 struct drbd_backing_dev *bdev,
 147                                 struct page *page, sector_t sector,
 148                                 int rw, int size)
 149{
 150        struct bio *bio;
 151        int err;
 152
 153        mdev->md_io.done = 0;
 154        mdev->md_io.error = -ENODEV;
 155
 156        if ((rw & WRITE) && !test_bit(MD_NO_FUA, &mdev->flags))
 157                rw |= REQ_FUA | REQ_FLUSH;
 158        rw |= REQ_SYNC;
 159
 160        bio = bio_alloc_drbd(GFP_NOIO);
 161        bio->bi_bdev = bdev->md_bdev;
 162        bio->bi_sector = sector;
 163        err = -EIO;
 164        if (bio_add_page(bio, page, size, 0) != size)
 165                goto out;
 166        bio->bi_private = &mdev->md_io;
 167        bio->bi_end_io = drbd_md_io_complete;
 168        bio->bi_rw = rw;
 169
 170        if (!(rw & WRITE) && mdev->state.disk == D_DISKLESS && mdev->ldev == NULL)
 171                /* special case, drbd_md_read() during drbd_adm_attach(): no get_ldev */
 172                ;
 173        else if (!get_ldev_if_state(mdev, D_ATTACHING)) {
 174                /* Corresponding put_ldev in drbd_md_io_complete() */
 175                dev_err(DEV, "ASSERT FAILED: get_ldev_if_state() == 1 in _drbd_md_sync_page_io()\n");
 176                err = -ENODEV;
 177                goto out;
 178        }
 179
 180        bio_get(bio); /* one bio_put() is in the completion handler */
 181        atomic_inc(&mdev->md_io_in_use); /* drbd_md_put_buffer() is in the completion handler */
 182        if (drbd_insert_fault(mdev, (rw & WRITE) ? DRBD_FAULT_MD_WR : DRBD_FAULT_MD_RD))
 183                bio_endio(bio, -EIO);
 184        else
 185                submit_bio(rw, bio);
 186        wait_until_done_or_force_detached(mdev, bdev, &mdev->md_io.done);
 187        if (bio_flagged(bio, BIO_UPTODATE))
 188                err = mdev->md_io.error;
 189
 190 out:
 191        bio_put(bio);
 192        return err;
 193}
 194
 195int drbd_md_sync_page_io(struct drbd_conf *mdev, struct drbd_backing_dev *bdev,
 196                         sector_t sector, int rw)
 197{
 198        int err;
 199        struct page *iop = mdev->md_io_page;
 200
 201        D_ASSERT(atomic_read(&mdev->md_io_in_use) == 1);
 202
 203        BUG_ON(!bdev->md_bdev);
 204
 205        dev_dbg(DEV, "meta_data io: %s [%d]:%s(,%llus,%s) %pS\n",
 206             current->comm, current->pid, __func__,
 207             (unsigned long long)sector, (rw & WRITE) ? "WRITE" : "READ",
 208             (void*)_RET_IP_ );
 209
 210        if (sector < drbd_md_first_sector(bdev) ||
 211            sector + 7 > drbd_md_last_sector(bdev))
 212                dev_alert(DEV, "%s [%d]:%s(,%llus,%s) out of range md access!\n",
 213                     current->comm, current->pid, __func__,
 214                     (unsigned long long)sector, (rw & WRITE) ? "WRITE" : "READ");
 215
 216        /* we do all our meta data IO in aligned 4k blocks. */
 217        err = _drbd_md_sync_page_io(mdev, bdev, iop, sector, rw, 4096);
 218        if (err) {
 219                dev_err(DEV, "drbd_md_sync_page_io(,%llus,%s) failed with error %d\n",
 220                    (unsigned long long)sector, (rw & WRITE) ? "WRITE" : "READ", err);
 221        }
 222        return err;
 223}
 224
 225static struct bm_extent *find_active_resync_extent(struct drbd_conf *mdev, unsigned int enr)
 226{
 227        struct lc_element *tmp;
 228        tmp = lc_find(mdev->resync, enr/AL_EXT_PER_BM_SECT);
 229        if (unlikely(tmp != NULL)) {
 230                struct bm_extent  *bm_ext = lc_entry(tmp, struct bm_extent, lce);
 231                if (test_bit(BME_NO_WRITES, &bm_ext->flags))
 232                        return bm_ext;
 233        }
 234        return NULL;
 235}
 236
 237static struct lc_element *_al_get(struct drbd_conf *mdev, unsigned int enr, bool nonblock)
 238{
 239        struct lc_element *al_ext;
 240        struct bm_extent *bm_ext;
 241        int wake;
 242
 243        spin_lock_irq(&mdev->al_lock);
 244        bm_ext = find_active_resync_extent(mdev, enr);
 245        if (bm_ext) {
 246                wake = !test_and_set_bit(BME_PRIORITY, &bm_ext->flags);
 247                spin_unlock_irq(&mdev->al_lock);
 248                if (wake)
 249                        wake_up(&mdev->al_wait);
 250                return NULL;
 251        }
 252        if (nonblock)
 253                al_ext = lc_try_get(mdev->act_log, enr);
 254        else
 255                al_ext = lc_get(mdev->act_log, enr);
 256        spin_unlock_irq(&mdev->al_lock);
 257        return al_ext;
 258}
 259
 260bool drbd_al_begin_io_fastpath(struct drbd_conf *mdev, struct drbd_interval *i)
 261{
 262        /* for bios crossing activity log extent boundaries,
 263         * we may need to activate two extents in one go */
 264        unsigned first = i->sector >> (AL_EXTENT_SHIFT-9);
 265        unsigned last = i->size == 0 ? first : (i->sector + (i->size >> 9) - 1) >> (AL_EXTENT_SHIFT-9);
 266
 267        D_ASSERT((unsigned)(last - first) <= 1);
 268        D_ASSERT(atomic_read(&mdev->local_cnt) > 0);
 269
 270        /* FIXME figure out a fast path for bios crossing AL extent boundaries */
 271        if (first != last)
 272                return false;
 273
 274        return _al_get(mdev, first, true);
 275}
 276
 277bool drbd_al_begin_io_prepare(struct drbd_conf *mdev, struct drbd_interval *i)
 278{
 279        /* for bios crossing activity log extent boundaries,
 280         * we may need to activate two extents in one go */
 281        unsigned first = i->sector >> (AL_EXTENT_SHIFT-9);
 282        unsigned last = i->size == 0 ? first : (i->sector + (i->size >> 9) - 1) >> (AL_EXTENT_SHIFT-9);
 283        unsigned enr;
 284        bool need_transaction = false;
 285
 286        D_ASSERT(first <= last);
 287        D_ASSERT(atomic_read(&mdev->local_cnt) > 0);
 288
 289        for (enr = first; enr <= last; enr++) {
 290                struct lc_element *al_ext;
 291                wait_event(mdev->al_wait,
 292                                (al_ext = _al_get(mdev, enr, false)) != NULL);
 293                if (al_ext->lc_number != enr)
 294                        need_transaction = true;
 295        }
 296        return need_transaction;
 297}
 298
 299static int al_write_transaction(struct drbd_conf *mdev, bool delegate);
 300
 301/* When called through generic_make_request(), we must delegate
 302 * activity log I/O to the worker thread: a further request
 303 * submitted via generic_make_request() within the same task
 304 * would be queued on current->bio_list, and would only start
 305 * after this function returns (see generic_make_request()).
 306 *
 307 * However, if we *are* the worker, we must not delegate to ourselves.
 308 */
 309
 310/*
 311 * @delegate:   delegate activity log I/O to the worker thread
 312 */
 313void drbd_al_begin_io_commit(struct drbd_conf *mdev, bool delegate)
 314{
 315        bool locked = false;
 316
 317        BUG_ON(delegate && current == mdev->tconn->worker.task);
 318
 319        /* Serialize multiple transactions.
 320         * This uses test_and_set_bit, memory barrier is implicit.
 321         */
 322        wait_event(mdev->al_wait,
 323                        mdev->act_log->pending_changes == 0 ||
 324                        (locked = lc_try_lock_for_transaction(mdev->act_log)));
 325
 326        if (locked) {
 327                /* Double check: it may have been committed by someone else,
 328                 * while we have been waiting for the lock. */
 329                if (mdev->act_log->pending_changes) {
 330                        bool write_al_updates;
 331
 332                        rcu_read_lock();
 333                        write_al_updates = rcu_dereference(mdev->ldev->disk_conf)->al_updates;
 334                        rcu_read_unlock();
 335
 336                        if (write_al_updates)
 337                                al_write_transaction(mdev, delegate);
 338                        spin_lock_irq(&mdev->al_lock);
 339                        /* FIXME
 340                        if (err)
 341                                we need an "lc_cancel" here;
 342                        */
 343                        lc_committed(mdev->act_log);
 344                        spin_unlock_irq(&mdev->al_lock);
 345                }
 346                lc_unlock(mdev->act_log);
 347                wake_up(&mdev->al_wait);
 348        }
 349}
 350
 351/*
 352 * @delegate:   delegate activity log I/O to the worker thread
 353 */
 354void drbd_al_begin_io(struct drbd_conf *mdev, struct drbd_interval *i, bool delegate)
 355{
 356        BUG_ON(delegate && current == mdev->tconn->worker.task);
 357
 358        if (drbd_al_begin_io_prepare(mdev, i))
 359                drbd_al_begin_io_commit(mdev, delegate);
 360}
 361
 362int drbd_al_begin_io_nonblock(struct drbd_conf *mdev, struct drbd_interval *i)
 363{
 364        struct lru_cache *al = mdev->act_log;
 365        /* for bios crossing activity log extent boundaries,
 366         * we may need to activate two extents in one go */
 367        unsigned first = i->sector >> (AL_EXTENT_SHIFT-9);
 368        unsigned last = i->size == 0 ? first : (i->sector + (i->size >> 9) - 1) >> (AL_EXTENT_SHIFT-9);
 369        unsigned nr_al_extents;
 370        unsigned available_update_slots;
 371        unsigned enr;
 372
 373        D_ASSERT(first <= last);
 374
 375        nr_al_extents = 1 + last - first; /* worst case: all touched extends are cold. */
 376        available_update_slots = min(al->nr_elements - al->used,
 377                                al->max_pending_changes - al->pending_changes);
 378
 379        /* We want all necessary updates for a given request within the same transaction
 380         * We could first check how many updates are *actually* needed,
 381         * and use that instead of the worst-case nr_al_extents */
 382        if (available_update_slots < nr_al_extents)
 383                return -EWOULDBLOCK;
 384
 385        /* Is resync active in this area? */
 386        for (enr = first; enr <= last; enr++) {
 387                struct lc_element *tmp;
 388                tmp = lc_find(mdev->resync, enr/AL_EXT_PER_BM_SECT);
 389                if (unlikely(tmp != NULL)) {
 390                        struct bm_extent  *bm_ext = lc_entry(tmp, struct bm_extent, lce);
 391                        if (test_bit(BME_NO_WRITES, &bm_ext->flags)) {
 392                                if (!test_and_set_bit(BME_PRIORITY, &bm_ext->flags))
 393                                        return -EBUSY;
 394                                return -EWOULDBLOCK;
 395                        }
 396                }
 397        }
 398
 399        /* Checkout the refcounts.
 400         * Given that we checked for available elements and update slots above,
 401         * this has to be successful. */
 402        for (enr = first; enr <= last; enr++) {
 403                struct lc_element *al_ext;
 404                al_ext = lc_get_cumulative(mdev->act_log, enr);
 405                if (!al_ext)
 406                        dev_info(DEV, "LOGIC BUG for enr=%u\n", enr);
 407        }
 408        return 0;
 409}
 410
 411void drbd_al_complete_io(struct drbd_conf *mdev, struct drbd_interval *i)
 412{
 413        /* for bios crossing activity log extent boundaries,
 414         * we may need to activate two extents in one go */
 415        unsigned first = i->sector >> (AL_EXTENT_SHIFT-9);
 416        unsigned last = i->size == 0 ? first : (i->sector + (i->size >> 9) - 1) >> (AL_EXTENT_SHIFT-9);
 417        unsigned enr;
 418        struct lc_element *extent;
 419        unsigned long flags;
 420
 421        D_ASSERT(first <= last);
 422        spin_lock_irqsave(&mdev->al_lock, flags);
 423
 424        for (enr = first; enr <= last; enr++) {
 425                extent = lc_find(mdev->act_log, enr);
 426                if (!extent) {
 427                        dev_err(DEV, "al_complete_io() called on inactive extent %u\n", enr);
 428                        continue;
 429                }
 430                lc_put(mdev->act_log, extent);
 431        }
 432        spin_unlock_irqrestore(&mdev->al_lock, flags);
 433        wake_up(&mdev->al_wait);
 434}
 435
 436#if (PAGE_SHIFT + 3) < (AL_EXTENT_SHIFT - BM_BLOCK_SHIFT)
 437/* Currently BM_BLOCK_SHIFT, BM_EXT_SHIFT and AL_EXTENT_SHIFT
 438 * are still coupled, or assume too much about their relation.
 439 * Code below will not work if this is violated.
 440 * Will be cleaned up with some followup patch.
 441 */
 442# error FIXME
 443#endif
 444
 445static unsigned int al_extent_to_bm_page(unsigned int al_enr)
 446{
 447        return al_enr >>
 448                /* bit to page */
 449                ((PAGE_SHIFT + 3) -
 450                /* al extent number to bit */
 451                 (AL_EXTENT_SHIFT - BM_BLOCK_SHIFT));
 452}
 453
 454static unsigned int rs_extent_to_bm_page(unsigned int rs_enr)
 455{
 456        return rs_enr >>
 457                /* bit to page */
 458                ((PAGE_SHIFT + 3) -
 459                /* resync extent number to bit */
 460                 (BM_EXT_SHIFT - BM_BLOCK_SHIFT));
 461}
 462
 463static sector_t al_tr_number_to_on_disk_sector(struct drbd_conf *mdev)
 464{
 465        const unsigned int stripes = mdev->ldev->md.al_stripes;
 466        const unsigned int stripe_size_4kB = mdev->ldev->md.al_stripe_size_4k;
 467
 468        /* transaction number, modulo on-disk ring buffer wrap around */
 469        unsigned int t = mdev->al_tr_number % (mdev->ldev->md.al_size_4k);
 470
 471        /* ... to aligned 4k on disk block */
 472        t = ((t % stripes) * stripe_size_4kB) + t/stripes;
 473
 474        /* ... to 512 byte sector in activity log */
 475        t *= 8;
 476
 477        /* ... plus offset to the on disk position */
 478        return mdev->ldev->md.md_offset + mdev->ldev->md.al_offset + t;
 479}
 480
 481static int
 482_al_write_transaction(struct drbd_conf *mdev)
 483{
 484        struct al_transaction_on_disk *buffer;
 485        struct lc_element *e;
 486        sector_t sector;
 487        int i, mx;
 488        unsigned extent_nr;
 489        unsigned crc = 0;
 490        int err = 0;
 491
 492        if (!get_ldev(mdev)) {
 493                dev_err(DEV, "disk is %s, cannot start al transaction\n",
 494                        drbd_disk_str(mdev->state.disk));
 495                return -EIO;
 496        }
 497
 498        /* The bitmap write may have failed, causing a state change. */
 499        if (mdev->state.disk < D_INCONSISTENT) {
 500                dev_err(DEV,
 501                        "disk is %s, cannot write al transaction\n",
 502                        drbd_disk_str(mdev->state.disk));
 503                put_ldev(mdev);
 504                return -EIO;
 505        }
 506
 507        buffer = drbd_md_get_buffer(mdev); /* protects md_io_buffer, al_tr_cycle, ... */
 508        if (!buffer) {
 509                dev_err(DEV, "disk failed while waiting for md_io buffer\n");
 510                put_ldev(mdev);
 511                return -ENODEV;
 512        }
 513
 514        memset(buffer, 0, sizeof(*buffer));
 515        buffer->magic = cpu_to_be32(DRBD_AL_MAGIC);
 516        buffer->tr_number = cpu_to_be32(mdev->al_tr_number);
 517
 518        i = 0;
 519
 520        /* Even though no one can start to change this list
 521         * once we set the LC_LOCKED -- from drbd_al_begin_io(),
 522         * lc_try_lock_for_transaction() --, someone may still
 523         * be in the process of changing it. */
 524        spin_lock_irq(&mdev->al_lock);
 525        list_for_each_entry(e, &mdev->act_log->to_be_changed, list) {
 526                if (i == AL_UPDATES_PER_TRANSACTION) {
 527                        i++;
 528                        break;
 529                }
 530                buffer->update_slot_nr[i] = cpu_to_be16(e->lc_index);
 531                buffer->update_extent_nr[i] = cpu_to_be32(e->lc_new_number);
 532                if (e->lc_number != LC_FREE)
 533                        drbd_bm_mark_for_writeout(mdev,
 534                                        al_extent_to_bm_page(e->lc_number));
 535                i++;
 536        }
 537        spin_unlock_irq(&mdev->al_lock);
 538        BUG_ON(i > AL_UPDATES_PER_TRANSACTION);
 539
 540        buffer->n_updates = cpu_to_be16(i);
 541        for ( ; i < AL_UPDATES_PER_TRANSACTION; i++) {
 542                buffer->update_slot_nr[i] = cpu_to_be16(-1);
 543                buffer->update_extent_nr[i] = cpu_to_be32(LC_FREE);
 544        }
 545
 546        buffer->context_size = cpu_to_be16(mdev->act_log->nr_elements);
 547        buffer->context_start_slot_nr = cpu_to_be16(mdev->al_tr_cycle);
 548
 549        mx = min_t(int, AL_CONTEXT_PER_TRANSACTION,
 550                   mdev->act_log->nr_elements - mdev->al_tr_cycle);
 551        for (i = 0; i < mx; i++) {
 552                unsigned idx = mdev->al_tr_cycle + i;
 553                extent_nr = lc_element_by_index(mdev->act_log, idx)->lc_number;
 554                buffer->context[i] = cpu_to_be32(extent_nr);
 555        }
 556        for (; i < AL_CONTEXT_PER_TRANSACTION; i++)
 557                buffer->context[i] = cpu_to_be32(LC_FREE);
 558
 559        mdev->al_tr_cycle += AL_CONTEXT_PER_TRANSACTION;
 560        if (mdev->al_tr_cycle >= mdev->act_log->nr_elements)
 561                mdev->al_tr_cycle = 0;
 562
 563        sector = al_tr_number_to_on_disk_sector(mdev);
 564
 565        crc = crc32c(0, buffer, 4096);
 566        buffer->crc32c = cpu_to_be32(crc);
 567
 568        if (drbd_bm_write_hinted(mdev))
 569                err = -EIO;
 570        else {
 571                bool write_al_updates;
 572                rcu_read_lock();
 573                write_al_updates = rcu_dereference(mdev->ldev->disk_conf)->al_updates;
 574                rcu_read_unlock();
 575                if (write_al_updates) {
 576                        if (drbd_md_sync_page_io(mdev, mdev->ldev, sector, WRITE)) {
 577                                err = -EIO;
 578                                drbd_chk_io_error(mdev, 1, DRBD_META_IO_ERROR);
 579                        } else {
 580                                mdev->al_tr_number++;
 581                                mdev->al_writ_cnt++;
 582                        }
 583                }
 584        }
 585
 586        drbd_md_put_buffer(mdev);
 587        put_ldev(mdev);
 588
 589        return err;
 590}
 591
 592
 593static int w_al_write_transaction(struct drbd_work *w, int unused)
 594{
 595        struct update_al_work *aw = container_of(w, struct update_al_work, w);
 596        struct drbd_conf *mdev = w->mdev;
 597        int err;
 598
 599        err = _al_write_transaction(mdev);
 600        aw->err = err;
 601        complete(&aw->event);
 602
 603        return err != -EIO ? err : 0;
 604}
 605
 606/* Calls from worker context (see w_restart_disk_io()) need to write the
 607   transaction directly. Others came through generic_make_request(),
 608   those need to delegate it to the worker. */
 609static int al_write_transaction(struct drbd_conf *mdev, bool delegate)
 610{
 611        if (delegate) {
 612                struct update_al_work al_work;
 613                init_completion(&al_work.event);
 614                al_work.w.cb = w_al_write_transaction;
 615                al_work.w.mdev = mdev;
 616                drbd_queue_work_front(&mdev->tconn->sender_work, &al_work.w);
 617                wait_for_completion(&al_work.event);
 618                return al_work.err;
 619        } else
 620                return _al_write_transaction(mdev);
 621}
 622
 623static int _try_lc_del(struct drbd_conf *mdev, struct lc_element *al_ext)
 624{
 625        int rv;
 626
 627        spin_lock_irq(&mdev->al_lock);
 628        rv = (al_ext->refcnt == 0);
 629        if (likely(rv))
 630                lc_del(mdev->act_log, al_ext);
 631        spin_unlock_irq(&mdev->al_lock);
 632
 633        return rv;
 634}
 635
 636/**
 637 * drbd_al_shrink() - Removes all active extents form the activity log
 638 * @mdev:       DRBD device.
 639 *
 640 * Removes all active extents form the activity log, waiting until
 641 * the reference count of each entry dropped to 0 first, of course.
 642 *
 643 * You need to lock mdev->act_log with lc_try_lock() / lc_unlock()
 644 */
 645void drbd_al_shrink(struct drbd_conf *mdev)
 646{
 647        struct lc_element *al_ext;
 648        int i;
 649
 650        D_ASSERT(test_bit(__LC_LOCKED, &mdev->act_log->flags));
 651
 652        for (i = 0; i < mdev->act_log->nr_elements; i++) {
 653                al_ext = lc_element_by_index(mdev->act_log, i);
 654                if (al_ext->lc_number == LC_FREE)
 655                        continue;
 656                wait_event(mdev->al_wait, _try_lc_del(mdev, al_ext));
 657        }
 658
 659        wake_up(&mdev->al_wait);
 660}
 661
 662int drbd_initialize_al(struct drbd_conf *mdev, void *buffer)
 663{
 664        struct al_transaction_on_disk *al = buffer;
 665        struct drbd_md *md = &mdev->ldev->md;
 666        sector_t al_base = md->md_offset + md->al_offset;
 667        int al_size_4k = md->al_stripes * md->al_stripe_size_4k;
 668        int i;
 669
 670        memset(al, 0, 4096);
 671        al->magic = cpu_to_be32(DRBD_AL_MAGIC);
 672        al->transaction_type = cpu_to_be16(AL_TR_INITIALIZED);
 673        al->crc32c = cpu_to_be32(crc32c(0, al, 4096));
 674
 675        for (i = 0; i < al_size_4k; i++) {
 676                int err = drbd_md_sync_page_io(mdev, mdev->ldev, al_base + i * 8, WRITE);
 677                if (err)
 678                        return err;
 679        }
 680        return 0;
 681}
 682
 683static int w_update_odbm(struct drbd_work *w, int unused)
 684{
 685        struct update_odbm_work *udw = container_of(w, struct update_odbm_work, w);
 686        struct drbd_conf *mdev = w->mdev;
 687        struct sib_info sib = { .sib_reason = SIB_SYNC_PROGRESS, };
 688
 689        if (!get_ldev(mdev)) {
 690                if (__ratelimit(&drbd_ratelimit_state))
 691                        dev_warn(DEV, "Can not update on disk bitmap, local IO disabled.\n");
 692                kfree(udw);
 693                return 0;
 694        }
 695
 696        drbd_bm_write_page(mdev, rs_extent_to_bm_page(udw->enr));
 697        put_ldev(mdev);
 698
 699        kfree(udw);
 700
 701        if (drbd_bm_total_weight(mdev) <= mdev->rs_failed) {
 702                switch (mdev->state.conn) {
 703                case C_SYNC_SOURCE:  case C_SYNC_TARGET:
 704                case C_PAUSED_SYNC_S: case C_PAUSED_SYNC_T:
 705                        drbd_resync_finished(mdev);
 706                default:
 707                        /* nothing to do */
 708                        break;
 709                }
 710        }
 711        drbd_bcast_event(mdev, &sib);
 712
 713        return 0;
 714}
 715
 716
 717/* ATTENTION. The AL's extents are 4MB each, while the extents in the
 718 * resync LRU-cache are 16MB each.
 719 * The caller of this function has to hold an get_ldev() reference.
 720 *
 721 * TODO will be obsoleted once we have a caching lru of the on disk bitmap
 722 */
 723static void drbd_try_clear_on_disk_bm(struct drbd_conf *mdev, sector_t sector,
 724                                      int count, int success)
 725{
 726        struct lc_element *e;
 727        struct update_odbm_work *udw;
 728
 729        unsigned int enr;
 730
 731        D_ASSERT(atomic_read(&mdev->local_cnt));
 732
 733        /* I simply assume that a sector/size pair never crosses
 734         * a 16 MB extent border. (Currently this is true...) */
 735        enr = BM_SECT_TO_EXT(sector);
 736
 737        e = lc_get(mdev->resync, enr);
 738        if (e) {
 739                struct bm_extent *ext = lc_entry(e, struct bm_extent, lce);
 740                if (ext->lce.lc_number == enr) {
 741                        if (success)
 742                                ext->rs_left -= count;
 743                        else
 744                                ext->rs_failed += count;
 745                        if (ext->rs_left < ext->rs_failed) {
 746                                dev_warn(DEV, "BAD! sector=%llus enr=%u rs_left=%d "
 747                                    "rs_failed=%d count=%d cstate=%s\n",
 748                                     (unsigned long long)sector,
 749                                     ext->lce.lc_number, ext->rs_left,
 750                                     ext->rs_failed, count,
 751                                     drbd_conn_str(mdev->state.conn));
 752
 753                                /* We don't expect to be able to clear more bits
 754                                 * than have been set when we originally counted
 755                                 * the set bits to cache that value in ext->rs_left.
 756                                 * Whatever the reason (disconnect during resync,
 757                                 * delayed local completion of an application write),
 758                                 * try to fix it up by recounting here. */
 759                                ext->rs_left = drbd_bm_e_weight(mdev, enr);
 760                        }
 761                } else {
 762                        /* Normally this element should be in the cache,
 763                         * since drbd_rs_begin_io() pulled it already in.
 764                         *
 765                         * But maybe an application write finished, and we set
 766                         * something outside the resync lru_cache in sync.
 767                         */
 768                        int rs_left = drbd_bm_e_weight(mdev, enr);
 769                        if (ext->flags != 0) {
 770                                dev_warn(DEV, "changing resync lce: %d[%u;%02lx]"
 771                                     " -> %d[%u;00]\n",
 772                                     ext->lce.lc_number, ext->rs_left,
 773                                     ext->flags, enr, rs_left);
 774                                ext->flags = 0;
 775                        }
 776                        if (ext->rs_failed) {
 777                                dev_warn(DEV, "Kicking resync_lru element enr=%u "
 778                                     "out with rs_failed=%d\n",
 779                                     ext->lce.lc_number, ext->rs_failed);
 780                        }
 781                        ext->rs_left = rs_left;
 782                        ext->rs_failed = success ? 0 : count;
 783                        /* we don't keep a persistent log of the resync lru,
 784                         * we can commit any change right away. */
 785                        lc_committed(mdev->resync);
 786                }
 787                lc_put(mdev->resync, &ext->lce);
 788                /* no race, we are within the al_lock! */
 789
 790                if (ext->rs_left == ext->rs_failed) {
 791                        ext->rs_failed = 0;
 792
 793                        udw = kmalloc(sizeof(*udw), GFP_ATOMIC);
 794                        if (udw) {
 795                                udw->enr = ext->lce.lc_number;
 796                                udw->w.cb = w_update_odbm;
 797                                udw->w.mdev = mdev;
 798                                drbd_queue_work_front(&mdev->tconn->sender_work, &udw->w);
 799                        } else {
 800                                dev_warn(DEV, "Could not kmalloc an udw\n");
 801                        }
 802                }
 803        } else {
 804                dev_err(DEV, "lc_get() failed! locked=%d/%d flags=%lu\n",
 805                    mdev->resync_locked,
 806                    mdev->resync->nr_elements,
 807                    mdev->resync->flags);
 808        }
 809}
 810
 811void drbd_advance_rs_marks(struct drbd_conf *mdev, unsigned long still_to_go)
 812{
 813        unsigned long now = jiffies;
 814        unsigned long last = mdev->rs_mark_time[mdev->rs_last_mark];
 815        int next = (mdev->rs_last_mark + 1) % DRBD_SYNC_MARKS;
 816        if (time_after_eq(now, last + DRBD_SYNC_MARK_STEP)) {
 817                if (mdev->rs_mark_left[mdev->rs_last_mark] != still_to_go &&
 818                    mdev->state.conn != C_PAUSED_SYNC_T &&
 819                    mdev->state.conn != C_PAUSED_SYNC_S) {
 820                        mdev->rs_mark_time[next] = now;
 821                        mdev->rs_mark_left[next] = still_to_go;
 822                        mdev->rs_last_mark = next;
 823                }
 824        }
 825}
 826
 827/* clear the bit corresponding to the piece of storage in question:
 828 * size byte of data starting from sector.  Only clear a bits of the affected
 829 * one ore more _aligned_ BM_BLOCK_SIZE blocks.
 830 *
 831 * called by worker on C_SYNC_TARGET and receiver on SyncSource.
 832 *
 833 */
 834void __drbd_set_in_sync(struct drbd_conf *mdev, sector_t sector, int size,
 835                       const char *file, const unsigned int line)
 836{
 837        /* Is called from worker and receiver context _only_ */
 838        unsigned long sbnr, ebnr, lbnr;
 839        unsigned long count = 0;
 840        sector_t esector, nr_sectors;
 841        int wake_up = 0;
 842        unsigned long flags;
 843
 844        if (size <= 0 || !IS_ALIGNED(size, 512) || size > DRBD_MAX_BIO_SIZE) {
 845                dev_err(DEV, "drbd_set_in_sync: sector=%llus size=%d nonsense!\n",
 846                                (unsigned long long)sector, size);
 847                return;
 848        }
 849
 850        if (!get_ldev(mdev))
 851                return; /* no disk, no metadata, no bitmap to clear bits in */
 852
 853        nr_sectors = drbd_get_capacity(mdev->this_bdev);
 854        esector = sector + (size >> 9) - 1;
 855
 856        if (!expect(sector < nr_sectors))
 857                goto out;
 858        if (!expect(esector < nr_sectors))
 859                esector = nr_sectors - 1;
 860
 861        lbnr = BM_SECT_TO_BIT(nr_sectors-1);
 862
 863        /* we clear it (in sync).
 864         * round up start sector, round down end sector.  we make sure we only
 865         * clear full, aligned, BM_BLOCK_SIZE (4K) blocks */
 866        if (unlikely(esector < BM_SECT_PER_BIT-1))
 867                goto out;
 868        if (unlikely(esector == (nr_sectors-1)))
 869                ebnr = lbnr;
 870        else
 871                ebnr = BM_SECT_TO_BIT(esector - (BM_SECT_PER_BIT-1));
 872        sbnr = BM_SECT_TO_BIT(sector + BM_SECT_PER_BIT-1);
 873
 874        if (sbnr > ebnr)
 875                goto out;
 876
 877        /*
 878         * ok, (capacity & 7) != 0 sometimes, but who cares...
 879         * we count rs_{total,left} in bits, not sectors.
 880         */
 881        count = drbd_bm_clear_bits(mdev, sbnr, ebnr);
 882        if (count) {
 883                drbd_advance_rs_marks(mdev, drbd_bm_total_weight(mdev));
 884                spin_lock_irqsave(&mdev->al_lock, flags);
 885                drbd_try_clear_on_disk_bm(mdev, sector, count, true);
 886                spin_unlock_irqrestore(&mdev->al_lock, flags);
 887
 888                /* just wake_up unconditional now, various lc_chaged(),
 889                 * lc_put() in drbd_try_clear_on_disk_bm(). */
 890                wake_up = 1;
 891        }
 892out:
 893        put_ldev(mdev);
 894        if (wake_up)
 895                wake_up(&mdev->al_wait);
 896}
 897
 898/*
 899 * this is intended to set one request worth of data out of sync.
 900 * affects at least 1 bit,
 901 * and at most 1+DRBD_MAX_BIO_SIZE/BM_BLOCK_SIZE bits.
 902 *
 903 * called by tl_clear and drbd_send_dblock (==drbd_make_request).
 904 * so this can be _any_ process.
 905 */
 906int __drbd_set_out_of_sync(struct drbd_conf *mdev, sector_t sector, int size,
 907                            const char *file, const unsigned int line)
 908{
 909        unsigned long sbnr, ebnr, flags;
 910        sector_t esector, nr_sectors;
 911        unsigned int enr, count = 0;
 912        struct lc_element *e;
 913
 914        /* this should be an empty REQ_FLUSH */
 915        if (size == 0)
 916                return 0;
 917
 918        if (size < 0 || !IS_ALIGNED(size, 512) || size > DRBD_MAX_BIO_SIZE) {
 919                dev_err(DEV, "sector: %llus, size: %d\n",
 920                        (unsigned long long)sector, size);
 921                return 0;
 922        }
 923
 924        if (!get_ldev(mdev))
 925                return 0; /* no disk, no metadata, no bitmap to set bits in */
 926
 927        nr_sectors = drbd_get_capacity(mdev->this_bdev);
 928        esector = sector + (size >> 9) - 1;
 929
 930        if (!expect(sector < nr_sectors))
 931                goto out;
 932        if (!expect(esector < nr_sectors))
 933                esector = nr_sectors - 1;
 934
 935        /* we set it out of sync,
 936         * we do not need to round anything here */
 937        sbnr = BM_SECT_TO_BIT(sector);
 938        ebnr = BM_SECT_TO_BIT(esector);
 939
 940        /* ok, (capacity & 7) != 0 sometimes, but who cares...
 941         * we count rs_{total,left} in bits, not sectors.  */
 942        spin_lock_irqsave(&mdev->al_lock, flags);
 943        count = drbd_bm_set_bits(mdev, sbnr, ebnr);
 944
 945        enr = BM_SECT_TO_EXT(sector);
 946        e = lc_find(mdev->resync, enr);
 947        if (e)
 948                lc_entry(e, struct bm_extent, lce)->rs_left += count;
 949        spin_unlock_irqrestore(&mdev->al_lock, flags);
 950
 951out:
 952        put_ldev(mdev);
 953
 954        return count;
 955}
 956
 957static
 958struct bm_extent *_bme_get(struct drbd_conf *mdev, unsigned int enr)
 959{
 960        struct lc_element *e;
 961        struct bm_extent *bm_ext;
 962        int wakeup = 0;
 963        unsigned long rs_flags;
 964
 965        spin_lock_irq(&mdev->al_lock);
 966        if (mdev->resync_locked > mdev->resync->nr_elements/2) {
 967                spin_unlock_irq(&mdev->al_lock);
 968                return NULL;
 969        }
 970        e = lc_get(mdev->resync, enr);
 971        bm_ext = e ? lc_entry(e, struct bm_extent, lce) : NULL;
 972        if (bm_ext) {
 973                if (bm_ext->lce.lc_number != enr) {
 974                        bm_ext->rs_left = drbd_bm_e_weight(mdev, enr);
 975                        bm_ext->rs_failed = 0;
 976                        lc_committed(mdev->resync);
 977                        wakeup = 1;
 978                }
 979                if (bm_ext->lce.refcnt == 1)
 980                        mdev->resync_locked++;
 981                set_bit(BME_NO_WRITES, &bm_ext->flags);
 982        }
 983        rs_flags = mdev->resync->flags;
 984        spin_unlock_irq(&mdev->al_lock);
 985        if (wakeup)
 986                wake_up(&mdev->al_wait);
 987
 988        if (!bm_ext) {
 989                if (rs_flags & LC_STARVING)
 990                        dev_warn(DEV, "Have to wait for element"
 991                             " (resync LRU too small?)\n");
 992                BUG_ON(rs_flags & LC_LOCKED);
 993        }
 994
 995        return bm_ext;
 996}
 997
 998static int _is_in_al(struct drbd_conf *mdev, unsigned int enr)
 999{
1000        int rv;

1001
1002        spin_lock_irq(&mdev->al_lock);
1003        rv = lc_is_used(mdev->act_log, enr);
1004        spin_unlock_irq(&mdev->al_lock);
1005
1006        return rv;
1007}
1008
1009/**
1010 * drbd_rs_begin_io() - Gets an extent in the resync LRU cache and sets it to BME_LOCKED
1011 * @mdev:       DRBD device.
1012 * @sector:     The sector number.
1013 *
1014 * This functions sleeps on al_wait. Returns 0 on success, -EINTR if interrupted.
1015 */
1016int drbd_rs_begin_io(struct drbd_conf *mdev, sector_t sector)
1017{
1018        unsigned int enr = BM_SECT_TO_EXT(sector);
1019        struct bm_extent *bm_ext;
1020        int i, sig;
1021        int sa = 200; /* Step aside 200 times, then grab the extent and let app-IO wait.
1022                         200 times -> 20 seconds. */
1023
1024retry:
1025        sig = wait_event_interruptible(mdev->al_wait,
1026                        (bm_ext = _bme_get(mdev, enr)));
1027        if (sig)
1028                return -EINTR;
1029
1030        if (test_bit(BME_LOCKED, &bm_ext->flags))
1031                return 0;
1032
1033        for (i = 0; i < AL_EXT_PER_BM_SECT; i++) {
1034                sig = wait_event_interruptible(mdev->al_wait,
1035                                               !_is_in_al(mdev, enr * AL_EXT_PER_BM_SECT + i) ||
1036                                               test_bit(BME_PRIORITY, &bm_ext->flags));
1037
1038                if (sig || (test_bit(BME_PRIORITY, &bm_ext->flags) && sa)) {
1039                        spin_lock_irq(&mdev->al_lock);
1040                        if (lc_put(mdev->resync, &bm_ext->lce) == 0) {
1041                                bm_ext->flags = 0; /* clears BME_NO_WRITES and eventually BME_PRIORITY */
1042                                mdev->resync_locked--;
1043                                wake_up(&mdev->al_wait);
1044                        }
1045                        spin_unlock_irq(&mdev->al_lock);
1046                        if (sig)
1047                                return -EINTR;
1048                        if (schedule_timeout_interruptible(HZ/10))
1049                                return -EINTR;
1050                        if (sa && --sa == 0)
1051                                dev_warn(DEV,"drbd_rs_begin_io() stepped aside for 20sec."
1052                                         "Resync stalled?\n");
1053                        goto retry;
1054                }
1055        }
1056        set_bit(BME_LOCKED, &bm_ext->flags);
1057        return 0;
1058}
1059
1060/**
1061 * drbd_try_rs_begin_io() - Gets an extent in the resync LRU cache, does not sleep
1062 * @mdev:       DRBD device.
1063 * @sector:     The sector number.
1064 *
1065 * Gets an extent in the resync LRU cache, sets it to BME_NO_WRITES, then
1066 * tries to set it to BME_LOCKED. Returns 0 upon success, and -EAGAIN
1067 * if there is still application IO going on in this area.
1068 */
1069int drbd_try_rs_begin_io(struct drbd_conf *mdev, sector_t sector)
1070{
1071        unsigned int enr = BM_SECT_TO_EXT(sector);
1072        const unsigned int al_enr = enr*AL_EXT_PER_BM_SECT;
1073        struct lc_element *e;
1074        struct bm_extent *bm_ext;
1075        int i;
1076
1077        spin_lock_irq(&mdev->al_lock);
1078        if (mdev->resync_wenr != LC_FREE && mdev->resync_wenr != enr) {
1079                /* in case you have very heavy scattered io, it may
1080                 * stall the syncer undefined if we give up the ref count
1081                 * when we try again and requeue.
1082                 *
1083                 * if we don't give up the refcount, but the next time
1084                 * we are scheduled this extent has been "synced" by new
1085                 * application writes, we'd miss the lc_put on the
1086                 * extent we keep the refcount on.
1087                 * so we remembered which extent we had to try again, and
1088                 * if the next requested one is something else, we do
1089                 * the lc_put here...
1090                 * we also have to wake_up
1091                 */
1092                e = lc_find(mdev->resync, mdev->resync_wenr);
1093                bm_ext = e ? lc_entry(e, struct bm_extent, lce) : NULL;
1094                if (bm_ext) {
1095                        D_ASSERT(!test_bit(BME_LOCKED, &bm_ext->flags));
1096                        D_ASSERT(test_bit(BME_NO_WRITES, &bm_ext->flags));
1097                        clear_bit(BME_NO_WRITES, &bm_ext->flags);
1098                        mdev->resync_wenr = LC_FREE;
1099                        if (lc_put(mdev->resync, &bm_ext->lce) == 0)
1100                                mdev->resync_locked--;
1101                        wake_up(&mdev->al_wait);
1102                } else {
1103                        dev_alert(DEV, "LOGIC BUG\n");
1104                }
1105        }
1106        /* TRY. */
1107        e = lc_try_get(mdev->resync, enr);
1108        bm_ext = e ? lc_entry(e, struct bm_extent, lce) : NULL;
1109        if (bm_ext) {
1110                if (test_bit(BME_LOCKED, &bm_ext->flags))
1111                        goto proceed;
1112                if (!test_and_set_bit(BME_NO_WRITES, &bm_ext->flags)) {
1113                        mdev->resync_locked++;
1114                } else {
1115                        /* we did set the BME_NO_WRITES,
1116                         * but then could not set BME_LOCKED,
1117                         * so we tried again.
1118                         * drop the extra reference. */
1119                        bm_ext->lce.refcnt--;
1120                        D_ASSERT(bm_ext->lce.refcnt > 0);
1121                }
1122                goto check_al;
1123        } else {
1124                /* do we rather want to try later? */
1125                if (mdev->resync_locked > mdev->resync->nr_elements-3)
1126                        goto try_again;
1127                /* Do or do not. There is no try. -- Yoda */
1128                e = lc_get(mdev->resync, enr);
1129                bm_ext = e ? lc_entry(e, struct bm_extent, lce) : NULL;
1130                if (!bm_ext) {
1131                        const unsigned long rs_flags = mdev->resync->flags;
1132                        if (rs_flags & LC_STARVING)
1133                                dev_warn(DEV, "Have to wait for element"
1134                                     " (resync LRU too small?)\n");
1135                        BUG_ON(rs_flags & LC_LOCKED);
1136                        goto try_again;
1137                }
1138                if (bm_ext->lce.lc_number != enr) {
1139                        bm_ext->rs_left = drbd_bm_e_weight(mdev, enr);
1140                        bm_ext->rs_failed = 0;
1141                        lc_committed(mdev->resync);
1142                        wake_up(&mdev->al_wait);
1143                        D_ASSERT(test_bit(BME_LOCKED, &bm_ext->flags) == 0);
1144                }
1145                set_bit(BME_NO_WRITES, &bm_ext->flags);
1146                D_ASSERT(bm_ext->lce.refcnt == 1);
1147                mdev->resync_locked++;
1148                goto check_al;
1149        }
1150check_al:
1151        for (i = 0; i < AL_EXT_PER_BM_SECT; i++) {
1152                if (lc_is_used(mdev->act_log, al_enr+i))
1153                        goto try_again;
1154        }
1155        set_bit(BME_LOCKED, &bm_ext->flags);
1156proceed:
1157        mdev->resync_wenr = LC_FREE;
1158        spin_unlock_irq(&mdev->al_lock);
1159        return 0;
1160
1161try_again:
1162        if (bm_ext)
1163                mdev->resync_wenr = enr;
1164        spin_unlock_irq(&mdev->al_lock);
1165        return -EAGAIN;
1166}
1167
1168void drbd_rs_complete_io(struct drbd_conf *mdev, sector_t sector)
1169{
1170        unsigned int enr = BM_SECT_TO_EXT(sector);
1171        struct lc_element *e;
1172        struct bm_extent *bm_ext;
1173        unsigned long flags;
1174
1175        spin_lock_irqsave(&mdev->al_lock, flags);
1176        e = lc_find(mdev->resync, enr);
1177        bm_ext = e ? lc_entry(e, struct bm_extent, lce) : NULL;
1178        if (!bm_ext) {
1179                spin_unlock_irqrestore(&mdev->al_lock, flags);
1180                if (__ratelimit(&drbd_ratelimit_state))
1181                        dev_err(DEV, "drbd_rs_complete_io() called, but extent not found\n");
1182                return;
1183        }
1184
1185        if (bm_ext->lce.refcnt == 0) {
1186                spin_unlock_irqrestore(&mdev->al_lock, flags);
1187                dev_err(DEV, "drbd_rs_complete_io(,%llu [=%u]) called, "
1188                    "but refcnt is 0!?\n",
1189                    (unsigned long long)sector, enr);
1190                return;
1191        }
1192
1193        if (lc_put(mdev->resync, &bm_ext->lce) == 0) {
1194                bm_ext->flags = 0; /* clear BME_LOCKED, BME_NO_WRITES and BME_PRIORITY */
1195                mdev->resync_locked--;
1196                wake_up(&mdev->al_wait);
1197        }
1198
1199        spin_unlock_irqrestore(&mdev->al_lock, flags);
1200}
1201
1202/**
1203 * drbd_rs_cancel_all() - Removes all extents from the resync LRU (even BME_LOCKED)
1204 * @mdev:       DRBD device.
1205 */
1206void drbd_rs_cancel_all(struct drbd_conf *mdev)
1207{
1208        spin_lock_irq(&mdev->al_lock);
1209
1210        if (get_ldev_if_state(mdev, D_FAILED)) { /* Makes sure ->resync is there. */
1211                lc_reset(mdev->resync);
1212                put_ldev(mdev);
1213        }
1214        mdev->resync_locked = 0;
1215        mdev->resync_wenr = LC_FREE;
1216        spin_unlock_irq(&mdev->al_lock);
1217        wake_up(&mdev->al_wait);
1218}
1219
1220/**
1221 * drbd_rs_del_all() - Gracefully remove all extents from the resync LRU
1222 * @mdev:       DRBD device.
1223 *
1224 * Returns 0 upon success, -EAGAIN if at least one reference count was
1225 * not zero.
1226 */
1227int drbd_rs_del_all(struct drbd_conf *mdev)
1228{
1229        struct lc_element *e;
1230        struct bm_extent *bm_ext;
1231        int i;
1232
1233        spin_lock_irq(&mdev->al_lock);
1234
1235        if (get_ldev_if_state(mdev, D_FAILED)) {
1236                /* ok, ->resync is there. */
1237                for (i = 0; i < mdev->resync->nr_elements; i++) {
1238                        e = lc_element_by_index(mdev->resync, i);
1239                        bm_ext = lc_entry(e, struct bm_extent, lce);
1240                        if (bm_ext->lce.lc_number == LC_FREE)
1241                                continue;
1242                        if (bm_ext->lce.lc_number == mdev->resync_wenr) {
1243                                dev_info(DEV, "dropping %u in drbd_rs_del_all, apparently"
1244                                     " got 'synced' by application io\n",
1245                                     mdev->resync_wenr);
1246                                D_ASSERT(!test_bit(BME_LOCKED, &bm_ext->flags));
1247                                D_ASSERT(test_bit(BME_NO_WRITES, &bm_ext->flags));
1248                                clear_bit(BME_NO_WRITES, &bm_ext->flags);
1249                                mdev->resync_wenr = LC_FREE;
1250                                lc_put(mdev->resync, &bm_ext->lce);
1251                        }
1252                        if (bm_ext->lce.refcnt != 0) {
1253                                dev_info(DEV, "Retrying drbd_rs_del_all() later. "
1254                                     "refcnt=%d\n", bm_ext->lce.refcnt);
1255                                put_ldev(mdev);
1256                                spin_unlock_irq(&mdev->al_lock);
1257                                return -EAGAIN;
1258                        }
1259                        D_ASSERT(!test_bit(BME_LOCKED, &bm_ext->flags));
1260                        D_ASSERT(!test_bit(BME_NO_WRITES, &bm_ext->flags));
1261                        lc_del(mdev->resync, &bm_ext->lce);
1262                }
1263                D_ASSERT(mdev->resync->used == 0);
1264                put_ldev(mdev);
1265        }
1266        spin_unlock_irq(&mdev->al_lock);
1267        wake_up(&mdev->al_wait);
1268
1269        return 0;
1270}
1271
1272/**
1273 * drbd_rs_failed_io() - Record information on a failure to resync the specified blocks
1274 * @mdev:       DRBD device.
1275 * @sector:     The sector number.
1276 * @size:       Size of failed IO operation, in byte.
1277 */
1278void drbd_rs_failed_io(struct drbd_conf *mdev, sector_t sector, int size)
1279{
1280        /* Is called from worker and receiver context _only_ */
1281        unsigned long sbnr, ebnr, lbnr;
1282        unsigned long count;
1283        sector_t esector, nr_sectors;
1284        int wake_up = 0;
1285
1286        if (size <= 0 || !IS_ALIGNED(size, 512) || size > DRBD_MAX_BIO_SIZE) {
1287                dev_err(DEV, "drbd_rs_failed_io: sector=%llus size=%d nonsense!\n",
1288                                (unsigned long long)sector, size);
1289                return;
1290        }
1291        nr_sectors = drbd_get_capacity(mdev->this_bdev);
1292        esector = sector + (size >> 9) - 1;
1293
1294        if (!expect(sector < nr_sectors))
1295                return;
1296        if (!expect(esector < nr_sectors))
1297                esector = nr_sectors - 1;
1298
1299        lbnr = BM_SECT_TO_BIT(nr_sectors-1);
1300
1301        /*
1302         * round up start sector, round down end sector.  we make sure we only
1303         * handle full, aligned, BM_BLOCK_SIZE (4K) blocks */
1304        if (unlikely(esector < BM_SECT_PER_BIT-1))
1305                return;
1306        if (unlikely(esector == (nr_sectors-1)))
1307                ebnr = lbnr;
1308        else
1309                ebnr = BM_SECT_TO_BIT(esector - (BM_SECT_PER_BIT-1));
1310        sbnr = BM_SECT_TO_BIT(sector + BM_SECT_PER_BIT-1);
1311
1312        if (sbnr > ebnr)
1313                return;
1314
1315        /*
1316         * ok, (capacity & 7) != 0 sometimes, but who cares...
1317         * we count rs_{total,left} in bits, not sectors.
1318         */
1319        spin_lock_irq(&mdev->al_lock);
1320        count = drbd_bm_count_bits(mdev, sbnr, ebnr);
1321        if (count) {
1322                mdev->rs_failed += count;
1323
1324                if (get_ldev(mdev)) {
1325                        drbd_try_clear_on_disk_bm(mdev, sector, count, false);
1326                        put_ldev(mdev);
1327                }
1328
1329                /* just wake_up unconditional now, various lc_chaged(),
1330                 * lc_put() in drbd_try_clear_on_disk_bm(). */
1331                wake_up = 1;
1332        }
1333        spin_unlock_irq(&mdev->al_lock);
1334        if (wake_up)
1335                wake_up(&mdev->al_wait);
1336}
1337