LXR linux/drivers/block/drbd/drbd

   1/*
   2   drbd_actlog.c
   3
   4   This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
   5
   6   Copyright (C) 2003-2008, LINBIT Information Technologies GmbH.
   7   Copyright (C) 2003-2008, Philipp Reisner <philipp.reisner@linbit.com>.
   8   Copyright (C) 2003-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
   9
  10   drbd is free software; you can redistribute it and/or modify
  11   it under the terms of the GNU General Public License as published by
  12   the Free Software Foundation; either version 2, or (at your option)
  13   any later version.
  14
  15   drbd is distributed in the hope that it will be useful,
  16   but WITHOUT ANY WARRANTY; without even the implied warranty of
  17   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  18   GNU General Public License for more details.
  19
  20   You should have received a copy of the GNU General Public License
  21   along with drbd; see the file COPYING.  If not, write to
  22   the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
  23
  24 */
  25
  26#include <linux/slab.h>
  27#include <linux/crc32c.h>
  28#include <linux/drbd.h>
  29#include <linux/drbd_limits.h>
  30#include <linux/dynamic_debug.h>
  31#include "drbd_int.h"
  32#include "drbd_wrappers.h"
  33
  34
  35enum al_transaction_types {
  36        AL_TR_UPDATE = 0,
  37        AL_TR_INITIALIZED = 0xffff
  38};
  39/* all fields on disc in big endian */
  40struct __packed al_transaction_on_disk {
  41        /* don't we all like magic */
  42        __be32  magic;
  43
  44        /* to identify the most recent transaction block
  45         * in the on disk ring buffer */
  46        __be32  tr_number;
  47
  48        /* checksum on the full 4k block, with this field set to 0. */
  49        __be32  crc32c;
  50
  51        /* type of transaction, special transaction types like:
  52         * purge-all, set-all-idle, set-all-active, ... to-be-defined
  53         * see also enum al_transaction_types */
  54        __be16  transaction_type;
  55
  56        /* we currently allow only a few thousand extents,
  57         * so 16bit will be enough for the slot number. */
  58
  59        /* how many updates in this transaction */
  60        __be16  n_updates;
  61
  62        /* maximum slot number, "al-extents" in drbd.conf speak.
  63         * Having this in each transaction should make reconfiguration
  64         * of that parameter easier. */
  65        __be16  context_size;
  66
  67        /* slot number the context starts with */
  68        __be16  context_start_slot_nr;
  69
  70        /* Some reserved bytes.  Expected usage is a 64bit counter of
  71         * sectors-written since device creation, and other data generation tag
  72         * supporting usage */
  73        __be32  __reserved[4];
  74
  75        /* --- 36 byte used --- */
  76
  77        /* Reserve space for up to AL_UPDATES_PER_TRANSACTION changes
  78         * in one transaction, then use the remaining byte in the 4k block for
  79         * context information.  "Flexible" number of updates per transaction
  80         * does not help, as we have to account for the case when all update
  81         * slots are used anyways, so it would only complicate code without
  82         * additional benefit.
  83         */
  84        __be16  update_slot_nr[AL_UPDATES_PER_TRANSACTION];
  85
  86        /* but the extent number is 32bit, which at an extent size of 4 MiB
  87         * allows to cover device sizes of up to 2**54 Byte (16 PiB) */
  88        __be32  update_extent_nr[AL_UPDATES_PER_TRANSACTION];
  89
  90        /* --- 420 bytes used (36 + 64*6) --- */
  91
  92        /* 4096 - 420 = 3676 = 919 * 4 */
  93        __be32  context[AL_CONTEXT_PER_TRANSACTION];
  94};
  95
  96struct update_odbm_work {
  97        struct drbd_work w;
  98        unsigned int enr;
  99};
 100
 101struct update_al_work {
 102        struct drbd_work w;
 103        struct completion event;
 104        int err;
 105};
 106
 107
 108void *drbd_md_get_buffer(struct drbd_conf *mdev)
 109{
 110        int r;
 111
 112        wait_event(mdev->misc_wait,
 113                   (r = atomic_cmpxchg(&mdev->md_io_in_use, 0, 1)) == 0 ||
 114                   mdev->state.disk <= D_FAILED);
 115
 116        return r ? NULL : page_address(mdev->md_io_page);
 117}
 118
 119void drbd_md_put_buffer(struct drbd_conf *mdev)
 120{
 121        if (atomic_dec_and_test(&mdev->md_io_in_use))
 122                wake_up(&mdev->misc_wait);
 123}
 124
 125void wait_until_done_or_force_detached(struct drbd_conf *mdev, struct drbd_backing_dev *bdev,
 126                                     unsigned int *done)
 127{
 128        long dt;
 129
 130        rcu_read_lock();
 131        dt = rcu_dereference(bdev->disk_conf)->disk_timeout;
 132        rcu_read_unlock();
 133        dt = dt * HZ / 10;
 134        if (dt == 0)
 135                dt = MAX_SCHEDULE_TIMEOUT;
 136
 137        dt = wait_event_timeout(mdev->misc_wait,
 138                        *done || test_bit(FORCE_DETACH, &mdev->flags), dt);
 139        if (dt == 0) {
 140                dev_err(DEV, "meta-data IO operation timed out\n");
 141                drbd_chk_io_error(mdev, 1, DRBD_FORCE_DETACH);
 142        }
 143}
 144
 145static int _drbd_md_sync_page_io(struct drbd_conf *mdev,
 146                                 struct drbd_backing_dev *bdev,
 147                                 struct page *page, sector_t sector,
 148                                 int rw, int size)
 149{
 150        struct bio *bio;
 151        int err;
 152
 153        mdev->md_io.done = 0;
 154        mdev->md_io.error = -ENODEV;
 155
 156        if ((rw & WRITE) && !test_bit(MD_NO_FUA, &mdev->flags))
 157                rw |= REQ_FUA | REQ_FLUSH;
 158        rw |= REQ_SYNC;
 159
 160        bio = bio_alloc_drbd(GFP_NOIO);
 161        bio->bi_bdev = bdev->md_bdev;
 162        bio->bi_sector = sector;
 163        err = -EIO;
 164        if (bio_add_page(bio, page, size, 0) != size)
 165                goto out;
 166        bio->bi_private = &mdev->md_io;
 167        bio->bi_end_io = drbd_md_io_complete;
 168        bio->bi_rw = rw;
 169
 170        if (!(rw & WRITE) && mdev->state.disk == D_DISKLESS && mdev->ldev == NULL)
 171                /* special case, drbd_md_read() during drbd_adm_attach(): no get_ldev */
 172                ;
 173        else if (!get_ldev_if_state(mdev, D_ATTACHING)) {
 174                /* Corresponding put_ldev in drbd_md_io_complete() */
 175                dev_err(DEV, "ASSERT FAILED: get_ldev_if_state() == 1 in _drbd_md_sync_page_io()\n");
 176                err = -ENODEV;
 177                goto out;
 178        }
 179
 180        bio_get(bio); /* one bio_put() is in the completion handler */
 181        atomic_inc(&mdev->md_io_in_use); /* drbd_md_put_buffer() is in the completion handler */
 182        if (drbd_insert_fault(mdev, (rw & WRITE) ? DRBD_FAULT_MD_WR : DRBD_FAULT_MD_RD))
 183                bio_endio(bio, -EIO);
 184        else
 185                submit_bio(rw, bio);
 186        wait_until_done_or_force_detached(mdev, bdev, &mdev->md_io.done);
 187        if (bio_flagged(bio, BIO_UPTODATE))
 188                err = mdev->md_io.error;
 189
 190 out:
 191        bio_put(bio);
 192        return err;
 193}
 194
 195int drbd_md_sync_page_io(struct drbd_conf *mdev, struct drbd_backing_dev *bdev,
 196                         sector_t sector, int rw)
 197{
 198        int err;
 199        struct page *iop = mdev->md_io_page;
 200
 201        D_ASSERT(atomic_read(&mdev->md_io_in_use) == 1);
 202
 203        BUG_ON(!bdev->md_bdev);
 204
 205        dev_dbg(DEV, "meta_data io: %s [%d]:%s(,%llus,%s) %pS\n",
 206             current->comm, current->pid, __func__,
 207             (unsigned long long)sector, (rw & WRITE) ? "WRITE" : "READ",
 208             (void*)_RET_IP_ );
 209
 210        if (sector < drbd_md_first_sector(bdev) ||
 211            sector + 7 > drbd_md_last_sector(bdev))
 212                dev_alert(DEV, "%s [%d]:%s(,%llus,%s) out of range md access!\n",
 213                     current->comm, current->pid, __func__,
 214                     (unsigned long long)sector, (rw & WRITE) ? "WRITE" : "READ");
 215
 216        /* we do all our meta data IO in aligned 4k blocks. */
 217        err = _drbd_md_sync_page_io(mdev, bdev, iop, sector, rw, 4096);
 218        if (err) {
 219                dev_err(DEV, "drbd_md_sync_page_io(,%llus,%s) failed with error %d\n",
 220                    (unsigned long long)sector, (rw & WRITE) ? "WRITE" : "READ", err);
 221        }
 222        return err;
 223}
 224
 225static struct bm_extent *find_active_resync_extent(struct drbd_conf *mdev, unsigned int enr)
 226{
 227        struct lc_element *tmp;
 228        tmp = lc_find(mdev->resync, enr/AL_EXT_PER_BM_SECT);
 229        if (unlikely(tmp != NULL)) {
 230                struct bm_extent  *bm_ext = lc_entry(tmp, struct bm_extent, lce);
 231                if (test_bit(BME_NO_WRITES, &bm_ext->flags))
 232                        return bm_ext;
 233        }
 234        return NULL;
 235}
 236
 237static struct lc_element *_al_get(struct drbd_conf *mdev, unsigned int enr, bool nonblock)
 238{
 239        struct lc_element *al_ext;
 240        struct bm_extent *bm_ext;
 241        int wake;
 242
 243        spin_lock_irq(&mdev->al_lock);
 244        bm_ext = find_active_resync_extent(mdev, enr);
 245        if (bm_ext) {
 246                wake = !test_and_set_bit(BME_PRIORITY, &bm_ext->flags);
 247                spin_unlock_irq(&mdev->al_lock);
 248                if (wake)
 249                        wake_up(&mdev->al_wait);
 250                return NULL;
 251        }
 252        if (nonblock)
 253                al_ext = lc_try_get(mdev->act_log, enr);
 254        else
 255                al_ext = lc_get(mdev->act_log, enr);
 256        spin_unlock_irq(&mdev->al_lock);
 257        return al_ext;
 258}
 259
 260bool drbd_al_begin_io_fastpath(struct drbd_conf *mdev, struct drbd_interval *i)
 261{
 262        /* for bios crossing activity log extent boundaries,
 263         * we may need to activate two extents in one go */
 264        unsigned first = i->sector >> (AL_EXTENT_SHIFT-9);
 265        unsigned last = i->size == 0 ? first : (i->sector + (i->size >> 9) - 1) >> (AL_EXTENT_SHIFT-9);
 266
 267        D_ASSERT((unsigned)(last - first) <= 1);
 268        D_ASSERT(atomic_read(&mdev->local_cnt) > 0);
 269
 270        /* FIXME figure out a fast path for bios crossing AL extent boundaries */
 271        if (first != last)
 272                return false;
 273
 274        return _al_get(mdev, first, true);
 275}
 276
 277bool drbd_al_begin_io_prepare(struct drbd_conf *mdev, struct drbd_interval *i)
 278{
 279        /* for bios crossing activity log extent boundaries,
 280         * we may need to activate two extents in one go */
 281        unsigned first = i->sector >> (AL_EXTENT_SHIFT-9);
 282        unsigned last = i->size == 0 ? first : (i->sector + (i->size >> 9) - 1) >> (AL_EXTENT_SHIFT-9);
 283        unsigned enr;
 284        bool need_transaction = false;
 285
 286        D_ASSERT(first <= last);
 287        D_ASSERT(atomic_read(&mdev->local_cnt) > 0);
 288
 289        for (enr = first; enr <= last; enr++) {
 290                struct lc_element *al_ext;
 291                wait_event(mdev->al_wait,
 292                                (al_ext = _al_get(mdev, enr, false)) != NULL);
 293                if (al_ext->lc_number != enr)
 294                        need_transaction = true;
 295        }
 296        return need_transaction;
 297}
 298
 299static int al_write_transaction(struct drbd_conf *mdev, bool delegate);
 300
 301/* When called through generic_make_request(), we must delegate
 302 * activity log I/O to the worker thread: a further request
 303 * submitted via generic_make_request() within the same task
 304 * would be queued on current->bio_list, and would only start
 305 * after this function returns (see generic_make_request()).
 306 *
 307 * However, if we *are* the worker, we must not delegate to ourselves.
 308 */
 309
 310/*
 311 * @delegate:   delegate activity log I/O to the worker thread
 312 */
 313void drbd_al_begin_io_commit(struct drbd_conf *mdev, bool delegate)
 314{
 315        bool locked = false;
 316
 317        BUG_ON(delegate && current == mdev->tconn->worker.task);
 318
 319        /* Serialize multiple transactions.
 320         * This uses test_and_set_bit, memory barrier is implicit.
 321         */
 322        wait_event(mdev->al_wait,
 323                        mdev->act_log->pending_changes == 0 ||
 324                        (locked = lc_try_lock_for_transaction(mdev->act_log)));
 325
 326        if (locked) {
 327                /* Double check: it may have been committed by someone else,
 328                 * while we have been waiting for the lock. */
 329                if (mdev->act_log->pending_changes) {
 330                        bool write_al_updates;
 331
 332                        rcu_read_lock();
 333                        write_al_updates = rcu_dereference(mdev->ldev->disk_conf)->al_updates;
 334                        rcu_read_unlock();
 335
 336                        if (write_al_updates)
 337                                al_write_transaction(mdev, delegate);
 338                        spin_lock_irq(&mdev->al_lock);
 339                        /* FIXME
 340                        if (err)
 341                                we need an "lc_cancel" here;
 342                        */
 343                        lc_committed(mdev->act_log);
 344                        spin_unlock_irq(&mdev->al_lock);
 345                }
 346                lc_unlock(mdev->act_log);
 347                wake_up(&mdev->al_wait);
 348        }
 349}
 350
 351/*
 352 * @delegate:   delegate activity log I/O to the worker thread
 353 */
 354void drbd_al_begin_io(struct drbd_conf *mdev, struct drbd_interval *i, bool delegate)
 355{
 356        BUG_ON(delegate && current == mdev->tconn->worker.task);
 357
 358        if (drbd_al_begin_io_prepare(mdev, i))
 359                drbd_al_begin_io_commit(mdev, delegate);
 360}
 361
 362int drbd_al_begin_io_nonblock(struct drbd_conf *mdev, struct drbd_interval *i)
 363{
 364        struct lru_cache *al = mdev->act_log;
 365        /* for bios crossing activity log extent boundaries,
 366         * we may need to activate two extents in one go */
 367        unsigned first = i->sector >> (AL_EXTENT_SHIFT-9);
 368        unsigned last = i->size == 0 ? first : (i->sector + (i->size >> 9) - 1) >> (AL_EXTENT_SHIFT-9);
 369        unsigned nr_al_extents;
 370        unsigned available_update_slots;
 371        unsigned enr;
 372
 373        D_ASSERT(first <= last);
 374
 375        nr_al_extents = 1 + last - first; /* worst case: all touched extends are cold. */
 376        available_update_slots = min(al->nr_elements - al->used,
 377                                al->max_pending_changes - al->pending_changes);
 378
 379        /* We want all necessary updates for a given request within the same transaction
 380         * We could first check how many updates are *actually* needed,
 381         * and use that instead of the worst-case nr_al_extents */
 382        if (available_update_slots < nr_al_extents)
 383                return -EWOULDBLOCK;
 384
 385        /* Is resync active in this area? */
 386        for (enr = first; enr <= last; enr++) {
 387                struct lc_element *tmp;
 388                tmp = lc_find(mdev->resync, enr/AL_EXT_PER_BM_SECT);
 389                if (unlikely(tmp != NULL)) {
 390                        struct bm_extent  *bm_ext = lc_entry(tmp, struct bm_extent, lce);
 391                        if (test_bit(BME_NO_WRITES, &bm_ext->flags)) {
 392                                if (!test_and_set_bit(BME_PRIORITY, &bm_ext->flags))
 393                                        return -EBUSY;
 394                                return -EWOULDBLOCK;
 395                        }
 396                }
 397        }
 398
 399        /* Checkout the refcounts.
 400         * Given that we checked for available elements and update slots above,
 401         * this has to be successful. */
 402        for (enr = first; enr <= last; enr++) {
 403                struct lc_element *al_ext;
 404                al_ext = lc_get_cumulative(mdev->act_log, enr);
 405                if (!al_ext)
 406                        dev_info(DEV, "LOGIC BUG for enr=%u\n", enr);
 407        }
 408        return 0;
 409}
 410
 411void drbd_al_complete_io(struct drbd_conf *mdev, struct drbd_interval *i)
 412{
 413        /* for bios crossing activity log extent boundaries,
 414         * we may need to activate two extents in one go */
 415        unsigned first = i->sector >> (AL_EXTENT_SHIFT-9);
 416        unsigned last = i->size == 0 ? first : (i->sector + (i->size >> 9) - 1) >> (AL_EXTENT_SHIFT-9);
 417        unsigned enr;
 418        struct lc_element *extent;
 419        unsigned long flags;
 420
 421        D_ASSERT(first <= last);
 422        spin_lock_irqsave(&mdev->al_lock, flags);
 423
 424        for (enr = first; enr <= last; enr++) {
 425                extent = lc_find(mdev->act_log, enr);
 426                if (!extent) {
 427                        dev_err(DEV, "al_complete_io() called on inactive extent %u\n", enr);
 428                        continue;
 429                }
 430                lc_put(mdev->act_log, extent);
 431        }
 432        spin_unlock_irqrestore(&mdev->al_lock, flags);
 433        wake_up(&mdev->al_wait);
 434}
 435
 436#if (PAGE_SHIFT + 3) < (AL_EXTENT_SHIFT - BM_BLOCK_SHIFT)
 437/* Currently BM_BLOCK_SHIFT, BM_EXT_SHIFT and AL_EXTENT_SHIFT
 438 * are still coupled, or assume too much about their relation.
 439 * Code below will not work if this is violated.
 440 * Will be cleaned up with some followup patch.
 441 */
 442# error FIXME
 443#endif
 444
 445static unsigned int al_extent_to_bm_page(unsigned int al_enr)
 446{
 447        return al_enr >>
 448                /* bit to page */
 449                ((PAGE_SHIFT + 3) -
 450                /* al extent number to bit */
 451                 (AL_EXTENT_SHIFT - BM_BLOCK_SHIFT));
 452}
 453
 454static unsigned int rs_extent_to_bm_page(unsigned int rs_enr)
 455{
 456        return rs_enr >>
 457                /* bit to page */
 458                ((PAGE_SHIFT + 3) -
 459                /* resync extent number to bit */
 460                 (BM_EXT_SHIFT - BM_BLOCK_SHIFT));
 461}
 462
 463static sector_t al_tr_number_to_on_disk_sector(struct drbd_conf *mdev)
 464{
 465        const unsigned int stripes = mdev->ldev->md.al_stripes;
 466        const unsigned int stripe_size_4kB = mdev->ldev->md.al_stripe_size_4k;
 467
 468        /* transaction number, modulo on-disk ring buffer wrap around */
 469        unsigned int t = mdev->al_tr_number % (mdev->ldev->md.al_size_4k);
 470
 471        /* ... to aligned 4k on disk block */
 472        t = ((t % stripes) * stripe_size_4kB) + t/stripes;
 473
 474        /* ... to 512 byte sector in activity log */
 475        t *= 8;
 476
 477        /* ... plus offset to the on disk position */
 478        return mdev->ldev->md.md_offset + mdev->ldev->md.al_offset + t;
 479}
 480
 481static int
 482_al_write_transaction(struct drbd_conf *mdev)
 483{
 484        struct al_transaction_on_disk *buffer;
 485        struct lc_element *e;
 486        sector_t sector;
 487        int i, mx;
 488        unsigned extent_nr;
 489        unsigned crc = 0;
 490        int err = 0;
 491
 492        if (!get_ldev(mdev)) {
 493                dev_err(DEV, "disk is %s, cannot start al transaction\n",
 494                        drbd_disk_str(mdev->state.disk));
 495                return -EIO;
 496        }
 497
 498        /* The bitmap write may have failed, causing a state change. */
 499        if (mdev->state.disk < D_INCONSISTENT) {
 500                dev_err(DEV,
 501                        "disk is %s, cannot write al transaction\n",
 502                        drbd_disk_str(mdev->state.disk));
 503                put_ldev(mdev);
 504                return -EIO;
 505        }
 506
 507        buffer = drbd_md_get_buffer(mdev); /* protects md_io_buffer, al_tr_cycle, ... */
 508        if (!buffer) {
 509                dev_err(DEV, "disk failed while waiting for md_io buffer\n");
 510                put_ldev(mdev);
 511                return -ENODEV;
 512        }
 513
 514        memset(buffer, 0, sizeof(*buffer));
 515        buffer->magic = cpu_to_be32(DRBD_AL_MAGIC);
 516        buffer->tr_number = cpu_to_be32(mdev->al_tr_number);
 517
 518        i = 0;
 519
 520        /* Even though no one can start to change this list
 521         * once we set the LC_LOCKED -- from drbd_al_begin_io(),
 522         * lc_try_lock_for_transaction() --, someone may still
 523         * be in the process of changing it. */
 524        spin_lock_irq(&mdev->al_lock);
 525        list_for_each_entry(e, &mdev->act_log->to_be_changed, list) {
 526                if (i == AL_UPDATES_PER_TRANSACTION) {
 527                        i++;
 528                        break;
 529                }
 530                buffer->update_slot_nr[i] = cpu_to_be16(e->lc_index);
 531                buffer->update_extent_nr[i] = cpu_to_be32(e->lc_new_number);
 532                if (e->lc_number != LC_FREE)
 533                        drbd_bm_mark_for_writeout(mdev,
 534                                        al_extent_to_bm_page(e->lc_number));
 535                i++;
 536        }
 537        spin_unlock_irq(&mdev->al_lock);
 538        BUG_ON(i > AL_UPDATES_PER_TRANSACTION);
 539
 540        buffer->n_updates = cpu_to_be16(i);
 541        for ( ; i < AL_UPDATES_PER_TRANSACTION; i++) {
 542                buffer->update_slot_nr[i] = cpu_to_be16(-1);
 543                buffer->update_extent_nr[i] = cpu_to_be32(LC_FREE);
 544        }
 545
 546        buffer->context_size = cpu_to_be16(mdev->act_log->nr_elements);
 547        buffer->context_start_slot_nr = cpu_to_be16(mdev->al_tr_cycle);
 548
 549        mx = min_t(int, AL_CONTEXT_PER_TRANSACTION,
 550                   mdev->act_log->nr_elements - mdev->al_tr_cycle);
 551        for (i = 0; i < mx; i++) {
 552                unsigned idx = mdev->al_tr_cycle + i;
 553                extent_nr = lc_element_by_index(mdev->act_log, idx)->lc_number;
 554                buffer->context[i] = cpu_to_be32(extent_nr);
 555        }
 556        for (; i < AL_CONTEXT_PER_TRANSACTION; i++)
 557                buffer->context[i] = cpu_to_be32(LC_FREE);
 558
 559        mdev->al_tr_cycle += AL_CONTEXT_PER_TRANSACTION;
 560        if (mdev->al_tr_cycle >= mdev->act_log->nr_elements)
 561                mdev->al_tr_cycle = 0;
 562
 563        sector = al_tr_number_to_on_disk_sector(mdev);
 564
 565        crc = crc32c(0, buffer, 4096);
 566        buffer->crc32c = cpu_to_be32(crc);
 567
 568        if (drbd_bm_write_hinted(mdev))
 569                err = -EIO;
 570        else {
 571                bool write_al_updates;
 572                rcu_read_lock();
 573                write_al_updates = rcu_dereference(mdev->ldev->disk_conf)->al_updates;
 574                rcu_read_unlock();
 575                if (write_al_updates) {
 576                        if (drbd_md_sync_page_io(mdev, mdev->ldev, sector, WRITE)) {
 577                                err = -EIO;
 578                                drbd_chk_io_error(mdev, 1, DRBD_META_IO_ERROR);
 579                        } else {
 580                                mdev->al_tr_number++;
 581                                mdev->al_writ_cnt++;
 582                        }
 583                }
 584        }
 585
 586        drbd_md_put_buffer(mdev);
 587        put_ldev(mdev);
 588
 589        return err;
 590}
 591
 592
 593static int w_al_write_transaction(struct drbd_work *w, int unused)
 594{
 595        struct update_al_work *aw = container_of(w, struct update_al_work, w);
 596        struct drbd_conf *mdev = w->mdev;
 597        int err;
 598
 599        err = _al_write_transaction(mdev);
 600        aw->err = err;
 601        complete(&aw->event);
 602
 603        return err != -EIO ? err : 0;
 604}
 605
 606/* Calls from worker context (see w_restart_disk_io()) need to write the
 607   transaction directly. Others came through generic_make_request(),
 608   those need to delegate it to the worker. */
 609static int al_write_transaction(struct drbd_conf *mdev, bool delegate)
 610{
 611        if (delegate) {
 612                struct update_al_work al_work;
 613                init_completion(&al_work.event);
 614                al_work.w.cb = w_al_write_transaction;
 615                al_work.w.mdev = mdev;
 616                drbd_queue_work_front(&mdev->tconn->sender_work, &al_work.w);
 617                wait_for_completion(&al_work.event);
 618                return al_work.err;
 619        } else
 620                return _al_write_transaction(mdev);
 621}
 622
 623static int _try_lc_del(struct drbd_conf *mdev, struct lc_element *al_ext)
 624{
 625        int rv;
 626
 627        spin_lock_irq(&mdev->al_lock);
 628        rv = (al_ext->refcnt == 0);
 629        if (likely(rv))
 630                lc_del(mdev->act_log, al_ext);
 631        spin_unlock_irq(&mdev->al_lock);
 632
 633        return rv;
 634}
 635
 636/**
 637 * drbd_al_shrink() - Removes all active extents form the activity log
 638 * @mdev:       DRBD device.
 639 *
 640 * Removes all active extents form the activity log, waiting until
 641 * the reference count of each entry dropped to 0 first, of course.
 642 *
 643 * You need to lock mdev->act_log with lc_try_lock() / lc_unlock()
 644 */
 645void drbd_al_shrink(struct drbd_conf *mdev)
 646{
 647        struct lc_element *al_ext;
 648        int i;
 649
 650        D_ASSERT(test_bit(__LC_LOCKED, &mdev->act_log->flags));
 651
 652        for (i = 0; i < mdev->act_log->nr_elements; i++) {
 653                al_ext = lc_element_by_index(mdev->act_log, i);
 654                if (al_ext->lc_number == LC_FREE)
 655                        continue;
 656                wait_event(mdev->al_wait, _try_lc_del(mdev, al_ext));
 657        }
 658
 659        wake_up(&mdev->al_wait);
 660}
 661
 662static int w_update_odbm(struct drbd_work *w, int unused)
 663{
 664        struct update_odbm_work *udw = container_of(w, struct update_odbm_work, w);
 665        struct drbd_conf *mdev = w->mdev;
 666        struct sib_info sib = { .sib_reason = SIB_SYNC_PROGRESS, };
 667
 668        if (!get_ldev(mdev)) {
 669                if (__ratelimit(&drbd_ratelimit_state))
 670                        dev_warn(DEV, "Can not update on disk bitmap, local IO disabled.\n");
 671                kfree(udw);
 672                return 0;
 673        }
 674
 675        drbd_bm_write_page(mdev, rs_extent_to_bm_page(udw->enr));
 676        put_ldev(mdev);
 677
 678        kfree(udw);
 679
 680        if (drbd_bm_total_weight(mdev) <= mdev->rs_failed) {
 681                switch (mdev->state.conn) {
 682                case C_SYNC_SOURCE:  case C_SYNC_TARGET:
 683                case C_PAUSED_SYNC_S: case C_PAUSED_SYNC_T:
 684                        drbd_resync_finished(mdev);
 685                default:
 686                        /* nothing to do */
 687                        break;
 688                }
 689        }
 690        drbd_bcast_event(mdev, &sib);
 691
 692        return 0;
 693}
 694
 695
 696/* ATTENTION. The AL's extents are 4MB each, while the extents in the
 697 * resync LRU-cache are 16MB each.
 698 * The caller of this function has to hold an get_ldev() reference.
 699 *
 700 * TODO will be obsoleted once we have a caching lru of the on disk bitmap
 701 */
 702static void drbd_try_clear_on_disk_bm(struct drbd_conf *mdev, sector_t sector,
 703                                      int count, int success)
 704{
 705        struct lc_element *e;
 706        struct update_odbm_work *udw;
 707
 708        unsigned int enr;
 709
 710        D_ASSERT(atomic_read(&mdev->local_cnt));
 711
 712        /* I simply assume that a sector/size pair never crosses
 713         * a 16 MB extent border. (Currently this is true...) */
 714        enr = BM_SECT_TO_EXT(sector);
 715
 716        e = lc_get(mdev->resync, enr);
 717        if (e) {
 718                struct bm_extent *ext = lc_entry(e, struct bm_extent, lce);
 719                if (ext->lce.lc_number == enr) {
 720                        if (success)
 721                                ext->rs_left -= count;
 722                        else
 723                                ext->rs_failed += count;
 724                        if (ext->rs_left < ext->rs_failed) {
 725                                dev_warn(DEV, "BAD! sector=%llus enr=%u rs_left=%d "
 726                                    "rs_failed=%d count=%d cstate=%s\n",
 727                                     (unsigned long long)sector,
 728                                     ext->lce.lc_number, ext->rs_left,
 729                                     ext->rs_failed, count,
 730                                     drbd_conn_str(mdev->state.conn));
 731
 732                                /* We don't expect to be able to clear more bits
 733                                 * than have been set when we originally counted
 734                                 * the set bits to cache that value in ext->rs_left.
 735                                 * Whatever the reason (disconnect during resync,
 736                                 * delayed local completion of an application write),
 737                                 * try to fix it up by recounting here. */
 738                                ext->rs_left = drbd_bm_e_weight(mdev, enr);
 739                        }
 740                } else {
 741                        /* Normally this element should be in the cache,
 742                         * since drbd_rs_begin_io() pulled it already in.
 743                         *
 744                         * But maybe an application write finished, and we set
 745                         * something outside the resync lru_cache in sync.
 746                         */
 747                        int rs_left = drbd_bm_e_weight(mdev, enr);
 748                        if (ext->flags != 0) {
 749                                dev_warn(DEV, "changing resync lce: %d[%u;%02lx]"
 750                                     " -> %d[%u;00]\n",
 751                                     ext->lce.lc_number, ext->rs_left,
 752                                     ext->flags, enr, rs_left);
 753                                ext->flags = 0;
 754                        }
 755                        if (ext->rs_failed) {
 756                                dev_warn(DEV, "Kicking resync_lru element enr=%u "
 757                                     "out with rs_failed=%d\n",
 758                                     ext->lce.lc_number, ext->rs_failed);
 759                        }
 760                        ext->rs_left = rs_left;
 761                        ext->rs_failed = success ? 0 : count;
 762                        /* we don't keep a persistent log of the resync lru,
 763                         * we can commit any change right away. */
 764                        lc_committed(mdev->resync);
 765                }
 766                lc_put(mdev->resync, &ext->lce);
 767                /* no race, we are within the al_lock! */
 768
 769                if (ext->rs_left == ext->rs_failed) {
 770                        ext->rs_failed = 0;
 771
 772                        udw = kmalloc(sizeof(*udw), GFP_ATOMIC);
 773                        if (udw) {
 774                                udw->enr = ext->lce.lc_number;
 775                                udw->w.cb = w_update_odbm;
 776                                udw->w.mdev = mdev;
 777                                drbd_queue_work_front(&mdev->tconn->sender_work, &udw->w);
 778                        } else {
 779                                dev_warn(DEV, "Could not kmalloc an udw\n");
 780                        }
 781                }
 782        } else {
 783                dev_err(DEV, "lc_get() failed! locked=%d/%d flags=%lu\n",
 784                    mdev->resync_locked,
 785                    mdev->resync->nr_elements,
 786                    mdev->resync->flags);
 787        }
 788}
 789
 790void drbd_advance_rs_marks(struct drbd_conf *mdev, unsigned long still_to_go)
 791{
 792        unsigned long now = jiffies;
 793        unsigned long last = mdev->rs_mark_time[mdev->rs_last_mark];
 794        int next = (mdev->rs_last_mark + 1) % DRBD_SYNC_MARKS;
 795        if (time_after_eq(now, last + DRBD_SYNC_MARK_STEP)) {
 796                if (mdev->rs_mark_left[mdev->rs_last_mark] != still_to_go &&
 797                    mdev->state.conn != C_PAUSED_SYNC_T &&
 798                    mdev->state.conn != C_PAUSED_SYNC_S) {
 799                        mdev->rs_mark_time[next] = now;
 800                        mdev->rs_mark_left[next] = still_to_go;
 801                        mdev->rs_last_mark = next;
 802                }
 803        }
 804}
 805
 806/* clear the bit corresponding to the piece of storage in question:
 807 * size byte of data starting from sector.  Only clear a bits of the affected
 808 * one ore more _aligned_ BM_BLOCK_SIZE blocks.
 809 *
 810 * called by worker on C_SYNC_TARGET and receiver on SyncSource.
 811 *
 812 */
 813void __drbd_set_in_sync(struct drbd_conf *mdev, sector_t sector, int size,
 814                       const char *file, const unsigned int line)
 815{
 816        /* Is called from worker and receiver context _only_ */
 817        unsigned long sbnr, ebnr, lbnr;
 818        unsigned long count = 0;
 819        sector_t esector, nr_sectors;
 820        int wake_up = 0;
 821        unsigned long flags;
 822
 823        if (size <= 0 || !IS_ALIGNED(size, 512) || size > DRBD_MAX_BIO_SIZE) {
 824                dev_err(DEV, "drbd_set_in_sync: sector=%llus size=%d nonsense!\n",
 825                                (unsigned long long)sector, size);
 826                return;
 827        }
 828
 829        if (!get_ldev(mdev))
 830                return; /* no disk, no metadata, no bitmap to clear bits in */
 831
 832        nr_sectors = drbd_get_capacity(mdev->this_bdev);
 833        esector = sector + (size >> 9) - 1;
 834
 835        if (!expect(sector < nr_sectors))
 836                goto out;
 837        if (!expect(esector < nr_sectors))
 838                esector = nr_sectors - 1;
 839
 840        lbnr = BM_SECT_TO_BIT(nr_sectors-1);
 841
 842        /* we clear it (in sync).
 843         * round up start sector, round down end sector.  we make sure we only
 844         * clear full, aligned, BM_BLOCK_SIZE (4K) blocks */
 845        if (unlikely(esector < BM_SECT_PER_BIT-1))
 846                goto out;
 847        if (unlikely(esector == (nr_sectors-1)))
 848                ebnr = lbnr;
 849        else
 850                ebnr = BM_SECT_TO_BIT(esector - (BM_SECT_PER_BIT-1));
 851        sbnr = BM_SECT_TO_BIT(sector + BM_SECT_PER_BIT-1);
 852
 853        if (sbnr > ebnr)
 854                goto out;
 855
 856        /*
 857         * ok, (capacity & 7) != 0 sometimes, but who cares...
 858         * we count rs_{total,left} in bits, not sectors.
 859         */
 860        count = drbd_bm_clear_bits(mdev, sbnr, ebnr);
 861        if (count) {
 862                drbd_advance_rs_marks(mdev, drbd_bm_total_weight(mdev));
 863                spin_lock_irqsave(&mdev->al_lock, flags);
 864                drbd_try_clear_on_disk_bm(mdev, sector, count, true);
 865                spin_unlock_irqrestore(&mdev->al_lock, flags);
 866
 867                /* just wake_up unconditional now, various lc_chaged(),
 868                 * lc_put() in drbd_try_clear_on_disk_bm(). */
 869                wake_up = 1;
 870        }
 871out:
 872        put_ldev(mdev);
 873        if (wake_up)
 874                wake_up(&mdev->al_wait);
 875}
 876
 877/*
 878 * this is intended to set one request worth of data out of sync.
 879 * affects at least 1 bit,
 880 * and at most 1+DRBD_MAX_BIO_SIZE/BM_BLOCK_SIZE bits.
 881 *
 882 * called by tl_clear and drbd_send_dblock (==drbd_make_request).
 883 * so this can be _any_ process.
 884 */
 885int __drbd_set_out_of_sync(struct drbd_conf *mdev, sector_t sector, int size,
 886                            const char *file, const unsigned int line)
 887{
 888        unsigned long sbnr, ebnr, flags;
 889        sector_t esector, nr_sectors;
 890        unsigned int enr, count = 0;
 891        struct lc_element *e;
 892
 893        /* this should be an empty REQ_FLUSH */
 894        if (size == 0)
 895                return 0;
 896
 897        if (size < 0 || !IS_ALIGNED(size, 512) || size > DRBD_MAX_BIO_SIZE) {
 898                dev_err(DEV, "sector: %llus, size: %d\n",
 899                        (unsigned long long)sector, size);
 900                return 0;
 901        }
 902
 903        if (!get_ldev(mdev))
 904                return 0; /* no disk, no metadata, no bitmap to set bits in */
 905
 906        nr_sectors = drbd_get_capacity(mdev->this_bdev);
 907        esector = sector + (size >> 9) - 1;
 908
 909        if (!expect(sector < nr_sectors))
 910                goto out;
 911        if (!expect(esector < nr_sectors))
 912                esector = nr_sectors - 1;
 913
 914        /* we set it out of sync,
 915         * we do not need to round anything here */
 916        sbnr = BM_SECT_TO_BIT(sector);
 917        ebnr = BM_SECT_TO_BIT(esector);
 918
 919        /* ok, (capacity & 7) != 0 sometimes, but who cares...
 920         * we count rs_{total,left} in bits, not sectors.  */
 921        spin_lock_irqsave(&mdev->al_lock, flags);
 922        count = drbd_bm_set_bits(mdev, sbnr, ebnr);
 923
 924        enr = BM_SECT_TO_EXT(sector);
 925        e = lc_find(mdev->resync, enr);
 926        if (e)
 927                lc_entry(e, struct bm_extent, lce)->rs_left += count;
 928        spin_unlock_irqrestore(&mdev->al_lock, flags);
 929
 930out:
 931        put_ldev(mdev);
 932
 933        return count;
 934}
 935
 936static
 937struct bm_extent *_bme_get(struct drbd_conf *mdev, unsigned int enr)
 938{
 939        struct lc_element *e;
 940        struct bm_extent *bm_ext;
 941        int wakeup = 0;
 942        unsigned long rs_flags;
 943
 944        spin_lock_irq(&mdev->al_lock);
 945        if (mdev->resync_locked > mdev->resync->nr_elements/2) {
 946                spin_unlock_irq(&mdev->al_lock);
 947                return NULL;
 948        }
 949        e = lc_get(mdev->resync, enr);
 950        bm_ext = e ? lc_entry(e, struct bm_extent, lce) : NULL;
 951        if (bm_ext) {
 952                if (bm_ext->lce.lc_number != enr) {
 953                        bm_ext->rs_left = drbd_bm_e_weight(mdev, enr);
 954                        bm_ext->rs_failed = 0;
 955                        lc_committed(mdev->resync);
 956                        wakeup = 1;
 957                }
 958                if (bm_ext->lce.refcnt == 1)
 959                        mdev->resync_locked++;
 960                set_bit(BME_NO_WRITES, &bm_ext->flags);
 961        }
 962        rs_flags = mdev->resync->flags;
 963        spin_unlock_irq(&mdev->al_lock);
 964        if (wakeup)
 965                wake_up(&mdev->al_wait);
 966
 967        if (!bm_ext) {
 968                if (rs_flags & LC_STARVING)
 969                        dev_warn(DEV, "Have to wait for element"
 970                             " (resync LRU too small?)\n");
 971                BUG_ON(rs_flags & LC_LOCKED);
 972        }
 973
 974        return bm_ext;
 975}
 976
 977static int _is_in_al(struct drbd_conf *mdev, unsigned int enr)
 978{
 979        int rv;
 980
 981        spin_lock_irq(&mdev->al_lock);
 982        rv = lc_is_used(mdev->act_log, enr);
 983        spin_unlock_irq(&mdev->al_lock);
 984
 985        return rv;
 986}
 987
 988/**
 989 * drbd_rs_begin_io() - Gets an extent in the resync LRU cache and sets it to BME_LOCKED
 990 * @mdev:       DRBD device.
 991 * @sector:     The sector number.
 992 *
 993 * This functions sleeps on al_wait. Returns 0 on success, -EINTR if interrupted.
 994 */
 995int drbd_rs_begin_io(struct drbd_conf *mdev, sector_t sector)
 996{
 997        unsigned int enr = BM_SECT_TO_EXT(sector);
 998        struct bm_extent *bm_ext;
 999        int i, sig;
1000        int sa = 200; /* Step aside 200 times, then grab the extent and let app-IO wait.

1001                         200 times -> 20 seconds. */
1002
1003retry:
1004        sig = wait_event_interruptible(mdev->al_wait,
1005                        (bm_ext = _bme_get(mdev, enr)));
1006        if (sig)
1007                return -EINTR;
1008
1009        if (test_bit(BME_LOCKED, &bm_ext->flags))
1010                return 0;
1011
1012        for (i = 0; i < AL_EXT_PER_BM_SECT; i++) {
1013                sig = wait_event_interruptible(mdev->al_wait,
1014                                               !_is_in_al(mdev, enr * AL_EXT_PER_BM_SECT + i) ||
1015                                               test_bit(BME_PRIORITY, &bm_ext->flags));
1016
1017                if (sig || (test_bit(BME_PRIORITY, &bm_ext->flags) && sa)) {
1018                        spin_lock_irq(&mdev->al_lock);
1019                        if (lc_put(mdev->resync, &bm_ext->lce) == 0) {
1020                                bm_ext->flags = 0; /* clears BME_NO_WRITES and eventually BME_PRIORITY */
1021                                mdev->resync_locked--;
1022                                wake_up(&mdev->al_wait);
1023                        }
1024                        spin_unlock_irq(&mdev->al_lock);
1025                        if (sig)
1026                                return -EINTR;
1027                        if (schedule_timeout_interruptible(HZ/10))
1028                                return -EINTR;
1029                        if (sa && --sa == 0)
1030                                dev_warn(DEV,"drbd_rs_begin_io() stepped aside for 20sec."
1031                                         "Resync stalled?\n");
1032                        goto retry;
1033                }
1034        }
1035        set_bit(BME_LOCKED, &bm_ext->flags);
1036        return 0;
1037}
1038
1039/**
1040 * drbd_try_rs_begin_io() - Gets an extent in the resync LRU cache, does not sleep
1041 * @mdev:       DRBD device.
1042 * @sector:     The sector number.
1043 *
1044 * Gets an extent in the resync LRU cache, sets it to BME_NO_WRITES, then
1045 * tries to set it to BME_LOCKED. Returns 0 upon success, and -EAGAIN
1046 * if there is still application IO going on in this area.
1047 */
1048int drbd_try_rs_begin_io(struct drbd_conf *mdev, sector_t sector)
1049{
1050        unsigned int enr = BM_SECT_TO_EXT(sector);
1051        const unsigned int al_enr = enr*AL_EXT_PER_BM_SECT;
1052        struct lc_element *e;
1053        struct bm_extent *bm_ext;
1054        int i;
1055
1056        spin_lock_irq(&mdev->al_lock);
1057        if (mdev->resync_wenr != LC_FREE && mdev->resync_wenr != enr) {
1058                /* in case you have very heavy scattered io, it may
1059                 * stall the syncer undefined if we give up the ref count
1060                 * when we try again and requeue.
1061                 *
1062                 * if we don't give up the refcount, but the next time
1063                 * we are scheduled this extent has been "synced" by new
1064                 * application writes, we'd miss the lc_put on the
1065                 * extent we keep the refcount on.
1066                 * so we remembered which extent we had to try again, and
1067                 * if the next requested one is something else, we do
1068                 * the lc_put here...
1069                 * we also have to wake_up
1070                 */
1071                e = lc_find(mdev->resync, mdev->resync_wenr);
1072                bm_ext = e ? lc_entry(e, struct bm_extent, lce) : NULL;
1073                if (bm_ext) {
1074                        D_ASSERT(!test_bit(BME_LOCKED, &bm_ext->flags));
1075                        D_ASSERT(test_bit(BME_NO_WRITES, &bm_ext->flags));
1076                        clear_bit(BME_NO_WRITES, &bm_ext->flags);
1077                        mdev->resync_wenr = LC_FREE;
1078                        if (lc_put(mdev->resync, &bm_ext->lce) == 0)
1079                                mdev->resync_locked--;
1080                        wake_up(&mdev->al_wait);
1081                } else {
1082                        dev_alert(DEV, "LOGIC BUG\n");
1083                }
1084        }
1085        /* TRY. */
1086        e = lc_try_get(mdev->resync, enr);
1087        bm_ext = e ? lc_entry(e, struct bm_extent, lce) : NULL;
1088        if (bm_ext) {
1089                if (test_bit(BME_LOCKED, &bm_ext->flags))
1090                        goto proceed;
1091                if (!test_and_set_bit(BME_NO_WRITES, &bm_ext->flags)) {
1092                        mdev->resync_locked++;
1093                } else {
1094                        /* we did set the BME_NO_WRITES,
1095                         * but then could not set BME_LOCKED,
1096                         * so we tried again.
1097                         * drop the extra reference. */
1098                        bm_ext->lce.refcnt--;
1099                        D_ASSERT(bm_ext->lce.refcnt > 0);
1100                }
1101                goto check_al;
1102        } else {
1103                /* do we rather want to try later? */
1104                if (mdev->resync_locked > mdev->resync->nr_elements-3)
1105                        goto try_again;
1106                /* Do or do not. There is no try. -- Yoda */
1107                e = lc_get(mdev->resync, enr);
1108                bm_ext = e ? lc_entry(e, struct bm_extent, lce) : NULL;
1109                if (!bm_ext) {
1110                        const unsigned long rs_flags = mdev->resync->flags;
1111                        if (rs_flags & LC_STARVING)
1112                                dev_warn(DEV, "Have to wait for element"
1113                                     " (resync LRU too small?)\n");
1114                        BUG_ON(rs_flags & LC_LOCKED);
1115                        goto try_again;
1116                }
1117                if (bm_ext->lce.lc_number != enr) {
1118                        bm_ext->rs_left = drbd_bm_e_weight(mdev, enr);
1119                        bm_ext->rs_failed = 0;
1120                        lc_committed(mdev->resync);
1121                        wake_up(&mdev->al_wait);
1122                        D_ASSERT(test_bit(BME_LOCKED, &bm_ext->flags) == 0);
1123                }
1124                set_bit(BME_NO_WRITES, &bm_ext->flags);
1125                D_ASSERT(bm_ext->lce.refcnt == 1);
1126                mdev->resync_locked++;
1127                goto check_al;
1128        }
1129check_al:
1130        for (i = 0; i < AL_EXT_PER_BM_SECT; i++) {
1131                if (lc_is_used(mdev->act_log, al_enr+i))
1132                        goto try_again;
1133        }
1134        set_bit(BME_LOCKED, &bm_ext->flags);
1135proceed:
1136        mdev->resync_wenr = LC_FREE;
1137        spin_unlock_irq(&mdev->al_lock);
1138        return 0;
1139
1140try_again:
1141        if (bm_ext)
1142                mdev->resync_wenr = enr;
1143        spin_unlock_irq(&mdev->al_lock);
1144        return -EAGAIN;
1145}
1146
1147void drbd_rs_complete_io(struct drbd_conf *mdev, sector_t sector)
1148{
1149        unsigned int enr = BM_SECT_TO_EXT(sector);
1150        struct lc_element *e;
1151        struct bm_extent *bm_ext;
1152        unsigned long flags;
1153
1154        spin_lock_irqsave(&mdev->al_lock, flags);
1155        e = lc_find(mdev->resync, enr);
1156        bm_ext = e ? lc_entry(e, struct bm_extent, lce) : NULL;
1157        if (!bm_ext) {
1158                spin_unlock_irqrestore(&mdev->al_lock, flags);
1159                if (__ratelimit(&drbd_ratelimit_state))
1160                        dev_err(DEV, "drbd_rs_complete_io() called, but extent not found\n");
1161                return;
1162        }
1163
1164        if (bm_ext->lce.refcnt == 0) {
1165                spin_unlock_irqrestore(&mdev->al_lock, flags);
1166                dev_err(DEV, "drbd_rs_complete_io(,%llu [=%u]) called, "
1167                    "but refcnt is 0!?\n",
1168                    (unsigned long long)sector, enr);
1169                return;
1170        }
1171
1172        if (lc_put(mdev->resync, &bm_ext->lce) == 0) {
1173                bm_ext->flags = 0; /* clear BME_LOCKED, BME_NO_WRITES and BME_PRIORITY */
1174                mdev->resync_locked--;
1175                wake_up(&mdev->al_wait);
1176        }
1177
1178        spin_unlock_irqrestore(&mdev->al_lock, flags);
1179}
1180
1181/**
1182 * drbd_rs_cancel_all() - Removes all extents from the resync LRU (even BME_LOCKED)
1183 * @mdev:       DRBD device.
1184 */
1185void drbd_rs_cancel_all(struct drbd_conf *mdev)
1186{
1187        spin_lock_irq(&mdev->al_lock);
1188
1189        if (get_ldev_if_state(mdev, D_FAILED)) { /* Makes sure ->resync is there. */
1190                lc_reset(mdev->resync);
1191                put_ldev(mdev);
1192        }
1193        mdev->resync_locked = 0;
1194        mdev->resync_wenr = LC_FREE;
1195        spin_unlock_irq(&mdev->al_lock);
1196        wake_up(&mdev->al_wait);
1197}
1198
1199/**
1200 * drbd_rs_del_all() - Gracefully remove all extents from the resync LRU
1201 * @mdev:       DRBD device.
1202 *
1203 * Returns 0 upon success, -EAGAIN if at least one reference count was
1204 * not zero.
1205 */
1206int drbd_rs_del_all(struct drbd_conf *mdev)
1207{
1208        struct lc_element *e;
1209        struct bm_extent *bm_ext;
1210        int i;
1211
1212        spin_lock_irq(&mdev->al_lock);
1213
1214        if (get_ldev_if_state(mdev, D_FAILED)) {
1215                /* ok, ->resync is there. */
1216                for (i = 0; i < mdev->resync->nr_elements; i++) {
1217                        e = lc_element_by_index(mdev->resync, i);
1218                        bm_ext = lc_entry(e, struct bm_extent, lce);
1219                        if (bm_ext->lce.lc_number == LC_FREE)
1220                                continue;
1221                        if (bm_ext->lce.lc_number == mdev->resync_wenr) {
1222                                dev_info(DEV, "dropping %u in drbd_rs_del_all, apparently"
1223                                     " got 'synced' by application io\n",
1224                                     mdev->resync_wenr);
1225                                D_ASSERT(!test_bit(BME_LOCKED, &bm_ext->flags));
1226                                D_ASSERT(test_bit(BME_NO_WRITES, &bm_ext->flags));
1227                                clear_bit(BME_NO_WRITES, &bm_ext->flags);
1228                                mdev->resync_wenr = LC_FREE;
1229                                lc_put(mdev->resync, &bm_ext->lce);
1230                        }
1231                        if (bm_ext->lce.refcnt != 0) {
1232                                dev_info(DEV, "Retrying drbd_rs_del_all() later. "
1233                                     "refcnt=%d\n", bm_ext->lce.refcnt);
1234                                put_ldev(mdev);
1235                                spin_unlock_irq(&mdev->al_lock);
1236                                return -EAGAIN;
1237                        }
1238                        D_ASSERT(!test_bit(BME_LOCKED, &bm_ext->flags));
1239                        D_ASSERT(!test_bit(BME_NO_WRITES, &bm_ext->flags));
1240                        lc_del(mdev->resync, &bm_ext->lce);
1241                }
1242                D_ASSERT(mdev->resync->used == 0);
1243                put_ldev(mdev);
1244        }
1245        spin_unlock_irq(&mdev->al_lock);
1246        wake_up(&mdev->al_wait);
1247
1248        return 0;
1249}
1250
1251/**
1252 * drbd_rs_failed_io() - Record information on a failure to resync the specified blocks
1253 * @mdev:       DRBD device.
1254 * @sector:     The sector number.
1255 * @size:       Size of failed IO operation, in byte.
1256 */
1257void drbd_rs_failed_io(struct drbd_conf *mdev, sector_t sector, int size)
1258{
1259        /* Is called from worker and receiver context _only_ */
1260        unsigned long sbnr, ebnr, lbnr;
1261        unsigned long count;
1262        sector_t esector, nr_sectors;
1263        int wake_up = 0;
1264
1265        if (size <= 0 || !IS_ALIGNED(size, 512) || size > DRBD_MAX_BIO_SIZE) {
1266                dev_err(DEV, "drbd_rs_failed_io: sector=%llus size=%d nonsense!\n",
1267                                (unsigned long long)sector, size);
1268                return;
1269        }
1270        nr_sectors = drbd_get_capacity(mdev->this_bdev);
1271        esector = sector + (size >> 9) - 1;
1272
1273        if (!expect(sector < nr_sectors))
1274                return;
1275        if (!expect(esector < nr_sectors))
1276                esector = nr_sectors - 1;
1277
1278        lbnr = BM_SECT_TO_BIT(nr_sectors-1);
1279
1280        /*
1281         * round up start sector, round down end sector.  we make sure we only
1282         * handle full, aligned, BM_BLOCK_SIZE (4K) blocks */
1283        if (unlikely(esector < BM_SECT_PER_BIT-1))
1284                return;
1285        if (unlikely(esector == (nr_sectors-1)))
1286                ebnr = lbnr;
1287        else
1288                ebnr = BM_SECT_TO_BIT(esector - (BM_SECT_PER_BIT-1));
1289        sbnr = BM_SECT_TO_BIT(sector + BM_SECT_PER_BIT-1);
1290
1291        if (sbnr > ebnr)
1292                return;
1293
1294        /*
1295         * ok, (capacity & 7) != 0 sometimes, but who cares...
1296         * we count rs_{total,left} in bits, not sectors.
1297         */
1298        spin_lock_irq(&mdev->al_lock);
1299        count = drbd_bm_count_bits(mdev, sbnr, ebnr);
1300        if (count) {
1301                mdev->rs_failed += count;
1302
1303                if (get_ldev(mdev)) {
1304                        drbd_try_clear_on_disk_bm(mdev, sector, count, false);
1305                        put_ldev(mdev);
1306                }
1307
1308                /* just wake_up unconditional now, various lc_chaged(),
1309                 * lc_put() in drbd_try_clear_on_disk_bm(). */
1310                wake_up = 1;
1311        }
1312        spin_unlock_irq(&mdev->al_lock);
1313        if (wake_up)
1314                wake_up(&mdev->al_wait);
1315}
1316