LXR linux/drivers/block/drbd/drbd

   1/*
   2   drbd_receiver.c
   3
   4   This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
   5
   6   Copyright (C) 2001-2008, LINBIT Information Technologies GmbH.
   7   Copyright (C) 1999-2008, Philipp Reisner <philipp.reisner@linbit.com>.
   8   Copyright (C) 2002-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
   9
  10   drbd is free software; you can redistribute it and/or modify
  11   it under the terms of the GNU General Public License as published by
  12   the Free Software Foundation; either version 2, or (at your option)
  13   any later version.
  14
  15   drbd is distributed in the hope that it will be useful,
  16   but WITHOUT ANY WARRANTY; without even the implied warranty of
  17   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  18   GNU General Public License for more details.
  19
  20   You should have received a copy of the GNU General Public License
  21   along with drbd; see the file COPYING.  If not, write to
  22   the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
  23 */
  24
  25
  26#include <linux/module.h>
  27
  28#include <asm/uaccess.h>
  29#include <net/sock.h>
  30
  31#include <linux/drbd.h>
  32#include <linux/fs.h>
  33#include <linux/file.h>
  34#include <linux/in.h>
  35#include <linux/mm.h>
  36#include <linux/memcontrol.h>
  37#include <linux/mm_inline.h>
  38#include <linux/slab.h>
  39#include <linux/pkt_sched.h>
  40#define __KERNEL_SYSCALLS__
  41#include <linux/unistd.h>
  42#include <linux/vmalloc.h>
  43#include <linux/random.h>
  44#include <linux/string.h>
  45#include <linux/scatterlist.h>
  46#include "drbd_int.h"
  47#include "drbd_protocol.h"
  48#include "drbd_req.h"
  49#include "drbd_vli.h"
  50
  51#define PRO_FEATURES (FF_TRIM)
  52
  53struct packet_info {
  54        enum drbd_packet cmd;
  55        unsigned int size;
  56        unsigned int vnr;
  57        void *data;
  58};
  59
  60enum finish_epoch {
  61        FE_STILL_LIVE,
  62        FE_DESTROYED,
  63        FE_RECYCLED,
  64};
  65
  66static int drbd_do_features(struct drbd_connection *connection);
  67static int drbd_do_auth(struct drbd_connection *connection);
  68static int drbd_disconnected(struct drbd_peer_device *);
  69static void conn_wait_active_ee_empty(struct drbd_connection *connection);
  70static enum finish_epoch drbd_may_finish_epoch(struct drbd_connection *, struct drbd_epoch *, enum epoch_event);
  71static int e_end_block(struct drbd_work *, int);
  72
  73
  74#define GFP_TRY (__GFP_HIGHMEM | __GFP_NOWARN)
  75
  76/*
  77 * some helper functions to deal with single linked page lists,
  78 * page->private being our "next" pointer.
  79 */
  80
  81/* If at least n pages are linked at head, get n pages off.
  82 * Otherwise, don't modify head, and return NULL.
  83 * Locking is the responsibility of the caller.
  84 */
  85static struct page *page_chain_del(struct page **head, int n)
  86{
  87        struct page *page;
  88        struct page *tmp;
  89
  90        BUG_ON(!n);
  91        BUG_ON(!head);
  92
  93        page = *head;
  94
  95        if (!page)
  96                return NULL;
  97
  98        while (page) {
  99                tmp = page_chain_next(page);
 100                if (--n == 0)
 101                        break; /* found sufficient pages */
 102                if (tmp == NULL)
 103                        /* insufficient pages, don't use any of them. */
 104                        return NULL;
 105                page = tmp;
 106        }
 107
 108        /* add end of list marker for the returned list */
 109        set_page_private(page, 0);
 110        /* actual return value, and adjustment of head */
 111        page = *head;
 112        *head = tmp;
 113        return page;
 114}
 115
 116/* may be used outside of locks to find the tail of a (usually short)
 117 * "private" page chain, before adding it back to a global chain head
 118 * with page_chain_add() under a spinlock. */
 119static struct page *page_chain_tail(struct page *page, int *len)
 120{
 121        struct page *tmp;
 122        int i = 1;
 123        while ((tmp = page_chain_next(page)))
 124                ++i, page = tmp;
 125        if (len)
 126                *len = i;
 127        return page;
 128}
 129
 130static int page_chain_free(struct page *page)
 131{
 132        struct page *tmp;
 133        int i = 0;
 134        page_chain_for_each_safe(page, tmp) {
 135                put_page(page);
 136                ++i;
 137        }
 138        return i;
 139}
 140
 141static void page_chain_add(struct page **head,
 142                struct page *chain_first, struct page *chain_last)
 143{
 144#if 1
 145        struct page *tmp;
 146        tmp = page_chain_tail(chain_first, NULL);
 147        BUG_ON(tmp != chain_last);
 148#endif
 149
 150        /* add chain to head */
 151        set_page_private(chain_last, (unsigned long)*head);
 152        *head = chain_first;
 153}
 154
 155static struct page *__drbd_alloc_pages(struct drbd_device *device,
 156                                       unsigned int number)
 157{
 158        struct page *page = NULL;
 159        struct page *tmp = NULL;
 160        unsigned int i = 0;
 161
 162        /* Yes, testing drbd_pp_vacant outside the lock is racy.
 163         * So what. It saves a spin_lock. */
 164        if (drbd_pp_vacant >= number) {
 165                spin_lock(&drbd_pp_lock);
 166                page = page_chain_del(&drbd_pp_pool, number);
 167                if (page)
 168                        drbd_pp_vacant -= number;
 169                spin_unlock(&drbd_pp_lock);
 170                if (page)
 171                        return page;
 172        }
 173
 174        /* GFP_TRY, because we must not cause arbitrary write-out: in a DRBD
 175         * "criss-cross" setup, that might cause write-out on some other DRBD,
 176         * which in turn might block on the other node at this very place.  */
 177        for (i = 0; i < number; i++) {
 178                tmp = alloc_page(GFP_TRY);
 179                if (!tmp)
 180                        break;
 181                set_page_private(tmp, (unsigned long)page);
 182                page = tmp;
 183        }
 184
 185        if (i == number)
 186                return page;
 187
 188        /* Not enough pages immediately available this time.
 189         * No need to jump around here, drbd_alloc_pages will retry this
 190         * function "soon". */
 191        if (page) {
 192                tmp = page_chain_tail(page, NULL);
 193                spin_lock(&drbd_pp_lock);
 194                page_chain_add(&drbd_pp_pool, page, tmp);
 195                drbd_pp_vacant += i;
 196                spin_unlock(&drbd_pp_lock);
 197        }
 198        return NULL;
 199}
 200
 201static void reclaim_finished_net_peer_reqs(struct drbd_device *device,
 202                                           struct list_head *to_be_freed)
 203{
 204        struct drbd_peer_request *peer_req, *tmp;
 205
 206        /* The EEs are always appended to the end of the list. Since
 207           they are sent in order over the wire, they have to finish
 208           in order. As soon as we see the first not finished we can
 209           stop to examine the list... */
 210
 211        list_for_each_entry_safe(peer_req, tmp, &device->net_ee, w.list) {
 212                if (drbd_peer_req_has_active_page(peer_req))
 213                        break;
 214                list_move(&peer_req->w.list, to_be_freed);
 215        }
 216}
 217
 218static void drbd_kick_lo_and_reclaim_net(struct drbd_device *device)
 219{
 220        LIST_HEAD(reclaimed);
 221        struct drbd_peer_request *peer_req, *t;
 222
 223        spin_lock_irq(&device->resource->req_lock);
 224        reclaim_finished_net_peer_reqs(device, &reclaimed);
 225        spin_unlock_irq(&device->resource->req_lock);
 226
 227        list_for_each_entry_safe(peer_req, t, &reclaimed, w.list)
 228                drbd_free_net_peer_req(device, peer_req);
 229}
 230
 231/**
 232 * drbd_alloc_pages() - Returns @number pages, retries forever (or until signalled)
 233 * @device:     DRBD device.
 234 * @number:     number of pages requested
 235 * @retry:      whether to retry, if not enough pages are available right now
 236 *
 237 * Tries to allocate number pages, first from our own page pool, then from
 238 * the kernel.
 239 * Possibly retry until DRBD frees sufficient pages somewhere else.
 240 *
 241 * If this allocation would exceed the max_buffers setting, we throttle
 242 * allocation (schedule_timeout) to give the system some room to breathe.
 243 *
 244 * We do not use max-buffers as hard limit, because it could lead to
 245 * congestion and further to a distributed deadlock during online-verify or
 246 * (checksum based) resync, if the max-buffers, socket buffer sizes and
 247 * resync-rate settings are mis-configured.
 248 *
 249 * Returns a page chain linked via page->private.
 250 */
 251struct page *drbd_alloc_pages(struct drbd_peer_device *peer_device, unsigned int number,
 252                              bool retry)
 253{
 254        struct drbd_device *device = peer_device->device;
 255        struct page *page = NULL;
 256        struct net_conf *nc;
 257        DEFINE_WAIT(wait);
 258        unsigned int mxb;
 259
 260        rcu_read_lock();
 261        nc = rcu_dereference(peer_device->connection->net_conf);
 262        mxb = nc ? nc->max_buffers : 1000000;
 263        rcu_read_unlock();
 264
 265        if (atomic_read(&device->pp_in_use) < mxb)
 266                page = __drbd_alloc_pages(device, number);
 267
 268        while (page == NULL) {
 269                prepare_to_wait(&drbd_pp_wait, &wait, TASK_INTERRUPTIBLE);
 270
 271                drbd_kick_lo_and_reclaim_net(device);
 272
 273                if (atomic_read(&device->pp_in_use) < mxb) {
 274                        page = __drbd_alloc_pages(device, number);
 275                        if (page)
 276                                break;
 277                }
 278
 279                if (!retry)
 280                        break;
 281
 282                if (signal_pending(current)) {
 283                        drbd_warn(device, "drbd_alloc_pages interrupted!\n");
 284                        break;
 285                }
 286
 287                if (schedule_timeout(HZ/10) == 0)
 288                        mxb = UINT_MAX;
 289        }
 290        finish_wait(&drbd_pp_wait, &wait);
 291
 292        if (page)
 293                atomic_add(number, &device->pp_in_use);
 294        return page;
 295}
 296
 297/* Must not be used from irq, as that may deadlock: see drbd_alloc_pages.
 298 * Is also used from inside an other spin_lock_irq(&resource->req_lock);
 299 * Either links the page chain back to the global pool,
 300 * or returns all pages to the system. */
 301static void drbd_free_pages(struct drbd_device *device, struct page *page, int is_net)
 302{
 303        atomic_t *a = is_net ? &device->pp_in_use_by_net : &device->pp_in_use;
 304        int i;
 305
 306        if (page == NULL)
 307                return;
 308
 309        if (drbd_pp_vacant > (DRBD_MAX_BIO_SIZE/PAGE_SIZE) * minor_count)
 310                i = page_chain_free(page);
 311        else {
 312                struct page *tmp;
 313                tmp = page_chain_tail(page, &i);
 314                spin_lock(&drbd_pp_lock);
 315                page_chain_add(&drbd_pp_pool, page, tmp);
 316                drbd_pp_vacant += i;
 317                spin_unlock(&drbd_pp_lock);
 318        }
 319        i = atomic_sub_return(i, a);
 320        if (i < 0)
 321                drbd_warn(device, "ASSERTION FAILED: %s: %d < 0\n",
 322                        is_net ? "pp_in_use_by_net" : "pp_in_use", i);
 323        wake_up(&drbd_pp_wait);
 324}
 325
 326/*
 327You need to hold the req_lock:
 328 _drbd_wait_ee_list_empty()
 329
 330You must not have the req_lock:
 331 drbd_free_peer_req()
 332 drbd_alloc_peer_req()
 333 drbd_free_peer_reqs()
 334 drbd_ee_fix_bhs()
 335 drbd_finish_peer_reqs()
 336 drbd_clear_done_ee()
 337 drbd_wait_ee_list_empty()
 338*/
 339
 340struct drbd_peer_request *
 341drbd_alloc_peer_req(struct drbd_peer_device *peer_device, u64 id, sector_t sector,
 342                    unsigned int data_size, bool has_payload, gfp_t gfp_mask) __must_hold(local)
 343{
 344        struct drbd_device *device = peer_device->device;
 345        struct drbd_peer_request *peer_req;
 346        struct page *page = NULL;
 347        unsigned nr_pages = (data_size + PAGE_SIZE -1) >> PAGE_SHIFT;
 348
 349        if (drbd_insert_fault(device, DRBD_FAULT_AL_EE))
 350                return NULL;
 351
 352        peer_req = mempool_alloc(drbd_ee_mempool, gfp_mask & ~__GFP_HIGHMEM);
 353        if (!peer_req) {
 354                if (!(gfp_mask & __GFP_NOWARN))
 355                        drbd_err(device, "%s: allocation failed\n", __func__);
 356                return NULL;
 357        }
 358
 359        if (has_payload && data_size) {
 360                page = drbd_alloc_pages(peer_device, nr_pages, (gfp_mask & __GFP_WAIT));
 361                if (!page)
 362                        goto fail;
 363        }
 364
 365        memset(peer_req, 0, sizeof(*peer_req));
 366        INIT_LIST_HEAD(&peer_req->w.list);
 367        drbd_clear_interval(&peer_req->i);
 368        peer_req->i.size = data_size;
 369        peer_req->i.sector = sector;
 370        peer_req->submit_jif = jiffies;
 371        peer_req->peer_device = peer_device;
 372        peer_req->pages = page;
 373        /*
 374         * The block_id is opaque to the receiver.  It is not endianness
 375         * converted, and sent back to the sender unchanged.
 376         */
 377        peer_req->block_id = id;
 378
 379        return peer_req;
 380
 381 fail:
 382        mempool_free(peer_req, drbd_ee_mempool);
 383        return NULL;
 384}
 385
 386void __drbd_free_peer_req(struct drbd_device *device, struct drbd_peer_request *peer_req,
 387                       int is_net)
 388{
 389        might_sleep();
 390        if (peer_req->flags & EE_HAS_DIGEST)
 391                kfree(peer_req->digest);
 392        drbd_free_pages(device, peer_req->pages, is_net);
 393        D_ASSERT(device, atomic_read(&peer_req->pending_bios) == 0);
 394        D_ASSERT(device, drbd_interval_empty(&peer_req->i));
 395        if (!expect(!(peer_req->flags & EE_CALL_AL_COMPLETE_IO))) {
 396                peer_req->flags &= ~EE_CALL_AL_COMPLETE_IO;
 397                drbd_al_complete_io(device, &peer_req->i);
 398        }
 399        mempool_free(peer_req, drbd_ee_mempool);
 400}
 401
 402int drbd_free_peer_reqs(struct drbd_device *device, struct list_head *list)
 403{
 404        LIST_HEAD(work_list);
 405        struct drbd_peer_request *peer_req, *t;
 406        int count = 0;
 407        int is_net = list == &device->net_ee;
 408
 409        spin_lock_irq(&device->resource->req_lock);
 410        list_splice_init(list, &work_list);
 411        spin_unlock_irq(&device->resource->req_lock);
 412
 413        list_for_each_entry_safe(peer_req, t, &work_list, w.list) {
 414                __drbd_free_peer_req(device, peer_req, is_net);
 415                count++;
 416        }
 417        return count;
 418}
 419
 420/*
 421 * See also comments in _req_mod(,BARRIER_ACKED) and receive_Barrier.
 422 */
 423static int drbd_finish_peer_reqs(struct drbd_device *device)
 424{
 425        LIST_HEAD(work_list);
 426        LIST_HEAD(reclaimed);
 427        struct drbd_peer_request *peer_req, *t;
 428        int err = 0;
 429
 430        spin_lock_irq(&device->resource->req_lock);
 431        reclaim_finished_net_peer_reqs(device, &reclaimed);
 432        list_splice_init(&device->done_ee, &work_list);
 433        spin_unlock_irq(&device->resource->req_lock);
 434
 435        list_for_each_entry_safe(peer_req, t, &reclaimed, w.list)
 436                drbd_free_net_peer_req(device, peer_req);
 437
 438        /* possible callbacks here:
 439         * e_end_block, and e_end_resync_block, e_send_superseded.
 440         * all ignore the last argument.
 441         */
 442        list_for_each_entry_safe(peer_req, t, &work_list, w.list) {
 443                int err2;
 444
 445                /* list_del not necessary, next/prev members not touched */
 446                err2 = peer_req->w.cb(&peer_req->w, !!err);
 447                if (!err)
 448                        err = err2;
 449                drbd_free_peer_req(device, peer_req);
 450        }
 451        wake_up(&device->ee_wait);
 452
 453        return err;
 454}
 455
 456static void _drbd_wait_ee_list_empty(struct drbd_device *device,
 457                                     struct list_head *head)
 458{
 459        DEFINE_WAIT(wait);
 460
 461        /* avoids spin_lock/unlock
 462         * and calling prepare_to_wait in the fast path */
 463        while (!list_empty(head)) {
 464                prepare_to_wait(&device->ee_wait, &wait, TASK_UNINTERRUPTIBLE);
 465                spin_unlock_irq(&device->resource->req_lock);
 466                io_schedule();
 467                finish_wait(&device->ee_wait, &wait);
 468                spin_lock_irq(&device->resource->req_lock);
 469        }
 470}
 471
 472static void drbd_wait_ee_list_empty(struct drbd_device *device,
 473                                    struct list_head *head)
 474{
 475        spin_lock_irq(&device->resource->req_lock);
 476        _drbd_wait_ee_list_empty(device, head);
 477        spin_unlock_irq(&device->resource->req_lock);
 478}
 479
 480static int drbd_recv_short(struct socket *sock, void *buf, size_t size, int flags)
 481{
 482        struct kvec iov = {
 483                .iov_base = buf,
 484                .iov_len = size,
 485        };
 486        struct msghdr msg = {
 487                .msg_flags = (flags ? flags : MSG_WAITALL | MSG_NOSIGNAL)
 488        };
 489        return kernel_recvmsg(sock, &msg, &iov, 1, size, msg.msg_flags);
 490}
 491
 492static int drbd_recv(struct drbd_connection *connection, void *buf, size_t size)
 493{
 494        int rv;
 495
 496        rv = drbd_recv_short(connection->data.socket, buf, size, 0);
 497
 498        if (rv < 0) {
 499                if (rv == -ECONNRESET)
 500                        drbd_info(connection, "sock was reset by peer\n");
 501                else if (rv != -ERESTARTSYS)
 502                        drbd_err(connection, "sock_recvmsg returned %d\n", rv);
 503        } else if (rv == 0) {
 504                if (test_bit(DISCONNECT_SENT, &connection->flags)) {
 505                        long t;
 506                        rcu_read_lock();
 507                        t = rcu_dereference(connection->net_conf)->ping_timeo * HZ/10;
 508                        rcu_read_unlock();
 509
 510                        t = wait_event_timeout(connection->ping_wait, connection->cstate < C_WF_REPORT_PARAMS, t);
 511
 512                        if (t)
 513                                goto out;
 514                }
 515                drbd_info(connection, "sock was shut down by peer\n");
 516        }
 517
 518        if (rv != size)
 519                conn_request_state(connection, NS(conn, C_BROKEN_PIPE), CS_HARD);
 520
 521out:
 522        return rv;
 523}
 524
 525static int drbd_recv_all(struct drbd_connection *connection, void *buf, size_t size)
 526{
 527        int err;
 528
 529        err = drbd_recv(connection, buf, size);
 530        if (err != size) {
 531                if (err >= 0)
 532                        err = -EIO;
 533        } else
 534                err = 0;
 535        return err;
 536}
 537
 538static int drbd_recv_all_warn(struct drbd_connection *connection, void *buf, size_t size)
 539{
 540        int err;
 541
 542        err = drbd_recv_all(connection, buf, size);
 543        if (err && !signal_pending(current))
 544                drbd_warn(connection, "short read (expected size %d)\n", (int)size);
 545        return err;
 546}
 547
 548/* quoting tcp(7):
 549 *   On individual connections, the socket buffer size must be set prior to the
 550 *   listen(2) or connect(2) calls in order to have it take effect.
 551 * This is our wrapper to do so.
 552 */
 553static void drbd_setbufsize(struct socket *sock, unsigned int snd,
 554                unsigned int rcv)
 555{
 556        /* open coded SO_SNDBUF, SO_RCVBUF */
 557        if (snd) {
 558                sock->sk->sk_sndbuf = snd;
 559                sock->sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
 560        }
 561        if (rcv) {
 562                sock->sk->sk_rcvbuf = rcv;
 563                sock->sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
 564        }
 565}
 566
 567static struct socket *drbd_try_connect(struct drbd_connection *connection)
 568{
 569        const char *what;
 570        struct socket *sock;
 571        struct sockaddr_in6 src_in6;
 572        struct sockaddr_in6 peer_in6;
 573        struct net_conf *nc;
 574        int err, peer_addr_len, my_addr_len;
 575        int sndbuf_size, rcvbuf_size, connect_int;
 576        int disconnect_on_error = 1;
 577
 578        rcu_read_lock();
 579        nc = rcu_dereference(connection->net_conf);
 580        if (!nc) {
 581                rcu_read_unlock();
 582                return NULL;
 583        }
 584        sndbuf_size = nc->sndbuf_size;
 585        rcvbuf_size = nc->rcvbuf_size;
 586        connect_int = nc->connect_int;
 587        rcu_read_unlock();
 588
 589        my_addr_len = min_t(int, connection->my_addr_len, sizeof(src_in6));
 590        memcpy(&src_in6, &connection->my_addr, my_addr_len);
 591
 592        if (((struct sockaddr *)&connection->my_addr)->sa_family == AF_INET6)
 593                src_in6.sin6_port = 0;
 594        else
 595                ((struct sockaddr_in *)&src_in6)->sin_port = 0; /* AF_INET & AF_SCI */
 596
 597        peer_addr_len = min_t(int, connection->peer_addr_len, sizeof(src_in6));
 598        memcpy(&peer_in6, &connection->peer_addr, peer_addr_len);
 599
 600        what = "sock_create_kern";
 601        err = sock_create_kern(((struct sockaddr *)&src_in6)->sa_family,
 602                               SOCK_STREAM, IPPROTO_TCP, &sock);
 603        if (err < 0) {
 604                sock = NULL;
 605                goto out;
 606        }
 607
 608        sock->sk->sk_rcvtimeo =
 609        sock->sk->sk_sndtimeo = connect_int * HZ;
 610        drbd_setbufsize(sock, sndbuf_size, rcvbuf_size);
 611
 612       /* explicitly bind to the configured IP as source IP
 613        *  for the outgoing connections.
 614        *  This is needed for multihomed hosts and to be
 615        *  able to use lo: interfaces for drbd.
 616        * Make sure to use 0 as port number, so linux selects
 617        *  a free one dynamically.
 618        */
 619        what = "bind before connect";
 620        err = sock->ops->bind(sock, (struct sockaddr *) &src_in6, my_addr_len);
 621        if (err < 0)
 622                goto out;
 623
 624        /* connect may fail, peer not yet available.
 625         * stay C_WF_CONNECTION, don't go Disconnecting! */
 626        disconnect_on_error = 0;
 627        what = "connect";
 628        err = sock->ops->connect(sock, (struct sockaddr *) &peer_in6, peer_addr_len, 0);
 629
 630out:
 631        if (err < 0) {
 632                if (sock) {
 633                        sock_release(sock);
 634                        sock = NULL;
 635                }
 636                switch (-err) {
 637                        /* timeout, busy, signal pending */
 638                case ETIMEDOUT: case EAGAIN: case EINPROGRESS:
 639                case EINTR: case ERESTARTSYS:
 640                        /* peer not (yet) available, network problem */
 641                case ECONNREFUSED: case ENETUNREACH:
 642                case EHOSTDOWN:    case EHOSTUNREACH:
 643                        disconnect_on_error = 0;
 644                        break;
 645                default:
 646                        drbd_err(connection, "%s failed, err = %d\n", what, err);
 647                }
 648                if (disconnect_on_error)
 649                        conn_request_state(connection, NS(conn, C_DISCONNECTING), CS_HARD);
 650        }
 651
 652        return sock;
 653}
 654
 655struct accept_wait_data {
 656        struct drbd_connection *connection;
 657        struct socket *s_listen;
 658        struct completion door_bell;
 659        void (*original_sk_state_change)(struct sock *sk);
 660
 661};
 662
 663static void drbd_incoming_connection(struct sock *sk)
 664{
 665        struct accept_wait_data *ad = sk->sk_user_data;
 666        void (*state_change)(struct sock *sk);
 667
 668        state_change = ad->original_sk_state_change;
 669        if (sk->sk_state == TCP_ESTABLISHED)
 670                complete(&ad->door_bell);
 671        state_change(sk);
 672}
 673
 674static int prepare_listen_socket(struct drbd_connection *connection, struct accept_wait_data *ad)
 675{
 676        int err, sndbuf_size, rcvbuf_size, my_addr_len;
 677        struct sockaddr_in6 my_addr;
 678        struct socket *s_listen;
 679        struct net_conf *nc;
 680        const char *what;
 681
 682        rcu_read_lock();
 683        nc = rcu_dereference(connection->net_conf);
 684        if (!nc) {
 685                rcu_read_unlock();
 686                return -EIO;
 687        }
 688        sndbuf_size = nc->sndbuf_size;
 689        rcvbuf_size = nc->rcvbuf_size;
 690        rcu_read_unlock();
 691
 692        my_addr_len = min_t(int, connection->my_addr_len, sizeof(struct sockaddr_in6));
 693        memcpy(&my_addr, &connection->my_addr, my_addr_len);
 694
 695        what = "sock_create_kern";
 696        err = sock_create_kern(((struct sockaddr *)&my_addr)->sa_family,
 697                               SOCK_STREAM, IPPROTO_TCP, &s_listen);
 698        if (err) {
 699                s_listen = NULL;
 700                goto out;
 701        }
 702
 703        s_listen->sk->sk_reuse = SK_CAN_REUSE; /* SO_REUSEADDR */
 704        drbd_setbufsize(s_listen, sndbuf_size, rcvbuf_size);
 705
 706        what = "bind before listen";
 707        err = s_listen->ops->bind(s_listen, (struct sockaddr *)&my_addr, my_addr_len);
 708        if (err < 0)
 709                goto out;
 710
 711        ad->s_listen = s_listen;
 712        write_lock_bh(&s_listen->sk->sk_callback_lock);
 713        ad->original_sk_state_change = s_listen->sk->sk_state_change;
 714        s_listen->sk->sk_state_change = drbd_incoming_connection;
 715        s_listen->sk->sk_user_data = ad;
 716        write_unlock_bh(&s_listen->sk->sk_callback_lock);
 717
 718        what = "listen";
 719        err = s_listen->ops->listen(s_listen, 5);
 720        if (err < 0)
 721                goto out;
 722
 723        return 0;
 724out:
 725        if (s_listen)
 726                sock_release(s_listen);
 727        if (err < 0) {
 728                if (err != -EAGAIN && err != -EINTR && err != -ERESTARTSYS) {
 729                        drbd_err(connection, "%s failed, err = %d\n", what, err);
 730                        conn_request_state(connection, NS(conn, C_DISCONNECTING), CS_HARD);
 731                }
 732        }
 733
 734        return -EIO;
 735}
 736
 737static void unregister_state_change(struct sock *sk, struct accept_wait_data *ad)
 738{
 739        write_lock_bh(&sk->sk_callback_lock);
 740        sk->sk_state_change = ad->original_sk_state_change;
 741        sk->sk_user_data = NULL;
 742        write_unlock_bh(&sk->sk_callback_lock);
 743}
 744
 745static struct socket *drbd_wait_for_connect(struct drbd_connection *connection, struct accept_wait_data *ad)
 746{
 747        int timeo, connect_int, err = 0;
 748        struct socket *s_estab = NULL;
 749        struct net_conf *nc;
 750
 751        rcu_read_lock();
 752        nc = rcu_dereference(connection->net_conf);
 753        if (!nc) {
 754                rcu_read_unlock();
 755                return NULL;
 756        }
 757        connect_int = nc->connect_int;
 758        rcu_read_unlock();
 759
 760        timeo = connect_int * HZ;
 761        /* 28.5% random jitter */
 762        timeo += (prandom_u32() & 1) ? timeo / 7 : -timeo / 7;
 763
 764        err = wait_for_completion_interruptible_timeout(&ad->door_bell, timeo);
 765        if (err <= 0)
 766                return NULL;
 767
 768        err = kernel_accept(ad->s_listen, &s_estab, 0);
 769        if (err < 0) {
 770                if (err != -EAGAIN && err != -EINTR && err != -ERESTARTSYS) {
 771                        drbd_err(connection, "accept failed, err = %d\n", err);
 772                        conn_request_state(connection, NS(conn, C_DISCONNECTING), CS_HARD);
 773                }
 774        }
 775
 776        if (s_estab)
 777                unregister_state_change(s_estab->sk, ad);
 778
 779        return s_estab;
 780}
 781
 782static int decode_header(struct drbd_connection *, void *, struct packet_info *);
 783
 784static int send_first_packet(struct drbd_connection *connection, struct drbd_socket *sock,
 785                             enum drbd_packet cmd)
 786{
 787        if (!conn_prepare_command(connection, sock))
 788                return -EIO;
 789        return conn_send_command(connection, sock, cmd, 0, NULL, 0);
 790}
 791
 792static int receive_first_packet(struct drbd_connection *connection, struct socket *sock)
 793{
 794        unsigned int header_size = drbd_header_size(connection);
 795        struct packet_info pi;
 796        struct net_conf *nc;
 797        int err;
 798
 799        rcu_read_lock();
 800        nc = rcu_dereference(connection->net_conf);
 801        if (!nc) {
 802                rcu_read_unlock();
 803                return -EIO;
 804        }
 805        sock->sk->sk_rcvtimeo = nc->ping_timeo * 4 * HZ / 10;
 806        rcu_read_unlock();
 807
 808        err = drbd_recv_short(sock, connection->data.rbuf, header_size, 0);
 809        if (err != header_size) {
 810                if (err >= 0)
 811                        err = -EIO;
 812                return err;
 813        }
 814        err = decode_header(connection, connection->data.rbuf, &pi);
 815        if (err)
 816                return err;
 817        return pi.cmd;
 818}
 819
 820/**
 821 * drbd_socket_okay() - Free the socket if its connection is not okay
 822 * @sock:       pointer to the pointer to the socket.
 823 */
 824static bool drbd_socket_okay(struct socket **sock)
 825{
 826        int rr;
 827        char tb[4];
 828
 829        if (!*sock)
 830                return false;
 831
 832        rr = drbd_recv_short(*sock, tb, 4, MSG_DONTWAIT | MSG_PEEK);
 833
 834        if (rr > 0 || rr == -EAGAIN) {
 835                return true;
 836        } else {
 837                sock_release(*sock);
 838                *sock = NULL;
 839                return false;
 840        }
 841}
 842
 843static bool connection_established(struct drbd_connection *connection,
 844                                   struct socket **sock1,
 845                                   struct socket **sock2)
 846{
 847        struct net_conf *nc;
 848        int timeout;
 849        bool ok;
 850
 851        if (!*sock1 || !*sock2)
 852                return false;
 853
 854        rcu_read_lock();
 855        nc = rcu_dereference(connection->net_conf);
 856        timeout = (nc->sock_check_timeo ?: nc->ping_timeo) * HZ / 10;
 857        rcu_read_unlock();
 858        schedule_timeout_interruptible(timeout);
 859
 860        ok = drbd_socket_okay(sock1);
 861        ok = drbd_socket_okay(sock2) && ok;
 862
 863        return ok;
 864}
 865
 866/* Gets called if a connection is established, or if a new minor gets created
 867   in a connection */
 868int drbd_connected(struct drbd_peer_device *peer_device)
 869{
 870        struct drbd_device *device = peer_device->device;
 871        int err;
 872
 873        atomic_set(&device->packet_seq, 0);
 874        device->peer_seq = 0;
 875
 876        device->state_mutex = peer_device->connection->agreed_pro_version < 100 ?
 877                &peer_device->connection->cstate_mutex :
 878                &device->own_state_mutex;
 879
 880        err = drbd_send_sync_param(peer_device);
 881        if (!err)
 882                err = drbd_send_sizes(peer_device, 0, 0);
 883        if (!err)
 884                err = drbd_send_uuids(peer_device);
 885        if (!err)
 886                err = drbd_send_current_state(peer_device);
 887        clear_bit(USE_DEGR_WFC_T, &device->flags);
 888        clear_bit(RESIZE_PENDING, &device->flags);
 889        atomic_set(&device->ap_in_flight, 0);
 890        mod_timer(&device->request_timer, jiffies + HZ); /* just start it here. */
 891        return err;
 892}
 893
 894/*
 895 * return values:
 896 *   1 yes, we have a valid connection
 897 *   0 oops, did not work out, please try again
 898 *  -1 peer talks different language,
 899 *     no point in trying again, please go standalone.
 900 *  -2 We do not have a network config...
 901 */
 902static int conn_connect(struct drbd_connection *connection)
 903{
 904        struct drbd_socket sock, msock;
 905        struct drbd_peer_device *peer_device;
 906        struct net_conf *nc;
 907        int vnr, timeout, h;
 908        bool discard_my_data, ok;
 909        enum drbd_state_rv rv;
 910        struct accept_wait_data ad = {
 911                .connection = connection,
 912                .door_bell = COMPLETION_INITIALIZER_ONSTACK(ad.door_bell),
 913        };
 914
 915        clear_bit(DISCONNECT_SENT, &connection->flags);
 916        if (conn_request_state(connection, NS(conn, C_WF_CONNECTION), CS_VERBOSE) < SS_SUCCESS)
 917                return -2;
 918
 919        mutex_init(&sock.mutex);
 920        sock.sbuf = connection->data.sbuf;
 921        sock.rbuf = connection->data.rbuf;
 922        sock.socket = NULL;
 923        mutex_init(&msock.mutex);
 924        msock.sbuf = connection->meta.sbuf;
 925        msock.rbuf = connection->meta.rbuf;
 926        msock.socket = NULL;
 927
 928        /* Assume that the peer only understands protocol 80 until we know better.  */
 929        connection->agreed_pro_version = 80;
 930
 931        if (prepare_listen_socket(connection, &ad))
 932                return 0;
 933
 934        do {
 935                struct socket *s;
 936
 937                s = drbd_try_connect(connection);
 938                if (s) {
 939                        if (!sock.socket) {
 940                                sock.socket = s;
 941                                send_first_packet(connection, &sock, P_INITIAL_DATA);
 942                        } else if (!msock.socket) {
 943                                clear_bit(RESOLVE_CONFLICTS, &connection->flags);
 944                                msock.socket = s;
 945                                send_first_packet(connection, &msock, P_INITIAL_META);
 946                        } else {
 947                                drbd_err(connection, "Logic error in conn_connect()\n");
 948                                goto out_release_sockets;
 949                        }
 950                }
 951
 952                if (connection_established(connection, &sock.socket, &msock.socket))
 953                        break;
 954
 955retry:
 956                s = drbd_wait_for_connect(connection, &ad);
 957                if (s) {
 958                        int fp = receive_first_packet(connection, s);
 959                        drbd_socket_okay(&sock.socket);
 960                        drbd_socket_okay(&msock.socket);
 961                        switch (fp) {
 962                        case P_INITIAL_DATA:
 963                                if (sock.socket) {
 964                                        drbd_warn(connection, "initial packet S crossed\n");
 965                                        sock_release(sock.socket);
 966                                        sock.socket = s;
 967                                        goto randomize;
 968                                }
 969                                sock.socket = s;
 970                                break;
 971                        case P_INITIAL_META:
 972                                set_bit(RESOLVE_CONFLICTS, &connection->flags);
 973                                if (msock.socket) {
 974                                        drbd_warn(connection, "initial packet M crossed\n");
 975                                        sock_release(msock.socket);
 976                                        msock.socket = s;
 977                                        goto randomize;
 978                                }
 979                                msock.socket = s;
 980                                break;
 981                        default:
 982                                drbd_warn(connection, "Error receiving initial packet\n");
 983                                sock_release(s);
 984randomize:
 985                                if (prandom_u32() & 1)
 986                                        goto retry;
 987                        }
 988                }
 989
 990                if (connection->cstate <= C_DISCONNECTING)
 991                        goto out_release_sockets;
 992                if (signal_pending(current)) {
 993                        flush_signals(current);
 994                        smp_rmb();
 995                        if (get_t_state(&connection->receiver) == EXITING)
 996                                goto out_release_sockets;
 997                }
 998
 999                ok = connection_established(connection, &sock.socket, &msock.socket);
1000        } while (!ok);

1001
1002        if (ad.s_listen)
1003                sock_release(ad.s_listen);
1004
1005        sock.socket->sk->sk_reuse = SK_CAN_REUSE; /* SO_REUSEADDR */
1006        msock.socket->sk->sk_reuse = SK_CAN_REUSE; /* SO_REUSEADDR */
1007
1008        sock.socket->sk->sk_allocation = GFP_NOIO;
1009        msock.socket->sk->sk_allocation = GFP_NOIO;
1010
1011        sock.socket->sk->sk_priority = TC_PRIO_INTERACTIVE_BULK;
1012        msock.socket->sk->sk_priority = TC_PRIO_INTERACTIVE;
1013
1014        /* NOT YET ...
1015         * sock.socket->sk->sk_sndtimeo = connection->net_conf->timeout*HZ/10;
1016         * sock.socket->sk->sk_rcvtimeo = MAX_SCHEDULE_TIMEOUT;
1017         * first set it to the P_CONNECTION_FEATURES timeout,
1018         * which we set to 4x the configured ping_timeout. */
1019        rcu_read_lock();
1020        nc = rcu_dereference(connection->net_conf);
1021
1022        sock.socket->sk->sk_sndtimeo =
1023        sock.socket->sk->sk_rcvtimeo = nc->ping_timeo*4*HZ/10;
1024
1025        msock.socket->sk->sk_rcvtimeo = nc->ping_int*HZ;
1026        timeout = nc->timeout * HZ / 10;
1027        discard_my_data = nc->discard_my_data;
1028        rcu_read_unlock();
1029
1030        msock.socket->sk->sk_sndtimeo = timeout;
1031
1032        /* we don't want delays.
1033         * we use TCP_CORK where appropriate, though */
1034        drbd_tcp_nodelay(sock.socket);
1035        drbd_tcp_nodelay(msock.socket);
1036
1037        connection->data.socket = sock.socket;
1038        connection->meta.socket = msock.socket;
1039        connection->last_received = jiffies;
1040
1041        h = drbd_do_features(connection);
1042        if (h <= 0)
1043                return h;
1044
1045        if (connection->cram_hmac_tfm) {
1046                /* drbd_request_state(device, NS(conn, WFAuth)); */
1047                switch (drbd_do_auth(connection)) {
1048                case -1:
1049                        drbd_err(connection, "Authentication of peer failed\n");
1050                        return -1;
1051                case 0:
1052                        drbd_err(connection, "Authentication of peer failed, trying again.\n");
1053                        return 0;
1054                }
1055        }
1056
1057        connection->data.socket->sk->sk_sndtimeo = timeout;
1058        connection->data.socket->sk->sk_rcvtimeo = MAX_SCHEDULE_TIMEOUT;
1059
1060        if (drbd_send_protocol(connection) == -EOPNOTSUPP)
1061                return -1;
1062
1063        /* Prevent a race between resync-handshake and
1064         * being promoted to Primary.
1065         *
1066         * Grab and release the state mutex, so we know that any current
1067         * drbd_set_role() is finished, and any incoming drbd_set_role
1068         * will see the STATE_SENT flag, and wait for it to be cleared.
1069         */
1070        idr_for_each_entry(&connection->peer_devices, peer_device, vnr)
1071                mutex_lock(peer_device->device->state_mutex);
1072
1073        set_bit(STATE_SENT, &connection->flags);
1074
1075        idr_for_each_entry(&connection->peer_devices, peer_device, vnr)
1076                mutex_unlock(peer_device->device->state_mutex);
1077
1078        rcu_read_lock();
1079        idr_for_each_entry(&connection->peer_devices, peer_device, vnr) {
1080                struct drbd_device *device = peer_device->device;
1081                kref_get(&device->kref);
1082                rcu_read_unlock();
1083
1084                if (discard_my_data)
1085                        set_bit(DISCARD_MY_DATA, &device->flags);
1086                else
1087                        clear_bit(DISCARD_MY_DATA, &device->flags);
1088
1089                drbd_connected(peer_device);
1090                kref_put(&device->kref, drbd_destroy_device);
1091                rcu_read_lock();
1092        }
1093        rcu_read_unlock();
1094
1095        rv = conn_request_state(connection, NS(conn, C_WF_REPORT_PARAMS), CS_VERBOSE);
1096        if (rv < SS_SUCCESS || connection->cstate != C_WF_REPORT_PARAMS) {
1097                clear_bit(STATE_SENT, &connection->flags);
1098                return 0;
1099        }
1100
1101        drbd_thread_start(&connection->asender);
1102
1103        mutex_lock(&connection->resource->conf_update);
1104        /* The discard_my_data flag is a single-shot modifier to the next
1105         * connection attempt, the handshake of which is now well underway.
1106         * No need for rcu style copying of the whole struct
1107         * just to clear a single value. */
1108        connection->net_conf->discard_my_data = 0;
1109        mutex_unlock(&connection->resource->conf_update);
1110
1111        return h;
1112
1113out_release_sockets:
1114        if (ad.s_listen)
1115                sock_release(ad.s_listen);
1116        if (sock.socket)
1117                sock_release(sock.socket);
1118        if (msock.socket)
1119                sock_release(msock.socket);
1120        return -1;
1121}
1122
1123static int decode_header(struct drbd_connection *connection, void *header, struct packet_info *pi)
1124{
1125        unsigned int header_size = drbd_header_size(connection);
1126
1127        if (header_size == sizeof(struct p_header100) &&
1128            *(__be32 *)header == cpu_to_be32(DRBD_MAGIC_100)) {
1129                struct p_header100 *h = header;
1130                if (h->pad != 0) {
1131                        drbd_err(connection, "Header padding is not zero\n");
1132                        return -EINVAL;
1133                }
1134                pi->vnr = be16_to_cpu(h->volume);
1135                pi->cmd = be16_to_cpu(h->command);
1136                pi->size = be32_to_cpu(h->length);
1137        } else if (header_size == sizeof(struct p_header95) &&
1138                   *(__be16 *)header == cpu_to_be16(DRBD_MAGIC_BIG)) {
1139                struct p_header95 *h = header;
1140                pi->cmd = be16_to_cpu(h->command);
1141                pi->size = be32_to_cpu(h->length);
1142                pi->vnr = 0;
1143        } else if (header_size == sizeof(struct p_header80) &&
1144                   *(__be32 *)header == cpu_to_be32(DRBD_MAGIC)) {
1145                struct p_header80 *h = header;
1146                pi->cmd = be16_to_cpu(h->command);
1147                pi->size = be16_to_cpu(h->length);
1148                pi->vnr = 0;
1149        } else {
1150                drbd_err(connection, "Wrong magic value 0x%08x in protocol version %d\n",
1151                         be32_to_cpu(*(__be32 *)header),
1152                         connection->agreed_pro_version);
1153                return -EINVAL;
1154        }
1155        pi->data = header + header_size;
1156        return 0;
1157}
1158
1159static int drbd_recv_header(struct drbd_connection *connection, struct packet_info *pi)
1160{
1161        void *buffer = connection->data.rbuf;
1162        int err;
1163
1164        err = drbd_recv_all_warn(connection, buffer, drbd_header_size(connection));
1165        if (err)
1166                return err;
1167
1168        err = decode_header(connection, buffer, pi);
1169        connection->last_received = jiffies;
1170
1171        return err;
1172}
1173
1174static void drbd_flush(struct drbd_connection *connection)
1175{
1176        int rv;
1177        struct drbd_peer_device *peer_device;
1178        int vnr;
1179
1180        if (connection->resource->write_ordering >= WO_bdev_flush) {
1181                rcu_read_lock();
1182                idr_for_each_entry(&connection->peer_devices, peer_device, vnr) {
1183                        struct drbd_device *device = peer_device->device;
1184
1185                        if (!get_ldev(device))
1186                                continue;
1187                        kref_get(&device->kref);
1188                        rcu_read_unlock();
1189
1190                        /* Right now, we have only this one synchronous code path
1191                         * for flushes between request epochs.
1192                         * We may want to make those asynchronous,
1193                         * or at least parallelize the flushes to the volume devices.
1194                         */
1195                        device->flush_jif = jiffies;
1196                        set_bit(FLUSH_PENDING, &device->flags);
1197                        rv = blkdev_issue_flush(device->ldev->backing_bdev,
1198                                        GFP_NOIO, NULL);
1199                        clear_bit(FLUSH_PENDING, &device->flags);
1200                        if (rv) {
1201                                drbd_info(device, "local disk flush failed with status %d\n", rv);
1202                                /* would rather check on EOPNOTSUPP, but that is not reliable.
1203                                 * don't try again for ANY return value != 0
1204                                 * if (rv == -EOPNOTSUPP) */
1205                                drbd_bump_write_ordering(connection->resource, NULL, WO_drain_io);
1206                        }
1207                        put_ldev(device);
1208                        kref_put(&device->kref, drbd_destroy_device);
1209
1210                        rcu_read_lock();
1211                        if (rv)
1212                                break;
1213                }
1214                rcu_read_unlock();
1215        }
1216}
1217
1218/**
1219 * drbd_may_finish_epoch() - Applies an epoch_event to the epoch's state, eventually finishes it.
1220 * @device:     DRBD device.
1221 * @epoch:      Epoch object.
1222 * @ev:         Epoch event.
1223 */
1224static enum finish_epoch drbd_may_finish_epoch(struct drbd_connection *connection,
1225                                               struct drbd_epoch *epoch,
1226                                               enum epoch_event ev)
1227{
1228        int epoch_size;
1229        struct drbd_epoch *next_epoch;
1230        enum finish_epoch rv = FE_STILL_LIVE;
1231
1232        spin_lock(&connection->epoch_lock);
1233        do {
1234                next_epoch = NULL;
1235
1236                epoch_size = atomic_read(&epoch->epoch_size);
1237
1238                switch (ev & ~EV_CLEANUP) {
1239                case EV_PUT:
1240                        atomic_dec(&epoch->active);
1241                        break;
1242                case EV_GOT_BARRIER_NR:
1243                        set_bit(DE_HAVE_BARRIER_NUMBER, &epoch->flags);
1244                        break;
1245                case EV_BECAME_LAST:
1246                        /* nothing to do*/
1247                        break;
1248                }
1249
1250                if (epoch_size != 0 &&
1251                    atomic_read(&epoch->active) == 0 &&
1252                    (test_bit(DE_HAVE_BARRIER_NUMBER, &epoch->flags) || ev & EV_CLEANUP)) {
1253                        if (!(ev & EV_CLEANUP)) {
1254                                spin_unlock(&connection->epoch_lock);
1255                                drbd_send_b_ack(epoch->connection, epoch->barrier_nr, epoch_size);
1256                                spin_lock(&connection->epoch_lock);
1257                        }
1258#if 0
1259                        /* FIXME: dec unacked on connection, once we have
1260                         * something to count pending connection packets in. */
1261                        if (test_bit(DE_HAVE_BARRIER_NUMBER, &epoch->flags))
1262                                dec_unacked(epoch->connection);
1263#endif
1264
1265                        if (connection->current_epoch != epoch) {
1266                                next_epoch = list_entry(epoch->list.next, struct drbd_epoch, list);
1267                                list_del(&epoch->list);
1268                                ev = EV_BECAME_LAST | (ev & EV_CLEANUP);
1269                                connection->epochs--;
1270                                kfree(epoch);
1271
1272                                if (rv == FE_STILL_LIVE)
1273                                        rv = FE_DESTROYED;
1274                        } else {
1275                                epoch->flags = 0;
1276                                atomic_set(&epoch->epoch_size, 0);
1277                                /* atomic_set(&epoch->active, 0); is already zero */
1278                                if (rv == FE_STILL_LIVE)
1279                                        rv = FE_RECYCLED;
1280                        }
1281                }
1282
1283                if (!next_epoch)
1284                        break;
1285
1286                epoch = next_epoch;
1287        } while (1);
1288
1289        spin_unlock(&connection->epoch_lock);
1290
1291        return rv;
1292}
1293
1294static enum write_ordering_e
1295max_allowed_wo(struct drbd_backing_dev *bdev, enum write_ordering_e wo)
1296{
1297        struct disk_conf *dc;
1298
1299        dc = rcu_dereference(bdev->disk_conf);
1300
1301        if (wo == WO_bdev_flush && !dc->disk_flushes)
1302                wo = WO_drain_io;
1303        if (wo == WO_drain_io && !dc->disk_drain)
1304                wo = WO_none;
1305
1306        return wo;
1307}
1308
1309/**
1310 * drbd_bump_write_ordering() - Fall back to an other write ordering method
1311 * @connection: DRBD connection.
1312 * @wo:         Write ordering method to try.
1313 */
1314void drbd_bump_write_ordering(struct drbd_resource *resource, struct drbd_backing_dev *bdev,
1315                              enum write_ordering_e wo)
1316{
1317        struct drbd_device *device;
1318        enum write_ordering_e pwo;
1319        int vnr;
1320        static char *write_ordering_str[] = {
1321                [WO_none] = "none",
1322                [WO_drain_io] = "drain",
1323                [WO_bdev_flush] = "flush",
1324        };
1325
1326        pwo = resource->write_ordering;
1327        if (wo != WO_bdev_flush)
1328                wo = min(pwo, wo);
1329        rcu_read_lock();
1330        idr_for_each_entry(&resource->devices, device, vnr) {
1331                if (get_ldev(device)) {
1332                        wo = max_allowed_wo(device->ldev, wo);
1333                        if (device->ldev == bdev)
1334                                bdev = NULL;
1335                        put_ldev(device);
1336                }
1337        }
1338
1339        if (bdev)
1340                wo = max_allowed_wo(bdev, wo);
1341
1342        rcu_read_unlock();
1343
1344        resource->write_ordering = wo;
1345        if (pwo != resource->write_ordering || wo == WO_bdev_flush)
1346                drbd_info(resource, "Method to ensure write ordering: %s\n", write_ordering_str[resource->write_ordering]);
1347}
1348
1349/**
1350 * drbd_submit_peer_request()
1351 * @device:     DRBD device.
1352 * @peer_req:   peer request
1353 * @rw:         flag field, see bio->bi_rw
1354 *
1355 * May spread the pages to multiple bios,
1356 * depending on bio_add_page restrictions.
1357 *
1358 * Returns 0 if all bios have been submitted,
1359 * -ENOMEM if we could not allocate enough bios,
1360 * -ENOSPC (any better suggestion?) if we have not been able to bio_add_page a
1361 *  single page to an empty bio (which should never happen and likely indicates
1362 *  that the lower level IO stack is in some way broken). This has been observed
1363 *  on certain Xen deployments.
1364 */
1365/* TODO allocate from our own bio_set. */
1366int drbd_submit_peer_request(struct drbd_device *device,
1367                             struct drbd_peer_request *peer_req,
1368                             const unsigned rw, const int fault_type)
1369{
1370        struct bio *bios = NULL;
1371        struct bio *bio;
1372        struct page *page = peer_req->pages;
1373        sector_t sector = peer_req->i.sector;
1374        unsigned data_size = peer_req->i.size;
1375        unsigned n_bios = 0;
1376        unsigned nr_pages = (data_size + PAGE_SIZE -1) >> PAGE_SHIFT;
1377        int err = -ENOMEM;
1378
1379        if (peer_req->flags & EE_IS_TRIM_USE_ZEROOUT) {
1380                /* wait for all pending IO completions, before we start
1381                 * zeroing things out. */
1382                conn_wait_active_ee_empty(first_peer_device(device)->connection);
1383                /* add it to the active list now,
1384                 * so we can find it to present it in debugfs */
1385                peer_req->submit_jif = jiffies;
1386                peer_req->flags |= EE_SUBMITTED;
1387                spin_lock_irq(&device->resource->req_lock);
1388                list_add_tail(&peer_req->w.list, &device->active_ee);
1389                spin_unlock_irq(&device->resource->req_lock);
1390                if (blkdev_issue_zeroout(device->ldev->backing_bdev,
1391                        sector, data_size >> 9, GFP_NOIO, false))
1392                        peer_req->flags |= EE_WAS_ERROR;
1393                drbd_endio_write_sec_final(peer_req);
1394                return 0;
1395        }
1396
1397        /* Discards don't have any payload.
1398         * But the scsi layer still expects a bio_vec it can use internally,
1399         * see sd_setup_discard_cmnd() and blk_add_request_payload(). */
1400        if (peer_req->flags & EE_IS_TRIM)
1401                nr_pages = 1;
1402
1403        /* In most cases, we will only need one bio.  But in case the lower
1404         * level restrictions happen to be different at this offset on this
1405         * side than those of the sending peer, we may need to submit the
1406         * request in more than one bio.
1407         *
1408         * Plain bio_alloc is good enough here, this is no DRBD internally
1409         * generated bio, but a bio allocated on behalf of the peer.
1410         */
1411next_bio:
1412        bio = bio_alloc(GFP_NOIO, nr_pages);
1413        if (!bio) {
1414                drbd_err(device, "submit_ee: Allocation of a bio failed (nr_pages=%u)\n", nr_pages);
1415                goto fail;
1416        }
1417        /* > peer_req->i.sector, unless this is the first bio */
1418        bio->bi_iter.bi_sector = sector;
1419        bio->bi_bdev = device->ldev->backing_bdev;
1420        bio->bi_rw = rw;
1421        bio->bi_private = peer_req;
1422        bio->bi_end_io = drbd_peer_request_endio;
1423
1424        bio->bi_next = bios;
1425        bios = bio;
1426        ++n_bios;
1427
1428        if (rw & REQ_DISCARD) {
1429                bio->bi_iter.bi_size = data_size;
1430                goto submit;
1431        }
1432
1433        page_chain_for_each(page) {
1434                unsigned len = min_t(unsigned, data_size, PAGE_SIZE);
1435                if (!bio_add_page(bio, page, len, 0)) {
1436                        /* A single page must always be possible!
1437                         * But in case it fails anyways,
1438                         * we deal with it, and complain (below). */
1439                        if (bio->bi_vcnt == 0) {
1440                                drbd_err(device,
1441                                        "bio_add_page failed for len=%u, "
1442                                        "bi_vcnt=0 (bi_sector=%llu)\n",
1443                                        len, (uint64_t)bio->bi_iter.bi_sector);
1444                                err = -ENOSPC;
1445                                goto fail;
1446                        }
1447                        goto next_bio;
1448                }
1449                data_size -= len;
1450                sector += len >> 9;
1451                --nr_pages;
1452        }
1453        D_ASSERT(device, data_size == 0);
1454submit:
1455        D_ASSERT(device, page == NULL);
1456
1457        atomic_set(&peer_req->pending_bios, n_bios);
1458        /* for debugfs: update timestamp, mark as submitted */
1459        peer_req->submit_jif = jiffies;
1460        peer_req->flags |= EE_SUBMITTED;
1461        do {
1462                bio = bios;
1463                bios = bios->bi_next;
1464                bio->bi_next = NULL;
1465
1466                drbd_generic_make_request(device, fault_type, bio);
1467        } while (bios);
1468        return 0;
1469
1470fail:
1471        while (bios) {
1472                bio = bios;
1473                bios = bios->bi_next;
1474                bio_put(bio);
1475        }
1476        return err;
1477}
1478
1479static void drbd_remove_epoch_entry_interval(struct drbd_device *device,
1480                                             struct drbd_peer_request *peer_req)
1481{
1482        struct drbd_interval *i = &peer_req->i;
1483
1484        drbd_remove_interval(&device->write_requests, i);
1485        drbd_clear_interval(i);
1486
1487        /* Wake up any processes waiting for this peer request to complete.  */
1488        if (i->waiting)
1489                wake_up(&device->misc_wait);
1490}
1491
1492static void conn_wait_active_ee_empty(struct drbd_connection *connection)
1493{
1494        struct drbd_peer_device *peer_device;
1495        int vnr;
1496
1497        rcu_read_lock();
1498        idr_for_each_entry(&connection->peer_devices, peer_device, vnr) {
1499                struct drbd_device *device = peer_device->device;
1500
1501                kref_get(&device->kref);
1502                rcu_read_unlock();
1503                drbd_wait_ee_list_empty(device, &device->active_ee);
1504                kref_put(&device->kref, drbd_destroy_device);
1505                rcu_read_lock();
1506        }
1507        rcu_read_unlock();
1508}
1509
1510static struct drbd_peer_device *
1511conn_peer_device(struct drbd_connection *connection, int volume_number)
1512{
1513        return idr_find(&connection->peer_devices, volume_number);
1514}
1515
1516static int receive_Barrier(struct drbd_connection *connection, struct packet_info *pi)
1517{
1518        int rv;
1519        struct p_barrier *p = pi->data;
1520        struct drbd_epoch *epoch;
1521
1522        /* FIXME these are unacked on connection,
1523         * not a specific (peer)device.
1524         */
1525        connection->current_epoch->barrier_nr = p->barrier;
1526        connection->current_epoch->connection = connection;
1527        rv = drbd_may_finish_epoch(connection, connection->current_epoch, EV_GOT_BARRIER_NR);
1528
1529        /* P_BARRIER_ACK may imply that the corresponding extent is dropped from
1530         * the activity log, which means it would not be resynced in case the
1531         * R_PRIMARY crashes now.
1532         * Therefore we must send the barrier_ack after the barrier request was
1533         * completed. */
1534        switch (connection->resource->write_ordering) {
1535        case WO_none:
1536                if (rv == FE_RECYCLED)
1537                        return 0;
1538
1539                /* receiver context, in the writeout path of the other node.
1540                 * avoid potential distributed deadlock */
1541                epoch = kmalloc(sizeof(struct drbd_epoch), GFP_NOIO);
1542                if (epoch)
1543                        break;
1544                else
1545                        drbd_warn(connection, "Allocation of an epoch failed, slowing down\n");
1546                        /* Fall through */
1547
1548        case WO_bdev_flush:
1549        case WO_drain_io:
1550                conn_wait_active_ee_empty(connection);
1551                drbd_flush(connection);
1552
1553                if (atomic_read(&connection->current_epoch->epoch_size)) {
1554                        epoch = kmalloc(sizeof(struct drbd_epoch), GFP_NOIO);
1555                        if (epoch)
1556                                break;
1557                }
1558
1559                return 0;
1560        default:
1561                drbd_err(connection, "Strangeness in connection->write_ordering %d\n",
1562                         connection->resource->write_ordering);
1563                return -EIO;
1564        }
1565
1566        epoch->flags = 0;
1567        atomic_set(&epoch->epoch_size, 0);
1568        atomic_set(&epoch->active, 0);
1569
1570        spin_lock(&connection->epoch_lock);
1571        if (atomic_read(&connection->current_epoch->epoch_size)) {
1572                list_add(&epoch->list, &connection->current_epoch->list);
1573                connection->current_epoch = epoch;
1574                connection->epochs++;
1575        } else {
1576                /* The current_epoch got recycled while we allocated this one... */
1577                kfree(epoch);
1578        }
1579        spin_unlock(&connection->epoch_lock);
1580
1581        return 0;
1582}
1583
1584/* used from receive_RSDataReply (recv_resync_read)
1585 * and from receive_Data */
1586static struct drbd_peer_request *
1587read_in_block(struct drbd_peer_device *peer_device, u64 id, sector_t sector,
1588              struct packet_info *pi) __must_hold(local)
1589{
1590        struct drbd_device *device = peer_device->device;
1591        const sector_t capacity = drbd_get_capacity(device->this_bdev);
1592        struct drbd_peer_request *peer_req;
1593        struct page *page;
1594        int digest_size, err;
1595        unsigned int data_size = pi->size, ds;
1596        void *dig_in = peer_device->connection->int_dig_in;
1597        void *dig_vv = peer_device->connection->int_dig_vv;
1598        unsigned long *data;
1599        struct p_trim *trim = (pi->cmd == P_TRIM) ? pi->data : NULL;
1600
1601        digest_size = 0;
1602        if (!trim && peer_device->connection->peer_integrity_tfm) {
1603                digest_size = crypto_hash_digestsize(peer_device->connection->peer_integrity_tfm);
1604                /*
1605                 * FIXME: Receive the incoming digest into the receive buffer
1606                 *        here, together with its struct p_data?
1607                 */
1608                err = drbd_recv_all_warn(peer_device->connection, dig_in, digest_size);
1609                if (err)
1610                        return NULL;
1611                data_size -= digest_size;
1612        }
1613
1614        if (trim) {
1615                D_ASSERT(peer_device, data_size == 0);
1616                data_size = be32_to_cpu(trim->size);
1617        }
1618
1619        if (!expect(IS_ALIGNED(data_size, 512)))
1620                return NULL;
1621        /* prepare for larger trim requests. */
1622        if (!trim && !expect(data_size <= DRBD_MAX_BIO_SIZE))
1623                return NULL;
1624
1625        /* even though we trust out peer,
1626         * we sometimes have to double check. */
1627        if (sector + (data_size>>9) > capacity) {
1628                drbd_err(device, "request from peer beyond end of local disk: "
1629                        "capacity: %llus < sector: %llus + size: %u\n",
1630                        (unsigned long long)capacity,
1631                        (unsigned long long)sector, data_size);
1632                return NULL;
1633        }
1634
1635        /* GFP_NOIO, because we must not cause arbitrary write-out: in a DRBD
1636         * "criss-cross" setup, that might cause write-out on some other DRBD,
1637         * which in turn might block on the other node at this very place.  */
1638        peer_req = drbd_alloc_peer_req(peer_device, id, sector, data_size, trim == NULL, GFP_NOIO);
1639        if (!peer_req)
1640                return NULL;
1641
1642        peer_req->flags |= EE_WRITE;
1643        if (trim)
1644                return peer_req;
1645
1646        ds = data_size;
1647        page = peer_req->pages;
1648        page_chain_for_each(page) {
1649                unsigned len = min_t(int, ds, PAGE_SIZE);
1650                data = kmap(page);
1651                err = drbd_recv_all_warn(peer_device->connection, data, len);
1652                if (drbd_insert_fault(device, DRBD_FAULT_RECEIVE)) {
1653                        drbd_err(device, "Fault injection: Corrupting data on receive\n");
1654                        data[0] = data[0] ^ (unsigned long)-1;
1655                }
1656                kunmap(page);
1657                if (err) {
1658                        drbd_free_peer_req(device, peer_req);
1659                        return NULL;
1660                }
1661                ds -= len;
1662        }
1663
1664        if (digest_size) {
1665                drbd_csum_ee(peer_device->connection->peer_integrity_tfm, peer_req, dig_vv);
1666                if (memcmp(dig_in, dig_vv, digest_size)) {
1667                        drbd_err(device, "Digest integrity check FAILED: %llus +%u\n",
1668                                (unsigned long long)sector, data_size);
1669                        drbd_free_peer_req(device, peer_req);
1670                        return NULL;
1671                }
1672        }
1673        device->recv_cnt += data_size >> 9;
1674        return peer_req;
1675}
1676
1677/* drbd_drain_block() just takes a data block
1678 * out of the socket input buffer, and discards it.
1679 */
1680static int drbd_drain_block(struct drbd_peer_device *peer_device, int data_size)
1681{
1682        struct page *page;
1683        int err = 0;
1684        void *data;
1685
1686        if (!data_size)
1687                return 0;
1688
1689        page = drbd_alloc_pages(peer_device, 1, 1);
1690
1691        data = kmap(page);
1692        while (data_size) {
1693                unsigned int len = min_t(int, data_size, PAGE_SIZE);
1694
1695                err = drbd_recv_all_warn(peer_device->connection, data, len);
1696                if (err)
1697                        break;
1698                data_size -= len;
1699        }
1700        kunmap(page);
1701        drbd_free_pages(peer_device->device, page, 0);
1702        return err;
1703}
1704
1705static int recv_dless_read(struct drbd_peer_device *peer_device, struct drbd_request *req,
1706                           sector_t sector, int data_size)
1707{
1708        struct bio_vec bvec;
1709        struct bvec_iter iter;
1710        struct bio *bio;
1711        int digest_size, err, expect;
1712        void *dig_in = peer_device->connection->int_dig_in;
1713        void *dig_vv = peer_device->connection->int_dig_vv;
1714
1715        digest_size = 0;
1716        if (peer_device->connection->peer_integrity_tfm) {
1717                digest_size = crypto_hash_digestsize(peer_device->connection->peer_integrity_tfm);
1718                err = drbd_recv_all_warn(peer_device->connection, dig_in, digest_size);
1719                if (err)
1720                        return err;
1721                data_size -= digest_size;
1722        }
1723
1724        /* optimistically update recv_cnt.  if receiving fails below,
1725         * we disconnect anyways, and counters will be reset. */
1726        peer_device->device->recv_cnt += data_size>>9;
1727
1728        bio = req->master_bio;
1729        D_ASSERT(peer_device->device, sector == bio->bi_iter.bi_sector);
1730
1731        bio_for_each_segment(bvec, bio, iter) {
1732                void *mapped = kmap(bvec.bv_page) + bvec.bv_offset;
1733                expect = min_t(int, data_size, bvec.bv_len);
1734                err = drbd_recv_all_warn(peer_device->connection, mapped, expect);
1735                kunmap(bvec.bv_page);
1736                if (err)
1737                        return err;
1738                data_size -= expect;
1739        }
1740
1741        if (digest_size) {
1742                drbd_csum_bio(peer_device->connection->peer_integrity_tfm, bio, dig_vv);
1743                if (memcmp(dig_in, dig_vv, digest_size)) {
1744                        drbd_err(peer_device, "Digest integrity check FAILED. Broken NICs?\n");
1745                        return -EINVAL;
1746                }
1747        }
1748
1749        D_ASSERT(peer_device->device, data_size == 0);
1750        return 0;
1751}
1752
1753/*
1754 * e_end_resync_block() is called in asender context via
1755 * drbd_finish_peer_reqs().
1756 */
1757static int e_end_resync_block(struct drbd_work *w, int unused)
1758{
1759        struct drbd_peer_request *peer_req =
1760                container_of(w, struct drbd_peer_request, w);
1761        struct drbd_peer_device *peer_device = peer_req->peer_device;
1762        struct drbd_device *device = peer_device->device;
1763        sector_t sector = peer_req->i.sector;
1764        int err;
1765
1766        D_ASSERT(device, drbd_interval_empty(&peer_req->i));
1767
1768        if (likely((peer_req->flags & EE_WAS_ERROR) == 0)) {
1769                drbd_set_in_sync(device, sector, peer_req->i.size);
1770                err = drbd_send_ack(peer_device, P_RS_WRITE_ACK, peer_req);
1771        } else {
1772                /* Record failure to sync */
1773                drbd_rs_failed_io(device, sector, peer_req->i.size);
1774
1775                err  = drbd_send_ack(peer_device, P_NEG_ACK, peer_req);
1776        }
1777        dec_unacked(device);
1778
1779        return err;
1780}
1781
1782static int recv_resync_read(struct drbd_peer_device *peer_device, sector_t sector,
1783                            struct packet_info *pi) __releases(local)
1784{
1785        struct drbd_device *device = peer_device->device;
1786        struct drbd_peer_request *peer_req;
1787
1788        peer_req = read_in_block(peer_device, ID_SYNCER, sector, pi);
1789        if (!peer_req)
1790                goto fail;
1791
1792        dec_rs_pending(device);
1793
1794        inc_unacked(device);
1795        /* corresponding dec_unacked() in e_end_resync_block()
1796         * respective _drbd_clear_done_ee */
1797
1798        peer_req->w.cb = e_end_resync_block;
1799        peer_req->submit_jif = jiffies;
1800
1801        spin_lock_irq(&device->resource->req_lock);
1802        list_add_tail(&peer_req->w.list, &device->sync_ee);
1803        spin_unlock_irq(&device->resource->req_lock);
1804
1805        atomic_add(pi->size >> 9, &device->rs_sect_ev);
1806        if (drbd_submit_peer_request(device, peer_req, WRITE, DRBD_FAULT_RS_WR) == 0)
1807                return 0;
1808
1809        /* don't care for the reason here */
1810        drbd_err(device, "submit failed, triggering re-connect\n");
1811        spin_lock_irq(&device->resource->req_lock);
1812        list_del(&peer_req->w.list);
1813        spin_unlock_irq(&device->resource->req_lock);
1814
1815        drbd_free_peer_req(device, peer_req);
1816fail:
1817        put_ldev(device);
1818        return -EIO;
1819}
1820
1821static struct drbd_request *
1822find_request(struct drbd_device *device, struct rb_root *root, u64 id,
1823             sector_t sector, bool missing_ok, const char *func)
1824{
1825        struct drbd_request *req;
1826
1827        /* Request object according to our peer */
1828        req = (struct drbd_request *)(unsigned long)id;
1829        if (drbd_contains_interval(root, sector, &req->i) && req->i.local)
1830                return req;
1831        if (!missing_ok) {
1832                drbd_err(device, "%s: failed to find request 0x%lx, sector %llus\n", func,
1833                        (unsigned long)id, (unsigned long long)sector);
1834        }
1835        return NULL;
1836}
1837
1838static int receive_DataReply(struct drbd_connection *connection, struct packet_info *pi)
1839{
1840        struct drbd_peer_device *peer_device;
1841        struct drbd_device *device;
1842        struct drbd_request *req;
1843        sector_t sector;
1844        int err;
1845        struct p_data *p = pi->data;
1846
1847        peer_device = conn_peer_device(connection, pi->vnr);
1848        if (!peer_device)
1849                return -EIO;
1850        device = peer_device->device;
1851
1852        sector = be64_to_cpu(p->sector);
1853
1854        spin_lock_irq(&device->resource->req_lock);
1855        req = find_request(device, &device->read_requests, p->block_id, sector, false, __func__);
1856        spin_unlock_irq(&device->resource->req_lock);
1857        if (unlikely(!req))
1858                return -EIO;
1859
1860        /* hlist_del(&req->collision) is done in _req_may_be_done, to avoid
1861         * special casing it there for the various failure cases.
1862         * still no race with drbd_fail_pending_reads */
1863        err = recv_dless_read(peer_device, req, sector, pi->size);
1864        if (!err)
1865                req_mod(req, DATA_RECEIVED);
1866        /* else: nothing. handled from drbd_disconnect...
1867         * I don't think we may complete this just yet
1868         * in case we are "on-disconnect: freeze" */
1869
1870        return err;
1871}
1872
1873static int receive_RSDataReply(struct drbd_connection *connection, struct packet_info *pi)
1874{
1875        struct drbd_peer_device *peer_device;
1876        struct drbd_device *device;
1877        sector_t sector;
1878        int err;
1879        struct p_data *p = pi->data;
1880
1881        peer_device = conn_peer_device(connection, pi->vnr);
1882        if (!peer_device)
1883                return -EIO;
1884        device = peer_device->device;
1885
1886        sector = be64_to_cpu(p->sector);
1887        D_ASSERT(device, p->block_id == ID_SYNCER);
1888
1889        if (get_ldev(device)) {
1890                /* data is submitted to disk within recv_resync_read.
1891                 * corresponding put_ldev done below on error,
1892                 * or in drbd_peer_request_endio. */
1893                err = recv_resync_read(peer_device, sector, pi);
1894        } else {
1895                if (__ratelimit(&drbd_ratelimit_state))
1896                        drbd_err(device, "Can not write resync data to local disk.\n");
1897
1898                err = drbd_drain_block(peer_device, pi->size);
1899
1900                drbd_send_ack_dp(peer_device, P_NEG_ACK, p, pi->size);
1901        }
1902
1903        atomic_add(pi->size >> 9, &device->rs_sect_in);
1904
1905        return err;
1906}
1907
1908static void restart_conflicting_writes(struct drbd_device *device,
1909                                       sector_t sector, int size)
1910{
1911        struct drbd_interval *i;
1912        struct drbd_request *req;
1913
1914        drbd_for_each_overlap(i, &device->write_requests, sector, size) {
1915                if (!i->local)
1916                        continue;
1917                req = container_of(i, struct drbd_request, i);
1918                if (req->rq_state & RQ_LOCAL_PENDING ||
1919                    !(req->rq_state & RQ_POSTPONED))
1920                        continue;
1921                /* as it is RQ_POSTPONED, this will cause it to
1922                 * be queued on the retry workqueue. */
1923                __req_mod(req, CONFLICT_RESOLVED, NULL);
1924        }
1925}
1926
1927/*
1928 * e_end_block() is called in asender context via drbd_finish_peer_reqs().
1929 */
1930static int e_end_block(struct drbd_work *w, int cancel)
1931{
1932        struct drbd_peer_request *peer_req =
1933                container_of(w, struct drbd_peer_request, w);
1934        struct drbd_peer_device *peer_device = peer_req->peer_device;
1935        struct drbd_device *device = peer_device->device;
1936        sector_t sector = peer_req->i.sector;
1937        int err = 0, pcmd;
1938
1939        if (peer_req->flags & EE_SEND_WRITE_ACK) {
1940                if (likely((peer_req->flags & EE_WAS_ERROR) == 0)) {
1941                        pcmd = (device->state.conn >= C_SYNC_SOURCE &&
1942                                device->state.conn <= C_PAUSED_SYNC_T &&
1943                                peer_req->flags & EE_MAY_SET_IN_SYNC) ?
1944                                P_RS_WRITE_ACK : P_WRITE_ACK;
1945                        err = drbd_send_ack(peer_device, pcmd, peer_req);
1946                        if (pcmd == P_RS_WRITE_ACK)
1947                                drbd_set_in_sync(device, sector, peer_req->i.size);
1948                } else {
1949                        err = drbd_send_ack(peer_device, P_NEG_ACK, peer_req);
1950                        /* we expect it to be marked out of sync anyways...
1951                         * maybe assert this?  */
1952                }
1953                dec_unacked(device);
1954        }
1955
1956        /* we delete from the conflict detection hash _after_ we sent out the
1957         * P_WRITE_ACK / P_NEG_ACK, to get the sequence number right.  */
1958        if (peer_req->flags & EE_IN_INTERVAL_TREE) {
1959                spin_lock_irq(&device->resource->req_lock);
1960                D_ASSERT(device, !drbd_interval_empty(&peer_req->i));
1961                drbd_remove_epoch_entry_interval(device, peer_req);
1962                if (peer_req->flags & EE_RESTART_REQUESTS)
1963                        restart_conflicting_writes(device, sector, peer_req->i.size);
1964                spin_unlock_irq(&device->resource->req_lock);
1965        } else
1966                D_ASSERT(device, drbd_interval_empty(&peer_req->i));
1967
1968        drbd_may_finish_epoch(first_peer_device(device)->connection, peer_req->epoch, EV_PUT + (cancel ? EV_CLEANUP : 0));
1969
1970        return err;
1971}
1972
1973static int e_send_ack(struct drbd_work *w, enum drbd_packet ack)
1974{
1975        struct drbd_peer_request *peer_req =
1976                container_of(w, struct drbd_peer_request, w);
1977        struct drbd_peer_device *peer_device = peer_req->peer_device;
1978        int err;
1979
1980        err = drbd_send_ack(peer_device, ack, peer_req);
1981        dec_unacked(peer_device->device);
1982
1983        return err;
1984}
1985
1986static int e_send_superseded(struct drbd_work *w, int unused)
1987{
1988        return e_send_ack(w, P_SUPERSEDED);
1989}
1990
1991static int e_send_retry_write(struct drbd_work *w, int unused)
1992{
1993        struct drbd_peer_request *peer_req =
1994                container_of(w, struct drbd_peer_request, w);
1995        struct drbd_connection *connection = peer_req->peer_device->connection;
1996
1997        return e_send_ack(w, connection->agreed_pro_version >= 100 ?
1998                             P_RETRY_WRITE : P_SUPERSEDED);
1999}
2000

2001static bool seq_greater(u32 a, u32 b)
2002{
2003        /*
2004         * We assume 32-bit wrap-around here.
2005         * For 24-bit wrap-around, we would have to shift:
2006         *  a <<= 8; b <<= 8;
2007         */
2008        return (s32)a - (s32)b > 0;
2009}
2010
2011static u32 seq_max(u32 a, u32 b)
2012{
2013        return seq_greater(a, b) ? a : b;
2014}
2015
2016static void update_peer_seq(struct drbd_peer_device *peer_device, unsigned int peer_seq)
2017{
2018        struct drbd_device *device = peer_device->device;
2019        unsigned int newest_peer_seq;
2020
2021        if (test_bit(RESOLVE_CONFLICTS, &peer_device->connection->flags)) {
2022                spin_lock(&device->peer_seq_lock);
2023                newest_peer_seq = seq_max(device->peer_seq, peer_seq);
2024                device->peer_seq = newest_peer_seq;
2025                spin_unlock(&device->peer_seq_lock);
2026                /* wake up only if we actually changed device->peer_seq */
2027                if (peer_seq == newest_peer_seq)
2028                        wake_up(&device->seq_wait);
2029        }
2030}
2031
2032static inline int overlaps(sector_t s1, int l1, sector_t s2, int l2)
2033{
2034        return !((s1 + (l1>>9) <= s2) || (s1 >= s2 + (l2>>9)));
2035}
2036
2037/* maybe change sync_ee into interval trees as well? */
2038static bool overlapping_resync_write(struct drbd_device *device, struct drbd_peer_request *peer_req)
2039{
2040        struct drbd_peer_request *rs_req;
2041        bool rv = 0;
2042
2043        spin_lock_irq(&device->resource->req_lock);
2044        list_for_each_entry(rs_req, &device->sync_ee, w.list) {
2045                if (overlaps(peer_req->i.sector, peer_req->i.size,
2046                             rs_req->i.sector, rs_req->i.size)) {
2047                        rv = 1;
2048                        break;
2049                }
2050        }
2051        spin_unlock_irq(&device->resource->req_lock);
2052
2053        return rv;
2054}
2055
2056/* Called from receive_Data.
2057 * Synchronize packets on sock with packets on msock.
2058 *
2059 * This is here so even when a P_DATA packet traveling via sock overtook an Ack
2060 * packet traveling on msock, they are still processed in the order they have
2061 * been sent.
2062 *
2063 * Note: we don't care for Ack packets overtaking P_DATA packets.
2064 *
2065 * In case packet_seq is larger than device->peer_seq number, there are
2066 * outstanding packets on the msock. We wait for them to arrive.
2067 * In case we are the logically next packet, we update device->peer_seq
2068 * ourselves. Correctly handles 32bit wrap around.
2069 *
2070 * Assume we have a 10 GBit connection, that is about 1<<30 byte per second,
2071 * about 1<<21 sectors per second. So "worst" case, we have 1<<3 == 8 seconds
2072 * for the 24bit wrap (historical atomic_t guarantee on some archs), and we have
2073 * 1<<9 == 512 seconds aka ages for the 32bit wrap around...
2074 *
2075 * returns 0 if we may process the packet,
2076 * -ERESTARTSYS if we were interrupted (by disconnect signal). */
2077static int wait_for_and_update_peer_seq(struct drbd_peer_device *peer_device, const u32 peer_seq)
2078{
2079        struct drbd_device *device = peer_device->device;
2080        DEFINE_WAIT(wait);
2081        long timeout;
2082        int ret = 0, tp;
2083
2084        if (!test_bit(RESOLVE_CONFLICTS, &peer_device->connection->flags))
2085                return 0;
2086
2087        spin_lock(&device->peer_seq_lock);
2088        for (;;) {
2089                if (!seq_greater(peer_seq - 1, device->peer_seq)) {
2090                        device->peer_seq = seq_max(device->peer_seq, peer_seq);
2091                        break;
2092                }
2093
2094                if (signal_pending(current)) {
2095                        ret = -ERESTARTSYS;
2096                        break;
2097                }
2098
2099                rcu_read_lock();
2100                tp = rcu_dereference(first_peer_device(device)->connection->net_conf)->two_primaries;
2101                rcu_read_unlock();
2102
2103                if (!tp)
2104                        break;
2105
2106                /* Only need to wait if two_primaries is enabled */
2107                prepare_to_wait(&device->seq_wait, &wait, TASK_INTERRUPTIBLE);
2108                spin_unlock(&device->peer_seq_lock);
2109                rcu_read_lock();
2110                timeout = rcu_dereference(peer_device->connection->net_conf)->ping_timeo*HZ/10;
2111                rcu_read_unlock();
2112                timeout = schedule_timeout(timeout);
2113                spin_lock(&device->peer_seq_lock);
2114                if (!timeout) {
2115                        ret = -ETIMEDOUT;
2116                        drbd_err(device, "Timed out waiting for missing ack packets; disconnecting\n");
2117                        break;
2118                }
2119        }
2120        spin_unlock(&device->peer_seq_lock);
2121        finish_wait(&device->seq_wait, &wait);
2122        return ret;
2123}
2124
2125/* see also bio_flags_to_wire()
2126 * DRBD_REQ_*, because we need to semantically map the flags to data packet
2127 * flags and back. We may replicate to other kernel versions. */
2128static unsigned long wire_flags_to_bio(u32 dpf)
2129{
2130        return  (dpf & DP_RW_SYNC ? REQ_SYNC : 0) |
2131                (dpf & DP_FUA ? REQ_FUA : 0) |
2132                (dpf & DP_FLUSH ? REQ_FLUSH : 0) |
2133                (dpf & DP_DISCARD ? REQ_DISCARD : 0);
2134}
2135
2136static void fail_postponed_requests(struct drbd_device *device, sector_t sector,
2137                                    unsigned int size)
2138{
2139        struct drbd_interval *i;
2140
2141    repeat:
2142        drbd_for_each_overlap(i, &device->write_requests, sector, size) {
2143                struct drbd_request *req;
2144                struct bio_and_error m;
2145
2146                if (!i->local)
2147                        continue;
2148                req = container_of(i, struct drbd_request, i);
2149                if (!(req->rq_state & RQ_POSTPONED))
2150                        continue;
2151                req->rq_state &= ~RQ_POSTPONED;
2152                __req_mod(req, NEG_ACKED, &m);
2153                spin_unlock_irq(&device->resource->req_lock);
2154                if (m.bio)
2155                        complete_master_bio(device, &m);
2156                spin_lock_irq(&device->resource->req_lock);
2157                goto repeat;
2158        }
2159}
2160
2161static int handle_write_conflicts(struct drbd_device *device,
2162                                  struct drbd_peer_request *peer_req)
2163{
2164        struct drbd_connection *connection = peer_req->peer_device->connection;
2165        bool resolve_conflicts = test_bit(RESOLVE_CONFLICTS, &connection->flags);
2166        sector_t sector = peer_req->i.sector;
2167        const unsigned int size = peer_req->i.size;
2168        struct drbd_interval *i;
2169        bool equal;
2170        int err;
2171
2172        /*
2173         * Inserting the peer request into the write_requests tree will prevent
2174         * new conflicting local requests from being added.
2175         */
2176        drbd_insert_interval(&device->write_requests, &peer_req->i);
2177
2178    repeat:
2179        drbd_for_each_overlap(i, &device->write_requests, sector, size) {
2180                if (i == &peer_req->i)
2181                        continue;
2182                if (i->completed)
2183                        continue;
2184
2185                if (!i->local) {
2186                        /*
2187                         * Our peer has sent a conflicting remote request; this
2188                         * should not happen in a two-node setup.  Wait for the
2189                         * earlier peer request to complete.
2190                         */
2191                        err = drbd_wait_misc(device, i);
2192                        if (err)
2193                                goto out;
2194                        goto repeat;
2195                }
2196
2197                equal = i->sector == sector && i->size == size;
2198                if (resolve_conflicts) {
2199                        /*
2200                         * If the peer request is fully contained within the
2201                         * overlapping request, it can be considered overwritten
2202                         * and thus superseded; otherwise, it will be retried
2203                         * once all overlapping requests have completed.
2204                         */
2205                        bool superseded = i->sector <= sector && i->sector +
2206                                       (i->size >> 9) >= sector + (size >> 9);
2207
2208                        if (!equal)
2209                                drbd_alert(device, "Concurrent writes detected: "
2210                                               "local=%llus +%u, remote=%llus +%u, "
2211                                               "assuming %s came first\n",
2212                                          (unsigned long long)i->sector, i->size,
2213                                          (unsigned long long)sector, size,
2214                                          superseded ? "local" : "remote");
2215
2216                        peer_req->w.cb = superseded ? e_send_superseded :
2217                                                   e_send_retry_write;
2218                        list_add_tail(&peer_req->w.list, &device->done_ee);
2219                        wake_asender(connection);
2220
2221                        err = -ENOENT;
2222                        goto out;
2223                } else {
2224                        struct drbd_request *req =
2225                                container_of(i, struct drbd_request, i);
2226
2227                        if (!equal)
2228                                drbd_alert(device, "Concurrent writes detected: "
2229                                               "local=%llus +%u, remote=%llus +%u\n",
2230                                          (unsigned long long)i->sector, i->size,
2231                                          (unsigned long long)sector, size);
2232
2233                        if (req->rq_state & RQ_LOCAL_PENDING ||
2234                            !(req->rq_state & RQ_POSTPONED)) {
2235                                /*
2236                                 * Wait for the node with the discard flag to
2237                                 * decide if this request has been superseded
2238                                 * or needs to be retried.
2239                                 * Requests that have been superseded will
2240                                 * disappear from the write_requests tree.
2241                                 *
2242                                 * In addition, wait for the conflicting
2243                                 * request to finish locally before submitting
2244                                 * the conflicting peer request.
2245                                 */
2246                                err = drbd_wait_misc(device, &req->i);
2247                                if (err) {
2248                                        _conn_request_state(connection, NS(conn, C_TIMEOUT), CS_HARD);
2249                                        fail_postponed_requests(device, sector, size);
2250                                        goto out;
2251                                }
2252                                goto repeat;
2253                        }
2254                        /*
2255                         * Remember to restart the conflicting requests after
2256                         * the new peer request has completed.
2257                         */
2258                        peer_req->flags |= EE_RESTART_REQUESTS;
2259                }
2260        }
2261        err = 0;
2262
2263    out:
2264        if (err)
2265                drbd_remove_epoch_entry_interval(device, peer_req);
2266        return err;
2267}
2268
2269/* mirrored write */
2270static int receive_Data(struct drbd_connection *connection, struct packet_info *pi)
2271{
2272        struct drbd_peer_device *peer_device;
2273        struct drbd_device *device;
2274        struct net_conf *nc;
2275        sector_t sector;
2276        struct drbd_peer_request *peer_req;
2277        struct p_data *p = pi->data;
2278        u32 peer_seq = be32_to_cpu(p->seq_num);
2279        int rw = WRITE;
2280        u32 dp_flags;
2281        int err, tp;
2282
2283        peer_device = conn_peer_device(connection, pi->vnr);
2284        if (!peer_device)
2285                return -EIO;
2286        device = peer_device->device;
2287
2288        if (!get_ldev(device)) {
2289                int err2;
2290
2291                err = wait_for_and_update_peer_seq(peer_device, peer_seq);
2292                drbd_send_ack_dp(peer_device, P_NEG_ACK, p, pi->size);
2293                atomic_inc(&connection->current_epoch->epoch_size);
2294                err2 = drbd_drain_block(peer_device, pi->size);
2295                if (!err)
2296                        err = err2;
2297                return err;
2298        }
2299
2300        /*
2301         * Corresponding put_ldev done either below (on various errors), or in
2302         * drbd_peer_request_endio, if we successfully submit the data at the
2303         * end of this function.
2304         */
2305
2306        sector = be64_to_cpu(p->sector);
2307        peer_req = read_in_block(peer_device, p->block_id, sector, pi);
2308        if (!peer_req) {
2309                put_ldev(device);
2310                return -EIO;
2311        }
2312
2313        peer_req->w.cb = e_end_block;
2314        peer_req->submit_jif = jiffies;
2315        peer_req->flags |= EE_APPLICATION;
2316
2317        dp_flags = be32_to_cpu(p->dp_flags);
2318        rw |= wire_flags_to_bio(dp_flags);
2319        if (pi->cmd == P_TRIM) {
2320                struct request_queue *q = bdev_get_queue(device->ldev->backing_bdev);
2321                peer_req->flags |= EE_IS_TRIM;
2322                if (!blk_queue_discard(q))
2323                        peer_req->flags |= EE_IS_TRIM_USE_ZEROOUT;
2324                D_ASSERT(peer_device, peer_req->i.size > 0);
2325                D_ASSERT(peer_device, rw & REQ_DISCARD);
2326                D_ASSERT(peer_device, peer_req->pages == NULL);
2327        } else if (peer_req->pages == NULL) {
2328                D_ASSERT(device, peer_req->i.size == 0);
2329                D_ASSERT(device, dp_flags & DP_FLUSH);
2330        }
2331
2332        if (dp_flags & DP_MAY_SET_IN_SYNC)
2333                peer_req->flags |= EE_MAY_SET_IN_SYNC;
2334
2335        spin_lock(&connection->epoch_lock);
2336        peer_req->epoch = connection->current_epoch;
2337        atomic_inc(&peer_req->epoch->epoch_size);
2338        atomic_inc(&peer_req->epoch->active);
2339        spin_unlock(&connection->epoch_lock);
2340
2341        rcu_read_lock();
2342        nc = rcu_dereference(peer_device->connection->net_conf);
2343        tp = nc->two_primaries;
2344        if (peer_device->connection->agreed_pro_version < 100) {
2345                switch (nc->wire_protocol) {
2346                case DRBD_PROT_C:
2347                        dp_flags |= DP_SEND_WRITE_ACK;
2348                        break;
2349                case DRBD_PROT_B:
2350                        dp_flags |= DP_SEND_RECEIVE_ACK;
2351                        break;
2352                }
2353        }
2354        rcu_read_unlock();
2355
2356        if (dp_flags & DP_SEND_WRITE_ACK) {
2357                peer_req->flags |= EE_SEND_WRITE_ACK;
2358                inc_unacked(device);
2359                /* corresponding dec_unacked() in e_end_block()
2360                 * respective _drbd_clear_done_ee */
2361        }
2362
2363        if (dp_flags & DP_SEND_RECEIVE_ACK) {
2364                /* I really don't like it that the receiver thread
2365                 * sends on the msock, but anyways */
2366                drbd_send_ack(first_peer_device(device), P_RECV_ACK, peer_req);
2367        }
2368
2369        if (tp) {
2370                /* two primaries implies protocol C */
2371                D_ASSERT(device, dp_flags & DP_SEND_WRITE_ACK);
2372                peer_req->flags |= EE_IN_INTERVAL_TREE;
2373                err = wait_for_and_update_peer_seq(peer_device, peer_seq);
2374                if (err)
2375                        goto out_interrupted;
2376                spin_lock_irq(&device->resource->req_lock);
2377                err = handle_write_conflicts(device, peer_req);
2378                if (err) {
2379                        spin_unlock_irq(&device->resource->req_lock);
2380                        if (err == -ENOENT) {
2381                                put_ldev(device);
2382                                return 0;
2383                        }
2384                        goto out_interrupted;
2385                }
2386        } else {
2387                update_peer_seq(peer_device, peer_seq);
2388                spin_lock_irq(&device->resource->req_lock);
2389        }
2390        /* if we use the zeroout fallback code, we process synchronously
2391         * and we wait for all pending requests, respectively wait for
2392         * active_ee to become empty in drbd_submit_peer_request();
2393         * better not add ourselves here. */
2394        if ((peer_req->flags & EE_IS_TRIM_USE_ZEROOUT) == 0)
2395                list_add_tail(&peer_req->w.list, &device->active_ee);
2396        spin_unlock_irq(&device->resource->req_lock);
2397
2398        if (device->state.conn == C_SYNC_TARGET)
2399                wait_event(device->ee_wait, !overlapping_resync_write(device, peer_req));
2400
2401        if (device->state.pdsk < D_INCONSISTENT) {
2402                /* In case we have the only disk of the cluster, */
2403                drbd_set_out_of_sync(device, peer_req->i.sector, peer_req->i.size);
2404                peer_req->flags &= ~EE_MAY_SET_IN_SYNC;
2405                drbd_al_begin_io(device, &peer_req->i);
2406                peer_req->flags |= EE_CALL_AL_COMPLETE_IO;
2407        }
2408
2409        err = drbd_submit_peer_request(device, peer_req, rw, DRBD_FAULT_DT_WR);
2410        if (!err)
2411                return 0;
2412
2413        /* don't care for the reason here */
2414        drbd_err(device, "submit failed, triggering re-connect\n");
2415        spin_lock_irq(&device->resource->req_lock);
2416        list_del(&peer_req->w.list);
2417        drbd_remove_epoch_entry_interval(device, peer_req);
2418        spin_unlock_irq(&device->resource->req_lock);
2419        if (peer_req->flags & EE_CALL_AL_COMPLETE_IO) {
2420                peer_req->flags &= ~EE_CALL_AL_COMPLETE_IO;
2421                drbd_al_complete_io(device, &peer_req->i);
2422        }
2423
2424out_interrupted:
2425        drbd_may_finish_epoch(connection, peer_req->epoch, EV_PUT + EV_CLEANUP);
2426        put_ldev(device);
2427        drbd_free_peer_req(device, peer_req);
2428        return err;
2429}
2430
2431/* We may throttle resync, if the lower device seems to be busy,
2432 * and current sync rate is above c_min_rate.
2433 *
2434 * To decide whether or not the lower device is busy, we use a scheme similar
2435 * to MD RAID is_mddev_idle(): if the partition stats reveal "significant"
2436 * (more than 64 sectors) of activity we cannot account for with our own resync
2437 * activity, it obviously is "busy".
2438 *
2439 * The current sync rate used here uses only the most recent two step marks,
2440 * to have a short time average so we can react faster.
2441 */
2442bool drbd_rs_should_slow_down(struct drbd_device *device, sector_t sector,
2443                bool throttle_if_app_is_waiting)
2444{
2445        struct lc_element *tmp;
2446        bool throttle = drbd_rs_c_min_rate_throttle(device);
2447
2448        if (!throttle || throttle_if_app_is_waiting)
2449                return throttle;
2450
2451        spin_lock_irq(&device->al_lock);
2452        tmp = lc_find(device->resync, BM_SECT_TO_EXT(sector));
2453        if (tmp) {
2454                struct bm_extent *bm_ext = lc_entry(tmp, struct bm_extent, lce);
2455                if (test_bit(BME_PRIORITY, &bm_ext->flags))
2456                        throttle = false;
2457                /* Do not slow down if app IO is already waiting for this extent,
2458                 * and our progress is necessary for application IO to complete. */
2459        }
2460        spin_unlock_irq(&device->al_lock);
2461
2462        return throttle;
2463}
2464
2465bool drbd_rs_c_min_rate_throttle(struct drbd_device *device)
2466{
2467        struct gendisk *disk = device->ldev->backing_bdev->bd_contains->bd_disk;
2468        unsigned long db, dt, dbdt;
2469        unsigned int c_min_rate;
2470        int curr_events;
2471
2472        rcu_read_lock();
2473        c_min_rate = rcu_dereference(device->ldev->disk_conf)->c_min_rate;
2474        rcu_read_unlock();
2475
2476        /* feature disabled? */
2477        if (c_min_rate == 0)
2478                return false;
2479
2480        curr_events = (int)part_stat_read(&disk->part0, sectors[0]) +
2481                      (int)part_stat_read(&disk->part0, sectors[1]) -
2482                        atomic_read(&device->rs_sect_ev);
2483
2484        if (atomic_read(&device->ap_actlog_cnt)
2485            || curr_events - device->rs_last_events > 64) {
2486                unsigned long rs_left;
2487                int i;
2488
2489                device->rs_last_events = curr_events;
2490
2491                /* sync speed average over the last 2*DRBD_SYNC_MARK_STEP,
2492                 * approx. */
2493                i = (device->rs_last_mark + DRBD_SYNC_MARKS-1) % DRBD_SYNC_MARKS;
2494
2495                if (device->state.conn == C_VERIFY_S || device->state.conn == C_VERIFY_T)
2496                        rs_left = device->ov_left;
2497                else
2498                        rs_left = drbd_bm_total_weight(device) - device->rs_failed;
2499
2500                dt = ((long)jiffies - (long)device->rs_mark_time[i]) / HZ;
2501                if (!dt)
2502                        dt++;
2503                db = device->rs_mark_left[i] - rs_left;
2504                dbdt = Bit2KB(db/dt);
2505
2506                if (dbdt > c_min_rate)
2507                        return true;
2508        }
2509        return false;
2510}
2511
2512static int receive_DataRequest(struct drbd_connection *connection, struct packet_info *pi)
2513{
2514        struct drbd_peer_device *peer_device;
2515        struct drbd_device *device;
2516        sector_t sector;
2517        sector_t capacity;
2518        struct drbd_peer_request *peer_req;
2519        struct digest_info *di = NULL;
2520        int size, verb;
2521        unsigned int fault_type;
2522        struct p_block_req *p = pi->data;
2523
2524        peer_device = conn_peer_device(connection, pi->vnr);
2525        if (!peer_device)
2526                return -EIO;
2527        device = peer_device->device;
2528        capacity = drbd_get_capacity(device->this_bdev);
2529
2530        sector = be64_to_cpu(p->sector);
2531        size   = be32_to_cpu(p->blksize);
2532
2533        if (size <= 0 || !IS_ALIGNED(size, 512) || size > DRBD_MAX_BIO_SIZE) {
2534                drbd_err(device, "%s:%d: sector: %llus, size: %u\n", __FILE__, __LINE__,
2535                                (unsigned long long)sector, size);
2536                return -EINVAL;
2537        }
2538        if (sector + (size>>9) > capacity) {
2539                drbd_err(device, "%s:%d: sector: %llus, size: %u\n", __FILE__, __LINE__,
2540                                (unsigned long long)sector, size);
2541                return -EINVAL;
2542        }
2543
2544        if (!get_ldev_if_state(device, D_UP_TO_DATE)) {
2545                verb = 1;
2546                switch (pi->cmd) {
2547                case P_DATA_REQUEST:
2548                        drbd_send_ack_rp(peer_device, P_NEG_DREPLY, p);
2549                        break;
2550                case P_RS_DATA_REQUEST:
2551                case P_CSUM_RS_REQUEST:
2552                case P_OV_REQUEST:
2553                        drbd_send_ack_rp(peer_device, P_NEG_RS_DREPLY , p);
2554                        break;
2555                case P_OV_REPLY:
2556                        verb = 0;
2557                        dec_rs_pending(device);
2558                        drbd_send_ack_ex(peer_device, P_OV_RESULT, sector, size, ID_IN_SYNC);
2559                        break;
2560                default:
2561                        BUG();
2562                }
2563                if (verb && __ratelimit(&drbd_ratelimit_state))
2564                        drbd_err(device, "Can not satisfy peer's read request, "
2565                            "no local data.\n");
2566
2567                /* drain possibly payload */
2568                return drbd_drain_block(peer_device, pi->size);
2569        }
2570
2571        /* GFP_NOIO, because we must not cause arbitrary write-out: in a DRBD
2572         * "criss-cross" setup, that might cause write-out on some other DRBD,
2573         * which in turn might block on the other node at this very place.  */
2574        peer_req = drbd_alloc_peer_req(peer_device, p->block_id, sector, size,
2575                        true /* has real payload */, GFP_NOIO);
2576        if (!peer_req) {
2577                put_ldev(device);
2578                return -ENOMEM;
2579        }
2580
2581        switch (pi->cmd) {
2582        case P_DATA_REQUEST:
2583                peer_req->w.cb = w_e_end_data_req;
2584                fault_type = DRBD_FAULT_DT_RD;
2585                /* application IO, don't drbd_rs_begin_io */
2586                peer_req->flags |= EE_APPLICATION;
2587                goto submit;
2588
2589        case P_RS_DATA_REQUEST:
2590                peer_req->w.cb = w_e_end_rsdata_req;
2591                fault_type = DRBD_FAULT_RS_RD;
2592                /* used in the sector offset progress display */
2593                device->bm_resync_fo = BM_SECT_TO_BIT(sector);
2594                break;
2595
2596        case P_OV_REPLY:
2597        case P_CSUM_RS_REQUEST:
2598                fault_type = DRBD_FAULT_RS_RD;
2599                di = kmalloc(sizeof(*di) + pi->size, GFP_NOIO);
2600                if (!di)
2601                        goto out_free_e;
2602
2603                di->digest_size = pi->size;
2604                di->digest = (((char *)di)+sizeof(struct digest_info));
2605
2606                peer_req->digest = di;
2607                peer_req->flags |= EE_HAS_DIGEST;
2608
2609                if (drbd_recv_all(peer_device->connection, di->digest, pi->size))
2610                        goto out_free_e;
2611
2612                if (pi->cmd == P_CSUM_RS_REQUEST) {
2613                        D_ASSERT(device, peer_device->connection->agreed_pro_version >= 89);
2614                        peer_req->w.cb = w_e_end_csum_rs_req;
2615                        /* used in the sector offset progress display */
2616                        device->bm_resync_fo = BM_SECT_TO_BIT(sector);
2617                        /* remember to report stats in drbd_resync_finished */
2618                        device->use_csums = true;
2619                } else if (pi->cmd == P_OV_REPLY) {
2620                        /* track progress, we may need to throttle */
2621                        atomic_add(size >> 9, &device->rs_sect_in);
2622                        peer_req->w.cb = w_e_end_ov_reply;
2623                        dec_rs_pending(device);
2624                        /* drbd_rs_begin_io done when we sent this request,
2625                         * but accounting still needs to be done. */
2626                        goto submit_for_resync;
2627                }
2628                break;
2629
2630        case P_OV_REQUEST:
2631                if (device->ov_start_sector == ~(sector_t)0 &&
2632                    peer_device->connection->agreed_pro_version >= 90) {
2633                        unsigned long now = jiffies;
2634                        int i;
2635                        device->ov_start_sector = sector;
2636                        device->ov_position = sector;
2637                        device->ov_left = drbd_bm_bits(device) - BM_SECT_TO_BIT(sector);
2638                        device->rs_total = device->ov_left;
2639                        for (i = 0; i < DRBD_SYNC_MARKS; i++) {
2640                                device->rs_mark_left[i] = device->ov_left;
2641                                device->rs_mark_time[i] = now;
2642                        }
2643                        drbd_info(device, "Online Verify start sector: %llu\n",
2644                                        (unsigned long long)sector);
2645                }
2646                peer_req->w.cb = w_e_end_ov_req;
2647                fault_type = DRBD_FAULT_RS_RD;
2648                break;
2649
2650        default:
2651                BUG();
2652        }
2653
2654        /* Throttle, drbd_rs_begin_io and submit should become asynchronous
2655         * wrt the receiver, but it is not as straightforward as it may seem.
2656         * Various places in the resync start and stop logic assume resync
2657         * requests are processed in order, requeuing this on the worker thread
2658         * introduces a bunch of new code for synchronization between threads.
2659         *
2660         * Unlimited throttling before drbd_rs_begin_io may stall the resync
2661         * "forever", throttling after drbd_rs_begin_io will lock that extent
2662         * for application writes for the same time.  For now, just throttle
2663         * here, where the rest of the code expects the receiver to sleep for
2664         * a while, anyways.
2665         */
2666
2667        /* Throttle before drbd_rs_begin_io, as that locks out application IO;
2668         * this defers syncer requests for some time, before letting at least
2669         * on request through.  The resync controller on the receiving side
2670         * will adapt to the incoming rate accordingly.
2671         *
2672         * We cannot throttle here if remote is Primary/SyncTarget:
2673         * we would also throttle its application reads.
2674         * In that case, throttling is done on the SyncTarget only.
2675         */
2676
2677        /* Even though this may be a resync request, we do add to "read_ee";
2678         * "sync_ee" is only used for resync WRITEs.
2679         * Add to list early, so debugfs can find this request
2680         * even if we have to sleep below. */
2681        spin_lock_irq(&device->resource->req_lock);
2682        list_add_tail(&peer_req->w.list, &device->read_ee);
2683        spin_unlock_irq(&device->resource->req_lock);
2684
2685        update_receiver_timing_details(connection, drbd_rs_should_slow_down);
2686        if (device->state.peer != R_PRIMARY
2687        && drbd_rs_should_slow_down(device, sector, false))
2688                schedule_timeout_uninterruptible(HZ/10);
2689        update_receiver_timing_details(connection, drbd_rs_begin_io);
2690        if (drbd_rs_begin_io(device, sector))
2691                goto out_free_e;
2692
2693submit_for_resync:
2694        atomic_add(size >> 9, &device->rs_sect_ev);
2695
2696submit:
2697        update_receiver_timing_details(connection, drbd_submit_peer_request);
2698        inc_unacked(device);
2699        if (drbd_submit_peer_request(device, peer_req, READ, fault_type) == 0)
2700                return 0;
2701
2702        /* don't care for the reason here */
2703        drbd_err(device, "submit failed, triggering re-connect\n");
2704
2705out_free_e:
2706        spin_lock_irq(&device->resource->req_lock);
2707        list_del(&peer_req->w.list);
2708        spin_unlock_irq(&device->resource->req_lock);
2709        /* no drbd_rs_complete_io(), we are dropping the connection anyways */
2710
2711        put_ldev(device);
2712        drbd_free_peer_req(device, peer_req);
2713        return -EIO;
2714}
2715
2716/**
2717 * drbd_asb_recover_0p  -  Recover after split-brain with no remaining primaries
2718 */
2719static int drbd_asb_recover_0p(struct drbd_peer_device *peer_device) __must_hold(local)
2720{
2721        struct drbd_device *device = peer_device->device;
2722        int self, peer, rv = -100;
2723        unsigned long ch_self, ch_peer;
2724        enum drbd_after_sb_p after_sb_0p;
2725
2726        self = device->ldev->md.uuid[UI_BITMAP] & 1;
2727        peer = device->p_uuid[UI_BITMAP] & 1;
2728
2729        ch_peer = device->p_uuid[UI_SIZE];
2730        ch_self = device->comm_bm_set;
2731
2732        rcu_read_lock();
2733        after_sb_0p = rcu_dereference(peer_device->connection->net_conf)->after_sb_0p;
2734        rcu_read_unlock();
2735        switch (after_sb_0p) {
2736        case ASB_CONSENSUS:
2737        case ASB_DISCARD_SECONDARY:
2738        case ASB_CALL_HELPER:
2739        case ASB_VIOLENTLY:
2740                drbd_err(device, "Configuration error.\n");
2741                break;
2742        case ASB_DISCONNECT:
2743                break;
2744        case ASB_DISCARD_YOUNGER_PRI:
2745                if (self == 0 && peer == 1) {
2746                        rv = -1;
2747                        break;
2748                }
2749                if (self == 1 && peer == 0) {
2750                        rv =  1;
2751                        break;
2752                }
2753                /* Else fall through to one of the other strategies... */
2754        case ASB_DISCARD_OLDER_PRI:
2755                if (self == 0 && peer == 1) {
2756                        rv = 1;
2757                        break;
2758                }
2759                if (self == 1 && peer == 0) {
2760                        rv = -1;
2761                        break;
2762                }
2763                /* Else fall through to one of the other strategies... */
2764                drbd_warn(device, "Discard younger/older primary did not find a decision\n"
2765                     "Using discard-least-changes instead\n");
2766        case ASB_DISCARD_ZERO_CHG:
2767                if (ch_peer == 0 && ch_self == 0) {
2768                        rv = test_bit(RESOLVE_CONFLICTS, &peer_device->connection->flags)
2769                                ? -1 : 1;
2770                        break;
2771                } else {
2772                        if (ch_peer == 0) { rv =  1; break; }
2773                        if (ch_self == 0) { rv = -1; break; }
2774                }
2775                if (after_sb_0p == ASB_DISCARD_ZERO_CHG)
2776                        break;
2777        case ASB_DISCARD_LEAST_CHG:
2778                if      (ch_self < ch_peer)
2779                        rv = -1;
2780                else if (ch_self > ch_peer)
2781                        rv =  1;
2782                else /* ( ch_self == ch_peer ) */
2783                     /* Well, then use something else. */
2784                        rv = test_bit(RESOLVE_CONFLICTS, &peer_device->connection->flags)
2785                                ? -1 : 1;
2786                break;
2787        case ASB_DISCARD_LOCAL:
2788                rv = -1;
2789                break;
2790        case ASB_DISCARD_REMOTE:
2791                rv =  1;
2792        }
2793
2794        return rv;
2795}
2796
2797/**
2798 * drbd_asb_recover_1p  -  Recover after split-brain with one remaining primary
2799 */
2800static int drbd_asb_recover_1p(struct drbd_peer_device *peer_device) __must_hold(local)
2801{
2802        struct drbd_device *device = peer_device->device;
2803        int hg, rv = -100;
2804        enum drbd_after_sb_p after_sb_1p;
2805
2806        rcu_read_lock();
2807        after_sb_1p = rcu_dereference(peer_device->connection->net_conf)->after_sb_1p;
2808        rcu_read_unlock();
2809        switch (after_sb_1p) {
2810        case ASB_DISCARD_YOUNGER_PRI:
2811        case ASB_DISCARD_OLDER_PRI:
2812        case ASB_DISCARD_LEAST_CHG:
2813        case ASB_DISCARD_LOCAL:
2814        case ASB_DISCARD_REMOTE:
2815        case ASB_DISCARD_ZERO_CHG:
2816                drbd_err(device, "Configuration error.\n");
2817                break;
2818        case ASB_DISCONNECT:
2819                break;
2820        case ASB_CONSENSUS:
2821                hg = drbd_asb_recover_0p(peer_device);
2822                if (hg == -1 && device->state.role == R_SECONDARY)
2823                        rv = hg;
2824                if (hg == 1  && device->state.role == R_PRIMARY)
2825                        rv = hg;
2826                break;
2827        case ASB_VIOLENTLY:
2828                rv = drbd_asb_recover_0p(peer_device);
2829                break;
2830        case ASB_DISCARD_SECONDARY:
2831                return device->state.role == R_PRIMARY ? 1 : -1;
2832        case ASB_CALL_HELPER:
2833                hg = drbd_asb_recover_0p(peer_device);
2834                if (hg == -1 && device->state.role == R_PRIMARY) {
2835                        enum drbd_state_rv rv2;
2836
2837                         /* drbd_change_state() does not sleep while in SS_IN_TRANSIENT_STATE,
2838                          * we might be here in C_WF_REPORT_PARAMS which is transient.
2839                          * we do not need to wait for the after state change work either. */
2840                        rv2 = drbd_change_state(device, CS_VERBOSE, NS(role, R_SECONDARY));
2841                        if (rv2 != SS_SUCCESS) {
2842                                drbd_khelper(device, "pri-lost-after-sb");
2843                        } else {
2844                                drbd_warn(device, "Successfully gave up primary role.\n");
2845                                rv = hg;
2846                        }
2847                } else
2848                        rv = hg;
2849        }
2850
2851        return rv;
2852}
2853
2854/**
2855 * drbd_asb_recover_2p  -  Recover after split-brain with two remaining primaries
2856 */
2857static int drbd_asb_recover_2p(struct drbd_peer_device *peer_device) __must_hold(local)
2858{
2859        struct drbd_device *device = peer_device->device;
2860        int hg, rv = -100;
2861        enum drbd_after_sb_p after_sb_2p;
2862
2863        rcu_read_lock();
2864        after_sb_2p = rcu_dereference(peer_device->connection->net_conf)->after_sb_2p;
2865        rcu_read_unlock();
2866        switch (after_sb_2p) {
2867        case ASB_DISCARD_YOUNGER_PRI:
2868        case ASB_DISCARD_OLDER_PRI:
2869        case ASB_DISCARD_LEAST_CHG:
2870        case ASB_DISCARD_LOCAL:
2871        case ASB_DISCARD_REMOTE:
2872        case ASB_CONSENSUS:
2873        case ASB_DISCARD_SECONDARY:
2874        case ASB_DISCARD_ZERO_CHG:
2875                drbd_err(device, "Configuration error.\n");
2876                break;
2877        case ASB_VIOLENTLY:
2878                rv = drbd_asb_recover_0p(peer_device);
2879                break;
2880        case ASB_DISCONNECT:
2881                break;
2882        case ASB_CALL_HELPER:
2883                hg = drbd_asb_recover_0p(peer_device);
2884                if (hg == -1) {
2885                        enum drbd_state_rv rv2;
2886
2887                         /* drbd_change_state() does not sleep while in SS_IN_TRANSIENT_STATE,
2888                          * we might be here in C_WF_REPORT_PARAMS which is transient.
2889                          * we do not need to wait for the after state change work either. */
2890                        rv2 = drbd_change_state(device, CS_VERBOSE, NS(role, R_SECONDARY));
2891                        if (rv2 != SS_SUCCESS) {
2892                                drbd_khelper(device, "pri-lost-after-sb");
2893                        } else {
2894                                drbd_warn(device, "Successfully gave up primary role.\n");
2895                                rv = hg;
2896                        }
2897                } else
2898                        rv = hg;
2899        }
2900
2901        return rv;
2902}
2903
2904static void drbd_uuid_dump(struct drbd_device *device, char *text, u64 *uuid,
2905                           u64 bits, u64 flags)
2906{
2907        if (!uuid) {
2908                drbd_info(device, "%s uuid info vanished while I was looking!\n", text);
2909                return;
2910        }
2911        drbd_info(device, "%s %016llX:%016llX:%016llX:%016llX bits:%llu flags:%llX\n",
2912             text,
2913             (unsigned long long)uuid[UI_CURRENT],
2914             (unsigned long long)uuid[UI_BITMAP],
2915             (unsigned long long)uuid[UI_HISTORY_START],
2916             (unsigned long long)uuid[UI_HISTORY_END],
2917             (unsigned long long)bits,
2918             (unsigned long long)flags);
2919}
2920
2921/*
2922  100   after split brain try auto recover
2923    2   C_SYNC_SOURCE set BitMap
2924    1   C_SYNC_SOURCE use BitMap
2925    0   no Sync
2926   -1   C_SYNC_TARGET use BitMap
2927   -2   C_SYNC_TARGET set BitMap
2928 -100   after split brain, disconnect
2929-1000   unrelated data
2930-1091   requires proto 91
2931-1096   requires proto 96
2932 */
2933static int drbd_uuid_compare(struct drbd_device *const device, int *rule_nr) __must_hold(local)
2934{
2935        struct drbd_peer_device *const peer_device = first_peer_device(device);
2936        struct drbd_connection *const connection = peer_device ? peer_device->connection : NULL;
2937        u64 self, peer;
2938        int i, j;
2939
2940        self = device->ldev->md.uuid[UI_CURRENT] & ~((u64)1);
2941        peer = device->p_uuid[UI_CURRENT] & ~((u64)1);
2942
2943        *rule_nr = 10;
2944        if (self == UUID_JUST_CREATED && peer == UUID_JUST_CREATED)
2945                return 0;
2946
2947        *rule_nr = 20;
2948        if ((self == UUID_JUST_CREATED || self == (u64)0) &&
2949             peer != UUID_JUST_CREATED)
2950                return -2;
2951
2952        *rule_nr = 30;
2953        if (self != UUID_JUST_CREATED &&
2954            (peer == UUID_JUST_CREATED || peer == (u64)0))
2955                return 2;
2956
2957        if (self == peer) {
2958                int rct, dc; /* roles at crash time */
2959
2960                if (device->p_uuid[UI_BITMAP] == (u64)0 && device->ldev->md.uuid[UI_BITMAP] != (u64)0) {
2961
2962                        if (connection->agreed_pro_version < 91)
2963                                return -1091;
2964
2965                        if ((device->ldev->md.uuid[UI_BITMAP] & ~((u64)1)) == (device->p_uuid[UI_HISTORY_START] & ~((u64)1)) &&
2966                            (device->ldev->md.uuid[UI_HISTORY_START] & ~((u64)1)) == (device->p_uuid[UI_HISTORY_START + 1] & ~((u64)1))) {
2967                                drbd_info(device, "was SyncSource, missed the resync finished event, corrected myself:\n");
2968                                drbd_uuid_move_history(device);
2969                                device->ldev->md.uuid[UI_HISTORY_START] = device->ldev->md.uuid[UI_BITMAP];
2970                                device->ldev->md.uuid[UI_BITMAP] = 0;
2971
2972                                drbd_uuid_dump(device, "self", device->ldev->md.uuid,
2973                                               device->state.disk >= D_NEGOTIATING ? drbd_bm_total_weight(device) : 0, 0);
2974                                *rule_nr = 34;
2975                        } else {
2976                                drbd_info(device, "was SyncSource (peer failed to write sync_uuid)\n");
2977                                *rule_nr = 36;
2978                        }
2979
2980                        return 1;
2981                }
2982
2983                if (device->ldev->md.uuid[UI_BITMAP] == (u64)0 && device->p_uuid[UI_BITMAP] != (u64)0) {
2984
2985                        if (connection->agreed_pro_version < 91)
2986                                return -1091;
2987
2988                        if ((device->ldev->md.uuid[UI_HISTORY_START] & ~((u64)1)) == (device->p_uuid[UI_BITMAP] & ~((u64)1)) &&
2989                            (device->ldev->md.uuid[UI_HISTORY_START + 1] & ~((u64)1)) == (device->p_uuid[UI_HISTORY_START] & ~((u64)1))) {
2990                                drbd_info(device, "was SyncTarget, peer missed the resync finished event, corrected peer:\n");
2991
2992                                device->p_uuid[UI_HISTORY_START + 1] = device->p_uuid[UI_HISTORY_START];
2993                                device->p_uuid[UI_HISTORY_START] = device->p_uuid[UI_BITMAP];
2994                                device->p_uuid[UI_BITMAP] = 0UL;
2995
2996                                drbd_uuid_dump(device, "peer", device->p_uuid, device->p_uuid[UI_SIZE], device->p_uuid[UI_FLAGS]);
2997                                *rule_nr = 35;
2998                        } else {
2999                                drbd_info(device, "was SyncTarget (failed to write sync_uuid)\n");
3000                                *rule_nr = 37;

3001                        }
3002
3003                        return -1;
3004                }
3005
3006                /* Common power [off|failure] */
3007                rct = (test_bit(CRASHED_PRIMARY, &device->flags) ? 1 : 0) +
3008                        (device->p_uuid[UI_FLAGS] & 2);
3009                /* lowest bit is set when we were primary,
3010                 * next bit (weight 2) is set when peer was primary */
3011                *rule_nr = 40;
3012
3013                switch (rct) {
3014                case 0: /* !self_pri && !peer_pri */ return 0;
3015                case 1: /*  self_pri && !peer_pri */ return 1;
3016                case 2: /* !self_pri &&  peer_pri */ return -1;
3017                case 3: /*  self_pri &&  peer_pri */
3018                        dc = test_bit(RESOLVE_CONFLICTS, &connection->flags);
3019                        return dc ? -1 : 1;
3020                }
3021        }
3022
3023        *rule_nr = 50;
3024        peer = device->p_uuid[UI_BITMAP] & ~((u64)1);
3025        if (self == peer)
3026                return -1;
3027
3028        *rule_nr = 51;
3029        peer = device->p_uuid[UI_HISTORY_START] & ~((u64)1);
3030        if (self == peer) {
3031                if (connection->agreed_pro_version < 96 ?
3032                    (device->ldev->md.uuid[UI_HISTORY_START] & ~((u64)1)) ==
3033                    (device->p_uuid[UI_HISTORY_START + 1] & ~((u64)1)) :
3034                    peer + UUID_NEW_BM_OFFSET == (device->p_uuid[UI_BITMAP] & ~((u64)1))) {
3035                        /* The last P_SYNC_UUID did not get though. Undo the last start of
3036                           resync as sync source modifications of the peer's UUIDs. */
3037
3038                        if (connection->agreed_pro_version < 91)
3039                                return -1091;
3040
3041                        device->p_uuid[UI_BITMAP] = device->p_uuid[UI_HISTORY_START];
3042                        device->p_uuid[UI_HISTORY_START] = device->p_uuid[UI_HISTORY_START + 1];
3043
3044                        drbd_info(device, "Lost last syncUUID packet, corrected:\n");
3045                        drbd_uuid_dump(device, "peer", device->p_uuid, device->p_uuid[UI_SIZE], device->p_uuid[UI_FLAGS]);
3046
3047                        return -1;
3048                }
3049        }
3050
3051        *rule_nr = 60;
3052        self = device->ldev->md.uuid[UI_CURRENT] & ~((u64)1);
3053        for (i = UI_HISTORY_START; i <= UI_HISTORY_END; i++) {
3054                peer = device->p_uuid[i] & ~((u64)1);
3055                if (self == peer)
3056                        return -2;
3057        }
3058
3059        *rule_nr = 70;
3060        self = device->ldev->md.uuid[UI_BITMAP] & ~((u64)1);
3061        peer = device->p_uuid[UI_CURRENT] & ~((u64)1);
3062        if (self == peer)
3063                return 1;
3064
3065        *rule_nr = 71;
3066        self = device->ldev->md.uuid[UI_HISTORY_START] & ~((u64)1);
3067        if (self == peer) {
3068                if (connection->agreed_pro_version < 96 ?
3069                    (device->ldev->md.uuid[UI_HISTORY_START + 1] & ~((u64)1)) ==
3070                    (device->p_uuid[UI_HISTORY_START] & ~((u64)1)) :
3071                    self + UUID_NEW_BM_OFFSET == (device->ldev->md.uuid[UI_BITMAP] & ~((u64)1))) {
3072                        /* The last P_SYNC_UUID did not get though. Undo the last start of
3073                           resync as sync source modifications of our UUIDs. */
3074
3075                        if (connection->agreed_pro_version < 91)
3076                                return -1091;
3077
3078                        __drbd_uuid_set(device, UI_BITMAP, device->ldev->md.uuid[UI_HISTORY_START]);
3079                        __drbd_uuid_set(device, UI_HISTORY_START, device->ldev->md.uuid[UI_HISTORY_START + 1]);
3080
3081                        drbd_info(device, "Last syncUUID did not get through, corrected:\n");
3082                        drbd_uuid_dump(device, "self", device->ldev->md.uuid,
3083                                       device->state.disk >= D_NEGOTIATING ? drbd_bm_total_weight(device) : 0, 0);
3084
3085                        return 1;
3086                }
3087        }
3088
3089
3090        *rule_nr = 80;
3091        peer = device->p_uuid[UI_CURRENT] & ~((u64)1);
3092        for (i = UI_HISTORY_START; i <= UI_HISTORY_END; i++) {
3093                self = device->ldev->md.uuid[i] & ~((u64)1);
3094                if (self == peer)
3095                        return 2;
3096        }
3097
3098        *rule_nr = 90;
3099        self = device->ldev->md.uuid[UI_BITMAP] & ~((u64)1);
3100        peer = device->p_uuid[UI_BITMAP] & ~((u64)1);
3101        if (self == peer && self != ((u64)0))
3102                return 100;
3103
3104        *rule_nr = 100;
3105        for (i = UI_HISTORY_START; i <= UI_HISTORY_END; i++) {
3106                self = device->ldev->md.uuid[i] & ~((u64)1);
3107                for (j = UI_HISTORY_START; j <= UI_HISTORY_END; j++) {
3108                        peer = device->p_uuid[j] & ~((u64)1);
3109                        if (self == peer)
3110                                return -100;
3111                }
3112        }
3113
3114        return -1000;
3115}
3116
3117/* drbd_sync_handshake() returns the new conn state on success, or
3118   CONN_MASK (-1) on failure.
3119 */
3120static enum drbd_conns drbd_sync_handshake(struct drbd_peer_device *peer_device,
3121                                           enum drbd_role peer_role,
3122                                           enum drbd_disk_state peer_disk) __must_hold(local)
3123{
3124        struct drbd_device *device = peer_device->device;
3125        enum drbd_conns rv = C_MASK;
3126        enum drbd_disk_state mydisk;
3127        struct net_conf *nc;
3128        int hg, rule_nr, rr_conflict, tentative;
3129
3130        mydisk = device->state.disk;
3131        if (mydisk == D_NEGOTIATING)
3132                mydisk = device->new_state_tmp.disk;
3133
3134        drbd_info(device, "drbd_sync_handshake:\n");
3135
3136        spin_lock_irq(&device->ldev->md.uuid_lock);
3137        drbd_uuid_dump(device, "self", device->ldev->md.uuid, device->comm_bm_set, 0);
3138        drbd_uuid_dump(device, "peer", device->p_uuid,
3139                       device->p_uuid[UI_SIZE], device->p_uuid[UI_FLAGS]);
3140
3141        hg = drbd_uuid_compare(device, &rule_nr);
3142        spin_unlock_irq(&device->ldev->md.uuid_lock);
3143
3144        drbd_info(device, "uuid_compare()=%d by rule %d\n", hg, rule_nr);
3145
3146        if (hg == -1000) {
3147                drbd_alert(device, "Unrelated data, aborting!\n");
3148                return C_MASK;
3149        }
3150        if (hg < -1000) {
3151                drbd_alert(device, "To resolve this both sides have to support at least protocol %d\n", -hg - 1000);
3152                return C_MASK;
3153        }
3154
3155        if    ((mydisk == D_INCONSISTENT && peer_disk > D_INCONSISTENT) ||
3156            (peer_disk == D_INCONSISTENT && mydisk    > D_INCONSISTENT)) {
3157                int f = (hg == -100) || abs(hg) == 2;
3158                hg = mydisk > D_INCONSISTENT ? 1 : -1;
3159                if (f)
3160                        hg = hg*2;
3161                drbd_info(device, "Becoming sync %s due to disk states.\n",
3162                     hg > 0 ? "source" : "target");
3163        }
3164
3165        if (abs(hg) == 100)
3166                drbd_khelper(device, "initial-split-brain");
3167
3168        rcu_read_lock();
3169        nc = rcu_dereference(peer_device->connection->net_conf);
3170
3171        if (hg == 100 || (hg == -100 && nc->always_asbp)) {
3172                int pcount = (device->state.role == R_PRIMARY)
3173                           + (peer_role == R_PRIMARY);
3174                int forced = (hg == -100);
3175
3176                switch (pcount) {
3177                case 0:
3178                        hg = drbd_asb_recover_0p(peer_device);
3179                        break;
3180                case 1:
3181                        hg = drbd_asb_recover_1p(peer_device);
3182                        break;
3183                case 2:
3184                        hg = drbd_asb_recover_2p(peer_device);
3185                        break;
3186                }
3187                if (abs(hg) < 100) {
3188                        drbd_warn(device, "Split-Brain detected, %d primaries, "
3189                             "automatically solved. Sync from %s node\n",
3190                             pcount, (hg < 0) ? "peer" : "this");
3191                        if (forced) {
3192                                drbd_warn(device, "Doing a full sync, since"
3193                                     " UUIDs where ambiguous.\n");
3194                                hg = hg*2;
3195                        }
3196                }
3197        }
3198
3199        if (hg == -100) {
3200                if (test_bit(DISCARD_MY_DATA, &device->flags) && !(device->p_uuid[UI_FLAGS]&1))
3201                        hg = -1;
3202                if (!test_bit(DISCARD_MY_DATA, &device->flags) && (device->p_uuid[UI_FLAGS]&1))
3203                        hg = 1;
3204
3205                if (abs(hg) < 100)
3206                        drbd_warn(device, "Split-Brain detected, manually solved. "
3207                             "Sync from %s node\n",
3208                             (hg < 0) ? "peer" : "this");
3209        }
3210        rr_conflict = nc->rr_conflict;
3211        tentative = nc->tentative;
3212        rcu_read_unlock();
3213
3214        if (hg == -100) {
3215                /* FIXME this log message is not correct if we end up here
3216                 * after an attempted attach on a diskless node.
3217                 * We just refuse to attach -- well, we drop the "connection"
3218                 * to that disk, in a way... */
3219                drbd_alert(device, "Split-Brain detected but unresolved, dropping connection!\n");
3220                drbd_khelper(device, "split-brain");
3221                return C_MASK;
3222        }
3223
3224        if (hg > 0 && mydisk <= D_INCONSISTENT) {
3225                drbd_err(device, "I shall become SyncSource, but I am inconsistent!\n");
3226                return C_MASK;
3227        }
3228
3229        if (hg < 0 && /* by intention we do not use mydisk here. */
3230            device->state.role == R_PRIMARY && device->state.disk >= D_CONSISTENT) {
3231                switch (rr_conflict) {
3232                case ASB_CALL_HELPER:
3233                        drbd_khelper(device, "pri-lost");
3234                        /* fall through */
3235                case ASB_DISCONNECT:
3236                        drbd_err(device, "I shall become SyncTarget, but I am primary!\n");
3237                        return C_MASK;
3238                case ASB_VIOLENTLY:
3239                        drbd_warn(device, "Becoming SyncTarget, violating the stable-data"
3240                             "assumption\n");
3241                }
3242        }
3243
3244        if (tentative || test_bit(CONN_DRY_RUN, &peer_device->connection->flags)) {
3245                if (hg == 0)
3246                        drbd_info(device, "dry-run connect: No resync, would become Connected immediately.\n");
3247                else
3248                        drbd_info(device, "dry-run connect: Would become %s, doing a %s resync.",
3249                                 drbd_conn_str(hg > 0 ? C_SYNC_SOURCE : C_SYNC_TARGET),
3250                                 abs(hg) >= 2 ? "full" : "bit-map based");
3251                return C_MASK;
3252        }
3253
3254        if (abs(hg) >= 2) {
3255                drbd_info(device, "Writing the whole bitmap, full sync required after drbd_sync_handshake.\n");
3256                if (drbd_bitmap_io(device, &drbd_bmio_set_n_write, "set_n_write from sync_handshake",
3257                                        BM_LOCKED_SET_ALLOWED))
3258                        return C_MASK;
3259        }
3260
3261        if (hg > 0) { /* become sync source. */
3262                rv = C_WF_BITMAP_S;
3263        } else if (hg < 0) { /* become sync target */
3264                rv = C_WF_BITMAP_T;
3265        } else {
3266                rv = C_CONNECTED;
3267                if (drbd_bm_total_weight(device)) {
3268                        drbd_info(device, "No resync, but %lu bits in bitmap!\n",
3269                             drbd_bm_total_weight(device));
3270                }
3271        }
3272
3273        return rv;
3274}
3275
3276static enum drbd_after_sb_p convert_after_sb(enum drbd_after_sb_p peer)
3277{
3278        /* ASB_DISCARD_REMOTE - ASB_DISCARD_LOCAL is valid */
3279        if (peer == ASB_DISCARD_REMOTE)
3280                return ASB_DISCARD_LOCAL;
3281
3282        /* any other things with ASB_DISCARD_REMOTE or ASB_DISCARD_LOCAL are invalid */
3283        if (peer == ASB_DISCARD_LOCAL)
3284                return ASB_DISCARD_REMOTE;
3285
3286        /* everything else is valid if they are equal on both sides. */
3287        return peer;
3288}
3289
3290static int receive_protocol(struct drbd_connection *connection, struct packet_info *pi)
3291{
3292        struct p_protocol *p = pi->data;
3293        enum drbd_after_sb_p p_after_sb_0p, p_after_sb_1p, p_after_sb_2p;
3294        int p_proto, p_discard_my_data, p_two_primaries, cf;
3295        struct net_conf *nc, *old_net_conf, *new_net_conf = NULL;
3296        char integrity_alg[SHARED_SECRET_MAX] = "";
3297        struct crypto_hash *peer_integrity_tfm = NULL;
3298        void *int_dig_in = NULL, *int_dig_vv = NULL;
3299
3300        p_proto         = be32_to_cpu(p->protocol);
3301        p_after_sb_0p   = be32_to_cpu(p->after_sb_0p);
3302        p_after_sb_1p   = be32_to_cpu(p->after_sb_1p);
3303        p_after_sb_2p   = be32_to_cpu(p->after_sb_2p);
3304        p_two_primaries = be32_to_cpu(p->two_primaries);
3305        cf              = be32_to_cpu(p->conn_flags);
3306        p_discard_my_data = cf & CF_DISCARD_MY_DATA;
3307
3308        if (connection->agreed_pro_version >= 87) {
3309                int err;
3310
3311                if (pi->size > sizeof(integrity_alg))
3312                        return -EIO;
3313                err = drbd_recv_all(connection, integrity_alg, pi->size);
3314                if (err)
3315                        return err;
3316                integrity_alg[SHARED_SECRET_MAX - 1] = 0;
3317        }
3318
3319        if (pi->cmd != P_PROTOCOL_UPDATE) {
3320                clear_bit(CONN_DRY_RUN, &connection->flags);
3321
3322                if (cf & CF_DRY_RUN)
3323                        set_bit(CONN_DRY_RUN, &connection->flags);
3324
3325                rcu_read_lock();
3326                nc = rcu_dereference(connection->net_conf);
3327
3328                if (p_proto != nc->wire_protocol) {
3329                        drbd_err(connection, "incompatible %s settings\n", "protocol");
3330                        goto disconnect_rcu_unlock;
3331                }
3332
3333                if (convert_after_sb(p_after_sb_0p) != nc->after_sb_0p) {
3334                        drbd_err(connection, "incompatible %s settings\n", "after-sb-0pri");
3335                        goto disconnect_rcu_unlock;
3336                }
3337
3338                if (convert_after_sb(p_after_sb_1p) != nc->after_sb_1p) {
3339                        drbd_err(connection, "incompatible %s settings\n", "after-sb-1pri");
3340                        goto disconnect_rcu_unlock;
3341                }
3342
3343                if (convert_after_sb(p_after_sb_2p) != nc->after_sb_2p) {
3344                        drbd_err(connection, "incompatible %s settings\n", "after-sb-2pri");
3345                        goto disconnect_rcu_unlock;
3346                }
3347
3348                if (p_discard_my_data && nc->discard_my_data) {
3349                        drbd_err(connection, "incompatible %s settings\n", "discard-my-data");
3350                        goto disconnect_rcu_unlock;
3351                }
3352
3353                if (p_two_primaries != nc->two_primaries) {
3354                        drbd_err(connection, "incompatible %s settings\n", "allow-two-primaries");
3355                        goto disconnect_rcu_unlock;
3356                }
3357
3358                if (strcmp(integrity_alg, nc->integrity_alg)) {
3359                        drbd_err(connection, "incompatible %s settings\n", "data-integrity-alg");
3360                        goto disconnect_rcu_unlock;
3361                }
3362
3363                rcu_read_unlock();
3364        }
3365
3366        if (integrity_alg[0]) {
3367                int hash_size;
3368
3369                /*
3370                 * We can only change the peer data integrity algorithm
3371                 * here.  Changing our own data integrity algorithm
3372                 * requires that we send a P_PROTOCOL_UPDATE packet at
3373                 * the same time; otherwise, the peer has no way to
3374                 * tell between which packets the algorithm should
3375                 * change.
3376                 */
3377
3378                peer_integrity_tfm = crypto_alloc_hash(integrity_alg, 0, CRYPTO_ALG_ASYNC);
3379                if (!peer_integrity_tfm) {
3380                        drbd_err(connection, "peer data-integrity-alg %s not supported\n",
3381                                 integrity_alg);
3382                        goto disconnect;
3383                }
3384
3385                hash_size = crypto_hash_digestsize(peer_integrity_tfm);
3386                int_dig_in = kmalloc(hash_size, GFP_KERNEL);
3387                int_dig_vv = kmalloc(hash_size, GFP_KERNEL);
3388                if (!(int_dig_in && int_dig_vv)) {
3389                        drbd_err(connection, "Allocation of buffers for data integrity checking failed\n");
3390                        goto disconnect;
3391                }
3392        }
3393
3394        new_net_conf = kmalloc(sizeof(struct net_conf), GFP_KERNEL);
3395        if (!new_net_conf) {
3396                drbd_err(connection, "Allocation of new net_conf failed\n");
3397                goto disconnect;
3398        }
3399
3400        mutex_lock(&connection->data.mutex);
3401        mutex_lock(&connection->resource->conf_update);
3402        old_net_conf = connection->net_conf;
3403        *new_net_conf = *old_net_conf;
3404
3405        new_net_conf->wire_protocol = p_proto;
3406        new_net_conf->after_sb_0p = convert_after_sb(p_after_sb_0p);
3407        new_net_conf->after_sb_1p = convert_after_sb(p_after_sb_1p);
3408        new_net_conf->after_sb_2p = convert_after_sb(p_after_sb_2p);
3409        new_net_conf->two_primaries = p_two_primaries;
3410
3411        rcu_assign_pointer(connection->net_conf, new_net_conf);
3412        mutex_unlock(&connection->resource->conf_update);
3413        mutex_unlock(&connection->data.mutex);
3414
3415        crypto_free_hash(connection->peer_integrity_tfm);
3416        kfree(connection->int_dig_in);
3417        kfree(connection->int_dig_vv);
3418        connection->peer_integrity_tfm = peer_integrity_tfm;
3419        connection->int_dig_in = int_dig_in;
3420        connection->int_dig_vv = int_dig_vv;
3421
3422        if (strcmp(old_net_conf->integrity_alg, integrity_alg))
3423                drbd_info(connection, "peer data-integrity-alg: %s\n",
3424                          integrity_alg[0] ? integrity_alg : "(none)");
3425
3426        synchronize_rcu();
3427        kfree(old_net_conf);
3428        return 0;
3429
3430disconnect_rcu_unlock:
3431        rcu_read_unlock();
3432disconnect:
3433        crypto_free_hash(peer_integrity_tfm);
3434        kfree(int_dig_in);
3435        kfree(int_dig_vv);
3436        conn_request_state(connection, NS(conn, C_DISCONNECTING), CS_HARD);
3437        return -EIO;
3438}
3439
3440/* helper function
3441 * input: alg name, feature name
3442 * return: NULL (alg name was "")
3443 *         ERR_PTR(error) if something goes wrong
3444 *         or the crypto hash ptr, if it worked out ok. */
3445static struct crypto_hash *drbd_crypto_alloc_digest_safe(const struct drbd_device *device,
3446                const char *alg, const char *name)
3447{
3448        struct crypto_hash *tfm;
3449
3450        if (!alg[0])
3451                return NULL;
3452
3453        tfm = crypto_alloc_hash(alg, 0, CRYPTO_ALG_ASYNC);
3454        if (IS_ERR(tfm)) {
3455                drbd_err(device, "Can not allocate \"%s\" as %s (reason: %ld)\n",
3456                        alg, name, PTR_ERR(tfm));
3457                return tfm;
3458        }
3459        return tfm;
3460}
3461
3462static int ignore_remaining_packet(struct drbd_connection *connection, struct packet_info *pi)
3463{
3464        void *buffer = connection->data.rbuf;
3465        int size = pi->size;
3466
3467        while (size) {
3468                int s = min_t(int, size, DRBD_SOCKET_BUFFER_SIZE);
3469                s = drbd_recv(connection, buffer, s);
3470                if (s <= 0) {
3471                        if (s < 0)
3472                                return s;
3473                        break;
3474                }
3475                size -= s;
3476        }
3477        if (size)
3478                return -EIO;
3479        return 0;
3480}
3481
3482/*
3483 * config_unknown_volume  -  device configuration command for unknown volume
3484 *
3485 * When a device is added to an existing connection, the node on which the
3486 * device is added first will send configuration commands to its peer but the
3487 * peer will not know about the device yet.  It will warn and ignore these
3488 * commands.  Once the device is added on the second node, the second node will
3489 * send the same device configuration commands, but in the other direction.
3490 *
3491 * (We can also end up here if drbd is misconfigured.)
3492 */
3493static int config_unknown_volume(struct drbd_connection *connection, struct packet_info *pi)
3494{
3495        drbd_warn(connection, "%s packet received for volume %u, which is not configured locally\n",
3496                  cmdname(pi->cmd), pi->vnr);
3497        return ignore_remaining_packet(connection, pi);
3498}
3499
3500static int receive_SyncParam(struct drbd_connection *connection, struct packet_info *pi)
3501{
3502        struct drbd_peer_device *peer_device;
3503        struct drbd_device *device;
3504        struct p_rs_param_95 *p;
3505        unsigned int header_size, data_size, exp_max_sz;
3506        struct crypto_hash *verify_tfm = NULL;
3507        struct crypto_hash *csums_tfm = NULL;
3508        struct net_conf *old_net_conf, *new_net_conf = NULL;
3509        struct disk_conf *old_disk_conf = NULL, *new_disk_conf = NULL;
3510        const int apv = connection->agreed_pro_version;
3511        struct fifo_buffer *old_plan = NULL, *new_plan = NULL;
3512        int fifo_size = 0;
3513        int err;
3514
3515        peer_device = conn_peer_device(connection, pi->vnr);
3516        if (!peer_device)
3517                return config_unknown_volume(connection, pi);
3518        device = peer_device->device;
3519
3520        exp_max_sz  = apv <= 87 ? sizeof(struct p_rs_param)
3521                    : apv == 88 ? sizeof(struct p_rs_param)
3522                                        + SHARED_SECRET_MAX
3523                    : apv <= 94 ? sizeof(struct p_rs_param_89)
3524                    : /* apv >= 95 */ sizeof(struct p_rs_param_95);
3525
3526        if (pi->size > exp_max_sz) {
3527                drbd_err(device, "SyncParam packet too long: received %u, expected <= %u bytes\n",
3528                    pi->size, exp_max_sz);
3529                return -EIO;
3530        }
3531
3532        if (apv <= 88) {
3533                header_size = sizeof(struct p_rs_param);
3534                data_size = pi->size - header_size;
3535        } else if (apv <= 94) {
3536                header_size = sizeof(struct p_rs_param_89);
3537                data_size = pi->size - header_size;
3538                D_ASSERT(device, data_size == 0);
3539        } else {
3540                header_size = sizeof(struct p_rs_param_95);
3541                data_size = pi->size - header_size;
3542                D_ASSERT(device, data_size == 0);
3543        }
3544
3545        /* initialize verify_alg and csums_alg */
3546        p = pi->data;
3547        memset(p->verify_alg, 0, 2 * SHARED_SECRET_MAX);
3548
3549        err = drbd_recv_all(peer_device->connection, p, header_size);
3550        if (err)
3551                return err;
3552
3553        mutex_lock(&connection->resource->conf_update);
3554        old_net_conf = peer_device->connection->net_conf;
3555        if (get_ldev(device)) {
3556                new_disk_conf = kzalloc(sizeof(struct disk_conf), GFP_KERNEL);
3557                if (!new_disk_conf) {
3558                        put_ldev(device);
3559                        mutex_unlock(&connection->resource->conf_update);
3560                        drbd_err(device, "Allocation of new disk_conf failed\n");
3561                        return -ENOMEM;
3562                }
3563
3564                old_disk_conf = device->ldev->disk_conf;
3565                *new_disk_conf = *old_disk_conf;
3566
3567                new_disk_conf->resync_rate = be32_to_cpu(p->resync_rate);
3568        }
3569
3570        if (apv >= 88) {
3571                if (apv == 88) {
3572                        if (data_size > SHARED_SECRET_MAX || data_size == 0) {
3573                                drbd_err(device, "verify-alg of wrong size, "
3574                                        "peer wants %u, accepting only up to %u byte\n",
3575                                        data_size, SHARED_SECRET_MAX);
3576                                err = -EIO;
3577                                goto reconnect;
3578                        }
3579
3580                        err = drbd_recv_all(peer_device->connection, p->verify_alg, data_size);
3581                        if (err)
3582                                goto reconnect;
3583                        /* we expect NUL terminated string */
3584                        /* but just in case someone tries to be evil */
3585                        D_ASSERT(device, p->verify_alg[data_size-1] == 0);
3586                        p->verify_alg[data_size-1] = 0;
3587
3588                } else /* apv >= 89 */ {
3589                        /* we still expect NUL terminated strings */
3590                        /* but just in case someone tries to be evil */
3591                        D_ASSERT(device, p->verify_alg[SHARED_SECRET_MAX-1] == 0);
3592                        D_ASSERT(device, p->csums_alg[SHARED_SECRET_MAX-1] == 0);
3593                        p->verify_alg[SHARED_SECRET_MAX-1] = 0;
3594                        p->csums_alg[SHARED_SECRET_MAX-1] = 0;
3595                }
3596
3597                if (strcmp(old_net_conf->verify_alg, p->verify_alg)) {
3598                        if (device->state.conn == C_WF_REPORT_PARAMS) {
3599                                drbd_err(device, "Different verify-alg settings. me=\"%s\" peer=\"%s\"\n",
3600                                    old_net_conf->verify_alg, p->verify_alg);
3601                                goto disconnect;
3602                        }
3603                        verify_tfm = drbd_crypto_alloc_digest_safe(device,
3604                                        p->verify_alg, "verify-alg");
3605                        if (IS_ERR(verify_tfm)) {
3606                                verify_tfm = NULL;
3607                                goto disconnect;
3608                        }
3609                }
3610
3611                if (apv >= 89 && strcmp(old_net_conf->csums_alg, p->csums_alg)) {
3612                        if (device->state.conn == C_WF_REPORT_PARAMS) {
3613                                drbd_err(device, "Different csums-alg settings. me=\"%s\" peer=\"%s\"\n",
3614                                    old_net_conf->csums_alg, p->csums_alg);
3615                                goto disconnect;
3616                        }
3617                        csums_tfm = drbd_crypto_alloc_digest_safe(device,
3618                                        p->csums_alg, "csums-alg");
3619                        if (IS_ERR(csums_tfm)) {
3620                                csums_tfm = NULL;
3621                                goto disconnect;
3622                        }
3623                }
3624
3625                if (apv > 94 && new_disk_conf) {
3626                        new_disk_conf->c_plan_ahead = be32_to_cpu(p->c_plan_ahead);
3627                        new_disk_conf->c_delay_target = be32_to_cpu(p->c_delay_target);
3628                        new_disk_conf->c_fill_target = be32_to_cpu(p->c_fill_target);
3629                        new_disk_conf->c_max_rate = be32_to_cpu(p->c_max_rate);
3630
3631                        fifo_size = (new_disk_conf->c_plan_ahead * 10 * SLEEP_TIME) / HZ;
3632                        if (fifo_size != device->rs_plan_s->size) {
3633                                new_plan = fifo_alloc(fifo_size);
3634                                if (!new_plan) {
3635                                        drbd_err(device, "kmalloc of fifo_buffer failed");
3636                                        put_ldev(device);
3637                                        goto disconnect;
3638                                }
3639                        }
3640                }
3641
3642                if (verify_tfm || csums_tfm) {
3643                        new_net_conf = kzalloc(sizeof(struct net_conf), GFP_KERNEL);
3644                        if (!new_net_conf) {
3645                                drbd_err(device, "Allocation of new net_conf failed\n");
3646                                goto disconnect;
3647                        }
3648
3649                        *new_net_conf = *old_net_conf;
3650
3651                        if (verify_tfm) {
3652                                strcpy(new_net_conf->verify_alg, p->verify_alg);
3653                                new_net_conf->verify_alg_len = strlen(p->verify_alg) + 1;
3654                                crypto_free_hash(peer_device->connection->verify_tfm);
3655                                peer_device->connection->verify_tfm = verify_tfm;
3656                                drbd_info(device, "using verify-alg: \"%s\"\n", p->verify_alg);
3657                        }
3658                        if (csums_tfm) {
3659                                strcpy(new_net_conf->csums_alg, p->csums_alg);
3660                                new_net_conf->csums_alg_len = strlen(p->csums_alg) + 1;
3661                                crypto_free_hash(peer_device->connection->csums_tfm);
3662                                peer_device->connection->csums_tfm = csums_tfm;
3663                                drbd_info(device, "using csums-alg: \"%s\"\n", p->csums_alg);
3664                        }
3665                        rcu_assign_pointer(connection->net_conf, new_net_conf);
3666                }
3667        }
3668
3669        if (new_disk_conf) {
3670                rcu_assign_pointer(device->ldev->disk_conf, new_disk_conf);
3671                put_ldev(device);
3672        }
3673
3674        if (new_plan) {
3675                old_plan = device->rs_plan_s;
3676                rcu_assign_pointer(device->rs_plan_s, new_plan);
3677        }
3678
3679        mutex_unlock(&connection->resource->conf_update);
3680        synchronize_rcu();
3681        if (new_net_conf)
3682                kfree(old_net_conf);
3683        kfree(old_disk_conf);
3684        kfree(old_plan);
3685
3686        return 0;
3687
3688reconnect:
3689        if (new_disk_conf) {
3690                put_ldev(device);
3691                kfree(new_disk_conf);
3692        }
3693        mutex_unlock(&connection->resource->conf_update);
3694        return -EIO;
3695
3696disconnect:
3697        kfree(new_plan);
3698        if (new_disk_conf) {
3699                put_ldev(device);
3700                kfree(new_disk_conf);
3701        }
3702        mutex_unlock(&connection->resource->conf_update);
3703        /* just for completeness: actually not needed,
3704         * as this is not reached if csums_tfm was ok. */
3705        crypto_free_hash(csums_tfm);
3706        /* but free the verify_tfm again, if csums_tfm did not work out */
3707        crypto_free_hash(verify_tfm);
3708        conn_request_state(peer_device->connection, NS(conn, C_DISCONNECTING), CS_HARD);
3709        return -EIO;
3710}
3711
3712/* warn if the arguments differ by more than 12.5% */
3713static void warn_if_differ_considerably(struct drbd_device *device,
3714        const char *s, sector_t a, sector_t b)
3715{
3716        sector_t d;
3717        if (a == 0 || b == 0)
3718                return;
3719        d = (a > b) ? (a - b) : (b - a);
3720        if (d > (a>>3) || d > (b>>3))
3721                drbd_warn(device, "Considerable difference in %s: %llus vs. %llus\n", s,
3722                     (unsigned long long)a, (unsigned long long)b);
3723}
3724
3725static int receive_sizes(struct drbd_connection *connection, struct packet_info *pi)
3726{
3727        struct drbd_peer_device *peer_device;
3728        struct drbd_device *device;
3729        struct p_sizes *p = pi->data;
3730        enum determine_dev_size dd = DS_UNCHANGED;
3731        sector_t p_size, p_usize, p_csize, my_usize;
3732        int ldsc = 0; /* local disk size changed */
3733        enum dds_flags ddsf;
3734
3735        peer_device = conn_peer_device(connection, pi->vnr);
3736        if (!peer_device)
3737                return config_unknown_volume(connection, pi);
3738        device = peer_device->device;
3739
3740        p_size = be64_to_cpu(p->d_size);
3741        p_usize = be64_to_cpu(p->u_size);
3742        p_csize = be64_to_cpu(p->c_size);
3743
3744        /* just store the peer's disk size for now.
3745         * we still need to figure out whether we accept that. */
3746        device->p_size = p_size;
3747
3748        if (get_ldev(device)) {
3749                rcu_read_lock();
3750                my_usize = rcu_dereference(device->ldev->disk_conf)->disk_size;
3751                rcu_read_unlock();
3752
3753                warn_if_differ_considerably(device, "lower level device sizes",
3754                           p_size, drbd_get_max_capacity(device->ldev));
3755                warn_if_differ_considerably(device, "user requested size",
3756                                            p_usize, my_usize);
3757
3758                /* if this is the first connect, or an otherwise expected
3759                 * param exchange, choose the minimum */
3760                if (device->state.conn == C_WF_REPORT_PARAMS)
3761                        p_usize = min_not_zero(my_usize, p_usize);
3762
3763                /* Never shrink a device with usable data during connect.
3764                   But allow online shrinking if we are connected. */
3765                if (drbd_new_dev_size(device, device->ldev, p_usize, 0) <
3766                    drbd_get_capacity(device->this_bdev) &&
3767                    device->state.disk >= D_OUTDATED &&
3768                    device->state.conn < C_CONNECTED) {
3769                        drbd_err(device, "The peer's disk size is too small!\n");
3770                        conn_request_state(peer_device->connection, NS(conn, C_DISCONNECTING), CS_HARD);
3771                        put_ldev(device);
3772                        return -EIO;
3773                }
3774
3775                if (my_usize != p_usize) {
3776                        struct disk_conf *old_disk_conf, *new_disk_conf = NULL;
3777
3778                        new_disk_conf = kzalloc(sizeof(struct disk_conf), GFP_KERNEL);
3779                        if (!new_disk_conf) {
3780                                drbd_err(device, "Allocation of new disk_conf failed\n");
3781                                put_ldev(device);
3782                                return -ENOMEM;
3783                        }
3784
3785                        mutex_lock(&connection->resource->conf_update);
3786                        old_disk_conf = device->ldev->disk_conf;
3787                        *new_disk_conf = *old_disk_conf;
3788                        new_disk_conf->disk_size = p_usize;
3789
3790                        rcu_assign_pointer(device->ldev->disk_conf, new_disk_conf);
3791                        mutex_unlock(&connection->resource->conf_update);
3792                        synchronize_rcu();
3793                        kfree(old_disk_conf);
3794
3795                        drbd_info(device, "Peer sets u_size to %lu sectors\n",
3796                                 (unsigned long)my_usize);
3797                }
3798
3799                put_ldev(device);
3800        }
3801
3802        device->peer_max_bio_size = be32_to_cpu(p->max_bio_size);
3803        /* Leave drbd_reconsider_max_bio_size() before drbd_determine_dev_size().
3804           In case we cleared the QUEUE_FLAG_DISCARD from our queue in
3805           drbd_reconsider_max_bio_size(), we can be sure that after
3806           drbd_determine_dev_size() no REQ_DISCARDs are in the queue. */
3807
3808        ddsf = be16_to_cpu(p->dds_flags);
3809        if (get_ldev(device)) {
3810                drbd_reconsider_max_bio_size(device, device->ldev);
3811                dd = drbd_determine_dev_size(device, ddsf, NULL);
3812                put_ldev(device);
3813                if (dd == DS_ERROR)
3814                        return -EIO;
3815                drbd_md_sync(device);
3816        } else {
3817                /*
3818                 * I am diskless, need to accept the peer's *current* size.
3819                 * I must NOT accept the peers backing disk size,
3820                 * it may have been larger than mine all along...
3821                 *
3822                 * At this point, the peer knows more about my disk, or at
3823                 * least about what we last agreed upon, than myself.
3824                 * So if his c_size is less than his d_size, the most likely
3825                 * reason is that *my* d_size was smaller last time we checked.
3826                 *
3827                 * However, if he sends a zero current size,
3828                 * take his (user-capped or) backing disk size anyways.
3829                 */
3830                drbd_reconsider_max_bio_size(device, NULL);
3831                drbd_set_my_capacity(device, p_csize ?: p_usize ?: p_size);
3832        }
3833
3834        if (get_ldev(device)) {
3835                if (device->ldev->known_size != drbd_get_capacity(device->ldev->backing_bdev)) {
3836                        device->ldev->known_size = drbd_get_capacity(device->ldev->backing_bdev);
3837                        ldsc = 1;
3838                }
3839
3840                put_ldev(device);
3841        }
3842
3843        if (device->state.conn > C_WF_REPORT_PARAMS) {
3844                if (be64_to_cpu(p->c_size) !=
3845                    drbd_get_capacity(device->this_bdev) || ldsc) {
3846                        /* we have different sizes, probably peer
3847                         * needs to know my new size... */
3848                        drbd_send_sizes(peer_device, 0, ddsf);
3849                }
3850                if (test_and_clear_bit(RESIZE_PENDING, &device->flags) ||
3851                    (dd == DS_GREW && device->state.conn == C_CONNECTED)) {
3852                        if (device->state.pdsk >= D_INCONSISTENT &&
3853                            device->state.disk >= D_INCONSISTENT) {
3854                                if (ddsf & DDSF_NO_RESYNC)
3855                                        drbd_info(device, "Resync of new storage suppressed with --assume-clean\n");
3856                                else
3857                                        resync_after_online_grow(device);
3858                        } else
3859                                set_bit(RESYNC_AFTER_NEG, &device->flags);
3860                }
3861        }
3862
3863        return 0;
3864}
3865
3866static int receive_uuids(struct drbd_connection *connection, struct packet_info *pi)
3867{
3868        struct drbd_peer_device *peer_device;
3869        struct drbd_device *device;
3870        struct p_uuids *p = pi->data;
3871        u64 *p_uuid;
3872        int i, updated_uuids = 0;
3873
3874        peer_device = conn_peer_device(connection, pi->vnr);
3875        if (!peer_device)
3876                return config_unknown_volume(connection, pi);
3877        device = peer_device->device;
3878
3879        p_uuid = kmalloc(sizeof(u64)*UI_EXTENDED_SIZE, GFP_NOIO);
3880        if (!p_uuid) {
3881                drbd_err(device, "kmalloc of p_uuid failed\n");
3882                return false;
3883        }
3884
3885        for (i = UI_CURRENT; i < UI_EXTENDED_SIZE; i++)
3886                p_uuid[i] = be64_to_cpu(p->uuid[i]);
3887
3888        kfree(device->p_uuid);
3889        device->p_uuid = p_uuid;
3890
3891        if (device->state.conn < C_CONNECTED &&
3892            device->state.disk < D_INCONSISTENT &&
3893            device->state.role == R_PRIMARY &&
3894            (device->ed_uuid & ~((u64)1)) != (p_uuid[UI_CURRENT] & ~((u64)1))) {
3895                drbd_err(device, "Can only connect to data with current UUID=%016llX\n",
3896                    (unsigned long long)device->ed_uuid);
3897                conn_request_state(peer_device->connection, NS(conn, C_DISCONNECTING), CS_HARD);
3898                return -EIO;
3899        }
3900
3901        if (get_ldev(device)) {
3902                int skip_initial_sync =
3903                        device->state.conn == C_CONNECTED &&
3904                        peer_device->connection->agreed_pro_version >= 90 &&
3905                        device->ldev->md.uuid[UI_CURRENT] == UUID_JUST_CREATED &&
3906                        (p_uuid[UI_FLAGS] & 8);
3907                if (skip_initial_sync) {
3908                        drbd_info(device, "Accepted new current UUID, preparing to skip initial sync\n");
3909                        drbd_bitmap_io(device, &drbd_bmio_clear_n_write,
3910                                        "clear_n_write from receive_uuids",
3911                                        BM_LOCKED_TEST_ALLOWED);
3912                        _drbd_uuid_set(device, UI_CURRENT, p_uuid[UI_CURRENT]);
3913                        _drbd_uuid_set(device, UI_BITMAP, 0);
3914                        _drbd_set_state(_NS2(device, disk, D_UP_TO_DATE, pdsk, D_UP_TO_DATE),
3915                                        CS_VERBOSE, NULL);
3916                        drbd_md_sync(device);
3917                        updated_uuids = 1;
3918                }
3919                put_ldev(device);
3920        } else if (device->state.disk < D_INCONSISTENT &&
3921                   device->state.role == R_PRIMARY) {
3922                /* I am a diskless primary, the peer just created a new current UUID
3923                   for me. */
3924                updated_uuids = drbd_set_ed_uuid(device, p_uuid[UI_CURRENT]);
3925        }
3926
3927        /* Before we test for the disk state, we should wait until an eventually
3928           ongoing cluster wide state change is finished. That is important if
3929           we are primary and are detaching from our disk. We need to see the
3930           new disk state... */
3931        mutex_lock(device->state_mutex);
3932        mutex_unlock(device->state_mutex);
3933        if (device->state.conn >= C_CONNECTED && device->state.disk < D_INCONSISTENT)
3934                updated_uuids |= drbd_set_ed_uuid(device, p_uuid[UI_CURRENT]);
3935
3936        if (updated_uuids)
3937                drbd_print_uuids(device, "receiver updated UUIDs to");
3938
3939        return 0;
3940}
3941
3942/**
3943 * convert_state() - Converts the peer's view of the cluster state to our point of view
3944 * @ps:         The state as seen by the peer.
3945 */
3946static union drbd_state convert_state(union drbd_state ps)
3947{
3948        union drbd_state ms;
3949
3950        static enum drbd_conns c_tab[] = {
3951                [C_WF_REPORT_PARAMS] = C_WF_REPORT_PARAMS,
3952                [C_CONNECTED] = C_CONNECTED,
3953
3954                [C_STARTING_SYNC_S] = C_STARTING_SYNC_T,
3955                [C_STARTING_SYNC_T] = C_STARTING_SYNC_S,
3956                [C_DISCONNECTING] = C_TEAR_DOWN, /* C_NETWORK_FAILURE, */
3957                [C_VERIFY_S]       = C_VERIFY_T,
3958                [C_MASK]   = C_MASK,
3959        };
3960
3961        ms.i = ps.i;
3962
3963        ms.conn = c_tab[ps.conn];
3964        ms.peer = ps.role;
3965        ms.role = ps.peer;
3966        ms.pdsk = ps.disk;
3967        ms.disk = ps.pdsk;
3968        ms.peer_isp = (ps.aftr_isp | ps.user_isp);
3969
3970        return ms;
3971}
3972
3973static int receive_req_state(struct drbd_connection *connection, struct packet_info *pi)
3974{
3975        struct drbd_peer_device *peer_device;
3976        struct drbd_device *device;
3977        struct p_req_state *p = pi->data;
3978        union drbd_state mask, val;
3979        enum drbd_state_rv rv;
3980
3981        peer_device = conn_peer_device(connection, pi->vnr);
3982        if (!peer_device)
3983                return -EIO;
3984        device = peer_device->device;
3985
3986        mask.i = be32_to_cpu(p->mask);
3987        val.i = be32_to_cpu(p->val);
3988
3989        if (test_bit(RESOLVE_CONFLICTS, &peer_device->connection->flags) &&
3990            mutex_is_locked(device->state_mutex)) {
3991                drbd_send_sr_reply(peer_device, SS_CONCURRENT_ST_CHG);
3992                return 0;
3993        }
3994
3995        mask = convert_state(mask);
3996        val = convert_state(val);
3997
3998        rv = drbd_change_state(device, CS_VERBOSE, mask, val);
3999        drbd_send_sr_reply(peer_device, rv);
4000

4001        drbd_md_sync(device);
4002
4003        return 0;
4004}
4005
4006static int receive_req_conn_state(struct drbd_connection *connection, struct packet_info *pi)
4007{
4008        struct p_req_state *p = pi->data;
4009        union drbd_state mask, val;
4010        enum drbd_state_rv rv;
4011
4012        mask.i = be32_to_cpu(p->mask);
4013        val.i = be32_to_cpu(p->val);
4014
4015        if (test_bit(RESOLVE_CONFLICTS, &connection->flags) &&
4016            mutex_is_locked(&connection->cstate_mutex)) {
4017                conn_send_sr_reply(connection, SS_CONCURRENT_ST_CHG);
4018                return 0;
4019        }
4020
4021        mask = convert_state(mask);
4022        val = convert_state(val);
4023
4024        rv = conn_request_state(connection, mask, val, CS_VERBOSE | CS_LOCAL_ONLY | CS_IGN_OUTD_FAIL);
4025        conn_send_sr_reply(connection, rv);
4026
4027        return 0;
4028}
4029
4030static int receive_state(struct drbd_connection *connection, struct packet_info *pi)
4031{
4032        struct drbd_peer_device *peer_device;
4033        struct drbd_device *device;
4034        struct p_state *p = pi->data;
4035        union drbd_state os, ns, peer_state;
4036        enum drbd_disk_state real_peer_disk;
4037        enum chg_state_flags cs_flags;
4038        int rv;
4039
4040        peer_device = conn_peer_device(connection, pi->vnr);
4041        if (!peer_device)
4042                return config_unknown_volume(connection, pi);
4043        device = peer_device->device;
4044
4045        peer_state.i = be32_to_cpu(p->state);
4046
4047        real_peer_disk = peer_state.disk;
4048        if (peer_state.disk == D_NEGOTIATING) {
4049                real_peer_disk = device->p_uuid[UI_FLAGS] & 4 ? D_INCONSISTENT : D_CONSISTENT;
4050                drbd_info(device, "real peer disk state = %s\n", drbd_disk_str(real_peer_disk));
4051        }
4052
4053        spin_lock_irq(&device->resource->req_lock);
4054 retry:
4055        os = ns = drbd_read_state(device);
4056        spin_unlock_irq(&device->resource->req_lock);
4057
4058        /* If some other part of the code (asender thread, timeout)
4059         * already decided to close the connection again,
4060         * we must not "re-establish" it here. */
4061        if (os.conn <= C_TEAR_DOWN)
4062                return -ECONNRESET;
4063
4064        /* If this is the "end of sync" confirmation, usually the peer disk
4065         * transitions from D_INCONSISTENT to D_UP_TO_DATE. For empty (0 bits
4066         * set) resync started in PausedSyncT, or if the timing of pause-/
4067         * unpause-sync events has been "just right", the peer disk may
4068         * transition from D_CONSISTENT to D_UP_TO_DATE as well.
4069         */
4070        if ((os.pdsk == D_INCONSISTENT || os.pdsk == D_CONSISTENT) &&
4071            real_peer_disk == D_UP_TO_DATE &&
4072            os.conn > C_CONNECTED && os.disk == D_UP_TO_DATE) {
4073                /* If we are (becoming) SyncSource, but peer is still in sync
4074                 * preparation, ignore its uptodate-ness to avoid flapping, it
4075                 * will change to inconsistent once the peer reaches active
4076                 * syncing states.
4077                 * It may have changed syncer-paused flags, however, so we
4078                 * cannot ignore this completely. */
4079                if (peer_state.conn > C_CONNECTED &&
4080                    peer_state.conn < C_SYNC_SOURCE)
4081                        real_peer_disk = D_INCONSISTENT;
4082
4083                /* if peer_state changes to connected at the same time,
4084                 * it explicitly notifies us that it finished resync.
4085                 * Maybe we should finish it up, too? */
4086                else if (os.conn >= C_SYNC_SOURCE &&
4087                         peer_state.conn == C_CONNECTED) {
4088                        if (drbd_bm_total_weight(device) <= device->rs_failed)
4089                                drbd_resync_finished(device);
4090                        return 0;
4091                }
4092        }
4093
4094        /* explicit verify finished notification, stop sector reached. */
4095        if (os.conn == C_VERIFY_T && os.disk == D_UP_TO_DATE &&
4096            peer_state.conn == C_CONNECTED && real_peer_disk == D_UP_TO_DATE) {
4097                ov_out_of_sync_print(device);
4098                drbd_resync_finished(device);
4099                return 0;
4100        }
4101
4102        /* peer says his disk is inconsistent, while we think it is uptodate,
4103         * and this happens while the peer still thinks we have a sync going on,
4104         * but we think we are already done with the sync.
4105         * We ignore this to avoid flapping pdsk.
4106         * This should not happen, if the peer is a recent version of drbd. */
4107        if (os.pdsk == D_UP_TO_DATE && real_peer_disk == D_INCONSISTENT &&
4108            os.conn == C_CONNECTED && peer_state.conn > C_SYNC_SOURCE)
4109                real_peer_disk = D_UP_TO_DATE;
4110
4111        if (ns.conn == C_WF_REPORT_PARAMS)
4112                ns.conn = C_CONNECTED;
4113
4114        if (peer_state.conn == C_AHEAD)
4115                ns.conn = C_BEHIND;
4116
4117        if (device->p_uuid && peer_state.disk >= D_NEGOTIATING &&
4118            get_ldev_if_state(device, D_NEGOTIATING)) {
4119                int cr; /* consider resync */
4120
4121                /* if we established a new connection */
4122                cr  = (os.conn < C_CONNECTED);
4123                /* if we had an established connection
4124                 * and one of the nodes newly attaches a disk */
4125                cr |= (os.conn == C_CONNECTED &&
4126                       (peer_state.disk == D_NEGOTIATING ||
4127                        os.disk == D_NEGOTIATING));
4128                /* if we have both been inconsistent, and the peer has been
4129                 * forced to be UpToDate with --overwrite-data */
4130                cr |= test_bit(CONSIDER_RESYNC, &device->flags);
4131                /* if we had been plain connected, and the admin requested to
4132                 * start a sync by "invalidate" or "invalidate-remote" */
4133                cr |= (os.conn == C_CONNECTED &&
4134                                (peer_state.conn >= C_STARTING_SYNC_S &&
4135                                 peer_state.conn <= C_WF_BITMAP_T));
4136
4137                if (cr)
4138                        ns.conn = drbd_sync_handshake(peer_device, peer_state.role, real_peer_disk);
4139
4140                put_ldev(device);
4141                if (ns.conn == C_MASK) {
4142                        ns.conn = C_CONNECTED;
4143                        if (device->state.disk == D_NEGOTIATING) {
4144                                drbd_force_state(device, NS(disk, D_FAILED));
4145                        } else if (peer_state.disk == D_NEGOTIATING) {
4146                                drbd_err(device, "Disk attach process on the peer node was aborted.\n");
4147                                peer_state.disk = D_DISKLESS;
4148                                real_peer_disk = D_DISKLESS;
4149                        } else {
4150                                if (test_and_clear_bit(CONN_DRY_RUN, &peer_device->connection->flags))
4151                                        return -EIO;
4152                                D_ASSERT(device, os.conn == C_WF_REPORT_PARAMS);
4153                                conn_request_state(peer_device->connection, NS(conn, C_DISCONNECTING), CS_HARD);
4154                                return -EIO;
4155                        }
4156                }
4157        }
4158
4159        spin_lock_irq(&device->resource->req_lock);
4160        if (os.i != drbd_read_state(device).i)
4161                goto retry;
4162        clear_bit(CONSIDER_RESYNC, &device->flags);
4163        ns.peer = peer_state.role;
4164        ns.pdsk = real_peer_disk;
4165        ns.peer_isp = (peer_state.aftr_isp | peer_state.user_isp);
4166        if ((ns.conn == C_CONNECTED || ns.conn == C_WF_BITMAP_S) && ns.disk == D_NEGOTIATING)
4167                ns.disk = device->new_state_tmp.disk;
4168        cs_flags = CS_VERBOSE + (os.conn < C_CONNECTED && ns.conn >= C_CONNECTED ? 0 : CS_HARD);
4169        if (ns.pdsk == D_CONSISTENT && drbd_suspended(device) && ns.conn == C_CONNECTED && os.conn < C_CONNECTED &&
4170            test_bit(NEW_CUR_UUID, &device->flags)) {
4171                /* Do not allow tl_restart(RESEND) for a rebooted peer. We can only allow this
4172                   for temporal network outages! */
4173                spin_unlock_irq(&device->resource->req_lock);
4174                drbd_err(device, "Aborting Connect, can not thaw IO with an only Consistent peer\n");
4175                tl_clear(peer_device->connection);
4176                drbd_uuid_new_current(device);
4177                clear_bit(NEW_CUR_UUID, &device->flags);
4178                conn_request_state(peer_device->connection, NS2(conn, C_PROTOCOL_ERROR, susp, 0), CS_HARD);
4179                return -EIO;
4180        }
4181        rv = _drbd_set_state(device, ns, cs_flags, NULL);
4182        ns = drbd_read_state(device);
4183        spin_unlock_irq(&device->resource->req_lock);
4184
4185        if (rv < SS_SUCCESS) {
4186                conn_request_state(peer_device->connection, NS(conn, C_DISCONNECTING), CS_HARD);
4187                return -EIO;
4188        }
4189
4190        if (os.conn > C_WF_REPORT_PARAMS) {
4191                if (ns.conn > C_CONNECTED && peer_state.conn <= C_CONNECTED &&
4192                    peer_state.disk != D_NEGOTIATING ) {
4193                        /* we want resync, peer has not yet decided to sync... */
4194                        /* Nowadays only used when forcing a node into primary role and
4195                           setting its disk to UpToDate with that */
4196                        drbd_send_uuids(peer_device);
4197                        drbd_send_current_state(peer_device);
4198                }
4199        }
4200
4201        clear_bit(DISCARD_MY_DATA, &device->flags);
4202
4203        drbd_md_sync(device); /* update connected indicator, la_size_sect, ... */
4204
4205        return 0;
4206}
4207
4208static int receive_sync_uuid(struct drbd_connection *connection, struct packet_info *pi)
4209{
4210        struct drbd_peer_device *peer_device;
4211        struct drbd_device *device;
4212        struct p_rs_uuid *p = pi->data;
4213
4214        peer_device = conn_peer_device(connection, pi->vnr);
4215        if (!peer_device)
4216                return -EIO;
4217        device = peer_device->device;
4218
4219        wait_event(device->misc_wait,
4220                   device->state.conn == C_WF_SYNC_UUID ||
4221                   device->state.conn == C_BEHIND ||
4222                   device->state.conn < C_CONNECTED ||
4223                   device->state.disk < D_NEGOTIATING);
4224
4225        /* D_ASSERT(device,  device->state.conn == C_WF_SYNC_UUID ); */
4226
4227        /* Here the _drbd_uuid_ functions are right, current should
4228           _not_ be rotated into the history */
4229        if (get_ldev_if_state(device, D_NEGOTIATING)) {
4230                _drbd_uuid_set(device, UI_CURRENT, be64_to_cpu(p->uuid));
4231                _drbd_uuid_set(device, UI_BITMAP, 0UL);
4232
4233                drbd_print_uuids(device, "updated sync uuid");
4234                drbd_start_resync(device, C_SYNC_TARGET);
4235
4236                put_ldev(device);
4237        } else
4238                drbd_err(device, "Ignoring SyncUUID packet!\n");
4239
4240        return 0;
4241}
4242
4243/**
4244 * receive_bitmap_plain
4245 *
4246 * Return 0 when done, 1 when another iteration is needed, and a negative error
4247 * code upon failure.
4248 */
4249static int
4250receive_bitmap_plain(struct drbd_peer_device *peer_device, unsigned int size,
4251                     unsigned long *p, struct bm_xfer_ctx *c)
4252{
4253        unsigned int data_size = DRBD_SOCKET_BUFFER_SIZE -
4254                                 drbd_header_size(peer_device->connection);
4255        unsigned int num_words = min_t(size_t, data_size / sizeof(*p),
4256                                       c->bm_words - c->word_offset);
4257        unsigned int want = num_words * sizeof(*p);
4258        int err;
4259
4260        if (want != size) {
4261                drbd_err(peer_device, "%s:want (%u) != size (%u)\n", __func__, want, size);
4262                return -EIO;
4263        }
4264        if (want == 0)
4265                return 0;
4266        err = drbd_recv_all(peer_device->connection, p, want);
4267        if (err)
4268                return err;
4269
4270        drbd_bm_merge_lel(peer_device->device, c->word_offset, num_words, p);
4271
4272        c->word_offset += num_words;
4273        c->bit_offset = c->word_offset * BITS_PER_LONG;
4274        if (c->bit_offset > c->bm_bits)
4275                c->bit_offset = c->bm_bits;
4276
4277        return 1;
4278}
4279
4280static enum drbd_bitmap_code dcbp_get_code(struct p_compressed_bm *p)
4281{
4282        return (enum drbd_bitmap_code)(p->encoding & 0x0f);
4283}
4284
4285static int dcbp_get_start(struct p_compressed_bm *p)
4286{
4287        return (p->encoding & 0x80) != 0;
4288}
4289
4290static int dcbp_get_pad_bits(struct p_compressed_bm *p)
4291{
4292        return (p->encoding >> 4) & 0x7;
4293}
4294
4295/**
4296 * recv_bm_rle_bits
4297 *
4298 * Return 0 when done, 1 when another iteration is needed, and a negative error
4299 * code upon failure.
4300 */
4301static int
4302recv_bm_rle_bits(struct drbd_peer_device *peer_device,
4303                struct p_compressed_bm *p,
4304                 struct bm_xfer_ctx *c,
4305                 unsigned int len)
4306{
4307        struct bitstream bs;
4308        u64 look_ahead;
4309        u64 rl;
4310        u64 tmp;
4311        unsigned long s = c->bit_offset;
4312        unsigned long e;
4313        int toggle = dcbp_get_start(p);
4314        int have;
4315        int bits;
4316
4317        bitstream_init(&bs, p->code, len, dcbp_get_pad_bits(p));
4318
4319        bits = bitstream_get_bits(&bs, &look_ahead, 64);
4320        if (bits < 0)
4321                return -EIO;
4322
4323        for (have = bits; have > 0; s += rl, toggle = !toggle) {
4324                bits = vli_decode_bits(&rl, look_ahead);
4325                if (bits <= 0)
4326                        return -EIO;
4327
4328                if (toggle) {
4329                        e = s + rl -1;
4330                        if (e >= c->bm_bits) {
4331                                drbd_err(peer_device, "bitmap overflow (e:%lu) while decoding bm RLE packet\n", e);
4332                                return -EIO;
4333                        }
4334                        _drbd_bm_set_bits(peer_device->device, s, e);
4335                }
4336
4337                if (have < bits) {
4338                        drbd_err(peer_device, "bitmap decoding error: h:%d b:%d la:0x%08llx l:%u/%u\n",
4339                                have, bits, look_ahead,
4340                                (unsigned int)(bs.cur.b - p->code),
4341                                (unsigned int)bs.buf_len);
4342                        return -EIO;
4343                }
4344                /* if we consumed all 64 bits, assign 0; >> 64 is "undefined"; */
4345                if (likely(bits < 64))
4346                        look_ahead >>= bits;
4347                else
4348                        look_ahead = 0;
4349                have -= bits;
4350
4351                bits = bitstream_get_bits(&bs, &tmp, 64 - have);
4352                if (bits < 0)
4353                        return -EIO;
4354                look_ahead |= tmp << have;
4355                have += bits;
4356        }
4357
4358        c->bit_offset = s;
4359        bm_xfer_ctx_bit_to_word_offset(c);
4360
4361        return (s != c->bm_bits);
4362}
4363
4364/**
4365 * decode_bitmap_c
4366 *
4367 * Return 0 when done, 1 when another iteration is needed, and a negative error
4368 * code upon failure.
4369 */
4370static int
4371decode_bitmap_c(struct drbd_peer_device *peer_device,
4372                struct p_compressed_bm *p,
4373                struct bm_xfer_ctx *c,
4374                unsigned int len)
4375{
4376        if (dcbp_get_code(p) == RLE_VLI_Bits)
4377                return recv_bm_rle_bits(peer_device, p, c, len - sizeof(*p));
4378
4379        /* other variants had been implemented for evaluation,
4380         * but have been dropped as this one turned out to be "best"
4381         * during all our tests. */
4382
4383        drbd_err(peer_device, "receive_bitmap_c: unknown encoding %u\n", p->encoding);
4384        conn_request_state(peer_device->connection, NS(conn, C_PROTOCOL_ERROR), CS_HARD);
4385        return -EIO;
4386}
4387
4388void INFO_bm_xfer_stats(struct drbd_device *device,
4389                const char *direction, struct bm_xfer_ctx *c)
4390{
4391        /* what would it take to transfer it "plaintext" */
4392        unsigned int header_size = drbd_header_size(first_peer_device(device)->connection);
4393        unsigned int data_size = DRBD_SOCKET_BUFFER_SIZE - header_size;
4394        unsigned int plain =
4395                header_size * (DIV_ROUND_UP(c->bm_words, data_size) + 1) +
4396                c->bm_words * sizeof(unsigned long);
4397        unsigned int total = c->bytes[0] + c->bytes[1];
4398        unsigned int r;
4399
4400        /* total can not be zero. but just in case: */
4401        if (total == 0)
4402                return;
4403
4404        /* don't report if not compressed */
4405        if (total >= plain)
4406                return;
4407
4408        /* total < plain. check for overflow, still */
4409        r = (total > UINT_MAX/1000) ? (total / (plain/1000))
4410                                    : (1000 * total / plain);
4411
4412        if (r > 1000)
4413                r = 1000;
4414
4415        r = 1000 - r;
4416        drbd_info(device, "%s bitmap stats [Bytes(packets)]: plain %u(%u), RLE %u(%u), "
4417             "total %u; compression: %u.%u%%\n",
4418                        direction,
4419                        c->bytes[1], c->packets[1],
4420                        c->bytes[0], c->packets[0],
4421                        total, r/10, r % 10);
4422}
4423
4424/* Since we are processing the bitfield from lower addresses to higher,
4425   it does not matter if the process it in 32 bit chunks or 64 bit
4426   chunks as long as it is little endian. (Understand it as byte stream,
4427   beginning with the lowest byte...) If we would use big endian
4428   we would need to process it from the highest address to the lowest,
4429   in order to be agnostic to the 32 vs 64 bits issue.
4430
4431   returns 0 on failure, 1 if we successfully received it. */
4432static int receive_bitmap(struct drbd_connection *connection, struct packet_info *pi)
4433{
4434        struct drbd_peer_device *peer_device;
4435        struct drbd_device *device;
4436        struct bm_xfer_ctx c;
4437        int err;
4438
4439        peer_device = conn_peer_device(connection, pi->vnr);
4440        if (!peer_device)
4441                return -EIO;
4442        device = peer_device->device;
4443
4444        drbd_bm_lock(device, "receive bitmap", BM_LOCKED_SET_ALLOWED);
4445        /* you are supposed to send additional out-of-sync information
4446         * if you actually set bits during this phase */
4447
4448        c = (struct bm_xfer_ctx) {
4449                .bm_bits = drbd_bm_bits(device),
4450                .bm_words = drbd_bm_words(device),
4451        };
4452
4453        for(;;) {
4454                if (pi->cmd == P_BITMAP)
4455                        err = receive_bitmap_plain(peer_device, pi->size, pi->data, &c);
4456                else if (pi->cmd == P_COMPRESSED_BITMAP) {
4457                        /* MAYBE: sanity check that we speak proto >= 90,
4458                         * and the feature is enabled! */
4459                        struct p_compressed_bm *p = pi->data;
4460
4461                        if (pi->size > DRBD_SOCKET_BUFFER_SIZE - drbd_header_size(connection)) {
4462                                drbd_err(device, "ReportCBitmap packet too large\n");
4463                                err = -EIO;
4464                                goto out;
4465                        }
4466                        if (pi->size <= sizeof(*p)) {
4467                                drbd_err(device, "ReportCBitmap packet too small (l:%u)\n", pi->size);
4468                                err = -EIO;
4469                                goto out;
4470                        }
4471                        err = drbd_recv_all(peer_device->connection, p, pi->size);
4472                        if (err)
4473                               goto out;
4474                        err = decode_bitmap_c(peer_device, p, &c, pi->size);
4475                } else {
4476                        drbd_warn(device, "receive_bitmap: cmd neither ReportBitMap nor ReportCBitMap (is 0x%x)", pi->cmd);
4477                        err = -EIO;
4478                        goto out;
4479                }
4480
4481                c.packets[pi->cmd == P_BITMAP]++;
4482                c.bytes[pi->cmd == P_BITMAP] += drbd_header_size(connection) + pi->size;
4483
4484                if (err <= 0) {
4485                        if (err < 0)
4486                                goto out;
4487                        break;
4488                }
4489                err = drbd_recv_header(peer_device->connection, pi);
4490                if (err)
4491                        goto out;
4492        }
4493
4494        INFO_bm_xfer_stats(device, "receive", &c);
4495
4496        if (device->state.conn == C_WF_BITMAP_T) {
4497                enum drbd_state_rv rv;
4498
4499                err = drbd_send_bitmap(device);
4500                if (err)
4501                        goto out;
4502                /* Omit CS_ORDERED with this state transition to avoid deadlocks. */
4503                rv = _drbd_request_state(device, NS(conn, C_WF_SYNC_UUID), CS_VERBOSE);
4504                D_ASSERT(device, rv == SS_SUCCESS);
4505        } else if (device->state.conn != C_WF_BITMAP_S) {
4506                /* admin may have requested C_DISCONNECTING,
4507                 * other threads may have noticed network errors */
4508                drbd_info(device, "unexpected cstate (%s) in receive_bitmap\n",
4509                    drbd_conn_str(device->state.conn));
4510        }
4511        err = 0;
4512
4513 out:
4514        drbd_bm_unlock(device);
4515        if (!err && device->state.conn == C_WF_BITMAP_S)
4516                drbd_start_resync(device, C_SYNC_SOURCE);
4517        return err;
4518}
4519
4520static int receive_skip(struct drbd_connection *connection, struct packet_info *pi)
4521{
4522        drbd_warn(connection, "skipping unknown optional packet type %d, l: %d!\n",
4523                 pi->cmd, pi->size);
4524
4525        return ignore_remaining_packet(connection, pi);
4526}
4527
4528static int receive_UnplugRemote(struct drbd_connection *connection, struct packet_info *pi)
4529{
4530        /* Make sure we've acked all the TCP data associated
4531         * with the data requests being unplugged */
4532        drbd_tcp_quickack(connection->data.socket);
4533
4534        return 0;
4535}
4536
4537static int receive_out_of_sync(struct drbd_connection *connection, struct packet_info *pi)
4538{
4539        struct drbd_peer_device *peer_device;
4540        struct drbd_device *device;
4541        struct p_block_desc *p = pi->data;
4542
4543        peer_device = conn_peer_device(connection, pi->vnr);
4544        if (!peer_device)
4545                return -EIO;
4546        device = peer_device->device;
4547
4548        switch (device->state.conn) {
4549        case C_WF_SYNC_UUID:
4550        case C_WF_BITMAP_T:
4551        case C_BEHIND:
4552                        break;
4553        default:
4554                drbd_err(device, "ASSERT FAILED cstate = %s, expected: WFSyncUUID|WFBitMapT|Behind\n",
4555                                drbd_conn_str(device->state.conn));
4556        }
4557
4558        drbd_set_out_of_sync(device, be64_to_cpu(p->sector), be32_to_cpu(p->blksize));
4559
4560        return 0;
4561}
4562
4563struct data_cmd {
4564        int expect_payload;
4565        size_t pkt_size;
4566        int (*fn)(struct drbd_connection *, struct packet_info *);
4567};
4568
4569static struct data_cmd drbd_cmd_handler[] = {
4570        [P_DATA]            = { 1, sizeof(struct p_data), receive_Data },
4571        [P_DATA_REPLY]      = { 1, sizeof(struct p_data), receive_DataReply },
4572        [P_RS_DATA_REPLY]   = { 1, sizeof(struct p_data), receive_RSDataReply } ,
4573        [P_BARRIER]         = { 0, sizeof(struct p_barrier), receive_Barrier } ,
4574        [P_BITMAP]          = { 1, 0, receive_bitmap } ,
4575        [P_COMPRESSED_BITMAP] = { 1, 0, receive_bitmap } ,
4576        [P_UNPLUG_REMOTE]   = { 0, 0, receive_UnplugRemote },
4577        [P_DATA_REQUEST]    = { 0, sizeof(struct p_block_req), receive_DataRequest },
4578        [P_RS_DATA_REQUEST] = { 0, sizeof(struct p_block_req), receive_DataRequest },
4579        [P_SYNC_PARAM]      = { 1, 0, receive_SyncParam },
4580        [P_SYNC_PARAM89]    = { 1, 0, receive_SyncParam },
4581        [P_PROTOCOL]        = { 1, sizeof(struct p_protocol), receive_protocol },
4582        [P_UUIDS]           = { 0, sizeof(struct p_uuids), receive_uuids },
4583        [P_SIZES]           = { 0, sizeof(struct p_sizes), receive_sizes },
4584        [P_STATE]           = { 0, sizeof(struct p_state), receive_state },
4585        [P_STATE_CHG_REQ]   = { 0, sizeof(struct p_req_state), receive_req_state },
4586        [P_SYNC_UUID]       = { 0, sizeof(struct p_rs_uuid), receive_sync_uuid },
4587        [P_OV_REQUEST]      = { 0, sizeof(struct p_block_req), receive_DataRequest },
4588        [P_OV_REPLY]        = { 1, sizeof(struct p_block_req), receive_DataRequest },
4589        [P_CSUM_RS_REQUEST] = { 1, sizeof(struct p_block_req), receive_DataRequest },
4590        [P_DELAY_PROBE]     = { 0, sizeof(struct p_delay_probe93), receive_skip },
4591        [P_OUT_OF_SYNC]     = { 0, sizeof(struct p_block_desc), receive_out_of_sync },
4592        [P_CONN_ST_CHG_REQ] = { 0, sizeof(struct p_req_state), receive_req_conn_state },
4593        [P_PROTOCOL_UPDATE] = { 1, sizeof(struct p_protocol), receive_protocol },
4594        [P_TRIM]            = { 0, sizeof(struct p_trim), receive_Data },
4595};
4596
4597static void drbdd(struct drbd_connection *connection)
4598{
4599        struct packet_info pi;
4600        size_t shs; /* sub header size */
4601        int err;
4602
4603        while (get_t_state(&connection->receiver) == RUNNING) {
4604                struct data_cmd *cmd;
4605
4606                drbd_thread_current_set_cpu(&connection->receiver);
4607                update_receiver_timing_details(connection, drbd_recv_header);
4608                if (drbd_recv_header(connection, &pi))
4609                        goto err_out;
4610
4611                cmd = &drbd_cmd_handler[pi.cmd];
4612                if (unlikely(pi.cmd >= ARRAY_SIZE(drbd_cmd_handler) || !cmd->fn)) {
4613                        drbd_err(connection, "Unexpected data packet %s (0x%04x)",
4614                                 cmdname(pi.cmd), pi.cmd);
4615                        goto err_out;
4616                }
4617
4618                shs = cmd->pkt_size;
4619                if (pi.size > shs && !cmd->expect_payload) {
4620                        drbd_err(connection, "No payload expected %s l:%d\n",
4621                                 cmdname(pi.cmd), pi.size);
4622                        goto err_out;
4623                }
4624
4625                if (shs) {
4626                        update_receiver_timing_details(connection, drbd_recv_all_warn);
4627                        err = drbd_recv_all_warn(connection, pi.data, shs);
4628                        if (err)
4629                                goto err_out;
4630                        pi.size -= shs;
4631                }
4632
4633                update_receiver_timing_details(connection, cmd->fn);
4634                err = cmd->fn(connection, &pi);
4635                if (err) {
4636                        drbd_err(connection, "error receiving %s, e: %d l: %d!\n",
4637                                 cmdname(pi.cmd), err, pi.size);
4638                        goto err_out;
4639                }
4640        }
4641        return;
4642
4643    err_out:
4644        conn_request_state(connection, NS(conn, C_PROTOCOL_ERROR), CS_HARD);
4645}
4646
4647static void conn_disconnect(struct drbd_connection *connection)
4648{
4649        struct drbd_peer_device *peer_device;
4650        enum drbd_conns oc;
4651        int vnr;
4652
4653        if (connection->cstate == C_STANDALONE)
4654                return;
4655
4656        /* We are about to start the cleanup after connection loss.
4657         * Make sure drbd_make_request knows about that.
4658         * Usually we should be in some network failure state already,
4659         * but just in case we are not, we fix it up here.
4660         */
4661        conn_request_state(connection, NS(conn, C_NETWORK_FAILURE), CS_HARD);
4662
4663        /* asender does not clean up anything. it must not interfere, either */
4664        drbd_thread_stop(&connection->asender);
4665        drbd_free_sock(connection);
4666
4667        rcu_read_lock();
4668        idr_for_each_entry(&connection->peer_devices, peer_device, vnr) {
4669                struct drbd_device *device = peer_device->device;
4670                kref_get(&device->kref);
4671                rcu_read_unlock();
4672                drbd_disconnected(peer_device);
4673                kref_put(&device->kref, drbd_destroy_device);
4674                rcu_read_lock();
4675        }
4676        rcu_read_unlock();
4677
4678        if (!list_empty(&connection->current_epoch->list))
4679                drbd_err(connection, "ASSERTION FAILED: connection->current_epoch->list not empty\n");
4680        /* ok, no more ee's on the fly, it is safe to reset the epoch_size */
4681        atomic_set(&connection->current_epoch->epoch_size, 0);
4682        connection->send.seen_any_write_yet = false;
4683
4684        drbd_info(connection, "Connection closed\n");
4685
4686        if (conn_highest_role(connection) == R_PRIMARY && conn_highest_pdsk(connection) >= D_UNKNOWN)
4687                conn_try_outdate_peer_async(connection);
4688
4689        spin_lock_irq(&connection->resource->req_lock);
4690        oc = connection->cstate;
4691        if (oc >= C_UNCONNECTED)
4692                _conn_request_state(connection, NS(conn, C_UNCONNECTED), CS_VERBOSE);
4693
4694        spin_unlock_irq(&connection->resource->req_lock);
4695
4696        if (oc == C_DISCONNECTING)
4697                conn_request_state(connection, NS(conn, C_STANDALONE), CS_VERBOSE | CS_HARD);
4698}
4699
4700static int drbd_disconnected(struct drbd_peer_device *peer_device)
4701{
4702        struct drbd_device *device = peer_device->device;
4703        unsigned int i;
4704
4705        /* wait for current activity to cease. */
4706        spin_lock_irq(&device->resource->req_lock);
4707        _drbd_wait_ee_list_empty(device, &device->active_ee);
4708        _drbd_wait_ee_list_empty(device, &device->sync_ee);
4709        _drbd_wait_ee_list_empty(device, &device->read_ee);
4710        spin_unlock_irq(&device->resource->req_lock);
4711
4712        /* We do not have data structures that would allow us to
4713         * get the rs_pending_cnt down to 0 again.
4714         *  * On C_SYNC_TARGET we do not have any data structures describing
4715         *    the pending RSDataRequest's we have sent.
4716         *  * On C_SYNC_SOURCE there is no data structure that tracks
4717         *    the P_RS_DATA_REPLY blocks that we sent to the SyncTarget.
4718         *  And no, it is not the sum of the reference counts in the
4719         *  resync_LRU. The resync_LRU tracks the whole operation including
4720         *  the disk-IO, while the rs_pending_cnt only tracks the blocks
4721         *  on the fly. */
4722        drbd_rs_cancel_all(device);
4723        device->rs_total = 0;
4724        device->rs_failed = 0;
4725        atomic_set(&device->rs_pending_cnt, 0);
4726        wake_up(&device->misc_wait);
4727
4728        del_timer_sync(&device->resync_timer);
4729        resync_timer_fn((unsigned long)device);
4730
4731        /* wait for all w_e_end_data_req, w_e_end_rsdata_req, w_send_barrier,
4732         * w_make_resync_request etc. which may still be on the worker queue
4733         * to be "canceled" */
4734        drbd_flush_workqueue(&peer_device->connection->sender_work);
4735
4736        drbd_finish_peer_reqs(device);
4737
4738        /* This second workqueue flush is necessary, since drbd_finish_peer_reqs()
4739           might have issued a work again. The one before drbd_finish_peer_reqs() is
4740           necessary to reclain net_ee in drbd_finish_peer_reqs(). */
4741        drbd_flush_workqueue(&peer_device->connection->sender_work);
4742
4743        /* need to do it again, drbd_finish_peer_reqs() may have populated it
4744         * again via drbd_try_clear_on_disk_bm(). */
4745        drbd_rs_cancel_all(device);
4746
4747        kfree(device->p_uuid);
4748        device->p_uuid = NULL;
4749
4750        if (!drbd_suspended(device))
4751                tl_clear(peer_device->connection);
4752
4753        drbd_md_sync(device);
4754
4755        /* serialize with bitmap writeout triggered by the state change,
4756         * if any. */
4757        wait_event(device->misc_wait, !test_bit(BITMAP_IO, &device->flags));
4758
4759        /* tcp_close and release of sendpage pages can be deferred.  I don't
4760         * want to use SO_LINGER, because apparently it can be deferred for
4761         * more than 20 seconds (longest time I checked).
4762         *
4763         * Actually we don't care for exactly when the network stack does its
4764         * put_page(), but release our reference on these pages right here.
4765         */
4766        i = drbd_free_peer_reqs(device, &device->net_ee);
4767        if (i)
4768                drbd_info(device, "net_ee not empty, killed %u entries\n", i);
4769        i = atomic_read(&device->pp_in_use_by_net);
4770        if (i)
4771                drbd_info(device, "pp_in_use_by_net = %d, expected 0\n", i);
4772        i = atomic_read(&device->pp_in_use);
4773        if (i)
4774                drbd_info(device, "pp_in_use = %d, expected 0\n", i);
4775
4776        D_ASSERT(device, list_empty(&device->read_ee));
4777        D_ASSERT(device, list_empty(&device->active_ee));
4778        D_ASSERT(device, list_empty(&device->sync_ee));
4779        D_ASSERT(device, list_empty(&device->done_ee));
4780
4781        return 0;
4782}
4783
4784/*
4785 * We support PRO_VERSION_MIN to PRO_VERSION_MAX. The protocol version
4786 * we can agree on is stored in agreed_pro_version.
4787 *
4788 * feature flags and the reserved array should be enough room for future
4789 * enhancements of the handshake protocol, and possible plugins...
4790 *
4791 * for now, they are expected to be zero, but ignored.
4792 */
4793static int drbd_send_features(struct drbd_connection *connection)
4794{
4795        struct drbd_socket *sock;
4796        struct p_connection_features *p;
4797
4798        sock = &connection->data;
4799        p = conn_prepare_command(connection, sock);
4800        if (!p)
4801                return -EIO;
4802        memset(p, 0, sizeof(*p));
4803        p->protocol_min = cpu_to_be32(PRO_VERSION_MIN);
4804        p->protocol_max = cpu_to_be32(PRO_VERSION_MAX);
4805        p->feature_flags = cpu_to_be32(PRO_FEATURES);
4806        return conn_send_command(connection, sock, P_CONNECTION_FEATURES, sizeof(*p), NULL, 0);
4807}
4808
4809/*
4810 * return values:
4811 *   1 yes, we have a valid connection
4812 *   0 oops, did not work out, please try again
4813 *  -1 peer talks different language,
4814 *     no point in trying again, please go standalone.
4815 */
4816static int drbd_do_features(struct drbd_connection *connection)
4817{
4818        /* ASSERT current == connection->receiver ... */
4819        struct p_connection_features *p;
4820        const int expect = sizeof(struct p_connection_features);
4821        struct packet_info pi;
4822        int err;
4823
4824        err = drbd_send_features(connection);
4825        if (err)
4826                return 0;
4827
4828        err = drbd_recv_header(connection, &pi);
4829        if (err)
4830                return 0;
4831
4832        if (pi.cmd != P_CONNECTION_FEATURES) {
4833                drbd_err(connection, "expected ConnectionFeatures packet, received: %s (0x%04x)\n",
4834                         cmdname(pi.cmd), pi.cmd);
4835                return -1;
4836        }
4837
4838        if (pi.size != expect) {
4839                drbd_err(connection, "expected ConnectionFeatures length: %u, received: %u\n",
4840                     expect, pi.size);
4841                return -1;
4842        }
4843
4844        p = pi.data;
4845        err = drbd_recv_all_warn(connection, p, expect);
4846        if (err)
4847                return 0;
4848
4849        p->protocol_min = be32_to_cpu(p->protocol_min);
4850        p->protocol_max = be32_to_cpu(p->protocol_max);
4851        if (p->protocol_max == 0)
4852                p->protocol_max = p->protocol_min;
4853
4854        if (PRO_VERSION_MAX < p->protocol_min ||
4855            PRO_VERSION_MIN > p->protocol_max)
4856                goto incompat;
4857
4858        connection->agreed_pro_version = min_t(int, PRO_VERSION_MAX, p->protocol_max);
4859        connection->agreed_features = PRO_FEATURES & be32_to_cpu(p->feature_flags);
4860
4861        drbd_info(connection, "Handshake successful: "
4862             "Agreed network protocol version %d\n", connection->agreed_pro_version);
4863
4864        drbd_info(connection, "Agreed to%ssupport TRIM on protocol level\n",
4865                  connection->agreed_features & FF_TRIM ? " " : " not ");
4866
4867        return 1;
4868
4869 incompat:
4870        drbd_err(connection, "incompatible DRBD dialects: "
4871            "I support %d-%d, peer supports %d-%d\n",
4872            PRO_VERSION_MIN, PRO_VERSION_MAX,
4873            p->protocol_min, p->protocol_max);
4874        return -1;
4875}
4876
4877#if !defined(CONFIG_CRYPTO_HMAC) && !defined(CONFIG_CRYPTO_HMAC_MODULE)
4878static int drbd_do_auth(struct drbd_connection *connection)
4879{
4880        drbd_err(connection, "This kernel was build without CONFIG_CRYPTO_HMAC.\n");
4881        drbd_err(connection, "You need to disable 'cram-hmac-alg' in drbd.conf.\n");
4882        return -1;
4883}
4884#else
4885#define CHALLENGE_LEN 64
4886
4887/* Return value:
4888        1 - auth succeeded,
4889        0 - failed, try again (network error),
4890        -1 - auth failed, don't try again.
4891*/
4892
4893static int drbd_do_auth(struct drbd_connection *connection)
4894{
4895        struct drbd_socket *sock;
4896        char my_challenge[CHALLENGE_LEN];  /* 64 Bytes... */
4897        struct scatterlist sg;
4898        char *response = NULL;
4899        char *right_response = NULL;
4900        char *peers_ch = NULL;
4901        unsigned int key_len;
4902        char secret[SHARED_SECRET_MAX]; /* 64 byte */
4903        unsigned int resp_size;
4904        struct hash_desc desc;
4905        struct packet_info pi;
4906        struct net_conf *nc;
4907        int err, rv;
4908
4909        /* FIXME: Put the challenge/response into the preallocated socket buffer.  */
4910
4911        rcu_read_lock();
4912        nc = rcu_dereference(connection->net_conf);
4913        key_len = strlen(nc->shared_secret);
4914        memcpy(secret, nc->shared_secret, key_len);
4915        rcu_read_unlock();
4916
4917        desc.tfm = connection->cram_hmac_tfm;
4918        desc.flags = 0;
4919
4920        rv = crypto_hash_setkey(connection->cram_hmac_tfm, (u8 *)secret, key_len);
4921        if (rv) {
4922                drbd_err(connection, "crypto_hash_setkey() failed with %d\n", rv);
4923                rv = -1;
4924                goto fail;
4925        }
4926
4927        get_random_bytes(my_challenge, CHALLENGE_LEN);
4928
4929        sock = &connection->data;
4930        if (!conn_prepare_command(connection, sock)) {
4931                rv = 0;
4932                goto fail;
4933        }
4934        rv = !conn_send_command(connection, sock, P_AUTH_CHALLENGE, 0,
4935                                my_challenge, CHALLENGE_LEN);
4936        if (!rv)
4937                goto fail;
4938
4939        err = drbd_recv_header(connection, &pi);
4940        if (err) {
4941                rv = 0;
4942                goto fail;
4943        }
4944
4945        if (pi.cmd != P_AUTH_CHALLENGE) {
4946                drbd_err(connection, "expected AuthChallenge packet, received: %s (0x%04x)\n",
4947                         cmdname(pi.cmd), pi.cmd);
4948                rv = 0;
4949                goto fail;
4950        }
4951
4952        if (pi.size > CHALLENGE_LEN * 2) {
4953                drbd_err(connection, "expected AuthChallenge payload too big.\n");
4954                rv = -1;
4955                goto fail;
4956        }
4957
4958        if (pi.size < CHALLENGE_LEN) {
4959                drbd_err(connection, "AuthChallenge payload too small.\n");
4960                rv = -1;
4961                goto fail;
4962        }
4963
4964        peers_ch = kmalloc(pi.size, GFP_NOIO);
4965        if (peers_ch == NULL) {
4966                drbd_err(connection, "kmalloc of peers_ch failed\n");
4967                rv = -1;
4968                goto fail;
4969        }
4970
4971        err = drbd_recv_all_warn(connection, peers_ch, pi.size);
4972        if (err) {
4973                rv = 0;
4974                goto fail;
4975        }
4976
4977        if (!memcmp(my_challenge, peers_ch, CHALLENGE_LEN)) {
4978                drbd_err(connection, "Peer presented the same challenge!\n");
4979                rv = -1;
4980                goto fail;
4981        }
4982
4983        resp_size = crypto_hash_digestsize(connection->cram_hmac_tfm);
4984        response = kmalloc(resp_size, GFP_NOIO);
4985        if (response == NULL) {
4986                drbd_err(connection, "kmalloc of response failed\n");
4987                rv = -1;
4988                goto fail;
4989        }
4990
4991        sg_init_table(&sg, 1);
4992        sg_set_buf(&sg, peers_ch, pi.size);
4993
4994        rv = crypto_hash_digest(&desc, &sg, sg.length, response);
4995        if (rv) {
4996                drbd_err(connection, "crypto_hash_digest() failed with %d\n", rv);
4997                rv = -1;
4998                goto fail;
4999        }
5000

5001        if (!conn_prepare_command(connection, sock)) {
5002                rv = 0;
5003                goto fail;
5004        }
5005        rv = !conn_send_command(connection, sock, P_AUTH_RESPONSE, 0,
5006                                response, resp_size);
5007        if (!rv)
5008                goto fail;
5009
5010        err = drbd_recv_header(connection, &pi);
5011        if (err) {
5012                rv = 0;
5013                goto fail;
5014        }
5015
5016        if (pi.cmd != P_AUTH_RESPONSE) {
5017                drbd_err(connection, "expected AuthResponse packet, received: %s (0x%04x)\n",
5018                         cmdname(pi.cmd), pi.cmd);
5019                rv = 0;
5020                goto fail;
5021        }
5022
5023        if (pi.size != resp_size) {
5024                drbd_err(connection, "expected AuthResponse payload of wrong size\n");
5025                rv = 0;
5026                goto fail;
5027        }
5028
5029        err = drbd_recv_all_warn(connection, response , resp_size);
5030        if (err) {
5031                rv = 0;
5032                goto fail;
5033        }
5034
5035        right_response = kmalloc(resp_size, GFP_NOIO);
5036        if (right_response == NULL) {
5037                drbd_err(connection, "kmalloc of right_response failed\n");
5038                rv = -1;
5039                goto fail;
5040        }
5041
5042        sg_set_buf(&sg, my_challenge, CHALLENGE_LEN);
5043
5044        rv = crypto_hash_digest(&desc, &sg, sg.length, right_response);
5045        if (rv) {
5046                drbd_err(connection, "crypto_hash_digest() failed with %d\n", rv);
5047                rv = -1;
5048                goto fail;
5049        }
5050
5051        rv = !memcmp(response, right_response, resp_size);
5052
5053        if (rv)
5054                drbd_info(connection, "Peer authenticated using %d bytes HMAC\n",
5055                     resp_size);
5056        else
5057                rv = -1;
5058
5059 fail:
5060        kfree(peers_ch);
5061        kfree(response);
5062        kfree(right_response);
5063
5064        return rv;
5065}
5066#endif
5067
5068int drbd_receiver(struct drbd_thread *thi)
5069{
5070        struct drbd_connection *connection = thi->connection;
5071        int h;
5072
5073        drbd_info(connection, "receiver (re)started\n");
5074
5075        do {
5076                h = conn_connect(connection);
5077                if (h == 0) {
5078                        conn_disconnect(connection);
5079                        schedule_timeout_interruptible(HZ);
5080                }
5081                if (h == -1) {
5082                        drbd_warn(connection, "Discarding network configuration.\n");
5083                        conn_request_state(connection, NS(conn, C_DISCONNECTING), CS_HARD);
5084                }
5085        } while (h == 0);
5086
5087        if (h > 0)
5088                drbdd(connection);
5089
5090        conn_disconnect(connection);
5091
5092        drbd_info(connection, "receiver terminated\n");
5093        return 0;
5094}
5095
5096/* ********* acknowledge sender ******** */
5097
5098static int got_conn_RqSReply(struct drbd_connection *connection, struct packet_info *pi)
5099{
5100        struct p_req_state_reply *p = pi->data;
5101        int retcode = be32_to_cpu(p->retcode);
5102
5103        if (retcode >= SS_SUCCESS) {
5104                set_bit(CONN_WD_ST_CHG_OKAY, &connection->flags);
5105        } else {
5106                set_bit(CONN_WD_ST_CHG_FAIL, &connection->flags);
5107                drbd_err(connection, "Requested state change failed by peer: %s (%d)\n",
5108                         drbd_set_st_err_str(retcode), retcode);
5109        }
5110        wake_up(&connection->ping_wait);
5111
5112        return 0;
5113}
5114
5115static int got_RqSReply(struct drbd_connection *connection, struct packet_info *pi)
5116{
5117        struct drbd_peer_device *peer_device;
5118        struct drbd_device *device;
5119        struct p_req_state_reply *p = pi->data;
5120        int retcode = be32_to_cpu(p->retcode);
5121
5122        peer_device = conn_peer_device(connection, pi->vnr);
5123        if (!peer_device)
5124                return -EIO;
5125        device = peer_device->device;
5126
5127        if (test_bit(CONN_WD_ST_CHG_REQ, &connection->flags)) {
5128                D_ASSERT(device, connection->agreed_pro_version < 100);
5129                return got_conn_RqSReply(connection, pi);
5130        }
5131
5132        if (retcode >= SS_SUCCESS) {
5133                set_bit(CL_ST_CHG_SUCCESS, &device->flags);
5134        } else {
5135                set_bit(CL_ST_CHG_FAIL, &device->flags);
5136                drbd_err(device, "Requested state change failed by peer: %s (%d)\n",
5137                        drbd_set_st_err_str(retcode), retcode);
5138        }
5139        wake_up(&device->state_wait);
5140
5141        return 0;
5142}
5143
5144static int got_Ping(struct drbd_connection *connection, struct packet_info *pi)
5145{
5146        return drbd_send_ping_ack(connection);
5147
5148}
5149
5150static int got_PingAck(struct drbd_connection *connection, struct packet_info *pi)
5151{
5152        /* restore idle timeout */
5153        connection->meta.socket->sk->sk_rcvtimeo = connection->net_conf->ping_int*HZ;
5154        if (!test_and_set_bit(GOT_PING_ACK, &connection->flags))
5155                wake_up(&connection->ping_wait);
5156
5157        return 0;
5158}
5159
5160static int got_IsInSync(struct drbd_connection *connection, struct packet_info *pi)
5161{
5162        struct drbd_peer_device *peer_device;
5163        struct drbd_device *device;
5164        struct p_block_ack *p = pi->data;
5165        sector_t sector = be64_to_cpu(p->sector);
5166        int blksize = be32_to_cpu(p->blksize);
5167
5168        peer_device = conn_peer_device(connection, pi->vnr);
5169        if (!peer_device)
5170                return -EIO;
5171        device = peer_device->device;
5172
5173        D_ASSERT(device, peer_device->connection->agreed_pro_version >= 89);
5174
5175        update_peer_seq(peer_device, be32_to_cpu(p->seq_num));
5176
5177        if (get_ldev(device)) {
5178                drbd_rs_complete_io(device, sector);
5179                drbd_set_in_sync(device, sector, blksize);
5180                /* rs_same_csums is supposed to count in units of BM_BLOCK_SIZE */
5181                device->rs_same_csum += (blksize >> BM_BLOCK_SHIFT);
5182                put_ldev(device);
5183        }
5184        dec_rs_pending(device);
5185        atomic_add(blksize >> 9, &device->rs_sect_in);
5186
5187        return 0;
5188}
5189
5190static int
5191validate_req_change_req_state(struct drbd_device *device, u64 id, sector_t sector,
5192                              struct rb_root *root, const char *func,
5193                              enum drbd_req_event what, bool missing_ok)
5194{
5195        struct drbd_request *req;
5196        struct bio_and_error m;
5197
5198        spin_lock_irq(&device->resource->req_lock);
5199        req = find_request(device, root, id, sector, missing_ok, func);
5200        if (unlikely(!req)) {
5201                spin_unlock_irq(&device->resource->req_lock);
5202                return -EIO;
5203        }
5204        __req_mod(req, what, &m);
5205        spin_unlock_irq(&device->resource->req_lock);
5206
5207        if (m.bio)
5208                complete_master_bio(device, &m);
5209        return 0;
5210}
5211
5212static int got_BlockAck(struct drbd_connection *connection, struct packet_info *pi)
5213{
5214        struct drbd_peer_device *peer_device;
5215        struct drbd_device *device;
5216        struct p_block_ack *p = pi->data;
5217        sector_t sector = be64_to_cpu(p->sector);
5218        int blksize = be32_to_cpu(p->blksize);
5219        enum drbd_req_event what;
5220
5221        peer_device = conn_peer_device(connection, pi->vnr);
5222        if (!peer_device)
5223                return -EIO;
5224        device = peer_device->device;
5225
5226        update_peer_seq(peer_device, be32_to_cpu(p->seq_num));
5227
5228        if (p->block_id == ID_SYNCER) {
5229                drbd_set_in_sync(device, sector, blksize);
5230                dec_rs_pending(device);
5231                return 0;
5232        }
5233        switch (pi->cmd) {
5234        case P_RS_WRITE_ACK:
5235                what = WRITE_ACKED_BY_PEER_AND_SIS;
5236                break;
5237        case P_WRITE_ACK:
5238                what = WRITE_ACKED_BY_PEER;
5239                break;
5240        case P_RECV_ACK:
5241                what = RECV_ACKED_BY_PEER;
5242                break;
5243        case P_SUPERSEDED:
5244                what = CONFLICT_RESOLVED;
5245                break;
5246        case P_RETRY_WRITE:
5247                what = POSTPONE_WRITE;
5248                break;
5249        default:
5250                BUG();
5251        }
5252
5253        return validate_req_change_req_state(device, p->block_id, sector,
5254                                             &device->write_requests, __func__,
5255                                             what, false);
5256}
5257
5258static int got_NegAck(struct drbd_connection *connection, struct packet_info *pi)
5259{
5260        struct drbd_peer_device *peer_device;
5261        struct drbd_device *device;
5262        struct p_block_ack *p = pi->data;
5263        sector_t sector = be64_to_cpu(p->sector);
5264        int size = be32_to_cpu(p->blksize);
5265        int err;
5266
5267        peer_device = conn_peer_device(connection, pi->vnr);
5268        if (!peer_device)
5269                return -EIO;
5270        device = peer_device->device;
5271
5272        update_peer_seq(peer_device, be32_to_cpu(p->seq_num));
5273
5274        if (p->block_id == ID_SYNCER) {
5275                dec_rs_pending(device);
5276                drbd_rs_failed_io(device, sector, size);
5277                return 0;
5278        }
5279
5280        err = validate_req_change_req_state(device, p->block_id, sector,
5281                                            &device->write_requests, __func__,
5282                                            NEG_ACKED, true);
5283        if (err) {
5284                /* Protocol A has no P_WRITE_ACKs, but has P_NEG_ACKs.
5285                   The master bio might already be completed, therefore the
5286                   request is no longer in the collision hash. */
5287                /* In Protocol B we might already have got a P_RECV_ACK
5288                   but then get a P_NEG_ACK afterwards. */
5289                drbd_set_out_of_sync(device, sector, size);
5290        }
5291        return 0;
5292}
5293
5294static int got_NegDReply(struct drbd_connection *connection, struct packet_info *pi)
5295{
5296        struct drbd_peer_device *peer_device;
5297        struct drbd_device *device;
5298        struct p_block_ack *p = pi->data;
5299        sector_t sector = be64_to_cpu(p->sector);
5300
5301        peer_device = conn_peer_device(connection, pi->vnr);
5302        if (!peer_device)
5303                return -EIO;
5304        device = peer_device->device;
5305
5306        update_peer_seq(peer_device, be32_to_cpu(p->seq_num));
5307
5308        drbd_err(device, "Got NegDReply; Sector %llus, len %u.\n",
5309            (unsigned long long)sector, be32_to_cpu(p->blksize));
5310
5311        return validate_req_change_req_state(device, p->block_id, sector,
5312                                             &device->read_requests, __func__,
5313                                             NEG_ACKED, false);
5314}
5315
5316static int got_NegRSDReply(struct drbd_connection *connection, struct packet_info *pi)
5317{
5318        struct drbd_peer_device *peer_device;
5319        struct drbd_device *device;
5320        sector_t sector;
5321        int size;
5322        struct p_block_ack *p = pi->data;
5323
5324        peer_device = conn_peer_device(connection, pi->vnr);
5325        if (!peer_device)
5326                return -EIO;
5327        device = peer_device->device;
5328
5329        sector = be64_to_cpu(p->sector);
5330        size = be32_to_cpu(p->blksize);
5331
5332        update_peer_seq(peer_device, be32_to_cpu(p->seq_num));
5333
5334        dec_rs_pending(device);
5335
5336        if (get_ldev_if_state(device, D_FAILED)) {
5337                drbd_rs_complete_io(device, sector);
5338                switch (pi->cmd) {
5339                case P_NEG_RS_DREPLY:
5340                        drbd_rs_failed_io(device, sector, size);
5341                case P_RS_CANCEL:
5342                        break;
5343                default:
5344                        BUG();
5345                }
5346                put_ldev(device);
5347        }
5348
5349        return 0;
5350}
5351
5352static int got_BarrierAck(struct drbd_connection *connection, struct packet_info *pi)
5353{
5354        struct p_barrier_ack *p = pi->data;
5355        struct drbd_peer_device *peer_device;
5356        int vnr;
5357
5358        tl_release(connection, p->barrier, be32_to_cpu(p->set_size));
5359
5360        rcu_read_lock();
5361        idr_for_each_entry(&connection->peer_devices, peer_device, vnr) {
5362                struct drbd_device *device = peer_device->device;
5363
5364                if (device->state.conn == C_AHEAD &&
5365                    atomic_read(&device->ap_in_flight) == 0 &&
5366                    !test_and_set_bit(AHEAD_TO_SYNC_SOURCE, &device->flags)) {
5367                        device->start_resync_timer.expires = jiffies + HZ;
5368                        add_timer(&device->start_resync_timer);
5369                }
5370        }
5371        rcu_read_unlock();
5372
5373        return 0;
5374}
5375
5376static int got_OVResult(struct drbd_connection *connection, struct packet_info *pi)
5377{
5378        struct drbd_peer_device *peer_device;
5379        struct drbd_device *device;
5380        struct p_block_ack *p = pi->data;
5381        struct drbd_device_work *dw;
5382        sector_t sector;
5383        int size;
5384
5385        peer_device = conn_peer_device(connection, pi->vnr);
5386        if (!peer_device)
5387                return -EIO;
5388        device = peer_device->device;
5389
5390        sector = be64_to_cpu(p->sector);
5391        size = be32_to_cpu(p->blksize);
5392
5393        update_peer_seq(peer_device, be32_to_cpu(p->seq_num));
5394
5395        if (be64_to_cpu(p->block_id) == ID_OUT_OF_SYNC)
5396                drbd_ov_out_of_sync_found(device, sector, size);
5397        else
5398                ov_out_of_sync_print(device);
5399
5400        if (!get_ldev(device))
5401                return 0;
5402
5403        drbd_rs_complete_io(device, sector);
5404        dec_rs_pending(device);
5405
5406        --device->ov_left;
5407
5408        /* let's advance progress step marks only for every other megabyte */
5409        if ((device->ov_left & 0x200) == 0x200)
5410                drbd_advance_rs_marks(device, device->ov_left);
5411
5412        if (device->ov_left == 0) {
5413                dw = kmalloc(sizeof(*dw), GFP_NOIO);
5414                if (dw) {
5415                        dw->w.cb = w_ov_finished;
5416                        dw->device = device;
5417                        drbd_queue_work(&peer_device->connection->sender_work, &dw->w);
5418                } else {
5419                        drbd_err(device, "kmalloc(dw) failed.");
5420                        ov_out_of_sync_print(device);
5421                        drbd_resync_finished(device);
5422                }
5423        }
5424        put_ldev(device);
5425        return 0;
5426}
5427
5428static int got_skip(struct drbd_connection *connection, struct packet_info *pi)
5429{
5430        return 0;
5431}
5432
5433static int connection_finish_peer_reqs(struct drbd_connection *connection)
5434{
5435        struct drbd_peer_device *peer_device;
5436        int vnr, not_empty = 0;
5437
5438        do {
5439                clear_bit(SIGNAL_ASENDER, &connection->flags);
5440                flush_signals(current);
5441
5442                rcu_read_lock();
5443                idr_for_each_entry(&connection->peer_devices, peer_device, vnr) {
5444                        struct drbd_device *device = peer_device->device;
5445                        kref_get(&device->kref);
5446                        rcu_read_unlock();
5447                        if (drbd_finish_peer_reqs(device)) {
5448                                kref_put(&device->kref, drbd_destroy_device);
5449                                return 1;
5450                        }
5451                        kref_put(&device->kref, drbd_destroy_device);
5452                        rcu_read_lock();
5453                }
5454                set_bit(SIGNAL_ASENDER, &connection->flags);
5455
5456                spin_lock_irq(&connection->resource->req_lock);
5457                idr_for_each_entry(&connection->peer_devices, peer_device, vnr) {
5458                        struct drbd_device *device = peer_device->device;
5459                        not_empty = !list_empty(&device->done_ee);
5460                        if (not_empty)
5461                                break;
5462                }
5463                spin_unlock_irq(&connection->resource->req_lock);
5464                rcu_read_unlock();
5465        } while (not_empty);
5466
5467        return 0;
5468}
5469
5470struct asender_cmd {
5471        size_t pkt_size;
5472        int (*fn)(struct drbd_connection *connection, struct packet_info *);
5473};
5474
5475static struct asender_cmd asender_tbl[] = {
5476        [P_PING]            = { 0, got_Ping },
5477        [P_PING_ACK]        = { 0, got_PingAck },
5478        [P_RECV_ACK]        = { sizeof(struct p_block_ack), got_BlockAck },
5479        [P_WRITE_ACK]       = { sizeof(struct p_block_ack), got_BlockAck },
5480        [P_RS_WRITE_ACK]    = { sizeof(struct p_block_ack), got_BlockAck },
5481        [P_SUPERSEDED]   = { sizeof(struct p_block_ack), got_BlockAck },
5482        [P_NEG_ACK]         = { sizeof(struct p_block_ack), got_NegAck },
5483        [P_NEG_DREPLY]      = { sizeof(struct p_block_ack), got_NegDReply },
5484        [P_NEG_RS_DREPLY]   = { sizeof(struct p_block_ack), got_NegRSDReply },
5485        [P_OV_RESULT]       = { sizeof(struct p_block_ack), got_OVResult },
5486        [P_BARRIER_ACK]     = { sizeof(struct p_barrier_ack), got_BarrierAck },
5487        [P_STATE_CHG_REPLY] = { sizeof(struct p_req_state_reply), got_RqSReply },
5488        [P_RS_IS_IN_SYNC]   = { sizeof(struct p_block_ack), got_IsInSync },
5489        [P_DELAY_PROBE]     = { sizeof(struct p_delay_probe93), got_skip },
5490        [P_RS_CANCEL]       = { sizeof(struct p_block_ack), got_NegRSDReply },
5491        [P_CONN_ST_CHG_REPLY]={ sizeof(struct p_req_state_reply), got_conn_RqSReply },
5492        [P_RETRY_WRITE]     = { sizeof(struct p_block_ack), got_BlockAck },
5493};
5494
5495int drbd_asender(struct drbd_thread *thi)
5496{
5497        struct drbd_connection *connection = thi->connection;
5498        struct asender_cmd *cmd = NULL;
5499        struct packet_info pi;
5500        int rv;
5501        void *buf    = connection->meta.rbuf;
5502        int received = 0;
5503        unsigned int header_size = drbd_header_size(connection);
5504        int expect   = header_size;
5505        bool ping_timeout_active = false;
5506        struct net_conf *nc;
5507        int ping_timeo, tcp_cork, ping_int;
5508        struct sched_param param = { .sched_priority = 2 };
5509
5510        rv = sched_setscheduler(current, SCHED_RR, &param);
5511        if (rv < 0)
5512                drbd_err(connection, "drbd_asender: ERROR set priority, ret=%d\n", rv);
5513
5514        while (get_t_state(thi) == RUNNING) {
5515                drbd_thread_current_set_cpu(thi);
5516
5517                rcu_read_lock();
5518                nc = rcu_dereference(connection->net_conf);
5519                ping_timeo = nc->ping_timeo;
5520                tcp_cork = nc->tcp_cork;
5521                ping_int = nc->ping_int;
5522                rcu_read_unlock();
5523
5524                if (test_and_clear_bit(SEND_PING, &connection->flags)) {
5525                        if (drbd_send_ping(connection)) {
5526                                drbd_err(connection, "drbd_send_ping has failed\n");
5527                                goto reconnect;
5528                        }
5529                        connection->meta.socket->sk->sk_rcvtimeo = ping_timeo * HZ / 10;
5530                        ping_timeout_active = true;
5531                }
5532
5533                /* TODO: conditionally cork; it may hurt latency if we cork without
5534                   much to send */
5535                if (tcp_cork)
5536                        drbd_tcp_cork(connection->meta.socket);
5537                if (connection_finish_peer_reqs(connection)) {
5538                        drbd_err(connection, "connection_finish_peer_reqs() failed\n");
5539                        goto reconnect;
5540                }
5541                /* but unconditionally uncork unless disabled */
5542                if (tcp_cork)
5543                        drbd_tcp_uncork(connection->meta.socket);
5544
5545                /* short circuit, recv_msg would return EINTR anyways. */
5546                if (signal_pending(current))
5547                        continue;
5548
5549                rv = drbd_recv_short(connection->meta.socket, buf, expect-received, 0);
5550                clear_bit(SIGNAL_ASENDER, &connection->flags);
5551
5552                flush_signals(current);
5553
5554                /* Note:
5555                 * -EINTR        (on meta) we got a signal
5556                 * -EAGAIN       (on meta) rcvtimeo expired
5557                 * -ECONNRESET   other side closed the connection
5558                 * -ERESTARTSYS  (on data) we got a signal
5559                 * rv <  0       other than above: unexpected error!
5560                 * rv == expected: full header or command
5561                 * rv <  expected: "woken" by signal during receive
5562                 * rv == 0       : "connection shut down by peer"
5563                 */
5564received_more:
5565                if (likely(rv > 0)) {
5566                        received += rv;
5567                        buf      += rv;
5568                } else if (rv == 0) {
5569                        if (test_bit(DISCONNECT_SENT, &connection->flags)) {
5570                                long t;
5571                                rcu_read_lock();
5572                                t = rcu_dereference(connection->net_conf)->ping_timeo * HZ/10;
5573                                rcu_read_unlock();
5574
5575                                t = wait_event_timeout(connection->ping_wait,
5576                                                       connection->cstate < C_WF_REPORT_PARAMS,
5577                                                       t);
5578                                if (t)
5579                                        break;
5580                        }
5581                        drbd_err(connection, "meta connection shut down by peer.\n");
5582                        goto reconnect;
5583                } else if (rv == -EAGAIN) {
5584                        /* If the data socket received something meanwhile,
5585                         * that is good enough: peer is still alive. */
5586                        if (time_after(connection->last_received,
5587                                jiffies - connection->meta.socket->sk->sk_rcvtimeo))
5588                                continue;
5589                        if (ping_timeout_active) {
5590                                drbd_err(connection, "PingAck did not arrive in time.\n");
5591                                goto reconnect;
5592                        }
5593                        set_bit(SEND_PING, &connection->flags);
5594                        continue;
5595                } else if (rv == -EINTR) {
5596                        continue;
5597                } else {
5598                        drbd_err(connection, "sock_recvmsg returned %d\n", rv);
5599                        goto reconnect;
5600                }
5601
5602                if (received == expect && cmd == NULL) {
5603                        if (decode_header(connection, connection->meta.rbuf, &pi))
5604                                goto reconnect;
5605                        cmd = &asender_tbl[pi.cmd];
5606                        if (pi.cmd >= ARRAY_SIZE(asender_tbl) || !cmd->fn) {
5607                                drbd_err(connection, "Unexpected meta packet %s (0x%04x)\n",
5608                                         cmdname(pi.cmd), pi.cmd);
5609                                goto disconnect;
5610                        }
5611                        expect = header_size + cmd->pkt_size;
5612                        if (pi.size != expect - header_size) {
5613                                drbd_err(connection, "Wrong packet size on meta (c: %d, l: %d)\n",
5614                                        pi.cmd, pi.size);
5615                                goto reconnect;
5616                        }
5617                }
5618                if (received == expect) {
5619                        bool err;
5620
5621                        err = cmd->fn(connection, &pi);
5622                        if (err) {
5623                                drbd_err(connection, "%pf failed\n", cmd->fn);
5624                                goto reconnect;
5625                        }
5626
5627                        connection->last_received = jiffies;
5628
5629                        if (cmd == &asender_tbl[P_PING_ACK]) {
5630                                /* restore idle timeout */
5631                                connection->meta.socket->sk->sk_rcvtimeo = ping_int * HZ;
5632                                ping_timeout_active = false;
5633                        }
5634
5635                        buf      = connection->meta.rbuf;
5636                        received = 0;
5637                        expect   = header_size;
5638                        cmd      = NULL;
5639                }
5640                if (test_bit(SEND_PING, &connection->flags))
5641                        continue;
5642                rv = drbd_recv_short(connection->meta.socket, buf, expect-received, MSG_DONTWAIT);
5643                if (rv > 0)
5644                        goto received_more;
5645        }
5646
5647        if (0) {
5648reconnect:
5649                conn_request_state(connection, NS(conn, C_NETWORK_FAILURE), CS_HARD);
5650                conn_md_sync(connection);
5651        }
5652        if (0) {
5653disconnect:
5654                conn_request_state(connection, NS(conn, C_DISCONNECTING), CS_HARD);
5655        }
5656        clear_bit(SIGNAL_ASENDER, &connection->flags);
5657
5658        drbd_info(connection, "asender terminated\n");
5659
5660        return 0;
5661}
5662