LXR linux/drivers/block/drbd/drbd

   1/*
   2   drbd_nl.c
   3
   4   This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
   5
   6   Copyright (C) 2001-2008, LINBIT Information Technologies GmbH.
   7   Copyright (C) 1999-2008, Philipp Reisner <philipp.reisner@linbit.com>.
   8   Copyright (C) 2002-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
   9
  10   drbd is free software; you can redistribute it and/or modify
  11   it under the terms of the GNU General Public License as published by
  12   the Free Software Foundation; either version 2, or (at your option)
  13   any later version.
  14
  15   drbd is distributed in the hope that it will be useful,
  16   but WITHOUT ANY WARRANTY; without even the implied warranty of
  17   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  18   GNU General Public License for more details.
  19
  20   You should have received a copy of the GNU General Public License
  21   along with drbd; see the file COPYING.  If not, write to
  22   the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
  23
  24 */
  25
  26#include <linux/module.h>
  27#include <linux/drbd.h>
  28#include <linux/in.h>
  29#include <linux/fs.h>
  30#include <linux/file.h>
  31#include <linux/slab.h>
  32#include <linux/blkpg.h>
  33#include <linux/cpumask.h>
  34#include "drbd_int.h"
  35#include "drbd_req.h"
  36#include "drbd_wrappers.h"
  37#include <asm/unaligned.h>
  38#include <linux/drbd_limits.h>
  39#include <linux/kthread.h>
  40
  41#include <net/genetlink.h>
  42
  43/* .doit */
  44// int drbd_adm_create_resource(struct sk_buff *skb, struct genl_info *info);
  45// int drbd_adm_delete_resource(struct sk_buff *skb, struct genl_info *info);
  46
  47int drbd_adm_add_minor(struct sk_buff *skb, struct genl_info *info);
  48int drbd_adm_delete_minor(struct sk_buff *skb, struct genl_info *info);
  49
  50int drbd_adm_new_resource(struct sk_buff *skb, struct genl_info *info);
  51int drbd_adm_del_resource(struct sk_buff *skb, struct genl_info *info);
  52int drbd_adm_down(struct sk_buff *skb, struct genl_info *info);
  53
  54int drbd_adm_set_role(struct sk_buff *skb, struct genl_info *info);
  55int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info);
  56int drbd_adm_disk_opts(struct sk_buff *skb, struct genl_info *info);
  57int drbd_adm_detach(struct sk_buff *skb, struct genl_info *info);
  58int drbd_adm_connect(struct sk_buff *skb, struct genl_info *info);
  59int drbd_adm_net_opts(struct sk_buff *skb, struct genl_info *info);
  60int drbd_adm_resize(struct sk_buff *skb, struct genl_info *info);
  61int drbd_adm_start_ov(struct sk_buff *skb, struct genl_info *info);
  62int drbd_adm_new_c_uuid(struct sk_buff *skb, struct genl_info *info);
  63int drbd_adm_disconnect(struct sk_buff *skb, struct genl_info *info);
  64int drbd_adm_invalidate(struct sk_buff *skb, struct genl_info *info);
  65int drbd_adm_invalidate_peer(struct sk_buff *skb, struct genl_info *info);
  66int drbd_adm_pause_sync(struct sk_buff *skb, struct genl_info *info);
  67int drbd_adm_resume_sync(struct sk_buff *skb, struct genl_info *info);
  68int drbd_adm_suspend_io(struct sk_buff *skb, struct genl_info *info);
  69int drbd_adm_resume_io(struct sk_buff *skb, struct genl_info *info);
  70int drbd_adm_outdate(struct sk_buff *skb, struct genl_info *info);
  71int drbd_adm_resource_opts(struct sk_buff *skb, struct genl_info *info);
  72int drbd_adm_get_status(struct sk_buff *skb, struct genl_info *info);
  73int drbd_adm_get_timeout_type(struct sk_buff *skb, struct genl_info *info);
  74/* .dumpit */
  75int drbd_adm_get_status_all(struct sk_buff *skb, struct netlink_callback *cb);
  76
  77#include <linux/drbd_genl_api.h>
  78#include "drbd_nla.h"
  79#include <linux/genl_magic_func.h>
  80
  81/* used blkdev_get_by_path, to claim our meta data device(s) */
  82static char *drbd_m_holder = "Hands off! this is DRBD's meta data device.";
  83
  84/* Configuration is strictly serialized, because generic netlink message
  85 * processing is strictly serialized by the genl_lock().
  86 * Which means we can use one static global drbd_config_context struct.
  87 */
  88static struct drbd_config_context {
  89        /* assigned from drbd_genlmsghdr */
  90        unsigned int minor;
  91        /* assigned from request attributes, if present */
  92        unsigned int volume;
  93#define VOLUME_UNSPECIFIED              (-1U)
  94        /* pointer into the request skb,
  95         * limited lifetime! */
  96        char *resource_name;
  97        struct nlattr *my_addr;
  98        struct nlattr *peer_addr;
  99
 100        /* reply buffer */
 101        struct sk_buff *reply_skb;
 102        /* pointer into reply buffer */
 103        struct drbd_genlmsghdr *reply_dh;
 104        /* resolved from attributes, if possible */
 105        struct drbd_conf *mdev;
 106        struct drbd_tconn *tconn;
 107} adm_ctx;
 108
 109static void drbd_adm_send_reply(struct sk_buff *skb, struct genl_info *info)
 110{
 111        genlmsg_end(skb, genlmsg_data(nlmsg_data(nlmsg_hdr(skb))));
 112        if (genlmsg_reply(skb, info))
 113                printk(KERN_ERR "drbd: error sending genl reply\n");
 114}
 115
 116/* Used on a fresh "drbd_adm_prepare"d reply_skb, this cannot fail: The only
 117 * reason it could fail was no space in skb, and there are 4k available. */
 118int drbd_msg_put_info(const char *info)
 119{
 120        struct sk_buff *skb = adm_ctx.reply_skb;
 121        struct nlattr *nla;
 122        int err = -EMSGSIZE;
 123
 124        if (!info || !info[0])
 125                return 0;
 126
 127        nla = nla_nest_start(skb, DRBD_NLA_CFG_REPLY);
 128        if (!nla)
 129                return err;
 130
 131        err = nla_put_string(skb, T_info_text, info);
 132        if (err) {
 133                nla_nest_cancel(skb, nla);
 134                return err;
 135        } else
 136                nla_nest_end(skb, nla);
 137        return 0;
 138}
 139
 140/* This would be a good candidate for a "pre_doit" hook,
 141 * and per-family private info->pointers.
 142 * But we need to stay compatible with older kernels.
 143 * If it returns successfully, adm_ctx members are valid.
 144 */
 145#define DRBD_ADM_NEED_MINOR     1
 146#define DRBD_ADM_NEED_RESOURCE  2
 147#define DRBD_ADM_NEED_CONNECTION 4
 148static int drbd_adm_prepare(struct sk_buff *skb, struct genl_info *info,
 149                unsigned flags)
 150{
 151        struct drbd_genlmsghdr *d_in = info->userhdr;
 152        const u8 cmd = info->genlhdr->cmd;
 153        int err;
 154
 155        memset(&adm_ctx, 0, sizeof(adm_ctx));
 156
 157        /* genl_rcv_msg only checks for CAP_NET_ADMIN on "GENL_ADMIN_PERM" :( */
 158        if (cmd != DRBD_ADM_GET_STATUS && !capable(CAP_NET_ADMIN))
 159               return -EPERM;
 160
 161        adm_ctx.reply_skb = genlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL);
 162        if (!adm_ctx.reply_skb) {
 163                err = -ENOMEM;
 164                goto fail;
 165        }
 166
 167        adm_ctx.reply_dh = genlmsg_put_reply(adm_ctx.reply_skb,
 168                                        info, &drbd_genl_family, 0, cmd);
 169        /* put of a few bytes into a fresh skb of >= 4k will always succeed.
 170         * but anyways */
 171        if (!adm_ctx.reply_dh) {
 172                err = -ENOMEM;
 173                goto fail;
 174        }
 175
 176        adm_ctx.reply_dh->minor = d_in->minor;
 177        adm_ctx.reply_dh->ret_code = NO_ERROR;
 178
 179        adm_ctx.volume = VOLUME_UNSPECIFIED;
 180        if (info->attrs[DRBD_NLA_CFG_CONTEXT]) {
 181                struct nlattr *nla;
 182                /* parse and validate only */
 183                err = drbd_cfg_context_from_attrs(NULL, info);
 184                if (err)
 185                        goto fail;
 186
 187                /* It was present, and valid,
 188                 * copy it over to the reply skb. */
 189                err = nla_put_nohdr(adm_ctx.reply_skb,
 190                                info->attrs[DRBD_NLA_CFG_CONTEXT]->nla_len,
 191                                info->attrs[DRBD_NLA_CFG_CONTEXT]);
 192                if (err)
 193                        goto fail;
 194
 195                /* and assign stuff to the global adm_ctx */
 196                nla = nested_attr_tb[__nla_type(T_ctx_volume)];
 197                if (nla)
 198                        adm_ctx.volume = nla_get_u32(nla);
 199                nla = nested_attr_tb[__nla_type(T_ctx_resource_name)];
 200                if (nla)
 201                        adm_ctx.resource_name = nla_data(nla);
 202                adm_ctx.my_addr = nested_attr_tb[__nla_type(T_ctx_my_addr)];
 203                adm_ctx.peer_addr = nested_attr_tb[__nla_type(T_ctx_peer_addr)];
 204                if ((adm_ctx.my_addr &&
 205                     nla_len(adm_ctx.my_addr) > sizeof(adm_ctx.tconn->my_addr)) ||
 206                    (adm_ctx.peer_addr &&
 207                     nla_len(adm_ctx.peer_addr) > sizeof(adm_ctx.tconn->peer_addr))) {
 208                        err = -EINVAL;
 209                        goto fail;
 210                }
 211        }
 212
 213        adm_ctx.minor = d_in->minor;
 214        adm_ctx.mdev = minor_to_mdev(d_in->minor);
 215        adm_ctx.tconn = conn_get_by_name(adm_ctx.resource_name);
 216
 217        if (!adm_ctx.mdev && (flags & DRBD_ADM_NEED_MINOR)) {
 218                drbd_msg_put_info("unknown minor");
 219                return ERR_MINOR_INVALID;
 220        }
 221        if (!adm_ctx.tconn && (flags & DRBD_ADM_NEED_RESOURCE)) {
 222                drbd_msg_put_info("unknown resource");
 223                return ERR_INVALID_REQUEST;
 224        }
 225
 226        if (flags & DRBD_ADM_NEED_CONNECTION) {
 227                if (adm_ctx.tconn && !(flags & DRBD_ADM_NEED_RESOURCE)) {
 228                        drbd_msg_put_info("no resource name expected");
 229                        return ERR_INVALID_REQUEST;
 230                }
 231                if (adm_ctx.mdev) {
 232                        drbd_msg_put_info("no minor number expected");
 233                        return ERR_INVALID_REQUEST;
 234                }
 235                if (adm_ctx.my_addr && adm_ctx.peer_addr)
 236                        adm_ctx.tconn = conn_get_by_addrs(nla_data(adm_ctx.my_addr),
 237                                                          nla_len(adm_ctx.my_addr),
 238                                                          nla_data(adm_ctx.peer_addr),
 239                                                          nla_len(adm_ctx.peer_addr));
 240                if (!adm_ctx.tconn) {
 241                        drbd_msg_put_info("unknown connection");
 242                        return ERR_INVALID_REQUEST;
 243                }
 244        }
 245
 246        /* some more paranoia, if the request was over-determined */
 247        if (adm_ctx.mdev && adm_ctx.tconn &&
 248            adm_ctx.mdev->tconn != adm_ctx.tconn) {
 249                pr_warning("request: minor=%u, resource=%s; but that minor belongs to connection %s\n",
 250                                adm_ctx.minor, adm_ctx.resource_name,
 251                                adm_ctx.mdev->tconn->name);
 252                drbd_msg_put_info("minor exists in different resource");
 253                return ERR_INVALID_REQUEST;
 254        }
 255        if (adm_ctx.mdev &&
 256            adm_ctx.volume != VOLUME_UNSPECIFIED &&
 257            adm_ctx.volume != adm_ctx.mdev->vnr) {
 258                pr_warning("request: minor=%u, volume=%u; but that minor is volume %u in %s\n",
 259                                adm_ctx.minor, adm_ctx.volume,
 260                                adm_ctx.mdev->vnr, adm_ctx.mdev->tconn->name);
 261                drbd_msg_put_info("minor exists as different volume");
 262                return ERR_INVALID_REQUEST;
 263        }
 264
 265        return NO_ERROR;
 266
 267fail:
 268        nlmsg_free(adm_ctx.reply_skb);
 269        adm_ctx.reply_skb = NULL;
 270        return err;
 271}
 272
 273static int drbd_adm_finish(struct genl_info *info, int retcode)
 274{
 275        if (adm_ctx.tconn) {
 276                kref_put(&adm_ctx.tconn->kref, &conn_destroy);
 277                adm_ctx.tconn = NULL;
 278        }
 279
 280        if (!adm_ctx.reply_skb)
 281                return -ENOMEM;
 282
 283        adm_ctx.reply_dh->ret_code = retcode;
 284        drbd_adm_send_reply(adm_ctx.reply_skb, info);
 285        return 0;
 286}
 287
 288static void setup_khelper_env(struct drbd_tconn *tconn, char **envp)
 289{
 290        char *afs;
 291
 292        /* FIXME: A future version will not allow this case. */
 293        if (tconn->my_addr_len == 0 || tconn->peer_addr_len == 0)
 294                return;
 295
 296        switch (((struct sockaddr *)&tconn->peer_addr)->sa_family) {
 297        case AF_INET6:
 298                afs = "ipv6";
 299                snprintf(envp[4], 60, "DRBD_PEER_ADDRESS=%pI6",
 300                         &((struct sockaddr_in6 *)&tconn->peer_addr)->sin6_addr);
 301                break;
 302        case AF_INET:
 303                afs = "ipv4";
 304                snprintf(envp[4], 60, "DRBD_PEER_ADDRESS=%pI4",
 305                         &((struct sockaddr_in *)&tconn->peer_addr)->sin_addr);
 306                break;
 307        default:
 308                afs = "ssocks";
 309                snprintf(envp[4], 60, "DRBD_PEER_ADDRESS=%pI4",
 310                         &((struct sockaddr_in *)&tconn->peer_addr)->sin_addr);
 311        }
 312        snprintf(envp[3], 20, "DRBD_PEER_AF=%s", afs);
 313}
 314
 315int drbd_khelper(struct drbd_conf *mdev, char *cmd)
 316{
 317        char *envp[] = { "HOME=/",
 318                        "TERM=linux",
 319                        "PATH=/sbin:/usr/sbin:/bin:/usr/bin",
 320                         (char[20]) { }, /* address family */
 321                         (char[60]) { }, /* address */
 322                        NULL };
 323        char mb[12];
 324        char *argv[] = {usermode_helper, cmd, mb, NULL };
 325        struct drbd_tconn *tconn = mdev->tconn;
 326        struct sib_info sib;
 327        int ret;
 328
 329        if (current == tconn->worker.task)
 330                set_bit(CALLBACK_PENDING, &tconn->flags);
 331
 332        snprintf(mb, 12, "minor-%d", mdev_to_minor(mdev));
 333        setup_khelper_env(tconn, envp);
 334
 335        /* The helper may take some time.
 336         * write out any unsynced meta data changes now */
 337        drbd_md_sync(mdev);
 338
 339        dev_info(DEV, "helper command: %s %s %s\n", usermode_helper, cmd, mb);
 340        sib.sib_reason = SIB_HELPER_PRE;
 341        sib.helper_name = cmd;
 342        drbd_bcast_event(mdev, &sib);
 343        ret = call_usermodehelper(usermode_helper, argv, envp, UMH_WAIT_PROC);
 344        if (ret)
 345                dev_warn(DEV, "helper command: %s %s %s exit code %u (0x%x)\n",
 346                                usermode_helper, cmd, mb,
 347                                (ret >> 8) & 0xff, ret);
 348        else
 349                dev_info(DEV, "helper command: %s %s %s exit code %u (0x%x)\n",
 350                                usermode_helper, cmd, mb,
 351                                (ret >> 8) & 0xff, ret);
 352        sib.sib_reason = SIB_HELPER_POST;
 353        sib.helper_exit_code = ret;
 354        drbd_bcast_event(mdev, &sib);
 355
 356        if (current == tconn->worker.task)
 357                clear_bit(CALLBACK_PENDING, &tconn->flags);
 358
 359        if (ret < 0) /* Ignore any ERRNOs we got. */
 360                ret = 0;
 361
 362        return ret;
 363}
 364
 365int conn_khelper(struct drbd_tconn *tconn, char *cmd)
 366{
 367        char *envp[] = { "HOME=/",
 368                        "TERM=linux",
 369                        "PATH=/sbin:/usr/sbin:/bin:/usr/bin",
 370                         (char[20]) { }, /* address family */
 371                         (char[60]) { }, /* address */
 372                        NULL };
 373        char *argv[] = {usermode_helper, cmd, tconn->name, NULL };
 374        int ret;
 375
 376        setup_khelper_env(tconn, envp);
 377        conn_md_sync(tconn);
 378
 379        conn_info(tconn, "helper command: %s %s %s\n", usermode_helper, cmd, tconn->name);
 380        /* TODO: conn_bcast_event() ?? */
 381
 382        ret = call_usermodehelper(usermode_helper, argv, envp, UMH_WAIT_PROC);
 383        if (ret)
 384                conn_warn(tconn, "helper command: %s %s %s exit code %u (0x%x)\n",
 385                          usermode_helper, cmd, tconn->name,
 386                          (ret >> 8) & 0xff, ret);
 387        else
 388                conn_info(tconn, "helper command: %s %s %s exit code %u (0x%x)\n",
 389                          usermode_helper, cmd, tconn->name,
 390                          (ret >> 8) & 0xff, ret);
 391        /* TODO: conn_bcast_event() ?? */
 392
 393        if (ret < 0) /* Ignore any ERRNOs we got. */
 394                ret = 0;
 395
 396        return ret;
 397}
 398
 399static enum drbd_fencing_p highest_fencing_policy(struct drbd_tconn *tconn)
 400{
 401        enum drbd_fencing_p fp = FP_NOT_AVAIL;
 402        struct drbd_conf *mdev;
 403        int vnr;
 404
 405        rcu_read_lock();
 406        idr_for_each_entry(&tconn->volumes, mdev, vnr) {
 407                if (get_ldev_if_state(mdev, D_CONSISTENT)) {
 408                        fp = max_t(enum drbd_fencing_p, fp,
 409                                   rcu_dereference(mdev->ldev->disk_conf)->fencing);
 410                        put_ldev(mdev);
 411                }
 412        }
 413        rcu_read_unlock();
 414
 415        return fp;
 416}
 417
 418bool conn_try_outdate_peer(struct drbd_tconn *tconn)
 419{
 420        unsigned int connect_cnt;
 421        union drbd_state mask = { };
 422        union drbd_state val = { };
 423        enum drbd_fencing_p fp;
 424        char *ex_to_string;
 425        int r;
 426
 427        if (tconn->cstate >= C_WF_REPORT_PARAMS) {
 428                conn_err(tconn, "Expected cstate < C_WF_REPORT_PARAMS\n");
 429                return false;
 430        }
 431
 432        spin_lock_irq(&tconn->req_lock);
 433        connect_cnt = tconn->connect_cnt;
 434        spin_unlock_irq(&tconn->req_lock);
 435
 436        fp = highest_fencing_policy(tconn);
 437        switch (fp) {
 438        case FP_NOT_AVAIL:
 439                conn_warn(tconn, "Not fencing peer, I'm not even Consistent myself.\n");
 440                goto out;
 441        case FP_DONT_CARE:
 442                return true;
 443        default: ;
 444        }
 445
 446        r = conn_khelper(tconn, "fence-peer");
 447
 448        switch ((r>>8) & 0xff) {
 449        case 3: /* peer is inconsistent */
 450                ex_to_string = "peer is inconsistent or worse";
 451                mask.pdsk = D_MASK;
 452                val.pdsk = D_INCONSISTENT;
 453                break;
 454        case 4: /* peer got outdated, or was already outdated */
 455                ex_to_string = "peer was fenced";
 456                mask.pdsk = D_MASK;
 457                val.pdsk = D_OUTDATED;
 458                break;
 459        case 5: /* peer was down */
 460                if (conn_highest_disk(tconn) == D_UP_TO_DATE) {
 461                        /* we will(have) create(d) a new UUID anyways... */
 462                        ex_to_string = "peer is unreachable, assumed to be dead";
 463                        mask.pdsk = D_MASK;
 464                        val.pdsk = D_OUTDATED;
 465                } else {
 466                        ex_to_string = "peer unreachable, doing nothing since disk != UpToDate";
 467                }
 468                break;
 469        case 6: /* Peer is primary, voluntarily outdate myself.
 470                 * This is useful when an unconnected R_SECONDARY is asked to
 471                 * become R_PRIMARY, but finds the other peer being active. */
 472                ex_to_string = "peer is active";
 473                conn_warn(tconn, "Peer is primary, outdating myself.\n");
 474                mask.disk = D_MASK;
 475                val.disk = D_OUTDATED;
 476                break;
 477        case 7:
 478                if (fp != FP_STONITH)
 479                        conn_err(tconn, "fence-peer() = 7 && fencing != Stonith !!!\n");
 480                ex_to_string = "peer was stonithed";
 481                mask.pdsk = D_MASK;
 482                val.pdsk = D_OUTDATED;
 483                break;
 484        default:
 485                /* The script is broken ... */
 486                conn_err(tconn, "fence-peer helper broken, returned %d\n", (r>>8)&0xff);
 487                return false; /* Eventually leave IO frozen */
 488        }
 489
 490        conn_info(tconn, "fence-peer helper returned %d (%s)\n",
 491                  (r>>8) & 0xff, ex_to_string);
 492
 493 out:
 494
 495        /* Not using
 496           conn_request_state(tconn, mask, val, CS_VERBOSE);
 497           here, because we might were able to re-establish the connection in the
 498           meantime. */
 499        spin_lock_irq(&tconn->req_lock);
 500        if (tconn->cstate < C_WF_REPORT_PARAMS && !test_bit(STATE_SENT, &tconn->flags)) {
 501                if (tconn->connect_cnt != connect_cnt)
 502                        /* In case the connection was established and droped
 503                           while the fence-peer handler was running, ignore it */
 504                        conn_info(tconn, "Ignoring fence-peer exit code\n");
 505                else
 506                        _conn_request_state(tconn, mask, val, CS_VERBOSE);
 507        }
 508        spin_unlock_irq(&tconn->req_lock);
 509
 510        return conn_highest_pdsk(tconn) <= D_OUTDATED;
 511}
 512
 513static int _try_outdate_peer_async(void *data)
 514{
 515        struct drbd_tconn *tconn = (struct drbd_tconn *)data;
 516
 517        conn_try_outdate_peer(tconn);
 518
 519        kref_put(&tconn->kref, &conn_destroy);
 520        return 0;
 521}
 522
 523void conn_try_outdate_peer_async(struct drbd_tconn *tconn)
 524{
 525        struct task_struct *opa;
 526
 527        kref_get(&tconn->kref);
 528        opa = kthread_run(_try_outdate_peer_async, tconn, "drbd_async_h");
 529        if (IS_ERR(opa)) {
 530                conn_err(tconn, "out of mem, failed to invoke fence-peer helper\n");
 531                kref_put(&tconn->kref, &conn_destroy);
 532        }
 533}
 534
 535enum drbd_state_rv
 536drbd_set_role(struct drbd_conf *mdev, enum drbd_role new_role, int force)
 537{
 538        const int max_tries = 4;
 539        enum drbd_state_rv rv = SS_UNKNOWN_ERROR;
 540        struct net_conf *nc;
 541        int try = 0;
 542        int forced = 0;
 543        union drbd_state mask, val;
 544
 545        if (new_role == R_PRIMARY)
 546                request_ping(mdev->tconn); /* Detect a dead peer ASAP */
 547
 548        mutex_lock(mdev->state_mutex);
 549
 550        mask.i = 0; mask.role = R_MASK;
 551        val.i  = 0; val.role  = new_role;
 552
 553        while (try++ < max_tries) {
 554                rv = _drbd_request_state(mdev, mask, val, CS_WAIT_COMPLETE);
 555
 556                /* in case we first succeeded to outdate,
 557                 * but now suddenly could establish a connection */
 558                if (rv == SS_CW_FAILED_BY_PEER && mask.pdsk != 0) {
 559                        val.pdsk = 0;
 560                        mask.pdsk = 0;
 561                        continue;
 562                }
 563
 564                if (rv == SS_NO_UP_TO_DATE_DISK && force &&
 565                    (mdev->state.disk < D_UP_TO_DATE &&
 566                     mdev->state.disk >= D_INCONSISTENT)) {
 567                        mask.disk = D_MASK;
 568                        val.disk  = D_UP_TO_DATE;
 569                        forced = 1;
 570                        continue;
 571                }
 572
 573                if (rv == SS_NO_UP_TO_DATE_DISK &&
 574                    mdev->state.disk == D_CONSISTENT && mask.pdsk == 0) {
 575                        D_ASSERT(mdev->state.pdsk == D_UNKNOWN);
 576
 577                        if (conn_try_outdate_peer(mdev->tconn)) {
 578                                val.disk = D_UP_TO_DATE;
 579                                mask.disk = D_MASK;
 580                        }
 581                        continue;
 582                }
 583
 584                if (rv == SS_NOTHING_TO_DO)
 585                        goto out;
 586                if (rv == SS_PRIMARY_NOP && mask.pdsk == 0) {
 587                        if (!conn_try_outdate_peer(mdev->tconn) && force) {
 588                                dev_warn(DEV, "Forced into split brain situation!\n");
 589                                mask.pdsk = D_MASK;
 590                                val.pdsk  = D_OUTDATED;
 591
 592                        }
 593                        continue;
 594                }
 595                if (rv == SS_TWO_PRIMARIES) {
 596                        /* Maybe the peer is detected as dead very soon...
 597                           retry at most once more in this case. */
 598                        int timeo;
 599                        rcu_read_lock();
 600                        nc = rcu_dereference(mdev->tconn->net_conf);
 601                        timeo = nc ? (nc->ping_timeo + 1) * HZ / 10 : 1;
 602                        rcu_read_unlock();
 603                        schedule_timeout_interruptible(timeo);
 604                        if (try < max_tries)
 605                                try = max_tries - 1;
 606                        continue;
 607                }
 608                if (rv < SS_SUCCESS) {
 609                        rv = _drbd_request_state(mdev, mask, val,
 610                                                CS_VERBOSE + CS_WAIT_COMPLETE);
 611                        if (rv < SS_SUCCESS)
 612                                goto out;
 613                }
 614                break;
 615        }
 616
 617        if (rv < SS_SUCCESS)
 618                goto out;
 619
 620        if (forced)
 621                dev_warn(DEV, "Forced to consider local data as UpToDate!\n");
 622
 623        /* Wait until nothing is on the fly :) */
 624        wait_event(mdev->misc_wait, atomic_read(&mdev->ap_pending_cnt) == 0);
 625
 626        /* FIXME also wait for all pending P_BARRIER_ACK? */
 627
 628        if (new_role == R_SECONDARY) {
 629                set_disk_ro(mdev->vdisk, true);
 630                if (get_ldev(mdev)) {
 631                        mdev->ldev->md.uuid[UI_CURRENT] &= ~(u64)1;
 632                        put_ldev(mdev);
 633                }
 634        } else {
 635                mutex_lock(&mdev->tconn->conf_update);
 636                nc = mdev->tconn->net_conf;
 637                if (nc)
 638                        nc->discard_my_data = 0; /* without copy; single bit op is atomic */
 639                mutex_unlock(&mdev->tconn->conf_update);
 640
 641                set_disk_ro(mdev->vdisk, false);
 642                if (get_ldev(mdev)) {
 643                        if (((mdev->state.conn < C_CONNECTED ||
 644                               mdev->state.pdsk <= D_FAILED)
 645                              && mdev->ldev->md.uuid[UI_BITMAP] == 0) || forced)
 646                                drbd_uuid_new_current(mdev);
 647
 648                        mdev->ldev->md.uuid[UI_CURRENT] |=  (u64)1;
 649                        put_ldev(mdev);
 650                }
 651        }
 652
 653        /* writeout of activity log covered areas of the bitmap
 654         * to stable storage done in after state change already */
 655
 656        if (mdev->state.conn >= C_WF_REPORT_PARAMS) {
 657                /* if this was forced, we should consider sync */
 658                if (forced)
 659                        drbd_send_uuids(mdev);
 660                drbd_send_current_state(mdev);
 661        }
 662
 663        drbd_md_sync(mdev);
 664
 665        kobject_uevent(&disk_to_dev(mdev->vdisk)->kobj, KOBJ_CHANGE);
 666out:
 667        mutex_unlock(mdev->state_mutex);
 668        return rv;
 669}
 670
 671static const char *from_attrs_err_to_txt(int err)
 672{
 673        return  err == -ENOMSG ? "required attribute missing" :
 674                err == -EOPNOTSUPP ? "unknown mandatory attribute" :
 675                err == -EEXIST ? "can not change invariant setting" :
 676                "invalid attribute value";
 677}
 678
 679int drbd_adm_set_role(struct sk_buff *skb, struct genl_info *info)
 680{
 681        struct set_role_parms parms;
 682        int err;
 683        enum drbd_ret_code retcode;
 684
 685        retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_MINOR);
 686        if (!adm_ctx.reply_skb)
 687                return retcode;
 688        if (retcode != NO_ERROR)
 689                goto out;
 690
 691        memset(&parms, 0, sizeof(parms));
 692        if (info->attrs[DRBD_NLA_SET_ROLE_PARMS]) {
 693                err = set_role_parms_from_attrs(&parms, info);
 694                if (err) {
 695                        retcode = ERR_MANDATORY_TAG;
 696                        drbd_msg_put_info(from_attrs_err_to_txt(err));
 697                        goto out;
 698                }
 699        }
 700
 701        if (info->genlhdr->cmd == DRBD_ADM_PRIMARY)
 702                retcode = drbd_set_role(adm_ctx.mdev, R_PRIMARY, parms.assume_uptodate);
 703        else
 704                retcode = drbd_set_role(adm_ctx.mdev, R_SECONDARY, 0);
 705out:
 706        drbd_adm_finish(info, retcode);
 707        return 0;
 708}
 709
 710/* Initializes the md.*_offset members, so we are able to find
 711 * the on disk meta data.
 712 *
 713 * We currently have two possible layouts:
 714 * external:
 715 *   |----------- md_size_sect ------------------|
 716 *   [ 4k superblock ][ activity log ][  Bitmap  ]
 717 *   | al_offset == 8 |
 718 *   | bm_offset = al_offset + X      |
 719 *  ==> bitmap sectors = md_size_sect - bm_offset
 720 *
 721 * internal:
 722 *            |----------- md_size_sect ------------------|
 723 * [data.....][  Bitmap  ][ activity log ][ 4k superblock ]
 724 *                        | al_offset < 0 |
 725 *            | bm_offset = al_offset - Y |
 726 *  ==> bitmap sectors = Y = al_offset - bm_offset
 727 *
 728 *  Activity log size used to be fixed 32kB,
 729 *  but is about to become configurable.
 730 */
 731static void drbd_md_set_sector_offsets(struct drbd_conf *mdev,
 732                                       struct drbd_backing_dev *bdev)
 733{
 734        sector_t md_size_sect = 0;
 735        unsigned int al_size_sect = bdev->md.al_size_4k * 8;
 736
 737        bdev->md.md_offset = drbd_md_ss(bdev);
 738
 739        switch (bdev->md.meta_dev_idx) {
 740        default:
 741                /* v07 style fixed size indexed meta data */
 742                bdev->md.md_size_sect = MD_128MB_SECT;
 743                bdev->md.al_offset = MD_4kB_SECT;
 744                bdev->md.bm_offset = MD_4kB_SECT + al_size_sect;
 745                break;
 746        case DRBD_MD_INDEX_FLEX_EXT:
 747                /* just occupy the full device; unit: sectors */
 748                bdev->md.md_size_sect = drbd_get_capacity(bdev->md_bdev);
 749                bdev->md.al_offset = MD_4kB_SECT;
 750                bdev->md.bm_offset = MD_4kB_SECT + al_size_sect;
 751                break;
 752        case DRBD_MD_INDEX_INTERNAL:
 753        case DRBD_MD_INDEX_FLEX_INT:
 754                /* al size is still fixed */
 755                bdev->md.al_offset = -al_size_sect;
 756                /* we need (slightly less than) ~ this much bitmap sectors: */
 757                md_size_sect = drbd_get_capacity(bdev->backing_bdev);
 758                md_size_sect = ALIGN(md_size_sect, BM_SECT_PER_EXT);
 759                md_size_sect = BM_SECT_TO_EXT(md_size_sect);
 760                md_size_sect = ALIGN(md_size_sect, 8);
 761
 762                /* plus the "drbd meta data super block",
 763                 * and the activity log; */
 764                md_size_sect += MD_4kB_SECT + al_size_sect;
 765
 766                bdev->md.md_size_sect = md_size_sect;
 767                /* bitmap offset is adjusted by 'super' block size */
 768                bdev->md.bm_offset   = -md_size_sect + MD_4kB_SECT;
 769                break;
 770        }
 771}
 772
 773/* input size is expected to be in KB */
 774char *ppsize(char *buf, unsigned long long size)
 775{
 776        /* Needs 9 bytes at max including trailing NUL:
 777         * -1ULL ==> "16384 EB" */
 778        static char units[] = { 'K', 'M', 'G', 'T', 'P', 'E' };
 779        int base = 0;
 780        while (size >= 10000 && base < sizeof(units)-1) {
 781                /* shift + round */
 782                size = (size >> 10) + !!(size & (1<<9));
 783                base++;
 784        }
 785        sprintf(buf, "%u %cB", (unsigned)size, units[base]);
 786
 787        return buf;
 788}
 789
 790/* there is still a theoretical deadlock when called from receiver
 791 * on an D_INCONSISTENT R_PRIMARY:
 792 *  remote READ does inc_ap_bio, receiver would need to receive answer
 793 *  packet from remote to dec_ap_bio again.
 794 *  receiver receive_sizes(), comes here,
 795 *  waits for ap_bio_cnt == 0. -> deadlock.
 796 * but this cannot happen, actually, because:
 797 *  R_PRIMARY D_INCONSISTENT, and peer's disk is unreachable
 798 *  (not connected, or bad/no disk on peer):
 799 *  see drbd_fail_request_early, ap_bio_cnt is zero.
 800 *  R_PRIMARY D_INCONSISTENT, and C_SYNC_TARGET:
 801 *  peer may not initiate a resize.
 802 */
 803/* Note these are not to be confused with
 804 * drbd_adm_suspend_io/drbd_adm_resume_io,
 805 * which are (sub) state changes triggered by admin (drbdsetup),
 806 * and can be long lived.
 807 * This changes an mdev->flag, is triggered by drbd internals,
 808 * and should be short-lived. */
 809void drbd_suspend_io(struct drbd_conf *mdev)
 810{
 811        set_bit(SUSPEND_IO, &mdev->flags);
 812        if (drbd_suspended(mdev))
 813                return;
 814        wait_event(mdev->misc_wait, !atomic_read(&mdev->ap_bio_cnt));
 815}
 816
 817void drbd_resume_io(struct drbd_conf *mdev)
 818{
 819        clear_bit(SUSPEND_IO, &mdev->flags);
 820        wake_up(&mdev->misc_wait);
 821}
 822
 823/**
 824 * drbd_determine_dev_size() -  Sets the right device size obeying all constraints
 825 * @mdev:       DRBD device.
 826 *
 827 * Returns 0 on success, negative return values indicate errors.
 828 * You should call drbd_md_sync() after calling this function.
 829 */
 830enum determine_dev_size
 831drbd_determine_dev_size(struct drbd_conf *mdev, enum dds_flags flags, struct resize_parms *rs) __must_hold(local)
 832{
 833        sector_t prev_first_sect, prev_size; /* previous meta location */
 834        sector_t la_size_sect, u_size;
 835        struct drbd_md *md = &mdev->ldev->md;
 836        u32 prev_al_stripe_size_4k;
 837        u32 prev_al_stripes;
 838        sector_t size;
 839        char ppb[10];
 840        void *buffer;
 841
 842        int md_moved, la_size_changed;
 843        enum determine_dev_size rv = DS_UNCHANGED;
 844
 845        /* race:
 846         * application request passes inc_ap_bio,
 847         * but then cannot get an AL-reference.
 848         * this function later may wait on ap_bio_cnt == 0. -> deadlock.
 849         *
 850         * to avoid that:
 851         * Suspend IO right here.
 852         * still lock the act_log to not trigger ASSERTs there.
 853         */
 854        drbd_suspend_io(mdev);
 855        buffer = drbd_md_get_buffer(mdev); /* Lock meta-data IO */
 856        if (!buffer) {
 857                drbd_resume_io(mdev);
 858                return DS_ERROR;
 859        }
 860
 861        /* no wait necessary anymore, actually we could assert that */
 862        wait_event(mdev->al_wait, lc_try_lock(mdev->act_log));
 863
 864        prev_first_sect = drbd_md_first_sector(mdev->ldev);
 865        prev_size = mdev->ldev->md.md_size_sect;
 866        la_size_sect = mdev->ldev->md.la_size_sect;
 867
 868        if (rs) {
 869                /* rs is non NULL if we should change the AL layout only */
 870
 871                prev_al_stripes = md->al_stripes;
 872                prev_al_stripe_size_4k = md->al_stripe_size_4k;
 873
 874                md->al_stripes = rs->al_stripes;
 875                md->al_stripe_size_4k = rs->al_stripe_size / 4;
 876                md->al_size_4k = (u64)rs->al_stripes * rs->al_stripe_size / 4;
 877        }
 878
 879        drbd_md_set_sector_offsets(mdev, mdev->ldev);
 880
 881        rcu_read_lock();
 882        u_size = rcu_dereference(mdev->ldev->disk_conf)->disk_size;
 883        rcu_read_unlock();
 884        size = drbd_new_dev_size(mdev, mdev->ldev, u_size, flags & DDSF_FORCED);
 885
 886        if (size < la_size_sect) {
 887                if (rs && u_size == 0) {
 888                        /* Remove "rs &&" later. This check should always be active, but
 889                           right now the receiver expects the permissive behavior */
 890                        dev_warn(DEV, "Implicit shrink not allowed. "
 891                                 "Use --size=%llus for explicit shrink.\n",
 892                                 (unsigned long long)size);
 893                        rv = DS_ERROR_SHRINK;
 894                }
 895                if (u_size > size)
 896                        rv = DS_ERROR_SPACE_MD;
 897                if (rv != DS_UNCHANGED)
 898                        goto err_out;
 899        }
 900
 901        if (drbd_get_capacity(mdev->this_bdev) != size ||
 902            drbd_bm_capacity(mdev) != size) {
 903                int err;
 904                err = drbd_bm_resize(mdev, size, !(flags & DDSF_NO_RESYNC));
 905                if (unlikely(err)) {
 906                        /* currently there is only one error: ENOMEM! */
 907                        size = drbd_bm_capacity(mdev)>>1;
 908                        if (size == 0) {
 909                                dev_err(DEV, "OUT OF MEMORY! "
 910                                    "Could not allocate bitmap!\n");
 911                        } else {
 912                                dev_err(DEV, "BM resizing failed. "
 913                                    "Leaving size unchanged at size = %lu KB\n",
 914                                    (unsigned long)size);
 915                        }
 916                        rv = DS_ERROR;
 917                }
 918                /* racy, see comments above. */
 919                drbd_set_my_capacity(mdev, size);
 920                mdev->ldev->md.la_size_sect = size;
 921                dev_info(DEV, "size = %s (%llu KB)\n", ppsize(ppb, size>>1),
 922                     (unsigned long long)size>>1);
 923        }
 924        if (rv <= DS_ERROR)
 925                goto err_out;
 926
 927        la_size_changed = (la_size_sect != mdev->ldev->md.la_size_sect);
 928
 929        md_moved = prev_first_sect != drbd_md_first_sector(mdev->ldev)
 930                || prev_size       != mdev->ldev->md.md_size_sect;
 931
 932        if (la_size_changed || md_moved || rs) {
 933                u32 prev_flags;
 934
 935                drbd_al_shrink(mdev); /* All extents inactive. */
 936
 937                prev_flags = md->flags;
 938                md->flags &= ~MDF_PRIMARY_IND;
 939                drbd_md_write(mdev, buffer);
 940
 941                dev_info(DEV, "Writing the whole bitmap, %s\n",
 942                         la_size_changed && md_moved ? "size changed and md moved" :
 943                         la_size_changed ? "size changed" : "md moved");
 944                /* next line implicitly does drbd_suspend_io()+drbd_resume_io() */
 945                drbd_bitmap_io(mdev, md_moved ? &drbd_bm_write_all : &drbd_bm_write,
 946                               "size changed", BM_LOCKED_MASK);
 947                drbd_initialize_al(mdev, buffer);
 948
 949                md->flags = prev_flags;
 950                drbd_md_write(mdev, buffer);
 951
 952                if (rs)
 953                        dev_info(DEV, "Changed AL layout to al-stripes = %d, al-stripe-size-kB = %d\n",
 954                                 md->al_stripes, md->al_stripe_size_4k * 4);
 955        }
 956
 957        if (size > la_size_sect)
 958                rv = DS_GREW;
 959        if (size < la_size_sect)
 960                rv = DS_SHRUNK;
 961
 962        if (0) {
 963        err_out:
 964                if (rs) {
 965                        md->al_stripes = prev_al_stripes;
 966                        md->al_stripe_size_4k = prev_al_stripe_size_4k;
 967                        md->al_size_4k = (u64)prev_al_stripes * prev_al_stripe_size_4k;
 968
 969                        drbd_md_set_sector_offsets(mdev, mdev->ldev);
 970                }
 971        }
 972        lc_unlock(mdev->act_log);
 973        wake_up(&mdev->al_wait);
 974        drbd_md_put_buffer(mdev);
 975        drbd_resume_io(mdev);
 976
 977        return rv;
 978}
 979
 980sector_t
 981drbd_new_dev_size(struct drbd_conf *mdev, struct drbd_backing_dev *bdev,
 982                  sector_t u_size, int assume_peer_has_space)
 983{
 984        sector_t p_size = mdev->p_size;   /* partner's disk size. */
 985        sector_t la_size_sect = bdev->md.la_size_sect; /* last agreed size. */
 986        sector_t m_size; /* my size */
 987        sector_t size = 0;
 988
 989        m_size = drbd_get_max_capacity(bdev);
 990
 991        if (mdev->state.conn < C_CONNECTED && assume_peer_has_space) {
 992                dev_warn(DEV, "Resize while not connected was forced by the user!\n");
 993                p_size = m_size;
 994        }
 995
 996        if (p_size && m_size) {
 997                size = min_t(sector_t, p_size, m_size);
 998        } else {
 999                if (la_size_sect) {
1000                        size = la_size_sect;

1001                        if (m_size && m_size < size)
1002                                size = m_size;
1003                        if (p_size && p_size < size)
1004                                size = p_size;
1005                } else {
1006                        if (m_size)
1007                                size = m_size;
1008                        if (p_size)
1009                                size = p_size;
1010                }
1011        }
1012
1013        if (size == 0)
1014                dev_err(DEV, "Both nodes diskless!\n");
1015
1016        if (u_size) {
1017                if (u_size > size)
1018                        dev_err(DEV, "Requested disk size is too big (%lu > %lu)\n",
1019                            (unsigned long)u_size>>1, (unsigned long)size>>1);
1020                else
1021                        size = u_size;
1022        }
1023
1024        return size;
1025}
1026
1027/**
1028 * drbd_check_al_size() - Ensures that the AL is of the right size
1029 * @mdev:       DRBD device.
1030 *
1031 * Returns -EBUSY if current al lru is still used, -ENOMEM when allocation
1032 * failed, and 0 on success. You should call drbd_md_sync() after you called
1033 * this function.
1034 */
1035static int drbd_check_al_size(struct drbd_conf *mdev, struct disk_conf *dc)
1036{
1037        struct lru_cache *n, *t;
1038        struct lc_element *e;
1039        unsigned int in_use;
1040        int i;
1041
1042        if (mdev->act_log &&
1043            mdev->act_log->nr_elements == dc->al_extents)
1044                return 0;
1045
1046        in_use = 0;
1047        t = mdev->act_log;
1048        n = lc_create("act_log", drbd_al_ext_cache, AL_UPDATES_PER_TRANSACTION,
1049                dc->al_extents, sizeof(struct lc_element), 0);
1050
1051        if (n == NULL) {
1052                dev_err(DEV, "Cannot allocate act_log lru!\n");
1053                return -ENOMEM;
1054        }
1055        spin_lock_irq(&mdev->al_lock);
1056        if (t) {
1057                for (i = 0; i < t->nr_elements; i++) {
1058                        e = lc_element_by_index(t, i);
1059                        if (e->refcnt)
1060                                dev_err(DEV, "refcnt(%d)==%d\n",
1061                                    e->lc_number, e->refcnt);
1062                        in_use += e->refcnt;
1063                }
1064        }
1065        if (!in_use)
1066                mdev->act_log = n;
1067        spin_unlock_irq(&mdev->al_lock);
1068        if (in_use) {
1069                dev_err(DEV, "Activity log still in use!\n");
1070                lc_destroy(n);
1071                return -EBUSY;
1072        } else {
1073                if (t)
1074                        lc_destroy(t);
1075        }
1076        drbd_md_mark_dirty(mdev); /* we changed mdev->act_log->nr_elemens */
1077        return 0;
1078}
1079
1080static void drbd_setup_queue_param(struct drbd_conf *mdev, unsigned int max_bio_size)
1081{
1082        struct request_queue * const q = mdev->rq_queue;
1083        unsigned int max_hw_sectors = max_bio_size >> 9;
1084        unsigned int max_segments = 0;
1085
1086        if (get_ldev_if_state(mdev, D_ATTACHING)) {
1087                struct request_queue * const b = mdev->ldev->backing_bdev->bd_disk->queue;
1088
1089                max_hw_sectors = min(queue_max_hw_sectors(b), max_bio_size >> 9);
1090                rcu_read_lock();
1091                max_segments = rcu_dereference(mdev->ldev->disk_conf)->max_bio_bvecs;
1092                rcu_read_unlock();
1093                put_ldev(mdev);
1094        }
1095
1096        blk_queue_logical_block_size(q, 512);
1097        blk_queue_max_hw_sectors(q, max_hw_sectors);
1098        /* This is the workaround for "bio would need to, but cannot, be split" */
1099        blk_queue_max_segments(q, max_segments ? max_segments : BLK_MAX_SEGMENTS);
1100        blk_queue_segment_boundary(q, PAGE_CACHE_SIZE-1);
1101
1102        if (get_ldev_if_state(mdev, D_ATTACHING)) {
1103                struct request_queue * const b = mdev->ldev->backing_bdev->bd_disk->queue;
1104
1105                blk_queue_stack_limits(q, b);
1106
1107                if (q->backing_dev_info.ra_pages != b->backing_dev_info.ra_pages) {
1108                        dev_info(DEV, "Adjusting my ra_pages to backing device's (%lu -> %lu)\n",
1109                                 q->backing_dev_info.ra_pages,
1110                                 b->backing_dev_info.ra_pages);
1111                        q->backing_dev_info.ra_pages = b->backing_dev_info.ra_pages;
1112                }
1113                put_ldev(mdev);
1114        }
1115}
1116
1117void drbd_reconsider_max_bio_size(struct drbd_conf *mdev)
1118{
1119        unsigned int now, new, local, peer;
1120
1121        now = queue_max_hw_sectors(mdev->rq_queue) << 9;
1122        local = mdev->local_max_bio_size; /* Eventually last known value, from volatile memory */
1123        peer = mdev->peer_max_bio_size; /* Eventually last known value, from meta data */
1124
1125        if (get_ldev_if_state(mdev, D_ATTACHING)) {
1126                local = queue_max_hw_sectors(mdev->ldev->backing_bdev->bd_disk->queue) << 9;
1127                mdev->local_max_bio_size = local;
1128                put_ldev(mdev);
1129        }
1130        local = min(local, DRBD_MAX_BIO_SIZE);
1131
1132        /* We may ignore peer limits if the peer is modern enough.
1133           Because new from 8.3.8 onwards the peer can use multiple
1134           BIOs for a single peer_request */
1135        if (mdev->state.conn >= C_CONNECTED) {
1136                if (mdev->tconn->agreed_pro_version < 94)
1137                        peer = min( mdev->peer_max_bio_size, DRBD_MAX_SIZE_H80_PACKET);
1138                        /* Correct old drbd (up to 8.3.7) if it believes it can do more than 32KiB */
1139                else if (mdev->tconn->agreed_pro_version == 94)
1140                        peer = DRBD_MAX_SIZE_H80_PACKET;
1141                else if (mdev->tconn->agreed_pro_version < 100)
1142                        peer = DRBD_MAX_BIO_SIZE_P95;  /* drbd 8.3.8 onwards, before 8.4.0 */
1143                else
1144                        peer = DRBD_MAX_BIO_SIZE;
1145        }
1146
1147        new = min(local, peer);
1148
1149        if (mdev->state.role == R_PRIMARY && new < now)
1150                dev_err(DEV, "ASSERT FAILED new < now; (%u < %u)\n", new, now);
1151
1152        if (new != now)
1153                dev_info(DEV, "max BIO size = %u\n", new);
1154
1155        drbd_setup_queue_param(mdev, new);
1156}
1157
1158/* Starts the worker thread */
1159static void conn_reconfig_start(struct drbd_tconn *tconn)
1160{
1161        drbd_thread_start(&tconn->worker);
1162        conn_flush_workqueue(tconn);
1163}
1164
1165/* if still unconfigured, stops worker again. */
1166static void conn_reconfig_done(struct drbd_tconn *tconn)
1167{
1168        bool stop_threads;
1169        spin_lock_irq(&tconn->req_lock);
1170        stop_threads = conn_all_vols_unconf(tconn) &&
1171                tconn->cstate == C_STANDALONE;
1172        spin_unlock_irq(&tconn->req_lock);
1173        if (stop_threads) {
1174                /* asender is implicitly stopped by receiver
1175                 * in conn_disconnect() */
1176                drbd_thread_stop(&tconn->receiver);
1177                drbd_thread_stop(&tconn->worker);
1178        }
1179}
1180
1181/* Make sure IO is suspended before calling this function(). */
1182static void drbd_suspend_al(struct drbd_conf *mdev)
1183{
1184        int s = 0;
1185
1186        if (!lc_try_lock(mdev->act_log)) {
1187                dev_warn(DEV, "Failed to lock al in drbd_suspend_al()\n");
1188                return;
1189        }
1190
1191        drbd_al_shrink(mdev);
1192        spin_lock_irq(&mdev->tconn->req_lock);
1193        if (mdev->state.conn < C_CONNECTED)
1194                s = !test_and_set_bit(AL_SUSPENDED, &mdev->flags);
1195        spin_unlock_irq(&mdev->tconn->req_lock);
1196        lc_unlock(mdev->act_log);
1197
1198        if (s)
1199                dev_info(DEV, "Suspended AL updates\n");
1200}
1201
1202
1203static bool should_set_defaults(struct genl_info *info)
1204{
1205        unsigned flags = ((struct drbd_genlmsghdr*)info->userhdr)->flags;
1206        return 0 != (flags & DRBD_GENL_F_SET_DEFAULTS);
1207}
1208
1209static unsigned int drbd_al_extents_max(struct drbd_backing_dev *bdev)
1210{
1211        /* This is limited by 16 bit "slot" numbers,
1212         * and by available on-disk context storage.
1213         *
1214         * Also (u16)~0 is special (denotes a "free" extent).
1215         *
1216         * One transaction occupies one 4kB on-disk block,
1217         * we have n such blocks in the on disk ring buffer,
1218         * the "current" transaction may fail (n-1),
1219         * and there is 919 slot numbers context information per transaction.
1220         *
1221         * 72 transaction blocks amounts to more than 2**16 context slots,
1222         * so cap there first.
1223         */
1224        const unsigned int max_al_nr = DRBD_AL_EXTENTS_MAX;
1225        const unsigned int sufficient_on_disk =
1226                (max_al_nr + AL_CONTEXT_PER_TRANSACTION -1)
1227                /AL_CONTEXT_PER_TRANSACTION;
1228
1229        unsigned int al_size_4k = bdev->md.al_size_4k;
1230
1231        if (al_size_4k > sufficient_on_disk)
1232                return max_al_nr;
1233
1234        return (al_size_4k - 1) * AL_CONTEXT_PER_TRANSACTION;
1235}
1236
1237int drbd_adm_disk_opts(struct sk_buff *skb, struct genl_info *info)
1238{
1239        enum drbd_ret_code retcode;
1240        struct drbd_conf *mdev;
1241        struct disk_conf *new_disk_conf, *old_disk_conf;
1242        struct fifo_buffer *old_plan = NULL, *new_plan = NULL;
1243        int err, fifo_size;
1244
1245        retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_MINOR);
1246        if (!adm_ctx.reply_skb)
1247                return retcode;
1248        if (retcode != NO_ERROR)
1249                goto out;
1250
1251        mdev = adm_ctx.mdev;
1252
1253        /* we also need a disk
1254         * to change the options on */
1255        if (!get_ldev(mdev)) {
1256                retcode = ERR_NO_DISK;
1257                goto out;
1258        }
1259
1260        new_disk_conf = kmalloc(sizeof(struct disk_conf), GFP_KERNEL);
1261        if (!new_disk_conf) {
1262                retcode = ERR_NOMEM;
1263                goto fail;
1264        }
1265
1266        mutex_lock(&mdev->tconn->conf_update);
1267        old_disk_conf = mdev->ldev->disk_conf;
1268        *new_disk_conf = *old_disk_conf;
1269        if (should_set_defaults(info))
1270                set_disk_conf_defaults(new_disk_conf);
1271
1272        err = disk_conf_from_attrs_for_change(new_disk_conf, info);
1273        if (err && err != -ENOMSG) {
1274                retcode = ERR_MANDATORY_TAG;
1275                drbd_msg_put_info(from_attrs_err_to_txt(err));
1276        }
1277
1278        if (!expect(new_disk_conf->resync_rate >= 1))
1279                new_disk_conf->resync_rate = 1;
1280
1281        if (new_disk_conf->al_extents < DRBD_AL_EXTENTS_MIN)
1282                new_disk_conf->al_extents = DRBD_AL_EXTENTS_MIN;
1283        if (new_disk_conf->al_extents > drbd_al_extents_max(mdev->ldev))
1284                new_disk_conf->al_extents = drbd_al_extents_max(mdev->ldev);
1285
1286        if (new_disk_conf->c_plan_ahead > DRBD_C_PLAN_AHEAD_MAX)
1287                new_disk_conf->c_plan_ahead = DRBD_C_PLAN_AHEAD_MAX;
1288
1289        fifo_size = (new_disk_conf->c_plan_ahead * 10 * SLEEP_TIME) / HZ;
1290        if (fifo_size != mdev->rs_plan_s->size) {
1291                new_plan = fifo_alloc(fifo_size);
1292                if (!new_plan) {
1293                        dev_err(DEV, "kmalloc of fifo_buffer failed");
1294                        retcode = ERR_NOMEM;
1295                        goto fail_unlock;
1296                }
1297        }
1298
1299        drbd_suspend_io(mdev);
1300        wait_event(mdev->al_wait, lc_try_lock(mdev->act_log));
1301        drbd_al_shrink(mdev);
1302        err = drbd_check_al_size(mdev, new_disk_conf);
1303        lc_unlock(mdev->act_log);
1304        wake_up(&mdev->al_wait);
1305        drbd_resume_io(mdev);
1306
1307        if (err) {
1308                retcode = ERR_NOMEM;
1309                goto fail_unlock;
1310        }
1311
1312        write_lock_irq(&global_state_lock);
1313        retcode = drbd_resync_after_valid(mdev, new_disk_conf->resync_after);
1314        if (retcode == NO_ERROR) {
1315                rcu_assign_pointer(mdev->ldev->disk_conf, new_disk_conf);
1316                drbd_resync_after_changed(mdev);
1317        }
1318        write_unlock_irq(&global_state_lock);
1319
1320        if (retcode != NO_ERROR)
1321                goto fail_unlock;
1322
1323        if (new_plan) {
1324                old_plan = mdev->rs_plan_s;
1325                rcu_assign_pointer(mdev->rs_plan_s, new_plan);
1326        }
1327
1328        mutex_unlock(&mdev->tconn->conf_update);
1329
1330        if (new_disk_conf->al_updates)
1331                mdev->ldev->md.flags &= ~MDF_AL_DISABLED;
1332        else
1333                mdev->ldev->md.flags |= MDF_AL_DISABLED;
1334
1335        if (new_disk_conf->md_flushes)
1336                clear_bit(MD_NO_FUA, &mdev->flags);
1337        else
1338                set_bit(MD_NO_FUA, &mdev->flags);
1339
1340        drbd_bump_write_ordering(mdev->tconn, WO_bdev_flush);
1341
1342        drbd_md_sync(mdev);
1343
1344        if (mdev->state.conn >= C_CONNECTED)
1345                drbd_send_sync_param(mdev);
1346
1347        synchronize_rcu();
1348        kfree(old_disk_conf);
1349        kfree(old_plan);
1350        mod_timer(&mdev->request_timer, jiffies + HZ);
1351        goto success;
1352
1353fail_unlock:
1354        mutex_unlock(&mdev->tconn->conf_update);
1355 fail:
1356        kfree(new_disk_conf);
1357        kfree(new_plan);
1358success:
1359        put_ldev(mdev);
1360 out:
1361        drbd_adm_finish(info, retcode);
1362        return 0;
1363}
1364
1365int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info)
1366{
1367        struct drbd_conf *mdev;
1368        int err;
1369        enum drbd_ret_code retcode;
1370        enum determine_dev_size dd;
1371        sector_t max_possible_sectors;
1372        sector_t min_md_device_sectors;
1373        struct drbd_backing_dev *nbc = NULL; /* new_backing_conf */
1374        struct disk_conf *new_disk_conf = NULL;
1375        struct block_device *bdev;
1376        struct lru_cache *resync_lru = NULL;
1377        struct fifo_buffer *new_plan = NULL;
1378        union drbd_state ns, os;
1379        enum drbd_state_rv rv;
1380        struct net_conf *nc;
1381
1382        retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_MINOR);
1383        if (!adm_ctx.reply_skb)
1384                return retcode;
1385        if (retcode != NO_ERROR)
1386                goto finish;
1387
1388        mdev = adm_ctx.mdev;
1389        conn_reconfig_start(mdev->tconn);
1390
1391        /* if you want to reconfigure, please tear down first */
1392        if (mdev->state.disk > D_DISKLESS) {
1393                retcode = ERR_DISK_CONFIGURED;
1394                goto fail;
1395        }
1396        /* It may just now have detached because of IO error.  Make sure
1397         * drbd_ldev_destroy is done already, we may end up here very fast,
1398         * e.g. if someone calls attach from the on-io-error handler,
1399         * to realize a "hot spare" feature (not that I'd recommend that) */
1400        wait_event(mdev->misc_wait, !atomic_read(&mdev->local_cnt));
1401
1402        /* make sure there is no leftover from previous force-detach attempts */
1403        clear_bit(FORCE_DETACH, &mdev->flags);
1404        clear_bit(WAS_IO_ERROR, &mdev->flags);
1405        clear_bit(WAS_READ_ERROR, &mdev->flags);
1406
1407        /* and no leftover from previously aborted resync or verify, either */
1408        mdev->rs_total = 0;
1409        mdev->rs_failed = 0;
1410        atomic_set(&mdev->rs_pending_cnt, 0);
1411
1412        /* allocation not in the IO path, drbdsetup context */
1413        nbc = kzalloc(sizeof(struct drbd_backing_dev), GFP_KERNEL);
1414        if (!nbc) {
1415                retcode = ERR_NOMEM;
1416                goto fail;
1417        }
1418        spin_lock_init(&nbc->md.uuid_lock);
1419
1420        new_disk_conf = kzalloc(sizeof(struct disk_conf), GFP_KERNEL);
1421        if (!new_disk_conf) {
1422                retcode = ERR_NOMEM;
1423                goto fail;
1424        }
1425        nbc->disk_conf = new_disk_conf;
1426
1427        set_disk_conf_defaults(new_disk_conf);
1428        err = disk_conf_from_attrs(new_disk_conf, info);
1429        if (err) {
1430                retcode = ERR_MANDATORY_TAG;
1431                drbd_msg_put_info(from_attrs_err_to_txt(err));
1432                goto fail;
1433        }
1434
1435        if (new_disk_conf->c_plan_ahead > DRBD_C_PLAN_AHEAD_MAX)
1436                new_disk_conf->c_plan_ahead = DRBD_C_PLAN_AHEAD_MAX;
1437
1438        new_plan = fifo_alloc((new_disk_conf->c_plan_ahead * 10 * SLEEP_TIME) / HZ);
1439        if (!new_plan) {
1440                retcode = ERR_NOMEM;
1441                goto fail;
1442        }
1443
1444        if (new_disk_conf->meta_dev_idx < DRBD_MD_INDEX_FLEX_INT) {
1445                retcode = ERR_MD_IDX_INVALID;
1446                goto fail;
1447        }
1448
1449        write_lock_irq(&global_state_lock);
1450        retcode = drbd_resync_after_valid(mdev, new_disk_conf->resync_after);
1451        write_unlock_irq(&global_state_lock);
1452        if (retcode != NO_ERROR)
1453                goto fail;
1454
1455        rcu_read_lock();
1456        nc = rcu_dereference(mdev->tconn->net_conf);
1457        if (nc) {
1458                if (new_disk_conf->fencing == FP_STONITH && nc->wire_protocol == DRBD_PROT_A) {
1459                        rcu_read_unlock();
1460                        retcode = ERR_STONITH_AND_PROT_A;
1461                        goto fail;
1462                }
1463        }
1464        rcu_read_unlock();
1465
1466        bdev = blkdev_get_by_path(new_disk_conf->backing_dev,
1467                                  FMODE_READ | FMODE_WRITE | FMODE_EXCL, mdev);
1468        if (IS_ERR(bdev)) {
1469                dev_err(DEV, "open(\"%s\") failed with %ld\n", new_disk_conf->backing_dev,
1470                        PTR_ERR(bdev));
1471                retcode = ERR_OPEN_DISK;
1472                goto fail;
1473        }
1474        nbc->backing_bdev = bdev;
1475
1476        /*
1477         * meta_dev_idx >= 0: external fixed size, possibly multiple
1478         * drbd sharing one meta device.  TODO in that case, paranoia
1479         * check that [md_bdev, meta_dev_idx] is not yet used by some
1480         * other drbd minor!  (if you use drbd.conf + drbdadm, that
1481         * should check it for you already; but if you don't, or
1482         * someone fooled it, we need to double check here)
1483         */
1484        bdev = blkdev_get_by_path(new_disk_conf->meta_dev,
1485                                  FMODE_READ | FMODE_WRITE | FMODE_EXCL,
1486                                  (new_disk_conf->meta_dev_idx < 0) ?
1487                                  (void *)mdev : (void *)drbd_m_holder);
1488        if (IS_ERR(bdev)) {
1489                dev_err(DEV, "open(\"%s\") failed with %ld\n", new_disk_conf->meta_dev,
1490                        PTR_ERR(bdev));
1491                retcode = ERR_OPEN_MD_DISK;
1492                goto fail;
1493        }
1494        nbc->md_bdev = bdev;
1495
1496        if ((nbc->backing_bdev == nbc->md_bdev) !=
1497            (new_disk_conf->meta_dev_idx == DRBD_MD_INDEX_INTERNAL ||
1498             new_disk_conf->meta_dev_idx == DRBD_MD_INDEX_FLEX_INT)) {
1499                retcode = ERR_MD_IDX_INVALID;
1500                goto fail;
1501        }
1502
1503        resync_lru = lc_create("resync", drbd_bm_ext_cache,
1504                        1, 61, sizeof(struct bm_extent),
1505                        offsetof(struct bm_extent, lce));
1506        if (!resync_lru) {
1507                retcode = ERR_NOMEM;
1508                goto fail;
1509        }
1510
1511        /* Read our meta data super block early.
1512         * This also sets other on-disk offsets. */
1513        retcode = drbd_md_read(mdev, nbc);
1514        if (retcode != NO_ERROR)
1515                goto fail;
1516
1517        if (new_disk_conf->al_extents < DRBD_AL_EXTENTS_MIN)
1518                new_disk_conf->al_extents = DRBD_AL_EXTENTS_MIN;
1519        if (new_disk_conf->al_extents > drbd_al_extents_max(nbc))
1520                new_disk_conf->al_extents = drbd_al_extents_max(nbc);
1521
1522        if (drbd_get_max_capacity(nbc) < new_disk_conf->disk_size) {
1523                dev_err(DEV, "max capacity %llu smaller than disk size %llu\n",
1524                        (unsigned long long) drbd_get_max_capacity(nbc),
1525                        (unsigned long long) new_disk_conf->disk_size);
1526                retcode = ERR_DISK_TOO_SMALL;
1527                goto fail;
1528        }
1529
1530        if (new_disk_conf->meta_dev_idx < 0) {
1531                max_possible_sectors = DRBD_MAX_SECTORS_FLEX;
1532                /* at least one MB, otherwise it does not make sense */
1533                min_md_device_sectors = (2<<10);
1534        } else {
1535                max_possible_sectors = DRBD_MAX_SECTORS;
1536                min_md_device_sectors = MD_128MB_SECT * (new_disk_conf->meta_dev_idx + 1);
1537        }
1538
1539        if (drbd_get_capacity(nbc->md_bdev) < min_md_device_sectors) {
1540                retcode = ERR_MD_DISK_TOO_SMALL;
1541                dev_warn(DEV, "refusing attach: md-device too small, "
1542                     "at least %llu sectors needed for this meta-disk type\n",
1543                     (unsigned long long) min_md_device_sectors);
1544                goto fail;
1545        }
1546
1547        /* Make sure the new disk is big enough
1548         * (we may currently be R_PRIMARY with no local disk...) */
1549        if (drbd_get_max_capacity(nbc) <
1550            drbd_get_capacity(mdev->this_bdev)) {
1551                retcode = ERR_DISK_TOO_SMALL;
1552                goto fail;
1553        }
1554
1555        nbc->known_size = drbd_get_capacity(nbc->backing_bdev);
1556
1557        if (nbc->known_size > max_possible_sectors) {
1558                dev_warn(DEV, "==> truncating very big lower level device "
1559                        "to currently maximum possible %llu sectors <==\n",
1560                        (unsigned long long) max_possible_sectors);
1561                if (new_disk_conf->meta_dev_idx >= 0)
1562                        dev_warn(DEV, "==>> using internal or flexible "
1563                                      "meta data may help <<==\n");
1564        }
1565
1566        drbd_suspend_io(mdev);
1567        /* also wait for the last barrier ack. */
1568        /* FIXME see also https://daiquiri.linbit/cgi-bin/bugzilla/show_bug.cgi?id=171
1569         * We need a way to either ignore barrier acks for barriers sent before a device
1570         * was attached, or a way to wait for all pending barrier acks to come in.
1571         * As barriers are counted per resource,
1572         * we'd need to suspend io on all devices of a resource.
1573         */
1574        wait_event(mdev->misc_wait, !atomic_read(&mdev->ap_pending_cnt) || drbd_suspended(mdev));
1575        /* and for any other previously queued work */
1576        drbd_flush_workqueue(mdev);
1577
1578        rv = _drbd_request_state(mdev, NS(disk, D_ATTACHING), CS_VERBOSE);
1579        retcode = rv;  /* FIXME: Type mismatch. */
1580        drbd_resume_io(mdev);
1581        if (rv < SS_SUCCESS)
1582                goto fail;
1583
1584        if (!get_ldev_if_state(mdev, D_ATTACHING))
1585                goto force_diskless;
1586
1587        if (!mdev->bitmap) {
1588                if (drbd_bm_init(mdev)) {
1589                        retcode = ERR_NOMEM;
1590                        goto force_diskless_dec;
1591                }
1592        }
1593
1594        if (mdev->state.conn < C_CONNECTED &&
1595            mdev->state.role == R_PRIMARY &&
1596            (mdev->ed_uuid & ~((u64)1)) != (nbc->md.uuid[UI_CURRENT] & ~((u64)1))) {
1597                dev_err(DEV, "Can only attach to data with current UUID=%016llX\n",
1598                    (unsigned long long)mdev->ed_uuid);
1599                retcode = ERR_DATA_NOT_CURRENT;
1600                goto force_diskless_dec;
1601        }
1602
1603        /* Since we are diskless, fix the activity log first... */
1604        if (drbd_check_al_size(mdev, new_disk_conf)) {
1605                retcode = ERR_NOMEM;
1606                goto force_diskless_dec;
1607        }
1608
1609        /* Prevent shrinking of consistent devices ! */
1610        if (drbd_md_test_flag(nbc, MDF_CONSISTENT) &&
1611            drbd_new_dev_size(mdev, nbc, nbc->disk_conf->disk_size, 0) < nbc->md.la_size_sect) {
1612                dev_warn(DEV, "refusing to truncate a consistent device\n");
1613                retcode = ERR_DISK_TOO_SMALL;
1614                goto force_diskless_dec;
1615        }
1616
1617        /* Reset the "barriers don't work" bits here, then force meta data to
1618         * be written, to ensure we determine if barriers are supported. */
1619        if (new_disk_conf->md_flushes)
1620                clear_bit(MD_NO_FUA, &mdev->flags);
1621        else
1622                set_bit(MD_NO_FUA, &mdev->flags);
1623
1624        /* Point of no return reached.
1625         * Devices and memory are no longer released by error cleanup below.
1626         * now mdev takes over responsibility, and the state engine should
1627         * clean it up somewhere.  */
1628        D_ASSERT(mdev->ldev == NULL);
1629        mdev->ldev = nbc;
1630        mdev->resync = resync_lru;
1631        mdev->rs_plan_s = new_plan;
1632        nbc = NULL;
1633        resync_lru = NULL;
1634        new_disk_conf = NULL;
1635        new_plan = NULL;
1636
1637        drbd_bump_write_ordering(mdev->tconn, WO_bdev_flush);
1638
1639        if (drbd_md_test_flag(mdev->ldev, MDF_CRASHED_PRIMARY))
1640                set_bit(CRASHED_PRIMARY, &mdev->flags);
1641        else
1642                clear_bit(CRASHED_PRIMARY, &mdev->flags);
1643
1644        if (drbd_md_test_flag(mdev->ldev, MDF_PRIMARY_IND) &&
1645            !(mdev->state.role == R_PRIMARY && mdev->tconn->susp_nod))
1646                set_bit(CRASHED_PRIMARY, &mdev->flags);
1647
1648        mdev->send_cnt = 0;
1649        mdev->recv_cnt = 0;
1650        mdev->read_cnt = 0;
1651        mdev->writ_cnt = 0;
1652
1653        drbd_reconsider_max_bio_size(mdev);
1654
1655        /* If I am currently not R_PRIMARY,
1656         * but meta data primary indicator is set,
1657         * I just now recover from a hard crash,
1658         * and have been R_PRIMARY before that crash.
1659         *
1660         * Now, if I had no connection before that crash
1661         * (have been degraded R_PRIMARY), chances are that
1662         * I won't find my peer now either.
1663         *
1664         * In that case, and _only_ in that case,
1665         * we use the degr-wfc-timeout instead of the default,
1666         * so we can automatically recover from a crash of a
1667         * degraded but active "cluster" after a certain timeout.
1668         */
1669        clear_bit(USE_DEGR_WFC_T, &mdev->flags);
1670        if (mdev->state.role != R_PRIMARY &&
1671             drbd_md_test_flag(mdev->ldev, MDF_PRIMARY_IND) &&
1672            !drbd_md_test_flag(mdev->ldev, MDF_CONNECTED_IND))
1673                set_bit(USE_DEGR_WFC_T, &mdev->flags);
1674
1675        dd = drbd_determine_dev_size(mdev, 0, NULL);
1676        if (dd <= DS_ERROR) {
1677                retcode = ERR_NOMEM_BITMAP;
1678                goto force_diskless_dec;
1679        } else if (dd == DS_GREW)
1680                set_bit(RESYNC_AFTER_NEG, &mdev->flags);
1681
1682        if (drbd_md_test_flag(mdev->ldev, MDF_FULL_SYNC) ||
1683            (test_bit(CRASHED_PRIMARY, &mdev->flags) &&
1684             drbd_md_test_flag(mdev->ldev, MDF_AL_DISABLED))) {
1685                dev_info(DEV, "Assuming that all blocks are out of sync "
1686                     "(aka FullSync)\n");
1687                if (drbd_bitmap_io(mdev, &drbd_bmio_set_n_write,
1688                        "set_n_write from attaching", BM_LOCKED_MASK)) {
1689                        retcode = ERR_IO_MD_DISK;
1690                        goto force_diskless_dec;
1691                }
1692        } else {
1693                if (drbd_bitmap_io(mdev, &drbd_bm_read,
1694                        "read from attaching", BM_LOCKED_MASK)) {
1695                        retcode = ERR_IO_MD_DISK;
1696                        goto force_diskless_dec;
1697                }
1698        }
1699
1700        if (_drbd_bm_total_weight(mdev) == drbd_bm_bits(mdev))
1701                drbd_suspend_al(mdev); /* IO is still suspended here... */
1702
1703        spin_lock_irq(&mdev->tconn->req_lock);
1704        os = drbd_read_state(mdev);
1705        ns = os;
1706        /* If MDF_CONSISTENT is not set go into inconsistent state,
1707           otherwise investigate MDF_WasUpToDate...
1708           If MDF_WAS_UP_TO_DATE is not set go into D_OUTDATED disk state,
1709           otherwise into D_CONSISTENT state.
1710        */
1711        if (drbd_md_test_flag(mdev->ldev, MDF_CONSISTENT)) {
1712                if (drbd_md_test_flag(mdev->ldev, MDF_WAS_UP_TO_DATE))
1713                        ns.disk = D_CONSISTENT;
1714                else
1715                        ns.disk = D_OUTDATED;
1716        } else {
1717                ns.disk = D_INCONSISTENT;
1718        }
1719
1720        if (drbd_md_test_flag(mdev->ldev, MDF_PEER_OUT_DATED))
1721                ns.pdsk = D_OUTDATED;
1722
1723        rcu_read_lock();
1724        if (ns.disk == D_CONSISTENT &&
1725            (ns.pdsk == D_OUTDATED || rcu_dereference(mdev->ldev->disk_conf)->fencing == FP_DONT_CARE))
1726                ns.disk = D_UP_TO_DATE;
1727
1728        /* All tests on MDF_PRIMARY_IND, MDF_CONNECTED_IND,
1729           MDF_CONSISTENT and MDF_WAS_UP_TO_DATE must happen before
1730           this point, because drbd_request_state() modifies these
1731           flags. */
1732
1733        if (rcu_dereference(mdev->ldev->disk_conf)->al_updates)
1734                mdev->ldev->md.flags &= ~MDF_AL_DISABLED;
1735        else
1736                mdev->ldev->md.flags |= MDF_AL_DISABLED;
1737
1738        rcu_read_unlock();
1739
1740        /* In case we are C_CONNECTED postpone any decision on the new disk
1741           state after the negotiation phase. */
1742        if (mdev->state.conn == C_CONNECTED) {
1743                mdev->new_state_tmp.i = ns.i;
1744                ns.i = os.i;
1745                ns.disk = D_NEGOTIATING;
1746
1747                /* We expect to receive up-to-date UUIDs soon.
1748                   To avoid a race in receive_state, free p_uuid while
1749                   holding req_lock. I.e. atomic with the state change */
1750                kfree(mdev->p_uuid);
1751                mdev->p_uuid = NULL;
1752        }
1753
1754        rv = _drbd_set_state(mdev, ns, CS_VERBOSE, NULL);
1755        spin_unlock_irq(&mdev->tconn->req_lock);
1756
1757        if (rv < SS_SUCCESS)
1758                goto force_diskless_dec;
1759
1760        mod_timer(&mdev->request_timer, jiffies + HZ);
1761
1762        if (mdev->state.role == R_PRIMARY)
1763                mdev->ldev->md.uuid[UI_CURRENT] |=  (u64)1;
1764        else
1765                mdev->ldev->md.uuid[UI_CURRENT] &= ~(u64)1;
1766
1767        drbd_md_mark_dirty(mdev);
1768        drbd_md_sync(mdev);
1769
1770        kobject_uevent(&disk_to_dev(mdev->vdisk)->kobj, KOBJ_CHANGE);
1771        put_ldev(mdev);
1772        conn_reconfig_done(mdev->tconn);
1773        drbd_adm_finish(info, retcode);
1774        return 0;
1775
1776 force_diskless_dec:
1777        put_ldev(mdev);
1778 force_diskless:
1779        drbd_force_state(mdev, NS(disk, D_DISKLESS));
1780        drbd_md_sync(mdev);
1781 fail:
1782        conn_reconfig_done(mdev->tconn);
1783        if (nbc) {
1784                if (nbc->backing_bdev)
1785                        blkdev_put(nbc->backing_bdev,
1786                                   FMODE_READ | FMODE_WRITE | FMODE_EXCL);
1787                if (nbc->md_bdev)
1788                        blkdev_put(nbc->md_bdev,
1789                                   FMODE_READ | FMODE_WRITE | FMODE_EXCL);
1790                kfree(nbc);
1791        }
1792        kfree(new_disk_conf);
1793        lc_destroy(resync_lru);
1794        kfree(new_plan);
1795
1796 finish:
1797        drbd_adm_finish(info, retcode);
1798        return 0;
1799}
1800
1801static int adm_detach(struct drbd_conf *mdev, int force)
1802{
1803        enum drbd_state_rv retcode;
1804        int ret;
1805
1806        if (force) {
1807                set_bit(FORCE_DETACH, &mdev->flags);
1808                drbd_force_state(mdev, NS(disk, D_FAILED));
1809                retcode = SS_SUCCESS;
1810                goto out;
1811        }
1812
1813        drbd_suspend_io(mdev); /* so no-one is stuck in drbd_al_begin_io */
1814        drbd_md_get_buffer(mdev); /* make sure there is no in-flight meta-data IO */
1815        retcode = drbd_request_state(mdev, NS(disk, D_FAILED));
1816        drbd_md_put_buffer(mdev);
1817        /* D_FAILED will transition to DISKLESS. */
1818        ret = wait_event_interruptible(mdev->misc_wait,
1819                        mdev->state.disk != D_FAILED);
1820        drbd_resume_io(mdev);
1821        if ((int)retcode == (int)SS_IS_DISKLESS)
1822                retcode = SS_NOTHING_TO_DO;
1823        if (ret)
1824                retcode = ERR_INTR;
1825out:
1826        return retcode;
1827}
1828
1829/* Detaching the disk is a process in multiple stages.  First we need to lock
1830 * out application IO, in-flight IO, IO stuck in drbd_al_begin_io.
1831 * Then we transition to D_DISKLESS, and wait for put_ldev() to return all
1832 * internal references as well.
1833 * Only then we have finally detached. */
1834int drbd_adm_detach(struct sk_buff *skb, struct genl_info *info)
1835{
1836        enum drbd_ret_code retcode;
1837        struct detach_parms parms = { };
1838        int err;
1839
1840        retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_MINOR);
1841        if (!adm_ctx.reply_skb)
1842                return retcode;
1843        if (retcode != NO_ERROR)
1844                goto out;
1845
1846        if (info->attrs[DRBD_NLA_DETACH_PARMS]) {
1847                err = detach_parms_from_attrs(&parms, info);
1848                if (err) {
1849                        retcode = ERR_MANDATORY_TAG;
1850                        drbd_msg_put_info(from_attrs_err_to_txt(err));
1851                        goto out;
1852                }
1853        }
1854
1855        retcode = adm_detach(adm_ctx.mdev, parms.force_detach);
1856out:
1857        drbd_adm_finish(info, retcode);
1858        return 0;
1859}
1860
1861static bool conn_resync_running(struct drbd_tconn *tconn)
1862{
1863        struct drbd_conf *mdev;
1864        bool rv = false;
1865        int vnr;
1866
1867        rcu_read_lock();
1868        idr_for_each_entry(&tconn->volumes, mdev, vnr) {
1869                if (mdev->state.conn == C_SYNC_SOURCE ||
1870                    mdev->state.conn == C_SYNC_TARGET ||
1871                    mdev->state.conn == C_PAUSED_SYNC_S ||
1872                    mdev->state.conn == C_PAUSED_SYNC_T) {
1873                        rv = true;
1874                        break;
1875                }
1876        }
1877        rcu_read_unlock();
1878
1879        return rv;
1880}
1881
1882static bool conn_ov_running(struct drbd_tconn *tconn)
1883{
1884        struct drbd_conf *mdev;
1885        bool rv = false;
1886        int vnr;
1887
1888        rcu_read_lock();
1889        idr_for_each_entry(&tconn->volumes, mdev, vnr) {
1890                if (mdev->state.conn == C_VERIFY_S ||
1891                    mdev->state.conn == C_VERIFY_T) {
1892                        rv = true;
1893                        break;
1894                }
1895        }
1896        rcu_read_unlock();
1897
1898        return rv;
1899}
1900
1901static enum drbd_ret_code
1902_check_net_options(struct drbd_tconn *tconn, struct net_conf *old_conf, struct net_conf *new_conf)
1903{
1904        struct drbd_conf *mdev;
1905        int i;
1906
1907        if (old_conf && tconn->cstate == C_WF_REPORT_PARAMS && tconn->agreed_pro_version < 100) {
1908                if (new_conf->wire_protocol != old_conf->wire_protocol)
1909                        return ERR_NEED_APV_100;
1910
1911                if (new_conf->two_primaries != old_conf->two_primaries)
1912                        return ERR_NEED_APV_100;
1913
1914                if (strcmp(new_conf->integrity_alg, old_conf->integrity_alg))
1915                        return ERR_NEED_APV_100;
1916        }
1917
1918        if (!new_conf->two_primaries &&
1919            conn_highest_role(tconn) == R_PRIMARY &&
1920            conn_highest_peer(tconn) == R_PRIMARY)
1921                return ERR_NEED_ALLOW_TWO_PRI;
1922
1923        if (new_conf->two_primaries &&
1924            (new_conf->wire_protocol != DRBD_PROT_C))
1925                return ERR_NOT_PROTO_C;
1926
1927        idr_for_each_entry(&tconn->volumes, mdev, i) {
1928                if (get_ldev(mdev)) {
1929                        enum drbd_fencing_p fp = rcu_dereference(mdev->ldev->disk_conf)->fencing;
1930                        put_ldev(mdev);
1931                        if (new_conf->wire_protocol == DRBD_PROT_A && fp == FP_STONITH)
1932                                return ERR_STONITH_AND_PROT_A;
1933                }
1934                if (mdev->state.role == R_PRIMARY && new_conf->discard_my_data)
1935                        return ERR_DISCARD_IMPOSSIBLE;
1936        }
1937
1938        if (new_conf->on_congestion != OC_BLOCK && new_conf->wire_protocol != DRBD_PROT_A)
1939                return ERR_CONG_NOT_PROTO_A;
1940
1941        return NO_ERROR;
1942}
1943
1944static enum drbd_ret_code
1945check_net_options(struct drbd_tconn *tconn, struct net_conf *new_conf)
1946{
1947        static enum drbd_ret_code rv;
1948        struct drbd_conf *mdev;
1949        int i;
1950
1951        rcu_read_lock();
1952        rv = _check_net_options(tconn, rcu_dereference(tconn->net_conf), new_conf);
1953        rcu_read_unlock();
1954
1955        /* tconn->volumes protected by genl_lock() here */
1956        idr_for_each_entry(&tconn->volumes, mdev, i) {
1957                if (!mdev->bitmap) {
1958                        if(drbd_bm_init(mdev))
1959                                return ERR_NOMEM;
1960                }
1961        }
1962
1963        return rv;
1964}
1965
1966struct crypto {
1967        struct crypto_hash *verify_tfm;
1968        struct crypto_hash *csums_tfm;
1969        struct crypto_hash *cram_hmac_tfm;
1970        struct crypto_hash *integrity_tfm;
1971};
1972
1973static int
1974alloc_hash(struct crypto_hash **tfm, char *tfm_name, int err_alg)
1975{
1976        if (!tfm_name[0])
1977                return NO_ERROR;
1978
1979        *tfm = crypto_alloc_hash(tfm_name, 0, CRYPTO_ALG_ASYNC);
1980        if (IS_ERR(*tfm)) {
1981                *tfm = NULL;
1982                return err_alg;
1983        }
1984
1985        return NO_ERROR;
1986}
1987
1988static enum drbd_ret_code
1989alloc_crypto(struct crypto *crypto, struct net_conf *new_conf)
1990{
1991        char hmac_name[CRYPTO_MAX_ALG_NAME];
1992        enum drbd_ret_code rv;
1993
1994        rv = alloc_hash(&crypto->csums_tfm, new_conf->csums_alg,
1995                       ERR_CSUMS_ALG);
1996        if (rv != NO_ERROR)
1997                return rv;
1998        rv = alloc_hash(&crypto->verify_tfm, new_conf->verify_alg,
1999                       ERR_VERIFY_ALG);
2000        if (rv != NO_ERROR)

2001                return rv;
2002        rv = alloc_hash(&crypto->integrity_tfm, new_conf->integrity_alg,
2003                       ERR_INTEGRITY_ALG);
2004        if (rv != NO_ERROR)
2005                return rv;
2006        if (new_conf->cram_hmac_alg[0] != 0) {
2007                snprintf(hmac_name, CRYPTO_MAX_ALG_NAME, "hmac(%s)",
2008                         new_conf->cram_hmac_alg);
2009
2010                rv = alloc_hash(&crypto->cram_hmac_tfm, hmac_name,
2011                               ERR_AUTH_ALG);
2012        }
2013
2014        return rv;
2015}
2016
2017static void free_crypto(struct crypto *crypto)
2018{
2019        crypto_free_hash(crypto->cram_hmac_tfm);
2020        crypto_free_hash(crypto->integrity_tfm);
2021        crypto_free_hash(crypto->csums_tfm);
2022        crypto_free_hash(crypto->verify_tfm);
2023}
2024
2025int drbd_adm_net_opts(struct sk_buff *skb, struct genl_info *info)
2026{
2027        enum drbd_ret_code retcode;
2028        struct drbd_tconn *tconn;
2029        struct net_conf *old_conf, *new_conf = NULL;
2030        int err;
2031        int ovr; /* online verify running */
2032        int rsr; /* re-sync running */
2033        struct crypto crypto = { };
2034
2035        retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_CONNECTION);
2036        if (!adm_ctx.reply_skb)
2037                return retcode;
2038        if (retcode != NO_ERROR)
2039                goto out;
2040
2041        tconn = adm_ctx.tconn;
2042
2043        new_conf = kzalloc(sizeof(struct net_conf), GFP_KERNEL);
2044        if (!new_conf) {
2045                retcode = ERR_NOMEM;
2046                goto out;
2047        }
2048
2049        conn_reconfig_start(tconn);
2050
2051        mutex_lock(&tconn->data.mutex);
2052        mutex_lock(&tconn->conf_update);
2053        old_conf = tconn->net_conf;
2054
2055        if (!old_conf) {
2056                drbd_msg_put_info("net conf missing, try connect");
2057                retcode = ERR_INVALID_REQUEST;
2058                goto fail;
2059        }
2060
2061        *new_conf = *old_conf;
2062        if (should_set_defaults(info))
2063                set_net_conf_defaults(new_conf);
2064
2065        err = net_conf_from_attrs_for_change(new_conf, info);
2066        if (err && err != -ENOMSG) {
2067                retcode = ERR_MANDATORY_TAG;
2068                drbd_msg_put_info(from_attrs_err_to_txt(err));
2069                goto fail;
2070        }
2071
2072        retcode = check_net_options(tconn, new_conf);
2073        if (retcode != NO_ERROR)
2074                goto fail;
2075
2076        /* re-sync running */
2077        rsr = conn_resync_running(tconn);
2078        if (rsr && strcmp(new_conf->csums_alg, old_conf->csums_alg)) {
2079                retcode = ERR_CSUMS_RESYNC_RUNNING;
2080                goto fail;
2081        }
2082
2083        /* online verify running */
2084        ovr = conn_ov_running(tconn);
2085        if (ovr && strcmp(new_conf->verify_alg, old_conf->verify_alg)) {
2086                retcode = ERR_VERIFY_RUNNING;
2087                goto fail;
2088        }
2089
2090        retcode = alloc_crypto(&crypto, new_conf);
2091        if (retcode != NO_ERROR)
2092                goto fail;
2093
2094        rcu_assign_pointer(tconn->net_conf, new_conf);
2095
2096        if (!rsr) {
2097                crypto_free_hash(tconn->csums_tfm);
2098                tconn->csums_tfm = crypto.csums_tfm;
2099                crypto.csums_tfm = NULL;
2100        }
2101        if (!ovr) {
2102                crypto_free_hash(tconn->verify_tfm);
2103                tconn->verify_tfm = crypto.verify_tfm;
2104                crypto.verify_tfm = NULL;
2105        }
2106
2107        crypto_free_hash(tconn->integrity_tfm);
2108        tconn->integrity_tfm = crypto.integrity_tfm;
2109        if (tconn->cstate >= C_WF_REPORT_PARAMS && tconn->agreed_pro_version >= 100)
2110                /* Do this without trying to take tconn->data.mutex again.  */
2111                __drbd_send_protocol(tconn, P_PROTOCOL_UPDATE);
2112
2113        crypto_free_hash(tconn->cram_hmac_tfm);
2114        tconn->cram_hmac_tfm = crypto.cram_hmac_tfm;
2115
2116        mutex_unlock(&tconn->conf_update);
2117        mutex_unlock(&tconn->data.mutex);
2118        synchronize_rcu();
2119        kfree(old_conf);
2120
2121        if (tconn->cstate >= C_WF_REPORT_PARAMS)
2122                drbd_send_sync_param(minor_to_mdev(conn_lowest_minor(tconn)));
2123
2124        goto done;
2125
2126 fail:
2127        mutex_unlock(&tconn->conf_update);
2128        mutex_unlock(&tconn->data.mutex);
2129        free_crypto(&crypto);
2130        kfree(new_conf);
2131 done:
2132        conn_reconfig_done(tconn);
2133 out:
2134        drbd_adm_finish(info, retcode);
2135        return 0;
2136}
2137
2138int drbd_adm_connect(struct sk_buff *skb, struct genl_info *info)
2139{
2140        struct drbd_conf *mdev;
2141        struct net_conf *old_conf, *new_conf = NULL;
2142        struct crypto crypto = { };
2143        struct drbd_tconn *tconn;
2144        enum drbd_ret_code retcode;
2145        int i;
2146        int err;
2147
2148        retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_RESOURCE);
2149
2150        if (!adm_ctx.reply_skb)
2151                return retcode;
2152        if (retcode != NO_ERROR)
2153                goto out;
2154        if (!(adm_ctx.my_addr && adm_ctx.peer_addr)) {
2155                drbd_msg_put_info("connection endpoint(s) missing");
2156                retcode = ERR_INVALID_REQUEST;
2157                goto out;
2158        }
2159
2160        /* No need for _rcu here. All reconfiguration is
2161         * strictly serialized on genl_lock(). We are protected against
2162         * concurrent reconfiguration/addition/deletion */
2163        list_for_each_entry(tconn, &drbd_tconns, all_tconn) {
2164                if (nla_len(adm_ctx.my_addr) == tconn->my_addr_len &&
2165                    !memcmp(nla_data(adm_ctx.my_addr), &tconn->my_addr, tconn->my_addr_len)) {
2166                        retcode = ERR_LOCAL_ADDR;
2167                        goto out;
2168                }
2169
2170                if (nla_len(adm_ctx.peer_addr) == tconn->peer_addr_len &&
2171                    !memcmp(nla_data(adm_ctx.peer_addr), &tconn->peer_addr, tconn->peer_addr_len)) {
2172                        retcode = ERR_PEER_ADDR;
2173                        goto out;
2174                }
2175        }
2176
2177        tconn = adm_ctx.tconn;
2178        conn_reconfig_start(tconn);
2179
2180        if (tconn->cstate > C_STANDALONE) {
2181                retcode = ERR_NET_CONFIGURED;
2182                goto fail;
2183        }
2184
2185        /* allocation not in the IO path, drbdsetup / netlink process context */
2186        new_conf = kzalloc(sizeof(*new_conf), GFP_KERNEL);
2187        if (!new_conf) {
2188                retcode = ERR_NOMEM;
2189                goto fail;
2190        }
2191
2192        set_net_conf_defaults(new_conf);
2193
2194        err = net_conf_from_attrs(new_conf, info);
2195        if (err && err != -ENOMSG) {
2196                retcode = ERR_MANDATORY_TAG;
2197                drbd_msg_put_info(from_attrs_err_to_txt(err));
2198                goto fail;
2199        }
2200
2201        retcode = check_net_options(tconn, new_conf);
2202        if (retcode != NO_ERROR)
2203                goto fail;
2204
2205        retcode = alloc_crypto(&crypto, new_conf);
2206        if (retcode != NO_ERROR)
2207                goto fail;
2208
2209        ((char *)new_conf->shared_secret)[SHARED_SECRET_MAX-1] = 0;
2210
2211        conn_flush_workqueue(tconn);
2212
2213        mutex_lock(&tconn->conf_update);
2214        old_conf = tconn->net_conf;
2215        if (old_conf) {
2216                retcode = ERR_NET_CONFIGURED;
2217                mutex_unlock(&tconn->conf_update);
2218                goto fail;
2219        }
2220        rcu_assign_pointer(tconn->net_conf, new_conf);
2221
2222        conn_free_crypto(tconn);
2223        tconn->cram_hmac_tfm = crypto.cram_hmac_tfm;
2224        tconn->integrity_tfm = crypto.integrity_tfm;
2225        tconn->csums_tfm = crypto.csums_tfm;
2226        tconn->verify_tfm = crypto.verify_tfm;
2227
2228        tconn->my_addr_len = nla_len(adm_ctx.my_addr);
2229        memcpy(&tconn->my_addr, nla_data(adm_ctx.my_addr), tconn->my_addr_len);
2230        tconn->peer_addr_len = nla_len(adm_ctx.peer_addr);
2231        memcpy(&tconn->peer_addr, nla_data(adm_ctx.peer_addr), tconn->peer_addr_len);
2232
2233        mutex_unlock(&tconn->conf_update);
2234
2235        rcu_read_lock();
2236        idr_for_each_entry(&tconn->volumes, mdev, i) {
2237                mdev->send_cnt = 0;
2238                mdev->recv_cnt = 0;
2239        }
2240        rcu_read_unlock();
2241
2242        retcode = conn_request_state(tconn, NS(conn, C_UNCONNECTED), CS_VERBOSE);
2243
2244        conn_reconfig_done(tconn);
2245        drbd_adm_finish(info, retcode);
2246        return 0;
2247
2248fail:
2249        free_crypto(&crypto);
2250        kfree(new_conf);
2251
2252        conn_reconfig_done(tconn);
2253out:
2254        drbd_adm_finish(info, retcode);
2255        return 0;
2256}
2257
2258static enum drbd_state_rv conn_try_disconnect(struct drbd_tconn *tconn, bool force)
2259{
2260        enum drbd_state_rv rv;
2261
2262        rv = conn_request_state(tconn, NS(conn, C_DISCONNECTING),
2263                        force ? CS_HARD : 0);
2264
2265        switch (rv) {
2266        case SS_NOTHING_TO_DO:
2267                break;
2268        case SS_ALREADY_STANDALONE:
2269                return SS_SUCCESS;
2270        case SS_PRIMARY_NOP:
2271                /* Our state checking code wants to see the peer outdated. */
2272                rv = conn_request_state(tconn, NS2(conn, C_DISCONNECTING, pdsk, D_OUTDATED), 0);
2273
2274                if (rv == SS_OUTDATE_WO_CONN) /* lost connection before graceful disconnect succeeded */
2275                        rv = conn_request_state(tconn, NS(conn, C_DISCONNECTING), CS_VERBOSE);
2276
2277                break;
2278        case SS_CW_FAILED_BY_PEER:
2279                /* The peer probably wants to see us outdated. */
2280                rv = conn_request_state(tconn, NS2(conn, C_DISCONNECTING,
2281                                                        disk, D_OUTDATED), 0);
2282                if (rv == SS_IS_DISKLESS || rv == SS_LOWER_THAN_OUTDATED) {
2283                        rv = conn_request_state(tconn, NS(conn, C_DISCONNECTING),
2284                                        CS_HARD);
2285                }
2286                break;
2287        default:;
2288                /* no special handling necessary */
2289        }
2290
2291        if (rv >= SS_SUCCESS) {
2292                enum drbd_state_rv rv2;
2293                /* No one else can reconfigure the network while I am here.
2294                 * The state handling only uses drbd_thread_stop_nowait(),
2295                 * we want to really wait here until the receiver is no more.
2296                 */
2297                drbd_thread_stop(&adm_ctx.tconn->receiver);
2298
2299                /* Race breaker.  This additional state change request may be
2300                 * necessary, if this was a forced disconnect during a receiver
2301                 * restart.  We may have "killed" the receiver thread just
2302                 * after drbdd_init() returned.  Typically, we should be
2303                 * C_STANDALONE already, now, and this becomes a no-op.
2304                 */
2305                rv2 = conn_request_state(tconn, NS(conn, C_STANDALONE),
2306                                CS_VERBOSE | CS_HARD);
2307                if (rv2 < SS_SUCCESS)
2308                        conn_err(tconn,
2309                                "unexpected rv2=%d in conn_try_disconnect()\n",
2310                                rv2);
2311        }
2312        return rv;
2313}
2314
2315int drbd_adm_disconnect(struct sk_buff *skb, struct genl_info *info)
2316{
2317        struct disconnect_parms parms;
2318        struct drbd_tconn *tconn;
2319        enum drbd_state_rv rv;
2320        enum drbd_ret_code retcode;
2321        int err;
2322
2323        retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_CONNECTION);
2324        if (!adm_ctx.reply_skb)
2325                return retcode;
2326        if (retcode != NO_ERROR)
2327                goto fail;
2328
2329        tconn = adm_ctx.tconn;
2330        memset(&parms, 0, sizeof(parms));
2331        if (info->attrs[DRBD_NLA_DISCONNECT_PARMS]) {
2332                err = disconnect_parms_from_attrs(&parms, info);
2333                if (err) {
2334                        retcode = ERR_MANDATORY_TAG;
2335                        drbd_msg_put_info(from_attrs_err_to_txt(err));
2336                        goto fail;
2337                }
2338        }
2339
2340        rv = conn_try_disconnect(tconn, parms.force_disconnect);
2341        if (rv < SS_SUCCESS)
2342                retcode = rv;  /* FIXME: Type mismatch. */
2343        else
2344                retcode = NO_ERROR;
2345 fail:
2346        drbd_adm_finish(info, retcode);
2347        return 0;
2348}
2349
2350void resync_after_online_grow(struct drbd_conf *mdev)
2351{
2352        int iass; /* I am sync source */
2353
2354        dev_info(DEV, "Resync of new storage after online grow\n");
2355        if (mdev->state.role != mdev->state.peer)
2356                iass = (mdev->state.role == R_PRIMARY);
2357        else
2358                iass = test_bit(RESOLVE_CONFLICTS, &mdev->tconn->flags);
2359
2360        if (iass)
2361                drbd_start_resync(mdev, C_SYNC_SOURCE);
2362        else
2363                _drbd_request_state(mdev, NS(conn, C_WF_SYNC_UUID), CS_VERBOSE + CS_SERIALIZE);
2364}
2365
2366int drbd_adm_resize(struct sk_buff *skb, struct genl_info *info)
2367{
2368        struct disk_conf *old_disk_conf, *new_disk_conf = NULL;
2369        struct resize_parms rs;
2370        struct drbd_conf *mdev;
2371        enum drbd_ret_code retcode;
2372        enum determine_dev_size dd;
2373        bool change_al_layout = false;
2374        enum dds_flags ddsf;
2375        sector_t u_size;
2376        int err;
2377
2378        retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_MINOR);
2379        if (!adm_ctx.reply_skb)
2380                return retcode;
2381        if (retcode != NO_ERROR)
2382                goto fail;
2383
2384        mdev = adm_ctx.mdev;
2385        if (!get_ldev(mdev)) {
2386                retcode = ERR_NO_DISK;
2387                goto fail;
2388        }
2389
2390        memset(&rs, 0, sizeof(struct resize_parms));
2391        rs.al_stripes = mdev->ldev->md.al_stripes;
2392        rs.al_stripe_size = mdev->ldev->md.al_stripe_size_4k * 4;
2393        if (info->attrs[DRBD_NLA_RESIZE_PARMS]) {
2394                err = resize_parms_from_attrs(&rs, info);
2395                if (err) {
2396                        retcode = ERR_MANDATORY_TAG;
2397                        drbd_msg_put_info(from_attrs_err_to_txt(err));
2398                        goto fail_ldev;
2399                }
2400        }
2401
2402        if (mdev->state.conn > C_CONNECTED) {
2403                retcode = ERR_RESIZE_RESYNC;
2404                goto fail_ldev;
2405        }
2406
2407        if (mdev->state.role == R_SECONDARY &&
2408            mdev->state.peer == R_SECONDARY) {
2409                retcode = ERR_NO_PRIMARY;
2410                goto fail_ldev;
2411        }
2412
2413        if (rs.no_resync && mdev->tconn->agreed_pro_version < 93) {
2414                retcode = ERR_NEED_APV_93;
2415                goto fail_ldev;
2416        }
2417
2418        rcu_read_lock();
2419        u_size = rcu_dereference(mdev->ldev->disk_conf)->disk_size;
2420        rcu_read_unlock();
2421        if (u_size != (sector_t)rs.resize_size) {
2422                new_disk_conf = kmalloc(sizeof(struct disk_conf), GFP_KERNEL);
2423                if (!new_disk_conf) {
2424                        retcode = ERR_NOMEM;
2425                        goto fail_ldev;
2426                }
2427        }
2428
2429        if (mdev->ldev->md.al_stripes != rs.al_stripes ||
2430            mdev->ldev->md.al_stripe_size_4k != rs.al_stripe_size / 4) {
2431                u32 al_size_k = rs.al_stripes * rs.al_stripe_size;
2432
2433                if (al_size_k > (16 * 1024 * 1024)) {
2434                        retcode = ERR_MD_LAYOUT_TOO_BIG;
2435                        goto fail_ldev;
2436                }
2437
2438                if (al_size_k < MD_32kB_SECT/2) {
2439                        retcode = ERR_MD_LAYOUT_TOO_SMALL;
2440                        goto fail_ldev;
2441                }
2442
2443                if (mdev->state.conn != C_CONNECTED) {
2444                        retcode = ERR_MD_LAYOUT_CONNECTED;
2445                        goto fail_ldev;
2446                }
2447
2448                change_al_layout = true;
2449        }
2450
2451        if (mdev->ldev->known_size != drbd_get_capacity(mdev->ldev->backing_bdev))
2452                mdev->ldev->known_size = drbd_get_capacity(mdev->ldev->backing_bdev);
2453
2454        if (new_disk_conf) {
2455                mutex_lock(&mdev->tconn->conf_update);
2456                old_disk_conf = mdev->ldev->disk_conf;
2457                *new_disk_conf = *old_disk_conf;
2458                new_disk_conf->disk_size = (sector_t)rs.resize_size;
2459                rcu_assign_pointer(mdev->ldev->disk_conf, new_disk_conf);
2460                mutex_unlock(&mdev->tconn->conf_update);
2461                synchronize_rcu();
2462                kfree(old_disk_conf);
2463        }
2464
2465        ddsf = (rs.resize_force ? DDSF_FORCED : 0) | (rs.no_resync ? DDSF_NO_RESYNC : 0);
2466        dd = drbd_determine_dev_size(mdev, ddsf, change_al_layout ? &rs : NULL);
2467        drbd_md_sync(mdev);
2468        put_ldev(mdev);
2469        if (dd == DS_ERROR) {
2470                retcode = ERR_NOMEM_BITMAP;
2471                goto fail;
2472        } else if (dd == DS_ERROR_SPACE_MD) {
2473                retcode = ERR_MD_LAYOUT_NO_FIT;
2474                goto fail;
2475        } else if (dd == DS_ERROR_SHRINK) {
2476                retcode = ERR_IMPLICIT_SHRINK;
2477                goto fail;
2478        }
2479
2480        if (mdev->state.conn == C_CONNECTED) {
2481                if (dd == DS_GREW)
2482                        set_bit(RESIZE_PENDING, &mdev->flags);
2483
2484                drbd_send_uuids(mdev);
2485                drbd_send_sizes(mdev, 1, ddsf);
2486        }
2487
2488 fail:
2489        drbd_adm_finish(info, retcode);
2490        return 0;
2491
2492 fail_ldev:
2493        put_ldev(mdev);
2494        goto fail;
2495}
2496
2497int drbd_adm_resource_opts(struct sk_buff *skb, struct genl_info *info)
2498{
2499        enum drbd_ret_code retcode;
2500        struct drbd_tconn *tconn;
2501        struct res_opts res_opts;
2502        int err;
2503
2504        retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_RESOURCE);
2505        if (!adm_ctx.reply_skb)
2506                return retcode;
2507        if (retcode != NO_ERROR)
2508                goto fail;
2509        tconn = adm_ctx.tconn;
2510
2511        res_opts = tconn->res_opts;
2512        if (should_set_defaults(info))
2513                set_res_opts_defaults(&res_opts);
2514
2515        err = res_opts_from_attrs(&res_opts, info);
2516        if (err && err != -ENOMSG) {
2517                retcode = ERR_MANDATORY_TAG;
2518                drbd_msg_put_info(from_attrs_err_to_txt(err));
2519                goto fail;
2520        }
2521
2522        err = set_resource_options(tconn, &res_opts);
2523        if (err) {
2524                retcode = ERR_INVALID_REQUEST;
2525                if (err == -ENOMEM)
2526                        retcode = ERR_NOMEM;
2527        }
2528
2529fail:
2530        drbd_adm_finish(info, retcode);
2531        return 0;
2532}
2533
2534int drbd_adm_invalidate(struct sk_buff *skb, struct genl_info *info)
2535{
2536        struct drbd_conf *mdev;
2537        int retcode; /* enum drbd_ret_code rsp. enum drbd_state_rv */
2538
2539        retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_MINOR);
2540        if (!adm_ctx.reply_skb)
2541                return retcode;
2542        if (retcode != NO_ERROR)
2543                goto out;
2544
2545        mdev = adm_ctx.mdev;
2546
2547        /* If there is still bitmap IO pending, probably because of a previous
2548         * resync just being finished, wait for it before requesting a new resync.
2549         * Also wait for it's after_state_ch(). */
2550        drbd_suspend_io(mdev);
2551        wait_event(mdev->misc_wait, !test_bit(BITMAP_IO, &mdev->flags));
2552        drbd_flush_workqueue(mdev);
2553
2554        /* If we happen to be C_STANDALONE R_SECONDARY, just change to
2555         * D_INCONSISTENT, and set all bits in the bitmap.  Otherwise,
2556         * try to start a resync handshake as sync target for full sync.
2557         */
2558        if (mdev->state.conn == C_STANDALONE && mdev->state.role == R_SECONDARY) {
2559                retcode = drbd_request_state(mdev, NS(disk, D_INCONSISTENT));
2560                if (retcode >= SS_SUCCESS) {
2561                        if (drbd_bitmap_io(mdev, &drbd_bmio_set_n_write,
2562                                "set_n_write from invalidate", BM_LOCKED_MASK))
2563                                retcode = ERR_IO_MD_DISK;
2564                }
2565        } else
2566                retcode = drbd_request_state(mdev, NS(conn, C_STARTING_SYNC_T));
2567        drbd_resume_io(mdev);
2568
2569out:
2570        drbd_adm_finish(info, retcode);
2571        return 0;
2572}
2573
2574static int drbd_adm_simple_request_state(struct sk_buff *skb, struct genl_info *info,
2575                union drbd_state mask, union drbd_state val)
2576{
2577        enum drbd_ret_code retcode;
2578
2579        retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_MINOR);
2580        if (!adm_ctx.reply_skb)
2581                return retcode;
2582        if (retcode != NO_ERROR)
2583                goto out;
2584
2585        retcode = drbd_request_state(adm_ctx.mdev, mask, val);
2586out:
2587        drbd_adm_finish(info, retcode);
2588        return 0;
2589}
2590
2591static int drbd_bmio_set_susp_al(struct drbd_conf *mdev)
2592{
2593        int rv;
2594
2595        rv = drbd_bmio_set_n_write(mdev);
2596        drbd_suspend_al(mdev);
2597        return rv;
2598}
2599
2600int drbd_adm_invalidate_peer(struct sk_buff *skb, struct genl_info *info)
2601{
2602        int retcode; /* drbd_ret_code, drbd_state_rv */
2603        struct drbd_conf *mdev;
2604
2605        retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_MINOR);
2606        if (!adm_ctx.reply_skb)
2607                return retcode;
2608        if (retcode != NO_ERROR)
2609                goto out;
2610
2611        mdev = adm_ctx.mdev;
2612
2613        /* If there is still bitmap IO pending, probably because of a previous
2614         * resync just being finished, wait for it before requesting a new resync.
2615         * Also wait for it's after_state_ch(). */
2616        drbd_suspend_io(mdev);
2617        wait_event(mdev->misc_wait, !test_bit(BITMAP_IO, &mdev->flags));
2618        drbd_flush_workqueue(mdev);
2619
2620        /* If we happen to be C_STANDALONE R_PRIMARY, just set all bits
2621         * in the bitmap.  Otherwise, try to start a resync handshake
2622         * as sync source for full sync.
2623         */
2624        if (mdev->state.conn == C_STANDALONE && mdev->state.role == R_PRIMARY) {
2625                /* The peer will get a resync upon connect anyways. Just make that
2626                   into a full resync. */
2627                retcode = drbd_request_state(mdev, NS(pdsk, D_INCONSISTENT));
2628                if (retcode >= SS_SUCCESS) {
2629                        if (drbd_bitmap_io(mdev, &drbd_bmio_set_susp_al,
2630                                "set_n_write from invalidate_peer",
2631                                BM_LOCKED_SET_ALLOWED))
2632                                retcode = ERR_IO_MD_DISK;
2633                }
2634        } else
2635                retcode = drbd_request_state(mdev, NS(conn, C_STARTING_SYNC_S));
2636        drbd_resume_io(mdev);
2637
2638out:
2639        drbd_adm_finish(info, retcode);
2640        return 0;
2641}
2642
2643int drbd_adm_pause_sync(struct sk_buff *skb, struct genl_info *info)
2644{
2645        enum drbd_ret_code retcode;
2646
2647        retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_MINOR);
2648        if (!adm_ctx.reply_skb)
2649                return retcode;
2650        if (retcode != NO_ERROR)
2651                goto out;
2652
2653        if (drbd_request_state(adm_ctx.mdev, NS(user_isp, 1)) == SS_NOTHING_TO_DO)
2654                retcode = ERR_PAUSE_IS_SET;
2655out:
2656        drbd_adm_finish(info, retcode);
2657        return 0;
2658}
2659
2660int drbd_adm_resume_sync(struct sk_buff *skb, struct genl_info *info)
2661{
2662        union drbd_dev_state s;
2663        enum drbd_ret_code retcode;
2664
2665        retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_MINOR);
2666        if (!adm_ctx.reply_skb)
2667                return retcode;
2668        if (retcode != NO_ERROR)
2669                goto out;
2670
2671        if (drbd_request_state(adm_ctx.mdev, NS(user_isp, 0)) == SS_NOTHING_TO_DO) {
2672                s = adm_ctx.mdev->state;
2673                if (s.conn == C_PAUSED_SYNC_S || s.conn == C_PAUSED_SYNC_T) {
2674                        retcode = s.aftr_isp ? ERR_PIC_AFTER_DEP :
2675                                  s.peer_isp ? ERR_PIC_PEER_DEP : ERR_PAUSE_IS_CLEAR;
2676                } else {
2677                        retcode = ERR_PAUSE_IS_CLEAR;
2678                }
2679        }
2680
2681out:
2682        drbd_adm_finish(info, retcode);
2683        return 0;
2684}
2685
2686int drbd_adm_suspend_io(struct sk_buff *skb, struct genl_info *info)
2687{
2688        return drbd_adm_simple_request_state(skb, info, NS(susp, 1));
2689}
2690
2691int drbd_adm_resume_io(struct sk_buff *skb, struct genl_info *info)
2692{
2693        struct drbd_conf *mdev;
2694        int retcode; /* enum drbd_ret_code rsp. enum drbd_state_rv */
2695
2696        retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_MINOR);
2697        if (!adm_ctx.reply_skb)
2698                return retcode;
2699        if (retcode != NO_ERROR)
2700                goto out;
2701
2702        mdev = adm_ctx.mdev;
2703        if (test_bit(NEW_CUR_UUID, &mdev->flags)) {
2704                drbd_uuid_new_current(mdev);
2705                clear_bit(NEW_CUR_UUID, &mdev->flags);
2706        }
2707        drbd_suspend_io(mdev);
2708        retcode = drbd_request_state(mdev, NS3(susp, 0, susp_nod, 0, susp_fen, 0));
2709        if (retcode == SS_SUCCESS) {
2710                if (mdev->state.conn < C_CONNECTED)
2711                        tl_clear(mdev->tconn);
2712                if (mdev->state.disk == D_DISKLESS || mdev->state.disk == D_FAILED)
2713                        tl_restart(mdev->tconn, FAIL_FROZEN_DISK_IO);
2714        }
2715        drbd_resume_io(mdev);
2716
2717out:
2718        drbd_adm_finish(info, retcode);
2719        return 0;
2720}
2721
2722int drbd_adm_outdate(struct sk_buff *skb, struct genl_info *info)
2723{
2724        return drbd_adm_simple_request_state(skb, info, NS(disk, D_OUTDATED));
2725}
2726
2727int nla_put_drbd_cfg_context(struct sk_buff *skb, struct drbd_tconn *tconn, unsigned vnr)
2728{
2729        struct nlattr *nla;
2730        nla = nla_nest_start(skb, DRBD_NLA_CFG_CONTEXT);
2731        if (!nla)
2732                goto nla_put_failure;
2733        if (vnr != VOLUME_UNSPECIFIED &&
2734            nla_put_u32(skb, T_ctx_volume, vnr))
2735                goto nla_put_failure;
2736        if (nla_put_string(skb, T_ctx_resource_name, tconn->name))
2737                goto nla_put_failure;
2738        if (tconn->my_addr_len &&
2739            nla_put(skb, T_ctx_my_addr, tconn->my_addr_len, &tconn->my_addr))
2740                goto nla_put_failure;
2741        if (tconn->peer_addr_len &&
2742            nla_put(skb, T_ctx_peer_addr, tconn->peer_addr_len, &tconn->peer_addr))
2743                goto nla_put_failure;
2744        nla_nest_end(skb, nla);
2745        return 0;
2746
2747nla_put_failure:
2748        if (nla)
2749                nla_nest_cancel(skb, nla);
2750        return -EMSGSIZE;
2751}
2752
2753int nla_put_status_info(struct sk_buff *skb, struct drbd_conf *mdev,
2754                const struct sib_info *sib)
2755{
2756        struct state_info *si = NULL; /* for sizeof(si->member); */
2757        struct nlattr *nla;
2758        int got_ldev;
2759        int err = 0;
2760        int exclude_sensitive;
2761
2762        /* If sib != NULL, this is drbd_bcast_event, which anyone can listen
2763         * to.  So we better exclude_sensitive information.
2764         *
2765         * If sib == NULL, this is drbd_adm_get_status, executed synchronously
2766         * in the context of the requesting user process. Exclude sensitive
2767         * information, unless current has superuser.
2768         *
2769         * NOTE: for drbd_adm_get_status_all(), this is a netlink dump, and
2770         * relies on the current implementation of netlink_dump(), which
2771         * executes the dump callback successively from netlink_recvmsg(),
2772         * always in the context of the receiving process */
2773        exclude_sensitive = sib || !capable(CAP_SYS_ADMIN);
2774
2775        got_ldev = get_ldev(mdev);
2776
2777        /* We need to add connection name and volume number information still.
2778         * Minor number is in drbd_genlmsghdr. */
2779        if (nla_put_drbd_cfg_context(skb, mdev->tconn, mdev->vnr))
2780                goto nla_put_failure;
2781
2782        if (res_opts_to_skb(skb, &mdev->tconn->res_opts, exclude_sensitive))
2783                goto nla_put_failure;
2784
2785        rcu_read_lock();
2786        if (got_ldev) {
2787                struct disk_conf *disk_conf;
2788
2789                disk_conf = rcu_dereference(mdev->ldev->disk_conf);
2790                err = disk_conf_to_skb(skb, disk_conf, exclude_sensitive);
2791        }
2792        if (!err) {
2793                struct net_conf *nc;
2794
2795                nc = rcu_dereference(mdev->tconn->net_conf);
2796                if (nc)
2797                        err = net_conf_to_skb(skb, nc, exclude_sensitive);
2798        }
2799        rcu_read_unlock();
2800        if (err)
2801                goto nla_put_failure;
2802
2803        nla = nla_nest_start(skb, DRBD_NLA_STATE_INFO);
2804        if (!nla)
2805                goto nla_put_failure;
2806        if (nla_put_u32(skb, T_sib_reason, sib ? sib->sib_reason : SIB_GET_STATUS_REPLY) ||
2807            nla_put_u32(skb, T_current_state, mdev->state.i) ||
2808            nla_put_u64(skb, T_ed_uuid, mdev->ed_uuid) ||
2809            nla_put_u64(skb, T_capacity, drbd_get_capacity(mdev->this_bdev)) ||
2810            nla_put_u64(skb, T_send_cnt, mdev->send_cnt) ||
2811            nla_put_u64(skb, T_recv_cnt, mdev->recv_cnt) ||
2812            nla_put_u64(skb, T_read_cnt, mdev->read_cnt) ||
2813            nla_put_u64(skb, T_writ_cnt, mdev->writ_cnt) ||
2814            nla_put_u64(skb, T_al_writ_cnt, mdev->al_writ_cnt) ||
2815            nla_put_u64(skb, T_bm_writ_cnt, mdev->bm_writ_cnt) ||
2816            nla_put_u32(skb, T_ap_bio_cnt, atomic_read(&mdev->ap_bio_cnt)) ||
2817            nla_put_u32(skb, T_ap_pending_cnt, atomic_read(&mdev->ap_pending_cnt)) ||
2818            nla_put_u32(skb, T_rs_pending_cnt, atomic_read(&mdev->rs_pending_cnt)))
2819                goto nla_put_failure;
2820
2821        if (got_ldev) {
2822                int err;
2823
2824                spin_lock_irq(&mdev->ldev->md.uuid_lock);
2825                err = nla_put(skb, T_uuids, sizeof(si->uuids), mdev->ldev->md.uuid);
2826                spin_unlock_irq(&mdev->ldev->md.uuid_lock);
2827
2828                if (err)
2829                        goto nla_put_failure;
2830
2831                if (nla_put_u32(skb, T_disk_flags, mdev->ldev->md.flags) ||
2832                    nla_put_u64(skb, T_bits_total, drbd_bm_bits(mdev)) ||
2833                    nla_put_u64(skb, T_bits_oos, drbd_bm_total_weight(mdev)))
2834                        goto nla_put_failure;
2835                if (C_SYNC_SOURCE <= mdev->state.conn &&
2836                    C_PAUSED_SYNC_T >= mdev->state.conn) {
2837                        if (nla_put_u64(skb, T_bits_rs_total, mdev->rs_total) ||
2838                            nla_put_u64(skb, T_bits_rs_failed, mdev->rs_failed))
2839                                goto nla_put_failure;
2840                }
2841        }
2842
2843        if (sib) {
2844                switch(sib->sib_reason) {
2845                case SIB_SYNC_PROGRESS:
2846                case SIB_GET_STATUS_REPLY:
2847                        break;
2848                case SIB_STATE_CHANGE:
2849                        if (nla_put_u32(skb, T_prev_state, sib->os.i) ||
2850                            nla_put_u32(skb, T_new_state, sib->ns.i))
2851                                goto nla_put_failure;
2852                        break;
2853                case SIB_HELPER_POST:
2854                        if (nla_put_u32(skb, T_helper_exit_code,
2855                                        sib->helper_exit_code))
2856                                goto nla_put_failure;
2857                        /* fall through */
2858                case SIB_HELPER_PRE:
2859                        if (nla_put_string(skb, T_helper, sib->helper_name))
2860                                goto nla_put_failure;
2861                        break;
2862                }
2863        }
2864        nla_nest_end(skb, nla);
2865
2866        if (0)
2867nla_put_failure:
2868                err = -EMSGSIZE;
2869        if (got_ldev)
2870                put_ldev(mdev);
2871        return err;
2872}
2873
2874int drbd_adm_get_status(struct sk_buff *skb, struct genl_info *info)
2875{
2876        enum drbd_ret_code retcode;
2877        int err;
2878
2879        retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_MINOR);
2880        if (!adm_ctx.reply_skb)
2881                return retcode;
2882        if (retcode != NO_ERROR)
2883                goto out;
2884
2885        err = nla_put_status_info(adm_ctx.reply_skb, adm_ctx.mdev, NULL);
2886        if (err) {
2887                nlmsg_free(adm_ctx.reply_skb);
2888                return err;
2889        }
2890out:
2891        drbd_adm_finish(info, retcode);
2892        return 0;
2893}
2894
2895int get_one_status(struct sk_buff *skb, struct netlink_callback *cb)
2896{
2897        struct drbd_conf *mdev;
2898        struct drbd_genlmsghdr *dh;
2899        struct drbd_tconn *pos = (struct drbd_tconn*)cb->args[0];
2900        struct drbd_tconn *tconn = NULL;
2901        struct drbd_tconn *tmp;
2902        unsigned volume = cb->args[1];
2903
2904        /* Open coded, deferred, iteration:
2905         * list_for_each_entry_safe(tconn, tmp, &drbd_tconns, all_tconn) {
2906         *      idr_for_each_entry(&tconn->volumes, mdev, i) {
2907         *        ...
2908         *      }
2909         * }
2910         * where tconn is cb->args[0];
2911         * and i is cb->args[1];
2912         *
2913         * cb->args[2] indicates if we shall loop over all resources,
2914         * or just dump all volumes of a single resource.
2915         *
2916         * This may miss entries inserted after this dump started,
2917         * or entries deleted before they are reached.
2918         *
2919         * We need to make sure the mdev won't disappear while
2920         * we are looking at it, and revalidate our iterators
2921         * on each iteration.
2922         */
2923
2924        /* synchronize with conn_create()/conn_destroy() */
2925        rcu_read_lock();
2926        /* revalidate iterator position */
2927        list_for_each_entry_rcu(tmp, &drbd_tconns, all_tconn) {
2928                if (pos == NULL) {
2929                        /* first iteration */
2930                        pos = tmp;
2931                        tconn = pos;
2932                        break;
2933                }
2934                if (tmp == pos) {
2935                        tconn = pos;
2936                        break;
2937                }
2938        }
2939        if (tconn) {
2940next_tconn:
2941                mdev = idr_get_next(&tconn->volumes, &volume);
2942                if (!mdev) {
2943                        /* No more volumes to dump on this tconn.
2944                         * Advance tconn iterator. */
2945                        pos = list_entry_rcu(tconn->all_tconn.next,
2946                                             struct drbd_tconn, all_tconn);
2947                        /* Did we dump any volume on this tconn yet? */
2948                        if (volume != 0) {
2949                                /* If we reached the end of the list,
2950                                 * or only a single resource dump was requested,
2951                                 * we are done. */
2952                                if (&pos->all_tconn == &drbd_tconns || cb->args[2])
2953                                        goto out;
2954                                volume = 0;
2955                                tconn = pos;
2956                                goto next_tconn;
2957                        }
2958                }
2959
2960                dh = genlmsg_put(skb, NETLINK_CB(cb->skb).portid,
2961                                cb->nlh->nlmsg_seq, &drbd_genl_family,
2962                                NLM_F_MULTI, DRBD_ADM_GET_STATUS);
2963                if (!dh)
2964                        goto out;
2965
2966                if (!mdev) {
2967                        /* This is a tconn without a single volume.
2968                         * Suprisingly enough, it may have a network
2969                         * configuration. */
2970                        struct net_conf *nc;
2971                        dh->minor = -1U;
2972                        dh->ret_code = NO_ERROR;
2973                        if (nla_put_drbd_cfg_context(skb, tconn, VOLUME_UNSPECIFIED))
2974                                goto cancel;
2975                        nc = rcu_dereference(tconn->net_conf);
2976                        if (nc && net_conf_to_skb(skb, nc, 1) != 0)
2977                                goto cancel;
2978                        goto done;
2979                }
2980
2981                D_ASSERT(mdev->vnr == volume);
2982                D_ASSERT(mdev->tconn == tconn);
2983
2984                dh->minor = mdev_to_minor(mdev);
2985                dh->ret_code = NO_ERROR;
2986
2987                if (nla_put_status_info(skb, mdev, NULL)) {
2988cancel:
2989                        genlmsg_cancel(skb, dh);
2990                        goto out;
2991                }
2992done:
2993                genlmsg_end(skb, dh);
2994        }
2995
2996out:
2997        rcu_read_unlock();
2998        /* where to start the next iteration */
2999        cb->args[0] = (long)pos;
3000        cb->args[1] = (pos == tconn) ? volume + 1 : 0;

3001
3002        /* No more tconns/volumes/minors found results in an empty skb.
3003         * Which will terminate the dump. */
3004        return skb->len;
3005}
3006
3007/*
3008 * Request status of all resources, or of all volumes within a single resource.
3009 *
3010 * This is a dump, as the answer may not fit in a single reply skb otherwise.
3011 * Which means we cannot use the family->attrbuf or other such members, because
3012 * dump is NOT protected by the genl_lock().  During dump, we only have access
3013 * to the incoming skb, and need to opencode "parsing" of the nlattr payload.
3014 *
3015 * Once things are setup properly, we call into get_one_status().
3016 */
3017int drbd_adm_get_status_all(struct sk_buff *skb, struct netlink_callback *cb)
3018{
3019        const unsigned hdrlen = GENL_HDRLEN + GENL_MAGIC_FAMILY_HDRSZ;
3020        struct nlattr *nla;
3021        const char *resource_name;
3022        struct drbd_tconn *tconn;
3023        int maxtype;
3024
3025        /* Is this a followup call? */
3026        if (cb->args[0]) {
3027                /* ... of a single resource dump,
3028                 * and the resource iterator has been advanced already? */
3029                if (cb->args[2] && cb->args[2] != cb->args[0])
3030                        return 0; /* DONE. */
3031                goto dump;
3032        }
3033
3034        /* First call (from netlink_dump_start).  We need to figure out
3035         * which resource(s) the user wants us to dump. */
3036        nla = nla_find(nlmsg_attrdata(cb->nlh, hdrlen),
3037                        nlmsg_attrlen(cb->nlh, hdrlen),
3038                        DRBD_NLA_CFG_CONTEXT);
3039
3040        /* No explicit context given.  Dump all. */
3041        if (!nla)
3042                goto dump;
3043        maxtype = ARRAY_SIZE(drbd_cfg_context_nl_policy) - 1;
3044        nla = drbd_nla_find_nested(maxtype, nla, __nla_type(T_ctx_resource_name));
3045        if (IS_ERR(nla))
3046                return PTR_ERR(nla);
3047        /* context given, but no name present? */
3048        if (!nla)
3049                return -EINVAL;
3050        resource_name = nla_data(nla);
3051        tconn = conn_get_by_name(resource_name);
3052
3053        if (!tconn)
3054                return -ENODEV;
3055
3056        kref_put(&tconn->kref, &conn_destroy); /* get_one_status() (re)validates tconn by itself */
3057
3058        /* prime iterators, and set "filter" mode mark:
3059         * only dump this tconn. */
3060        cb->args[0] = (long)tconn;
3061        /* cb->args[1] = 0; passed in this way. */
3062        cb->args[2] = (long)tconn;
3063
3064dump:
3065        return get_one_status(skb, cb);
3066}
3067
3068int drbd_adm_get_timeout_type(struct sk_buff *skb, struct genl_info *info)
3069{
3070        enum drbd_ret_code retcode;
3071        struct timeout_parms tp;
3072        int err;
3073
3074        retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_MINOR);
3075        if (!adm_ctx.reply_skb)
3076                return retcode;
3077        if (retcode != NO_ERROR)
3078                goto out;
3079
3080        tp.timeout_type =
3081                adm_ctx.mdev->state.pdsk == D_OUTDATED ? UT_PEER_OUTDATED :
3082                test_bit(USE_DEGR_WFC_T, &adm_ctx.mdev->flags) ? UT_DEGRADED :
3083                UT_DEFAULT;
3084
3085        err = timeout_parms_to_priv_skb(adm_ctx.reply_skb, &tp);
3086        if (err) {
3087                nlmsg_free(adm_ctx.reply_skb);
3088                return err;
3089        }
3090out:
3091        drbd_adm_finish(info, retcode);
3092        return 0;
3093}
3094
3095int drbd_adm_start_ov(struct sk_buff *skb, struct genl_info *info)
3096{
3097        struct drbd_conf *mdev;
3098        enum drbd_ret_code retcode;
3099        struct start_ov_parms parms;
3100
3101        retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_MINOR);
3102        if (!adm_ctx.reply_skb)
3103                return retcode;
3104        if (retcode != NO_ERROR)
3105                goto out;
3106
3107        mdev = adm_ctx.mdev;
3108
3109        /* resume from last known position, if possible */
3110        parms.ov_start_sector = mdev->ov_start_sector;
3111        parms.ov_stop_sector = ULLONG_MAX;
3112        if (info->attrs[DRBD_NLA_START_OV_PARMS]) {
3113                int err = start_ov_parms_from_attrs(&parms, info);
3114                if (err) {
3115                        retcode = ERR_MANDATORY_TAG;
3116                        drbd_msg_put_info(from_attrs_err_to_txt(err));
3117                        goto out;
3118                }
3119        }
3120        /* w_make_ov_request expects position to be aligned */
3121        mdev->ov_start_sector = parms.ov_start_sector & ~(BM_SECT_PER_BIT-1);
3122        mdev->ov_stop_sector = parms.ov_stop_sector;
3123
3124        /* If there is still bitmap IO pending, e.g. previous resync or verify
3125         * just being finished, wait for it before requesting a new resync. */
3126        drbd_suspend_io(mdev);
3127        wait_event(mdev->misc_wait, !test_bit(BITMAP_IO, &mdev->flags));
3128        retcode = drbd_request_state(mdev,NS(conn,C_VERIFY_S));
3129        drbd_resume_io(mdev);
3130out:
3131        drbd_adm_finish(info, retcode);
3132        return 0;
3133}
3134
3135
3136int drbd_adm_new_c_uuid(struct sk_buff *skb, struct genl_info *info)
3137{
3138        struct drbd_conf *mdev;
3139        enum drbd_ret_code retcode;
3140        int skip_initial_sync = 0;
3141        int err;
3142        struct new_c_uuid_parms args;
3143
3144        retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_MINOR);
3145        if (!adm_ctx.reply_skb)
3146                return retcode;
3147        if (retcode != NO_ERROR)
3148                goto out_nolock;
3149
3150        mdev = adm_ctx.mdev;
3151        memset(&args, 0, sizeof(args));
3152        if (info->attrs[DRBD_NLA_NEW_C_UUID_PARMS]) {
3153                err = new_c_uuid_parms_from_attrs(&args, info);
3154                if (err) {
3155                        retcode = ERR_MANDATORY_TAG;
3156                        drbd_msg_put_info(from_attrs_err_to_txt(err));
3157                        goto out_nolock;
3158                }
3159        }
3160
3161        mutex_lock(mdev->state_mutex); /* Protects us against serialized state changes. */
3162
3163        if (!get_ldev(mdev)) {
3164                retcode = ERR_NO_DISK;
3165                goto out;
3166        }
3167
3168        /* this is "skip initial sync", assume to be clean */
3169        if (mdev->state.conn == C_CONNECTED && mdev->tconn->agreed_pro_version >= 90 &&
3170            mdev->ldev->md.uuid[UI_CURRENT] == UUID_JUST_CREATED && args.clear_bm) {
3171                dev_info(DEV, "Preparing to skip initial sync\n");
3172                skip_initial_sync = 1;
3173        } else if (mdev->state.conn != C_STANDALONE) {
3174                retcode = ERR_CONNECTED;
3175                goto out_dec;
3176        }
3177
3178        drbd_uuid_set(mdev, UI_BITMAP, 0); /* Rotate UI_BITMAP to History 1, etc... */
3179        drbd_uuid_new_current(mdev); /* New current, previous to UI_BITMAP */
3180
3181        if (args.clear_bm) {
3182                err = drbd_bitmap_io(mdev, &drbd_bmio_clear_n_write,
3183                        "clear_n_write from new_c_uuid", BM_LOCKED_MASK);
3184                if (err) {
3185                        dev_err(DEV, "Writing bitmap failed with %d\n",err);
3186                        retcode = ERR_IO_MD_DISK;
3187                }
3188                if (skip_initial_sync) {
3189                        drbd_send_uuids_skip_initial_sync(mdev);
3190                        _drbd_uuid_set(mdev, UI_BITMAP, 0);
3191                        drbd_print_uuids(mdev, "cleared bitmap UUID");
3192                        spin_lock_irq(&mdev->tconn->req_lock);
3193                        _drbd_set_state(_NS2(mdev, disk, D_UP_TO_DATE, pdsk, D_UP_TO_DATE),
3194                                        CS_VERBOSE, NULL);
3195                        spin_unlock_irq(&mdev->tconn->req_lock);
3196                }
3197        }
3198
3199        drbd_md_sync(mdev);
3200out_dec:
3201        put_ldev(mdev);
3202out:
3203        mutex_unlock(mdev->state_mutex);
3204out_nolock:
3205        drbd_adm_finish(info, retcode);
3206        return 0;
3207}
3208
3209static enum drbd_ret_code
3210drbd_check_resource_name(const char *name)
3211{
3212        if (!name || !name[0]) {
3213                drbd_msg_put_info("resource name missing");
3214                return ERR_MANDATORY_TAG;
3215        }
3216        /* if we want to use these in sysfs/configfs/debugfs some day,
3217         * we must not allow slashes */
3218        if (strchr(name, '/')) {
3219                drbd_msg_put_info("invalid resource name");
3220                return ERR_INVALID_REQUEST;
3221        }
3222        return NO_ERROR;
3223}
3224
3225int drbd_adm_new_resource(struct sk_buff *skb, struct genl_info *info)
3226{
3227        enum drbd_ret_code retcode;
3228        struct res_opts res_opts;
3229        int err;
3230
3231        retcode = drbd_adm_prepare(skb, info, 0);
3232        if (!adm_ctx.reply_skb)
3233                return retcode;
3234        if (retcode != NO_ERROR)
3235                goto out;
3236
3237        set_res_opts_defaults(&res_opts);
3238        err = res_opts_from_attrs(&res_opts, info);
3239        if (err && err != -ENOMSG) {
3240                retcode = ERR_MANDATORY_TAG;
3241                drbd_msg_put_info(from_attrs_err_to_txt(err));
3242                goto out;
3243        }
3244
3245        retcode = drbd_check_resource_name(adm_ctx.resource_name);
3246        if (retcode != NO_ERROR)
3247                goto out;
3248
3249        if (adm_ctx.tconn) {
3250                if (info->nlhdr->nlmsg_flags & NLM_F_EXCL) {
3251                        retcode = ERR_INVALID_REQUEST;
3252                        drbd_msg_put_info("resource exists");
3253                }
3254                /* else: still NO_ERROR */
3255                goto out;
3256        }
3257
3258        if (!conn_create(adm_ctx.resource_name, &res_opts))
3259                retcode = ERR_NOMEM;
3260out:
3261        drbd_adm_finish(info, retcode);
3262        return 0;
3263}
3264
3265int drbd_adm_add_minor(struct sk_buff *skb, struct genl_info *info)
3266{
3267        struct drbd_genlmsghdr *dh = info->userhdr;
3268        enum drbd_ret_code retcode;
3269
3270        retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_RESOURCE);
3271        if (!adm_ctx.reply_skb)
3272                return retcode;
3273        if (retcode != NO_ERROR)
3274                goto out;
3275
3276        if (dh->minor > MINORMASK) {
3277                drbd_msg_put_info("requested minor out of range");
3278                retcode = ERR_INVALID_REQUEST;
3279                goto out;
3280        }
3281        if (adm_ctx.volume > DRBD_VOLUME_MAX) {
3282                drbd_msg_put_info("requested volume id out of range");
3283                retcode = ERR_INVALID_REQUEST;
3284                goto out;
3285        }
3286
3287        /* drbd_adm_prepare made sure already
3288         * that mdev->tconn and mdev->vnr match the request. */
3289        if (adm_ctx.mdev) {
3290                if (info->nlhdr->nlmsg_flags & NLM_F_EXCL)
3291                        retcode = ERR_MINOR_EXISTS;
3292                /* else: still NO_ERROR */
3293                goto out;
3294        }
3295
3296        retcode = conn_new_minor(adm_ctx.tconn, dh->minor, adm_ctx.volume);
3297out:
3298        drbd_adm_finish(info, retcode);
3299        return 0;
3300}
3301
3302static enum drbd_ret_code adm_delete_minor(struct drbd_conf *mdev)
3303{
3304        if (mdev->state.disk == D_DISKLESS &&
3305            /* no need to be mdev->state.conn == C_STANDALONE &&
3306             * we may want to delete a minor from a live replication group.
3307             */
3308            mdev->state.role == R_SECONDARY) {
3309                _drbd_request_state(mdev, NS(conn, C_WF_REPORT_PARAMS),
3310                                    CS_VERBOSE + CS_WAIT_COMPLETE);
3311                idr_remove(&mdev->tconn->volumes, mdev->vnr);
3312                idr_remove(&minors, mdev_to_minor(mdev));
3313                destroy_workqueue(mdev->submit.wq);
3314                del_gendisk(mdev->vdisk);
3315                synchronize_rcu();
3316                kref_put(&mdev->kref, &drbd_minor_destroy);
3317                return NO_ERROR;
3318        } else
3319                return ERR_MINOR_CONFIGURED;
3320}
3321
3322int drbd_adm_delete_minor(struct sk_buff *skb, struct genl_info *info)
3323{
3324        enum drbd_ret_code retcode;
3325
3326        retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_MINOR);
3327        if (!adm_ctx.reply_skb)
3328                return retcode;
3329        if (retcode != NO_ERROR)
3330                goto out;
3331
3332        retcode = adm_delete_minor(adm_ctx.mdev);
3333out:
3334        drbd_adm_finish(info, retcode);
3335        return 0;
3336}
3337
3338int drbd_adm_down(struct sk_buff *skb, struct genl_info *info)
3339{
3340        int retcode; /* enum drbd_ret_code rsp. enum drbd_state_rv */
3341        struct drbd_conf *mdev;
3342        unsigned i;
3343
3344        retcode = drbd_adm_prepare(skb, info, 0);
3345        if (!adm_ctx.reply_skb)
3346                return retcode;
3347        if (retcode != NO_ERROR)
3348                goto out;
3349
3350        if (!adm_ctx.tconn) {
3351                retcode = ERR_RES_NOT_KNOWN;
3352                goto out;
3353        }
3354
3355        /* demote */
3356        idr_for_each_entry(&adm_ctx.tconn->volumes, mdev, i) {
3357                retcode = drbd_set_role(mdev, R_SECONDARY, 0);
3358                if (retcode < SS_SUCCESS) {
3359                        drbd_msg_put_info("failed to demote");
3360                        goto out;
3361                }
3362        }
3363
3364        retcode = conn_try_disconnect(adm_ctx.tconn, 0);
3365        if (retcode < SS_SUCCESS) {
3366                drbd_msg_put_info("failed to disconnect");
3367                goto out;
3368        }
3369
3370        /* detach */
3371        idr_for_each_entry(&adm_ctx.tconn->volumes, mdev, i) {
3372                retcode = adm_detach(mdev, 0);
3373                if (retcode < SS_SUCCESS || retcode > NO_ERROR) {
3374                        drbd_msg_put_info("failed to detach");
3375                        goto out;
3376                }
3377        }
3378
3379        /* If we reach this, all volumes (of this tconn) are Secondary,
3380         * Disconnected, Diskless, aka Unconfigured. Make sure all threads have
3381         * actually stopped, state handling only does drbd_thread_stop_nowait(). */
3382        drbd_thread_stop(&adm_ctx.tconn->worker);
3383
3384        /* Now, nothing can fail anymore */
3385
3386        /* delete volumes */
3387        idr_for_each_entry(&adm_ctx.tconn->volumes, mdev, i) {
3388                retcode = adm_delete_minor(mdev);
3389                if (retcode != NO_ERROR) {
3390                        /* "can not happen" */
3391                        drbd_msg_put_info("failed to delete volume");
3392                        goto out;
3393                }
3394        }
3395
3396        /* delete connection */
3397        if (conn_lowest_minor(adm_ctx.tconn) < 0) {
3398                list_del_rcu(&adm_ctx.tconn->all_tconn);
3399                synchronize_rcu();
3400                kref_put(&adm_ctx.tconn->kref, &conn_destroy);
3401
3402                retcode = NO_ERROR;
3403        } else {
3404                /* "can not happen" */
3405                retcode = ERR_RES_IN_USE;
3406                drbd_msg_put_info("failed to delete connection");
3407        }
3408        goto out;
3409out:
3410        drbd_adm_finish(info, retcode);
3411        return 0;
3412}
3413
3414int drbd_adm_del_resource(struct sk_buff *skb, struct genl_info *info)
3415{
3416        enum drbd_ret_code retcode;
3417
3418        retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_RESOURCE);
3419        if (!adm_ctx.reply_skb)
3420                return retcode;
3421        if (retcode != NO_ERROR)
3422                goto out;
3423
3424        if (conn_lowest_minor(adm_ctx.tconn) < 0) {
3425                list_del_rcu(&adm_ctx.tconn->all_tconn);
3426                synchronize_rcu();
3427                kref_put(&adm_ctx.tconn->kref, &conn_destroy);
3428
3429                retcode = NO_ERROR;
3430        } else {
3431                retcode = ERR_RES_IN_USE;
3432        }
3433
3434        if (retcode == NO_ERROR)
3435                drbd_thread_stop(&adm_ctx.tconn->worker);
3436out:
3437        drbd_adm_finish(info, retcode);
3438        return 0;
3439}
3440
3441void drbd_bcast_event(struct drbd_conf *mdev, const struct sib_info *sib)
3442{
3443        static atomic_t drbd_genl_seq = ATOMIC_INIT(2); /* two. */
3444        struct sk_buff *msg;
3445        struct drbd_genlmsghdr *d_out;
3446        unsigned seq;
3447        int err = -ENOMEM;
3448
3449        if (sib->sib_reason == SIB_SYNC_PROGRESS) {
3450                if (time_after(jiffies, mdev->rs_last_bcast + HZ))
3451                        mdev->rs_last_bcast = jiffies;
3452                else
3453                        return;
3454        }
3455
3456        seq = atomic_inc_return(&drbd_genl_seq);
3457        msg = genlmsg_new(NLMSG_GOODSIZE, GFP_NOIO);
3458        if (!msg)
3459                goto failed;
3460
3461        err = -EMSGSIZE;
3462        d_out = genlmsg_put(msg, 0, seq, &drbd_genl_family, 0, DRBD_EVENT);
3463        if (!d_out) /* cannot happen, but anyways. */
3464                goto nla_put_failure;
3465        d_out->minor = mdev_to_minor(mdev);
3466        d_out->ret_code = NO_ERROR;
3467
3468        if (nla_put_status_info(msg, mdev, sib))
3469                goto nla_put_failure;
3470        genlmsg_end(msg, d_out);
3471        err = drbd_genl_multicast_events(msg, 0);
3472        /* msg has been consumed or freed in netlink_broadcast() */
3473        if (err && err != -ESRCH)
3474                goto failed;
3475
3476        return;
3477
3478nla_put_failure:
3479        nlmsg_free(msg);
3480failed:
3481        dev_err(DEV, "Error %d while broadcasting event. "
3482                        "Event seq:%u sib_reason:%u\n",
3483                        err, seq, sib->sib_reason);
3484}
3485